From ed9007d0d219726db01f211e9c9ab72fbfe4ecb1 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Sun, 19 May 2024 04:29:11 -0700
Subject: [PATCH 01/60] Revert "[Bounds-Safety] Temporarily relax a
 `counted_by` attribute restriction on flexible array members"

Together with 0ec3b972e58bcbcdc1bebe1696ea37f2931287c3
breaks https://lab.llvm.org/buildbot/#/builders/5/builds/43403

Issue #92687

This reverts commit cef6387e52578366c2332275dad88b9953b55336.
---
 clang/include/clang/Basic/DiagnosticGroups.td   |  4 ----
 .../include/clang/Basic/DiagnosticSemaKinds.td  |  8 +-------
 clang/lib/Sema/SemaDeclAttr.cpp                 | 17 ++---------------
 clang/test/Sema/attr-counted-by-vla.c           |  9 +++------
 4 files changed, 6 insertions(+), 32 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 4fad4d1a0eca72..4cb4f3d999f7ab 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1447,10 +1447,6 @@ def FunctionMultiVersioning
 
 def NoDeref : DiagGroup<"noderef">;
 
-// -fbounds-safety and bounds annotation related warnings
-def BoundsSafetyCountedByEltTyUnknownSize :
-  DiagGroup<"bounds-safety-counted-by-elt-type-unknown-size">;
-
 // A group for cross translation unit static analysis related warnings.
 def CrossTU : DiagGroup<"ctu">;
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 1efa3af121c109..8e6596410c5d05 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6552,7 +6552,7 @@ def err_counted_by_attr_refer_to_union : Error<
 def note_flexible_array_counted_by_attr_field : Note<
   "field %0 declared here">;
 def err_counted_by_attr_pointee_unknown_size : Error<
-  "'counted_by' %select{cannot|should not}3 be applied to %select{"
+  "'counted_by' cannot be applied to %select{"
     "a pointer with pointee|" // pointer
     "an array with element}0" // array
   " of unknown size because %1 is %select{"
@@ -6561,14 +6561,8 @@ def err_counted_by_attr_pointee_unknown_size : Error<
     "a function type|"     // CountedByInvalidPointeeTypeKind::FUNCTION
     // CountedByInvalidPointeeTypeKind::FLEXIBLE_ARRAY_MEMBER
     "a struct type with a flexible array member"
-    "%select{|. This will be an error in a future compiler version}3"
-    ""
   "}2">;
 
-def warn_counted_by_attr_elt_type_unknown_size :
-  Warning<err_counted_by_attr_pointee_unknown_size.Summary>,
-  InGroup<BoundsSafetyCountedByEltTyUnknownSize>;
-
 let CategoryName = "ARC Semantic Issue" in {
 
 // ARC-mode diagnostics.
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index e816ea3647a7c3..c8b71631076baf 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8687,7 +8687,6 @@ static bool CheckCountedByAttrOnField(
   // Note: The `Decl::isFlexibleArrayMemberLike` check earlier on means
   // only `PointeeTy->isStructureTypeWithFlexibleArrayMember()` is reachable
   // when `FieldTy->isArrayType()`.
-  bool ShouldWarn = false;
   if (PointeeTy->isIncompleteType()) {
     InvalidTypeKind = CountedByInvalidPointeeTypeKind::INCOMPLETE;
   } else if (PointeeTy->isSizelessType()) {
@@ -8695,25 +8694,13 @@ static bool CheckCountedByAttrOnField(
   } else if (PointeeTy->isFunctionType()) {
     InvalidTypeKind = CountedByInvalidPointeeTypeKind::FUNCTION;
   } else if (PointeeTy->isStructureTypeWithFlexibleArrayMember()) {
-    if (FieldTy->isArrayType()) {
-      // This is a workaround for the Linux kernel that has already adopted
-      // `counted_by` on a FAM where the pointee is a struct with a FAM. This
-      // should be an error because computing the bounds of the array cannot be
-      // done correctly without manually traversing every struct object in the
-      // array at runtime. To allow the code to be built this error is
-      // downgraded to a warning.
-      ShouldWarn = true;
-    }
     InvalidTypeKind = CountedByInvalidPointeeTypeKind::FLEXIBLE_ARRAY_MEMBER;
   }
 
   if (InvalidTypeKind != CountedByInvalidPointeeTypeKind::VALID) {
-    unsigned DiagID = ShouldWarn
-                          ? diag::warn_counted_by_attr_elt_type_unknown_size
-                          : diag::err_counted_by_attr_pointee_unknown_size;
-    S.Diag(FD->getBeginLoc(), DiagID)
+    S.Diag(FD->getBeginLoc(), diag::err_counted_by_attr_pointee_unknown_size)
         << SelectPtrOrArr << PointeeTy << (int)InvalidTypeKind
-        << (ShouldWarn ? 1 : 0) << FD->getSourceRange();
+        << FD->getSourceRange();
     return true;
   }
 
diff --git a/clang/test/Sema/attr-counted-by-vla.c b/clang/test/Sema/attr-counted-by-vla.c
index b25f719f3b95ab..3de6bd55e2d8eb 100644
--- a/clang/test/Sema/attr-counted-by-vla.c
+++ b/clang/test/Sema/attr-counted-by-vla.c
@@ -173,24 +173,21 @@ struct has_annotated_VLA {
 
 struct buffer_of_structs_with_unnannotated_vla {
   int count;
-  // Treating this as a warning is a temporary fix for existing attribute adopters. It **SHOULD BE AN ERROR**.
-  // expected-warning@+1{{'counted_by' should not be applied to an array with element of unknown size because 'struct has_unannotated_VLA' is a struct type with a flexible array member. This will be an error in a future compiler version}}
+  // expected-error@+1{{'counted_by' cannot be applied to an array with element of unknown size because 'struct has_unannotated_VLA' is a struct type with a flexible array member}}
   struct has_unannotated_VLA Arr[] __counted_by(count);
 };
 
 
 struct buffer_of_structs_with_annotated_vla {
   int count;
-  // Treating this as a warning is a temporary fix for existing attribute adopters. It **SHOULD BE AN ERROR**.
-  // expected-warning@+1{{'counted_by' should not be applied to an array with element of unknown size because 'struct has_annotated_VLA' is a struct type with a flexible array member. This will be an error in a future compiler version}}
+  // expected-error@+1{{'counted_by' cannot be applied to an array with element of unknown size because 'struct has_annotated_VLA' is a struct type with a flexible array member}}
   struct has_annotated_VLA Arr[] __counted_by(count);
 };
 
 struct buffer_of_const_structs_with_annotated_vla {
   int count;
-  // Treating this as a warning is a temporary fix for existing attribute adopters. It **SHOULD BE AN ERROR**.
   // Make sure the `const` qualifier is printed when printing the element type.
-  // expected-warning@+1{{'counted_by' should not be applied to an array with element of unknown size because 'const struct has_annotated_VLA' is a struct type with a flexible array member. This will be an error in a future compiler version}}
+  // expected-error@+1{{'counted_by' cannot be applied to an array with element of unknown size because 'const struct has_annotated_VLA' is a struct type with a flexible array member}}
   const struct has_annotated_VLA Arr[] __counted_by(count);
 };
 

From 6447abe067c8088a5cc093fe872719374e174068 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Sun, 19 May 2024 04:30:22 -0700
Subject: [PATCH 02/60] Revert "[BoundsSafety] Allow 'counted_by' attribute on
 pointers in structs in C (#90786)"

Memory leak: https://lab.llvm.org/buildbot/#/builders/5/builds/43403

Issue #92687

This reverts commit 0ec3b972e58bcbcdc1bebe1696ea37f2931287c3.
---
 clang/docs/ReleaseNotes.rst                   |  21 +-
 clang/include/clang/AST/Type.h                |   1 -
 clang/include/clang/Basic/Attr.td             |   3 +-
 .../clang/Basic/DiagnosticSemaKinds.td        |  17 +-
 clang/include/clang/Parse/Parser.h            |   7 +-
 clang/include/clang/Sema/Sema.h               |   3 +-
 clang/lib/AST/Type.cpp                        |  10 -
 clang/lib/Parse/ParseDecl.cpp                 | 104 +------
 clang/lib/Parse/ParseObjc.cpp                 |  10 +-
 clang/lib/Sema/SemaDeclAttr.cpp               |  82 ++----
 clang/lib/Sema/SemaType.cpp                   |   6 +-
 clang/lib/Sema/TreeTransform.h                |   2 +-
 .../attr-counted-by-late-parsed-struct-ptrs.c |  45 ----
 clang/test/AST/attr-counted-by-struct-ptrs.c  | 117 --------
 .../Sema/attr-counted-by-late-parsed-off.c    |  26 --
 .../attr-counted-by-late-parsed-struct-ptrs.c | 254 ------------------
 ...tr-counted-by-struct-ptrs-sizeless-types.c |  17 --
 clang/test/Sema/attr-counted-by-struct-ptrs.c | 224 ---------------
 .../Sema/attr-counted-by-vla-sizeless-types.c |  11 -
 clang/test/Sema/attr-counted-by-vla.c         | 193 -------------
 clang/test/Sema/attr-counted-by.c             | 112 ++++++++
 21 files changed, 148 insertions(+), 1117 deletions(-)
 delete mode 100644 clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c
 delete mode 100644 clang/test/AST/attr-counted-by-struct-ptrs.c
 delete mode 100644 clang/test/Sema/attr-counted-by-late-parsed-off.c
 delete mode 100644 clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c
 delete mode 100644 clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c
 delete mode 100644 clang/test/Sema/attr-counted-by-struct-ptrs.c
 delete mode 100644 clang/test/Sema/attr-counted-by-vla-sizeless-types.c
 delete mode 100644 clang/test/Sema/attr-counted-by-vla.c
 create mode 100644 clang/test/Sema/attr-counted-by.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 2f83f5c6d54e9c..7af5869d21768d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -317,8 +317,7 @@ New Compiler Flags
 
 - ``-fexperimental-late-parse-attributes`` enables an experimental feature to
   allow late parsing certain attributes in specific contexts where they would
-  not normally be late parsed. Currently this allows late parsing the
-  `counted_by` attribute in C. See `Attribute Changes in Clang`_.
+  not normally be late parsed.
 
 - ``-fseparate-named-sections`` uses separate unique sections for global
   symbols in named special sections (i.e. symbols annotated with
@@ -407,24 +406,6 @@ Attribute Changes in Clang
 - The ``clspv_libclc_builtin`` attribute has been added to allow clspv
   (`OpenCL-C to Vulkan SPIR-V compiler <https://github.com/google/clspv>`_) to identify functions coming from libclc
   (`OpenCL-C builtin library <https://libclc.llvm.org>`_).
-- The ``counted_by`` attribute is now allowed on pointers that are members of a
-  struct in C.
-
-- The ``counted_by`` attribute can now be late parsed in C when
-  ``-fexperimental-late-parse-attributes`` is passed but only when attribute is
-  used in the declaration attribute position. This allows using the
-  attribute on existing code where it previously impossible to do so without
-  re-ordering struct field declarations would break ABI as shown below.
-
-  .. code-block:: c
-
-     struct BufferTy {
-       /* Refering to `count` requires late parsing */
-       char* buffer __counted_by(count);
-       /* Swapping `buffer` and `count` to avoid late parsing would break ABI */
-       size_t count;
-     };
-
 
 Improvements to Clang's diagnostics
 -----------------------------------
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index c7a8e785913b36..da3834f19ca044 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2515,7 +2515,6 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
   bool isRecordType() const;
   bool isClassType() const;
   bool isStructureType() const;
-  bool isStructureTypeWithFlexibleArrayMember() const;
   bool isObjCBoxableRecordType() const;
   bool isInterfaceType() const;
   bool isStructureOrClassType() const;
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 7a7721239a28fa..38ee8356583bef 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2229,8 +2229,7 @@ def TypeNullUnspecified : TypeAttr {
 def CountedBy : DeclOrTypeAttr {
   let Spellings = [Clang<"counted_by">];
   let Subjects = SubjectList<[Field], ErrorDiag>;
-  let Args = [ExprArgument<"Count">, IntArgument<"NestedLevel", 1>];
-  let LateParsed = LateAttrParseExperimentalExt;
+  let Args = [ExprArgument<"Count">, IntArgument<"NestedLevel">];
   let ParseArgumentsAsUnevaluated = 1;
   let Documentation = [CountedByDocs];
   let LangOpts = [COnly];
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 8e6596410c5d05..09b1874f9fddd7 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6533,10 +6533,8 @@ def warn_superclass_variable_sized_type_not_at_end : Warning<
 
 def err_flexible_array_count_not_in_same_struct : Error<
   "'counted_by' field %0 isn't within the same struct as the flexible array">;
-def err_counted_by_attr_not_on_ptr_or_flexible_array_member : Error<
-  "'counted_by' only applies to pointers or C99 flexible array members">;
-def err_counted_by_attr_on_array_not_flexible_array_member : Error<
-  "'counted_by' on arrays only applies to C99 flexible array members">;
+def err_counted_by_attr_not_on_flexible_array_member : Error<
+  "'counted_by' only applies to C99 flexible array members">;
 def err_counted_by_attr_refer_to_itself : Error<
   "'counted_by' cannot refer to the flexible array member %0">;
 def err_counted_by_must_be_in_structure : Error<
@@ -6551,17 +6549,6 @@ def err_counted_by_attr_refer_to_union : Error<
   "'counted_by' argument cannot refer to a union member">;
 def note_flexible_array_counted_by_attr_field : Note<
   "field %0 declared here">;
-def err_counted_by_attr_pointee_unknown_size : Error<
-  "'counted_by' cannot be applied to %select{"
-    "a pointer with pointee|" // pointer
-    "an array with element}0" // array
-  " of unknown size because %1 is %select{"
-    "an incomplete type|"  // CountedByInvalidPointeeTypeKind::INCOMPLETE
-    "a sizeless type|"     // CountedByInvalidPointeeTypeKind::SIZELESS
-    "a function type|"     // CountedByInvalidPointeeTypeKind::FUNCTION
-    // CountedByInvalidPointeeTypeKind::FLEXIBLE_ARRAY_MEMBER
-    "a struct type with a flexible array member"
-  "}2">;
 
 let CategoryName = "ARC Semantic Issue" in {
 
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index af50164a8f93fc..1e796e828b10a7 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -1645,8 +1645,6 @@ class Parser : public CodeCompletionHandler {
                                bool EnterScope, bool OnDefinition);
   void ParseLexedAttribute(LateParsedAttribute &LA,
                            bool EnterScope, bool OnDefinition);
-  void ParseLexedCAttribute(LateParsedAttribute &LA,
-                            ParsedAttributes *OutAttrs = nullptr);
   void ParseLexedMethodDeclarations(ParsingClass &Class);
   void ParseLexedMethodDeclaration(LateParsedMethodDeclaration &LM);
   void ParseLexedMethodDefs(ParsingClass &Class);
@@ -2533,8 +2531,7 @@ class Parser : public CodeCompletionHandler {
 
   void ParseStructDeclaration(
       ParsingDeclSpec &DS,
-      llvm::function_ref<Decl *(ParsingFieldDeclarator &)> FieldsCallback,
-      LateParsedAttrList *LateFieldAttrs = nullptr);
+      llvm::function_ref<void(ParsingFieldDeclarator &)> FieldsCallback);
 
   DeclGroupPtrTy ParseTopLevelStmtDecl();
 
@@ -3112,8 +3109,6 @@ class Parser : public CodeCompletionHandler {
                                  SourceLocation ScopeLoc,
                                  ParsedAttr::Form Form);
 
-  void DistributeCLateParsedAttrs(Decl *Dcl, LateParsedAttrList *LateAttrs);
-
   void ParseBoundsAttribute(IdentifierInfo &AttrName,
                             SourceLocation AttrNameLoc, ParsedAttributes &Attrs,
                             IdentifierInfo *ScopeName, SourceLocation ScopeLoc,
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index d4d4a82525a020..b16a304960d3fc 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11396,8 +11396,7 @@ class Sema final : public SemaBase {
   QualType BuildMatrixType(QualType T, Expr *NumRows, Expr *NumColumns,
                            SourceLocation AttrLoc);
 
-  QualType BuildCountAttributedArrayOrPointerType(QualType WrappedTy,
-                                                  Expr *CountExpr);
+  QualType BuildCountAttributedArrayType(QualType WrappedTy, Expr *CountExpr);
 
   QualType BuildAddressSpaceAttr(QualType &T, LangAS ASIdx, Expr *AddrSpace,
                                  SourceLocation AttrLoc);
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index f69a8f80a63938..e31741cd44240d 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -632,16 +632,6 @@ bool Type::isStructureType() const {
   return false;
 }
 
-bool Type::isStructureTypeWithFlexibleArrayMember() const {
-  const auto *RT = getAs<RecordType>();
-  if (!RT)
-    return false;
-  const auto *Decl = RT->getDecl();
-  if (!Decl->isStruct())
-    return false;
-  return Decl->hasFlexibleArrayMember();
-}
-
 bool Type::isObjCBoxableRecordType() const {
   if (const auto *RT = getAs<RecordType>())
     return RT->getDecl()->hasAttr<ObjCBoxableAttr>();
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 8405b44685ae49..2ce8fa98089f6d 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -3288,19 +3288,6 @@ void Parser::ParseAlignmentSpecifier(ParsedAttributes &Attrs,
   }
 }
 
-void Parser::DistributeCLateParsedAttrs(Decl *Dcl,
-                                        LateParsedAttrList *LateAttrs) {
-  assert(Dcl && "Dcl cannot be null");
-
-  if (!LateAttrs)
-    return;
-
-  for (auto *LateAttr : *LateAttrs) {
-    if (LateAttr->Decls.empty())
-      LateAttr->addDecl(Dcl);
-  }
-}
-
 /// Bounds attributes (e.g., counted_by):
 ///   AttrName '(' expression ')'
 void Parser::ParseBoundsAttribute(IdentifierInfo &AttrName,
@@ -4838,14 +4825,13 @@ static void DiagnoseCountAttributedTypeInUnnamedAnon(ParsingDeclSpec &DS,
 ///
 void Parser::ParseStructDeclaration(
     ParsingDeclSpec &DS,
-    llvm::function_ref<Decl *(ParsingFieldDeclarator &)> FieldsCallback,
-    LateParsedAttrList *LateFieldAttrs) {
+    llvm::function_ref<void(ParsingFieldDeclarator &)> FieldsCallback) {
 
   if (Tok.is(tok::kw___extension__)) {
     // __extension__ silences extension warnings in the subexpression.
     ExtensionRAIIObject O(Diags);  // Use RAII to do this.
     ConsumeToken();
-    return ParseStructDeclaration(DS, FieldsCallback, LateFieldAttrs);
+    return ParseStructDeclaration(DS, FieldsCallback);
   }
 
   // Parse leading attributes.
@@ -4910,12 +4896,10 @@ void Parser::ParseStructDeclaration(
     }
 
     // If attributes exist after the declarator, parse them.
-    MaybeParseGNUAttributes(DeclaratorInfo.D, LateFieldAttrs);
+    MaybeParseGNUAttributes(DeclaratorInfo.D);
 
     // We're done with this declarator;  invoke the callback.
-    Decl *Field = FieldsCallback(DeclaratorInfo);
-    if (Field)
-      DistributeCLateParsedAttrs(Field, LateFieldAttrs);
+    FieldsCallback(DeclaratorInfo);
 
     // If we don't have a comma, it is either the end of the list (a ';')
     // or an error, bail out.
@@ -4926,69 +4910,6 @@ void Parser::ParseStructDeclaration(
   }
 }
 
-/// Finish parsing an attribute for which parsing was delayed.
-/// This will be called at the end of parsing a class declaration
-/// for each LateParsedAttribute. We consume the saved tokens and
-/// create an attribute with the arguments filled in. We add this
-/// to the Attribute list for the decl.
-void Parser::ParseLexedCAttribute(LateParsedAttribute &LA,
-                                  ParsedAttributes *OutAttrs) {
-  // Create a fake EOF so that attribute parsing won't go off the end of the
-  // attribute.
-  Token AttrEnd;
-  AttrEnd.startToken();
-  AttrEnd.setKind(tok::eof);
-  AttrEnd.setLocation(Tok.getLocation());
-  AttrEnd.setEofData(LA.Toks.data());
-  LA.Toks.push_back(AttrEnd);
-
-  // Append the current token at the end of the new token stream so that it
-  // doesn't get lost.
-  LA.Toks.push_back(Tok);
-  PP.EnterTokenStream(LA.Toks, /*DisableMacroExpansion=*/true,
-                      /*IsReinject=*/true);
-  // Drop the current token and bring the first cached one. It's the same token
-  // as when we entered this function.
-  ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true);
-
-  ParsedAttributes Attrs(AttrFactory);
-
-  assert(LA.Decls.size() <= 1 &&
-         "late field attribute expects to have at most one declaration.");
-
-  // Dispatch based on the attribute and parse it
-  const AttributeCommonInfo::Form ParsedForm = ParsedAttr::Form::GNU();
-  IdentifierInfo *ScopeName = nullptr;
-  const ParsedAttr::Kind AttrKind =
-      ParsedAttr::getParsedKind(&LA.AttrName, /*ScopeName=*/ScopeName,
-                                /*SyntaxUsed=*/ParsedForm.getSyntax());
-  switch (AttrKind) {
-  case ParsedAttr::Kind::AT_CountedBy:
-    ParseBoundsAttribute(LA.AttrName, LA.AttrNameLoc, Attrs,
-                         /*ScopeName=*/ScopeName, SourceLocation(),
-                         /*Form=*/ParsedForm);
-    break;
-  default:
-    llvm_unreachable("Unhandled late parsed attribute");
-  }
-
-  for (auto *D : LA.Decls)
-    Actions.ActOnFinishDelayedAttribute(getCurScope(), D, Attrs);
-
-  // Due to a parsing error, we either went over the cached tokens or
-  // there are still cached tokens left, so we skip the leftover tokens.
-  while (Tok.isNot(tok::eof))
-    ConsumeAnyToken();
-
-  // Consume the fake EOF token if it's there
-  if (Tok.is(tok::eof) && Tok.getEofData() == AttrEnd.getEofData())
-    ConsumeAnyToken();
-
-  if (OutAttrs) {
-    OutAttrs->takeAllFrom(Attrs);
-  }
-}
-
 /// ParseStructUnionBody
 ///       struct-contents:
 ///         struct-declaration-list
@@ -5012,11 +4933,6 @@ void Parser::ParseStructUnionBody(SourceLocation RecordLoc,
   ParseScope StructScope(this, Scope::ClassScope|Scope::DeclScope);
   Actions.ActOnTagStartDefinition(getCurScope(), TagDecl);
 
-  // `LateAttrParseExperimentalExtOnly=true` requests that only attributes
-  // marked with `LateAttrParseExperimentalExt` are late parsed.
-  LateParsedAttrList LateFieldAttrs(/*PSoon=*/false,
-                                    /*LateAttrParseExperimentalExtOnly=*/true);
-
   // While we still have something to read, read the declarations in the struct.
   while (!tryParseMisplacedModuleImport() && Tok.isNot(tok::r_brace) &&
          Tok.isNot(tok::eof)) {
@@ -5067,19 +4983,18 @@ void Parser::ParseStructUnionBody(SourceLocation RecordLoc,
     }
 
     if (!Tok.is(tok::at)) {
-      auto CFieldCallback = [&](ParsingFieldDeclarator &FD) -> Decl * {
+      auto CFieldCallback = [&](ParsingFieldDeclarator &FD) {
         // Install the declarator into the current TagDecl.
         Decl *Field =
             Actions.ActOnField(getCurScope(), TagDecl,
                                FD.D.getDeclSpec().getSourceRange().getBegin(),
                                FD.D, FD.BitfieldSize);
         FD.complete(Field);
-        return Field;
       };
 
       // Parse all the comma separated declarators.
       ParsingDeclSpec DS(*this);
-      ParseStructDeclaration(DS, CFieldCallback, &LateFieldAttrs);
+      ParseStructDeclaration(DS, CFieldCallback);
     } else { // Handle @defs
       ConsumeToken();
       if (!Tok.isObjCAtKeyword(tok::objc_defs)) {
@@ -5120,12 +5035,7 @@ void Parser::ParseStructUnionBody(SourceLocation RecordLoc,
 
   ParsedAttributes attrs(AttrFactory);
   // If attributes exist after struct contents, parse them.
-  MaybeParseGNUAttributes(attrs, &LateFieldAttrs);
-
-  // Late parse field attributes if necessary.
-  assert(!getLangOpts().CPlusPlus);
-  for (auto *LateAttr : LateFieldAttrs)
-    ParseLexedCAttribute(*LateAttr);
+  MaybeParseGNUAttributes(attrs);
 
   SmallVector<Decl *, 32> FieldDecls(TagDecl->fields());
 
diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp
index 6a2088a73c55b9..89f4acbd25e493 100644
--- a/clang/lib/Parse/ParseObjc.cpp
+++ b/clang/lib/Parse/ParseObjc.cpp
@@ -780,16 +780,16 @@ void Parser::ParseObjCInterfaceDeclList(tok::ObjCKeywordKind contextKey,
       }
 
       bool addedToDeclSpec = false;
-      auto ObjCPropertyCallback = [&](ParsingFieldDeclarator &FD) -> Decl * {
+      auto ObjCPropertyCallback = [&](ParsingFieldDeclarator &FD) {
         if (FD.D.getIdentifier() == nullptr) {
           Diag(AtLoc, diag::err_objc_property_requires_field_name)
               << FD.D.getSourceRange();
-          return nullptr;
+          return;
         }
         if (FD.BitfieldSize) {
           Diag(AtLoc, diag::err_objc_property_bitfield)
               << FD.D.getSourceRange();
-          return nullptr;
+          return;
         }
 
         // Map a nullability property attribute to a context-sensitive keyword
@@ -818,7 +818,6 @@ void Parser::ParseObjCInterfaceDeclList(tok::ObjCKeywordKind contextKey,
             MethodImplKind);
 
         FD.complete(Property);
-        return Property;
       };
 
       // Parse all the comma separated declarators.
@@ -2014,7 +2013,7 @@ void Parser::ParseObjCClassInstanceVariables(ObjCContainerDecl *interfaceDecl,
       continue;
     }
 
-    auto ObjCIvarCallback = [&](ParsingFieldDeclarator &FD) -> Decl * {
+    auto ObjCIvarCallback = [&](ParsingFieldDeclarator &FD) {
       assert(getObjCDeclContext() == interfaceDecl &&
              "Ivar should have interfaceDecl as its decl context");
       // Install the declarator into the interface decl.
@@ -2025,7 +2024,6 @@ void Parser::ParseObjCClassInstanceVariables(ObjCContainerDecl *interfaceDecl,
       if (Field)
         AllIvarDecls.push_back(Field);
       FD.complete(Field);
-      return Field;
     };
 
     // Parse all the comma separated declarators.
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index c8b71631076baf..30776ff537fb50 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8633,82 +8633,31 @@ static const RecordDecl *GetEnclosingNamedOrTopAnonRecord(const FieldDecl *FD) {
   return RD;
 }
 
-enum class CountedByInvalidPointeeTypeKind {
-  INCOMPLETE,
-  SIZELESS,
-  FUNCTION,
-  FLEXIBLE_ARRAY_MEMBER,
-  VALID,
-};
-
-static bool CheckCountedByAttrOnField(
-    Sema &S, FieldDecl *FD, Expr *E,
-    llvm::SmallVectorImpl<TypeCoupledDeclRefInfo> &Decls) {
-  // Check the context the attribute is used in
-
+static bool
+CheckCountExpr(Sema &S, FieldDecl *FD, Expr *E,
+               llvm::SmallVectorImpl<TypeCoupledDeclRefInfo> &Decls) {
   if (FD->getParent()->isUnion()) {
     S.Diag(FD->getBeginLoc(), diag::err_counted_by_attr_in_union)
         << FD->getSourceRange();
     return true;
   }
 
-  const auto FieldTy = FD->getType();
-  if (!FieldTy->isArrayType() && !FieldTy->isPointerType()) {
-    S.Diag(FD->getBeginLoc(),
-           diag::err_counted_by_attr_not_on_ptr_or_flexible_array_member)
-        << FD->getLocation();
+  if (!E->getType()->isIntegerType() || E->getType()->isBooleanType()) {
+    S.Diag(E->getBeginLoc(), diag::err_counted_by_attr_argument_not_integer)
+        << E->getSourceRange();
     return true;
   }
 
   LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel =
       LangOptions::StrictFlexArraysLevelKind::IncompleteOnly;
-  if (FieldTy->isArrayType() &&
-      !Decl::isFlexibleArrayMemberLike(S.getASTContext(), FD, FieldTy,
-                                       StrictFlexArraysLevel, true)) {
-    S.Diag(FD->getBeginLoc(),
-           diag::err_counted_by_attr_on_array_not_flexible_array_member)
-        << FD->getLocation();
-    return true;
-  }
 
-  CountedByInvalidPointeeTypeKind InvalidTypeKind =
-      CountedByInvalidPointeeTypeKind::VALID;
-  QualType PointeeTy;
-  int SelectPtrOrArr = 0;
-  if (FieldTy->isPointerType()) {
-    PointeeTy = FieldTy->getPointeeType();
-    SelectPtrOrArr = 0;
-  } else {
-    assert(FieldTy->isArrayType());
-    const ArrayType *AT = S.getASTContext().getAsArrayType(FieldTy);
-    PointeeTy = AT->getElementType();
-    SelectPtrOrArr = 1;
-  }
-  // Note: The `Decl::isFlexibleArrayMemberLike` check earlier on means
-  // only `PointeeTy->isStructureTypeWithFlexibleArrayMember()` is reachable
-  // when `FieldTy->isArrayType()`.
-  if (PointeeTy->isIncompleteType()) {
-    InvalidTypeKind = CountedByInvalidPointeeTypeKind::INCOMPLETE;
-  } else if (PointeeTy->isSizelessType()) {
-    InvalidTypeKind = CountedByInvalidPointeeTypeKind::SIZELESS;
-  } else if (PointeeTy->isFunctionType()) {
-    InvalidTypeKind = CountedByInvalidPointeeTypeKind::FUNCTION;
-  } else if (PointeeTy->isStructureTypeWithFlexibleArrayMember()) {
-    InvalidTypeKind = CountedByInvalidPointeeTypeKind::FLEXIBLE_ARRAY_MEMBER;
-  }
-
-  if (InvalidTypeKind != CountedByInvalidPointeeTypeKind::VALID) {
-    S.Diag(FD->getBeginLoc(), diag::err_counted_by_attr_pointee_unknown_size)
-        << SelectPtrOrArr << PointeeTy << (int)InvalidTypeKind
-        << FD->getSourceRange();
-    return true;
-  }
-
-  // Check the expression
-
-  if (!E->getType()->isIntegerType() || E->getType()->isBooleanType()) {
-    S.Diag(E->getBeginLoc(), diag::err_counted_by_attr_argument_not_integer)
-        << E->getSourceRange();
+  if (!Decl::isFlexibleArrayMemberLike(S.getASTContext(), FD, FD->getType(),
+                                       StrictFlexArraysLevel, true)) {
+    // The "counted_by" attribute must be on a flexible array member.
+    SourceRange SR = FD->getLocation();
+    S.Diag(SR.getBegin(),
+           diag::err_counted_by_attr_not_on_flexible_array_member)
+        << SR;
     return true;
   }
 
@@ -8771,11 +8720,10 @@ static void handleCountedByAttrField(Sema &S, Decl *D, const ParsedAttr &AL) {
     return;
 
   llvm::SmallVector<TypeCoupledDeclRefInfo, 1> Decls;
-  if (CheckCountedByAttrOnField(S, FD, CountExpr, Decls))
+  if (CheckCountExpr(S, FD, CountExpr, Decls))
     return;
 
-  QualType CAT =
-      S.BuildCountAttributedArrayOrPointerType(FD->getType(), CountExpr);
+  QualType CAT = S.BuildCountAttributedArrayType(FD->getType(), CountExpr);
   FD->setType(CAT);
 }
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index ef0b6b701a52c5..c19c8cc34dd3bd 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -9345,9 +9345,9 @@ BuildTypeCoupledDecls(Expr *E,
   Decls.push_back(TypeCoupledDeclRefInfo(CountDecl, /*IsDref*/ false));
 }
 
-QualType Sema::BuildCountAttributedArrayOrPointerType(QualType WrappedTy,
-                                                      Expr *CountExpr) {
-  assert(WrappedTy->isIncompleteArrayType() || WrappedTy->isPointerType());
+QualType Sema::BuildCountAttributedArrayType(QualType WrappedTy,
+                                             Expr *CountExpr) {
+  assert(WrappedTy->isIncompleteArrayType());
 
   llvm::SmallVector<TypeCoupledDeclRefInfo, 1> Decls;
   BuildTypeCoupledDecls(CountExpr, Decls);
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 29444f0edc2aed..b10e5ba65eb1c7 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -7344,7 +7344,7 @@ QualType TreeTransform<Derived>::TransformCountAttributedType(
   if (getDerived().AlwaysRebuild() || InnerTy != OldTy->desugar() ||
       OldCount != NewCount) {
     // Currently, CountAttributedType can only wrap incomplete array types.
-    Result = SemaRef.BuildCountAttributedArrayOrPointerType(InnerTy, NewCount);
+    Result = SemaRef.BuildCountAttributedArrayType(InnerTy, NewCount);
   }
 
   TLB.push<CountAttributedTypeLoc>(Result);
diff --git a/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c b/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c
deleted file mode 100644
index a585a45eeff03d..00000000000000
--- a/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c
+++ /dev/null
@@ -1,45 +0,0 @@
-// RUN: %clang_cc1 -fexperimental-late-parse-attributes %s -ast-dump | FileCheck %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct size_known {
-  int field;
-};
-
-//==============================================================================
-// __counted_by on struct member pointer in decl attribute position
-//==============================================================================
-
-struct on_member_pointer_complete_ty {
-  struct size_known *buf __counted_by(count);
-  int count;
-};
-// CHECK-LABEL: struct on_member_pointer_complete_ty definition
-// CHECK-NEXT: |-FieldDecl {{.*}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
-// CHECK-NEXT: `-FieldDecl {{.*}} referenced count 'int'
-
-struct on_pointer_anon_count {
-  struct size_known *buf __counted_by(count);
-  struct {
-    int count;
-  };
-};
-
-// CHECK-LABEL: struct on_pointer_anon_count definition
-// CHECK-NEXT:  |-FieldDecl {{.*}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
-// CHECK-NEXT:  |-RecordDecl {{.*}} struct definition
-// CHECK-NEXT:  | `-FieldDecl {{.*}} count 'int'
-// CHECK-NEXT:  |-FieldDecl {{.*}} implicit 'struct on_pointer_anon_count::(anonymous at {{.*}})'
-// CHECK-NEXT:  `-IndirectFieldDecl {{.*}} implicit referenced count 'int'
-// CHECK-NEXT:    |-Field {{.*}} '' 'struct on_pointer_anon_count::(anonymous at {{.*}})'
-// CHECK-NEXT:    `-Field {{.*}} 'count' 'int'
-
-//==============================================================================
-// __counted_by on struct member pointer in type attribute position
-//==============================================================================
-// TODO: Correctly parse counted_by as a type attribute. Currently it is parsed
-// as a declaration attribute and is **not** late parsed resulting in the `count`
-// field being unavailable.
-//
-// See `clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c` for test
-// cases.
diff --git a/clang/test/AST/attr-counted-by-struct-ptrs.c b/clang/test/AST/attr-counted-by-struct-ptrs.c
deleted file mode 100644
index 79a453d239cd52..00000000000000
--- a/clang/test/AST/attr-counted-by-struct-ptrs.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// RUN: %clang_cc1 %s -ast-dump | FileCheck %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct size_unknown;
-struct size_known {
-  int field;
-};
-
-//==============================================================================
-// __counted_by on struct member pointer in decl attribute position
-//==============================================================================
-
-// CHECK-LABEL: RecordDecl {{.+}} struct on_member_pointer_complete_ty definition
-// CHECK-NEXT: |-FieldDecl {{.+}} referenced count 'int'
-// CHECK-NEXT: `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
-struct on_member_pointer_complete_ty {
-  int count;
-  struct size_known * buf __counted_by(count);
-};
-
-// CHECK-LABEL: RecordDecl {{.+}} struct on_pointer_anon_buf definition
-// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
-// CHECK-NEXT:  |-RecordDecl {{.+}} struct definition
-// CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
-// CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH:.+]])'
-// CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __counted_by(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
-// CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __counted_by(count)':'struct size_known *'
-struct on_pointer_anon_buf {
-  int count;
-  struct {
-    struct size_known *buf __counted_by(count);
-  };
-};
-
-struct on_pointer_anon_count {
-  struct {
-    int count;
-  };
-  struct size_known *buf __counted_by(count);
-};
-
-//==============================================================================
-// __counted_by on struct member pointer in type attribute position
-//==============================================================================
-// TODO: Correctly parse counted_by as a type attribute. Currently it is parsed
-// as a declaration attribute
-
-// CHECK-LABEL: RecordDecl {{.+}} struct on_member_pointer_complete_ty_ty_pos definition
-// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
-// CHECK-NEXT:  `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
-struct on_member_pointer_complete_ty_ty_pos {
-  int count;
-  struct size_known *__counted_by(count) buf;
-};
-
-// TODO: This should be forbidden but isn't due to counted_by being treated as a
-// declaration attribute. The attribute ends up on the outer most pointer
-// (allowed by sema) even though syntactically its supposed to be on the inner
-// pointer (would not allowed by sema due to pointee being a function type).
-// CHECK-LABEL: RecordDecl {{.+}} struct on_member_pointer_fn_ptr_ty_ty_pos_inner definition
-// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
-// CHECK-NEXT:  `-FieldDecl {{.+}} fn_ptr 'void (** __counted_by(count))(void)':'void (**)(void)'
-struct on_member_pointer_fn_ptr_ty_ty_pos_inner {
-  int count;
-  void (* __counted_by(count) * fn_ptr)(void);
-};
-
-// FIXME: The generated AST here is wrong. The attribute should be on the inner
-// pointer.
-// CHECK-LABEL: RecordDecl {{.+}} struct on_nested_pointer_inner definition
-// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
-// CHECK-NEXT:  `-FieldDecl {{.+}} buf 'struct size_known ** __counted_by(count)':'struct size_known **'
-struct on_nested_pointer_inner {
-  int count;
-  // TODO: This should be disallowed because in the `-fbounds-safety` model
-  // `__counted_by` can only be nested when used in function parameters.
-  struct size_known *__counted_by(count) *buf;
-};
-
-// CHECK-LABEL: RecordDecl {{.+}} struct on_nested_pointer_outer definition
-// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
-// CHECK-NEXT:  `-FieldDecl {{.+}} buf 'struct size_known ** __counted_by(count)':'struct size_known **'
-struct on_nested_pointer_outer {
-  int count;
-  struct size_known **__counted_by(count) buf;
-};
-
-// CHECK-LABEL: RecordDecl {{.+}} struct on_pointer_anon_buf_ty_pos definition
-// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
-// CHECK-NEXT:  |-RecordDecl {{.+}} struct definition
-// CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
-// CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2:.+]])'
-// CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __counted_by(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
-// CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __counted_by(count)':'struct size_known *'
-struct on_pointer_anon_buf_ty_pos {
-  int count;
-  struct {
-    struct size_known * __counted_by(count) buf;
-  };
-};
-
-// CHECK-LABEL: RecordDecl {{.+}} struct on_pointer_anon_count_ty_pos definition
-// CHECK-NEXT:  |-RecordDecl {{.+}} struct definition
-// CHECK-NEXT:  | `-FieldDecl {{.+}} count 'int'
-// CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3:.+]])'
-// CHECK-NEXT:  |-IndirectFieldDecl {{.+}} implicit referenced count 'int'
-// CHECK-NEXT:  | |-Field {{.+}} '' 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
-// CHECK-NEXT:  | `-Field {{.+}} 'count' 'int'
-struct on_pointer_anon_count_ty_pos {
-  struct {
-    int count;
-  };
-  struct size_known *__counted_by(count) buf;
-};
diff --git a/clang/test/Sema/attr-counted-by-late-parsed-off.c b/clang/test/Sema/attr-counted-by-late-parsed-off.c
deleted file mode 100644
index 34f51d10c08389..00000000000000
--- a/clang/test/Sema/attr-counted-by-late-parsed-off.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: %clang_cc1 -DNEEDS_LATE_PARSING -fno-experimental-late-parse-attributes -fsyntax-only -verify %s
-// RUN: %clang_cc1 -DNEEDS_LATE_PARSING -fsyntax-only -verify %s
-
-// RUN: %clang_cc1 -UNEEDS_LATE_PARSING -fno-experimental-late-parse-attributes -fsyntax-only -verify=ok %s
-// RUN: %clang_cc1 -UNEEDS_LATE_PARSING -fsyntax-only -verify=ok %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct size_known { int dummy; };
-
-#ifdef NEEDS_LATE_PARSING
-struct on_decl {
-  // expected-error@+1{{use of undeclared identifier 'count'}}
-  struct size_known *buf __counted_by(count);
-  int count;
-};
-
-#else
-
-// ok-no-diagnostics
-struct on_decl {
-  int count;
-  struct size_known *buf __counted_by(count);
-};
-
-#endif
diff --git a/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c b/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c
deleted file mode 100644
index 9ff3b080f6576b..00000000000000
--- a/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c
+++ /dev/null
@@ -1,254 +0,0 @@
-// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct size_unknown;
-struct size_known {
-  int field;
-};
-
-typedef void(*fn_ptr_ty)(void);
-
-//==============================================================================
-// __counted_by on struct member pointer in decl attribute position
-//==============================================================================
-
-struct on_member_pointer_complete_ty {
-  struct size_known * buf __counted_by(count);
-  int count;
-};
-
-struct on_member_pointer_incomplete_ty {
-  struct size_unknown * buf __counted_by(count); // expected-error{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct size_unknown' is an incomplete type}}
-  int count;
-};
-
-struct on_member_pointer_const_incomplete_ty {
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'const struct size_unknown' is an incomplete type}}
-  const struct size_unknown * buf __counted_by(count);
-  int count;
-};
-
-struct on_member_pointer_void_ty {
-  void* buf __counted_by(count); // expected-error{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
-  int count;
-};
-
-struct on_member_pointer_fn_ptr_ty {
-  // buffer of `count` function pointers is allowed
-  void (**fn_ptr)(void) __counted_by(count);
-  int count;
-};
-
-
-struct on_member_pointer_fn_ptr_ty_ptr_ty {
-  // buffer of `count` function pointers is allowed
-  fn_ptr_ty* fn_ptr __counted_by(count);
-  int count;
-};
-
-struct on_member_pointer_fn_ty {
-  // buffer of `count` functions is not allowed
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
-  void (*fn_ptr)(void) __counted_by(count);
-  int count;
-};
-
-struct on_member_pointer_fn_ptr_ty_ty {
-  // buffer of `count` functions is not allowed
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
-  fn_ptr_ty fn_ptr __counted_by(count);
-  int count;
-};
-
-struct has_unannotated_vla {
-  int count;
-  int buffer[];
-};
-
-struct on_member_pointer_struct_with_vla {
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_unannotated_vla' is a struct type with a flexible array member}}
-  struct has_unannotated_vla* objects __counted_by(count);
-  int count;
-};
-
-struct has_annotated_vla {
-  int count;
-  int buffer[] __counted_by(count);
-};
-
-// Currently prevented because computing the size of `objects` at runtime would
-// require an O(N) walk of `objects` to take into account the length of the VLA
-// in each struct instance.
-struct on_member_pointer_struct_with_annotated_vla {
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_annotated_vla' is a struct type with a flexible array member}}
-  struct has_annotated_vla* objects __counted_by(count);
-  int count;
-};
-
-struct on_pointer_anon_buf {
-  // TODO: Support referring to parent scope
-  struct {
-    // expected-error@+1{{use of undeclared identifier 'count'}}
-    struct size_known *buf __counted_by(count);
-  };
-  int count;
-};
-
-struct on_pointer_anon_count {
-  struct size_known *buf __counted_by(count);
-  struct {
-    int count;
-  };
-};
-
-//==============================================================================
-// __counted_by on struct member pointer in type attribute position
-//==============================================================================
-// TODO: Correctly parse counted_by as a type attribute. Currently it is parsed
-// as a declaration attribute and is **not** late parsed resulting in the `count`
-// field being unavailable.
-
-struct on_member_pointer_complete_ty_ty_pos {
-  // TODO: Allow this
-  // expected-error@+1{{use of undeclared identifier 'count'}}
-  struct size_known *__counted_by(count) buf;
-  int count;
-};
-
-struct on_member_pointer_incomplete_ty_ty_pos {
-  // TODO: Allow this
-  // expected-error@+1{{use of undeclared identifier 'count'}}
-  struct size_unknown * __counted_by(count) buf;
-  int count;
-};
-
-struct on_member_pointer_const_incomplete_ty_ty_pos {
-  // TODO: Allow this
-  // expected-error@+1{{use of undeclared identifier 'count'}}
-  const struct size_unknown * __counted_by(count) buf;
-  int count;
-};
-
-struct on_member_pointer_void_ty_ty_pos {
-  // TODO: This should fail because the attribute is
-  // on a pointer with the pointee being an incomplete type.
-  // expected-error@+1{{use of undeclared identifier 'count'}}
-  void *__counted_by(count) buf;
-  int count;
-};
-
-// -
-
-struct on_member_pointer_fn_ptr_ty_pos {
-  // TODO: buffer of `count` function pointers should be allowed
-  // but fails because this isn't late parsed.
-  // expected-error@+1{{use of undeclared identifier 'count'}}
-  void (** __counted_by(count) fn_ptr)(void);
-  int count;
-};
-
-struct on_member_pointer_fn_ptr_ty_ptr_ty_pos {
-  // TODO: buffer of `count` function pointers should be allowed
-  // but fails because this isn't late parsed.
-  // expected-error@+1{{use of undeclared identifier 'count'}}
-  fn_ptr_ty* __counted_by(count) fn_ptr;
-  int count;
-};
-
-struct on_member_pointer_fn_ty_ty_pos {
-  // TODO: This should fail because the attribute is
-  // on a pointer with the pointee being a function type.
-  // expected-error@+1{{use of undeclared identifier 'count'}}
-  void (* __counted_by(count) fn_ptr)(void);
-  int count;
-};
-
-struct on_member_pointer_fn_ptr_ty_ty_pos {
-  // TODO: buffer of `count` function pointers should be allowed
-  // expected-error@+1{{use of undeclared identifier 'count'}}
-  void (** __counted_by(count) fn_ptr)(void);
-  int count;
-};
-
-struct on_member_pointer_fn_ptr_ty_typedef_ty_pos {
-  // TODO: This should fail because the attribute is
-  // on a pointer with the pointee being a function type.
-  // expected-error@+1{{use of undeclared identifier 'count'}}
-  fn_ptr_ty __counted_by(count) fn_ptr;
-  int count;
-};
-
-struct on_member_pointer_fn_ptr_ty_ty_pos_inner {
-  // TODO: This should fail because the attribute is
-  // on a pointer with the pointee being a function type.
-  // expected-error@+1{{use of undeclared identifier 'count'}}
-  void (* __counted_by(count) * fn_ptr)(void);
-  int count;
-};
-
-struct on_member_pointer_struct_with_vla_ty_pos {
-  // TODO: This should fail because the attribute is
-  // on a pointer with the pointee being a struct type with a VLA.
-  // expected-error@+1{{use of undeclared identifier 'count'}}
-  struct has_unannotated_vla *__counted_by(count) objects;
-  int count;
-};
-
-struct on_member_pointer_struct_with_annotated_vla_ty_pos {
-  // TODO: This should fail because the attribute is
-  // on a pointer with the pointee being a struct type with a VLA.
-  // expected-error@+1{{use of undeclared identifier 'count'}}
-  struct has_annotated_vla* __counted_by(count) objects;
-  int count;
-};
-
-struct on_nested_pointer_inner {
-  // TODO: This should be disallowed because in the `-fbounds-safety` model
-  // `__counted_by` can only be nested when used in function parameters.
-  // expected-error@+1{{use of undeclared identifier 'count'}}
-  struct size_known *__counted_by(count) *buf;
-  int count;
-};
-
-struct on_nested_pointer_outer {
-  // TODO: Allow this
-  // expected-error@+1{{use of undeclared identifier 'count'}}
-  struct size_known **__counted_by(count) buf;
-  int count;
-};
-
-struct on_pointer_anon_buf_ty_pos {
-  struct {
-    // TODO: Support referring to parent scope
-    // expected-error@+1{{use of undeclared identifier 'count'}}
-    struct size_known * __counted_by(count) buf;
-  };
-  int count;
-};
-
-struct on_pointer_anon_count_ty_pos {
-  // TODO: Allow this
-  // expected-error@+1{{use of undeclared identifier 'count'}}
-  struct size_known *__counted_by(count) buf;
-  struct {
-    int count;
-  };
-};
-
-//==============================================================================
-// __counted_by on struct non-pointer members
-//==============================================================================
-
-struct on_pod_ty {
-  // expected-error@+1{{'counted_by' only applies to pointers or C99 flexible array members}}
-  int wrong_ty __counted_by(count);
-  int count;
-};
-
-struct on_void_ty {
-  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
-  // expected-error@+1{{field has incomplete type 'void'}}
-  void wrong_ty __counted_by(count);
-  int count;
-};
diff --git a/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c b/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c
deleted file mode 100644
index 9b0f2eafb13c2b..00000000000000
--- a/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c
+++ /dev/null
@@ -1,17 +0,0 @@
-// __SVInt8_t is specific to ARM64 so specify that in the target triple
-// RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct on_sizeless_pointee_ty {
-    int count;
-    // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because '__SVInt8_t' is a sizeless type}}
-    __SVInt8_t* member __counted_by(count);
-};
-
-struct on_sizeless_ty {
-    int count;
-    // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
-    // expected-error@+1{{field has sizeless type '__SVInt8_t'}}
-    __SVInt8_t member __counted_by(count);
-};
diff --git a/clang/test/Sema/attr-counted-by-struct-ptrs.c b/clang/test/Sema/attr-counted-by-struct-ptrs.c
deleted file mode 100644
index cd2bfe36938b2e..00000000000000
--- a/clang/test/Sema/attr-counted-by-struct-ptrs.c
+++ /dev/null
@@ -1,224 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct size_unknown;
-struct size_known {
-  int field;
-};
-
-typedef void(*fn_ptr_ty)(void);
-
-//==============================================================================
-// __counted_by on struct member pointer in decl attribute position
-//==============================================================================
-
-struct on_member_pointer_complete_ty {
-  int count;
-  struct size_known * buf __counted_by(count);
-};
-
-struct on_member_pointer_incomplete_ty {
-  int count;
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct size_unknown' is an incomplete type}}
-  struct size_unknown * buf __counted_by(count);
-};
-
-struct on_member_pointer_const_incomplete_ty {
-  int count;
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'const struct size_unknown' is an incomplete type}}
-  const struct size_unknown * buf __counted_by(count);
-};
-
-struct on_member_pointer_void_ty {
-  int count;
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
-  void* buf __counted_by(count);
-};
-
-struct on_member_pointer_fn_ptr_ty {
-  int count;
-  // buffer of `count` function pointers is allowed
-  void (**fn_ptr)(void) __counted_by(count);
-};
-
-struct on_member_pointer_fn_ptr_ty_ptr_ty {
-  int count;
-  // buffer of `count` function pointers is allowed
-  fn_ptr_ty* fn_ptr __counted_by(count);
-};
-
-struct on_member_pointer_fn_ty {
-  int count;
-  // buffer of `count` functions is not allowed
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
-  void (*fn_ptr)(void) __counted_by(count);
-};
-
-struct on_member_pointer_fn_ptr_ty_ty {
-  int count;
-  // buffer of `count` functions is not allowed
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
-  fn_ptr_ty fn_ptr __counted_by(count);
-};
-
-struct has_unannotated_vla {
-  int count;
-  int buffer[];
-};
-
-struct on_member_pointer_struct_with_vla {
-  int count;
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_unannotated_vla' is a struct type with a flexible array member}}
-  struct has_unannotated_vla* objects __counted_by(count);
-};
-
-struct has_annotated_vla {
-  int count;
-  int buffer[] __counted_by(count);
-};
-
-// Currently prevented because computing the size of `objects` at runtime would
-// require an O(N) walk of `objects` to take into account the length of the VLA
-// in each struct instance.
-struct on_member_pointer_struct_with_annotated_vla {
-  int count;
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_annotated_vla' is a struct type with a flexible array member}}
-  struct has_annotated_vla* objects __counted_by(count);
-};
-
-struct on_pointer_anon_buf {
-  int count;
-  struct {
-    struct size_known *buf __counted_by(count);
-  };
-};
-
-struct on_pointer_anon_count {
-  struct {
-    int count;
-  };
-  struct size_known *buf __counted_by(count);
-};
-
-//==============================================================================
-// __counted_by on struct member pointer in type attribute position
-//==============================================================================
-// TODO: Correctly parse counted_by as a type attribute. Currently it is parsed
-// as a declaration attribute
-
-struct on_member_pointer_complete_ty_ty_pos {
-  int count;
-  struct size_known *__counted_by(count) buf;
-};
-
-struct on_member_pointer_incomplete_ty_ty_pos {
-  int count;
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct size_unknown' is an incomplete type}}
-  struct size_unknown * __counted_by(count) buf;
-};
-
-struct on_member_pointer_const_incomplete_ty_ty_pos {
-  int count;
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'const struct size_unknown' is an incomplete type}}
-  const struct size_unknown * __counted_by(count) buf;
-};
-
-struct on_member_pointer_void_ty_ty_pos {
-  int count;
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
-  void *__counted_by(count) buf;
-};
-
-// -
-
-struct on_member_pointer_fn_ptr_ty_pos {
-  int count;
-  // buffer of `count` function pointers is allowed
-  void (** __counted_by(count) fn_ptr)(void);
-};
-
-struct on_member_pointer_fn_ptr_ty_ptr_ty_pos {
-  int count;
-  // buffer of `count` function pointers is allowed
-  fn_ptr_ty* __counted_by(count) fn_ptr;
-};
-
-struct on_member_pointer_fn_ty_ty_pos {
-  int count;
-  // buffer of `count` functions is not allowed
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
-  void (* __counted_by(count) fn_ptr)(void);
-};
-
-struct on_member_pointer_fn_ptr_ty_ty_pos {
-  int count;
-  // buffer of `count` functions is not allowed
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
-  fn_ptr_ty __counted_by(count) fn_ptr;
-};
-
-// TODO: This should be forbidden but isn't due to counted_by being treated
-// as a declaration attribute.
-struct on_member_pointer_fn_ptr_ty_ty_pos_inner {
-  int count;
-  void (* __counted_by(count) * fn_ptr)(void);
-};
-
-struct on_member_pointer_struct_with_vla_ty_pos {
-  int count;
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_unannotated_vla' is a struct type with a flexible array member}}
-  struct has_unannotated_vla *__counted_by(count) objects;
-};
-
-// Currently prevented because computing the size of `objects` at runtime would
-// require an O(N) walk of `objects` to take into account the length of the VLA
-// in each struct instance.
-struct on_member_pointer_struct_with_annotated_vla_ty_pos {
-  int count;
-  // expected-error@+1{{counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_annotated_vla' is a struct type with a flexible array member}}
-  struct has_annotated_vla* __counted_by(count) objects;
-};
-
-struct on_nested_pointer_inner {
-  // TODO: This should be disallowed because in the `-fbounds-safety` model
-  // `__counted_by` can only be nested when used in function parameters.
-  int count;
-  struct size_known *__counted_by(count) *buf;
-};
-
-struct on_nested_pointer_outer {
-  int count;
-  struct size_known **__counted_by(count) buf;
-};
-
-struct on_pointer_anon_buf_ty_pos {
-  int count;
-  struct {
-    struct size_known * __counted_by(count) buf;
-  };
-};
-
-struct on_pointer_anon_count_ty_pos {
-  struct {
-    int count;
-  };
-  struct size_known *__counted_by(count) buf;
-};
-
-//==============================================================================
-// __counted_by on struct non-pointer members
-//==============================================================================
-
-struct on_pod_ty {
-  int count;
-  // expected-error@+1{{'counted_by' only applies to pointers or C99 flexible array members}}
-  int wrong_ty __counted_by(count);
-};
-
-struct on_void_ty {
-  int count;
-  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
-  // expected-error@+1{{field has incomplete type 'void'}}
-  void wrong_ty __counted_by(count);
-};
diff --git a/clang/test/Sema/attr-counted-by-vla-sizeless-types.c b/clang/test/Sema/attr-counted-by-vla-sizeless-types.c
deleted file mode 100644
index 31c0007501c48d..00000000000000
--- a/clang/test/Sema/attr-counted-by-vla-sizeless-types.c
+++ /dev/null
@@ -1,11 +0,0 @@
-// __SVInt8_t is specific to ARM64 so specify that in the target triple
-// RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct on_sizeless_elt_ty {
-    int count;
-    // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
-    // expected-error@+1{{array has sizeless element type '__SVInt8_t'}}
-    __SVInt8_t arr[] __counted_by(count);
-};
diff --git a/clang/test/Sema/attr-counted-by-vla.c b/clang/test/Sema/attr-counted-by-vla.c
deleted file mode 100644
index 3de6bd55e2d8eb..00000000000000
--- a/clang/test/Sema/attr-counted-by-vla.c
+++ /dev/null
@@ -1,193 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct bar;
-
-struct not_found {
-  int count;
-  struct bar *fam[] __counted_by(bork); // expected-error {{use of undeclared identifier 'bork'}}
-};
-
-struct no_found_count_not_in_substruct {
-  unsigned long flags;
-  unsigned char count; // expected-note {{'count' declared here}}
-  struct A {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
-  } a;
-};
-
-struct not_found_count_not_in_unnamed_substruct {
-  unsigned char count; // expected-note {{'count' declared here}}
-  struct {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
-  } a;
-};
-
-struct not_found_count_not_in_unnamed_substruct_2 {
-  struct {
-    unsigned char count; // expected-note {{'count' declared here}}
-  };
-  struct {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
-  } a;
-};
-
-struct not_found_count_in_other_unnamed_substruct {
-  struct {
-    unsigned char count;
-  } a1;
-
-  struct {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
-  };
-};
-
-struct not_found_count_in_other_substruct {
-  struct _a1 {
-    unsigned char count;
-  } a1;
-
-  struct {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
-  };
-};
-
-struct not_found_count_in_other_substruct_2 {
-  struct _a2 {
-    unsigned char count;
-  } a2;
-
-  int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
-};
-
-struct not_found_suggest {
-  int bork;
-  struct bar *fam[] __counted_by(blork); // expected-error {{use of undeclared identifier 'blork'}}
-};
-
-int global; // expected-note {{'global' declared here}}
-
-struct found_outside_of_struct {
-  int bork;
-  struct bar *fam[] __counted_by(global); // expected-error {{field 'global' in 'counted_by' not inside structure}}
-};
-
-struct self_referrential {
-  int bork;
-  struct bar *self[] __counted_by(self); // expected-error {{use of undeclared identifier 'self'}}
-};
-
-struct non_int_count {
-  double dbl_count;
-  struct bar *fam[] __counted_by(dbl_count); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
-};
-
-struct array_of_ints_count {
-  int integers[2];
-  struct bar *fam[] __counted_by(integers); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
-};
-
-struct not_a_fam {
-  int count;
-  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct bar' is an incomplete type}}
-  struct bar *non_fam __counted_by(count);
-};
-
-struct not_a_c99_fam {
-  int count;
-  struct bar *non_c99_fam[0] __counted_by(count); // expected-error {{'counted_by' on arrays only applies to C99 flexible array members}}
-};
-
-struct annotated_with_anon_struct {
-  unsigned long flags;
-  struct {
-    unsigned char count;
-    int array[] __counted_by(crount); // expected-error {{use of undeclared identifier 'crount'}}
-  };
-};
-
-//==============================================================================
-// __counted_by on a struct VLA with element type that has unknown size
-//==============================================================================
-
-struct size_unknown; // expected-note 2{{forward declaration of 'struct size_unknown'}}
-struct on_member_arr_incomplete_ty_ty_pos {
-  int count;
-  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
-  // expected-error@+1{{array has incomplete element type 'struct size_unknown'}}
-  struct size_unknown buf[] __counted_by(count);
-};
-
-struct on_member_arr_incomplete_const_ty_ty_pos {
-  int count;
-  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
-  // expected-error@+1{{array has incomplete element type 'const struct size_unknown'}}
-  const struct size_unknown buf[] __counted_by(count);
-};
-
-struct on_member_arr_void_ty_ty_pos {
-  int count;
-  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
-  // expected-error@+1{{array has incomplete element type 'void'}}
-  void buf[] __counted_by(count);
-};
-
-typedef void(fn_ty)(int);
-
-struct on_member_arr_fn_ptr_ty {
-  int count;
-  // An Array of function pointers is allowed
-  fn_ty* buf[] __counted_by(count);
-};
-
-struct on_member_arr_fn_ty {
-  int count;
-  // An array of functions is not allowed.
-  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
-  // expected-error@+1{{'buf' declared as array of functions of type 'fn_ty' (aka 'void (int)')}}
-  fn_ty buf[] __counted_by(count);
-};
-
-
-// `buffer_of_structs_with_unnannotated_vla`,
-// `buffer_of_structs_with_annotated_vla`, and
-// `buffer_of_const_structs_with_annotated_vla` are currently prevented because
-// computing the size of `Arr` at runtime would require an O(N) walk of `Arr`
-// elements to take into account the length of the VLA in each struct instance.
-
-struct has_unannotated_VLA {
-  int count;
-  char buffer[];
-};
-
-struct has_annotated_VLA {
-  int count;
-  char buffer[] __counted_by(count);
-};
-
-struct buffer_of_structs_with_unnannotated_vla {
-  int count;
-  // expected-error@+1{{'counted_by' cannot be applied to an array with element of unknown size because 'struct has_unannotated_VLA' is a struct type with a flexible array member}}
-  struct has_unannotated_VLA Arr[] __counted_by(count);
-};
-
-
-struct buffer_of_structs_with_annotated_vla {
-  int count;
-  // expected-error@+1{{'counted_by' cannot be applied to an array with element of unknown size because 'struct has_annotated_VLA' is a struct type with a flexible array member}}
-  struct has_annotated_VLA Arr[] __counted_by(count);
-};
-
-struct buffer_of_const_structs_with_annotated_vla {
-  int count;
-  // Make sure the `const` qualifier is printed when printing the element type.
-  // expected-error@+1{{'counted_by' cannot be applied to an array with element of unknown size because 'const struct has_annotated_VLA' is a struct type with a flexible array member}}
-  const struct has_annotated_VLA Arr[] __counted_by(count);
-};
-
diff --git a/clang/test/Sema/attr-counted-by.c b/clang/test/Sema/attr-counted-by.c
new file mode 100644
index 00000000000000..d5d4ebf5573922
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by.c
@@ -0,0 +1,112 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct bar;
+
+struct not_found {
+  int count;
+  struct bar *fam[] __counted_by(bork); // expected-error {{use of undeclared identifier 'bork'}}
+};
+
+struct no_found_count_not_in_substruct {
+  unsigned long flags;
+  unsigned char count; // expected-note {{'count' declared here}}
+  struct A {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
+  } a;
+};
+
+struct not_found_count_not_in_unnamed_substruct {
+  unsigned char count; // expected-note {{'count' declared here}}
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
+  } a;
+};
+
+struct not_found_count_not_in_unnamed_substruct_2 {
+  struct {
+    unsigned char count; // expected-note {{'count' declared here}}
+  };
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
+  } a;
+};
+
+struct not_found_count_in_other_unnamed_substruct {
+  struct {
+    unsigned char count;
+  } a1;
+
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+  };
+};
+
+struct not_found_count_in_other_substruct {
+  struct _a1 {
+    unsigned char count;
+  } a1;
+
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+  };
+};
+
+struct not_found_count_in_other_substruct_2 {
+  struct _a2 {
+    unsigned char count;
+  } a2;
+
+  int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+};
+
+struct not_found_suggest {
+  int bork;
+  struct bar *fam[] __counted_by(blork); // expected-error {{use of undeclared identifier 'blork'}}
+};
+
+int global; // expected-note {{'global' declared here}}
+
+struct found_outside_of_struct {
+  int bork;
+  struct bar *fam[] __counted_by(global); // expected-error {{field 'global' in 'counted_by' not inside structure}}
+};
+
+struct self_referrential {
+  int bork;
+  struct bar *self[] __counted_by(self); // expected-error {{use of undeclared identifier 'self'}}
+};
+
+struct non_int_count {
+  double dbl_count;
+  struct bar *fam[] __counted_by(dbl_count); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
+};
+
+struct array_of_ints_count {
+  int integers[2];
+  struct bar *fam[] __counted_by(integers); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
+};
+
+struct not_a_fam {
+  int count;
+  struct bar *non_fam __counted_by(count); // expected-error {{'counted_by' only applies to C99 flexible array members}}
+};
+
+struct not_a_c99_fam {
+  int count;
+  struct bar *non_c99_fam[0] __counted_by(count); // expected-error {{'counted_by' only applies to C99 flexible array members}}
+};
+
+struct annotated_with_anon_struct {
+  unsigned long flags;
+  struct {
+    unsigned char count;
+    int array[] __counted_by(crount); // expected-error {{use of undeclared identifier 'crount'}}
+  };
+};

From c587483da0b50efa04146fde205da1d16731e12e Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Sun, 19 May 2024 06:21:40 -0700
Subject: [PATCH 03/60] Revert "[Bounds-Safety] Fix
 `pragma-attribute-supported-attributes-list.test`"

Issue #92687

This reverts commit 112eadd55f06bee15caadff688ea0b45acbfa804.
---
 clang/test/Misc/pragma-attribute-supported-attributes-list.test | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 99732694f72a5f..fd0e6d71baa803 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -63,6 +63,7 @@
 // CHECK-NEXT: CoroOnlyDestroyWhenComplete (SubjectMatchRule_record)
 // CHECK-NEXT: CoroReturnType (SubjectMatchRule_record)
 // CHECK-NEXT: CoroWrapper (SubjectMatchRule_function)
+// CHECK-NEXT: CountedBy (SubjectMatchRule_field)
 // CHECK-NEXT: DLLExport (SubjectMatchRule_function, SubjectMatchRule_variable, SubjectMatchRule_record, SubjectMatchRule_objc_interface)
 // CHECK-NEXT: DLLImport (SubjectMatchRule_function, SubjectMatchRule_variable, SubjectMatchRule_record, SubjectMatchRule_objc_interface)
 // CHECK-NEXT: Destructor (SubjectMatchRule_function)

From 10edb4991c12738e60843d55cd9edbf6d702d9eb Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Sun, 19 May 2024 16:59:03 +0300
Subject: [PATCH 04/60] [Clang][CodeGen] Start migrating away from assuming the
 Default AS is 0 (#88182)

At the moment, Clang is rather liberal in assuming that 0 (and by extension unqualified) is always a safe default. This does not work for targets that actually use a different value for the default / generic AS (for example, the SPIRV that obtains from HIPSPV or SYCL). This patch is a first, fairly safe step towards trying to clear things up by querying a modules' default AS from the target, rather than assuming it's 0, alongside fixing a few places where things break / we encode the 0 == DefaultAS assumption. A bunch of existing tests are extended to check for non-zero default AS usage.
---
 clang/lib/CodeGen/CGException.cpp             |   5 +-
 clang/lib/CodeGen/CGExprCXX.cpp               |   7 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |   3 +-
 clang/lib/CodeGen/CodeGenTypeCache.h          |   2 +-
 .../CodeGenCXX/dynamic-cast-address-space.cpp | 123 ++++++++++++++++--
 clang/test/CodeGenCXX/eh.cpp                  |   6 +-
 clang/test/CodeGenCXX/nrvo.cpp                |   4 +-
 .../template-param-objects-address-space.cpp  |  10 ++
 ...w-expression-typeinfo-in-address-space.cpp |   2 +
 .../try-catch-with-address-space.cpp          |   7 +-
 .../typeid-cxx11-with-address-space.cpp       |   4 +
 .../CodeGenCXX/typeid-with-address-space.cpp  |  11 ++
 .../typeinfo-with-address-space.cpp           |   7 +
 .../vtable-assume-load-address-space.cpp      | 110 ++++++++++------
 ...e-pointer-initialization-address-space.cpp |   7 +
 clang/test/CodeGenCXX/vtt-address-space.cpp   |   7 +
 clang/test/CodeGenCXX/wasm-eh.cpp             |   4 +-
 llvm/examples/ExceptionDemo/ExceptionDemo.cpp |   2 +-
 llvm/include/llvm/IR/Intrinsics.td            |   4 +-
 .../WebAssembly/lower-em-exceptions.ll        |   6 +-
 .../GVNHoist/infinite-loop-indirect.ll        |   6 +-
 llvm/test/Transforms/Inline/inline_invoke.ll  |  10 +-
 .../Transforms/LICM/scalar-promote-unwind.ll  |   6 +-
 .../LowerTypeTests/cfi-unwind-direct-call.ll  |   6 +-
 .../Transforms/NewGVN/2011-09-07-TypeIdFor.ll |  14 +-
 .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td   |   2 +-
 mlir/test/Target/LLVMIR/Import/intrinsic.ll   |   4 +-
 .../test/Target/LLVMIR/llvmir-intrinsics.mlir |   2 +-
 28 files changed, 283 insertions(+), 98 deletions(-)

diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp
index 34f289334a7df9..8acda3f2eb864a 100644
--- a/clang/lib/CodeGen/CGException.cpp
+++ b/clang/lib/CodeGen/CGException.cpp
@@ -1052,7 +1052,8 @@ static void emitWasmCatchPadBlock(CodeGenFunction &CGF,
   CGF.Builder.CreateStore(Exn, CGF.getExceptionSlot());
   llvm::CallInst *Selector = CGF.Builder.CreateCall(GetSelectorFn, CPI);
 
-  llvm::Function *TypeIDFn = CGF.CGM.getIntrinsic(llvm::Intrinsic::eh_typeid_for);
+  llvm::Function *TypeIDFn =
+      CGF.CGM.getIntrinsic(llvm::Intrinsic::eh_typeid_for, {CGF.VoidPtrTy});
 
   // If there's only a single catch-all, branch directly to its handler.
   if (CatchScope.getNumHandlers() == 1 &&
@@ -1137,7 +1138,7 @@ static void emitCatchDispatchBlock(CodeGenFunction &CGF,
 
   // Select the right handler.
   llvm::Function *llvm_eh_typeid_for =
-    CGF.CGM.getIntrinsic(llvm::Intrinsic::eh_typeid_for);
+      CGF.CGM.getIntrinsic(llvm::Intrinsic::eh_typeid_for, {CGF.VoidPtrTy});
   llvm::Type *argTy = llvm_eh_typeid_for->getArg(0)->getType();
   LangAS globAS = CGF.CGM.GetGlobalVarAddressSpace(nullptr);
 
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index c18c36d3f3f326..0cfdb7effe4703 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -2216,7 +2216,12 @@ static llvm::Value *EmitTypeidFromVTable(CodeGenFunction &CGF, const Expr *E,
 }
 
 llvm::Value *CodeGenFunction::EmitCXXTypeidExpr(const CXXTypeidExpr *E) {
-  llvm::Type *PtrTy = llvm::PointerType::getUnqual(getLLVMContext());
+  // Ideally, we would like to use GlobalsInt8PtrTy here, however, we cannot,
+  // primarily because the result of applying typeid is a value of type
+  // type_info, which is declared & defined by the standard library
+  // implementation and expects to operate on the generic (default) AS.
+  // https://reviews.llvm.org/D157452 has more context, and a possible solution.
+  llvm::Type *PtrTy = Int8PtrTy;
   LangAS GlobAS = CGM.GetGlobalVarAddressSpace(nullptr);
 
   auto MaybeASCast = [=](auto &&TypeInfo) {
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 489c08a4d4819b..227813ad44e8b8 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -368,7 +368,8 @@ CodeGenModule::CodeGenModule(ASTContext &C,
   IntTy = llvm::IntegerType::get(LLVMContext, C.getTargetInfo().getIntWidth());
   IntPtrTy = llvm::IntegerType::get(LLVMContext,
     C.getTargetInfo().getMaxPointerWidth());
-  Int8PtrTy = llvm::PointerType::get(LLVMContext, 0);
+  Int8PtrTy = llvm::PointerType::get(LLVMContext,
+                                     C.getTargetAddressSpace(LangAS::Default));
   const llvm::DataLayout &DL = M.getDataLayout();
   AllocaInt8PtrTy =
       llvm::PointerType::get(LLVMContext, DL.getAllocaAddrSpace());
diff --git a/clang/lib/CodeGen/CodeGenTypeCache.h b/clang/lib/CodeGen/CodeGenTypeCache.h
index 083d69214fb3c2..e273ebe3b060f2 100644
--- a/clang/lib/CodeGen/CodeGenTypeCache.h
+++ b/clang/lib/CodeGen/CodeGenTypeCache.h
@@ -51,7 +51,7 @@ struct CodeGenTypeCache {
     llvm::IntegerType *PtrDiffTy;
   };
 
-  /// void*, void** in address space 0
+  /// void*, void** in the target's default address space (often 0)
   union {
     llvm::PointerType *UnqualPtrTy;
     llvm::PointerType *VoidPtrTy;
diff --git a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
index 83a408984b7605..3d5e32516c7af2 100644
--- a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
+++ b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
@@ -1,24 +1,127 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --no-generate-body-for-unused-prefixes --version 4
 // RUN: %clang_cc1 -I%S %s -triple amdgcn-amd-amdhsa -emit-llvm -fcxx-exceptions -fexceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -I%S %s -triple spirv64-unknown-unknown -fsycl-is-device -emit-llvm -fcxx-exceptions -fexceptions -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
+
 struct A { virtual void f(); };
 struct B : A { };
 
-// CHECK: {{define.*@_Z1fP1A}}
-// CHECK-SAME:  personality ptr @__gxx_personality_v0
 B fail;
+//.
+// CHECK: @_ZTV1B = linkonce_odr unnamed_addr addrspace(1) constant { [3 x ptr addrspace(1)] } { [3 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTI1B, ptr addrspace(1) addrspacecast (ptr @_ZN1A1fEv to ptr addrspace(1))] }, comdat, align 8
+// CHECK: @fail = addrspace(1) global { ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, i32 0, i32 2) }, align 8
+// CHECK: @_ZTI1A = external addrspace(1) constant ptr addrspace(1)
+// CHECK: @_ZTVN10__cxxabiv120__si_class_type_infoE = external addrspace(1) global [0 x ptr addrspace(1)]
+// CHECK: @_ZTS1B = linkonce_odr addrspace(1) constant [3 x i8] c"1B\00", comdat, align 1
+// CHECK: @_ZTI1B = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1B, ptr addrspace(1) @_ZTI1A }, comdat, align 8
+// CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
+//.
+// WITH-NONZERO-DEFAULT-AS: @_ZTV1B = linkonce_odr unnamed_addr addrspace(1) constant { [3 x ptr addrspace(1)] } { [3 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTI1B, ptr addrspace(1) addrspacecast (ptr @_ZN1A1fEv to ptr addrspace(1))] }, comdat, align 8
+// WITH-NONZERO-DEFAULT-AS: @fail = addrspace(1) global { ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, i32 0, i32 2) }, align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZTI1A = external addrspace(1) constant ptr addrspace(1)
+// WITH-NONZERO-DEFAULT-AS: @_ZTVN10__cxxabiv120__si_class_type_infoE = external addrspace(1) global [0 x ptr addrspace(1)]
+// WITH-NONZERO-DEFAULT-AS: @_ZTS1B = linkonce_odr addrspace(1) constant [3 x i8] c"1B\00", comdat, align 1
+// WITH-NONZERO-DEFAULT-AS: @_ZTI1B = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1B, ptr addrspace(1) @_ZTI1A }, comdat, align 8
+//.
+// CHECK-LABEL: define dso_local noundef nonnull align 8 dereferenceable(8) ptr @_Z1fP1A(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[EXN_SLOT:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[EHSELECTOR_SLOT:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = call ptr @__dynamic_cast(ptr [[TMP0]], ptr addrspace(1) @_ZTI1A, ptr addrspace(1) @_ZTI1B, i64 0) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null
+// CHECK-NEXT:    br i1 [[TMP2]], label [[DYNAMIC_CAST_BAD_CAST:%.*]], label [[DYNAMIC_CAST_END:%.*]]
+// CHECK:       dynamic_cast.bad_cast:
+// CHECK-NEXT:    invoke void @__cxa_bad_cast() #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]]
+// CHECK:       invoke.cont:
+// CHECK-NEXT:    unreachable
+// CHECK:       dynamic_cast.end:
+// CHECK-NEXT:    br label [[TRY_CONT:%.*]]
+// CHECK:       lpad:
+// CHECK-NEXT:    [[TMP3:%.*]] = landingpad { ptr, i32 }
+// CHECK-NEXT:            catch ptr null
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 0
+// CHECK-NEXT:    store ptr [[TMP4]], ptr addrspace(5) [[EXN_SLOT]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 1
+// CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[EHSELECTOR_SLOT]], align 4
+// CHECK-NEXT:    br label [[CATCH:%.*]]
+// CHECK:       catch:
+// CHECK-NEXT:    [[EXN:%.*]] = load ptr, ptr addrspace(5) [[EXN_SLOT]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = call ptr @__cxa_begin_catch(ptr [[EXN]]) #[[ATTR3]]
+// CHECK-NEXT:    call void @__cxa_end_catch()
+// CHECK-NEXT:    br label [[TRY_CONT]]
+// CHECK:       try.cont:
+// CHECK-NEXT:    ret ptr addrspacecast (ptr addrspace(1) @fail to ptr)
+//
+// WITH-NONZERO-DEFAULT-AS-LABEL: define spir_func noundef align 8 dereferenceable(8) ptr addrspace(4) @_Z1fP1A(
+// WITH-NONZERO-DEFAULT-AS-SAME: ptr addrspace(4) noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 {
+// WITH-NONZERO-DEFAULT-AS-NEXT:  entry:
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[RETVAL:%.*]] = alloca ptr addrspace(4), align 8
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[EXN_SLOT:%.*]] = alloca ptr addrspace(4), align 8
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[EHSELECTOR_SLOT:%.*]] = alloca i32, align 4
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4)
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4)
+// WITH-NONZERO-DEFAULT-AS-NEXT:    store ptr addrspace(4) [[A]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[A_ADDR_ASCAST]], align 8
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP1:%.*]] = call spir_func ptr addrspace(4) @__dynamic_cast(ptr addrspace(4) [[TMP0]], ptr addrspace(1) @_ZTI1A, ptr addrspace(1) @_ZTI1B, i64 0) #[[ATTR3:[0-9]+]]
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP2:%.*]] = icmp eq ptr addrspace(4) [[TMP1]], null
+// WITH-NONZERO-DEFAULT-AS-NEXT:    br i1 [[TMP2]], label [[DYNAMIC_CAST_BAD_CAST:%.*]], label [[DYNAMIC_CAST_END:%.*]]
+// WITH-NONZERO-DEFAULT-AS:       dynamic_cast.bad_cast:
+// WITH-NONZERO-DEFAULT-AS-NEXT:    invoke spir_func void @__cxa_bad_cast() #[[ATTR4:[0-9]+]]
+// WITH-NONZERO-DEFAULT-AS-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]]
+// WITH-NONZERO-DEFAULT-AS:       invoke.cont:
+// WITH-NONZERO-DEFAULT-AS-NEXT:    unreachable
+// WITH-NONZERO-DEFAULT-AS:       dynamic_cast.end:
+// WITH-NONZERO-DEFAULT-AS-NEXT:    br label [[TRY_CONT:%.*]]
+// WITH-NONZERO-DEFAULT-AS:       lpad:
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP3:%.*]] = landingpad { ptr addrspace(4), i32 }
+// WITH-NONZERO-DEFAULT-AS-NEXT:            catch ptr addrspace(4) null
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP4:%.*]] = extractvalue { ptr addrspace(4), i32 } [[TMP3]], 0
+// WITH-NONZERO-DEFAULT-AS-NEXT:    store ptr addrspace(4) [[TMP4]], ptr [[EXN_SLOT]], align 8
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP5:%.*]] = extractvalue { ptr addrspace(4), i32 } [[TMP3]], 1
+// WITH-NONZERO-DEFAULT-AS-NEXT:    store i32 [[TMP5]], ptr [[EHSELECTOR_SLOT]], align 4
+// WITH-NONZERO-DEFAULT-AS-NEXT:    br label [[CATCH:%.*]]
+// WITH-NONZERO-DEFAULT-AS:       catch:
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[EXN:%.*]] = load ptr addrspace(4), ptr [[EXN_SLOT]], align 8
+// WITH-NONZERO-DEFAULT-AS-NEXT:    [[TMP6:%.*]] = call spir_func ptr addrspace(4) @__cxa_begin_catch(ptr addrspace(4) [[EXN]]) #[[ATTR3]]
+// WITH-NONZERO-DEFAULT-AS-NEXT:    call spir_func void @__cxa_end_catch()
+// WITH-NONZERO-DEFAULT-AS-NEXT:    br label [[TRY_CONT]]
+// WITH-NONZERO-DEFAULT-AS:       try.cont:
+// WITH-NONZERO-DEFAULT-AS-NEXT:    ret ptr addrspace(4) addrspacecast (ptr addrspace(1) @fail to ptr addrspace(4))
+//
 const B& f(A *a) {
   try {
-    // CHECK: call ptr @__dynamic_cast
-    // CHECK: br i1
-    // CHECK: invoke void @__cxa_bad_cast() [[NR:#[0-9]+]]
     dynamic_cast<const B&>(*a);
   } catch (...) {
-    // CHECK:      landingpad { ptr, i32 }
-    // CHECK-NEXT:   catch ptr null
   }
   return fail;
 }
 
-// CHECK: declare ptr @__dynamic_cast(ptr, ptr addrspace(1), ptr addrspace(1), i64) [[NUW_RO:#[0-9]+]]
 
-// CHECK: attributes [[NUW_RO]] = { nounwind willreturn memory(read) }
-// CHECK: attributes [[NR]] = { noreturn }
+//.
+// CHECK: attributes #[[ATTR0]] = { mustprogress noinline optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(read) }
+// CHECK: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR3]] = { nounwind }
+// CHECK: attributes #[[ATTR4]] = { noreturn }
+//.
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { convergent mustprogress noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(read) }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR3]] = { nounwind }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR4]] = { noreturn }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
+// CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// WITH-NONZERO-DEFAULT-AS: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// WITH-NONZERO-DEFAULT-AS: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGenCXX/eh.cpp b/clang/test/CodeGenCXX/eh.cpp
index 5c592a96e27b73..f174b5d84fdf50 100644
--- a/clang/test/CodeGenCXX/eh.cpp
+++ b/clang/test/CodeGenCXX/eh.cpp
@@ -81,7 +81,7 @@ namespace test5 {
 // CHECK:      invoke void @__cxa_throw(ptr [[EXNOBJ]], ptr @_ZTIN5test51AE, ptr @_ZN5test51AD1Ev) [[NR]]
 // CHECK-NEXT:   to label {{%.*}} unwind label %[[HANDLER:[^ ]*]]
 //      :    [[HANDLER]]:  (can't check this in Release-Asserts builds)
-// CHECK:      {{%.*}} = call i32 @llvm.eh.typeid.for(ptr @_ZTIN5test51AE)
+// CHECK:      {{%.*}} = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIN5test51AE)
 }
 
 namespace test6 {
@@ -96,7 +96,7 @@ namespace test6 {
 
 // PR7127
 namespace test7 {
-// CHECK-LABEL:      define{{.*}} i32 @_ZN5test73fooEv() 
+// CHECK-LABEL:      define{{.*}} i32 @_ZN5test73fooEv()
 // CHECK-SAME:  personality ptr @__gxx_personality_v0
   int foo() {
 // CHECK:      [[CAUGHTEXNVAR:%.*]] = alloca ptr
@@ -119,7 +119,7 @@ namespace test7 {
 // CHECK-NEXT: store i32 [[SELECTOR]], ptr [[SELECTORVAR]]
 // CHECK-NEXT: br label
 // CHECK:      [[SELECTOR:%.*]] = load i32, ptr [[SELECTORVAR]]
-// CHECK-NEXT: [[T0:%.*]] = call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+// CHECK-NEXT: [[T0:%.*]] = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi)
 // CHECK-NEXT: icmp eq i32 [[SELECTOR]], [[T0]]
 // CHECK-NEXT: br i1
 // CHECK:      [[T0:%.*]] = load ptr, ptr [[CAUGHTEXNVAR]]
diff --git a/clang/test/CodeGenCXX/nrvo.cpp b/clang/test/CodeGenCXX/nrvo.cpp
index 33dc4cf9dbc8d2..23ac04511514da 100644
--- a/clang/test/CodeGenCXX/nrvo.cpp
+++ b/clang/test/CodeGenCXX/nrvo.cpp
@@ -628,7 +628,7 @@ void may_throw();
 // CHECK-EH-03-NEXT:    br label [[CATCH_DISPATCH:%.*]]
 // CHECK-EH-03:       catch.dispatch:
 // CHECK-EH-03-NEXT:    [[SEL:%.*]] = load i32, ptr [[EHSELECTOR_SLOT]], align 4
-// CHECK-EH-03-NEXT:    [[TMP3:%.*]] = call i32 @llvm.eh.typeid.for(ptr @_ZTI1X) #[[ATTR7]]
+// CHECK-EH-03-NEXT:    [[TMP3:%.*]] = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTI1X) #[[ATTR7]]
 // CHECK-EH-03-NEXT:    [[MATCHES:%.*]] = icmp eq i32 [[SEL]], [[TMP3]]
 // CHECK-EH-03-NEXT:    br i1 [[MATCHES]], label [[CATCH:%.*]], label [[EH_RESUME:%.*]]
 // CHECK-EH-03:       catch:
@@ -707,7 +707,7 @@ void may_throw();
 // CHECK-EH-11-NEXT:    br label [[CATCH_DISPATCH:%.*]]
 // CHECK-EH-11:       catch.dispatch:
 // CHECK-EH-11-NEXT:    [[SEL:%.*]] = load i32, ptr [[EHSELECTOR_SLOT]], align 4
-// CHECK-EH-11-NEXT:    [[TMP3:%.*]] = call i32 @llvm.eh.typeid.for(ptr @_ZTI1X) #[[ATTR6]]
+// CHECK-EH-11-NEXT:    [[TMP3:%.*]] = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTI1X) #[[ATTR6]]
 // CHECK-EH-11-NEXT:    [[MATCHES:%.*]] = icmp eq i32 [[SEL]], [[TMP3]]
 // CHECK-EH-11-NEXT:    br i1 [[MATCHES]], label [[CATCH:%.*]], label [[EH_RESUME:%.*]]
 // CHECK-EH-11:       catch:
diff --git a/clang/test/CodeGenCXX/template-param-objects-address-space.cpp b/clang/test/CodeGenCXX/template-param-objects-address-space.cpp
index b54dcfe77934ee..b3733decdb550f 100644
--- a/clang/test/CodeGenCXX/template-param-objects-address-space.cpp
+++ b/clang/test/CodeGenCXX/template-param-objects-address-space.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -std=c++20 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple spirv64-unknown-unknown -fsycl-is-device -std=c++20 %s -emit-llvm -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 
 struct S { char buf[32]; };
 template<S s> constexpr const char *begin() { return s.buf; }
@@ -8,25 +9,34 @@ extern const void *callee(const S*);
 template<S s> constexpr const void* observable_addr() { return callee(&s); }
 
 // CHECK: [[HELLO:@_ZTAXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEE]]
+// WITH-NONZERO-DEFAULT-AS: [[HELLO:@_ZTAXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEE]]
 // CHECK-SAME: = linkonce_odr addrspace(1) constant { <{ [11 x i8], [21 x i8] }> } { <{ [11 x i8], [21 x i8] }> <{ [11 x i8] c"hello world", [21 x i8] zeroinitializer }> }, comdat
 
 // CHECK: @p
 // CHECK-SAME: addrspace(1) global ptr addrspacecast (ptr addrspace(1) [[HELLO]] to ptr)
+// WITH-NONZERO-DEFAULT-AS: addrspace(1) global ptr addrspace(4) addrspacecast (ptr addrspace(1) [[HELLO]] to ptr addrspace(4))
 const char *p = begin<S{"hello world"}>();
 
 // CHECK: @q
 // CHECK-SAME: addrspace(1) global ptr addrspacecast (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) [[HELLO]], i64 11) to ptr)
+// WITH-NONZERO-DEFAULT-AS: addrspace(1) global ptr addrspace(4) addrspacecast (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) [[HELLO]], i64 11) to ptr addrspace(4))
 const char *q = end<S{"hello world"}>();
 
 const void *(*r)() = &retval<S{"hello world"}>;
 
 // CHECK: @s
 // CHECK-SAME: addrspace(1) global ptr null
+// WITH-NONZERO-DEFAULT-AS: addrspace(1) global ptr addrspace(4) null
 const void *s = observable_addr<S{"hello world"}>();
 
 // CHECK: define linkonce_odr noundef ptr @_Z6retvalIXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEEEPKvv()
+// WITH-NONZERO-DEFAULT-AS: define linkonce_odr {{.*}} noundef ptr addrspace(4) @_Z6retvalIXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEEEPKvv()
 // CHECK: ret ptr addrspacecast (ptr addrspace(1) [[HELLO]] to ptr)
+// WITH-NONZERO-DEFAULT-AS: ret ptr addrspace(4) addrspacecast (ptr addrspace(1) [[HELLO]] to ptr addrspace(4))
 
 // CHECK: define linkonce_odr noundef ptr @_Z15observable_addrIXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEEEPKvv()
+// WITH-NONZERO-DEFAULT-AS: define linkonce_odr {{.*}} noundef ptr addrspace(4) @_Z15observable_addrIXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEEEPKvv()
 // CHECK: %call = call noundef ptr @_Z6calleePK1S(ptr noundef addrspacecast (ptr addrspace(1) [[HELLO]] to ptr))
+// WITH-NONZERO-DEFAULT-AS: %call = call {{.*}} noundef ptr addrspace(4) @_Z6calleePK1S(ptr addrspace(4) noundef addrspacecast (ptr addrspace(1) [[HELLO]] to ptr addrspace(4)))
 // CHECK: declare noundef ptr @_Z6calleePK1S(ptr noundef)
+// WITH-NONZERO-DEFAULT-AS: declare {{.*}} noundef ptr addrspace(4) @_Z6calleePK1S(ptr addrspace(4) noundef)
diff --git a/clang/test/CodeGenCXX/throw-expression-typeinfo-in-address-space.cpp b/clang/test/CodeGenCXX/throw-expression-typeinfo-in-address-space.cpp
index d8c23d427e67a3..3acbdd8fd97ee0 100644
--- a/clang/test/CodeGenCXX/throw-expression-typeinfo-in-address-space.cpp
+++ b/clang/test/CodeGenCXX/throw-expression-typeinfo-in-address-space.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm -fcxx-exceptions -fexceptions -std=c++11 -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple spirv64-unknown-unknown -fsycl-is-device -emit-llvm -fcxx-exceptions -fexceptions -std=c++11 -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 
 struct X {
   ~X();
@@ -15,3 +16,4 @@ void f() {
 }
 
 // CHECK: declare void @__cxa_throw(ptr, ptr addrspace(1), ptr)
+// WITH-NONZERO-DEFAULT-AS: declare{{.*}} void @__cxa_throw(ptr addrspace(4), ptr addrspace(1), ptr addrspace(4))
diff --git a/clang/test/CodeGenCXX/try-catch-with-address-space.cpp b/clang/test/CodeGenCXX/try-catch-with-address-space.cpp
index 279d29f50fd410..412ac6c2872587 100644
--- a/clang/test/CodeGenCXX/try-catch-with-address-space.cpp
+++ b/clang/test/CodeGenCXX/try-catch-with-address-space.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -emit-llvm -o - -fcxx-exceptions -fexceptions | FileCheck %s
+// RUN: %clang_cc1 %s -triple=spirv64-unknown-unknown -fsycl-is-device -emit-llvm -o - -fcxx-exceptions -fexceptions | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 
 struct X { };
 
@@ -10,7 +11,8 @@ void f() {
     // CHECK: ptr addrspace(1) @_ZTI1X
   } catch (const X x) {
     // CHECK: catch ptr addrspace(1) @_ZTI1X
-    // CHECK: call i32 @llvm.eh.typeid.for(ptr addrspacecast (ptr addrspace(1) @_ZTI1X to ptr))
+    // CHECK: call i32 @llvm.eh.typeid.for.p0(ptr addrspacecast (ptr addrspace(1) @_ZTI1X to ptr))
+    // WITH-NONZERO-DEFAULT-AS: call i32 @llvm.eh.typeid.for.p4(ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTI1X to ptr addrspace(4)))
   }
 }
 
@@ -20,6 +22,7 @@ void h() {
     // CHECK: ptr addrspace(1) @_ZTIPKc
   } catch (char const(&)[4]) {
     // CHECK: catch ptr addrspace(1) @_ZTIA4_c
-    // CHECK: call i32 @llvm.eh.typeid.for(ptr addrspacecast (ptr addrspace(1) @_ZTIA4_c to ptr))
+    // CHECK: call i32 @llvm.eh.typeid.for.p0(ptr addrspacecast (ptr addrspace(1) @_ZTIA4_c to ptr))
+    // WITH-NONZERO-DEFAULT-AS: call i32 @llvm.eh.typeid.for.p4(ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIA4_c to ptr addrspace(4)))
   }
 }
diff --git a/clang/test/CodeGenCXX/typeid-cxx11-with-address-space.cpp b/clang/test/CodeGenCXX/typeid-cxx11-with-address-space.cpp
index c4e7d36acff130..f6dc38ec9f2925 100644
--- a/clang/test/CodeGenCXX/typeid-cxx11-with-address-space.cpp
+++ b/clang/test/CodeGenCXX/typeid-cxx11-with-address-space.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -I%S %s -triple amdgcn-amd-amdhsa -emit-llvm -std=c++11 -o - | FileCheck %s
+// RUN: %clang_cc1 -I%S %s -triple spirv64-unknown-unknown -fsycl-is-device -emit-llvm -std=c++11 -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 #include <typeinfo>
 
 namespace Test1 {
@@ -19,14 +20,17 @@ struct B : virtual A {};
 struct C { int n; };
 
 // CHECK: @_ZN5Test15itemsE ={{.*}} constant [4 x {{.*}}] [{{.*}} ptr addrspacecast (ptr addrspace(1) @_ZTIN5Test11AE to ptr), {{.*}} @_ZN5Test19make_implINS_1AEEEPvv {{.*}} ptr addrspacecast (ptr addrspace(1) @_ZTIN5Test11BE to ptr), {{.*}} @_ZN5Test19make_implINS_1BEEEPvv {{.*}} ptr addrspacecast (ptr addrspace(1) @_ZTIN5Test11CE to ptr), {{.*}} @_ZN5Test19make_implINS_1CEEEPvv {{.*}} ptr addrspacecast (ptr addrspace(1) @_ZTIi to ptr), {{.*}} @_ZN5Test19make_implIiEEPvv }]
+// WITH-NONZERO-DEFAULT-AS: @_ZN5Test15itemsE ={{.*}} addrspace(1) constant [4 x {{.*}}] [{{.*}} ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIN5Test11AE to ptr addrspace(4)), {{.*}} @_ZN5Test19make_implINS_1AEEEPvv {{.*}} ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIN5Test11BE to ptr addrspace(4)), {{.*}} @_ZN5Test19make_implINS_1BEEEPvv {{.*}} ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIN5Test11CE to ptr addrspace(4)), {{.*}} @_ZN5Test19make_implINS_1CEEEPvv {{.*}} ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIi to ptr addrspace(4)), {{.*}} @_ZN5Test19make_implIiEEPvv }]
 extern constexpr Item items[] = {
   item<A>("A"), item<B>("B"), item<C>("C"), item<int>("int")
 };
 
 // CHECK: @_ZN5Test11xE ={{.*}} constant ptr addrspacecast (ptr addrspace(1) @_ZTIN5Test11AE to ptr), align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZN5Test11xE ={{.*}} addrspace(1) constant ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIN5Test11AE to ptr addrspace(4)), align 8
 constexpr auto &x = items[0].ti;
 
 // CHECK: @_ZN5Test11yE ={{.*}} constant ptr addrspacecast (ptr addrspace(1) @_ZTIN5Test11BE to ptr), align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZN5Test11yE ={{.*}} addrspace(1) constant ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIN5Test11BE to ptr addrspace(4)), align 8
 constexpr auto &y = typeid(B{});
 
 }
diff --git a/clang/test/CodeGenCXX/typeid-with-address-space.cpp b/clang/test/CodeGenCXX/typeid-with-address-space.cpp
index b439770a8b631e..98af17f4fc8888 100644
--- a/clang/test/CodeGenCXX/typeid-with-address-space.cpp
+++ b/clang/test/CodeGenCXX/typeid-with-address-space.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -I%S %s -triple amdgcn-amd-amdhsa -emit-llvm -fcxx-exceptions -fexceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -I%S %s -triple spirv64-unknown-unknown -fsycl-is-device -emit-llvm -fcxx-exceptions -fexceptions -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 #include <typeinfo>
 
 namespace Test1 {
@@ -7,19 +8,23 @@ namespace Test1 {
 struct A { virtual void f(); };
 
 // CHECK: @_ZN5Test16int_tiE ={{.*}} constant ptr addrspacecast (ptr addrspace(1) @_ZTIi to ptr), align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZN5Test16int_tiE ={{.*}} constant ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIi to ptr addrspace(4)), align 8
 const std::type_info &int_ti = typeid(int);
 
 // CHECK: @_ZN5Test14A_tiE ={{.*}} constant ptr addrspacecast (ptr addrspace(1) @_ZTIN5Test11AE to ptr), align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZN5Test14A_tiE ={{.*}} constant ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIN5Test11AE to ptr addrspace(4)), align 8
 const std::type_info &A_ti = typeid(const volatile A &);
 
 volatile char c;
 
 // CHECK: @_ZN5Test14c_tiE ={{.*}} constant ptr addrspacecast (ptr addrspace(1) @_ZTIc to ptr), align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZN5Test14c_tiE ={{.*}} constant ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIc to ptr addrspace(4)), align 8
 const std::type_info &c_ti = typeid(c);
 
 extern const double &d;
 
 // CHECK: @_ZN5Test14d_tiE ={{.*}} constant ptr addrspacecast (ptr addrspace(1) @_ZTId to ptr), align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZN5Test14d_tiE ={{.*}} constant ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTId to ptr addrspace(4)), align 8
 const std::type_info &d_ti = typeid(d);
 
 extern A &a;
@@ -28,18 +33,24 @@ extern A &a;
 const std::type_info &a_ti = typeid(a);
 
 // CHECK: @_ZN5Test18A10_c_tiE ={{.*}} constant ptr addrspacecast (ptr addrspace(1) @_ZTIA10_c to ptr), align 8
+// WITH-NONZERO-DEFAULT-AS: @_ZN5Test18A10_c_tiE ={{.*}} constant ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZTIA10_c to ptr addrspace(4)), align 8
 const std::type_info &A10_c_ti = typeid(char const[10]);
 
 // CHECK-LABEL: define{{.*}} ptr @_ZN5Test11fEv
 // CHECK-SAME:  personality ptr @__gxx_personality_v0
+// WITH-NONZERO-DEFAULT-AS-LABEL: define{{.*}} ptr addrspace(4) @_ZN5Test11fEv
+// WITH-NONZERO-DEFAULT-AS-SAME:  personality ptr @__gxx_personality_v0
 const char *f() {
   try {
     // CHECK: br i1
     // CHECK: invoke void @__cxa_bad_typeid() [[NR:#[0-9]+]]
+    // WITH-NONZERO-DEFAULT-AS: invoke{{.*}} void @__cxa_bad_typeid() [[NR:#[0-9]+]]
     return typeid(*static_cast<A *>(0)).name();
   } catch (...) {
     // CHECK:      landingpad { ptr, i32 }
     // CHECK-NEXT:   catch ptr null
+    // WITH-NONZERO-DEFAULT-AS:      landingpad { ptr addrspace(4), i32 }
+    // WITH-NONZERO-DEFAULT-AS-NEXT:   catch ptr addrspace(4) null
   }
 
   return 0;
diff --git a/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp b/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp
index 80f6ab0903e51b..350303cc6e9b34 100644
--- a/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp
+++ b/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -I%S %s -triple amdgcn-amd-amdhsa -emit-llvm -o - | FileCheck %s -check-prefix=AS
+// RUN: %clang_cc1 -I%S %s -triple spirv64-unknown-unknown -fsycl-is-device -emit-llvm -o - | FileCheck %s -check-prefix=NONZERO-DEFAULT-AS
 // RUN: %clang_cc1 -I%S %s -triple x86_64-linux-gnu -emit-llvm -o - | FileCheck %s -check-prefix=NO-AS
 #include <typeinfo>
 
@@ -25,24 +26,30 @@ class B : A {
 
 unsigned long Fn(B& b) {
 // AS: %call = call noundef zeroext i1 @_ZNKSt9type_infoeqERKS_(ptr {{.*}} addrspacecast (ptr addrspace(1) @_ZTISt9type_info to ptr), ptr {{.*}} %2)
+// NONZERO-DEFAULT-AS: %call = call{{.*}} noundef zeroext i1 @_ZNKSt9type_infoeqERKS_(ptr addrspace(4) {{.*}} addrspacecast (ptr addrspace(1) @_ZTISt9type_info to ptr addrspace(4)), ptr addrspace(4) {{.*}} %2)
 // NO-AS: %call = call noundef zeroext i1 @_ZNKSt9type_infoeqERKS_(ptr {{.*}} @_ZTISt9type_info, ptr {{.*}} %2)
     if (typeid(std::type_info) == typeid(b))
         return 42;
 // AS: %call2 = call noundef zeroext i1 @_ZNKSt9type_infoneERKS_(ptr {{.*}} addrspacecast (ptr addrspace(1) @_ZTIi to ptr), ptr {{.*}} %5)
+// NONZERO-DEFAULT-AS: %call2 = call{{.*}} noundef zeroext i1 @_ZNKSt9type_infoneERKS_(ptr addrspace(4) {{.*}} addrspacecast (ptr addrspace(1) @_ZTIi to ptr addrspace(4)), ptr addrspace(4) {{.*}} %5)
 // NO-AS: %call2 = call noundef zeroext i1 @_ZNKSt9type_infoneERKS_(ptr {{.*}} @_ZTIi, ptr {{.*}} %5)
     if (typeid(int) != typeid(b))
         return 1712;
 // AS: %call5 = call noundef ptr @_ZNKSt9type_info4nameEv(ptr {{.*}} addrspacecast (ptr addrspace(1) @_ZTI1A to ptr))
+// NONZERO-DEFAULT-AS: %call5 = call{{.*}} noundef ptr addrspace(4) @_ZNKSt9type_info4nameEv(ptr addrspace(4) {{.*}} addrspacecast (ptr addrspace(1) @_ZTI1A to ptr addrspace(4)))
 // NO-AS: %call5 = call noundef ptr @_ZNKSt9type_info4nameEv(ptr {{.*}} @_ZTI1A)
 // AS: %call7 = call noundef ptr @_ZNKSt9type_info4nameEv(ptr {{.*}} %8)
+// NONZERO-DEFAULT-AS: %call7 = call{{.*}} noundef ptr addrspace(4) @_ZNKSt9type_info4nameEv(ptr addrspace(4) {{.*}} %8)
 // NO-AS: %call7 = call noundef ptr @_ZNKSt9type_info4nameEv(ptr {{.*}} %8)
     if (typeid(A).name() == typeid(b).name())
         return 0;
 // AS: %call11 = call noundef zeroext i1 @_ZNKSt9type_info6beforeERKS_(ptr {{.*}} %11, ptr {{.*}} addrspacecast (ptr addrspace(1) @_ZTIf to ptr))
+// NONZERO-DEFAULT-AS: %call11 = call{{.*}} noundef zeroext i1 @_ZNKSt9type_info6beforeERKS_(ptr addrspace(4) {{.*}} %11, ptr addrspace(4) {{.*}} addrspacecast (ptr addrspace(1) @_ZTIf to ptr addrspace(4)))
 // NO-AS:   %call11 = call noundef zeroext i1 @_ZNKSt9type_info6beforeERKS_(ptr {{.*}} %11, ptr {{.*}} @_ZTIf)
     if (typeid(b).before(typeid(float)))
         return 1;
 // AS: %call15 = call noundef i64 @_ZNKSt9type_info9hash_codeEv(ptr {{.*}} %14)
+// NONZERO-DEFAULT-AS: %call15 = call{{.*}} noundef i64 @_ZNKSt9type_info9hash_codeEv(ptr addrspace(4) {{.*}} %14)
 // NO-AS: %call15 = call noundef i64 @_ZNKSt9type_info9hash_codeEv(ptr {{.*}} %14)
     return typeid(b).hash_code();
 }
diff --git a/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp b/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp
index d765fe94d9b084..ecafa99d8be00f 100644
--- a/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp
+++ b/clang/test/CodeGenCXX/vtable-assume-load-address-space.cpp
@@ -1,14 +1,17 @@
 // RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -std=c++11 -emit-llvm -o %t.ll -O1 -disable-llvm-passes -fms-extensions -fstrict-vtable-pointers
+// RUN: %clang_cc1 %s -triple i686-pc-win32 -emit-llvm -o %t.ms.ll -O1 -disable-llvm-passes -fms-extensions -fstrict-vtable-pointers
+// RUN: %clang_cc1 %s -triple=spirv64-unknown-unknown -fsycl-is-device -std=c++11 -emit-llvm -o %t.ll -O1 -disable-llvm-passes -fms-extensions -fstrict-vtable-pointers
 // FIXME: Assume load should not require -fstrict-vtable-pointers
 
 // RUN: FileCheck --check-prefix=CHECK1 --input-file=%t.ll %s
 // RUN: FileCheck --check-prefix=CHECK2 --input-file=%t.ll %s
 // RUN: FileCheck --check-prefix=CHECK3 --input-file=%t.ll %s
 // RUN: FileCheck --check-prefix=CHECK4 --input-file=%t.ll %s
-// RUN: FileCheck --check-prefix=CHECK5 --input-file=%t.ll %s
+// RUN: FileCheck --check-prefix=CHECK-MS --input-file=%t.ms.ll %s
 // RUN: FileCheck --check-prefix=CHECK6 --input-file=%t.ll %s
 // RUN: FileCheck --check-prefix=CHECK7 --input-file=%t.ll %s
 // RUN: FileCheck --check-prefix=CHECK8 --input-file=%t.ll %s
+// RUN: FileCheck --check-prefix=CHECK9 --input-file=%t.ll %s
 namespace test1 {
 
 struct A {
@@ -23,8 +26,8 @@ struct B : A {
 void g(A *a) { a->foo(); }
 
 // CHECK1-LABEL: define{{.*}} void @_ZN5test14fooAEv()
-// CHECK1: call void @_ZN5test11AC1Ev(ptr
-// CHECK1: %[[VTABLE:.*]] = load ptr addrspace(1), ptr %{{.*}}
+// CHECK1: call{{.*}} void @_ZN5test11AC1Ev(ptr {{((addrspace(4)){0,1})}}
+// CHECK1: %[[VTABLE:.*]] = load ptr addrspace(1), ptr {{((addrspace(4)){0,1})}}{{.*}}%{{.*}}
 // CHECK1: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test11AE, i32 0, i32 0, i32 2)
 // CHECK1: call void @llvm.assume(i1 %[[CMP]])
 // CHECK1-LABEL: {{^}}}
@@ -35,8 +38,8 @@ void fooA() {
 }
 
 // CHECK1-LABEL: define{{.*}} void @_ZN5test14fooBEv()
-// CHECK1: call void @_ZN5test11BC1Ev(ptr {{[^,]*}} %{{.*}})
-// CHECK1: %[[VTABLE:.*]] = load ptr addrspace(1), ptr %{{.*}}
+// CHECK1: call{{.*}} void @_ZN5test11BC1Ev(ptr {{[^,]*}} %{{.*}})
+// CHECK1: %[[VTABLE:.*]] = load ptr addrspace(1), ptr {{((addrspace(4)){0,1})}}{{.*}}%{{.*}}
 // CHECK1: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test11BE, i32 0, i32 0, i32 2)
 // CHECK1: call void @llvm.assume(i1 %[[CMP]])
 // CHECK1-LABEL: {{^}}}
@@ -46,7 +49,7 @@ void fooB() {
   g(&b);
 }
 // there should not be any assumes in the ctor that calls base ctor
-// CHECK1-LABEL: define linkonce_odr void @_ZN5test11BC2Ev(ptr
+// CHECK1-LABEL: define linkonce_odr{{.*}} void @_ZN5test11BC2Ev(ptr
 // CHECK1-NOT: @llvm.assume(
 // CHECK1-LABEL: {{^}}}
 }
@@ -69,17 +72,17 @@ void g(A *a) { a->foo(); }
 void h(B *b) { b->bar(); }
 
 // CHECK2-LABEL: define{{.*}} void @_ZN5test24testEv()
-// CHECK2: call void @_ZN5test21CC1Ev(ptr
+// CHECK2: call{{.*}} void @_ZN5test21CC1Ev(ptr
 // CHECK2: %[[VTABLE:.*]] = load ptr addrspace(1), ptr {{.*}}
 // CHECK2: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test21CE, i32 0, i32 0, i32 2)
 // CHECK2: call void @llvm.assume(i1 %[[CMP]])
 
-// CHECK2: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %{{.*}}, i64 8
-// CHECK2: %[[VTABLE2:.*]] = load ptr addrspace(1), ptr %[[ADD_PTR]]
+// CHECK2: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr {{((addrspace(4)){0,1})}}{{.*}}%{{.*}}, i64 8
+// CHECK2: %[[VTABLE2:.*]] = load ptr addrspace(1), ptr {{((addrspace(4)){0,1})}}{{.*}}%[[ADD_PTR]]
 // CHECK2: %[[CMP2:.*]] = icmp eq ptr addrspace(1) %[[VTABLE2]], getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)], [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test21CE, i32 0, i32 1, i32 2)
 // CHECK2: call void @llvm.assume(i1 %[[CMP2]])
 
-// CHECK2: call void @_ZN5test21gEPNS_1AE(
+// CHECK2: call{{.*}} void @_ZN5test21gEPNS_1AE(
 // CHECK2-LABEL: {{^}}}
 
 void test() {
@@ -106,7 +109,7 @@ struct C : virtual A, B {
 void g(B *a) { a->foo(); }
 
 // CHECK3-LABEL: define{{.*}} void @_ZN5test34testEv()
-// CHECK3: call void @_ZN5test31CC1Ev(ptr
+// CHECK3: call{{.*}} void @_ZN5test31CC1Ev(ptr
 // CHECK3: %[[CMP:.*]] = icmp eq ptr addrspace(1) %{{.*}}, getelementptr inbounds inrange(-24, 8) ({ [4 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test31CE, i32 0, i32 0, i32 3)
 // CHECK3: call void @llvm.assume(i1 %[[CMP]])
 // CHECK3-LABLEL: }
@@ -134,12 +137,12 @@ struct C : B {
 void g(C *c) { c->foo(); }
 
 // CHECK4-LABEL: define{{.*}} void @_ZN5test44testEv()
-// CHECK4: call void @_ZN5test41CC1Ev(ptr
-// CHECK4: %[[VTABLE:.*]] = load ptr addrspace(1), ptr %{{.*}}
+// CHECK4: call{{.*}} void @_ZN5test41CC1Ev(ptr
+// CHECK4: %[[VTABLE:.*]] = load ptr addrspace(1), ptr {{((addrspace(4)){0,1})}}{{.*}}%{{.*}}
 // CHECK4: %[[CMP:.*]] = icmp eq ptr addrspace(1) %[[VTABLE]], getelementptr inbounds inrange(-32, 8) ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test41CE, i32 0, i32 0, i32 4)
 // CHECK4: call void @llvm.assume(i1 %[[CMP]]
 
-// CHECK4: %[[VTABLE2:.*]] = load ptr addrspace(1), ptr %{{.*}}
+// CHECK4: %[[VTABLE2:.*]] = load ptr addrspace(1), ptr {{((addrspace(4)){0,1})}}{{.*}}%{{.*}}
 // CHECK4: %[[CMP2:.*]] = icmp eq ptr addrspace(1) %[[VTABLE2]], getelementptr inbounds inrange(-32, 8) ({ [5 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTVN5test41CE, i32 0, i32 0, i32 4)
 // CHECK4: call void @llvm.assume(i1 %[[CMP2]])
 // CHECK4-LABEL: {{^}}}
@@ -150,6 +153,27 @@ void test() {
 }
 } // test4
 
+namespace testMS {
+
+struct __declspec(novtable) S {
+  virtual void foo();
+};
+
+void g(S &s) { s.foo(); }
+
+// if struct has novtable specifier, then we can't generate assumes
+// CHECK-MS-LABEL: define dso_local void @"?test@testMS@@YAXXZ"()
+// CHECK-MS: call x86_thiscallcc noundef ptr @"??0S@testMS@@QAE@XZ"(
+// CHECK-MS-NOT: @llvm.assume
+// CHECK-MS-LABEL: {{^}}}
+
+void test() {
+  S s;
+  g(s);
+}
+
+} // testMS
+
 namespace test6 {
 struct A {
   A();
@@ -161,17 +185,17 @@ struct B : A {
 };
 // FIXME: Because A's vtable is external, and no virtual functions are hidden,
 // it's safe to generate assumption loads.
-// CHECK5-LABEL: define{{.*}} void @_ZN5test61gEv()
-// CHECK5: call void @_ZN5test61AC1Ev(
-// CHECK5-NOT: call void @llvm.assume(
+// CHECK6-LABEL: define{{.*}} void @_ZN5test61gEv()
+// CHECK6: call{{.*}} void @_ZN5test61AC1Ev(
+// CHECK6-NOT: call void @llvm.assume(
 
 // We can't emit assumption loads for B, because if we would refer to vtable
 // it would refer to functions that will not be able to find (like implicit
 // inline destructor).
 
-// CHECK5-LABEL:   call void @_ZN5test61BC1Ev(
-// CHECK5-NOT: call void @llvm.assume(
-// CHECK5-LABEL: {{^}}}
+// CHECK6-LABEL:   call{{.*}} void @_ZN5test61BC1Ev(
+// CHECK6-NOT: call void @llvm.assume(
+// CHECK6-LABEL: {{^}}}
 void g() {
   A *a = new A;
   B *b = new B;
@@ -180,7 +204,7 @@ void g() {
 
 namespace test7 {
 // Because A's key function is defined here, vtable is generated in this TU
-// CHECK6: @_ZTVN5test71AE ={{.*}} unnamed_addr addrspace(1) constant
+// CHECK7: @_ZTVN5test71AE ={{.*}} unnamed_addr addrspace(1) constant
 struct A {
   A();
   virtual void foo();
@@ -188,10 +212,10 @@ struct A {
 };
 void A::foo() {}
 
-// CHECK6-LABEL: define{{.*}} void @_ZN5test71gEv()
-// CHECK6: call void @_ZN5test71AC1Ev(
-// CHECK6: call void @llvm.assume(
-// CHECK6-LABEL: {{^}}}
+// CHECK7-LABEL: define{{.*}} void @_ZN5test71gEv()
+// CHECK7: call{{.*}} void @_ZN5test71AC1Ev(
+// CHECK7: call void @llvm.assume(
+// CHECK7-LABEL: {{^}}}
 void g() {
   A *a = new A();
   a->bar();
@@ -205,14 +229,14 @@ struct A {
   virtual void bar();
 };
 
-// CHECK7-DAG: @_ZTVN5test81BE = available_externally unnamed_addr addrspace(1) constant
+// CHECK8-DAG: @_ZTVN5test81BE = available_externally unnamed_addr addrspace(1) constant
 struct B : A {
   B();
   void foo();
   void bar();
 };
 
-// CHECK7-DAG: @_ZTVN5test81CE = linkonce_odr unnamed_addr addrspace(1) constant
+// CHECK8-DAG: @_ZTVN5test81CE = linkonce_odr unnamed_addr addrspace(1) constant
 struct C : A {
   C();
   void bar();
@@ -227,14 +251,14 @@ struct D : A {
 };
 void D::bar() {}
 
-// CHECK7-DAG: @_ZTVN5test81EE = linkonce_odr unnamed_addr addrspace(1) constant
+// CHECK8-DAG: @_ZTVN5test81EE = linkonce_odr unnamed_addr addrspace(1) constant
 struct E : A {
   E();
 };
 
-// CHECK7-LABEL: define{{.*}} void @_ZN5test81bEv()
-// CHECK7: call void @llvm.assume(
-// CHECK7-LABEL: {{^}}}
+// CHECK8-LABEL: define{{.*}} void @_ZN5test81bEv()
+// CHECK8: call void @llvm.assume(
+// CHECK8-LABEL: {{^}}}
 void b() {
   B b;
   b.bar();
@@ -243,26 +267,26 @@ void b() {
 // FIXME: C has inline virtual functions which prohibits as from generating
 // assumption loads, but because vtable is generated in this TU (key function
 // defined here) it would be correct to refer to it.
-// CHECK7-LABEL: define{{.*}} void @_ZN5test81cEv()
-// CHECK7-NOT: call void @llvm.assume(
-// CHECK7-LABEL: {{^}}}
+// CHECK8-LABEL: define{{.*}} void @_ZN5test81cEv()
+// CHECK8-NOT: call void @llvm.assume(
+// CHECK8-LABEL: {{^}}}
 void c() {
   C c;
   c.bar();
 }
 
 // FIXME: We could generate assumption loads here.
-// CHECK7-LABEL: define{{.*}} void @_ZN5test81dEv()
-// CHECK7-NOT: call void @llvm.assume(
-// CHECK7-LABEL: {{^}}}
+// CHECK8-LABEL: define{{.*}} void @_ZN5test81dEv()
+// CHECK8-NOT: call void @llvm.assume(
+// CHECK8-LABEL: {{^}}}
 void d() {
   D d;
   d.bar();
 }
 
-// CHECK7-LABEL: define{{.*}} void @_ZN5test81eEv()
-// CHECK7: call void @llvm.assume(
-// CHECK7-LABEL: {{^}}}
+// CHECK8-LABEL: define{{.*}} void @_ZN5test81eEv()
+// CHECK8: call void @llvm.assume(
+// CHECK8-LABEL: {{^}}}
 void e() {
   E e;
   e.bar();
@@ -276,9 +300,9 @@ struct S {
   __attribute__((visibility("hidden"))) virtual void doStuff();
 };
 
-// CHECK8-LABEL: define{{.*}} void @_ZN5test94testEv()
-// CHECK8-NOT: @llvm.assume(
-// CHECK8: }
+// CHECK9-LABEL: define{{.*}} void @_ZN5test94testEv()
+// CHECK9-NOT: @llvm.assume(
+// CHECK9: }
 void test() {
   S *s = new S();
   s->doStuff();
diff --git a/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp b/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp
index a3f12f0ebfc87b..876d0845cc5157 100644
--- a/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp
+++ b/clang/test/CodeGenCXX/vtable-pointer-initialization-address-space.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -std=c++11 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple=spirv64-unknown-unknown -fsycl-is-device -std=c++11 -emit-llvm -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 
 struct Field {
   Field();
@@ -24,6 +25,7 @@ struct A : Base {
 // CHECK: store ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1A, i32 0, i32 0, i32 2)
 // CHECK: call void @_ZN5FieldC1Ev(
 // CHECK: ret void
+// WITH-NONZERO-DEFAULT-AS-LABEL: define{{.*}} void @_ZN1AC2Ev(ptr addrspace(4) {{[^,]*}} %this) unnamed_addr
 A::A() { }
 
 // CHECK-LABEL: define{{.*}} void @_ZN1AD2Ev(ptr {{[^,]*}} %this) unnamed_addr
@@ -31,6 +33,7 @@ A::A() { }
 // CHECK: call void @_ZN5FieldD1Ev(
 // CHECK: call void @_ZN4BaseD2Ev(
 // CHECK: ret void
+// WITH-NONZERO-DEFAULT-AS-LABEL: define{{.*}} void @_ZN1AD2Ev(ptr addrspace(4) {{[^,]*}} %this) unnamed_addr
 A::~A() { }
 
 struct B : Base {
@@ -43,18 +46,22 @@ void f() { B b; }
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1BC1Ev(ptr {{[^,]*}} %this) unnamed_addr
 // CHECK: call void @_ZN1BC2Ev(
+// WITH-NONZERO-DEFAULT-AS-LABEL: define linkonce_odr{{.*}} void @_ZN1BC1Ev(ptr addrspace(4) {{[^,]*}} %this) unnamed_addr
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1BD1Ev(ptr {{[^,]*}} %this) unnamed_addr
 // CHECK: call void @_ZN1BD2Ev(
+// WITH-NONZERO-DEFAULT-AS-LABEL: define linkonce_odr{{.*}} void @_ZN1BD1Ev(ptr addrspace(4) {{[^,]*}} %this) unnamed_addr
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1BC2Ev(ptr {{[^,]*}} %this) unnamed_addr
 // CHECK: call void @_ZN4BaseC2Ev(
 // CHECK: store ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, i32 0, i32 2)
 // CHECK: call void @_ZN5FieldC1Ev
 // CHECK: ret void
+// WITH-NONZERO-DEFAULT-AS-LABEL: define linkonce_odr{{.*}} void @_ZN1BC2Ev(ptr addrspace(4) {{[^,]*}} %this) unnamed_addr
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1BD2Ev(ptr {{[^,]*}} %this) unnamed_addr
 // CHECK: store ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, i32 0, i32 2)
 // CHECK: call void @_ZN5FieldD1Ev(
 // CHECK: call void @_ZN4BaseD2Ev(
 // CHECK: ret void
+// WITH-NONZERO-DEFAULT-AS-LABEL: define linkonce_odr{{.*}} void @_ZN1BD2Ev(ptr addrspace(4) {{[^,]*}} %this) unnamed_addr
diff --git a/clang/test/CodeGenCXX/vtt-address-space.cpp b/clang/test/CodeGenCXX/vtt-address-space.cpp
index 24f4e2a755da04..4c3d0a534611c7 100644
--- a/clang/test/CodeGenCXX/vtt-address-space.cpp
+++ b/clang/test/CodeGenCXX/vtt-address-space.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -std=c++11 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple=spirv64-unknown-unknown -fsycl-is-device -std=c++11 -emit-llvm -o - | FileCheck %s --check-prefix=WITH-NONZERO-DEFAULT-AS
 
 // This is the sample from the C++ Itanium ABI, p2.6.2.
 namespace Test {
@@ -25,3 +26,9 @@ namespace Test {
 // CHECK: define linkonce_odr void @_ZN4Test2V2C2Ev(ptr noundef nonnull align 8 dereferenceable(20) %this, ptr addrspace(1) noundef %vtt)
 // CHECK: define linkonce_odr void @_ZN4Test2C1C2Ev(ptr noundef nonnull align 8 dereferenceable(12) %this, ptr addrspace(1) noundef %vtt)
 // CHECK: define linkonce_odr void @_ZN4Test2C2C2Ev(ptr noundef nonnull align 8 dereferenceable(12) %this, ptr addrspace(1) noundef %vtt)
+// WITH-NONZERO-DEFAULT-AS: call {{.*}} void @_ZN4Test2V2C2Ev(ptr addrspace(4) noundef align 8 dereferenceable_or_null(20) %2, ptr addrspace(1) noundef getelementptr inbounds ([13 x ptr addrspace(1)], ptr addrspace(1) @_ZTTN4Test1DE, i64 0, i64 11))
+// WITH-NONZERO-DEFAULT-AS: call {{.*}} void @_ZN4Test2C1C2Ev(ptr addrspace(4) noundef align 8 dereferenceable_or_null(12) %this1, ptr addrspace(1) noundef getelementptr inbounds ([13 x ptr addrspace(1)], ptr addrspace(1) @_ZTTN4Test1DE, i64 0, i64 1))
+// WITH-NONZERO-DEFAULT-AS: call {{.*}} void @_ZN4Test2C2C2Ev(ptr addrspace(4) noundef align 8 dereferenceable_or_null(12) %3, ptr addrspace(1) noundef getelementptr inbounds ([13 x ptr addrspace(1)], ptr addrspace(1) @_ZTTN4Test1DE, i64 0, i64 3))
+// WITH-NONZERO-DEFAULT-AS: define linkonce_odr {{.*}} void @_ZN4Test2V2C2Ev(ptr addrspace(4) noundef align 8 dereferenceable_or_null(20) %this, ptr addrspace(1) noundef %vtt)
+// WITH-NONZERO-DEFAULT-AS: define linkonce_odr {{.*}} void @_ZN4Test2C1C2Ev(ptr addrspace(4) noundef align 8 dereferenceable_or_null(12) %this, ptr addrspace(1) noundef %vtt)
+// WITH-NONZERO-DEFAULT-AS: define linkonce_odr {{.*}} void @_ZN4Test2C2C2Ev(ptr addrspace(4) noundef align 8 dereferenceable_or_null(12) %this, ptr addrspace(1) noundef %vtt)
diff --git a/clang/test/CodeGenCXX/wasm-eh.cpp b/clang/test/CodeGenCXX/wasm-eh.cpp
index af023f52191b97..1b17498ba9ce9f 100644
--- a/clang/test/CodeGenCXX/wasm-eh.cpp
+++ b/clang/test/CodeGenCXX/wasm-eh.cpp
@@ -34,7 +34,7 @@ void test0() {
 // CHECK-NEXT:   %[[EXN:.*]] = call ptr @llvm.wasm.get.exception(token %[[CATCHPAD]])
 // CHECK-NEXT:   store ptr %[[EXN]], ptr %exn.slot
 // CHECK-NEXT:   %[[SELECTOR:.*]] = call i32 @llvm.wasm.get.ehselector(token %[[CATCHPAD]])
-// CHECK-NEXT:   %[[TYPEID:.*]] = call i32 @llvm.eh.typeid.for(ptr @_ZTIi) #7
+// CHECK-NEXT:   %[[TYPEID:.*]] = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi) #7
 // CHECK-NEXT:   %[[MATCHES:.*]] = icmp eq i32 %[[SELECTOR]], %[[TYPEID]]
 // CHECK-NEXT:   br i1 %[[MATCHES]], label %[[CATCH_INT_BB:.*]], label %[[CATCH_FALLTHROUGH_BB:.*]]
 
@@ -51,7 +51,7 @@ void test0() {
 // CHECK-NEXT:   br label %[[TRY_CONT_BB:.*]]
 
 // CHECK: [[CATCH_FALLTHROUGH_BB]]
-// CHECK-NEXT:   %[[TYPEID:.*]] = call i32 @llvm.eh.typeid.for(ptr @_ZTId) #7
+// CHECK-NEXT:   %[[TYPEID:.*]] = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTId) #7
 // CHECK-NEXT:   %[[MATCHES:.*]] = icmp eq i32 %[[SELECTOR]], %[[TYPEID]]
 // CHECK-NEXT:   br i1 %[[MATCHES]], label %[[CATCH_FLOAT_BB:.*]], label %[[RETHROW_BB:.*]]
 
diff --git a/llvm/examples/ExceptionDemo/ExceptionDemo.cpp b/llvm/examples/ExceptionDemo/ExceptionDemo.cpp
index 0afc6b30d140e9..fdee76cb96146e 100644
--- a/llvm/examples/ExceptionDemo/ExceptionDemo.cpp
+++ b/llvm/examples/ExceptionDemo/ExceptionDemo.cpp
@@ -1865,7 +1865,7 @@ static void createStandardUtilityFunctions(unsigned numTypeInfos,
 
   // llvm.eh.typeid.for intrinsic
 
-  getDeclaration(&module, llvm::Intrinsic::eh_typeid_for);
+  getDeclaration(&module, llvm::Intrinsic::eh_typeid_for, builder.getPtrTy());
 }
 
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 78f0dbec863e96..3019f68083d422 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1371,7 +1371,7 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
 
 // The result of eh.typeid.for depends on the enclosing function, but inside a
 // given function it is 'const' and may be CSE'd etc.
-def int_eh_typeid_for : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>;
+def int_eh_typeid_for : Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty], [IntrNoMem]>;
 
 def int_eh_return_i32 : Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty]>;
 def int_eh_return_i64 : Intrinsic<[], [llvm_i64_ty, llvm_ptr_ty]>;
@@ -1730,7 +1730,7 @@ def int_coro_subfn_addr : DefaultAttrsIntrinsic<
 
 ///===-------------------------- Other Intrinsics --------------------------===//
 //
-// TODO: We should introduce a new memory kind fo traps (and other side effects 
+// TODO: We should introduce a new memory kind fo traps (and other side effects
 //       we only model to keep things alive).
 def int_trap : Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrInaccessibleMemOnly,
                IntrWriteMem]>, ClangBuiltin<"__builtin_trap">;
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll b/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll
index d17a5b419e3518..f6b36c56c6d3d3 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-exceptions.ll
@@ -44,7 +44,7 @@ lpad:                                             ; preds = %entry
 ; CHECK-NEXT: %[[CDR:.*]] = extractvalue { ptr, i32 } %[[IVI2]], 1
 
 catch.dispatch:                                   ; preds = %lpad
-  %3 = call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+  %3 = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi)
   %matches = icmp eq i32 %2, %3
   br i1 %matches, label %catch1, label %catch
 ; CHECK: catch.dispatch:
@@ -139,7 +139,7 @@ lpad:                                             ; preds = %entry
   br label %catch.dispatch
 
 catch.dispatch:                                   ; preds = %lpad
-  %4 = call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+  %4 = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi)
   %matches = icmp eq i32 %3, %4
   br i1 %matches, label %catch1, label %catch
 
@@ -162,7 +162,7 @@ declare void @foo(i32)
 declare ptr @bar(i8, i8)
 
 declare i32 @__gxx_personality_v0(...)
-declare i32 @llvm.eh.typeid.for(ptr)
+declare i32 @llvm.eh.typeid.for.p0(ptr)
 declare ptr @__cxa_begin_catch(ptr)
 declare void @__cxa_end_catch()
 declare void @__cxa_call_unexpected(ptr)
diff --git a/llvm/test/Transforms/GVNHoist/infinite-loop-indirect.ll b/llvm/test/Transforms/GVNHoist/infinite-loop-indirect.ll
index aef55af81dcac0..a7e6ff30d8b2f6 100644
--- a/llvm/test/Transforms/GVNHoist/infinite-loop-indirect.ll
+++ b/llvm/test/Transforms/GVNHoist/infinite-loop-indirect.ll
@@ -292,7 +292,7 @@ define i32 @foo2(ptr nocapture readonly %i) local_unnamed_addr personality ptr @
 ; CHECK-NEXT:    [[BC1:%.*]] = add i32 [[TMP0]], 10
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { ptr, i32 } [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP2]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi) #[[ATTR1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi) #[[ATTR1]]
 ; CHECK-NEXT:    [[MATCHES:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[BC7:%.*]] = add i32 [[TMP0]], 10
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP3]]) #[[ATTR1]]
@@ -340,7 +340,7 @@ lpad:
   %bc1 = add i32 %0, 10
   %3 = extractvalue { ptr, i32 } %2, 0
   %4 = extractvalue { ptr, i32 } %2, 1
-  %5 = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi) #2
+  %5 = tail call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi) #2
   %matches = icmp eq i32 %4, %5
   %bc7 = add i32 %0, 10
   %6 = tail call ptr @__cxa_begin_catch(ptr %3) #2
@@ -383,7 +383,7 @@ declare void @__cxa_throw(ptr, ptr, ptr) local_unnamed_addr
 declare i32 @__gxx_personality_v0(...)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(ptr) #1
+declare i32 @llvm.eh.typeid.for.p0(ptr) #1
 
 declare ptr @__cxa_begin_catch(ptr) local_unnamed_addr
 
diff --git a/llvm/test/Transforms/Inline/inline_invoke.ll b/llvm/test/Transforms/Inline/inline_invoke.ll
index 89c56447c07bd4..5441e2a9e63b92 100644
--- a/llvm/test/Transforms/Inline/inline_invoke.ll
+++ b/llvm/test/Transforms/Inline/inline_invoke.ll
@@ -19,7 +19,7 @@ declare void @use(i32) nounwind
 
 declare void @opaque()
 
-declare i32 @llvm.eh.typeid.for(ptr) nounwind
+declare i32 @llvm.eh.typeid.for.p0(ptr) nounwind
 
 declare i32 @__gxx_personality_v0(...)
 
@@ -74,7 +74,7 @@ lpad:                                             ; preds = %entry
             catch ptr @_ZTIi
   %eh.exc = extractvalue { ptr, i32 } %exn, 0
   %eh.selector = extractvalue { ptr, i32 } %exn, 1
-  %0 = call i32 @llvm.eh.typeid.for(ptr @_ZTIi) nounwind
+  %0 = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi) nounwind
   %1 = icmp eq i32 %eh.selector, %0
   br i1 %1, label %catch, label %eh.resume
 
@@ -109,7 +109,7 @@ eh.resume:
 ; CHECK-NEXT: phi { ptr, i32 } [
 ; CHECK-NEXT: extractvalue { ptr, i32 }
 ; CHECK-NEXT: extractvalue { ptr, i32 }
-; CHECK-NEXT: call i32 @llvm.eh.typeid.for(
+; CHECK-NEXT: call i32 @llvm.eh.typeid.for.p0(
 
 
 ;; Test 1 - Correctly handle phis in outer landing pads.
@@ -133,7 +133,7 @@ lpad:
             catch ptr @_ZTIi
   %eh.exc = extractvalue { ptr, i32 } %exn, 0
   %eh.selector = extractvalue { ptr, i32 } %exn, 1
-  %0 = call i32 @llvm.eh.typeid.for(ptr @_ZTIi) nounwind
+  %0 = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi) nounwind
   %1 = icmp eq i32 %eh.selector, %0
   br i1 %1, label %catch, label %eh.resume
 
@@ -212,7 +212,7 @@ eh.resume:
 ; CHECK-NEXT: [[EXNJ1:%.*]] = phi { ptr, i32 } [ [[EXNJ2]], %[[LPAD_JOIN2]] ], [ [[LPADVAL1]], %[[RESUME1]] ]
 ; CHECK-NEXT: extractvalue { ptr, i32 } [[EXNJ1]], 0
 ; CHECK-NEXT: [[SELJ1:%.*]] = extractvalue { ptr, i32 } [[EXNJ1]], 1
-; CHECK-NEXT: [[T:%.*]] = call i32 @llvm.eh.typeid.for(
+; CHECK-NEXT: [[T:%.*]] = call i32 @llvm.eh.typeid.for.p0(
 ; CHECK-NEXT: icmp eq i32 [[SELJ1]], [[T]]
 
 ; CHECK:      call void @use(i32 [[XJ1]])
diff --git a/llvm/test/Transforms/LICM/scalar-promote-unwind.ll b/llvm/test/Transforms/LICM/scalar-promote-unwind.ll
index be11722d2d5676..f7829c4d6e4d97 100644
--- a/llvm/test/Transforms/LICM/scalar-promote-unwind.ll
+++ b/llvm/test/Transforms/LICM/scalar-promote-unwind.ll
@@ -304,7 +304,7 @@ define void @loop_within_tryblock() personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1
 ; CHECK-NEXT:    br label [[CATCH_DISPATCH:%.*]]
 ; CHECK:       catch.dispatch:
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi)
 ; CHECK-NEXT:    [[MATCHES:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[MATCHES]], label [[CATCH:%.*]], label [[EH_RESUME:%.*]]
 ; CHECK:       catch:
@@ -355,7 +355,7 @@ lpad:
   br label %catch.dispatch
 
 catch.dispatch:
-  %4 = call i32 @llvm.eh.typeid.for(ptr @_ZTIi) #3
+  %4 = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi) #3
   %matches = icmp eq i32 %3, %4
   br i1 %matches, label %catch, label %eh.resume
 
@@ -564,6 +564,6 @@ declare ptr @__cxa_begin_catch(ptr)
 
 declare void @__cxa_end_catch()
 
-declare i32 @llvm.eh.typeid.for(ptr)
+declare i32 @llvm.eh.typeid.for.p0(ptr)
 
 declare void @f() uwtable
diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-unwind-direct-call.ll b/llvm/test/Transforms/LowerTypeTests/cfi-unwind-direct-call.ll
index 3e1f8b97e98b82..4d5055cc5a7606 100644
--- a/llvm/test/Transforms/LowerTypeTests/cfi-unwind-direct-call.ll
+++ b/llvm/test/Transforms/LowerTypeTests/cfi-unwind-direct-call.ll
@@ -65,7 +65,7 @@ lpad:                                             ; preds = %cfi.cont
   %1 = landingpad { ptr, i32 }
   catch ptr @_ZTIi
   %2 = extractvalue { ptr, i32 } %1, 1
-  %3 = tail call i32 @llvm.eh.typeid.for(ptr nonnull @_ZTIi) #5
+  %3 = tail call i32 @llvm.eh.typeid.for.p0(ptr nonnull @_ZTIi) #5
   %matches = icmp eq i32 %2, %3
   br i1 %matches, label %catch, label %eh.resume
 
@@ -90,7 +90,7 @@ declare void @__cfi_slowpath(i64, ptr) local_unnamed_addr
 declare i32 @__gxx_personality_v0(...)
 
 ; Function Attrs: nofree nosync nounwind memory(none)
-declare i32 @llvm.eh.typeid.for(ptr) #2
+declare i32 @llvm.eh.typeid.for.p0(ptr) #2
 
 declare ptr @__cxa_begin_catch(ptr) local_unnamed_addr
 
@@ -181,7 +181,7 @@ attributes #8 = { noreturn nounwind }
 ; CHECK-NEXT:    [[TMP0:%.*]] = landingpad { ptr, i32 }
 ; CHECK-NEXT:            catch ptr @_ZTIi
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr nonnull @_ZTIi) #[[ATTR6]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.eh.typeid.for.p0(ptr nonnull @_ZTIi) #[[ATTR6]]
 ; CHECK-NEXT:    [[MATCHES:%.*]] = icmp eq i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MATCHES]], label [[CATCH:%.*]], label [[EH_RESUME:%.*]]
 ; CHECK:       catch:
diff --git a/llvm/test/Transforms/NewGVN/2011-09-07-TypeIdFor.ll b/llvm/test/Transforms/NewGVN/2011-09-07-TypeIdFor.ll
index 675e7da26a1059..afd7610b716240 100644
--- a/llvm/test/Transforms/NewGVN/2011-09-07-TypeIdFor.ll
+++ b/llvm/test/Transforms/NewGVN/2011-09-07-TypeIdFor.ll
@@ -10,7 +10,7 @@ declare void @_Z4barv()
 
 declare void @_Z7cleanupv()
 
-declare i32 @llvm.eh.typeid.for(ptr) nounwind readonly
+declare i32 @llvm.eh.typeid.for.p0(ptr) nounwind readonly
 
 declare ptr @__cxa_begin_catch(ptr) nounwind
 
@@ -32,11 +32,11 @@ define void @_Z3foov() uwtable personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:            catch ptr @_ZTIb
 ; CHECK-NEXT:    [[EXC_PTR2_I:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 0
 ; CHECK-NEXT:    [[FILTER3_I:%.*]] = extractvalue { ptr, i32 } [[TMP0]], 1
-; CHECK-NEXT:    [[TYPEID_I:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+; CHECK-NEXT:    [[TYPEID_I:%.*]] = tail call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi)
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[FILTER3_I]], [[TYPEID_I]]
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[PPAD:%.*]], label [[NEXT:%.*]]
 ; CHECK:       next:
-; CHECK-NEXT:    [[TYPEID1_I:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIb)
+; CHECK-NEXT:    [[TYPEID1_I:%.*]] = tail call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIb)
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[FILTER3_I]], [[TYPEID1_I]]
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[PPAD2:%.*]], label [[NEXT2:%.*]]
 ; CHECK:       ppad:
@@ -77,12 +77,12 @@ lpad:                                             ; preds = %entry
   catch ptr @_ZTIb
   %exc_ptr2.i = extractvalue { ptr, i32 } %0, 0
   %filter3.i = extractvalue { ptr, i32 } %0, 1
-  %typeid.i = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+  %typeid.i = tail call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi)
   %1 = icmp eq i32 %filter3.i, %typeid.i
   br i1 %1, label %ppad, label %next
 
 next:                                             ; preds = %lpad
-  %typeid1.i = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIb)
+  %typeid1.i = tail call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIb)
   %2 = icmp eq i32 %filter3.i, %typeid1.i
   br i1 %2, label %ppad2, label %next2
 
@@ -98,12 +98,12 @@ ppad2:                                            ; preds = %next
 
 next2:                                            ; preds = %next
   call void @_Z7cleanupv()
-  %typeid = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi)
+  %typeid = tail call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi)
   %4 = icmp eq i32 %filter3.i, %typeid
   br i1 %4, label %ppad3, label %next3
 
 next3:                                            ; preds = %next2
-  %typeid1 = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIb)
+  %typeid1 = tail call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIb)
   %5 = icmp eq i32 %filter3.i, %typeid1
   br i1 %5, label %ppad4, label %unwind
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index bd347d0cf6308a..57af89f5dbf8d6 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -635,7 +635,7 @@ def LLVM_VaEndOp : LLVM_ZeroResultIntrOp<"vaend", [0]>,
 // Exception handling intrinsics.
 //
 
-def LLVM_EhTypeidForOp : LLVM_OneResultIntrOp<"eh.typeid.for"> {
+def LLVM_EhTypeidForOp : LLVM_OneResultIntrOp<"eh.typeid.for", [], [0]> {
     let arguments = (ins LLVM_AnyPointer:$type_info);
     let assemblyFormat = "$type_info attr-dict `:` functional-type(operands, results)";
 }
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index e43024ff868e56..9a5528002ef5e9 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -732,7 +732,7 @@ define void @coro_promise(ptr %0, i32 %1, i1 %2) {
 ; CHECK-LABEL:  llvm.func @eh_typeid_for
 define void @eh_typeid_for(ptr %0) {
   ; CHECK: llvm.intr.eh.typeid.for %{{.*}} : (!llvm.ptr) -> i32
-  %2 = call i32 @llvm.eh.typeid.for(ptr %0)
+  %2 = call i32 @llvm.eh.typeid.for.p0(ptr %0)
   ret void
 }
 
@@ -1082,7 +1082,7 @@ declare i1 @llvm.coro.end(ptr, i1, token)
 declare ptr @llvm.coro.free(token, ptr nocapture readonly)
 declare void @llvm.coro.resume(ptr)
 declare ptr @llvm.coro.promise(ptr nocapture, i32, i1)
-declare i32 @llvm.eh.typeid.for(ptr)
+declare i32 @llvm.eh.typeid.for.p0(ptr)
 declare ptr @llvm.stacksave.p0()
 declare ptr addrspace(1) @llvm.stacksave.p1()
 declare void @llvm.stackrestore.p0(ptr)
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index 238c3e4263cb05..1e533aeacfb49d 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -724,7 +724,7 @@ llvm.func @coro_promise(%arg0: !llvm.ptr, %arg1 : i32, %arg2 : i1) {
 
 // CHECK-LABEL: @eh_typeid_for
 llvm.func @eh_typeid_for(%arg0 : !llvm.ptr) {
-    // CHECK: call i32 @llvm.eh.typeid.for
+    // CHECK: call i32 @llvm.eh.typeid.for.p0
     %0 = llvm.intr.eh.typeid.for %arg0 : (!llvm.ptr) -> i32
     llvm.return
 }

From 0c7d268ba72767b70c7bf0bc8ae6422c509f94d8 Mon Sep 17 00:00:00 2001
From: aengelke <engelke@in.tum.de>
Date: Sun, 19 May 2024 16:38:53 +0200
Subject: [PATCH 05/60] [CodeGen][SDAG] Skip preferred extend at O0 (#92643)

This is a pure optimization to avoid redundant extensions, but iterating
over all users is expensive, so don't do this at -O0.
---
 llvm/include/llvm/CodeGen/SelectionDAG.h               | 1 +
 llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 979ef8033eb5e7..ed6962685f7b04 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -469,6 +469,7 @@ class SelectionDAG {
   MachineFunction &getMachineFunction() const { return *MF; }
   const Pass *getPass() const { return SDAGISelPass; }
 
+  CodeGenOptLevel getOptLevel() const { return OptLevel; }
   const DataLayout &getDataLayout() const { return MF->getDataLayout(); }
   const TargetMachine &getTarget() const { return TM; }
   const TargetSubtargetInfo &getSubtarget() const { return MF->getSubtarget(); }
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 8fb6b11b8805c5..35f840201e4ba1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -222,8 +222,10 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
         if (!isa<AllocaInst>(I) || !StaticAllocaMap.count(cast<AllocaInst>(&I)))
           InitializeRegForValue(&I);
 
-      // Decide the preferred extend type for a value.
-      PreferredExtendType[&I] = getPreferredExtendForValue(&I);
+      // Decide the preferred extend type for a value. This iterates over all
+      // users and therefore isn't cheap, so don't do this at O0.
+      if (DAG->getOptLevel() != CodeGenOptLevel::None)
+        PreferredExtendType[&I] = getPreferredExtendForValue(&I);
     }
   }
 

From 9e4ef0dee18c0c99325e8d56f16c149020e89d37 Mon Sep 17 00:00:00 2001
From: aengelke <engelke@in.tum.de>
Date: Sun, 19 May 2024 16:39:19 +0200
Subject: [PATCH 06/60] [CodeGen][SDAG] Track returntwice in lowering info
 (#92640)

This saves an extra iteration over the all instructions of the function.
---
 llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp | 4 ++++
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp     | 3 ---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 35f840201e4ba1..de22d230b1c32d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -214,6 +214,10 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
           if (CI->isMustTailCall() && Fn->isVarArg())
             MF->getFrameInfo().setHasMustTailInVarArgFunc(true);
         }
+
+        // Determine if there is a call to setjmp in the machine function.
+        if (Call->hasFnAttr(Attribute::ReturnsTwice))
+          MF->setExposesReturnsTwice(true);
       }
 
       // Mark values used outside their block as exported, by allocating
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index b5694c955b8c8f..8addaf1ae3e540 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -680,9 +680,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     }
   }
 
-  // Determine if there is a call to setjmp in the machine function.
-  MF->setExposesReturnsTwice(Fn.callsFunctionThatReturnsTwice());
-
   // Determine if floating point is used for msvc
   computeUsesMSVCFloatingPoint(TM.getTargetTriple(), Fn, MF->getMMI());
 

From eab92cb7f33be16a6a17549182e9237112b7a183 Mon Sep 17 00:00:00 2001
From: Nhat Nguyen <nhat7203@gmail.com>
Date: Sun, 19 May 2024 10:57:11 -0400
Subject: [PATCH 07/60] [llvm] Add KnownBits implementations for avgFloor and
 avgCeil (#86445)

This PR is to address the issue #84640
---
 llvm/include/llvm/Support/KnownBits.h         | 12 +++++++
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 29 +++++++++++------
 llvm/lib/Support/KnownBits.cpp                | 31 +++++++++++++++++++
 llvm/unittests/Support/KnownBitsTest.cpp      | 12 +++++++
 4 files changed, 74 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/Support/KnownBits.h b/llvm/include/llvm/Support/KnownBits.h
index 9b7f405b625642..ba4a5f01036ca6 100644
--- a/llvm/include/llvm/Support/KnownBits.h
+++ b/llvm/include/llvm/Support/KnownBits.h
@@ -354,6 +354,18 @@ struct KnownBits {
   /// Compute knownbits resulting from llvm.usub.sat(LHS, RHS)
   static KnownBits usub_sat(const KnownBits &LHS, const KnownBits &RHS);
 
+  /// Compute knownbits resulting from APIntOps::avgFloorS
+  static KnownBits avgFloorS(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Compute knownbits resulting from APIntOps::avgFloorU
+  static KnownBits avgFloorU(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Compute knownbits resulting from APIntOps::avgCeilS
+  static KnownBits avgCeilS(const KnownBits &LHS, const KnownBits &RHS);
+
+  /// Compute knownbits resulting from APIntOps::avgCeilU
+  static KnownBits avgCeilU(const KnownBits &LHS, const KnownBits &RHS);
+
   /// Compute known bits resulting from multiplying LHS and RHS.
   static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS,
                        bool NoUndefSelfMultiply = false);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 2e1f4b7e5b3742..72685a2d772164 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3468,19 +3468,28 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       Known = KnownBits::mulhs(Known, Known2);
     break;
   }
-  case ISD::AVGFLOORU:
-  case ISD::AVGCEILU:
-  case ISD::AVGFLOORS:
+  case ISD::AVGFLOORU: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::avgFloorU(Known, Known2);
+    break;
+  }
+  case ISD::AVGCEILU: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::avgCeilU(Known, Known2);
+    break;
+  }
+  case ISD::AVGFLOORS: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::avgFloorS(Known, Known2);
+    break;
+  }
   case ISD::AVGCEILS: {
-    bool IsCeil = Opcode == ISD::AVGCEILU || Opcode == ISD::AVGCEILS;
-    bool IsSigned = Opcode == ISD::AVGFLOORS || Opcode == ISD::AVGCEILS;
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-    Known = IsSigned ? Known.sext(BitWidth + 1) : Known.zext(BitWidth + 1);
-    Known2 = IsSigned ? Known2.sext(BitWidth + 1) : Known2.zext(BitWidth + 1);
-    KnownBits Carry = KnownBits::makeConstant(APInt(1, IsCeil ? 1 : 0));
-    Known = KnownBits::computeForAddCarry(Known, Known2, Carry);
-    Known = Known.extractBits(BitWidth, 1);
+    Known = KnownBits::avgCeilS(Known, Known2);
     break;
   }
   case ISD::SELECT:
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index fe47884f3e55ac..d6012a8eea8a6e 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -774,6 +774,37 @@ KnownBits KnownBits::usub_sat(const KnownBits &LHS, const KnownBits &RHS) {
   return computeForSatAddSub(/*Add*/ false, /*Signed*/ false, LHS, RHS);
 }
 
+static KnownBits avgCompute(KnownBits LHS, KnownBits RHS, bool IsCeil,
+                            bool IsSigned) {
+  unsigned BitWidth = LHS.getBitWidth();
+  LHS = IsSigned ? LHS.sext(BitWidth + 1) : LHS.zext(BitWidth + 1);
+  RHS = IsSigned ? RHS.sext(BitWidth + 1) : RHS.zext(BitWidth + 1);
+  KnownBits Carry = KnownBits::makeConstant(APInt(1, IsCeil ? 1 : 0));
+  LHS = KnownBits::computeForAddCarry(LHS, RHS, Carry);
+  LHS = LHS.extractBits(BitWidth, 1);
+  return LHS;
+}
+
+KnownBits KnownBits::avgFloorS(const KnownBits &LHS, const KnownBits &RHS) {
+  return avgCompute(LHS, RHS, /* IsCeil */ false,
+                    /* IsSigned */ true);
+}
+
+KnownBits KnownBits::avgFloorU(const KnownBits &LHS, const KnownBits &RHS) {
+  return avgCompute(LHS, RHS, /* IsCeil */ false,
+                    /* IsSigned */ false);
+}
+
+KnownBits KnownBits::avgCeilS(const KnownBits &LHS, const KnownBits &RHS) {
+  return avgCompute(LHS, RHS, /* IsCeil */ true,
+                    /* IsSigned */ true);
+}
+
+KnownBits KnownBits::avgCeilU(const KnownBits &LHS, const KnownBits &RHS) {
+  return avgCompute(LHS, RHS, /* IsCeil */ true,
+                    /* IsSigned */ false);
+}
+
 KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS,
                          bool NoUndefSelfMultiply) {
   unsigned BitWidth = LHS.getBitWidth();
diff --git a/llvm/unittests/Support/KnownBitsTest.cpp b/llvm/unittests/Support/KnownBitsTest.cpp
index d7407070271662..824cf7501fd44f 100644
--- a/llvm/unittests/Support/KnownBitsTest.cpp
+++ b/llvm/unittests/Support/KnownBitsTest.cpp
@@ -501,6 +501,18 @@ TEST(KnownBitsTest, BinaryExhaustive) {
       "mulhu", KnownBits::mulhu,
       [](const APInt &N1, const APInt &N2) { return APIntOps::mulhu(N1, N2); },
       /*CheckOptimality=*/false);
+
+  testBinaryOpExhaustive("avgFloorS", KnownBits::avgFloorS, APIntOps::avgFloorS,
+                         false);
+
+  testBinaryOpExhaustive("avgFloorU", KnownBits::avgFloorU, APIntOps::avgFloorU,
+                         false);
+
+  testBinaryOpExhaustive("avgCeilU", KnownBits::avgCeilU, APIntOps::avgCeilU,
+                         false);
+
+  testBinaryOpExhaustive("avgCeilS", KnownBits::avgCeilS, APIntOps::avgCeilS,
+                         false);
 }
 
 TEST(KnownBitsTest, UnaryExhaustive) {

From c1c1567d60983298a0db0efefd78899985464f19 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sun, 19 May 2024 17:35:42 +0200
Subject: [PATCH 08/60] SimplifyLibCalls: Permit pow(2, x) -> ldexp(1, x) fold
 for vectors (#92532)

---
 .../lib/Transforms/Utils/SimplifyLibCalls.cpp |  7 +-
 .../Transforms/InstCombine/pow-to-ldexp.ll    | 69 ++++++-------------
 2 files changed, 24 insertions(+), 52 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index c9567b740026bf..eb1224abf00e29 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -2087,15 +2087,16 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
 
   AttributeList NoAttrs; // Attributes are only meaningful on the original call
 
+  const bool UseIntrinsic = Pow->doesNotAccessMemory();
+
   // pow(2.0, itofp(x)) -> ldexp(1.0, x)
-  // TODO: This does not work for vectors because there is no ldexp intrinsic.
-  if (!Ty->isVectorTy() && match(Base, m_SpecificFP(2.0)) &&
+  if ((UseIntrinsic || !Ty->isVectorTy()) && match(Base, m_SpecificFP(2.0)) &&
       (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) &&
       hasFloatFn(M, TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
     if (Value *ExpoI = getIntToFPVal(Expo, B, TLI->getIntSize())) {
       Constant *One = ConstantFP::get(Ty, 1.0);
 
-      if (Pow->doesNotAccessMemory()) {
+      if (UseIntrinsic) {
         return copyFlags(*Pow, B.CreateIntrinsic(Intrinsic::ldexp,
                                                  {Ty, ExpoI->getType()},
                                                  {One, ExpoI}, Pow, "exp2"));
diff --git a/llvm/test/Transforms/InstCombine/pow-to-ldexp.ll b/llvm/test/Transforms/InstCombine/pow-to-ldexp.ll
index 27249dd5d72aec..b61f8809bd2598 100644
--- a/llvm/test/Transforms/InstCombine/pow-to-ldexp.ll
+++ b/llvm/test/Transforms/InstCombine/pow-to-ldexp.ll
@@ -144,16 +144,10 @@ define half @pow_sitofp_f16_const_base_2(i32 %x) {
 }
 
 define <2 x float> @pow_sitofp_v2f32_const_base_2(<2 x i32> %x) {
-; LDEXP-EXP2-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2(
-; LDEXP-EXP2-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-EXP2-NEXT:    [[EXP2:%.*]] = tail call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> [[X]])
-; LDEXP-EXP2-NEXT:    ret <2 x float> [[EXP2]]
-;
-; LDEXP-NOEXP2-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2(
-; LDEXP-NOEXP2-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-NOEXP2-NEXT:    [[ITOFP:%.*]] = sitofp <2 x i32> [[X]] to <2 x float>
-; LDEXP-NOEXP2-NEXT:    [[POW:%.*]] = tail call <2 x float> @llvm.pow.v2f32(<2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> [[ITOFP]])
-; LDEXP-NOEXP2-NEXT:    ret <2 x float> [[POW]]
+; LDEXP-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2(
+; LDEXP-SAME: <2 x i32> [[X:%.*]]) {
+; LDEXP-NEXT:    [[EXP2:%.*]] = tail call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> [[X]])
+; LDEXP-NEXT:    ret <2 x float> [[EXP2]]
 ;
 ; NOLDEXP-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2(
 ; NOLDEXP-SAME: <2 x i32> [[X:%.*]]) {
@@ -205,15 +199,10 @@ define <2 x float> @pow_sitofp_v2f32_const_base_mixed_2(<2 x i32> %x) {
 }
 
 define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(<2 x i32> %x) {
-; LDEXP-EXP2-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(
-; LDEXP-EXP2-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-EXP2-NEXT:    [[EXP2:%.*]] = tail call nsz afn <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> [[X]])
-; LDEXP-EXP2-NEXT:    ret <2 x float> [[EXP2]]
-;
-; LDEXP-NOEXP2-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(
-; LDEXP-NOEXP2-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-NOEXP2-NEXT:    [[POW:%.*]] = tail call nsz afn <2 x float> @llvm.powi.v2f32.v2i32(<2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x i32> [[X]])
-; LDEXP-NOEXP2-NEXT:    ret <2 x float> [[POW]]
+; LDEXP-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(
+; LDEXP-SAME: <2 x i32> [[X:%.*]]) {
+; LDEXP-NEXT:    [[EXP2:%.*]] = tail call nsz afn <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x i32> [[X]])
+; LDEXP-NEXT:    ret <2 x float> [[EXP2]]
 ;
 ; NOLDEXP-LABEL: define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(
 ; NOLDEXP-SAME: <2 x i32> [[X:%.*]]) {
@@ -227,16 +216,10 @@ define <2 x float> @pow_sitofp_v2f32_const_base_2__flags(<2 x i32> %x) {
 }
 
 define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(<vscale x 4 x i32> %x) {
-; LDEXP-EXP2-LABEL: define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(
-; LDEXP-EXP2-SAME: <vscale x 4 x i32> [[X:%.*]]) {
-; LDEXP-EXP2-NEXT:    [[EXP2:%.*]] = tail call <vscale x 4 x float> @llvm.ldexp.nxv4f32.nxv4i32(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[X]])
-; LDEXP-EXP2-NEXT:    ret <vscale x 4 x float> [[EXP2]]
-;
-; LDEXP-NOEXP2-LABEL: define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(
-; LDEXP-NOEXP2-SAME: <vscale x 4 x i32> [[X:%.*]]) {
-; LDEXP-NOEXP2-NEXT:    [[ITOFP:%.*]] = sitofp <vscale x 4 x i32> [[X]] to <vscale x 4 x float>
-; LDEXP-NOEXP2-NEXT:    [[POW:%.*]] = tail call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[ITOFP]])
-; LDEXP-NOEXP2-NEXT:    ret <vscale x 4 x float> [[POW]]
+; LDEXP-LABEL: define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(
+; LDEXP-SAME: <vscale x 4 x i32> [[X:%.*]]) {
+; LDEXP-NEXT:    [[EXP2:%.*]] = tail call <vscale x 4 x float> @llvm.ldexp.nxv4f32.nxv4i32(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[X]])
+; LDEXP-NEXT:    ret <vscale x 4 x float> [[EXP2]]
 ;
 ; NOLDEXP-LABEL: define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(
 ; NOLDEXP-SAME: <vscale x 4 x i32> [[X:%.*]]) {
@@ -250,16 +233,10 @@ define <vscale x 4 x float> @pow_sitofp_nxv4f32_const_base_2(<vscale x 4 x i32>
 }
 
 define <2 x half> @pow_sitofp_v2f16_const_base_2(<2 x i32> %x) {
-; LDEXP-EXP2-LABEL: define <2 x half> @pow_sitofp_v2f16_const_base_2(
-; LDEXP-EXP2-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-EXP2-NEXT:    [[EXP2:%.*]] = tail call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> <half 0xH3C00, half 0xH3C00>, <2 x i32> [[X]])
-; LDEXP-EXP2-NEXT:    ret <2 x half> [[EXP2]]
-;
-; LDEXP-NOEXP2-LABEL: define <2 x half> @pow_sitofp_v2f16_const_base_2(
-; LDEXP-NOEXP2-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-NOEXP2-NEXT:    [[ITOFP:%.*]] = sitofp <2 x i32> [[X]] to <2 x half>
-; LDEXP-NOEXP2-NEXT:    [[POW:%.*]] = tail call <2 x half> @llvm.pow.v2f16(<2 x half> <half 0xH4000, half 0xH4000>, <2 x half> [[ITOFP]])
-; LDEXP-NOEXP2-NEXT:    ret <2 x half> [[POW]]
+; LDEXP-LABEL: define <2 x half> @pow_sitofp_v2f16_const_base_2(
+; LDEXP-SAME: <2 x i32> [[X:%.*]]) {
+; LDEXP-NEXT:    [[EXP2:%.*]] = tail call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> <half 0xH3C00, half 0xH3C00>, <2 x i32> [[X]])
+; LDEXP-NEXT:    ret <2 x half> [[EXP2]]
 ;
 ; NOLDEXP-LABEL: define <2 x half> @pow_sitofp_v2f16_const_base_2(
 ; NOLDEXP-SAME: <2 x i32> [[X:%.*]]) {
@@ -273,16 +250,10 @@ define <2 x half> @pow_sitofp_v2f16_const_base_2(<2 x i32> %x) {
 }
 
 define <2 x double> @pow_sitofp_v2f64_const_base_2(<2 x i32> %x) {
-; LDEXP-EXP2-LABEL: define <2 x double> @pow_sitofp_v2f64_const_base_2(
-; LDEXP-EXP2-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-EXP2-NEXT:    [[EXP2:%.*]] = tail call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> <double 1.000000e+00, double 1.000000e+00>, <2 x i32> [[X]])
-; LDEXP-EXP2-NEXT:    ret <2 x double> [[EXP2]]
-;
-; LDEXP-NOEXP2-LABEL: define <2 x double> @pow_sitofp_v2f64_const_base_2(
-; LDEXP-NOEXP2-SAME: <2 x i32> [[X:%.*]]) {
-; LDEXP-NOEXP2-NEXT:    [[ITOFP:%.*]] = sitofp <2 x i32> [[X]] to <2 x double>
-; LDEXP-NOEXP2-NEXT:    [[POW:%.*]] = tail call <2 x double> @llvm.pow.v2f64(<2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double> [[ITOFP]])
-; LDEXP-NOEXP2-NEXT:    ret <2 x double> [[POW]]
+; LDEXP-LABEL: define <2 x double> @pow_sitofp_v2f64_const_base_2(
+; LDEXP-SAME: <2 x i32> [[X:%.*]]) {
+; LDEXP-NEXT:    [[EXP2:%.*]] = tail call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> <double 1.000000e+00, double 1.000000e+00>, <2 x i32> [[X]])
+; LDEXP-NEXT:    ret <2 x double> [[EXP2]]
 ;
 ; NOLDEXP-LABEL: define <2 x double> @pow_sitofp_v2f64_const_base_2(
 ; NOLDEXP-SAME: <2 x i32> [[X:%.*]]) {

From b050048d35f6580fb427e6de9063444aa85625c6 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 19 May 2024 16:45:23 +0100
Subject: [PATCH 09/60] [VPlan] Simplify (X && Y) || (X && !Y) -> X. (#89386)

Simplify a common pattern generated for masks when folding the tail.

PR: https://github.com/llvm/llvm-project/pull/89386
---
 .../Transforms/Vectorize/VPlanPatternMatch.h  |  8 +++++++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 15 +++++++++++++-
 .../LoopVectorize/AArch64/masked-call.ll      |  9 +++------
 .../AArch64/scalable-strict-fadd.ll           |  3 +--
 .../LoopVectorize/RISCV/uniform-load-store.ll | 20 ++++---------------
 .../Transforms/LoopVectorize/uniform-blend.ll |  5 +----
 .../unused-blend-mask-for-first-operand.ll    | 12 ++---------
 .../vplan-sink-scalars-and-merge.ll           | 19 +++++-------------
 8 files changed, 37 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 50b08bbb7ebf71..56cbaa42012978 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -270,9 +270,15 @@ m_Mul(const Op0_t &Op0, const Op1_t &Op1) {
 
 template <typename Op0_t, typename Op1_t>
 inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or>
-m_Or(const Op0_t &Op0, const Op1_t &Op1) {
+m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
   return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
 }
+
+template <typename Op0_t, typename Op1_t>
+inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::LogicalAnd>
+m_LogicalAnd(const Op0_t &Op0, const Op1_t &Op1) {
+  return m_VPInstruction<VPInstruction::LogicalAnd, Op0_t, Op1_t>(Op0, Op1);
+}
 } // namespace VPlanPatternMatch
 } // namespace llvm
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c0eb6d710ad34b..4c968c2834b10d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -935,6 +935,19 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
 #endif
   }
 
+  // Simplify (X && Y) || (X && !Y) -> X.
+  // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
+  // && (Y || Z) and (X || !X) into true. This requires queuing newly created
+  // recipes to be visited during simplification.
+  VPValue *X, *Y, *X1, *Y1;
+  if (match(&R,
+            m_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
+                       m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) &&
+      X == X1 && Y == Y1) {
+    R.getVPSingleValue()->replaceAllUsesWith(X);
+    return;
+  }
+
   if (match(&R, m_CombineOr(m_Mul(m_VPValue(A), m_SpecificInt(1)),
                             m_Mul(m_SpecificInt(1), m_VPValue(A)))))
     return R.getVPSingleValue()->replaceAllUsesWith(A);
@@ -1402,7 +1415,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
         // for dependence analysis). Instead, replace it with an equivalent Add.
         // This is possible as all users of the disjoint OR only access lanes
         // where the operands are disjoint or poison otherwise.
-        if (match(RecWithFlags, m_Or(m_VPValue(A), m_VPValue(B))) &&
+        if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
             RecWithFlags->isDisjoint()) {
           VPBuilder Builder(RecWithFlags);
           VPInstruction *New = Builder.createOverflowingOp(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index b91579106261a6..d335ac4b697099 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -223,10 +223,9 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP9]])
 ; TFCOMMON-NEXT:    [[TMP11:%.*]] = xor <vscale x 2 x i1> [[TMP8]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFCOMMON-NEXT:    [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> zeroinitializer
-; TFCOMMON-NEXT:    [[TMP13:%.*]] = or <vscale x 2 x i1> [[TMP9]], [[TMP12]]
 ; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP10]]
 ; TFCOMMON-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[TMP13]])
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFCOMMON-NEXT:    [[TMP15:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
@@ -272,16 +271,14 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = xor <vscale x 2 x i1> [[TMP14]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP19]], <vscale x 2 x i1> zeroinitializer
 ; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP20]], <vscale x 2 x i1> zeroinitializer
-; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = or <vscale x 2 x i1> [[TMP15]], [[TMP21]]
-; TFA_INTERLEAVE-NEXT:    [[TMP24:%.*]] = or <vscale x 2 x i1> [[TMP16]], [[TMP22]]
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP21]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP17]]
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP22]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP18]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP26]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP25]], i64 [[TMP27]]
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP25]], i32 8, <vscale x 2 x i1> [[TMP23]])
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP28]], i32 8, <vscale x 2 x i1> [[TMP24]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP25]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP28]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index ddc004657ed5bf..bcf8096f1b7383 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -1241,9 +1241,8 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> zeroinitializer
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = or <vscale x 4 x i1> [[TMP15]], [[TMP18]]
 ; CHECK-ORDERED-TF-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD1]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP20]])
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]]
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 1ce4cb928e8085..ee70f4aa35850f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -462,13 +462,10 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 10, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
 ; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i1> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i64> poison)
-; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = xor <vscale x 2 x i1> [[TMP12]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer
-; TF-SCALABLE-NEXT:    [[TMP17:%.*]] = or <vscale x 2 x i1> [[TMP13]], [[TMP15]]
 ; TF-SCALABLE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], <vscale x 2 x i64> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP11]]
 ; TF-SCALABLE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[TMP17]])
+; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -510,13 +507,10 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], <i64 10, i64 10, i64 10, i64 10>
 ; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP2]], <4 x i64> poison)
-; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; TF-FIXEDLEN-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
-; TF-FIXEDLEN-NEXT:    [[TMP6:%.*]] = or <4 x i1> [[TMP2]], [[TMP4]]
 ; TF-FIXEDLEN-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; TF-FIXEDLEN-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[PREDPHI]], ptr [[TMP7]], i32 8, <4 x i1> [[TMP6]])
+; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[PREDPHI]], ptr [[TMP7]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; TF-FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; TF-FIXEDLEN-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
@@ -1296,12 +1290,9 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = icmp ugt <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 10, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
 ; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i1> zeroinitializer
 ; TF-SCALABLE-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> [[BROADCAST_SPLAT2]], i32 8, <vscale x 2 x i1> [[TMP13]])
-; TF-SCALABLE-NEXT:    [[TMP15:%.*]] = xor <vscale x 2 x i1> [[TMP12]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TF-SCALABLE-NEXT:    [[TMP16:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> zeroinitializer
-; TF-SCALABLE-NEXT:    [[TMP17:%.*]] = or <vscale x 2 x i1> [[TMP13]], [[TMP16]]
 ; TF-SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP11]]
 ; TF-SCALABLE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 0
-; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[TMP17]])
+; TF-SCALABLE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1344,12 +1335,9 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-FIXEDLEN-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], <i64 10, i64 10, i64 10, i64 10>
 ; TF-FIXEDLEN-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
 ; TF-FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]], i32 8, <4 x i1> [[TMP2]])
-; TF-FIXEDLEN-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; TF-FIXEDLEN-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
-; TF-FIXEDLEN-NEXT:    [[TMP6:%.*]] = or <4 x i1> [[TMP2]], [[TMP5]]
 ; TF-FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; TF-FIXEDLEN-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, <4 x i1> [[TMP6]])
+; TF-FIXEDLEN-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, <4 x i1> [[ACTIVE_LANE_MASK]])
 ; TF-FIXEDLEN-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; TF-FIXEDLEN-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; TF-FIXEDLEN-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 19cbcac6090c61..f33ec1419b1147 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -86,11 +86,8 @@ define void @blend_chain_iv(i1 %c) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[MASK1]], <4 x i1> [[MASK1]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[MASK1]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[MASK1]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i1> [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> undef
-; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <4 x i1> [[TMP8]], <4 x i64> [[PREDPHI]], <4 x i64> undef
+; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <4 x i1> [[MASK1]], <4 x i64> [[PREDPHI]], <4 x i64> undef
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 1
diff --git a/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll b/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
index 0f7bd3d71feb4f..d79b4a7cefc257 100644
--- a/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
+++ b/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
@@ -172,8 +172,6 @@ define void @test_not_first_lane_only_wide_compare_incoming_order_swapped(ptr %A
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[X]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[Y]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -184,14 +182,8 @@ define void @test_not_first_lane_only_wide_compare_incoming_order_swapped(ptr %A
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult <4 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult <4 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP7]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP11]], ptr [[B]], ptr poison
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP9]], ptr [[B]], ptr poison
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[PREDPHI]], align 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT3]], <4 x i16> poison, <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index 1e60e57a5409d4..ae5879bb2bae9c 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -361,15 +361,12 @@ define void @pred_cfg1(i32 %k, i32 %j) {
 ; CHECK-NEXT: Successor(s): then.0.0
 ; CHECK-EMPTY:
 ; CHECK-NEXT: then.0.0:
-; CHECK-NEXT:   EMIT vp<[[NOT:%.+]]> = not ir<%c.1>
-; CHECK-NEXT:   EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, vp<[[NOT]]>
-; CHECK-NEXT:   EMIT vp<[[OR:%.+]]> = or vp<[[MASK2]]>, vp<[[MASK3]]>
 ; CHECK-NEXT:   BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]>
 ; CHECK-NEXT: Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <xVFxUF> pred.store: {
 ; CHECK-NEXT:   pred.store.entry:
-; CHECK-NEXT:     BRANCH-ON-MASK vp<[[OR]]>
+; CHECK-NEXT:     BRANCH-ON-MASK vp<[[MASK1]]>
 ; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.if:
@@ -462,16 +459,13 @@ define void @pred_cfg2(i32 %k, i32 %j) {
 ; CHECK-NEXT: Successor(s): then.0.0
 ; CHECK-EMPTY:
 ; CHECK-NEXT: then.0.0:
-; CHECK-NEXT:   EMIT vp<[[NOT:%.+]]> = not ir<%c.0>
-; CHECK-NEXT:   EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, vp<[[NOT]]>
-; CHECK-NEXT:   EMIT vp<[[OR:%.+]]> = or vp<[[MASK2]]>, vp<[[MASK3]]>
 ; CHECK-NEXT:   BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]>
-; CHECK-NEXT:   EMIT vp<[[MASK4:%.+]]> = logical-and vp<[[OR]]>, ir<%c.1>
+; CHECK-NEXT:   EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, ir<%c.1>
 ; CHECK-NEXT: Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <xVFxUF> pred.store: {
 ; CHECK-NEXT:   pred.store.entry:
-; CHECK-NEXT:     BRANCH-ON-MASK vp<[[MASK4]]>
+; CHECK-NEXT:     BRANCH-ON-MASK vp<[[MASK3]]>
 ; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.if:
@@ -570,16 +564,13 @@ define void @pred_cfg3(i32 %k, i32 %j) {
 ; CHECK-NEXT: Successor(s): then.0.0
 ; CHECK-EMPTY:
 ; CHECK-NEXT: then.0.0:
-; CHECK-NEXT:   EMIT vp<[[NOT:%.+]]> = not ir<%c.0>
-; CHECK-NEXT:   EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, vp<[[NOT]]>
-; CHECK-NEXT:   EMIT vp<[[MASK4:%.+]]> = or vp<[[MASK2]]>, vp<[[MASK3]]>
 ; CHECK-NEXT:   BLEND ir<%p> = ir<0> vp<[[PRED]]>/vp<[[MASK2]]>
-; CHECK-NEXT:   EMIT vp<[[MASK5:%.+]]> = logical-and vp<[[MASK4]]>, ir<%c.0>
+; CHECK-NEXT:   EMIT vp<[[MASK3:%.+]]> = logical-and vp<[[MASK1]]>, ir<%c.0>
 ; CHECK-NEXT: Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <xVFxUF> pred.store: {
 ; CHECK-NEXT:   pred.store.entry:
-; CHECK-NEXT:     BRANCH-ON-MASK vp<[[MASK5]]>
+; CHECK-NEXT:     BRANCH-ON-MASK vp<[[MASK3]]>
 ; CHECK-NEXT:   Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:   pred.store.if:

From 643f36184bd3d9a95cbfd608af6f1cccc69e0187 Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Sun, 19 May 2024 09:27:56 -0700
Subject: [PATCH 10/60] HLSL availability diagnostics design doc (#92207)

Design document for the HLSL availability diagnostic modes

Fixes microsoft/hlsl-specs#190

---------

Co-authored-by: Xiang Li <python3kgae@outlook.com>
---
 clang/docs/HLSL/AvailabilityDiagnostics.rst | 137 ++++++++++++++++++++
 clang/docs/HLSL/HLSLDocs.rst                |   1 +
 2 files changed, 138 insertions(+)
 create mode 100644 clang/docs/HLSL/AvailabilityDiagnostics.rst

diff --git a/clang/docs/HLSL/AvailabilityDiagnostics.rst b/clang/docs/HLSL/AvailabilityDiagnostics.rst
new file mode 100644
index 00000000000000..bb9d02f21dde62
--- /dev/null
+++ b/clang/docs/HLSL/AvailabilityDiagnostics.rst
@@ -0,0 +1,137 @@
+=============================
+HLSL Availability Diagnostics
+=============================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+HLSL availability diagnostics emits errors or warning when unavailable shader APIs are used. Unavailable shader APIs are APIs that are exposed in HLSL code but are not available in the target shader stage or shader model version.
+
+There are three modes of HLSL availability diagnostic:
+
+#. **Default mode** - compiler emits an error when an unavailable API is found in a code that is reachable from the shader entry point function or from an exported library function (when compiling a shader library)
+
+#. **Relaxed mode** - same as default mode except the compiler emits a warning. This mode is enabled by ``-Wno-error=hlsl-availability``.
+
+#. **Strict mode** - compiler emits an error when an unavailable API is found in parsed code regardless of whether it can be reached from the shader entry point or exported functions, or not. This mode is enabled by ``-fhlsl-strict-availability``.
+
+Implementation Details
+======================
+
+Environment Parameter
+---------------------
+
+In order to encode API availability based on the shader model version and shader model stage a new ``environment`` parameter was added to the existing Clang ``availability`` attribute.
+
+The values allowed for this parameter are a subset of values allowed as the ``llvm::Triple`` environment component. If the environment parameters is present, the declared availability attribute applies only to targets with the same platform and environment.
+
+Default and Relaxed Diagnostic Modes
+------------------------------------
+
+This mode is implemented in ``DiagnoseHLSLAvailability`` class in ``SemaHLSL.cpp`` and it is invoked after the whole translation unit is parsed (from ``Sema::ActOnEndOfTranslationUnit``). The implementation iterates over all shader entry points and exported library functions in the translation unit and performs an AST traversal of each function body.
+
+When a reference to another function or member method is found (``DeclRefExpr`` or ``MemberExpr``) and it has a body, the AST of the referenced function is also scanned. This chain of AST traversals will reach all of the code that is reachable from the initial shader entry point or exported library function and avoids the need to generate a call graph.
+
+All shader APIs have an availability attribute that specifies the shader model version (and environment, if applicable) when this API was first introduced.When a reference to a function without a definition is found and it has an availability attribute, the version of the attribute is checked against the target shader model version and shader stage (if shader stage context is known), and an appropriate diagnostic is generated as needed.
+
+All shader entry functions have ``HLSLShaderAttr`` attribute that specifies what type of shader this function represents. However, for exported library functions the target shader stage is unknown, so in this case the HLSL API availability will be only checked against the shader model version. It means that for exported library functions the diagnostic of APIs with availability specific to shader stage will be deferred until DXIL linking time.
+
+A list of functions that were already scanned is kept in order to avoid duplicate scans and diagnostics (see ``DiagnoseHLSLAvailability::ScannedDecls``). It might happen that a shader library has multiple shader entry points for different shader stages that all call into the same shared function. It is therefore important to record not just that a function has been scanned, but also in which shader stage context. This is done by using ``llvm::DenseMap`` that maps ``FunctionDecl *`` to a ``unsigned`` bitmap that represents a set of shader stages (or environments) the function has been scanned for. The ``N``'th bit in the set is set if the function has been scanned in shader environment whose ``HLSLShaderAttr::ShaderType`` integer value equals ``N``.
+
+The emitted diagnostic messages belong to ``hlsl-availability`` diagnostic group and are reported as errors by default. With ``-Wno-error=hlsl-availability`` flag they become warning, making it relaxed HLSL diagnostics mode.
+
+Strict Diagnostic Mode
+----------------------
+
+When strict HLSL availability diagnostic mode is enabled the compiler must report all HLSL API availability issues regardless of code reachability. The implementation of this mode takes advantage of an existing diagnostic scan in ``DiagnoseUnguardedAvailability`` class which is already traversing AST of each function as soon as the function body has been parsed. For HLSL, this pass was only slightly modified, such as making sure diagnostic messages are in the ``hlsl-availability`` group and that availability checks based on shader stage are not included if the shader stage context is unknown.
+
+If the compilation target is a shader library, only availability based on shader model version can be diagnosed during this scan. To diagnose availability based on shader stage, the compiler needs to run the AST traversals implementated in ``DiagnoseHLSLAvailability`` at the end of the translation unit as described above.
+
+As a result, availability based on specific shader stage will only be diagnosed in code that is reachable from a shader entry point or library export function. It also means that function bodies might be scanned multiple time. When that happens, care should be taken not to produce duplicated diagnostics.
+
+========
+Examples
+========
+
+**Note**
+For the example below, the ``WaveActiveCountBits`` API function became available in shader model 6.0 and ``WaveMultiPrefixSum`` in shader model 6.5.
+
+The availability of ``ddx`` function depends on a shader stage. It is available for pixel shaders in shader model 2.1 and higher, for compute, mesh and amplification shaders in shader model 6.6 and higher. For any other shader stages it is not available.
+
+Compute shader example
+======================
+
+.. code-block:: c++
+
+   float unusedFunction(float f) {
+     return ddx(f);
+   }
+
+   [numthreads(4, 4, 1)]
+   void main(uint3 threadId : SV_DispatchThreadId) {
+     float f1 = ddx(threadId.x);
+     float f2 = WaveActiveCountBits(threadId.y == 1.0);
+   }
+
+When compiled as compute shader for shader model version 5.0, Clang will emit the following error by default:
+
+.. code-block:: console
+
+   <>:7:13: error: 'ddx' is only available in compute shader environment on Shader Model 6.6 or newer
+   <>:8:13: error: 'WaveActiveCountBits' is only available on Shader Model 6.5 or newer
+
+With relaxed diagnostic mode this errors will become warnings.
+
+With strict diagnostic mode, in addition to the 2 errors above Clang will also emit error for the ``ddx`` call in ``unusedFunction``.:
+
+.. code-block:: console
+
+   <>:2:9: error: 'ddx' is only available in compute shader environment on Shader Model 6.5 or newer
+   <>:7:13: error: 'ddx' is only available in compute shader environment on Shader Model 6.5 or newer
+   <>:7:13: error: 'WaveActiveCountBits' is only available on Shader Model 6.5 or newer
+
+Shader library example
+======================
+
+.. code-block:: c++
+
+   float myFunction(float f) {
+     return ddx(f);
+   }
+
+   float unusedFunction(float f) {
+     return WaveMultiPrefixSum(f, 1.0);
+   }
+
+   [shader("compute")]
+   [numthreads(4, 4, 1)]
+   void main(uint3 threadId : SV_DispatchThreadId) {
+      float f = 3;
+      float e = myFunction(f);
+   }
+
+   [shader("pixel")]
+   void main() {
+      float f = 3;
+      float e = myFunction(f);
+   }
+
+When compiled as shader library vshader model version 6.4, Clang will emit the following error by default:
+
+.. code-block:: console
+
+   <>:2:9: error: 'ddx' is only available in compute shader environment on Shader Model 6.5 or newer
+
+With relaxed diagnostic mode this errors will become warnings.
+
+With strict diagnostic mode Clang will also emit errors for availability issues in code that is not used by any of the entry points:
+
+.. code-block:: console
+
+   <>2:9: error: 'ddx' is only available in compute shader environment on Shader Model 6.6 or newer
+   <>:6:9: error: 'WaveActiveCountBits' is only available on Shader Model 6.5 or newer
+
+Note that ``myFunction`` is reachable from both pixel and compute shader entry points is therefore scanned twice - once for each context. The diagnostic is emitted only for the compute shader context.
diff --git a/clang/docs/HLSL/HLSLDocs.rst b/clang/docs/HLSL/HLSLDocs.rst
index 97b2425f013b34..1e50a66d984b53 100644
--- a/clang/docs/HLSL/HLSLDocs.rst
+++ b/clang/docs/HLSL/HLSLDocs.rst
@@ -16,3 +16,4 @@ HLSL Design and Implementation
    ResourceTypes
    EntryFunctions
    FunctionCalls
+   AvailabilityDiagnostics

From c34079c9455515fd1eb4feaa7613a57e88b7209d Mon Sep 17 00:00:00 2001
From: Isaac David <61389980+orion160@users.noreply.github.com>
Date: Sun, 19 May 2024 11:39:46 -0500
Subject: [PATCH 11/60] [DOCS] ORCv2.rst Typo (#89482)

---
 llvm/docs/ORCv2.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/ORCv2.rst b/llvm/docs/ORCv2.rst
index 910ef5b9f3d02f..333977a0aaa664 100644
--- a/llvm/docs/ORCv2.rst
+++ b/llvm/docs/ORCv2.rst
@@ -780,7 +780,7 @@ constructs a new ThreadSafeContext value from a std::unique_ptr<LLVMContext>:
     // separate context.
     for (const auto &IRPath : IRPaths) {
       auto Ctx = std::make_unique<LLVMContext>();
-      auto M = std::make_unique<LLVMContext>("M", *Ctx);
+      auto M = std::make_unique<Module>("M", *Ctx);
       CompileLayer.add(MainJD, ThreadSafeModule(std::move(M), std::move(Ctx)));
     }
 

From 3f33c4c14e79e68007cf1460e4a0e606eb199da5 Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Sun, 19 May 2024 10:46:12 -0700
Subject: [PATCH 12/60] [Clang][HLSL] Add environment parameter to availability
 attribute (#89809)

Add `environment` parameter to Clang availability attribute. The allowed
values for this parameter are a subset of values allowed in the
`llvm::Triple` environment component. If the `environment` parameters is
present, the declared availability attribute applies only to targets
with the same platform and environment.

This new parameter will be initially used for annotating HLSL functions
for the `shadermodel` platform because in HLSL built-in function
availability can depend not just on the shader model version (mapped to
`llvm::Triple::OSType`) but also on the target shader stage (mapped to
`llvm::Triple::EnvironmentType`). See example in #89802 and
microsoft/hlsl-specs#204 for more details.

The environment parameter is currently supported only for HLSL.

Fixes #89802
---
 clang/include/clang/Basic/Attr.td             |  33 ++++-
 clang/include/clang/Basic/AttrDocs.td         |   5 +
 .../clang/Basic/DiagnosticParseKinds.td       |   2 +
 .../clang/Basic/DiagnosticSemaKinds.td        |  17 ++-
 clang/include/clang/Parse/Parser.h            |   3 +
 clang/include/clang/Sema/ParsedAttr.h         |  42 ++++--
 clang/include/clang/Sema/Sema.h               |  15 +-
 clang/lib/AST/DeclBase.cpp                    |  28 +++-
 clang/lib/Headers/hlsl/hlsl_intrinsics.h      |  15 +-
 clang/lib/Index/CommentToXML.cpp              |   6 +
 clang/lib/Parse/ParseDecl.cpp                 |  20 ++-
 clang/lib/Sema/SemaAPINotes.cpp               |   3 +-
 clang/lib/Sema/SemaAvailability.cpp           | 128 +++++++++++++-----
 clang/lib/Sema/SemaDecl.cpp                   |   2 +-
 clang/lib/Sema/SemaDeclAttr.cpp               |  48 +++++--
 clang/test/Parser/attr-availability.c         |   2 +
 clang/test/Sema/attr-availability-ios.c       |   1 +
 .../attr-availability-compute.hlsl            |  73 ++++++++++
 .../attr-availability-errors.hlsl             |  11 ++
 .../Availability/attr-availability-mesh.hlsl  |  73 ++++++++++
 .../Availability/attr-availability-pixel.hlsl |  63 +++++++++
 clang/test/SemaHLSL/AvailabilityMarkup.hlsl   |  25 ----
 .../SemaHLSL/WaveBuiltinAvailability.hlsl     |   4 +-
 23 files changed, 508 insertions(+), 111 deletions(-)
 create mode 100644 clang/test/SemaHLSL/Availability/attr-availability-compute.hlsl
 create mode 100644 clang/test/SemaHLSL/Availability/attr-availability-errors.hlsl
 create mode 100644 clang/test/SemaHLSL/Availability/attr-availability-mesh.hlsl
 create mode 100644 clang/test/SemaHLSL/Availability/attr-availability-pixel.hlsl
 delete mode 100644 clang/test/SemaHLSL/AvailabilityMarkup.hlsl

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 38ee8356583bef..7008bea483c872 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -999,7 +999,7 @@ def Availability : InheritableAttr {
               VersionArgument<"deprecated">, VersionArgument<"obsoleted">,
               BoolArgument<"unavailable">, StringArgument<"message">,
               BoolArgument<"strict">, StringArgument<"replacement">,
-              IntArgument<"priority">];
+              IntArgument<"priority">, IdentifierArgument<"environment">];
   let AdditionalMembers =
 [{static llvm::StringRef getPrettyPlatformName(llvm::StringRef Platform) {
     return llvm::StringSwitch<llvm::StringRef>(Platform)
@@ -1019,7 +1019,7 @@ def Availability : InheritableAttr {
              .Case("xros", "visionOS")
              .Case("xros_app_extension", "visionOS (App Extension)")
              .Case("swift", "Swift")
-             .Case("shadermodel", "HLSL ShaderModel")
+             .Case("shadermodel", "Shader Model")
              .Case("ohos", "OpenHarmony OS")
              .Default(llvm::StringRef());
 }
@@ -1059,7 +1059,34 @@ static llvm::StringRef canonicalizePlatformName(llvm::StringRef Platform) {
              .Case("visionos_app_extension", "xros_app_extension")
              .Case("ShaderModel", "shadermodel")
              .Default(Platform);
-} }];
+}
+static llvm::StringRef getPrettyEnviromentName(llvm::StringRef Environment) {
+    return llvm::StringSwitch<llvm::StringRef>(Environment)
+             .Case("pixel", "pixel shader")
+             .Case("vertex", "vertex shader")
+             .Case("geometry", "geometry shader")
+             .Case("hull", "hull shader")
+             .Case("domain", "domain shader")
+             .Case("compute", "compute shader")
+             .Case("mesh", "mesh shader")
+             .Case("amplification", "amplification shader")
+             .Case("library", "shader library")
+             .Default(Environment);
+}
+static llvm::Triple::EnvironmentType getEnvironmentType(llvm::StringRef Environment) {
+    return llvm::StringSwitch<llvm::Triple::EnvironmentType>(Environment)
+             .Case("pixel", llvm::Triple::Pixel)
+             .Case("vertex", llvm::Triple::Vertex)
+             .Case("geometry", llvm::Triple::Geometry)
+             .Case("hull", llvm::Triple::Hull)
+             .Case("domain", llvm::Triple::Domain)
+             .Case("compute", llvm::Triple::Compute)
+             .Case("mesh", llvm::Triple::Mesh)
+             .Case("amplification", llvm::Triple::Amplification)
+             .Case("library", llvm::Triple::Library)
+             .Default(llvm::Triple::UnknownEnvironment);
+}
+}];
   let HasCustomParsing = 1;
   let InheritEvenIfAlreadyPresent = 1;
   let Subjects = SubjectList<[Named]>;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index b48aaf65558acc..54197d588eb450 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -1593,6 +1593,11 @@ replacement=\ *string-literal*
   a warning about use of a deprecated declaration. The Fix-It will replace
   the deprecated declaration with the new declaration specified.
 
+environment=\ *identifier*
+  Target environment in which this declaration is available. If present,
+  the availability attribute applies only to targets with the same platform
+  and environment. The parameter is currently supported only in HLSL.
+
 Multiple availability attributes can be placed on a declaration, which may
 correspond to different platforms. For most platforms, the availability
 attribute with the platform corresponding to the target platform will be used;
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 8316845844cb2e..46656fc66044d2 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1112,6 +1112,8 @@ def err_zero_version : Error<
   "version number must have non-zero major, minor, or sub-minor version">;
 def err_availability_expected_platform : Error<
   "expected a platform name, e.g., 'macos'">;
+def err_availability_expected_environment : Error<
+  "expected an environment name, e.g., 'compute'">;
 
 // objc_bridge_related attribute
 def err_objcbridge_related_expected_related_class : Error<
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 09b1874f9fddd7..e3b4186f1b06fa 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3837,6 +3837,9 @@ def note_cannot_use_trivial_abi_reason : Note<
 // Availability attribute
 def warn_availability_unknown_platform : Warning<
   "unknown platform %0 in availability macro">, InGroup<Availability>;
+def warn_availability_unknown_environment : Warning<
+  "unknown environment %0 in availability macro">, InGroup<Availability>;
+
 def warn_availability_version_ordering : Warning<
   "feature cannot be %select{introduced|deprecated|obsoleted}0 in %1 version "
   "%2 before it was %select{introduced|deprecated|obsoleted}3 in version %4; "
@@ -3867,13 +3870,21 @@ def note_protocol_method : Note<
 def warn_availability_fuchsia_unavailable_minor : Warning<
   "Fuchsia API Level prohibits specifying a minor or sub-minor version">,
   InGroup<Availability>;
+def err_availability_unexpected_parameter: Error<
+  "unexpected parameter '%0' in availability attribute, not permitted in %select{HLSL|C/C++}1">;
 
 def warn_unguarded_availability :
-  Warning<"%0 is only available on %1 %2 or newer">,
+  Warning<"%0 is only available %select{|in %4 environment }3on %1 %2 or newer">,
+  InGroup<UnguardedAvailability>, DefaultIgnore;
+def warn_unguarded_availability_unavailable :
+  Warning<"%0 is unavailable">,
   InGroup<UnguardedAvailability>, DefaultIgnore;
 def warn_unguarded_availability_new :
   Warning<warn_unguarded_availability.Summary>,
   InGroup<UnguardedAvailabilityNew>;
+def warn_unguarded_availability_unavailable_new :
+  Warning<warn_unguarded_availability_unavailable.Summary>,
+  InGroup<UnguardedAvailabilityNew>;
 def note_decl_unguarded_availability_silence : Note<
   "annotate %select{%1|anonymous %1}0 with an availability attribute to silence this warning">;
 def note_unguarded_available_silence : Note<
@@ -5870,8 +5881,8 @@ def note_availability_specified_here : Note<
   "%0 has been explicitly marked "
   "%select{unavailable|deleted|deprecated}1 here">;
 def note_partial_availability_specified_here : Note<
-  "%0 has been marked as being introduced in %1 %2 here, "
-  "but the deployment target is %1 %3">;
+  "%0 has been marked as being introduced in %1 %2 %select{|in %5 environment }4here, "
+  "but the deployment target is %1 %3%select{| %6 environment }4">;
 def note_implicitly_deleted : Note<
   "explicitly defaulted function was implicitly deleted here">;
 def warn_not_enough_argument : Warning<
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 1e796e828b10a7..5f04664141d29f 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -153,6 +153,9 @@ class Parser : public CodeCompletionHandler {
   /// Identifier for "replacement".
   IdentifierInfo *Ident_replacement;
 
+  /// Identifier for "environment".
+  IdentifierInfo *Ident_environment;
+
   /// Identifiers used by the 'external_source_symbol' attribute.
   IdentifierInfo *Ident_language, *Ident_defined_in,
       *Ident_generated_declaration, *Ident_USR;
diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h
index 8368d9ce61466d..22cbd0d90ee432 100644
--- a/clang/include/clang/Sema/ParsedAttr.h
+++ b/clang/include/clang/Sema/ParsedAttr.h
@@ -40,6 +40,7 @@ class LangOptions;
 class Sema;
 class Stmt;
 class TargetInfo;
+struct IdentifierLoc;
 
 /// Represents information about a change in availability for
 /// an entity, which is part of the encoding of the 'availability'
@@ -68,12 +69,14 @@ struct AvailabilityData {
   AvailabilityChange Changes[NumAvailabilitySlots];
   SourceLocation StrictLoc;
   const Expr *Replacement;
+  const IdentifierLoc *EnvironmentLoc;
 
   AvailabilityData(const AvailabilityChange &Introduced,
                    const AvailabilityChange &Deprecated,
-                   const AvailabilityChange &Obsoleted,
-                   SourceLocation Strict, const Expr *ReplaceExpr)
-    : StrictLoc(Strict), Replacement(ReplaceExpr) {
+                   const AvailabilityChange &Obsoleted, SourceLocation Strict,
+                   const Expr *ReplaceExpr, const IdentifierLoc *EnvironmentLoc)
+      : StrictLoc(Strict), Replacement(ReplaceExpr),
+        EnvironmentLoc(EnvironmentLoc) {
     Changes[IntroducedSlot] = Introduced;
     Changes[DeprecatedSlot] = Deprecated;
     Changes[ObsoletedSlot] = Obsoleted;
@@ -234,7 +237,7 @@ class ParsedAttr final
              const AvailabilityChange &deprecated,
              const AvailabilityChange &obsoleted, SourceLocation unavailable,
              const Expr *messageExpr, Form formUsed, SourceLocation strict,
-             const Expr *replacementExpr)
+             const Expr *replacementExpr, const IdentifierLoc *environmentLoc)
       : AttributeCommonInfo(attrName, scopeName, attrRange, scopeLoc, formUsed),
         NumArgs(1), Invalid(false), UsedAsTypeAttr(false), IsAvailability(true),
         IsTypeTagForDatatype(false), IsProperty(false), HasParsedType(false),
@@ -243,8 +246,9 @@ class ParsedAttr final
         Info(ParsedAttrInfo::get(*this)) {
     ArgsUnion PVal(Parm);
     memcpy(getArgsBuffer(), &PVal, sizeof(ArgsUnion));
-    new (getAvailabilityData()) detail::AvailabilityData(
-        introduced, deprecated, obsoleted, strict, replacementExpr);
+    new (getAvailabilityData())
+        detail::AvailabilityData(introduced, deprecated, obsoleted, strict,
+                                 replacementExpr, environmentLoc);
   }
 
   /// Constructor for objc_bridge_related attributes.
@@ -445,6 +449,12 @@ class ParsedAttr final
     return getAvailabilityData()->Replacement;
   }
 
+  const IdentifierLoc *getEnvironment() const {
+    assert(getParsedKind() == AT_Availability &&
+           "Not an availability attribute");
+    return getAvailabilityData()->EnvironmentLoc;
+  }
+
   const ParsedType &getMatchingCType() const {
     assert(getParsedKind() == AT_TypeTagForDatatype &&
            "Not a type_tag_for_datatype attribute");
@@ -759,11 +769,13 @@ class AttributePool {
                      const AvailabilityChange &obsoleted,
                      SourceLocation unavailable, const Expr *MessageExpr,
                      ParsedAttr::Form form, SourceLocation strict,
-                     const Expr *ReplacementExpr) {
+                     const Expr *ReplacementExpr,
+                     IdentifierLoc *EnvironmentLoc) {
     void *memory = allocate(AttributeFactory::AvailabilityAllocSize);
-    return add(new (memory) ParsedAttr(
-        attrName, attrRange, scopeName, scopeLoc, Param, introduced, deprecated,
-        obsoleted, unavailable, MessageExpr, form, strict, ReplacementExpr));
+    return add(new (memory) ParsedAttr(attrName, attrRange, scopeName, scopeLoc,
+                                       Param, introduced, deprecated, obsoleted,
+                                       unavailable, MessageExpr, form, strict,
+                                       ReplacementExpr, EnvironmentLoc));
   }
 
   ParsedAttr *create(IdentifierInfo *attrName, SourceRange attrRange,
@@ -994,10 +1006,12 @@ class ParsedAttributes : public ParsedAttributesView {
                      const AvailabilityChange &obsoleted,
                      SourceLocation unavailable, const Expr *MessageExpr,
                      ParsedAttr::Form form, SourceLocation strict,
-                     const Expr *ReplacementExpr) {
-    ParsedAttr *attr = pool.create(
-        attrName, attrRange, scopeName, scopeLoc, Param, introduced, deprecated,
-        obsoleted, unavailable, MessageExpr, form, strict, ReplacementExpr);
+                     const Expr *ReplacementExpr,
+                     IdentifierLoc *EnvironmentLoc) {
+    ParsedAttr *attr =
+        pool.create(attrName, attrRange, scopeName, scopeLoc, Param, introduced,
+                    deprecated, obsoleted, unavailable, MessageExpr, form,
+                    strict, ReplacementExpr, EnvironmentLoc);
     addAtEnd(attr);
     return attr;
   }
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index b16a304960d3fc..6c89d275215de3 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -39,6 +39,7 @@
 #include "clang/Basic/Cuda.h"
 #include "clang/Basic/DarwinSDKInfo.h"
 #include "clang/Basic/ExpressionTraits.h"
+#include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/Module.h"
 #include "clang/Basic/OpenCLOptions.h"
 #include "clang/Basic/PragmaKinds.h"
@@ -3580,13 +3581,13 @@ class Sema final : public SemaBase {
   bool CheckAttrTarget(const ParsedAttr &CurrAttr);
   bool CheckAttrNoArgs(const ParsedAttr &CurrAttr);
 
-  AvailabilityAttr *
-  mergeAvailabilityAttr(NamedDecl *D, const AttributeCommonInfo &CI,
-                        IdentifierInfo *Platform, bool Implicit,
-                        VersionTuple Introduced, VersionTuple Deprecated,
-                        VersionTuple Obsoleted, bool IsUnavailable,
-                        StringRef Message, bool IsStrict, StringRef Replacement,
-                        AvailabilityMergeKind AMK, int Priority);
+  AvailabilityAttr *mergeAvailabilityAttr(
+      NamedDecl *D, const AttributeCommonInfo &CI, IdentifierInfo *Platform,
+      bool Implicit, VersionTuple Introduced, VersionTuple Deprecated,
+      VersionTuple Obsoleted, bool IsUnavailable, StringRef Message,
+      bool IsStrict, StringRef Replacement, AvailabilityMergeKind AMK,
+      int Priority, IdentifierInfo *IIEnvironment);
+
   TypeVisibilityAttr *
   mergeTypeVisibilityAttr(Decl *D, const AttributeCommonInfo &CI,
                           TypeVisibilityAttr::VisibilityType Vis);
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index 03e1055251c24f..65d5eeb6354eba 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -666,12 +666,28 @@ static AvailabilityResult CheckAvailability(ASTContext &Context,
   // Make sure that this declaration has already been introduced.
   if (!A->getIntroduced().empty() &&
       EnclosingVersion < A->getIntroduced()) {
-    if (Message) {
-      Message->clear();
-      llvm::raw_string_ostream Out(*Message);
-      VersionTuple VTI(A->getIntroduced());
-      Out << "introduced in " << PrettyPlatformName << ' '
-          << VTI << HintMessage;
+    IdentifierInfo *IIEnv = A->getEnvironment();
+    StringRef TargetEnv =
+        Context.getTargetInfo().getTriple().getEnvironmentName();
+    StringRef EnvName = AvailabilityAttr::getPrettyEnviromentName(TargetEnv);
+    // Matching environment or no environment on attribute
+    if (!IIEnv || (!TargetEnv.empty() && IIEnv->getName() == TargetEnv)) {
+      if (Message) {
+        Message->clear();
+        llvm::raw_string_ostream Out(*Message);
+        VersionTuple VTI(A->getIntroduced());
+        Out << "introduced in " << PrettyPlatformName << " " << VTI << " "
+            << EnvName << HintMessage;
+      }
+    }
+    // Non-matching environment or no environment on target
+    else {
+      if (Message) {
+        Message->clear();
+        llvm::raw_string_ostream Out(*Message);
+        Out << "not available on " << PrettyPlatformName << " " << EnvName
+            << HintMessage;
+      }
     }
 
     return A->getStrict() ? AR_Unavailable : AR_NotYetIntroduced;
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 3390f0962f67d0..bc72e8a00e0d56 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -18,14 +18,21 @@ namespace hlsl {
 
 #define _HLSL_BUILTIN_ALIAS(builtin)                                           \
   __attribute__((clang_builtin_alias(builtin)))
-#define _HLSL_AVAILABILITY(environment, version)                               \
-  __attribute__((availability(environment, introduced = version)))
+#define _HLSL_AVAILABILITY(platform, version)                                  \
+  __attribute__((availability(platform, introduced = version)))
+#define _HLSL_AVAILABILITY_STAGE(platform, version, stage)                     \
+  __attribute__((                                                              \
+      availability(platform, introduced = version, environment = stage)))
 
 #ifdef __HLSL_ENABLE_16_BIT
-#define _HLSL_16BIT_AVAILABILITY(environment, version)                         \
-  __attribute__((availability(environment, introduced = version)))
+#define _HLSL_16BIT_AVAILABILITY(platform, version)                            \
+  __attribute__((availability(platform, introduced = version)))
+#define _HLSL_16BIT_AVAILABILITY_STAGE(platform, version, stage)               \
+  __attribute__((                                                              \
+      availability(platform, introduced = version, environment = stage)))
 #else
 #define _HLSL_16BIT_AVAILABILITY(environment, version)
+#define _HLSL_16BIT_AVAILABILITY_STAGE(environment, version, stage)
 #endif
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Index/CommentToXML.cpp b/clang/lib/Index/CommentToXML.cpp
index 295f3f228ff79b..3372fbba438317 100644
--- a/clang/lib/Index/CommentToXML.cpp
+++ b/clang/lib/Index/CommentToXML.cpp
@@ -12,6 +12,7 @@
 #include "clang/AST/Comment.h"
 #include "clang/AST/CommentVisitor.h"
 #include "clang/Basic/FileManager.h"
+#include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Format/Format.h"
 #include "clang/Index/USRGeneration.h"
@@ -1052,6 +1053,11 @@ void CommentASTToXMLConverter::visitFullComment(const FullComment *C) {
       }
       if (AA->getUnavailable())
         Result << "<Unavailable/>";
+
+      IdentifierInfo *Environment = AA->getEnvironment();
+      if (Environment) {
+        Result << "<Environment>" << Environment->getName() << "</Environment>";
+      }
       Result << "</Availability>";
     }
   }
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 2ce8fa98089f6d..445d3fd66e3879 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -1256,6 +1256,7 @@ void Parser::ParseAvailabilityAttribute(
   enum { Introduced, Deprecated, Obsoleted, Unknown };
   AvailabilityChange Changes[Unknown];
   ExprResult MessageExpr, ReplacementExpr;
+  IdentifierLoc *EnvironmentLoc = nullptr;
 
   // Opening '('.
   BalancedDelimiterTracker T(*this, tok::l_paren);
@@ -1303,6 +1304,7 @@ void Parser::ParseAvailabilityAttribute(
     Ident_message = PP.getIdentifierInfo("message");
     Ident_strict = PP.getIdentifierInfo("strict");
     Ident_replacement = PP.getIdentifierInfo("replacement");
+    Ident_environment = PP.getIdentifierInfo("environment");
   }
 
   // Parse the optional "strict", the optional "replacement" and the set of
@@ -1350,6 +1352,13 @@ void Parser::ParseAvailabilityAttribute(
       continue;
     }
 
+    if (Keyword == Ident_environment) {
+      if (EnvironmentLoc != nullptr) {
+        Diag(KeywordLoc, diag::err_availability_redundant)
+            << Keyword << SourceRange(EnvironmentLoc->Loc);
+      }
+    }
+
     if (Tok.isNot(tok::equal)) {
       Diag(Tok, diag::err_expected_after) << Keyword << tok::equal;
       SkipUntil(tok::r_paren, StopAtSemi);
@@ -1371,6 +1380,15 @@ void Parser::ParseAvailabilityAttribute(
         continue;
       }
     }
+    if (Keyword == Ident_environment) {
+      if (Tok.isNot(tok::identifier)) {
+        Diag(Tok, diag::err_availability_expected_environment);
+        SkipUntil(tok::r_paren, StopAtSemi);
+        return;
+      }
+      EnvironmentLoc = ParseIdentifierLoc();
+      continue;
+    }
 
     // Special handling of 'NA' only when applied to introduced or
     // deprecated.
@@ -1452,7 +1470,7 @@ void Parser::ParseAvailabilityAttribute(
                SourceRange(AvailabilityLoc, T.getCloseLocation()), ScopeName,
                ScopeLoc, Platform, Changes[Introduced], Changes[Deprecated],
                Changes[Obsoleted], UnavailableLoc, MessageExpr.get(), Form,
-               StrictLoc, ReplacementExpr.get());
+               StrictLoc, ReplacementExpr.get(), EnvironmentLoc);
 }
 
 /// Parse the contents of the "external_source_symbol" attribute.
diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
index 443bf162044ff7..c80b08e361cfa6 100644
--- a/clang/lib/Sema/SemaAPINotes.cpp
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -269,7 +269,8 @@ static void ProcessAPINotes(Sema &S, Decl *D,
               ASTAllocateString(S.Context, Info.UnavailableMsg),
               /*Strict=*/false,
               /*Replacement=*/StringRef(),
-              /*Priority=*/Sema::AP_Explicit);
+              /*Priority=*/Sema::AP_Explicit,
+              /*Environment=*/nullptr);
         },
         [](const Decl *D) {
           return llvm::find_if(D->attrs(), [](const Attr *next) -> bool {
diff --git a/clang/lib/Sema/SemaAvailability.cpp b/clang/lib/Sema/SemaAvailability.cpp
index 5ebc25317bf371..663b6f35b869d5 100644
--- a/clang/lib/Sema/SemaAvailability.cpp
+++ b/clang/lib/Sema/SemaAvailability.cpp
@@ -14,20 +14,37 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/DiagnosticSema.h"
+#include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/DelayedDiagnostic.h"
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/Sema.h"
 #include "clang/Sema/SemaObjC.h"
+#include "llvm/ADT/StringRef.h"
 #include <optional>
 
 using namespace clang;
 using namespace sema;
 
+static bool hasMatchingEnvironmentOrNone(const ASTContext &Context,
+                                         const AvailabilityAttr *AA) {
+  IdentifierInfo *IIEnvironment = AA->getEnvironment();
+  auto Environment = Context.getTargetInfo().getTriple().getEnvironment();
+  if (!IIEnvironment || Environment == llvm::Triple::UnknownEnvironment)
+    return true;
+
+  llvm::Triple::EnvironmentType ET =
+      AvailabilityAttr::getEnvironmentType(IIEnvironment->getName());
+  return Environment == ET;
+}
+
 static const AvailabilityAttr *getAttrForPlatform(ASTContext &Context,
                                                   const Decl *D) {
+  AvailabilityAttr const *PartialMatch = nullptr;
   // Check each AvailabilityAttr to find the one for this platform.
+  // For multiple attributes with the same platform try to find one for this
+  // environment.
   for (const auto *A : D->attrs()) {
     if (const auto *Avail = dyn_cast<AvailabilityAttr>(A)) {
       // FIXME: this is copied from CheckAvailability. We should try to
@@ -46,11 +63,15 @@ static const AvailabilityAttr *getAttrForPlatform(ASTContext &Context,
       StringRef TargetPlatform = Context.getTargetInfo().getPlatformName();
 
       // Match the platform name.
-      if (RealizedPlatform == TargetPlatform)
-        return Avail;
+      if (RealizedPlatform == TargetPlatform) {
+        // Find the best matching attribute for this environment
+        if (hasMatchingEnvironmentOrNone(Context, Avail))
+          return Avail;
+        PartialMatch = Avail;
+      }
     }
   }
-  return nullptr;
+  return PartialMatch;
 }
 
 /// The diagnostic we should emit for \c D, and the declaration that
@@ -118,10 +139,9 @@ ShouldDiagnoseAvailabilityOfDecl(Sema &S, const NamedDecl *D,
 /// whether we should emit a diagnostic for \c K and \c DeclVersion in
 /// the context of \c Ctx. For example, we should emit an unavailable diagnostic
 /// in a deprecated context, but not the other way around.
-static bool
-ShouldDiagnoseAvailabilityInContext(Sema &S, AvailabilityResult K,
-                                    VersionTuple DeclVersion, Decl *Ctx,
-                                    const NamedDecl *OffendingDecl) {
+static bool ShouldDiagnoseAvailabilityInContext(
+    Sema &S, AvailabilityResult K, VersionTuple DeclVersion,
+    const IdentifierInfo *DeclEnv, Decl *Ctx, const NamedDecl *OffendingDecl) {
   assert(K != AR_Available && "Expected an unavailable declaration here!");
 
   // If this was defined using CF_OPTIONS, etc. then ignore the diagnostic.
@@ -140,7 +160,8 @@ ShouldDiagnoseAvailabilityInContext(Sema &S, AvailabilityResult K,
   auto CheckContext = [&](const Decl *C) {
     if (K == AR_NotYetIntroduced) {
       if (const AvailabilityAttr *AA = getAttrForPlatform(S.Context, C))
-        if (AA->getIntroduced() >= DeclVersion)
+        if (AA->getIntroduced() >= DeclVersion &&
+            AA->getEnvironment() == DeclEnv)
           return true;
     } else if (K == AR_Deprecated) {
       if (C->isDeprecated())
@@ -344,10 +365,14 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
   unsigned available_here_select_kind;
 
   VersionTuple DeclVersion;
-  if (const AvailabilityAttr *AA = getAttrForPlatform(S.Context, OffendingDecl))
+  const AvailabilityAttr *AA = getAttrForPlatform(S.Context, OffendingDecl);
+  const IdentifierInfo *IIEnv = nullptr;
+  if (AA) {
     DeclVersion = AA->getIntroduced();
+    IIEnv = AA->getEnvironment();
+  }
 
-  if (!ShouldDiagnoseAvailabilityInContext(S, K, DeclVersion, Ctx,
+  if (!ShouldDiagnoseAvailabilityInContext(S, K, DeclVersion, IIEnv, Ctx,
                                            OffendingDecl))
     return;
 
@@ -355,8 +380,7 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
 
   // The declaration can have multiple availability attributes, we are looking
   // at one of them.
-  const AvailabilityAttr *A = getAttrForPlatform(S.Context, OffendingDecl);
-  if (A && A->isInherited()) {
+  if (AA && AA->isInherited()) {
     for (const Decl *Redecl = OffendingDecl->getMostRecentDecl(); Redecl;
          Redecl = Redecl->getPreviousDecl()) {
       const AvailabilityAttr *AForRedecl =
@@ -376,26 +400,43 @@ static void DoEmitAvailabilityWarning(Sema &S, AvailabilityResult K,
     // not specified for deployment targets >= to iOS 11 or equivalent or
     // for declarations that were introduced in iOS 11 (macOS 10.13, ...) or
     // later.
-    const AvailabilityAttr *AA =
-        getAttrForPlatform(S.getASTContext(), OffendingDecl);
+    assert(AA != nullptr && "expecting valid availability attribute");
     VersionTuple Introduced = AA->getIntroduced();
+    bool EnvironmentMatchesOrNone =
+        hasMatchingEnvironmentOrNone(S.getASTContext(), AA);
+
+    const TargetInfo &TI = S.getASTContext().getTargetInfo();
+    std::string PlatformName(
+        AvailabilityAttr::getPrettyPlatformName(TI.getPlatformName()));
+    llvm::StringRef TargetEnvironment(AvailabilityAttr::getPrettyEnviromentName(
+        TI.getTriple().getEnvironmentName()));
+    llvm::StringRef AttrEnvironment =
+        AA->getEnvironment() ? AvailabilityAttr::getPrettyEnviromentName(
+                                   AA->getEnvironment()->getName())
+                             : "";
+    bool UseEnvironment =
+        (!AttrEnvironment.empty() && !TargetEnvironment.empty());
 
     bool UseNewWarning = shouldDiagnoseAvailabilityByDefault(
         S.Context, S.Context.getTargetInfo().getPlatformMinVersion(),
         Introduced);
-    unsigned Warning = UseNewWarning ? diag::warn_unguarded_availability_new
-                                     : diag::warn_unguarded_availability;
 
-    std::string PlatformName(AvailabilityAttr::getPrettyPlatformName(
-        S.getASTContext().getTargetInfo().getPlatformName()));
+    unsigned DiagKind =
+        EnvironmentMatchesOrNone
+            ? (UseNewWarning ? diag::warn_unguarded_availability_new
+                             : diag::warn_unguarded_availability)
+            : (UseNewWarning ? diag::warn_unguarded_availability_unavailable_new
+                             : diag::warn_unguarded_availability_unavailable);
 
-    S.Diag(Loc, Warning) << OffendingDecl << PlatformName
-                         << Introduced.getAsString();
+    S.Diag(Loc, DiagKind) << OffendingDecl << PlatformName
+                          << Introduced.getAsString() << UseEnvironment
+                          << TargetEnvironment;
 
     S.Diag(OffendingDecl->getLocation(),
            diag::note_partial_availability_specified_here)
         << OffendingDecl << PlatformName << Introduced.getAsString()
-        << S.Context.getTargetInfo().getPlatformMinVersion().getAsString();
+        << S.Context.getTargetInfo().getPlatformMinVersion().getAsString()
+        << UseEnvironment << AttrEnvironment << TargetEnvironment;
 
     if (const auto *Enclosing = findEnclosingDeclToAnnotate(Ctx)) {
       if (const auto *TD = dyn_cast<TagDecl>(Enclosing))
@@ -772,14 +813,17 @@ void DiagnoseUnguardedAvailability::DiagnoseDeclAvailability(
 
     const AvailabilityAttr *AA =
       getAttrForPlatform(SemaRef.getASTContext(), OffendingDecl);
+    bool EnvironmentMatchesOrNone =
+        hasMatchingEnvironmentOrNone(SemaRef.getASTContext(), AA);
     VersionTuple Introduced = AA->getIntroduced();
 
-    if (AvailabilityStack.back() >= Introduced)
+    if (EnvironmentMatchesOrNone && AvailabilityStack.back() >= Introduced)
       return;
 
     // If the context of this function is less available than D, we should not
     // emit a diagnostic.
-    if (!ShouldDiagnoseAvailabilityInContext(SemaRef, Result, Introduced, Ctx,
+    if (!ShouldDiagnoseAvailabilityInContext(SemaRef, Result, Introduced,
+                                             AA->getEnvironment(), Ctx,
                                              OffendingDecl))
       return;
 
@@ -787,25 +831,39 @@ void DiagnoseUnguardedAvailability::DiagnoseDeclAvailability(
     // not specified for deployment targets >= to iOS 11 or equivalent or
     // for declarations that were introduced in iOS 11 (macOS 10.13, ...) or
     // later.
-    unsigned DiagKind =
-        shouldDiagnoseAvailabilityByDefault(
-            SemaRef.Context,
-            SemaRef.Context.getTargetInfo().getPlatformMinVersion(), Introduced)
-            ? diag::warn_unguarded_availability_new
-            : diag::warn_unguarded_availability;
+    bool UseNewDiagKind = shouldDiagnoseAvailabilityByDefault(
+        SemaRef.Context,
+        SemaRef.Context.getTargetInfo().getPlatformMinVersion(), Introduced);
+
+    const TargetInfo &TI = SemaRef.getASTContext().getTargetInfo();
+    std::string PlatformName(
+        AvailabilityAttr::getPrettyPlatformName(TI.getPlatformName()));
+    llvm::StringRef TargetEnvironment(AvailabilityAttr::getPrettyEnviromentName(
+        TI.getTriple().getEnvironmentName()));
+    llvm::StringRef AttrEnvironment =
+        AA->getEnvironment() ? AvailabilityAttr::getPrettyEnviromentName(
+                                   AA->getEnvironment()->getName())
+                             : "";
+    bool UseEnvironment =
+        (!AttrEnvironment.empty() && !TargetEnvironment.empty());
 
-    std::string PlatformName(AvailabilityAttr::getPrettyPlatformName(
-        SemaRef.getASTContext().getTargetInfo().getPlatformName()));
+    unsigned DiagKind =
+        EnvironmentMatchesOrNone
+            ? (UseNewDiagKind ? diag::warn_unguarded_availability_new
+                              : diag::warn_unguarded_availability)
+            : (UseNewDiagKind
+                   ? diag::warn_unguarded_availability_unavailable_new
+                   : diag::warn_unguarded_availability_unavailable);
 
     SemaRef.Diag(Range.getBegin(), DiagKind)
-        << Range << D << PlatformName << Introduced.getAsString();
+        << Range << D << PlatformName << Introduced.getAsString()
+        << UseEnvironment << TargetEnvironment;
 
     SemaRef.Diag(OffendingDecl->getLocation(),
                  diag::note_partial_availability_specified_here)
         << OffendingDecl << PlatformName << Introduced.getAsString()
-        << SemaRef.Context.getTargetInfo()
-               .getPlatformMinVersion()
-               .getAsString();
+        << SemaRef.Context.getTargetInfo().getPlatformMinVersion().getAsString()
+        << UseEnvironment << AttrEnvironment << TargetEnvironment;
 
     auto FixitDiag =
         SemaRef.Diag(Range.getBegin(), diag::note_unguarded_available_silence)
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index f2b9202255cd44..557fe10619c354 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -2879,7 +2879,7 @@ static bool mergeDeclAttribute(Sema &S, NamedDecl *D,
         D, *AA, AA->getPlatform(), AA->isImplicit(), AA->getIntroduced(),
         AA->getDeprecated(), AA->getObsoleted(), AA->getUnavailable(),
         AA->getMessage(), AA->getStrict(), AA->getReplacement(), AMK,
-        AA->getPriority());
+        AA->getPriority(), AA->getEnvironment());
   else if (const auto *VA = dyn_cast<VisibilityAttr>(Attr))
     NewAttr = S.mergeVisibilityAttr(D, *VA, VA->getVisibility());
   else if (const auto *VA = dyn_cast<TypeVisibilityAttr>(Attr))
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 30776ff537fb50..ca5938083917f7 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -26,6 +26,7 @@
 #include "clang/Basic/Cuda.h"
 #include "clang/Basic/DarwinSDKInfo.h"
 #include "clang/Basic/HLSLRuntime.h"
+#include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManager.h"
@@ -52,6 +53,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/Triple.h"
 #include <optional>
 
 using namespace clang;
@@ -2495,7 +2497,7 @@ AvailabilityAttr *Sema::mergeAvailabilityAttr(
     bool Implicit, VersionTuple Introduced, VersionTuple Deprecated,
     VersionTuple Obsoleted, bool IsUnavailable, StringRef Message,
     bool IsStrict, StringRef Replacement, AvailabilityMergeKind AMK,
-    int Priority) {
+    int Priority, IdentifierInfo *Environment) {
   VersionTuple MergedIntroduced = Introduced;
   VersionTuple MergedDeprecated = Deprecated;
   VersionTuple MergedObsoleted = Obsoleted;
@@ -2529,6 +2531,12 @@ AvailabilityAttr *Sema::mergeAvailabilityAttr(
         continue;
       }
 
+      IdentifierInfo *OldEnvironment = OldAA->getEnvironment();
+      if (OldEnvironment != Environment) {
+        ++i;
+        continue;
+      }
+
       // If there is an existing availability attribute for this platform that
       // has a lower priority use the existing one and discard the new
       // attribute.
@@ -2647,7 +2655,7 @@ AvailabilityAttr *Sema::mergeAvailabilityAttr(
       !OverrideOrImpl) {
     auto *Avail = ::new (Context) AvailabilityAttr(
         Context, CI, Platform, Introduced, Deprecated, Obsoleted, IsUnavailable,
-        Message, IsStrict, Replacement, Priority);
+        Message, IsStrict, Replacement, Priority, Environment);
     Avail->setImplicit(Implicit);
     return Avail;
   }
@@ -2706,13 +2714,34 @@ static void handleAvailabilityAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     }
   }
 
+  if (S.getLangOpts().HLSL && IsStrict)
+    S.Diag(AL.getStrictLoc(), diag::err_availability_unexpected_parameter)
+        << "strict" << /* HLSL */ 0;
+
   int PriorityModifier = AL.isPragmaClangAttribute()
                              ? Sema::AP_PragmaClangAttribute
                              : Sema::AP_Explicit;
+
+  const IdentifierLoc *EnvironmentLoc = AL.getEnvironment();
+  IdentifierInfo *IIEnvironment = nullptr;
+  if (EnvironmentLoc) {
+    if (S.getLangOpts().HLSL) {
+      IIEnvironment = EnvironmentLoc->Ident;
+      if (AvailabilityAttr::getEnvironmentType(
+              EnvironmentLoc->Ident->getName()) ==
+          llvm::Triple::EnvironmentType::UnknownEnvironment)
+        S.Diag(EnvironmentLoc->Loc, diag::warn_availability_unknown_environment)
+            << EnvironmentLoc->Ident;
+    } else {
+      S.Diag(EnvironmentLoc->Loc, diag::err_availability_unexpected_parameter)
+          << "environment" << /* C/C++ */ 1;
+    }
+  }
+
   AvailabilityAttr *NewAttr = S.mergeAvailabilityAttr(
       ND, AL, II, false /*Implicit*/, Introduced.Version, Deprecated.Version,
       Obsoleted.Version, IsUnavailable, Str, IsStrict, Replacement,
-      Sema::AMK_None, PriorityModifier);
+      Sema::AMK_None, PriorityModifier, IIEnvironment);
   if (NewAttr)
     D->addAttr(NewAttr);
 
@@ -2768,8 +2797,8 @@ static void handleAvailabilityAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
       AvailabilityAttr *NewAttr = S.mergeAvailabilityAttr(
           ND, AL, NewII, true /*Implicit*/, NewIntroduced, NewDeprecated,
           NewObsoleted, IsUnavailable, Str, IsStrict, Replacement,
-          Sema::AMK_None,
-          PriorityModifier + Sema::AP_InferredFromOtherPlatform);
+          Sema::AMK_None, PriorityModifier + Sema::AP_InferredFromOtherPlatform,
+          IIEnvironment);
       if (NewAttr)
         D->addAttr(NewAttr);
     }
@@ -2810,8 +2839,8 @@ static void handleAvailabilityAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
       AvailabilityAttr *NewAttr = S.mergeAvailabilityAttr(
           ND, AL, NewII, true /*Implicit*/, NewIntroduced, NewDeprecated,
           NewObsoleted, IsUnavailable, Str, IsStrict, Replacement,
-          Sema::AMK_None,
-          PriorityModifier + Sema::AP_InferredFromOtherPlatform);
+          Sema::AMK_None, PriorityModifier + Sema::AP_InferredFromOtherPlatform,
+          IIEnvironment);
       if (NewAttr)
         D->addAttr(NewAttr);
     }
@@ -2844,7 +2873,7 @@ static void handleAvailabilityAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
           MinMacCatalystVersion(Deprecated.Version),
           MinMacCatalystVersion(Obsoleted.Version), IsUnavailable, Str,
           IsStrict, Replacement, Sema::AMK_None,
-          PriorityModifier + Sema::AP_InferredFromOtherPlatform);
+          PriorityModifier + Sema::AP_InferredFromOtherPlatform, IIEnvironment);
       if (NewAttr)
         D->addAttr(NewAttr);
     } else if (II->getName() == "macos" && GetSDKInfo() &&
@@ -2887,7 +2916,8 @@ static void handleAvailabilityAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
               VersionOrEmptyVersion(NewObsoleted), /*IsUnavailable=*/false, Str,
               IsStrict, Replacement, Sema::AMK_None,
               PriorityModifier + Sema::AP_InferredFromOtherPlatform +
-                  Sema::AP_InferredFromOtherPlatform);
+                  Sema::AP_InferredFromOtherPlatform,
+              IIEnvironment);
           if (NewAttr)
             D->addAttr(NewAttr);
         }
diff --git a/clang/test/Parser/attr-availability.c b/clang/test/Parser/attr-availability.c
index aab0f2f3a852a9..9d84d9c1df363f 100644
--- a/clang/test/Parser/attr-availability.c
+++ b/clang/test/Parser/attr-availability.c
@@ -30,6 +30,8 @@ void f11(void) __attribute__((availability(macosx,message=u"b"))); // expected-w
 
 void f12(void) __attribute__((availability(macosx,message="a" u"b"))); // expected-warning {{encoding prefix 'u' on an unevaluated string literal has no effect}}
 
+void f13(void) __attribute__((availability(shadermodel, introduced = 6.0, environment=pixel))); // expected-error {{unexpected parameter 'environment' in availability attribute, not permitted in C/C++}}
+
 enum E{
     gorf __attribute__((availability(macosx,introduced=8.5, message = 10.0))), // expected-error {{expected string literal for optional message in 'availability' attribute}}
     garf __attribute__((availability(macosx,introduced=8.5, message))), // expected-error {{expected '=' after 'message'}}
diff --git a/clang/test/Sema/attr-availability-ios.c b/clang/test/Sema/attr-availability-ios.c
index b97b7e688cc615..b001e70b5ff5c3 100644
--- a/clang/test/Sema/attr-availability-ios.c
+++ b/clang/test/Sema/attr-availability-ios.c
@@ -9,6 +9,7 @@ void f4(int) __attribute__((availability(macosx,introduced=10.1,deprecated=10.3,
 void f5(int) __attribute__((availability(ios,introduced=2.0))) __attribute__((availability(ios,deprecated=3.0))); // expected-note {{'f5' has been explicitly marked deprecated here}}
 void f6(int) __attribute__((availability(ios,deprecated=3.0))); // expected-note {{'f6' has been explicitly marked deprecated here}}
 void f6(int) __attribute__((availability(iOS,introduced=2.0)));
+void f7(int) __attribute__((availability(ios,introduced=2.0, environment=e))); // expected-error {{unexpected parameter 'environment' in availability attribute, not permitted in C/C++}}
 
 void test(void) {
   f0(0); // expected-warning{{'f0' is deprecated: first deprecated in iOS 2.1}}
diff --git a/clang/test/SemaHLSL/Availability/attr-availability-compute.hlsl b/clang/test/SemaHLSL/Availability/attr-availability-compute.hlsl
new file mode 100644
index 00000000000000..8fa696ea116498
--- /dev/null
+++ b/clang/test/SemaHLSL/Availability/attr-availability-compute.hlsl
@@ -0,0 +1,73 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel5.0-compute -fsyntax-only -verify %s
+
+// Platform shader model, no environment parameter
+__attribute__((availability(shadermodel, introduced = 6.0)))
+unsigned f1(); // #f1
+
+__attribute__((availability(shadermodel, introduced = 5.1)))
+unsigned f2(); // #f2
+
+__attribute__((availability(shadermodel, introduced = 5.0)))
+unsigned f3();
+
+// Platform shader model, environment parameter restricting earlier version,
+// available in all environments in higher versions
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0)))
+unsigned f4(); // #f4
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 5.0)))
+unsigned f5();
+
+// Platform shader model, environment parameter restricting earlier version,
+// never available in all environments in higher versions
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = compute)))
+__attribute__((availability(shadermodel, introduced = 5.0, environment = mesh)))
+unsigned f6();  // #f6
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = mesh)))
+unsigned f7(); // #f7
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 5.0, environment = compute)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = mesh)))
+unsigned f8();
+
+[numthreads(4,1,1)]
+int main() {
+    // expected-warning@#f1_call {{'f1' is only available on Shader Model 6.0 or newer}}
+    // expected-note@#f1 {{'f1' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f1_call {{enclose 'f1' in a __builtin_available check to silence this warning}}
+    unsigned A = f1(); // #f1_call
+
+    // expected-warning@#f2_call {{'f2' is only available on Shader Model 5.1 or newer}}
+    // expected-note@#f2 {{'f2' has been marked as being introduced in Shader Model 5.1 here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f2_call {{enclose 'f2' in a __builtin_available check to silence this warning}}
+    unsigned B = f2(); // #f2_call
+
+    unsigned C = f3();
+
+    // expected-warning@#f4_call {{'f4' is only available on Shader Model 6.0 or newer}}
+    // expected-note@#f4 {{'f4' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f4_call {{enclose 'f4' in a __builtin_available check to silence this warning}}
+    unsigned D = f4(); // #f4_call
+
+    unsigned E = f5();
+
+    // expected-warning@#f6_call {{'f6' is only available in compute shader environment on Shader Model 6.0 or newer}}
+    // expected-note@#f6 {{'f6' has been marked as being introduced in Shader Model 6.0 in compute shader environment here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f6_call {{enclose 'f6' in a __builtin_available check to silence this warning}}
+    unsigned F = f6(); // #f6_call
+
+    // expected-warning@#f7_call {{'f7' is unavailable}}
+    // expected-note@#f7 {{'f7' has been marked as being introduced in Shader Model 6.0 in mesh shader environment here, but the deployment target is Shader Model 5.0 compute shader environment}}
+    // expected-note@#f7_call {{enclose 'f7' in a __builtin_available check to silence this warning}}
+    unsigned G = f7(); // #f7_call
+
+    unsigned H = f8();
+
+    return 0;
+}
diff --git a/clang/test/SemaHLSL/Availability/attr-availability-errors.hlsl b/clang/test/SemaHLSL/Availability/attr-availability-errors.hlsl
new file mode 100644
index 00000000000000..2682eb5fbb5c2f
--- /dev/null
+++ b/clang/test/SemaHLSL/Availability/attr-availability-errors.hlsl
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.5-library -fsyntax-only -verify %s
+
+
+void f1(void) __attribute__((availability(shadermodel, introduced = 6.0, environment="pixel"))); // expected-error {{expected an environment name, e.g., 'compute'}}
+
+void f2(void) __attribute__((availability(shadermodel, introduced = 6.0, environment=pixel, environment=compute))); // expected-error {{redundant 'environment' availability change; only the last specified change will be used}}
+
+void f3(void) __attribute__((availability(shadermodel, strict, introduced = 6.0, environment = mesh))); // expected-error {{unexpected parameter 'strict' in availability attribute, not permitted in HLSL}}
+
+int main() {
+}
diff --git a/clang/test/SemaHLSL/Availability/attr-availability-mesh.hlsl b/clang/test/SemaHLSL/Availability/attr-availability-mesh.hlsl
new file mode 100644
index 00000000000000..40a7ddbb1de988
--- /dev/null
+++ b/clang/test/SemaHLSL/Availability/attr-availability-mesh.hlsl
@@ -0,0 +1,73 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel5.0-mesh -fsyntax-only -verify %s
+
+// Platform shader model, no environment parameter
+__attribute__((availability(shadermodel, introduced = 6.0)))
+unsigned f1(); // #f1
+
+__attribute__((availability(shadermodel, introduced = 5.1)))
+unsigned f2(); // #f2
+
+__attribute__((availability(shadermodel, introduced = 5.0)))
+unsigned f3();
+
+// Platform shader model, environment parameter restricting earlier version,
+// available in all environments in higher versions
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0)))
+unsigned f4(); // #f4
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 5.0)))
+unsigned f5(); // #f5
+
+// Platform shader model, environment parameter restricting earlier version,
+// never available in all environments in higher versions
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = compute)))
+__attribute__((availability(shadermodel, introduced = 5.0, environment = mesh)))
+unsigned f6();  // #f6
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = mesh)))
+unsigned f7(); // #f7
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 5.0, environment = compute)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = mesh)))
+unsigned f8(); // #f8
+
+[numthreads(4,1,1)]
+int main() {
+    // expected-warning@#f1_call {{'f1' is only available on Shader Model 6.0 or newer}}
+    // expected-note@#f1 {{'f1' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f1_call {{enclose 'f1' in a __builtin_available check to silence this warning}}
+    unsigned A = f1(); // #f1_call
+
+    // expected-warning@#f2_call {{'f2' is only available on Shader Model 5.1 or newer}}
+    // expected-note@#f2 {{'f2' has been marked as being introduced in Shader Model 5.1 here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f2_call {{enclose 'f2' in a __builtin_available check to silence this warning}}
+    unsigned B = f2(); // #f2_call
+
+    unsigned C = f3();
+
+    // expected-warning@#f4_call {{'f4' is only available on Shader Model 6.0 or newer}}
+    // expected-note@#f4 {{'f4' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f4_call {{enclose 'f4' in a __builtin_available check to silence this warning}}
+    unsigned D = f4(); // #f4_call
+
+    unsigned E = f5(); // #f5_call
+
+    unsigned F = f6(); // #f6_call
+
+    // expected-warning@#f7_call {{'f7' is only available in mesh shader environment on Shader Model 6.0 or newer}}
+    // expected-note@#f7 {{'f7' has been marked as being introduced in Shader Model 6.0 in mesh shader environment here, but the deployment target is Shader Model 5.0 mesh shader environment}}
+    // expected-note@#f7_call {{enclose 'f7' in a __builtin_available check to silence this warning}}
+    unsigned G = f7(); // #f7_call
+
+    // expected-warning@#f8_call {{'f8' is only available in mesh shader environment on Shader Model 6.0 or newer}}
+    // expected-note@#f8 {{'f8' has been marked as being introduced in Shader Model 6.0 in mesh shader environment here, but the deployment target is Shader Model 5.0 mesh shader environment}}
+    // expected-note@#f8_call {{enclose 'f8' in a __builtin_available check to silence this warning}}
+    unsigned H = f8(); // #f8_call
+
+    return 0;
+}
diff --git a/clang/test/SemaHLSL/Availability/attr-availability-pixel.hlsl b/clang/test/SemaHLSL/Availability/attr-availability-pixel.hlsl
new file mode 100644
index 00000000000000..59d09a9cd276f9
--- /dev/null
+++ b/clang/test/SemaHLSL/Availability/attr-availability-pixel.hlsl
@@ -0,0 +1,63 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel5.0-pixel -fsyntax-only -verify %s
+
+// Platform shader model, no environment parameter
+__attribute__((availability(shadermodel, introduced = 6.0)))
+unsigned f1(); // #f1
+
+__attribute__((availability(shadermodel, introduced = 5.1)))
+unsigned f2(); // #f2
+
+__attribute__((availability(shadermodel, introduced = 5.0)))
+unsigned f3();
+
+// Platform shader model, environment parameter restricting earlier version,
+// available in all environments in higher versions
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0)))
+unsigned f4(); // #f4
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 5.0)))
+unsigned f5();
+
+// Platform shader model, environment parameter restricting earlier version,
+// never available in all environments in higher versions
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = compute)))
+__attribute__((availability(shadermodel, introduced = 5.0, environment = mesh)))
+unsigned f6();  // #f6
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = mesh)))
+unsigned f7(); // #f7
+
+__attribute__((availability(shadermodel, introduced = 2.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 5.0, environment = compute)))
+__attribute__((availability(shadermodel, introduced = 6.0, environment = mesh)))
+unsigned f8();
+
+int main() {
+    // expected-warning@#f1_call {{'f1' is only available on Shader Model 6.0 or newer}}
+    // expected-note@#f1 {{'f1' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f1_call {{enclose 'f1' in a __builtin_available check to silence this warning}}
+    unsigned A = f1(); // #f1_call
+
+    // expected-warning@#f2_call {{'f2' is only available on Shader Model 5.1 or newer}}
+    // expected-note@#f2 {{'f2' has been marked as being introduced in Shader Model 5.1 here, but the deployment target is Shader Model 5.0}}
+    // expected-note@#f2_call {{enclose 'f2' in a __builtin_available check to silence this warning}}
+    unsigned B = f2(); // #f2_call
+
+    unsigned C = f3();
+
+    unsigned D = f4(); // #f4_call
+
+    unsigned E = f5();
+
+    unsigned F = f6(); // #f6_call
+
+    unsigned G = f7(); // #f7_call
+
+    unsigned H = f8();
+
+    return 0;
+}
diff --git a/clang/test/SemaHLSL/AvailabilityMarkup.hlsl b/clang/test/SemaHLSL/AvailabilityMarkup.hlsl
deleted file mode 100644
index b883957af08712..00000000000000
--- a/clang/test/SemaHLSL/AvailabilityMarkup.hlsl
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel5.0-library -verify %s
-
-__attribute__((availability(shadermodel, introduced = 6.0)))
-unsigned fn6_0(); // #fn6_0
-
-__attribute__((availability(shadermodel, introduced = 5.1)))
-unsigned fn5_1(); // #fn5_1
-
-__attribute__((availability(shadermodel, introduced = 5.0)))
-unsigned fn5_0();
-
-void fn() {
-    // expected-warning@#fn6_0_site {{'fn6_0' is only available on HLSL ShaderModel 6.0 or newer}}
-    // expected-note@#fn6_0 {{'fn6_0' has been marked as being introduced in HLSL ShaderModel 6.0 here, but the deployment target is HLSL ShaderModel 5.0}}
-    // expected-note@#fn6_0_site {{enclose 'fn6_0' in a __builtin_available check to silence this warning}}
-    unsigned A = fn6_0(); // #fn6_0_site
-
-    // expected-warning@#fn5_1_site {{'fn5_1' is only available on HLSL ShaderModel 5.1 or newer}}
-    // expected-note@#fn5_1 {{'fn5_1' has been marked as being introduced in HLSL ShaderModel 5.1 here, but the deployment target is HLSL ShaderModel 5.0}}
-    // expected-note@#fn5_1_site {{enclose 'fn5_1' in a __builtin_available check to silence this warning}}
-    unsigned B = fn5_1(); // #fn5_1_site
-
-    unsigned C = fn5_0();
-}
-
diff --git a/clang/test/SemaHLSL/WaveBuiltinAvailability.hlsl b/clang/test/SemaHLSL/WaveBuiltinAvailability.hlsl
index 0e45edc6a4c860..185b79be37be5b 100644
--- a/clang/test/SemaHLSL/WaveBuiltinAvailability.hlsl
+++ b/clang/test/SemaHLSL/WaveBuiltinAvailability.hlsl
@@ -2,8 +2,8 @@
 // WaveActiveCountBits is unavailable before ShaderModel 6.0.
 
 unsigned foo(bool b) {
-    // expected-warning@#site {{'WaveActiveCountBits' is only available on HLSL ShaderModel 6.0 or newer}}
-    // expected-note@hlsl/hlsl_intrinsics.h:* {{'WaveActiveCountBits' has been marked as being introduced in HLSL ShaderModel 6.0 here, but the deployment target is HLSL ShaderModel 5.0}}
+    // expected-warning@#site {{'WaveActiveCountBits' is only available on Shader Model 6.0 or newer}}
+    // expected-note@hlsl/hlsl_intrinsics.h:* {{'WaveActiveCountBits' has been marked as being introduced in Shader Model 6.0 here, but the deployment target is Shader Model 5.0}}
     // expected-note@#site {{enclose 'WaveActiveCountBits' in a __builtin_available check to silence this warning}}
     return hlsl::WaveActiveCountBits(b); // #site
 }

From 0cd2bf3521a52f255c2b0d466f2f48f15d4a89a9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sun, 19 May 2024 20:56:21 +0200
Subject: [PATCH 13/60] ValueTracking: Correct undef handling for constant FP
 vectors (#92557)

Treat undef as unknown, and poison as ignorable.
---
 llvm/lib/Analysis/ValueTracking.cpp           |   2 +-
 .../AMDGPU/amdgpu-codegenprepare-fdiv.ll      | 130 +++++++++---------
 llvm/test/Transforms/Attributor/nofpclass.ll  |   2 +-
 llvm/test/Transforms/InstCombine/and-fcmp.ll  |  27 +++-
 llvm/test/Transforms/InstCombine/or-fcmp.ll   |  49 ++++++-
 5 files changed, 135 insertions(+), 75 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index e8c5f9b3dc25d3..2d1486d252c3ea 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -4751,7 +4751,7 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
         Known = KnownFPClass();
         return;
       }
-      if (isa<UndefValue>(Elt))
+      if (isa<PoisonValue>(Elt))
         continue;
       auto *CElt = dyn_cast<ConstantFP>(Elt);
       if (!CElt) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index 6bda962d1b9cac..b69afa3ab1f3d6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -2151,7 +2151,7 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
 ; IEEE-GOODFREXP-NEXT:    [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP28]], 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP31:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP29]])
-; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; IEEE-GOODFREXP-NEXT:    [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP32]], 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP35:%.*]] = fmul contract float [[TMP33]], [[TMP31]]
@@ -2222,9 +2222,9 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
 ; IEEE-BADFREXP-NEXT:    [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0
 ; IEEE-BADFREXP-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP19]])
 ; IEEE-BADFREXP-NEXT:    [[TMP31:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP29]])
-; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0
-; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP35:%.*]] = fmul contract float [[TMP33]], [[TMP31]]
 ; IEEE-BADFREXP-NEXT:    [[TMP36:%.*]] = sub i32 [[TMP34]], [[TMP30]]
 ; IEEE-BADFREXP-NEXT:    [[TMP37:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP36]])
@@ -2281,7 +2281,7 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
 ; DAZ-NEXT:    [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP16]], 0
 ; DAZ-NEXT:    [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP16]], 1
 ; DAZ-NEXT:    [[TMP19:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP17]])
-; DAZ-NEXT:    [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; DAZ-NEXT:    [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; DAZ-NEXT:    [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP20]], 0
 ; DAZ-NEXT:    [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP20]], 1
 ; DAZ-NEXT:    [[TMP23:%.*]] = fmul contract float [[TMP21]], [[TMP19]]
@@ -2313,7 +2313,7 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
 
   ; Matches the rsq instruction accuracy
   %sqrt.md.1ulp.undef = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !2
-  %md.1ulp.undef = fdiv contract <2 x float> <float 1.0, float undef>, %sqrt.md.1ulp.undef, !fpmath !2
+  %md.1ulp.undef = fdiv contract <2 x float> <float 1.0, float poison>, %sqrt.md.1ulp.undef, !fpmath !2
   store volatile <2 x float> %md.1ulp.undef, ptr addrspace(1) %out, align 4
 
   ; Test mismatched metadata/flags between the sqrt and fdiv
@@ -3121,7 +3121,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator(<4 x float> %arg) {
 ; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP31]], 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP34:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP32]])
-; IEEE-GOODFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-GOODFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; IEEE-GOODFREXP-NEXT:    [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP35]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP37:%.*]] = extractvalue { float, i32 } [[TMP35]], 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP38:%.*]] = fmul contract float [[TMP36]], [[TMP34]]
@@ -3170,9 +3170,9 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator(<4 x float> %arg) {
 ; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0
 ; IEEE-BADFREXP-NEXT:    [[TMP33:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]])
 ; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP32]])
-; IEEE-BADFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP35]], 0
-; IEEE-BADFREXP-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP38:%.*]] = fmul contract float [[TMP36]], [[TMP34]]
 ; IEEE-BADFREXP-NEXT:    [[TMP39:%.*]] = sub i32 [[TMP37]], [[TMP33]]
 ; IEEE-BADFREXP-NEXT:    [[TMP40:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP38]], i32 [[TMP39]])
@@ -3217,7 +3217,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator(<4 x float> %arg) {
 ; DAZ-NEXT:    [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0
 ; DAZ-NEXT:    [[TMP31:%.*]] = extractvalue { float, i32 } [[TMP29]], 1
 ; DAZ-NEXT:    [[TMP32:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP30]])
-; DAZ-NEXT:    [[TMP33:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; DAZ-NEXT:    [[TMP33:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; DAZ-NEXT:    [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP33]], 0
 ; DAZ-NEXT:    [[TMP35:%.*]] = extractvalue { float, i32 } [[TMP33]], 1
 ; DAZ-NEXT:    [[TMP36:%.*]] = fmul contract float [[TMP34]], [[TMP32]]
@@ -3230,7 +3230,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator(<4 x float> %arg) {
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
   %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2
-  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %denom, !fpmath !2
+  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %denom, !fpmath !2
   ret <4 x float> %partial.rsq
 }
 
@@ -3272,7 +3272,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt(<4 x float>
 ; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP31]], 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP34:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP32]])
-; IEEE-GOODFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-GOODFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; IEEE-GOODFREXP-NEXT:    [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP35]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP37:%.*]] = extractvalue { float, i32 } [[TMP35]], 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP38:%.*]] = fmul contract float [[TMP36]], [[TMP34]]
@@ -3321,9 +3321,9 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt(<4 x float>
 ; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0
 ; IEEE-BADFREXP-NEXT:    [[TMP33:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]])
 ; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP32]])
-; IEEE-BADFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP35]], 0
-; IEEE-BADFREXP-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP38:%.*]] = fmul contract float [[TMP36]], [[TMP34]]
 ; IEEE-BADFREXP-NEXT:    [[TMP39:%.*]] = sub i32 [[TMP37]], [[TMP33]]
 ; IEEE-BADFREXP-NEXT:    [[TMP40:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP38]], i32 [[TMP39]])
@@ -3361,7 +3361,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt(<4 x float>
 ; DAZ-NEXT:    [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0
 ; DAZ-NEXT:    [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1
 ; DAZ-NEXT:    [[TMP25:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP23]])
-; DAZ-NEXT:    [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; DAZ-NEXT:    [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; DAZ-NEXT:    [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0
 ; DAZ-NEXT:    [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP26]], 1
 ; DAZ-NEXT:    [[TMP29:%.*]] = fmul contract float [[TMP27]], [[TMP25]]
@@ -3374,7 +3374,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt(<4 x float>
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
   %denom = call contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg)
-  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %denom, !fpmath !2
+  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %denom, !fpmath !2
   ret <4 x float> %partial.rsq
 }
 
@@ -3382,7 +3382,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div(<4 x float>
 ; IEEE-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div(
 ; IEEE-SAME: <4 x float> [[ARG:%.*]]) #[[ATTR1]] {
 ; IEEE-NEXT:    [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath [[META2:![0-9]+]]
-; IEEE-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv contract afn <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float undef>, [[DENOM]]
+; IEEE-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv contract afn <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float poison>, [[DENOM]]
 ; IEEE-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
 ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div(
@@ -3399,11 +3399,11 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_div(<4 x float>
 ; DAZ-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i64 1
 ; DAZ-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i64 2
 ; DAZ-NEXT:    [[DENOM:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP8]], i64 3
-; DAZ-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv contract afn <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float undef>, [[DENOM]]
+; DAZ-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv contract afn <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float poison>, [[DENOM]]
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
   %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2
-  %partial.rsq = fdiv contract afn <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %denom
+  %partial.rsq = fdiv contract afn <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %denom
   ret <4 x float> %partial.rsq
 }
 
@@ -3411,7 +3411,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_fdiv(<4 x fl
 ; IEEE-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_fdiv(
 ; IEEE-SAME: <4 x float> [[ARG:%.*]]) #[[ATTR1]] {
 ; IEEE-NEXT:    [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath [[META2]]
-; IEEE-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv contract <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float undef>, [[DENOM]]
+; IEEE-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv contract <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float poison>, [[DENOM]]
 ; IEEE-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
 ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_fdiv(
@@ -3428,11 +3428,11 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_fdiv(<4 x fl
 ; DAZ-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i64 1
 ; DAZ-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i64 2
 ; DAZ-NEXT:    [[DENOM:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP8]], i64 3
-; DAZ-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv contract <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float undef>, [[DENOM]]
+; DAZ-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv contract <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float poison>, [[DENOM]]
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
   %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2
-  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %denom
+  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %denom
   ret <4 x float> %partial.rsq
 }
 
@@ -3471,7 +3471,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
 ; IEEE-GOODFREXP-NEXT:    [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP28]], 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP31:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP29]])
-; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; IEEE-GOODFREXP-NEXT:    [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP32]], 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP35:%.*]] = fmul contract float [[TMP33]], [[TMP31]]
@@ -3517,9 +3517,9 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
 ; IEEE-BADFREXP-NEXT:    [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0
 ; IEEE-BADFREXP-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]])
 ; IEEE-BADFREXP-NEXT:    [[TMP31:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP29]])
-; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0
-; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP35:%.*]] = fmul contract float [[TMP33]], [[TMP31]]
 ; IEEE-BADFREXP-NEXT:    [[TMP36:%.*]] = sub i32 [[TMP34]], [[TMP30]]
 ; IEEE-BADFREXP-NEXT:    [[TMP37:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP36]])
@@ -3553,7 +3553,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
 ; DAZ-NEXT:    [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0
 ; DAZ-NEXT:    [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1
 ; DAZ-NEXT:    [[TMP21:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP19]])
-; DAZ-NEXT:    [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; DAZ-NEXT:    [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; DAZ-NEXT:    [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0
 ; DAZ-NEXT:    [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1
 ; DAZ-NEXT:    [[TMP25:%.*]] = fmul contract float [[TMP23]], [[TMP21]]
@@ -3566,7 +3566,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
   %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg)
-  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %denom, !fpmath !2
+  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %denom, !fpmath !2
   ret <4 x float> %partial.rsq
 }
 
@@ -3607,7 +3607,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar
 ; IEEE-GOODFREXP-NEXT:    [[TMP31:%.*]] = sub i32 0, [[TMP30]]
 ; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP29]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP33:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]])
-; IEEE-GOODFREXP-NEXT:    [[TMP34:%.*]] = fmul arcp contract float undef, [[TMP33]]
+; IEEE-GOODFREXP-NEXT:    [[TMP34:%.*]] = fmul arcp contract float poison, [[TMP33]]
 ; IEEE-GOODFREXP-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[TMP20]], i64 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP37:%.*]] = insertelement <4 x float> [[TMP36]], float [[TMP27]], i64 2
@@ -3650,7 +3650,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar
 ; IEEE-BADFREXP-NEXT:    [[TMP31:%.*]] = sub i32 0, [[TMP30]]
 ; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP29]])
 ; IEEE-BADFREXP-NEXT:    [[TMP33:%.*]] = call arcp contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]])
-; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = fmul arcp contract float undef, [[TMP33]]
+; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = fmul arcp contract float poison, [[TMP33]]
 ; IEEE-BADFREXP-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0
 ; IEEE-BADFREXP-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float [[TMP20]], i64 1
 ; IEEE-BADFREXP-NEXT:    [[TMP37:%.*]] = insertelement <4 x float> [[TMP36]], float [[TMP27]], i64 2
@@ -3681,7 +3681,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar
 ; DAZ-NEXT:    [[TMP19:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP14]])
 ; DAZ-NEXT:    [[TMP20:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP19]]
 ; DAZ-NEXT:    [[TMP21:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP15]])
-; DAZ-NEXT:    [[TMP22:%.*]] = fmul arcp contract float undef, [[TMP21]]
+; DAZ-NEXT:    [[TMP22:%.*]] = fmul arcp contract float poison, [[TMP21]]
 ; DAZ-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP16]], i64 0
 ; DAZ-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP18]], i64 1
 ; DAZ-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP20]], i64 2
@@ -3689,7 +3689,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
   %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2
-  %partial.rsq = fdiv contract arcp <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %denom, !fpmath !2
+  %partial.rsq = fdiv contract arcp <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %denom, !fpmath !2
   ret <4 x float> %partial.rsq
 }
 
@@ -3697,7 +3697,7 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp_correct(<4 x fl
 ; IEEE-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp_correct(
 ; IEEE-SAME: <4 x float> [[ARG:%.*]]) #[[ATTR1]] {
 ; IEEE-NEXT:    [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath [[META2]]
-; IEEE-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv arcp contract <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float undef>, [[DENOM]]
+; IEEE-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv arcp contract <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float poison>, [[DENOM]]
 ; IEEE-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
 ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp_correct(
@@ -3714,11 +3714,11 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp_correct(<4 x fl
 ; DAZ-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP6]], i64 1
 ; DAZ-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i64 2
 ; DAZ-NEXT:    [[DENOM:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP8]], i64 3
-; DAZ-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv arcp contract <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float undef>, [[DENOM]]
+; DAZ-NEXT:    [[PARTIAL_RSQ:%.*]] = fdiv arcp contract <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float poison>, [[DENOM]]
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
   %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2
-  %partial.rsq = fdiv contract arcp <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %denom
+  %partial.rsq = fdiv contract arcp <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %denom
   ret <4 x float> %partial.rsq
 }
 
@@ -3755,7 +3755,7 @@ define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar
 ; IEEE-GOODFREXP-NEXT:    [[TMP28:%.*]] = sub i32 0, [[TMP27]]
 ; IEEE-GOODFREXP-NEXT:    [[TMP29:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP26]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP30:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP28]])
-; IEEE-GOODFREXP-NEXT:    [[TMP31:%.*]] = fmul arcp float undef, [[TMP30]]
+; IEEE-GOODFREXP-NEXT:    [[TMP31:%.*]] = fmul arcp float poison, [[TMP30]]
 ; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i64 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[TMP17]], i64 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP24]], i64 2
@@ -3794,7 +3794,7 @@ define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar
 ; IEEE-BADFREXP-NEXT:    [[TMP28:%.*]] = sub i32 0, [[TMP27]]
 ; IEEE-BADFREXP-NEXT:    [[TMP29:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP26]])
 ; IEEE-BADFREXP-NEXT:    [[TMP30:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP28]])
-; IEEE-BADFREXP-NEXT:    [[TMP31:%.*]] = fmul arcp float undef, [[TMP30]]
+; IEEE-BADFREXP-NEXT:    [[TMP31:%.*]] = fmul arcp float poison, [[TMP30]]
 ; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i64 0
 ; IEEE-BADFREXP-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[TMP17]], i64 1
 ; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP24]], i64 2
@@ -3813,24 +3813,24 @@ define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar
 ; DAZ-NEXT:    [[TMP8:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP3]])
 ; DAZ-NEXT:    [[TMP9:%.*]] = fmul arcp float 4.000000e+00, [[TMP8]]
 ; DAZ-NEXT:    [[TMP10:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP4]])
-; DAZ-NEXT:    [[TMP11:%.*]] = fmul arcp float undef, [[TMP10]]
+; DAZ-NEXT:    [[TMP11:%.*]] = fmul arcp float poison, [[TMP10]]
 ; DAZ-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0
 ; DAZ-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP7]], i64 1
 ; DAZ-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP9]], i64 2
 ; DAZ-NEXT:    [[PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP11]], i64 3
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RCP]]
 ;
-  %partial.rcp = fdiv arcp <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %arg, !fpmath !2
+  %partial.rcp = fdiv arcp <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %arg, !fpmath !2
   ret <4 x float> %partial.rcp
 }
 
 define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp_correct(<4 x float> %arg) {
 ; CHECK-LABEL: define <4 x float> @rcp_f32_vector_mixed_constant_numerator_arcp_correct(
 ; CHECK-SAME: <4 x float> [[ARG:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[PARTIAL_RCP:%.*]] = fdiv arcp <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float undef>, [[ARG]]
+; CHECK-NEXT:    [[PARTIAL_RCP:%.*]] = fdiv arcp <4 x float> <float 1.000000e+00, float -1.000000e+00, float 4.000000e+00, float poison>, [[ARG]]
 ; CHECK-NEXT:    ret <4 x float> [[PARTIAL_RCP]]
 ;
-  %partial.rcp = fdiv arcp <4 x float> <float 1.0, float -1.0, float 4.0, float undef>, %arg
+  %partial.rcp = fdiv arcp <4 x float> <float 1.0, float -1.0, float 4.0, float poison>, %arg
   ret <4 x float> %partial.rcp
 }
 
@@ -3841,7 +3841,7 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
 ; IEEE-GOODFREXP-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 4.000000e+00)
 ; IEEE-GOODFREXP-NEXT:    [[TMP2:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 2.000000e+00)
 ; IEEE-GOODFREXP-NEXT:    [[TMP3:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 8.000000e+00)
-; IEEE-GOODFREXP-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float undef)
+; IEEE-GOODFREXP-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float poison)
 ; IEEE-GOODFREXP-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP2]], i64 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i64 2
@@ -3857,21 +3857,21 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
 ; IEEE-GOODFREXP-NEXT:    [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP18:%.*]] = fneg contract float [[TMP9]]
-; IEEE-GOODFREXP-NEXT:    [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]])
-; IEEE-GOODFREXP-NEXT:    [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0
-; IEEE-GOODFREXP-NEXT:    [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP25]], 1
-; IEEE-GOODFREXP-NEXT:    [[TMP22:%.*]] = sub i32 0, [[TMP27]]
-; IEEE-GOODFREXP-NEXT:    [[TMP28:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP26]])
-; IEEE-GOODFREXP-NEXT:    [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP28]], i32 [[TMP22]])
-; IEEE-GOODFREXP-NEXT:    [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]])
+; IEEE-GOODFREXP-NEXT:    [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP48]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP50:%.*]] = extractvalue { float, i32 } [[TMP48]], 1
+; IEEE-GOODFREXP-NEXT:    [[TMP22:%.*]] = sub i32 0, [[TMP50]]
 ; IEEE-GOODFREXP-NEXT:    [[TMP51:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP49]])
-; IEEE-GOODFREXP-NEXT:    [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-GOODFREXP-NEXT:    [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP51]], i32 [[TMP22]])
+; IEEE-GOODFREXP-NEXT:    [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP31:%.*]] = extractvalue { float, i32 } [[TMP29]], 1
-; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = fmul contract float [[TMP30]], [[TMP51]]
-; IEEE-GOODFREXP-NEXT:    [[TMP33:%.*]] = sub i32 [[TMP31]], [[TMP50]]
+; IEEE-GOODFREXP-NEXT:    [[TMP28:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP30]])
+; IEEE-GOODFREXP-NEXT:    [[TMP52:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
+; IEEE-GOODFREXP-NEXT:    [[TMP53:%.*]] = extractvalue { float, i32 } [[TMP52]], 0
+; IEEE-GOODFREXP-NEXT:    [[TMP54:%.*]] = extractvalue { float, i32 } [[TMP52]], 1
+; IEEE-GOODFREXP-NEXT:    [[TMP32:%.*]] = fmul contract float [[TMP53]], [[TMP28]]
+; IEEE-GOODFREXP-NEXT:    [[TMP33:%.*]] = sub i32 [[TMP54]], [[TMP31]]
 ; IEEE-GOODFREXP-NEXT:    [[TMP34:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP33]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP35]], 0
@@ -3894,7 +3894,7 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
 ; IEEE-BADFREXP-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 4.000000e+00)
 ; IEEE-BADFREXP-NEXT:    [[TMP2:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 2.000000e+00)
 ; IEEE-BADFREXP-NEXT:    [[TMP3:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 8.000000e+00)
-; IEEE-BADFREXP-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float poison)
 ; IEEE-BADFREXP-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
 ; IEEE-BADFREXP-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP2]], i64 1
 ; IEEE-BADFREXP-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i64 2
@@ -3910,20 +3910,20 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
 ; IEEE-BADFREXP-NEXT:    [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
 ; IEEE-BADFREXP-NEXT:    [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]])
 ; IEEE-BADFREXP-NEXT:    [[TMP18:%.*]] = fneg contract float [[TMP9]]
-; IEEE-BADFREXP-NEXT:    [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]])
-; IEEE-BADFREXP-NEXT:    [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0
+; IEEE-BADFREXP-NEXT:    [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]])
+; IEEE-BADFREXP-NEXT:    [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP48]], 0
 ; IEEE-BADFREXP-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP18]])
 ; IEEE-BADFREXP-NEXT:    [[TMP22:%.*]] = sub i32 0, [[TMP21]]
-; IEEE-BADFREXP-NEXT:    [[TMP28:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP26]])
-; IEEE-BADFREXP-NEXT:    [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP28]], i32 [[TMP22]])
-; IEEE-BADFREXP-NEXT:    [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]])
-; IEEE-BADFREXP-NEXT:    [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP48]], 0
-; IEEE-BADFREXP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP10]])
 ; IEEE-BADFREXP-NEXT:    [[TMP50:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP49]])
-; IEEE-BADFREXP-NEXT:    [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; IEEE-BADFREXP-NEXT:    [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP50]], i32 [[TMP22]])
+; IEEE-BADFREXP-NEXT:    [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]])
 ; IEEE-BADFREXP-NEXT:    [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0
-; IEEE-BADFREXP-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef)
-; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = fmul contract float [[TMP30]], [[TMP50]]
+; IEEE-BADFREXP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP10]])
+; IEEE-BADFREXP-NEXT:    [[TMP28:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP30]])
+; IEEE-BADFREXP-NEXT:    [[TMP51:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
+; IEEE-BADFREXP-NEXT:    [[TMP52:%.*]] = extractvalue { float, i32 } [[TMP51]], 0
+; IEEE-BADFREXP-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float poison)
+; IEEE-BADFREXP-NEXT:    [[TMP32:%.*]] = fmul contract float [[TMP52]], [[TMP28]]
 ; IEEE-BADFREXP-NEXT:    [[TMP33:%.*]] = sub i32 [[TMP31]], [[TMP27]]
 ; IEEE-BADFREXP-NEXT:    [[TMP34:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP33]])
 ; IEEE-BADFREXP-NEXT:    [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]])
@@ -3947,7 +3947,7 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
 ; DAZ-NEXT:    [[TMP1:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 4.000000e+00)
 ; DAZ-NEXT:    [[TMP2:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 2.000000e+00)
 ; DAZ-NEXT:    [[TMP3:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 8.000000e+00)
-; DAZ-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float undef)
+; DAZ-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.sqrt.f32(float poison)
 ; DAZ-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
 ; DAZ-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[TMP2]], i64 1
 ; DAZ-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i64 2
@@ -3963,7 +3963,7 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
 ; DAZ-NEXT:    [[TMP16:%.*]] = extractvalue { float, i32 } [[TMP15]], 0
 ; DAZ-NEXT:    [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP15]], 1
 ; DAZ-NEXT:    [[TMP18:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP16]])
-; DAZ-NEXT:    [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef)
+; DAZ-NEXT:    [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float poison)
 ; DAZ-NEXT:    [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0
 ; DAZ-NEXT:    [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1
 ; DAZ-NEXT:    [[TMP22:%.*]] = fmul contract float [[TMP20]], [[TMP18]]
@@ -3985,8 +3985,8 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
 ; DAZ-NEXT:    [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP37]], float [[TMP34]], i64 3
 ; DAZ-NEXT:    ret <4 x float> [[PARTIAL_RSQ]]
 ;
-  %sqrt = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> <float 4.0, float 2.0, float 8.0, float undef>), !fpmath !2
-  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float undef, float 2.0>, %sqrt, !fpmath !2
+  %sqrt = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> <float 4.0, float 2.0, float 8.0, float poison>), !fpmath !2
+  %partial.rsq = fdiv contract <4 x float> <float 1.0, float -1.0, float poison, float 2.0>, %sqrt, !fpmath !2
   ret <4 x float> %partial.rsq
 }
 
diff --git a/llvm/test/Transforms/Attributor/nofpclass.ll b/llvm/test/Transforms/Attributor/nofpclass.ll
index 5945fc5e7b0bf4..b38f9bae50ccca 100644
--- a/llvm/test/Transforms/Attributor/nofpclass.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass.ll
@@ -114,7 +114,7 @@ define <2 x double> @returned_strange_constant_vector_elt() {
 
 ; Test a vector element that's undef
 define <3 x double> @returned_undef_constant_vector_elt() {
-; CHECK-LABEL: define nofpclass(nan inf sub norm) <3 x double> @returned_undef_constant_vector_elt() {
+; CHECK-LABEL: define <3 x double> @returned_undef_constant_vector_elt() {
 ; CHECK-NEXT:    call void @unknown()
 ; CHECK-NEXT:    ret <3 x double> <double -0.000000e+00, double 0.000000e+00, double undef>
 ;
diff --git a/llvm/test/Transforms/InstCombine/and-fcmp.ll b/llvm/test/Transforms/InstCombine/and-fcmp.ll
index f1ae2e74ac2e4e..c163802fcc935c 100644
--- a/llvm/test/Transforms/InstCombine/and-fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/and-fcmp.ll
@@ -39,7 +39,9 @@ define i1 @PR1738_logical_noundef(double %x, double noundef %y) {
 
 define <2 x i1> @PR1738_vec_undef(<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: @PR1738_vec_undef(
-; CHECK-NEXT:    [[OR:%.*]] = fcmp ord <2 x double> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ord <2 x double> [[X:%.*]], <double 0.000000e+00, double undef>
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ord <2 x double> [[Y:%.*]], <double undef, double 0.000000e+00>
+; CHECK-NEXT:    [[OR:%.*]] = and <2 x i1> [[CMP1]], [[CMP2]]
 ; CHECK-NEXT:    ret <2 x i1> [[OR]]
 ;
   %cmp1 = fcmp ord <2 x double> %x, <double 0.0, double undef>
@@ -48,6 +50,17 @@ define <2 x i1> @PR1738_vec_undef(<2 x double> %x, <2 x double> %y) {
   ret <2 x i1> %or
 }
 
+define <2 x i1> @PR1738_vec_poison(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: @PR1738_vec_poison(
+; CHECK-NEXT:    [[OR:%.*]] = fcmp ord <2 x double> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i1> [[OR]]
+;
+  %cmp1 = fcmp ord <2 x double> %x, <double 0.0, double poison>
+  %cmp2 = fcmp ord <2 x double> %y, <double poison, double 0.0>
+  %or = and <2 x i1> %cmp1, %cmp2
+  ret <2 x i1> %or
+}
+
 define i1 @PR41069(i1 %z, float %c, float %d) {
 ; CHECK-LABEL: @PR41069(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ord float [[D:%.*]], [[C:%.*]]
@@ -111,8 +124,10 @@ define i1 @PR41069_commute_logical(i1 %z, float %c, float %d) {
 define <2 x i1> @PR41069_vec(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %d) {
 ; CHECK-LABEL: @PR41069_vec(
 ; CHECK-NEXT:    [[ORD1:%.*]] = fcmp ord <2 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ord <2 x double> [[D:%.*]], [[C:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i1> [[TMP1]], [[ORD1]]
+; CHECK-NEXT:    [[ORD2:%.*]] = fcmp ord <2 x double> [[C:%.*]], <double 0.000000e+00, double undef>
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i1> [[ORD1]], [[ORD2]]
+; CHECK-NEXT:    [[ORD3:%.*]] = fcmp ord <2 x double> [[D:%.*]], zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i1> [[AND]], [[ORD3]]
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %ord1 = fcmp ord <2 x double> %a, %b
@@ -126,8 +141,10 @@ define <2 x i1> @PR41069_vec(<2 x double> %a, <2 x double> %b, <2 x double> %c,
 define <2 x i1> @PR41069_vec_commute(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %d) {
 ; CHECK-LABEL: @PR41069_vec_commute(
 ; CHECK-NEXT:    [[ORD1:%.*]] = fcmp ord <2 x double> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ord <2 x double> [[D:%.*]], [[C:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i1> [[TMP1]], [[ORD1]]
+; CHECK-NEXT:    [[ORD2:%.*]] = fcmp ord <2 x double> [[C:%.*]], <double 0.000000e+00, double undef>
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i1> [[ORD1]], [[ORD2]]
+; CHECK-NEXT:    [[ORD3:%.*]] = fcmp ord <2 x double> [[D:%.*]], zeroinitializer
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i1> [[ORD3]], [[AND]]
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %ord1 = fcmp ord <2 x double> %a, %b
diff --git a/llvm/test/Transforms/InstCombine/or-fcmp.ll b/llvm/test/Transforms/InstCombine/or-fcmp.ll
index ffd927672b413b..285b2d958abd8c 100644
--- a/llvm/test/Transforms/InstCombine/or-fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/or-fcmp.ll
@@ -28,7 +28,9 @@ define i1 @PR1738_logical(double %x, double %y) {
 
 define <2 x i1> @PR1738_vec_undef(<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: @PR1738_vec_undef(
-; CHECK-NEXT:    [[OR:%.*]] = fcmp uno <2 x double> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp uno <2 x double> [[X:%.*]], <double 0.000000e+00, double undef>
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp uno <2 x double> [[Y:%.*]], <double undef, double 0.000000e+00>
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i1> [[CMP1]], [[CMP2]]
 ; CHECK-NEXT:    ret <2 x i1> [[OR]]
 ;
   %cmp1 = fcmp uno <2 x double> %x, <double 0.0, double undef>
@@ -37,6 +39,17 @@ define <2 x i1> @PR1738_vec_undef(<2 x double> %x, <2 x double> %y) {
   ret <2 x i1> %or
 }
 
+define <2 x i1> @PR1738_vec_poison(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: @PR1738_vec_poison(
+; CHECK-NEXT:    [[OR:%.*]] = fcmp uno <2 x double> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i1> [[OR]]
+;
+  %cmp1 = fcmp uno <2 x double> %x, <double 0.0, double poison>
+  %cmp2 = fcmp uno <2 x double> %y, <double poison, double 0.0>
+  %or = or <2 x i1> %cmp1, %cmp2
+  ret <2 x i1> %or
+}
+
 define i1 @PR41069(double %a, double %b, double %c, double %d) {
 ; CHECK-LABEL: @PR41069(
 ; CHECK-NEXT:    [[UNO1:%.*]] = fcmp uno double [[A:%.*]], [[B:%.*]]
@@ -105,26 +118,56 @@ define i1 @PR41069_commute_logical(double %a, double %b, double %c, double %d) {
 
 define <2 x i1> @PR41069_vec(<2 x i1> %z, <2 x float> %c, <2 x float> %d) {
 ; CHECK-LABEL: @PR41069_vec(
+; CHECK-NEXT:    [[UNO1:%.*]] = fcmp uno <2 x float> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i1> [[UNO1]], [[Z:%.*]]
+; CHECK-NEXT:    [[UNO2:%.*]] = fcmp uno <2 x float> [[D:%.*]], <float 0.000000e+00, float undef>
+; CHECK-NEXT:    [[R:%.*]] = or <2 x i1> [[OR]], [[UNO2]]
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %uno1 = fcmp uno <2 x float> %c, zeroinitializer
+  %or = or <2 x i1> %uno1, %z
+  %uno2 = fcmp uno <2 x float> %d, <float 0.0, float undef>
+  %r = or <2 x i1> %or, %uno2
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @PR41069_vec_poison(<2 x i1> %z, <2 x float> %c, <2 x float> %d) {
+; CHECK-LABEL: @PR41069_vec_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fcmp uno <2 x float> [[D:%.*]], [[C:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = or <2 x i1> [[TMP1]], [[Z:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %uno1 = fcmp uno <2 x float> %c, zeroinitializer
   %or = or <2 x i1> %uno1, %z
-  %uno2 = fcmp uno <2 x float> %d, <float 0.0, float undef>
+  %uno2 = fcmp uno <2 x float> %d, <float 0.0, float poison>
   %r = or <2 x i1> %or, %uno2
   ret <2 x i1> %r
 }
 
 define <2 x i1> @PR41069_vec_commute(<2 x i1> %z, <2 x float> %c, <2 x float> %d) {
 ; CHECK-LABEL: @PR41069_vec_commute(
+; CHECK-NEXT:    [[UNO1:%.*]] = fcmp uno <2 x float> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i1> [[UNO1]], [[Z:%.*]]
+; CHECK-NEXT:    [[UNO2:%.*]] = fcmp uno <2 x float> [[D:%.*]], <float 0.000000e+00, float undef>
+; CHECK-NEXT:    [[R:%.*]] = or <2 x i1> [[UNO2]], [[OR]]
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %uno1 = fcmp uno <2 x float> %c, zeroinitializer
+  %or = or <2 x i1> %uno1, %z
+  %uno2 = fcmp uno <2 x float> %d, <float 0.0, float undef>
+  %r = or <2 x i1> %uno2, %or
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @PR41069_vec_commute_poison(<2 x i1> %z, <2 x float> %c, <2 x float> %d) {
+; CHECK-LABEL: @PR41069_vec_commute_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fcmp uno <2 x float> [[D:%.*]], [[C:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = or <2 x i1> [[TMP1]], [[Z:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %uno1 = fcmp uno <2 x float> %c, zeroinitializer
   %or = or <2 x i1> %uno1, %z
-  %uno2 = fcmp uno <2 x float> %d, <float 0.0, float undef>
+  %uno2 = fcmp uno <2 x float> %d, <float 0.0, float poison>
   %r = or <2 x i1> %uno2, %or
   ret <2 x i1> %r
 }

From 878642954f5178c55b337afe2bff4e6a92a67a5b Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Sun, 19 May 2024 13:23:04 -0700
Subject: [PATCH 14/60] [BOLT] Fix preserved offset in fixDoubleJumps (#92485)

---
 bolt/lib/Passes/BinaryPasses.cpp       | 14 +++++++++-----
 bolt/test/X86/bb-with-two-tail-calls.s |  8 ++++----
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 867f977cebca72..298ba29ff5b3ff 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -674,7 +674,8 @@ static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
   for (BinaryBasicBlock &BB : Function) {
     auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
-                             const MCSymbol *SuccSym) {
+                             const MCSymbol *SuccSym,
+                             std::optional<uint32_t> Offset) {
       // Ignore infinite loop jumps or fallthrough tail jumps.
       if (Pred == Succ || Succ == &BB)
         return false;
@@ -715,9 +716,11 @@ static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
           Pred->removeSuccessor(&BB);
           Pred->eraseInstruction(Pred->findInstruction(Branch));
           Pred->addTailCallInstruction(SuccSym);
-          MCInst *TailCall = Pred->getLastNonPseudoInstr();
-          assert(TailCall);
-          MIB->setOffset(*TailCall, BB.getOffset());
+          if (Offset) {
+            MCInst *TailCall = Pred->getLastNonPseudoInstr();
+            assert(TailCall);
+            MIB->setOffset(*TailCall, *Offset);
+          }
         } else {
           return false;
         }
@@ -760,7 +763,8 @@ static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
       if (Pred->getSuccessor() == &BB ||
           (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
           Pred->getConditionalSuccessor(false) == &BB)
-        if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid)
+        if (checkAndPatch(Pred, Succ, SuccSym, MIB->getOffset(*Inst)) &&
+            MarkInvalid)
           BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() ||
                        BB.isEntryPoint());
     }
diff --git a/bolt/test/X86/bb-with-two-tail-calls.s b/bolt/test/X86/bb-with-two-tail-calls.s
index bb2b0cd4cc23a5..b6703e352ff4bf 100644
--- a/bolt/test/X86/bb-with-two-tail-calls.s
+++ b/bolt/test/X86/bb-with-two-tail-calls.s
@@ -1,8 +1,6 @@
 # This reproduces a bug with dynostats when trying to compute branch stats
 # at a block with two tails calls (one conditional and one unconditional).
 
-# REQUIRES: system-linux
-
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
 # RUN:   %s -o %t.o
 # RUN: link_fdata %s %t.o %t.fdata
@@ -13,7 +11,7 @@
 # CHECK-NOT: Assertion `BranchInfo.size() == 2 && "could only be called for blocks with 2 successors"' failed.
 # Two tail calls in the same basic block after SCTC:
 # CHECK:         {{.*}}:   ja      {{.*}} # TAILCALL # Offset: 7 # CTCTakenCount: 4
-# CHECK-NEXT:    {{.*}}:   jmp     {{.*}} # TAILCALL # Offset: 12
+# CHECK-NEXT:    {{.*}}:   jmp     {{.*}} # TAILCALL # Offset: 13
 
   .globl _start
 _start:
@@ -23,7 +21,9 @@ a:  ja b
 x:  ret
 # FDATA: 1 _start #a# 1 _start #b# 2 4
 b:  jmp e
-c:  jmp f
+c:
+    .nops 1
+    jmp f
 
   .globl e
 e:

From fb2c6597e39e9e1a775525ea0236b2f89e46acff Mon Sep 17 00:00:00 2001
From: Leon Clark <PeddleSpam@users.noreply.github.com>
Date: Sun, 19 May 2024 21:45:24 +0100
Subject: [PATCH 15/60] [AMDGPU] Use LSH for lowering ctlz_zero_undef.i8/i16
 (#88512)

Use LSH to lower ctlz_zero_undef instead of subtracting leading zeros
for i8 and i16.

Related to [77615](https://github.com/llvm/llvm-project/pull/77615).

---------

Co-authored-by: Leon Clark <leoclark@amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  22 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  44 +++-
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |   2 +
 .../GlobalISel/legalize-ctlz-zero-undef.mir   |  47 ++--
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   | 232 +++++++-----------
 5 files changed, 169 insertions(+), 178 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d35a022ad68067..980e58510ceb7d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3117,20 +3117,30 @@ static bool isCttzOpc(unsigned Opc) {
 SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
                                                SelectionDAG &DAG) const {
   auto SL = SDLoc(Op);
+  auto Opc = Op.getOpcode();
   auto Arg = Op.getOperand(0u);
   auto ResultVT = Op.getValueType();
 
   if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
     return {};
 
-  assert(isCtlzOpc(Op.getOpcode()));
+  assert(isCtlzOpc(Opc));
   assert(ResultVT == Arg.getValueType());
 
-  auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
-  auto SubVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
-  auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
-  NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
-  NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
+  const uint64_t NumBits = ResultVT.getFixedSizeInBits();
+  SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
+  SDValue NewOp;
+
+  if (Opc == ISD::CTLZ_ZERO_UNDEF) {
+    NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
+    NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
+    NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
+  } else {
+    NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
+    NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
+    NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
+  }
+
   return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index bd7bf78c4c0bdc..15a4b6796880f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1270,13 +1270,22 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .custom();
 
   // The 64-bit versions produce 32-bit results, but only on the SALU.
-  getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
-    .legalFor({{S32, S32}, {S32, S64}})
-    .clampScalar(0, S32, S32)
-    .clampScalar(1, S32, S64)
-    .scalarize(0)
-    .widenScalarToNextPow2(0, 32)
-    .widenScalarToNextPow2(1, 32);
+  getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
+      .legalFor({{S32, S32}, {S32, S64}})
+      .customIf(scalarNarrowerThan(1, 32))
+      .clampScalar(0, S32, S32)
+      .clampScalar(1, S32, S64)
+      .scalarize(0)
+      .widenScalarToNextPow2(0, 32)
+      .widenScalarToNextPow2(1, 32);
+
+  getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
+      .legalFor({{S32, S32}, {S32, S64}})
+      .clampScalar(0, S32, S32)
+      .clampScalar(1, S32, S64)
+      .scalarize(0)
+      .widenScalarToNextPow2(0, 32)
+      .widenScalarToNextPow2(1, 32);
 
   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
   // RegBankSelect.
@@ -2128,6 +2137,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
   case TargetOpcode::G_CTLZ:
   case TargetOpcode::G_CTTZ:
     return legalizeCTLZ_CTTZ(MI, MRI, B);
+  case TargetOpcode::G_CTLZ_ZERO_UNDEF:
+    return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
     return legalizeFPTruncRound(MI, B);
   case TargetOpcode::G_STACKSAVE:
@@ -4145,6 +4156,25 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
+                                                  MachineRegisterInfo &MRI,
+                                                  MachineIRBuilder &B) const {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  LLT SrcTy = MRI.getType(Src);
+  TypeSize NumBits = SrcTy.getSizeInBits();
+
+  assert(NumBits < 32u);
+
+  auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
+  auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
+  auto Shift = B.buildLShr(S32, {Extend}, ShiftAmt);
+  auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
+  B.buildTrunc(Dst, Ctlz);
+  MI.eraseFromParent();
+  return true;
+}
+
 // Check that this is a G_XOR x, -1
 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
   if (MI.getOpcode() != TargetOpcode::G_XOR)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index e5ba84a74a0f8a..4b1d821dadc215 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -108,6 +108,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const;
   bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI,
                          MachineIRBuilder &B) const;
+  bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI,
+                               MachineIRBuilder &B) const;
 
   bool loadInputValue(Register DstReg, MachineIRBuilder &B,
                       const ArgDescriptor *Arg,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
index fed277d7d10d08..7748b481cf5b77 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
@@ -81,14 +81,12 @@ body: |
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-    ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
+    ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s16) = G_TRUNC %0
     %2:_(s16) = G_CTLZ_ZERO_UNDEF %1
@@ -149,18 +147,15 @@ body: |
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR1]](s32)
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[C]](s32)
+    ; CHECK-NEXT: [[AMDGPU_FFBH_U321:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR2]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-    ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
-    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
-    ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[LSHR]](s32)
-    ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF1]], [[C]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U321]], [[C1]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -179,14 +174,12 @@ body: |
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-    ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
-    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[FFBH:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FFBH]], [[C1]]
+    ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s7) = G_TRUNC %0
     %2:_(s7) = G_CTLZ_ZERO_UNDEF %1
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 54adde38d6d229..d94a27e8c02000 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -322,9 +322,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s2, s2, 0xff
-; SI-NEXT:    s_flbit_i32_b32 s2, s2
-; SI-NEXT:    s_sub_i32 s4, s2, 24
+; SI-NEXT:    s_lshl_b32 s2, s2, 24
+; SI-NEXT:    s_flbit_i32_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
@@ -335,9 +334,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s2, s2, 0xff
+; VI-NEXT:    s_lshl_b32 s2, s2, 24
 ; VI-NEXT:    s_flbit_i32_b32 s2, s2
-; VI-NEXT:    s_sub_i32 s2, s2, 24
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
@@ -357,13 +355,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, 0.0,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     FFBH_UINT T0.W, T0.X,
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
 ; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    -24(nan), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:     LSHL * T1.W, PS, literal.y,
 ; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
 ; EG-NEXT:     LSHL T0.X, PV.W, PS,
 ; EG-NEXT:     LSHL * T0.W, literal.x, PS,
@@ -379,9 +377,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s0, s4, 24
 ; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s0, s0
-; GFX9-GISEL-NEXT:    s_sub_i32 s0, s0, 24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_endpgm
@@ -399,9 +396,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s2, s2, 0xffff
-; SI-NEXT:    s_flbit_i32_b32 s2, s2
-; SI-NEXT:    s_add_i32 s4, s2, -16
+; SI-NEXT:    s_lshl_b32 s2, s2, 16
+; SI-NEXT:    s_flbit_i32_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -434,13 +430,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, 0.0,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     FFBH_UINT T0.W, T0.X,
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
 ; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    -16(nan), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:     LSHL * T1.W, PS, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
 ; EG-NEXT:     LSHL T0.X, PV.W, PS,
 ; EG-NEXT:     LSHL * T0.W, literal.x, PS,
@@ -456,9 +452,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s0, s4, 16
 ; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s0, s0
-; GFX9-GISEL-NEXT:    s_sub_i32 s0, s0, 16
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_endpgm
@@ -598,8 +593,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_ffbh_u32_e32 v1, v0
-; SI-NEXT:    v_subrev_i32_e32 v1, vcc, 24, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v0
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
@@ -613,8 +608,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; VI-NEXT:    v_subrev_u32_e32 v1, vcc, 24, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 24, v0
+; VI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -626,7 +621,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
@@ -635,10 +630,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
-; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    -24(nan), 3(4.203895e-45)
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
 ; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
@@ -659,8 +655,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v2, 24, v2
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
@@ -693,8 +688,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_ffbh_u32_e32 v1, v0
-; SI-NEXT:    v_add_i32_e32 v1, vcc, -16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
@@ -729,7 +724,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
@@ -738,10 +733,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
-; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    -16(nan), 3(4.203895e-45)
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
 ; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
@@ -764,8 +760,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v2, 16, v2
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
@@ -1110,8 +1105,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    v_ffbh_u32_e32 v0, v0
-; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -1124,8 +1119,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v0, v0
-; VI-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; VI-NEXT:    v_ffbh_u32_e32 v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_byte v[0:1], v2
@@ -1144,13 +1139,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     FFBH_UINT T0.W, T0.X,
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
 ; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    -24(nan), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:     LSHL * T1.W, PS, literal.y,
 ; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
 ; EG-NEXT:     LSHL T0.X, PV.W, PS,
 ; EG-NEXT:     LSHL * T0.W, literal.x, PS,
@@ -1172,8 +1167,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 24, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0
 ; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1709,12 +1703,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v1, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 24, v1
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v0
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa s[2:3], v0, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, s[2:3]
 ; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2193,9 +2186,8 @@ define i7 @v_ctlz_zero_undef_i7(i7 %val) {
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i7:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 25, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 true)
   ret i7 %ctlz
@@ -2286,9 +2278,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_and_b32 s0, s4, 0x3ffff
+; GFX9-GISEL-NEXT:    s_lshr_b32 s0, s4, 14
 ; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s0, s0
-; GFX9-GISEL-NEXT:    s_sub_i32 s0, s0, 14
 ; GFX9-GISEL-NEXT:    s_and_b32 s0, s0, 0x3ffff
 ; GFX9-GISEL-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
@@ -2326,9 +2317,8 @@ define i18 @v_ctlz_zero_undef_i18(i18 %val) {
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i18:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ffff, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 14, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 14, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true)
   ret i18 %ctlz
@@ -2365,12 +2355,10 @@ define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) {
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i18:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ffff, v0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ffff, v1
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 14, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 14, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 14, v0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 14, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call <2 x i18> @llvm.ctlz.v2i18(<2 x i18> %val, i1 true)
   ret <2 x i18> %ctlz
@@ -2380,16 +2368,12 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) {
 ; SI-LABEL: v_ctlz_zero_undef_v2i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_ffbh_u32_e32 v1, v1
-; SI-NEXT:    v_ffbh_u32_e32 v0, v0
-; SI-NEXT:    v_add_i32_e32 v1, vcc, -16, v1
-; SI-NEXT:    v_add_i32_e32 v0, vcc, -16, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v2
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_ctlz_zero_undef_v2i16:
@@ -2410,12 +2394,10 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) {
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i16:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 16, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 16, v0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s4, 0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, s4, 16, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %val, i1 true)
   ret <2 x i16> %ctlz
@@ -2425,20 +2407,15 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) {
 ; SI-LABEL: v_ctlz_zero_undef_v3i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; SI-NEXT:    v_ffbh_u32_e32 v0, v0
-; SI-NEXT:    v_ffbh_u32_e32 v2, v2
+; SI-NEXT:    v_ffbh_u32_e32 v3, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_add_i32_e32 v0, vcc, -16, v0
-; SI-NEXT:    v_add_i32_e32 v3, vcc, -16, v2
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0xfff00000, v0
-; SI-NEXT:    v_or_b32_e32 v2, 0x100000, v2
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_or_b32_e32 v2, 0x200000, v3
 ; SI-NEXT:    v_alignbit_b32 v1, v3, v0, 16
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2462,14 +2439,11 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) {
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v3i16:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v2, 16, v2
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 16, v0
-; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 16, v1
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s4, 0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, s4, 16, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %val, i1 true)
   ret <3 x i16> %ctlz
@@ -2479,24 +2453,18 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) {
 ; SI-LABEL: v_ctlz_zero_undef_v4i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_ffbh_u32_e32 v3, v3
 ; SI-NEXT:    v_ffbh_u32_e32 v2, v2
 ; SI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; SI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_add_i32_e32 v2, vcc, -16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_add_i32_e32 v0, vcc, -16, v0
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0xfff00000, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0xfff00000, v0
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2524,18 +2492,13 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) {
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i16:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v2, 16, v2
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 16, v0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v3, 16, v3
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 16, v1
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s4, 0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, s4, 16, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, s4, 16, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %val, i1 true)
   ret <4 x i16> %ctlz
@@ -2545,27 +2508,24 @@ define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) {
 ; SI-LABEL: v_ctlz_zero_undef_v2i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v1
 ; SI-NEXT:    v_ffbh_u32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
-; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0xffffe800, v0
-; SI-NEXT:    v_bfe_u32 v1, v0, 8, 8
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_ctlz_zero_undef_v2i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v1, 0xe800, v1
-; VI-NEXT:    v_subrev_u16_e32 v0, 24, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; VI-NEXT:    v_ffbh_u32_e32 v1, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v1
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_ctlz_zero_undef_v2i8:
@@ -2576,10 +2536,8 @@ define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) {
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i8:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 24, v0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 24, v1
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-GISEL-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %val, i1 true)
   ret <2 x i8> %ctlz
@@ -2621,12 +2579,10 @@ define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) {
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i7:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0x7f, v1
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 25, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v0, 25, v0
-; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v1, 25, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call <2 x i7> @llvm.ctlz.v2i7(<2 x i7> %val, i1 true)
   ret <2 x i7> %ctlz

From ad625a407622ba5817ef58e30357139a40cf929e Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 19 May 2024 14:51:13 -0700
Subject: [PATCH 16/60] [TableGen] Avoid std::string copy. NFC

Fix #92702
---
 llvm/utils/TableGen/ARMTargetDefEmitter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
index 491011643bbfb1..b79458529623f3 100644
--- a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp
@@ -170,7 +170,7 @@ static void EmitARMTargetDef(RecordKeeper &RK, raw_ostream &OS) {
      << "/// The set of all architectures\n"
      << "static constexpr std::array<const ArchInfo *, " << CppSpellings.size()
      << "> ArchInfos = {\n";
-  for (auto CppSpelling : CppSpellings)
+  for (StringRef CppSpelling : CppSpellings)
     OS << "  &" << CppSpelling << ",\n";
   OS << "};\n";
 

From 7892d434741ba0ac755e00ae96ca7cdcfaf82d35 Mon Sep 17 00:00:00 2001
From: Ryuichi Watanabe <ryucrosskey@gmail.com>
Date: Mon, 20 May 2024 07:01:47 +0900
Subject: [PATCH 17/60] Update llvm-bugs.yml (#77243)

---
 .github/workflows/llvm-bugs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llvm-bugs.yml b/.github/workflows/llvm-bugs.yml
index f592dd6ccd9033..c392078fa45251 100644
--- a/.github/workflows/llvm-bugs.yml
+++ b/.github/workflows/llvm-bugs.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     if: github.repository == 'llvm/llvm-project'
     steps:
-      - uses: actions/setup-node@v3
+      - uses: actions/setup-node@v4
         with:
           node-version: 18
           check-latest: true

From b603237b6c067e82a7c6b73adb7e18c8edfb40dd Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 19 May 2024 15:02:44 -0700
Subject: [PATCH 18/60] [llvm] Use operator==(StringRef, StringRef) (NFC)
 (#92705)

---
 llvm/lib/Option/OptTable.cpp                   |  2 +-
 llvm/lib/ProfileData/InstrProfCorrelator.cpp   | 10 ++++------
 llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp    |  6 +++---
 llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp |  2 +-
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Option/OptTable.cpp b/llvm/lib/Option/OptTable.cpp
index b8b6b90c253f23..3eceb0fbdfc47b 100644
--- a/llvm/lib/Option/OptTable.cpp
+++ b/llvm/lib/Option/OptTable.cpp
@@ -197,7 +197,7 @@ OptTable::suggestValueCompletions(StringRef Option, StringRef Arg) const {
 
     std::vector<std::string> Result;
     for (StringRef Val : Candidates)
-      if (Val.starts_with(Arg) && Arg.compare(Val))
+      if (Val.starts_with(Arg) && Arg != Val)
         Result.push_back(std::string(Val));
     return Result;
   }
diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
index cf80a58f43bd90..44e2aeb00d8cc8 100644
--- a/llvm/lib/ProfileData/InstrProfCorrelator.cpp
+++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
@@ -350,16 +350,14 @@ void DwarfInstrProfCorrelator<IntPtrT>::correlateProfileDataImpl(
         continue;
       }
       StringRef AnnotationName = *AnnotationNameOrErr;
-      if (AnnotationName.compare(
-              InstrProfCorrelator::FunctionNameAttributeName) == 0) {
+      if (AnnotationName == InstrProfCorrelator::FunctionNameAttributeName) {
         if (auto EC =
                 AnnotationFormValue->getAsCString().moveInto(FunctionName))
           consumeError(std::move(EC));
-      } else if (AnnotationName.compare(
-                     InstrProfCorrelator::CFGHashAttributeName) == 0) {
+      } else if (AnnotationName == InstrProfCorrelator::CFGHashAttributeName) {
         CFGHash = AnnotationFormValue->getAsUnsignedConstant();
-      } else if (AnnotationName.compare(
-                     InstrProfCorrelator::NumCountersAttributeName) == 0) {
+      } else if (AnnotationName ==
+                 InstrProfCorrelator::NumCountersAttributeName) {
         NumCounters = AnnotationFormValue->getAsUnsignedConstant();
       }
     }
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 727e4e584c053f..f4daab7d06eb53 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -171,9 +171,9 @@ getArgAccessQual(const Function &F, unsigned ArgIdx) {
   if (!ArgAttribute)
     return SPIRV::AccessQualifier::ReadWrite;
 
-  if (ArgAttribute->getString().compare("read_only") == 0)
+  if (ArgAttribute->getString() == "read_only")
     return SPIRV::AccessQualifier::ReadOnly;
-  if (ArgAttribute->getString().compare("write_only") == 0)
+  if (ArgAttribute->getString() == "write_only")
     return SPIRV::AccessQualifier::WriteOnly;
   return SPIRV::AccessQualifier::ReadWrite;
 }
@@ -181,7 +181,7 @@ getArgAccessQual(const Function &F, unsigned ArgIdx) {
 static std::vector<SPIRV::Decoration::Decoration>
 getKernelArgTypeQual(const Function &F, unsigned ArgIdx) {
   MDString *ArgAttribute = getOCLKernelArgTypeQual(F, ArgIdx);
-  if (ArgAttribute && ArgAttribute->getString().compare("volatile") == 0)
+  if (ArgAttribute && ArgAttribute->getString() == "volatile")
     return {SPIRV::Decoration::Volatile};
   return {};
 }
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 62b4a9278954ce..6623106109316b 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1802,7 +1802,7 @@ bool X86AsmParser::ParseIntelNamedOperator(StringRef Name,
                                            bool &ParseError, SMLoc &End) {
   // A named operator should be either lower or upper case, but not a mix...
   // except in MASM, which uses full case-insensitivity.
-  if (Name.compare(Name.lower()) && Name.compare(Name.upper()) &&
+  if (Name != Name.lower() && Name != Name.upper() &&
       !getParser().isParsingMasm())
     return false;
   if (Name.equals_insensitive("not")) {

From 2d5e488c98225108aebfe4aa4acfe6ec1f234a37 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sun, 19 May 2024 15:09:03 -0700
Subject: [PATCH 19/60] [clang-format][NFC] Clean up SortIncludesTest.cpp

Wherever applicable, replace EXPECT_EQ with verifyFormat and std::string
with StringRef. Also, change a raw string literal to a regular one.
---
 clang/unittests/Format/SortIncludesTest.cpp | 1942 +++++++++----------
 1 file changed, 970 insertions(+), 972 deletions(-)

diff --git a/clang/unittests/Format/SortIncludesTest.cpp b/clang/unittests/Format/SortIncludesTest.cpp
index 824fa0078cd037..52ba19627182b8 100644
--- a/clang/unittests/Format/SortIncludesTest.cpp
+++ b/clang/unittests/Format/SortIncludesTest.cpp
@@ -53,35 +53,35 @@ class SortIncludesTest : public test::FormatTestBase {
 };
 
 TEST_F(SortIncludesTest, BasicSorting) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\""));
-
-  EXPECT_EQ("// comment\n"
-            "#include <a>\n"
-            "#include <b>",
-            sort("// comment\n"
-                 "#include <b>\n"
-                 "#include <a>",
-                 {tooling::Range(25, 1)}));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\""));
+
+  verifyFormat("// comment\n"
+               "#include <a>\n"
+               "#include <b>",
+               sort("// comment\n"
+                    "#include <b>\n"
+                    "#include <a>",
+                    {tooling::Range(25, 1)}));
 }
 
 TEST_F(SortIncludesTest, TrailingComments) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\" /* long\n"
-            "                  * long\n"
-            "                  * comment*/\n"
-            "#include \"c.h\"\n"
-            "#include \"d.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\" /* long\n"
-                 "                  * long\n"
-                 "                  * comment*/\n"
-                 "#include \"d.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\" /* long\n"
+               "                  * long\n"
+               "                  * comment*/\n"
+               "#include \"c.h\"\n"
+               "#include \"d.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\" /* long\n"
+                    "                  * long\n"
+                    "                  * comment*/\n"
+                    "#include \"d.h\""));
 }
 
 TEST_F(SortIncludesTest, SortedIncludesUsingSortPriorityAttribute) {
@@ -100,531 +100,531 @@ TEST_F(SortIncludesTest, SortedIncludesUsingSortPriorityAttribute) {
       {"<path", 9, 11, false},
       {"^<[^/].*\\.h>", 8, 10, false},
       {"^\".*\\.h\"", 10, 12, false}};
-  EXPECT_EQ("#include <sys/param.h>\n"
-            "#include <sys/types.h>\n"
-            "#include <sys/ioctl.h>\n"
-            "#include <sys/socket.h>\n"
-            "#include <sys/stat.h>\n"
-            "#include <sys/wait.h>\n"
-            "\n"
-            "#include <net/if.h>\n"
-            "#include <net/if_dl.h>\n"
-            "#include <net/route.h>\n"
-            "#include <netinet/in.h>\n"
-            "#include <protocols/rwhod.h>\n"
-            "\n"
-            "#include <assert.h>\n"
-            "#include <errno.h>\n"
-            "#include <inttypes.h>\n"
-            "#include <stdio.h>\n"
-            "#include <stdlib.h>\n"
-            "\n"
-            "#include <paths.h>\n"
-            "\n"
-            "#include \"pathnames.h\"",
-            sort("#include <sys/param.h>\n"
-                 "#include <sys/types.h>\n"
-                 "#include <sys/ioctl.h>\n"
-                 "#include <net/if_dl.h>\n"
-                 "#include <net/route.h>\n"
-                 "#include <netinet/in.h>\n"
-                 "#include <sys/socket.h>\n"
-                 "#include <sys/stat.h>\n"
-                 "#include <sys/wait.h>\n"
-                 "#include <net/if.h>\n"
-                 "#include <protocols/rwhod.h>\n"
-                 "#include <assert.h>\n"
-                 "#include <paths.h>\n"
-                 "#include \"pathnames.h\"\n"
-                 "#include <errno.h>\n"
-                 "#include <inttypes.h>\n"
-                 "#include <stdio.h>\n"
-                 "#include <stdlib.h>"));
+  verifyFormat("#include <sys/param.h>\n"
+               "#include <sys/types.h>\n"
+               "#include <sys/ioctl.h>\n"
+               "#include <sys/socket.h>\n"
+               "#include <sys/stat.h>\n"
+               "#include <sys/wait.h>\n"
+               "\n"
+               "#include <net/if.h>\n"
+               "#include <net/if_dl.h>\n"
+               "#include <net/route.h>\n"
+               "#include <netinet/in.h>\n"
+               "#include <protocols/rwhod.h>\n"
+               "\n"
+               "#include <assert.h>\n"
+               "#include <errno.h>\n"
+               "#include <inttypes.h>\n"
+               "#include <stdio.h>\n"
+               "#include <stdlib.h>\n"
+               "\n"
+               "#include <paths.h>\n"
+               "\n"
+               "#include \"pathnames.h\"",
+               sort("#include <sys/param.h>\n"
+                    "#include <sys/types.h>\n"
+                    "#include <sys/ioctl.h>\n"
+                    "#include <net/if_dl.h>\n"
+                    "#include <net/route.h>\n"
+                    "#include <netinet/in.h>\n"
+                    "#include <sys/socket.h>\n"
+                    "#include <sys/stat.h>\n"
+                    "#include <sys/wait.h>\n"
+                    "#include <net/if.h>\n"
+                    "#include <protocols/rwhod.h>\n"
+                    "#include <assert.h>\n"
+                    "#include <paths.h>\n"
+                    "#include \"pathnames.h\"\n"
+                    "#include <errno.h>\n"
+                    "#include <inttypes.h>\n"
+                    "#include <stdio.h>\n"
+                    "#include <stdlib.h>"));
 }
 TEST_F(SortIncludesTest, SortPriorityNotDefined) {
   FmtStyle = getLLVMStyle();
-  EXPECT_EQ("#include \"FormatTestUtils.h\"\n"
-            "#include \"clang/Format/Format.h\"\n"
-            "#include \"llvm/ADT/None.h\"\n"
-            "#include \"llvm/Support/Debug.h\"\n"
-            "#include \"gtest/gtest.h\"",
-            sort("#include \"clang/Format/Format.h\"\n"
-                 "#include \"llvm/ADT/None.h\"\n"
-                 "#include \"FormatTestUtils.h\"\n"
-                 "#include \"gtest/gtest.h\"\n"
-                 "#include \"llvm/Support/Debug.h\""));
+  verifyFormat("#include \"FormatTestUtils.h\"\n"
+               "#include \"clang/Format/Format.h\"\n"
+               "#include \"llvm/ADT/None.h\"\n"
+               "#include \"llvm/Support/Debug.h\"\n"
+               "#include \"gtest/gtest.h\"",
+               sort("#include \"clang/Format/Format.h\"\n"
+                    "#include \"llvm/ADT/None.h\"\n"
+                    "#include \"FormatTestUtils.h\"\n"
+                    "#include \"gtest/gtest.h\"\n"
+                    "#include \"llvm/Support/Debug.h\""));
 }
 
 TEST_F(SortIncludesTest, NoReplacementsForValidIncludes) {
   // Identical #includes have led to a failure with an unstable sort.
-  std::string Code = "#include <a>\n"
-                     "#include <b>\n"
-                     "#include <c>\n"
-                     "#include <d>\n"
-                     "#include <e>\n"
-                     "#include <f>\n";
+  StringRef Code = "#include <a>\n"
+                   "#include <b>\n"
+                   "#include <c>\n"
+                   "#include <d>\n"
+                   "#include <e>\n"
+                   "#include <f>\n";
   EXPECT_TRUE(sortIncludes(FmtStyle, Code, GetCodeRange(Code), "a.cc").empty());
 }
 
 TEST_F(SortIncludesTest, MainFileHeader) {
-  std::string Code = "#include <string>\n"
-                     "\n"
-                     "#include \"a/extra_action.proto.h\"\n";
+  StringRef Code = "#include <string>\n"
+                   "\n"
+                   "#include \"a/extra_action.proto.h\"\n";
   FmtStyle = getGoogleStyle(FormatStyle::LK_Cpp);
   EXPECT_TRUE(
       sortIncludes(FmtStyle, Code, GetCodeRange(Code), "a/extra_action.cc")
           .empty());
 
-  EXPECT_EQ("#include \"foo.bar.h\"\n"
-            "\n"
-            "#include \"a.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"foo.bar.h\"",
-                 "foo.bar.cc"));
+  verifyFormat("#include \"foo.bar.h\"\n"
+               "\n"
+               "#include \"a.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"foo.bar.h\"",
+                    "foo.bar.cc"));
 }
 
 TEST_F(SortIncludesTest, SortedIncludesInMultipleBlocksAreMerged) {
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "\n"
-                 "\n"
-                 "#include \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "\n"
+                    "\n"
+                    "#include \"b.h\""));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "\n"
-                 "\n"
-                 "#include \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "\n"
+                    "\n"
+                    "#include \"b.h\""));
 }
 
 TEST_F(SortIncludesTest, SupportClangFormatOff) {
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>\n"
-            "// clang-format off\n"
-            "#include <b>\n"
-            "#include <a>\n"
-            "#include <c>\n"
-            "// clang-format on",
-            sort("#include <b>\n"
-                 "#include <a>\n"
-                 "#include <c>\n"
-                 "// clang-format off\n"
-                 "#include <b>\n"
-                 "#include <a>\n"
-                 "#include <c>\n"
-                 "// clang-format on"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>\n"
+               "// clang-format off\n"
+               "#include <b>\n"
+               "#include <a>\n"
+               "#include <c>\n"
+               "// clang-format on",
+               sort("#include <b>\n"
+                    "#include <a>\n"
+                    "#include <c>\n"
+                    "// clang-format off\n"
+                    "#include <b>\n"
+                    "#include <a>\n"
+                    "#include <c>\n"
+                    "// clang-format on"));
 
   Style.IncludeBlocks = Style.IBS_Merge;
-  std::string Code = "// clang-format off\r\n"
-                     "#include \"d.h\"\r\n"
-                     "#include \"b.h\"\r\n"
-                     "// clang-format on\r\n"
-                     "\r\n"
-                     "#include \"c.h\"\r\n"
-                     "#include \"a.h\"\r\n"
-                     "#include \"e.h\"\r\n";
-
-  std::string Expected = "// clang-format off\r\n"
-                         "#include \"d.h\"\r\n"
-                         "#include \"b.h\"\r\n"
-                         "// clang-format on\r\n"
-                         "\r\n"
-                         "#include \"e.h\"\r\n"
-                         "#include \"a.h\"\r\n"
-                         "#include \"c.h\"\r\n";
-
-  EXPECT_EQ(Expected, sort(Code, "e.cpp", 1));
+  StringRef Code = "// clang-format off\r\n"
+                   "#include \"d.h\"\r\n"
+                   "#include \"b.h\"\r\n"
+                   "// clang-format on\r\n"
+                   "\r\n"
+                   "#include \"c.h\"\r\n"
+                   "#include \"a.h\"\r\n"
+                   "#include \"e.h\"\r\n";
+
+  StringRef Expected = "// clang-format off\r\n"
+                       "#include \"d.h\"\r\n"
+                       "#include \"b.h\"\r\n"
+                       "// clang-format on\r\n"
+                       "\r\n"
+                       "#include \"e.h\"\r\n"
+                       "#include \"a.h\"\r\n"
+                       "#include \"c.h\"\r\n";
+
+  verifyFormat(Expected, sort(Code, "e.cpp", 1));
 }
 
 TEST_F(SortIncludesTest, SupportClangFormatOffCStyle) {
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>\n"
-            "/* clang-format off */\n"
-            "#include <b>\n"
-            "#include <a>\n"
-            "#include <c>\n"
-            "/* clang-format on */",
-            sort("#include <b>\n"
-                 "#include <a>\n"
-                 "#include <c>\n"
-                 "/* clang-format off */\n"
-                 "#include <b>\n"
-                 "#include <a>\n"
-                 "#include <c>\n"
-                 "/* clang-format on */"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>\n"
+               "/* clang-format off */\n"
+               "#include <b>\n"
+               "#include <a>\n"
+               "#include <c>\n"
+               "/* clang-format on */",
+               sort("#include <b>\n"
+                    "#include <a>\n"
+                    "#include <c>\n"
+                    "/* clang-format off */\n"
+                    "#include <b>\n"
+                    "#include <a>\n"
+                    "#include <c>\n"
+                    "/* clang-format on */"));
 
   // Not really turning it off
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>\n"
-            "/* clang-format offically */\n"
-            "#include <a>\n"
-            "#include <b>\n"
-            "#include <c>\n"
-            "/* clang-format onwards */",
-            sort("#include <b>\n"
-                 "#include <a>\n"
-                 "#include <c>\n"
-                 "/* clang-format offically */\n"
-                 "#include <b>\n"
-                 "#include <a>\n"
-                 "#include <c>\n"
-                 "/* clang-format onwards */",
-                 "input.h", 2));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>\n"
+               "/* clang-format offically */\n"
+               "#include <a>\n"
+               "#include <b>\n"
+               "#include <c>\n"
+               "/* clang-format onwards */",
+               sort("#include <b>\n"
+                    "#include <a>\n"
+                    "#include <c>\n"
+                    "/* clang-format offically */\n"
+                    "#include <b>\n"
+                    "#include <a>\n"
+                    "#include <c>\n"
+                    "/* clang-format onwards */",
+                    "input.h", 2));
 }
 
 TEST_F(SortIncludesTest, IncludeSortingCanBeDisabled) {
   FmtStyle.SortIncludes = FormatStyle::SI_Never;
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"b.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "input.h", 0));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"b.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "input.h", 0));
 }
 
 TEST_F(SortIncludesTest, MixIncludeAndImport) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#import \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#import \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#import \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#import \"b.h\""));
 }
 
 TEST_F(SortIncludesTest, FixTrailingComments) {
-  EXPECT_EQ("#include \"a.h\"  // comment\n"
-            "#include \"bb.h\" // comment\n"
-            "#include \"ccc.h\"",
-            sort("#include \"a.h\" // comment\n"
-                 "#include \"ccc.h\"\n"
-                 "#include \"bb.h\" // comment"));
+  verifyFormat("#include \"a.h\"  // comment\n"
+               "#include \"bb.h\" // comment\n"
+               "#include \"ccc.h\"",
+               sort("#include \"a.h\" // comment\n"
+                    "#include \"ccc.h\"\n"
+                    "#include \"bb.h\" // comment"));
 }
 
 TEST_F(SortIncludesTest, LeadingWhitespace) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort(" #include \"a.h\"\n"
-                 "  #include \"c.h\"\n"
-                 "   #include \"b.h\""));
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("# include \"a.h\"\n"
-                 "#  include \"c.h\"\n"
-                 "#   include \"b.h\""));
-  EXPECT_EQ("#include \"a.h\"", sort("#include \"a.h\"\n"
-                                     " #include \"a.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort(" #include \"a.h\"\n"
+                    "  #include \"c.h\"\n"
+                    "   #include \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("# include \"a.h\"\n"
+                    "#  include \"c.h\"\n"
+                    "#   include \"b.h\""));
+  verifyFormat("#include \"a.h\"", sort("#include \"a.h\"\n"
+                                        " #include \"a.h\""));
 }
 
 TEST_F(SortIncludesTest, TrailingWhitespace) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"a.h\" \n"
-                 "#include \"c.h\"  \n"
-                 "#include \"b.h\"   "));
-  EXPECT_EQ("#include \"a.h\"", sort("#include \"a.h\"\n"
-                                     "#include \"a.h\" "));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"a.h\" \n"
+                    "#include \"c.h\"  \n"
+                    "#include \"b.h\"   "));
+  verifyFormat("#include \"a.h\"", sort("#include \"a.h\"\n"
+                                        "#include \"a.h\" "));
 }
 
 TEST_F(SortIncludesTest, GreaterInComment) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\" // >\n"
-            "#include \"c.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\" // >"));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\" // >\n"
+               "#include \"c.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\" // >"));
 }
 
 TEST_F(SortIncludesTest, SortsLocallyInEachBlock) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"c.h\"\n"
-            "\n"
-            "#include \"b.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "\n"
-                 "#include \"b.h\"",
-                 "input.h", 0));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"c.h\"\n"
+               "\n"
+               "#include \"b.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "\n"
+                    "#include \"b.h\"",
+                    "input.h", 0));
 }
 
 TEST_F(SortIncludesTest, SortsAllBlocksWhenMerging) {
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "\n"
-                 "#include \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "\n"
+                    "#include \"b.h\""));
 }
 
 TEST_F(SortIncludesTest, CommentsAlwaysSeparateGroups) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"c.h\"\n"
-            "// comment\n"
-            "#include \"b.h\"",
-            sort("#include \"c.h\"\n"
-                 "#include \"a.h\"\n"
-                 "// comment\n"
-                 "#include \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"c.h\"\n"
+               "// comment\n"
+               "#include \"b.h\"",
+               sort("#include \"c.h\"\n"
+                    "#include \"a.h\"\n"
+                    "// comment\n"
+                    "#include \"b.h\""));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"c.h\"\n"
-            "// comment\n"
-            "#include \"b.h\"",
-            sort("#include \"c.h\"\n"
-                 "#include \"a.h\"\n"
-                 "// comment\n"
-                 "#include \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"c.h\"\n"
+               "// comment\n"
+               "#include \"b.h\"",
+               sort("#include \"c.h\"\n"
+                    "#include \"a.h\"\n"
+                    "// comment\n"
+                    "#include \"b.h\""));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"c.h\"\n"
-            "// comment\n"
-            "#include \"b.h\"",
-            sort("#include \"c.h\"\n"
-                 "#include \"a.h\"\n"
-                 "// comment\n"
-                 "#include \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"c.h\"\n"
+               "// comment\n"
+               "#include \"b.h\"",
+               sort("#include \"c.h\"\n"
+                    "#include \"a.h\"\n"
+                    "// comment\n"
+                    "#include \"b.h\""));
 }
 
 TEST_F(SortIncludesTest, HandlesAngledIncludesAsSeparateBlocks) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"c.h\"\n"
-            "#include <array>\n"
-            "#include <b.h>\n"
-            "#include <d.h>\n"
-            "#include <vector>",
-            sort("#include <vector>\n"
-                 "#include <d.h>\n"
-                 "#include <array>\n"
-                 "#include <b.h>\n"
-                 "#include \"c.h\"\n"
-                 "#include \"a.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"c.h\"\n"
+               "#include <array>\n"
+               "#include <b.h>\n"
+               "#include <d.h>\n"
+               "#include <vector>",
+               sort("#include <vector>\n"
+                    "#include <d.h>\n"
+                    "#include <array>\n"
+                    "#include <b.h>\n"
+                    "#include \"c.h\"\n"
+                    "#include \"a.h\""));
 
   FmtStyle = getGoogleStyle(FormatStyle::LK_Cpp);
-  EXPECT_EQ("#include <b.h>\n"
-            "#include <d.h>\n"
-            "\n"
-            "#include <array>\n"
-            "#include <vector>\n"
-            "\n"
-            "#include \"a.h\"\n"
-            "#include \"c.h\"",
-            sort("#include <vector>\n"
-                 "#include <d.h>\n"
-                 "#include <array>\n"
-                 "#include <b.h>\n"
-                 "#include \"c.h\"\n"
-                 "#include \"a.h\""));
+  verifyFormat("#include <b.h>\n"
+               "#include <d.h>\n"
+               "\n"
+               "#include <array>\n"
+               "#include <vector>\n"
+               "\n"
+               "#include \"a.h\"\n"
+               "#include \"c.h\"",
+               sort("#include <vector>\n"
+                    "#include <d.h>\n"
+                    "#include <array>\n"
+                    "#include <b.h>\n"
+                    "#include \"c.h\"\n"
+                    "#include \"a.h\""));
 }
 
 TEST_F(SortIncludesTest, RegroupsAngledIncludesInSeparateBlocks) {
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"c.h\"\n"
-            "\n"
-            "#include <b.h>\n"
-            "#include <d.h>",
-            sort("#include <d.h>\n"
-                 "#include <b.h>\n"
-                 "#include \"c.h\"\n"
-                 "#include \"a.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"c.h\"\n"
+               "\n"
+               "#include <b.h>\n"
+               "#include <d.h>",
+               sort("#include <d.h>\n"
+                    "#include <b.h>\n"
+                    "#include \"c.h\"\n"
+                    "#include \"a.h\""));
 }
 
 TEST_F(SortIncludesTest, HandlesMultilineIncludes) {
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"a.h\"\n"
-                 "#include \\\n"
-                 "\"c.h\"\n"
-                 "#include \"b.h\""));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"a.h\"\n"
+                    "#include \\\n"
+                    "\"c.h\"\n"
+                    "#include \"b.h\""));
 }
 
 TEST_F(SortIncludesTest, HandlesTrailingCommentsWithAngleBrackets) {
   // Regression test from the discussion at https://reviews.llvm.org/D121370.
-  EXPECT_EQ("#include <cstdint>\n"
-            "\n"
-            "#include \"util/bar.h\"\n"
-            "#include \"util/foo/foo.h\" // foo<T>",
-            sort("#include <cstdint>\n"
-                 "\n"
-                 "#include \"util/bar.h\"\n"
-                 "#include \"util/foo/foo.h\" // foo<T>",
-                 /*FileName=*/"input.cc",
-                 /*ExpectedNumRanges=*/0));
+  verifyFormat("#include <cstdint>\n"
+               "\n"
+               "#include \"util/bar.h\"\n"
+               "#include \"util/foo/foo.h\" // foo<T>",
+               sort("#include <cstdint>\n"
+                    "\n"
+                    "#include \"util/bar.h\"\n"
+                    "#include \"util/foo/foo.h\" // foo<T>",
+                    /*FileName=*/"input.cc",
+                    /*ExpectedNumRanges=*/0));
 }
 
 TEST_F(SortIncludesTest, LeavesMainHeaderFirst) {
   Style.IncludeIsMainRegex = "([-_](test|unittest))?$";
-  EXPECT_EQ("#include \"llvm/a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a.cc"));
-  EXPECT_EQ("#include \"llvm/a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a_test.cc"));
-  EXPECT_EQ("#include \"llvm/input.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"llvm/input.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "input.mm"));
+  verifyFormat("#include \"llvm/a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a.cc"));
+  verifyFormat("#include \"llvm/a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a_test.cc"));
+  verifyFormat("#include \"llvm/input.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"llvm/input.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "input.mm"));
 
   // Don't allow prefixes.
-  EXPECT_EQ("#include \"b.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"llvm/not_a.h\"",
-            sort("#include \"llvm/not_a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a.cc"));
+  verifyFormat("#include \"b.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"llvm/not_a.h\"",
+               sort("#include \"llvm/not_a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a.cc"));
 
   // Don't do this for _main and other suffixes.
-  EXPECT_EQ("#include \"b.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a_main.cc"));
+  verifyFormat("#include \"b.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"llvm/a.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a_main.cc"));
 
   // Don't do this in headers.
-  EXPECT_EQ("#include \"b.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a.h"));
+  verifyFormat("#include \"b.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"llvm/a.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a.h"));
 
   // Only do this in the first #include block.
-  EXPECT_EQ("#include <a>\n"
-            "\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"",
-            sort("#include <a>\n"
-                 "\n"
-                 "#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a.cc"));
+  verifyFormat("#include <a>\n"
+               "\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"llvm/a.h\"",
+               sort("#include <a>\n"
+                    "\n"
+                    "#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a.cc"));
 
   // Only recognize the first #include with a matching basename as main include.
-  EXPECT_EQ("#include \"a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"",
-            sort("#include \"b.h\"\n"
-                 "#include \"a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"llvm/a.h\"",
-                 "a.cc"));
+  verifyFormat("#include \"a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"llvm/a.h\"",
+               sort("#include \"b.h\"\n"
+                    "#include \"a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"llvm/a.h\"",
+                    "a.cc"));
 }
 
 TEST_F(SortIncludesTest, LeavesMainHeaderFirstInAdditionalExtensions) {
   Style.IncludeIsMainRegex = "([-_](test|unittest))?|(Impl)?$";
-  EXPECT_EQ("#include \"b.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a_test.xxx"));
-  EXPECT_EQ("#include \"b.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "aImpl.hpp"));
+  verifyFormat("#include \"b.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"llvm/a.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a_test.xxx"));
+  verifyFormat("#include \"b.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"llvm/a.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "aImpl.hpp"));
 
   // .cpp extension is considered "main" by default
-  EXPECT_EQ("#include \"llvm/a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "aImpl.cpp"));
-  EXPECT_EQ("#include \"llvm/a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a_test.cpp"));
+  verifyFormat("#include \"llvm/a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "aImpl.cpp"));
+  verifyFormat("#include \"llvm/a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a_test.cpp"));
 
   // Allow additional filenames / extensions
   Style.IncludeIsMainSourceRegex = "(Impl\\.hpp)|(\\.xxx)$";
-  EXPECT_EQ("#include \"llvm/a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "a_test.xxx"));
-  EXPECT_EQ("#include \"llvm/a.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"llvm/a.h\"\n"
-                 "#include \"c.h\"\n"
-                 "#include \"b.h\"",
-                 "aImpl.hpp"));
+  verifyFormat("#include \"llvm/a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "a_test.xxx"));
+  verifyFormat("#include \"llvm/a.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"llvm/a.h\"\n"
+                    "#include \"c.h\"\n"
+                    "#include \"b.h\"",
+                    "aImpl.hpp"));
 }
 
 TEST_F(SortIncludesTest, RecognizeMainHeaderInAllGroups) {
   Style.IncludeIsMainRegex = "([-_](test|unittest))?$";
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
 
-  EXPECT_EQ("#include \"c.h\"\n"
-            "#include \"a.h\"\n"
-            "#include \"b.h\"",
-            sort("#include \"b.h\"\n"
-                 "\n"
-                 "#include \"a.h\"\n"
-                 "#include \"c.h\"",
-                 "c.cc"));
+  verifyFormat("#include \"c.h\"\n"
+               "#include \"a.h\"\n"
+               "#include \"b.h\"",
+               sort("#include \"b.h\"\n"
+                    "\n"
+                    "#include \"a.h\"\n"
+                    "#include \"c.h\"",
+                    "c.cc"));
 }
 
 TEST_F(SortIncludesTest, MainHeaderIsSeparatedWhenRegroupping) {
   Style.IncludeIsMainRegex = "([-_](test|unittest))?$";
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
 
-  EXPECT_EQ("#include \"a.h\"\n"
-            "\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"",
-            sort("#include \"b.h\"\n"
-                 "\n"
-                 "#include \"a.h\"\n"
-                 "#include \"c.h\"",
-                 "a.cc"));
+  verifyFormat("#include \"a.h\"\n"
+               "\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"",
+               sort("#include \"b.h\"\n"
+                    "\n"
+                    "#include \"a.h\"\n"
+                    "#include \"c.h\"",
+                    "a.cc"));
 }
 
 TEST_F(SortIncludesTest, SupportOptionalCaseSensitiveSorting) {
@@ -632,17 +632,17 @@ TEST_F(SortIncludesTest, SupportOptionalCaseSensitiveSorting) {
 
   FmtStyle.SortIncludes = FormatStyle::SI_CaseInsensitive;
 
-  EXPECT_EQ("#include \"A/B.h\"\n"
-            "#include \"A/b.h\"\n"
-            "#include \"a/b.h\"\n"
-            "#include \"B/A.h\"\n"
-            "#include \"B/a.h\"",
-            sort("#include \"B/a.h\"\n"
-                 "#include \"B/A.h\"\n"
-                 "#include \"A/B.h\"\n"
-                 "#include \"a/b.h\"\n"
-                 "#include \"A/b.h\"",
-                 "a.h"));
+  verifyFormat("#include \"A/B.h\"\n"
+               "#include \"A/b.h\"\n"
+               "#include \"a/b.h\"\n"
+               "#include \"B/A.h\"\n"
+               "#include \"B/a.h\"",
+               sort("#include \"B/a.h\"\n"
+                    "#include \"B/A.h\"\n"
+                    "#include \"A/B.h\"\n"
+                    "#include \"a/b.h\"\n"
+                    "#include \"A/b.h\"",
+                    "a.h"));
 
   Style.IncludeBlocks = clang::tooling::IncludeStyle::IBS_Regroup;
   Style.IncludeCategories = {
@@ -657,17 +657,17 @@ TEST_F(SortIncludesTest, SupportOptionalCaseSensitiveSorting) {
                            "#include \"Vlib.h\"\n"
                            "#include \"AST.h\"";
 
-  EXPECT_EQ("#include \"AST.h\"\n"
-            "#include \"qt.h\"\n"
-            "#include \"Vlib.h\"\n"
-            "#include \"vlib.h\"\n"
-            "\n"
-            "#include <Qtwhatever.h>\n"
-            "#include <qtwhatever.h>\n"
-            "\n"
-            "#include <Algorithm>\n"
-            "#include <algorithm>",
-            sort(UnsortedCode));
+  verifyFormat("#include \"AST.h\"\n"
+               "#include \"qt.h\"\n"
+               "#include \"Vlib.h\"\n"
+               "#include \"vlib.h\"\n"
+               "\n"
+               "#include <Qtwhatever.h>\n"
+               "#include <qtwhatever.h>\n"
+               "\n"
+               "#include <Algorithm>\n"
+               "#include <algorithm>",
+               sort(UnsortedCode));
 }
 
 TEST_F(SortIncludesTest, SupportCaseInsensitiveMatching) {
@@ -676,21 +676,21 @@ TEST_F(SortIncludesTest, SupportCaseInsensitiveMatching) {
 
   // Ensure both main header detection and grouping work in a case insensitive
   // manner.
-  EXPECT_EQ("#include \"llvm/A.h\"\n"
-            "#include \"b.h\"\n"
-            "#include \"c.h\"\n"
-            "#include \"LLVM/z.h\"\n"
-            "#include \"llvm/X.h\"\n"
-            "#include \"GTest/GTest.h\"\n"
-            "#include \"gmock/gmock.h\"",
-            sort("#include \"c.h\"\n"
-                 "#include \"b.h\"\n"
-                 "#include \"GTest/GTest.h\"\n"
-                 "#include \"llvm/A.h\"\n"
-                 "#include \"gmock/gmock.h\"\n"
-                 "#include \"llvm/X.h\"\n"
-                 "#include \"LLVM/z.h\"",
-                 "a_TEST.cc"));
+  verifyFormat("#include \"llvm/A.h\"\n"
+               "#include \"b.h\"\n"
+               "#include \"c.h\"\n"
+               "#include \"LLVM/z.h\"\n"
+               "#include \"llvm/X.h\"\n"
+               "#include \"GTest/GTest.h\"\n"
+               "#include \"gmock/gmock.h\"",
+               sort("#include \"c.h\"\n"
+                    "#include \"b.h\"\n"
+                    "#include \"GTest/GTest.h\"\n"
+                    "#include \"llvm/A.h\"\n"
+                    "#include \"gmock/gmock.h\"\n"
+                    "#include \"llvm/X.h\"\n"
+                    "#include \"LLVM/z.h\"",
+                    "a_TEST.cc"));
 }
 
 TEST_F(SortIncludesTest, SupportOptionalCaseSensitiveMachting) {
@@ -711,57 +711,57 @@ TEST_F(SortIncludesTest, SupportOptionalCaseSensitiveMachting) {
                            "#include <qtwhatever.h>\n"
                            "#include <QtGlobal>";
 
-  EXPECT_EQ("#include \"qa.h\"\n"
-            "#include \"qt.h\"\n"
-            "\n"
-            "#include <qtwhatever.h>\n"
-            "#include <windows.h>\n"
-            "\n"
-            "#include <QLabel>\n"
-            "#include <QWidget>\n"
-            "#include <QtGlobal>\n"
-            "#include <queue>\n"
-            "\n"
-            "#include <algorithm>",
-            sort(UnsortedCode));
+  verifyFormat("#include \"qa.h\"\n"
+               "#include \"qt.h\"\n"
+               "\n"
+               "#include <qtwhatever.h>\n"
+               "#include <windows.h>\n"
+               "\n"
+               "#include <QLabel>\n"
+               "#include <QWidget>\n"
+               "#include <QtGlobal>\n"
+               "#include <queue>\n"
+               "\n"
+               "#include <algorithm>",
+               sort(UnsortedCode));
 
   Style.IncludeCategories[2].RegexIsCaseSensitive = true;
   Style.IncludeCategories[3].RegexIsCaseSensitive = true;
-  EXPECT_EQ("#include \"qa.h\"\n"
-            "#include \"qt.h\"\n"
-            "\n"
-            "#include <qtwhatever.h>\n"
-            "#include <windows.h>\n"
-            "\n"
-            "#include <QLabel>\n"
-            "#include <QWidget>\n"
-            "\n"
-            "#include <QtGlobal>\n"
-            "\n"
-            "#include <algorithm>\n"
-            "#include <queue>",
-            sort(UnsortedCode));
+  verifyFormat("#include \"qa.h\"\n"
+               "#include \"qt.h\"\n"
+               "\n"
+               "#include <qtwhatever.h>\n"
+               "#include <windows.h>\n"
+               "\n"
+               "#include <QLabel>\n"
+               "#include <QWidget>\n"
+               "\n"
+               "#include <QtGlobal>\n"
+               "\n"
+               "#include <algorithm>\n"
+               "#include <queue>",
+               sort(UnsortedCode));
 }
 
 TEST_F(SortIncludesTest, NegativePriorities) {
   Style.IncludeCategories = {{".*important_os_header.*", -1, 0, false},
                              {".*", 1, 0, false}};
-  EXPECT_EQ("#include \"important_os_header.h\"\n"
-            "#include \"c_main.h\"\n"
-            "#include \"a_other.h\"",
-            sort("#include \"c_main.h\"\n"
-                 "#include \"a_other.h\"\n"
-                 "#include \"important_os_header.h\"",
-                 "c_main.cc"));
+  verifyFormat("#include \"important_os_header.h\"\n"
+               "#include \"c_main.h\"\n"
+               "#include \"a_other.h\"",
+               sort("#include \"c_main.h\"\n"
+                    "#include \"a_other.h\"\n"
+                    "#include \"important_os_header.h\"",
+                    "c_main.cc"));
 
   // check stable when re-run
-  EXPECT_EQ("#include \"important_os_header.h\"\n"
-            "#include \"c_main.h\"\n"
-            "#include \"a_other.h\"",
-            sort("#include \"important_os_header.h\"\n"
-                 "#include \"c_main.h\"\n"
-                 "#include \"a_other.h\"",
-                 "c_main.cc", 0));
+  verifyFormat("#include \"important_os_header.h\"\n"
+               "#include \"c_main.h\"\n"
+               "#include \"a_other.h\"",
+               sort("#include \"important_os_header.h\"\n"
+                    "#include \"c_main.h\"\n"
+                    "#include \"a_other.h\"",
+                    "c_main.cc", 0));
 }
 
 TEST_F(SortIncludesTest, PriorityGroupsAreSeparatedWhenRegroupping) {
@@ -769,34 +769,34 @@ TEST_F(SortIncludesTest, PriorityGroupsAreSeparatedWhenRegroupping) {
                              {".*", 1, 0, false}};
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
 
-  EXPECT_EQ("#include \"important_os_header.h\"\n"
-            "\n"
-            "#include \"c_main.h\"\n"
-            "\n"
-            "#include \"a_other.h\"",
-            sort("#include \"c_main.h\"\n"
-                 "#include \"a_other.h\"\n"
-                 "#include \"important_os_header.h\"",
-                 "c_main.cc"));
+  verifyFormat("#include \"important_os_header.h\"\n"
+               "\n"
+               "#include \"c_main.h\"\n"
+               "\n"
+               "#include \"a_other.h\"",
+               sort("#include \"c_main.h\"\n"
+                    "#include \"a_other.h\"\n"
+                    "#include \"important_os_header.h\"",
+                    "c_main.cc"));
 
   // check stable when re-run
-  EXPECT_EQ("#include \"important_os_header.h\"\n"
-            "\n"
-            "#include \"c_main.h\"\n"
-            "\n"
-            "#include \"a_other.h\"",
-            sort("#include \"important_os_header.h\"\n"
-                 "\n"
-                 "#include \"c_main.h\"\n"
-                 "\n"
-                 "#include \"a_other.h\"",
-                 "c_main.cc", 0));
+  verifyFormat("#include \"important_os_header.h\"\n"
+               "\n"
+               "#include \"c_main.h\"\n"
+               "\n"
+               "#include \"a_other.h\"",
+               sort("#include \"important_os_header.h\"\n"
+                    "\n"
+                    "#include \"c_main.h\"\n"
+                    "\n"
+                    "#include \"a_other.h\"",
+                    "c_main.cc", 0));
 }
 
 TEST_F(SortIncludesTest, CalculatesCorrectCursorPosition) {
-  std::string Code = "#include <ccc>\n"    // Start of line: 0
-                     "#include <bbbbbb>\n" // Start of line: 15
-                     "#include <a>\n";     // Start of line: 33
+  StringRef Code = "#include <ccc>\n"    // Start of line: 0
+                   "#include <bbbbbb>\n" // Start of line: 15
+                   "#include <a>\n";     // Start of line: 33
   EXPECT_EQ(31u, newCursor(Code, 0));
   EXPECT_EQ(13u, newCursor(Code, 15));
   EXPECT_EQ(0u, newCursor(Code, 33));
@@ -808,14 +808,14 @@ TEST_F(SortIncludesTest, CalculatesCorrectCursorPosition) {
 
 TEST_F(SortIncludesTest, CalculatesCorrectCursorPositionWithRegrouping) {
   Style.IncludeBlocks = Style.IBS_Regroup;
-  std::string Code = "#include \"b\"\n"      // Start of line: 0
-                     "\n"                    // Start of line: 13
-                     "#include \"aa\"\n"     // Start of line: 14
-                     "int i;";               // Start of line: 28
-  std::string Expected = "#include \"aa\"\n" // Start of line: 0
-                         "#include \"b\"\n"  // Start of line: 14
-                         "int i;";           // Start of line: 27
-  EXPECT_EQ(Expected, sort(Code));
+  StringRef Code = "#include \"b\"\n"      // Start of line: 0
+                   "\n"                    // Start of line: 13
+                   "#include \"aa\"\n"     // Start of line: 14
+                   "int i;";               // Start of line: 28
+  StringRef Expected = "#include \"aa\"\n" // Start of line: 0
+                       "#include \"b\"\n"  // Start of line: 14
+                       "int i;";           // Start of line: 27
+  verifyFormat(Expected, sort(Code));
   EXPECT_EQ(12u, newCursor(Code, 26)); // Closing quote of "aa"
   EXPECT_EQ(26u, newCursor(Code, 27)); // Newline after "aa"
   EXPECT_EQ(27u, newCursor(Code, 28)); // Start of last line
@@ -827,14 +827,14 @@ TEST_F(SortIncludesTest,
   FmtStyle.LineEnding = FormatStyle::LE_CRLF;
   Style.IncludeCategories = {
       {"^\"a\"", 0, 0, false}, {"^\"b\"", 1, 1, false}, {".*", 2, 2, false}};
-  std::string Code = "#include \"a\"\r\n" // Start of line: 0
-                     "\r\n"               // Start of line: 14
-                     "#include \"b\"\r\n" // Start of line: 16
-                     "\r\n"               // Start of line: 30
-                     "#include \"c\"\r\n" // Start of line: 32
-                     "\r\n"               // Start of line: 46
-                     "int i;";            // Start of line: 48
-  verifyNoChange(Code);
+  StringRef Code = "#include \"a\"\r\n" // Start of line: 0
+                   "\r\n"               // Start of line: 14
+                   "#include \"b\"\r\n" // Start of line: 16
+                   "\r\n"               // Start of line: 30
+                   "#include \"c\"\r\n" // Start of line: 32
+                   "\r\n"               // Start of line: 46
+                   "int i;";            // Start of line: 48
+  verifyFormat(Code);
   EXPECT_EQ(0u, newCursor(Code, 0));
   EXPECT_EQ(14u, newCursor(Code, 14));
   EXPECT_EQ(16u, newCursor(Code, 16));
@@ -850,19 +850,19 @@ TEST_F(
   Style.IncludeBlocks = Style.IBS_Regroup;
   FmtStyle.LineEnding = FormatStyle::LE_CRLF;
   Style.IncludeCategories = {{".*", 0, 0, false}};
-  std::string Code = "#include \"a\"\r\n"     // Start of line: 0
-                     "\r\n"                   // Start of line: 14
-                     "#include \"b\"\r\n"     // Start of line: 16
-                     "\r\n"                   // Start of line: 30
-                     "#include \"c\"\r\n"     // Start of line: 32
-                     "\r\n"                   // Start of line: 46
-                     "int i;";                // Start of line: 48
-  std::string Expected = "#include \"a\"\r\n" // Start of line: 0
-                         "#include \"b\"\r\n" // Start of line: 14
-                         "#include \"c\"\r\n" // Start of line: 28
-                         "\r\n"               // Start of line: 42
-                         "int i;";            // Start of line: 44
-  EXPECT_EQ(Expected, sort(Code));
+  StringRef Code = "#include \"a\"\r\n"     // Start of line: 0
+                   "\r\n"                   // Start of line: 14
+                   "#include \"b\"\r\n"     // Start of line: 16
+                   "\r\n"                   // Start of line: 30
+                   "#include \"c\"\r\n"     // Start of line: 32
+                   "\r\n"                   // Start of line: 46
+                   "int i;";                // Start of line: 48
+  StringRef Expected = "#include \"a\"\r\n" // Start of line: 0
+                       "#include \"b\"\r\n" // Start of line: 14
+                       "#include \"c\"\r\n" // Start of line: 28
+                       "\r\n"               // Start of line: 42
+                       "int i;";            // Start of line: 44
+  verifyFormat(Expected, sort(Code));
   EXPECT_EQ(0u, newCursor(Code, 0));
   EXPECT_EQ(
       14u,
@@ -885,19 +885,19 @@ TEST_F(
   FmtStyle.LineEnding = FormatStyle::LE_CRLF;
   Style.IncludeCategories = {
       {"^\"a\"", 0, 0, false}, {"^\"b\"", 1, 1, false}, {".*", 2, 2, false}};
-  std::string Code = "#include \"a\"\r\n"     // Start of line: 0
-                     "#include \"b\"\r\n"     // Start of line: 14
-                     "#include \"c\"\r\n"     // Start of line: 28
-                     "\r\n"                   // Start of line: 42
-                     "int i;";                // Start of line: 44
-  std::string Expected = "#include \"a\"\r\n" // Start of line: 0
-                         "\r\n"               // Start of line: 14
-                         "#include \"b\"\r\n" // Start of line: 16
-                         "\r\n"               // Start of line: 30
-                         "#include \"c\"\r\n" // Start of line: 32
-                         "\r\n"               // Start of line: 46
-                         "int i;";            // Start of line: 48
-  EXPECT_EQ(Expected, sort(Code));
+  StringRef Code = "#include \"a\"\r\n"     // Start of line: 0
+                   "#include \"b\"\r\n"     // Start of line: 14
+                   "#include \"c\"\r\n"     // Start of line: 28
+                   "\r\n"                   // Start of line: 42
+                   "int i;";                // Start of line: 44
+  StringRef Expected = "#include \"a\"\r\n" // Start of line: 0
+                       "\r\n"               // Start of line: 14
+                       "#include \"b\"\r\n" // Start of line: 16
+                       "\r\n"               // Start of line: 30
+                       "#include \"c\"\r\n" // Start of line: 32
+                       "\r\n"               // Start of line: 46
+                       "int i;";            // Start of line: 48
+  verifyFormat(Expected, sort(Code));
   EXPECT_EQ(0u, newCursor(Code, 0));
   EXPECT_EQ(15u, newCursor(Code, 16));
   EXPECT_EQ(30u, newCursor(Code, 32));
@@ -912,21 +912,21 @@ TEST_F(
   FmtStyle.LineEnding = FormatStyle::LE_CRLF;
   Style.IncludeCategories = {
       {"^\"a\"", 0, 0, false}, {"^\"b\"", 1, 1, false}, {".*", 2, 2, false}};
-  std::string Code = "#include \"a\"\r\n"     // Start of line: 0
-                     "\r\n"                   // Start of line: 14
-                     "#include \"c\"\r\n"     // Start of line: 16
-                     "\r\n"                   // Start of line: 30
-                     "#include \"b\"\r\n"     // Start of line: 32
-                     "\r\n"                   // Start of line: 46
-                     "int i;";                // Start of line: 48
-  std::string Expected = "#include \"a\"\r\n" // Start of line: 0
-                         "\r\n"               // Start of line: 14
-                         "#include \"b\"\r\n" // Start of line: 16
-                         "\r\n"               // Start of line: 30
-                         "#include \"c\"\r\n" // Start of line: 32
-                         "\r\n"               // Start of line: 46
-                         "int i;";            // Start of line: 48
-  EXPECT_EQ(Expected, sort(Code));
+  StringRef Code = "#include \"a\"\r\n"     // Start of line: 0
+                   "\r\n"                   // Start of line: 14
+                   "#include \"c\"\r\n"     // Start of line: 16
+                   "\r\n"                   // Start of line: 30
+                   "#include \"b\"\r\n"     // Start of line: 32
+                   "\r\n"                   // Start of line: 46
+                   "int i;";                // Start of line: 48
+  StringRef Expected = "#include \"a\"\r\n" // Start of line: 0
+                       "\r\n"               // Start of line: 14
+                       "#include \"b\"\r\n" // Start of line: 16
+                       "\r\n"               // Start of line: 30
+                       "#include \"c\"\r\n" // Start of line: 32
+                       "\r\n"               // Start of line: 46
+                       "int i;";            // Start of line: 48
+  verifyFormat(Expected, sort(Code));
   EXPECT_EQ(0u, newCursor(Code, 0));
   EXPECT_EQ(14u, newCursor(Code, 14));
   EXPECT_EQ(30u, newCursor(Code, 32));
@@ -938,88 +938,88 @@ TEST_F(
 #endif
 
 TEST_F(SortIncludesTest, DeduplicateIncludes) {
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>",
-            sort("#include <a>\n"
-                 "#include <b>\n"
-                 "#include <b>\n"
-                 "#include <b>\n"
-                 "#include <b>\n"
-                 "#include <c>"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>",
+               sort("#include <a>\n"
+                    "#include <b>\n"
+                    "#include <b>\n"
+                    "#include <b>\n"
+                    "#include <b>\n"
+                    "#include <c>"));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>",
-            sort("#include <a>\n"
-                 "#include <b>\n"
-                 "\n"
-                 "#include <b>\n"
-                 "\n"
-                 "#include <b>\n"
-                 "#include <c>"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>",
+               sort("#include <a>\n"
+                    "#include <b>\n"
+                    "\n"
+                    "#include <b>\n"
+                    "\n"
+                    "#include <b>\n"
+                    "#include <c>"));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>",
-            sort("#include <a>\n"
-                 "#include <b>\n"
-                 "\n"
-                 "#include <b>\n"
-                 "\n"
-                 "#include <b>\n"
-                 "#include <c>"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>",
+               sort("#include <a>\n"
+                    "#include <b>\n"
+                    "\n"
+                    "#include <b>\n"
+                    "\n"
+                    "#include <b>\n"
+                    "#include <c>"));
 }
 
 TEST_F(SortIncludesTest, SortAndDeduplicateIncludes) {
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>",
-            sort("#include <b>\n"
-                 "#include <a>\n"
-                 "#include <b>\n"
-                 "#include <b>\n"
-                 "#include <c>\n"
-                 "#include <b>"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>",
+               sort("#include <b>\n"
+                    "#include <a>\n"
+                    "#include <b>\n"
+                    "#include <b>\n"
+                    "#include <c>\n"
+                    "#include <b>"));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>",
-            sort("#include <b>\n"
-                 "#include <a>\n"
-                 "\n"
-                 "#include <b>\n"
-                 "\n"
-                 "#include <c>\n"
-                 "#include <b>"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>",
+               sort("#include <b>\n"
+                    "#include <a>\n"
+                    "\n"
+                    "#include <b>\n"
+                    "\n"
+                    "#include <c>\n"
+                    "#include <b>"));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "#include <c>",
-            sort("#include <b>\n"
-                 "#include <a>\n"
-                 "\n"
-                 "#include <b>\n"
-                 "\n"
-                 "#include <c>\n"
-                 "#include <b>"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "#include <c>",
+               sort("#include <b>\n"
+                    "#include <a>\n"
+                    "\n"
+                    "#include <b>\n"
+                    "\n"
+                    "#include <c>\n"
+                    "#include <b>"));
 }
 
 TEST_F(SortIncludesTest, CalculatesCorrectCursorPositionAfterDeduplicate) {
-  std::string Code = "#include <b>\n"      // Start of line: 0
-                     "#include <a>\n"      // Start of line: 13
-                     "#include <b>\n"      // Start of line: 26
-                     "#include <b>\n"      // Start of line: 39
-                     "#include <c>\n"      // Start of line: 52
-                     "#include <b>\n";     // Start of line: 65
-  std::string Expected = "#include <a>\n"  // Start of line: 0
-                         "#include <b>\n"  // Start of line: 13
-                         "#include <c>\n"; // Start of line: 26
-  EXPECT_EQ(Expected, sort(Code));
+  StringRef Code = "#include <b>\n"      // Start of line: 0
+                   "#include <a>\n"      // Start of line: 13
+                   "#include <b>\n"      // Start of line: 26
+                   "#include <b>\n"      // Start of line: 39
+                   "#include <c>\n"      // Start of line: 52
+                   "#include <b>\n";     // Start of line: 65
+  StringRef Expected = "#include <a>\n"  // Start of line: 0
+                       "#include <b>\n"  // Start of line: 13
+                       "#include <c>\n"; // Start of line: 26
+  verifyFormat(Expected, sort(Code));
   // Cursor on 'i' in "#include <a>".
   EXPECT_EQ(1u, newCursor(Code, 14));
   // Cursor on 'b' in "#include <b>".
@@ -1033,26 +1033,26 @@ TEST_F(SortIncludesTest, CalculatesCorrectCursorPositionAfterDeduplicate) {
 }
 
 TEST_F(SortIncludesTest, DeduplicateLocallyInEachBlock) {
-  EXPECT_EQ("#include <a>\n"
-            "#include <b>\n"
-            "\n"
-            "#include <b>\n"
-            "#include <c>",
-            sort("#include <a>\n"
-                 "#include <b>\n"
-                 "\n"
-                 "#include <c>\n"
-                 "#include <b>\n"
-                 "#include <b>"));
+  verifyFormat("#include <a>\n"
+               "#include <b>\n"
+               "\n"
+               "#include <b>\n"
+               "#include <c>",
+               sort("#include <a>\n"
+                    "#include <b>\n"
+                    "\n"
+                    "#include <c>\n"
+                    "#include <b>\n"
+                    "#include <b>"));
 }
 
 TEST_F(SortIncludesTest, ValidAffactedRangesAfterDeduplicatingIncludes) {
-  std::string Code = "#include <a>\n"
-                     "#include <b>\n"
-                     "#include <a>\n"
-                     "#include <a>\n"
-                     "\n"
-                     "   int     x ;";
+  StringRef Code = "#include <a>\n"
+                   "#include <b>\n"
+                   "#include <a>\n"
+                   "#include <a>\n"
+                   "\n"
+                   "   int     x ;";
   std::vector<tooling::Range> Ranges = {tooling::Range(0, 52)};
   auto Replaces = sortIncludes(FmtStyle, Code, Ranges, "input.cpp");
   Ranges = tooling::calculateRangesAfterReplacements(Replaces, Ranges);
@@ -1062,80 +1062,78 @@ TEST_F(SortIncludesTest, ValidAffactedRangesAfterDeduplicatingIncludes) {
 }
 
 TEST_F(SortIncludesTest, DoNotSortLikelyXml) {
-  EXPECT_EQ("<!--;\n"
-            "#include <b>\n"
-            "#include <a>\n"
-            "-->",
-            sort("<!--;\n"
-                 "#include <b>\n"
-                 "#include <a>\n"
-                 "-->",
-                 "input.h", 0));
+  verifyFormat("<!--;\n"
+               "#include <b>\n"
+               "#include <a>\n"
+               "-->",
+               sort("<!--;\n"
+                    "#include <b>\n"
+                    "#include <a>\n"
+                    "-->",
+                    "input.h", 0));
 }
 
 TEST_F(SortIncludesTest, DoNotOutputReplacementsForSortedBlocksWithRegrouping) {
   Style.IncludeBlocks = Style.IBS_Regroup;
-  std::string Code = R"(
-#include "b.h"
-
-#include <a.h>
-)";
-  EXPECT_EQ(Code, sort(Code, "input.h", 0));
+  StringRef Code = "#include \"b.h\"\n"
+                   "\n"
+                   "#include <a.h>";
+  verifyFormat(Code, sort(Code, "input.h", 0));
 }
 
 TEST_F(SortIncludesTest,
        DoNotOutputReplacementsForSortedBlocksWithRegroupingWindows) {
   Style.IncludeBlocks = Style.IBS_Regroup;
-  std::string Code = "#include \"b.h\"\r\n"
-                     "\r\n"
-                     "#include <a.h>\r\n";
-  EXPECT_EQ(Code, sort(Code, "input.h", 0));
+  StringRef Code = "#include \"b.h\"\r\n"
+                   "\r\n"
+                   "#include <a.h>\r\n";
+  verifyFormat(Code, sort(Code, "input.h", 0));
 }
 
 TEST_F(SortIncludesTest, MainIncludeChar) {
-  std::string Code = "#include <a>\n"
-                     "#include \"quote/input.h\"\n"
-                     "#include <angle-bracket/input.h>\n";
+  StringRef Code = "#include <a>\n"
+                   "#include \"quote/input.h\"\n"
+                   "#include <angle-bracket/input.h>\n";
 
   // Default behavior
-  EXPECT_EQ("#include \"quote/input.h\"\n"
-            "#include <a>\n"
-            "#include <angle-bracket/input.h>\n",
-            sort(Code, "input.cc", 1));
+  verifyFormat("#include \"quote/input.h\"\n"
+               "#include <a>\n"
+               "#include <angle-bracket/input.h>\n",
+               sort(Code, "input.cc", 1));
 
   Style.MainIncludeChar = tooling::IncludeStyle::MICD_Quote;
-  EXPECT_EQ("#include \"quote/input.h\"\n"
-            "#include <a>\n"
-            "#include <angle-bracket/input.h>\n",
-            sort(Code, "input.cc", 1));
+  verifyFormat("#include \"quote/input.h\"\n"
+               "#include <a>\n"
+               "#include <angle-bracket/input.h>\n",
+               sort(Code, "input.cc", 1));
 
   Style.MainIncludeChar = tooling::IncludeStyle::MICD_AngleBracket;
-  EXPECT_EQ("#include <angle-bracket/input.h>\n"
-            "#include \"quote/input.h\"\n"
-            "#include <a>\n",
-            sort(Code, "input.cc", 1));
+  verifyFormat("#include <angle-bracket/input.h>\n"
+               "#include \"quote/input.h\"\n"
+               "#include <a>\n",
+               sort(Code, "input.cc", 1));
 }
 
 TEST_F(SortIncludesTest, MainIncludeCharAnyPickQuote) {
   Style.MainIncludeChar = tooling::IncludeStyle::MICD_Any;
-  EXPECT_EQ("#include \"input.h\"\n"
-            "#include <a>\n"
-            "#include <b>\n",
-            sort("#include <a>\n"
-                 "#include \"input.h\"\n"
-                 "#include <b>\n",
-                 "input.cc", 1));
+  verifyFormat("#include \"input.h\"\n"
+               "#include <a>\n"
+               "#include <b>\n",
+               sort("#include <a>\n"
+                    "#include \"input.h\"\n"
+                    "#include <b>\n",
+                    "input.cc", 1));
 }
 
 TEST_F(SortIncludesTest, MainIncludeCharAnyPickAngleBracket) {
   Style.MainIncludeChar = tooling::IncludeStyle::MICD_Any;
-  EXPECT_EQ("#include <input.h>\n"
-            "#include <a>\n"
-            "#include <b>\n",
-            sort("#include <a>\n"
-                 "#include <input.h>\n"
-                 "#include <b>\n",
-                 "input.cc", 1));
+  verifyFormat("#include <input.h>\n"
+               "#include <a>\n"
+               "#include <b>\n",
+               sort("#include <a>\n"
+                    "#include <input.h>\n"
+                    "#include <b>\n",
+                    "input.cc", 1));
 }
 
 TEST_F(SortIncludesTest, MainIncludeCharQuoteAndRegroup) {
@@ -1144,28 +1142,28 @@ TEST_F(SortIncludesTest, MainIncludeCharQuoteAndRegroup) {
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
   Style.MainIncludeChar = tooling::IncludeStyle::MICD_Quote;
 
-  EXPECT_EQ("#include \"lib-b/input.h\"\n"
-            "\n"
-            "#include <lib-a/h-1.h>\n"
-            "#include <lib-a/h-3.h>\n"
-            "#include <lib-a/input.h>\n"
-            "\n"
-            "#include <lib-b/h-1.h>\n"
-            "#include <lib-b/h-3.h>\n"
-            "\n"
-            "#include <lib-c/h-1.h>\n"
-            "#include <lib-c/h-2.h>\n"
-            "#include <lib-c/h-3.h>\n",
-            sort("#include <lib-c/h-1.h>\n"
-                 "#include <lib-c/h-2.h>\n"
-                 "#include <lib-c/h-3.h>\n"
-                 "#include <lib-b/h-1.h>\n"
-                 "#include \"lib-b/input.h\"\n"
-                 "#include <lib-b/h-3.h>\n"
-                 "#include <lib-a/h-1.h>\n"
-                 "#include <lib-a/input.h>\n"
-                 "#include <lib-a/h-3.h>\n",
-                 "input.cc"));
+  verifyFormat("#include \"lib-b/input.h\"\n"
+               "\n"
+               "#include <lib-a/h-1.h>\n"
+               "#include <lib-a/h-3.h>\n"
+               "#include <lib-a/input.h>\n"
+               "\n"
+               "#include <lib-b/h-1.h>\n"
+               "#include <lib-b/h-3.h>\n"
+               "\n"
+               "#include <lib-c/h-1.h>\n"
+               "#include <lib-c/h-2.h>\n"
+               "#include <lib-c/h-3.h>\n",
+               sort("#include <lib-c/h-1.h>\n"
+                    "#include <lib-c/h-2.h>\n"
+                    "#include <lib-c/h-3.h>\n"
+                    "#include <lib-b/h-1.h>\n"
+                    "#include \"lib-b/input.h\"\n"
+                    "#include <lib-b/h-3.h>\n"
+                    "#include <lib-a/h-1.h>\n"
+                    "#include <lib-a/input.h>\n"
+                    "#include <lib-a/h-3.h>\n",
+                    "input.cc"));
 }
 
 TEST_F(SortIncludesTest, MainIncludeCharAngleBracketAndRegroup) {
@@ -1174,60 +1172,60 @@ TEST_F(SortIncludesTest, MainIncludeCharAngleBracketAndRegroup) {
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
   Style.MainIncludeChar = tooling::IncludeStyle::MICD_AngleBracket;
 
-  EXPECT_EQ("#include <lib-a/input.h>\n"
-            "\n"
-            "#include <lib-a/h-1.h>\n"
-            "#include <lib-a/h-3.h>\n"
-            "\n"
-            "#include \"lib-b/input.h\"\n"
-            "#include <lib-b/h-1.h>\n"
-            "#include <lib-b/h-3.h>\n"
-            "\n"
-            "#include <lib-c/h-1.h>\n"
-            "#include <lib-c/h-2.h>\n"
-            "#include <lib-c/h-3.h>\n",
-            sort("#include <lib-c/h-1.h>\n"
-                 "#include <lib-c/h-2.h>\n"
-                 "#include <lib-c/h-3.h>\n"
-                 "#include <lib-b/h-1.h>\n"
-                 "#include \"lib-b/input.h\"\n"
-                 "#include <lib-b/h-3.h>\n"
-                 "#include <lib-a/h-1.h>\n"
-                 "#include <lib-a/input.h>\n"
-                 "#include <lib-a/h-3.h>\n",
-                 "input.cc"));
+  verifyFormat("#include <lib-a/input.h>\n"
+               "\n"
+               "#include <lib-a/h-1.h>\n"
+               "#include <lib-a/h-3.h>\n"
+               "\n"
+               "#include \"lib-b/input.h\"\n"
+               "#include <lib-b/h-1.h>\n"
+               "#include <lib-b/h-3.h>\n"
+               "\n"
+               "#include <lib-c/h-1.h>\n"
+               "#include <lib-c/h-2.h>\n"
+               "#include <lib-c/h-3.h>\n",
+               sort("#include <lib-c/h-1.h>\n"
+                    "#include <lib-c/h-2.h>\n"
+                    "#include <lib-c/h-3.h>\n"
+                    "#include <lib-b/h-1.h>\n"
+                    "#include \"lib-b/input.h\"\n"
+                    "#include <lib-b/h-3.h>\n"
+                    "#include <lib-a/h-1.h>\n"
+                    "#include <lib-a/input.h>\n"
+                    "#include <lib-a/h-3.h>\n",
+                    "input.cc"));
 }
 
 TEST_F(SortIncludesTest, DoNotRegroupGroupsInGoogleObjCStyle) {
   FmtStyle = getGoogleStyle(FormatStyle::LK_ObjC);
 
-  EXPECT_EQ("#include <a.h>\n"
-            "#include <b.h>\n"
-            "#include \"a.h\"",
-            sort("#include <b.h>\n"
-                 "#include <a.h>\n"
-                 "#include \"a.h\""));
+  verifyFormat("#include <a.h>\n"
+               "#include <b.h>\n"
+               "#include \"a.h\"",
+               sort("#include <b.h>\n"
+                    "#include <a.h>\n"
+                    "#include \"a.h\""));
 }
 
 TEST_F(SortIncludesTest, DoNotTreatPrecompiledHeadersAsFirstBlock) {
   Style.IncludeBlocks = Style.IBS_Merge;
-  std::string Code = "#include \"d.h\"\r\n"
-                     "#include \"b.h\"\r\n"
-                     "#pragma hdrstop\r\n"
-                     "\r\n"
-                     "#include \"c.h\"\r\n"
-                     "#include \"a.h\"\r\n"
-                     "#include \"e.h\"\r\n";
-
-  std::string Expected = "#include \"b.h\"\r\n"
-                         "#include \"d.h\"\r\n"
-                         "#pragma hdrstop\r\n"
-                         "\r\n"
-                         "#include \"e.h\"\r\n"
-                         "#include \"a.h\"\r\n"
-                         "#include \"c.h\"\r\n";
-
-  EXPECT_EQ(Expected, sort(Code, "e.cpp", 2));
+  StringRef Code = "#include \"d.h\"\r\n"
+                   "#include \"b.h\"\r\n"
+                   "#pragma hdrstop\r\n"
+                   "\r\n"
+                   "#include \"c.h\"\r\n"
+                   "#include \"a.h\"\r\n"
+                   "#include \"e.h\"\r\n";
+
+  StringRef Expected = "#include \"b.h\"\r\n"
+                       "#include \"d.h\"\r\n"
+                       "#pragma hdrstop\r\n"
+                       "\r\n"
+                       "#include \"e.h\"\r\n"
+                       "#include \"a.h\"\r\n"
+                       "#include \"c.h\"\r\n";
+
+  verifyFormat(Expected, sort(Code, "e.cpp", 2));
 
   Code = "#include \"d.h\"\n"
          "#include \"b.h\"\n"
@@ -1245,59 +1243,59 @@ TEST_F(SortIncludesTest, DoNotTreatPrecompiledHeadersAsFirstBlock) {
              "#include \"a.h\"\n"
              "#include \"c.h\"\n";
 
-  EXPECT_EQ(Expected, sort(Code, "e.cpp", 2));
+  verifyFormat(Expected, sort(Code, "e.cpp", 2));
 }
 
 TEST_F(SortIncludesTest, skipUTF8ByteOrderMarkMerge) {
   Style.IncludeBlocks = Style.IBS_Merge;
-  std::string Code = "\xEF\xBB\xBF#include \"d.h\"\r\n"
-                     "#include \"b.h\"\r\n"
-                     "\r\n"
-                     "#include \"c.h\"\r\n"
-                     "#include \"a.h\"\r\n"
-                     "#include \"e.h\"\r\n";
-
-  std::string Expected = "\xEF\xBB\xBF#include \"e.h\"\r\n"
-                         "#include \"a.h\"\r\n"
-                         "#include \"b.h\"\r\n"
-                         "#include \"c.h\"\r\n"
-                         "#include \"d.h\"\r\n";
-
-  EXPECT_EQ(Expected, sort(Code, "e.cpp", 1));
+  StringRef Code = "\xEF\xBB\xBF#include \"d.h\"\r\n"
+                   "#include \"b.h\"\r\n"
+                   "\r\n"
+                   "#include \"c.h\"\r\n"
+                   "#include \"a.h\"\r\n"
+                   "#include \"e.h\"\r\n";
+
+  StringRef Expected = "\xEF\xBB\xBF#include \"e.h\"\r\n"
+                       "#include \"a.h\"\r\n"
+                       "#include \"b.h\"\r\n"
+                       "#include \"c.h\"\r\n"
+                       "#include \"d.h\"\r\n";
+
+  verifyFormat(Expected, sort(Code, "e.cpp", 1));
 }
 
 TEST_F(SortIncludesTest, skipUTF8ByteOrderMarkPreserve) {
   Style.IncludeBlocks = Style.IBS_Preserve;
-  std::string Code = "\xEF\xBB\xBF#include \"d.h\"\r\n"
-                     "#include \"b.h\"\r\n"
-                     "\r\n"
-                     "#include \"c.h\"\r\n"
-                     "#include \"a.h\"\r\n"
-                     "#include \"e.h\"\r\n";
-
-  std::string Expected = "\xEF\xBB\xBF#include \"b.h\"\r\n"
-                         "#include \"d.h\"\r\n"
-                         "\r\n"
-                         "#include \"a.h\"\r\n"
-                         "#include \"c.h\"\r\n"
-                         "#include \"e.h\"\r\n";
-
-  EXPECT_EQ(Expected, sort(Code, "e.cpp", 2));
+  StringRef Code = "\xEF\xBB\xBF#include \"d.h\"\r\n"
+                   "#include \"b.h\"\r\n"
+                   "\r\n"
+                   "#include \"c.h\"\r\n"
+                   "#include \"a.h\"\r\n"
+                   "#include \"e.h\"\r\n";
+
+  StringRef Expected = "\xEF\xBB\xBF#include \"b.h\"\r\n"
+                       "#include \"d.h\"\r\n"
+                       "\r\n"
+                       "#include \"a.h\"\r\n"
+                       "#include \"c.h\"\r\n"
+                       "#include \"e.h\"\r\n";
+
+  verifyFormat(Expected, sort(Code, "e.cpp", 2));
 }
 
 TEST_F(SortIncludesTest, MergeLines) {
   Style.IncludeBlocks = Style.IBS_Merge;
-  std::string Code = "#include \"c.h\"\r\n"
-                     "#include \"b\\\r\n"
-                     ".h\"\r\n"
-                     "#include \"a.h\"\r\n";
+  StringRef Code = "#include \"c.h\"\r\n"
+                   "#include \"b\\\r\n"
+                   ".h\"\r\n"
+                   "#include \"a.h\"\r\n";
 
-  std::string Expected = "#include \"a.h\"\r\n"
-                         "#include \"b\\\r\n"
-                         ".h\"\r\n"
-                         "#include \"c.h\"\r\n";
+  StringRef Expected = "#include \"a.h\"\r\n"
+                       "#include \"b\\\r\n"
+                       ".h\"\r\n"
+                       "#include \"c.h\"\r\n";
 
-  EXPECT_EQ(Expected, sort(Code, "a.cpp", 1));
+  verifyFormat(Expected, sort(Code, "a.cpp", 1));
 }
 
 TEST_F(SortIncludesTest, DisableFormatDisablesIncludeSorting) {
@@ -1305,154 +1303,154 @@ TEST_F(SortIncludesTest, DisableFormatDisablesIncludeSorting) {
                      "#include <b.h>\n";
   StringRef Unsorted = "#include <b.h>\n"
                        "#include <a.h>\n";
-  EXPECT_EQ(Sorted, sort(Unsorted));
+  verifyFormat(Sorted, sort(Unsorted));
   FmtStyle.DisableFormat = true;
-  EXPECT_EQ(Unsorted, sort(Unsorted, "input.cpp", 0));
+  verifyFormat(Unsorted, sort(Unsorted, "input.cpp", 0));
 }
 
 TEST_F(SortIncludesTest, DisableRawStringLiteralSorting) {
 
-  EXPECT_EQ("const char *t = R\"(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")\";",
-            sort("const char *t = R\"(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")\";",
-                 "test.cxx", 0));
-  EXPECT_EQ("const char *t = R\"x(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")x\";",
-            sort("const char *t = R\"x(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")x\";",
-                 "test.cxx", 0));
-  EXPECT_EQ("const char *t = R\"xyz(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")xyz\";",
-            sort("const char *t = R\"xyz(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")xyz\";",
-                 "test.cxx", 0));
-
-  EXPECT_EQ("#include <a.h>\n"
-            "#include <b.h>\n"
-            "const char *t = R\"(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")\";\n"
-            "#include <c.h>\n"
-            "#include <d.h>\n"
-            "const char *t = R\"x(\n"
-            "#include <f.h>\n"
-            "#include <e.h>\n"
-            ")x\";\n"
-            "#include <g.h>\n"
-            "#include <h.h>\n"
-            "const char *t = R\"xyz(\n"
-            "#include <j.h>\n"
-            "#include <i.h>\n"
-            ")xyz\";\n"
-            "#include <k.h>\n"
-            "#include <l.h>",
-            sort("#include <b.h>\n"
-                 "#include <a.h>\n"
-                 "const char *t = R\"(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")\";\n"
-                 "#include <d.h>\n"
-                 "#include <c.h>\n"
-                 "const char *t = R\"x(\n"
-                 "#include <f.h>\n"
-                 "#include <e.h>\n"
-                 ")x\";\n"
-                 "#include <h.h>\n"
-                 "#include <g.h>\n"
-                 "const char *t = R\"xyz(\n"
-                 "#include <j.h>\n"
-                 "#include <i.h>\n"
-                 ")xyz\";\n"
-                 "#include <l.h>\n"
-                 "#include <k.h>",
-                 "test.cc", 4));
-
-  EXPECT_EQ("const char *t = R\"AMZ029amz(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")AMZ029amz\";",
-            sort("const char *t = R\"AMZ029amz(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")AMZ029amz\";",
-                 "test.cxx", 0));
-
-  EXPECT_EQ("const char *t = R\"-AMZ029amz(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")-AMZ029amz\";",
-            sort("const char *t = R\"-AMZ029amz(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")-AMZ029amz\";",
-                 "test.cxx", 0));
-
-  EXPECT_EQ("const char *t = R\"AMZ029amz-(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")AMZ029amz-\";",
-            sort("const char *t = R\"AMZ029amz-(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")AMZ029amz-\";",
-                 "test.cxx", 0));
-
-  EXPECT_EQ("const char *t = R\"AM|029amz-(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")AM|029amz-\";",
-            sort("const char *t = R\"AM|029amz-(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")AM|029amz-\";",
-                 "test.cxx", 0));
-
-  EXPECT_EQ("const char *t = R\"AM[029amz-(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")AM[029amz-\";",
-            sort("const char *t = R\"AM[029amz-(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")AM[029amz-\";",
-                 "test.cxx", 0));
-
-  EXPECT_EQ("const char *t = R\"AM]029amz-(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")AM]029amz-\";",
-            sort("const char *t = R\"AM]029amz-(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")AM]029amz-\";",
-                 "test.cxx", 0));
+  verifyFormat("const char *t = R\"(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")\";",
+               sort("const char *t = R\"(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")\";",
+                    "test.cxx", 0));
+  verifyFormat("const char *t = R\"x(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")x\";",
+               sort("const char *t = R\"x(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")x\";",
+                    "test.cxx", 0));
+  verifyFormat("const char *t = R\"xyz(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")xyz\";",
+               sort("const char *t = R\"xyz(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")xyz\";",
+                    "test.cxx", 0));
+
+  verifyFormat("#include <a.h>\n"
+               "#include <b.h>\n"
+               "const char *t = R\"(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")\";\n"
+               "#include <c.h>\n"
+               "#include <d.h>\n"
+               "const char *t = R\"x(\n"
+               "#include <f.h>\n"
+               "#include <e.h>\n"
+               ")x\";\n"
+               "#include <g.h>\n"
+               "#include <h.h>\n"
+               "const char *t = R\"xyz(\n"
+               "#include <j.h>\n"
+               "#include <i.h>\n"
+               ")xyz\";\n"
+               "#include <k.h>\n"
+               "#include <l.h>",
+               sort("#include <b.h>\n"
+                    "#include <a.h>\n"
+                    "const char *t = R\"(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")\";\n"
+                    "#include <d.h>\n"
+                    "#include <c.h>\n"
+                    "const char *t = R\"x(\n"
+                    "#include <f.h>\n"
+                    "#include <e.h>\n"
+                    ")x\";\n"
+                    "#include <h.h>\n"
+                    "#include <g.h>\n"
+                    "const char *t = R\"xyz(\n"
+                    "#include <j.h>\n"
+                    "#include <i.h>\n"
+                    ")xyz\";\n"
+                    "#include <l.h>\n"
+                    "#include <k.h>",
+                    "test.cc", 4));
+
+  verifyFormat("const char *t = R\"AMZ029amz(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")AMZ029amz\";",
+               sort("const char *t = R\"AMZ029amz(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")AMZ029amz\";",
+                    "test.cxx", 0));
+
+  verifyFormat("const char *t = R\"-AMZ029amz(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")-AMZ029amz\";",
+               sort("const char *t = R\"-AMZ029amz(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")-AMZ029amz\";",
+                    "test.cxx", 0));
+
+  verifyFormat("const char *t = R\"AMZ029amz-(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")AMZ029amz-\";",
+               sort("const char *t = R\"AMZ029amz-(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")AMZ029amz-\";",
+                    "test.cxx", 0));
+
+  verifyFormat("const char *t = R\"AM|029amz-(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")AM|029amz-\";",
+               sort("const char *t = R\"AM|029amz-(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")AM|029amz-\";",
+                    "test.cxx", 0));
+
+  verifyFormat("const char *t = R\"AM[029amz-(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")AM[029amz-\";",
+               sort("const char *t = R\"AM[029amz-(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")AM[029amz-\";",
+                    "test.cxx", 0));
+
+  verifyFormat("const char *t = R\"AM]029amz-(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")AM]029amz-\";",
+               sort("const char *t = R\"AM]029amz-(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")AM]029amz-\";",
+                    "test.cxx", 0));
 
 #define X "AMZ029amz{}+!%*=_:;',.<>|/?#~-$"
 
-  EXPECT_EQ("const char *t = R\"" X "(\n"
-            "#include <b.h>\n"
-            "#include <a.h>\n"
-            ")" X "\";",
-            sort("const char *t = R\"" X "(\n"
-                 "#include <b.h>\n"
-                 "#include <a.h>\n"
-                 ")" X "\";",
-                 "test.cxx", 0));
+  verifyFormat("const char *t = R\"" X "(\n"
+               "#include <b.h>\n"
+               "#include <a.h>\n"
+               ")" X "\";",
+               sort("const char *t = R\"" X "(\n"
+                    "#include <b.h>\n"
+                    "#include <a.h>\n"
+                    ")" X "\";",
+                    "test.cxx", 0));
 
 #undef X
 }

From 2f52bbeb6f6f3b7abef19cb5297773d95aa0b434 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 19 May 2024 15:20:46 -0700
Subject: [PATCH 20/60] [mlir] Use operator==(StringRef, StringRef) (NFC)
 (#92706)

---
 .../SparseTensor/IR/Detail/LvlTypeParser.cpp       | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
index 39f5cf1a750828..bb6c65a6f6ca07 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
@@ -37,7 +37,7 @@ FailureOr<uint64_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
   uint64_t properties = 0;
   SmallVector<unsigned> structured;
 
-  if (base.compare("structured") == 0) {
+  if (base == "structured") {
     ParseResult res = parser.parseCommaSeparatedList(
         mlir::OpAsmParser::Delimiter::OptionalSquare,
         [&]() -> ParseResult { return parseStructured(parser, &structured); },
@@ -60,18 +60,18 @@ FailureOr<uint64_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
   FAILURE_IF_FAILED(res)
 
   // Set the base bit for properties.
-  if (base.compare("dense") == 0) {
+  if (base == "dense") {
     properties |= static_cast<uint64_t>(LevelFormat::Dense);
-  } else if (base.compare("batch") == 0) {
+  } else if (base == "batch") {
     properties |= static_cast<uint64_t>(LevelFormat::Batch);
-  } else if (base.compare("compressed") == 0) {
+  } else if (base == "compressed") {
     properties |= static_cast<uint64_t>(LevelFormat::Compressed);
-  } else if (base.compare("structured") == 0) {
+  } else if (base == "structured") {
     properties |= static_cast<uint64_t>(LevelFormat::NOutOfM);
     properties |= nToBits(structured[0]) | mToBits(structured[1]);
-  } else if (base.compare("loose_compressed") == 0) {
+  } else if (base == "loose_compressed") {
     properties |= static_cast<uint64_t>(LevelFormat::LooseCompressed);
-  } else if (base.compare("singleton") == 0) {
+  } else if (base == "singleton") {
     properties |= static_cast<uint64_t>(LevelFormat::Singleton);
   } else {
     parser.emitError(loc, "unknown level format: ") << base;

From 5d3f296733b66281a53dd451a983e69ae0bb482f Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Sun, 19 May 2024 16:33:17 -0700
Subject: [PATCH 21/60] [CallPromotionUtils]Implement conditional indirect call
 promotion with vtable-based comparison (#81378)

* Given the code sequence
   ```
   bb:
     %vtable = load ptr, ptr %d, !prof !8
     %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
     %1 = load ptr, ptr %vfn
     %call = tail call i32 %1(ptr %d), !prof !9
  ```
   The transformation looks like

   ```
   bb:
    %vtable = load ptr, ptr %d, align 8
    %vfn = getelementptr inbounds i8, ptr %vtable, i64 8  <-- Inst 1
    %func-addr = load ptr, ptr %vfn, align 8  <-- Inst 2
    # compare loaded pointers with address point of vtables
%1 = icmp eq ptr %vtable, getelementptr inbounds (i8, ptr @_ZTV<VTable>,
i32 16)
br i1 %1, label %if.true.direct_targ, label %if.false.orig_indirect,
!prof !18

  if.true.direct_targ:                              ; preds = %bb
    %2 = tail call i32 @<direct-call>(ptr nonnull %d)
    br label %if.end.icp

  if.false.orig_indirect:                           ; preds = %bb
    %call = tail call i32 %func-addr(ptr nonnull %d)
    br label %if.end.icp

if.end.icp: ; preds = %if.false.orig_indirect, %if.true.direct_targ
%4 = phi i32 [ %call, %if.false.orig_indirect ], [ %2,
%if.true.direct_targ ]

   ```
It's intentional that `Inst 1` and `Inst2` remains in `bb` (not in
`if.false.orig_indirect`). A follow up patch will implement code to sink
them (something like how `instcombine` would
[sink](https://github.com/llvm/llvm-project/blob/2fcfc9754a16805b81e541dc8222a8b5cf17a121/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp#L4293)
instructions along with [debug
intrinsics](https://github.com/llvm/llvm-project/blob/2fcfc9754a16805b81e541dc8222a8b5cf17a121/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp#L4356-L4368)
if possible)

* The parent patch is https://github.com/llvm/llvm-project/pull/81181
---
 .../Transforms/Utils/CallPromotionUtils.h     | 33 +++++--
 .../Transforms/Utils/CallPromotionUtils.cpp   | 32 ++++++-
 .../Utils/CallPromotionUtilsTest.cpp          | 88 +++++++++++++++++++
 3 files changed, 143 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/CallPromotionUtils.h b/llvm/include/llvm/Transforms/Utils/CallPromotionUtils.h
index fcb384ec361339..385831f457038d 100644
--- a/llvm/include/llvm/Transforms/Utils/CallPromotionUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/CallPromotionUtils.h
@@ -15,9 +15,12 @@
 #define LLVM_TRANSFORMS_UTILS_CALLPROMOTIONUTILS_H
 
 namespace llvm {
+template <typename T> class ArrayRef;
+class Constant;
 class CallBase;
 class CastInst;
 class Function;
+class Instruction;
 class MDNode;
 class Value;
 
@@ -41,7 +44,9 @@ bool isLegalToPromote(const CallBase &CB, Function *Callee,
 CallBase &promoteCall(CallBase &CB, Function *Callee,
                       CastInst **RetBitCast = nullptr);
 
-/// Promote the given indirect call site to conditionally call \p Callee.
+/// Promote the given indirect call site to conditionally call \p Callee. The
+/// promoted direct call instruction is predicated on `CB.getCalledOperand() ==
+/// Callee`.
 ///
 /// This function creates an if-then-else structure at the location of the call
 /// site. The original call site is moved into the "else" block. A clone of the
@@ -51,6 +56,22 @@ CallBase &promoteCall(CallBase &CB, Function *Callee,
 CallBase &promoteCallWithIfThenElse(CallBase &CB, Function *Callee,
                                     MDNode *BranchWeights = nullptr);
 
+/// This is similar to `promoteCallWithIfThenElse` except that the condition to
+/// promote a virtual call is that \p VPtr is the same as any of \p
+/// AddressPoints.
+///
+/// This function is expected to be used on virtual calls (a subset of indirect
+/// calls). \p VPtr is the virtual table address stored in the objects, and
+/// \p AddressPoints contains vtable address points. A vtable address point is
+/// a location inside the vtable that's referenced by vpointer in C++ objects.
+///
+/// TODO: sink the address-calculation instructions of indirect callee to the
+/// indirect call fallback after transformation.
+CallBase &promoteCallWithVTableCmp(CallBase &CB, Instruction *VPtr,
+                                   Function *Callee,
+                                   ArrayRef<Constant *> AddressPoints,
+                                   MDNode *BranchWeights);
+
 /// Try to promote (devirtualize) a virtual call on an Alloca. Return true on
 /// success.
 ///
@@ -76,11 +97,11 @@ bool tryPromoteCall(CallBase &CB);
 
 /// Predicate and clone the given call site.
 ///
-/// This function creates an if-then-else structure at the location of the call
-/// site. The "if" condition compares the call site's called value to the given
-/// callee. The original call site is moved into the "else" block, and a clone
-/// of the call site is placed in the "then" block. The cloned instruction is
-/// returned.
+/// This function creates an if-then-else structure at the location of the
+/// call site. The "if" condition compares the call site's called value to
+/// the given callee. The original call site is moved into the "else" block,
+/// and a clone of the call site is placed in the "then" block. The cloned
+/// instruction is returned.
 CallBase &versionCallSite(CallBase &CB, Value *Callee, MDNode *BranchWeights);
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index 9ca9aaf9ee9dff..dda80d419999df 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -12,9 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/IR/AttributeMask.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -188,9 +190,9 @@ static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
 /// Predicate and clone the given call site.
 ///
 /// This function creates an if-then-else structure at the location of the call
-/// site. The "if" condition is specified by `Cond`. The original call site is
-/// moved into the "else" block, and a clone of the call site is placed in the
-/// "then" block. The cloned instruction is returned.
+/// site. The "if" condition is specified by `Cond`.
+/// The original call site is moved into the "else" block, and a clone of the
+/// call site is placed in the "then" block. The cloned instruction is returned.
 ///
 /// For example, the call instruction below:
 ///
@@ -518,7 +520,8 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
     Type *FormalTy = CalleeType->getParamType(ArgNo);
     Type *ActualTy = Arg->getType();
     if (FormalTy != ActualTy) {
-      auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", CB.getIterator());
+      auto *Cast =
+          CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", CB.getIterator());
       CB.setArgOperand(ArgNo, Cast);
 
       // Remove any incompatible attributes for the argument.
@@ -568,6 +571,27 @@ CallBase &llvm::promoteCallWithIfThenElse(CallBase &CB, Function *Callee,
   return promoteCall(NewInst, Callee);
 }
 
+CallBase &llvm::promoteCallWithVTableCmp(CallBase &CB, Instruction *VPtr,
+                                         Function *Callee,
+                                         ArrayRef<Constant *> AddressPoints,
+                                         MDNode *BranchWeights) {
+  assert(!AddressPoints.empty() && "Caller should guarantee");
+  IRBuilder<> Builder(&CB);
+  SmallVector<Value *, 2> ICmps;
+  for (auto &AddressPoint : AddressPoints)
+    ICmps.push_back(Builder.CreateICmpEQ(VPtr, AddressPoint));
+
+  // TODO: Perform tree height reduction if the number of ICmps is high.
+  Value *Cond = Builder.CreateOr(ICmps);
+
+  // Version the indirect call site. If Cond is true, 'NewInst' will be
+  // executed, otherwise the original call site will be executed.
+  CallBase &NewInst = versionCallSiteWithCond(CB, Cond, BranchWeights);
+
+  // Promote 'NewInst' so that it directly calls the desired function.
+  return promoteCall(NewInst, Callee);
+}
+
 bool llvm::tryPromoteCall(CallBase &CB) {
   assert(!CB.getCalledFunction());
   Module *M = CB.getCaller()->getParent();
diff --git a/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp b/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
index 0e9641c5846f35..2d457eb3b678aa 100644
--- a/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
@@ -8,9 +8,12 @@
 
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
@@ -24,6 +27,21 @@ static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
   return Mod;
 }
 
+// Returns a constant representing the vtable's address point specified by the
+// offset.
+static Constant *getVTableAddressPointOffset(GlobalVariable *VTable,
+                                             uint32_t AddressPointOffset) {
+  Module &M = *VTable->getParent();
+  LLVMContext &Context = M.getContext();
+  assert(AddressPointOffset <
+             M.getDataLayout().getTypeAllocSize(VTable->getValueType()) &&
+         "Out-of-bound access");
+
+  return ConstantExpr::getInBoundsGetElementPtr(
+      Type::getInt8Ty(Context), VTable,
+      llvm::ConstantInt::get(Type::getInt32Ty(Context), AddressPointOffset));
+}
+
 TEST(CallPromotionUtilsTest, TryPromoteCall) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C,
@@ -368,3 +386,73 @@ declare %struct2 @_ZN4Impl3RunEv(%class.Impl* %this)
   bool IsPromoted = tryPromoteCall(*CI);
   EXPECT_FALSE(IsPromoted);
 }
+
+TEST(CallPromotionUtilsTest, promoteCallWithVTableCmp) {
+  LLVMContext C;
+  std::unique_ptr<Module> M = parseIR(C,
+                                      R"IR(
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@_ZTV5Base1 = constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN5Base15func0Ev, ptr @_ZN5Base15func1Ev] }, !type !0
+@_ZTV8Derived1 = constant { [4 x ptr], [3 x ptr] } { [4 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr @_ZN5Base15func0Ev, ptr @_ZN5Base15func1Ev], [3 x ptr] [ptr null, ptr null, ptr @_ZN5Base25func2Ev] }, !type !0, !type !1, !type !2
+@_ZTV8Derived2 = constant { [3 x ptr], [3 x ptr], [4 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN5Base35func3Ev], [3 x ptr] [ptr inttoptr (i64 -8 to ptr), ptr null, ptr @_ZN5Base25func2Ev], [4 x ptr] [ptr inttoptr (i64 -16 to ptr), ptr null, ptr @_ZN5Base15func0Ev, ptr @_ZN5Base15func1Ev] }, !type !3, !type !4, !type !5, !type !6
+
+define i32 @testfunc(ptr %d) {
+entry:
+  %vtable = load ptr, ptr %d, !prof !7
+  %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
+  %0 = load ptr, ptr %vfn
+  %call = tail call i32 %0(ptr %d), !prof !8
+  ret i32 %call
+}
+
+define i32 @_ZN5Base15func1Ev(ptr %this) {
+entry:
+  ret i32 2
+}
+
+declare i32 @_ZN5Base25func2Ev(ptr)
+declare i32 @_ZN5Base15func0Ev(ptr)
+declare void @_ZN5Base35func3Ev(ptr)
+
+!0 = !{i64 16, !"_ZTS5Base1"}
+!1 = !{i64 48, !"_ZTS5Base2"}
+!2 = !{i64 16, !"_ZTS8Derived1"}
+!3 = !{i64 64, !"_ZTS5Base1"}
+!4 = !{i64 40, !"_ZTS5Base2"}
+!5 = !{i64 16, !"_ZTS5Base3"}
+!6 = !{i64 16, !"_ZTS8Derived2"}
+!7 = !{!"VP", i32 2, i64 1600, i64 -9064381665493407289, i64 800, i64 5035968517245772950, i64 500, i64 3215870116411581797, i64 300}
+!8 = !{!"VP", i32 0, i64 1600, i64 6804820478065511155, i64 1600})IR");
+
+  Function *F = M->getFunction("testfunc");
+  CallInst *CI = dyn_cast<CallInst>(&*std::next(F->front().rbegin()));
+  ASSERT_TRUE(CI && CI->isIndirectCall());
+
+  // Create the constant and the branch weights
+  SmallVector<Constant *, 3> VTableAddressPoints;
+
+  for (auto &[VTableName, AddressPointOffset] : {std::pair{"_ZTV5Base1", 16},
+                                                 {"_ZTV8Derived1", 16},
+                                                 {"_ZTV8Derived2", 64}})
+    VTableAddressPoints.push_back(getVTableAddressPointOffset(
+        M->getGlobalVariable(VTableName), AddressPointOffset));
+
+  MDBuilder MDB(C);
+  MDNode *BranchWeights = MDB.createBranchWeights(1600, 0);
+
+  size_t OrigEntryBBSize = F->front().size();
+
+  LoadInst *VPtr = dyn_cast<LoadInst>(&*F->front().begin());
+
+  Function *Callee = M->getFunction("_ZN5Base15func1Ev");
+  // Tests that promoted direct call is returned.
+  CallBase &DirectCB = promoteCallWithVTableCmp(
+      *CI, VPtr, Callee, VTableAddressPoints, BranchWeights);
+  EXPECT_EQ(DirectCB.getCalledOperand(), Callee);
+
+  // Promotion inserts 3 icmp instructions and 2 or instructions, and removes
+  // 1 call instruction from the entry block.
+  EXPECT_EQ(F->front().size(), OrigEntryBBSize + 4);
+}

From d102ee63e849cdaa586fd1aaae900c1399bf2b76 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 19 May 2024 16:51:07 -0700
Subject: [PATCH 22/60] [clang] Use operator==(StringRef, StringRef) (NFC)
 (#92708)

---
 clang-tools-extra/modularize/ModularizeUtilities.cpp | 6 ++----
 clang/lib/Driver/ToolChains/Clang.cpp                | 2 +-
 clang/utils/TableGen/ClangAttrEmitter.cpp            | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp
index 53e8a49d1a5489..b202b3aae8f8a3 100644
--- a/clang-tools-extra/modularize/ModularizeUtilities.cpp
+++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp
@@ -435,11 +435,9 @@ static std::string replaceDotDot(StringRef Path) {
   llvm::sys::path::const_iterator B = llvm::sys::path::begin(Path),
     E = llvm::sys::path::end(Path);
   while (B != E) {
-    if (B->compare(".") == 0) {
-    }
-    else if (B->compare("..") == 0)
+    if (*B == "..")
       llvm::sys::path::remove_filename(Buffer);
-    else
+    else if (*B != ".")
       llvm::sys::path::append(Buffer, *B);
     ++B;
   }
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index c3e6d563f3bd21..6d2015b2cd1566 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1522,7 +1522,7 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
       auto isPAuthLR = [](const char *member) {
         llvm::AArch64::ExtensionInfo pauthlr_extension =
             llvm::AArch64::getExtensionByID(llvm::AArch64::AEK_PAUTHLR);
-        return (pauthlr_extension.Feature.compare(member) == 0);
+        return pauthlr_extension.Feature == member;
       };
 
       if (std::any_of(CmdArgs.begin(), CmdArgs.end(), isPAuthLR))
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index aafbf1f40949a2..ca7630adfbb7b5 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -1845,7 +1845,7 @@ static LateAttrParseKind getLateAttrParseKind(const Record *Attr) {
     PrintFatalError(Attr, "Field `" + llvm::Twine(LateParsedStr) +
                               "`should only have one super class");
 
-  if (SuperClasses[0]->getName().compare(LateAttrParseKindStr) != 0)
+  if (SuperClasses[0]->getName() != LateAttrParseKindStr)
     PrintFatalError(Attr, "Field `" + llvm::Twine(LateParsedStr) +
                               "`should only have type `" +
                               llvm::Twine(LateAttrParseKindStr) +

From 0bced10f290bb96d675874a89f1b6789a2384e30 Mon Sep 17 00:00:00 2001
From: Freddy Ye <freddy.ye@intel.com>
Date: Mon, 20 May 2024 08:53:21 +0800
Subject: [PATCH 23/60] [SDAG][X86] Extend SplitVecOp_VSETCC for STRICT_FSETCC.
 (#92509)

---
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 19 +++++++--
 .../CodeGen/X86/vec-strict-cmp-512-skx.ll     | 40 +++++++++++++++++++
 2 files changed, 56 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/vec-strict-cmp-512-skx.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index cd858003cf03bc..dca5a481fbd0e5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3033,6 +3033,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
                        "operand!\n");
 
   case ISD::VP_SETCC:
+  case ISD::STRICT_FSETCC:
   case ISD::SETCC:             Res = SplitVecOp_VSETCC(N); break;
   case ISD::BITCAST:           Res = SplitVecOp_BITCAST(N); break;
   case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
@@ -3997,14 +3998,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
+  bool isStrict = N->getOpcode() == ISD::STRICT_FSETCC;
   assert(N->getValueType(0).isVector() &&
-         N->getOperand(0).getValueType().isVector() &&
+         N->getOperand(isStrict ? 1 : 0).getValueType().isVector() &&
          "Operand types must be vectors");
   // The result has a legal vector type, but the input needs splitting.
   SDValue Lo0, Hi0, Lo1, Hi1, LoRes, HiRes;
   SDLoc DL(N);
-  GetSplitVector(N->getOperand(0), Lo0, Hi0);
-  GetSplitVector(N->getOperand(1), Lo1, Hi1);
+  GetSplitVector(N->getOperand(isStrict ? 1 : 0), Lo0, Hi0);
+  GetSplitVector(N->getOperand(isStrict ? 2 : 1), Lo1, Hi1);
+
   auto PartEltCnt = Lo0.getValueType().getVectorElementCount();
 
   LLVMContext &Context = *DAG.getContext();
@@ -4014,6 +4017,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
   if (N->getOpcode() == ISD::SETCC) {
     LoRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Lo0, Lo1, N->getOperand(2));
     HiRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Hi0, Hi1, N->getOperand(2));
+  } else if (N->getOpcode() == ISD::STRICT_FSETCC) {
+    LoRes = DAG.getNode(ISD::STRICT_FSETCC, DL,
+                        DAG.getVTList(PartResVT, N->getValueType(1)),
+                        N->getOperand(0), Lo0, Lo1, N->getOperand(3));
+    HiRes = DAG.getNode(ISD::STRICT_FSETCC, DL,
+                        DAG.getVTList(PartResVT, N->getValueType(1)),
+                        N->getOperand(0), Hi0, Hi1, N->getOperand(3));
+    SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                   LoRes.getValue(1), HiRes.getValue(1));
+    ReplaceValueWith(SDValue(N, 1), NewChain);
   } else {
     assert(N->getOpcode() == ISD::VP_SETCC && "Expected VP_SETCC opcode");
     SDValue MaskLo, MaskHi, EVLLo, EVLHi;
diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-512-skx.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-512-skx.ll
new file mode 100644
index 00000000000000..3028b749673784
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-strict-cmp-512-skx.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64 -mcpu=skx | FileCheck %s --check-prefixes=SKX
+
+;; Test no crash for AVX512 targets without prefer-vector-width=512.
+
+define <16 x i32> @test_v16f32_oeq_q(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
+; SKX-LABEL: test_v16f32_oeq_q:
+; SKX:       # %bb.0:
+; SKX-NEXT:    vcmpeqps %ymm7, %ymm5, %k1
+; SKX-NEXT:    vcmpeqps %ymm6, %ymm4, %k2
+; SKX-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k2}
+; SKX-NEXT:    vpblendmd %ymm1, %ymm3, %ymm1 {%k1}
+; SKX-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(
+                                               <16 x float> %f1, <16 x float> %f2, metadata !"oeq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i32> %a, <16 x i32> %b
+  ret <16 x i32> %res
+}
+
+define <8 x i32> @test_v8f64_oeq_q(<8 x i32> %a, <8 x i32> %b, <8 x double> %f1, <8 x double> %f2) #0 {
+; SKX-LABEL: test_v8f64_oeq_q:
+; SKX:       # %bb.0:
+; SKX-NEXT:    vcmpeqpd %ymm4, %ymm2, %k0
+; SKX-NEXT:    vcmpeqpd %ymm5, %ymm3, %k1
+; SKX-NEXT:    kshiftlb $4, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k1
+; SKX-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(
+                                               <8 x double> %f1, <8 x double> %f2, metadata !"oeq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i32> %a, <8 x i32> %b
+  ret <8 x i32> %res
+}
+
+declare <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(<16 x float>, <16 x float>, metadata, metadata)
+declare <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(<8 x double>, <8 x double>, metadata, metadata)
+
+attributes #0 = { nounwind strictfp "min-legal-vector-width"="0" }

From 89d0937348ebd4b55f17d503910be9300aa44a13 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 19 May 2024 18:17:53 -0700
Subject: [PATCH 24/60] [llvm] Use StringRef::contains (NFC) (#92710)

---
 llvm/lib/IR/Mangler.cpp                 | 2 +-
 llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp | 2 +-
 llvm/lib/TextAPI/Utils.cpp              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/IR/Mangler.cpp b/llvm/lib/IR/Mangler.cpp
index 72e2bc1f24ac97..019fe844e286c8 100644
--- a/llvm/lib/IR/Mangler.cpp
+++ b/llvm/lib/IR/Mangler.cpp
@@ -292,7 +292,7 @@ void llvm::emitLinkerFlagsForUsedCOFF(raw_ostream &OS, const GlobalValue *GV,
 
 std::optional<std::string> llvm::getArm64ECMangledFunctionName(StringRef Name) {
   bool IsCppFn = Name[0] == '?';
-  if (IsCppFn && Name.find("$$h") != std::string::npos)
+  if (IsCppFn && Name.contains("$$h"))
     return std::nullopt;
   if (!IsCppFn && Name[0] == '#')
     return std::nullopt;
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 32de8b9587b46f..9fde26c900f510 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1886,7 +1886,7 @@ static bool buildEnqueueKernel(const SPIRV::IncomingCall *Call,
   // Local sizes arguments: Sizes of block invoke arguments. Clang generates
   // local size operands as an array, so we need to unpack them.
   SmallVector<Register, 16> LocalSizes;
-  if (Call->Builtin->Name.find("_varargs") != StringRef::npos || IsSpirvOp) {
+  if (Call->Builtin->Name.contains("_varargs") || IsSpirvOp) {
     const unsigned LocalSizeArrayIdx = HasEvents ? 9 : 6;
     Register GepReg = Call->Arguments[LocalSizeArrayIdx];
     MachineInstr *GepMI = MRI->getUniqueVRegDef(GepReg);
diff --git a/llvm/lib/TextAPI/Utils.cpp b/llvm/lib/TextAPI/Utils.cpp
index 08f14f65177ed3..01021e3a264dd8 100644
--- a/llvm/lib/TextAPI/Utils.cpp
+++ b/llvm/lib/TextAPI/Utils.cpp
@@ -184,7 +184,7 @@ llvm::Expected<Regex> llvm::MachO::createRegexFromGlob(StringRef Glob) {
       break;
     }
     default:
-      if (RegexMetachars.find(C) != StringRef::npos)
+      if (RegexMetachars.contains(C))
         RegexString.push_back('\\');
       RegexString.push_back(C);
     }

From fc0144a30cf20d6405411da141d11bfde143d3d2 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Mon, 20 May 2024 10:36:03 +0800
Subject: [PATCH 25/60] [Serialization] Read the initializer for interesting
 static variables before consuming it (#92353)

Close https://github.com/llvm/llvm-project/issues/91418

Since we load the variable's initializers lazily, it'd be problematic if
the initializers dependent on each other. So here we try to load the
initializers of static variables to make sure they are passed to code
generator by order. If we read any thing interesting, we would consume
that before emitting the current declaration.
---
 clang/lib/Serialization/ASTReaderDecl.cpp    |  29 ++-
 clang/test/Modules/pr91418.cppm              |  65 +++++
 clang/test/OpenMP/nvptx_lambda_capturing.cpp | 246 +++++++++----------
 3 files changed, 214 insertions(+), 126 deletions(-)
 create mode 100644 clang/test/Modules/pr91418.cppm

diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 0c647086e304a3..a6254b70560c36 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -4186,12 +4186,35 @@ void ASTReader::PassInterestingDeclsToConsumer() {
     GetDecl(ID);
   EagerlyDeserializedDecls.clear();
 
-  while (!PotentiallyInterestingDecls.empty()) {
-    Decl *D = PotentiallyInterestingDecls.front();
-    PotentiallyInterestingDecls.pop_front();
+  auto ConsumingPotentialInterestingDecls = [this]() {
+    while (!PotentiallyInterestingDecls.empty()) {
+      Decl *D = PotentiallyInterestingDecls.front();
+      PotentiallyInterestingDecls.pop_front();
+      if (isConsumerInterestedIn(D))
+        PassInterestingDeclToConsumer(D);
+    }
+  };
+  std::deque<Decl *> MaybeInterestingDecls =
+      std::move(PotentiallyInterestingDecls);
+  assert(PotentiallyInterestingDecls.empty());
+  while (!MaybeInterestingDecls.empty()) {
+    Decl *D = MaybeInterestingDecls.front();
+    MaybeInterestingDecls.pop_front();
+    // Since we load the variable's initializers lazily, it'd be problematic
+    // if the initializers dependent on each other. So here we try to load the
+    // initializers of static variables to make sure they are passed to code
+    // generator by order. If we read anything interesting, we would consume
+    // that before emitting the current declaration.
+    if (auto *VD = dyn_cast<VarDecl>(D);
+        VD && VD->isFileVarDecl() && !VD->isExternallyVisible())
+      VD->getInit();
+    ConsumingPotentialInterestingDecls();
     if (isConsumerInterestedIn(D))
       PassInterestingDeclToConsumer(D);
   }
+
+  // If we add any new potential interesting decl in the last call, consume it.
+  ConsumingPotentialInterestingDecls();
 }
 
 void ASTReader::loadDeclUpdateRecords(PendingUpdateRecord &Record) {
diff --git a/clang/test/Modules/pr91418.cppm b/clang/test/Modules/pr91418.cppm
new file mode 100644
index 00000000000000..b507df162643ba
--- /dev/null
+++ b/clang/test/Modules/pr91418.cppm
@@ -0,0 +1,65 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 -x c++-header %t/foo.h \
+// RUN:     -emit-pch -o %t/foo.pch
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++20 %t/use.cpp -include-pch \
+// RUN:     %t/foo.pch -emit-llvm -o - | FileCheck %t/use.cpp
+
+//--- foo.h
+#ifndef FOO_H
+#define FOO_H
+typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
+
+static __inline__ __m128 __attribute__((__always_inline__, __min_vector_width__(128)))
+_mm_setr_ps(float __z, float __y, float __x, float __w)
+{
+  return __extension__ (__m128){ __z, __y, __x, __w };
+}
+
+typedef __m128 VR;
+
+inline VR MakeVR( float X, float Y, float Z, float W )
+{
+ return _mm_setr_ps( X, Y, Z, W );
+}
+
+extern "C" float sqrtf(float);
+
+namespace VectorSinConstantsSSE
+{
+  float a = (16 * sqrtf(0.225f));
+  VR A = MakeVR(a, a, a, a);
+  static const float b = (16 * sqrtf(0.225f));
+  static const VR B = MakeVR(b, b, b, b);
+}
+
+#endif // FOO_H
+
+//--- use.cpp
+#include "foo.h"
+float use() {
+    return VectorSinConstantsSSE::A[0] + VectorSinConstantsSSE::A[1] +
+           VectorSinConstantsSSE::A[2] + VectorSinConstantsSSE::A[3] +
+           VectorSinConstantsSSE::B[0] + VectorSinConstantsSSE::B[1] +
+           VectorSinConstantsSSE::B[2] + VectorSinConstantsSSE::B[3];
+}
+
+// CHECK: define{{.*}}@__cxx_global_var_init(
+// CHECK: store{{.*}}, ptr @_ZN21VectorSinConstantsSSE1aE
+
+// CHECK: define{{.*}}@__cxx_global_var_init.1(
+// CHECK: store{{.*}}, ptr @_ZN21VectorSinConstantsSSE1AE
+
+// CHECK: define{{.*}}@__cxx_global_var_init.2(
+// CHECK: store{{.*}}, ptr @_ZN21VectorSinConstantsSSEL1BE
+
+// CHECK: define{{.*}}@__cxx_global_var_init.3(
+// CHECK: store{{.*}}, ptr @_ZN21VectorSinConstantsSSEL1bE
+
+// CHECK: @_GLOBAL__sub_I_use.cpp
+// CHECK: call{{.*}}@__cxx_global_var_init(
+// CHECK: call{{.*}}@__cxx_global_var_init.1(
+// CHECK: call{{.*}}@__cxx_global_var_init.3(
+// CHECK: call{{.*}}@__cxx_global_var_init.2(
diff --git a/clang/test/OpenMP/nvptx_lambda_capturing.cpp b/clang/test/OpenMP/nvptx_lambda_capturing.cpp
index 641fbc38dd6bc4..efea8d4a056181 100644
--- a/clang/test/OpenMP/nvptx_lambda_capturing.cpp
+++ b/clang/test/OpenMP/nvptx_lambda_capturing.cpp
@@ -1165,8 +1165,113 @@ int main(int argc, char **argv) {
 // CHECK2-NEXT:    ret void
 //
 //
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[L1:%.*]] = alloca [[CLASS_ANON:%.*]], align 8
+// CHECK3-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L_ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_kernel_environment, ptr [[DYN_PTR]])
+// CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
+// CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK3:       user_code.entry:
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[L1]], ptr align 8 [[TMP3]], i64 8, i1 false)
+// CHECK3-NEXT:    store ptr [[L1]], ptr [[_TMP2]], align 8
+// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP2]], align 8
+// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP4]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP5]], align 8
+// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[_TMP2]], align 8
+// CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZZN1S3fooEvENKUlvE_clEv(ptr noundef nonnull align 8 dereferenceable(8) [[TMP6]]) #[[ATTR7:[0-9]+]]
+// CHECK3-NEXT:    call void @__kmpc_target_deinit()
+// CHECK3-NEXT:    ret void
+// CHECK3:       worker.exit:
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZZN1S3fooEvENKUlvE_clEv
+// CHECK3-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) #[[ATTR2:[0-9]+]] comdat align 2 {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK3-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[CLASS_ANON:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[TMP1]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT:    ret i32 [[TMP2]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 8
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L_ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29_kernel_environment, ptr [[DYN_PTR]])
+// CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
+// CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK3:       user_code.entry:
+// CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP5]], align 8
+// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK3-NEXT:    store ptr [[TMP4]], ptr [[TMP6]], align 8
+// CHECK3-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2)
+// CHECK3-NEXT:    call void @__kmpc_target_deinit()
+// CHECK3-NEXT:    ret void
+// CHECK3:       worker.exit:
+// CHECK3-NEXT:    ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29_omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    [[L1:%.*]] = alloca [[CLASS_ANON:%.*]], align 8
+// CHECK3-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
+// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L_ADDR]], align 8
+// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
+// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[L1]], ptr align 8 [[TMP2]], i64 8, i1 false)
+// CHECK3-NEXT:    store ptr [[L1]], ptr [[_TMP2]], align 8
+// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[_TMP2]], align 8
+// CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP4]], align 8
+// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP2]], align 8
+// CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZZN1S3fooEvENKUlvE_clEv(ptr noundef nonnull align 8 dereferenceable(8) [[TMP5]]) #[[ATTR7]]
+// CHECK3-NEXT:    ret void
+//
+//
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41
-// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR0]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
@@ -1178,7 +1283,7 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[_TMP1:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[L3:%.*]] = alloca [[CLASS_ANON:%.*]], align 8
+// CHECK3-NEXT:    [[L3:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 8
 // CHECK3-NEXT:    [[_TMP4:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[B5:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[_TMP6:%.*]] = alloca ptr, align 8
@@ -1214,20 +1319,20 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    store i32 [[TMP9]], ptr [[C7]], align 4
 // CHECK3-NEXT:    store ptr [[C7]], ptr [[_TMP8]], align 8
 // CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[_TMP4]], align 8
-// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP10]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP10]], i32 0, i32 0
 // CHECK3-NEXT:    store ptr [[ARGC_ADDR]], ptr [[TMP11]], align 8
-// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP10]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP10]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[_TMP6]], align 8
 // CHECK3-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8
-// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP10]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP10]], i32 0, i32 2
 // CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[_TMP8]], align 8
 // CHECK3-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8
-// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP10]], i32 0, i32 3
+// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP10]], i32 0, i32 3
 // CHECK3-NEXT:    store ptr [[D_ADDR]], ptr [[TMP16]], align 8
-// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP10]], i32 0, i32 4
+// CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP10]], i32 0, i32 4
 // CHECK3-NEXT:    store ptr [[TMP2]], ptr [[TMP17]], align 8
 // CHECK3-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[_TMP4]], align 8
-// CHECK3-NEXT:    [[CALL:%.*]] = call noundef i64 @"_ZZ4mainENK3$_0clEv"(ptr noundef nonnull align 8 dereferenceable(40) [[TMP18]]) #[[ATTR7:[0-9]+]]
+// CHECK3-NEXT:    [[CALL:%.*]] = call noundef i64 @"_ZZ4mainENK3$_0clEv"(ptr noundef nonnull align 8 dereferenceable(40) [[TMP18]]) #[[ATTR7]]
 // CHECK3-NEXT:    call void @__kmpc_target_deinit()
 // CHECK3-NEXT:    ret void
 // CHECK3:       worker.exit:
@@ -1235,7 +1340,7 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43
-// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
@@ -1267,7 +1372,7 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP5]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
-// CHECK3-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK3-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP]], align 8
 // CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP1]], align 8
 // CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[D_ADDR]], align 8
@@ -1292,7 +1397,7 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43_omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1305,7 +1410,7 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[_TMP1:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[L3:%.*]] = alloca [[CLASS_ANON:%.*]], align 8
+// CHECK3-NEXT:    [[L3:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 8
 // CHECK3-NEXT:    [[_TMP4:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[ARGC5:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[B6:%.*]] = alloca i32, align 4
@@ -1345,128 +1450,23 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP3]], align 4
 // CHECK3-NEXT:    store i32 [[TMP11]], ptr [[A10]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[_TMP4]], align 8
-// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP12]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP12]], i32 0, i32 0
 // CHECK3-NEXT:    store ptr [[ARGC5]], ptr [[TMP13]], align 8
-// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP12]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP12]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 8
 // CHECK3-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8
-// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP12]], i32 0, i32 2
+// CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP12]], i32 0, i32 2
 // CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[_TMP9]], align 8
 // CHECK3-NEXT:    store ptr [[TMP17]], ptr [[TMP16]], align 8
-// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP12]], i32 0, i32 3
+// CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP12]], i32 0, i32 3
 // CHECK3-NEXT:    store ptr [[D_ADDR]], ptr [[TMP18]], align 8
-// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[TMP12]], i32 0, i32 4
+// CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP12]], i32 0, i32 4
 // CHECK3-NEXT:    store ptr [[A10]], ptr [[TMP19]], align 8
 // CHECK3-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[_TMP4]], align 8
 // CHECK3-NEXT:    [[CALL:%.*]] = call noundef i64 @"_ZZ4mainENK3$_0clEv"(ptr noundef nonnull align 8 dereferenceable(40) [[TMP20]]) #[[ATTR7]]
 // CHECK3-NEXT:    ret void
 //
 //
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27
-// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR0]] {
-// CHECK3-NEXT:  entry:
-// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[L1:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 8
-// CHECK3-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L_ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
-// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_kernel_environment, ptr [[DYN_PTR]])
-// CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
-// CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK3:       user_code.entry:
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8
-// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[L1]], ptr align 8 [[TMP3]], i64 8, i1 false)
-// CHECK3-NEXT:    store ptr [[L1]], ptr [[_TMP2]], align 8
-// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP2]], align 8
-// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP4]], i32 0, i32 0
-// CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP5]], align 8
-// CHECK3-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[_TMP2]], align 8
-// CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZZN1S3fooEvENKUlvE_clEv(ptr noundef nonnull align 8 dereferenceable(8) [[TMP6]]) #[[ATTR7]]
-// CHECK3-NEXT:    call void @__kmpc_target_deinit()
-// CHECK3-NEXT:    ret void
-// CHECK3:       worker.exit:
-// CHECK3-NEXT:    ret void
-//
-//
-// CHECK3-LABEL: define {{[^@]+}}@_ZZN1S3fooEvENKUlvE_clEv
-// CHECK3-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]]) #[[ATTR2:[0-9]+]] comdat align 2 {
-// CHECK3-NEXT:  entry:
-// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
-// CHECK3-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK3-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[CLASS_ANON_1:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8
-// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[TMP1]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A]], align 4
-// CHECK3-NEXT:    ret i32 [[TMP2]]
-//
-//
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29
-// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR3]] {
-// CHECK3-NEXT:  entry:
-// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 8
-// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L_ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
-// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29_kernel_environment, ptr [[DYN_PTR]])
-// CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
-// CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// CHECK3:       user_code.entry:
-// CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
-// CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
-// CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP5]], align 8
-// CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
-// CHECK3-NEXT:    store ptr [[TMP4]], ptr [[TMP6]], align 8
-// CHECK3-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2)
-// CHECK3-NEXT:    call void @__kmpc_target_deinit()
-// CHECK3-NEXT:    ret void
-// CHECK3:       worker.exit:
-// CHECK3-NEXT:    ret void
-//
-//
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29_omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR4]] {
-// CHECK3-NEXT:  entry:
-// CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[L1:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 8
-// CHECK3-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
-// CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L_ADDR]], align 8
-// CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
-// CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8
-// CHECK3-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[L1]], ptr align 8 [[TMP2]], i64 8, i1 false)
-// CHECK3-NEXT:    store ptr [[L1]], ptr [[_TMP2]], align 8
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[_TMP2]], align 8
-// CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[CLASS_ANON_1]], ptr [[TMP3]], i32 0, i32 0
-// CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP4]], align 8
-// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP2]], align 8
-// CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZZN1S3fooEvENKUlvE_clEv(ptr noundef nonnull align 8 dereferenceable(8) [[TMP5]]) #[[ATTR7]]
-// CHECK3-NEXT:    ret void
-//
-//
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIZN1S3fooEvEUlvE_EiRKT__l18
 // CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
@@ -1500,7 +1500,7 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
-// CHECK3-NEXT:    [[T1:%.*]] = alloca [[CLASS_ANON_1:%.*]], align 8
+// CHECK3-NEXT:    [[T1:%.*]] = alloca [[CLASS_ANON:%.*]], align 8
 // CHECK3-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8

From 91423d71938d7a1dba27188e6d854148a750a3dd Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Sun, 19 May 2024 20:15:31 -0700
Subject: [PATCH 26/60] [BOLT][NFC] Don't assign YAML profile to functions with
 no CFG (#92487)

YAML profile for non-simple functions without CFG is
  1) useless for optimizations,
  2) can't be attached, similar to fdata profile,
  3) would be reported as invalid/stale even if the profile is valid.

Don't attempt to attach the profile in this case, aligning the behavior
to DataReader.

Test Plan: added yaml-non-simple.test
---
 bolt/lib/Profile/YAMLProfileReader.cpp |  3 ++
 bolt/test/X86/yaml-non-simple.test     | 71 ++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 bolt/test/X86/yaml-non-simple.test

diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index 978a7cadfe798f..29d94067f459fc 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -99,6 +99,9 @@ bool YAMLProfileReader::parseFunctionProfile(
       FuncRawBranchCount += YamlSI.Count;
   BF.setRawBranchCount(FuncRawBranchCount);
 
+  if (BF.empty())
+    return true;
+
   if (!opts::IgnoreHash &&
       YamlBF.Hash != BF.computeHash(IsDFSOrder, HashFunction)) {
     if (opts::Verbosity >= 1)
diff --git a/bolt/test/X86/yaml-non-simple.test b/bolt/test/X86/yaml-non-simple.test
new file mode 100644
index 00000000000000..fef98f692a7103
--- /dev/null
+++ b/bolt/test/X86/yaml-non-simple.test
@@ -0,0 +1,71 @@
+## Check that YAML profile for non-simple function is not reported as stale.
+
+# RUN: split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %t/main.s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -nostdlib
+# RUN: llvm-bolt %t.exe -o %t.out --data %t/yaml --profile-ignore-hash -v=1 \
+# RUN:   --report-stale 2>&1 | FileCheck %s
+
+# CHECK: BOLT-INFO: could not disassemble function main. Will ignore.
+# CHECK: BOLT-INFO: could not disassemble function main.cold. Will ignore.
+# CHECK: BOLT-INFO: 0 out of 2 functions in the binary (0.0%) have non-empty execution profile
+# CHECK: BOLT-INFO: 1 function with profile could not be optimized
+
+#--- main.s
+.globl main
+.type	main, @function
+main:
+  .cfi_startproc
+.LBB00:
+  pushq   %rbp
+  movq    %rsp, %rbp
+  subq    $16, %rsp
+  testq   %rax, %rax
+  js      .LBB03
+.LBB01:
+  jne     .LBB04
+.LBB02:
+  nop
+.LBB03:
+  xorl    %eax, %eax
+  addq    $16, %rsp
+  popq    %rbp
+  retq
+.LBB04:
+  xorl    %eax, %eax
+  addq    $16, %rsp
+  popq    %rbp
+  retq
+  .cfi_endproc
+  .size	main, .-main
+
+.globl main.cold
+.type	main.cold, @function
+main.cold:
+  .cfi_startproc
+  nop
+  .cfi_endproc
+  .size	main.cold, .-main.cold
+
+#--- yaml
+---
+header:
+  profile-version: 1
+  binary-name:     'yaml-non-simple.s.tmp.exe'
+  binary-build-id: '<unknown>'
+  profile-flags:   [ lbr ]
+  profile-origin:  branch profile reader
+  profile-events:  ''
+  dfs-order:       false
+  hash-func:       xxh3
+functions:
+  - name:            main
+    fid:             0
+    hash:            0x0000000000000000
+    exec:            1
+    nblocks:         5
+    blocks:
+      - bid:             1
+        insns:           1
+        succ:            [ { bid: 3, cnt: 1} ]
+...

From 6bf1601a0d9a01fe663442096466d46800483e0c Mon Sep 17 00:00:00 2001
From: Monad <yanwqmonad@gmail.com>
Date: Mon, 20 May 2024 12:20:47 +0800
Subject: [PATCH 27/60] [InstCombine] Fold pointer adding in integer to
 arithmetic add (#91596)

Fold
``` llvm
define i32 @src(i32 %x, i32 %y) {
  %base = inttoptr i32 %x to ptr
  %ptr = getelementptr inbounds i8, ptr %base, i32 %y
  %r = ptrtoint ptr %ptr to i32
  ret i32 %r
}
```
where both `%base` and `%ptr` have only one use, to
``` llvm
define i32 @tgt(i32 %x, i32 %y) {
  %r = add i32 %x, %y
  ret i32 %r
}
```

The `add` can be `nuw` if the GEP is `inbounds` and the offset is
non-negative. The relevant Alive2 proof is
https://alive2.llvm.org/ce/z/nP3RWy.

### Motivation

It seems unnecessary to convert `int` to `ptr` just to get its offset.
In most cases, they generates the same assembly, but sometimes it may
miss some optimizations since the analysis of `GEP` is not as perfect as
that of arithmetic operation. One example is


https://github.com/dtcxzyw/llvm-opt-benchmark/blob/e3c822bf41df3a88ca38eba884a52b0cc7e70bf2/bench/protobuf/optimized/generated_message_reflection.cc.ll#L39860-L39873

``` llvm
  %conv.i188 = zext i32 %145 to i64
  %add.i189 = add i64 %conv.i188, %125
  %146 = load i16, ptr %num_aux_entries10.i, align 2
  %conv2.i191 = zext i16 %146 to i64
  %mul.i192 = shl nuw nsw i64 %conv2.i191, 3
  %add3.i193 = add i64 %add.i189, %mul.i192
  %147 = inttoptr i64 %add3.i193 to ptr
  %sub.ptr.lhs.cast.i195 = ptrtoint ptr %144 to i64
  %sub.ptr.rhs.cast.i196 = ptrtoint ptr %143 to i64
  %sub.ptr.sub.i197 = sub i64 %sub.ptr.lhs.cast.i195, %sub.ptr.rhs.cast.i196
  %add.ptr = getelementptr inbounds i8, ptr %147, i64 %sub.ptr.sub.i197
  %sub.ptr.lhs.cast = ptrtoint ptr %add.ptr to i64
  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %125
```

where `%conv.i188` first adds `%125` and then subtracts `%125` (the
result is `%sub.ptr.sub`), which can be optimized.
---
 .../InstCombine/InstCombineCasts.cpp          |  20 ++-
 llvm/test/Transforms/InstCombine/cast_ptr.ll  | 151 ++++++++++++++++++
 2 files changed, 167 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 11e31877de38c2..1b4c319032cab8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2049,16 +2049,28 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
       Mask->getType() == Ty)
     return BinaryOperator::CreateAnd(Builder.CreatePtrToInt(Ptr, Ty), Mask);
 
-  if (auto *GEP = dyn_cast<GetElementPtrInst>(SrcOp)) {
+  if (auto *GEP = dyn_cast<GEPOperator>(SrcOp)) {
     // Fold ptrtoint(gep null, x) to multiply + constant if the GEP has one use.
     // While this can increase the number of instructions it doesn't actually
     // increase the overall complexity since the arithmetic is just part of
     // the GEP otherwise.
     if (GEP->hasOneUse() &&
         isa<ConstantPointerNull>(GEP->getPointerOperand())) {
-      return replaceInstUsesWith(
-          CI, Builder.CreateIntCast(EmitGEPOffset(cast<GEPOperator>(GEP)), Ty,
-                                    /*isSigned=*/false));
+      return replaceInstUsesWith(CI,
+                                 Builder.CreateIntCast(EmitGEPOffset(GEP), Ty,
+                                                       /*isSigned=*/false));
+    }
+
+    // (ptrtoint (gep (inttoptr Base), ...)) -> Base + Offset
+    Value *Base;
+    if (GEP->hasOneUse() &&
+        match(GEP->getPointerOperand(), m_OneUse(m_IntToPtr(m_Value(Base)))) &&
+        Base->getType() == Ty) {
+      Value *Offset = EmitGEPOffset(GEP);
+      auto *NewOp = BinaryOperator::CreateAdd(Base, Offset);
+      if (GEP->isInBounds() && isKnownNonNegative(Offset, SQ))
+        NewOp->setHasNoUnsignedWrap(true);
+      return NewOp;
     }
   }
 
diff --git a/llvm/test/Transforms/InstCombine/cast_ptr.ll b/llvm/test/Transforms/InstCombine/cast_ptr.ll
index 5c6c012064e05b..786ea876ddea7a 100644
--- a/llvm/test/Transforms/InstCombine/cast_ptr.ll
+++ b/llvm/test/Transforms/InstCombine/cast_ptr.ll
@@ -244,3 +244,154 @@ define <2 x i32> @insertelt_extra_use2(<2 x i32> %x, ptr %p) {
   %r = ptrtoint <2 x ptr> %i to <2 x i32>
   ret <2 x i32> %r
 }
+
+define i32 @ptr_add_in_int(i32 %x, i32 %y) {
+; CHECK-LABEL: @ptr_add_in_int(
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr inbounds i8, ptr %ptr, i32 %y
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+define i32 @ptr_add_in_int_2(i32 %x, i32 %y) {
+; CHECK-LABEL: @ptr_add_in_int_2(
+; CHECK-NEXT:    [[P2_IDX:%.*]] = shl nsw i32 [[Y:%.*]], 2
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[P2_IDX]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr inbounds i32, ptr %ptr, i32 %y
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+define i32 @ptr_add_in_int_nneg(i32 %x, i32 %y) {
+; CHECK-LABEL: @ptr_add_in_int_nneg(
+; CHECK-NEXT:    [[Z:%.*]] = call i32 @llvm.abs.i32(i32 [[Y:%.*]], i1 true)
+; CHECK-NEXT:    [[R:%.*]] = add nuw i32 [[Z]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %z = call i32 @llvm.abs.i32(i32 %y, i1 true)
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr inbounds i8, ptr %ptr, i32 %z
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+define i64 @ptr_add_in_int_different_type_1(i32 %x, i32 %y) {
+; CHECK-LABEL: @ptr_add_in_int_different_type_1(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr i8, ptr %ptr, i32 %y
+  %r = ptrtoint ptr %p2 to i64
+  ret i64 %r
+}
+
+define i16 @ptr_add_in_int_different_type_2(i32 %x, i32 %y) {
+; CHECK-LABEL: @ptr_add_in_int_different_type_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT:    ret i16 [[R]]
+;
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr i8, ptr %ptr, i32 %y
+  %r = ptrtoint ptr %p2 to i16
+  ret i16 %r
+}
+
+define i32 @ptr_add_in_int_different_type_3(i16 %x, i32 %y) {
+; CHECK-LABEL: @ptr_add_in_int_different_type_3(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ptr = inttoptr i16 %x to ptr
+  %p2 = getelementptr i8, ptr %ptr, i32 %y
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+define i32 @ptr_add_in_int_different_type_4(i64 %x, i32 %y) {
+; CHECK-LABEL: @ptr_add_in_int_different_type_4(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[X:%.*]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[TMP1]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ptr = inttoptr i64 %x to ptr
+  %p2 = getelementptr i8, ptr %ptr, i32 %y
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+define i32 @ptr_add_in_int_not_inbounds(i32 %x, i32 %y) {
+; CHECK-LABEL: @ptr_add_in_int_not_inbounds(
+; CHECK-NEXT:    [[Z:%.*]] = call i32 @llvm.abs.i32(i32 [[Y:%.*]], i1 true)
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[Z]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %z = call i32 @llvm.abs.i32(i32 %y, i1 true)
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr i8, ptr %ptr, i32 %z
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+define i32 @ptr_add_in_int_const(i32 %x) {
+; CHECK-LABEL: @ptr_add_in_int_const(
+; CHECK-NEXT:    [[R:%.*]] = add nuw i32 [[X:%.*]], 4096
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr inbounds i8, ptr %ptr, i32 4096
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+define i32 @ptr_add_in_int_const_negative(i32 %x) {
+; CHECK-LABEL: @ptr_add_in_int_const_negative(
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[X:%.*]], -4096
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr inbounds i8, ptr %ptr, i32 -4096
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+declare void @use_ptr(ptr)
+
+define i32 @ptr_add_in_int_extra_use1(i32 %x) {
+; CHECK-LABEL: @ptr_add_in_int_extra_use1(
+; CHECK-NEXT:    [[PTR:%.*]] = inttoptr i32 [[X:%.*]] to ptr
+; CHECK-NEXT:    call void @use_ptr(ptr [[PTR]])
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 4096
+; CHECK-NEXT:    [[R:%.*]] = ptrtoint ptr [[P2]] to i32
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ptr = inttoptr i32 %x to ptr
+  call void @use_ptr(ptr %ptr)
+  %p2 = getelementptr inbounds i8, ptr %ptr, i32 4096
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}
+
+define i32 @ptr_add_in_int_extra_use2(i32 %x) {
+; CHECK-LABEL: @ptr_add_in_int_extra_use2(
+; CHECK-NEXT:    [[PTR:%.*]] = inttoptr i32 [[X:%.*]] to ptr
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 4096
+; CHECK-NEXT:    call void @use_ptr(ptr nonnull [[P2]])
+; CHECK-NEXT:    [[R:%.*]] = ptrtoint ptr [[P2]] to i32
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ptr = inttoptr i32 %x to ptr
+  %p2 = getelementptr inbounds i8, ptr %ptr, i32 4096
+  call void @use_ptr(ptr %p2)
+  %r = ptrtoint ptr %p2 to i32
+  ret i32 %r
+}

From ebbbc73667a68dcfbe09392a1d34050592b234fd Mon Sep 17 00:00:00 2001
From: Chaitanya <Krishna.Sankisa@amd.com>
Date: Mon, 20 May 2024 10:24:40 +0530
Subject: [PATCH 28/60] [AMDGPU] Use removeFnAttrFromReachable in
 lower-module-lds pass. (#92686)

---
 .../AMDGPU/AMDGPULowerModuleLDSPass.cpp       | 44 +------------------
 1 file changed, 1 insertion(+), 43 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 2c7163a7753725..625ac0230f1606 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -862,48 +862,6 @@ class AMDGPULowerModuleLDS {
     return N;
   }
 
-  /// Strip "amdgpu-no-lds-kernel-id" from any functions where we may have
-  /// introduced its use. If AMDGPUAttributor ran prior to the pass, we inferred
-  /// the lack of llvm.amdgcn.lds.kernel.id calls.
-  void removeNoLdsKernelIdFromReachable(CallGraph &CG, Function *KernelRoot) {
-    KernelRoot->removeFnAttr("amdgpu-no-lds-kernel-id");
-
-    SmallVector<Function *> WorkList({CG[KernelRoot]->getFunction()});
-    SmallPtrSet<Function *, 8> Visited;
-    bool SeenUnknownCall = false;
-
-    while (!WorkList.empty()) {
-      Function *F = WorkList.pop_back_val();
-
-      for (auto &CallRecord : *CG[F]) {
-        if (!CallRecord.second)
-          continue;
-
-        Function *Callee = CallRecord.second->getFunction();
-        if (!Callee) {
-          if (!SeenUnknownCall) {
-            SeenUnknownCall = true;
-
-            // If we see any indirect calls, assume nothing about potential
-            // targets.
-            // TODO: This could be refined to possible LDS global users.
-            for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) {
-              Function *PotentialCallee =
-                  ExternalCallRecord.second->getFunction();
-              assert(PotentialCallee);
-              if (!isKernelLDS(PotentialCallee))
-                PotentialCallee->removeFnAttr("amdgpu-no-lds-kernel-id");
-            }
-          }
-        } else {
-          Callee->removeFnAttr("amdgpu-no-lds-kernel-id");
-          if (Visited.insert(Callee).second)
-            WorkList.push_back(Callee);
-        }
-      }
-    }
-  }
-
   DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables(
       Module &M, LDSUsesInfoTy &LDSUsesInfo,
       DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS,
@@ -1059,7 +1017,7 @@ class AMDGPULowerModuleLDS {
       //
       // TODO: We could filter out subgraphs that do not access LDS globals.
       for (Function *F : KernelsThatAllocateTableLDS)
-        removeNoLdsKernelIdFromReachable(CG, F);
+        removeFnAttrFromReachable(CG, F, "amdgpu-no-lds-kernel-id");
     }
 
     DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS =

From f6527774569790b5a5236f6e84f3f839ce6c2fff Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow@amd.com>
Date: Sun, 19 May 2024 22:01:10 -0700
Subject: [PATCH 29/60] [AMDGPU] Fix kernarg preloading crash with some types
 and alignments (#91625)

Lowering of preloded arguments would fail with half/bfloat if they were
dword aligned in the kernarg segment and not part of a vector. Added
more tests with different alignments and types.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp    |   20 +-
 llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 2231 ++++++++++--------
 2 files changed, 1231 insertions(+), 1020 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 89e83babcfef42..c7c4a8faa2fb05 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2976,12 +2976,20 @@ SDValue SITargetLowering::LowerFormalArguments(
                                    DL, Elts);
           }
 
-          SDValue CMemVT;
-          if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType()))
-            CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg);
-          else
-            CMemVT = DAG.getBitcast(MemVT, NewArg);
-          NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT,
+          // If the argument was preloaded to multiple consecutive 32-bit
+          // registers because of misalignment between addressable SGPR tuples
+          // and the argument size, we can still assume that because of kernarg
+          // segment alignment restrictions that NewArg's size is the same as
+          // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
+          // truncate since we cannot preload to less than a single SGPR and the
+          // MemVT may be smaller.
+          EVT MemVTInt =
+              EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+          if (MemVT.bitsLT(NewArg.getSimpleValueType()))
+            NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
+
+          NewArg = DAG.getBitcast(MemVT, NewArg);
+          NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
                                   Ins[i].Flags.isSExt(), &Ins[i]);
           NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
         }
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index f0e709b5a17279..857bb897ead2a3 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -1,18 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NO-PRELOAD %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-1 %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-2 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-4 %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-PRELOAD-8 %s
 
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-NO-PRELOAD %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-1 %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-2 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=4 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-4 %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=8 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a-PRELOAD-8 %s
 
-define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i8:
+define amdgpu_kernel void @ptr1_i8_kernel_preload_arg(ptr addrspace(1) %out, i8 %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -23,19 +19,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i8:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i8:
+; GFX940-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -45,17 +29,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i8:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xff
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i8:
+; GFX940-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -65,7 +39,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i8:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -76,19 +50,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-2-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -98,17 +60,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s0, s8, 0xff
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i8:
+; GFX90a-PRELOAD-8-LABEL: ptr1_i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -122,8 +74,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
   ret void
 }
 
-define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %arg0) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+define amdgpu_kernel void @ptr1_i8_zext_kernel_preload_arg(ptr addrspace(1) %out, i8 zeroext %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -134,19 +86,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -157,18 +97,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
+; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -179,7 +108,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -190,19 +119,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -213,18 +130,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_mov_b32 s0, 0xffff
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
+; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -239,8 +145,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
   ret void
 }
 
-define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+define amdgpu_kernel void @ptr1_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -251,19 +157,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -273,17 +167,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_and_b32 s0, s4, 0xffff
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
+; GFX940-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -293,7 +177,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -304,19 +188,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -326,17 +198,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s0, s8, 0xffff
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: ptr1_i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -350,8 +212,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
   ret void
 }
 
-define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+define amdgpu_kernel void @ptr1_i32_kernel_preload_arg(ptr addrspace(1) %out, i32 %arg0) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -361,18 +223,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -381,16 +232,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
+; GFX940-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -399,7 +241,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -409,18 +251,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -429,16 +260,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: ptr1_i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -451,8 +273,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
 }
 
 
-define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) {
-; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+define amdgpu_kernel void @i32_ptr1_i32_kernel_preload_arg(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) {
+; GFX940-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x10
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s5, s[0:1], 0x0
@@ -464,20 +286,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s3, s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_add_i32 s0, s2, s3
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -489,17 +298,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_add_i32 s0, s2, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
+; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -509,7 +308,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x10
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s3, s[4:5], 0x0
@@ -521,20 +320,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s2, s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_add_i32 s2, s6, s2
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -546,17 +332,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_add_i32 s0, s6, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[8:9]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -570,8 +346,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+define amdgpu_kernel void @ptr1_i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -584,21 +360,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX940-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX940-PRELOAD-1-NEXT:    s_add_i32 s0, s0, s1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -612,19 +374,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-4-NEXT:    s_and_b32 s1, s4, 0xffff
-; GFX940-PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
+; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -636,7 +386,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -649,21 +399,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX90a-PRELOAD-1-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX90a-PRELOAD-1-NEXT:    s_add_i32 s0, s0, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -677,19 +413,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX90a-PRELOAD-4-NEXT:    s_and_b32 s1, s8, 0xffff
-; GFX90a-PRELOAD-4-NEXT:    s_add_i32 s0, s1, s0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -707,8 +431,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
   ret void
 }
 
-define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) {
-; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+define amdgpu_kernel void @ptr1_v2i8_kernel_preload_arg(ptr addrspace(1) %out, <2 x i8> %in) {
+; GFX940-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -718,18 +442,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dword s0, s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -740,18 +453,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX940-PRELOAD-2-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; GFX940-PRELOAD-4-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
+; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -762,7 +464,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX940-PRELOAD-8-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -772,18 +474,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dword s0, s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_short v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -794,18 +485,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 ; GFX90a-PRELOAD-2-NEXT:    global_store_short v1, v0, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90a-PRELOAD-4-NEXT:    global_store_short v1, v0, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -820,8 +500,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
 }
 
 
-define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
-; GFX940-NO-PRELOAD-LABEL: byref_preload_arg:
+define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
+; GFX940-NO-PRELOAD-LABEL: byref_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x100
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
@@ -835,22 +515,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX940-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-2-LABEL: byref_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -865,22 +530,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX940-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x100
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: byref_preload_arg:
+; GFX940-PRELOAD-8-LABEL: byref_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -895,7 +545,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX940-PRELOAD-8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: byref_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: byref_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
@@ -909,22 +559,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: byref_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -939,22 +574,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 ; GFX90a-PRELOAD-2-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s1
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v2, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt vmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: byref_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: byref_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -975,8 +595,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
 }
 
 
-define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v8i32_arg:
+define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
@@ -995,27 +615,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v8i32_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_nop 1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v8i32_arg:
+; GFX940-PRELOAD-2-LABEL: v8i32_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1035,27 +635,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v8i32_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x20
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_nop 1
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v8i32_arg:
+; GFX940-PRELOAD-8-LABEL: v8i32_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1075,7 +655,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v8i32_arg:
+; GFX90a-NO-PRELOAD-LABEL: v8i32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -1094,27 +674,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-PRELOAD-1-NEXT:    s_nop 0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-2-LABEL: v8i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1134,27 +694,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-PRELOAD-4-NEXT:    s_nop 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v8i32_arg:
+; GFX90a-PRELOAD-8-LABEL: v8i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1177,8 +717,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
   ret void
 }
 
-define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v3i16_preload_arg:
+define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
@@ -1189,20 +729,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-2-LABEL: v3i16_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1213,18 +740,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s4
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg:
+; GFX940-PRELOAD-8-LABEL: v3i16_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1235,7 +751,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v3i16_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
@@ -1246,20 +762,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: v3i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1270,18 +773,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-4-NEXT:    global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s8
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: v3i16_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1295,8 +787,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
   ret void
 }
 
-define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v3i32_preload_arg:
+define amdgpu_kernel void @v3i32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -1308,20 +800,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-2-LABEL: v3i32_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1332,18 +811,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg:
+; GFX940-PRELOAD-8-LABEL: v3i32_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1354,7 +822,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v3i32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v3i32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -1366,20 +834,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: v3i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1390,18 +845,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s11
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s12
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: v3i32_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1415,8 +859,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
   ret void
 }
 
-define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v3f32_preload_arg:
+define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
@@ -1428,20 +872,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-2-LABEL: v3f32_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1452,18 +883,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s7
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s8
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg:
+; GFX940-PRELOAD-8-LABEL: v3f32_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1474,7 +894,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v3f32_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v3f32_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -1486,20 +906,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s2
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: v3f32_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1510,18 +917,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s11
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s12
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: v3f32_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1535,8 +931,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
   ret void
 }
 
-define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v5i8_preload_arg:
+define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
@@ -1547,20 +943,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; GFX940-PRELOAD-1-NEXT:    global_store_byte v0, v1, s[2:3] offset:4 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-2-LABEL: v5i8_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1578,25 +961,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX940-PRELOAD-2-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s5
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; GFX940-PRELOAD-4-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg:
+; GFX940-PRELOAD-8-LABEL: v5i8_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1614,7 +979,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX940-PRELOAD-8-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v5i8_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
@@ -1625,20 +990,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s1
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s0
-; GFX90a-PRELOAD-1-NEXT:    global_store_byte v0, v1, s[6:7] offset:4
-; GFX90a-PRELOAD-1-NEXT:    global_store_dword v0, v2, s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: v5i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1656,25 +1008,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 24
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s9
-; GFX90a-PRELOAD-4-NEXT:    global_store_byte v1, v2, s[6:7] offset:4
-; GFX90a-PRELOAD-4-NEXT:    global_store_dword v1, v0, s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: v5i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1695,8 +1029,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
   ret void
 }
 
-define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
-; GFX940-NO-PRELOAD-LABEL: v5f64_arg:
+define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind {
+; GFX940-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x60
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
@@ -1718,30 +1052,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[12:13] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v5f64_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_nop 1
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v5f64_arg:
+; GFX940-PRELOAD-2-LABEL: v5f64_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1764,30 +1075,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v5f64_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x60
-; GFX940-PRELOAD-4-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x40
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; GFX940-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_nop 1
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s4
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s5
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s6
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s7
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v5f64_arg:
+; GFX940-PRELOAD-8-LABEL: v5f64_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1810,7 +1098,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v5f64_arg:
+; GFX90a-NO-PRELOAD-LABEL: v5f64_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
@@ -1832,30 +1120,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-PRELOAD-1-NEXT:    s_nop 0
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-2-LABEL: v5f64_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1878,30 +1143,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-PRELOAD-4-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90a-PRELOAD-4-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s12
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s13
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s14
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s15
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-PRELOAD-4-NEXT:    s_nop 0
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v0, s8
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v1, s9
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, s10
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v5f64_arg:
+; GFX90a-PRELOAD-8-LABEL: v5f64_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -1927,8 +1169,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
   ret void
 }
 
-define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) {
-; GFX940-NO-PRELOAD-LABEL: v8i8_preload_arg:
+define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) {
+; GFX940-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg:
 ; GFX940-NO-PRELOAD:       ; %bb.0:
 ; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 ; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
@@ -1937,18 +1179,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-2-LABEL: v8i8_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
@@ -1973,32 +1204,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 8
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 24
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s5, 16
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 24
-; GFX940-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; GFX940-PRELOAD-4-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    s_nop 0
-; GFX940-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg:
+; GFX940-PRELOAD-8-LABEL: v8i8_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
@@ -2023,7 +1229,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX940-PRELOAD-8-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-8-NEXT:    s_endpgm
 ;
-; GFX90a-NO-PRELOAD-LABEL: v8i8_preload_arg:
+; GFX90a-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg:
 ; GFX90a-NO-PRELOAD:       ; %bb.0:
 ; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, 0
@@ -2032,18 +1238,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-2-LABEL: v8i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
@@ -2067,31 +1262,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 8
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 24
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s9, 16
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 8
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 24
-; GFX90a-PRELOAD-4-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; GFX90a-PRELOAD-4-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
-; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg:
+; GFX90a-PRELOAD-8-LABEL: v8i8_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
@@ -2129,17 +1300,6 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
 ; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2149,15 +1309,6 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
 ; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2177,17 +1328,6 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
 ; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2197,15 +1337,6 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
 ; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2229,17 +1360,6 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
 ; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
 ; GFX940-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x8
-; GFX940-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-1-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-1-NEXT:    s_endpgm
-;
 ; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg:
 ; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2249,15 +1369,6 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
 ; GFX940-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX940-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX940-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-PRELOAD-4-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX940-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
-; GFX940-PRELOAD-4-NEXT:    s_endpgm
-;
 ; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg:
 ; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2277,17 +1388,6 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
 ; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-1:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-1-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-1-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX90a-PRELOAD-1-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-1-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90a-PRELOAD-1-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-PRELOAD-1-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-1-NEXT:    s_endpgm
-;
 ; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg:
 ; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2297,15 +1397,6 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
 ; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX90a-PRELOAD-2-NEXT:    s_endpgm
 ;
-; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-4:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
-; GFX90a-PRELOAD-4-NEXT:  ; %bb.0:
-; GFX90a-PRELOAD-4-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-PRELOAD-4-NEXT:    v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-PRELOAD-4-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
-; GFX90a-PRELOAD-4-NEXT:    s_endpgm
-;
 ; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg:
 ; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
@@ -2317,3 +1408,1115 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
   store double %in, ptr addrspace(1) %out
   ret void
 }
+
+define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) %out, half %in) {
+; GFX940-NO-PRELOAD-LABEL: half_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: half_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: half_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: half_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: half_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: half_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store half %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) %out, bfloat %in) {
+; GFX940-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store bfloat %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) %out, <2 x bfloat> %in) {
+; GFX940-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v2bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <2 x bfloat> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) %out, <3 x bfloat> %in) {
+; GFX940-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v3bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <3 x bfloat> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) %out, <6 x bfloat> %in) {
+; GFX940-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v6bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <6 x bfloat> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) %out, half %in, <7 x bfloat> %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s10, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s10
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v3, v0, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s7
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v3, v0, s[8:9] offset:12 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x20
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v3, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s11
+; GFX940-PRELOAD-2-NEXT:    global_store_short v3, v0, s[6:7] offset:12 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v3, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s9
+; GFX940-PRELOAD-8-NEXT:    global_store_short v3, v0, s[10:11] offset:12 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s10, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x20
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v3, v0, s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s3
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v3, v0, s[8:9] offset:12
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x20
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v3, v0, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s3
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v3, v0, s[10:11] offset:12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: half_v7bfloat_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v3, v0, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s13
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v3, v0, s[0:1] offset:12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store half %in, ptr addrspace(1) %out
+  store <7 x bfloat> %in2, ptr addrspace(1) %out2
+  ret void
+}
+
+define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) %out, i1 %in) {
+; GFX940-NO-PRELOAD-LABEL: i1_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s4, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_and_b32 s0, s4, 1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i1_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_and_b32 s0, s4, 1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-2-NEXT:    global_store_byte v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i1_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_and_b32 s0, s4, 1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-PRELOAD-8-NEXT:    global_store_byte v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i1_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_and_b32 s2, s2, 1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i1_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_and_b32 s0, s8, 1
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-2-NEXT:    global_store_byte v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i1_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_and_b32 s0, s8, 1
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-PRELOAD-8-NEXT:    global_store_byte v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i1 %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) %out, fp128 %in) {
+; GFX940-NO-PRELOAD-LABEL: fp128_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: fp128_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s9
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: fp128_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: fp128_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-NO-PRELOAD-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: fp128_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: fp128_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store fp128 %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) %out, <7 x i8> %in) {
+; GFX940-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NO-PRELOAD-NEXT:    global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v7i8_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s5
+; GFX940-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-2-NEXT:    global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_short v2, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v2, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v7i8_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 8
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_short v2, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v2, v0, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_byte_d16_hi v0, v1, s[0:1] offset:6
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1] offset:4
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, s9
+; GFX90a-PRELOAD-2-NEXT:    v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-2-NEXT:    global_store_byte_d16_hi v2, v3, s[6:7] offset:6
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v2, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v7i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 8
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s9, 8
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    global_store_byte_d16_hi v2, v3, s[6:7] offset:6
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v2, v1, s[6:7] offset:4
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <7 x i8> %in, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) %out, <7 x half> %in) {
+; GFX940-NO-PRELOAD-LABEL: v7half_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v3, v1, s[2:3] offset:12 sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: v7half_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s9
+; GFX940-PRELOAD-2-NEXT:    global_store_short v3, v0, s[2:3] offset:12 sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: v7half_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s9
+; GFX940-PRELOAD-8-NEXT:    global_store_short v3, v0, s[2:3] offset:12 sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: v7half_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v3, v1, s[6:7] offset:12
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: v7half_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s13
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: v7half_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s13
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store <7 x half> %in, ptr addrspace(1) %out
+  ret void
+}
+
+; Test when previous argument was not dword aligned.
+define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i32 %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s7
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s5, s[0:1], 0xc
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s3
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dword v0, v2, s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0xc
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-2-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-PRELOAD-8-NEXT:    global_store_dword v0, v1, s[10:11]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i16 %in, ptr addrspace(1) %out
+  store i32 %in2, ptr addrspace(1) %out2
+  ret void
+}
+
+define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <3 x i32> %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s7, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x20
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, s7
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s4
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v3, v4, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x20
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s8
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s9
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s10
+; GFX940-PRELOAD-2-NEXT:    global_store_short v3, v4, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-PRELOAD-8-NEXT:    global_store_short v3, v4, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s3, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x20
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v4, s3
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v3, v4, s[6:7]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_dwordx3 v3, v[0:2], s[8:9]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x20
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v4, s8
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s1
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v3, v4, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    global_store_dwordx3 v3, v[0:2], s[4:5]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_v3i32_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v4, s8
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v3, v4, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-8-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i16 %in, ptr addrspace(1) %out
+  store <3 x i32> %in2, ptr addrspace(1) %out2
+  ret void
+}
+
+define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, i16 %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s6, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s5, s[0:1], 0x8
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short_d16_hi v0, v1, s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-2-NEXT:    global_store_short_d16_hi v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_i16_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    global_store_short_d16_hi v0, v1, s[10:11]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i16 %in, ptr addrspace(1) %out
+  store i16 %in2, ptr addrspace(1) %out2
+  ret void
+}
+
+define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) %out, i16 %in, <2 x i8> %in2, ptr addrspace(1) %out2) {
+; GFX940-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX940-NO-PRELOAD:       ; %bb.0:
+; GFX940-NO-PRELOAD-NEXT:    s_load_dword s6, s[0:1], 0x8
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NO-PRELOAD-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x10
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX940-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    global_store_short_d16_hi v0, v1, s[4:5] sc0 sc1
+; GFX940-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX940-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-2-NEXT:    s_load_dword s5, s[0:1], 0x8
+; GFX940-PRELOAD-2-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x10
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-PRELOAD-2-NEXT:    global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1
+; GFX940-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX940-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX940-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX940-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 24
+; GFX940-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX940-PRELOAD-8-NEXT:    s_lshr_b32 s0, s4, 16
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX940-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX940-PRELOAD-8-NEXT:    global_store_short v1, v2, s[2:3] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    global_store_short v1, v0, s[6:7] sc0 sc1
+; GFX940-PRELOAD-8-NEXT:    s_endpgm
+;
+; GFX90a-NO-PRELOAD-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-NO-PRELOAD:       ; %bb.0:
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX90a-NO-PRELOAD-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NO-PRELOAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-NO-PRELOAD-NEXT:    v_mov_b32_e32 v1, s6
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX90a-NO-PRELOAD-NEXT:    global_store_short_d16_hi v0, v1, s[2:3]
+; GFX90a-NO-PRELOAD-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-2-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-PRELOAD-2:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-2-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-2-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX90a-PRELOAD-2-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-PRELOAD-2-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-PRELOAD-2-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90a-PRELOAD-2-NEXT:    v_mov_b32_e32 v1, s2
+; GFX90a-PRELOAD-2-NEXT:    global_store_short_d16_hi v0, v1, s[0:1]
+; GFX90a-PRELOAD-2-NEXT:    s_endpgm
+;
+; GFX90a-PRELOAD-8-LABEL: i16_v2i8_kernel_preload_arg:
+; GFX90a-PRELOAD-8:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
+; GFX90a-PRELOAD-8-NEXT:  ; %bb.0:
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 24
+; GFX90a-PRELOAD-8-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
+; GFX90a-PRELOAD-8-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90a-PRELOAD-8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90a-PRELOAD-8-NEXT:    v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v1, v2, s[6:7]
+; GFX90a-PRELOAD-8-NEXT:    global_store_short v1, v0, s[10:11]
+; GFX90a-PRELOAD-8-NEXT:    s_endpgm
+  store i16 %in, ptr addrspace(1) %out
+  store <2 x i8> %in2, ptr addrspace(1) %out2
+  ret void
+}

From 8de7890572296830b27b6e6db39b36810bc98c31 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Sun, 19 May 2024 22:22:47 -0700
Subject: [PATCH 30/60] [ThinLTO] Populate declaration import status except for
 distributed ThinLTO under a default-off new option (#88024)

The goal is to populate `declaration` import status if a new flag`-import-declaration` is on.

* For in-process ThinLTO, the `declaration` status is visible to backend
`function-import` pass, so `FunctionImporter::importFunctions` should
read the import status and be no-op for declaration summaries.
Basically, the postlink pipeline is updated to keep its current behavior
(import definitions), but not updated to handle `declaration` summaries.
Two use cases (better call-graph sort and cross-module auto-init)
would use this bit differently.

* For distributed ThinLTO, the `declaration` status is not serialized to
bitcode. As discussed, https://github.com/llvm/llvm-project/pull/87600
will do this.

[1] https://discourse.llvm.org/t/rfc-for-better-call-graph-sort-build-a-more-complete-call-graph-by-adding-more-indirect-call-edges/74029#support-cross-module-function-declaration-import-5
[2] https://github.com/llvm/llvm-project/pull/87597#discussion_r1556067195
---
 llvm/include/llvm/IR/ModuleSummaryIndex.h     |   7 +
 .../llvm/Transforms/IPO/FunctionImport.h      |  15 +-
 llvm/lib/LTO/LTO.cpp                          |  32 ++-
 llvm/lib/LTO/LTOBackend.cpp                   |   9 +-
 llvm/lib/Transforms/IPO/FunctionImport.cpp    | 270 ++++++++++++++----
 llvm/test/ThinLTO/X86/funcimport-stats.ll     |   4 +-
 .../ThinLTO/X86/import_callee_declaration.ll  | 180 ++++++++++++
 .../Transforms/FunctionImport/funcimport.ll   |   5 +-
 llvm/tools/llvm-link/llvm-link.cpp            |   6 +-
 9 files changed, 443 insertions(+), 85 deletions(-)
 create mode 100644 llvm/test/ThinLTO/X86/import_callee_declaration.ll

diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 5d137d4b3553cf..a6bb261af75226 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -587,6 +587,10 @@ class GlobalValueSummary {
 
   void setImportKind(ImportKind IK) { Flags.ImportType = IK; }
 
+  GlobalValueSummary::ImportKind importType() const {
+    return static_cast<ImportKind>(Flags.ImportType);
+  }
+
   GlobalValue::VisibilityTypes getVisibility() const {
     return (GlobalValue::VisibilityTypes)Flags.Visibility;
   }
@@ -1272,6 +1276,9 @@ using ModulePathStringTableTy = StringMap<ModuleHash>;
 /// a particular module, and provide efficient access to their summary.
 using GVSummaryMapTy = DenseMap<GlobalValue::GUID, GlobalValueSummary *>;
 
+/// A set of global value summary pointers.
+using GVSummaryPtrSet = SmallPtrSet<GlobalValueSummary *, 4>;
+
 /// Map of a type GUID to type id string and summary (multimap used
 /// in case of GUID conflicts).
 using TypeIdSummaryMapTy =
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
index c4d19e8641eca2..024bba8105b897 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
@@ -31,9 +31,9 @@ class Module;
 /// based on the provided summary informations.
 class FunctionImporter {
 public:
-  /// Set of functions to import from a source module. Each entry is a set
-  /// containing all the GUIDs of all functions to import for a source module.
-  using FunctionsToImportTy = std::unordered_set<GlobalValue::GUID>;
+  /// The functions to import from a source module and their import type.
+  using FunctionsToImportTy =
+      DenseMap<GlobalValue::GUID, GlobalValueSummary::ImportKind>;
 
   /// The different reasons selectCallee will chose not to import a
   /// candidate.
@@ -99,8 +99,13 @@ class FunctionImporter {
   /// index's module path string table).
   using ImportMapTy = DenseMap<StringRef, FunctionsToImportTy>;
 
-  /// The set contains an entry for every global value the module exports.
-  using ExportSetTy = DenseSet<ValueInfo>;
+  /// The map contains an entry for every global value the module exports.
+  /// The key is ValueInfo, and the value indicates whether the definition
+  /// or declaration is visible to another module. If a function's definition is
+  /// visible to other modules, the global values this function referenced are
+  /// visible and shouldn't be internalized.
+  /// TODO: Rename to `ExportMapTy`.
+  using ExportSetTy = DenseMap<ValueInfo, GlobalValueSummary::ImportKind>;
 
   /// A function of this type is used to load modules referenced by the index.
   using ModuleLoaderTy =
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 5c603ac6ab4728..e2754d74979e8c 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -121,6 +121,9 @@ void llvm::computeLTOCacheKey(
     support::endian::write64le(Data, I);
     Hasher.update(Data);
   };
+  auto AddUint8 = [&](const uint8_t I) {
+    Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&I, 1));
+  };
   AddString(Conf.CPU);
   // FIXME: Hash more of Options. For now all clients initialize Options from
   // command-line flags (which is unsupported in production), but may set
@@ -156,18 +159,18 @@ void llvm::computeLTOCacheKey(
   auto ModHash = Index.getModuleHash(ModuleID);
   Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
 
-  std::vector<uint64_t> ExportsGUID;
+  std::vector<std::pair<uint64_t, uint8_t>> ExportsGUID;
   ExportsGUID.reserve(ExportList.size());
-  for (const auto &VI : ExportList) {
-    auto GUID = VI.getGUID();
-    ExportsGUID.push_back(GUID);
-  }
+  for (const auto &[VI, ExportType] : ExportList)
+    ExportsGUID.push_back(
+        std::make_pair(VI.getGUID(), static_cast<uint8_t>(ExportType)));
 
   // Sort the export list elements GUIDs.
   llvm::sort(ExportsGUID);
-  for (uint64_t GUID : ExportsGUID) {
+  for (auto [GUID, ExportType] : ExportsGUID) {
     // The export list can impact the internalization, be conservative here
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&GUID, sizeof(GUID)));
+    AddUint8(ExportType);
   }
 
   // Include the hash for every module we import functions from. The set of
@@ -199,7 +202,7 @@ void llvm::computeLTOCacheKey(
              [](const ImportModule &Lhs, const ImportModule &Rhs) -> bool {
                return Lhs.getHash() < Rhs.getHash();
              });
-  std::vector<uint64_t> ImportedGUIDs;
+  std::vector<std::pair<uint64_t, uint8_t>> ImportedGUIDs;
   for (const ImportModule &Entry : ImportModulesVector) {
     auto ModHash = Entry.getHash();
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
@@ -207,11 +210,13 @@ void llvm::computeLTOCacheKey(
     AddUint64(Entry.getFunctions().size());
 
     ImportedGUIDs.clear();
-    for (auto &Fn : Entry.getFunctions())
-      ImportedGUIDs.push_back(Fn);
+    for (auto &[Fn, ImportType] : Entry.getFunctions())
+      ImportedGUIDs.push_back(std::make_pair(Fn, ImportType));
     llvm::sort(ImportedGUIDs);
-    for (auto &GUID : ImportedGUIDs)
+    for (auto &[GUID, Type] : ImportedGUIDs) {
       AddUint64(GUID);
+      AddUint8(Type);
+    }
   }
 
   // Include the hash for the resolved ODR.
@@ -281,9 +286,9 @@ void llvm::computeLTOCacheKey(
   // Imported functions may introduce new uses of type identifier resolutions,
   // so we need to collect their used resolutions as well.
   for (const ImportModule &ImpM : ImportModulesVector)
-    for (auto &ImpF : ImpM.getFunctions()) {
+    for (auto &[GUID, UnusedImportType] : ImpM.getFunctions()) {
       GlobalValueSummary *S =
-          Index.findSummaryInModule(ImpF, ImpM.getIdentifier());
+          Index.findSummaryInModule(GUID, ImpM.getIdentifier());
       AddUsedThings(S);
       // If this is an alias, we also care about any types/etc. that the aliasee
       // may reference.
@@ -1395,6 +1400,7 @@ class lto::ThinBackendProc {
                   llvm::StringRef ModulePath,
                   const std::string &NewModulePath) {
     std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
+
     std::error_code EC;
     gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
                                      ImportList, ModuleToSummariesForIndex);
@@ -1403,6 +1409,8 @@ class lto::ThinBackendProc {
                       sys::fs::OpenFlags::OF_None);
     if (EC)
       return errorCodeToError(EC);
+
+    // TODO: Serialize declaration bits to bitcode.
     writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex);
 
     if (ShouldEmitImportsFiles) {
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index d4b89ede2d7134..58434feec6f969 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -721,7 +721,14 @@ bool lto::initImportList(const Module &M,
       if (Summary->modulePath() == M.getModuleIdentifier())
         continue;
       // Add an entry to provoke importing by thinBackend.
-      ImportList[Summary->modulePath()].insert(GUID);
+      // Try emplace the entry first. If an entry with the same key already
+      // exists, set the value to 'std::min(existing-value, new-value)' to make
+      // sure a definition takes precedence over a declaration.
+      auto [Iter, Inserted] = ImportList[Summary->modulePath()].try_emplace(
+          GUID, Summary->importType());
+
+      if (!Inserted)
+        Iter->second = std::min(Iter->second, Summary->importType());
     }
   }
   return true;
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 68f9799616ae6d..a116fd65353471 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -140,6 +140,17 @@ static cl::opt<bool>
     ImportAllIndex("import-all-index",
                    cl::desc("Import all external functions in index."));
 
+/// This is a test-only option.
+/// If this option is enabled, the ThinLTO indexing step will import each
+/// function declaration as a fallback. In a real build this may increase ram
+/// usage of the indexing step unnecessarily.
+/// TODO: Implement selective import (based on combined summary analysis) to
+/// ensure the imported function has a use case in the postlink pipeline.
+static cl::opt<bool> ImportDeclaration(
+    "import-declaration", cl::init(false), cl::Hidden,
+    cl::desc("If true, import function declaration as fallback if the function "
+             "definition is not imported."));
+
 /// Pass a workload description file - an example of workload would be the
 /// functions executed to satisfy a RPC request. A workload is defined by a root
 /// function and the list of functions that are (frequently) needed to satisfy
@@ -245,8 +256,12 @@ static auto qualifyCalleeCandidates(
 }
 
 /// Given a list of possible callee implementation for a call site, select one
-/// that fits the \p Threshold. If none are found, the Reason will give the last
-/// reason for the failure (last, in the order of CalleeSummaryList entries).
+/// that fits the \p Threshold for function definition import. If none are
+/// found, the Reason will give the last reason for the failure (last, in the
+/// order of CalleeSummaryList entries). While looking for a callee definition,
+/// sets \p TooLargeOrNoInlineSummary to the last seen too-large or noinline
+/// candidate; other modules may want to know the function summary or
+/// declaration even if a definition is not needed.
 ///
 /// FIXME: select "best" instead of first that fits. But what is "best"?
 /// - The smallest: more likely to be inlined.
@@ -259,24 +274,32 @@ static const GlobalValueSummary *
 selectCallee(const ModuleSummaryIndex &Index,
              ArrayRef<std::unique_ptr<GlobalValueSummary>> CalleeSummaryList,
              unsigned Threshold, StringRef CallerModulePath,
+             const GlobalValueSummary *&TooLargeOrNoInlineSummary,
              FunctionImporter::ImportFailureReason &Reason) {
+  // Records the last summary with reason noinline or too-large.
+  TooLargeOrNoInlineSummary = nullptr;
   auto QualifiedCandidates =
       qualifyCalleeCandidates(Index, CalleeSummaryList, CallerModulePath);
   for (auto QualifiedValue : QualifiedCandidates) {
     Reason = QualifiedValue.first;
+    // Skip a summary if its import is not (proved to be) legal.
     if (Reason != FunctionImporter::ImportFailureReason::None)
       continue;
     auto *Summary =
         cast<FunctionSummary>(QualifiedValue.second->getBaseObject());
 
+    // Don't bother importing the definition if the chance of inlining it is
+    // not high enough (except under `--force-import-all`).
     if ((Summary->instCount() > Threshold) && !Summary->fflags().AlwaysInline &&
         !ForceImportAll) {
+      TooLargeOrNoInlineSummary = Summary;
       Reason = FunctionImporter::ImportFailureReason::TooLarge;
       continue;
     }
 
-    // Don't bother importing if we can't inline it anyway.
+    // Don't bother importing the definition if we can't inline it anyway.
     if (Summary->fflags().NoInline && !ForceImportAll) {
+      TooLargeOrNoInlineSummary = Summary;
       Reason = FunctionImporter::ImportFailureReason::NoInline;
       continue;
     }
@@ -358,17 +381,27 @@ class GlobalsImporter final {
         if (!GVS || !Index.canImportGlobalVar(GVS, /* AnalyzeRefs */ true) ||
             LocalNotInModule(GVS))
           continue;
-        auto ILI = ImportList[RefSummary->modulePath()].insert(VI.getGUID());
+
+        // If there isn't an entry for GUID, insert <GUID, Definition> pair.
+        // Otherwise, definition should take precedence over declaration.
+        auto [Iter, Inserted] =
+            ImportList[RefSummary->modulePath()].try_emplace(
+                VI.getGUID(), GlobalValueSummary::Definition);
         // Only update stat and exports if we haven't already imported this
         // variable.
-        if (!ILI.second)
+        if (!Inserted) {
+          // Set the value to 'std::min(existing-value, new-value)' to make
+          // sure a definition takes precedence over a declaration.
+          Iter->second = std::min(GlobalValueSummary::Definition, Iter->second);
           break;
+        }
         NumImportedGlobalVarsThinLink++;
         // Any references made by this variable will be marked exported
         // later, in ComputeCrossModuleImport, after import decisions are
         // complete, which is more efficient than adding them here.
         if (ExportLists)
-          (*ExportLists)[RefSummary->modulePath()].insert(VI);
+          (*ExportLists)[RefSummary->modulePath()][VI] =
+              GlobalValueSummary::Definition;
 
         // If variable is not writeonly we attempt to recursively analyze
         // its references in order to import referenced constants.
@@ -545,10 +578,11 @@ class WorkloadImportsManager : public ModuleImportsManager {
       LLVM_DEBUG(dbgs() << "[Workload][Including]" << VI.name() << " from "
                         << ExportingModule << " : "
                         << Function::getGUID(VI.name()) << "\n");
-      ImportList[ExportingModule].insert(VI.getGUID());
+      ImportList[ExportingModule][VI.getGUID()] =
+          GlobalValueSummary::Definition;
       GVI.onImportingSummary(*GVS);
       if (ExportLists)
-        (*ExportLists)[ExportingModule].insert(VI);
+        (*ExportLists)[ExportingModule][VI] = GlobalValueSummary::Definition;
     }
     LLVM_DEBUG(dbgs() << "[Workload] Done\n");
   }
@@ -769,9 +803,28 @@ static void computeImportForFunction(
       }
 
       FunctionImporter::ImportFailureReason Reason{};
-      CalleeSummary = selectCallee(Index, VI.getSummaryList(), NewThreshold,
-                                   Summary.modulePath(), Reason);
+
+      // `SummaryForDeclImport` is an summary eligible for declaration import.
+      const GlobalValueSummary *SummaryForDeclImport = nullptr;
+      CalleeSummary =
+          selectCallee(Index, VI.getSummaryList(), NewThreshold,
+                       Summary.modulePath(), SummaryForDeclImport, Reason);
       if (!CalleeSummary) {
+        // There isn't a callee for definition import but one for declaration
+        // import.
+        if (ImportDeclaration && SummaryForDeclImport) {
+          StringRef DeclSourceModule = SummaryForDeclImport->modulePath();
+
+          // Since definition takes precedence over declaration for the same VI,
+          // try emplace <VI, declaration> pair without checking insert result.
+          // If insert doesn't happen, there must be an existing entry keyed by
+          // VI.
+          if (ExportLists)
+            (*ExportLists)[DeclSourceModule].try_emplace(
+                VI, GlobalValueSummary::Declaration);
+          ImportList[DeclSourceModule].try_emplace(
+              VI.getGUID(), GlobalValueSummary::Declaration);
+        }
         // Update with new larger threshold if this was a retry (otherwise
         // we would have already inserted with NewThreshold above). Also
         // update failure info if requested.
@@ -816,11 +869,15 @@ static void computeImportForFunction(
              "selectCallee() didn't honor the threshold");
 
       auto ExportModulePath = ResolvedCalleeSummary->modulePath();
-      auto ILI = ImportList[ExportModulePath].insert(VI.getGUID());
+
+      // Try emplace the definition entry, and update stats based on insertion
+      // status.
+      auto [Iter, Inserted] = ImportList[ExportModulePath].try_emplace(
+          VI.getGUID(), GlobalValueSummary::Definition);
+
       // We previously decided to import this GUID definition if it was already
       // inserted in the set of imports from the exporting module.
-      bool PreviouslyImported = !ILI.second;
-      if (!PreviouslyImported) {
+      if (Inserted || Iter->second == GlobalValueSummary::Declaration) {
         NumImportedFunctionsThinLink++;
         if (IsHotCallsite)
           NumImportedHotFunctionsThinLink++;
@@ -828,11 +885,14 @@ static void computeImportForFunction(
           NumImportedCriticalFunctionsThinLink++;
       }
 
+      if (Iter->second == GlobalValueSummary::Declaration)
+        Iter->second = GlobalValueSummary::Definition;
+
       // Any calls/references made by this function will be marked exported
       // later, in ComputeCrossModuleImport, after import decisions are
       // complete, which is more efficient than adding them here.
       if (ExportLists)
-        (*ExportLists)[ExportModulePath].insert(VI);
+        (*ExportLists)[ExportModulePath][VI] = GlobalValueSummary::Definition;
     }
 
     auto GetAdjustedThreshold = [](unsigned Threshold, bool IsHotCallsite) {
@@ -939,12 +999,20 @@ static bool isGlobalVarSummary(const ModuleSummaryIndex &Index,
 }
 
 template <class T>
-static unsigned numGlobalVarSummaries(const ModuleSummaryIndex &Index,
-                                      T &Cont) {
+static unsigned numGlobalVarSummaries(const ModuleSummaryIndex &Index, T &Cont,
+                                      unsigned &DefinedGVS,
+                                      unsigned &DefinedFS) {
   unsigned NumGVS = 0;
-  for (auto &V : Cont)
-    if (isGlobalVarSummary(Index, V))
+  DefinedGVS = 0;
+  DefinedFS = 0;
+  for (auto &[GUID, Type] : Cont) {
+    if (isGlobalVarSummary(Index, GUID)) {
+      if (Type == GlobalValueSummary::Definition)
+        ++DefinedGVS;
       ++NumGVS;
+    } else if (Type == GlobalValueSummary::Definition)
+      ++DefinedFS;
+  }
   return NumGVS;
 }
 #endif
@@ -954,13 +1022,12 @@ static bool checkVariableImport(
     const ModuleSummaryIndex &Index,
     DenseMap<StringRef, FunctionImporter::ImportMapTy> &ImportLists,
     DenseMap<StringRef, FunctionImporter::ExportSetTy> &ExportLists) {
-
   DenseSet<GlobalValue::GUID> FlattenedImports;
 
   for (auto &ImportPerModule : ImportLists)
     for (auto &ExportPerModule : ImportPerModule.second)
-      FlattenedImports.insert(ExportPerModule.second.begin(),
-                              ExportPerModule.second.end());
+      for (auto &[GUID, Type] : ExportPerModule.second)
+        FlattenedImports.insert(GUID);
 
   // Checks that all GUIDs of read/writeonly vars we see in export lists
   // are also in the import lists. Otherwise we my face linker undefs,
@@ -979,7 +1046,7 @@ static bool checkVariableImport(
   };
 
   for (auto &ExportPerModule : ExportLists)
-    for (auto &VI : ExportPerModule.second)
+    for (auto &[VI, Unused] : ExportPerModule.second)
       if (!FlattenedImports.count(VI.getGUID()) &&
           IsReadOrWriteOnlyVarNeedingImporting(ExportPerModule.first, VI))
         return false;
@@ -1015,7 +1082,11 @@ void llvm::ComputeCrossModuleImport(
     FunctionImporter::ExportSetTy NewExports;
     const auto &DefinedGVSummaries =
         ModuleToDefinedGVSummaries.lookup(ELI.first);
-    for (auto &EI : ELI.second) {
+    for (auto &[EI, Type] : ELI.second) {
+      // If a variable is exported as a declaration, its 'refs' and 'calls' are
+      // not further exported.
+      if (Type == GlobalValueSummary::Declaration)
+        continue;
       // Find the copy defined in the exporting module so that we can mark the
       // values it references in that specific definition as exported.
       // Below we will add all references and called values, without regard to
@@ -1034,22 +1105,31 @@ void llvm::ComputeCrossModuleImport(
         // we convert such variables initializers to "zeroinitializer".
         // See processGlobalForThinLTO.
         if (!Index.isWriteOnly(GVS))
-          for (const auto &VI : GVS->refs())
-            NewExports.insert(VI);
+          for (const auto &VI : GVS->refs()) {
+            // Try to emplace the declaration entry. If a definition entry
+            // already exists for key `VI`, this is a no-op.
+            NewExports.try_emplace(VI, GlobalValueSummary::Declaration);
+          }
       } else {
         auto *FS = cast<FunctionSummary>(S);
-        for (const auto &Edge : FS->calls())
-          NewExports.insert(Edge.first);
-        for (const auto &Ref : FS->refs())
-          NewExports.insert(Ref);
+        for (const auto &Edge : FS->calls()) {
+          // Try to emplace the declaration entry. If a definition entry
+          // already exists for key `VI`, this is a no-op.
+          NewExports.try_emplace(Edge.first, GlobalValueSummary::Declaration);
+        }
+        for (const auto &Ref : FS->refs()) {
+          // Try to emplace the declaration entry. If a definition entry
+          // already exists for key `VI`, this is a no-op.
+          NewExports.try_emplace(Ref, GlobalValueSummary::Declaration);
+        }
       }
     }
-    // Prune list computed above to only include values defined in the exporting
-    // module. We do this after the above insertion since we may hit the same
-    // ref/call target multiple times in above loop, and it is more efficient to
-    // avoid a set lookup each time.
+    // Prune list computed above to only include values defined in the
+    // exporting module. We do this after the above insertion since we may hit
+    // the same ref/call target multiple times in above loop, and it is more
+    // efficient to avoid a set lookup each time.
     for (auto EI = NewExports.begin(); EI != NewExports.end();) {
-      if (!DefinedGVSummaries.count(EI->getGUID()))
+      if (!DefinedGVSummaries.count(EI->first.getGUID()))
         NewExports.erase(EI++);
       else
         ++EI;
@@ -1064,18 +1144,29 @@ void llvm::ComputeCrossModuleImport(
   for (auto &ModuleImports : ImportLists) {
     auto ModName = ModuleImports.first;
     auto &Exports = ExportLists[ModName];
-    unsigned NumGVS = numGlobalVarSummaries(Index, Exports);
-    LLVM_DEBUG(dbgs() << "* Module " << ModName << " exports "
-                      << Exports.size() - NumGVS << " functions and " << NumGVS
-                      << " vars. Imports from " << ModuleImports.second.size()
-                      << " modules.\n");
+    unsigned DefinedGVS = 0, DefinedFS = 0;
+    unsigned NumGVS =
+        numGlobalVarSummaries(Index, Exports, DefinedGVS, DefinedFS);
+    LLVM_DEBUG(dbgs() << "* Module " << ModName << " exports " << DefinedFS
+                      << " function as definitions, "
+                      << Exports.size() - NumGVS - DefinedFS
+                      << " functions as declarations, " << DefinedGVS
+                      << " var definitions and " << NumGVS - DefinedGVS
+                      << " var declarations. Imports from "
+                      << ModuleImports.second.size() << " modules.\n");
     for (auto &Src : ModuleImports.second) {
       auto SrcModName = Src.first;
-      unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second);
-      LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod
-                        << " functions imported from " << SrcModName << "\n");
-      LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod
-                        << " global vars imported from " << SrcModName << "\n");
+      unsigned DefinedGVS = 0, DefinedFS = 0;
+      unsigned NumGVSPerMod =
+          numGlobalVarSummaries(Index, Src.second, DefinedGVS, DefinedFS);
+      LLVM_DEBUG(dbgs() << " - " << DefinedFS << " function definitions and "
+                        << Src.second.size() - NumGVSPerMod - DefinedFS
+                        << " function declarations imported from " << SrcModName
+                        << "\n");
+      LLVM_DEBUG(dbgs() << " - " << DefinedGVS << " global vars definition and "
+                        << NumGVSPerMod - DefinedGVS
+                        << " global vars declaration imported from "
+                        << SrcModName << "\n");
     }
   }
 #endif
@@ -1089,11 +1180,17 @@ static void dumpImportListForModule(const ModuleSummaryIndex &Index,
                     << ImportList.size() << " modules.\n");
   for (auto &Src : ImportList) {
     auto SrcModName = Src.first;
-    unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second);
-    LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod
-                      << " functions imported from " << SrcModName << "\n");
-    LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod << " vars imported from "
-                      << SrcModName << "\n");
+    unsigned DefinedGVS = 0, DefinedFS = 0;
+    unsigned NumGVSPerMod =
+        numGlobalVarSummaries(Index, Src.second, DefinedGVS, DefinedFS);
+    LLVM_DEBUG(dbgs() << " - " << DefinedFS << " function definitions and "
+                      << Src.second.size() - DefinedFS - NumGVSPerMod
+                      << " function declarations imported from " << SrcModName
+                      << "\n");
+    LLVM_DEBUG(dbgs() << " - " << DefinedGVS << " var definitions and "
+                      << NumGVSPerMod - DefinedGVS
+                      << " var declarations imported from " << SrcModName
+                      << "\n");
   }
 }
 #endif
@@ -1149,7 +1246,13 @@ static void ComputeCrossModuleImportForModuleFromIndexForTest(
     if (Summary->modulePath() == ModulePath)
       continue;
     // Add an entry to provoke importing by thinBackend.
-    ImportList[Summary->modulePath()].insert(GUID);
+    auto [Iter, Inserted] = ImportList[Summary->modulePath()].try_emplace(
+        GUID, Summary->importType());
+    if (!Inserted) {
+      // Use 'std::min' to make sure definition (with enum value 0) takes
+      // precedence over declaration (with enum value 1).
+      Iter->second = std::min(Iter->second, Summary->importType());
+    }
   }
 #ifndef NDEBUG
   dumpImportListForModule(Index, ModulePath, ImportList);
@@ -1339,13 +1442,17 @@ void llvm::gatherImportedSummariesForModule(
   // Include summaries for imports.
   for (const auto &ILI : ImportList) {
     auto &SummariesForIndex = ModuleToSummariesForIndex[std::string(ILI.first)];
+
     const auto &DefinedGVSummaries =
         ModuleToDefinedGVSummaries.lookup(ILI.first);
-    for (const auto &GI : ILI.second) {
-      const auto &DS = DefinedGVSummaries.find(GI);
+    for (const auto &[GUID, Type] : ILI.second) {
+      const auto &DS = DefinedGVSummaries.find(GUID);
       assert(DS != DefinedGVSummaries.end() &&
              "Expected a defined summary for imported global value");
-      SummariesForIndex[GI] = DS->second;
+      if (Type == GlobalValueSummary::Declaration)
+        continue;
+
+      SummariesForIndex[GUID] = DS->second;
     }
   }
 }
@@ -1617,6 +1724,16 @@ Expected<bool> FunctionImporter::importFunctions(
   for (const auto &FunctionsToImportPerModule : ImportList) {
     ModuleNameOrderedList.insert(FunctionsToImportPerModule.first);
   }
+
+  auto getImportType = [&](const FunctionsToImportTy &GUIDToImportType,
+                           GlobalValue::GUID GUID)
+      -> std::optional<GlobalValueSummary::ImportKind> {
+    auto Iter = GUIDToImportType.find(GUID);
+    if (Iter == GUIDToImportType.end())
+      return std::nullopt;
+    return Iter->second;
+  };
+
   for (const auto &Name : ModuleNameOrderedList) {
     // Get the module for the import
     const auto &FunctionsToImportPerModule = ImportList.find(Name);
@@ -1634,17 +1751,27 @@ Expected<bool> FunctionImporter::importFunctions(
       return std::move(Err);
 
     auto &ImportGUIDs = FunctionsToImportPerModule->second;
+
     // Find the globals to import
     SetVector<GlobalValue *> GlobalsToImport;
     for (Function &F : *SrcModule) {
       if (!F.hasName())
         continue;
       auto GUID = F.getGUID();
-      auto Import = ImportGUIDs.count(GUID);
-      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing function "
+      auto MaybeImportType = getImportType(ImportGUIDs, GUID);
+
+      bool ImportDefinition =
+          (MaybeImportType &&
+           (*MaybeImportType == GlobalValueSummary::Definition));
+
+      LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
+                        << " importing function"
+                        << (ImportDefinition
+                                ? " definition "
+                                : (MaybeImportType ? " declaration " : " "))
                         << GUID << " " << F.getName() << " from "
                         << SrcModule->getSourceFileName() << "\n");
-      if (Import) {
+      if (ImportDefinition) {
         if (Error Err = F.materialize())
           return std::move(Err);
         // MemProf should match function's definition and summary,
@@ -1670,11 +1797,20 @@ Expected<bool> FunctionImporter::importFunctions(
       if (!GV.hasName())
         continue;
       auto GUID = GV.getGUID();
-      auto Import = ImportGUIDs.count(GUID);
-      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing global "
+      auto MaybeImportType = getImportType(ImportGUIDs, GUID);
+
+      bool ImportDefinition =
+          (MaybeImportType &&
+           (*MaybeImportType == GlobalValueSummary::Definition));
+
+      LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
+                        << " importing global"
+                        << (ImportDefinition
+                                ? " definition "
+                                : (MaybeImportType ? " declaration " : " "))
                         << GUID << " " << GV.getName() << " from "
                         << SrcModule->getSourceFileName() << "\n");
-      if (Import) {
+      if (ImportDefinition) {
         if (Error Err = GV.materialize())
           return std::move(Err);
         ImportedGVCount += GlobalsToImport.insert(&GV);
@@ -1684,11 +1820,20 @@ Expected<bool> FunctionImporter::importFunctions(
       if (!GA.hasName() || isa<GlobalIFunc>(GA.getAliaseeObject()))
         continue;
       auto GUID = GA.getGUID();
-      auto Import = ImportGUIDs.count(GUID);
-      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing alias "
+      auto MaybeImportType = getImportType(ImportGUIDs, GUID);
+
+      bool ImportDefinition =
+          (MaybeImportType &&
+           (*MaybeImportType == GlobalValueSummary::Definition));
+
+      LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
+                        << " importing alias"
+                        << (ImportDefinition
+                                ? " definition "
+                                : (MaybeImportType ? " declaration " : " "))
                         << GUID << " " << GA.getName() << " from "
                         << SrcModule->getSourceFileName() << "\n");
-      if (Import) {
+      if (ImportDefinition) {
         if (Error Err = GA.materialize())
           return std::move(Err);
         // Import alias as a copy of its aliasee.
@@ -1754,6 +1899,7 @@ Expected<bool> FunctionImporter::importFunctions(
   NumImportedFunctions += (ImportedCount - ImportedGVCount);
   NumImportedGlobalVars += ImportedGVCount;
 
+  // TODO: Print counters for definitions and declarations in the debugging log.
   LLVM_DEBUG(dbgs() << "Imported " << ImportedCount - ImportedGVCount
                     << " functions for Module "
                     << DestModule.getModuleIdentifier() << "\n");
diff --git a/llvm/test/ThinLTO/X86/funcimport-stats.ll b/llvm/test/ThinLTO/X86/funcimport-stats.ll
index 913b13004c1c24..7fcd33855fe1ab 100644
--- a/llvm/test/ThinLTO/X86/funcimport-stats.ll
+++ b/llvm/test/ThinLTO/X86/funcimport-stats.ll
@@ -9,8 +9,8 @@
 ; RUN: cat %t4 | grep 'Is importing aliasee' | count 1
 ; RUN: cat %t4 | FileCheck %s
 
-; CHECK:      - [[NUM_FUNCS:[0-9]+]] functions imported from
-; CHECK-NEXT: - [[NUM_VARS:[0-9]+]] global vars imported from
+; CHECK:      - [[NUM_FUNCS:[0-9]+]] function definitions and 0 function declarations imported from
+; CHECK-NEXT: - [[NUM_VARS:[0-9]+]] global vars definition and 0 global vars declaration imported from
 
 ; CHECK:      [[NUM_FUNCS]] function-import - Number of functions imported in backend
 ; CHECK-NEXT: [[NUM_FUNCS]] function-import - Number of functions thin link decided to import
diff --git a/llvm/test/ThinLTO/X86/import_callee_declaration.ll b/llvm/test/ThinLTO/X86/import_callee_declaration.ll
new file mode 100644
index 00000000000000..df8a9ce6f7109a
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/import_callee_declaration.ll
@@ -0,0 +1,180 @@
+; "-debug-only" requires asserts.
+; REQUIRES: asserts
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+; Generate per-module summaries.
+; RUN: opt -module-summary main.ll -o main.bc
+; RUN: opt -module-summary lib.ll -o lib.bc
+
+; Generate the combined summary and distributed indices.
+
+; - For function import, set 'import-instr-limit' to 7 and fall back to import
+;   function declarations.
+; - In main.ll, function 'main' calls 'small_func' and 'large_func'. Both callees
+;   are defined in lib.ll. 'small_func' has two indirect callees, one is smaller
+;   and the other one is larger. Both callees of 'small_func' are defined in lib.ll.
+; - Given the import limit, in main's combined summary, the import type of 'small_func'
+;   and 'small_indirect_callee' will be 'definition', and the import type of
+;   'large_func' and 'large_indirect_callee' will be 'declaration'.
+;
+; The test will disassemble combined summaries and check the import type is
+; correct. Right now postlink optimizer pipeline doesn't do anything (e.g.,
+; import the declaration or de-serialize summary attributes yet) so there is
+; nothing to test more than the summary content.
+;
+; RUN: llvm-lto2 run \
+; RUN:   -debug-only=function-import \
+; RUN:   -import-instr-limit=7 \
+; RUN:   -import-declaration \
+; RUN:   -thinlto-distributed-indexes \
+; RUN:   -r=main.bc,main,px \
+; RUN:   -r=main.bc,small_func, \
+; RUN:   -r=main.bc,large_func, \
+; RUN:   -r=lib.bc,callee,pl \
+; RUN:   -r=lib.bc,large_indirect_callee,px \
+; RUN:   -r=lib.bc,small_func,px \
+; RUN:   -r=lib.bc,large_func,px \
+; RUN:   -r=lib.bc,large_indirect_callee_alias,px \
+; RUN:   -r=lib.bc,calleeAddrs,px -o summary main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=DUMP
+;
+; RUN: llvm-lto -thinlto-action=thinlink -import-declaration -import-instr-limit=7  -o combined.index.bc main.bc lib.bc
+; RUN: llvm-lto -thinlto-action=distributedindexes -debug-only=function-import -import-declaration -import-instr-limit=7 -thinlto-index combined.index.bc main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; DUMP: - 2 function definitions and 3 function declarations imported from lib.bc
+
+; First disassemble per-module summary and find out the GUID for {large_func, large_indirect_callee}.
+;
+; RUN: llvm-dis lib.bc -o - | FileCheck %s --check-prefix=LIB-DIS
+; LIB-DIS: [[LARGEFUNC:\^[0-9]+]] = gv: (name: "large_func", summaries: {{.*}}) ; guid = 2418497564662708935
+; LIB-DIS: [[LARGEINDIRECT:\^[0-9]+]] = gv: (name: "large_indirect_callee", summaries: {{.*}}) ; guid = 14343440786664691134
+; LIB-DIS: [[LARGEINDIRECTALIAS:\^[0-9]+]] = gv: (name: "large_indirect_callee_alias", summaries: {{.*}}, aliasee: [[LARGEINDIRECT]]
+;
+; Secondly disassemble main's combined summary and test that large callees are
+; not imported as declarations yet.
+;
+; RUN: llvm-dis main.bc.thinlto.bc -o - | FileCheck %s --check-prefix=MAIN-DIS
+;
+; MAIN-DIS: [[LIBMOD:\^[0-9]+]] = module: (path: "lib.bc", hash: (0, 0, 0, 0, 0))
+; MAIN-DIS-NOT: [[LARGEFUNC:\^[0-9]+]] = gv: (guid: 2418497564662708935, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
+; MAIN-DIS-NOT: [[LARGEINDIRECT:\^[0-9]+]] = gv: (guid: 14343440786664691134, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
+; MAIN-DIS-NOT: [[LARGEINDIRECTALIAS:\^[0-9]+]] = gv: (guid: 16730173943625350469, summaries: (alias: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration)
+
+; Run in-process ThinLTO and tests that
+; 1. `callee` remains internalized even if the symbols of its callers
+; (large_func and large_indirect_callee) are exported as declarations and visible to main module.
+; 2. the debugging logs from `function-import` pass are expected.
+
+; RUN: llvm-lto2 run \
+; RUN:   -debug-only=function-import \
+; RUN:   -save-temps \
+; RUN:   -import-instr-limit=7 \
+; RUN:   -import-declaration \
+; RUN:   -r=main.bc,main,px \
+; RUN:   -r=main.bc,small_func, \
+; RUN:   -r=main.bc,large_func, \
+; RUN:   -r=lib.bc,callee,pl \
+; RUN:   -r=lib.bc,large_indirect_callee,px \
+; RUN:   -r=lib.bc,small_func,px \
+; RUN:   -r=lib.bc,large_func,px \
+; RUN:   -r=lib.bc,large_indirect_callee_alias,px \
+; RUN:   -r=lib.bc,calleeAddrs,px -o in-process main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=IMPORTDUMP
+
+; Test import status from debugging logs.
+; TODO: Serialize declaration bit and test declaration bits are correctly set,
+; and extend this test case to test IR once postlink optimizer makes use of
+; the import type for declarations.
+; IMPORTDUMP-DAG: Not importing function 11825436545918268459 callee from lib.cc
+; IMPORTDUMP-DAG: Is importing function declaration 14343440786664691134 large_indirect_callee from lib.cc
+; IMPORTDUMP-DAG: Is importing function definition 13568239288960714650 small_indirect_callee from lib.cc
+; IMPORTDUMP-DAG: Is importing function definition 6976996067367342685 small_func from lib.cc
+; IMPORTDUMP-DAG: Is importing function declaration 2418497564662708935 large_func from lib.cc
+; IMPORTDUMP-DAG: Not importing global 7680325410415171624 calleeAddrs from lib.cc
+; IMPORTDUMP-DAG: Is importing alias declaration 16730173943625350469 large_indirect_callee_alias from lib.cc
+
+; RUN: llvm-dis in-process.1.3.import.bc -o - | FileCheck %s --check-prefix=IMPORT
+
+; RUN: llvm-dis in-process.2.2.internalize.bc -o - | FileCheck %s --check-prefix=INTERNALIZE
+
+; IMPORT-DAG: define available_externally void @small_func
+; IMPORT-DAG: define available_externally hidden void @small_indirect_callee
+; IMPORT-DAG: declare void @large_func
+; IMPORT-NOT: large_indirect_callee
+; IMPORT-NOT: large_indirect_callee_alias
+
+; INTERNALIZE: define internal void @callee()
+
+;--- main.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() {
+  call void @small_func()
+  call void @large_func()
+  ret i32 0
+}
+
+declare void @small_func()
+
+; large_func without attributes
+declare void @large_func()
+
+;--- lib.ll
+source_filename = "lib.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@calleeAddrs = global [3 x ptr] [ptr @large_indirect_callee, ptr @small_indirect_callee, ptr @large_indirect_callee_alias]
+
+define void @callee() #1 {
+  ret void
+}
+
+define void @large_indirect_callee()#2 {
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  ret void
+}
+
+define internal void @small_indirect_callee() #0 {
+  ret void
+}
+
+@large_indirect_callee_alias = alias void(), ptr @large_indirect_callee
+
+define void @small_func() {
+entry:
+  %0 = load ptr, ptr @calleeAddrs
+  call void %0(), !prof !0
+  %1 = load ptr, ptr getelementptr inbounds ([3 x ptr], ptr @calleeAddrs, i64 0, i64 1)
+  call void %1(), !prof !1
+  %2 = load ptr, ptr getelementptr inbounds ([3 x ptr], ptr @calleeAddrs, i64 0, i64 2)
+  call void %2(), !prof !2
+  ret void
+}
+
+define void @large_func() #0 {
+entry:
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  ret void
+}
+
+attributes #0 = { nounwind norecurse }
+
+attributes #1 = { noinline }
+
+attributes #2 = { norecurse }
+
+!0 = !{!"VP", i32 0, i64 1, i64 14343440786664691134, i64 1}
+!1 = !{!"VP", i32 0, i64 1, i64 13568239288960714650, i64 1}
+!2 = !{!"VP", i32 0, i64 1, i64 16730173943625350469, i64 1}
diff --git a/llvm/test/Transforms/FunctionImport/funcimport.ll b/llvm/test/Transforms/FunctionImport/funcimport.ll
index a0968a67f5ce84..635750b33fff00 100644
--- a/llvm/test/Transforms/FunctionImport/funcimport.ll
+++ b/llvm/test/Transforms/FunctionImport/funcimport.ll
@@ -166,7 +166,8 @@ declare void @variadic_va_start(...)
 ; GUID-DAG: GUID {{.*}} is linkoncefunc
 
 ; DUMP:       Module [[M1:.*]] imports from 1 module
-; DUMP-NEXT:  15 functions imported from [[M2:.*]]
-; DUMP-NEXT:  4 vars imported from [[M2]]
+; DUMP-NEXT:  15 function definitions and 0 function declarations imported from [[M2:.*]]
+; DUMP-NEXT:  4 var definitions and 0 var declarations imported from [[M2]]
+
 ; DUMP:       Imported 15 functions for Module [[M1]]
 ; DUMP-NEXT:  Imported 4 global variables for Module [[M1]]
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index 7794f2d81ed064..1b90fce76fbd14 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -377,9 +377,13 @@ static bool importFunctions(const char *argv0, Module &DestModule) {
     if (Verbose)
       errs() << "Importing " << FunctionName << " from " << FileName << "\n";
 
+    // `-import` specifies the `<filename,function-name>` pairs to import as
+    // definition, so make the import type definition directly.
+    // FIXME: A follow-up patch should add test coverage for import declaration
+    // in `llvm-link` CLI (e.g., by introducing a new command line option).
     auto &Entry =
         ImportList[FileNameStringCache.insert(FileName).first->getKey()];
-    Entry.insert(F->getGUID());
+    Entry[F->getGUID()] = GlobalValueSummary::Definition;
   }
   auto CachedModuleLoader = [&](StringRef Identifier) {
     return ModuleLoaderCache.takeModule(std::string(Identifier));

From d316a0bd48ceb4a0ee851d729291a2cdcc8818eb Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Mon, 20 May 2024 13:35:52 +0800
Subject: [PATCH 31/60] [NFC] Remove unused ASTWriter::getTypeID

As the title suggests, the `ASTWriter:getTypeID` method is not used.
This patch removes it.
---
 clang/include/clang/Serialization/ASTWriter.h |  3 --
 clang/lib/Serialization/ASTCommon.h           | 24 ------------
 clang/lib/Serialization/ASTWriter.cpp         | 38 ++++++++++++-------
 3 files changed, 25 insertions(+), 40 deletions(-)

diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index 6aa2796a41e0c7..88192e439a3f05 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -715,9 +715,6 @@ class ASTWriter : public ASTDeserializationListener,
   /// Force a type to be emitted and get its ID.
   serialization::TypeID GetOrCreateTypeID(QualType T);
 
-  /// Determine the type ID of an already-emitted type.
-  serialization::TypeID getTypeID(QualType T) const;
-
   /// Find the first local declaration of a given local redeclarable
   /// decl.
   const Decl *getFirstLocalDecl(const Decl *D);
diff --git a/clang/lib/Serialization/ASTCommon.h b/clang/lib/Serialization/ASTCommon.h
index 296642e3674a49..0230908d3e0528 100644
--- a/clang/lib/Serialization/ASTCommon.h
+++ b/clang/lib/Serialization/ASTCommon.h
@@ -46,30 +46,6 @@ enum DeclUpdateKind {
 
 TypeIdx TypeIdxFromBuiltin(const BuiltinType *BT);
 
-template <typename IdxForTypeTy>
-TypeID MakeTypeID(ASTContext &Context, QualType T, IdxForTypeTy IdxForType) {
-  if (T.isNull())
-    return PREDEF_TYPE_NULL_ID;
-
-  unsigned FastQuals = T.getLocalFastQualifiers();
-  T.removeLocalFastQualifiers();
-
-  if (T.hasLocalNonFastQualifiers())
-    return IdxForType(T).asTypeID(FastQuals);
-
-  assert(!T.hasLocalQualifiers());
-
-  if (const BuiltinType *BT = dyn_cast<BuiltinType>(T.getTypePtr()))
-    return TypeIdxFromBuiltin(BT).asTypeID(FastQuals);
-
-  if (T == Context.AutoDeductTy)
-    return TypeIdx(PREDEF_TYPE_AUTO_DEDUCT).asTypeID(FastQuals);
-  if (T == Context.AutoRRefDeductTy)
-    return TypeIdx(PREDEF_TYPE_AUTO_RREF_DEDUCT).asTypeID(FastQuals);
-
-  return IdxForType(T).asTypeID(FastQuals);
-}
-
 unsigned ComputeHash(Selector Sel);
 
 /// Retrieve the "definitive" declaration that provides all of the
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 2a107e4c56a3a0..1d6d96932ba2cf 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6074,6 +6074,31 @@ void ASTWriter::AddTypeRef(QualType T, RecordDataImpl &Record) {
   Record.push_back(GetOrCreateTypeID(T));
 }
 
+template <typename IdxForTypeTy>
+static TypeID MakeTypeID(ASTContext &Context, QualType T,
+                         IdxForTypeTy IdxForType) {
+  if (T.isNull())
+    return PREDEF_TYPE_NULL_ID;
+
+  unsigned FastQuals = T.getLocalFastQualifiers();
+  T.removeLocalFastQualifiers();
+
+  if (T.hasLocalNonFastQualifiers())
+    return IdxForType(T).asTypeID(FastQuals);
+
+  assert(!T.hasLocalQualifiers());
+
+  if (const BuiltinType *BT = dyn_cast<BuiltinType>(T.getTypePtr()))
+    return TypeIdxFromBuiltin(BT).asTypeID(FastQuals);
+
+  if (T == Context.AutoDeductTy)
+    return TypeIdx(PREDEF_TYPE_AUTO_DEDUCT).asTypeID(FastQuals);
+  if (T == Context.AutoRRefDeductTy)
+    return TypeIdx(PREDEF_TYPE_AUTO_RREF_DEDUCT).asTypeID(FastQuals);
+
+  return IdxForType(T).asTypeID(FastQuals);
+}
+
 TypeID ASTWriter::GetOrCreateTypeID(QualType T) {
   assert(Context);
   return MakeTypeID(*Context, T, [&](QualType T) -> TypeIdx {
@@ -6097,19 +6122,6 @@ TypeID ASTWriter::GetOrCreateTypeID(QualType T) {
   });
 }
 
-TypeID ASTWriter::getTypeID(QualType T) const {
-  assert(Context);
-  return MakeTypeID(*Context, T, [&](QualType T) -> TypeIdx {
-    if (T.isNull())
-      return TypeIdx();
-    assert(!T.getLocalFastQualifiers());
-
-    TypeIdxMap::const_iterator I = TypeIdxs.find(T);
-    assert(I != TypeIdxs.end() && "Type not emitted!");
-    return I->second;
-  });
-}
-
 void ASTWriter::AddEmittedDeclRef(const Decl *D, RecordDataImpl &Record) {
   if (!wasDeclEmitted(D))
     return;

From b6e102e08cd35543175459494211a3a15f793302 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 20 May 2024 07:40:54 +0200
Subject: [PATCH 32/60] [SCEV] Don't use non-deterministic constant folding for
 trip counts (#90942)

When calculating the exit count exhaustively, if any of the involved
operations is non-deterministic, the exit count we compute at
compile-time and the exit count at run-time may differ. Using these
non-deterministic constant folding results is only correct if we
actually replace all uses of the instruction with the value. SCEV (or
its consumers) generally don't do this.

Handle this by adding a new AllowNonDeterministic flag to the constant
folding API, and disabling it in SCEV. If non-deterministic results are
not allowed, do not fold FP lib calls in general, and FP operations
returning NaNs in particular. This could be made more precise (some FP
libcalls like fabs are fully deterministic), but I don't think this that
precise handling here is worthwhile.

Fixes the interesting part of
https://github.com/llvm/llvm-project/issues/89885.
---
 llvm/include/llvm/Analysis/ConstantFolding.h  |  15 +-
 llvm/lib/Analysis/ConstantFolding.cpp         |  51 ++++--
 llvm/lib/Analysis/ScalarEvolution.cpp         |   6 +-
 .../ScalarEvolution/exhaustive-trip-counts.ll | 152 ++++++++++++++++++
 4 files changed, 208 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ConstantFolding.h b/llvm/include/llvm/Analysis/ConstantFolding.h
index c54b1e8f01d2b6..58b38fb8b03674 100644
--- a/llvm/include/llvm/Analysis/ConstantFolding.h
+++ b/llvm/include/llvm/Analysis/ConstantFolding.h
@@ -68,9 +68,16 @@ Constant *ConstantFoldConstant(const Constant *C, const DataLayout &DL,
 /// fold instructions like loads and stores, which have no constant expression
 /// form.
 ///
+/// In some cases, constant folding may return one value chosen from a set of
+/// multiple legal return values. For example, the exact bit pattern of NaN
+/// results is not guaranteed. Using such a result is usually only valid if
+/// all uses of the original operation are replaced by the constant-folded
+/// result. The \p AllowNonDeterministic parameter controls whether this is
+/// allowed.
 Constant *ConstantFoldInstOperands(Instruction *I, ArrayRef<Constant *> Ops,
                                    const DataLayout &DL,
-                                   const TargetLibraryInfo *TLI = nullptr);
+                                   const TargetLibraryInfo *TLI = nullptr,
+                                   bool AllowNonDeterministic = true);
 
 /// Attempt to constant fold a compare instruction (icmp/fcmp) with the
 /// specified operands. Returns null or a constant expression of the specified
@@ -95,7 +102,8 @@ Constant *ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS,
 /// Returns null or a constant expression of the specified operands on failure.
 Constant *ConstantFoldFPInstOperands(unsigned Opcode, Constant *LHS,
                                      Constant *RHS, const DataLayout &DL,
-                                     const Instruction *I);
+                                     const Instruction *I,
+                                     bool AllowNonDeterministic = true);
 
 /// Attempt to flush float point constant according to denormal mode set in the
 /// instruction's parent function attributes. If so, return a zero with the
@@ -190,7 +198,8 @@ bool canConstantFoldCallTo(const CallBase *Call, const Function *F);
 /// with the specified arguments, returning null if unsuccessful.
 Constant *ConstantFoldCall(const CallBase *Call, Function *F,
                            ArrayRef<Constant *> Operands,
-                           const TargetLibraryInfo *TLI = nullptr);
+                           const TargetLibraryInfo *TLI = nullptr,
+                           bool AllowNonDeterministic = true);
 
 Constant *ConstantFoldBinaryIntrinsic(Intrinsic::ID ID, Constant *LHS,
                                       Constant *RHS, Type *Ty,
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 046a7694538085..524e84f3f3ded2 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -992,7 +992,8 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
 Constant *ConstantFoldInstOperandsImpl(const Value *InstOrCE, unsigned Opcode,
                                        ArrayRef<Constant *> Ops,
                                        const DataLayout &DL,
-                                       const TargetLibraryInfo *TLI) {
+                                       const TargetLibraryInfo *TLI,
+                                       bool AllowNonDeterministic) {
   Type *DestTy = InstOrCE->getType();
 
   if (Instruction::isUnaryOp(Opcode))
@@ -1011,7 +1012,8 @@ Constant *ConstantFoldInstOperandsImpl(const Value *InstOrCE, unsigned Opcode,
       // TODO: If a constant expression is being folded rather than an
       // instruction, denormals will not be flushed/treated as zero
       if (const auto *I = dyn_cast<Instruction>(InstOrCE)) {
-        return ConstantFoldFPInstOperands(Opcode, Ops[0], Ops[1], DL, I);
+        return ConstantFoldFPInstOperands(Opcode, Ops[0], Ops[1], DL, I,
+                                          AllowNonDeterministic);
       }
     }
     return ConstantFoldBinaryOpOperands(Opcode, Ops[0], Ops[1], DL);
@@ -1053,7 +1055,8 @@ Constant *ConstantFoldInstOperandsImpl(const Value *InstOrCE, unsigned Opcode,
     if (auto *F = dyn_cast<Function>(Ops.back())) {
       const auto *Call = cast<CallBase>(InstOrCE);
       if (canConstantFoldCallTo(Call, F))
-        return ConstantFoldCall(Call, F, Ops.slice(0, Ops.size() - 1), TLI);
+        return ConstantFoldCall(Call, F, Ops.slice(0, Ops.size() - 1), TLI,
+                                AllowNonDeterministic);
     }
     return nullptr;
   case Instruction::Select:
@@ -1114,8 +1117,8 @@ ConstantFoldConstantImpl(const Constant *C, const DataLayout &DL,
   }
 
   if (auto *CE = dyn_cast<ConstantExpr>(C)) {
-    if (Constant *Res =
-            ConstantFoldInstOperandsImpl(CE, CE->getOpcode(), Ops, DL, TLI))
+    if (Constant *Res = ConstantFoldInstOperandsImpl(
+            CE, CE->getOpcode(), Ops, DL, TLI, /*AllowNonDeterministic=*/true))
       return Res;
     return const_cast<Constant *>(C);
   }
@@ -1183,8 +1186,10 @@ Constant *llvm::ConstantFoldConstant(const Constant *C, const DataLayout &DL,
 Constant *llvm::ConstantFoldInstOperands(Instruction *I,
                                          ArrayRef<Constant *> Ops,
                                          const DataLayout &DL,
-                                         const TargetLibraryInfo *TLI) {
-  return ConstantFoldInstOperandsImpl(I, I->getOpcode(), Ops, DL, TLI);
+                                         const TargetLibraryInfo *TLI,
+                                         bool AllowNonDeterministic) {
+  return ConstantFoldInstOperandsImpl(I, I->getOpcode(), Ops, DL, TLI,
+                                      AllowNonDeterministic);
 }
 
 Constant *llvm::ConstantFoldCompareInstOperands(
@@ -1357,7 +1362,8 @@ Constant *llvm::FlushFPConstant(Constant *Operand, const Instruction *I,
 
 Constant *llvm::ConstantFoldFPInstOperands(unsigned Opcode, Constant *LHS,
                                            Constant *RHS, const DataLayout &DL,
-                                           const Instruction *I) {
+                                           const Instruction *I,
+                                           bool AllowNonDeterministic) {
   if (Instruction::isBinaryOp(Opcode)) {
     // Flush denormal inputs if needed.
     Constant *Op0 = FlushFPConstant(LHS, I, /* IsOutput */ false);
@@ -1367,13 +1373,30 @@ Constant *llvm::ConstantFoldFPInstOperands(unsigned Opcode, Constant *LHS,
     if (!Op1)
       return nullptr;
 
+    // If nsz or an algebraic FMF flag is set, the result of the FP operation
+    // may change due to future optimization. Don't constant fold them if
+    // non-deterministic results are not allowed.
+    if (!AllowNonDeterministic)
+      if (auto *FP = dyn_cast_or_null<FPMathOperator>(I))
+        if (FP->hasNoSignedZeros() || FP->hasAllowReassoc() ||
+            FP->hasAllowContract() || FP->hasAllowReciprocal())
+          return nullptr;
+
     // Calculate constant result.
     Constant *C = ConstantFoldBinaryOpOperands(Opcode, Op0, Op1, DL);
     if (!C)
       return nullptr;
 
     // Flush denormal output if needed.
-    return FlushFPConstant(C, I, /* IsOutput */ true);
+    C = FlushFPConstant(C, I, /* IsOutput */ true);
+    if (!C)
+      return nullptr;
+
+    // The precise NaN value is non-deterministic.
+    if (!AllowNonDeterministic && C->isNaN())
+      return nullptr;
+
+    return C;
   }
   // If instruction lacks a parent/function and the denormal mode cannot be
   // determined, use the default (IEEE).
@@ -3401,7 +3424,8 @@ Constant *llvm::ConstantFoldBinaryIntrinsic(Intrinsic::ID ID, Constant *LHS,
 
 Constant *llvm::ConstantFoldCall(const CallBase *Call, Function *F,
                                  ArrayRef<Constant *> Operands,
-                                 const TargetLibraryInfo *TLI) {
+                                 const TargetLibraryInfo *TLI,
+                                 bool AllowNonDeterministic) {
   if (Call->isNoBuiltin())
     return nullptr;
   if (!F->hasName())
@@ -3417,8 +3441,13 @@ Constant *llvm::ConstantFoldCall(const CallBase *Call, Function *F,
       return nullptr;
   }
 
-  StringRef Name = F->getName();
+  // Conservatively assume that floating-point libcalls may be
+  // non-deterministic.
   Type *Ty = F->getReturnType();
+  if (!AllowNonDeterministic && Ty->isFPOrFPVectorTy())
+    return nullptr;
+
+  StringRef Name = F->getName();
   if (auto *FVTy = dyn_cast<FixedVectorType>(Ty))
     return ConstantFoldFixedVectorCall(
         Name, IID, FVTy, Operands, F->getParent()->getDataLayout(), TLI, Call);
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 254d79183a1e95..704f92669a1174 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -9540,7 +9540,8 @@ static Constant *EvaluateExpression(Value *V, const Loop *L,
     Operands[i] = C;
   }
 
-  return ConstantFoldInstOperands(I, Operands, DL, TLI);
+  return ConstantFoldInstOperands(I, Operands, DL, TLI,
+                                  /*AllowNonDeterministic=*/false);
 }
 
 
@@ -10031,7 +10032,8 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
 
     Constant *C = nullptr;
     const DataLayout &DL = getDataLayout();
-    C = ConstantFoldInstOperands(I, Operands, DL, &TLI);
+    C = ConstantFoldInstOperands(I, Operands, DL, &TLI,
+                                 /*AllowNonDeterministic=*/false);
     if (!C)
       return V;
     return getSCEV(C);
diff --git a/llvm/test/Analysis/ScalarEvolution/exhaustive-trip-counts.ll b/llvm/test/Analysis/ScalarEvolution/exhaustive-trip-counts.ll
index 21237f4266933f..cc08fa5fc7d876 100644
--- a/llvm/test/Analysis/ScalarEvolution/exhaustive-trip-counts.ll
+++ b/llvm/test/Analysis/ScalarEvolution/exhaustive-trip-counts.ll
@@ -27,4 +27,156 @@ for.cond.cleanup:
   ret void
 }
 
+; Do not compute exhaustive trip count based on FP libcalls, as their exact
+; return value may not be specified.
+define i64 @test_fp_libcall() {
+; CHECK-LABEL: 'test_fp_libcall'
+; CHECK-NEXT:  Determining loop execution counts for: @test_fp_libcall
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fv = phi double [ 1.000000e+00, %entry ], [ %fv.next, %loop ]
+  call void @use(double %fv)
+  %fv.next = call double @llvm.sin.f64(double %fv)
+  %iv.next = add i64 %iv, 1
+  %fcmp = fcmp une double %fv, 0x3FC6BA15EE8460B0
+  br i1 %fcmp, label %loop, label %exit
+
+exit:
+  ret i64 %iv
+}
+
+; Do not compute exhaustive trip count based on FP constant folding resulting
+; in NaN values, as we don't specify which NaN exactly is returned.
+define i64 @test_nan_sign() {
+; CHECK-LABEL: 'test_nan_sign'
+; CHECK-NEXT:  Determining loop execution counts for: @test_nan_sign
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fv = phi double [ -1.000000e+00, %entry ], [ %fv.next, %loop ]
+  call void @use(double %fv)
+  %a = fsub double %fv, 0x7F86C16C16C16C16
+  %b = fadd double %a, %a
+  %fv.next = fsub double %b, %a
+  %iv.next = add i64 %iv, 1
+  %fv.bc = bitcast double %fv to i64
+  %icmp = icmp slt i64 %fv.bc, 0
+  br i1 %icmp, label %loop, label %exit
+
+exit:
+  ret i64 %iv
+}
+
+; Do not compute exhaustive trip count based on FP constant folding if the
+; involved operation has nsz or one of the algebraic FMF flags (reassoc, arcp,
+; contract) set. The examples in the following are dummies and don't illustrate
+; real cases where FMF transforms could cause issues.
+
+define i64 @test_fp_nsz() {
+; CHECK-LABEL: 'test_fp_nsz'
+; CHECK-NEXT:  Determining loop execution counts for: @test_fp_nsz
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fv = phi double [ 1.000000e+00, %entry ], [ %fv.next, %loop ]
+  call void @use(double %fv)
+  %fv.next = fadd nsz double %fv, 1.0
+  %iv.next = add i64 %iv, 1
+  %fcmp = fcmp une double %fv, 100.0
+  br i1 %fcmp, label %loop, label %exit
+
+exit:
+  ret i64 %iv
+}
+
+define i64 @test_fp_reassoc() {
+; CHECK-LABEL: 'test_fp_reassoc'
+; CHECK-NEXT:  Determining loop execution counts for: @test_fp_reassoc
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fv = phi double [ 1.000000e+00, %entry ], [ %fv.next, %loop ]
+  call void @use(double %fv)
+  %fv.next = fadd reassoc double %fv, 1.0
+  %iv.next = add i64 %iv, 1
+  %fcmp = fcmp une double %fv, 100.0
+  br i1 %fcmp, label %loop, label %exit
+
+exit:
+  ret i64 %iv
+}
+
+define i64 @test_fp_arcp() {
+; CHECK-LABEL: 'test_fp_arcp'
+; CHECK-NEXT:  Determining loop execution counts for: @test_fp_arcp
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fv = phi double [ 1.000000e+00, %entry ], [ %fv.next, %loop ]
+  call void @use(double %fv)
+  %fv.next = fadd arcp double %fv, 1.0
+  %iv.next = add i64 %iv, 1
+  %fcmp = fcmp une double %fv, 100.0
+  br i1 %fcmp, label %loop, label %exit
+
+exit:
+  ret i64 %iv
+}
+
+define i64 @test_fp_contract() {
+; CHECK-LABEL: 'test_fp_contract'
+; CHECK-NEXT:  Determining loop execution counts for: @test_fp_contract
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fv = phi double [ 1.000000e+00, %entry ], [ %fv.next, %loop ]
+  call void @use(double %fv)
+  %fv.next = fadd contract double %fv, 1.0
+  %iv.next = add i64 %iv, 1
+  %fcmp = fcmp une double %fv, 100.0
+  br i1 %fcmp, label %loop, label %exit
+
+exit:
+  ret i64 %iv
+}
+
 declare void @dummy()
+declare void @use(double %i)
+declare double @llvm.sin.f64(double)

From 6b0733e3a35350679ea9c6056ecd28652d99017f Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Sun, 19 May 2024 22:42:18 -0700
Subject: [PATCH 33/60] Revert "[ThinLTO] Populate declaration import status
 except for distributed ThinLTO under a default-off new option" (#92715)

Reverts llvm/llvm-project#88024

Build bot failures
(https://lab.llvm.org/buildbot/#/builders/259/builds/4727 and
https://lab.llvm.org/buildbot/#/builders/9/builds/43876)
---
 llvm/include/llvm/IR/ModuleSummaryIndex.h     |   7 -
 .../llvm/Transforms/IPO/FunctionImport.h      |  15 +-
 llvm/lib/LTO/LTO.cpp                          |  32 +--
 llvm/lib/LTO/LTOBackend.cpp                   |   9 +-
 llvm/lib/Transforms/IPO/FunctionImport.cpp    | 270 ++++--------------
 llvm/test/ThinLTO/X86/funcimport-stats.ll     |   4 +-
 .../ThinLTO/X86/import_callee_declaration.ll  | 180 ------------
 .../Transforms/FunctionImport/funcimport.ll   |   5 +-
 llvm/tools/llvm-link/llvm-link.cpp            |   6 +-
 9 files changed, 85 insertions(+), 443 deletions(-)
 delete mode 100644 llvm/test/ThinLTO/X86/import_callee_declaration.ll

diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index a6bb261af75226..5d137d4b3553cf 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -587,10 +587,6 @@ class GlobalValueSummary {
 
   void setImportKind(ImportKind IK) { Flags.ImportType = IK; }
 
-  GlobalValueSummary::ImportKind importType() const {
-    return static_cast<ImportKind>(Flags.ImportType);
-  }
-
   GlobalValue::VisibilityTypes getVisibility() const {
     return (GlobalValue::VisibilityTypes)Flags.Visibility;
   }
@@ -1276,9 +1272,6 @@ using ModulePathStringTableTy = StringMap<ModuleHash>;
 /// a particular module, and provide efficient access to their summary.
 using GVSummaryMapTy = DenseMap<GlobalValue::GUID, GlobalValueSummary *>;
 
-/// A set of global value summary pointers.
-using GVSummaryPtrSet = SmallPtrSet<GlobalValueSummary *, 4>;
-
 /// Map of a type GUID to type id string and summary (multimap used
 /// in case of GUID conflicts).
 using TypeIdSummaryMapTy =
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
index 024bba8105b897..c4d19e8641eca2 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
@@ -31,9 +31,9 @@ class Module;
 /// based on the provided summary informations.
 class FunctionImporter {
 public:
-  /// The functions to import from a source module and their import type.
-  using FunctionsToImportTy =
-      DenseMap<GlobalValue::GUID, GlobalValueSummary::ImportKind>;
+  /// Set of functions to import from a source module. Each entry is a set
+  /// containing all the GUIDs of all functions to import for a source module.
+  using FunctionsToImportTy = std::unordered_set<GlobalValue::GUID>;
 
   /// The different reasons selectCallee will chose not to import a
   /// candidate.
@@ -99,13 +99,8 @@ class FunctionImporter {
   /// index's module path string table).
   using ImportMapTy = DenseMap<StringRef, FunctionsToImportTy>;
 
-  /// The map contains an entry for every global value the module exports.
-  /// The key is ValueInfo, and the value indicates whether the definition
-  /// or declaration is visible to another module. If a function's definition is
-  /// visible to other modules, the global values this function referenced are
-  /// visible and shouldn't be internalized.
-  /// TODO: Rename to `ExportMapTy`.
-  using ExportSetTy = DenseMap<ValueInfo, GlobalValueSummary::ImportKind>;
+  /// The set contains an entry for every global value the module exports.
+  using ExportSetTy = DenseSet<ValueInfo>;
 
   /// A function of this type is used to load modules referenced by the index.
   using ModuleLoaderTy =
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index e2754d74979e8c..5c603ac6ab4728 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -121,9 +121,6 @@ void llvm::computeLTOCacheKey(
     support::endian::write64le(Data, I);
     Hasher.update(Data);
   };
-  auto AddUint8 = [&](const uint8_t I) {
-    Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&I, 1));
-  };
   AddString(Conf.CPU);
   // FIXME: Hash more of Options. For now all clients initialize Options from
   // command-line flags (which is unsupported in production), but may set
@@ -159,18 +156,18 @@ void llvm::computeLTOCacheKey(
   auto ModHash = Index.getModuleHash(ModuleID);
   Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
 
-  std::vector<std::pair<uint64_t, uint8_t>> ExportsGUID;
+  std::vector<uint64_t> ExportsGUID;
   ExportsGUID.reserve(ExportList.size());
-  for (const auto &[VI, ExportType] : ExportList)
-    ExportsGUID.push_back(
-        std::make_pair(VI.getGUID(), static_cast<uint8_t>(ExportType)));
+  for (const auto &VI : ExportList) {
+    auto GUID = VI.getGUID();
+    ExportsGUID.push_back(GUID);
+  }
 
   // Sort the export list elements GUIDs.
   llvm::sort(ExportsGUID);
-  for (auto [GUID, ExportType] : ExportsGUID) {
+  for (uint64_t GUID : ExportsGUID) {
     // The export list can impact the internalization, be conservative here
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&GUID, sizeof(GUID)));
-    AddUint8(ExportType);
   }
 
   // Include the hash for every module we import functions from. The set of
@@ -202,7 +199,7 @@ void llvm::computeLTOCacheKey(
              [](const ImportModule &Lhs, const ImportModule &Rhs) -> bool {
                return Lhs.getHash() < Rhs.getHash();
              });
-  std::vector<std::pair<uint64_t, uint8_t>> ImportedGUIDs;
+  std::vector<uint64_t> ImportedGUIDs;
   for (const ImportModule &Entry : ImportModulesVector) {
     auto ModHash = Entry.getHash();
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
@@ -210,13 +207,11 @@ void llvm::computeLTOCacheKey(
     AddUint64(Entry.getFunctions().size());
 
     ImportedGUIDs.clear();
-    for (auto &[Fn, ImportType] : Entry.getFunctions())
-      ImportedGUIDs.push_back(std::make_pair(Fn, ImportType));
+    for (auto &Fn : Entry.getFunctions())
+      ImportedGUIDs.push_back(Fn);
     llvm::sort(ImportedGUIDs);
-    for (auto &[GUID, Type] : ImportedGUIDs) {
+    for (auto &GUID : ImportedGUIDs)
       AddUint64(GUID);
-      AddUint8(Type);
-    }
   }
 
   // Include the hash for the resolved ODR.
@@ -286,9 +281,9 @@ void llvm::computeLTOCacheKey(
   // Imported functions may introduce new uses of type identifier resolutions,
   // so we need to collect their used resolutions as well.
   for (const ImportModule &ImpM : ImportModulesVector)
-    for (auto &[GUID, UnusedImportType] : ImpM.getFunctions()) {
+    for (auto &ImpF : ImpM.getFunctions()) {
       GlobalValueSummary *S =
-          Index.findSummaryInModule(GUID, ImpM.getIdentifier());
+          Index.findSummaryInModule(ImpF, ImpM.getIdentifier());
       AddUsedThings(S);
       // If this is an alias, we also care about any types/etc. that the aliasee
       // may reference.
@@ -1400,7 +1395,6 @@ class lto::ThinBackendProc {
                   llvm::StringRef ModulePath,
                   const std::string &NewModulePath) {
     std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
-
     std::error_code EC;
     gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
                                      ImportList, ModuleToSummariesForIndex);
@@ -1409,8 +1403,6 @@ class lto::ThinBackendProc {
                       sys::fs::OpenFlags::OF_None);
     if (EC)
       return errorCodeToError(EC);
-
-    // TODO: Serialize declaration bits to bitcode.
     writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex);
 
     if (ShouldEmitImportsFiles) {
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 58434feec6f969..d4b89ede2d7134 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -721,14 +721,7 @@ bool lto::initImportList(const Module &M,
       if (Summary->modulePath() == M.getModuleIdentifier())
         continue;
       // Add an entry to provoke importing by thinBackend.
-      // Try emplace the entry first. If an entry with the same key already
-      // exists, set the value to 'std::min(existing-value, new-value)' to make
-      // sure a definition takes precedence over a declaration.
-      auto [Iter, Inserted] = ImportList[Summary->modulePath()].try_emplace(
-          GUID, Summary->importType());
-
-      if (!Inserted)
-        Iter->second = std::min(Iter->second, Summary->importType());
+      ImportList[Summary->modulePath()].insert(GUID);
     }
   }
   return true;
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index a116fd65353471..68f9799616ae6d 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -140,17 +140,6 @@ static cl::opt<bool>
     ImportAllIndex("import-all-index",
                    cl::desc("Import all external functions in index."));
 
-/// This is a test-only option.
-/// If this option is enabled, the ThinLTO indexing step will import each
-/// function declaration as a fallback. In a real build this may increase ram
-/// usage of the indexing step unnecessarily.
-/// TODO: Implement selective import (based on combined summary analysis) to
-/// ensure the imported function has a use case in the postlink pipeline.
-static cl::opt<bool> ImportDeclaration(
-    "import-declaration", cl::init(false), cl::Hidden,
-    cl::desc("If true, import function declaration as fallback if the function "
-             "definition is not imported."));
-
 /// Pass a workload description file - an example of workload would be the
 /// functions executed to satisfy a RPC request. A workload is defined by a root
 /// function and the list of functions that are (frequently) needed to satisfy
@@ -256,12 +245,8 @@ static auto qualifyCalleeCandidates(
 }
 
 /// Given a list of possible callee implementation for a call site, select one
-/// that fits the \p Threshold for function definition import. If none are
-/// found, the Reason will give the last reason for the failure (last, in the
-/// order of CalleeSummaryList entries). While looking for a callee definition,
-/// sets \p TooLargeOrNoInlineSummary to the last seen too-large or noinline
-/// candidate; other modules may want to know the function summary or
-/// declaration even if a definition is not needed.
+/// that fits the \p Threshold. If none are found, the Reason will give the last
+/// reason for the failure (last, in the order of CalleeSummaryList entries).
 ///
 /// FIXME: select "best" instead of first that fits. But what is "best"?
 /// - The smallest: more likely to be inlined.
@@ -274,32 +259,24 @@ static const GlobalValueSummary *
 selectCallee(const ModuleSummaryIndex &Index,
              ArrayRef<std::unique_ptr<GlobalValueSummary>> CalleeSummaryList,
              unsigned Threshold, StringRef CallerModulePath,
-             const GlobalValueSummary *&TooLargeOrNoInlineSummary,
              FunctionImporter::ImportFailureReason &Reason) {
-  // Records the last summary with reason noinline or too-large.
-  TooLargeOrNoInlineSummary = nullptr;
   auto QualifiedCandidates =
       qualifyCalleeCandidates(Index, CalleeSummaryList, CallerModulePath);
   for (auto QualifiedValue : QualifiedCandidates) {
     Reason = QualifiedValue.first;
-    // Skip a summary if its import is not (proved to be) legal.
     if (Reason != FunctionImporter::ImportFailureReason::None)
       continue;
     auto *Summary =
         cast<FunctionSummary>(QualifiedValue.second->getBaseObject());
 
-    // Don't bother importing the definition if the chance of inlining it is
-    // not high enough (except under `--force-import-all`).
     if ((Summary->instCount() > Threshold) && !Summary->fflags().AlwaysInline &&
         !ForceImportAll) {
-      TooLargeOrNoInlineSummary = Summary;
       Reason = FunctionImporter::ImportFailureReason::TooLarge;
       continue;
     }
 
-    // Don't bother importing the definition if we can't inline it anyway.
+    // Don't bother importing if we can't inline it anyway.
     if (Summary->fflags().NoInline && !ForceImportAll) {
-      TooLargeOrNoInlineSummary = Summary;
       Reason = FunctionImporter::ImportFailureReason::NoInline;
       continue;
     }
@@ -381,27 +358,17 @@ class GlobalsImporter final {
         if (!GVS || !Index.canImportGlobalVar(GVS, /* AnalyzeRefs */ true) ||
             LocalNotInModule(GVS))
           continue;
-
-        // If there isn't an entry for GUID, insert <GUID, Definition> pair.
-        // Otherwise, definition should take precedence over declaration.
-        auto [Iter, Inserted] =
-            ImportList[RefSummary->modulePath()].try_emplace(
-                VI.getGUID(), GlobalValueSummary::Definition);
+        auto ILI = ImportList[RefSummary->modulePath()].insert(VI.getGUID());
         // Only update stat and exports if we haven't already imported this
         // variable.
-        if (!Inserted) {
-          // Set the value to 'std::min(existing-value, new-value)' to make
-          // sure a definition takes precedence over a declaration.
-          Iter->second = std::min(GlobalValueSummary::Definition, Iter->second);
+        if (!ILI.second)
           break;
-        }
         NumImportedGlobalVarsThinLink++;
         // Any references made by this variable will be marked exported
         // later, in ComputeCrossModuleImport, after import decisions are
         // complete, which is more efficient than adding them here.
         if (ExportLists)
-          (*ExportLists)[RefSummary->modulePath()][VI] =
-              GlobalValueSummary::Definition;
+          (*ExportLists)[RefSummary->modulePath()].insert(VI);
 
         // If variable is not writeonly we attempt to recursively analyze
         // its references in order to import referenced constants.
@@ -578,11 +545,10 @@ class WorkloadImportsManager : public ModuleImportsManager {
       LLVM_DEBUG(dbgs() << "[Workload][Including]" << VI.name() << " from "
                         << ExportingModule << " : "
                         << Function::getGUID(VI.name()) << "\n");
-      ImportList[ExportingModule][VI.getGUID()] =
-          GlobalValueSummary::Definition;
+      ImportList[ExportingModule].insert(VI.getGUID());
       GVI.onImportingSummary(*GVS);
       if (ExportLists)
-        (*ExportLists)[ExportingModule][VI] = GlobalValueSummary::Definition;
+        (*ExportLists)[ExportingModule].insert(VI);
     }
     LLVM_DEBUG(dbgs() << "[Workload] Done\n");
   }
@@ -803,28 +769,9 @@ static void computeImportForFunction(
       }
 
       FunctionImporter::ImportFailureReason Reason{};
-
-      // `SummaryForDeclImport` is an summary eligible for declaration import.
-      const GlobalValueSummary *SummaryForDeclImport = nullptr;
-      CalleeSummary =
-          selectCallee(Index, VI.getSummaryList(), NewThreshold,
-                       Summary.modulePath(), SummaryForDeclImport, Reason);
+      CalleeSummary = selectCallee(Index, VI.getSummaryList(), NewThreshold,
+                                   Summary.modulePath(), Reason);
       if (!CalleeSummary) {
-        // There isn't a callee for definition import but one for declaration
-        // import.
-        if (ImportDeclaration && SummaryForDeclImport) {
-          StringRef DeclSourceModule = SummaryForDeclImport->modulePath();
-
-          // Since definition takes precedence over declaration for the same VI,
-          // try emplace <VI, declaration> pair without checking insert result.
-          // If insert doesn't happen, there must be an existing entry keyed by
-          // VI.
-          if (ExportLists)
-            (*ExportLists)[DeclSourceModule].try_emplace(
-                VI, GlobalValueSummary::Declaration);
-          ImportList[DeclSourceModule].try_emplace(
-              VI.getGUID(), GlobalValueSummary::Declaration);
-        }
         // Update with new larger threshold if this was a retry (otherwise
         // we would have already inserted with NewThreshold above). Also
         // update failure info if requested.
@@ -869,15 +816,11 @@ static void computeImportForFunction(
              "selectCallee() didn't honor the threshold");
 
       auto ExportModulePath = ResolvedCalleeSummary->modulePath();
-
-      // Try emplace the definition entry, and update stats based on insertion
-      // status.
-      auto [Iter, Inserted] = ImportList[ExportModulePath].try_emplace(
-          VI.getGUID(), GlobalValueSummary::Definition);
-
+      auto ILI = ImportList[ExportModulePath].insert(VI.getGUID());
       // We previously decided to import this GUID definition if it was already
       // inserted in the set of imports from the exporting module.
-      if (Inserted || Iter->second == GlobalValueSummary::Declaration) {
+      bool PreviouslyImported = !ILI.second;
+      if (!PreviouslyImported) {
         NumImportedFunctionsThinLink++;
         if (IsHotCallsite)
           NumImportedHotFunctionsThinLink++;
@@ -885,14 +828,11 @@ static void computeImportForFunction(
           NumImportedCriticalFunctionsThinLink++;
       }
 
-      if (Iter->second == GlobalValueSummary::Declaration)
-        Iter->second = GlobalValueSummary::Definition;
-
       // Any calls/references made by this function will be marked exported
       // later, in ComputeCrossModuleImport, after import decisions are
       // complete, which is more efficient than adding them here.
       if (ExportLists)
-        (*ExportLists)[ExportModulePath][VI] = GlobalValueSummary::Definition;
+        (*ExportLists)[ExportModulePath].insert(VI);
     }
 
     auto GetAdjustedThreshold = [](unsigned Threshold, bool IsHotCallsite) {
@@ -999,20 +939,12 @@ static bool isGlobalVarSummary(const ModuleSummaryIndex &Index,
 }
 
 template <class T>
-static unsigned numGlobalVarSummaries(const ModuleSummaryIndex &Index, T &Cont,
-                                      unsigned &DefinedGVS,
-                                      unsigned &DefinedFS) {
+static unsigned numGlobalVarSummaries(const ModuleSummaryIndex &Index,
+                                      T &Cont) {
   unsigned NumGVS = 0;
-  DefinedGVS = 0;
-  DefinedFS = 0;
-  for (auto &[GUID, Type] : Cont) {
-    if (isGlobalVarSummary(Index, GUID)) {
-      if (Type == GlobalValueSummary::Definition)
-        ++DefinedGVS;
+  for (auto &V : Cont)
+    if (isGlobalVarSummary(Index, V))
       ++NumGVS;
-    } else if (Type == GlobalValueSummary::Definition)
-      ++DefinedFS;
-  }
   return NumGVS;
 }
 #endif
@@ -1022,12 +954,13 @@ static bool checkVariableImport(
     const ModuleSummaryIndex &Index,
     DenseMap<StringRef, FunctionImporter::ImportMapTy> &ImportLists,
     DenseMap<StringRef, FunctionImporter::ExportSetTy> &ExportLists) {
+
   DenseSet<GlobalValue::GUID> FlattenedImports;
 
   for (auto &ImportPerModule : ImportLists)
     for (auto &ExportPerModule : ImportPerModule.second)
-      for (auto &[GUID, Type] : ExportPerModule.second)
-        FlattenedImports.insert(GUID);
+      FlattenedImports.insert(ExportPerModule.second.begin(),
+                              ExportPerModule.second.end());
 
   // Checks that all GUIDs of read/writeonly vars we see in export lists
   // are also in the import lists. Otherwise we my face linker undefs,
@@ -1046,7 +979,7 @@ static bool checkVariableImport(
   };
 
   for (auto &ExportPerModule : ExportLists)
-    for (auto &[VI, Unused] : ExportPerModule.second)
+    for (auto &VI : ExportPerModule.second)
       if (!FlattenedImports.count(VI.getGUID()) &&
           IsReadOrWriteOnlyVarNeedingImporting(ExportPerModule.first, VI))
         return false;
@@ -1082,11 +1015,7 @@ void llvm::ComputeCrossModuleImport(
     FunctionImporter::ExportSetTy NewExports;
     const auto &DefinedGVSummaries =
         ModuleToDefinedGVSummaries.lookup(ELI.first);
-    for (auto &[EI, Type] : ELI.second) {
-      // If a variable is exported as a declaration, its 'refs' and 'calls' are
-      // not further exported.
-      if (Type == GlobalValueSummary::Declaration)
-        continue;
+    for (auto &EI : ELI.second) {
       // Find the copy defined in the exporting module so that we can mark the
       // values it references in that specific definition as exported.
       // Below we will add all references and called values, without regard to
@@ -1105,31 +1034,22 @@ void llvm::ComputeCrossModuleImport(
         // we convert such variables initializers to "zeroinitializer".
         // See processGlobalForThinLTO.
         if (!Index.isWriteOnly(GVS))
-          for (const auto &VI : GVS->refs()) {
-            // Try to emplace the declaration entry. If a definition entry
-            // already exists for key `VI`, this is a no-op.
-            NewExports.try_emplace(VI, GlobalValueSummary::Declaration);
-          }
+          for (const auto &VI : GVS->refs())
+            NewExports.insert(VI);
       } else {
         auto *FS = cast<FunctionSummary>(S);
-        for (const auto &Edge : FS->calls()) {
-          // Try to emplace the declaration entry. If a definition entry
-          // already exists for key `VI`, this is a no-op.
-          NewExports.try_emplace(Edge.first, GlobalValueSummary::Declaration);
-        }
-        for (const auto &Ref : FS->refs()) {
-          // Try to emplace the declaration entry. If a definition entry
-          // already exists for key `VI`, this is a no-op.
-          NewExports.try_emplace(Ref, GlobalValueSummary::Declaration);
-        }
+        for (const auto &Edge : FS->calls())
+          NewExports.insert(Edge.first);
+        for (const auto &Ref : FS->refs())
+          NewExports.insert(Ref);
       }
     }
-    // Prune list computed above to only include values defined in the
-    // exporting module. We do this after the above insertion since we may hit
-    // the same ref/call target multiple times in above loop, and it is more
-    // efficient to avoid a set lookup each time.
+    // Prune list computed above to only include values defined in the exporting
+    // module. We do this after the above insertion since we may hit the same
+    // ref/call target multiple times in above loop, and it is more efficient to
+    // avoid a set lookup each time.
     for (auto EI = NewExports.begin(); EI != NewExports.end();) {
-      if (!DefinedGVSummaries.count(EI->first.getGUID()))
+      if (!DefinedGVSummaries.count(EI->getGUID()))
         NewExports.erase(EI++);
       else
         ++EI;
@@ -1144,29 +1064,18 @@ void llvm::ComputeCrossModuleImport(
   for (auto &ModuleImports : ImportLists) {
     auto ModName = ModuleImports.first;
     auto &Exports = ExportLists[ModName];
-    unsigned DefinedGVS = 0, DefinedFS = 0;
-    unsigned NumGVS =
-        numGlobalVarSummaries(Index, Exports, DefinedGVS, DefinedFS);
-    LLVM_DEBUG(dbgs() << "* Module " << ModName << " exports " << DefinedFS
-                      << " function as definitions, "
-                      << Exports.size() - NumGVS - DefinedFS
-                      << " functions as declarations, " << DefinedGVS
-                      << " var definitions and " << NumGVS - DefinedGVS
-                      << " var declarations. Imports from "
-                      << ModuleImports.second.size() << " modules.\n");
+    unsigned NumGVS = numGlobalVarSummaries(Index, Exports);
+    LLVM_DEBUG(dbgs() << "* Module " << ModName << " exports "
+                      << Exports.size() - NumGVS << " functions and " << NumGVS
+                      << " vars. Imports from " << ModuleImports.second.size()
+                      << " modules.\n");
     for (auto &Src : ModuleImports.second) {
       auto SrcModName = Src.first;
-      unsigned DefinedGVS = 0, DefinedFS = 0;
-      unsigned NumGVSPerMod =
-          numGlobalVarSummaries(Index, Src.second, DefinedGVS, DefinedFS);
-      LLVM_DEBUG(dbgs() << " - " << DefinedFS << " function definitions and "
-                        << Src.second.size() - NumGVSPerMod - DefinedFS
-                        << " function declarations imported from " << SrcModName
-                        << "\n");
-      LLVM_DEBUG(dbgs() << " - " << DefinedGVS << " global vars definition and "
-                        << NumGVSPerMod - DefinedGVS
-                        << " global vars declaration imported from "
-                        << SrcModName << "\n");
+      unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second);
+      LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod
+                        << " functions imported from " << SrcModName << "\n");
+      LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod
+                        << " global vars imported from " << SrcModName << "\n");
     }
   }
 #endif
@@ -1180,17 +1089,11 @@ static void dumpImportListForModule(const ModuleSummaryIndex &Index,
                     << ImportList.size() << " modules.\n");
   for (auto &Src : ImportList) {
     auto SrcModName = Src.first;
-    unsigned DefinedGVS = 0, DefinedFS = 0;
-    unsigned NumGVSPerMod =
-        numGlobalVarSummaries(Index, Src.second, DefinedGVS, DefinedFS);
-    LLVM_DEBUG(dbgs() << " - " << DefinedFS << " function definitions and "
-                      << Src.second.size() - DefinedFS - NumGVSPerMod
-                      << " function declarations imported from " << SrcModName
-                      << "\n");
-    LLVM_DEBUG(dbgs() << " - " << DefinedGVS << " var definitions and "
-                      << NumGVSPerMod - DefinedGVS
-                      << " var declarations imported from " << SrcModName
-                      << "\n");
+    unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second);
+    LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod
+                      << " functions imported from " << SrcModName << "\n");
+    LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod << " vars imported from "
+                      << SrcModName << "\n");
   }
 }
 #endif
@@ -1246,13 +1149,7 @@ static void ComputeCrossModuleImportForModuleFromIndexForTest(
     if (Summary->modulePath() == ModulePath)
       continue;
     // Add an entry to provoke importing by thinBackend.
-    auto [Iter, Inserted] = ImportList[Summary->modulePath()].try_emplace(
-        GUID, Summary->importType());
-    if (!Inserted) {
-      // Use 'std::min' to make sure definition (with enum value 0) takes
-      // precedence over declaration (with enum value 1).
-      Iter->second = std::min(Iter->second, Summary->importType());
-    }
+    ImportList[Summary->modulePath()].insert(GUID);
   }
 #ifndef NDEBUG
   dumpImportListForModule(Index, ModulePath, ImportList);
@@ -1442,17 +1339,13 @@ void llvm::gatherImportedSummariesForModule(
   // Include summaries for imports.
   for (const auto &ILI : ImportList) {
     auto &SummariesForIndex = ModuleToSummariesForIndex[std::string(ILI.first)];
-
     const auto &DefinedGVSummaries =
         ModuleToDefinedGVSummaries.lookup(ILI.first);
-    for (const auto &[GUID, Type] : ILI.second) {
-      const auto &DS = DefinedGVSummaries.find(GUID);
+    for (const auto &GI : ILI.second) {
+      const auto &DS = DefinedGVSummaries.find(GI);
       assert(DS != DefinedGVSummaries.end() &&
              "Expected a defined summary for imported global value");
-      if (Type == GlobalValueSummary::Declaration)
-        continue;
-
-      SummariesForIndex[GUID] = DS->second;
+      SummariesForIndex[GI] = DS->second;
     }
   }
 }
@@ -1724,16 +1617,6 @@ Expected<bool> FunctionImporter::importFunctions(
   for (const auto &FunctionsToImportPerModule : ImportList) {
     ModuleNameOrderedList.insert(FunctionsToImportPerModule.first);
   }
-
-  auto getImportType = [&](const FunctionsToImportTy &GUIDToImportType,
-                           GlobalValue::GUID GUID)
-      -> std::optional<GlobalValueSummary::ImportKind> {
-    auto Iter = GUIDToImportType.find(GUID);
-    if (Iter == GUIDToImportType.end())
-      return std::nullopt;
-    return Iter->second;
-  };
-
   for (const auto &Name : ModuleNameOrderedList) {
     // Get the module for the import
     const auto &FunctionsToImportPerModule = ImportList.find(Name);
@@ -1751,27 +1634,17 @@ Expected<bool> FunctionImporter::importFunctions(
       return std::move(Err);
 
     auto &ImportGUIDs = FunctionsToImportPerModule->second;
-
     // Find the globals to import
     SetVector<GlobalValue *> GlobalsToImport;
     for (Function &F : *SrcModule) {
       if (!F.hasName())
         continue;
       auto GUID = F.getGUID();
-      auto MaybeImportType = getImportType(ImportGUIDs, GUID);
-
-      bool ImportDefinition =
-          (MaybeImportType &&
-           (*MaybeImportType == GlobalValueSummary::Definition));
-
-      LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
-                        << " importing function"
-                        << (ImportDefinition
-                                ? " definition "
-                                : (MaybeImportType ? " declaration " : " "))
+      auto Import = ImportGUIDs.count(GUID);
+      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing function "
                         << GUID << " " << F.getName() << " from "
                         << SrcModule->getSourceFileName() << "\n");
-      if (ImportDefinition) {
+      if (Import) {
         if (Error Err = F.materialize())
           return std::move(Err);
         // MemProf should match function's definition and summary,
@@ -1797,20 +1670,11 @@ Expected<bool> FunctionImporter::importFunctions(
       if (!GV.hasName())
         continue;
       auto GUID = GV.getGUID();
-      auto MaybeImportType = getImportType(ImportGUIDs, GUID);
-
-      bool ImportDefinition =
-          (MaybeImportType &&
-           (*MaybeImportType == GlobalValueSummary::Definition));
-
-      LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
-                        << " importing global"
-                        << (ImportDefinition
-                                ? " definition "
-                                : (MaybeImportType ? " declaration " : " "))
+      auto Import = ImportGUIDs.count(GUID);
+      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing global "
                         << GUID << " " << GV.getName() << " from "
                         << SrcModule->getSourceFileName() << "\n");
-      if (ImportDefinition) {
+      if (Import) {
         if (Error Err = GV.materialize())
           return std::move(Err);
         ImportedGVCount += GlobalsToImport.insert(&GV);
@@ -1820,20 +1684,11 @@ Expected<bool> FunctionImporter::importFunctions(
       if (!GA.hasName() || isa<GlobalIFunc>(GA.getAliaseeObject()))
         continue;
       auto GUID = GA.getGUID();
-      auto MaybeImportType = getImportType(ImportGUIDs, GUID);
-
-      bool ImportDefinition =
-          (MaybeImportType &&
-           (*MaybeImportType == GlobalValueSummary::Definition));
-
-      LLVM_DEBUG(dbgs() << (MaybeImportType ? "Is" : "Not")
-                        << " importing alias"
-                        << (ImportDefinition
-                                ? " definition "
-                                : (MaybeImportType ? " declaration " : " "))
+      auto Import = ImportGUIDs.count(GUID);
+      LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing alias "
                         << GUID << " " << GA.getName() << " from "
                         << SrcModule->getSourceFileName() << "\n");
-      if (ImportDefinition) {
+      if (Import) {
         if (Error Err = GA.materialize())
           return std::move(Err);
         // Import alias as a copy of its aliasee.
@@ -1899,7 +1754,6 @@ Expected<bool> FunctionImporter::importFunctions(
   NumImportedFunctions += (ImportedCount - ImportedGVCount);
   NumImportedGlobalVars += ImportedGVCount;
 
-  // TODO: Print counters for definitions and declarations in the debugging log.
   LLVM_DEBUG(dbgs() << "Imported " << ImportedCount - ImportedGVCount
                     << " functions for Module "
                     << DestModule.getModuleIdentifier() << "\n");
diff --git a/llvm/test/ThinLTO/X86/funcimport-stats.ll b/llvm/test/ThinLTO/X86/funcimport-stats.ll
index 7fcd33855fe1ab..913b13004c1c24 100644
--- a/llvm/test/ThinLTO/X86/funcimport-stats.ll
+++ b/llvm/test/ThinLTO/X86/funcimport-stats.ll
@@ -9,8 +9,8 @@
 ; RUN: cat %t4 | grep 'Is importing aliasee' | count 1
 ; RUN: cat %t4 | FileCheck %s
 
-; CHECK:      - [[NUM_FUNCS:[0-9]+]] function definitions and 0 function declarations imported from
-; CHECK-NEXT: - [[NUM_VARS:[0-9]+]] global vars definition and 0 global vars declaration imported from
+; CHECK:      - [[NUM_FUNCS:[0-9]+]] functions imported from
+; CHECK-NEXT: - [[NUM_VARS:[0-9]+]] global vars imported from
 
 ; CHECK:      [[NUM_FUNCS]] function-import - Number of functions imported in backend
 ; CHECK-NEXT: [[NUM_FUNCS]] function-import - Number of functions thin link decided to import
diff --git a/llvm/test/ThinLTO/X86/import_callee_declaration.ll b/llvm/test/ThinLTO/X86/import_callee_declaration.ll
deleted file mode 100644
index df8a9ce6f7109a..00000000000000
--- a/llvm/test/ThinLTO/X86/import_callee_declaration.ll
+++ /dev/null
@@ -1,180 +0,0 @@
-; "-debug-only" requires asserts.
-; REQUIRES: asserts
-; RUN: rm -rf %t && split-file %s %t && cd %t
-
-; Generate per-module summaries.
-; RUN: opt -module-summary main.ll -o main.bc
-; RUN: opt -module-summary lib.ll -o lib.bc
-
-; Generate the combined summary and distributed indices.
-
-; - For function import, set 'import-instr-limit' to 7 and fall back to import
-;   function declarations.
-; - In main.ll, function 'main' calls 'small_func' and 'large_func'. Both callees
-;   are defined in lib.ll. 'small_func' has two indirect callees, one is smaller
-;   and the other one is larger. Both callees of 'small_func' are defined in lib.ll.
-; - Given the import limit, in main's combined summary, the import type of 'small_func'
-;   and 'small_indirect_callee' will be 'definition', and the import type of
-;   'large_func' and 'large_indirect_callee' will be 'declaration'.
-;
-; The test will disassemble combined summaries and check the import type is
-; correct. Right now postlink optimizer pipeline doesn't do anything (e.g.,
-; import the declaration or de-serialize summary attributes yet) so there is
-; nothing to test more than the summary content.
-;
-; RUN: llvm-lto2 run \
-; RUN:   -debug-only=function-import \
-; RUN:   -import-instr-limit=7 \
-; RUN:   -import-declaration \
-; RUN:   -thinlto-distributed-indexes \
-; RUN:   -r=main.bc,main,px \
-; RUN:   -r=main.bc,small_func, \
-; RUN:   -r=main.bc,large_func, \
-; RUN:   -r=lib.bc,callee,pl \
-; RUN:   -r=lib.bc,large_indirect_callee,px \
-; RUN:   -r=lib.bc,small_func,px \
-; RUN:   -r=lib.bc,large_func,px \
-; RUN:   -r=lib.bc,large_indirect_callee_alias,px \
-; RUN:   -r=lib.bc,calleeAddrs,px -o summary main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=DUMP
-;
-; RUN: llvm-lto -thinlto-action=thinlink -import-declaration -import-instr-limit=7  -o combined.index.bc main.bc lib.bc
-; RUN: llvm-lto -thinlto-action=distributedindexes -debug-only=function-import -import-declaration -import-instr-limit=7 -thinlto-index combined.index.bc main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=DUMP
-
-; DUMP: - 2 function definitions and 3 function declarations imported from lib.bc
-
-; First disassemble per-module summary and find out the GUID for {large_func, large_indirect_callee}.
-;
-; RUN: llvm-dis lib.bc -o - | FileCheck %s --check-prefix=LIB-DIS
-; LIB-DIS: [[LARGEFUNC:\^[0-9]+]] = gv: (name: "large_func", summaries: {{.*}}) ; guid = 2418497564662708935
-; LIB-DIS: [[LARGEINDIRECT:\^[0-9]+]] = gv: (name: "large_indirect_callee", summaries: {{.*}}) ; guid = 14343440786664691134
-; LIB-DIS: [[LARGEINDIRECTALIAS:\^[0-9]+]] = gv: (name: "large_indirect_callee_alias", summaries: {{.*}}, aliasee: [[LARGEINDIRECT]]
-;
-; Secondly disassemble main's combined summary and test that large callees are
-; not imported as declarations yet.
-;
-; RUN: llvm-dis main.bc.thinlto.bc -o - | FileCheck %s --check-prefix=MAIN-DIS
-;
-; MAIN-DIS: [[LIBMOD:\^[0-9]+]] = module: (path: "lib.bc", hash: (0, 0, 0, 0, 0))
-; MAIN-DIS-NOT: [[LARGEFUNC:\^[0-9]+]] = gv: (guid: 2418497564662708935, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
-; MAIN-DIS-NOT: [[LARGEINDIRECT:\^[0-9]+]] = gv: (guid: 14343440786664691134, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
-; MAIN-DIS-NOT: [[LARGEINDIRECTALIAS:\^[0-9]+]] = gv: (guid: 16730173943625350469, summaries: (alias: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration)
-
-; Run in-process ThinLTO and tests that
-; 1. `callee` remains internalized even if the symbols of its callers
-; (large_func and large_indirect_callee) are exported as declarations and visible to main module.
-; 2. the debugging logs from `function-import` pass are expected.
-
-; RUN: llvm-lto2 run \
-; RUN:   -debug-only=function-import \
-; RUN:   -save-temps \
-; RUN:   -import-instr-limit=7 \
-; RUN:   -import-declaration \
-; RUN:   -r=main.bc,main,px \
-; RUN:   -r=main.bc,small_func, \
-; RUN:   -r=main.bc,large_func, \
-; RUN:   -r=lib.bc,callee,pl \
-; RUN:   -r=lib.bc,large_indirect_callee,px \
-; RUN:   -r=lib.bc,small_func,px \
-; RUN:   -r=lib.bc,large_func,px \
-; RUN:   -r=lib.bc,large_indirect_callee_alias,px \
-; RUN:   -r=lib.bc,calleeAddrs,px -o in-process main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=IMPORTDUMP
-
-; Test import status from debugging logs.
-; TODO: Serialize declaration bit and test declaration bits are correctly set,
-; and extend this test case to test IR once postlink optimizer makes use of
-; the import type for declarations.
-; IMPORTDUMP-DAG: Not importing function 11825436545918268459 callee from lib.cc
-; IMPORTDUMP-DAG: Is importing function declaration 14343440786664691134 large_indirect_callee from lib.cc
-; IMPORTDUMP-DAG: Is importing function definition 13568239288960714650 small_indirect_callee from lib.cc
-; IMPORTDUMP-DAG: Is importing function definition 6976996067367342685 small_func from lib.cc
-; IMPORTDUMP-DAG: Is importing function declaration 2418497564662708935 large_func from lib.cc
-; IMPORTDUMP-DAG: Not importing global 7680325410415171624 calleeAddrs from lib.cc
-; IMPORTDUMP-DAG: Is importing alias declaration 16730173943625350469 large_indirect_callee_alias from lib.cc
-
-; RUN: llvm-dis in-process.1.3.import.bc -o - | FileCheck %s --check-prefix=IMPORT
-
-; RUN: llvm-dis in-process.2.2.internalize.bc -o - | FileCheck %s --check-prefix=INTERNALIZE
-
-; IMPORT-DAG: define available_externally void @small_func
-; IMPORT-DAG: define available_externally hidden void @small_indirect_callee
-; IMPORT-DAG: declare void @large_func
-; IMPORT-NOT: large_indirect_callee
-; IMPORT-NOT: large_indirect_callee_alias
-
-; INTERNALIZE: define internal void @callee()
-
-;--- main.ll
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define i32 @main() {
-  call void @small_func()
-  call void @large_func()
-  ret i32 0
-}
-
-declare void @small_func()
-
-; large_func without attributes
-declare void @large_func()
-
-;--- lib.ll
-source_filename = "lib.cc"
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@calleeAddrs = global [3 x ptr] [ptr @large_indirect_callee, ptr @small_indirect_callee, ptr @large_indirect_callee_alias]
-
-define void @callee() #1 {
-  ret void
-}
-
-define void @large_indirect_callee()#2 {
-  call void @callee()
-  call void @callee()
-  call void @callee()
-  call void @callee()
-  call void @callee()
-  call void @callee()
-  call void @callee()
-  ret void
-}
-
-define internal void @small_indirect_callee() #0 {
-  ret void
-}
-
-@large_indirect_callee_alias = alias void(), ptr @large_indirect_callee
-
-define void @small_func() {
-entry:
-  %0 = load ptr, ptr @calleeAddrs
-  call void %0(), !prof !0
-  %1 = load ptr, ptr getelementptr inbounds ([3 x ptr], ptr @calleeAddrs, i64 0, i64 1)
-  call void %1(), !prof !1
-  %2 = load ptr, ptr getelementptr inbounds ([3 x ptr], ptr @calleeAddrs, i64 0, i64 2)
-  call void %2(), !prof !2
-  ret void
-}
-
-define void @large_func() #0 {
-entry:
-  call void @callee()
-  call void @callee()
-  call void @callee()
-  call void @callee()
-  call void @callee()
-  call void @callee()
-  call void @callee()
-  ret void
-}
-
-attributes #0 = { nounwind norecurse }
-
-attributes #1 = { noinline }
-
-attributes #2 = { norecurse }
-
-!0 = !{!"VP", i32 0, i64 1, i64 14343440786664691134, i64 1}
-!1 = !{!"VP", i32 0, i64 1, i64 13568239288960714650, i64 1}
-!2 = !{!"VP", i32 0, i64 1, i64 16730173943625350469, i64 1}
diff --git a/llvm/test/Transforms/FunctionImport/funcimport.ll b/llvm/test/Transforms/FunctionImport/funcimport.ll
index 635750b33fff00..a0968a67f5ce84 100644
--- a/llvm/test/Transforms/FunctionImport/funcimport.ll
+++ b/llvm/test/Transforms/FunctionImport/funcimport.ll
@@ -166,8 +166,7 @@ declare void @variadic_va_start(...)
 ; GUID-DAG: GUID {{.*}} is linkoncefunc
 
 ; DUMP:       Module [[M1:.*]] imports from 1 module
-; DUMP-NEXT:  15 function definitions and 0 function declarations imported from [[M2:.*]]
-; DUMP-NEXT:  4 var definitions and 0 var declarations imported from [[M2]]
-
+; DUMP-NEXT:  15 functions imported from [[M2:.*]]
+; DUMP-NEXT:  4 vars imported from [[M2]]
 ; DUMP:       Imported 15 functions for Module [[M1]]
 ; DUMP-NEXT:  Imported 4 global variables for Module [[M1]]
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index 1b90fce76fbd14..7794f2d81ed064 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -377,13 +377,9 @@ static bool importFunctions(const char *argv0, Module &DestModule) {
     if (Verbose)
       errs() << "Importing " << FunctionName << " from " << FileName << "\n";
 
-    // `-import` specifies the `<filename,function-name>` pairs to import as
-    // definition, so make the import type definition directly.
-    // FIXME: A follow-up patch should add test coverage for import declaration
-    // in `llvm-link` CLI (e.g., by introducing a new command line option).
     auto &Entry =
         ImportList[FileNameStringCache.insert(FileName).first->getKey()];
-    Entry[F->getGUID()] = GlobalValueSummary::Definition;
+    Entry.insert(F->getGUID());
   }
   auto CachedModuleLoader = [&](StringRef Identifier) {
     return ModuleLoaderCache.takeModule(std::string(Identifier));

From 32ae9a28a54f59f2b4e2f32323f53fb107ea1f85 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 19 May 2024 22:48:06 -0700
Subject: [PATCH 34/60] [llvm] Use SmallString::str (NFC) (#92712)

---
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 2 +-
 llvm/lib/CodeGen/ParallelCG.cpp           | 4 +---
 llvm/lib/LTO/LTOBackend.cpp               | 5 ++---
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index e64051cf538627..c9295344f8080f 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2940,7 +2940,7 @@ Error BitcodeReader::parseValueSymbolTable(uint64_t Offset) {
       if (!BB)
         return error("Invalid bbentry record");
 
-      BB->setName(StringRef(ValueName.data(), ValueName.size()));
+      BB->setName(ValueName.str());
       ValueName.clear();
       break;
     }
diff --git a/llvm/lib/CodeGen/ParallelCG.cpp b/llvm/lib/CodeGen/ParallelCG.cpp
index ceb64b2badab56..8ab64f8afe6ec2 100644
--- a/llvm/lib/CodeGen/ParallelCG.cpp
+++ b/llvm/lib/CodeGen/ParallelCG.cpp
@@ -79,9 +79,7 @@ void llvm::splitCodeGen(
               [TMFactory, FileType, ThreadOS](const SmallString<0> &BC) {
                 LLVMContext Ctx;
                 Expected<std::unique_ptr<Module>> MOrErr = parseBitcodeFile(
-                    MemoryBufferRef(StringRef(BC.data(), BC.size()),
-                                    "<split-module>"),
-                    Ctx);
+                    MemoryBufferRef(BC.str(), "<split-module>"), Ctx);
                 if (!MOrErr)
                   report_fatal_error("Failed to read bitcode");
                 std::unique_ptr<Module> MPartInCtx = std::move(MOrErr.get());
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index d4b89ede2d7134..21aed799d6fa37 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -452,9 +452,8 @@ static void splitCodeGen(const Config &C, TargetMachine *TM,
         CodegenThreadPool.async(
             [&](const SmallString<0> &BC, unsigned ThreadId) {
               LTOLLVMContext Ctx(C);
-              Expected<std::unique_ptr<Module>> MOrErr = parseBitcodeFile(
-                  MemoryBufferRef(StringRef(BC.data(), BC.size()), "ld-temp.o"),
-                  Ctx);
+              Expected<std::unique_ptr<Module>> MOrErr =
+                  parseBitcodeFile(MemoryBufferRef(BC.str(), "ld-temp.o"), Ctx);
               if (!MOrErr)
                 report_fatal_error("Failed to read bitcode");
               std::unique_ptr<Module> MPartInCtx = std::move(MOrErr.get());

From 7529fe2e92e79eef22a528a7168e4dd777d6e9bd Mon Sep 17 00:00:00 2001
From: Jessica Clarke <jrtc27@jrtc27.com>
Date: Mon, 20 May 2024 07:08:40 +0100
Subject: [PATCH 35/60] [AMDGPU] Only set Info.memVT when not later overridden
 (#92670)

For the amdgcn_*_buffer_load_lds intrinsics this field is later
overriden, so avoid pointlessly calling MVT::getVT in that case.
Importantly, this is also the only case I can find in tree where a
PointerType is passed to MVT::getVT, so this will allow us to forbid
doing so in future, keeping MVT::iPTR as originating solely from
TableGen as was claimed next to its definition in MachineValueType.h
(but lost in the autogeneration conversion).
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c7c4a8faa2fb05..d7b6941fcf81d5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1233,13 +1233,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       // Atomic
       Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
                                             ISD::INTRINSIC_W_CHAIN;
-      Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
       Info.flags |= MachineMemOperand::MOLoad |
                     MachineMemOperand::MOStore |
                     MachineMemOperand::MODereferenceable;
 
       switch (IntrID) {
       default:
+        Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
         // XXX - Should this be volatile without known ordering?
         Info.flags |= MachineMemOperand::MOVolatile;
         break;

From 9500a5d02e23f9b43294e5f662ac099f8989c0e4 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 19 May 2024 23:35:15 -0700
Subject: [PATCH 36/60] [MC] Make UseAssemblerInfoForParsing mostly true

Commit 6c0665e22174d474050e85ca367424f6e02476be
(https://reviews.llvm.org/D45164) enabled certain constant expression
evaluation for `MCObjectStreamer` at parse time (e.g. `.if` directives,
see llvm/test/MC/AsmParser/assembler-expressions.s).

`getUseAssemblerInfoForParsing` was added to make `clang -c` handling
inline assembly similar to `MCAsmStreamer` (e.g. `llvm-mc -filetype=asm`),
where such expression folding (related to
`AttemptToFoldSymbolOffsetDifference`) is unavailable.

I believe this is overly conservative. We can make some parse-time
expression folding work for `clang -c` even if `clang -S` would still
report an error, a MCAsmStreamer issue (we cannot print `.if`
directives) that should not restrict the functionality of
MCObjectStreamer.

```
% cat b.cc
asm(R"(
.pushsection .text,"ax"
.globl _start; _start: ret
.if . -_start == 1
  ret
.endif
.popsection
)");
% gcc -S b.cc && gcc -c b.cc
% clang -S -fno-integrated-as b.cc     # succeeded

% clang -c b.cc     # succeeded with this patch
% clang -S b.cc     # still failed
<inline asm>:4:5: error: expected absolute expression
    4 | .if . -_start == 1
      |     ^
1 error generated.
```

However, removing `getUseAssemblerInfoForParsing` would make
MCDwarfFrameEmitter::Emit (for .eh_frame FDE) slow (~4% compile time
regression for sqlite3.c amalgamation) due to expensive
`AttemptToFoldSymbolOffsetDifference`. For now, make
`UseAssemblerInfoForParsing` false in MCDwarfFrameEmitter::Emit.

Close #62520
Link: https://discourse.llvm.org/t/rfc-clang-assembly-object-equivalence-for-files-with-inline-assembly/78841

Pull Request: https://github.com/llvm/llvm-project/pull/91082
---
 clang/tools/driver/cc1as_main.cpp                |  3 ---
 llvm/include/llvm/MC/MCStreamer.h                |  4 +++-
 .../CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp   |  3 ---
 llvm/lib/MC/MCDwarf.cpp                          |  6 ++++++
 llvm/lib/MC/MCObjectStreamer.cpp                 |  3 ---
 llvm/lib/MC/MCStreamer.cpp                       |  2 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp      |  7 ++-----
 llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp        |  3 ---
 .../AsmParser/assembler-expressions-inlineasm.ll | 16 ++++++++++------
 llvm/tools/llvm-mc/llvm-mc.cpp                   |  3 ---
 llvm/tools/llvm-ml/llvm-ml.cpp                   |  3 ---
 11 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp
index 86afe22fac24cc..4eb753a7297a92 100644
--- a/clang/tools/driver/cc1as_main.cpp
+++ b/clang/tools/driver/cc1as_main.cpp
@@ -576,9 +576,6 @@ static bool ExecuteAssemblerImpl(AssemblerInvocation &Opts,
     Str.get()->emitZeros(1);
   }
 
-  // Assembly to object compilation should leverage assembly info.
-  Str->setUseAssemblerInfoForParsing(true);
-
   bool Failed = false;
 
   std::unique_ptr<MCAsmParser> Parser(
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 69867620e1bf8a..b7468cf70a6643 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -245,7 +245,7 @@ class MCStreamer {
   /// requires.
   unsigned NextWinCFIID = 0;
 
-  bool UseAssemblerInfoForParsing;
+  bool UseAssemblerInfoForParsing = true;
 
   /// Is the assembler allowed to insert padding automatically?  For
   /// correctness reasons, we sometimes need to ensure instructions aren't
@@ -296,6 +296,8 @@ class MCStreamer {
 
   MCContext &getContext() const { return Context; }
 
+  // MCObjectStreamer has an MCAssembler and allows more expression folding at
+  // parse time.
   virtual MCAssembler *getAssemblerPtr() { return nullptr; }
 
   void setUseAssemblerInfoForParsing(bool v) { UseAssemblerInfoForParsing = v; }
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index d0ef3e5a19391c..08e3c208ba4d38 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -102,9 +102,6 @@ void AsmPrinter::emitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   std::unique_ptr<MCAsmParser> Parser(
       createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI, BufNum));
 
-  // Do not use assembler-level information for parsing inline assembly.
-  OutStreamer->setUseAssemblerInfoForParsing(false);
-
   // We create a new MCInstrInfo here since we might be at the module level
   // and not have a MachineFunction to initialize the TargetInstrInfo from and
   // we only need MCInstrInfo for asm parsing. We create one unconditionally
diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index 2ee0c3eb27b92e..aba4071e6b910e 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -1910,6 +1910,11 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
                     [](const MCDwarfFrameInfo &X, const MCDwarfFrameInfo &Y) {
                       return CIEKey(X) < CIEKey(Y);
                     });
+  // Disable AttemptToFoldSymbolOffsetDifference folding of fdeStart-cieStart
+  // for EmitFDE due to the the performance issue. The label differences will be
+  // evaluate at write time.
+  assert(Streamer.getUseAssemblerInfoForParsing());
+  Streamer.setUseAssemblerInfoForParsing(false);
   for (auto I = FrameArrayX.begin(), E = FrameArrayX.end(); I != E;) {
     const MCDwarfFrameInfo &Frame = *I;
     ++I;
@@ -1930,6 +1935,7 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
 
     Emitter.EmitFDE(*CIEStart, Frame, I == E, *SectionStart);
   }
+  Streamer.setUseAssemblerInfoForParsing(true);
 }
 
 void MCDwarfFrameEmitter::encodeAdvanceLoc(MCContext &Context,
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index d2da5d0d3f90f2..0ccade91677a41 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -40,9 +40,6 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context,
 
 MCObjectStreamer::~MCObjectStreamer() = default;
 
-// AssemblerPtr is used for evaluation of expressions and causes
-// difference between asm and object outputs. Return nullptr to in
-// inline asm mode to limit divergence to assembly inputs.
 MCAssembler *MCObjectStreamer::getAssemblerPtr() {
   if (getUseAssemblerInfoForParsing())
     return Assembler.get();
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index 176d55aa890bed..199d865ea3496d 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -93,7 +93,7 @@ void MCTargetStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {}
 
 MCStreamer::MCStreamer(MCContext &Ctx)
     : Context(Ctx), CurrentWinFrameInfo(nullptr),
-      CurrentProcWinFrameInfoStartIndex(0), UseAssemblerInfoForParsing(false) {
+      CurrentProcWinFrameInfoStartIndex(0) {
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index b7388ed9e85a85..bd48a5f80c8284 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -517,12 +517,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   DumpCodeInstEmitter = nullptr;
   if (STM.dumpCode()) {
-    // For -dumpcode, get the assembler out of the streamer, even if it does
-    // not really want to let us have it. This only works with -filetype=obj.
-    bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
-    OutStreamer->setUseAssemblerInfoForParsing(true);
+    // For -dumpcode, get the assembler out of the streamer. This only works
+    // with -filetype=obj.
     MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
-    OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
     if (Assembler)
       DumpCodeInstEmitter = Assembler->getEmitterPtr();
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 2ebe5bdc47715b..ad015808604487 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -114,12 +114,9 @@ void SPIRVAsmPrinter::emitEndOfAsmFile(Module &M) {
   // Bound is an approximation that accounts for the maximum used register
   // number and number of generated OpLabels
   unsigned Bound = 2 * (ST->getBound() + 1) + NLabels;
-  bool FlagToRestore = OutStreamer->getUseAssemblerInfoForParsing();
-  OutStreamer->setUseAssemblerInfoForParsing(true);
   if (MCAssembler *Asm = OutStreamer->getAssemblerPtr())
     Asm->setBuildVersion(static_cast<MachO::PlatformType>(0), Major, Minor,
                          Bound, VersionTuple(Major, Minor, 0, Bound));
-  OutStreamer->setUseAssemblerInfoForParsing(FlagToRestore);
 }
 
 void SPIRVAsmPrinter::emitFunctionHeader() {
diff --git a/llvm/test/MC/AsmParser/assembler-expressions-inlineasm.ll b/llvm/test/MC/AsmParser/assembler-expressions-inlineasm.ll
index 35f110f37e2fb6..9d9a38f5b5a54b 100644
--- a/llvm/test/MC/AsmParser/assembler-expressions-inlineasm.ll
+++ b/llvm/test/MC/AsmParser/assembler-expressions-inlineasm.ll
@@ -1,13 +1,17 @@
-; RUN: not llc -mtriple x86_64-unknown-linux-gnu -o %t.s -filetype=asm %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple x86_64-unknown-linux-gnu -o %t.o -filetype=obj %s 2>&1 | FileCheck %s
-
-; Assembler-aware expression evaluation should be disabled in inline
-; assembly to prevent differences in behavior between object and
-; assembly output.
+; RUN: not llc -mtriple=x86_64 %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: llc -mtriple=x86_64 -no-integrated-as < %s | FileCheck %s --check-prefix=GAS
+; RUN: llc -mtriple=x86_64 -filetype=obj %s -o - | llvm-objdump -d - |  FileCheck %s --check-prefix=DISASM
 
+; GAS: nop;  .if . - foo==1; nop;.endif
 
 ; CHECK: <inline asm>:1:17: error: expected absolute expression
 
+; DISASM:      <main>:
+; DISASM-NEXT:   nop
+; DISASM-NEXT:   nop
+; DISASM-NEXT:   xorl %eax, %eax
+; DISASM-NEXT:   retq
+
 define i32 @main() local_unnamed_addr {
   tail call void asm sideeffect "foo: nop;  .if . - foo==1;  nop;.endif", "~{dirflag},~{fpsr},~{flags}"()
   ret i32 0
diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp
index 807071a7b9a16a..506e4f22ef8f54 100644
--- a/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -569,9 +569,6 @@ int main(int argc, char **argv) {
       Str->initSections(true, *STI);
   }
 
-  // Use Assembler information for parsing.
-  Str->setUseAssemblerInfoForParsing(true);
-
   int Res = 1;
   bool disassemble = false;
   switch (Action) {
diff --git a/llvm/tools/llvm-ml/llvm-ml.cpp b/llvm/tools/llvm-ml/llvm-ml.cpp
index 1cac576f54e77f..f1f39af059aa49 100644
--- a/llvm/tools/llvm-ml/llvm-ml.cpp
+++ b/llvm/tools/llvm-ml/llvm-ml.cpp
@@ -428,9 +428,6 @@ int llvm_ml_main(int Argc, char **Argv, const llvm::ToolContext &) {
     Str->emitAssignment(Feat00Sym, MCConstantExpr::create(Feat00Flags, Ctx));
   }
 
-  // Use Assembler information for parsing.
-  Str->setUseAssemblerInfoForParsing(true);
-
   int Res = 1;
   if (InputArgs.hasArg(OPT_as_lex)) {
     // -as-lex; Lex only, and output a stream of tokens

From eac743d1b01fd44bc742e1ccc2be8360908bdbf8 Mon Sep 17 00:00:00 2001
From: YunQiang Su <syq@debian.org>
Date: Mon, 20 May 2024 14:46:47 +0800
Subject: [PATCH 37/60] MIPS: Support '%w' token in inline asm template for MSA
 (#91920)

MSA registers share the FPRs as its bottom half. So that we can use MSA
instructions to work with normal float/double:
   double a, b, c;
   asm volatile ("fmadd.d %w0, %w1, %w2" : "+f"(a) : "f"(b), "f"(c));

GCC has support it for quite long time.
---
 llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h |  9 +++++++++
 llvm/lib/Target/Mips/MipsAsmPrinter.cpp          | 11 +++++++----
 llvm/test/CodeGen/Mips/msa/inline-asm.ll         | 16 ++++++++++++++++
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
index 02ab5ede2c1a40..aa35e7db6bda44 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -135,6 +135,15 @@ namespace MipsII {
     OPERAND_LAST_MIPS_MEM_IMM = OPERAND_MEM_SIMM9
   };
 }
+
+inline static MCRegister getMSARegFromFReg(MCRegister Reg) {
+  if (Reg >= Mips::F0 && Reg <= Mips::F31)
+    return Reg - Mips::F0 + Mips::W0;
+  else if (Reg >= Mips::D0_64 && Reg <= Mips::D31_64)
+    return Reg - Mips::D0_64 + Mips::W0;
+  else
+    return Mips::NoRegister;
+}
 }
 
 #endif
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 66b2b0de8d52a3..dda33f9a180876 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -565,12 +565,15 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       }
       break;
     }
-    case 'w':
-      // Print MSA registers for the 'f' constraint
-      // In LLVM, the 'w' modifier doesn't need to do anything.
-      // We can just call printOperand as normal.
+    case 'w': {
+      MCRegister w = getMSARegFromFReg(MO.getReg());
+      if (w != Mips::NoRegister) {
+        O << '$' << MipsInstPrinter::getRegisterName(w);
+        return false;
+      }
       break;
     }
+    }
   }
 
   printOperand(MI, OpNum, O);
diff --git a/llvm/test/CodeGen/Mips/msa/inline-asm.ll b/llvm/test/CodeGen/Mips/msa/inline-asm.ll
index 57cd78a25647c1..f84b11e05387ed 100644
--- a/llvm/test/CodeGen/Mips/msa/inline-asm.ll
+++ b/llvm/test/CodeGen/Mips/msa/inline-asm.ll
@@ -32,3 +32,19 @@ entry:
   store <4 x i32> %1, ptr @v4i32_r
   ret void
 }
+
+define dso_local double @test4(double noundef %a, double noundef %b, double noundef %c) {
+entry:
+  ; CHECK-LABEL: test4:
+  %0 = tail call double asm sideeffect "fmadd.d ${0:w}, ${1:w}, ${2:w}", "=f,f,f,0,~{$1}"(double %b, double %c, double %a)
+  ; CHECK: fmadd.d $w{{([0-9]|[1-3][0-9])}}, $w{{([0-9]|[1-3][0-9])}}, $w{{([0-9]|[1-3][0-9])}}
+  ret double %0
+}
+
+define dso_local float @test5(float noundef %a, float noundef %b, float noundef %c) {
+entry:
+  ; CHECK-LABEL: test5:
+  %0 = tail call float asm sideeffect "fmadd.w ${0:w}, ${1:w}, ${2:w}", "=f,f,f,0,~{$1}"(float %b, float %c, float %a)
+  ; CHECK: fmadd.w $w{{([0-9]|[1-3][0-9])}}, $w{{([0-9]|[1-3][0-9])}}, $w{{([0-9]|[1-3][0-9])}}
+  ret float %0
+}

From d59bc6b5c75384aa0b1e78cc85e17e8acaccebaf Mon Sep 17 00:00:00 2001
From: YunQiang Su <syq@debian.org>
Date: Mon, 20 May 2024 14:48:34 +0800
Subject: [PATCH 38/60] Clang/MIPS: Add +fp64 if MSA and no explicit -mfp
 option (#91949)

MSA requires -mfp64. If FP64 is supported by CPU (mips32r2+), and no
-mfp32/-mfpxx is explicitly given, let's add +fp64. Otherwise some cmd
like
   clang --target=mips -mips32r5 -mmsa
will issue LLVM backend ICE.
---
 clang/lib/Driver/ToolChains/Arch/Mips.cpp | 10 ++++++++++
 clang/test/Driver/mips-as.c               |  2 +-
 clang/test/Driver/mips-features.c         |  6 ++++++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
index 74a8874a3ea2b7..79a00711e6f53c 100644
--- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
@@ -369,6 +369,9 @@ void mips::getMIPSTargetFeatures(const Driver &D, const llvm::Triple &Triple,
   } else if (mips::isFP64ADefault(Triple, CPUName)) {
     Features.push_back("+fp64");
     Features.push_back("+nooddspreg");
+  } else if (Arg *A = Args.getLastArg(options::OPT_mmsa)) {
+    if (A->getOption().matches(options::OPT_mmsa))
+      Features.push_back("+fp64");
   }
 
   AddTargetFeature(Args, Features, options::OPT_mno_odd_spreg,
@@ -499,6 +502,13 @@ bool mips::shouldUseFPXX(const ArgList &Args, const llvm::Triple &Triple,
                                options::OPT_mdouble_float))
     if (A->getOption().matches(options::OPT_msingle_float))
       UseFPXX = false;
+  // FP64 should be used for MSA.
+  if (Arg *A = Args.getLastArg(options::OPT_mmsa))
+    if (A->getOption().matches(options::OPT_mmsa))
+      UseFPXX = llvm::StringSwitch<bool>(CPUName)
+                    .Cases("mips32r2", "mips32r3", "mips32r5", false)
+                    .Cases("mips64r2", "mips64r3", "mips64r5", false)
+                    .Default(UseFPXX);
 
   return UseFPXX;
 }
diff --git a/clang/test/Driver/mips-as.c b/clang/test/Driver/mips-as.c
index 14fbb18c93500b..a3399f1078fcd6 100644
--- a/clang/test/Driver/mips-as.c
+++ b/clang/test/Driver/mips-as.c
@@ -266,7 +266,7 @@
 // RUN: %clang -target mips-linux-gnu -mno-msa -mmsa -### \
 // RUN:   -no-integrated-as -fno-pic -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-MSA %s
-// MIPS-MSA: as{{(.exe)?}}" "-march" "mips32r2" "-mabi" "32" "-mno-shared" "-call_nonpic" "-EB" "-mfpxx" "-mmsa"
+// MIPS-MSA: as{{(.exe)?}}" "-march" "mips32r2" "-mabi" "32" "-mno-shared" "-call_nonpic" "-EB" "-mmsa"
 //
 // RUN: %clang -target mips-linux-gnu -mmsa -mno-msa -### \
 // RUN:   -no-integrated-as -fno-pic -c %s 2>&1 \
diff --git a/clang/test/Driver/mips-features.c b/clang/test/Driver/mips-features.c
index 5e92dccaa02abb..8b8db4c4a341bb 100644
--- a/clang/test/Driver/mips-features.c
+++ b/clang/test/Driver/mips-features.c
@@ -163,6 +163,12 @@
 // RUN:   | FileCheck --check-prefix=CHECK-NOMMSA %s
 // CHECK-NOMMSA: "-target-feature" "-msa"
 //
+// -mmsa
+// RUN: %clang -target mips-linux-gnu -### -c %s \
+// RUN:     -mmsa 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-MMSA-MFP64 %s
+// CHECK-MMSA-MFP64: "-target-feature" "+msa" "-target-feature" "+fp64"
+//
 // -mmt
 // RUN: %clang -target mips-linux-gnu -### -c %s \
 // RUN:     -mno-mt -mmt 2>&1 \

From 073488cb1f2ca131253efa3171bd56be34ba9fb3 Mon Sep 17 00:00:00 2001
From: YunQiang Su <syq@debian.org>
Date: Mon, 20 May 2024 14:50:26 +0800
Subject: [PATCH 39/60] MIPS/Clang: Use FP32 by default if CPU is mips1
 (#92122)

FP32 is the only supported FPMode of mips1.
FPXX requires MIPS2+ and FP64 requires MIPS32r2+.
---
 clang/lib/Basic/Targets/Mips.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Basic/Targets/Mips.h b/clang/lib/Basic/Targets/Mips.h
index 730deb674aa579..f76c6ece8bf481 100644
--- a/clang/lib/Basic/Targets/Mips.h
+++ b/clang/lib/Basic/Targets/Mips.h
@@ -85,8 +85,13 @@ class LLVM_LIBRARY_VISIBILITY MipsTargetInfo : public TargetInfo {
     return CPU == "mips32r6" || CPU == "mips64r6";
   }
 
-  bool isFP64Default() const {
-    return CPU == "mips32r6" || ABI == "n32" || ABI == "n64" || ABI == "64";
+  enum FPModeEnum getDefaultFPMode() const {
+    if (CPU == "mips32r6" || ABI == "n32" || ABI == "n64" || ABI == "64")
+      return FP64;
+    else if (CPU == "mips1")
+      return FP32;
+    else
+      return FPXX;
   }
 
   bool isNan2008() const override { return IsNan2008; }
@@ -315,8 +320,8 @@ class LLVM_LIBRARY_VISIBILITY MipsTargetInfo : public TargetInfo {
     IsSingleFloat = false;
     FloatABI = HardFloat;
     DspRev = NoDSP;
-    FPMode = isFP64Default() ? FP64 : FPXX;
     NoOddSpreg = false;
+    FPMode = getDefaultFPMode();
     bool OddSpregGiven = false;
     bool StrictAlign = false;
 

From dd8cb3d4f120edcf5fc3939594ee086c44010274 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 20 May 2024 00:13:09 -0700
Subject: [PATCH 40/60] [ELF] Support high address DW_EH_sdata4 for ELFCLASS32

When the address pointer encoding in FDEs uses
DW_EH_PE_absptr|DW_EH_PE_sdata4, the address is sign-extended to 64-bit
by `readFdeAddr`. We should truncate the address to 32-bit for
ELFCLASS32. Otherwise, `isInt<32>(pc - va)` could be false, leading to a
spurious error in `getFdeData`.

In LLVM, this appears a MIPS-specific issue.
Fix #88852

Pull Request: https://github.com/llvm/llvm-project/pull/92438
---
 lld/ELF/SyntheticSections.cpp    | 2 +-
 lld/test/ELF/mips-eh_frame-pic.s | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 22bfed0852bcae..ad280289cebf9b 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -613,7 +613,7 @@ uint64_t EhFrameSection::getFdePc(uint8_t *buf, size_t fdeOff,
   size_t off = fdeOff + 8;
   uint64_t addr = readFdeAddr(buf + off, enc & 0xf);
   if ((enc & 0x70) == DW_EH_PE_absptr)
-    return addr;
+    return config->is64 ? addr : uint32_t(addr);
   if ((enc & 0x70) == DW_EH_PE_pcrel)
     return addr + getParent()->addr + off + outSecOff;
   fatal("unknown FDE size relative encoding");
diff --git a/lld/test/ELF/mips-eh_frame-pic.s b/lld/test/ELF/mips-eh_frame-pic.s
index 79076e74a7e3fc..fd8560bc0163ff 100644
--- a/lld/test/ELF/mips-eh_frame-pic.s
+++ b/lld/test/ELF/mips-eh_frame-pic.s
@@ -27,6 +27,11 @@
 ## relative addressing.
 # NOPIC32-ERR: ld.lld: error: relocation R_MIPS_32 cannot be used against local symbol
 
+## https://github.com/llvm/llvm-project/issues/88852: getFdePc should return a
+## 32-bit address.
+# RUN: ld.lld --eh-frame-hdr -Ttext=0x80000000 %t-nopic32.o -o %t-nopic32
+# RUN: llvm-readelf -x .eh_frame_hdr %t-nopic32 | FileCheck %s --check-prefix=NOPIC32-HDR
+
 ## For -fPIC, .eh_frame should contain DW_EH_PE_pcrel | DW_EH_PE_sdata4 values:
 # RUN: llvm-mc -filetype=obj -triple=mips-unknown-linux --position-independent %s -o %t-pic32.o
 # RUN: llvm-readobj -r %t-pic32.o | FileCheck %s --check-prefixes=RELOCS,PIC32-RELOCS
@@ -51,6 +56,10 @@
 ## Note: ld.bfd converts the R_MIPS_64 relocs to DW_EH_PE_pcrel | DW_EH_PE_sdata8
 ## for N64 ABI (and DW_EH_PE_pcrel | DW_EH_PE_sdata4 for MIPS32)
 
+# NOPIC32-HDR: Hex dump of section '.eh_frame_hdr':
+# NOPIC32-HDR: 0x80010038 011b033b 00000010 00000001 fffeffc8 .
+# NOPIC32-HDR: 0x80010048 00000028                            .
+
 .ent func
 .global func
 func:

From 2143b7cd7d184b3f3bc4a997ea925ab7574c93f9 Mon Sep 17 00:00:00 2001
From: Chen Zheng <czhengsz@cn.ibm.com>
Date: Mon, 20 May 2024 03:10:08 -0400
Subject: [PATCH 41/60] [PowerPC]perform bitcast lowering only at 64 bit

Perform bitcast lowering requires 64-bit to be native supported,
However this is not true on 32-bit targets. Explicitly require
64-bit target.

Fixes #92233
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp |  2 +-
 llvm/test/CodeGen/PowerPC/pr92233.ll        | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/pr92233.ll

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index ad86c393ba7919..8450ce9e0e3b3b 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9338,7 +9338,7 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
   if ((Op.getValueType() != MVT::f128) ||
       (Op0.getOpcode() != ISD::BUILD_PAIR) ||
       (Op0.getOperand(0).getValueType() != MVT::i64) ||
-      (Op0.getOperand(1).getValueType() != MVT::i64))
+      (Op0.getOperand(1).getValueType() != MVT::i64) || !Subtarget.isPPC64())
     return SDValue();
 
   return DAG.getNode(PPCISD::BUILD_FP128, dl, MVT::f128, Op0.getOperand(0),
diff --git a/llvm/test/CodeGen/PowerPC/pr92233.ll b/llvm/test/CodeGen/PowerPC/pr92233.ll
new file mode 100644
index 00000000000000..858d665909fe86
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr92233.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mcpu=pwr9 -verify-machineinstrs < %s -mtriple=powerpc-unknown-linux-gnu | FileCheck %s
+
+define internal fp128 @f(i128 %v) nounwind {
+; CHECK-LABEL: f:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    stwu 1, -32(1)
+; CHECK-NEXT:    stw 6, 28(1)
+; CHECK-NEXT:    stw 5, 24(1)
+; CHECK-NEXT:    stw 4, 20(1)
+; CHECK-NEXT:    stw 3, 16(1)
+; CHECK-NEXT:    lxv 34, 16(1)
+; CHECK-NEXT:    addi 1, 1, 32
+; CHECK-NEXT:    blr
+entry:
+  %cast = bitcast i128 %v to fp128
+  ret fp128 %cast
+}
+

From a027bea438e285380450f5b380be072f44ee0312 Mon Sep 17 00:00:00 2001
From: hev <wangrui@loongson.cn>
Date: Mon, 20 May 2024 15:24:52 +0800
Subject: [PATCH 42/60] [LoongArch] Select {DIV,MOD}.{W,WU} instruction to
 eliminate explicit sign extension (#92205)

---
 .../LoongArch/LoongArchISelLowering.cpp       | 13 +++
 .../Target/LoongArch/LoongArchISelLowering.h  |  4 +
 .../Target/LoongArch/LoongArchInstrInfo.td    |  6 ++
 .../ir-instruction/sdiv-udiv-srem-urem.ll     | 96 +++++++------------
 4 files changed, 57 insertions(+), 62 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index fe2c613b1b30f1..8a87c82a205bbc 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -139,6 +139,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
 
     setOperationAction(ISD::BITREVERSE, MVT::i32, Custom);
     setOperationAction(ISD::BSWAP, MVT::i32, Custom);
+    setOperationAction({ISD::UDIV, ISD::UREM}, MVT::i32, Custom);
   }
 
   // Set operations for LA32 only.
@@ -1665,6 +1666,10 @@ static LoongArchISD::NodeType getLoongArchWOpcode(unsigned Opcode) {
   switch (Opcode) {
   default:
     llvm_unreachable("Unexpected opcode");
+  case ISD::UDIV:
+    return LoongArchISD::DIV_WU;
+  case ISD::UREM:
+    return LoongArchISD::MOD_WU;
   case ISD::SHL:
     return LoongArchISD::SLL_W;
   case ISD::SRA:
@@ -1841,6 +1846,12 @@ void LoongArchTargetLowering::ReplaceNodeResults(
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Don't know how to legalize this operation");
+  case ISD::UDIV:
+  case ISD::UREM:
+    assert(VT == MVT::i32 && Subtarget.is64Bit() &&
+           "Unexpected custom legalisation");
+    Results.push_back(customLegalizeToWOp(N, DAG, 2, ISD::SIGN_EXTEND));
+    break;
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
@@ -3445,6 +3456,8 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(BITREV_W)
     NODE_NAME_CASE(ROTR_W)
     NODE_NAME_CASE(ROTL_W)
+    NODE_NAME_CASE(DIV_WU)
+    NODE_NAME_CASE(MOD_WU)
     NODE_NAME_CASE(CLZ_W)
     NODE_NAME_CASE(CTZ_W)
     NODE_NAME_CASE(DBAR)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index de3f45172e25a6..f274b1971fd232 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -43,6 +43,10 @@ enum NodeType : unsigned {
   ROTL_W,
   ROTR_W,
 
+  // unsigned 32-bit integer division
+  DIV_WU,
+  MOD_WU,
+
   // FPR<->GPR transfer operations
   MOVGR2FR_W_LA64,
   MOVFR2GR_S_LA64,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index f56f8f7e1179c2..35ea9f07866d5a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -85,6 +85,8 @@ def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>;
 def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>;
 def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>;
 def loongarch_rotr_w : SDNode<"LoongArchISD::ROTR_W", SDT_LoongArchIntBinOpW>;
+def loongarch_div_wu : SDNode<"LoongArchISD::DIV_WU", SDT_LoongArchIntBinOpW>;
+def loongarch_mod_wu : SDNode<"LoongArchISD::MOD_WU", SDT_LoongArchIntBinOpW>;
 def loongarch_crc_w_b_w
     : SDNode<"LoongArchISD::CRC_W_B_W", SDT_LoongArchIntBinOpW, [SDNPHasChain]>;
 def loongarch_crc_w_h_w
@@ -1110,9 +1112,13 @@ def : PatGprImm_32<add, ADDI_W, simm12>;
 def : PatGprGpr<sub, SUB_D>;
 def : PatGprGpr_32<sub, SUB_W>;
 def : PatGprGpr<sdiv, DIV_D>;
+def : PatGprGpr_32<sdiv, DIV_W>;
 def : PatGprGpr<udiv, DIV_DU>;
+def : PatGprGpr<loongarch_div_wu, DIV_WU>;
 def : PatGprGpr<srem, MOD_D>;
+def : PatGprGpr_32<srem, MOD_W>;
 def : PatGprGpr<urem, MOD_DU>;
+def : PatGprGpr<loongarch_mod_wu, MOD_WU>;
 def : PatGprGpr<rotr, ROTR_D>;
 def : PatGprGpr<loongarch_rotr_w, ROTR_W>;
 def : PatGprGpr_32<rotr, ROTR_W>;
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll
index 2064c398948fed..ab3eec240db3c1 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem.ll
@@ -191,8 +191,7 @@ define signext i32 @sdiv_si32_ui32_ui32(i32 %a, i32 %b) {
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    addi.w $a1, $a1, 0
 ; LA64-NEXT:    addi.w $a0, $a0, 0
-; LA64-NEXT:    div.d $a0, $a0, $a1
-; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    div.w $a0, $a0, $a1
 ; LA64-NEXT:    ret
 ;
 ; LA32-TRAP-LABEL: sdiv_si32_ui32_ui32:
@@ -208,12 +207,11 @@ define signext i32 @sdiv_si32_ui32_ui32(i32 %a, i32 %b) {
 ; LA64-TRAP:       # %bb.0: # %entry
 ; LA64-TRAP-NEXT:    addi.w $a1, $a1, 0
 ; LA64-TRAP-NEXT:    addi.w $a0, $a0, 0
-; LA64-TRAP-NEXT:    div.d $a0, $a0, $a1
+; LA64-TRAP-NEXT:    div.w $a0, $a0, $a1
 ; LA64-TRAP-NEXT:    bnez $a1, .LBB5_2
 ; LA64-TRAP-NEXT:  # %bb.1: # %entry
 ; LA64-TRAP-NEXT:    break 7
 ; LA64-TRAP-NEXT:  .LBB5_2: # %entry
-; LA64-TRAP-NEXT:    addi.w $a0, $a0, 0
 ; LA64-TRAP-NEXT:    ret
 entry:
   %r = sdiv i32 %a, %b
@@ -228,8 +226,7 @@ define signext i32 @sdiv_si32_si32_si32(i32 signext %a, i32 signext %b) {
 ;
 ; LA64-LABEL: sdiv_si32_si32_si32:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    div.d $a0, $a0, $a1
-; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    div.w $a0, $a0, $a1
 ; LA64-NEXT:    ret
 ;
 ; LA32-TRAP-LABEL: sdiv_si32_si32_si32:
@@ -243,12 +240,11 @@ define signext i32 @sdiv_si32_si32_si32(i32 signext %a, i32 signext %b) {
 ;
 ; LA64-TRAP-LABEL: sdiv_si32_si32_si32:
 ; LA64-TRAP:       # %bb.0: # %entry
-; LA64-TRAP-NEXT:    div.d $a0, $a0, $a1
+; LA64-TRAP-NEXT:    div.w $a0, $a0, $a1
 ; LA64-TRAP-NEXT:    bnez $a1, .LBB6_2
 ; LA64-TRAP-NEXT:  # %bb.1: # %entry
 ; LA64-TRAP-NEXT:    break 7
 ; LA64-TRAP-NEXT:  .LBB6_2: # %entry
-; LA64-TRAP-NEXT:    addi.w $a0, $a0, 0
 ; LA64-TRAP-NEXT:    ret
 entry:
   %r = sdiv i32 %a, %b
@@ -407,9 +403,9 @@ define i32 @udiv_i32(i32 %a, i32 %b) {
 ;
 ; LA64-LABEL: udiv_i32:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-NEXT:    div.du $a0, $a0, $a1
+; LA64-NEXT:    addi.w $a1, $a1, 0
+; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    div.wu $a0, $a0, $a1
 ; LA64-NEXT:    ret
 ;
 ; LA32-TRAP-LABEL: udiv_i32:
@@ -423,9 +419,9 @@ define i32 @udiv_i32(i32 %a, i32 %b) {
 ;
 ; LA64-TRAP-LABEL: udiv_i32:
 ; LA64-TRAP:       # %bb.0: # %entry
-; LA64-TRAP-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-TRAP-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-TRAP-NEXT:    div.du $a0, $a0, $a1
+; LA64-TRAP-NEXT:    addi.w $a1, $a1, 0
+; LA64-TRAP-NEXT:    addi.w $a0, $a0, 0
+; LA64-TRAP-NEXT:    div.wu $a0, $a0, $a1
 ; LA64-TRAP-NEXT:    bnez $a1, .LBB11_2
 ; LA64-TRAP-NEXT:  # %bb.1: # %entry
 ; LA64-TRAP-NEXT:    break 7
@@ -444,9 +440,7 @@ define i32 @udiv_ui32_si32_si32(i32 signext %a, i32 signext %b) {
 ;
 ; LA64-LABEL: udiv_ui32_si32_si32:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-NEXT:    div.du $a0, $a0, $a1
+; LA64-NEXT:    div.wu $a0, $a0, $a1
 ; LA64-NEXT:    ret
 ;
 ; LA32-TRAP-LABEL: udiv_ui32_si32_si32:
@@ -460,9 +454,7 @@ define i32 @udiv_ui32_si32_si32(i32 signext %a, i32 signext %b) {
 ;
 ; LA64-TRAP-LABEL: udiv_ui32_si32_si32:
 ; LA64-TRAP:       # %bb.0: # %entry
-; LA64-TRAP-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-TRAP-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-TRAP-NEXT:    div.du $a0, $a0, $a1
+; LA64-TRAP-NEXT:    div.wu $a0, $a0, $a1
 ; LA64-TRAP-NEXT:    bnez $a1, .LBB12_2
 ; LA64-TRAP-NEXT:  # %bb.1: # %entry
 ; LA64-TRAP-NEXT:    break 7
@@ -481,10 +473,9 @@ define signext i32 @udiv_si32_ui32_ui32(i32 %a, i32 %b) {
 ;
 ; LA64-LABEL: udiv_si32_ui32_ui32:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-NEXT:    div.du $a0, $a0, $a1
+; LA64-NEXT:    addi.w $a1, $a1, 0
 ; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    div.wu $a0, $a0, $a1
 ; LA64-NEXT:    ret
 ;
 ; LA32-TRAP-LABEL: udiv_si32_ui32_ui32:
@@ -498,14 +489,13 @@ define signext i32 @udiv_si32_ui32_ui32(i32 %a, i32 %b) {
 ;
 ; LA64-TRAP-LABEL: udiv_si32_ui32_ui32:
 ; LA64-TRAP:       # %bb.0: # %entry
-; LA64-TRAP-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-TRAP-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-TRAP-NEXT:    div.du $a0, $a0, $a1
+; LA64-TRAP-NEXT:    addi.w $a1, $a1, 0
+; LA64-TRAP-NEXT:    addi.w $a0, $a0, 0
+; LA64-TRAP-NEXT:    div.wu $a0, $a0, $a1
 ; LA64-TRAP-NEXT:    bnez $a1, .LBB13_2
 ; LA64-TRAP-NEXT:  # %bb.1: # %entry
 ; LA64-TRAP-NEXT:    break 7
 ; LA64-TRAP-NEXT:  .LBB13_2: # %entry
-; LA64-TRAP-NEXT:    addi.w $a0, $a0, 0
 ; LA64-TRAP-NEXT:    ret
 entry:
   %r = udiv i32 %a, %b
@@ -520,10 +510,7 @@ define signext i32 @udiv_si32_si32_si32(i32 signext %a, i32 signext %b) {
 ;
 ; LA64-LABEL: udiv_si32_si32_si32:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-NEXT:    div.du $a0, $a0, $a1
-; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    div.wu $a0, $a0, $a1
 ; LA64-NEXT:    ret
 ;
 ; LA32-TRAP-LABEL: udiv_si32_si32_si32:
@@ -537,14 +524,11 @@ define signext i32 @udiv_si32_si32_si32(i32 signext %a, i32 signext %b) {
 ;
 ; LA64-TRAP-LABEL: udiv_si32_si32_si32:
 ; LA64-TRAP:       # %bb.0: # %entry
-; LA64-TRAP-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-TRAP-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-TRAP-NEXT:    div.du $a0, $a0, $a1
+; LA64-TRAP-NEXT:    div.wu $a0, $a0, $a1
 ; LA64-TRAP-NEXT:    bnez $a1, .LBB14_2
 ; LA64-TRAP-NEXT:  # %bb.1: # %entry
 ; LA64-TRAP-NEXT:    break 7
 ; LA64-TRAP-NEXT:  .LBB14_2: # %entry
-; LA64-TRAP-NEXT:    addi.w $a0, $a0, 0
 ; LA64-TRAP-NEXT:    ret
 entry:
   %r = udiv i32 %a, %b
@@ -995,9 +979,9 @@ define i32 @urem_i32(i32 %a, i32 %b) {
 ;
 ; LA64-LABEL: urem_i32:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-NEXT:    mod.du $a0, $a0, $a1
+; LA64-NEXT:    addi.w $a1, $a1, 0
+; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    mod.wu $a0, $a0, $a1
 ; LA64-NEXT:    ret
 ;
 ; LA32-TRAP-LABEL: urem_i32:
@@ -1011,9 +995,9 @@ define i32 @urem_i32(i32 %a, i32 %b) {
 ;
 ; LA64-TRAP-LABEL: urem_i32:
 ; LA64-TRAP:       # %bb.0: # %entry
-; LA64-TRAP-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-TRAP-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-TRAP-NEXT:    mod.du $a0, $a0, $a1
+; LA64-TRAP-NEXT:    addi.w $a1, $a1, 0
+; LA64-TRAP-NEXT:    addi.w $a0, $a0, 0
+; LA64-TRAP-NEXT:    mod.wu $a0, $a0, $a1
 ; LA64-TRAP-NEXT:    bnez $a1, .LBB27_2
 ; LA64-TRAP-NEXT:  # %bb.1: # %entry
 ; LA64-TRAP-NEXT:    break 7
@@ -1032,9 +1016,7 @@ define i32 @urem_ui32_si32_si32(i32 signext %a, i32 signext %b) {
 ;
 ; LA64-LABEL: urem_ui32_si32_si32:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-NEXT:    mod.du $a0, $a0, $a1
+; LA64-NEXT:    mod.wu $a0, $a0, $a1
 ; LA64-NEXT:    ret
 ;
 ; LA32-TRAP-LABEL: urem_ui32_si32_si32:
@@ -1048,9 +1030,7 @@ define i32 @urem_ui32_si32_si32(i32 signext %a, i32 signext %b) {
 ;
 ; LA64-TRAP-LABEL: urem_ui32_si32_si32:
 ; LA64-TRAP:       # %bb.0: # %entry
-; LA64-TRAP-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-TRAP-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-TRAP-NEXT:    mod.du $a0, $a0, $a1
+; LA64-TRAP-NEXT:    mod.wu $a0, $a0, $a1
 ; LA64-TRAP-NEXT:    bnez $a1, .LBB28_2
 ; LA64-TRAP-NEXT:  # %bb.1: # %entry
 ; LA64-TRAP-NEXT:    break 7
@@ -1069,10 +1049,9 @@ define signext i32 @urem_si32_ui32_ui32(i32 %a, i32 %b) {
 ;
 ; LA64-LABEL: urem_si32_ui32_ui32:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-NEXT:    mod.du $a0, $a0, $a1
+; LA64-NEXT:    addi.w $a1, $a1, 0
 ; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    mod.wu $a0, $a0, $a1
 ; LA64-NEXT:    ret
 ;
 ; LA32-TRAP-LABEL: urem_si32_ui32_ui32:
@@ -1086,14 +1065,13 @@ define signext i32 @urem_si32_ui32_ui32(i32 %a, i32 %b) {
 ;
 ; LA64-TRAP-LABEL: urem_si32_ui32_ui32:
 ; LA64-TRAP:       # %bb.0: # %entry
-; LA64-TRAP-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-TRAP-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-TRAP-NEXT:    mod.du $a0, $a0, $a1
+; LA64-TRAP-NEXT:    addi.w $a1, $a1, 0
+; LA64-TRAP-NEXT:    addi.w $a0, $a0, 0
+; LA64-TRAP-NEXT:    mod.wu $a0, $a0, $a1
 ; LA64-TRAP-NEXT:    bnez $a1, .LBB29_2
 ; LA64-TRAP-NEXT:  # %bb.1: # %entry
 ; LA64-TRAP-NEXT:    break 7
 ; LA64-TRAP-NEXT:  .LBB29_2: # %entry
-; LA64-TRAP-NEXT:    addi.w $a0, $a0, 0
 ; LA64-TRAP-NEXT:    ret
 entry:
   %r = urem i32 %a, %b
@@ -1108,10 +1086,7 @@ define signext i32 @urem_si32_si32_si32(i32 signext %a, i32 signext %b) {
 ;
 ; LA64-LABEL: urem_si32_si32_si32:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-NEXT:    mod.du $a0, $a0, $a1
-; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    mod.wu $a0, $a0, $a1
 ; LA64-NEXT:    ret
 ;
 ; LA32-TRAP-LABEL: urem_si32_si32_si32:
@@ -1125,14 +1100,11 @@ define signext i32 @urem_si32_si32_si32(i32 signext %a, i32 signext %b) {
 ;
 ; LA64-TRAP-LABEL: urem_si32_si32_si32:
 ; LA64-TRAP:       # %bb.0: # %entry
-; LA64-TRAP-NEXT:    bstrpick.d $a1, $a1, 31, 0
-; LA64-TRAP-NEXT:    bstrpick.d $a0, $a0, 31, 0
-; LA64-TRAP-NEXT:    mod.du $a0, $a0, $a1
+; LA64-TRAP-NEXT:    mod.wu $a0, $a0, $a1
 ; LA64-TRAP-NEXT:    bnez $a1, .LBB30_2
 ; LA64-TRAP-NEXT:  # %bb.1: # %entry
 ; LA64-TRAP-NEXT:    break 7
 ; LA64-TRAP-NEXT:  .LBB30_2: # %entry
-; LA64-TRAP-NEXT:    addi.w $a0, $a0, 0
 ; LA64-TRAP-NEXT:    ret
 entry:
   %r = urem i32 %a, %b

From 6582efc263c0df921b88b03bbdcd28a89daaa641 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 20 May 2024 09:36:45 +0200
Subject: [PATCH 43/60] [Clang] Fix __is_array returning true for zero-sized
 arrays (#86652)

Fixes #54705
---
 clang/docs/ReleaseNotes.rst        | 3 +++
 clang/lib/Sema/SemaExprCXX.cpp     | 8 ++++++++
 clang/test/SemaCXX/type-traits.cpp | 4 ++++
 3 files changed, 15 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7af5869d21768d..5a123b0b86ddac 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -586,6 +586,9 @@ Bug Fixes in This Version
 - Clang now correctly disallows VLA type compound literals, e.g. ``(int[size]){}``,
   as the C standard mandates. (#GH89835)
 
+- ``__is_array`` and ``__is_bounded_array`` no longer return ``true`` for
+  zero-sized arrays. Fixes (#GH54705).
+
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index e4601f7d6c47d5..f543e006060d62 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -5217,10 +5217,18 @@ static bool EvaluateUnaryTypeTrait(Sema &Self, TypeTrait UTT,
   case UTT_IsFloatingPoint:
     return T->isFloatingType();
   case UTT_IsArray:
+    // Zero-sized arrays aren't considered arrays in partial specializations,
+    // so __is_array shouldn't consider them arrays either.
+    if (const auto *CAT = C.getAsConstantArrayType(T))
+      return CAT->getSize() != 0;
     return T->isArrayType();
   case UTT_IsBoundedArray:
     if (DiagnoseVLAInCXXTypeTrait(Self, TInfo, tok::kw___is_bounded_array))
       return false;
+    // Zero-sized arrays aren't considered arrays in partial specializations,
+    // so __is_bounded_array shouldn't consider them arrays either.
+    if (const auto *CAT = C.getAsConstantArrayType(T))
+      return CAT->getSize() != 0;
     return T->isArrayType() && !T->isIncompleteArrayType();
   case UTT_IsUnboundedArray:
     if (DiagnoseVLAInCXXTypeTrait(Self, TInfo, tok::kw___is_unbounded_array))
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index f2fd45762abf81..d40605f56f1edd 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -25,6 +25,7 @@ typedef Empty EmptyArMB[1][2];
 typedef int Int;
 typedef Int IntAr[10];
 typedef Int IntArNB[];
+typedef Int IntArZero[0];
 class Statics { static int priv; static NonPOD np; };
 union EmptyUnion {};
 union IncompleteUnion; // expected-note {{forward declaration of 'IncompleteUnion'}}
@@ -685,6 +686,7 @@ void is_array()
 {
   static_assert(__is_array(IntAr));
   static_assert(__is_array(IntArNB));
+  static_assert(!__is_array(IntArZero));
   static_assert(__is_array(UnionAr));
 
   static_assert(!__is_array(void));
@@ -714,6 +716,7 @@ void is_array()
 void is_bounded_array(int n) {
   static_assert(__is_bounded_array(IntAr));
   static_assert(!__is_bounded_array(IntArNB));
+  static_assert(!__is_bounded_array(IntArZero));
   static_assert(__is_bounded_array(UnionAr));
 
   static_assert(!__is_bounded_array(void));
@@ -746,6 +749,7 @@ void is_bounded_array(int n) {
 void is_unbounded_array(int n) {
   static_assert(!__is_unbounded_array(IntAr));
   static_assert(__is_unbounded_array(IntArNB));
+  static_assert(!__is_unbounded_array(IntArZero));
   static_assert(!__is_unbounded_array(UnionAr));
 
   static_assert(!__is_unbounded_array(void));

From da6a0b7af29a222b2e16a10155b49d4fafe967f3 Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Mon, 20 May 2024 09:37:53 +0200
Subject: [PATCH 44/60] [OpenCL] Add cl_khr_kernel_clock builtins (#91950)

---
 clang/lib/Headers/opencl-c-base.h |  4 ++++
 clang/lib/Headers/opencl-c.h      | 15 +++++++++++++++
 clang/lib/Sema/OpenCLBuiltins.td  | 14 ++++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/clang/lib/Headers/opencl-c-base.h b/clang/lib/Headers/opencl-c-base.h
index 2494f6213fc569..786678b9d8a753 100644
--- a/clang/lib/Headers/opencl-c-base.h
+++ b/clang/lib/Headers/opencl-c-base.h
@@ -46,6 +46,10 @@
 #define __opencl_c_ext_fp32_global_atomic_min_max 1
 #define __opencl_c_ext_fp32_local_atomic_min_max 1
 #define __opencl_c_ext_image_raw10_raw12 1
+#define cl_khr_kernel_clock 1
+#define __opencl_c_kernel_clock_scope_device 1
+#define __opencl_c_kernel_clock_scope_work_group 1
+#define __opencl_c_kernel_clock_scope_sub_group 1
 
 #endif // defined(__SPIR__) || defined(__SPIRV__)
 #endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
diff --git a/clang/lib/Headers/opencl-c.h b/clang/lib/Headers/opencl-c.h
index 288bb18bc654eb..20719b74b6b8d1 100644
--- a/clang/lib/Headers/opencl-c.h
+++ b/clang/lib/Headers/opencl-c.h
@@ -17314,6 +17314,21 @@ half __ovld __conv sub_group_clustered_rotate(half, int, uint);
 #endif // cl_khr_fp16
 #endif // cl_khr_subgroup_rotate
 
+#if defined(cl_khr_kernel_clock)
+#if defined(__opencl_c_kernel_clock_scope_device)
+ulong __ovld clock_read_device();
+uint2 __ovld clock_read_hilo_device();
+#endif // __opencl_c_kernel_clock_scope_device
+#if defined(__opencl_c_kernel_clock_scope_work_group)
+ulong __ovld clock_read_work_group();
+uint2 __ovld clock_read_hilo_work_group();
+#endif // __opencl_c_kernel_clock_scope_work_group
+#if defined(__opencl_c_kernel_clock_scope_sub_group)
+ulong __ovld clock_read_sub_group();
+uint2 __ovld clock_read_hilo_sub_group();
+#endif // __opencl_c_kernel_clock_scope_sub_group
+#endif // cl_khr_kernel_clock
+
 #if defined(cl_intel_subgroups)
 // Intel-Specific Sub Group Functions
 float   __ovld __conv intel_sub_group_shuffle( float , uint );
diff --git a/clang/lib/Sema/OpenCLBuiltins.td b/clang/lib/Sema/OpenCLBuiltins.td
index a7bdfe20b98280..4da61429fcce70 100644
--- a/clang/lib/Sema/OpenCLBuiltins.td
+++ b/clang/lib/Sema/OpenCLBuiltins.td
@@ -1852,6 +1852,20 @@ let Extension = FunctionExtension<"cl_khr_subgroup_rotate"> in {
   def : Builtin<"sub_group_clustered_rotate", [AGenType1, AGenType1, Int, UInt], Attr.Convergent>;
 }
 
+// cl_khr_kernel_clock
+let Extension = FunctionExtension<"cl_khr_kernel_clock __opencl_c_kernel_clock_scope_device"> in {
+  def : Builtin<"clock_read_device", [ULong]>;
+  def : Builtin<"clock_read_hilo_device", [VectorType<UInt, 2>]>;
+}
+let Extension = FunctionExtension<"cl_khr_kernel_clock __opencl_c_kernel_clock_scope_work_group"> in {
+  def : Builtin<"clock_read_work_group", [ULong]>;
+  def : Builtin<"clock_read_hilo_work_group", [VectorType<UInt, 2>]>;
+}
+let Extension = FunctionExtension<"cl_khr_kernel_clock __opencl_c_kernel_clock_scope_sub_group"> in {
+  def : Builtin<"clock_read_sub_group", [ULong]>;
+  def : Builtin<"clock_read_hilo_sub_group", [VectorType<UInt, 2>]>;
+}
+
 //--------------------------------------------------------------------
 // Arm extensions.
 let Extension = ArmIntegerDotProductInt8 in {

From 50b2bd4a25bb3a7a0790cb59e1240099efd7092e Mon Sep 17 00:00:00 2001
From: Daniel Grumberg <dgrumberg@apple.com>
Date: Mon, 20 May 2024 08:59:02 +0100
Subject: [PATCH 45/60] [clang][ExtractAPI] Remove symbols defined in
 categories to external types unless requested (#92522)

rdar://128259890
---
 .../Serialization/SymbolGraphSerializer.h      |  9 +++++++--
 .../Serialization/SymbolGraphSerializer.cpp    | 11 +++++++++--
 clang/test/ExtractAPI/objc_external_category.m | 18 +++++++++++++-----
 3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/clang/include/clang/ExtractAPI/Serialization/SymbolGraphSerializer.h b/clang/include/clang/ExtractAPI/Serialization/SymbolGraphSerializer.h
index 724b087f7aea98..27e9167ca1ad02 100644
--- a/clang/include/clang/ExtractAPI/Serialization/SymbolGraphSerializer.h
+++ b/clang/include/clang/ExtractAPI/Serialization/SymbolGraphSerializer.h
@@ -102,6 +102,8 @@ class SymbolGraphSerializer : public APISetVisitor<SymbolGraphSerializer> {
 
   const bool EmitSymbolLabelsForTesting = false;
 
+  const bool SkipSymbolsInCategoriesToExternalTypes = false;
+
   /// The object instantiated by the last call to serializeAPIRecord.
   Object *CurrentSymbol = nullptr;
 
@@ -271,10 +273,13 @@ class SymbolGraphSerializer : public APISetVisitor<SymbolGraphSerializer> {
 
   SymbolGraphSerializer(const APISet &API, const APIIgnoresList &IgnoresList,
                         bool EmitSymbolLabelsForTesting = false,
-                        bool ForceEmitToMainModule = false)
+                        bool ForceEmitToMainModule = false,
+                        bool SkipSymbolsInCategoriesToExternalTypes = false)
       : Base(API), ForceEmitToMainModule(ForceEmitToMainModule),
         IgnoresList(IgnoresList),
-        EmitSymbolLabelsForTesting(EmitSymbolLabelsForTesting) {}
+        EmitSymbolLabelsForTesting(EmitSymbolLabelsForTesting),
+        SkipSymbolsInCategoriesToExternalTypes(
+            SkipSymbolsInCategoriesToExternalTypes) {}
 };
 
 } // namespace extractapi
diff --git a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
index c16d4623f115d5..08e711cafae280 100644
--- a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
+++ b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
@@ -925,6 +925,10 @@ bool SymbolGraphSerializer::visitObjCInterfaceRecord(
 
 bool SymbolGraphSerializer::traverseObjCCategoryRecord(
     const ObjCCategoryRecord *Record) {
+  if (SkipSymbolsInCategoriesToExternalTypes &&
+      !API.findRecordForUSR(Record->Interface.USR))
+    return true;
+
   auto *CurrentModule = ModuleForCurrentSymbol;
   if (Record->isExtendingExternalModule())
     ModuleForCurrentSymbol = &ExtendedModules[Record->Interface.Source];
@@ -1040,8 +1044,11 @@ void SymbolGraphSerializer::serializeGraphToStream(
 void SymbolGraphSerializer::serializeMainSymbolGraph(
     raw_ostream &OS, const APISet &API, const APIIgnoresList &IgnoresList,
     SymbolGraphSerializerOption Options) {
-  SymbolGraphSerializer Serializer(API, IgnoresList,
-                                   Options.EmitSymbolLabelsForTesting);
+  SymbolGraphSerializer Serializer(
+      API, IgnoresList, Options.EmitSymbolLabelsForTesting,
+      /*ForceEmitToMainModule=*/true,
+      /*SkipSymbolsInCategoriesToExternalTypes=*/true);
+
   Serializer.traverseAPISet();
   Serializer.serializeGraphToStream(OS, Options, API.ProductName,
                                     std::move(Serializer.MainModule));
diff --git a/clang/test/ExtractAPI/objc_external_category.m b/clang/test/ExtractAPI/objc_external_category.m
index 47e699cb91c0e4..8afc92489f28b6 100644
--- a/clang/test/ExtractAPI/objc_external_category.m
+++ b/clang/test/ExtractAPI/objc_external_category.m
@@ -4,6 +4,9 @@
 // RUN:   --emit-extension-symbol-graphs --symbol-graph-dir=%t/symbols \
 // RUN:   --product-name=Module -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/modules-cache \
 // RUN:   -triple arm64-apple-macosx -x objective-c-header %t/input.h -verify
+// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
+// RUN:   --product-name=Module -o %t/ModuleNoExt.symbols.json -triple arm64-apple-macosx \
+// RUN:   -x objective-c-header %t/input.h
 
 //--- input.h
 #include "ExternalModule.h"
@@ -28,15 +31,20 @@ @interface ExtInterface
     header "ExternalModule.h"
 }
 
+// Main symbol graph from the build with extension SGFs
 // RUN: FileCheck %s --input-file  %t/symbols/Module.symbols.json --check-prefix MOD
+
 // MOD-NOT: "!testRelLabel": "memberOf $ c:objc(cs)ExtInterface(py)Property $ c:objc(cs)ExtInterface"
 // MOD-NOT: "!testRelLabel": "memberOf $ c:objc(cs)ExtInterface(im)InstanceMethod $ c:objc(cs)ExtInterface"
 // MOD-NOT: "!testRelLabel": "memberOf $ c:objc(cs)ExtInterface(cm)ClassMethod $ c:objc(cs)ExtInterface"
-// MOD-NOT: "!testLabel": "c:objc(cs)ExtInterface(py)Property"
-// MOD-NOT: "!testLabel": "c:objc(cs)ExtInterface(im)InstanceMethod"
-// MOD-NOT: "!testLabel": "c:objc(cs)ExtInterface(cm)ClassMethod"
-// MOD-NOT: "!testLabel": "c:objc(cs)ExtInterface"
-// MOD-DAG: "!testLabel": "c:objc(cs)ModInterface"
+// MOD-NOT: "c:objc(cs)ExtInterface(py)Property"
+// MOD-NOT: "c:objc(cs)ExtInterface(im)InstanceMethod"
+// MOD-NOT: "c:objc(cs)ExtInterface(cm)ClassMethod"
+// MOD-NOT: "c:objc(cs)ExtInterface"
+// MOD-DAG: "c:objc(cs)ModInterface"
+
+// Symbol graph from the build without extension SGFs should be identical to main symbol graph with extension SGFs
+// RUN: diff %t/symbols/Module.symbols.json %t/ModuleNoExt.symbols.json
 
 // RUN: FileCheck %s --input-file %t/symbols/ExternalModule@Module.symbols.json --check-prefix EXT
 // EXT-DAG: "!testRelLabel": "memberOf $ c:objc(cs)ExtInterface(py)Property $ c:objc(cs)ExtInterface"

From b60e62896e2665e1a0ac51fc9942c1c4d31c0f53 Mon Sep 17 00:00:00 2001
From: Elvis Wang <110374989+ElvisWang123@users.noreply.github.com>
Date: Mon, 20 May 2024 16:03:18 +0800
Subject: [PATCH 46/60] [RISCV][CostModel] Remove cost of icmp inst in
 icmp+select with SFB. (#91158)

With ShortFowrardBranchOpt(SFB) or ConditionalMoveFusion, scalar
ICmp and scalar Select instructions will lower to SELECT_CC
and lower to PseudoCCMOVGPR which will generate a conditional
branch instruction and a move instruction.
The cost of scalar (ICmp + Select) = (0 + Select instruction cost)
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  17 ++
 .../Analysis/CostModel/RISCV/cmp-select.ll    | 258 ++++++++++++++++++
 2 files changed, 275 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/RISCV/cmp-select.ll

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index b73ed208ed74c6..ca8279672c096a 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -14,9 +14,11 @@
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/PatternMatch.h"
 #include <cmath>
 #include <optional>
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "riscvtti"
 
@@ -1469,6 +1471,21 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     }
   }
 
+  // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
+  // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
+  // generate a conditional branch + mv. The cost of scalar (icmp + select) will
+  // be (0 + select instr cost).
+  if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
+      ValTy->isIntegerTy() && !I->user_empty()) {
+    if (all_of(I->users(), [&](const User *U) {
+          return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
+                 U->getType()->isIntegerTy() &&
+                 !isa<ConstantData>(U->getOperand(1)) &&
+                 !isa<ConstantData>(U->getOperand(2));
+        }))
+      return 0;
+  }
+
   // TODO: Add cost for scalar type.
 
   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
diff --git a/llvm/test/Analysis/CostModel/RISCV/cmp-select.ll b/llvm/test/Analysis/CostModel/RISCV/cmp-select.ll
new file mode 100644
index 00000000000000..dc0810b1286985
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/RISCV/cmp-select.ll
@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+f,+short-forward-branch-opt -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=SFB64
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+f -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=RV64
+
+define i32 @icmp-iselect(i64 %ca, i64 %cb, i32 %a, i32 %b) {
+; SFB64-LABEL: 'icmp-iselect'
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %cmp1 = icmp slt i64 %ca, %cb
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select i1 %cmp1, i32 %a, i32 %b
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %select1
+;
+; RV64-LABEL: 'icmp-iselect'
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp1 = icmp slt i64 %ca, %cb
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select i1 %cmp1, i32 %a, i32 %b
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %select1
+;
+  %cmp1 = icmp slt i64 %ca, %cb
+  %select1 = select i1 %cmp1, i32 %a, i32 %b
+  ret i32 %select1
+}
+
+define i32 @icmp-iselects(i64 %ca, i64 %cb, i32 %a, i32 %b, i32 %c) {
+; SFB64-LABEL: 'icmp-iselects'
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %cmp1 = icmp slt i64 %ca, %cb
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select i1 %cmp1, i32 %a, i32 %b
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select2 = select i1 %cmp1, i32 %a, i32 %c
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ret = add i32 %select1, %select2
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ret
+;
+; RV64-LABEL: 'icmp-iselects'
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp1 = icmp slt i64 %ca, %cb
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select i1 %cmp1, i32 %a, i32 %b
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select2 = select i1 %cmp1, i32 %a, i32 %c
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ret = add i32 %select1, %select2
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ret
+;
+  %cmp1 = icmp slt i64 %ca, %cb
+  %select1 = select i1 %cmp1, i32 %a, i32 %b
+  %select2 = select i1 %cmp1, i32 %a, i32 %c
+  %ret = add i32 %select1, %select2
+  ret i32 %ret
+}
+
+define i32 @icmp-ifselects(i64 %ca, i64 %cb, i32 %a, i32 %b, float %c, float %d) {
+; SFB64-LABEL: 'icmp-ifselects'
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp1 = icmp slt i64 %ca, %cb
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select i1 %cmp1, i32 %a, i32 %b
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select2 = select i1 %cmp1, float %c, float %d
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %selectint = fptosi float %select2 to i32
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ret = add i32 %select1, %selectint
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ret
+;
+; RV64-LABEL: 'icmp-ifselects'
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp1 = icmp slt i64 %ca, %cb
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select i1 %cmp1, i32 %a, i32 %b
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select2 = select i1 %cmp1, float %c, float %d
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %selectint = fptosi float %select2 to i32
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ret = add i32 %select1, %selectint
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ret
+;
+  %cmp1 = icmp slt i64 %ca, %cb
+  %select1 = select i1 %cmp1, i32 %a, i32 %b
+  %select2 = select i1 %cmp1, float %c, float %d
+  %selectint = fptosi float %select2 to i32
+  %ret = add i32 %select1, %selectint
+  ret i32 %ret
+}
+
+define i32 @constant-icmp-iselect(i64 %ca, i64 %cb, i32 %a) {
+; SFB64-LABEL: 'constant-icmp-iselect'
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp1 = icmp slt i64 %ca, %cb
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select i1 %cmp1, i32 %a, i32 7
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %select1
+;
+; RV64-LABEL: 'constant-icmp-iselect'
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp1 = icmp slt i64 %ca, %cb
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select i1 %cmp1, i32 %a, i32 7
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %select1
+;
+  %cmp1 = icmp slt i64 %ca, %cb
+  %select1 = select i1 %cmp1, i32 %a, i32 7
+  ret i32 %select1
+}
+
+define i32 @fcmp-iselect(float %ca, float %cb, i32 %a, i32 %b) {
+; SFB64-LABEL: 'fcmp-iselect'
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fcmp1 = fcmp ogt float %ca, %cb
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select i1 %fcmp1, i32 %a, i32 %b
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %select1
+;
+; RV64-LABEL: 'fcmp-iselect'
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fcmp1 = fcmp ogt float %ca, %cb
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select i1 %fcmp1, i32 %a, i32 %b
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %select1
+;
+  %fcmp1 = fcmp ogt float %ca, %cb
+  %select1 = select i1 %fcmp1, i32 %a, i32 %b
+  ret i32 %select1
+}
+
+define float @fcmp-fselect(float %ca, float %cb, float %a, float %b) {
+; SFB64-LABEL: 'fcmp-fselect'
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fcmp1 = fcmp ogt float %ca, %cb
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fselect1 = select i1 %fcmp1, float %a, float %b
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %fselect1
+;
+; RV64-LABEL: 'fcmp-fselect'
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fcmp1 = fcmp ogt float %ca, %cb
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fselect1 = select i1 %fcmp1, float %a, float %b
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %fselect1
+;
+  %fcmp1 = fcmp ogt float %ca, %cb
+  %fselect1 = select i1 %fcmp1, float %a, float %b
+  ret float %fselect1
+}
+
+define float @icmp-fselect(i64 %ca, i64 %cb, float %a, float %b) {
+; SFB64-LABEL: 'icmp-fselect'
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %icmp1 = icmp slt i64 %ca, %cb
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fselect1 = select i1 %icmp1, float %a, float %b
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %fselect1
+;
+; RV64-LABEL: 'icmp-fselect'
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %icmp1 = icmp slt i64 %ca, %cb
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fselect1 = select i1 %icmp1, float %a, float %b
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %fselect1
+;
+  %icmp1 = icmp slt i64 %ca, %cb
+  %fselect1 = select i1 %icmp1, float %a, float %b
+  ret float %fselect1
+}
+
+define <2 x i32> @vector-icmp-vector-iselect(<2 x i32> %ca, <2 x i32> %cb, <2 x i32> %a, <2 x i32> %b) {
+; SFB64-LABEL: 'vector-icmp-vector-iselect'
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %icmp = icmp slt <2 x i32> %ca, %cb
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select <2 x i1> %icmp, <2 x i32> %a, <2 x i32> %b
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %select1
+;
+; RV64-LABEL: 'vector-icmp-vector-iselect'
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %icmp = icmp slt <2 x i32> %ca, %cb
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select <2 x i1> %icmp, <2 x i32> %a, <2 x i32> %b
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %select1
+;
+  %icmp = icmp slt <2 x i32> %ca, %cb
+  %select1 = select <2 x i1> %icmp, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %select1
+}
+
+define <2 x i32> @vector-fcmp-vector-iselect(<2 x float> %ca, <2 x float> %cb, <2 x i32> %a, <2 x i32> %b) {
+; SFB64-LABEL: 'vector-fcmp-vector-iselect'
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fcmp1 = fcmp ogt <2 x float> %ca, %cb
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select <2 x i1> %fcmp1, <2 x i32> %a, <2 x i32> %b
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %select1
+;
+; RV64-LABEL: 'vector-fcmp-vector-iselect'
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fcmp1 = fcmp ogt <2 x float> %ca, %cb
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select <2 x i1> %fcmp1, <2 x i32> %a, <2 x i32> %b
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %select1
+;
+  %fcmp1 = fcmp ogt <2 x float> %ca, %cb
+  %select1 = select <2 x i1> %fcmp1, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %select1
+}
+
+define <2 x float> @vector-fcmp-vector-fselect(<2 x float> %ca, <2 x float> %cb, <2 x float> %a, <2 x float> %b) {
+; SFB64-LABEL: 'vector-fcmp-vector-fselect'
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fcmp1 = fcmp ogt <2 x float> %ca, %cb
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select <2 x i1> %fcmp1, <2 x float> %a, <2 x float> %b
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %select1
+;
+; RV64-LABEL: 'vector-fcmp-vector-fselect'
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fcmp1 = fcmp ogt <2 x float> %ca, %cb
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select <2 x i1> %fcmp1, <2 x float> %a, <2 x float> %b
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %select1
+;
+  %fcmp1 = fcmp ogt <2 x float> %ca, %cb
+  %select1 = select <2 x i1> %fcmp1, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %select1
+}
+
+define <2 x float> @vector-icmp-vector-fselect(<2 x i32> %ca, <2 x i32> %cb, <2 x float> %a, <2 x float> %b) {
+; SFB64-LABEL: 'vector-icmp-vector-fselect'
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %icmp1 = icmp slt <2 x i32> %ca, %cb
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select <2 x i1> %icmp1, <2 x float> %a, <2 x float> %b
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %select1
+;
+; RV64-LABEL: 'vector-icmp-vector-fselect'
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %icmp1 = icmp slt <2 x i32> %ca, %cb
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select1 = select <2 x i1> %icmp1, <2 x float> %a, <2 x float> %b
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %select1
+;
+  %icmp1 = icmp slt <2 x i32> %ca, %cb
+  %select1 = select <2 x i1> %icmp1, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %select1
+}
+
+define <2 x float> @icmp-vector-fselect(i1 %ca, i1 %cb, <2 x float> %a, <2 x float> %b) {
+; SFB64-LABEL: 'icmp-vector-fselect'
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %icmp1 = icmp slt i1 %ca, %cb
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %select1 = select i1 %icmp1, <2 x float> %a, <2 x float> %b
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %select1
+;
+; RV64-LABEL: 'icmp-vector-fselect'
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %icmp1 = icmp slt i1 %ca, %cb
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %select1 = select i1 %icmp1, <2 x float> %a, <2 x float> %b
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %select1
+;
+  %icmp1 = icmp slt i1 %ca, %cb
+  %select1 = select i1 %icmp1, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %select1
+}
+
+define <2 x i32> @icmp-vector-iselect(i1 %ca, i1 %cb, <2 x i32> %a, <2 x i32> %b) {
+; SFB64-LABEL: 'icmp-vector-iselect'
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %icmp1 = icmp slt i1 %ca, %cb
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %select1 = select i1 %icmp1, <2 x i32> %a, <2 x i32> %b
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %select1
+;
+; RV64-LABEL: 'icmp-vector-iselect'
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %icmp1 = icmp slt i1 %ca, %cb
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %select1 = select i1 %icmp1, <2 x i32> %a, <2 x i32> %b
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %select1
+;
+  %icmp1 = icmp slt i1 %ca, %cb
+  %select1 = select i1 %icmp1, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %select1
+}
+
+define <2 x float> @fcmp-vector-fselect(float %ca, float %cb, <2 x float> %a, <2 x float> %b) {
+; SFB64-LABEL: 'fcmp-vector-fselect'
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fcmp1 = fcmp ogt float %ca, %cb
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %select1 = select i1 %fcmp1, <2 x float> %a, <2 x float> %b
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %select1
+;
+; RV64-LABEL: 'fcmp-vector-fselect'
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fcmp1 = fcmp ogt float %ca, %cb
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %select1 = select i1 %fcmp1, <2 x float> %a, <2 x float> %b
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %select1
+;
+  %fcmp1 = fcmp ogt float %ca, %cb
+  %select1 = select i1 %fcmp1, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %select1
+}
+
+define <2 x i32> @fcmp-vector-iselect(float %ca, float %cb, <2 x i32> %a, <2 x i32> %b) {
+; SFB64-LABEL: 'fcmp-vector-iselect'
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fcmp1 = fcmp ogt float %ca, %cb
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %select1 = select i1 %fcmp1, <2 x i32> %a, <2 x i32> %b
+; SFB64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %select1
+;
+; RV64-LABEL: 'fcmp-vector-iselect'
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fcmp1 = fcmp ogt float %ca, %cb
+; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %select1 = select i1 %fcmp1, <2 x i32> %a, <2 x i32> %b
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %select1
+;
+  %fcmp1 = fcmp ogt float %ca, %cb
+  %select1 = select i1 %fcmp1, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %select1
+}

From f5211d79b9edf56c08143491ccde38d480b40ab8 Mon Sep 17 00:00:00 2001
From: Shan Huang <52285902006@stu.ecnu.edu.cn>
Date: Mon, 20 May 2024 16:39:48 +0800
Subject: [PATCH 47/60] [DebugInfo][GVNSink] Fix #77415: GVNSink fails to
 optimize LLVM IR with debug info (#77602)

This PR fixes issue #77415 and is revised from PR #77419 .

PR #77419 breaks the newly added test in the same PR on windows, because
GVNSink is non-deterministic when sorting `BasicBlock*` pointers. This
is reflected in the failure report.

```
# | C:\src\llvm-project\llvm\test\Transforms\GVNSink\sink-ignore-dbg-intrinsics.ll:28:10: error: CHECK: expected string not found in input
# | ; CHECK: %a.sink = phi i32 [ %a, %if.then ], [ %b, %if.else ]
# |          ^
# | <stdin>:24:8: note: scanning from here
# | if.end: ; preds = %if.else, %if.then
# |        ^
# | <stdin>:25:2: note: possible intended match here
# |  %b.sink = phi i32 [ %b, %if.else ], [ %a, %if.then ]
# |  ^
# |
# | Input file: <stdin>
# | Check file: C:\src\llvm-project\llvm\test\Transforms\GVNSink\sink-ignore-dbg-intrinsics.ll
```

According to the report, what the CheckFile wants to match is the
`%a.sink`, however there is `%b.sink`. But this mismatch does not mean
that this commit is wrong, since the occurrence of either `%a.sink` or
`%b.sink` is correct. The root cause of this test failure is the strict
check rule in the regression test committed.

So I refined the regression test with a more general check rule to only
detect whether there is an instruction with suffix `.sink` in the
`if.end` block. Hope this won't fail the test. If this PR still fails to
build, I will close this PR and try to find another right way to fix
this.
---
 llvm/lib/Transforms/Scalar/GVNSink.cpp        |  6 +-
 .../GVNSink/sink-ignore-dbg-intrinsics.ll     | 89 +++++++++++++++++++
 2 files changed, 92 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/GVNSink/sink-ignore-dbg-intrinsics.ll

diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index 7a183e4d3aa8fc..b0f716cb17598c 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -132,7 +132,7 @@ class LockstepReverseIterator {
         ActiveBlocks.remove(BB);
         continue;
       }
-      Insts.push_back(BB->getTerminator()->getPrevNode());
+      Insts.push_back(BB->getTerminator()->getPrevNonDebugInstruction());
     }
     if (Insts.empty())
       Fail = true;
@@ -168,7 +168,7 @@ class LockstepReverseIterator {
       if (Inst == &Inst->getParent()->front())
         ActiveBlocks.remove(Inst->getParent());
       else
-        NewInsts.push_back(Inst->getPrevNode());
+        NewInsts.push_back(Inst->getPrevNonDebugInstruction());
     }
     if (NewInsts.empty()) {
       Fail = true;
@@ -883,7 +883,7 @@ void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks,
                                   BasicBlock *BBEnd) {
   SmallVector<Instruction *, 4> Insts;
   for (BasicBlock *BB : Blocks)
-    Insts.push_back(BB->getTerminator()->getPrevNode());
+    Insts.push_back(BB->getTerminator()->getPrevNonDebugInstruction());
   Instruction *I0 = Insts.front();
 
   SmallVector<Value *, 4> NewOperands;
diff --git a/llvm/test/Transforms/GVNSink/sink-ignore-dbg-intrinsics.ll b/llvm/test/Transforms/GVNSink/sink-ignore-dbg-intrinsics.ll
new file mode 100644
index 00000000000000..8aafd689e781d0
--- /dev/null
+++ b/llvm/test/Transforms/GVNSink/sink-ignore-dbg-intrinsics.ll
@@ -0,0 +1,89 @@
+; RUN: opt < %s -passes=gvn-sink -S | FileCheck %s
+
+; Test that GVNSink correctly performs the sink optimization in the presence of debug information
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @fun(i32 noundef %a, i32 noundef %b) #0 !dbg !10 {
+; CHECK-LABEL: define dso_local i32 @fun(
+; CHECK-SAME:    i32 noundef [[A:%.*]], i32 noundef [[B:%.*]])
+; CHECK:       if.end:
+; CHECK:         [[B_SINK:%.*]] = phi i32 [ [[B]], %if.else ], [ [[A]], %if.then ]
+; CHECK:         [[ADD1:%.*]] = add nsw i32 [[B_SINK]], 1
+; CHECK:         [[XOR2:%.*]] = xor i32 [[ADD1]], 1
+;
+entry:
+  tail call void @llvm.dbg.value(metadata i32 %a, metadata !15, metadata !DIExpression()), !dbg !16
+  tail call void @llvm.dbg.value(metadata i32 %b, metadata !17, metadata !DIExpression()), !dbg !16
+  %cmp = icmp sgt i32 %b, 10, !dbg !18
+  br i1 %cmp, label %if.then, label %if.else, !dbg !20
+
+if.then:                                          ; preds = %entry
+  %add = add nsw i32 %a, 1, !dbg !21
+  tail call void @llvm.dbg.value(metadata i32 %add, metadata !23, metadata !DIExpression()), !dbg !24
+  %xor = xor i32 %add, 1, !dbg !25
+  tail call void @llvm.dbg.value(metadata i32 %xor, metadata !26, metadata !DIExpression()), !dbg !24
+  tail call void @llvm.dbg.value(metadata i32 %xor, metadata !27, metadata !DIExpression()), !dbg !16
+  br label %if.end, !dbg !28
+
+if.else:                                          ; preds = %entry
+  %add1 = add nsw i32 %b, 1, !dbg !29
+  tail call void @llvm.dbg.value(metadata i32 %add1, metadata !31, metadata !DIExpression()), !dbg !32
+  %xor2 = xor i32 %add1, 1, !dbg !33
+  tail call void @llvm.dbg.value(metadata i32 %xor2, metadata !34, metadata !DIExpression()), !dbg !32
+  tail call void @llvm.dbg.value(metadata i32 %xor2, metadata !27, metadata !DIExpression()), !dbg !16
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %ret.0 = phi i32 [ %xor, %if.then ], [ %xor2, %if.else ], !dbg !35
+  tail call void @llvm.dbg.value(metadata i32 %ret.0, metadata !27, metadata !DIExpression()), !dbg !16
+  ret i32 %ret.0, !dbg !36
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { noinline nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 18.0.0git (https://github.com/llvm/llvm-project.git 5dfcb3e5d1d16bb4f8fce52b3c089119ed977e7f)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "main.c", directory: "/")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{i32 7, !"frame-pointer", i32 2}
+!10 = distinct !DISubprogram(name: "fun", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !14)
+!11 = !DISubroutineType(types: !12)
+!12 = !{!13, !13, !13}
+!13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!14 = !{}
+!15 = !DILocalVariable(name: "a", arg: 1, scope: !10, file: !1, line: 1, type: !13)
+!16 = !DILocation(line: 0, scope: !10)
+!17 = !DILocalVariable(name: "b", arg: 2, scope: !10, file: !1, line: 1, type: !13)
+!18 = !DILocation(line: 3, column: 11, scope: !19)
+!19 = distinct !DILexicalBlock(scope: !10, file: !1, line: 3, column: 9)
+!20 = !DILocation(line: 3, column: 9, scope: !10)
+!21 = !DILocation(line: 4, column: 20, scope: !22)
+!22 = distinct !DILexicalBlock(scope: !19, file: !1, line: 3, column: 17)
+!23 = !DILocalVariable(name: "a1", scope: !22, file: !1, line: 4, type: !13)
+!24 = !DILocation(line: 0, scope: !22)
+!25 = !DILocation(line: 5, column: 21, scope: !22)
+!26 = !DILocalVariable(name: "a2", scope: !22, file: !1, line: 5, type: !13)
+!27 = !DILocalVariable(name: "ret", scope: !10, file: !1, line: 2, type: !13)
+!28 = !DILocation(line: 7, column: 5, scope: !22)
+!29 = !DILocation(line: 8, column: 20, scope: !30)
+!30 = distinct !DILexicalBlock(scope: !19, file: !1, line: 7, column: 12)
+!31 = !DILocalVariable(name: "b1", scope: !30, file: !1, line: 8, type: !13)
+!32 = !DILocation(line: 0, scope: !30)
+!33 = !DILocation(line: 9, column: 21, scope: !30)
+!34 = !DILocalVariable(name: "b2", scope: !30, file: !1, line: 9, type: !13)
+!35 = !DILocation(line: 0, scope: !19)
+!36 = !DILocation(line: 12, column: 5, scope: !10)

From d3d6565c2453be2f580ff12b32cc5d0cb5c6c9d8 Mon Sep 17 00:00:00 2001
From: hanbeom <kese111@gmail.com>
Date: Mon, 20 May 2024 09:41:51 +0100
Subject: [PATCH 48/60] [AArch64] Add PreTest for optimizing `MOV` to `ORR`

---
 .../CodeGen/AArch64/movimm-expand-ldst.ll     | 97 +++++++++++++++++++
 .../CodeGen/AArch64/movimm-expand-ldst.mir    | 35 +++++++
 2 files changed, 132 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
 create mode 100644 llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir

diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
new file mode 100644
index 00000000000000..82bba7b5f85405
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
+
+define i64 @test0x1234567812345678() {
+; CHECK-LABEL: test0x1234567812345678:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, #22136 // =0x5678
+; CHECK-NEXT:    movk x0, #4660, lsl #16
+; CHECK-NEXT:    movk x0, #22136, lsl #32
+; CHECK-NEXT:    movk x0, #4660, lsl #48
+; CHECK-NEXT:    ret
+  ret i64 u0x1234567812345678
+}
+
+define i64 @test0xff3456ffff3456ff() {
+; CHECK-LABEL: test0xff3456ffff3456ff:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, #22271 // =0x56ff
+; CHECK-NEXT:    movk x0, #65332, lsl #16
+; CHECK-NEXT:    movk x0, #22271, lsl #32
+; CHECK-NEXT:    movk x0, #65332, lsl #48
+; CHECK-NEXT:    ret
+  ret i64 u0xff3456ffff3456ff
+}
+
+define i64 @test0x00345600345600() {
+; CHECK-LABEL: test0x00345600345600:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, #22016 // =0x5600
+; CHECK-NEXT:    movk x0, #52, lsl #16
+; CHECK-NEXT:    movk x0, #13398, lsl #32
+; CHECK-NEXT:    ret
+  ret i64 u0x00345600345600
+}
+
+define i64 @test0x5555555555555555() {
+; CHECK-LABEL: test0x5555555555555555:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, #6148914691236517205 // =0x5555555555555555
+; CHECK-NEXT:    ret
+  ret i64 u0x5555555555555555
+}
+
+define i64 @test0x5055555550555555() {
+; CHECK-LABEL: test0x5055555550555555:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, #6148914691236517205 // =0x5555555555555555
+; CHECK-NEXT:    and x0, x0, #0xf0fffffff0ffffff
+; CHECK-NEXT:    ret
+  ret i64 u0x5055555550555555
+}
+
+define i64 @test0x0000555555555555() {
+; CHECK-LABEL: test0x0000555555555555:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, #6148914691236517205 // =0x5555555555555555
+; CHECK-NEXT:    movk x0, #0, lsl #48
+; CHECK-NEXT:    ret
+  ret i64 u0x0000555555555555
+}
+
+define i64 @test0x0000555500005555() {
+; CHECK-LABEL: test0x0000555500005555:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, #21845 // =0x5555
+; CHECK-NEXT:    movk x0, #21845, lsl #32
+; CHECK-NEXT:    ret
+  ret i64 u0x0000555500005555
+}
+
+define i64 @testu0xffff5555ffff5555() {
+; CHECK-LABEL: testu0xffff5555ffff5555:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, #-43691 // =0xffffffffffff5555
+; CHECK-NEXT:    movk x0, #21845, lsl #32
+; CHECK-NEXT:    ret
+  ret i64 u0xffff5555ffff5555
+}
+
+define i64 @testuu0xfffff555f555f555() {
+; CHECK-LABEL: testuu0xfffff555f555f555:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, #-2731 // =0xfffffffffffff555
+; CHECK-NEXT:    movk x0, #62805, lsl #16
+; CHECK-NEXT:    movk x0, #62805, lsl #32
+; CHECK-NEXT:    ret
+  ret i64 u0xfffff555f555f555
+}
+
+define i64 @testuu0xf555f555f555f555() {
+; CHECK-LABEL: testuu0xf555f555f555f555:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, #6148914691236517205 // =0x5555555555555555
+; CHECK-NEXT:    orr x0, x0, #0xe001e001e001e001
+; CHECK-NEXT:    ret
+  ret i64 u0xf555f555f555f555
+}
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
new file mode 100644
index 00000000000000..de14437108c93f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
@@ -0,0 +1,35 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=aarch64 -verify-machineinstrs -run-pass=aarch64-expand-pseudo -run-pass=aarch64-ldst-opt -debug-only=aarch64-ldst-opt %s -o - | FileCheck %s
+---
+name: test_fold_repeating_constant_load
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_fold_repeating_constant_load
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $x0 = MOVZXi 49370, 0
+    ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 320, 16
+    ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 49370, 32
+    ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 320, 48
+    ; CHECK-NEXT: RET undef $lr, implicit $x0
+    renamable $x0 = MOVi64imm 90284035103834330
+    RET_ReallyLR implicit $x0
+...
+---
+name: test_fold_repeating_constant_load_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_fold_repeating_constant_load_neg
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $x0 = MOVZXi 320, 0
+    ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 49370, 16
+    ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 320, 32
+    ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 49370, 48
+    ; CHECK-NEXT: RET undef $lr, implicit $x0
+    renamable $x0 = MOVi64imm -4550323095879417536
+    RET_ReallyLR implicit $x0

From 384bf545a1a7d8dfd16afd20ef07eb845495bac4 Mon Sep 17 00:00:00 2001
From: bd1976bris <bd1976llvm@gmail.com>
Date: Mon, 20 May 2024 09:48:35 +0100
Subject: [PATCH 49/60] [Driver][PS5] Set visibility option defaults (#92091)

Adjust the PS5 driver defaults for the -fvisibility-from-dllstorageclass
sub-options so that only globals with dllimport/dllexport annotations
are adjusted. This allows globals without dllimport/export to retain
the visibility and pre-emptability assigned during IR-Gen. Set
-fvisibility=hidden on PS5 by default to compensate for no longer
overriding the visibility of definitions without dllexport. Note there
is no behavior change for PS4 (the behavior of overriding the
visibility for all globals is retained on PS4).
---
 clang/lib/Driver/ToolChains/PS4CPU.cpp        | 18 ++++++++-
 .../ps4-ps5-visibility-dllstorageclass.c      | 39 ++++++++++++-------
 clang/test/Driver/ps4-visibility.cl           | 32 +++++++++++++++
 clang/test/Driver/ps5-visibility.cl           | 33 ++++++++++++++++
 4 files changed, 107 insertions(+), 15 deletions(-)
 create mode 100644 clang/test/Driver/ps4-visibility.cl
 create mode 100644 clang/test/Driver/ps5-visibility.cl

diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index 7bf9aa79384c55..3fd62d97930937 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -358,6 +358,12 @@ void toolchains::PS4PS5Base::addClangTargetOptions(
 
   CC1Args.push_back("-fno-use-init-array");
 
+  // Default to `hidden` visibility for PS5.
+  if (getTriple().isPS5() &&
+      !DriverArgs.hasArg(options::OPT_fvisibility_EQ,
+                         options::OPT_fvisibility_ms_compat))
+    CC1Args.push_back("-fvisibility=hidden");
+
   // Default to -fvisibility-global-new-delete=source for PS5.
   if (getTriple().isPS5() &&
       !DriverArgs.hasArg(options::OPT_fvisibility_global_new_delete_EQ,
@@ -376,11 +382,15 @@ void toolchains::PS4PS5Base::addClangTargetOptions(
     else
       CC1Args.push_back("-fvisibility-dllexport=protected");
 
+    // For PS4 we override the visibilty of globals definitions without
+    // dllimport or  dllexport annotations.
     if (DriverArgs.hasArg(options::OPT_fvisibility_nodllstorageclass_EQ))
       DriverArgs.AddLastArg(CC1Args,
                             options::OPT_fvisibility_nodllstorageclass_EQ);
-    else
+    else if (getTriple().isPS4())
       CC1Args.push_back("-fvisibility-nodllstorageclass=hidden");
+    else
+      CC1Args.push_back("-fvisibility-nodllstorageclass=keep");
 
     if (DriverArgs.hasArg(options::OPT_fvisibility_externs_dllimport_EQ))
       DriverArgs.AddLastArg(CC1Args,
@@ -388,12 +398,16 @@ void toolchains::PS4PS5Base::addClangTargetOptions(
     else
       CC1Args.push_back("-fvisibility-externs-dllimport=default");
 
+    // For PS4 we override the visibilty of external globals without
+    // dllimport or  dllexport annotations.
     if (DriverArgs.hasArg(
             options::OPT_fvisibility_externs_nodllstorageclass_EQ))
       DriverArgs.AddLastArg(
           CC1Args, options::OPT_fvisibility_externs_nodllstorageclass_EQ);
-    else
+    else if (getTriple().isPS4())
       CC1Args.push_back("-fvisibility-externs-nodllstorageclass=default");
+    else
+      CC1Args.push_back("-fvisibility-externs-nodllstorageclass=keep");
   }
 }
 
diff --git a/clang/test/Driver/ps4-ps5-visibility-dllstorageclass.c b/clang/test/Driver/ps4-ps5-visibility-dllstorageclass.c
index 430827805a8fda..71f8661679eb79 100644
--- a/clang/test/Driver/ps4-ps5-visibility-dllstorageclass.c
+++ b/clang/test/Driver/ps4-ps5-visibility-dllstorageclass.c
@@ -1,16 +1,19 @@
 // Check behaviour of -fvisibility-from-dllstorageclass options for PS4/PS5.
 
 // DEFINE: %{triple} =
+// DEFINE: %{prefix} =
 // DEFINE: %{run} = \
 // DEFINE: %clang -### -target %{triple} %s -Werror -o - 2>&1 | \
-// DEFINE:   FileCheck %s --check-prefix=DEFAULTS \
+// DEFINE:   FileCheck %s --check-prefixes=DEFAULTS,%{prefix} \
 // DEFINE:     --implicit-check-not=-fvisibility-from-dllstorageclass \
 // DEFINE:     --implicit-check-not=-fvisibility-dllexport \
 // DEFINE:     --implicit-check-not=-fvisibility-nodllstorageclass \
 // DEFINE:     --implicit-check-not=-fvisibility-externs-dllimport \
 // DEFINE:     --implicit-check-not=-fvisibility-externs-nodllstorageclass
+// REDEFINE: %{prefix} = DEFAULTS-PS4
 // REDEFINE: %{triple} = x86_64-scei-ps4
 // RUN: %{run}
+// REDEFINE: %{prefix} = DEFAULTS-PS5
 // REDEFINE: %{triple} = x86_64-sie-ps5
 // RUN: %{run}
 //
@@ -20,25 +23,29 @@
 // REDEFINE:     -fvisibility-from-dllstorageclass \
 // REDEFINE:     -Werror \
 // REDEFINE:     %s -o - 2>&1 | \
-// REDEFINE:   FileCheck %s --check-prefix=DEFAULTS \
+// REDEFINE:   FileCheck %s --check-prefixes=DEFAULTS,%{prefix} \
 // REDEFINE:     --implicit-check-not=-fvisibility-from-dllstorageclass \
 // REDEFINE:     --implicit-check-not=-fvisibility-dllexport \
 // REDEFINE:     --implicit-check-not=-fvisibility-nodllstorageclass \
 // REDEFINE:     --implicit-check-not=-fvisibility-externs-dllimport \
 // REDEFINE:     --implicit-check-not=-fvisibility-externs-nodllstorageclass
+// REDEFINE: %{prefix} = DEFAULTS-PS4
 // REDEFINE: %{triple} = x86_64-scei-ps4
 // RUN: %{run}
+// REDEFINE: %{prefix} = DEFAULTS-PS5
 // REDEFINE: %{triple} = x86_64-sie-ps5
 // RUN: %{run}
 
 // DEFAULTS:      "-fvisibility-from-dllstorageclass"
 // DEFAULTS-SAME: "-fvisibility-dllexport=protected"
-// DEFAULTS-SAME: "-fvisibility-nodllstorageclass=hidden"
+// DEFAULTS-PS4-SAME: "-fvisibility-nodllstorageclass=hidden"
+// DEFAULTS-PS5-SAME: "-fvisibility-nodllstorageclass=keep"
 // DEFAULTS-SAME: "-fvisibility-externs-dllimport=default"
-// DEFAULTS-SAME: "-fvisibility-externs-nodllstorageclass=default"
+// DEFAULTS-PS4-SAME: "-fvisibility-externs-nodllstorageclass=default"
+// DEFAULTS-PS5-SAME: "-fvisibility-externs-nodllstorageclass=keep"
 
 // REDEFINE: %{run} = \
-// REDEFINE: %clang -### -target x86_64-scei-ps4 \
+// REDEFINE: %clang -### -target %{triple} \
 // REDEFINE:     -fvisibility-from-dllstorageclass \
 // REDEFINE:     -fvisibility-dllexport=hidden \
 // REDEFINE:     -fvisibility-nodllstorageclass=protected \
@@ -64,37 +71,41 @@
 // UNUSED-NEXT: warning: argument unused during compilation: '-fvisibility-externs-nodllstorageclass=protected'
 
 // REDEFINE: %{run} = \
-// REDEFINE: %clang -### -target x86_64-scei-ps4 \
+// REDEFINE: %clang -### -target %{triple} \
 // REDEFINE:     -fvisibility-nodllstorageclass=protected \
 // REDEFINE:     -fvisibility-externs-dllimport=hidden \
 // REDEFINE:     -Werror \
 // REDEFINE:     %s -o - 2>&1 | \
-// REDEFINE:   FileCheck %s -check-prefix=SOME \
+// REDEFINE:   FileCheck %s -check-prefixes=SOME,%{prefix} \
 // REDEFINE:     --implicit-check-not=-fvisibility-from-dllstorageclass \
 // REDEFINE:     --implicit-check-not=-fvisibility-dllexport \
 // REDEFINE:     --implicit-check-not=-fvisibility-nodllstorageclass \
 // REDEFINE:     --implicit-check-not=-fvisibility-externs-dllimport \
 // REDEFINE:     --implicit-check-not=-fvisibility-externs-nodllstorageclass
+// REDEFINE: %{prefix} = SOME-PS4
 // REDEFINE: %{triple} = x86_64-scei-ps4
 // RUN: %{run}
+// REDEFINE: %{prefix} = SOME-PS5
 // REDEFINE: %{triple} = x86_64-sie-ps5
 // RUN: %{run}
 
 // REDEFINE: %{run} = \
-// REDEFINE: %clang -### -target x86_64-scei-ps4 \
+// REDEFINE: %clang -### -target %{triple} \
 // REDEFINE:     -fvisibility-from-dllstorageclass \
 // REDEFINE:     -fvisibility-nodllstorageclass=protected \
 // REDEFINE:     -fvisibility-externs-dllimport=hidden \
 // REDEFINE:     -Werror \
 // REDEFINE:     %s -o - 2>&1 | \
-// REDEFINE:   FileCheck %s -check-prefix=SOME \
+// REDEFINE:   FileCheck %s -check-prefixes=SOME,%{prefix} \
 // REDEFINE:     --implicit-check-not=-fvisibility-from-dllstorageclass \
 // REDEFINE:     --implicit-check-not=-fvisibility-dllexport \
 // REDEFINE:     --implicit-check-not=-fvisibility-nodllstorageclass \
 // REDEFINE:     --implicit-check-not=-fvisibility-externs-dllimport \
 // REDEFINE:     --implicit-check-not=-fvisibility-externs-nodllstorageclass
+// REDEFINE: %{prefix} = SOME-PS4
 // REDEFINE: %{triple} = x86_64-scei-ps4
 // RUN: %{run}
+// REDEFINE: %{prefix} = SOME-PS5
 // REDEFINE: %{triple} = x86_64-sie-ps5
 // RUN: %{run}
 
@@ -102,10 +113,11 @@
 // SOME-SAME: "-fvisibility-dllexport=protected"
 // SOME-SAME: "-fvisibility-nodllstorageclass=protected"
 // SOME-SAME: "-fvisibility-externs-dllimport=hidden"
-// SOME-SAME: "-fvisibility-externs-nodllstorageclass=default"
+// SOME-PS4-SAME: "-fvisibility-externs-nodllstorageclass=default"
+// SOME-PS5-SAME: "-fvisibility-externs-nodllstorageclass=keep"
 
 // REDEFINE: %{run} = \
-// REDEFINE: %clang -### -target x86_64-scei-ps4 \
+// REDEFINE: %clang -### -target %{triple} \
 // REDEFINE:     -fvisibility-dllexport=default \
 // REDEFINE:     -fvisibility-dllexport=hidden \
 // REDEFINE:     -fvisibility-nodllstorageclass=default \
@@ -121,14 +133,15 @@
 // REDEFINE:     --implicit-check-not=-fvisibility-dllexport \
 // REDEFINE:     --implicit-check-not=-fvisibility-nodllstorageclass \
 // REDEFINE:     --implicit-check-not=-fvisibility-externs-dllimport \
-// REDEFINE:     --implicit-check-not=-fvisibility-externs-nodllstorageclass
+// REDEFINE:     --implicit-check-not=-fvisibility-externs-nodllstorageclass \
+// REDEFINE:     --implicit-check-not="warning: argument unused"
 // REDEFINE: %{triple} = x86_64-scei-ps4
 // RUN: %{run}
 // REDEFINE: %{triple} = x86_64-sie-ps5
 // RUN: %{run}
 
 // REDEFINE: %{run} = \
-// REDEFINE: %clang -### -target x86_64-scei-ps4 \
+// REDEFINE: %clang -### -target %{triple} \
 // REDEFINE:     -fvisibility-from-dllstorageclass \
 // REDEFINE:     -fvisibility-dllexport=default \
 // REDEFINE:     -fvisibility-dllexport=hidden \
diff --git a/clang/test/Driver/ps4-visibility.cl b/clang/test/Driver/ps4-visibility.cl
new file mode 100644
index 00000000000000..a0ed7c71f1f0e5
--- /dev/null
+++ b/clang/test/Driver/ps4-visibility.cl
@@ -0,0 +1,32 @@
+/// Check PS4 specific interactions between visibility options.
+/// Detailed testing of -fvisibility-from-dllstorageclass is covered elsewhere.
+
+/// Check defaults.
+// RUN: %clang -### -target x86_64-scei-ps4 -x cl -c -emit-llvm %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=DEFAULT %s --implicit-check-not=fvisibility --implicit-check-not=ftype-visibility --implicit-check-not=dllstorageclass
+// DEFAULT-DAG: "-fvisibility-from-dllstorageclass"
+// DEFAULT-DAG: "-fvisibility-dllexport=protected"
+// DEFAULT-DAG: "-fvisibility-nodllstorageclass=hidden"
+// DEFAULT-DAG: "-fvisibility-externs-dllimport=default"
+// DEFAULT-DAG: "-fvisibility-externs-nodllstorageclass=default"
+
+/// Check that -fvisibility-from-dllstorageclass is added in the presence of -fvisibility=.
+// RUN: %clang -### -target x86_64-scei-ps4 -x cl -c -emit-llvm -fvisibility=default  %s 2>&1 | \
+// RUN:   FileCheck -check-prefixes=DEFAULT,VISEQUALS %s --implicit-check-not=fvisibility --implicit-check-not=ftype-visibility --implicit-check-not=dllstorageclass
+// VISEQUALS-DAG: "-fvisibility=default"
+
+/// Check that -fvisibility-from-dllstorageclass is added in the presence of -fvisibility-ms-compat.
+// RUN: %clang -### -target x86_64-scei-ps4 -x cl -c -emit-llvm -fvisibility-ms-compat %s 2>&1 | \
+// RUN:   FileCheck -check-prefixes=DEFAULT,MSCOMPT %s --implicit-check-not=fvisibility --implicit-check-not=ftype-visibility --implicit-check-not=dllstorageclass
+// MSCOMPT-DAG: "-fvisibility=hidden"
+// MSCOMPT-DAG: "-ftype-visibility=default"
+
+/// -fvisibility-from-dllstorageclass added explicitly.
+// RUN: %clang -### -target x86_64-scei-ps4 -x cl -c -emit-llvm -fvisibility-from-dllstorageclass %s 2>&1 | \
+// RUN:   FileCheck -check-prefixes=DEFAULT %s --implicit-check-not=fvisibility --implicit-check-not=ftype-visibility --implicit-check-not=dllstorageclass
+
+/// -fvisibility-from-dllstorageclass disabled explicitly.
+// RUN: %clang -### -target x86_64-scei-ps4 -x cl -c -emit-llvm -fno-visibility-from-dllstorageclass %s 2>&1 | \
+// RUN:   FileCheck -check-prefixes=NOVISFROM %s --implicit-check-not=fvisibility --implicit-check-not=ftype-visibility --implicit-check-not=dllstorageclass
+// NOVISFROM-NOT: "-fvisibility-from-dllstorageclass"
+
diff --git a/clang/test/Driver/ps5-visibility.cl b/clang/test/Driver/ps5-visibility.cl
new file mode 100644
index 00000000000000..ad144057be63ad
--- /dev/null
+++ b/clang/test/Driver/ps5-visibility.cl
@@ -0,0 +1,33 @@
+/// Check PS5 specific interactions between visibility options.
+/// Detailed testing of -fvisibility-from-dllstorageclass is covered elsewhere.
+
+/// Check defaults.
+// RUN: %clang -### -target x86_64-sie-ps5 -x cl -c -emit-llvm %s 2>&1 | \
+// RUN:   FileCheck -check-prefixes=VDEFAULT,VGND_DEFAULT,DEFAULT %s --implicit-check-not=fvisibility --implicit-check-not=ftype-visibility --implicit-check-not=dllstorageclass
+// VDEFAULT-DAG:     "-fvisibility=hidden"
+// VGND_DEFAULT-DAG: "-fvisibility-global-new-delete=source"
+// DEFAULT-DAG:      "-fvisibility-from-dllstorageclass"
+// DEFAULT-DAG:      "-fvisibility-dllexport=protected"
+// DEFAULT-DAG:      "-fvisibility-nodllstorageclass=keep"
+// DEFAULT-DAG:      "-fvisibility-externs-dllimport=default"
+// DEFAULT-DAG:      "-fvisibility-externs-nodllstorageclass=keep"
+
+/// -fvisibility= specified explicitly.
+// RUN: %clang -### -target x86_64-sie-ps5 -x cl -c -emit-llvm -fvisibility=protected %s 2>&1 | \
+// RUN:   FileCheck -check-prefixes=VPROTECTED,VGND_DEFAULT,DEFAULT %s --implicit-check-not=fvisibility --implicit-check-not=ftype-visibility --implicit-check-not=dllstorageclass
+// VPROTECTED-DAG: "-fvisibility=protected"
+
+/// -fvisibility-ms-compat added explicitly.
+// RUN: %clang -### -target x86_64-sie-ps5 -x cl -c -emit-llvm -fvisibility-ms-compat %s 2>&1 | \
+// RUN:   FileCheck -check-prefixes=MSCOMPT,VGND_DEFAULT,DEFAULT %s --implicit-check-not=fvisibility --implicit-check-not=ftype-visibility --implicit-check-not=dllstorageclass
+// MSCOMPT-DAG: "-fvisibility=hidden"
+// MSCOMPT-DAG: "-ftype-visibility=default"
+
+/// -fvisibility-from-dllstorageclass added explicitly.
+// RUN: %clang -### -target x86_64-sie-ps5 -x cl -c -emit-llvm -fvisibility-from-dllstorageclass %s 2>&1 | \
+// RUN:   FileCheck -check-prefixes=VDEFAULT,VGND_DEFAULT,DEFAULT %s --implicit-check-not=fvisibility --implicit-check-not=ftype-visibility --implicit-check-not=dllstorageclass
+
+/// -fvisibility-from-dllstorageclass disabled explicitly.
+// RUN: %clang -### -target x86_64-sie-ps5 -x cl -c -emit-llvm -fno-visibility-from-dllstorageclass %s 2>&1 | \
+// RUN:   FileCheck -check-prefixes=VDEFAULT,VGND_DEFAULT,NOVISFROM %s --implicit-check-not=fvisibility --implicit-check-not=ftype-visibility --implicit-check-not=dllstorageclass
+// NOVISFROM-NOT: "-fvisibility-from-dllstorageclass"

From ce1a0d8ad380d12dc7ea001cfab3749bb23d445d Mon Sep 17 00:00:00 2001
From: hanbeom <kese111@gmail.com>
Date: Mon, 20 May 2024 09:57:50 +0100
Subject: [PATCH 50/60] [AArch64] Optimize `MOV` to `ORR` when load symmetric
 constants (#86249)

This change looks for cases of symmetric constant loading.
`symmetric constant load` is when the upper 32 bits and lower 32 bits
of a 64-bit register load the same value.

When it finds this, it replaces it with an instruction that loads only
the lower 32 bits of the constant and stores it in the upper and lower
bits simultaneously.

For example:
  renamable $x8 = MOVZXi 49370, 0
  renamable $x8 = MOVKXi $x8, 320, 16
  renamable $x8 = MOVKXi $x8, 49370, 32
  renamable $x8 = MOVKXi $x8, 320, 48
becomes
  renamable $x8 = MOVZXi 49370, 0
  renamable $x8 = MOVKXi $x8, 320, 16
  renamable $x8 = ORRXrs $x8, $x8, 32
---
 llvm/lib/Target/AArch64/AArch64ExpandImm.cpp        |  8 ++++++++
 .../lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp | 13 +++++++++++++
 llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll     |  6 ++----
 llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir    |  6 ++----
 4 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
index a7d72b59b1d5a6..98016271a9d00f 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
@@ -518,6 +518,14 @@ static inline void expandMOVImmSimple(uint64_t Imm, unsigned BitSize,
     Insn.push_back({ Opc, Imm16,
                      AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift) });
   }
+
+  // Now, we get 16-bit divided Imm. If high and low bits are same in
+  // 32-bit, there is an opportunity to reduce instruction.
+  if (Insn.size() > 2 && (Imm >> 32) == (Imm & 0xffffffffULL)) {
+    for (int Size = Insn.size(); Size > 2; Size--)
+      Insn.pop_back();
+    Insn.push_back({AArch64::ORRXrs, 0, 32});
+  }
 }
 
 /// Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 03f0778bae59d5..36957bb0f5a059 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -168,6 +168,19 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
                 .addImm(I->Op2));
       }
       break;
+    case AArch64::ORRWrs:
+    case AArch64::ORRXrs: {
+      Register DstReg = MI.getOperand(0).getReg();
+      bool DstIsDead = MI.getOperand(0).isDead();
+      MIBS.push_back(
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
+              .addReg(DstReg, RegState::Define |
+                                  getDeadRegState(DstIsDead && LastItem) |
+                                  RenamableState)
+              .addReg(DstReg)
+              .addReg(DstReg)
+              .addImm(I->Op2));
+    } break;
     case AArch64::ANDXri:
     case AArch64::EORXri:
       if (I->Op1 == 0) {
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
index 82bba7b5f85405..b25ac96f97c7d0 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
@@ -6,8 +6,7 @@ define i64 @test0x1234567812345678() {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov x0, #22136 // =0x5678
 ; CHECK-NEXT:    movk x0, #4660, lsl #16
-; CHECK-NEXT:    movk x0, #22136, lsl #32
-; CHECK-NEXT:    movk x0, #4660, lsl #48
+; CHECK-NEXT:    orr x0, x0, x0, lsl #32
 ; CHECK-NEXT:    ret
   ret i64 u0x1234567812345678
 }
@@ -17,8 +16,7 @@ define i64 @test0xff3456ffff3456ff() {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov x0, #22271 // =0x56ff
 ; CHECK-NEXT:    movk x0, #65332, lsl #16
-; CHECK-NEXT:    movk x0, #22271, lsl #32
-; CHECK-NEXT:    movk x0, #65332, lsl #48
+; CHECK-NEXT:    orr x0, x0, x0, lsl #32
 ; CHECK-NEXT:    ret
   ret i64 u0xff3456ffff3456ff
 }
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
index de14437108c93f..1ec2a00f67690b 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
@@ -11,8 +11,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: renamable $x0 = MOVZXi 49370, 0
     ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 320, 16
-    ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 49370, 32
-    ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 320, 48
+    ; CHECK-NEXT: renamable $x0 = ORRXrs $x0, $x0, 32
     ; CHECK-NEXT: RET undef $lr, implicit $x0
     renamable $x0 = MOVi64imm 90284035103834330
     RET_ReallyLR implicit $x0
@@ -28,8 +27,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: renamable $x0 = MOVZXi 320, 0
     ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 49370, 16
-    ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 320, 32
-    ; CHECK-NEXT: renamable $x0 = MOVKXi $x0, 49370, 48
+    ; CHECK-NEXT: renamable $x0 = ORRXrs $x0, $x0, 32
     ; CHECK-NEXT: RET undef $lr, implicit $x0
     renamable $x0 = MOVi64imm -4550323095879417536
     RET_ReallyLR implicit $x0

From 702a2b627ff4b2a5d330a7bd0d3f7cadaff0b4ed Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Mon, 20 May 2024 18:06:03 +0900
Subject: [PATCH 51/60] [Coverage] Rework !SystemHeadersCoverage (#91446)

- Introduce `LeafExprSet`,
  - Suppress traversing LAnd and LOr expr under system headers.
- Handle LAnd and LOr as instrumented leaves to override
`!isInstrumentedCondition(C)`.
- Replace Loc with FileLoc if it is expanded with system headers.

Fixes #78920
---
 clang/lib/CodeGen/CoverageMappingGen.cpp      | 48 +++++++++++++++---
 .../CoverageMapping/mcdc-system-headers.cpp   | 50 +++++++++++++++++++
 2 files changed, 92 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/CoverageMapping/mcdc-system-headers.cpp

diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index cc8ab7a5b4369e..f4de21bac4b466 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -17,6 +17,7 @@
 #include "clang/Basic/FileManager.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Lex/Lexer.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ProfileData/Coverage/CoverageMapping.h"
@@ -336,16 +337,26 @@ class CoverageMappingBuilder {
 
     llvm::SmallSet<FileID, 8> Visited;
     SmallVector<std::pair<SourceLocation, unsigned>, 8> FileLocs;
-    for (const auto &Region : SourceRegions) {
+    for (auto &Region : SourceRegions) {
       SourceLocation Loc = Region.getBeginLoc();
+
+      // Replace Loc with FileLoc if it is expanded with system headers.
+      if (!SystemHeadersCoverage && SM.isInSystemMacro(Loc)) {
+        auto BeginLoc = SM.getSpellingLoc(Loc);
+        auto EndLoc = SM.getSpellingLoc(Region.getEndLoc());
+        if (SM.isWrittenInSameFile(BeginLoc, EndLoc)) {
+          Loc = SM.getFileLoc(Loc);
+          Region.setStartLoc(Loc);
+          Region.setEndLoc(SM.getFileLoc(Region.getEndLoc()));
+        }
+      }
+
       FileID File = SM.getFileID(Loc);
       if (!Visited.insert(File).second)
         continue;
 
-      // Do not map FileID's associated with system headers unless collecting
-      // coverage from system headers is explicitly enabled.
-      if (!SystemHeadersCoverage && SM.isInSystemHeader(SM.getSpellingLoc(Loc)))
-        continue;
+      assert(SystemHeadersCoverage ||
+             !SM.isInSystemHeader(SM.getSpellingLoc(Loc)));
 
       unsigned Depth = 0;
       for (SourceLocation Parent = getIncludeOrExpansionLoc(Loc);
@@ -818,6 +829,10 @@ struct CounterCoverageMappingBuilder
   /// A stack of currently live regions.
   llvm::SmallVector<SourceMappingRegion> RegionStack;
 
+  /// Set if the Expr should be handled as a leaf even if it is kind of binary
+  /// logical ops (&&, ||).
+  llvm::DenseSet<const Stmt *> LeafExprSet;
+
   /// An object to manage MCDC regions.
   MCDCCoverageBuilder MCDCBuilder;
 
@@ -1040,7 +1055,10 @@ struct CounterCoverageMappingBuilder
     // region onto RegionStack but immediately pop it (which adds it to the
     // function's SourceRegions) because it doesn't apply to any other source
     // code other than the Condition.
-    if (CodeGenFunction::isInstrumentedCondition(C)) {
+    // With !SystemHeadersCoverage, binary logical ops in system headers may be
+    // treated as instrumentable conditions.
+    if (CodeGenFunction::isInstrumentedCondition(C) ||
+        LeafExprSet.count(CodeGenFunction::stripCond(C))) {
       mcdc::Parameters BranchParams;
       mcdc::ConditionID ID = MCDCBuilder.getCondID(C);
       if (ID >= 0)
@@ -2070,7 +2088,20 @@ struct CounterCoverageMappingBuilder
     createDecisionRegion(E, DecisionParams);
   }
 
+  /// Check if E belongs to system headers.
+  bool isExprInSystemHeader(const BinaryOperator *E) const {
+    return (!SystemHeadersCoverage &&
+            SM.isInSystemHeader(SM.getSpellingLoc(E->getOperatorLoc())) &&
+            SM.isInSystemHeader(SM.getSpellingLoc(E->getBeginLoc())) &&
+            SM.isInSystemHeader(SM.getSpellingLoc(E->getEndLoc())));
+  }
+
   void VisitBinLAnd(const BinaryOperator *E) {
+    if (isExprInSystemHeader(E)) {
+      LeafExprSet.insert(E);
+      return;
+    }
+
     bool IsRootNode = MCDCBuilder.isIdle();
 
     // Keep track of Binary Operator and assign MCDC condition IDs.
@@ -2125,6 +2156,11 @@ struct CounterCoverageMappingBuilder
   }
 
   void VisitBinLOr(const BinaryOperator *E) {
+    if (isExprInSystemHeader(E)) {
+      LeafExprSet.insert(E);
+      return;
+    }
+
     bool IsRootNode = MCDCBuilder.isIdle();
 
     // Keep track of Binary Operator and assign MCDC condition IDs.
diff --git a/clang/test/CoverageMapping/mcdc-system-headers.cpp b/clang/test/CoverageMapping/mcdc-system-headers.cpp
new file mode 100644
index 00000000000000..a8a3ddbb506fb4
--- /dev/null
+++ b/clang/test/CoverageMapping/mcdc-system-headers.cpp
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping  -fcoverage-mcdc -mllvm -system-headers-coverage -emit-llvm-only -o - %s | FileCheck %s --check-prefixes=CHECK,W_SYS
+// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -fcoverage-mcdc -emit-llvm-only -o - %s | FileCheck %s --check-prefixes=CHECK,X_SYS
+
+#ifdef IS_SYSHEADER
+
+#pragma clang system_header
+#define CONST 42
+#define EXPR1(x) (x)
+#define EXPR2(x) ((x) && (x))
+
+#else
+
+#define IS_SYSHEADER
+#include __FILE__
+
+// CHECK: _Z5func0i:
+int func0(int a) {
+  // CHECK: Decision,File 0, [[@LINE+3]]:11 -> [[@LINE+3]]:21 = M:0, C:2
+  // W_SYS: Expansion,File 0, [[@LINE+2]]:11 -> [[@LINE+2]]:16 = #0 (Expanded file = 1)
+  // X_SYS: Branch,File 0, [[@LINE+1]]:11 -> [[@LINE+1]]:11 = 0, 0 [1,2,0]
+  return (CONST && a);
+  // CHECK: Branch,File 0, [[@LINE-1]]:20 -> [[@LINE-1]]:21 = #2, (#1 - #2) [2,0,0]
+  // W_SYS: Branch,File 1, [[@LINE-16]]:15 -> [[@LINE-16]]:17 = 0, 0 [1,2,0]
+}
+
+// CHECK: _Z5func1ii:
+int func1(int a, int b) {
+  // CHECK: Decision,File 0, [[@LINE+2]]:11 -> [[@LINE+2]]:21 = M:0, C:2
+  // CHECK: Branch,File 0, [[@LINE+1]]:11 -> [[@LINE+1]]:12 = (#0 - #1), #1 [1,0,2]
+  return (a || EXPR1(b));
+  // W_SYS: Expansion,File 0, [[@LINE-1]]:16 -> [[@LINE-1]]:21 = #1 (Expanded file = 1)
+  // W_SYS: Branch,File 1, [[@LINE-24]]:18 -> [[@LINE-24]]:21 = (#1 - #2), #2 [2,0,0]
+  // X_SYS: Branch,File 0, [[@LINE-3]]:16 -> [[@LINE-3]]:16 = (#1 - #2), #2 [2,0,0]
+}
+
+// CHECK: _Z5func2ii:
+int func2(int a, int b) {
+  // W_SYS: Decision,File 0, [[@LINE+5]]:11 -> [[@LINE+5]]:28 = M:0, C:3
+  // X_SYS: Decision,File 0, [[@LINE+4]]:11 -> [[@LINE+4]]:28 = M:0, C:2
+  // W_SYS: Expansion,File 0, [[@LINE+3]]:11 -> [[@LINE+3]]:16 = #0 (Expanded file = 1)
+  // W_SYS: Expansion,File 0, [[@LINE+2]]:23 -> [[@LINE+2]]:28 = #1 (Expanded file = 2)
+  // X_SYS: Branch,File 0, [[@LINE+1]]:11 -> [[@LINE+1]]:11 = #1, (#0 - #1) [1,2,0]
+  return (EXPR2(a) && EXPR1(a));
+  // W_SYS: Branch,File 1, [[@LINE-35]]:19 -> [[@LINE-35]]:22 = #3, (#0 - #3) [1,3,0]
+  // W_SYS: Branch,File 1, [[@LINE-36]]:26 -> [[@LINE-36]]:29 = #4, (#3 - #4) [3,2,0]
+  // W_SYS: Branch,File 2, [[@LINE-38]]:18 -> [[@LINE-38]]:21 = #2, (#1 - #2) [2,0,0]
+  // X_SYS: Branch,File 0, [[@LINE-4]]:23 -> [[@LINE-4]]:23 = #2, (#1 - #2) [2,0,0]
+}
+
+#endif

From 2217d1706a76d3f298899f824354ca9d96c45813 Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Mon, 20 May 2024 13:44:52 +0400
Subject: [PATCH 52/60] [lldb][Windows] Fixed
 LibcxxChronoTimePointSecondsSummaryProvider() (#92701)

This patch fixes #92574. It is a replacement for #92575.
---
 .../Plugins/Language/CPlusPlus/LibCxx.cpp     | 11 ++++
 .../chrono/TestDataFormatterLibcxxChrono.py   | 57 ++++++++++++++++---
 2 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
index e160fd07639395..b0e6fb7d6f5af4 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
@@ -1098,6 +1098,7 @@ LibcxxChronoTimePointSecondsSummaryProvider(ValueObject &valobj, Stream &stream,
   if (!ptr_sp)
     return false;
 
+#ifndef _WIN32
   // The date time in the chrono library is valid in the range
   // [-32767-01-01T00:00:00Z, 32767-12-31T23:59:59Z]. A 64-bit time_t has a
   // larger range, the function strftime is not able to format the entire range
@@ -1107,6 +1108,11 @@ LibcxxChronoTimePointSecondsSummaryProvider(ValueObject &valobj, Stream &stream,
       -1'096'193'779'200; // -32767-01-01T00:00:00Z
   const std::time_t chrono_timestamp_max =
       971'890'963'199; // 32767-12-31T23:59:59Z
+#else
+  const std::time_t chrono_timestamp_min = -43'200; // 1969-12-31T12:00:00Z
+  const std::time_t chrono_timestamp_max =
+      32'536'850'399; // 3001-01-19T21:59:59
+#endif
 
   const std::time_t seconds = ptr_sp->GetValueAsSigned(0);
   if (seconds < chrono_timestamp_min || seconds > chrono_timestamp_max)
@@ -1148,12 +1154,17 @@ LibcxxChronoTimepointDaysSummaryProvider(ValueObject &valobj, Stream &stream,
   if (!ptr_sp)
     return false;
 
+#ifndef _WIN32
   // The date time in the chrono library is valid in the range
   // [-32767-01-01Z, 32767-12-31Z]. A 32-bit time_t has a larger range, the
   // function strftime is not able to format the entire range of time_t. The
   // exact point has not been investigated; it's limited to chrono's range.
   const int chrono_timestamp_min = -12'687'428; // -32767-01-01Z
   const int chrono_timestamp_max = 11'248'737;  // 32767-12-31Z
+#else
+  const int chrono_timestamp_min = 0;       // 1970-01-01Z
+  const int chrono_timestamp_max = 376'583; // 3001-01-19Z
+#endif
 
   const int days = ptr_sp->GetValueAsSigned(0);
   if (days < chrono_timestamp_min || days > chrono_timestamp_max)
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
index fb35481d555149..0737a5bc7e6ebf 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
@@ -14,6 +14,7 @@ class LibcxxChronoDataFormatterTestCase(TestBase):
     @skipIf(compiler="clang", compiler_version=["<", "17.0"])
     def test_with_run_command(self):
         """Test that that file and class static variables display correctly."""
+        isNotWindowsHost = lldbplatformutil.getHostPlatform() != "windows"
         self.build()
         (self.target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
             self, "break here", lldb.SBFileSpec("main.cpp", False)
@@ -57,7 +58,11 @@ def test_with_run_command(self):
         self.expect(
             "frame variable ss_neg_date_time",
             substrs=[
-                "ss_neg_date_time = date/time=-32767-01-01T00:00:00Z timestamp=-1096193779200 s"
+                (
+                    "ss_neg_date_time = date/time=-32767-01-01T00:00:00Z timestamp=-1096193779200 s"
+                    if isNotWindowsHost
+                    else "ss_neg_date_time = timestamp=-1096193779200 s"
+                )
             ],
         )
         self.expect(
@@ -68,7 +73,11 @@ def test_with_run_command(self):
         self.expect(
             "frame variable ss_pos_date_time",
             substrs=[
-                "ss_pos_date_time = date/time=32767-12-31T23:59:59Z timestamp=971890963199 s"
+                (
+                    "ss_pos_date_time = date/time=32767-12-31T23:59:59Z timestamp=971890963199 s"
+                    if isNotWindowsHost
+                    else "ss_pos_date_time = timestamp=971890963199 s"
+                )
             ],
         )
         self.expect(
@@ -103,7 +112,13 @@ def test_with_run_command(self):
         )
         self.expect(
             "frame variable sd_neg_date",
-            substrs=["sd_neg_date = date=-32767-01-01Z timestamp=-12687428 days"],
+            substrs=[
+                (
+                    "sd_neg_date = date=-32767-01-01Z timestamp=-12687428 days"
+                    if isNotWindowsHost
+                    else "sd_neg_date = timestamp=-12687428 days"
+                )
+            ],
         )
         self.expect(
             "frame variable sd_neg_days",
@@ -112,7 +127,13 @@ def test_with_run_command(self):
 
         self.expect(
             "frame variable sd_pos_date",
-            substrs=["sd_pos_date = date=32767-12-31Z timestamp=11248737 days"],
+            substrs=[
+                (
+                    "sd_pos_date = date=32767-12-31Z timestamp=11248737 days"
+                    if isNotWindowsHost
+                    else "sd_pos_date = timestamp=11248737 days"
+                )
+            ],
         )
         self.expect(
             "frame variable sd_pos_days",
@@ -157,7 +178,11 @@ def test_with_run_command(self):
         self.expect(
             "frame variable ls_neg_date_time",
             substrs=[
-                "ls_neg_date_time = date/time=-32767-01-01T00:00:00 timestamp=-1096193779200 s"
+                (
+                    "ls_neg_date_time = date/time=-32767-01-01T00:00:00 timestamp=-1096193779200 s"
+                    if isNotWindowsHost
+                    else "ls_neg_date_time = timestamp=-1096193779200 s"
+                )
             ],
         )
         self.expect(
@@ -168,7 +193,11 @@ def test_with_run_command(self):
         self.expect(
             "frame variable ls_pos_date_time",
             substrs=[
-                "ls_pos_date_time = date/time=32767-12-31T23:59:59 timestamp=971890963199 s"
+                (
+                    "ls_pos_date_time = date/time=32767-12-31T23:59:59 timestamp=971890963199 s"
+                    if isNotWindowsHost
+                    else "ls_pos_date_time = timestamp=971890963199 s"
+                )
             ],
         )
         self.expect(
@@ -207,7 +236,13 @@ def test_with_run_command(self):
         )
         self.expect(
             "frame variable ld_neg_date",
-            substrs=["ld_neg_date = date=-32767-01-01 timestamp=-12687428 days"],
+            substrs=[
+                (
+                    "ld_neg_date = date=-32767-01-01 timestamp=-12687428 days"
+                    if isNotWindowsHost
+                    else "ld_neg_date = timestamp=-12687428 days"
+                )
+            ],
         )
         self.expect(
             "frame variable ld_neg_days",
@@ -216,7 +251,13 @@ def test_with_run_command(self):
 
         self.expect(
             "frame variable ld_pos_date",
-            substrs=["ld_pos_date = date=32767-12-31 timestamp=11248737 days"],
+            substrs=[
+                (
+                    "ld_pos_date = date=32767-12-31 timestamp=11248737 days"
+                    if isNotWindowsHost
+                    else "ld_pos_date = timestamp=11248737 days"
+                )
+            ],
         )
         self.expect(
             "frame variable ld_pos_days",

From 8e8d2595dafa230a3da7f410200d89f05b6e8d87 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 20 May 2024 11:47:30 +0200
Subject: [PATCH 53/60] [ConstantFolding] Canonicalize constexpr GEPs to i8
 (#89872)

This patch canonicalizes constant expression GEPs to use i8 source
element type, aka ptradd. This is the ConstantFolding equivalent of the
InstCombine canonicalization introduced in #68882.

I believe all our optimizations working on constant expression GEPs
(like GlobalOpt etc) have already been switched to work on offsets, so I
don't expect any significant fallout from this change.

This is part of:
https://discourse.llvm.org/t/rfc-replacing-getelementptr-with-ptradd/68699
---
 clang/test/CodeGen/RISCV/riscv-inline-asm.c   |   6 +-
 clang/test/CodeGen/attr-counted-by.c          |  12 +-
 clang/test/CodeGenCXX/atomicinit.cpp          |   4 +-
 clang/test/CodeGenCXX/auto-var-init.cpp       |   4 +-
 .../test/Profile/c-unreachable-after-switch.c |   4 +-
 .../test/profile/Linux/counter_promo_for.c    |  20 +-
 .../test/profile/Linux/counter_promo_while.c  |  16 +-
 llvm/lib/Analysis/ConstantFolding.cpp         |  46 +----
 llvm/test/Other/constant-fold-gep.ll          |  14 +-
 llvm/test/Other/optimize-inrange-gep.ll       |   2 +-
 ...tion-specialization-constant-expression.ll |   6 +-
 llvm/test/Transforms/GVN/PRE/load-pre-licm.ll |   2 +-
 .../Transforms/GVN/PRE/phi-translate-2.ll     |   4 +-
 .../test/Transforms/IndVarSimplify/D108043.ll |   2 +-
 .../IndVarSimplify/eliminate-exit-no-dl.ll    |   2 +-
 .../IndVarSimplify/floating-point-small-iv.ll |   4 +-
 .../IndVarSimplify/lftr-dead-ivs.ll           |   6 +-
 llvm/test/Transforms/IndVarSimplify/lftr.ll   |   2 +-
 .../Transforms/InstCombine/addrspacecast.ll   |   2 +-
 .../test/Transforms/InstCombine/align-addr.ll |   2 +-
 .../binop-select-cast-of-select-cond.ll       |   2 +-
 .../constant-fold-address-space-pointer.ll    |   2 +-
 .../InstCombine/constant-fold-gep.ll          |  40 ++--
 llvm/test/Transforms/InstCombine/fmul.ll      |   2 +-
 .../InstCombine/force-opaque-ptr.ll           |   4 +-
 .../Transforms/InstCombine/fortify-folding.ll |   4 +-
 .../Transforms/InstCombine/gep-custom-dl.ll   |   6 +-
 .../Transforms/InstCombine/getelementptr.ll   |  28 +--
 ...hoist-xor-by-constant-from-xor-by-value.ll |   2 +-
 .../InstCombine/loadstore-alignment.ll        |   4 +-
 llvm/test/Transforms/InstCombine/memchr-2.ll  |  10 +-
 llvm/test/Transforms/InstCombine/memchr-4.ll  |   2 +-
 llvm/test/Transforms/InstCombine/memchr-6.ll  |   4 +-
 llvm/test/Transforms/InstCombine/memchr-7.ll  |   2 +-
 llvm/test/Transforms/InstCombine/memchr-8.ll  |   6 +-
 llvm/test/Transforms/InstCombine/memchr-9.ll  |  36 ++--
 llvm/test/Transforms/InstCombine/memchr.ll    |  12 +-
 llvm/test/Transforms/InstCombine/memcmp-8.ll  |   2 +-
 .../InstCombine/memcpy-from-global.ll         |   8 +-
 llvm/test/Transforms/InstCombine/memrchr-3.ll |  20 +-
 llvm/test/Transforms/InstCombine/memrchr-4.ll |   4 +-
 .../merging-multiple-stores-into-successor.ll |   6 +-
 llvm/test/Transforms/InstCombine/objsize.ll   |   4 +-
 llvm/test/Transforms/InstCombine/pr25342.ll   |  10 +-
 llvm/test/Transforms/InstCombine/pr33453.ll   |   2 +-
 .../InstCombine/pr38984-inseltpoison.ll       |   2 +-
 llvm/test/Transforms/InstCombine/pr38984.ll   |   2 +-
 llvm/test/Transforms/InstCombine/pr83947.ll   |   4 +-
 .../InstCombine/ptr-replace-alloca.ll         |   8 +-
 llvm/test/Transforms/InstCombine/rem.ll       |   4 +-
 .../Transforms/InstCombine/select-and-or.ll   |   4 +-
 .../InstCombine/simplify-libcalls-i16.ll      |  10 +-
 .../InstCombine/simplify-libcalls.ll          |  10 +-
 .../test/Transforms/InstCombine/snprintf-2.ll |  48 ++---
 .../test/Transforms/InstCombine/snprintf-3.ll |  48 ++---
 .../test/Transforms/InstCombine/snprintf-4.ll |  30 +--
 llvm/test/Transforms/InstCombine/stpcpy-1.ll  |   4 +-
 .../Transforms/InstCombine/stpcpy_chk-1.ll    |  10 +-
 llvm/test/Transforms/InstCombine/stpncpy-1.ll |  35 ++--
 llvm/test/Transforms/InstCombine/str-int-2.ll |   2 +-
 llvm/test/Transforms/InstCombine/str-int-3.ll |   4 +-
 llvm/test/Transforms/InstCombine/str-int-4.ll |  40 ++--
 llvm/test/Transforms/InstCombine/str-int-5.ll |  50 ++---
 llvm/test/Transforms/InstCombine/str-int.ll   |   2 +-
 .../Transforms/InstCombine/strcall-bad-sig.ll |  10 +-
 .../Transforms/InstCombine/strcall-no-nul.ll  |  20 +-
 llvm/test/Transforms/InstCombine/strchr-1.ll  |   6 +-
 llvm/test/Transforms/InstCombine/strchr-3.ll  |  12 +-
 llvm/test/Transforms/InstCombine/strcmp-4.ll  |   4 +-
 llvm/test/Transforms/InstCombine/strlcpy-1.ll |   4 +-
 llvm/test/Transforms/InstCombine/strlen-1.ll  |   2 +-
 llvm/test/Transforms/InstCombine/strlen-6.ll  |  18 +-
 llvm/test/Transforms/InstCombine/strpbrk-1.ll |   2 +-
 llvm/test/Transforms/InstCombine/strrchr-1.ll |   6 +-
 llvm/test/Transforms/InstCombine/strrchr-3.ll |   8 +-
 llvm/test/Transforms/InstCombine/strstr-1.ll  |   2 +-
 .../vec_demanded_elts-inseltpoison.ll         |   2 +-
 .../InstCombine/vec_demanded_elts.ll          |   2 +-
 llvm/test/Transforms/InstCombine/wcslen-1.ll  |   4 +-
 .../InstSimplify/ConstProp/gep-alias.ll       |   2 +-
 .../ConstProp/gep-constanfolding-error.ll     |   3 +-
 .../Transforms/InstSimplify/ConstProp/gep.ll  |   6 +-
 .../InstSimplify/ConstProp/icmp-global.ll     |  10 +-
 llvm/test/Transforms/InstSimplify/compare.ll  |   2 +-
 .../Transforms/InstSimplify/past-the-end.ll   |   4 +-
 .../2011-12-19-PostincQuadratic.ll            |   2 +-
 .../X86/2012-01-13-phielim.ll                 |  20 +-
 .../Transforms/LoopVectorize/X86/pr42674.ll   |   2 +-
 ...pr47343-expander-lcssa-after-cfg-update.ll |   4 +-
 .../LoopVersioning/add-phi-update-users.ll    |   6 +-
 .../bound-check-partially-known.ll            |  10 +-
 llvm/test/Transforms/NewGVN/loadforward.ll    |   2 +-
 .../PhaseOrdering/SystemZ/sub-xor.ll          |  48 ++---
 .../PhaseOrdering/X86/excessive-unrolling.ll  | 180 +++++++++---------
 .../Transforms/SCCP/2009-09-24-byval-ptr.ll   |   2 +-
 llvm/test/Transforms/SCCP/apint-bigint2.ll    |   6 +-
 .../SLPVectorizer/AArch64/gather-cost.ll      |   8 +-
 .../Transforms/SLPVectorizer/X86/pr47623.ll   |  28 +--
 98 files changed, 561 insertions(+), 585 deletions(-)

diff --git a/clang/test/CodeGen/RISCV/riscv-inline-asm.c b/clang/test/CodeGen/RISCV/riscv-inline-asm.c
index 3565705dea7130..ed97add95e7116 100644
--- a/clang/test/CodeGen/RISCV/riscv-inline-asm.c
+++ b/clang/test/CodeGen/RISCV/riscv-inline-asm.c
@@ -49,9 +49,9 @@ extern int var, arr[2][2];
 struct Pair { int a, b; } pair;
 
 // CHECK-LABEL: test_s(
-// CHECK:         call void asm sideeffect "// $0 $1 $2", "s,s,s"(ptr nonnull @var, ptr nonnull getelementptr inbounds ([2 x [2 x i32]], ptr @arr, {{.*}}), ptr nonnull @test_s)
-// CHECK:         call void asm sideeffect "// $0", "s"(ptr nonnull getelementptr inbounds (%struct.Pair, ptr @pair, {{.*}}))
-// CHECK:         call void asm sideeffect "// $0 $1 $2", "S,S,S"(ptr nonnull @var, ptr nonnull getelementptr inbounds ([2 x [2 x i32]], ptr @arr, {{.*}}), ptr nonnull @test_s)
+// CHECK:         call void asm sideeffect "// $0 $1 $2", "s,s,s"(ptr nonnull @var, ptr nonnull getelementptr inbounds (i8, ptr @arr, {{.*}}), ptr nonnull @test_s)
+// CHECK:         call void asm sideeffect "// $0", "s"(ptr nonnull getelementptr inbounds (i8, ptr @pair, {{.*}}))
+// CHECK:         call void asm sideeffect "// $0 $1 $2", "S,S,S"(ptr nonnull @var, ptr nonnull getelementptr inbounds (i8, ptr @arr, {{.*}}), ptr nonnull @test_s)
 void test_s(void) {
   asm("// %0 %1 %2" :: "s"(&var), "s"(&arr[1][1]), "s"(test_s));
   asm("// %0" :: "s"(&pair.b));
diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c
index de30a00138ac80..79922eb4159f1c 100644
--- a/clang/test/CodeGen/attr-counted-by.c
+++ b/clang/test/CodeGen/attr-counted-by.c
@@ -1098,7 +1098,7 @@ int test12_a, test12_b;
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB20:[0-9]+]], i64 0) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.type_mismatch6:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB21:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds ([[STRUCT_ANON_5:%.*]], ptr @test12_foo, i64 1, i32 0, i32 0, i32 0) to i64)) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB21:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds (i8, ptr @test12_foo, i64 4) to i64)) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test12(
@@ -1111,7 +1111,7 @@ int test12_a, test12_b;
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [6 x i32], ptr [[BAZ]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[TMP0]], ptr @test12_b, align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr inbounds ([[STRUCT_ANON_5:%.*]], ptr @test12_foo, i64 1, i32 0, i32 0, i32 0), align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr inbounds (i8, ptr @test12_foo, i64 4), align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[TMP1]], ptr @test12_a, align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    br label [[FOR_COND:%.*]]
 // NO-SANITIZE-WITH-ATTR:       for.cond:
@@ -1140,7 +1140,7 @@ int test12_a, test12_b;
 // SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 0) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR:       handler.type_mismatch6:
-// SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds ([[STRUCT_ANON_5:%.*]], ptr @test12_foo, i64 1, i32 0, i32 0, i32 0) to i64)) #[[ATTR8]], !nosanitize [[META9]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds (i8, ptr @test12_foo, i64 4) to i64)) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local noundef i32 @test12(
@@ -1153,7 +1153,7 @@ int test12_a, test12_b;
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [6 x i32], ptr [[BAZ]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[TMP0]], ptr @test12_b, align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr inbounds ([[STRUCT_ANON_5:%.*]], ptr @test12_foo, i64 1, i32 0, i32 0, i32 0), align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr inbounds (i8, ptr @test12_foo, i64 4), align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[TMP1]], ptr @test12_a, align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    br label [[FOR_COND:%.*]]
 // NO-SANITIZE-WITHOUT-ATTR:       for.cond:
@@ -1315,7 +1315,7 @@ int test14(int idx) {
 // NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR4]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr getelementptr inbounds ([[STRUCT_ANON_8:%.*]], ptr @__const.test15.foo, i64 1, i32 0), i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr getelementptr inbounds (i8, ptr @__const.test15.foo, i64 8), i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP0]]
 //
@@ -1336,7 +1336,7 @@ int test14(int idx) {
 // NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr getelementptr inbounds ([[STRUCT_ANON_8:%.*]], ptr @__const.test15.foo, i64 1, i32 0), i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr getelementptr inbounds (i8, ptr @__const.test15.foo, i64 8), i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP0]]
 //
diff --git a/clang/test/CodeGenCXX/atomicinit.cpp b/clang/test/CodeGenCXX/atomicinit.cpp
index f2398b020621f0..a568f17b90d0c7 100644
--- a/clang/test/CodeGenCXX/atomicinit.cpp
+++ b/clang/test/CodeGenCXX/atomicinit.cpp
@@ -86,7 +86,7 @@ namespace PR18097 {
     };
     // CHECK-LABEL: define {{.*}} @__cxx_global_var_init
     // CHECK: call void @_ZN7PR180977dynamic1XC1Ei(ptr {{[^,]*}} @_ZN7PR180977dynamic1yE, i32 noundef 4)
-    // CHECK: store i32 5, ptr getelementptr inbounds ({{.*}}, ptr @_ZN7PR180977dynamic1yE, i32 0, i32 1)
+    // CHECK: store i32 5, ptr getelementptr inbounds (i8, ptr @_ZN7PR180977dynamic1yE, i32 4)
     Y y = { X(4), 5 };
   }
 
@@ -110,7 +110,7 @@ namespace PR18097 {
     // CHECK-LABEL: define {{.*}} @__cxx_global_var_init
     // CHECK: tail call void @llvm.memcpy.p0.p0.i32(ptr{{.*}} @_ZN7PR180978constant2y2E, ptr{{.*}} @_ZN7PR180978constantL1xE, i32 3, i1 false)
     // CHECK: %0 = load i32, ptr @_ZN7PR180978constant1zE
-    // CHECK: store i32 %0, ptr getelementptr inbounds (%"struct.PR18097::constant::Y", ptr @_ZN7PR180978constant2y2E, i32 0, i32 1)
+    // CHECK: store i32 %0, ptr getelementptr inbounds (i8, ptr @_ZN7PR180978constant2y2E, i32 4)
     int z;
     constexpr X x{1};
     Y y2 = { x, z };
diff --git a/clang/test/CodeGenCXX/auto-var-init.cpp b/clang/test/CodeGenCXX/auto-var-init.cpp
index 7803ed5b633fed..e1568bee136e5e 100644
--- a/clang/test/CodeGenCXX/auto-var-init.cpp
+++ b/clang/test/CodeGenCXX/auto-var-init.cpp
@@ -1346,7 +1346,7 @@ TEST_UNINIT(base, base);
 // PATTERN-O0: call void @llvm.memcpy{{.*}} @__const.test_base_uninit.uninit{{.+}}), !annotation [[AUTO_INIT]]
 // ZERO-LABEL: @test_base_uninit()
 // ZERO-O0: call void @llvm.memset{{.*}}, i8 0,{{.+}}), !annotation [[AUTO_INIT]]
-// ZERO-O1: store ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr] }, ptr @_ZTV4base, i64 0, i32 0, i64 2), {{.*}}, align 8
+// ZERO-O1: store ptr getelementptr inbounds inrange(-16, 16) (i8, ptr @_ZTV4base, i64 16), {{.*}}, align 8
 // ZERO-O1-NOT: !annotation
 
 TEST_BRACES(base, base);
@@ -1367,7 +1367,7 @@ TEST_UNINIT(derived, derived);
 // ZERO-LABEL: @test_derived_uninit()
 // ZERO-O0: call void @llvm.memset{{.*}}, i8 0, {{.+}}), !annotation [[AUTO_INIT]]
 // ZERO-O1: store i64 0, {{.*}} align 8, !annotation [[AUTO_INIT]]
-// ZERO-O1: store ptr getelementptr inbounds inrange(-16, 16) ({ [4 x ptr] }, ptr @_ZTV7derived, i64 0, i32 0, i64 2), {{.*}} align 8
+// ZERO-O1: store ptr getelementptr inbounds inrange(-16, 16) (i8, ptr @_ZTV7derived, i64 16), {{.*}} align 8
 
 TEST_BRACES(derived, derived);
 // CHECK-LABEL: @test_derived_braces()
diff --git a/clang/test/Profile/c-unreachable-after-switch.c b/clang/test/Profile/c-unreachable-after-switch.c
index 34d2742f7a3b29..0ed2efa32e8350 100644
--- a/clang/test/Profile/c-unreachable-after-switch.c
+++ b/clang/test/Profile/c-unreachable-after-switch.c
@@ -5,11 +5,11 @@
 // CHECK-LABEL: @foo()
 // CHECK: store {{.*}} @[[C]]
 void foo(void) {
-  // CHECK: store {{.*}} @[[C]], i64 0, i64 2
+  // CHECK: store {{.*}} @[[C]], i64 16)
   switch (0) {
   default:
     return;
   }
   // We shouldn't emit the unreachable counter. This used to crash in GlobalDCE.
-  // CHECK-NOT: store {{.*}} @[[C]], i64 0, i64 1}
+  // CHECK-NOT: store {{.*}} @[[C]], i64 8)
 }
diff --git a/compiler-rt/test/profile/Linux/counter_promo_for.c b/compiler-rt/test/profile/Linux/counter_promo_for.c
index 1694e3812de40c..aa77e6084bf85d 100644
--- a/compiler-rt/test/profile/Linux/counter_promo_for.c
+++ b/compiler-rt/test/profile/Linux/counter_promo_for.c
@@ -19,29 +19,29 @@ __attribute__((noinline)) void bar(int i) { g += i; }
 
 __attribute__((noinline)) void foo(int n, int N) {
 // PROMO-LABEL: @foo
-// PROMO: load{{.*}}@__profc_foo{{.*}} 3){{.*}}
+// PROMO: load{{.*}}@__profc_foo{{.*}} 24){{.*}}
 // PROMO-NEXT: add
-// PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 3){{.*}}
+// PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 24){{.*}}
 // PROMO: load{{.*}}@__profc_foo, align
 // PROMO-NEXT: add
 // PROMO-NEXT: store{{.*}}@__profc_foo, align
-// PROMO-NEXT: load{{.*}}@__profc_foo{{.*}} 1){{.*}}
+// PROMO-NEXT: load{{.*}}@__profc_foo{{.*}} 8){{.*}}
 // PROMO-NEXT: add
-// PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 1){{.*}}
-// PROMO: load{{.*}}@__profc_foo{{.*}} 2){{.*}}
+// PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 8){{.*}}
+// PROMO: load{{.*}}@__profc_foo{{.*}} 16){{.*}}
 // PROMO-NEXT: add
-// PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 2){{.*}}
+// PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 16){{.*}}
 //
 // NOPROMO-LABEL: @foo
 // NOPROMO: load{{.*}}@__profc_foo, align
 // NOPROMO-NEXT: add
 // NOPROMO-NEXT: store{{.*}}@__profc_foo, align
-// NOPROMO: load{{.*}}@__profc_foo{{.*}} 1){{.*}}
+// NOPROMO: load{{.*}}@__profc_foo{{.*}} 8){{.*}}
 // NOPROMO-NEXT: add
-// NOPROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 1){{.*}}
-// NOPROMO: load{{.*}}@__profc_foo{{.*}} 2){{.*}}
+// NOPROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 8){{.*}}
+// NOPROMO: load{{.*}}@__profc_foo{{.*}} 16){{.*}}
 // NOPROMO-NEXT: add
-// NOPROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 2){{.*}}
+// NOPROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 16){{.*}}
   int i;
   for (i = 0; i < N; i++) {
     if (i < n + 1)
diff --git a/compiler-rt/test/profile/Linux/counter_promo_while.c b/compiler-rt/test/profile/Linux/counter_promo_while.c
index 71c4a90d29fa04..c6ea3a7282d426 100644
--- a/compiler-rt/test/profile/Linux/counter_promo_while.c
+++ b/compiler-rt/test/profile/Linux/counter_promo_while.c
@@ -20,23 +20,23 @@ __attribute__((noinline)) void foo(int n, int N) {
 // PROMO: load{{.*}}@__profc_foo, align
 // PROMO-NEXT: add
 // PROMO-NEXT: store{{.*}}@__profc_foo, align
-// PROMO-NEXT: load{{.*}}@__profc_foo{{.*}} 1){{.*}}
+// PROMO-NEXT: load{{.*}}@__profc_foo{{.*}} 8){{.*}}
 // PROMO-NEXT: add
-// PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 1){{.*}}
-// PROMO-NEXT: load{{.*}}@__profc_foo{{.*}} 2){{.*}}
+// PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 8){{.*}}
+// PROMO-NEXT: load{{.*}}@__profc_foo{{.*}} 16){{.*}}
 // PROMO-NEXT: add
-// PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 2){{.*}}
+// PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 16){{.*}}
 //
 // NOPROMO-LABEL: @foo
 // NOPROMO: load{{.*}}@__profc_foo, align
 // NOPROMO-NEXT: add
 // NOPROMO-NEXT: store{{.*}}@__profc_foo, align
-// NOPROMO: load{{.*}}@__profc_foo{{.*}} 1){{.*}}
+// NOPROMO: load{{.*}}@__profc_foo{{.*}} 8){{.*}}
 // NOPROMO-NEXT: add
-// NOPROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 1){{.*}}
-// NOPROMO: load{{.*}}@__profc_foo{{.*}} 2){{.*}}
+// NOPROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 8){{.*}}
+// NOPROMO: load{{.*}}@__profc_foo{{.*}} 16){{.*}}
 // NOPROMO-NEXT: add
-// NOPROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 2){{.*}}
+// NOPROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 16){{.*}}
   int i = 0;
   while (i < N) {
     if (i < n + 1)
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 524e84f3f3ded2..31667ff3951f12 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -869,7 +869,6 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
   bool InBounds = GEP->isInBounds();
 
   Type *SrcElemTy = GEP->getSourceElementType();
-  Type *ResElemTy = GEP->getResultElementType();
   Type *ResTy = GEP->getType();
   if (!SrcElemTy->isSized() || isa<ScalableVectorType>(SrcElemTy))
     return nullptr;
@@ -944,43 +943,18 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
     return ConstantExpr::getIntToPtr(C, ResTy);
   }
 
-  // Otherwise form a regular getelementptr. Recompute the indices so that
-  // we eliminate over-indexing of the notional static type array bounds.
-  // This makes it easy to determine if the getelementptr is "inbounds".
-
-  // For GEPs of GlobalValues, use the value type, otherwise use an i8 GEP.
-  if (auto *GV = dyn_cast<GlobalValue>(Ptr))
-    SrcElemTy = GV->getValueType();
-  else
-    SrcElemTy = Type::getInt8Ty(Ptr->getContext());
-
-  if (!SrcElemTy->isSized())
-    return nullptr;
-
-  Type *ElemTy = SrcElemTy;
-  SmallVector<APInt> Indices = DL.getGEPIndicesForOffset(ElemTy, Offset);
-  if (Offset != 0)
-    return nullptr;
-
-  // Try to add additional zero indices to reach the desired result element
-  // type.
-  // TODO: Should we avoid extra zero indices if ResElemTy can't be reached and
-  // we'll have to insert a bitcast anyway?
-  while (ElemTy != ResElemTy) {
-    Type *NextTy = GetElementPtrInst::getTypeAtIndex(ElemTy, (uint64_t)0);
-    if (!NextTy)
-      break;
-
-    Indices.push_back(APInt::getZero(isa<StructType>(ElemTy) ? 32 : BitWidth));
-    ElemTy = NextTy;
+  // Try to infer inbounds for GEPs of globals.
+  if (!InBounds && Offset.isNonNegative()) {
+    bool CanBeNull, CanBeFreed;
+    uint64_t DerefBytes =
+        Ptr->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed);
+    InBounds = DerefBytes != 0 && !CanBeNull && Offset.sle(DerefBytes);
   }
 
-  SmallVector<Constant *, 32> NewIdxs;
-  for (const APInt &Index : Indices)
-    NewIdxs.push_back(ConstantInt::get(
-        Type::getIntNTy(Ptr->getContext(), Index.getBitWidth()), Index));
-
-  return ConstantExpr::getGetElementPtr(SrcElemTy, Ptr, NewIdxs, InBounds,
+  // Otherwise canonicalize this to a single ptradd.
+  LLVMContext &Ctx = Ptr->getContext();
+  return ConstantExpr::getGetElementPtr(Type::getInt8Ty(Ctx), Ptr,
+                                        ConstantInt::get(Ctx, Offset), InBounds,
                                         InRange);
 }
 
diff --git a/llvm/test/Other/constant-fold-gep.ll b/llvm/test/Other/constant-fold-gep.ll
index 0c1ca129bdb382..9af300ac9907f0 100644
--- a/llvm/test/Other/constant-fold-gep.ll
+++ b/llvm/test/Other/constant-fold-gep.ll
@@ -106,10 +106,10 @@
 
 ; PLAIN: @Y = global ptr getelementptr inbounds ([3 x { i32, i32 }], ptr @ext, i64 2)
 ; PLAIN: @Z = global ptr getelementptr inbounds (i32, ptr getelementptr inbounds ([3 x { i32, i32 }], ptr @ext, i64 0, i64 1, i32 0), i64 1)
-; OPT: @Y = local_unnamed_addr global ptr getelementptr inbounds ([3 x { i32, i32 }], ptr @ext, i64 2)
-; OPT: @Z = local_unnamed_addr global ptr getelementptr inbounds ([3 x { i32, i32 }], ptr @ext, i64 0, i64 1, i32 1)
-; TO: @Y = local_unnamed_addr global ptr getelementptr inbounds ([3 x { i32, i32 }], ptr @ext, i64 2)
-; TO: @Z = local_unnamed_addr global ptr getelementptr inbounds ([3 x { i32, i32 }], ptr @ext, i64 0, i64 1, i32 1)
+; OPT: @Y = local_unnamed_addr global ptr getelementptr inbounds (i8, ptr @ext, i64 48)
+; OPT: @Z = local_unnamed_addr global ptr getelementptr inbounds (i8, ptr @ext, i64 12)
+; TO: @Y = local_unnamed_addr global ptr getelementptr inbounds (i8, ptr @ext, i64 48)
+; TO: @Z = local_unnamed_addr global ptr getelementptr inbounds (i8, ptr @ext, i64 12)
 
 @ext = external global [3 x { i32, i32 }]
 @Y = global ptr getelementptr inbounds ([3 x { i32, i32 }], ptr getelementptr inbounds ([3 x { i32, i32 }], ptr @ext, i64 1), i64 1)
@@ -433,10 +433,10 @@ define ptr @fO() nounwind {
 ; PLAIN:   ret ptr %t
 ; PLAIN: }
 ; OPT: define ptr @fZ() local_unnamed_addr #0 {
-; OPT:   ret ptr getelementptr inbounds ([3 x { i32, i32 }], ptr @ext, i64 0, i64 1, i32 1)
+; OPT:   ret ptr getelementptr inbounds (i8, ptr @ext, i64 12)
 ; OPT: }
 ; TO: define ptr @fZ() local_unnamed_addr #0 {
-; TO:   ret ptr getelementptr inbounds ([3 x { i32, i32 }], ptr @ext, i64 0, i64 1, i32 1)
+; TO:   ret ptr getelementptr inbounds (i8, ptr @ext, i64 12)
 ; TO: }
 ; SCEV: Classifying expressions for: @fZ
 ; SCEV:   %t = bitcast ptr getelementptr inbounds (i32, ptr getelementptr inbounds ([3 x { i32, i32 }], ptr @ext, i64 0, i64 1, i32 0), i64 1) to ptr
@@ -464,7 +464,7 @@ define ptr @same_addrspace() nounwind noinline {
 ; OPT: same_addrspace
   %p = getelementptr inbounds i8, ptr @p0, i32 2
   ret ptr %p
-; OPT: ret ptr getelementptr inbounds ([4 x i8], ptr @p0, i64 0, i64 2)
+; OPT: ret ptr getelementptr inbounds (i8, ptr @p0, i64 2)
 }
 
 @gv1 = internal global i32 1
diff --git a/llvm/test/Other/optimize-inrange-gep.ll b/llvm/test/Other/optimize-inrange-gep.ll
index 2eae34bdb09b1d..e7465fddd80f0c 100644
--- a/llvm/test/Other/optimize-inrange-gep.ll
+++ b/llvm/test/Other/optimize-inrange-gep.ll
@@ -20,7 +20,7 @@ define void @foo(ptr %p) {
 ;
 ; CHECK-LABEL: define void @foo(
 ; CHECK-SAME: ptr nocapture writeonly [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    store ptr getelementptr inbounds inrange(-24, 0) ({ [3 x ptr] }, ptr @vtable, i64 1, i32 0, i64 0), ptr [[P]], align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds inrange(-24, 0) (i8, ptr @vtable, i64 24), ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
   store ptr getelementptr inrange(-24, 0) ({ [3 x ptr], [3 x ptr] }, ptr @vtable, i32 0, i32 0, i32 3), ptr %p
diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression.ll
index c242816b91d43c..16a46851163129 100644
--- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression.ll
+++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression.ll
@@ -30,13 +30,13 @@ define internal i64 @zoo(i1 %flag) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]]
 ; CHECK:       plus:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @func2.specialized.2(ptr getelementptr inbounds ([[STRUCT:%.*]], ptr @Global, i64 0, i32 3))
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @func2.specialized.2(ptr getelementptr inbounds (i8, ptr @Global, i64 8))
 ; CHECK-NEXT:    br label [[MERGE:%.*]]
 ; CHECK:       minus:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @func2.specialized.1(ptr getelementptr inbounds ([[STRUCT]], ptr @Global, i64 0, i32 4))
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @func2.specialized.1(ptr getelementptr inbounds (i8, ptr @Global, i64 16))
 ; CHECK-NEXT:    br label [[MERGE]]
 ; CHECK:       merge:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ ptrtoint (ptr getelementptr inbounds ([[STRUCT]], ptr @Global, i64 0, i32 3) to i64), [[PLUS]] ], [ ptrtoint (ptr getelementptr inbounds ([[STRUCT]], ptr @Global, i64 0, i32 4) to i64), [[MINUS]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ ptrtoint (ptr getelementptr inbounds (i8, ptr @Global, i64 8) to i64), [[PLUS]] ], [ ptrtoint (ptr getelementptr inbounds (i8, ptr @Global, i64 16) to i64), [[MINUS]] ]
 ; CHECK-NEXT:    ret i64 [[TMP2]]
 ;
 entry:
diff --git a/llvm/test/Transforms/GVN/PRE/load-pre-licm.ll b/llvm/test/Transforms/GVN/PRE/load-pre-licm.ll
index c52f46b4f63ee9..6a05d5b17dde86 100644
--- a/llvm/test/Transforms/GVN/PRE/load-pre-licm.ll
+++ b/llvm/test/Transforms/GVN/PRE/load-pre-licm.ll
@@ -8,7 +8,7 @@ target triple = "i386-apple-darwin11.0.0"
 define void @Bubble() nounwind noinline {
 ; CHECK-LABEL: @Bubble(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP7_PRE:%.*]] = load i32, ptr getelementptr inbounds ([5001 x i32], ptr @sortlist, i32 0, i32 1), align 4
+; CHECK-NEXT:    [[TMP7_PRE:%.*]] = load i32, ptr getelementptr inbounds (i8, ptr @sortlist, i32 4), align 4
 ; CHECK-NEXT:    br label [[WHILE_BODY5:%.*]]
 ; CHECK:       while.body5:
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP7_PRE]], [[ENTRY:%.*]] ], [ [[TMP71:%.*]], [[IF_END:%.*]] ]
diff --git a/llvm/test/Transforms/GVN/PRE/phi-translate-2.ll b/llvm/test/Transforms/GVN/PRE/phi-translate-2.ll
index 46fde7a0a48c1a..bd54de4acd4f27 100644
--- a/llvm/test/Transforms/GVN/PRE/phi-translate-2.ll
+++ b/llvm/test/Transforms/GVN/PRE/phi-translate-2.ll
@@ -63,8 +63,8 @@ define void @test2(i64 %i) {
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 (...) @goo()
 ; CHECK-NEXT:    store i64 [[CALL]], ptr @g2, align 8
-; CHECK-NEXT:    [[T2_PRE:%.*]] = load i64, ptr getelementptr inbounds ([100 x i64], ptr @a, i64 0, i64 3), align 8
-; CHECK-NEXT:    [[T3_PRE:%.*]] = load i64, ptr getelementptr inbounds ([100 x i64], ptr @b, i64 0, i64 3), align 8
+; CHECK-NEXT:    [[T2_PRE:%.*]] = load i64, ptr getelementptr inbounds (i8, ptr @a, i64 24), align 8
+; CHECK-NEXT:    [[T3_PRE:%.*]] = load i64, ptr getelementptr inbounds (i8, ptr @b, i64 24), align 8
 ; CHECK-NEXT:    [[DOTPRE:%.*]] = mul nsw i64 [[T3_PRE]], [[T2_PRE]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
diff --git a/llvm/test/Transforms/IndVarSimplify/D108043.ll b/llvm/test/Transforms/IndVarSimplify/D108043.ll
index ab95f0bb90395c..cc553e205ad35d 100644
--- a/llvm/test/Transforms/IndVarSimplify/D108043.ll
+++ b/llvm/test/Transforms/IndVarSimplify/D108043.ll
@@ -9,7 +9,7 @@ define internal fastcc void @func_2() unnamed_addr {
 ; CHECK-NEXT:  lbl_2898.preheader:
 ; CHECK-NEXT:    br label [[LBL_2898:%.*]]
 ; CHECK:       lbl_2898.loopexit:
-; CHECK-NEXT:    store ptr getelementptr inbounds ([4 x [6 x i32]], ptr @g_2168, i64 0, i64 3, i64 1), ptr @g_1150, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @g_2168, i64 76), ptr @g_1150, align 8
 ; CHECK-NEXT:    br label [[LBL_2898]]
 ; CHECK:       lbl_2898:
 ; CHECK-NEXT:    br label [[FOR_COND884:%.*]]
diff --git a/llvm/test/Transforms/IndVarSimplify/eliminate-exit-no-dl.ll b/llvm/test/Transforms/IndVarSimplify/eliminate-exit-no-dl.ll
index e605512cb23bf2..a3c4002626a705 100644
--- a/llvm/test/Transforms/IndVarSimplify/eliminate-exit-no-dl.ll
+++ b/llvm/test/Transforms/IndVarSimplify/eliminate-exit-no-dl.ll
@@ -14,7 +14,7 @@ define void @foo() {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr getelementptr inbounds ([0 x i8], ptr @global, i64 0, i64 1), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr getelementptr inbounds (i8, ptr @global, i64 1), align 1
 ; CHECK-NEXT:    br i1 false, label [[BB7:%.*]], label [[BB11:%.*]]
 ; CHECK:       bb7:
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
diff --git a/llvm/test/Transforms/IndVarSimplify/floating-point-small-iv.ll b/llvm/test/Transforms/IndVarSimplify/floating-point-small-iv.ll
index 599e69c814d9b3..bebd314f7375ab 100644
--- a/llvm/test/Transforms/IndVarSimplify/floating-point-small-iv.ll
+++ b/llvm/test/Transforms/IndVarSimplify/floating-point-small-iv.ll
@@ -357,7 +357,7 @@ define void @uitofp_fptoui_range_with_negative() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    store i32 100, ptr getelementptr inbounds ([16777219 x i32], ptr @array, i64 0, i64 100), align 4
+; CHECK-NEXT:    store i32 100, ptr getelementptr inbounds (i8, ptr @array, i64 400), align 4
 ; CHECK-NEXT:    br i1 false, label [[FOR_BODY]], label [[CLEANUP:%.*]]
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    ret void
@@ -418,7 +418,7 @@ define void @uitofp_fptosi_range_with_negative () {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    store i32 100, ptr getelementptr inbounds ([16777219 x i32], ptr @array, i64 0, i64 100), align 4
+; CHECK-NEXT:    store i32 100, ptr getelementptr inbounds (i8, ptr @array, i64 400), align 4
 ; CHECK-NEXT:    br i1 false, label [[FOR_BODY]], label [[CLEANUP:%.*]]
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/IndVarSimplify/lftr-dead-ivs.ll b/llvm/test/Transforms/IndVarSimplify/lftr-dead-ivs.ll
index a628a5357f6d12..6c15eb4af4f132 100644
--- a/llvm/test/Transforms/IndVarSimplify/lftr-dead-ivs.ll
+++ b/llvm/test/Transforms/IndVarSimplify/lftr-dead-ivs.ll
@@ -112,7 +112,7 @@ define void @dom_store_preinc() #0 {
 ; CHECK-NEXT:    [[P_0:%.*]] = phi ptr [ @data, [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    store volatile i8 0, ptr [[P_0]], align 1
 ; CHECK-NEXT:    [[TMP3]] = getelementptr inbounds i8, ptr [[P_0]], i64 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne ptr [[P_0]], getelementptr ([240 x i8], ptr @data, i64 1, i64 5)
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne ptr [[P_0]], getelementptr (i8, ptr @data, i64 245)
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
@@ -141,7 +141,7 @@ define void @dom_store_postinc() #0 {
 ; CHECK-NEXT:    [[P_0:%.*]] = phi ptr [ @data, [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP3]] = getelementptr inbounds i8, ptr [[P_0]], i64 1
 ; CHECK-NEXT:    store volatile i8 0, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne ptr [[TMP3]], getelementptr ([240 x i8], ptr @data, i64 1, i64 6)
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne ptr [[TMP3]], getelementptr (i8, ptr @data, i64 246)
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
@@ -170,7 +170,7 @@ define i8 @dom_load() #0 {
 ; CHECK-NEXT:    [[P_0:%.*]] = phi ptr [ @data, [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP3]] = getelementptr inbounds i8, ptr [[P_0]], i64 1
 ; CHECK-NEXT:    [[V:%.*]] = load i8, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne ptr [[TMP3]], getelementptr ([240 x i8], ptr @data, i64 1, i64 6)
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne ptr [[TMP3]], getelementptr (i8, ptr @data, i64 246)
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[V_LCSSA:%.*]] = phi i8 [ [[V]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/IndVarSimplify/lftr.ll b/llvm/test/Transforms/IndVarSimplify/lftr.ll
index 7f4820f093e55e..e37a34019ccd1b 100644
--- a/llvm/test/Transforms/IndVarSimplify/lftr.ll
+++ b/llvm/test/Transforms/IndVarSimplify/lftr.ll
@@ -196,7 +196,7 @@ define void @test_zext(ptr %a) #0 {
 ; CHECK-NEXT:    [[T2:%.*]] = load i8, ptr [[DOT0]], align 1
 ; CHECK-NEXT:    [[T3]] = getelementptr inbounds i8, ptr [[P_0]], i64 1
 ; CHECK-NEXT:    store i8 [[T2]], ptr [[P_0]], align 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne ptr [[P_0]], getelementptr inbounds ([240 x i8], ptr @data, i64 0, i64 239)
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne ptr [[P_0]], getelementptr inbounds (i8, ptr @data, i64 239)
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/addrspacecast.ll b/llvm/test/Transforms/InstCombine/addrspacecast.ll
index cbb88b9a09c93e..35a1066a6b31c2 100644
--- a/llvm/test/Transforms/InstCombine/addrspacecast.ll
+++ b/llvm/test/Transforms/InstCombine/addrspacecast.ll
@@ -141,7 +141,7 @@ define i32 @memcpy_addrspacecast() nounwind {
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_INC:%.*]], [[LOOP_BODY]] ]
 ; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SUM_INC:%.*]], [[LOOP_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[I]] to i16
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr addrspace(2) getelementptr inbounds ([60 x i8], ptr addrspace(2) @const_array, i16 0, i16 4), i16 [[TMP0]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr i8, ptr addrspace(2) getelementptr inbounds (i8, ptr addrspace(2) @const_array, i16 4), i16 [[TMP0]]
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(2) [[PTR]], align 1
 ; CHECK-NEXT:    [[EXT:%.*]] = zext i8 [[LOAD]] to i32
 ; CHECK-NEXT:    [[SUM_INC]] = add i32 [[SUM]], [[EXT]]
diff --git a/llvm/test/Transforms/InstCombine/align-addr.ll b/llvm/test/Transforms/InstCombine/align-addr.ll
index facb5df08a82f4..58647dc9595d82 100644
--- a/llvm/test/Transforms/InstCombine/align-addr.ll
+++ b/llvm/test/Transforms/InstCombine/align-addr.ll
@@ -81,7 +81,7 @@ define <16 x i8> @test1_as1(<2 x i64> %x) {
 
 define <16 x i8> @test1_as1_gep(<2 x i64> %x) {
 ; CHECK-LABEL: @test1_as1_gep(
-; CHECK-NEXT:    [[TMP:%.*]] = load <16 x i8>, ptr addrspace(1) getelementptr inbounds ([8 x i32], ptr addrspace(1) @GLOBAL_as1_gep, i32 0, i32 4), align 1
+; CHECK-NEXT:    [[TMP:%.*]] = load <16 x i8>, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @GLOBAL_as1_gep, i32 16), align 1
 ; CHECK-NEXT:    ret <16 x i8> [[TMP]]
 ;
   %tmp = load <16 x i8>, ptr addrspace(1) getelementptr ([8 x i32], ptr addrspace(1) @GLOBAL_as1_gep, i16 0, i16 4), align 1
diff --git a/llvm/test/Transforms/InstCombine/binop-select-cast-of-select-cond.ll b/llvm/test/Transforms/InstCombine/binop-select-cast-of-select-cond.ll
index 7dc2fe1cb88e1f..b0da6d80d05adf 100644
--- a/llvm/test/Transforms/InstCombine/binop-select-cast-of-select-cond.ll
+++ b/llvm/test/Transforms/InstCombine/binop-select-cast-of-select-cond.ll
@@ -232,7 +232,7 @@ define i64 @pr64669(i64 %a) {
 ; CHECK-LABEL: define i64 @pr64669
 ; CHECK-SAME: (i64 [[A:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[A]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = select i1 icmp ne (ptr getelementptr inbounds ([72 x i32], ptr @b, i64 0, i64 25), ptr @c), i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[ADD:%.*]] = select i1 icmp ne (ptr getelementptr inbounds (i8, ptr @b, i64 100), ptr @c), i64 [[TMP1]], i64 0
 ; CHECK-NEXT:    ret i64 [[ADD]]
 ;
   %mul = select i1 icmp ne (ptr getelementptr inbounds ([72 x i32], ptr @b, i64 0, i64 25), ptr @c), i64 %a, i64 0
diff --git a/llvm/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll b/llvm/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll
index 30d5cd66066bb7..857704f580281c 100644
--- a/llvm/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll
+++ b/llvm/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll
@@ -223,7 +223,7 @@ define i32 @test_cast_gep_large_indices_as() {
 
 define i32 @test_constant_cast_gep_struct_indices_as() {
 ; CHECK-LABEL: @test_constant_cast_gep_struct_indices_as(
-; CHECK-NEXT:    [[Y:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[STRUCT_FOO:%.*]], ptr addrspace(3) @constant_fold_global_ptr, i16 0, i32 2, i16 2), align 4
+; CHECK-NEXT:    [[Y:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @constant_fold_global_ptr, i16 16), align 4
 ; CHECK-NEXT:    ret i32 [[Y]]
 ;
   %x = getelementptr %struct.foo, ptr addrspace(3) @constant_fold_global_ptr, i18 0, i32 2, i12 2
diff --git a/llvm/test/Transforms/InstCombine/constant-fold-gep.ll b/llvm/test/Transforms/InstCombine/constant-fold-gep.ll
index 009c19dfa66cf9..54b7a6f66ecd2a 100644
--- a/llvm/test/Transforms/InstCombine/constant-fold-gep.ll
+++ b/llvm/test/Transforms/InstCombine/constant-fold-gep.ll
@@ -12,26 +12,26 @@ target datalayout = "E-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-
 define void @frob() {
 ; CHECK-LABEL: @frob(
 ; CHECK-NEXT:    store i32 1, ptr @Y, align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 0, i32 0, i64 1), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 0, i32 0, i64 2), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 0, i32 1, i64 0), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 0, i32 1, i64 1), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 0, i32 1, i64 2), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 1, i32 0, i64 0), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 1, i32 0, i64 1), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 1, i32 0, i64 2), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 1, i32 1, i64 0), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 1, i32 1, i64 1), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 1, i32 1, i64 2), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 2, i32 0, i64 0), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 2, i32 0, i64 1), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 2, i32 0, i64 2), align 8
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 2, i32 1, i64 0), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 2, i32 1, i64 1), align 8
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 0, i64 2, i32 1, i64 2), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds ([3 x %struct.X], ptr @Y, i64 1, i64 0, i32 0, i64 0), align 8
-; CHECK-NEXT:    store i32 1, ptr getelementptr ([3 x %struct.X], ptr @Y, i64 2, i64 0, i32 0, i64 0), align 8
-; CHECK-NEXT:    store i32 1, ptr getelementptr ([3 x %struct.X], ptr @Y, i64 1, i64 0, i32 0, i64 1), align 8
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 4), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 8), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 12), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 16), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 20), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 24), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 28), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 32), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 36), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 40), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 44), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 48), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 52), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 56), align 8
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 60), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 64), align 8
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 68), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr inbounds (i8, ptr @Y, i64 72), align 8
+; CHECK-NEXT:    store i32 1, ptr getelementptr (i8, ptr @Y, i64 144), align 8
+; CHECK-NEXT:    store i32 1, ptr getelementptr (i8, ptr @Y, i64 76), align 8
 ; CHECK-NEXT:    ret void
 ;
   store i32 1, ptr @Y, align 4
diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll
index 1526956c5b241e..ae2df634b02009 100644
--- a/llvm/test/Transforms/InstCombine/fmul.ll
+++ b/llvm/test/Transforms/InstCombine/fmul.ll
@@ -1131,7 +1131,7 @@ for.body:
 
 define double @fmul_negated_constant_expression(double %x) {
 ; CHECK-LABEL: @fmul_negated_constant_expression(
-; CHECK-NEXT:    [[FSUB:%.*]] = fneg double bitcast (i64 ptrtoint (ptr getelementptr inbounds ({ [2 x ptr] }, ptr @g, i64 1, i32 0, i64 0) to i64) to double)
+; CHECK-NEXT:    [[FSUB:%.*]] = fneg double bitcast (i64 ptrtoint (ptr getelementptr inbounds (i8, ptr @g, i64 16) to i64) to double)
 ; CHECK-NEXT:    [[R:%.*]] = fmul double [[FSUB]], [[X:%.*]]
 ; CHECK-NEXT:    ret double [[R]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/force-opaque-ptr.ll b/llvm/test/Transforms/InstCombine/force-opaque-ptr.ll
index ccc34e9134deb6..3b799e2fb2d0af 100644
--- a/llvm/test/Transforms/InstCombine/force-opaque-ptr.ll
+++ b/llvm/test/Transforms/InstCombine/force-opaque-ptr.ll
@@ -5,14 +5,14 @@
 
 define ptr @gep_constexpr_gv_1() {
 ; CHECK-LABEL: @gep_constexpr_gv_1(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([16 x i16], ptr @g, i64 0, i64 10)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @g, i64 20)
 ;
   ret ptr getelementptr([16 x i16], ptr @g, i64 0, i64 10)
 }
 
 define ptr @gep_constexpr_gv_2() {
 ; CHECK-LABEL: @gep_constexpr_gv_2(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([16 x i16], ptr @g, i64 0, i64 12)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @g, i64 24)
 ;
   ret ptr getelementptr(i32, ptr getelementptr([16 x i16], ptr @g, i64 0, i64 10), i64 1)
 }
diff --git a/llvm/test/Transforms/InstCombine/fortify-folding.ll b/llvm/test/Transforms/InstCombine/fortify-folding.ll
index a6b5dc90c3640a..988726c99edb79 100644
--- a/llvm/test/Transforms/InstCombine/fortify-folding.ll
+++ b/llvm/test/Transforms/InstCombine/fortify-folding.ll
@@ -39,7 +39,7 @@ define ptr @test_memccpy_tail() {
 define ptr @test_mempcpy() {
 ; CHECK-LABEL: @test_mempcpy(
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(15) @a, ptr noundef nonnull align 1 dereferenceable(15) @b, i64 15, i1 false)
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([60 x i8], ptr @a, i64 0, i64 15)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a, i64 15)
 ;
   %ret = call ptr @__mempcpy_chk(ptr @a, ptr @b, i64 15, i64 -1)
   ret ptr %ret
@@ -57,7 +57,7 @@ define ptr @test_not_mempcpy() {
 define ptr @test_mempcpy_tail() {
 ; CHECK-LABEL: @test_mempcpy_tail(
 ; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(15) @a, ptr noundef nonnull align 1 dereferenceable(15) @b, i64 15, i1 false)
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([60 x i8], ptr @a, i64 0, i64 15)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a, i64 15)
 ;
   %ret = tail call ptr @__mempcpy_chk(ptr @a, ptr @b, i64 15, i64 -1)
   ret ptr %ret
diff --git a/llvm/test/Transforms/InstCombine/gep-custom-dl.ll b/llvm/test/Transforms/InstCombine/gep-custom-dl.ll
index d9449e05612c38..e8eaf4e24f7e42 100644
--- a/llvm/test/Transforms/InstCombine/gep-custom-dl.ll
+++ b/llvm/test/Transforms/InstCombine/gep-custom-dl.ll
@@ -34,7 +34,7 @@ define ptr @test2(ptr %I) {
 define void @test3(i8 %B) {
 ; This should be turned into a constexpr instead of being an instruction
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    store i8 [[B:%.*]], ptr getelementptr inbounds ([10 x i8], ptr @Global, i32 0, i32 4), align 1
+; CHECK-NEXT:    store i8 [[B:%.*]], ptr getelementptr inbounds (i8, ptr @Global, i32 4), align 1
 ; CHECK-NEXT:    ret void
 ;
   %A = getelementptr [10 x i8], ptr @Global, i32 0, i32 4
@@ -62,7 +62,7 @@ define void @test_evaluate_gep_nested_as_ptrs(ptr addrspace(2) %B) {
 
 define void @test_evaluate_gep_as_ptrs_array(ptr addrspace(2) %B) {
 ; CHECK-LABEL: @test_evaluate_gep_as_ptrs_array(
-; CHECK-NEXT:    store ptr addrspace(2) [[B:%.*]], ptr addrspace(1) getelementptr inbounds ([4 x ptr addrspace(2)], ptr addrspace(1) @arst, i32 0, i32 2), align 8
+; CHECK-NEXT:    store ptr addrspace(2) [[B:%.*]], ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @arst, i32 16), align 8
 ; CHECK-NEXT:    ret void
 ;
 
@@ -168,7 +168,7 @@ define i32 @test10() {
 define i16 @constant_fold_custom_dl() {
 ; CHECK-LABEL: @constant_fold_custom_dl(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i16 ptrtoint (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) getelementptr inbounds ([1000 x i8], ptr addrspace(1) @X_as1, i32 1, i32 0), i16 sub (i16 0, i16 ptrtoint (ptr addrspace(1) @X_as1 to i16))) to i16)
+; CHECK-NEXT:    ret i16 ptrtoint (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @X_as1, i32 1000), i16 sub (i16 0, i16 ptrtoint (ptr addrspace(1) @X_as1 to i16))) to i16)
 ;
 
 entry:
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index 04b0c196ab5108..307ed8d2b02ba9 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -63,7 +63,7 @@ define ptr @test4(ptr %I) {
 define void @test5(i8 %B) {
         ; This should be turned into a constexpr instead of being an instruction
 ; CHECK-LABEL: @test5(
-; CHECK-NEXT:    store i8 [[B:%.*]], ptr getelementptr inbounds ([10 x i8], ptr @Global, i64 0, i64 4), align 1
+; CHECK-NEXT:    store i8 [[B:%.*]], ptr getelementptr inbounds (i8, ptr @Global, i64 4), align 1
 ; CHECK-NEXT:    ret void
 ;
   %A = getelementptr [10 x i8], ptr @Global, i64 0, i64 4
@@ -74,7 +74,7 @@ define void @test5(i8 %B) {
 define void @test5_as1(i8 %B) {
         ; This should be turned into a constexpr instead of being an instruction
 ; CHECK-LABEL: @test5_as1(
-; CHECK-NEXT:    store i8 [[B:%.*]], ptr addrspace(1) getelementptr inbounds ([10 x i8], ptr addrspace(1) @Global_as1, i16 0, i16 4), align 1
+; CHECK-NEXT:    store i8 [[B:%.*]], ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @Global_as1, i16 4), align 1
 ; CHECK-NEXT:    ret void
 ;
   %A = getelementptr [10 x i8], ptr addrspace(1) @Global_as1, i16 0, i16 4
@@ -102,7 +102,7 @@ define void @test_evaluate_gep_nested_as_ptrs(ptr addrspace(2) %B) {
 
 define void @test_evaluate_gep_as_ptrs_array(ptr addrspace(2) %B) {
 ; CHECK-LABEL: @test_evaluate_gep_as_ptrs_array(
-; CHECK-NEXT:    store ptr addrspace(2) [[B:%.*]], ptr addrspace(1) getelementptr inbounds ([4 x ptr addrspace(2)], ptr addrspace(1) @arst, i16 0, i16 2), align 4
+; CHECK-NEXT:    store ptr addrspace(2) [[B:%.*]], ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @arst, i16 8), align 4
 ; CHECK-NEXT:    ret void
 ;
 
@@ -114,7 +114,7 @@ define void @test_evaluate_gep_as_ptrs_array(ptr addrspace(2) %B) {
 ; This should be turned into a constexpr instead of being an instruction
 define void @test_overaligned_vec(i8 %B) {
 ; CHECK-LABEL: @test_overaligned_vec(
-; CHECK-NEXT:    store i8 [[B:%.*]], ptr getelementptr inbounds ([10 x i8], ptr @Global, i64 0, i64 2), align 1
+; CHECK-NEXT:    store i8 [[B:%.*]], ptr getelementptr inbounds (i8, ptr @Global, i64 2), align 1
 ; CHECK-NEXT:    ret void
 ;
   %A = getelementptr <2 x half>, ptr @Global, i64 0, i64 1
@@ -537,7 +537,7 @@ define i32 @test21() {
 
 define i1 @test22() {
 ; CHECK-LABEL: @test22(
-; CHECK-NEXT:    ret i1 icmp ult (ptr getelementptr inbounds (i32, ptr @A, i64 1), ptr getelementptr (i32, ptr @B, i64 2))
+; CHECK-NEXT:    ret i1 icmp ult (ptr getelementptr inbounds (i8, ptr @A, i64 4), ptr getelementptr (i8, ptr @B, i64 8))
 ;
   %C = icmp ult ptr getelementptr (i32, ptr @A, i64 1),
   getelementptr (i32, ptr @B, i64 2)
@@ -828,7 +828,7 @@ entry:
 
 define i32 @test35() nounwind {
 ; CHECK-LABEL: @test35(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @"\01LC8", ptr nonnull getelementptr inbounds ([[T0:%.*]], ptr @s, i64 0, i32 1, i64 0)) #[[ATTR0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @"\01LC8", ptr nonnull getelementptr inbounds (i8, ptr @s, i64 8)) #[[ATTR0]]
 ; CHECK-NEXT:    ret i32 0
 ;
   call i32 (ptr, ...) @printf(ptr @"\01LC8",
@@ -839,7 +839,7 @@ define i32 @test35() nounwind {
 ; Don't treat signed offsets as unsigned.
 define ptr @test36() nounwind {
 ; CHECK-LABEL: @test36(
-; CHECK-NEXT:    ret ptr getelementptr ([11 x i8], ptr @array, i64 -1, i64 10)
+; CHECK-NEXT:    ret ptr getelementptr (i8, ptr @array, i64 -1)
 ;
   ret ptr getelementptr ([11 x i8], ptr @array, i32 0, i64 -1)
 }
@@ -1377,14 +1377,14 @@ define ptr @gep_of_gep_multiuse_var_and_var(ptr %p, i64 %idx, i64 %idx2) {
 
 define ptr @const_gep_global_di_i8_smaller() {
 ; CHECK-LABEL: @const_gep_global_di_i8_smaller(
-; CHECK-NEXT:    ret ptr getelementptr (i8, ptr @g_i32_di, i64 3)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @g_i32_di, i64 3)
 ;
   ret ptr getelementptr (i8, ptr @g_i32_di, i64 3)
 }
 
 define ptr @const_gep_global_di_i8_exact() {
 ; CHECK-LABEL: @const_gep_global_di_i8_exact(
-; CHECK-NEXT:    ret ptr getelementptr inbounds (i32, ptr @g_i32_di, i64 1)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @g_i32_di, i64 4)
 ;
   ret ptr getelementptr (i8, ptr @g_i32_di, i64 4)
 }
@@ -1398,21 +1398,21 @@ define ptr @const_gep_global_di_i8_larger() {
 
 define ptr @const_gep_global_di_i64_larger() {
 ; CHECK-LABEL: @const_gep_global_di_i64_larger(
-; CHECK-NEXT:    ret ptr getelementptr (i32, ptr @g_i32_di, i64 2)
+; CHECK-NEXT:    ret ptr getelementptr (i8, ptr @g_i32_di, i64 8)
 ;
   ret ptr getelementptr (i64, ptr @g_i32_di, i64 1)
 }
 
 define ptr @const_gep_global_e_smaller() {
 ; CHECK-LABEL: @const_gep_global_e_smaller(
-; CHECK-NEXT:    ret ptr getelementptr (i8, ptr @g_i32_e, i64 3)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @g_i32_e, i64 3)
 ;
   ret ptr getelementptr (i8, ptr @g_i32_e, i64 3)
 }
 
 define ptr @const_gep_global_e_exact() {
 ; CHECK-LABEL: @const_gep_global_e_exact(
-; CHECK-NEXT:    ret ptr getelementptr inbounds (i32, ptr @g_i32_e, i64 1)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @g_i32_e, i64 4)
 ;
   ret ptr getelementptr (i8, ptr @g_i32_e, i64 4)
 }
@@ -1433,7 +1433,7 @@ define ptr @const_gep_global_ew_smaller() {
 
 define ptr @const_gep_global_ew_exact() {
 ; CHECK-LABEL: @const_gep_global_ew_exact(
-; CHECK-NEXT:    ret ptr getelementptr (i32, ptr @g_i32_ew, i64 1)
+; CHECK-NEXT:    ret ptr getelementptr (i8, ptr @g_i32_ew, i64 4)
 ;
   ret ptr getelementptr (i8, ptr @g_i32_ew, i64 4)
 }
@@ -1447,7 +1447,7 @@ define ptr @const_gep_global_ew_larger() {
 
 define ptr @const_gep_0xi8_global() {
 ; CHECK-LABEL: @const_gep_0xi8_global(
-; CHECK-NEXT:    ret ptr getelementptr ([0 x i8], ptr @g_0xi8_e, i64 0, i64 10)
+; CHECK-NEXT:    ret ptr getelementptr (i8, ptr @g_0xi8_e, i64 10)
 ;
   ret ptr getelementptr ([0 x i8], ptr @g_0xi8_e, i64 0, i64 10)
 }
diff --git a/llvm/test/Transforms/InstCombine/hoist-xor-by-constant-from-xor-by-value.ll b/llvm/test/Transforms/InstCombine/hoist-xor-by-constant-from-xor-by-value.ll
index db2c8e2f22f6ee..d75dbcf9c9b9c2 100644
--- a/llvm/test/Transforms/InstCombine/hoist-xor-by-constant-from-xor-by-value.ll
+++ b/llvm/test/Transforms/InstCombine/hoist-xor-by-constant-from-xor-by-value.ll
@@ -94,7 +94,7 @@ entry:
 
 define i16 @constantexpr2() {
 ; CHECK-LABEL: @constantexpr2(
-; CHECK-NEXT:    [[I1:%.*]] = zext i1 icmp ne (ptr getelementptr inbounds ([6 x [1 x i64]], ptr @global_constant3, i64 0, i64 5, i64 0), ptr @global_constant4) to i16
+; CHECK-NEXT:    [[I1:%.*]] = zext i1 icmp ne (ptr getelementptr inbounds (i8, ptr @global_constant3, i64 40), ptr @global_constant4) to i16
 ; CHECK-NEXT:    [[I2:%.*]] = load ptr, ptr @global_constant5, align 1
 ; CHECK-NEXT:    [[I3:%.*]] = load i16, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = xor i16 [[I3]], [[I1]]
diff --git a/llvm/test/Transforms/InstCombine/loadstore-alignment.ll b/llvm/test/Transforms/InstCombine/loadstore-alignment.ll
index 1027468d6715e8..098f2eee52df04 100644
--- a/llvm/test/Transforms/InstCombine/loadstore-alignment.ll
+++ b/llvm/test/Transforms/InstCombine/loadstore-alignment.ll
@@ -9,7 +9,7 @@ target datalayout = "E-p:64:64:64-p1:64:64:64-p2:32:32:32-a0:0:8-f32:32:32-f64:6
 
 define <2 x i64> @static_hem() {
 ; CHECK-LABEL: @static_hem(
-; CHECK-NEXT:    [[L:%.*]] = load <2 x i64>, ptr getelementptr (<2 x i64>, ptr @x, i64 7), align 1
+; CHECK-NEXT:    [[L:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @x, i64 112), align 1
 ; CHECK-NEXT:    ret <2 x i64> [[L]]
 ;
   %t = getelementptr <2 x i64>, ptr @x, i32 7
@@ -66,7 +66,7 @@ define <2 x i64> @bar() {
 
 define void @static_hem_store(<2 x i64> %y) {
 ; CHECK-LABEL: @static_hem_store(
-; CHECK-NEXT:    store <2 x i64> [[Y:%.*]], ptr getelementptr (<2 x i64>, ptr @x, i64 7), align 1
+; CHECK-NEXT:    store <2 x i64> [[Y:%.*]], ptr getelementptr (i8, ptr @x, i64 112), align 1
 ; CHECK-NEXT:    ret void
 ;
   %t = getelementptr <2 x i64>, ptr @x, i32 7
diff --git a/llvm/test/Transforms/InstCombine/memchr-2.ll b/llvm/test/Transforms/InstCombine/memchr-2.ll
index 22aae6edcf9203..2e85fe4ad1de02 100644
--- a/llvm/test/Transforms/InstCombine/memchr-2.ll
+++ b/llvm/test/Transforms/InstCombine/memchr-2.ll
@@ -51,7 +51,7 @@ define ptr @fold_memchr_a12345_4_3() {
 
 define ptr @fold_memchr_a12345_3_3() {
 ; CHECK-LABEL: @fold_memchr_a12345_3_3(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([5 x i8], ptr @a12345, i64 0, i64 2)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a12345, i64 2)
 ;
 
   %res = call ptr @memchr(ptr @a12345, i32 3, i64 3)
@@ -63,7 +63,7 @@ define ptr @fold_memchr_a12345_3_3() {
 
 define ptr @fold_memchr_a12345_3_9() {
 ; CHECK-LABEL: @fold_memchr_a12345_3_9(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([5 x i8], ptr @a12345, i64 0, i64 2)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a12345, i64 2)
 ;
 
   %res = call ptr @memchr(ptr @a12345, i32 3, i64 9)
@@ -76,7 +76,7 @@ define ptr @fold_memchr_a12345_3_9() {
 
 define ptr @fold_memchr_a123f45_500_9() {
 ; CHECK-LABEL: @fold_memchr_a123f45_500_9(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([5 x i8], ptr @a123f45, i64 0, i64 3)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a123f45, i64 3)
 ;
 
   %res = call ptr @memchr(ptr @a123f45, i32 500, i64 9)
@@ -89,7 +89,7 @@ define ptr @fold_memchr_a123f45_500_9() {
 define ptr @fold_a12345_3_n(i64 %n) {
 ; CHECK-LABEL: @fold_a12345_3_n(
 ; CHECK-NEXT:    [[MEMCHR_CMP:%.*]] = icmp ult i64 [[N:%.*]], 3
-; CHECK-NEXT:    [[RES:%.*]] = select i1 [[MEMCHR_CMP]], ptr null, ptr getelementptr inbounds ([5 x i8], ptr @a12345, i64 0, i64 2)
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[MEMCHR_CMP]], ptr null, ptr getelementptr inbounds (i8, ptr @a12345, i64 2)
 ; CHECK-NEXT:    ret ptr [[RES]]
 ;
 
@@ -104,7 +104,7 @@ define ptr @fold_a12345_3_n(i64 %n) {
 define ptr @fold_a12345_259_n(i64 %n) {
 ; CHECK-LABEL: @fold_a12345_259_n(
 ; CHECK-NEXT:    [[MEMCHR_CMP:%.*]] = icmp ult i64 [[N:%.*]], 3
-; CHECK-NEXT:    [[RES:%.*]] = select i1 [[MEMCHR_CMP]], ptr null, ptr getelementptr inbounds ([5 x i8], ptr @a12345, i64 0, i64 2)
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[MEMCHR_CMP]], ptr null, ptr getelementptr inbounds (i8, ptr @a12345, i64 2)
 ; CHECK-NEXT:    ret ptr [[RES]]
 ;
 
diff --git a/llvm/test/Transforms/InstCombine/memchr-4.ll b/llvm/test/Transforms/InstCombine/memchr-4.ll
index 93884c73af62bc..9aec0f1dfe57ca 100644
--- a/llvm/test/Transforms/InstCombine/memchr-4.ll
+++ b/llvm/test/Transforms/InstCombine/memchr-4.ll
@@ -44,7 +44,7 @@ define ptr @call_memchr_ax_2_uimax_p2() {
 
 define ptr @fold_memchr_a12345_3_uimax_p2() {
 ; CHECK-LABEL: @fold_memchr_a12345_3_uimax_p2(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([5 x i8], ptr @a12345, i64 0, i64 2)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a12345, i64 2)
 ;
 
   %res = call ptr @memchr(ptr @a12345, i32 3, i64 4294967297)
diff --git a/llvm/test/Transforms/InstCombine/memchr-6.ll b/llvm/test/Transforms/InstCombine/memchr-6.ll
index 6243c464c6d3c4..28364a92f54dfe 100644
--- a/llvm/test/Transforms/InstCombine/memchr-6.ll
+++ b/llvm/test/Transforms/InstCombine/memchr-6.ll
@@ -69,7 +69,7 @@ define ptr @fold_memchr_a111122_c_n(i32 %C, i64 %N) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[N:%.*]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i1 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[MEMCHR_SEL1:%.*]] = select i1 [[TMP4]], ptr getelementptr inbounds ([6 x i8], ptr @a111122, i64 0, i64 4), ptr null
+; CHECK-NEXT:    [[MEMCHR_SEL1:%.*]] = select i1 [[TMP4]], ptr getelementptr inbounds (i8, ptr @a111122, i64 4), ptr null
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i1 [[TMP6]], [[TMP5]]
@@ -103,7 +103,7 @@ define ptr @call_memchr_a1110111_c_4(i32 %C) {
 ; CHECK-LABEL: @call_memchr_a1110111_c_4(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
-; CHECK-NEXT:    [[MEMCHR_SEL1:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds ([7 x i8], ptr @a1110111, i64 0, i64 3), ptr null
+; CHECK-NEXT:    [[MEMCHR_SEL1:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds (i8, ptr @a1110111, i64 3), ptr null
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[TMP1]], 1
 ; CHECK-NEXT:    [[MEMCHR_SEL2:%.*]] = select i1 [[TMP3]], ptr @a1110111, ptr [[MEMCHR_SEL1]]
 ; CHECK-NEXT:    ret ptr [[MEMCHR_SEL2]]
diff --git a/llvm/test/Transforms/InstCombine/memchr-7.ll b/llvm/test/Transforms/InstCombine/memchr-7.ll
index 50072b5ca1484d..0b364cce656d77 100644
--- a/llvm/test/Transforms/InstCombine/memchr-7.ll
+++ b/llvm/test/Transforms/InstCombine/memchr-7.ll
@@ -76,7 +76,7 @@ define ptr @memchr_no_zero_cmp2(i32 %c) {
 ; CHECK-LABEL: @memchr_no_zero_cmp2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 10
-; CHECK-NEXT:    [[MEMCHR_SEL1:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds ([2 x i8], ptr @.str.1, i64 0, i64 1), ptr null
+; CHECK-NEXT:    [[MEMCHR_SEL1:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds (i8, ptr @.str.1, i64 1), ptr null
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[TMP1]], 13
 ; CHECK-NEXT:    [[MEMCHR_SEL2:%.*]] = select i1 [[TMP3]], ptr @.str.1, ptr [[MEMCHR_SEL1]]
 ; CHECK-NEXT:    ret ptr [[MEMCHR_SEL2]]
diff --git a/llvm/test/Transforms/InstCombine/memchr-8.ll b/llvm/test/Transforms/InstCombine/memchr-8.ll
index 0e878b77e40d70..b2ac2e6eda9a76 100644
--- a/llvm/test/Transforms/InstCombine/memchr-8.ll
+++ b/llvm/test/Transforms/InstCombine/memchr-8.ll
@@ -15,7 +15,7 @@ declare ptr @memrchr(ptr, i32, i64)
 
 define ptr @call_a_pi32max_p1() {
 ; CHECK-LABEL: @call_a_pi32max_p1(
-; CHECK-NEXT:    [[CHR:%.*]] = tail call ptr @memrchr(ptr noundef nonnull dereferenceable(2147483647) getelementptr inbounds (<{ i8, [4294967295 x i8] }>, ptr @a, i64 0, i32 1, i64 2147483647), i32 0, i64 2147483647)
+; CHECK-NEXT:    [[CHR:%.*]] = tail call ptr @memrchr(ptr noundef nonnull dereferenceable(2147483647) getelementptr inbounds (i8, ptr @a, i64 2147483648), i32 0, i64 2147483647)
 ; CHECK-NEXT:    ret ptr [[CHR]]
 ;
   %ptr = getelementptr <{ i8, [4294967295 x i8] }>, ptr @a, i32 0, i32 1, i32 2147483647
@@ -28,7 +28,7 @@ define ptr @call_a_pi32max_p1() {
 
 define ptr @call_a_pi32max() {
 ; CHECK-LABEL: @call_a_pi32max(
-; CHECK-NEXT:    [[CHR:%.*]] = tail call ptr @memrchr(ptr noundef nonnull dereferenceable(2147483647) getelementptr inbounds (<{ i8, [4294967295 x i8] }>, ptr @a, i64 0, i32 1, i64 2147483648), i32 0, i64 2147483647)
+; CHECK-NEXT:    [[CHR:%.*]] = tail call ptr @memrchr(ptr noundef nonnull dereferenceable(2147483647) getelementptr inbounds (i8, ptr @a, i64 2147483649), i32 0, i64 2147483647)
 ; CHECK-NEXT:    ret ptr [[CHR]]
 ;
   %ptr = getelementptr <{ i8, [4294967295 x i8] }>, ptr @a, i32 0, i32 1, i64 2147483648
@@ -42,7 +42,7 @@ define ptr @call_a_pi32max() {
 
 define ptr @call_a_pui32max() {
 ; CHECK-LABEL: @call_a_pui32max(
-; CHECK-NEXT:    [[CHR:%.*]] = tail call ptr @memrchr(ptr noundef nonnull dereferenceable(4294967295) getelementptr inbounds (<{ i8, [4294967295 x i8] }>, ptr @a, i64 0, i32 1, i64 0), i32 0, i64 4294967295)
+; CHECK-NEXT:    [[CHR:%.*]] = tail call ptr @memrchr(ptr noundef nonnull dereferenceable(4294967295) getelementptr inbounds (i8, ptr @a, i64 1), i32 0, i64 4294967295)
 ; CHECK-NEXT:    ret ptr [[CHR]]
 ;
   %ptr = getelementptr <{ i8, [4294967295 x i8] }>, ptr @a, i32 0, i32 1, i32 0
diff --git a/llvm/test/Transforms/InstCombine/memchr-9.ll b/llvm/test/Transforms/InstCombine/memchr-9.ll
index fe80c282eed54b..7a5e6c3f863cfc 100644
--- a/llvm/test/Transforms/InstCombine/memchr-9.ll
+++ b/llvm/test/Transforms/InstCombine/memchr-9.ll
@@ -24,19 +24,19 @@ define void @fold_memchr_A_pIb_cst_cst(ptr %pchr) {
 ; CHECK-NEXT:    [[PST_0_4_4:%.*]] = getelementptr i8, ptr [[PCHR]], i64 16
 ; CHECK-NEXT:    store ptr null, ptr [[PST_0_4_4]], align 8
 ; CHECK-NEXT:    [[PST_1_0_1:%.*]] = getelementptr i8, ptr [[PCHR]], i64 24
-; CHECK-NEXT:    store ptr getelementptr (i8, ptr @a, i64 1), ptr [[PST_1_0_1]], align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @a, i64 1), ptr [[PST_1_0_1]], align 8
 ; CHECK-NEXT:    [[PST_1_0_3:%.*]] = getelementptr i8, ptr [[PCHR]], i64 32
-; CHECK-NEXT:    store ptr getelementptr (i8, ptr @a, i64 1), ptr [[PST_1_0_3]], align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @a, i64 1), ptr [[PST_1_0_3]], align 8
 ; CHECK-NEXT:    [[PST_1_1_1:%.*]] = getelementptr i8, ptr [[PCHR]], i64 40
 ; CHECK-NEXT:    store ptr null, ptr [[PST_1_1_1]], align 8
 ; CHECK-NEXT:    [[PST_1_1_2:%.*]] = getelementptr i8, ptr [[PCHR]], i64 48
-; CHECK-NEXT:    store ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 0, i64 1), ptr [[PST_1_1_2]], align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @a, i64 2), ptr [[PST_1_1_2]], align 8
 ; CHECK-NEXT:    [[PST_1_3_3:%.*]] = getelementptr i8, ptr [[PCHR]], i64 56
 ; CHECK-NEXT:    store ptr null, ptr [[PST_1_3_3]], align 8
 ; CHECK-NEXT:    [[PST_1_3_4:%.*]] = getelementptr i8, ptr [[PCHR]], i64 64
 ; CHECK-NEXT:    store ptr null, ptr [[PST_1_3_4]], align 8
 ; CHECK-NEXT:    [[PST_1_3_6:%.*]] = getelementptr i8, ptr [[PCHR]], i64 80
-; CHECK-NEXT:    store ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 1, i64 1), ptr [[PST_1_3_6]], align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @a, i64 6), ptr [[PST_1_3_6]], align 8
 ; CHECK-NEXT:    ret void
 ;
 
@@ -110,25 +110,25 @@ define void @fold_memchr_A_pIb_cst_N(i64 %N, ptr %pchr) {
 ; CHECK-NEXT:    store ptr [[CHR_0_0_N]], ptr [[PCHR:%.*]], align 8
 ; CHECK-NEXT:    [[PST_0_1_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 8
 ; CHECK-NEXT:    [[MEMCHR_CMP1:%.*]] = icmp ult i64 [[N]], 3
-; CHECK-NEXT:    [[CHR_0_1_N:%.*]] = select i1 [[MEMCHR_CMP1]], ptr null, ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 0, i64 1)
+; CHECK-NEXT:    [[CHR_0_1_N:%.*]] = select i1 [[MEMCHR_CMP1]], ptr null, ptr getelementptr inbounds (i8, ptr @a, i64 2)
 ; CHECK-NEXT:    store ptr [[CHR_0_1_N]], ptr [[PST_0_1_N]], align 8
 ; CHECK-NEXT:    [[PST_0_4_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 16
 ; CHECK-NEXT:    store ptr null, ptr [[PST_0_4_N]], align 8
 ; CHECK-NEXT:    [[PST_1_0_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 24
 ; CHECK-NEXT:    [[MEMCHR_CMP2:%.*]] = icmp eq i64 [[N]], 0
-; CHECK-NEXT:    [[CHR_1_0_N:%.*]] = select i1 [[MEMCHR_CMP2]], ptr null, ptr getelementptr (i8, ptr @a, i64 1)
+; CHECK-NEXT:    [[CHR_1_0_N:%.*]] = select i1 [[MEMCHR_CMP2]], ptr null, ptr getelementptr inbounds (i8, ptr @a, i64 1)
 ; CHECK-NEXT:    store ptr [[CHR_1_0_N]], ptr [[PST_1_0_N]], align 8
 ; CHECK-NEXT:    [[PST_1_1_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 32
 ; CHECK-NEXT:    [[MEMCHR_CMP3:%.*]] = icmp ult i64 [[N]], 2
-; CHECK-NEXT:    [[CHR_1_1_N:%.*]] = select i1 [[MEMCHR_CMP3]], ptr null, ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 0, i64 1)
+; CHECK-NEXT:    [[CHR_1_1_N:%.*]] = select i1 [[MEMCHR_CMP3]], ptr null, ptr getelementptr inbounds (i8, ptr @a, i64 2)
 ; CHECK-NEXT:    store ptr [[CHR_1_1_N]], ptr [[PST_1_1_N]], align 8
 ; CHECK-NEXT:    [[PST_1_2_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 40
 ; CHECK-NEXT:    [[MEMCHR_CMP4:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT:    [[CHR_1_2_N:%.*]] = select i1 [[MEMCHR_CMP4]], ptr null, ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 1, i64 0)
+; CHECK-NEXT:    [[CHR_1_2_N:%.*]] = select i1 [[MEMCHR_CMP4]], ptr null, ptr getelementptr inbounds (i8, ptr @a, i64 4)
 ; CHECK-NEXT:    store ptr [[CHR_1_2_N]], ptr [[PST_1_2_N]], align 8
 ; CHECK-NEXT:    [[PST_1_3_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 48
 ; CHECK-NEXT:    [[MEMCHR_CMP5:%.*]] = icmp ult i64 [[N]], 6
-; CHECK-NEXT:    [[CHR_1_3_N:%.*]] = select i1 [[MEMCHR_CMP5]], ptr null, ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 1, i64 1)
+; CHECK-NEXT:    [[CHR_1_3_N:%.*]] = select i1 [[MEMCHR_CMP5]], ptr null, ptr getelementptr inbounds (i8, ptr @a, i64 6)
 ; CHECK-NEXT:    store ptr [[CHR_1_3_N]], ptr [[PST_1_3_N]], align 8
 ; CHECK-NEXT:    [[PST_1_4_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 56
 ; CHECK-NEXT:    store ptr null, ptr [[PST_1_4_N]], align 8
@@ -136,15 +136,15 @@ define void @fold_memchr_A_pIb_cst_N(i64 %N, ptr %pchr) {
 ; CHECK-NEXT:    store ptr null, ptr [[PST_2_0_N]], align 8
 ; CHECK-NEXT:    [[PST_2_1_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 72
 ; CHECK-NEXT:    [[MEMCHR_CMP6:%.*]] = icmp eq i64 [[N]], 0
-; CHECK-NEXT:    [[CHR_2_1_N:%.*]] = select i1 [[MEMCHR_CMP6]], ptr null, ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 0, i64 1)
+; CHECK-NEXT:    [[CHR_2_1_N:%.*]] = select i1 [[MEMCHR_CMP6]], ptr null, ptr getelementptr inbounds (i8, ptr @a, i64 2)
 ; CHECK-NEXT:    store ptr [[CHR_2_1_N]], ptr [[PST_2_1_N]], align 8
 ; CHECK-NEXT:    [[PST_2_2_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 80
 ; CHECK-NEXT:    [[MEMCHR_CMP7:%.*]] = icmp ult i64 [[N]], 3
-; CHECK-NEXT:    [[CHR_2_2_N:%.*]] = select i1 [[MEMCHR_CMP7]], ptr null, ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 1, i64 0)
+; CHECK-NEXT:    [[CHR_2_2_N:%.*]] = select i1 [[MEMCHR_CMP7]], ptr null, ptr getelementptr inbounds (i8, ptr @a, i64 4)
 ; CHECK-NEXT:    store ptr [[CHR_2_2_N]], ptr [[PST_2_2_N]], align 8
 ; CHECK-NEXT:    [[PST_2_3_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 88
 ; CHECK-NEXT:    [[MEMCHR_CMP8:%.*]] = icmp ult i64 [[N]], 5
-; CHECK-NEXT:    [[CHR_2_3_N:%.*]] = select i1 [[MEMCHR_CMP8]], ptr null, ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 1, i64 1)
+; CHECK-NEXT:    [[CHR_2_3_N:%.*]] = select i1 [[MEMCHR_CMP8]], ptr null, ptr getelementptr inbounds (i8, ptr @a, i64 6)
 ; CHECK-NEXT:    store ptr [[CHR_2_3_N]], ptr [[PST_2_3_N]], align 8
 ; CHECK-NEXT:    [[PST_2_4_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 96
 ; CHECK-NEXT:    store ptr null, ptr [[PST_2_4_N]], align 8
@@ -230,13 +230,13 @@ define void @fold_memchr_A_pIb_cst_N(i64 %N, ptr %pchr) {
 
 define void @call_memchr_A_pIb_xs_cst(ptr %pchr) {
 ; CHECK-LABEL: @call_memchr_A_pIb_xs_cst(
-; CHECK-NEXT:    [[CHR_1_0_0_2:%.*]] = call ptr @memchr(ptr noundef nonnull dereferenceable(1) getelementptr inbounds ([1 x %struct.A], ptr @a, i64 1, i64 0), i32 0, i64 2)
+; CHECK-NEXT:    [[CHR_1_0_0_2:%.*]] = call ptr @memchr(ptr noundef nonnull dereferenceable(1) getelementptr inbounds (i8, ptr @a, i64 8), i32 0, i64 2)
 ; CHECK-NEXT:    store ptr [[CHR_1_0_0_2]], ptr [[PCHR:%.*]], align 8
 ; CHECK-NEXT:    [[PST_1_0_1_2:%.*]] = getelementptr i8, ptr [[PCHR]], i64 8
-; CHECK-NEXT:    [[CHR_1_0_1_2:%.*]] = call ptr @memchr(ptr noundef nonnull dereferenceable(1) getelementptr inbounds ([1 x %struct.A], ptr @a, i64 1, i64 0), i32 0, i64 2)
+; CHECK-NEXT:    [[CHR_1_0_1_2:%.*]] = call ptr @memchr(ptr noundef nonnull dereferenceable(1) getelementptr inbounds (i8, ptr @a, i64 8), i32 0, i64 2)
 ; CHECK-NEXT:    store ptr [[CHR_1_0_1_2]], ptr [[PST_1_0_1_2]], align 8
 ; CHECK-NEXT:    [[PST_0_0_8_2:%.*]] = getelementptr i8, ptr [[PCHR]], i64 16
-; CHECK-NEXT:    [[CHR_0_0_8_2:%.*]] = call ptr @memchr(ptr noundef nonnull dereferenceable(1) getelementptr inbounds ([1 x %struct.A], ptr @a, i64 1, i64 0, i32 0, i64 0), i32 0, i64 2)
+; CHECK-NEXT:    [[CHR_0_0_8_2:%.*]] = call ptr @memchr(ptr noundef nonnull dereferenceable(1) getelementptr inbounds (i8, ptr @a, i64 8), i32 0, i64 2)
 ; CHECK-NEXT:    store ptr [[CHR_0_0_8_2]], ptr [[PST_0_0_8_2]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -276,7 +276,7 @@ define void @call_memchr_A_pIb_xs_cst(ptr %pchr) {
 
 define ptr @fold_memchr_gep_gep_gep() {
 ; CHECK-LABEL: @fold_memchr_gep_gep_gep(
-; CHECK-NEXT:    ret ptr getelementptr (i16, ptr getelementptr (i32, ptr getelementptr inbounds ([2 x i64], ptr @ai64, i64 0, i64 1), i64 1), i64 1)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @ai64, i64 14)
 ;
 
   %p8_1 = getelementptr [2 x i64], ptr @ai64, i64 0, i64 1
@@ -297,10 +297,10 @@ define ptr @fold_memchr_gep_gep_gep() {
 
 define ptr @fold_memchr_union_member() {
 ; BE-CHECK-LABEL: @fold_memchr_union_member(
-; BE-CHECK-NEXT:    ret ptr getelementptr (i8, ptr @u, i64 5)
+; BE-CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @u, i64 5)
 ;
 ; LE-CHECK-LABEL: @fold_memchr_union_member(
-; LE-CHECK-NEXT:    ret ptr getelementptr inbounds ([[UNION_U:%.*]], ptr @u, i64 0, i32 0, i64 1)
+; LE-CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @u, i64 4)
 ;
   %pi8u_p1 = getelementptr i8, ptr @u, i64 1
   %pc = call ptr @memchr(ptr %pi8u_p1, i32 34, i64 8)
diff --git a/llvm/test/Transforms/InstCombine/memchr.ll b/llvm/test/Transforms/InstCombine/memchr.ll
index 2074fd7ba4f722..08435a5e0388e5 100644
--- a/llvm/test/Transforms/InstCombine/memchr.ll
+++ b/llvm/test/Transforms/InstCombine/memchr.ll
@@ -17,7 +17,7 @@ declare ptr @memchr(ptr, i32, i32)
 
 define void @test1() {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([14 x i8], ptr @hello, i32 0, i32 6), ptr @chp, align 4
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @hello, i32 6), ptr @chp, align 4
 ; CHECK-NEXT:    ret void
 ;
   %dst = call ptr @memchr(ptr @hello, i32 119, i32 14)
@@ -37,7 +37,7 @@ define void @test2() {
 
 define void @test3() {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([14 x i8], ptr @hello, i32 0, i32 13), ptr @chp, align 4
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @hello, i32 13), ptr @chp, align 4
 ; CHECK-NEXT:    ret void
 ;
   %dst = call ptr @memchr(ptr @hello, i32 0, i32 14)
@@ -58,7 +58,7 @@ define void @test4(i32 %chr) {
 
 define void @test5() {
 ; CHECK-LABEL: @test5(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([14 x i8], ptr @hello, i32 0, i32 13), ptr @chp, align 4
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @hello, i32 13), ptr @chp, align 4
 ; CHECK-NEXT:    ret void
 ;
   %dst = call ptr @memchr(ptr @hello, i32 65280, i32 14)
@@ -68,7 +68,7 @@ define void @test5() {
 
 define void @test6() {
 ; CHECK-LABEL: @test6(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([14 x i8], ptr @hello, i32 0, i32 6), ptr @chp, align 4
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @hello, i32 6), ptr @chp, align 4
 ; CHECK-NEXT:    ret void
 ;
 ; Overflow, but we still find the right thing.
@@ -90,7 +90,7 @@ define void @test7() {
 
 define void @test8() {
 ; CHECK-LABEL: @test8(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([14 x i8], ptr @hellonull, i32 0, i32 6), ptr @chp, align 4
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @hellonull, i32 6), ptr @chp, align 4
 ; CHECK-NEXT:    ret void
 ;
   %dst = call ptr @memchr(ptr @hellonull, i32 119, i32 14)
@@ -100,7 +100,7 @@ define void @test8() {
 
 define void @test9() {
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([14 x i8], ptr @hellonull, i32 0, i32 6), ptr @chp, align 4
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @hellonull, i32 6), ptr @chp, align 4
 ; CHECK-NEXT:    ret void
 ;
   %str = getelementptr [14 x i8], ptr @hellonull, i32 0, i32 2
diff --git a/llvm/test/Transforms/InstCombine/memcmp-8.ll b/llvm/test/Transforms/InstCombine/memcmp-8.ll
index a3759914ad4f47..2bc1efad5c77a2 100644
--- a/llvm/test/Transforms/InstCombine/memcmp-8.ll
+++ b/llvm/test/Transforms/InstCombine/memcmp-8.ll
@@ -42,7 +42,7 @@ define i32 @fold_memcmp_a5pi_a5p5_n(i32 %i, i64 %n) {
 ; CHECK-LABEL: @fold_memcmp_a5pi_a5p5_n(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[I:%.*]] to i64
 ; CHECK-NEXT:    [[PA5_PI:%.*]] = getelementptr [5 x i8], ptr @a5, i64 0, i64 [[TMP1]]
-; CHECK-NEXT:    [[CMP:%.*]] = call i32 @memcmp(ptr [[PA5_PI]], ptr nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0), i64 [[N:%.*]])
+; CHECK-NEXT:    [[CMP:%.*]] = call i32 @memcmp(ptr [[PA5_PI]], ptr nonnull getelementptr inbounds (i8, ptr @a5, i64 5), i64 [[N:%.*]])
 ; CHECK-NEXT:    ret i32 [[CMP]]
 ;
   %pa5_pi = getelementptr [5 x i8], ptr @a5, i32 0, i32 %i
diff --git a/llvm/test/Transforms/InstCombine/memcpy-from-global.ll b/llvm/test/Transforms/InstCombine/memcpy-from-global.ll
index e9ff34735f1cf1..34e6c601f494a5 100644
--- a/llvm/test/Transforms/InstCombine/memcpy-from-global.ll
+++ b/llvm/test/Transforms/InstCombine/memcpy-from-global.ll
@@ -220,7 +220,7 @@ define void @test7() {
 define void @test8() {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:    [[AL:%.*]] = alloca [[U:%.*]], align 16
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(20) [[AL]], ptr noundef nonnull align 4 dereferenceable(20) getelementptr inbounds ([2 x %U], ptr @H, i64 0, i64 1), i64 20, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(20) [[AL]], ptr noundef nonnull align 4 dereferenceable(20) getelementptr inbounds (i8, ptr @H, i64 20), i64 20, i1 false)
 ; CHECK-NEXT:    call void @bar(ptr nonnull [[AL]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
@@ -234,7 +234,7 @@ define void @test8() {
 define void @test8_addrspacecast() {
 ; CHECK-LABEL: @test8_addrspacecast(
 ; CHECK-NEXT:    [[AL:%.*]] = alloca [[U:%.*]], align 16
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p1.i64(ptr noundef nonnull align 16 dereferenceable(20) [[AL]], ptr addrspace(1) noundef align 4 dereferenceable(20) addrspacecast (ptr getelementptr inbounds ([2 x %U], ptr @H, i64 0, i64 1) to ptr addrspace(1)), i64 20, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p1.i64(ptr noundef nonnull align 16 dereferenceable(20) [[AL]], ptr addrspace(1) noundef align 4 dereferenceable(20) addrspacecast (ptr getelementptr inbounds (i8, ptr @H, i64 20) to ptr addrspace(1)), i64 20, i1 false)
 ; CHECK-NEXT:    call void @bar(ptr nonnull [[AL]]) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
@@ -246,7 +246,7 @@ define void @test8_addrspacecast() {
 
 define void @test9() {
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT:    call void @bar(ptr nonnull getelementptr inbounds ([2 x %U], ptr @H, i64 0, i64 1)) #[[ATTR3]]
+; CHECK-NEXT:    call void @bar(ptr nonnull getelementptr inbounds (i8, ptr @H, i64 20)) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
   %A = alloca %U, align 4
@@ -257,7 +257,7 @@ define void @test9() {
 
 define void @test9_addrspacecast() {
 ; CHECK-LABEL: @test9_addrspacecast(
-; CHECK-NEXT:    call void @bar(ptr nonnull getelementptr inbounds ([2 x %U], ptr @H, i64 0, i64 1)) #[[ATTR3]]
+; CHECK-NEXT:    call void @bar(ptr nonnull getelementptr inbounds (i8, ptr @H, i64 20)) #[[ATTR3]]
 ; CHECK-NEXT:    ret void
 ;
   %A = alloca %U, align 4
diff --git a/llvm/test/Transforms/InstCombine/memrchr-3.ll b/llvm/test/Transforms/InstCombine/memrchr-3.ll
index ca122e5b7deab7..d3619432c0d834 100644
--- a/llvm/test/Transforms/InstCombine/memrchr-3.ll
+++ b/llvm/test/Transforms/InstCombine/memrchr-3.ll
@@ -98,7 +98,7 @@ define ptr @fold_memrchr_ax_c_1(i32 %C) {
 
 define ptr @fold_memrchr_a12345_5_5() {
 ; CHECK-LABEL: @fold_memrchr_a12345_5_5(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([5 x i8], ptr @a12345, i64 0, i64 4)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a12345, i64 4)
 ;
 
   %ret = call ptr @memrchr(ptr @a12345, i32 5, i64 5)
@@ -122,7 +122,7 @@ define ptr @fold_memrchr_a12345_5_4() {
 
 define ptr @fold_memrchr_a12345_4_5() {
 ; CHECK-LABEL: @fold_memrchr_a12345_4_5(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([5 x i8], ptr @a12345, i64 0, i64 3)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a12345, i64 3)
 ;
 
   %ret = call ptr @memrchr(ptr @a12345, i32 4, i64 5)
@@ -147,7 +147,7 @@ define ptr @fold_memrchr_a12345p1_1_4() {
 
 define ptr @fold_memrchr_a12345p1_2_4() {
 ; CHECK-LABEL: @fold_memrchr_a12345p1_2_4(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([5 x i8], ptr @a12345, i64 0, i64 1)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a12345, i64 1)
 ;
 
   %ptr = getelementptr [5 x i8], ptr @a12345, i32 0, i32 1
@@ -160,7 +160,7 @@ define ptr @fold_memrchr_a12345p1_2_4() {
 
 define ptr @fold_memrchr_a12345_2_5() {
 ; CHECK-LABEL: @fold_memrchr_a12345_2_5(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([5 x i8], ptr @a12345, i64 0, i64 1)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a12345, i64 1)
 ;
 
   %ret = call ptr @memrchr(ptr @a12345, i32 2, i64 5)
@@ -185,7 +185,7 @@ define ptr @fold_memrchr_a12345_0_n(i64 %N) {
 define ptr @fold_memrchr_a12345_3_n(i64 %n) {
 ; CHECK-LABEL: @fold_memrchr_a12345_3_n(
 ; CHECK-NEXT:    [[MEMRCHR_CMP:%.*]] = icmp ult i64 [[N:%.*]], 3
-; CHECK-NEXT:    [[MEMRCHR_SEL:%.*]] = select i1 [[MEMRCHR_CMP]], ptr null, ptr getelementptr inbounds ([5 x i8], ptr @a12345, i64 0, i64 2)
+; CHECK-NEXT:    [[MEMRCHR_SEL:%.*]] = select i1 [[MEMRCHR_CMP]], ptr null, ptr getelementptr inbounds (i8, ptr @a12345, i64 2)
 ; CHECK-NEXT:    ret ptr [[MEMRCHR_SEL]]
 ;
 
@@ -199,7 +199,7 @@ define ptr @fold_memrchr_a12345_3_n(i64 %n) {
 define ptr @fold_memrchr_a12345_5_n(i64 %n) {
 ; CHECK-LABEL: @fold_memrchr_a12345_5_n(
 ; CHECK-NEXT:    [[MEMRCHR_CMP:%.*]] = icmp ult i64 [[N:%.*]], 5
-; CHECK-NEXT:    [[MEMRCHR_SEL:%.*]] = select i1 [[MEMRCHR_CMP]], ptr null, ptr getelementptr inbounds ([5 x i8], ptr @a12345, i64 0, i64 4)
+; CHECK-NEXT:    [[MEMRCHR_SEL:%.*]] = select i1 [[MEMRCHR_CMP]], ptr null, ptr getelementptr inbounds (i8, ptr @a12345, i64 4)
 ; CHECK-NEXT:    ret ptr [[MEMRCHR_SEL]]
 ;
 
@@ -212,7 +212,7 @@ define ptr @fold_memrchr_a12345_5_n(i64 %n) {
 
 define ptr @fold_memrchr_a123123_3_5() {
 ; CHECK-LABEL: @fold_memrchr_a123123_3_5(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([6 x i8], ptr @a123123, i64 0, i64 2)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a123123, i64 2)
 ;
 
   %ret = call ptr @memrchr(ptr @a123123, i32 3, i64 5)
@@ -224,7 +224,7 @@ define ptr @fold_memrchr_a123123_3_5() {
 
 define ptr @fold_memrchr_a123123_3_6() {
 ; CHECK-LABEL: @fold_memrchr_a123123_3_6(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([6 x i8], ptr @a123123, i64 0, i64 5)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a123123, i64 5)
 ;
 
   %ret = call ptr @memrchr(ptr @a123123, i32 3, i64 6)
@@ -235,7 +235,7 @@ define ptr @fold_memrchr_a123123_3_6() {
 
 define ptr @fold_memrchr_a123123_2_6() {
 ; CHECK-LABEL: @fold_memrchr_a123123_2_6(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([6 x i8], ptr @a123123, i64 0, i64 4)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a123123, i64 4)
 ;
 
   %ret = call ptr @memrchr(ptr @a123123, i32 2, i64 6)
@@ -246,7 +246,7 @@ define ptr @fold_memrchr_a123123_2_6() {
 
 define ptr @fold_memrchr_a123123_1_6() {
 ; CHECK-LABEL: @fold_memrchr_a123123_1_6(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([6 x i8], ptr @a123123, i64 0, i64 3)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a123123, i64 3)
 ;
 
   %ret = call ptr @memrchr(ptr @a123123, i32 1, i64 6)
diff --git a/llvm/test/Transforms/InstCombine/memrchr-4.ll b/llvm/test/Transforms/InstCombine/memrchr-4.ll
index 1e57a3b93595ed..708b4417a7df93 100644
--- a/llvm/test/Transforms/InstCombine/memrchr-4.ll
+++ b/llvm/test/Transforms/InstCombine/memrchr-4.ll
@@ -16,7 +16,7 @@ define ptr @fold_memrchr_a11111_c_5(i32 %C) {
 ; CHECK-LABEL: @fold_memrchr_a11111_c_5(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 1
-; CHECK-NEXT:    [[MEMRCHR_SEL:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds ([5 x i8], ptr @a11111, i64 0, i64 4), ptr null
+; CHECK-NEXT:    [[MEMRCHR_SEL:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds (i8, ptr @a11111, i64 4), ptr null
 ; CHECK-NEXT:    ret ptr [[MEMRCHR_SEL]]
 ;
 
@@ -51,7 +51,7 @@ define ptr @fold_memrchr_a1110111_c_3(i32 %C) {
 ; CHECK-LABEL: @fold_memrchr_a1110111_c_3(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 1
-; CHECK-NEXT:    [[MEMRCHR_SEL:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds ([7 x i8], ptr @a1110111, i64 0, i64 2), ptr null
+; CHECK-NEXT:    [[MEMRCHR_SEL:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds (i8, ptr @a1110111, i64 2), ptr null
 ; CHECK-NEXT:    ret ptr [[MEMRCHR_SEL]]
 ;
 
diff --git a/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll b/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll
index 9c5bf3cb5a41bf..fbf58d47a32d20 100644
--- a/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll
+++ b/llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll
@@ -34,9 +34,9 @@ define void @_Z4testv() {
 ; CHECK-NEXT:    store i16 [[I4]], ptr @arr_4, align 2
 ; CHECK-NEXT:    [[I8:%.*]] = sext i16 [[I4]] to i32
 ; CHECK-NEXT:    store i32 [[I8]], ptr @arr_3, align 4
-; CHECK-NEXT:    store i32 [[STOREMERGE]], ptr getelementptr inbounds ([0 x i32], ptr @arr_2, i64 0, i64 1), align 4
-; CHECK-NEXT:    store i16 [[I4]], ptr getelementptr inbounds ([0 x i16], ptr @arr_4, i64 0, i64 1), align 2
-; CHECK-NEXT:    store i32 [[I8]], ptr getelementptr inbounds ([8 x i32], ptr @arr_3, i64 0, i64 1), align 4
+; CHECK-NEXT:    store i32 [[STOREMERGE]], ptr getelementptr inbounds (i8, ptr @arr_2, i64 4), align 4
+; CHECK-NEXT:    store i16 [[I4]], ptr getelementptr inbounds (i8, ptr @arr_4, i64 2), align 2
+; CHECK-NEXT:    store i32 [[I8]], ptr getelementptr inbounds (i8, ptr @arr_3, i64 4), align 4
 ; CHECK-NEXT:    ret void
 ;
 bb:
diff --git a/llvm/test/Transforms/InstCombine/objsize.ll b/llvm/test/Transforms/InstCombine/objsize.ll
index 33c14f44fc5fba..9a3391d91bab75 100644
--- a/llvm/test/Transforms/InstCombine/objsize.ll
+++ b/llvm/test/Transforms/InstCombine/objsize.ll
@@ -64,7 +64,7 @@ define i1 @baz() nounwind {
 define void @test1(ptr %q, i32 %x) nounwind noinline {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.objectsize.i32.p0(ptr getelementptr inbounds ([0 x i8], ptr @window, i32 0, i32 10), i1 false, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.objectsize.i32.p0(ptr getelementptr inbounds (i8, ptr @window, i32 10), i1 false, i1 false, i1 false)
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[TMP1]], label %"47", label %"46"
 ; CHECK:       "46":
@@ -112,7 +112,7 @@ define void @test3(i1 %c1, ptr %ptr1, ptr %ptr2, ptr %ptr3) nounwind {
 ; CHECK:       bb11:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb12:
-; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__inline_memcpy_chk(ptr nonnull getelementptr inbounds ([480 x float], ptr @array, i32 0, i32 1), ptr [[PTR3:%.*]], i32 512) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__inline_memcpy_chk(ptr nonnull getelementptr inbounds (i8, ptr @array, i32 4), ptr [[PTR3:%.*]], i32 512) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ;
 entry:
diff --git a/llvm/test/Transforms/InstCombine/pr25342.ll b/llvm/test/Transforms/InstCombine/pr25342.ll
index 2f85f99c4ce003..271d69b141ddd5 100644
--- a/llvm/test/Transforms/InstCombine/pr25342.ll
+++ b/llvm/test/Transforms/InstCombine/pr25342.ll
@@ -17,9 +17,9 @@ define void @_Z3fooi(i32 signext %n) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr @dd, align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds (%"struct.std::complex", ptr @dd, i64 0, i32 0, i32 1), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr getelementptr inbounds (i8, ptr @dd, i64 4), align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr @dd2, align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr getelementptr inbounds (%"struct.std::complex", ptr @dd2, i64 0, i32 0, i32 1), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr getelementptr inbounds (i8, ptr @dd2, i64 4), align 4
 ; CHECK-NEXT:    [[MUL_I:%.*]] = fmul float [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[MUL4_I:%.*]] = fmul float [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[SUB_I:%.*]] = fsub float [[MUL_I]], [[MUL4_I]]
@@ -32,7 +32,7 @@ define void @_Z3fooi(i32 signext %n) {
 ; CHECK-NEXT:    br label [[FOR_COND]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    store float [[TMP0]], ptr @dd, align 4
-; CHECK-NEXT:    store float [[TMP1]], ptr getelementptr inbounds (%"struct.std::complex", ptr @dd, i64 0, i32 0, i32 1), align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr getelementptr inbounds (i8, ptr @dd, i64 4), align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -84,9 +84,9 @@ define void @multi_phi(i32 signext %n) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr @dd, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr getelementptr inbounds (%"struct.std::complex", ptr @dd, i64 0, i32 0, i32 1), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr getelementptr inbounds (i8, ptr @dd, i64 4), align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr @dd2, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds (%"struct.std::complex", ptr @dd2, i64 0, i32 0, i32 1), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr getelementptr inbounds (i8, ptr @dd2, i64 4), align 4
 ; CHECK-NEXT:    [[MUL_I:%.*]] = fmul float [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[MUL4_I:%.*]] = fmul float [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[SUB_I:%.*]] = fsub float [[MUL_I]], [[MUL4_I]]
diff --git a/llvm/test/Transforms/InstCombine/pr33453.ll b/llvm/test/Transforms/InstCombine/pr33453.ll
index 45f87b75300601..23a232dd0b9a48 100644
--- a/llvm/test/Transforms/InstCombine/pr33453.ll
+++ b/llvm/test/Transforms/InstCombine/pr33453.ll
@@ -6,7 +6,7 @@
 
 define float @patatino() {
 ; CHECK-LABEL: @patatino(
-; CHECK-NEXT:    [[FMUL:%.*]] = uitofp i1 mul (i1 icmp eq (ptr getelementptr inbounds (i16, ptr @g2, i64 1), ptr @g1), i1 icmp eq (ptr getelementptr inbounds (i16, ptr @g2, i64 1), ptr @g1)) to float
+; CHECK-NEXT:    [[FMUL:%.*]] = uitofp i1 mul (i1 icmp eq (ptr getelementptr inbounds (i8, ptr @g2, i64 2), ptr @g1), i1 icmp eq (ptr getelementptr inbounds (i8, ptr @g2, i64 2), ptr @g1)) to float
 ; CHECK-NEXT:    ret float [[FMUL]]
 ;
   %uitofp1 = uitofp i1 icmp eq (ptr getelementptr inbounds (i16, ptr @g2, i64 1), ptr @g1) to float
diff --git a/llvm/test/Transforms/InstCombine/pr38984-inseltpoison.ll b/llvm/test/Transforms/InstCombine/pr38984-inseltpoison.ll
index 6613514c77549c..92f55b211b6390 100644
--- a/llvm/test/Transforms/InstCombine/pr38984-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/pr38984-inseltpoison.ll
@@ -26,7 +26,7 @@ define <4 x i1> @PR38984_2() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr @offsets, align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr getelementptr inbounds ([21 x i16], ptr @a, i16 1, i16 0), <4 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr getelementptr inbounds (i8, ptr @a, i16 42), <4 x i16> [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16, ptr null, <4 x i16> [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x ptr> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    ret <4 x i1> [[TMP4]]
diff --git a/llvm/test/Transforms/InstCombine/pr38984.ll b/llvm/test/Transforms/InstCombine/pr38984.ll
index c148765fce59f3..a7eddcfbe0845d 100644
--- a/llvm/test/Transforms/InstCombine/pr38984.ll
+++ b/llvm/test/Transforms/InstCombine/pr38984.ll
@@ -26,7 +26,7 @@ define <4 x i1> @PR38984_2() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr @offsets, align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 poison>, i16 [[TMP0]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr getelementptr inbounds ([21 x i16], ptr @a, i16 1, i16 0), <4 x i16> [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr getelementptr inbounds (i8, ptr @a, i16 42), <4 x i16> [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16, ptr null, <4 x i16> [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x ptr> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    ret <4 x i1> [[TMP4]]
diff --git a/llvm/test/Transforms/InstCombine/pr83947.ll b/llvm/test/Transforms/InstCombine/pr83947.ll
index c1d601ff637183..63a242abc9252a 100644
--- a/llvm/test/Transforms/InstCombine/pr83947.ll
+++ b/llvm/test/Transforms/InstCombine/pr83947.ll
@@ -6,7 +6,7 @@
 
 define void @masked_scatter1() {
 ; CHECK-LABEL: define void @masked_scatter1() {
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x ptr> shufflevector (<vscale x 4 x ptr> insertelement (<vscale x 4 x ptr> poison, ptr @c, i64 0), <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer), i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 icmp eq (ptr getelementptr inbounds (i32, ptr @b, i64 1), ptr @c), i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x ptr> shufflevector (<vscale x 4 x ptr> insertelement (<vscale x 4 x ptr> poison, ptr @c, i64 0), <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer), i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 icmp eq (ptr getelementptr inbounds (i8, ptr @b, i64 4), ptr @c), i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x ptr> splat (ptr @c), i32 4, <vscale x 4 x i1> splat (i1 icmp eq (ptr getelementptr (i32, ptr @b, i64 1), ptr @c)))
@@ -59,7 +59,7 @@ define void @masked_scatter6() {
 
 define void @masked_scatter7() {
 ; CHECK-LABEL: define void @masked_scatter7() {
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> <ptr @c, ptr @c>, i32 4, <2 x i1> <i1 icmp eq (ptr getelementptr inbounds (i32, ptr @b, i64 1), ptr @c), i1 icmp eq (ptr getelementptr inbounds (i32, ptr @b, i64 1), ptr @c)>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> <ptr @c, ptr @c>, i32 4, <2 x i1> <i1 icmp eq (ptr getelementptr inbounds (i8, ptr @b, i64 4), ptr @c), i1 icmp eq (ptr getelementptr inbounds (i8, ptr @b, i64 4), ptr @c)>)
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> splat (i1 icmp eq (ptr getelementptr (i32, ptr @b, i64 1), ptr @c)))
diff --git a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
index c783b101251d54..7c65a93a00436e 100644
--- a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
+++ b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
@@ -15,7 +15,7 @@ define i8 @remove_alloca_use_arg(i1 %cond) {
 ; CHECK:       else:
 ; CHECK-NEXT:    br label [[SINK]]
 ; CHECK:       sink:
-; CHECK-NEXT:    [[PTR1:%.*]] = phi ptr [ getelementptr inbounds ([32 x i8], ptr @g1, i64 0, i64 2), [[IF]] ], [ getelementptr inbounds ([32 x i8], ptr @g1, i64 0, i64 1), [[ELSE]] ]
+; CHECK-NEXT:    [[PTR1:%.*]] = phi ptr [ getelementptr inbounds (i8, ptr @g1, i64 2), [[IF]] ], [ getelementptr inbounds (i8, ptr @g1, i64 1), [[ELSE]] ]
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[PTR1]], align 1
 ; CHECK-NEXT:    ret i8 [[LOAD]]
 ;
@@ -114,7 +114,7 @@ define i8 @loop_phi_remove_alloca(i1 %cond) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[BB_0:%.*]]
 ; CHECK:       bb.0:
-; CHECK-NEXT:    [[PTR1:%.*]] = phi ptr [ getelementptr inbounds ([32 x i8], ptr @g1, i64 0, i64 1), [[ENTRY:%.*]] ], [ getelementptr inbounds ([32 x i8], ptr @g1, i64 0, i64 2), [[BB_1:%.*]] ]
+; CHECK-NEXT:    [[PTR1:%.*]] = phi ptr [ getelementptr inbounds (i8, ptr @g1, i64 1), [[ENTRY:%.*]] ], [ getelementptr inbounds (i8, ptr @g1, i64 2), [[BB_1:%.*]] ]
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB_1]], label [[EXIT:%.*]]
 ; CHECK:       bb.1:
 ; CHECK-NEXT:    br label [[BB_0]]
@@ -171,7 +171,7 @@ define i8 @loop_phi_late_memtransfer_remove_alloca(i1 %cond) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[BB_0:%.*]]
 ; CHECK:       bb.0:
-; CHECK-NEXT:    [[PTR1:%.*]] = phi ptr [ getelementptr inbounds ([32 x i8], ptr @g1, i64 0, i64 1), [[ENTRY:%.*]] ], [ getelementptr inbounds ([32 x i8], ptr @g1, i64 0, i64 2), [[BB_1:%.*]] ]
+; CHECK-NEXT:    [[PTR1:%.*]] = phi ptr [ getelementptr inbounds (i8, ptr @g1, i64 1), [[ENTRY:%.*]] ], [ getelementptr inbounds (i8, ptr @g1, i64 2), [[BB_1:%.*]] ]
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB_1]], label [[EXIT:%.*]]
 ; CHECK:       bb.1:
 ; CHECK-NEXT:    br label [[BB_0]]
@@ -288,7 +288,7 @@ define i32 @addrspace_diff_remove_alloca(i1 %cond) {
 ; CHECK:       if:
 ; CHECK-NEXT:    br label [[JOIN]]
 ; CHECK:       join:
-; CHECK-NEXT:    [[PHI1:%.*]] = phi ptr addrspace(1) [ @g2, [[IF]] ], [ getelementptr inbounds ([32 x i8], ptr addrspace(1) @g2, i64 0, i64 2), [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[PHI1:%.*]] = phi ptr addrspace(1) [ @g2, [[IF]] ], [ getelementptr inbounds (i8, ptr addrspace(1) @g2, i64 2), [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[V:%.*]] = load i32, ptr addrspace(1) [[PHI1]], align 4
 ; CHECK-NEXT:    ret i32 [[V]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/rem.ll b/llvm/test/Transforms/InstCombine/rem.ll
index ae390e72a4b730..a8fa72c37d32e2 100644
--- a/llvm/test/Transforms/InstCombine/rem.ll
+++ b/llvm/test/Transforms/InstCombine/rem.ll
@@ -522,7 +522,7 @@ define i32 @pr27968_0(i1 %c0, ptr %p) {
 ; CHECK-NEXT:    [[V:%.*]] = load volatile i32, ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    br i1 icmp eq (ptr getelementptr inbounds ([5 x i16], ptr @a, i64 0, i64 4), ptr @b), label [[REM_IS_SAFE:%.*]], label [[REM_IS_UNSAFE:%.*]]
+; CHECK-NEXT:    br i1 icmp eq (ptr getelementptr inbounds (i8, ptr @a, i64 8), ptr @b), label [[REM_IS_SAFE:%.*]], label [[REM_IS_UNSAFE:%.*]]
 ; CHECK:       rem.is.safe:
 ; CHECK-NEXT:    ret i32 0
 ; CHECK:       rem.is.unsafe:
@@ -591,7 +591,7 @@ define i32 @pr27968_2(i1 %c0, ptr %p) {
 ; CHECK-NEXT:    [[V:%.*]] = load volatile i32, ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    br i1 icmp eq (ptr getelementptr inbounds ([5 x i16], ptr @a, i64 0, i64 4), ptr @b), label [[REM_IS_SAFE:%.*]], label [[REM_IS_UNSAFE:%.*]]
+; CHECK-NEXT:    br i1 icmp eq (ptr getelementptr inbounds (i8, ptr @a, i64 8), ptr @b), label [[REM_IS_SAFE:%.*]], label [[REM_IS_UNSAFE:%.*]]
 ; CHECK:       rem.is.safe:
 ; CHECK-NEXT:    ret i32 0
 ; CHECK:       rem.is.unsafe:
diff --git a/llvm/test/Transforms/InstCombine/select-and-or.ll b/llvm/test/Transforms/InstCombine/select-and-or.ll
index 0f7acd4d56c064..0965e1c8348e75 100644
--- a/llvm/test/Transforms/InstCombine/select-and-or.ll
+++ b/llvm/test/Transforms/InstCombine/select-and-or.ll
@@ -431,7 +431,7 @@ define i1 @not_false_not_use3(i1 %x, i1 %y) {
 define i1 @demorgan_select_infloop1(i1 %L) {
 ; CHECK-LABEL: @demorgan_select_infloop1(
 ; CHECK-NEXT:    [[NOT_L:%.*]] = xor i1 [[L:%.*]], true
-; CHECK-NEXT:    [[C15:%.*]] = select i1 [[NOT_L]], i1 xor (i1 icmp eq (ptr getelementptr inbounds (i16, ptr @g2, i64 1), ptr @g1), i1 icmp eq (ptr getelementptr inbounds (i16, ptr @g2, i64 1), ptr @g1)), i1 false
+; CHECK-NEXT:    [[C15:%.*]] = select i1 [[NOT_L]], i1 xor (i1 icmp eq (ptr getelementptr inbounds (i8, ptr @g2, i64 2), ptr @g1), i1 icmp eq (ptr getelementptr inbounds (i8, ptr @g2, i64 2), ptr @g1)), i1 false
 ; CHECK-NEXT:    ret i1 [[C15]]
 ;
   %not.L = xor i1 %L, true
@@ -443,7 +443,7 @@ define i1 @demorgan_select_infloop1(i1 %L) {
 define i1 @demorgan_select_infloop2(i1 %L) {
 ; CHECK-LABEL: @demorgan_select_infloop2(
 ; CHECK-NEXT:    [[NOT_L:%.*]] = xor i1 [[L:%.*]], true
-; CHECK-NEXT:    [[C15:%.*]] = select i1 [[NOT_L]], i1 true, i1 xor (i1 icmp eq (ptr getelementptr inbounds (i16, ptr @g2, i64 1), ptr @g1), i1 icmp eq (ptr getelementptr inbounds (i16, ptr @g2, i64 1), ptr @g1))
+; CHECK-NEXT:    [[C15:%.*]] = select i1 [[NOT_L]], i1 true, i1 xor (i1 icmp eq (ptr getelementptr inbounds (i8, ptr @g2, i64 2), ptr @g1), i1 icmp eq (ptr getelementptr inbounds (i8, ptr @g2, i64 2), ptr @g1))
 ; CHECK-NEXT:    ret i1 [[C15]]
 ;
   %not.L = xor i1 %L, true
diff --git a/llvm/test/Transforms/InstCombine/simplify-libcalls-i16.ll b/llvm/test/Transforms/InstCombine/simplify-libcalls-i16.ll
index 2ac9e2996c4f72..9a08b6b5cf9f70 100644
--- a/llvm/test/Transforms/InstCombine/simplify-libcalls-i16.ll
+++ b/llvm/test/Transforms/InstCombine/simplify-libcalls-i16.ll
@@ -29,11 +29,11 @@ define void @foo(ptr %P, ptr %X) {
 
 define ptr @test1() {
 ; CHECK32-LABEL: @test1(
-; CHECK32-NEXT:    [[TMP3:%.*]] = tail call ptr @strchr(ptr nonnull getelementptr inbounds ([5 x i8], ptr @str, i32 0, i32 2), i16 103)
+; CHECK32-NEXT:    [[TMP3:%.*]] = tail call ptr @strchr(ptr nonnull getelementptr inbounds (i8, ptr @str, i32 2), i16 103)
 ; CHECK32-NEXT:    ret ptr [[TMP3]]
 ;
 ; CHECK16-LABEL: @test1(
-; CHECK16-NEXT:    ret ptr getelementptr inbounds ([5 x i8], ptr @str, i32 0, i32 3)
+; CHECK16-NEXT:    ret ptr getelementptr inbounds (i8, ptr @str, i32 3)
 ;
   %tmp3 = tail call ptr @strchr( ptr getelementptr ([5 x i8], ptr @str, i32 0, i16 2), i16 103 )              ; <ptr> [#uses=1]
   ret ptr %tmp3
@@ -45,11 +45,11 @@ declare ptr @strchr(ptr, i16)
 
 define ptr @test2() {
 ; CHECK32-LABEL: @test2(
-; CHECK32-NEXT:    [[TMP3:%.*]] = tail call ptr @strchr(ptr nonnull getelementptr inbounds ([8 x i8], ptr @str1, i32 0, i32 2), i16 0)
+; CHECK32-NEXT:    [[TMP3:%.*]] = tail call ptr @strchr(ptr nonnull getelementptr inbounds (i8, ptr @str1, i32 2), i16 0)
 ; CHECK32-NEXT:    ret ptr [[TMP3]]
 ;
 ; CHECK16-LABEL: @test2(
-; CHECK16-NEXT:    ret ptr getelementptr inbounds ([8 x i8], ptr @str1, i32 0, i32 7)
+; CHECK16-NEXT:    ret ptr getelementptr inbounds (i8, ptr @str1, i32 7)
 ;
   %tmp3 = tail call ptr @strchr( ptr getelementptr ([8 x i8], ptr @str1, i32 0, i32 2), i16 0 )               ; <ptr> [#uses=1]
   ret ptr %tmp3
@@ -58,7 +58,7 @@ define ptr @test2() {
 define ptr @test3() {
 ; CHECK32-LABEL: @test3(
 ; CHECK32-NEXT:  entry:
-; CHECK32-NEXT:    [[TMP3:%.*]] = tail call ptr @strchr(ptr nonnull getelementptr inbounds ([5 x i8], ptr @str2, i32 0, i32 1), i16 80)
+; CHECK32-NEXT:    [[TMP3:%.*]] = tail call ptr @strchr(ptr nonnull getelementptr inbounds (i8, ptr @str2, i32 1), i16 80)
 ; CHECK32-NEXT:    ret ptr [[TMP3]]
 ;
 ; CHECK16-LABEL: @test3(
diff --git a/llvm/test/Transforms/InstCombine/simplify-libcalls.ll b/llvm/test/Transforms/InstCombine/simplify-libcalls.ll
index 5ebb497ee765ff..bb2728a103ec6c 100644
--- a/llvm/test/Transforms/InstCombine/simplify-libcalls.ll
+++ b/llvm/test/Transforms/InstCombine/simplify-libcalls.ll
@@ -29,10 +29,10 @@ define void @foo(ptr %P, ptr %X) {
 
 define ptr @test1() {
 ; CHECK32-LABEL: @test1(
-; CHECK32-NEXT:    ret ptr getelementptr inbounds ([5 x i8], ptr @str, i32 0, i32 3)
+; CHECK32-NEXT:    ret ptr getelementptr inbounds (i8, ptr @str, i32 3)
 ;
 ; CHECK16-LABEL: @test1(
-; CHECK16-NEXT:    [[TMP3:%.*]] = tail call ptr @strchr(ptr nonnull getelementptr inbounds ([5 x i8], ptr @str, i32 0, i32 2), i32 103)
+; CHECK16-NEXT:    [[TMP3:%.*]] = tail call ptr @strchr(ptr nonnull getelementptr inbounds (i8, ptr @str, i32 2), i32 103)
 ; CHECK16-NEXT:    ret ptr [[TMP3]]
 ;
   %tmp3 = tail call ptr @strchr( ptr getelementptr ([5 x i8], ptr @str, i32 0, i32 2), i32 103 )              ; <ptr> [#uses=1]
@@ -45,10 +45,10 @@ declare ptr @strchr(ptr, i32)
 
 define ptr @test2() {
 ; CHECK32-LABEL: @test2(
-; CHECK32-NEXT:    ret ptr getelementptr inbounds ([8 x i8], ptr @str1, i32 0, i32 7)
+; CHECK32-NEXT:    ret ptr getelementptr inbounds (i8, ptr @str1, i32 7)
 ;
 ; CHECK16-LABEL: @test2(
-; CHECK16-NEXT:    [[TMP3:%.*]] = tail call ptr @strchr(ptr nonnull getelementptr inbounds ([8 x i8], ptr @str1, i32 0, i32 2), i32 0)
+; CHECK16-NEXT:    [[TMP3:%.*]] = tail call ptr @strchr(ptr nonnull getelementptr inbounds (i8, ptr @str1, i32 2), i32 0)
 ; CHECK16-NEXT:    ret ptr [[TMP3]]
 ;
   %tmp3 = tail call ptr @strchr( ptr getelementptr ([8 x i8], ptr @str1, i32 0, i32 2), i32 0 )               ; <ptr> [#uses=1]
@@ -62,7 +62,7 @@ define ptr @test3() {
 ;
 ; CHECK16-LABEL: @test3(
 ; CHECK16-NEXT:  entry:
-; CHECK16-NEXT:    [[TMP3:%.*]] = tail call ptr @strchr(ptr nonnull getelementptr inbounds ([5 x i8], ptr @str2, i32 0, i32 1), i32 80)
+; CHECK16-NEXT:    [[TMP3:%.*]] = tail call ptr @strchr(ptr nonnull getelementptr inbounds (i8, ptr @str2, i32 1), i32 80)
 ; CHECK16-NEXT:    ret ptr [[TMP3]]
 ;
 entry:
diff --git a/llvm/test/Transforms/InstCombine/snprintf-2.ll b/llvm/test/Transforms/InstCombine/snprintf-2.ll
index 46694e0764a02f..0465457aacec75 100644
--- a/llvm/test/Transforms/InstCombine/snprintf-2.ll
+++ b/llvm/test/Transforms/InstCombine/snprintf-2.ll
@@ -21,54 +21,54 @@ declare i32 @snprintf(ptr, i64, ptr, ...)
 
 define void @fold_snprintf_fmt() {
 ; BE-LABEL: @fold_snprintf_fmt(
-; BE-NEXT:    [[PDIMAX:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 2147483647), align 8
+; BE-NEXT:    [[PDIMAX:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 17179869176), align 8
 ; BE-NEXT:    store i32 825373440, ptr [[PDIMAX]], align 1
 ; BE-NEXT:    store i32 3, ptr @asiz, align 4
-; BE-NEXT:    [[PD5:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 5), align 8
+; BE-NEXT:    [[PD5:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 40), align 8
 ; BE-NEXT:    store i32 825373440, ptr [[PD5]], align 1
-; BE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 5), align 4
-; BE-NEXT:    [[PD4:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 4), align 8
+; BE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 20), align 4
+; BE-NEXT:    [[PD4:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 32), align 8
 ; BE-NEXT:    store i32 825373440, ptr [[PD4]], align 1
-; BE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 4), align 4
-; BE-NEXT:    [[PD3:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 3), align 8
+; BE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 16), align 4
+; BE-NEXT:    [[PD3:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 24), align 8
 ; BE-NEXT:    store i16 12594, ptr [[PD3]], align 1
 ; BE-NEXT:    [[ENDPTR:%.*]] = getelementptr inbounds i8, ptr [[PD3]], i64 2
 ; BE-NEXT:    store i8 0, ptr [[ENDPTR]], align 1
-; BE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 3), align 4
-; BE-NEXT:    [[PD2:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 2), align 8
+; BE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 12), align 4
+; BE-NEXT:    [[PD2:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 16), align 8
 ; BE-NEXT:    store i8 49, ptr [[PD2]], align 1
 ; BE-NEXT:    [[ENDPTR1:%.*]] = getelementptr inbounds i8, ptr [[PD2]], i64 1
 ; BE-NEXT:    store i8 0, ptr [[ENDPTR1]], align 1
-; BE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 2), align 4
-; BE-NEXT:    [[PD1:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 1), align 8
+; BE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 8), align 4
+; BE-NEXT:    [[PD1:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 8), align 8
 ; BE-NEXT:    store i8 0, ptr [[PD1]], align 1
-; BE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 1), align 4
+; BE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 4), align 4
 ; BE-NEXT:    store i32 3, ptr @asiz, align 4
 ; BE-NEXT:    ret void
 ;
 ; LE-LABEL: @fold_snprintf_fmt(
-; LE-NEXT:    [[PDIMAX:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 2147483647), align 8
+; LE-NEXT:    [[PDIMAX:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 17179869176), align 8
 ; LE-NEXT:    store i32 3355185, ptr [[PDIMAX]], align 1
 ; LE-NEXT:    store i32 3, ptr @asiz, align 4
-; LE-NEXT:    [[PD5:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 5), align 8
+; LE-NEXT:    [[PD5:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 40), align 8
 ; LE-NEXT:    store i32 3355185, ptr [[PD5]], align 1
-; LE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 5), align 4
-; LE-NEXT:    [[PD4:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 4), align 8
+; LE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 20), align 4
+; LE-NEXT:    [[PD4:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 32), align 8
 ; LE-NEXT:    store i32 3355185, ptr [[PD4]], align 1
-; LE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 4), align 4
-; LE-NEXT:    [[PD3:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 3), align 8
+; LE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 16), align 4
+; LE-NEXT:    [[PD3:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 24), align 8
 ; LE-NEXT:    store i16 12849, ptr [[PD3]], align 1
 ; LE-NEXT:    [[ENDPTR:%.*]] = getelementptr inbounds i8, ptr [[PD3]], i64 2
 ; LE-NEXT:    store i8 0, ptr [[ENDPTR]], align 1
-; LE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 3), align 4
-; LE-NEXT:    [[PD2:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 2), align 8
+; LE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 12), align 4
+; LE-NEXT:    [[PD2:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 16), align 8
 ; LE-NEXT:    store i8 49, ptr [[PD2]], align 1
 ; LE-NEXT:    [[ENDPTR1:%.*]] = getelementptr inbounds i8, ptr [[PD2]], i64 1
 ; LE-NEXT:    store i8 0, ptr [[ENDPTR1]], align 1
-; LE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 2), align 4
-; LE-NEXT:    [[PD1:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 1), align 8
+; LE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 8), align 4
+; LE-NEXT:    [[PD1:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 8), align 8
 ; LE-NEXT:    store i8 0, ptr [[PD1]], align 1
-; LE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 1), align 4
+; LE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 4), align 4
 ; LE-NEXT:    store i32 3, ptr @asiz, align 4
 ; LE-NEXT:    ret void
 ;
@@ -111,9 +111,9 @@ define void @fold_snprintf_fmt() {
 
 define void @call_snprintf_fmt_ximax() {
 ; ANY-LABEL: @call_snprintf_fmt_ximax(
-; ANY-NEXT:    [[PDM1:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 1), align 8
+; ANY-NEXT:    [[PDM1:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 8), align 8
 ; ANY-NEXT:    [[NM1:%.*]] = call i32 (ptr, i64, ptr, ...) @snprintf(ptr noundef nonnull dereferenceable(1) [[PDM1]], i64 -1, ptr nonnull @s)
-; ANY-NEXT:    store i32 [[NM1]], ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 1), align 4
+; ANY-NEXT:    store i32 [[NM1]], ptr getelementptr (i8, ptr @asiz, i64 4), align 4
 ; ANY-NEXT:    [[PDIMAXP1:%.*]] = load ptr, ptr @adst, align 8
 ; ANY-NEXT:    [[NIMAXP1:%.*]] = call i32 (ptr, i64, ptr, ...) @snprintf(ptr noundef nonnull dereferenceable(1) [[PDIMAXP1]], i64 2147483648, ptr nonnull @s)
 ; ANY-NEXT:    store i32 [[NIMAXP1]], ptr @asiz, align 4
diff --git a/llvm/test/Transforms/InstCombine/snprintf-3.ll b/llvm/test/Transforms/InstCombine/snprintf-3.ll
index 0332aa71ad6481..7c93580b4ea546 100644
--- a/llvm/test/Transforms/InstCombine/snprintf-3.ll
+++ b/llvm/test/Transforms/InstCombine/snprintf-3.ll
@@ -22,54 +22,54 @@ declare i32 @snprintf(ptr, i64, ptr, ...)
 
 define void @fold_snprintf_pcnt_s() {
 ; BE-LABEL: @fold_snprintf_pcnt_s(
-; BE-NEXT:    [[PDIMAX:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 2147483647), align 8
+; BE-NEXT:    [[PDIMAX:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 17179869176), align 8
 ; BE-NEXT:    store i32 825373440, ptr [[PDIMAX]], align 1
 ; BE-NEXT:    store i32 3, ptr @asiz, align 4
-; BE-NEXT:    [[PD5:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 5), align 8
+; BE-NEXT:    [[PD5:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 40), align 8
 ; BE-NEXT:    store i32 825373440, ptr [[PD5]], align 1
-; BE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 5), align 4
-; BE-NEXT:    [[PD4:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 4), align 8
+; BE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 20), align 4
+; BE-NEXT:    [[PD4:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 32), align 8
 ; BE-NEXT:    store i32 825373440, ptr [[PD4]], align 1
-; BE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 4), align 4
-; BE-NEXT:    [[PD3:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 3), align 8
+; BE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 16), align 4
+; BE-NEXT:    [[PD3:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 24), align 8
 ; BE-NEXT:    store i16 12594, ptr [[PD3]], align 1
 ; BE-NEXT:    [[ENDPTR:%.*]] = getelementptr inbounds i8, ptr [[PD3]], i64 2
 ; BE-NEXT:    store i8 0, ptr [[ENDPTR]], align 1
-; BE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 3), align 4
-; BE-NEXT:    [[PD2:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 2), align 8
+; BE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 12), align 4
+; BE-NEXT:    [[PD2:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 16), align 8
 ; BE-NEXT:    store i8 49, ptr [[PD2]], align 1
 ; BE-NEXT:    [[ENDPTR1:%.*]] = getelementptr inbounds i8, ptr [[PD2]], i64 1
 ; BE-NEXT:    store i8 0, ptr [[ENDPTR1]], align 1
-; BE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 2), align 4
-; BE-NEXT:    [[PD1:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 1), align 8
+; BE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 8), align 4
+; BE-NEXT:    [[PD1:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 8), align 8
 ; BE-NEXT:    store i8 0, ptr [[PD1]], align 1
-; BE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 1), align 4
+; BE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 4), align 4
 ; BE-NEXT:    store i32 3, ptr @asiz, align 4
 ; BE-NEXT:    ret void
 ;
 ; LE-LABEL: @fold_snprintf_pcnt_s(
-; LE-NEXT:    [[PDIMAX:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 2147483647), align 8
+; LE-NEXT:    [[PDIMAX:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 17179869176), align 8
 ; LE-NEXT:    store i32 3355185, ptr [[PDIMAX]], align 1
 ; LE-NEXT:    store i32 3, ptr @asiz, align 4
-; LE-NEXT:    [[PD5:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 5), align 8
+; LE-NEXT:    [[PD5:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 40), align 8
 ; LE-NEXT:    store i32 3355185, ptr [[PD5]], align 1
-; LE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 5), align 4
-; LE-NEXT:    [[PD4:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 4), align 8
+; LE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 20), align 4
+; LE-NEXT:    [[PD4:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 32), align 8
 ; LE-NEXT:    store i32 3355185, ptr [[PD4]], align 1
-; LE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 4), align 4
-; LE-NEXT:    [[PD3:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 3), align 8
+; LE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 16), align 4
+; LE-NEXT:    [[PD3:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 24), align 8
 ; LE-NEXT:    store i16 12849, ptr [[PD3]], align 1
 ; LE-NEXT:    [[ENDPTR:%.*]] = getelementptr inbounds i8, ptr [[PD3]], i64 2
 ; LE-NEXT:    store i8 0, ptr [[ENDPTR]], align 1
-; LE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 3), align 4
-; LE-NEXT:    [[PD2:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 2), align 8
+; LE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 12), align 4
+; LE-NEXT:    [[PD2:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 16), align 8
 ; LE-NEXT:    store i8 49, ptr [[PD2]], align 1
 ; LE-NEXT:    [[ENDPTR1:%.*]] = getelementptr inbounds i8, ptr [[PD2]], i64 1
 ; LE-NEXT:    store i8 0, ptr [[ENDPTR1]], align 1
-; LE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 2), align 4
-; LE-NEXT:    [[PD1:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 1), align 8
+; LE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 8), align 4
+; LE-NEXT:    [[PD1:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 8), align 8
 ; LE-NEXT:    store i8 0, ptr [[PD1]], align 1
-; LE-NEXT:    store i32 3, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 1), align 4
+; LE-NEXT:    store i32 3, ptr getelementptr (i8, ptr @asiz, i64 4), align 4
 ; LE-NEXT:    store i32 3, ptr @asiz, align 4
 ; LE-NEXT:    ret void
 ;
@@ -112,9 +112,9 @@ define void @fold_snprintf_pcnt_s() {
 
 define void @call_snprintf_pcnt_s_ximax() {
 ; ANY-LABEL: @call_snprintf_pcnt_s_ximax(
-; ANY-NEXT:    [[PDM1:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 1), align 8
+; ANY-NEXT:    [[PDM1:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 8), align 8
 ; ANY-NEXT:    [[NM1:%.*]] = call i32 (ptr, i64, ptr, ...) @snprintf(ptr noundef nonnull dereferenceable(1) [[PDM1]], i64 -1, ptr nonnull @pcnt_s, ptr nonnull @s)
-; ANY-NEXT:    store i32 [[NM1]], ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 1), align 4
+; ANY-NEXT:    store i32 [[NM1]], ptr getelementptr (i8, ptr @asiz, i64 4), align 4
 ; ANY-NEXT:    [[PDIMAXP1:%.*]] = load ptr, ptr @adst, align 8
 ; ANY-NEXT:    [[NIMAXP1:%.*]] = call i32 (ptr, i64, ptr, ...) @snprintf(ptr noundef nonnull dereferenceable(1) [[PDIMAXP1]], i64 2147483648, ptr nonnull @pcnt_s, ptr nonnull @s)
 ; ANY-NEXT:    store i32 [[NIMAXP1]], ptr @asiz, align 4
diff --git a/llvm/test/Transforms/InstCombine/snprintf-4.ll b/llvm/test/Transforms/InstCombine/snprintf-4.ll
index 4536a6d8817ee6..7006838ae9b58a 100644
--- a/llvm/test/Transforms/InstCombine/snprintf-4.ll
+++ b/llvm/test/Transforms/InstCombine/snprintf-4.ll
@@ -24,29 +24,29 @@ define void @fold_snprintf_pcnt_c(i32 %c) {
 ; CHECK-NEXT:    [[NUL:%.*]] = getelementptr inbounds i8, ptr [[PDIMAX]], i64 1
 ; CHECK-NEXT:    store i8 0, ptr [[NUL]], align 1
 ; CHECK-NEXT:    store i32 1, ptr @asiz, align 4
-; CHECK-NEXT:    [[PD2:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 1), align 8
+; CHECK-NEXT:    [[PD2:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 8), align 8
 ; CHECK-NEXT:    store i8 2, ptr [[PD2]], align 1
 ; CHECK-NEXT:    [[NUL1:%.*]] = getelementptr inbounds i8, ptr [[PD2]], i64 1
 ; CHECK-NEXT:    store i8 0, ptr [[NUL1]], align 1
-; CHECK-NEXT:    store i32 1, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[PD2_0:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 2), align 8
+; CHECK-NEXT:    store i32 1, ptr getelementptr (i8, ptr @asiz, i64 4), align 4
+; CHECK-NEXT:    [[PD2_0:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 16), align 8
 ; CHECK-NEXT:    store i8 0, ptr [[PD2_0]], align 1
 ; CHECK-NEXT:    [[NUL2:%.*]] = getelementptr inbounds i8, ptr [[PD2_0]], i64 1
 ; CHECK-NEXT:    store i8 0, ptr [[NUL2]], align 1
-; CHECK-NEXT:    store i32 1, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 2), align 4
-; CHECK-NEXT:    [[PD1:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 3), align 8
+; CHECK-NEXT:    store i32 1, ptr getelementptr (i8, ptr @asiz, i64 8), align 4
+; CHECK-NEXT:    [[PD1:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 24), align 8
 ; CHECK-NEXT:    store i8 0, ptr [[PD1]], align 1
-; CHECK-NEXT:    store i32 1, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 3), align 4
-; CHECK-NEXT:    store i32 1, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 4), align 4
-; CHECK-NEXT:    [[PD2_C:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 4), align 8
+; CHECK-NEXT:    store i32 1, ptr getelementptr (i8, ptr @asiz, i64 12), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr (i8, ptr @asiz, i64 16), align 4
+; CHECK-NEXT:    [[PD2_C:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 32), align 8
 ; CHECK-NEXT:    [[CHAR:%.*]] = trunc i32 [[C:%.*]] to i8
 ; CHECK-NEXT:    store i8 [[CHAR]], ptr [[PD2_C]], align 1
 ; CHECK-NEXT:    [[NUL3:%.*]] = getelementptr inbounds i8, ptr [[PD2_C]], i64 1
 ; CHECK-NEXT:    store i8 0, ptr [[NUL3]], align 1
-; CHECK-NEXT:    store i32 1, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 4), align 4
-; CHECK-NEXT:    [[PD1_C:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 5), align 8
+; CHECK-NEXT:    store i32 1, ptr getelementptr (i8, ptr @asiz, i64 16), align 4
+; CHECK-NEXT:    [[PD1_C:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 40), align 8
 ; CHECK-NEXT:    store i8 0, ptr [[PD1_C]], align 1
-; CHECK-NEXT:    store i32 1, ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 5), align 4
+; CHECK-NEXT:    store i32 1, ptr getelementptr (i8, ptr @asiz, i64 20), align 4
 ; CHECK-NEXT:    ret void
 ;
 
@@ -100,12 +100,12 @@ define void @call_snprintf_pcnt_c_ximax(i32 %c) {
 ; CHECK-NEXT:    [[PDM1:%.*]] = load ptr, ptr @adst, align 8
 ; CHECK-NEXT:    [[NM1:%.*]] = call i32 (ptr, i64, ptr, ...) @snprintf(ptr noundef nonnull dereferenceable(1) [[PDM1]], i64 -1, ptr nonnull @pcnt_c, i8 0)
 ; CHECK-NEXT:    store i32 [[NM1]], ptr @asiz, align 4
-; CHECK-NEXT:    [[PDIMAXP1:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 1), align 8
+; CHECK-NEXT:    [[PDIMAXP1:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 8), align 8
 ; CHECK-NEXT:    [[NIMAXP1:%.*]] = call i32 (ptr, i64, ptr, ...) @snprintf(ptr noundef nonnull dereferenceable(1) [[PDIMAXP1]], i64 2147483648, ptr nonnull @pcnt_c, i8 1)
-; CHECK-NEXT:    store i32 [[NIMAXP1]], ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[PDM1SL32:%.*]] = load ptr, ptr getelementptr ([0 x ptr], ptr @adst, i64 0, i64 2), align 8
+; CHECK-NEXT:    store i32 [[NIMAXP1]], ptr getelementptr (i8, ptr @asiz, i64 4), align 4
+; CHECK-NEXT:    [[PDM1SL32:%.*]] = load ptr, ptr getelementptr (i8, ptr @adst, i64 16), align 8
 ; CHECK-NEXT:    [[NM1SL32:%.*]] = call i32 (ptr, i64, ptr, ...) @snprintf(ptr noundef nonnull dereferenceable(1) [[PDM1SL32]], i64 -4294967296, ptr nonnull @pcnt_c, i8 1)
-; CHECK-NEXT:    store i32 [[NM1SL32]], ptr getelementptr ([0 x i32], ptr @asiz, i64 0, i64 2), align 4
+; CHECK-NEXT:    store i32 [[NM1SL32]], ptr getelementptr (i8, ptr @asiz, i64 8), align 4
 ; CHECK-NEXT:    ret void
 ;
 
diff --git a/llvm/test/Transforms/InstCombine/stpcpy-1.ll b/llvm/test/Transforms/InstCombine/stpcpy-1.ll
index 86691a08a79803..2ddacb20974426 100644
--- a/llvm/test/Transforms/InstCombine/stpcpy-1.ll
+++ b/llvm/test/Transforms/InstCombine/stpcpy-1.ll
@@ -16,7 +16,7 @@ declare ptr @stpcpy(ptr, ptr)
 define ptr @test_simplify1() {
 ; CHECK-LABEL: @test_simplify1(
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(6) @a, ptr noundef nonnull align 1 dereferenceable(6) @hello, i32 6, i1 false)
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([32 x i8], ptr @a, i32 0, i32 5)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a, i32 5)
 ;
   %ret = call ptr @stpcpy(ptr @a, ptr @hello)
   ret ptr %ret
@@ -62,7 +62,7 @@ define ptr @test_no_simplify2(ptr %dst, ptr %src) {
 define ptr @test_no_incompatible_attr() {
 ; CHECK-LABEL: @test_no_incompatible_attr(
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(6) @a, ptr noundef nonnull align 1 dereferenceable(6) @hello, i32 6, i1 false)
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([32 x i8], ptr @a, i32 0, i32 5)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a, i32 5)
 ;
   %ret = call dereferenceable(1) ptr @stpcpy(ptr @a, ptr @hello)
   ret ptr %ret
diff --git a/llvm/test/Transforms/InstCombine/stpcpy_chk-1.ll b/llvm/test/Transforms/InstCombine/stpcpy_chk-1.ll
index 5ebd9fae76201d..2d775f35c8bda4 100644
--- a/llvm/test/Transforms/InstCombine/stpcpy_chk-1.ll
+++ b/llvm/test/Transforms/InstCombine/stpcpy_chk-1.ll
@@ -15,7 +15,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 define ptr @test_simplify1() {
 ; CHECK-LABEL: @test_simplify1(
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(12) @a, ptr noundef nonnull align 1 dereferenceable(12) @.str, i32 12, i1 false)
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([60 x i8], ptr @a, i32 0, i32 11)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a, i32 11)
 ;
 
   %ret = call ptr @__stpcpy_chk(ptr @a, ptr @.str, i32 60)
@@ -25,7 +25,7 @@ define ptr @test_simplify1() {
 define ptr @test_simplify2() {
 ; CHECK-LABEL: @test_simplify2(
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(12) @a, ptr noundef nonnull align 1 dereferenceable(12) @.str, i32 12, i1 false)
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([60 x i8], ptr @a, i32 0, i32 11)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a, i32 11)
 ;
 
   %ret = call ptr @__stpcpy_chk(ptr @a, ptr @.str, i32 12)
@@ -35,7 +35,7 @@ define ptr @test_simplify2() {
 define ptr @test_simplify3() {
 ; CHECK-LABEL: @test_simplify3(
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(12) @a, ptr noundef nonnull align 1 dereferenceable(12) @.str, i32 12, i1 false)
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([60 x i8], ptr @a, i32 0, i32 11)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a, i32 11)
 ;
 
   %ret = call ptr @__stpcpy_chk(ptr @a, ptr @.str, i32 -1)
@@ -45,7 +45,7 @@ define ptr @test_simplify3() {
 define ptr @test_simplify1_tail() {
 ; CHECK-LABEL: @test_simplify1_tail(
 ; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(12) @a, ptr noundef nonnull align 1 dereferenceable(12) @.str, i32 12, i1 false)
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([60 x i8], ptr @a, i32 0, i32 11)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a, i32 11)
 ;
 
   %ret = tail call ptr @__stpcpy_chk(ptr @a, ptr @.str, i32 60)
@@ -80,7 +80,7 @@ define ptr @test_simplify5() {
 ; CHECK-LABEL: @test_simplify5(
 ; CHECK-NEXT:    [[LEN:%.*]] = call i32 @llvm.objectsize.i32.p0(ptr @a, i1 false, i1 false, i1 false)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @__memcpy_chk(ptr nonnull @a, ptr nonnull @.str, i32 12, i32 [[LEN]])
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([60 x i8], ptr @a, i32 0, i32 11)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a, i32 11)
 ;
 
   %len = call i32 @llvm.objectsize.i32.p0(ptr @a, i1 false, i1 false, i1 false)
diff --git a/llvm/test/Transforms/InstCombine/stpncpy-1.ll b/llvm/test/Transforms/InstCombine/stpncpy-1.ll
index 15eee6c1019353..0a4caa2c05f934 100644
--- a/llvm/test/Transforms/InstCombine/stpncpy-1.ll
+++ b/llvm/test/Transforms/InstCombine/stpncpy-1.ll
@@ -28,18 +28,18 @@ declare void @sink(ptr, ptr)
 ; to D + strnlen(D, N) or, equivalently, D + (*D != '\0'), when N < 2.
 
 ;.
-; ANY: @[[A4:[a-zA-Z0-9_$"\\.-]+]] = constant [4 x i8] c"1234"
-; ANY: @[[S4:[a-zA-Z0-9_$"\\.-]+]] = constant [5 x i8] c"1234\00"
-; ANY: @[[STR:[a-zA-Z0-9_$"\\.-]+]] = private constant [4 x i8] c"4\00\00\00"
-; ANY: @[[STR_1:[a-zA-Z0-9_$"\\.-]+]] = private constant [10 x i8] c"4\00\00\00\00\00\00\00\00\00"
-; ANY: @[[STR_2:[a-zA-Z0-9_$"\\.-]+]] = private constant [10 x i8] c"1234\00\00\00\00\00\00"
-; ANY: @[[STR_3:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr constant [4 x i8] c"4\00\00\00", align 1
-; ANY: @[[STR_4:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr constant [10 x i8] c"4\00\00\00\00\00\00\00\00\00", align 1
-; ANY: @[[STR_5:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr constant [10 x i8] c"1234\00\00\00\00\00\00", align 1
-; ANY: @[[STR_6:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr constant [4 x i8] c"4\00\00\00", align 1
-; ANY: @[[STR_7:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr constant [10 x i8] c"4\00\00\00\00\00\00\00\00\00", align 1
-; ANY: @[[STR_8:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr constant [10 x i8] c"1234\00\00\00\00\00\00", align 1
-; ANY: @[[STR_9:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr constant [10 x i8] c"1234\00\00\00\00\00\00", align 1
+; ANY: @a4 = constant [4 x i8] c"1234"
+; ANY: @s4 = constant [5 x i8] c"1234\00"
+; ANY: @str = private constant [4 x i8] c"4\00\00\00"
+; ANY: @str.1 = private constant [10 x i8] c"4\00\00\00\00\00\00\00\00\00"
+; ANY: @str.2 = private constant [10 x i8] c"1234\00\00\00\00\00\00"
+; ANY: @str.3 = private unnamed_addr constant [4 x i8] c"4\00\00\00", align 1
+; ANY: @str.4 = private unnamed_addr constant [10 x i8] c"4\00\00\00\00\00\00\00\00\00", align 1
+; ANY: @str.5 = private unnamed_addr constant [10 x i8] c"1234\00\00\00\00\00\00", align 1
+; ANY: @str.6 = private unnamed_addr constant [4 x i8] c"4\00\00\00", align 1
+; ANY: @str.7 = private unnamed_addr constant [10 x i8] c"4\00\00\00\00\00\00\00\00\00", align 1
+; ANY: @str.8 = private unnamed_addr constant [10 x i8] c"1234\00\00\00\00\00\00", align 1
+; ANY: @str.9 = private unnamed_addr constant [10 x i8] c"1234\00\00\00\00\00\00", align 1
 ;.
 define void @fold_stpncpy_overlap(ptr %dst, i64 %n) {
 ; ANY-LABEL: @fold_stpncpy_overlap(
@@ -273,11 +273,11 @@ define void @fold_stpncpy_s4(ptr %dst, i64 %n) {
 
 define void @call_stpncpy_xx_n(ptr %dst, i64 %n) {
 ; ANY-LABEL: @call_stpncpy_xx_n(
-; ANY-NEXT:    [[EA1_N:%.*]] = call ptr @stpncpy(ptr [[DST:%.*]], ptr nonnull dereferenceable(2) getelementptr inbounds ([4 x i8], ptr @a4, i64 0, i64 3), i64 [[N:%.*]])
+; ANY-NEXT:    [[EA1_N:%.*]] = call ptr @stpncpy(ptr [[DST:%.*]], ptr nonnull dereferenceable(2) getelementptr inbounds (i8, ptr @a4, i64 3), i64 [[N:%.*]])
 ; ANY-NEXT:    call void @sink(ptr [[DST]], ptr [[EA1_N]])
 ; ANY-NEXT:    [[EA4_N:%.*]] = call ptr @stpncpy(ptr [[DST]], ptr nonnull dereferenceable(5) @a4, i64 [[N]])
 ; ANY-NEXT:    call void @sink(ptr [[DST]], ptr [[EA4_N]])
-; ANY-NEXT:    [[ES1_N:%.*]] = call ptr @stpncpy(ptr [[DST]], ptr nonnull dereferenceable(2) getelementptr inbounds ([5 x i8], ptr @s4, i64 0, i64 3), i64 [[N]])
+; ANY-NEXT:    [[ES1_N:%.*]] = call ptr @stpncpy(ptr [[DST]], ptr nonnull dereferenceable(2) getelementptr inbounds (i8, ptr @s4, i64 3), i64 [[N]])
 ; ANY-NEXT:    call void @sink(ptr [[DST]], ptr [[ES1_N]])
 ; ANY-NEXT:    [[ES4_N:%.*]] = call ptr @stpncpy(ptr [[DST]], ptr nonnull dereferenceable(5) @s4, i64 [[N]])
 ; ANY-NEXT:    call void @sink(ptr [[DST]], ptr [[ES4_N]])
@@ -448,6 +448,9 @@ define void @call_stpncpy_s(ptr %dst, ptr %src, i64 %n) {
   ret void
 }
 ;.
-; ANY: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
-; ANY: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+; BE: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; BE: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+;.
+; LE: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; LE: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ;.
diff --git a/llvm/test/Transforms/InstCombine/str-int-2.ll b/llvm/test/Transforms/InstCombine/str-int-2.ll
index a34714365e2181..ae67422d120787 100644
--- a/llvm/test/Transforms/InstCombine/str-int-2.ll
+++ b/llvm/test/Transforms/InstCombine/str-int-2.ll
@@ -44,7 +44,7 @@ define i64 @strtol_hex() #0 {
 
 define i64 @strtol_endptr_not_null(ptr nonnull %pend) {
 ; CHECK-LABEL: @strtol_endptr_not_null(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([3 x i8], ptr @.str, i64 0, i64 2), ptr [[PEND:%.*]], align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @.str, i64 2), ptr [[PEND:%.*]], align 8
 ; CHECK-NEXT:    ret i64 12
 ;
   %call = call i64 @strtol(ptr @.str, ptr %pend, i32 10)
diff --git a/llvm/test/Transforms/InstCombine/str-int-3.ll b/llvm/test/Transforms/InstCombine/str-int-3.ll
index f319a16d211f9a..100f1a95b13595 100644
--- a/llvm/test/Transforms/InstCombine/str-int-3.ll
+++ b/llvm/test/Transforms/InstCombine/str-int-3.ll
@@ -66,9 +66,9 @@ define void @fold_atoi_member(ptr %pi) {
 
 define void @fold_atoi_offset_out_of_bounds(ptr %pi) {
 ; CHECK-LABEL: @fold_atoi_offset_out_of_bounds(
-; CHECK-NEXT:    [[IA_0_0_32:%.*]] = call i32 @atoi(ptr nocapture nonnull getelementptr inbounds ([2 x %struct.A], ptr @a, i64 1, i64 0, i32 0, i64 0))
+; CHECK-NEXT:    [[IA_0_0_32:%.*]] = call i32 @atoi(ptr nocapture nonnull getelementptr inbounds (i8, ptr @a, i64 32))
 ; CHECK-NEXT:    store i32 [[IA_0_0_32]], ptr [[PI:%.*]], align 4
-; CHECK-NEXT:    [[IA_0_0_33:%.*]] = call i32 @atoi(ptr nocapture getelementptr ([2 x %struct.A], ptr @a, i64 1, i64 0, i32 0, i64 1))
+; CHECK-NEXT:    [[IA_0_0_33:%.*]] = call i32 @atoi(ptr nocapture getelementptr (i8, ptr @a, i64 33))
 ; CHECK-NEXT:    store i32 [[IA_0_0_33]], ptr [[PI]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/str-int-4.ll b/llvm/test/Transforms/InstCombine/str-int-4.ll
index 6efc5fb4ed1fbb..9173e122f8dd8b 100644
--- a/llvm/test/Transforms/InstCombine/str-int-4.ll
+++ b/llvm/test/Transforms/InstCombine/str-int-4.ll
@@ -42,39 +42,39 @@ declare i64 @strtoll(ptr, ptr, i32)
 
 define void @fold_strtol(ptr %ps) {
 ; CHECK-LABEL: @fold_strtol(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([11 x i8], ptr @ws_im123, i64 0, i64 10), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @ws_im123, i64 10), ptr @endptr, align 8
 ; CHECK-NEXT:    store i32 -123, ptr [[PS:%.*]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([11 x i8], ptr @ws_ip234, i64 0, i64 10), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @ws_ip234, i64 10), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i8, ptr [[PS]], i64 4
 ; CHECK-NEXT:    store i32 234, ptr [[PS1]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([3 x i8], ptr @i0, i64 0, i64 2), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @i0, i64 2), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store i32 0, ptr [[PS2]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([3 x i8], ptr @i9, i64 0, i64 2), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @i9, i64 2), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 12
 ; CHECK-NEXT:    store i32 9, ptr [[PS3]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([3 x i8], ptr @ia, i64 0, i64 2), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @ia, i64 2), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS4:%.*]] = getelementptr i8, ptr [[PS]], i64 16
 ; CHECK-NEXT:    store i32 10, ptr [[PS4]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([7 x i8], ptr @i19azAZ, i64 0, i64 6), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @i19azAZ, i64 6), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS5:%.*]] = getelementptr i8, ptr [[PS]], i64 20
 ; CHECK-NEXT:    store i32 76095035, ptr [[PS5]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @i32min, i64 0, i64 12), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @i32min, i64 12), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS6:%.*]] = getelementptr i8, ptr [[PS]], i64 24
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS6]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([15 x i8], ptr @mo32min, i64 0, i64 14), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @mo32min, i64 14), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS7:%.*]] = getelementptr i8, ptr [[PS]], i64 28
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS7]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @mx32min, i64 0, i64 12), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @mx32min, i64 12), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS8:%.*]] = getelementptr i8, ptr [[PS]], i64 32
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS8]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @mx32min, i64 0, i64 12), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @mx32min, i64 12), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS9:%.*]] = getelementptr i8, ptr [[PS]], i64 36
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS9]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([12 x i8], ptr @i32max, i64 0, i64 11), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @i32max, i64 11), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS10:%.*]] = getelementptr i8, ptr [[PS]], i64 40
 ; CHECK-NEXT:    store i32 2147483647, ptr [[PS10]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([12 x i8], ptr @x32max, i64 0, i64 11), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @x32max, i64 11), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS11:%.*]] = getelementptr i8, ptr [[PS]], i64 44
 ; CHECK-NEXT:    store i32 2147483647, ptr [[PS11]], align 4
 ; CHECK-NEXT:    ret void
@@ -181,7 +181,7 @@ define void @call_strtol(ptr %ps) {
 ; CHECK-NEXT:    [[NWS:%.*]] = call i32 @strtol(ptr nonnull @ws, ptr nonnull @endptr, i32 10)
 ; CHECK-NEXT:    [[PS11:%.*]] = getelementptr i8, ptr [[PS]], i64 44
 ; CHECK-NEXT:    store i32 [[NWS]], ptr [[PS11]], align 4
-; CHECK-NEXT:    [[NWSP6:%.*]] = call i32 @strtol(ptr nonnull getelementptr inbounds ([7 x i8], ptr @ws, i64 0, i64 6), ptr nonnull @endptr, i32 10)
+; CHECK-NEXT:    [[NWSP6:%.*]] = call i32 @strtol(ptr nonnull getelementptr inbounds (i8, ptr @ws, i64 6), ptr nonnull @endptr, i32 10)
 ; CHECK-NEXT:    [[PS12:%.*]] = getelementptr i8, ptr [[PS]], i64 48
 ; CHECK-NEXT:    store i32 [[NWSP6]], ptr [[PS12]], align 4
 ; CHECK-NEXT:    [[I0B1:%.*]] = call i32 @strtol(ptr nonnull @i0, ptr nonnull @endptr, i32 1)
@@ -287,15 +287,15 @@ define void @call_strtol(ptr %ps) {
 
 define void @fold_strtoll(ptr %ps) {
 ; CHECK-LABEL: @fold_strtoll(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([11 x i8], ptr @ws_im123, i64 0, i64 10), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @ws_im123, i64 10), ptr @endptr, align 8
 ; CHECK-NEXT:    store i64 -123, ptr [[PS:%.*]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([11 x i8], ptr @ws_ip234, i64 0, i64 10), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @ws_ip234, i64 10), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store i64 234, ptr [[PS1]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([22 x i8], ptr @i64min, i64 0, i64 21), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @i64min, i64 21), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 16
 ; CHECK-NEXT:    store i64 -9223372036854775808, ptr [[PS2]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([21 x i8], ptr @i64max, i64 0, i64 20), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @i64max, i64 20), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 24
 ; CHECK-NEXT:    store i64 9223372036854775807, ptr [[PS3]], align 4
 ; CHECK-NEXT:    ret void
@@ -335,7 +335,7 @@ define void @call_strtoll(ptr %ps) {
 ; CHECK-NEXT:    [[NWS:%.*]] = call i64 @strtoll(ptr nonnull @ws, ptr nonnull @endptr, i32 10)
 ; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 16
 ; CHECK-NEXT:    store i64 [[NWS]], ptr [[PS2]], align 4
-; CHECK-NEXT:    [[NWSP6:%.*]] = call i64 @strtoll(ptr nonnull getelementptr inbounds ([7 x i8], ptr @ws, i64 0, i64 6), ptr nonnull @endptr, i32 10)
+; CHECK-NEXT:    [[NWSP6:%.*]] = call i64 @strtoll(ptr nonnull getelementptr inbounds (i8, ptr @ws, i64 6), ptr nonnull @endptr, i32 10)
 ; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 24
 ; CHECK-NEXT:    store i64 [[NWSP6]], ptr [[PS3]], align 4
 ; CHECK-NEXT:    ret void
@@ -375,10 +375,10 @@ define void @call_strtol_trailing_space(ptr %ps) {
 ; CHECK-NEXT:    [[N1:%.*]] = call i32 @strtol(ptr nonnull @i_1_2_3_, ptr nonnull @endptr, i32 10)
 ; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i8, ptr [[PS:%.*]], i64 4
 ; CHECK-NEXT:    store i32 [[N1]], ptr [[PS1]], align 4
-; CHECK-NEXT:    [[N2:%.*]] = call i32 @strtol(ptr nonnull getelementptr inbounds ([9 x i8], ptr @i_1_2_3_, i64 0, i64 2), ptr nonnull @endptr, i32 10)
+; CHECK-NEXT:    [[N2:%.*]] = call i32 @strtol(ptr nonnull getelementptr inbounds (i8, ptr @i_1_2_3_, i64 2), ptr nonnull @endptr, i32 10)
 ; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store i32 [[N2]], ptr [[PS2]], align 4
-; CHECK-NEXT:    [[N3:%.*]] = call i32 @strtol(ptr nonnull getelementptr inbounds ([9 x i8], ptr @i_1_2_3_, i64 0, i64 4), ptr nonnull @endptr, i32 10)
+; CHECK-NEXT:    [[N3:%.*]] = call i32 @strtol(ptr nonnull getelementptr inbounds (i8, ptr @i_1_2_3_, i64 4), ptr nonnull @endptr, i32 10)
 ; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 12
 ; CHECK-NEXT:    store i32 [[N3]], ptr [[PS3]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/str-int-5.ll b/llvm/test/Transforms/InstCombine/str-int-5.ll
index ff4f2bffd9776a..4ccf7ea6407c24 100644
--- a/llvm/test/Transforms/InstCombine/str-int-5.ll
+++ b/llvm/test/Transforms/InstCombine/str-int-5.ll
@@ -46,39 +46,39 @@ declare i64 @strtoull(ptr, ptr, i32)
 
 define void @fold_strtoul(ptr %ps) {
 ; CHECK-LABEL: @fold_strtoul(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([11 x i8], ptr @ws_im123, i64 0, i64 10), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @ws_im123, i64 10), ptr @endptr, align 8
 ; CHECK-NEXT:    store i32 -123, ptr [[PS:%.*]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([11 x i8], ptr @ws_ip234, i64 0, i64 10), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @ws_ip234, i64 10), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i8, ptr [[PS]], i64 4
 ; CHECK-NEXT:    store i32 234, ptr [[PS1]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @i32min_m1, i64 0, i64 12), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @i32min_m1, i64 12), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store i32 2147483647, ptr [[PS2]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @i32min, i64 0, i64 12), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @i32min, i64 12), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 12
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS3]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([15 x i8], ptr @o32min, i64 0, i64 14), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @o32min, i64 14), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS4:%.*]] = getelementptr i8, ptr [[PS]], i64 16
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS4]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([15 x i8], ptr @mo32min, i64 0, i64 14), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @mo32min, i64 14), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS5:%.*]] = getelementptr i8, ptr [[PS]], i64 20
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS5]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @x32min, i64 0, i64 12), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @x32min, i64 12), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS6:%.*]] = getelementptr i8, ptr [[PS]], i64 24
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS6]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @mx32min, i64 0, i64 12), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @mx32min, i64 12), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS7:%.*]] = getelementptr i8, ptr [[PS]], i64 28
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS7]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([12 x i8], ptr @i32max, i64 0, i64 11), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @i32max, i64 11), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS8:%.*]] = getelementptr i8, ptr [[PS]], i64 32
 ; CHECK-NEXT:    store i32 2147483647, ptr [[PS8]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([6 x i8], ptr @mX01, i64 0, i64 5), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @mX01, i64 5), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS9:%.*]] = getelementptr i8, ptr [[PS]], i64 36
 ; CHECK-NEXT:    store i32 -1, ptr [[PS9]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([12 x i8], ptr @i32max_p1, i64 0, i64 11), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @i32max_p1, i64 11), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS10:%.*]] = getelementptr i8, ptr [[PS]], i64 40
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS10]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([12 x i8], ptr @ui32max, i64 0, i64 11), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @ui32max, i64 11), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS11:%.*]] = getelementptr i8, ptr [[PS]], i64 44
 ; CHECK-NEXT:    store i32 -1, ptr [[PS11]], align 4
 ; CHECK-NEXT:    ret void
@@ -159,7 +159,7 @@ define void @call_strtoul(ptr %ps) {
 ; CHECK-NEXT:    [[NWS:%.*]] = call i32 @strtoul(ptr nonnull @ws, ptr nonnull @endptr, i32 10)
 ; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store i32 [[NWS]], ptr [[PS2]], align 4
-; CHECK-NEXT:    [[NWSP6:%.*]] = call i32 @strtoul(ptr nonnull getelementptr inbounds ([7 x i8], ptr @ws, i64 0, i64 6), ptr nonnull @endptr, i32 10)
+; CHECK-NEXT:    [[NWSP6:%.*]] = call i32 @strtoul(ptr nonnull getelementptr inbounds (i8, ptr @ws, i64 6), ptr nonnull @endptr, i32 10)
 ; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 12
 ; CHECK-NEXT:    store i32 [[NWSP6]], ptr [[PS3]], align 4
 ; CHECK-NEXT:    ret void
@@ -195,36 +195,36 @@ define void @call_strtoul(ptr %ps) {
 
 define void @fold_strtoull(ptr %ps) {
 ; CHECK-LABEL: @fold_strtoull(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([11 x i8], ptr @ws_im123, i64 0, i64 10), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @ws_im123, i64 10), ptr @endptr, align 8
 ; CHECK-NEXT:    store i64 -123, ptr [[PS:%.*]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([11 x i8], ptr @ws_ip234, i64 0, i64 10), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @ws_ip234, i64 10), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store i64 234, ptr [[PS1]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([22 x i8], ptr @i64min_m1, i64 0, i64 21), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @i64min_m1, i64 21), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 16
 ; CHECK-NEXT:    store i64 9223372036854775807, ptr [[PS2]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @i32min, i64 0, i64 12), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @i32min, i64 12), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 24
 ; CHECK-NEXT:    store i64 -2147483648, ptr [[PS3]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([15 x i8], ptr @o32min, i64 0, i64 14), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @o32min, i64 14), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS4:%.*]] = getelementptr i8, ptr [[PS]], i64 32
 ; CHECK-NEXT:    store i64 2147483648, ptr [[PS4]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @x32min, i64 0, i64 12), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @x32min, i64 12), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS5:%.*]] = getelementptr i8, ptr [[PS]], i64 40
 ; CHECK-NEXT:    store i64 2147483648, ptr [[PS5]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([22 x i8], ptr @i64min, i64 0, i64 21), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @i64min, i64 21), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS6:%.*]] = getelementptr i8, ptr [[PS]], i64 48
 ; CHECK-NEXT:    store i64 -9223372036854775808, ptr [[PS6]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([21 x i8], ptr @i64max, i64 0, i64 20), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @i64max, i64 20), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS7:%.*]] = getelementptr i8, ptr [[PS]], i64 56
 ; CHECK-NEXT:    store i64 9223372036854775807, ptr [[PS7]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([21 x i8], ptr @i64max_p1, i64 0, i64 20), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @i64max_p1, i64 20), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS8:%.*]] = getelementptr i8, ptr [[PS]], i64 64
 ; CHECK-NEXT:    store i64 -9223372036854775808, ptr [[PS8]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([22 x i8], ptr @ui64max, i64 0, i64 21), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @ui64max, i64 21), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS9:%.*]] = getelementptr i8, ptr [[PS]], i64 72
 ; CHECK-NEXT:    store i64 -1, ptr [[PS9]], align 4
-; CHECK-NEXT:    store ptr getelementptr inbounds ([20 x i8], ptr @x64max, i64 0, i64 19), ptr @endptr, align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @x64max, i64 19), ptr @endptr, align 8
 ; CHECK-NEXT:    [[PS10:%.*]] = getelementptr i8, ptr [[PS]], i64 80
 ; CHECK-NEXT:    store i64 -1, ptr [[PS10]], align 4
 ; CHECK-NEXT:    ret void
@@ -298,7 +298,7 @@ define void @call_strtoull(ptr %ps) {
 ; CHECK-NEXT:    [[NWS:%.*]] = call i64 @strtoull(ptr nonnull @ws, ptr nonnull @endptr, i32 10)
 ; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 16
 ; CHECK-NEXT:    store i64 [[NWS]], ptr [[PS2]], align 4
-; CHECK-NEXT:    [[NWSP6:%.*]] = call i64 @strtoull(ptr nonnull getelementptr inbounds ([7 x i8], ptr @ws, i64 0, i64 6), ptr nonnull @endptr, i32 10)
+; CHECK-NEXT:    [[NWSP6:%.*]] = call i64 @strtoull(ptr nonnull getelementptr inbounds (i8, ptr @ws, i64 6), ptr nonnull @endptr, i32 10)
 ; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 24
 ; CHECK-NEXT:    store i64 [[NWSP6]], ptr [[PS3]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/str-int.ll b/llvm/test/Transforms/InstCombine/str-int.ll
index 718bfe4133332a..ee8d04d2f0e2a5 100644
--- a/llvm/test/Transforms/InstCombine/str-int.ll
+++ b/llvm/test/Transforms/InstCombine/str-int.ll
@@ -46,7 +46,7 @@ define i32 @strtol_hex() #0 {
 define i32 @strtol_endptr_not_null(ptr %pend) {
 ; CHECK-LABEL: @strtol_endptr_not_null(
 ; CHECK-NEXT:    [[ENDP1:%.*]] = getelementptr inbounds i8, ptr [[PEND:%.*]], i64 8
-; CHECK-NEXT:    store ptr getelementptr inbounds ([3 x i8], ptr @.str, i64 0, i64 2), ptr [[ENDP1]], align 8
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @.str, i64 2), ptr [[ENDP1]], align 8
 ; CHECK-NEXT:    ret i32 12
 ;
   %endp1 = getelementptr inbounds ptr, ptr %pend, i32 1
diff --git a/llvm/test/Transforms/InstCombine/strcall-bad-sig.ll b/llvm/test/Transforms/InstCombine/strcall-bad-sig.ll
index 5e59db5ef88aa5..7d3633a5d2271e 100644
--- a/llvm/test/Transforms/InstCombine/strcall-bad-sig.ll
+++ b/llvm/test/Transforms/InstCombine/strcall-bad-sig.ll
@@ -42,7 +42,7 @@ declare ptr @strncasecmp(ptr, ptr)
 
 define ptr @call_bad_strncasecmp() {
 ; CHECK-LABEL: @call_bad_strncasecmp(
-; CHECK-NEXT:    [[CMP:%.*]] = call ptr @strncasecmp(ptr nonnull @a, ptr nonnull getelementptr inbounds ([2 x i8], ptr @a, i64 0, i64 1))
+; CHECK-NEXT:    [[CMP:%.*]] = call ptr @strncasecmp(ptr nonnull @a, ptr nonnull getelementptr inbounds (i8, ptr @a, i64 1))
 ; CHECK-NEXT:    ret ptr [[CMP]]
 ;
   %p1 = getelementptr [2 x i8], ptr @a, i32 0, i32 1
@@ -55,7 +55,7 @@ declare i1 @strcoll(ptr, ptr, ptr)
 
 define i1 @call_bad_strcoll() {
 ; CHECK-LABEL: @call_bad_strcoll(
-; CHECK-NEXT:    [[I:%.*]] = call i1 @strcoll(ptr nonnull @a, ptr nonnull getelementptr inbounds ([2 x i8], ptr @a, i64 0, i64 1), ptr nonnull @a)
+; CHECK-NEXT:    [[I:%.*]] = call i1 @strcoll(ptr nonnull @a, ptr nonnull getelementptr inbounds (i8, ptr @a, i64 1), ptr nonnull @a)
 ; CHECK-NEXT:    ret i1 [[I]]
 ;
   %p1 = getelementptr [2 x i8], ptr @a, i32 0, i32 1
@@ -80,7 +80,7 @@ declare i1 @strtok(ptr, ptr, i1)
 
 define i1 @call_bad_strtok() {
 ; CHECK-LABEL: @call_bad_strtok(
-; CHECK-NEXT:    [[RET:%.*]] = call i1 @strtok(ptr nonnull @a, ptr nonnull getelementptr inbounds ([2 x i8], ptr @a, i64 0, i64 1), i1 false)
+; CHECK-NEXT:    [[RET:%.*]] = call i1 @strtok(ptr nonnull @a, ptr nonnull getelementptr inbounds (i8, ptr @a, i64 1), i1 false)
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %p1 = getelementptr [2 x i8], ptr @a, i32 0, i32 1
@@ -94,7 +94,7 @@ declare i1 @strtok_r(ptr, ptr)
 
 define i1 @call_bad_strtok_r() {
 ; CHECK-LABEL: @call_bad_strtok_r(
-; CHECK-NEXT:    [[RET:%.*]] = call i1 @strtok_r(ptr nonnull @a, ptr nonnull getelementptr inbounds ([2 x i8], ptr @a, i64 0, i64 1))
+; CHECK-NEXT:    [[RET:%.*]] = call i1 @strtok_r(ptr nonnull @a, ptr nonnull getelementptr inbounds (i8, ptr @a, i64 1))
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
   %p1 = getelementptr [2 x i8], ptr @a, i32 0, i32 1
@@ -146,7 +146,7 @@ declare ptr @strxfrm(ptr, ptr)
 
 define ptr @call_bad_strxfrm() {
 ; CHECK-LABEL: @call_bad_strxfrm(
-; CHECK-NEXT:    [[RET:%.*]] = call ptr @strxfrm(ptr nonnull @a, ptr nonnull getelementptr inbounds ([2 x i8], ptr @a, i64 0, i64 1))
+; CHECK-NEXT:    [[RET:%.*]] = call ptr @strxfrm(ptr nonnull @a, ptr nonnull getelementptr inbounds (i8, ptr @a, i64 1))
 ; CHECK-NEXT:    ret ptr [[RET]]
 ;
   %p1 = getelementptr [2 x i8], ptr @a, i32 0, i32 1
diff --git a/llvm/test/Transforms/InstCombine/strcall-no-nul.ll b/llvm/test/Transforms/InstCombine/strcall-no-nul.ll
index 30221ad5b0962a..96905a273319b6 100644
--- a/llvm/test/Transforms/InstCombine/strcall-no-nul.ll
+++ b/llvm/test/Transforms/InstCombine/strcall-no-nul.ll
@@ -50,7 +50,7 @@ declare i32 @snprintf(ptr, i64, ptr, ...)
 
 define ptr @fold_strchr_past_end() {
 ; CHECK-LABEL: @fold_strchr_past_end(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @a5, i64 5)
 ;
   %p = getelementptr [5 x i8], ptr @a5, i32 0, i32 5
   %q = call ptr @strchr(ptr %p, i32 0)
@@ -268,7 +268,7 @@ define void @fold_strcspn_past_end(ptr %poff) {
 
 define i32 @fold_atoi_past_end() {
 ; CHECK-LABEL: @fold_atoi_past_end(
-; CHECK-NEXT:    [[I:%.*]] = call i32 @atoi(ptr nocapture nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0))
+; CHECK-NEXT:    [[I:%.*]] = call i32 @atoi(ptr nocapture nonnull getelementptr inbounds (i8, ptr @a5, i64 5))
 ; CHECK-NEXT:    ret i32 [[I]]
 ;
   %p5 = getelementptr [5 x i8], ptr @a5, i32 0, i32 5
@@ -282,21 +282,21 @@ define i32 @fold_atoi_past_end() {
 
 define void @fold_atol_strtol_past_end(ptr %ps) {
 ; CHECK-LABEL: @fold_atol_strtol_past_end(
-; CHECK-NEXT:    [[I0:%.*]] = call i64 @atol(ptr nocapture nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0))
+; CHECK-NEXT:    [[I0:%.*]] = call i64 @atol(ptr nocapture nonnull getelementptr inbounds (i8, ptr @a5, i64 5))
 ; CHECK-NEXT:    store i64 [[I0]], ptr [[PS:%.*]], align 4
-; CHECK-NEXT:    [[I1:%.*]] = call i64 @atoll(ptr nocapture nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0))
+; CHECK-NEXT:    [[I1:%.*]] = call i64 @atoll(ptr nocapture nonnull getelementptr inbounds (i8, ptr @a5, i64 5))
 ; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store i64 [[I1]], ptr [[P1]], align 4
-; CHECK-NEXT:    [[I2:%.*]] = call i64 @strtol(ptr nocapture nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0), ptr null, i32 0)
+; CHECK-NEXT:    [[I2:%.*]] = call i64 @strtol(ptr nocapture nonnull getelementptr inbounds (i8, ptr @a5, i64 5), ptr null, i32 0)
 ; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[PS]], i64 16
 ; CHECK-NEXT:    store i64 [[I2]], ptr [[P2]], align 4
-; CHECK-NEXT:    [[I3:%.*]] = call i64 @strtoul(ptr nocapture nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0), ptr null, i32 8)
+; CHECK-NEXT:    [[I3:%.*]] = call i64 @strtoul(ptr nocapture nonnull getelementptr inbounds (i8, ptr @a5, i64 5), ptr null, i32 8)
 ; CHECK-NEXT:    [[P3:%.*]] = getelementptr i8, ptr [[PS]], i64 24
 ; CHECK-NEXT:    store i64 [[I3]], ptr [[P3]], align 4
-; CHECK-NEXT:    [[I4:%.*]] = call i64 @strtoll(ptr nocapture nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0), ptr null, i32 10)
+; CHECK-NEXT:    [[I4:%.*]] = call i64 @strtoll(ptr nocapture nonnull getelementptr inbounds (i8, ptr @a5, i64 5), ptr null, i32 10)
 ; CHECK-NEXT:    [[P4:%.*]] = getelementptr i8, ptr [[PS]], i64 32
 ; CHECK-NEXT:    store i64 [[I4]], ptr [[P4]], align 4
-; CHECK-NEXT:    [[I5:%.*]] = call i64 @strtoul(ptr nocapture nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0), ptr null, i32 16)
+; CHECK-NEXT:    [[I5:%.*]] = call i64 @strtoul(ptr nocapture nonnull getelementptr inbounds (i8, ptr @a5, i64 5), ptr null, i32 16)
 ; CHECK-NEXT:    [[P5:%.*]] = getelementptr i8, ptr [[PS]], i64 40
 ; CHECK-NEXT:    store i64 [[I5]], ptr [[P5]], align 4
 ; CHECK-NEXT:    ret void
@@ -358,9 +358,9 @@ define void @fold_sprintf_past_end(ptr %pcnt, ptr %dst) {
 
 define void @fold_snprintf_past_end(ptr %pcnt, ptr %dst, i64 %n) {
 ; CHECK-LABEL: @fold_snprintf_past_end(
-; CHECK-NEXT:    [[N5_:%.*]] = call i32 (ptr, i64, ptr, ...) @snprintf(ptr [[DST:%.*]], i64 [[N:%.*]], ptr nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0))
+; CHECK-NEXT:    [[N5_:%.*]] = call i32 (ptr, i64, ptr, ...) @snprintf(ptr [[DST:%.*]], i64 [[N:%.*]], ptr nonnull getelementptr inbounds (i8, ptr @a5, i64 5))
 ; CHECK-NEXT:    store i32 [[N5_]], ptr [[PCNT:%.*]], align 4
-; CHECK-NEXT:    [[N05:%.*]] = call i32 (ptr, i64, ptr, ...) @snprintf(ptr [[DST]], i64 [[N]], ptr nonnull @a5, ptr nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0))
+; CHECK-NEXT:    [[N05:%.*]] = call i32 (ptr, i64, ptr, ...) @snprintf(ptr [[DST]], i64 [[N]], ptr nonnull @a5, ptr nonnull getelementptr inbounds (i8, ptr @a5, i64 5))
 ; CHECK-NEXT:    [[PN05:%.*]] = getelementptr i8, ptr [[PCNT]], i64 4
 ; CHECK-NEXT:    store i32 [[N05]], ptr [[PN05]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/strchr-1.ll b/llvm/test/Transforms/InstCombine/strchr-1.ll
index 191e0a18fced1b..0cedc3ad518137 100644
--- a/llvm/test/Transforms/InstCombine/strchr-1.ll
+++ b/llvm/test/Transforms/InstCombine/strchr-1.ll
@@ -13,7 +13,7 @@ declare ptr @strchr(ptr, i32)
 
 define void @test_simplify1() {
 ; CHECK-LABEL: @test_simplify1(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([14 x i8], ptr @hello, i32 0, i32 6), ptr @chp, align 4
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @hello, i32 6), ptr @chp, align 4
 ; CHECK-NEXT:    ret void
 ;
 
@@ -35,7 +35,7 @@ define void @test_simplify2() {
 
 define void @test_simplify3() {
 ; CHECK-LABEL: @test_simplify3(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([14 x i8], ptr @hello, i32 0, i32 13), ptr @chp, align 4
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @hello, i32 13), ptr @chp, align 4
 ; CHECK-NEXT:    ret void
 ;
 
@@ -58,7 +58,7 @@ define void @test_simplify4(i32 %chr) {
 
 define void @test_simplify5() {
 ; CHECK-LABEL: @test_simplify5(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([14 x i8], ptr @hello, i32 0, i32 13), ptr @chp, align 4
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @hello, i32 13), ptr @chp, align 4
 ; CHECK-NEXT:    ret void
 ;
 
diff --git a/llvm/test/Transforms/InstCombine/strchr-3.ll b/llvm/test/Transforms/InstCombine/strchr-3.ll
index 55fb4456392058..7cbbdf8c69bc4c 100644
--- a/llvm/test/Transforms/InstCombine/strchr-3.ll
+++ b/llvm/test/Transforms/InstCombine/strchr-3.ll
@@ -20,7 +20,7 @@ define ptr @fold_strchr_s1_C(i32 %C) {
 ; CHECK-LABEL: @fold_strchr_s1_C(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
-; CHECK-NEXT:    [[MEMCHR_SEL1:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds ([2 x i8], ptr @s1, i64 0, i64 1), ptr null
+; CHECK-NEXT:    [[MEMCHR_SEL1:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds (i8, ptr @s1, i64 1), ptr null
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[TMP1]], 1
 ; CHECK-NEXT:    [[MEMCHR_SEL2:%.*]] = select i1 [[TMP3]], ptr @s1, ptr [[MEMCHR_SEL1]]
 ; CHECK-NEXT:    ret ptr [[MEMCHR_SEL2]]
@@ -36,7 +36,7 @@ define ptr @fold_strchr_s11_C(i32 %C) {
 ; CHECK-LABEL: @fold_strchr_s11_C(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
-; CHECK-NEXT:    [[MEMCHR_SEL1:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds ([3 x i8], ptr @s11, i64 0, i64 2), ptr null
+; CHECK-NEXT:    [[MEMCHR_SEL1:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds (i8, ptr @s11, i64 2), ptr null
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[TMP1]], 1
 ; CHECK-NEXT:    [[MEMCHR_SEL2:%.*]] = select i1 [[TMP3]], ptr @s11, ptr [[MEMCHR_SEL1]]
 ; CHECK-NEXT:    ret ptr [[MEMCHR_SEL2]]
@@ -52,7 +52,7 @@ define ptr @fold_strchr_s111_C(i32 %C) {
 ; CHECK-LABEL: @fold_strchr_s111_C(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
-; CHECK-NEXT:    [[MEMCHR_SEL1:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds ([4 x i8], ptr @s111, i64 0, i64 3), ptr null
+; CHECK-NEXT:    [[MEMCHR_SEL1:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds (i8, ptr @s111, i64 3), ptr null
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[TMP1]], 1
 ; CHECK-NEXT:    [[MEMCHR_SEL2:%.*]] = select i1 [[TMP3]], ptr @s111, ptr [[MEMCHR_SEL1]]
 ; CHECK-NEXT:    ret ptr [[MEMCHR_SEL2]]
@@ -96,9 +96,9 @@ define ptr @fold_strchr_s21111p1_C(i32 %C) {
 ; CHECK-LABEL: @fold_strchr_s21111p1_C(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
-; CHECK-NEXT:    [[MEMCHR_SEL1:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds ([6 x i8], ptr @s21111, i64 0, i64 5), ptr null
+; CHECK-NEXT:    [[MEMCHR_SEL1:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds (i8, ptr @s21111, i64 5), ptr null
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[TMP1]], 1
-; CHECK-NEXT:    [[MEMCHR_SEL2:%.*]] = select i1 [[TMP3]], ptr getelementptr inbounds ([6 x i8], ptr @s21111, i64 0, i64 1), ptr [[MEMCHR_SEL1]]
+; CHECK-NEXT:    [[MEMCHR_SEL2:%.*]] = select i1 [[TMP3]], ptr getelementptr inbounds (i8, ptr @s21111, i64 1), ptr [[MEMCHR_SEL1]]
 ; CHECK-NEXT:    ret ptr [[MEMCHR_SEL2]]
 ;
   %ptr = getelementptr inbounds [6 x i8], ptr @s21111, i64 0, i64 1
@@ -113,7 +113,7 @@ define ptr @fold_strchr_s11102_C(i32 %C) {
 ; CHECK-LABEL: @fold_strchr_s11102_C(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0
-; CHECK-NEXT:    [[MEMCHR_SEL1:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds ([6 x i8], ptr @s11102, i64 0, i64 3), ptr null
+; CHECK-NEXT:    [[MEMCHR_SEL1:%.*]] = select i1 [[TMP2]], ptr getelementptr inbounds (i8, ptr @s11102, i64 3), ptr null
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[TMP1]], 1
 ; CHECK-NEXT:    [[MEMCHR_SEL2:%.*]] = select i1 [[TMP3]], ptr @s11102, ptr [[MEMCHR_SEL1]]
 ; CHECK-NEXT:    ret ptr [[MEMCHR_SEL2]]
diff --git a/llvm/test/Transforms/InstCombine/strcmp-4.ll b/llvm/test/Transforms/InstCombine/strcmp-4.ll
index bdd521ddb9094c..e96c28b780b28d 100644
--- a/llvm/test/Transforms/InstCombine/strcmp-4.ll
+++ b/llvm/test/Transforms/InstCombine/strcmp-4.ll
@@ -11,8 +11,8 @@ declare i32 @strcmp(ptr, ptr)
 
 define i32 @fold_strcmp_s3_x_s4_s3(i1 %C) {
 ; CHECK-LABEL: @fold_strcmp_s3_x_s4_s3(
-; CHECK-NEXT:    [[PTR:%.*]] = select i1 [[C:%.*]], ptr getelementptr inbounds ([10 x i8], ptr @s9, i64 0, i64 6), ptr getelementptr inbounds ([10 x i8], ptr @s9, i64 0, i64 5)
-; CHECK-NEXT:    [[CMP:%.*]] = call i32 @strcmp(ptr noundef nonnull dereferenceable(1) [[PTR]], ptr noundef nonnull dereferenceable(4) getelementptr inbounds ([10 x i8], ptr @s9, i64 0, i64 6))
+; CHECK-NEXT:    [[PTR:%.*]] = select i1 [[C:%.*]], ptr getelementptr inbounds (i8, ptr @s9, i64 6), ptr getelementptr inbounds (i8, ptr @s9, i64 5)
+; CHECK-NEXT:    [[CMP:%.*]] = call i32 @strcmp(ptr noundef nonnull dereferenceable(1) [[PTR]], ptr noundef nonnull dereferenceable(4) getelementptr inbounds (i8, ptr @s9, i64 6))
 ; CHECK-NEXT:    ret i32 [[CMP]]
 ;
 
diff --git a/llvm/test/Transforms/InstCombine/strlcpy-1.ll b/llvm/test/Transforms/InstCombine/strlcpy-1.ll
index bfa4fc11d31076..7ca6c1599f191f 100644
--- a/llvm/test/Transforms/InstCombine/strlcpy-1.ll
+++ b/llvm/test/Transforms/InstCombine/strlcpy-1.ll
@@ -235,9 +235,9 @@ define void @call_strlcpy_s0_n(ptr %dst, ptr %s, i64 %n) {
 ; ANY-NEXT:    [[NZ:%.*]] = or i64 [[N]], 1
 ; ANY-NEXT:    [[NS_NZ:%.*]] = call i64 @strlcpy(ptr noundef nonnull dereferenceable(1) [[DST]], ptr noundef nonnull dereferenceable(1) [[S]], i64 [[NZ]])
 ; ANY-NEXT:    call void @sink(ptr [[DST]], i64 [[NS_NZ]])
-; ANY-NEXT:    [[NS0_N:%.*]] = call i64 @strlcpy(ptr [[DST]], ptr noundef nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], ptr @s4, i64 0, i64 4), i64 [[N]])
+; ANY-NEXT:    [[NS0_N:%.*]] = call i64 @strlcpy(ptr [[DST]], ptr noundef nonnull dereferenceable(1) getelementptr inbounds (i8, ptr @s4, i64 4), i64 [[N]])
 ; ANY-NEXT:    call void @sink(ptr [[DST]], i64 [[NS0_N]])
-; ANY-NEXT:    [[NS1_N:%.*]] = call i64 @strlcpy(ptr [[DST]], ptr noundef nonnull dereferenceable(1) getelementptr inbounds ([5 x i8], ptr @s4, i64 0, i64 3), i64 [[N]])
+; ANY-NEXT:    [[NS1_N:%.*]] = call i64 @strlcpy(ptr [[DST]], ptr noundef nonnull dereferenceable(1) getelementptr inbounds (i8, ptr @s4, i64 3), i64 [[N]])
 ; ANY-NEXT:    call void @sink(ptr [[DST]], i64 [[NS1_N]])
 ; ANY-NEXT:    [[NS4_N:%.*]] = call i64 @strlcpy(ptr [[DST]], ptr noundef nonnull dereferenceable(1) @s4, i64 [[N]])
 ; ANY-NEXT:    call void @sink(ptr [[DST]], i64 [[NS4_N]])
diff --git a/llvm/test/Transforms/InstCombine/strlen-1.ll b/llvm/test/Transforms/InstCombine/strlen-1.ll
index bd4c4a2ce47e92..8def4dd9747f9c 100644
--- a/llvm/test/Transforms/InstCombine/strlen-1.ll
+++ b/llvm/test/Transforms/InstCombine/strlen-1.ll
@@ -235,7 +235,7 @@ define i1 @strlen0_after_write_to_first_byte_global() {
 
 define i1 @strlen0_after_write_to_second_byte_global() {
 ; CHECK-LABEL: @strlen0_after_write_to_second_byte_global(
-; CHECK-NEXT:    store i8 49, ptr getelementptr inbounds ([32 x i8], ptr @a, i32 0, i32 1), align 16
+; CHECK-NEXT:    store i8 49, ptr getelementptr inbounds (i8, ptr @a, i32 1), align 16
 ; CHECK-NEXT:    [[CHAR0:%.*]] = load i8, ptr @a, align 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[CHAR0]], 0
 ; CHECK-NEXT:    ret i1 [[CMP]]
diff --git a/llvm/test/Transforms/InstCombine/strlen-6.ll b/llvm/test/Transforms/InstCombine/strlen-6.ll
index f1fe715d3893b0..25e653362db819 100644
--- a/llvm/test/Transforms/InstCombine/strlen-6.ll
+++ b/llvm/test/Transforms/InstCombine/strlen-6.ll
@@ -103,7 +103,7 @@ define i64 @fold_strlen_a_S3_p2_s4_to_1() {
 define void @fold_strlen_a_s3_S4_to_4() {
 ; CHECK-LABEL: @fold_strlen_a_s3_S4_to_4(
 ; CHECK-NEXT:    store i64 4, ptr @ax, align 4
-; CHECK-NEXT:    store i64 4, ptr getelementptr inbounds ([0 x i64], ptr @ax, i64 0, i64 1), align 4
+; CHECK-NEXT:    store i64 4, ptr getelementptr inbounds (i8, ptr @ax, i64 8), align 4
 ; CHECK-NEXT:    ret void
 ;
   %p1 = getelementptr %struct.A_a4_a5, ptr @a_s3_s4, i32 0, i32 0, i32 4
@@ -125,7 +125,7 @@ define void @fold_strlen_a_s3_S4_to_4() {
 define void @fold_strlen_a_s3_S4_p1_to_3() {
 ; CHECK-LABEL: @fold_strlen_a_s3_S4_p1_to_3(
 ; CHECK-NEXT:    store i64 3, ptr @ax, align 4
-; CHECK-NEXT:    store i64 3, ptr getelementptr inbounds ([0 x i64], ptr @ax, i64 0, i64 1), align 4
+; CHECK-NEXT:    store i64 3, ptr getelementptr inbounds (i8, ptr @ax, i64 8), align 4
 ; CHECK-NEXT:    ret void
 ;
   %p1 = getelementptr %struct.A_a4_a5, ptr @a_s3_s4, i32 0, i32 0, i32 5
@@ -147,7 +147,7 @@ define void @fold_strlen_a_s3_S4_p1_to_3() {
 define void @fold_strlen_a_s3_i32_S4_to_4() {
 ; CHECK-LABEL: @fold_strlen_a_s3_i32_S4_to_4(
 ; CHECK-NEXT:    store i64 4, ptr @ax, align 4
-; CHECK-NEXT:    store i64 4, ptr getelementptr inbounds ([0 x i64], ptr @ax, i64 0, i64 1), align 4
+; CHECK-NEXT:    store i64 4, ptr getelementptr inbounds (i8, ptr @ax, i64 8), align 4
 ; CHECK-NEXT:    ret void
 ;
   %p1 = getelementptr %struct.A_a4_i32_a5, ptr @a_s3_i32_s4, i32 0, i32 0, i32 8
@@ -169,7 +169,7 @@ define void @fold_strlen_a_s3_i32_S4_to_4() {
 define void @fold_strlen_a_s3_i32_S4_p1_to_3() {
 ; CHECK-LABEL: @fold_strlen_a_s3_i32_S4_p1_to_3(
 ; CHECK-NEXT:    store i64 3, ptr @ax, align 4
-; CHECK-NEXT:    store i64 3, ptr getelementptr inbounds ([0 x i64], ptr @ax, i64 0, i64 1), align 4
+; CHECK-NEXT:    store i64 3, ptr getelementptr inbounds (i8, ptr @ax, i64 8), align 4
 ; CHECK-NEXT:    ret void
 ;
   %p1 = getelementptr %struct.A_a4_i32_a5, ptr @a_s3_i32_s4, i32 0, i32 0, i32 9
@@ -191,7 +191,7 @@ define void @fold_strlen_a_s3_i32_S4_p1_to_3() {
 define void @fold_strlen_a_s3_i32_S4_p2_to_2() {
 ; CHECK-LABEL: @fold_strlen_a_s3_i32_S4_p2_to_2(
 ; CHECK-NEXT:    store i64 2, ptr @ax, align 4
-; CHECK-NEXT:    store i64 2, ptr getelementptr inbounds ([0 x i64], ptr @ax, i64 0, i64 1), align 4
+; CHECK-NEXT:    store i64 2, ptr getelementptr inbounds (i8, ptr @ax, i64 8), align 4
 ; CHECK-NEXT:    ret void
 ;
   %p1 = getelementptr %struct.A_a4_i32_a5, ptr @a_s3_i32_s4, i32 0, i32 0, i32 10
@@ -213,7 +213,7 @@ define void @fold_strlen_a_s3_i32_S4_p2_to_2() {
 define void @fold_strlen_a_s3_i32_S4_p3_to_1() {
 ; CHECK-LABEL: @fold_strlen_a_s3_i32_S4_p3_to_1(
 ; CHECK-NEXT:    store i64 1, ptr @ax, align 4
-; CHECK-NEXT:    store i64 1, ptr getelementptr inbounds ([0 x i64], ptr @ax, i64 0, i64 1), align 4
+; CHECK-NEXT:    store i64 1, ptr getelementptr inbounds (i8, ptr @ax, i64 8), align 4
 ; CHECK-NEXT:    ret void
 ;
   %p1 = getelementptr %struct.A_a4_i32_a5, ptr @a_s3_i32_s4, i32 0, i32 0, i32 11
@@ -235,7 +235,7 @@ define void @fold_strlen_a_s3_i32_S4_p3_to_1() {
 define void @fold_strlen_a_s3_i32_S4_p4_to_0() {
 ; CHECK-LABEL: @fold_strlen_a_s3_i32_S4_p4_to_0(
 ; CHECK-NEXT:    store i64 0, ptr @ax, align 4
-; CHECK-NEXT:    store i64 0, ptr getelementptr inbounds ([0 x i64], ptr @ax, i64 0, i64 1), align 4
+; CHECK-NEXT:    store i64 0, ptr getelementptr inbounds (i8, ptr @ax, i64 8), align 4
 ; CHECK-NEXT:    ret void
 ;
   %p1 = getelementptr %struct.A_a4_i32_a5, ptr @a_s3_i32_s4, i32 0, i32 0, i32 12
@@ -257,8 +257,8 @@ define void @fold_strlen_a_s3_i32_S4_p4_to_0() {
 define void @fold_strlen_ax_s() {
 ; CHECK-LABEL: @fold_strlen_ax_s(
 ; CHECK-NEXT:    store i64 3, ptr @ax, align 4
-; CHECK-NEXT:    store i64 5, ptr getelementptr inbounds ([0 x i64], ptr @ax, i64 0, i64 1), align 4
-; CHECK-NEXT:    store i64 7, ptr getelementptr inbounds ([0 x i64], ptr @ax, i64 0, i64 2), align 4
+; CHECK-NEXT:    store i64 5, ptr getelementptr inbounds (i8, ptr @ax, i64 8), align 4
+; CHECK-NEXT:    store i64 7, ptr getelementptr inbounds (i8, ptr @ax, i64 16), align 4
 ; CHECK-NEXT:    ret void
 ;
   %pax_s3 = getelementptr { i8, [4 x i8] }, ptr @ax_s3, i64 0, i32 1, i64 0
diff --git a/llvm/test/Transforms/InstCombine/strpbrk-1.ll b/llvm/test/Transforms/InstCombine/strpbrk-1.ll
index 411bd8d627eca7..b51071df25d20e 100644
--- a/llvm/test/Transforms/InstCombine/strpbrk-1.ll
+++ b/llvm/test/Transforms/InstCombine/strpbrk-1.ll
@@ -37,7 +37,7 @@ define ptr @test_simplify2(ptr %pat) {
 
 define ptr @test_simplify3() {
 ; CHECK-LABEL: @test_simplify3(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([12 x i8], ptr @hello, i32 0, i32 6)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @hello, i32 6)
 ;
 
   %ret = call ptr @strpbrk(ptr @hello, ptr @w)
diff --git a/llvm/test/Transforms/InstCombine/strrchr-1.ll b/llvm/test/Transforms/InstCombine/strrchr-1.ll
index 661e040f8042ba..0c876b9d2a9829 100644
--- a/llvm/test/Transforms/InstCombine/strrchr-1.ll
+++ b/llvm/test/Transforms/InstCombine/strrchr-1.ll
@@ -12,7 +12,7 @@ declare ptr @strrchr(ptr, i32)
 
 define void @test_simplify1() {
 ; CHECK-LABEL: @test_simplify1(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([14 x i8], ptr @hello, i32 0, i32 6), ptr @chp, align 4
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @hello, i32 6), ptr @chp, align 4
 ; CHECK-NEXT:    ret void
 ;
 
@@ -34,7 +34,7 @@ define void @test_simplify2() {
 
 define void @test_simplify3() {
 ; CHECK-LABEL: @test_simplify3(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([14 x i8], ptr @hello, i32 0, i32 13), ptr @chp, align 4
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @hello, i32 13), ptr @chp, align 4
 ; CHECK-NEXT:    ret void
 ;
 
@@ -45,7 +45,7 @@ define void @test_simplify3() {
 
 define void @test_simplify4() {
 ; CHECK-LABEL: @test_simplify4(
-; CHECK-NEXT:    store ptr getelementptr inbounds ([14 x i8], ptr @hello, i32 0, i32 13), ptr @chp, align 4
+; CHECK-NEXT:    store ptr getelementptr inbounds (i8, ptr @hello, i32 13), ptr @chp, align 4
 ; CHECK-NEXT:    ret void
 ;
 
diff --git a/llvm/test/Transforms/InstCombine/strrchr-3.ll b/llvm/test/Transforms/InstCombine/strrchr-3.ll
index 1dadb048787171..f25504a8db2b5f 100644
--- a/llvm/test/Transforms/InstCombine/strrchr-3.ll
+++ b/llvm/test/Transforms/InstCombine/strrchr-3.ll
@@ -13,7 +13,7 @@ define ptr @fold_strrchr_sp10_x(i32 %c) {
 ; CHECK-LABEL: @fold_strrchr_sp10_x(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i8
 ; CHECK-NEXT:    [[MEMRCHR_CHAR0CMP:%.*]] = icmp eq i8 [[TMP1]], 0
-; CHECK-NEXT:    [[MEMRCHR_SEL:%.*]] = select i1 [[MEMRCHR_CHAR0CMP]], ptr getelementptr inbounds ([11 x i8], ptr @s10, i64 0, i64 10), ptr null
+; CHECK-NEXT:    [[MEMRCHR_SEL:%.*]] = select i1 [[MEMRCHR_CHAR0CMP]], ptr getelementptr inbounds (i8, ptr @s10, i64 10), ptr null
 ; CHECK-NEXT:    ret ptr [[MEMRCHR_SEL]]
 ;
   %psp10 = getelementptr [11 x i8], ptr @s10, i32 0, i32 10
@@ -26,7 +26,7 @@ define ptr @fold_strrchr_sp10_x(i32 %c) {
 
 define ptr @call_strrchr_sp9_x(i32 %c) {
 ; CHECK-LABEL: @call_strrchr_sp9_x(
-; CHECK-NEXT:    [[MEMRCHR:%.*]] = call ptr @memrchr(ptr noundef nonnull dereferenceable(2) getelementptr inbounds ([11 x i8], ptr @s10, i64 0, i64 9), i32 [[C:%.*]], i64 2)
+; CHECK-NEXT:    [[MEMRCHR:%.*]] = call ptr @memrchr(ptr noundef nonnull dereferenceable(2) getelementptr inbounds (i8, ptr @s10, i64 9), i32 [[C:%.*]], i64 2)
 ; CHECK-NEXT:    ret ptr [[MEMRCHR]]
 ;
   %psp9 = getelementptr [11 x i8], ptr @s10, i32 0, i32 9
@@ -40,7 +40,7 @@ define ptr @call_strrchr_sp9_x(i32 %c) {
 
 define ptr @call_strrchr_sp2_x(i32 %c) {
 ; CHECK-LABEL: @call_strrchr_sp2_x(
-; CHECK-NEXT:    [[MEMRCHR:%.*]] = call ptr @memrchr(ptr noundef nonnull dereferenceable(9) getelementptr inbounds ([11 x i8], ptr @s10, i64 0, i64 2), i32 [[C:%.*]], i64 9)
+; CHECK-NEXT:    [[MEMRCHR:%.*]] = call ptr @memrchr(ptr noundef nonnull dereferenceable(9) getelementptr inbounds (i8, ptr @s10, i64 2), i32 [[C:%.*]], i64 9)
 ; CHECK-NEXT:    ret ptr [[MEMRCHR]]
 ;
   %psp2 = getelementptr [11 x i8], ptr @s10, i32 0, i32 2
@@ -53,7 +53,7 @@ define ptr @call_strrchr_sp2_x(i32 %c) {
 
 define ptr @call_strrchr_sp1_x(i32 %c) {
 ; CHECK-LABEL: @call_strrchr_sp1_x(
-; CHECK-NEXT:    [[MEMRCHR:%.*]] = call ptr @memrchr(ptr noundef nonnull dereferenceable(10) getelementptr inbounds ([11 x i8], ptr @s10, i64 0, i64 1), i32 [[C:%.*]], i64 10)
+; CHECK-NEXT:    [[MEMRCHR:%.*]] = call ptr @memrchr(ptr noundef nonnull dereferenceable(10) getelementptr inbounds (i8, ptr @s10, i64 1), i32 [[C:%.*]], i64 10)
 ; CHECK-NEXT:    ret ptr [[MEMRCHR]]
 ;
   %psp1 = getelementptr [11 x i8], ptr @s10, i32 0, i32 1
diff --git a/llvm/test/Transforms/InstCombine/strstr-1.ll b/llvm/test/Transforms/InstCombine/strstr-1.ll
index 50edbfffb9f8d4..b5f4a2ce288d55 100644
--- a/llvm/test/Transforms/InstCombine/strstr-1.ll
+++ b/llvm/test/Transforms/InstCombine/strstr-1.ll
@@ -37,7 +37,7 @@ define ptr @test_simplify2(ptr %str) {
 
 define ptr @test_simplify3() {
 ; CHECK-LABEL: @test_simplify3(
-; CHECK-NEXT:    ret ptr getelementptr inbounds ([6 x i8], ptr @.str2, i64 0, i64 1)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @.str2, i64 1)
 ;
   %ret = call ptr @strstr(ptr @.str2, ptr @.str3)
   ret ptr %ret
diff --git a/llvm/test/Transforms/InstCombine/vec_demanded_elts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_demanded_elts-inseltpoison.ll
index 738ef1bc1ad2f6..74465cde86ade1 100644
--- a/llvm/test/Transforms/InstCombine/vec_demanded_elts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_demanded_elts-inseltpoison.ll
@@ -566,7 +566,7 @@ define ptr @gep_cvbase_w_s_idx(<2 x ptr> %base, i64 %raw_addr) {
 
 define ptr @gep_cvbase_w_cv_idx(<2 x ptr> %base, i64 %raw_addr) {
 ; CHECK-LABEL: @gep_cvbase_w_cv_idx(
-; CHECK-NEXT:    ret ptr getelementptr inbounds (i32, ptr @GLOBAL, i64 1)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @GLOBAL, i64 4)
 ;
   %gep = getelementptr i32, <2 x ptr> <ptr @GLOBAL, ptr @GLOBAL>, <2 x i64> <i64 0, i64 1>
   %ee = extractelement <2 x ptr> %gep, i32 1
diff --git a/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll b/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
index fd55a236e0d751..d8a3b87f78ee9c 100644
--- a/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -569,7 +569,7 @@ define ptr @gep_cvbase_w_s_idx(<2 x ptr> %base, i64 %raw_addr) {
 
 define ptr @gep_cvbase_w_cv_idx(<2 x ptr> %base, i64 %raw_addr) {
 ; CHECK-LABEL: @gep_cvbase_w_cv_idx(
-; CHECK-NEXT:    ret ptr getelementptr inbounds (i32, ptr @GLOBAL, i64 1)
+; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @GLOBAL, i64 4)
 ;
   %gep = getelementptr i32, <2 x ptr> <ptr @GLOBAL, ptr @GLOBAL>, <2 x i64> <i64 0, i64 1>
   %ee = extractelement <2 x ptr> %gep, i32 1
diff --git a/llvm/test/Transforms/InstCombine/wcslen-1.ll b/llvm/test/Transforms/InstCombine/wcslen-1.ll
index 138b3ff585c54d..8833754a536784 100644
--- a/llvm/test/Transforms/InstCombine/wcslen-1.ll
+++ b/llvm/test/Transforms/InstCombine/wcslen-1.ll
@@ -231,7 +231,7 @@ define i64 @fold_wcslen_1() {
 ; with an offset that isn't a multiple of the element size).
 define i64 @no_fold_wcslen_1() {
 ; CHECK-LABEL: @no_fold_wcslen_1(
-; CHECK-NEXT:    [[LEN:%.*]] = tail call i64 @wcslen(ptr getelementptr ([15 x i8], ptr @ws, i64 0, i64 3))
+; CHECK-NEXT:    [[LEN:%.*]] = tail call i64 @wcslen(ptr nonnull getelementptr inbounds (i8, ptr @ws, i64 3))
 ; CHECK-NEXT:    ret i64 [[LEN]]
 ;
   %p = getelementptr [15 x i8], ptr @ws, i64 0, i64 3
@@ -246,7 +246,7 @@ define i64 @no_fold_wcslen_1() {
 ; with an offset that isn't a multiple of the element size).
 define i64 @no_fold_wcslen_2() {
 ; CHECK-LABEL: @no_fold_wcslen_2(
-; CHECK-NEXT:    [[LEN:%.*]] = tail call i64 @wcslen(ptr nonnull getelementptr inbounds ([10 x i8], ptr @s8, i64 0, i64 3))
+; CHECK-NEXT:    [[LEN:%.*]] = tail call i64 @wcslen(ptr nonnull getelementptr inbounds (i8, ptr @s8, i64 3))
 ; CHECK-NEXT:    ret i64 [[LEN]]
 ;
   %p = getelementptr [10 x i8], ptr @s8, i64 0, i64 3
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/gep-alias.ll b/llvm/test/Transforms/InstSimplify/ConstProp/gep-alias.ll
index f77a49e90be7bc..097ccfe78e97cd 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/gep-alias.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/gep-alias.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define ptr @f() {
 ; CHECK-LABEL: define ptr @f() {
-; CHECK-NEXT:    ret ptr getelementptr ([3 x ptr], ptr @b, i64 0, i64 1)
+; CHECK-NEXT:    ret ptr getelementptr (i8, ptr @b, i64 8)
 ;
   ret ptr getelementptr ([3 x ptr], ptr @b, i64 0, i64 1)
 }
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/gep-constanfolding-error.ll b/llvm/test/Transforms/InstSimplify/ConstProp/gep-constanfolding-error.ll
index bcba5ce3aa7e4e..e5287a45da4b6d 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/gep-constanfolding-error.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/gep-constanfolding-error.ll
@@ -44,8 +44,7 @@ entry:
   %9 = add i32 %f.promoted, %smax
   %10 = add i32 %9, 2
   call void @llvm.memset.p0.i32(ptr %scevgep, i8 %conv6, i32 %10, i1 false)
-; CHECK:  call void @llvm.memset.p0.i32(ptr getelementptr inbounds ([6 x [6 x [7 x i8]]], ptr @j, i32 0, i{{32|64}} 5, i{{32|64}} 4, i32 1), i8 %conv6, i32 1, i1 false)
-; CHECK-NOT: call void @llvm.memset.p0.i32(ptr getelementptr ([6 x [6 x [7 x i8]]], ptr @j, i64 1, i64 4, i64 4, i32 1)
+; CHECK: call void @llvm.memset.p0.i32(ptr getelementptr inbounds (i8, ptr @j, i32 239), i8 %conv6, i32 1, i1 false)
   ret i32 0
 }
 ; Function Attrs: argmemonly nounwind
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/gep.ll b/llvm/test/Transforms/InstSimplify/ConstProp/gep.ll
index d91349a570b715..b3fe7f36ff97bd 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/gep.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/gep.ll
@@ -11,21 +11,21 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define ptr @f0() {
 ; CHECK-LABEL: @f0(
-; CHECK-NEXT:    ret ptr getelementptr inbounds inrange(-16, 8) ([3 x ptr], ptr @vt, i64 0, i64 2)
+; CHECK-NEXT:    ret ptr getelementptr inbounds inrange(-16, 8) (i8, ptr @vt, i64 16)
 ;
   ret ptr getelementptr (ptr, ptr getelementptr inbounds inrange(-8, 16) ([3 x ptr], ptr @vt, i64 0, i64 1), i64 1)
 }
 
 define ptr @f1() {
 ; CHECK-LABEL: @f1(
-; CHECK-NEXT:    ret ptr getelementptr inbounds inrange(-8, 0) ([3 x ptr], ptr @vt, i64 0, i64 2)
+; CHECK-NEXT:    ret ptr getelementptr inbounds inrange(-8, 0) (i8, ptr @vt, i64 16)
 ;
   ret ptr getelementptr (ptr, ptr getelementptr inbounds inrange(0, 8) ([3 x ptr], ptr @vt, i64 0, i64 1), i64 1)
 }
 
 define ptr @f2() {
 ; CHECK-LABEL: @f2(
-; CHECK-NEXT:    ret ptr getelementptr inrange(-24, -16) ([3 x ptr], ptr @vt, i64 1, i64 1)
+; CHECK-NEXT:    ret ptr getelementptr inrange(-24, -16) (i8, ptr @vt, i64 32)
 ;
   ret ptr getelementptr (ptr, ptr getelementptr inbounds inrange(0, 8) ([3 x ptr], ptr @vt, i64 0, i64 1), i64 3)
 }
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll b/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll
index b4afb7bd4a2b0e..1d7ed23d3e82d7 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/icmp-global.ll
@@ -121,7 +121,7 @@ define i1 @global_gep_ugt_null() {
 
 define i1 @global_gep_sgt_null() {
 ; CHECK-LABEL: @global_gep_sgt_null(
-; CHECK-NEXT:    ret i1 icmp sgt (ptr getelementptr inbounds ([2 x i32], ptr @g, i64 1), ptr null)
+; CHECK-NEXT:    ret i1 icmp sgt (ptr getelementptr inbounds (i8, ptr @g, i64 8), ptr null)
 ;
   %gep = getelementptr inbounds [2 x i32], ptr @g, i64 1
   %cmp = icmp sgt ptr %gep, null
@@ -222,7 +222,7 @@ define i1 @global_gep_ugt_global() {
 
 define i1 @global_gep_sgt_global() {
 ; CHECK-LABEL: @global_gep_sgt_global(
-; CHECK-NEXT:    ret i1 icmp sgt (ptr getelementptr inbounds ([2 x i32], ptr @g, i64 1), ptr @g)
+; CHECK-NEXT:    ret i1 icmp sgt (ptr getelementptr inbounds (i8, ptr @g, i64 8), ptr @g)
 ;
   %gep = getelementptr inbounds [2 x i32], ptr @g, i64 1
   %cmp = icmp sgt ptr %gep, @g
@@ -232,7 +232,7 @@ define i1 @global_gep_sgt_global() {
 ; This should not fold to true, as the offset is negative.
 define i1 @global_gep_ugt_global_neg_offset() {
 ; CHECK-LABEL: @global_gep_ugt_global_neg_offset(
-; CHECK-NEXT:    ret i1 icmp ugt (ptr getelementptr ([2 x i32], ptr @g, i64 -1), ptr @g)
+; CHECK-NEXT:    ret i1 icmp ugt (ptr getelementptr (i8, ptr @g, i64 -8), ptr @g)
 ;
   %gep = getelementptr [2 x i32], ptr @g, i64 -1
   %cmp = icmp ugt ptr %gep, @g
@@ -241,7 +241,7 @@ define i1 @global_gep_ugt_global_neg_offset() {
 
 define i1 @global_gep_sgt_global_neg_offset() {
 ; CHECK-LABEL: @global_gep_sgt_global_neg_offset(
-; CHECK-NEXT:    ret i1 icmp sgt (ptr getelementptr ([2 x i32], ptr @g, i64 -1), ptr @g)
+; CHECK-NEXT:    ret i1 icmp sgt (ptr getelementptr (i8, ptr @g, i64 -8), ptr @g)
 ;
   %gep = getelementptr [2 x i32], ptr @g, i64 -1
   %cmp = icmp sgt ptr %gep, @g
@@ -260,7 +260,7 @@ define i1 @global_gep_ugt_global_gep() {
 ; Should not fold due to signed comparison.
 define i1 @global_gep_sgt_global_gep() {
 ; CHECK-LABEL: @global_gep_sgt_global_gep(
-; CHECK-NEXT:    ret i1 icmp sgt (ptr getelementptr inbounds ([2 x i32], ptr @g, i64 0, i64 1), ptr @g)
+; CHECK-NEXT:    ret i1 icmp sgt (ptr getelementptr inbounds (i8, ptr @g, i64 4), ptr @g)
 ;
   %gep2 = getelementptr inbounds [2 x i32], ptr @g, i64 0, i64 1
   %cmp = icmp sgt ptr %gep2, @g
diff --git a/llvm/test/Transforms/InstSimplify/compare.ll b/llvm/test/Transforms/InstSimplify/compare.ll
index 724912d90bd861..0f72cd813f2f93 100644
--- a/llvm/test/Transforms/InstSimplify/compare.ll
+++ b/llvm/test/Transforms/InstSimplify/compare.ll
@@ -3078,7 +3078,7 @@ define i1 @globals_inequal() {
 ; TODO: Never equal
 define i1 @globals_offset_inequal() {
 ; CHECK-LABEL: @globals_offset_inequal(
-; CHECK-NEXT:    ret i1 icmp ne (ptr getelementptr (i8, ptr @A, i32 1), ptr getelementptr (i8, ptr @B, i32 1))
+; CHECK-NEXT:    ret i1 icmp ne (ptr getelementptr inbounds (i8, ptr @A, i32 1), ptr getelementptr inbounds (i8, ptr @B, i32 1))
 ;
   %a.off = getelementptr i8, ptr @A, i32 1
   %b.off = getelementptr i8, ptr @B, i32 1
diff --git a/llvm/test/Transforms/InstSimplify/past-the-end.ll b/llvm/test/Transforms/InstSimplify/past-the-end.ll
index 96339c1cdcf21f..1e146d18327a14 100644
--- a/llvm/test/Transforms/InstSimplify/past-the-end.ll
+++ b/llvm/test/Transforms/InstSimplify/past-the-end.ll
@@ -21,7 +21,7 @@ define zeroext i1 @no_offsets() {
 
 define zeroext i1 @both_past_the_end() {
 ; CHECK-LABEL: @both_past_the_end(
-; CHECK-NEXT:    ret i1 icmp eq (ptr getelementptr inbounds (i32, ptr @opte_a, i32 1), ptr getelementptr inbounds (i32, ptr @opte_b, i32 1))
+; CHECK-NEXT:    ret i1 icmp eq (ptr getelementptr inbounds (i8, ptr @opte_a, i32 4), ptr getelementptr inbounds (i8, ptr @opte_b, i32 4))
 ;
   %x = getelementptr i32, ptr @opte_a, i32 1
   %y = getelementptr i32, ptr @opte_b, i32 1
@@ -35,7 +35,7 @@ define zeroext i1 @both_past_the_end() {
 
 define zeroext i1 @just_one_past_the_end() {
 ; CHECK-LABEL: @just_one_past_the_end(
-; CHECK-NEXT:    ret i1 icmp eq (ptr getelementptr inbounds (i32, ptr @opte_a, i32 1), ptr @opte_b)
+; CHECK-NEXT:    ret i1 icmp eq (ptr getelementptr inbounds (i8, ptr @opte_a, i32 4), ptr @opte_b)
 ;
   %x = getelementptr i32, ptr @opte_a, i32 1
   %t = icmp eq ptr %x, @opte_b
diff --git a/llvm/test/Transforms/LoopStrengthReduce/2011-12-19-PostincQuadratic.ll b/llvm/test/Transforms/LoopStrengthReduce/2011-12-19-PostincQuadratic.ll
index 552cd880373214..616e3ae1b03684 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/2011-12-19-PostincQuadratic.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/2011-12-19-PostincQuadratic.ll
@@ -16,7 +16,7 @@ define void @vb() nounwind {
 ; CHECK-NEXT:  for.cond.preheader:
 ; CHECK-NEXT:    br label [[FOR_BODY7:%.*]]
 ; CHECK:       for.body7:
-; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY7]] ], [ getelementptr inbounds ([121 x i32], ptr @b, i32 0, i32 1), [[FOR_COND_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY7]] ], [ getelementptr inbounds (i8, ptr @b, i32 4), [[FOR_COND_PREHEADER:%.*]] ]
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY7]] ], [ 8, [[FOR_COND_PREHEADER]] ]
 ; CHECK-NEXT:    [[INDVARS_IV77:%.*]] = phi i32 [ [[INDVARS_IV_NEXT78:%.*]], [[FOR_BODY7]] ], [ 1, [[FOR_COND_PREHEADER]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT78]] = add i32 [[INDVARS_IV77]], 1
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
index 7fef404eaf1479..c4aa6c7725d411 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
@@ -10,23 +10,23 @@ define i32 @test(ptr %base) nounwind uwtable ssp {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[WHILE_BODY_LR_PH_I:%.*]]
 ; CHECK:       while.body.lr.ph.i:
-; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 16
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 16
 ; CHECK-NEXT:    br label [[WHILE_BODY_I:%.*]]
 ; CHECK:       while.body.i:
 ; CHECK-NEXT:    [[INDVARS_IV7_I:%.*]] = phi i64 [ 16, [[WHILE_BODY_LR_PH_I]] ], [ [[INDVARS_IV_NEXT8_I:%.*]], [[COND_TRUE29_I:%.*]] ]
 ; CHECK-NEXT:    [[I_05_I:%.*]] = phi i64 [ 0, [[WHILE_BODY_LR_PH_I]] ], [ [[INDVARS_IV7_I]], [[COND_TRUE29_I]] ]
 ; CHECK-NEXT:    [[LSR4:%.*]] = trunc i64 [[I_05_I]] to i32
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[LSR4]] to i64
-; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[UGLYGEP]], i64 [[TMP0]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[SEXT_I:%.*]] = shl i64 [[I_05_I]], 32
 ; CHECK-NEXT:    [[IDX_EXT_I:%.*]] = ashr exact i64 [[SEXT_I]], 32
 ; CHECK-NEXT:    [[ADD_PTR_SUM_I:%.*]] = add i64 [[IDX_EXT_I]], 16
 ; CHECK-NEXT:    br label [[FOR_BODY_I:%.*]]
 ; CHECK:       for.body.i:
-; CHECK-NEXT:    [[LSR_IV2:%.*]] = phi ptr [ [[UGLYGEP3:%.*]], [[FOR_BODY_I]] ], [ [[UGLYGEP1]], [[WHILE_BODY_I]] ]
+; CHECK-NEXT:    [[LSR_IV2:%.*]] = phi ptr [ [[SCEVGEP3:%.*]], [[FOR_BODY_I]] ], [ [[SCEVGEP1]], [[WHILE_BODY_I]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[LSR_IV2]], align 1
 ; CHECK-NEXT:    [[CMP:%.*]] = call i1 @check() #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    [[UGLYGEP3]] = getelementptr i8, ptr [[LSR_IV2]], i64 1
+; CHECK-NEXT:    [[SCEVGEP3]] = getelementptr i8, ptr [[LSR_IV2]], i64 1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_END_I:%.*]], label [[FOR_BODY_I]]
 ; CHECK:       for.end.i:
 ; CHECK-NEXT:    [[ADD_PTR_I144:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD_PTR_SUM_I]]
@@ -96,18 +96,18 @@ define void @test2(i32 %n) nounwind uwtable {
 ; CHECK-NEXT:    br label [[FOR_COND468:%.*]]
 ; CHECK:       for.cond468:
 ; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi i32 [ 1, [[FOR_COND468_PREHEADER]] ], [ [[LSR_IV_NEXT:%.*]], [[IF_THEN477:%.*]] ]
-; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ getelementptr inbounds ([5000 x %struct.anon.7.91.199.307.415.475.559.643.751.835.943.1003.1111.1219.1351.1375.1399.1435.1471.1483.1519.1531.1651.1771], ptr @tags, i64 0, i64 0, i32 2), [[FOR_COND468_PREHEADER]] ], [ [[UGLYGEP:%.*]], [[IF_THEN477]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ getelementptr inbounds (i8, ptr @tags, i64 8), [[FOR_COND468_PREHEADER]] ], [ [[SCEVGEP:%.*]], [[IF_THEN477]] ]
 ; CHECK-NEXT:    [[K_0:%.*]] = load i32, ptr [[LSR_IV]], align 4
 ; CHECK-NEXT:    [[CMP469:%.*]] = icmp slt i32 [[LSR_IV1]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP469]], label [[FOR_BODY471:%.*]], label [[FOR_INC498_PREHEADER:%.*]]
 ; CHECK:       for.body471:
-; CHECK-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[LSR_IV]], i64 8
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[UGLYGEP2]], align 4
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[LSR_IV]], i64 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SCEVGEP2]], align 4
 ; CHECK-NEXT:    br i1 false, label [[IF_THEN477]], label [[FOR_INC498_PREHEADER]]
 ; CHECK:       for.inc498.preheader:
 ; CHECK-NEXT:    br label [[FOR_INC498:%.*]]
 ; CHECK:       if.then477:
-; CHECK-NEXT:    [[UGLYGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 12
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 12
 ; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nuw nsw i32 [[LSR_IV1]], 1
 ; CHECK-NEXT:    br label [[FOR_COND468]]
 ; CHECK:       for.inc498:
@@ -162,8 +162,8 @@ define fastcc void @test3(ptr nocapture %u) nounwind uwtable ssp {
 ; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[TMP0]] to i32
 ; CHECK-NEXT:    [[MUL_I_US_I:%.*]] = mul nsw i32 0, [[TMP]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV_I_SV_PHI]], 3
-; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[U:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[UGLYGEP]], align 8
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[U:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[SCEVGEP]], align 8
 ; CHECK-NEXT:    br i1 undef, label [[FOR_INC8_US_I:%.*]], label [[MESHBB]]
 ; CHECK:       for.body3.lr.ph.us.i.loopexit:
 ; CHECK-NEXT:    [[LSR_IV_NEXT:%.*]] = add i64 [[LSR_IV]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
index 97bb4a2b4db536..1c64359dea2491 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
@@ -9,7 +9,7 @@
 define zeroext i8 @sum() {
 ; CHECK-LABEL: @sum(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <64 x i8>, ptr getelementptr inbounds ([128 x i8], ptr @bytes, i64 0, i64 64), align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <64 x i8>, ptr getelementptr inbounds (i8, ptr @bytes, i64 64), align 1
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <64 x i8>, ptr @bytes, align 1
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <64 x i8> [[WIDE_LOAD2]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]])
diff --git a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll
index b3b6d3ee55097e..aebe47c1287973 100644
--- a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll
@@ -39,14 +39,14 @@ define void @f() {
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP1]], i64 1
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr @f.e, [[SCEVGEP]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TMP1]], getelementptr inbounds (i32, ptr @f.e, i64 1)
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TMP1]], getelementptr inbounds (i8, ptr @f.e, i64 4)
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    store i32 0, ptr @f.e, align 1, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    store i32 0, ptr @f.e, align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
 ; CHECK-NEXT:    store i8 10, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 500
diff --git a/llvm/test/Transforms/LoopVersioning/add-phi-update-users.ll b/llvm/test/Transforms/LoopVersioning/add-phi-update-users.ll
index d9050700001a80..e326064175d18e 100644
--- a/llvm/test/Transforms/LoopVersioning/add-phi-update-users.ll
+++ b/llvm/test/Transforms/LoopVersioning/add-phi-update-users.ll
@@ -27,7 +27,7 @@ define void @f1() {
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[T0]], i64 2
 ; CHECK-NEXT:    br label [[FOR_BODY_LVER_CHECK:%.*]]
 ; CHECK:       for.body.lver.check:
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[T0]], getelementptr inbounds (i16, ptr @b, i64 1)
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[T0]], getelementptr inbounds (i8, ptr @b, i64 2)
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr @b, [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH:%.*]]
@@ -44,8 +44,8 @@ define void @f1() {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[T1:%.*]] = phi i64 [ 0, [[FOR_BODY_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[T2:%.*]] = load i16, ptr @b, align 1, !tbaa [[TBAA2]], !alias.scope !6
-; CHECK-NEXT:    store i16 [[T2]], ptr [[T0]], align 1, !tbaa [[TBAA2]], !alias.scope !9, !noalias !6
+; CHECK-NEXT:    [[T2:%.*]] = load i16, ptr @b, align 1, !tbaa [[TBAA2]], !alias.scope [[META6:![0-9]+]]
+; CHECK-NEXT:    store i16 [[T2]], ptr [[T0]], align 1, !tbaa [[TBAA2]], !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[T1]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INC]], 3
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT1:%.*]]
diff --git a/llvm/test/Transforms/LoopVersioning/bound-check-partially-known.ll b/llvm/test/Transforms/LoopVersioning/bound-check-partially-known.ll
index 70c12a2d62ec33..2fb58f5980ec88 100644
--- a/llvm/test/Transforms/LoopVersioning/bound-check-partially-known.ll
+++ b/llvm/test/Transforms/LoopVersioning/bound-check-partially-known.ll
@@ -18,14 +18,14 @@ define void @bound_check_partially_known_1(i32 %N) {
 ; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr @global, i64 [[TMP2]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr @global, [[SCEVGEP1]]
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[BOUND13:%.*]] = icmp ult ptr getelementptr inbounds ([[STRUCT_FOO:%.*]], ptr @global, i64 0, i32 1, i64 0), [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND13:%.*]] = icmp ult ptr getelementptr inbounds (i8, ptr @global, i64 256000), [[SCEVGEP1]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND13]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[LOOP_PH_LVER_ORIG:%.*]], label [[LOOP_PH:%.*]]
 ; CHECK:       loop.ph.lver.orig:
 ; CHECK-NEXT:    br label [[LOOP_LVER_ORIG:%.*]]
 ; CHECK:       loop.lver.orig:
 ; CHECK-NEXT:    [[IV_LVER_ORIG:%.*]] = phi i64 [ 0, [[LOOP_PH_LVER_ORIG]] ], [ [[IV_NEXT_LVER_ORIG:%.*]], [[LOOP_LVER_ORIG]] ]
-; CHECK-NEXT:    [[GEP_0_IV_LVER_ORIG:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @global, i64 0, i32 0, i64 [[IV_LVER_ORIG]]
+; CHECK-NEXT:    [[GEP_0_IV_LVER_ORIG:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr @global, i64 0, i32 0, i64 [[IV_LVER_ORIG]]
 ; CHECK-NEXT:    [[L_0_LVER_ORIG:%.*]] = load double, ptr [[GEP_0_IV_LVER_ORIG]], align 8
 ; CHECK-NEXT:    [[GEP_1_IV_LVER_ORIG:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @global, i64 0, i32 1, i64 [[IV_LVER_ORIG]]
 ; CHECK-NEXT:    [[L_1_LVER_ORIG:%.*]] = load double, ptr [[GEP_1_IV_LVER_ORIG]], align 8
@@ -41,13 +41,13 @@ define void @bound_check_partially_known_1(i32 %N) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[LOOP_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[GEP_0_IV:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @global, i64 0, i32 0, i64 [[IV]]
-; CHECK-NEXT:    [[L_0:%.*]] = load double, ptr [[GEP_0_IV]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[L_0:%.*]] = load double, ptr [[GEP_0_IV]], align 8, !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[GEP_1_IV:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @global, i64 0, i32 1, i64 [[IV]]
-; CHECK-NEXT:    [[L_1:%.*]] = load double, ptr [[GEP_1_IV]], align 8, !alias.scope !3
+; CHECK-NEXT:    [[L_1:%.*]] = load double, ptr [[GEP_1_IV]], align 8, !alias.scope [[META3:![0-9]+]]
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[L_0]], [[L_1]]
 ; CHECK-NEXT:    [[IV_N:%.*]] = add nuw nsw i64 [[IV]], [[N_EXT]]
 ; CHECK-NEXT:    [[GEP_0_IV_N:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @global, i64 0, i32 0, i64 [[IV_N]]
-; CHECK-NEXT:    store double [[ADD]], ptr [[GEP_0_IV_N]], align 8, !alias.scope !5, !noalias !7
+; CHECK-NEXT:    store double [[ADD]], ptr [[GEP_0_IV_N]], align 8, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_EXT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT_LOOPEXIT4:%.*]], label [[LOOP]]
diff --git a/llvm/test/Transforms/NewGVN/loadforward.ll b/llvm/test/Transforms/NewGVN/loadforward.ll
index 85ceafd433f45a..a44a6e92b8adf5 100644
--- a/llvm/test/Transforms/NewGVN/loadforward.ll
+++ b/llvm/test/Transforms/NewGVN/loadforward.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ;; Test that we forward the first store to the second load
 define i16 @bazinga() {
 ; CHECK-LABEL: @bazinga(
-; CHECK-NEXT:    [[_TMP10:%.*]] = load i16, ptr getelementptr inbounds ([[REC11:%.*]], ptr @str, i64 0, i32 1), align 2
+; CHECK-NEXT:    [[_TMP10:%.*]] = load i16, ptr getelementptr inbounds (i8, ptr @str, i64 2), align 2
 ; CHECK-NEXT:    store i16 [[_TMP10]], ptr @str, align 2
 ; CHECK-NEXT:    [[_TMP15:%.*]] = icmp eq i16 [[_TMP10]], 3
 ; CHECK-NEXT:    [[_TMP16:%.*]] = select i1 [[_TMP15]], i16 1, i16 0
diff --git a/llvm/test/Transforms/PhaseOrdering/SystemZ/sub-xor.ll b/llvm/test/Transforms/PhaseOrdering/SystemZ/sub-xor.ll
index 5fe267d62f9333..43fd8bd59b8d34 100644
--- a/llvm/test/Transforms/PhaseOrdering/SystemZ/sub-xor.ll
+++ b/llvm/test/Transforms/PhaseOrdering/SystemZ/sub-xor.ll
@@ -20,35 +20,35 @@ define dso_local zeroext i32 @foo(ptr noundef %a) #0 {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY4]] ]
 ; CHECK-NEXT:    [[SUM_11:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD_7:%.*]], [[FOR_BODY4]] ]
 ; CHECK-NEXT:    [[IDX_NEG:%.*]] = sub nsw i64 0, [[INDVARS_IV]]
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[IDX_NEG]]
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[IDX_NEG]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ADD_PTR]], align 4, !tbaa [[TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[SUM_11]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_NEG:%.*]] = xor i64 [[INDVARS_IV]], -1
-; CHECK-NEXT:    [[ADD_PTR_110:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_110:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_NEG]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ADD_PTR_110]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[ADD_111:%.*]] = add i32 [[TMP1]], [[ADD]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_112_NEG:%.*]] = sub nuw nsw i64 -2, [[INDVARS_IV]]
-; CHECK-NEXT:    [[ADD_PTR_217:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_112_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_217:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_112_NEG]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ADD_PTR_217]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[ADD_218:%.*]] = add i32 [[TMP2]], [[ADD_111]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_219_NEG:%.*]] = sub nuw nsw i64 -3, [[INDVARS_IV]]
-; CHECK-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_219_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_219_NEG]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ADD_PTR_3]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[TMP3]], [[ADD_218]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_3_NEG:%.*]] = sub nuw nsw i64 -4, [[INDVARS_IV]]
-; CHECK-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_3_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_3_NEG]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ADD_PTR_4]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 [[TMP4]], [[ADD_3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_4_NEG:%.*]] = sub nuw nsw i64 -5, [[INDVARS_IV]]
-; CHECK-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_4_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_4_NEG]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ADD_PTR_5]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[ADD_5:%.*]] = add i32 [[TMP5]], [[ADD_4]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_5_NEG:%.*]] = sub nuw nsw i64 -6, [[INDVARS_IV]]
-; CHECK-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_5_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_5_NEG]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ADD_PTR_6]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[ADD_6:%.*]] = add i32 [[TMP6]], [[ADD_5]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_6_NEG:%.*]] = sub nuw nsw i64 -7, [[INDVARS_IV]]
-; CHECK-NEXT:    [[ADD_PTR_7:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_6_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_7:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_6_NEG]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ADD_PTR_7]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[ADD_7]] = add i32 [[TMP7]], [[ADD_6]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8
@@ -58,34 +58,34 @@ define dso_local zeroext i32 @foo(ptr noundef %a) #0 {
 ; CHECK-NEXT:    [[INDVARS_IV_1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1_7:%.*]], [[FOR_BODY4_1]] ], [ 0, [[FOR_BODY4]] ]
 ; CHECK-NEXT:    [[SUM_11_1:%.*]] = phi i32 [ [[ADD_1_7:%.*]], [[FOR_BODY4_1]] ], [ [[ADD_7]], [[FOR_BODY4]] ]
 ; CHECK-NEXT:    [[IDX_NEG_1:%.*]] = sub nsw i64 0, [[INDVARS_IV_1]]
-; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[IDX_NEG_1]]
+; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[IDX_NEG_1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ADD_PTR_1]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1_NEG:%.*]] = xor i64 [[INDVARS_IV_1]], -1
-; CHECK-NEXT:    [[ADD_PTR_1_1:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_1_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_1_1:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_NEG]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ADD_PTR_1_1]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1_1_NEG:%.*]] = sub nuw nsw i64 -2, [[INDVARS_IV_1]]
-; CHECK-NEXT:    [[ADD_PTR_1_2:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_1_1_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_1_2:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_1_NEG]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ADD_PTR_1_2]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1_2_NEG:%.*]] = sub nuw nsw i64 -3, [[INDVARS_IV_1]]
-; CHECK-NEXT:    [[ADD_PTR_1_3:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_1_2_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_1_3:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_2_NEG]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ADD_PTR_1_3]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1_3_NEG:%.*]] = sub nuw nsw i64 -4, [[INDVARS_IV_1]]
-; CHECK-NEXT:    [[ADD_PTR_1_4:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_1_3_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_1_4:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_3_NEG]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ADD_PTR_1_4]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1_4_NEG:%.*]] = sub nuw nsw i64 -5, [[INDVARS_IV_1]]
-; CHECK-NEXT:    [[ADD_PTR_1_5:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_1_4_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_1_5:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_4_NEG]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ADD_PTR_1_5]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP16]], [[TMP17]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1_5_NEG:%.*]] = sub nuw nsw i64 -6, [[INDVARS_IV_1]]
-; CHECK-NEXT:    [[ADD_PTR_1_6:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_1_5_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_1_6:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_5_NEG]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ADD_PTR_1_6]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP18]], [[TMP19]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1_6_NEG:%.*]] = sub nuw nsw i64 -7, [[INDVARS_IV_1]]
-; CHECK-NEXT:    [[ADD_PTR_1_7:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_1_6_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_1_7:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_6_NEG]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ADD_PTR_1_7]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 1
@@ -97,42 +97,42 @@ define dso_local zeroext i32 @foo(ptr noundef %a) #0 {
 ; CHECK-NEXT:    [[INDVARS_IV_2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_2_7:%.*]], [[FOR_BODY4_2]] ], [ 0, [[FOR_BODY4_1]] ]
 ; CHECK-NEXT:    [[SUM_11_2:%.*]] = phi i32 [ [[ADD_2_7:%.*]], [[FOR_BODY4_2]] ], [ [[ADD_1_7]], [[FOR_BODY4_1]] ]
 ; CHECK-NEXT:    [[IDX_NEG_2:%.*]] = sub nsw i64 0, [[INDVARS_IV_2]]
-; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[IDX_NEG_2]]
+; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[IDX_NEG_2]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ADD_PTR_2]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[TMP24]], 3
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[MUL_2]], [[SUM_11_2]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2_NEG:%.*]] = xor i64 [[INDVARS_IV_2]], -1
-; CHECK-NEXT:    [[ADD_PTR_2_1:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_2_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_2_1:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_2_NEG]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ADD_PTR_2_1]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[MUL_2_1:%.*]] = mul i32 [[TMP25]], 3
 ; CHECK-NEXT:    [[ADD_2_1:%.*]] = add i32 [[MUL_2_1]], [[ADD_2]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2_1_NEG:%.*]] = sub nuw nsw i64 -2, [[INDVARS_IV_2]]
-; CHECK-NEXT:    [[ADD_PTR_2_2:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_2_1_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_2_2:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_2_1_NEG]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ADD_PTR_2_2]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[MUL_2_2:%.*]] = mul i32 [[TMP26]], 3
 ; CHECK-NEXT:    [[ADD_2_2:%.*]] = add i32 [[MUL_2_2]], [[ADD_2_1]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2_2_NEG:%.*]] = sub nuw nsw i64 -3, [[INDVARS_IV_2]]
-; CHECK-NEXT:    [[ADD_PTR_2_3:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_2_2_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_2_3:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_2_2_NEG]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ADD_PTR_2_3]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[MUL_2_3:%.*]] = mul i32 [[TMP27]], 3
 ; CHECK-NEXT:    [[ADD_2_3:%.*]] = add i32 [[MUL_2_3]], [[ADD_2_2]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2_3_NEG:%.*]] = sub nuw nsw i64 -4, [[INDVARS_IV_2]]
-; CHECK-NEXT:    [[ADD_PTR_2_4:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_2_3_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_2_4:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_2_3_NEG]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ADD_PTR_2_4]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[MUL_2_4:%.*]] = mul i32 [[TMP28]], 3
 ; CHECK-NEXT:    [[ADD_2_4:%.*]] = add i32 [[MUL_2_4]], [[ADD_2_3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2_4_NEG:%.*]] = sub nuw nsw i64 -5, [[INDVARS_IV_2]]
-; CHECK-NEXT:    [[ADD_PTR_2_5:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_2_4_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_2_5:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_2_4_NEG]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[ADD_PTR_2_5]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[MUL_2_5:%.*]] = mul i32 [[TMP29]], 3
 ; CHECK-NEXT:    [[ADD_2_5:%.*]] = add i32 [[MUL_2_5]], [[ADD_2_4]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2_5_NEG:%.*]] = sub nuw nsw i64 -6, [[INDVARS_IV_2]]
-; CHECK-NEXT:    [[ADD_PTR_2_6:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_2_5_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_2_6:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_2_5_NEG]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ADD_PTR_2_6]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[MUL_2_6:%.*]] = mul i32 [[TMP30]], 3
 ; CHECK-NEXT:    [[ADD_2_6:%.*]] = add i32 [[MUL_2_6]], [[ADD_2_5]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2_6_NEG:%.*]] = sub nuw nsw i64 -7, [[INDVARS_IV_2]]
-; CHECK-NEXT:    [[ADD_PTR_2_7:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds ([100 x i32], ptr @ARR, i64 0, i64 99), i64 [[INDVARS_IV_NEXT_2_6_NEG]]
+; CHECK-NEXT:    [[ADD_PTR_2_7:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_2_6_NEG]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[ADD_PTR_2_7]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[MUL_2_7:%.*]] = mul i32 [[TMP31]], 3
 ; CHECK-NEXT:    [[ADD_2_7]] = add i32 [[MUL_2_7]], [[ADD_2_6]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
index 741e3ad4f7b980..ed25734c8448fb 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
@@ -13,129 +13,129 @@ define void @test_known_trip_count() {
 ; CHECK-LABEL: @test_known_trip_count(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr @b, align 16
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 2), align 16
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 16), align 16
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x double>, ptr @c, align 16
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 2), align 16
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 16), align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD4]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[WIDE_LOAD3]], [[WIDE_LOAD5]]
 ; CHECK-NEXT:    store <2 x double> [[TMP0]], ptr @a, align 16
-; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 2), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 6), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 4), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 6), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr getelementptr inbounds (i8, ptr @a, i64 16), align 16
+; CHECK-NEXT:    [[WIDE_LOAD_1:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 32), align 16
+; CHECK-NEXT:    [[WIDE_LOAD3_1:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 48), align 16
+; CHECK-NEXT:    [[WIDE_LOAD4_1:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 32), align 16
+; CHECK-NEXT:    [[WIDE_LOAD5_1:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 48), align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[WIDE_LOAD_1]], [[WIDE_LOAD4_1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[WIDE_LOAD3_1]], [[WIDE_LOAD5_1]]
-; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 4), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 6), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 8), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 10), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 8), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 10), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr getelementptr inbounds (i8, ptr @a, i64 32), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr getelementptr inbounds (i8, ptr @a, i64 48), align 16
+; CHECK-NEXT:    [[WIDE_LOAD_2:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 64), align 16
+; CHECK-NEXT:    [[WIDE_LOAD3_2:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 80), align 16
+; CHECK-NEXT:    [[WIDE_LOAD4_2:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 64), align 16
+; CHECK-NEXT:    [[WIDE_LOAD5_2:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 80), align 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[WIDE_LOAD_2]], [[WIDE_LOAD4_2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[WIDE_LOAD3_2]], [[WIDE_LOAD5_2]]
-; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 8), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 10), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 12), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 14), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 12), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 14), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr getelementptr inbounds (i8, ptr @a, i64 64), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr getelementptr inbounds (i8, ptr @a, i64 80), align 16
+; CHECK-NEXT:    [[WIDE_LOAD_3:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 96), align 16
+; CHECK-NEXT:    [[WIDE_LOAD3_3:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 112), align 16
+; CHECK-NEXT:    [[WIDE_LOAD4_3:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 96), align 16
+; CHECK-NEXT:    [[WIDE_LOAD5_3:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 112), align 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[WIDE_LOAD_3]], [[WIDE_LOAD4_3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[WIDE_LOAD3_3]], [[WIDE_LOAD5_3]]
-; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 12), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 14), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_4:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 16), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_4:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 18), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_4:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 16), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_4:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 18), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr getelementptr inbounds (i8, ptr @a, i64 96), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr getelementptr inbounds (i8, ptr @a, i64 112), align 16
+; CHECK-NEXT:    [[WIDE_LOAD_4:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 128), align 16
+; CHECK-NEXT:    [[WIDE_LOAD3_4:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 144), align 16
+; CHECK-NEXT:    [[WIDE_LOAD4_4:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 128), align 16
+; CHECK-NEXT:    [[WIDE_LOAD5_4:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 144), align 16
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[WIDE_LOAD_4]], [[WIDE_LOAD4_4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[WIDE_LOAD3_4]], [[WIDE_LOAD5_4]]
-; CHECK-NEXT:    store <2 x double> [[TMP8]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 16), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 18), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_5:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 20), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_5:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 22), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_5:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 20), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_5:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 22), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP8]], ptr getelementptr inbounds (i8, ptr @a, i64 128), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr getelementptr inbounds (i8, ptr @a, i64 144), align 16
+; CHECK-NEXT:    [[WIDE_LOAD_5:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 160), align 16
+; CHECK-NEXT:    [[WIDE_LOAD3_5:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 176), align 16
+; CHECK-NEXT:    [[WIDE_LOAD4_5:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 160), align 16
+; CHECK-NEXT:    [[WIDE_LOAD5_5:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 176), align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd <2 x double> [[WIDE_LOAD_5]], [[WIDE_LOAD4_5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[WIDE_LOAD3_5]], [[WIDE_LOAD5_5]]
-; CHECK-NEXT:    store <2 x double> [[TMP10]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 20), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP11]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 22), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 24), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 26), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 24), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 26), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP10]], ptr getelementptr inbounds (i8, ptr @a, i64 160), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP11]], ptr getelementptr inbounds (i8, ptr @a, i64 176), align 16
+; CHECK-NEXT:    [[WIDE_LOAD_6:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 192), align 16
+; CHECK-NEXT:    [[WIDE_LOAD3_6:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 208), align 16
+; CHECK-NEXT:    [[WIDE_LOAD4_6:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 192), align 16
+; CHECK-NEXT:    [[WIDE_LOAD5_6:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 208), align 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[WIDE_LOAD_6]], [[WIDE_LOAD4_6]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x double> [[WIDE_LOAD3_6]], [[WIDE_LOAD5_6]]
-; CHECK-NEXT:    store <2 x double> [[TMP12]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 24), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 26), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_7:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 28), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_7:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 30), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_7:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 28), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_7:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 30), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP12]], ptr getelementptr inbounds (i8, ptr @a, i64 192), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP13]], ptr getelementptr inbounds (i8, ptr @a, i64 208), align 16
+; CHECK-NEXT:    [[WIDE_LOAD_7:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 224), align 16
+; CHECK-NEXT:    [[WIDE_LOAD3_7:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 240), align 16
+; CHECK-NEXT:    [[WIDE_LOAD4_7:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 224), align 16
+; CHECK-NEXT:    [[WIDE_LOAD5_7:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 240), align 16
 ; CHECK-NEXT:    [[TMP14:%.*]] = fadd <2 x double> [[WIDE_LOAD_7]], [[WIDE_LOAD4_7]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = fadd <2 x double> [[WIDE_LOAD3_7]], [[WIDE_LOAD5_7]]
-; CHECK-NEXT:    store <2 x double> [[TMP14]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 28), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP15]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 30), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_8:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 32), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_8:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 34), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_8:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 32), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_8:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 34), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP14]], ptr getelementptr inbounds (i8, ptr @a, i64 224), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP15]], ptr getelementptr inbounds (i8, ptr @a, i64 240), align 16
+; CHECK-NEXT:    [[WIDE_LOAD_8:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 256), align 16
+; CHECK-NEXT:    [[WIDE_LOAD3_8:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 272), align 16
+; CHECK-NEXT:    [[WIDE_LOAD4_8:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 256), align 16
+; CHECK-NEXT:    [[WIDE_LOAD5_8:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 272), align 16
 ; CHECK-NEXT:    [[TMP16:%.*]] = fadd <2 x double> [[WIDE_LOAD_8]], [[WIDE_LOAD4_8]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = fadd <2 x double> [[WIDE_LOAD3_8]], [[WIDE_LOAD5_8]]
-; CHECK-NEXT:    store <2 x double> [[TMP16]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 32), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP17]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 34), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_9:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 36), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_9:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 38), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_9:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 36), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_9:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 38), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP16]], ptr getelementptr inbounds (i8, ptr @a, i64 256), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP17]], ptr getelementptr inbounds (i8, ptr @a, i64 272), align 16
+; CHECK-NEXT:    [[WIDE_LOAD_9:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 288), align 16
+; CHECK-NEXT:    [[WIDE_LOAD3_9:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 304), align 16
+; CHECK-NEXT:    [[WIDE_LOAD4_9:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 288), align 16
+; CHECK-NEXT:    [[WIDE_LOAD5_9:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 304), align 16
 ; CHECK-NEXT:    [[TMP18:%.*]] = fadd <2 x double> [[WIDE_LOAD_9]], [[WIDE_LOAD4_9]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = fadd <2 x double> [[WIDE_LOAD3_9]], [[WIDE_LOAD5_9]]
-; CHECK-NEXT:    store <2 x double> [[TMP18]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 36), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP19]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 38), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_10:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 40), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_10:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 42), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_10:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 40), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_10:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 42), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP18]], ptr getelementptr inbounds (i8, ptr @a, i64 288), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP19]], ptr getelementptr inbounds (i8, ptr @a, i64 304), align 16
+; CHECK-NEXT:    [[WIDE_LOAD_10:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 320), align 16
+; CHECK-NEXT:    [[WIDE_LOAD3_10:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 336), align 16
+; CHECK-NEXT:    [[WIDE_LOAD4_10:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 320), align 16
+; CHECK-NEXT:    [[WIDE_LOAD5_10:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 336), align 16
 ; CHECK-NEXT:    [[TMP20:%.*]] = fadd <2 x double> [[WIDE_LOAD_10]], [[WIDE_LOAD4_10]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = fadd <2 x double> [[WIDE_LOAD3_10]], [[WIDE_LOAD5_10]]
-; CHECK-NEXT:    store <2 x double> [[TMP20]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 40), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP21]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 42), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_11:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 44), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_11:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 46), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_11:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 44), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_11:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 46), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP20]], ptr getelementptr inbounds (i8, ptr @a, i64 320), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP21]], ptr getelementptr inbounds (i8, ptr @a, i64 336), align 16
+; CHECK-NEXT:    [[WIDE_LOAD_11:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 352), align 16
+; CHECK-NEXT:    [[WIDE_LOAD3_11:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 368), align 16
+; CHECK-NEXT:    [[WIDE_LOAD4_11:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 352), align 16
+; CHECK-NEXT:    [[WIDE_LOAD5_11:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 368), align 16
 ; CHECK-NEXT:    [[TMP22:%.*]] = fadd <2 x double> [[WIDE_LOAD_11]], [[WIDE_LOAD4_11]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = fadd <2 x double> [[WIDE_LOAD3_11]], [[WIDE_LOAD5_11]]
-; CHECK-NEXT:    store <2 x double> [[TMP22]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 44), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP23]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 46), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_12:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 48), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_12:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 50), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_12:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 48), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_12:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 50), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP22]], ptr getelementptr inbounds (i8, ptr @a, i64 352), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP23]], ptr getelementptr inbounds (i8, ptr @a, i64 368), align 16
+; CHECK-NEXT:    [[WIDE_LOAD_12:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 384), align 16
+; CHECK-NEXT:    [[WIDE_LOAD3_12:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 400), align 16
+; CHECK-NEXT:    [[WIDE_LOAD4_12:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 384), align 16
+; CHECK-NEXT:    [[WIDE_LOAD5_12:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 400), align 16
 ; CHECK-NEXT:    [[TMP24:%.*]] = fadd <2 x double> [[WIDE_LOAD_12]], [[WIDE_LOAD4_12]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = fadd <2 x double> [[WIDE_LOAD3_12]], [[WIDE_LOAD5_12]]
-; CHECK-NEXT:    store <2 x double> [[TMP24]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 48), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 50), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_13:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 52), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_13:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 54), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_13:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 52), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_13:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 54), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP24]], ptr getelementptr inbounds (i8, ptr @a, i64 384), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP25]], ptr getelementptr inbounds (i8, ptr @a, i64 400), align 16
+; CHECK-NEXT:    [[WIDE_LOAD_13:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 416), align 16
+; CHECK-NEXT:    [[WIDE_LOAD3_13:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 432), align 16
+; CHECK-NEXT:    [[WIDE_LOAD4_13:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 416), align 16
+; CHECK-NEXT:    [[WIDE_LOAD5_13:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 432), align 16
 ; CHECK-NEXT:    [[TMP26:%.*]] = fadd <2 x double> [[WIDE_LOAD_13]], [[WIDE_LOAD4_13]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd <2 x double> [[WIDE_LOAD3_13]], [[WIDE_LOAD5_13]]
-; CHECK-NEXT:    store <2 x double> [[TMP26]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 52), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP27]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 54), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_14:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 56), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_14:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 1, i64 0), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_14:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 56), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_14:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 1, i64 0), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP26]], ptr getelementptr inbounds (i8, ptr @a, i64 416), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP27]], ptr getelementptr inbounds (i8, ptr @a, i64 432), align 16
+; CHECK-NEXT:    [[WIDE_LOAD_14:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 448), align 16
+; CHECK-NEXT:    [[WIDE_LOAD3_14:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @b, i64 464), align 16
+; CHECK-NEXT:    [[WIDE_LOAD4_14:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 448), align 16
+; CHECK-NEXT:    [[WIDE_LOAD5_14:%.*]] = load <2 x double>, ptr getelementptr inbounds (i8, ptr @c, i64 464), align 16
 ; CHECK-NEXT:    [[TMP28:%.*]] = fadd <2 x double> [[WIDE_LOAD_14]], [[WIDE_LOAD4_14]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = fadd <2 x double> [[WIDE_LOAD3_14]], [[WIDE_LOAD5_14]]
-; CHECK-NEXT:    store <2 x double> [[TMP28]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 56), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP29]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 1, i64 0), align 16
-; CHECK-NEXT:    [[TMP30:%.*]] = load double, ptr getelementptr inbounds ([58 x double], ptr @b, i64 1, i64 2), align 16
-; CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr getelementptr inbounds ([58 x double], ptr @c, i64 1, i64 2), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP28]], ptr getelementptr inbounds (i8, ptr @a, i64 448), align 16
+; CHECK-NEXT:    store <2 x double> [[TMP29]], ptr getelementptr inbounds (i8, ptr @a, i64 464), align 16
+; CHECK-NEXT:    [[TMP30:%.*]] = load double, ptr getelementptr inbounds (i8, ptr @b, i64 480), align 16
+; CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr getelementptr inbounds (i8, ptr @c, i64 480), align 16
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    store double [[ADD]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 1, i64 2), align 16
+; CHECK-NEXT:    store double [[ADD]], ptr getelementptr inbounds (i8, ptr @a, i64 480), align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SCCP/2009-09-24-byval-ptr.ll b/llvm/test/Transforms/SCCP/2009-09-24-byval-ptr.ll
index 34ef4349c786c0..ac2e945b125baf 100644
--- a/llvm/test/Transforms/SCCP/2009-09-24-byval-ptr.ll
+++ b/llvm/test/Transforms/SCCP/2009-09-24-byval-ptr.ll
@@ -31,7 +31,7 @@ return:                                           ; preds = %entry
 define internal i32 @vfu2(ptr byval(%struct.MYstr) align 4 %u) nounwind readonly {
 ; CHECK-LABEL: @vfu2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr inbounds ([[STRUCT_MYSTR:%.*]], ptr @mystr, i64 0, i32 1), align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr inbounds (i8, ptr @mystr, i64 4), align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr @mystr, align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], [[TMP0]]
diff --git a/llvm/test/Transforms/SCCP/apint-bigint2.ll b/llvm/test/Transforms/SCCP/apint-bigint2.ll
index 6092c092bea5cb..695d6a4cf056f8 100644
--- a/llvm/test/Transforms/SCCP/apint-bigint2.ll
+++ b/llvm/test/Transforms/SCCP/apint-bigint2.ll
@@ -23,7 +23,7 @@ define i101 @large_aggregate() {
 ; CHECK-LABEL: @large_aggregate(
 ; CHECK-NEXT:    [[D:%.*]] = and i101 undef, 1
 ; CHECK-NEXT:    [[DD:%.*]] = or i101 [[D]], 1
-; CHECK-NEXT:    [[G:%.*]] = getelementptr i101, ptr getelementptr inbounds ([6 x i101], ptr @Y, i64 0, i64 5), i101 [[DD]]
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i101, ptr getelementptr inbounds (i8, ptr @Y, i64 80), i101 [[DD]]
 ; CHECK-NEXT:    [[L3:%.*]] = load i101, ptr [[G]], align 4
 ; CHECK-NEXT:    ret i101 [[L3]]
 ;
@@ -40,7 +40,7 @@ define i101 @large_aggregate_2() {
 ; CHECK-LABEL: @large_aggregate_2(
 ; CHECK-NEXT:    [[D:%.*]] = and i101 undef, 1
 ; CHECK-NEXT:    [[DD:%.*]] = or i101 [[D]], 1
-; CHECK-NEXT:    [[G:%.*]] = getelementptr i101, ptr getelementptr inbounds ([6 x i101], ptr @Y, i64 0, i64 5), i101 [[DD]]
+; CHECK-NEXT:    [[G:%.*]] = getelementptr i101, ptr getelementptr inbounds (i8, ptr @Y, i64 80), i101 [[DD]]
 ; CHECK-NEXT:    [[L3:%.*]] = load i101, ptr [[G]], align 4
 ; CHECK-NEXT:    ret i101 [[L3]]
 ;
@@ -54,7 +54,7 @@ define i101 @large_aggregate_2() {
 
 define void @index_too_large() {
 ; CHECK-LABEL: @index_too_large(
-; CHECK-NEXT:    store ptr getelementptr ([6 x i101], ptr @Y, i64 187649984473770, i64 2), ptr undef, align 8
+; CHECK-NEXT:    store ptr getelementptr (i8, ptr @Y, i64 18014398509481952), ptr undef, align 8
 ; CHECK-NEXT:    ret void
 ;
   %ptr1 = getelementptr [6 x i101], ptr @Y, i32 0, i32 -1
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
index 2ea47216925037..45030a0965e006 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@@ -61,16 +61,16 @@ define void @gather_load(ptr noalias %ptr) {
 ; CHECK-NEXT:    [[ARRAYIDX183:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 6
 ; CHECK-NEXT:    [[ARRAYIDX185:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
-; CHECK-NEXT:    [[L0:%.*]] = load i8, ptr getelementptr inbounds ([6 x [258 x i8]], ptr @data, i64 0, i64 1, i64 0), align 1
+; CHECK-NEXT:    [[L0:%.*]] = load i8, ptr getelementptr inbounds (i8, ptr @data, i64 258), align 1
 ; CHECK-NEXT:    [[CONV150:%.*]] = zext i8 [[L0]] to i16
 ; CHECK-NEXT:    [[ADD152:%.*]] = add nuw nsw i16 [[CONV150]], 10
-; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr getelementptr inbounds ([6 x [258 x i8]], ptr @data, i64 0, i64 2, i64 1), align 1
+; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr getelementptr inbounds (i8, ptr @data, i64 517), align 1
 ; CHECK-NEXT:    [[CONV156:%.*]] = zext i8 [[L1]] to i16
 ; CHECK-NEXT:    [[ADD158:%.*]] = add nuw nsw i16 [[CONV156]], 20
-; CHECK-NEXT:    [[L2:%.*]] = load i8, ptr getelementptr inbounds ([6 x [258 x i8]], ptr @data, i64 0, i64 3, i64 2), align 1
+; CHECK-NEXT:    [[L2:%.*]] = load i8, ptr getelementptr inbounds (i8, ptr @data, i64 776), align 1
 ; CHECK-NEXT:    [[CONV162:%.*]] = zext i8 [[L2]] to i16
 ; CHECK-NEXT:    [[ADD164:%.*]] = add nuw nsw i16 [[CONV162]], 30
-; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr getelementptr inbounds ([6 x [258 x i8]], ptr @data, i64 0, i64 4, i64 3), align 1
+; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr getelementptr inbounds (i8, ptr @data, i64 1035), align 1
 ; CHECK-NEXT:    [[CONV168:%.*]] = zext i8 [[L3]] to i16
 ; CHECK-NEXT:    [[ADD170:%.*]] = add nuw nsw i16 [[CONV168]], 40
 ; CHECK-NEXT:    store i16 [[ADD152]], ptr [[ARRAYIDX182]], align 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
index c46a5aa758fb37..892a2b6cee3be9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
@@ -13,32 +13,32 @@ define void @foo() {
 ; SSE-LABEL: @foo(
 ; SSE-NEXT:    [[TMP1:%.*]] = load i32, ptr @b, align 16
 ; SSE-NEXT:    store i32 [[TMP1]], ptr @a, align 16
-; SSE-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8
-; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 1), align 4
-; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 2), align 8
-; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 3), align 4
-; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 4), align 16
-; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 5), align 4
-; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 6), align 8
-; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 7), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds (i8, ptr @b, i64 8), align 8
+; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds (i8, ptr @a, i64 4), align 4
+; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds (i8, ptr @a, i64 8), align 8
+; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds (i8, ptr @a, i64 12), align 4
+; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds (i8, ptr @a, i64 16), align 16
+; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds (i8, ptr @a, i64 20), align 4
+; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds (i8, ptr @a, i64 24), align 8
+; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds (i8, ptr @a, i64 28), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @foo(
 ; AVX-NEXT:    [[TMP1:%.*]] = load i32, ptr @b, align 16
-; AVX-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds (i8, ptr @b, i64 8), align 8
 ; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0
 ; AVX-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1
-; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
-; AVX-NEXT:    store <8 x i32> [[SHUFFLE]], ptr @a, align 16
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; AVX-NEXT:    store <8 x i32> [[TMP5]], ptr @a, align 16
 ; AVX-NEXT:    ret void
 ;
 ; AVX512-LABEL: @foo(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load i32, ptr @b, align 16
-; AVX512-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds (i8, ptr @b, i64 8), align 8
 ; AVX512-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0
 ; AVX512-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1
-; AVX512-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
-; AVX512-NEXT:    store <8 x i32> [[SHUFFLE]], ptr @a, align 16
+; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; AVX512-NEXT:    store <8 x i32> [[TMP5]], ptr @a, align 16
 ; AVX512-NEXT:    ret void
 ;
   %1 = load i32, ptr @b, align 16

From 21419071e197ca2f813a983c319a2213844dc72d Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <r@artagnon.com>
Date: Mon, 20 May 2024 10:47:51 +0100
Subject: [PATCH 54/60] InstSimplify: increase shufflevector test coverage
 (#92407)

Add examples of patterns that can be simplified, but are currently not.
This patch serves as a pre-commit test.
---
 .../Transforms/InstSimplify/shufflevector.ll  | 236 +++++++++++++++++-
 1 file changed, 230 insertions(+), 6 deletions(-)

diff --git a/llvm/test/Transforms/InstSimplify/shufflevector.ll b/llvm/test/Transforms/InstSimplify/shufflevector.ll
index 460e90aa31d913..64087194b0d17f 100644
--- a/llvm/test/Transforms/InstSimplify/shufflevector.ll
+++ b/llvm/test/Transforms/InstSimplify/shufflevector.ll
@@ -249,13 +249,13 @@ define <8 x i64> @PR30630(<8 x i64> %x) {
 ;         ret <2 x float> zeroinitializer
 define <2 x float> @PR32872(<2 x float> %x) {
 ; CHECK-LABEL: @PR32872(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> zeroinitializer, <4 x float> [[TMP1]], <2 x i32> <i32 4, i32 5>
-; CHECK-NEXT:    ret <2 x float> [[TMP4]]
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 0, i32 1>
+; CHECK-NEXT:    [[SHUF2:%.*]] = shufflevector <4 x float> zeroinitializer, <4 x float> [[SHUF]], <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    ret <2 x float> [[SHUF2]]
 ;
-  %tmp1 = shufflevector <2 x float> %x, <2 x float> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 0, i32 1>
-  %tmp4 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <2 x i32> <i32 4, i32 5>
-  ret <2 x float> %tmp4
+  %shuf = shufflevector <2 x float> %x, <2 x float> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 0, i32 1>
+  %shuf2 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuf, <2 x i32> <i32 4, i32 5>
+  ret <2 x float> %shuf2
 }
 
 define <5 x i8> @splat_inserted_constant(<4 x i8> %x) {
@@ -284,3 +284,227 @@ define <2 x i8> @splat_inserted_constant_not_canonical(<3 x i8> %x, <3 x i8> %y)
   %splat2 = shufflevector <3 x i8> %y, <3 x i8> %ins2, <2 x i32> <i32 undef, i32 5>
   ret <2 x i8> %splat2
 }
+
+define <4 x i32> @fold_identity(<4 x i32> %x) {
+; CHECK-LABEL: @fold_identity(
+; CHECK-NEXT:    ret <4 x i32> [[X:%.*]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %revshuf = shufflevector <4 x i32> %shuf, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i32> %revshuf
+}
+
+define <4 x i32> @fold_identity2(<4 x i32> %x) {
+; CHECK-LABEL: @fold_identity2(
+; CHECK-NEXT:    [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[SHL]]
+;
+  %shl = shl <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  %shuf = shufflevector <4 x i32> %shl, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %revshuf = shufflevector <4 x i32> %shuf, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i32> %revshuf
+}
+
+define <4 x i32> @fold_identity3(<4 x i32> %x) {
+; CHECK-LABEL: @fold_identity3(
+; CHECK-NEXT:    [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], [[X]]
+; CHECK-NEXT:    ret <4 x i32> [[SHL]]
+;
+  %shl = shl <4 x i32> %x, %x
+  %shuf = shufflevector <4 x i32> %shl, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %revshuf = shufflevector <4 x i32> %shuf, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i32> %revshuf
+}
+
+define <4 x i32> @not_fold_identity(<4 x i32> %x) {
+; CHECK-LABEL: @not_fold_identity(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i32> [[SHUF]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    ret <4 x i32> [[REVSHUF]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+  %revshuf = shufflevector <4 x i32> %shuf, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x i32> %revshuf
+}
+
+define <4 x i32> @not_fold_identity2(<4 x i32> %x) {
+; CHECK-LABEL: @not_fold_identity2(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i32> [[SHUF]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[REVSHUF]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %revshuf = shufflevector <4 x i32> %shuf, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i32> %revshuf
+}
+
+define <4 x i64> @fold_lookthrough_cast(<4 x i32> %x) {
+; CHECK-LABEL: @fold_lookthrough_cast(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i32> [[SHUF]] to <4 x i64>
+; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i64> [[ZEXT]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i64> [[REVSHUF]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %zext = zext <4 x i32> %shuf to <4 x i64>
+  %revshuf = shufflevector <4 x i64> %zext, <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i64> %revshuf
+}
+
+define <4 x i64> @not_fold_lookthrough_cast(<4 x i32> %x) {
+; CHECK-LABEL: @not_fold_lookthrough_cast(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i32> [[X:%.*]] to <4 x i64>
+; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i64> [[ZEXT]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i64> [[REVSHUF]]
+;
+  %zext = zext <4 x i32> %x to <4 x i64>
+  %revshuf = shufflevector <4 x i64> %zext, <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i64> %revshuf
+}
+
+define <4 x i64> @not_fold_lookthrough_cast2(<4 x i32> %x) {
+; CHECK-LABEL: @not_fold_lookthrough_cast2(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i32> [[SHUF]] to <4 x i64>
+; CHECK-NEXT:    ret <4 x i64> [[ZEXT]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %zext = zext <4 x i32> %shuf to <4 x i64>
+  ret <4 x i64> %zext
+}
+
+define i32 @not_fold_lookthrough_bitcast(<4 x i8> %x) {
+; CHECK-LABEL: @not_fold_lookthrough_bitcast(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <4 x i8> [[SHUF]] to i32
+; CHECK-NEXT:    ret i32 [[BITCAST]]
+;
+  %shuf = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bitcast = bitcast <4 x i8> %shuf to i32
+  ret i32 %bitcast
+}
+
+define <8 x i16> @not_fold_lookthrough_bitcast2(<4 x i32> %x, <8 x i16> %y) {
+; CHECK-LABEL: @not_fold_lookthrough_bitcast2(
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <4 x i32> [[X:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[OUT:%.*]] = shufflevector <8 x i16> [[Y:%.*]], <8 x i16> [[CAST]], <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+; CHECK-NEXT:    ret <8 x i16> [[OUT]]
+;
+  %cast = bitcast <4 x i32> %x to <8 x i16>
+  %out = shufflevector <8 x i16> %y, <8 x i16> %cast, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+  ret <8 x i16> %out
+}
+
+define <4 x i32> @fold_lookthrough_binop_same_operands(<4 x i32> %x) {
+; CHECK-LABEL: @fold_lookthrough_binop_same_operands(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[SHUF]], [[SHUF]]
+; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i32> [[ADD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[REVSHUF]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %add = add <4 x i32> %shuf, %shuf
+  %revshuf = shufflevector <4 x i32> %add, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i32> %revshuf
+}
+
+define <4 x i32> @fold_lookthrough_binop_different_operands(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @fold_lookthrough_binop_different_operands(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[SHUF]], [[Y:%.*]]
+; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i32> [[ADD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[REVSHUF]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %add = add <4 x i32> %shuf, %y
+  %revshuf = shufflevector <4 x i32> %add, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i32> %revshuf
+}
+
+define <4 x i32> @fold_lookthrough_binop_multiuse(<4 x i32> %x) {
+; CHECK-LABEL: @fold_lookthrough_binop_multiuse(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[SHUF]], [[SHUF]]
+; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i32> [[ADD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[ADD2:%.*]] = add <4 x i32> [[SHUF]], [[REVSHUF]]
+; CHECK-NEXT:    ret <4 x i32> [[ADD2]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %add = add <4 x i32> %shuf, %shuf
+  %revshuf = shufflevector <4 x i32> %add, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %add2 = add <4 x i32> %shuf, %revshuf
+  ret <4 x i32> %add2
+}
+
+define <4 x i64> @fold_lookthrough_cast_chain(<4 x i16> %x) {
+; CHECK-LABEL: @fold_lookthrough_cast_chain(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i16> [[X:%.*]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i16> [[SHUF]] to <4 x i32>
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i32> [[ZEXT]] to <4 x i64>
+; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i64> [[SEXT]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i64> [[REVSHUF]]
+;
+  %shuf = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %zext = zext <4 x i16> %shuf to <4 x i32>
+  %sext = sext <4 x i32> %zext to <4 x i64>
+  %revshuf = shufflevector <4 x i64> %sext, <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i64> %revshuf
+}
+
+define <4 x i32> @fold_lookthrough_binop_chain(<4 x i32> %x) {
+; CHECK-LABEL: @fold_lookthrough_binop_chain(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[SHUF]], [[SHUF]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add <4 x i32> [[ADD]], [[ADD]]
+; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i32> [[ADD2]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[REVSHUF]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %add = add <4 x i32> %shuf, %shuf
+  %add2 = add <4 x i32> %add, %add
+  %revshuf = shufflevector <4 x i32> %add2, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i32> %revshuf
+}
+
+define <4 x i64> @fold_lookthrough_cast_binop_chain(<4 x i32> %x) {
+; CHECK-LABEL: @fold_lookthrough_cast_binop_chain(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i32> [[SHUF]] to <4 x i64>
+; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i64> [[ZEXT]], [[ZEXT]]
+; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i64> [[ADD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x i64> [[REVSHUF]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %zext = zext <4 x i32> %shuf to <4 x i64>
+  %add = add <4 x i64> %zext, %zext
+  %revshuf = shufflevector <4 x i64> %add, <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i64> %revshuf
+}
+
+define <4 x i64> @not_fold_cast_mismatched_types(<4 x i32> %x) {
+; CHECK-LABEL: @not_fold_cast_mismatched_types(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i32> [[SHUF]] to <2 x i64>
+; CHECK-NEXT:    [[EXTSHUF:%.*]] = shufflevector <2 x i64> [[ZEXT]], <2 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    ret <4 x i64> [[EXTSHUF]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+  %zext = zext <2 x i32> %shuf to <2 x i64>
+  %extshuf = shufflevector <2 x i64> %zext, <2 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x i64> %extshuf
+}
+
+define <4 x float> @not_fold_binop_mismatched_types(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @not_fold_binop_mismatched_types(
+; CHECK-NEXT:    [[SHUF_X:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SHUF_Y:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[FADD:%.*]] = fadd fast <2 x float> [[SHUF_X]], [[SHUF_Y]]
+; CHECK-NEXT:    [[EXTSHUF:%.*]] = shufflevector <2 x float> [[FADD]], <2 x float> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[EXTSHUF]]
+;
+  %shuf.x = shufflevector <4 x float> %x, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %shuf.y = shufflevector <4 x float> %y, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %fadd = fadd fast <2 x float> %shuf.x, %shuf.y
+  %extshuf = shufflevector <2 x float> %fadd, <2 x float> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %extshuf
+}

From 605ae4e93be8976095c7eedf5c08bfdb9ff71257 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Mon, 20 May 2024 10:58:18 +0100
Subject: [PATCH 55/60] [flang][HLFIR] Adapt SimplifyHLFIRIntrinsics to run on
 all top level ops (#92573)

This means that this pass will also run on hlfir intrinsics which are
not inside of functions.

See RFC:

https://discourse.llvm.org/t/rfc-add-an-interface-for-top-level-container-operations

Some of the changes are from moving the declaration and definition of
the constructor into tablegen (as requested during code review of
another pass).
---
 flang/include/flang/Optimizer/HLFIR/Passes.h          |  1 -
 flang/include/flang/Optimizer/HLFIR/Passes.td         |  3 +--
 flang/include/flang/Tools/CLOptions.inc               |  3 ++-
 .../HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp      | 11 +++--------
 flang/test/Driver/mlir-pass-pipeline.f90              |  9 ++++++++-
 flang/test/Fir/basic-program.fir                      |  7 +++++++
 6 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.h b/flang/include/flang/Optimizer/HLFIR/Passes.h
index 3314e0b887f6eb..ef47c94b67a891 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.h
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.h
@@ -25,7 +25,6 @@ namespace hlfir {
 std::unique_ptr<mlir::Pass> createConvertHLFIRtoFIRPass();
 std::unique_ptr<mlir::Pass> createBufferizeHLFIRPass();
 std::unique_ptr<mlir::Pass> createLowerHLFIRIntrinsicsPass();
-std::unique_ptr<mlir::Pass> createSimplifyHLFIRIntrinsicsPass();
 std::unique_ptr<mlir::Pass> createInlineElementalsPass();
 std::unique_ptr<mlir::Pass> createLowerHLFIROrderedAssignmentsPass();
 std::unique_ptr<mlir::Pass> createOptimizedBufferizationPass();
diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td
index dae96b3f767ea1..806d1f202975bf 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.td
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.td
@@ -46,9 +46,8 @@ def LowerHLFIROrderedAssignments : Pass<"lower-hlfir-ordered-assignments", "::ml
   ];
 }
 
-def SimplifyHLFIRIntrinsics : Pass<"simplify-hlfir-intrinsics", "::mlir::func::FuncOp"> {
+def SimplifyHLFIRIntrinsics : Pass<"simplify-hlfir-intrinsics"> {
   let summary = "Simplify HLFIR intrinsic operations that don't need to result in runtime calls";
-  let constructor = "hlfir::createSimplifyHLFIRIntrinsicsPass()";
 }
 
 def InlineElementals : Pass<"inline-elementals", "::mlir::func::FuncOp"> {
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index 61e591f2086df4..e0ab9d5f0429cc 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -317,7 +317,8 @@ inline void createHLFIRToFIRPassPipeline(
     mlir::PassManager &pm, llvm::OptimizationLevel optLevel = defaultOptLevel) {
   if (optLevel.isOptimizingForSpeed()) {
     addCanonicalizerPassWithoutRegionSimplification(pm);
-    pm.addPass(hlfir::createSimplifyHLFIRIntrinsicsPass());
+    addNestedPassToAllTopLevelOperations(
+        pm, hlfir::createSimplifyHLFIRIntrinsics);
   }
   pm.addPass(hlfir::createInlineElementalsPass());
   if (optLevel.isOptimizingForSpeed()) {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index b761563eba0f88..6153c82fa7347a 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -94,7 +94,6 @@ class SimplifyHLFIRIntrinsics
     : public hlfir::impl::SimplifyHLFIRIntrinsicsBase<SimplifyHLFIRIntrinsics> {
 public:
   void runOnOperation() override {
-    mlir::func::FuncOp func = this->getOperation();
     mlir::MLIRContext *context = &getContext();
     mlir::RewritePatternSet patterns(context);
     patterns.insert<TransposeAsElementalConversion>(context);
@@ -108,16 +107,12 @@ class SimplifyHLFIRIntrinsics
         });
     target.markUnknownOpDynamicallyLegal(
         [](mlir::Operation *) { return true; });
-    if (mlir::failed(
-            mlir::applyFullConversion(func, target, std::move(patterns)))) {
-      mlir::emitError(func->getLoc(),
+    if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
+                                               std::move(patterns)))) {
+      mlir::emitError(getOperation()->getLoc(),
                       "failure in HLFIR intrinsic simplification");
       signalPassFailure();
     }
   }
 };
 } // namespace
-
-std::unique_ptr<mlir::Pass> hlfir::createSimplifyHLFIRIntrinsicsPass() {
-  return std::make_unique<SimplifyHLFIRIntrinsics>();
-}
diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index 4ebac7c3fb65c1..7130024e43b9be 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -13,9 +13,16 @@
 
 ! ALL: Fortran::lower::VerifierPass
 ! O2-NEXT: Canonicalizer
-! O2-NEXT: 'func.func' Pipeline
+! O2-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! O2-NEXT: 'fir.global' Pipeline
+! O2-NEXT:   SimplifyHLFIRIntrinsics
+! ALL:     'func.func' Pipeline
 ! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! ALL:       InlineElementals
+! O2-NEXT: 'omp.declare_reduction' Pipeline
+! O2-NEXT:   SimplifyHLFIRIntrinsics
+! O2-NEXT: 'omp.private' Pipeline
+! O2-NEXT:   SimplifyHLFIRIntrinsics
 ! ALL: LowerHLFIROrderedAssignments
 ! ALL-NEXT: LowerHLFIRIntrinsics
 ! ALL-NEXT: BufferizeHLFIR
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 02fb84ed8c873d..9e3d3c18337d97 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -17,9 +17,16 @@ func.func @_QQmain() {
 // PASSES: Pass statistics report
 
 // PASSES:        Canonicalizer
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+// PASSES-NEXT: 'fir.global' Pipeline
+// PASSES-NEXT:   SimplifyHLFIRIntrinsics
 // PASSES-NEXT: 'func.func' Pipeline
 // PASSES-NEXT:   SimplifyHLFIRIntrinsics
 // PASSES-NEXT:   InlineElementals
+// PASSES-NEXT: 'omp.declare_reduction' Pipeline
+// PASSES-NEXT:   SimplifyHLFIRIntrinsics
+// PASSES-NEXT: 'omp.private' Pipeline
+// PASSES-NEXT:   SimplifyHLFIRIntrinsics
 // PASSES-NEXT:   Canonicalizer
 // PASSES-NEXT:   CSE
 // PASSES-NEXT:    (S) 0 num-cse'd - Number of operations CSE'd

From 1ef081b05c562936fc025dde39b444066d9d470f Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Mon, 20 May 2024 18:54:48 +0900
Subject: [PATCH 56/60] movimm-expand-ldst.mir (d3d6565c2453) requires asserts

---
 llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
index 1ec2a00f67690b..72529807d5d54a 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -mtriple=aarch64 -verify-machineinstrs -run-pass=aarch64-expand-pseudo -run-pass=aarch64-ldst-opt -debug-only=aarch64-ldst-opt %s -o - | FileCheck %s
+# REQUIRES: asserts
 ---
 name: test_fold_repeating_constant_load
 tracksRegLiveness: true

From 9f449c34278191193f2f2cbc96c333548ad20238 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen@sifive.com>
Date: Mon, 20 May 2024 18:46:30 +0800
Subject: [PATCH 57/60] [SLP] NFC. Use TreeEntry::getOperand if
 setOperandsInOrder is called (#92727)

already.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 72 +++++--------------
 1 file changed, 17 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d21b5e1cc04178..140a1b1ffbafed 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6916,15 +6916,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                    std::nullopt, CurrentOrder);
       LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
 
-      constexpr int NumOps = 2;
-      ValueList VectorOperands[NumOps];
-      for (int I = 0; I < NumOps; ++I) {
-        for (Value *V : VL)
-          VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I));
-
-        TE->setOperand(I, VectorOperands[I]);
-      }
-      buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1});
+      TE->setOperandsInOrder();
+      buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
       return;
     }
     case Instruction::Load: {
@@ -7024,14 +7017,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
-        ValueList Operands;
-        // Prepare the operand vector.
-        for (Value *V : VL)
-          Operands.push_back(cast<Instruction>(V)->getOperand(I));
-
-        buildTree_rec(Operands, Depth + 1, {TE, I});
-      }
+      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+        buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       return;
     }
     case Instruction::ICmp:
@@ -7116,14 +7103,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       }
 
       TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
-        ValueList Operands;
-        // Prepare the operand vector.
-        for (Value *V : VL)
-          Operands.push_back(cast<Instruction>(V)->getOperand(I));
-
-        buildTree_rec(Operands, Depth + 1, {TE, I});
-      }
+      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+        buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       return;
     }
     case Instruction::GetElementPtr: {
@@ -7182,30 +7163,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       return;
     }
     case Instruction::Store: {
-      // Check if the stores are consecutive or if we need to swizzle them.
-      ValueList Operands(VL.size());
-      auto *OIter = Operands.begin();
-      for (Value *V : VL) {
-        auto *SI = cast<StoreInst>(V);
-        *OIter = SI->getValueOperand();
-        ++OIter;
-      }
-      // Check that the sorted pointer operands are consecutive.
-      if (CurrentOrder.empty()) {
-        // Original stores are consecutive and does not require reordering.
-        TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                     ReuseShuffleIndices);
-        TE->setOperandsInOrder();
-        buildTree_rec(Operands, Depth + 1, {TE, 0});
-        LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
-      } else {
+      bool Consecutive = CurrentOrder.empty();
+      if (!Consecutive)
         fixupOrderingIndices(CurrentOrder);
-        TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                     ReuseShuffleIndices, CurrentOrder);
-        TE->setOperandsInOrder();
-        buildTree_rec(Operands, Depth + 1, {TE, 0});
+      TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                                   ReuseShuffleIndices, CurrentOrder);
+      TE->setOperandsInOrder();
+      buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
+      if (Consecutive)
+        LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+      else
         LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
-      }
       return;
     }
     case Instruction::Call: {
@@ -7305,14 +7273,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       }
 
       TE->setOperandsInOrder();
-      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
-        ValueList Operands;
-        // Prepare the operand vector.
-        for (Value *V : VL)
-          Operands.push_back(cast<Instruction>(V)->getOperand(I));
-
-        buildTree_rec(Operands, Depth + 1, {TE, I});
-      }
+      for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
+        buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       return;
     }
     default:

From 6733a505a1afb8eb4db2f6d85426d79ff0dc5eee Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Mon, 20 May 2024 12:13:36 +0100
Subject: [PATCH 58/60] [MLIR][OpenMP] NFC: Split OpenMP dialect definitions
 (#91741)

This patch splits definitions for the OpenMP dialect into multiple files
to simplify the addition of new features, reduce merge conflicts, make
it easier to understand, etc. The split is based on the structure of the
more mature LLVMIR dialect. More specifically:

- The OpenMP dialect definition is located in OpenMPDialect.td.
- Base classes for OpenMP operations and types, as well as generic
OpenMP types are moved to OpenMPOpBase.td.
- OpenMP enumeration attributes, their case attributes and shared base
classes for these are placed in OpenMPEnums.td.
- Other OpenMP attributes are separated into OpenMPAttrDefs.td.
- OpenMPOps.td only contains operation definitions.

Even though this change should be useful on its own, it is intended as a
precursor to a follow-up PR in which operation arguments and attributes
are split into clause-specific classes which are then shared by all
operations to which they apply to. Without this prior change, that
approach would make the OpenMPOps.td file harder to navigate.
---
 .../mlir/Dialect/OpenMP/OpenMPAttrDefs.td     |  79 ++++++
 .../mlir/Dialect/OpenMP/OpenMPDialect.td      |  22 ++
 .../mlir/Dialect/OpenMP/OpenMPEnums.td        | 211 +++++++++++++++
 .../mlir/Dialect/OpenMP/OpenMPOpBase.td       |  48 ++++
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 245 +-----------------
 .../Dialect/OpenMP/OpenMPOpsInterfaces.td     |   6 +-
 6 files changed, 371 insertions(+), 240 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/OpenMP/OpenMPAttrDefs.td
 create mode 100644 mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.td
 create mode 100644 mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
 create mode 100644 mlir/include/mlir/Dialect/OpenMP/OpenMPOpBase.td

diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPAttrDefs.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPAttrDefs.td
new file mode 100644
index 00000000000000..704d0b2220e8a3
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPAttrDefs.td
@@ -0,0 +1,79 @@
+//=== OpenMPAttrDefs.td - OpenMP Attributes definition -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_ATTR_DEFS
+#define OPENMP_ATTR_DEFS
+
+include "mlir/Dialect/OpenMP/OpenMPDialect.td"
+include "mlir/Dialect/OpenMP/OpenMPEnums.td"
+include "mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td"
+include "mlir/Dialect/OpenMP/OpenMPTypeInterfaces.td"
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/CommonAttrConstraints.td"
+
+class OpenMP_Attr<string name, string attrMnemonic, list<Trait> traits = [],
+                  string baseCppClass = "::mlir::Attribute">
+    : AttrDef<OpenMP_Dialect, name, traits, baseCppClass> {
+  let mnemonic = attrMnemonic;
+}
+
+//===----------------------------------------------------------------------===//
+// DeclareTargetAttr
+//===----------------------------------------------------------------------===//
+
+def DeclareTargetAttr : OpenMP_Attr<"DeclareTarget", "declaretarget"> {
+  let parameters = (ins
+    OptionalParameter<"DeclareTargetDeviceTypeAttr">:$device_type,
+    OptionalParameter<"DeclareTargetCaptureClauseAttr">:$capture_clause
+  );
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// FlagsAttr
+//===----------------------------------------------------------------------===//
+
+// Runtime library flags attribute that holds information for lowering to LLVM.
+def FlagsAttr : OpenMP_Attr<"Flags", "flags"> {
+  let parameters = (ins
+    DefaultValuedParameter<"uint32_t", "0">:$debug_kind,
+    DefaultValuedParameter<"bool", "false">:$assume_teams_oversubscription,
+    DefaultValuedParameter<"bool", "false">:$assume_threads_oversubscription,
+    DefaultValuedParameter<"bool", "false">:$assume_no_thread_state,
+    DefaultValuedParameter<"bool", "false">:$assume_no_nested_parallelism,
+    DefaultValuedParameter<"bool", "false">:$no_gpu_lib,
+    DefaultValuedParameter<"uint32_t", "50">:$openmp_device_version
+  );
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// TaskDependArrayAttr
+//===----------------------------------------------------------------------===//
+
+def TaskDependArrayAttr
+    : TypedArrayAttrBase<ClauseTaskDependAttr,
+                         ClauseTaskDependAttr.summary # " array"> {
+  let constBuilderCall = ?;
+}
+
+//===----------------------------------------------------------------------===//
+// VersionAttr
+//===----------------------------------------------------------------------===//
+
+def VersionAttr : OpenMP_Attr<"Version", "version"> {
+  let parameters = (ins
+    "uint32_t":$version
+  );
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+#endif // OPENMP_ATTR_DEFS
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.td
new file mode 100644
index 00000000000000..459cc78435809c
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.td
@@ -0,0 +1,22 @@
+//===- OpenMPDialect.td - OpenMP dialect definition --------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_DIALECT
+#define OPENMP_DIALECT
+
+include "mlir/IR/DialectBase.td"
+
+def OpenMP_Dialect : Dialect {
+  let name = "omp";
+  let cppNamespace = "::mlir::omp";
+  let dependentDialects = ["::mlir::LLVM::LLVMDialect, ::mlir::func::FuncDialect"];
+  let useDefaultAttributePrinterParser = 1;
+  let useDefaultTypePrinterParser = 1;
+}
+
+#endif  // OPENMP_DIALECT
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
new file mode 100644
index 00000000000000..bf3d33819e9a9c
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
@@ -0,0 +1,211 @@
+//===-- OpenMPEnums.td - OpenMP dialect enum file ----------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_ENUMS
+#define OPENMP_ENUMS
+
+include "mlir/Dialect/OpenMP/OpenMPDialect.td"
+include "mlir/IR/EnumAttr.td"
+
+include "mlir/Dialect/OpenMP/OmpCommon.td"
+
+//===----------------------------------------------------------------------===//
+// Base classes for OpenMP enum attributes.
+//===----------------------------------------------------------------------===//
+
+class OpenMP_I32EnumAttr<string name, string summary,
+                         list<I32EnumAttrCase> cases>
+    : I32EnumAttr<name, summary, cases> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::omp";
+}
+
+class OpenMP_BitEnumAttr<string name, string summary,
+                         list<BitEnumAttrCaseBase> cases>
+    : I32BitEnumAttr<name, summary, cases> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::omp";
+}
+
+class OpenMP_EnumAttr<EnumAttrInfo enumInfo, string name>
+    : EnumAttr<OpenMP_Dialect, enumInfo, name>;
+
+
+//===----------------------------------------------------------------------===//
+// capture_clause enum.
+//===----------------------------------------------------------------------===//
+
+def CaptureClauseTo : I32EnumAttrCase<"to", 0>;
+def CaptureClauseLink : I32EnumAttrCase<"link", 1>;
+def CaptureClauseEnter : I32EnumAttrCase<"enter", 2>;
+
+def DeclareTargetCaptureClause : OpenMP_I32EnumAttr<
+    "DeclareTargetCaptureClause",
+    "capture clause", [
+      CaptureClauseTo,
+      CaptureClauseLink,
+      CaptureClauseEnter
+    ]>;
+
+def DeclareTargetCaptureClauseAttr : OpenMP_EnumAttr<DeclareTargetCaptureClause,
+                                                     "capture_clause"> {
+  let assemblyFormat = "`(` $value `)`";
+}
+
+//===----------------------------------------------------------------------===//
+// clause_depend enum.
+//===----------------------------------------------------------------------===//
+
+def ClauseDependSource : I32EnumAttrCase<"dependsource", 0>;
+def ClauseDependSink : I32EnumAttrCase<"dependsink", 1>;
+
+def ClauseDepend : OpenMP_I32EnumAttr<
+    "ClauseDepend",
+    "depend clause", [
+      ClauseDependSource,
+      ClauseDependSink
+    ]>;
+
+def ClauseDependAttr : OpenMP_EnumAttr<ClauseDepend, "clause_depend"> {
+  let assemblyFormat = "`(` $value `)`";
+}
+
+//===----------------------------------------------------------------------===//
+// clause_requires enum.
+//===----------------------------------------------------------------------===//
+
+// atomic_default_mem_order clause values not defined here because they can be
+// represented by the OMPC_MemoryOrder enumeration instead.
+def ClauseRequiresNone : I32BitEnumAttrCaseNone<"none">;
+def ClauseRequiresReverseOffload : I32BitEnumAttrCaseBit<"reverse_offload", 0>;
+def ClauseRequiresUnifiedAddress : I32BitEnumAttrCaseBit<"unified_address", 1>;
+def ClauseRequiresUnifiedSharedMemory
+    : I32BitEnumAttrCaseBit<"unified_shared_memory", 2>;
+def ClauseRequiresDynamicAllocators
+    : I32BitEnumAttrCaseBit<"dynamic_allocators", 3>;
+
+def ClauseRequires : OpenMP_BitEnumAttr<
+    "ClauseRequires",
+    "requires clauses", [
+      ClauseRequiresNone,
+      ClauseRequiresReverseOffload,
+      ClauseRequiresUnifiedAddress,
+      ClauseRequiresUnifiedSharedMemory,
+      ClauseRequiresDynamicAllocators
+    ]>;
+
+def ClauseRequiresAttr : OpenMP_EnumAttr<ClauseRequires, "clause_requires">;
+
+//===----------------------------------------------------------------------===//
+// clause_task_depend enum.
+//===----------------------------------------------------------------------===//
+
+def ClauseTaskDependIn : I32EnumAttrCase<"taskdependin", 0>;
+def ClauseTaskDependOut : I32EnumAttrCase<"taskdependout", 1>;
+def ClauseTaskDependInOut : I32EnumAttrCase<"taskdependinout", 2>;
+
+def ClauseTaskDepend : OpenMP_I32EnumAttr<
+    "ClauseTaskDepend",
+    "depend clause in a target or task construct", [
+      ClauseTaskDependIn,
+      ClauseTaskDependOut,
+      ClauseTaskDependInOut
+    ]>;
+
+def ClauseTaskDependAttr : OpenMP_EnumAttr<ClauseTaskDepend,
+                                           "clause_task_depend"> {
+  let assemblyFormat = "`(` $value `)`";
+}
+
+//===----------------------------------------------------------------------===//
+// data_sharing_type enum.
+//===----------------------------------------------------------------------===//
+
+def DataSharingTypePrivate : I32EnumAttrCase<"Private", 0, "private">;
+def DataSharingTypeFirstPrivate
+    : I32EnumAttrCase<"FirstPrivate", 1, "firstprivate">;
+
+def DataSharingClauseType : OpenMP_I32EnumAttr<
+    "DataSharingClauseType",
+    "Type of a data-sharing clause", [
+      DataSharingTypePrivate,
+      DataSharingTypeFirstPrivate
+    ]>;
+
+def DataSharingClauseTypeAttr : OpenMP_EnumAttr<DataSharingClauseType,
+                                                "data_sharing_type"> {
+  let assemblyFormat = "`{` `type` `=` $value `}`";
+}
+
+//===----------------------------------------------------------------------===//
+// device_type enum.
+//===----------------------------------------------------------------------===//
+
+def DeviceTypeAny : I32EnumAttrCase<"any", 0>;
+def DeviceTypeHost : I32EnumAttrCase<"host", 1>;
+def DeviceTypeNoHost : I32EnumAttrCase<"nohost", 2>;
+
+def DeclareTargetDeviceType : OpenMP_I32EnumAttr<
+    "DeclareTargetDeviceType",
+    "device_type clause", [
+      DeviceTypeAny,
+      DeviceTypeHost,
+      DeviceTypeNoHost
+    ]>;
+
+def DeclareTargetDeviceTypeAttr : OpenMP_EnumAttr<DeclareTargetDeviceType,
+                                                  "device_type"> {
+  let assemblyFormat = "`(` $value `)`";
+}
+
+//===----------------------------------------------------------------------===//
+// sched_mod enum.
+//===----------------------------------------------------------------------===//
+
+def OpenMP_ScheduleModNone : I32EnumAttrCase<"none", 0>;
+def OpenMP_ScheduleModMonotonic : I32EnumAttrCase<"monotonic", 1>;
+def OpenMP_ScheduleModNonmonotonic : I32EnumAttrCase<"nonmonotonic", 2>;
+// FIXME: remove this value for the modifier because this is handled using a
+// separate attribute
+def OpenMP_ScheduleModSimd : I32EnumAttrCase<"simd", 3>;
+
+def ScheduleModifier : OpenMP_I32EnumAttr<
+    "ScheduleModifier",
+    "OpenMP Schedule Modifier", [
+      OpenMP_ScheduleModNone,
+      OpenMP_ScheduleModMonotonic,
+      OpenMP_ScheduleModNonmonotonic,
+      OpenMP_ScheduleModSimd
+    ]>;
+
+def ScheduleModifierAttr : OpenMP_EnumAttr<ScheduleModifier, "sched_mod">;
+
+//===----------------------------------------------------------------------===//
+// variable_capture_kind enum.
+//===----------------------------------------------------------------------===//
+
+def CaptureThis : I32EnumAttrCase<"This", 0>;
+def CaptureByRef : I32EnumAttrCase<"ByRef", 1>;
+def CaptureByCopy : I32EnumAttrCase<"ByCopy", 2>;
+def CaptureVLAType : I32EnumAttrCase<"VLAType", 3>;
+
+def VariableCaptureKind : OpenMP_I32EnumAttr<
+    "VariableCaptureKind",
+    "variable capture kind", [
+      CaptureThis,
+      CaptureByRef,
+      CaptureByCopy,
+      CaptureVLAType
+    ]>;
+
+def VariableCaptureKindAttr : OpenMP_EnumAttr<VariableCaptureKind,
+                                              "variable_capture_kind"> {
+  let assemblyFormat = "`(` $value `)`";
+}
+
+#endif // OPENMP_ENUMS
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpBase.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpBase.td
new file mode 100644
index 00000000000000..b98d87aa74a6f6
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpBase.td
@@ -0,0 +1,48 @@
+//===- OpenMPOpBase.td - OpenMP dialect shared definitions -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains shared definitions for the OpenMP dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_OP_BASE
+#define OPENMP_OP_BASE
+
+include "mlir/Dialect/OpenMP/OpenMPAttrDefs.td"
+include "mlir/Dialect/OpenMP/OpenMPDialect.td"
+include "mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td"
+include "mlir/Dialect/OpenMP/OpenMPTypeInterfaces.td"
+include "mlir/IR/OpBase.td"
+
+//===----------------------------------------------------------------------===//
+// OpenMP dialect type constraints.
+//===----------------------------------------------------------------------===//
+
+class OpenMP_Type<string name, string typeMnemonic> :
+      TypeDef<OpenMP_Dialect, name> {
+  let mnemonic = typeMnemonic;
+}
+
+// Type which can be constraint accepting standard integers and indices.
+def IntLikeType : AnyTypeOf<[AnyInteger, Index]>;
+
+def OpenMP_PointerLikeType : TypeAlias<OpenMP_PointerLikeTypeInterface,
+	"OpenMP-compatible variable type">;
+
+def OpenMP_MapBoundsType : OpenMP_Type<"MapBounds", "map_bounds_ty"> {
+  let summary = "Type for representing omp map clause bounds information";
+}
+
+//===----------------------------------------------------------------------===//
+// Base classes for OpenMP dialect operations.
+//===----------------------------------------------------------------------===//
+
+class OpenMP_Op<string mnemonic, list<Trait> traits = []> :
+      Op<OpenMP_Dialect, mnemonic, traits>;
+
+#endif  // OPENMP_OP_BASE
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 29c287cad06e9b..122abbe7cc975a 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -14,145 +14,20 @@
 #ifndef OPENMP_OPS
 #define OPENMP_OPS
 
+include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
+include "mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.td"
+include "mlir/Dialect/OpenMP/OpenMPAttrDefs.td"
+include "mlir/Dialect/OpenMP/OpenMPOpBase.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/EnumAttr.td"
 include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/IR/SymbolInterfaces.td"
-include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
-include "mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.td"
-include "mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td"
-include "mlir/Dialect/OpenMP/OpenMPTypeInterfaces.td"
-
-def OpenMP_Dialect : Dialect {
-  let name = "omp";
-  let cppNamespace = "::mlir::omp";
-  let dependentDialects = ["::mlir::LLVM::LLVMDialect, ::mlir::func::FuncDialect"];
-  let useDefaultAttributePrinterParser = 1;
-  let useDefaultTypePrinterParser = 1;
-}
-
-// OmpCommon requires definition of OpenACC_Dialect.
-include "mlir/Dialect/OpenMP/OmpCommon.td"
-
-//===----------------------------------------------------------------------===//
-//  OpenMP Attributes
-//===----------------------------------------------------------------------===//
-
-class OpenMP_Attr<string name, string attrMnemonic,
-                list<Trait> traits = [],
-                string baseCppClass = "::mlir::Attribute">
-    : AttrDef<OpenMP_Dialect, name, traits, baseCppClass> {
-  let mnemonic = attrMnemonic;
-}
-
-def VersionAttr : OpenMP_Attr<"Version", "version"> {
-  let parameters = (ins
-    "uint32_t":$version
-  );
-
-  let assemblyFormat = "`<` struct(params) `>`";
-}
-
-//===----------------------------------------------------------------------===//
-// Runtime library flag's attribute that holds information for lowering to LLVM
-//===----------------------------------------------------------------------===//
-
-def FlagsAttr : OpenMP_Attr<"Flags", "flags"> {
-  let parameters = (ins
-    DefaultValuedParameter<"uint32_t", "0">:$debug_kind,
-    DefaultValuedParameter<"bool", "false">:$assume_teams_oversubscription,
-    DefaultValuedParameter<"bool", "false">:$assume_threads_oversubscription,
-    DefaultValuedParameter<"bool", "false">:$assume_no_thread_state,
-    DefaultValuedParameter<"bool", "false">:$assume_no_nested_parallelism,
-    DefaultValuedParameter<"bool", "false">:$no_gpu_lib,
-    DefaultValuedParameter<"uint32_t", "50">:$openmp_device_version
-  );
-
-  let assemblyFormat = "`<` struct(params) `>`";
-}
-
-
-class OpenMP_Op<string mnemonic, list<Trait> traits = []> :
-      Op<OpenMP_Dialect, mnemonic, traits>;
-
-// Type which can be constraint accepting standard integers and indices.
-def IntLikeType : AnyTypeOf<[AnyInteger, Index]>;
-
-def OpenMP_PointerLikeType : TypeAlias<OpenMP_PointerLikeTypeInterface,
-	"OpenMP-compatible variable type">;
-
-class OpenMP_Type<string name, string typeMnemonic> : TypeDef<OpenMP_Dialect, name> {
-  let mnemonic = typeMnemonic;
-}
-
-//===----------------------------------------------------------------------===//
-//  2.12.7 Declare Target Directive
-//===----------------------------------------------------------------------===//
-
-def DeviceTypeAny : I32EnumAttrCase<"any", 0>;
-def DeviceTypeHost   : I32EnumAttrCase<"host", 1>;
-def DeviceTypeNoHost   : I32EnumAttrCase<"nohost", 2>;
-
-def DeclareTargetDeviceType : I32EnumAttr<
-    "DeclareTargetDeviceType",
-    "device_type clause",
-    [DeviceTypeAny, DeviceTypeHost, DeviceTypeNoHost]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::omp";
-}
-
-def DeclareTargetDeviceTypeAttr : EnumAttr<OpenMP_Dialect, DeclareTargetDeviceType,
-                                    "device_type"> {
-  let assemblyFormat = "`(` $value `)`";
-}
-
-def CaptureClauseTo : I32EnumAttrCase<"to", 0>;
-def CaptureClauseLink   : I32EnumAttrCase<"link", 1>;
-def CaptureClauseEnter   : I32EnumAttrCase<"enter", 2>;
-
-def DeclareTargetCaptureClause : I32EnumAttr<
-    "DeclareTargetCaptureClause",
-    "capture clause",
-    [CaptureClauseTo, CaptureClauseLink, CaptureClauseEnter]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::omp";
-}
-
-def DeclareTargetCaptureClauseAttr : EnumAttr<OpenMP_Dialect, DeclareTargetCaptureClause,
-                                    "capture_clause"> {
-  let assemblyFormat = "`(` $value `)`";
-}
-
-def DeclareTargetAttr : OpenMP_Attr<"DeclareTarget", "declaretarget"> {
-  let parameters = (ins
-    OptionalParameter<"DeclareTargetDeviceTypeAttr">:$device_type,
-    OptionalParameter<"DeclareTargetCaptureClauseAttr">:$capture_clause
-  );
-
-  let assemblyFormat = "`<` struct(params) `>`";
-}
 
 //===----------------------------------------------------------------------===//
 // 2.19.4 Data-Sharing Attribute Clauses
 //===----------------------------------------------------------------------===//
 
-def DataSharingTypePrivate      : I32EnumAttrCase<"Private", 0, "private">;
-def DataSharingTypeFirstPrivate : I32EnumAttrCase<"FirstPrivate", 1, "firstprivate">;
-
-def DataSharingClauseType : I32EnumAttr<
-    "DataSharingClauseType",
-    "Type of a data-sharing clause",
-    [DataSharingTypePrivate, DataSharingTypeFirstPrivate]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::omp";
-}
-
-def DataSharingClauseTypeAttr : EnumAttr<
-    OpenMP_Dialect, DataSharingClauseType, "data_sharing_type"> {
-  let assemblyFormat = "`{` `type` `=` $value `}`";
-}
-
 def PrivateClauseOp : OpenMP_Op<"private", [IsolatedFromAbove]> {
   let summary = "Provides declaration of [first]private logic.";
   let description = [{
@@ -403,23 +278,6 @@ def TeamsOp : OpenMP_Op<"teams", [
   let hasVerifier = 1;
 }
 
-def OMP_ScheduleModNone         : I32EnumAttrCase<"none", 0>;
-def OMP_ScheduleModMonotonic    : I32EnumAttrCase<"monotonic", 1>;
-def OMP_ScheduleModNonmonotonic : I32EnumAttrCase<"nonmonotonic", 2>;
-// FIXME: remove this value for the modifier because this is handled using a
-// separate attribute
-def OMP_ScheduleModSIMD         : I32EnumAttrCase<"simd", 3>;
-
-def ScheduleModifier
-    : I32EnumAttr<"ScheduleModifier", "OpenMP Schedule Modifier",
-                  [OMP_ScheduleModNone, OMP_ScheduleModMonotonic,
-                   OMP_ScheduleModNonmonotonic, OMP_ScheduleModSIMD]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::omp";
-}
-def ScheduleModifierAttr : EnumAttr<OpenMP_Dialect, ScheduleModifier,
-                                    "sched_mod">;
-
 //===----------------------------------------------------------------------===//
 // 2.8.1 Sections Construct
 //===----------------------------------------------------------------------===//
@@ -904,26 +762,6 @@ def DistributeOp : OpenMP_Op<"distribute", [AttrSizedOperandSegments,
 // 2.10.1 task Construct
 //===----------------------------------------------------------------------===//
 
-def ClauseTaskDependIn    : I32EnumAttrCase<"taskdependin",    0>;
-def ClauseTaskDependOut   : I32EnumAttrCase<"taskdependout",   1>;
-def ClauseTaskDependInOut : I32EnumAttrCase<"taskdependinout", 2>;
-
-def ClauseTaskDepend : I32EnumAttr<
-    "ClauseTaskDepend",
-    "depend clause in a target or task construct",
-    [ClauseTaskDependIn, ClauseTaskDependOut, ClauseTaskDependInOut]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::omp";
-}
-def ClauseTaskDependAttr :
-  EnumAttr<OpenMP_Dialect, ClauseTaskDepend, "clause_task_depend"> {
-  let assemblyFormat = "`(` $value `)`";
-}
-def TaskDependArrayAttr :
-  TypedArrayAttrBase<ClauseTaskDependAttr, "clause_task_depend array attr"> {
-    let constBuilderCall = ?;
-  }
-
 def TaskOp : OpenMP_Op<"task", [AttrSizedOperandSegments,
                        OutlineableOpenMPOpInterface, AutomaticAllocationScope,
                        ReductionClauseInterface]> {
@@ -1283,28 +1121,6 @@ def FlushOp : OpenMP_Op<"flush"> {
 // Map related constructs
 //===----------------------------------------------------------------------===//
 
-def CaptureThis : I32EnumAttrCase<"This", 0>;
-def CaptureByRef : I32EnumAttrCase<"ByRef", 1>;
-def CaptureByCopy : I32EnumAttrCase<"ByCopy", 2>;
-def CaptureVLAType : I32EnumAttrCase<"VLAType", 3>;
-
-def VariableCaptureKind : I32EnumAttr<
-    "VariableCaptureKind",
-    "variable capture kind",
-    [CaptureThis, CaptureByRef, CaptureByCopy, CaptureVLAType]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::omp";
-}
-
-def VariableCaptureKindAttr : EnumAttr<OpenMP_Dialect, VariableCaptureKind,
-                                       "variable_capture_kind"> {
-  let assemblyFormat = "`(` $value `)`";
-}
-
-def MapBoundsType : OpenMP_Type<"MapBounds", "map_bounds_ty"> {
-  let summary = "Type for representing omp map clause bounds information";
-}
-
 def MapBoundsOp : OpenMP_Op<"map.bounds",
     [AttrSizedOperandSegments, NoMemoryEffect]> {
   let summary = "Represents normalized bounds information for map clauses.";
@@ -1386,7 +1202,7 @@ def MapBoundsOp : OpenMP_Op<"map.bounds",
                        Optional<IntLikeType>:$stride,
                        DefaultValuedAttr<BoolAttr, "false">:$stride_in_bytes,
                        Optional<IntLikeType>:$start_idx);
-  let results = (outs MapBoundsType:$result);
+  let results = (outs OpenMP_MapBoundsType:$result);
 
   let assemblyFormat = [{
     oilist(
@@ -1419,7 +1235,7 @@ def MapInfoOp : OpenMP_Op<"map.info", [AttrSizedOperandSegments]> {
                        Optional<OpenMP_PointerLikeType>:$var_ptr_ptr,
                        Variadic<OpenMP_PointerLikeType>:$members,
                        OptionalAttr<AnyIntElementsAttr>:$members_index,
-                       Variadic<MapBoundsType>:$bounds, /* rank-0 to rank-{n-1} */
+                       Variadic<OpenMP_MapBoundsType>:$bounds, /* rank-0 to rank-{n-1} */
                        OptionalAttr<UI64Attr>:$map_type,
                        OptionalAttr<VariableCaptureKindAttr>:$map_capture_type,
                        OptionalAttr<StrAttr>:$name,
@@ -1894,20 +1710,6 @@ def BarrierOp : OpenMP_Op<"barrier"> {
 // [5.1] 2.19.9 ordered Construct
 //===----------------------------------------------------------------------===//
 
-def ClauseDependSource : I32EnumAttrCase<"dependsource", 0>;
-def ClauseDependSink   : I32EnumAttrCase<"dependsink",   1>;
-
-def ClauseDepend : I32EnumAttr<
-    "ClauseDepend",
-    "depend clause",
-    [ClauseDependSource, ClauseDependSink]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::omp";
-}
-def ClauseDependAttr : EnumAttr<OpenMP_Dialect, ClauseDepend, "clause_depend"> {
-  let assemblyFormat = "`(` $value `)`";
-}
-
 def OrderedOp : OpenMP_Op<"ordered"> {
   let summary = "ordered construct without region";
   let description = [{
@@ -2377,35 +2179,4 @@ def ReductionOp : OpenMP_Op<"reduction"> {
   let hasVerifier = 1;
 }
 
-//===----------------------------------------------------------------------===//
-// 8.2 requires directive
-//===----------------------------------------------------------------------===//
-
-// atomic_default_mem_order clause values not defined here because they can be
-// represented by the OMPC_MemoryOrder enumeration instead.
-def ClauseRequiresNone : I32BitEnumAttrCaseNone<"none">;
-def ClauseRequiresReverseOffload : I32BitEnumAttrCaseBit<"reverse_offload", 0>;
-def ClauseRequiresUnifiedAddress : I32BitEnumAttrCaseBit<"unified_address", 1>;
-def ClauseRequiresUnifiedSharedMemory
-    : I32BitEnumAttrCaseBit<"unified_shared_memory", 2>;
-def ClauseRequiresDynamicAllocators
-    : I32BitEnumAttrCaseBit<"dynamic_allocators", 3>;
-
-def ClauseRequires : I32BitEnumAttr<
-    "ClauseRequires",
-    "requires clauses",
-    [
-      ClauseRequiresNone,
-      ClauseRequiresReverseOffload,
-      ClauseRequiresUnifiedAddress,
-      ClauseRequiresUnifiedSharedMemory,
-      ClauseRequiresDynamicAllocators
-    ]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::omp";
-}
-def ClauseRequiresAttr :
-  EnumAttr<OpenMP_Dialect, ClauseRequires, "clause_requires"> {
-}
-
 #endif // OPENMP_OPS
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
index d9569d9d294d84..31a306072d0ec3 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef OpenMP_OPS_INTERFACES
-#define OpenMP_OPS_INTERFACES
+#ifndef OPENMP_OPS_INTERFACES
+#define OPENMP_OPS_INTERFACES
 
 include "mlir/IR/OpBase.td"
 
@@ -349,4 +349,4 @@ def OffloadModuleInterface : OpInterface<"OffloadModuleInterface"> {
   ];
 }
 
-#endif // OpenMP_OPS_INTERFACES
+#endif // OPENMP_OPS_INTERFACES

From f6ae8e6381a02b10091589d7742fab389f847ea9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Degioanni?=
 <theo.degioanni.llvm.deluge062@simplelogin.fr>
Date: Mon, 20 May 2024 12:52:07 +0100
Subject: [PATCH 59/60] [mlir][irdl] Fix missing verifier in irdl.parametric
 (#92700)

The parametric op was not checking the symbol it points to is a type or
attribute. This PR also fixes a small bug where an invalid IRDL file
would not end processing in mlir-opt. I also improved the error messages
for the already handled irdl.base invalid symbols.
---
 mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td |  3 +-
 mlir/lib/Dialect/IRDL/IR/IRDL.cpp            | 31 +++++++++++++++-----
 mlir/lib/Tools/mlir-opt/MlirOptMain.cpp      |  2 ++
 mlir/test/Dialect/IRDL/invalid.irdl.mlir     | 17 ++++++++++-
 4 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td b/mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td
index aa6a8e93c0288e..d2765dec420ac2 100644
--- a/mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td
+++ b/mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td
@@ -503,7 +503,8 @@ def IRDL_BaseOp : IRDL_ConstraintOp<"base",
 }
 
 def IRDL_ParametricOp : IRDL_ConstraintOp<"parametric",
-    [ParentOneOf<["TypeOp", "AttributeOp", "OperationOp"]>, Pure]> {
+    [ParentOneOf<["TypeOp", "AttributeOp", "OperationOp"]>,
+     DeclareOpInterfaceMethods<SymbolUserOpInterface>, Pure]> {
   let summary = "Constraints an attribute/type base and its parameters";
   let description = [{
     `irdl.parametric` defines a constraint that accepts only a single type
diff --git a/mlir/lib/Dialect/IRDL/IR/IRDL.cpp b/mlir/lib/Dialect/IRDL/IR/IRDL.cpp
index 4eae2b03024c24..e4728f55b49d74 100644
--- a/mlir/lib/Dialect/IRDL/IR/IRDL.cpp
+++ b/mlir/lib/Dialect/IRDL/IR/IRDL.cpp
@@ -132,22 +132,37 @@ LogicalResult BaseOp::verify() {
   return success();
 }
 
+static LogicalResult
+checkSymbolIsTypeOrAttribute(SymbolTableCollection &symbolTable,
+                             Operation *source, SymbolRefAttr symbol) {
+  Operation *targetOp = symbolTable.lookupNearestSymbolFrom(source, symbol);
+  if (!targetOp)
+    return source->emitOpError() << "symbol '" << symbol << "' not found";
+
+  if (!isa<TypeOp, AttributeOp>(targetOp))
+    return source->emitOpError() << "symbol '" << symbol
+                                 << "' does not refer to a type or attribute "
+                                    "definition (refers to '"
+                                 << targetOp->getName() << "')";
+
+  return success();
+}
+
 LogicalResult BaseOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   std::optional<SymbolRefAttr> baseRef = getBaseRef();
   if (!baseRef)
     return success();
 
-  TypeOp typeOp = symbolTable.lookupNearestSymbolFrom<TypeOp>(*this, *baseRef);
-  if (typeOp)
-    return success();
+  return checkSymbolIsTypeOrAttribute(symbolTable, *this, *baseRef);
+}
 
-  AttributeOp attrOp =
-      symbolTable.lookupNearestSymbolFrom<AttributeOp>(*this, *baseRef);
-  if (attrOp)
+LogicalResult
+ParametricOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
+  std::optional<SymbolRefAttr> baseRef = getBaseType();
+  if (!baseRef)
     return success();
 
-  return emitOpError() << "'" << *baseRef
-                       << "' does not refer to a type or attribute definition";
+  return checkSymbolIsTypeOrAttribute(symbolTable, *this, *baseRef);
 }
 
 /// Parse a value with its variadicity first. By default, the variadicity is
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index 44c5e9826f3b79..a1b2893a973b19 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -266,6 +266,8 @@ LogicalResult loadIRDLDialects(StringRef irdlFile, MLIRContext &ctx) {
 
   // Parse the input file.
   OwningOpRef<ModuleOp> module(parseSourceFile<ModuleOp>(sourceMgr, &ctx));
+  if (!module)
+    return failure();
 
   // Load IRDL dialects.
   return irdl::loadDialects(module.get());
diff --git a/mlir/test/Dialect/IRDL/invalid.irdl.mlir b/mlir/test/Dialect/IRDL/invalid.irdl.mlir
index d62bb498a7ad98..f207d31cf158b7 100644
--- a/mlir/test/Dialect/IRDL/invalid.irdl.mlir
+++ b/mlir/test/Dialect/IRDL/invalid.irdl.mlir
@@ -6,7 +6,7 @@ func.func private @foo()
 
 irdl.dialect @testd {
   irdl.type @type {
-    // expected-error@+1 {{'@foo' does not refer to a type or attribute definition}}
+    // expected-error@+1 {{symbol '@foo' not found}}
     %0 = irdl.base @foo
     irdl.parameters(%0)
   }
@@ -41,3 +41,18 @@ irdl.dialect @testd {
     irdl.parameters(%0)
   }
 }
+
+// -----
+
+irdl.dialect @invalid_parametric {
+  irdl.operation @foo {
+    // expected-error@+1 {{symbol '@not_a_type_or_attr' does not refer to a type or attribute definition}}
+    %param = irdl.parametric @not_a_type_or_attr<>
+    irdl.results(%param)
+  }
+
+  irdl.operation @not_a_type_or_attr {
+    %param = irdl.is i1
+    irdl.results(%param)
+  }
+}

From 82c5d350d200ccc5365d40eac187b9ec967af727 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 20 May 2024 13:03:48 +0100
Subject: [PATCH 60/60] [VPlan] Add commutative binary OR matcher, use in
 transform. (#92539)

Split off from https://github.com/llvm/llvm-project/pull/89386, this
extends the binary matcher to support matching commuative operations.
This is used for a new m_c_BinaryOr matcher, used in simplifyRecipe.

PR: https://github.com/llvm/llvm-project/pull/92539
---
 .../Transforms/Vectorize/VPlanPatternMatch.h  | 43 +++++++++++++------
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  4 +-
 .../LoopVectorize/AArch64/masked-call.ll      |  9 ++--
 .../LoopVectorize/AArch64/sve-tail-folding.ll |  3 +-
 4 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 56cbaa42012978..05874688074351 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -157,7 +157,7 @@ using AllUnaryRecipe_match =
     UnaryRecipe_match<Op0_t, Opcode, VPWidenRecipe, VPReplicateRecipe,
                       VPWidenCastRecipe, VPInstruction>;
 
-template <typename Op0_t, typename Op1_t, unsigned Opcode,
+template <typename Op0_t, typename Op1_t, unsigned Opcode, bool Commutative,
           typename... RecipeTys>
 struct BinaryRecipe_match {
   Op0_t Op0;
@@ -179,18 +179,23 @@ struct BinaryRecipe_match {
       return false;
     assert(R->getNumOperands() == 2 &&
            "recipe with matched opcode does not have 2 operands");
-    return Op0.match(R->getOperand(0)) && Op1.match(R->getOperand(1));
+    if (Op0.match(R->getOperand(0)) && Op1.match(R->getOperand(1)))
+      return true;
+    return Commutative && Op0.match(R->getOperand(1)) &&
+           Op1.match(R->getOperand(0));
   }
 };
 
 template <typename Op0_t, typename Op1_t, unsigned Opcode>
 using BinaryVPInstruction_match =
-    BinaryRecipe_match<Op0_t, Op1_t, Opcode, VPInstruction>;
+    BinaryRecipe_match<Op0_t, Op1_t, Opcode, /*Commutative*/ false,
+                       VPInstruction>;
 
-template <typename Op0_t, typename Op1_t, unsigned Opcode>
+template <typename Op0_t, typename Op1_t, unsigned Opcode,
+          bool Commutative = false>
 using AllBinaryRecipe_match =
-    BinaryRecipe_match<Op0_t, Op1_t, Opcode, VPWidenRecipe, VPReplicateRecipe,
-                       VPWidenCastRecipe, VPInstruction>;
+    BinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative, VPWidenRecipe,
+                       VPReplicateRecipe, VPWidenCastRecipe, VPInstruction>;
 
 template <unsigned Opcode, typename Op0_t>
 inline UnaryVPInstruction_match<Op0_t, Opcode>
@@ -256,10 +261,11 @@ m_ZExtOrSExt(const Op0_t &Op0) {
   return m_CombineOr(m_ZExt(Op0), m_SExt(Op0));
 }
 
-template <unsigned Opcode, typename Op0_t, typename Op1_t>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Opcode> m_Binary(const Op0_t &Op0,
-                                                            const Op1_t &Op1) {
-  return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode>(Op0, Op1);
+template <unsigned Opcode, typename Op0_t, typename Op1_t,
+          bool Commutative = false>
+inline AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative>
+m_Binary(const Op0_t &Op0, const Op1_t &Op1) {
+  return AllBinaryRecipe_match<Op0_t, Op1_t, Opcode, Commutative>(Op0, Op1);
 }
 
 template <typename Op0_t, typename Op1_t>
@@ -268,10 +274,21 @@ m_Mul(const Op0_t &Op0, const Op1_t &Op1) {
   return m_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1);
 }
 
-template <typename Op0_t, typename Op1_t>
-inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or>
+/// Match a binary OR operation. Note that while conceptually the operands can
+/// be matched commutatively, \p Commutative defaults to false in line with the
+/// IR-based pattern matching infrastructure. Use m_c_BinaryOr for a commutative
+/// version of the matcher.
+template <typename Op0_t, typename Op1_t, bool Commutative = false>
+inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or, Commutative>
 m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
-  return m_Binary<Instruction::Or, Op0_t, Op1_t>(Op0, Op1);
+  return m_Binary<Instruction::Or, Op0_t, Op1_t, Commutative>(Op0, Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Or,
+                             /*Commutative*/ true>
+m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1) {
+  return m_BinaryOr<Op0_t, Op1_t, /*Commutative*/ true>(Op0, Op1);
 }
 
 template <typename Op0_t, typename Op1_t>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 4c968c2834b10d..7ff8d8e0ea15d1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -941,8 +941,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
   // recipes to be visited during simplification.
   VPValue *X, *Y, *X1, *Y1;
   if (match(&R,
-            m_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
-                       m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) &&
+            m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)),
+                         m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) &&
       X == X1 && Y == Y1) {
     R.getVPSingleValue()->replaceAllUsesWith(X);
     return;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index d335ac4b697099..200c2adcf0e666 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -402,10 +402,9 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP10]])
 ; TFCOMMON-NEXT:    [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP8]], <vscale x 2 x i1> zeroinitializer
 ; TFCOMMON-NEXT:    [[TMP13:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP12]])
-; TFCOMMON-NEXT:    [[TMP14:%.*]] = or <vscale x 2 x i1> [[TMP10]], [[TMP12]]
 ; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP13]]
 ; TFCOMMON-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[TMP14]])
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFCOMMON-NEXT:    [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
@@ -453,16 +452,14 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer
 ; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP21]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP24:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], <vscale x 2 x i1> [[TMP22]])
-; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = or <vscale x 2 x i1> [[TMP17]], [[TMP21]]
-; TFA_INTERLEAVE-NEXT:    [[TMP26:%.*]] = or <vscale x 2 x i1> [[TMP18]], [[TMP22]]
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i64> [[TMP19]], <vscale x 2 x i64> [[TMP23]]
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP18]], <vscale x 2 x i64> [[TMP20]], <vscale x 2 x i64> [[TMP24]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 2
 ; TFA_INTERLEAVE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[TMP27]], i64 [[TMP29]]
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP27]], i32 8, <vscale x 2 x i1> [[TMP25]])
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP30]], i32 8, <vscale x 2 x i1> [[TMP26]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP27]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP30]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index 2b2742ca7ccbc3..63ad98b2d8ab2a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -480,11 +480,10 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = or <vscale x 4 x i1> [[TMP15]], [[TMP16]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP19]], i32 4, <vscale x 4 x i1> [[TMP18]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP21]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)