From 0e377e253c16d82a60e73ae21ca6b902e7a78775 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Wed, 15 Jul 2020 12:12:18 +0200
Subject: [PATCH 001/363] First commit on the release/11.x branch.

---
 llvm/docs/ReleaseNotes.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 8d8da954ece3..2f93afd8374c 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -199,7 +199,6 @@ External Open Source Projects Using LLVM 11
 
 * A project...
 
-
 Additional Information
 ======================
 

From d256b8ad5f2898cd05faa7319e00ea4a86b0cb47 Mon Sep 17 00:00:00 2001
From: Richard Smith <richard@metafoo.co.uk>
Date: Wed, 15 Jul 2020 11:33:07 -0700
Subject: [PATCH 002/363] Fix "unused variable" warning from recent GCC.

(cherry picked from commit 268025e2636c023fc39eed80cc4589f7ce9db786)
---
 clang/lib/AST/ExprConstant.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index d20c2382b6ac..41a4ae4b91c8 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -9930,8 +9930,7 @@ namespace {
       const ConstantArrayType *CAT =
           Info.Ctx.getAsConstantArrayType(E->getType());
       if (!CAT) {
-        if (const IncompleteArrayType *IAT =
-                Info.Ctx.getAsIncompleteArrayType(E->getType())) {
+        if (E->getType()->isIncompleteArrayType()) {
           // We can be asked to zero-initialize a flexible array member; this
           // is represented as an ImplicitValueInitExpr of incomplete array
           // type. In this case, the array has zero elements.

From 06a6a2fd6dddf3d1cc3404bb094c092690eb3ffa Mon Sep 17 00:00:00 2001
From: Eugene Zelenko <eugene@diakopto.com>
Date: Thu, 16 Jul 2020 15:52:00 +0200
Subject: [PATCH 003/363] Alphabetical order of Clang-tidy entries. Format
 improvements.

---
 clang-tools-extra/docs/ReleaseNotes.rst | 48 +++++++++++++------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index c08fd45c2f96..0238ef5149b0 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -79,13 +79,8 @@ New checks
 - New :doc:`abseil-string-find-str-contains
   <clang-tidy/checks/abseil-string-find-str-contains>` check.
 
-  Finds ``s.find(...) == string::npos`` comparisons (for various string-like types)
-  and suggests replacing with ``absl::StrContains()``.
-
-- New :doc:`cppcoreguidelines-avoid-non-const-global-variables
-  <clang-tidy/checks/cppcoreguidelines-avoid-non-const-global-variables>` check.
-  Finds non-const global variables as described in check I.2 of C++ Core
-  Guidelines.
+  Finds ``s.find(...) == string::npos`` comparisons (for various string-like
+  types) and suggests replacing with ``absl::StrContains()``.
 
 - New :doc:`bugprone-misplaced-pointer-arithmetic-in-alloc
   <clang-tidy/checks/bugprone-misplaced-pointer-arithmetic-in-alloc>` check.
@@ -100,6 +95,11 @@ New checks
   Finds pointers with the ``noescape`` attribute that are captured by an
   asynchronously-executed block.
 
+- New :doc:`bugprone-reserved-identifier
+  <clang-tidy/checks/bugprone-reserved-identifier>` check.
+
+  Checks for usages of identifiers reserved for use by the implementation.
+
 - New :doc:`bugprone-spuriously-wake-up-functions
   <clang-tidy/checks/bugprone-spuriously-wake-up-functions>` check.
 
@@ -108,11 +108,6 @@ New checks
   that checks whether a condition predicate holds or the function has a
   condition parameter.
 
-- New :doc:`bugprone-reserved-identifier
-  <clang-tidy/checks/bugprone-reserved-identifier>` check.
-
-  Checks for usages of identifiers reserved for use by the implementation.
-
 - New :doc:`bugprone-suspicious-include
   <clang-tidy/checks/bugprone-suspicious-include>` check.
 
@@ -126,6 +121,12 @@ New checks
   Flags use of the `C` standard library functions ``memset``, ``memcpy`` and
   ``memcmp`` and similar derivatives on non-trivial types.
 
+- New :doc:`cppcoreguidelines-avoid-non-const-global-variables
+  <clang-tidy/checks/cppcoreguidelines-avoid-non-const-global-variables>` check.
+
+  Finds non-const global variables as described in check I.2 of C++ Core
+  Guidelines.
+
 - New :doc:`llvmlibc-callee-namespace
   <clang-tidy/checks/llvmlibc-callee-namespace>` check.
 
@@ -142,6 +143,11 @@ New checks
   Finds includes of system libc headers not provided by the compiler within
   llvm-libc implementations.
 
+- New :doc:`misc-no-recursion
+  <clang-tidy/checks/misc-no-recursion>` check.
+
+  Finds recursive functions and diagnoses them.
+
 - New :doc:`modernize-replace-disallow-copy-and-assign-macro
   <clang-tidy/checks/modernize-replace-disallow-copy-and-assign-macro>` check.
 
@@ -153,11 +159,6 @@ New checks
 
   Finds implementations of -dealloc in Objective-C categories.
 
-- New :doc:`misc-no-recursion
-  <clang-tidy/checks/misc-no-recursion>` check.
-
-  Finds recursive functions and diagnoses them.
-
 - New :doc:`objc-nsinvocation-argument-lifetime
   <clang-tidy/checks/objc-nsinvocation-argument-lifetime>` check.
 
@@ -222,11 +223,6 @@ Changes in existing checks
   Now able to rename member references in class template definitions with
   explicit access.
 
-- Improved :doc:`readability-qualified-auto
-  <clang-tidy/checks/readability-qualified-auto>` check now supports a
-  `AddConstToQualified` to enable adding ``const`` qualifiers to variables
-  typed with ``auto *`` and ``auto &``.
-
 - Improved :doc:`readability-redundant-string-init
   <clang-tidy/checks/readability-redundant-string-init>` check now supports a
   `StringNames` option enabling its application to custom string classes. The
@@ -238,6 +234,11 @@ Changes in existing checks
   release. This simplifies how one specifies the options on the command line:
   ``--config="{CheckOptions: [{ key: HeaderFileExtensions, value: h;;hpp;hxx }]}"``
 
+- Improved :doc:`readability-qualified-auto
+  <clang-tidy/checks/readability-qualified-auto>` check now supports a
+  `AddConstToQualified` to enable adding ``const`` qualifiers to variables
+  typed with ``auto *`` and ``auto &``.
+
 Renamed checks
 ^^^^^^^^^^^^^^
 
@@ -247,7 +248,8 @@ Renamed checks
 Other improvements
 ^^^^^^^^^^^^^^^^^^
 
-- For 'run-clang-tidy.py' add option to use alpha checkers from clang-analyzer.
+- For `run-clang-tidy.py` add option to use alpha checkers from
+  `clang-analyzer`.
 
 Improvements to include-fixer
 -----------------------------

From ccaad06b84c0f53f04f3acb97316a8ddef75b12f Mon Sep 17 00:00:00 2001
From: David Truby <david.truby@arm.com>
Date: Tue, 14 Jul 2020 15:04:38 +0100
Subject: [PATCH 004/363] [flang] Fix shared library builds for lib/Lower.

Summary:
This adds missing definitions for functions in the Lower directory
that were causing failures in shared library builds.
The definitions for these are taken from the fir-dev branch on github.

Reviewers: sscalpone, schweitz, jeanPerier, klausler

Reviewed By: schweitz

Subscribers: mgorny, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D83771
---
 flang/lib/Lower/CMakeLists.txt         |  1 +
 flang/lib/Lower/ConvertExpr.cpp        | 95 ++++++++++++++++++++++++++
 flang/lib/Optimizer/Dialect/FIROps.cpp | 21 ++++--
 3 files changed, 113 insertions(+), 4 deletions(-)
 create mode 100644 flang/lib/Lower/ConvertExpr.cpp

diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt
index 3cd71c007a00..975065c9ed7d 100644
--- a/flang/lib/Lower/CMakeLists.txt
+++ b/flang/lib/Lower/CMakeLists.txt
@@ -7,6 +7,7 @@ add_flang_library(FortranLower
   Coarray.cpp
   ComplexExpr.cpp
   ConvertType.cpp
+  ConvertExpr.cpp
   DoLoopHelper.cpp
   FIRBuilder.cpp
   IntrinsicCall.cpp
diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
new file mode 100644
index 000000000000..1bac6884a5f7
--- /dev/null
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -0,0 +1,95 @@
+//===-- ConvertExpr.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Common/idioms.h"
+#include "flang/Lower/IntrinsicCall.h"
+#include "flang/Lower/Support/BoxValue.h"
+
+mlir::Value fir::getBase(const fir::ExtendedValue &ex) {
+  return std::visit(Fortran::common::visitors{
+                        [](const fir::UnboxedValue &x) { return x; },
+                        [](const auto &x) { return x.getAddr(); },
+                    },
+                    ex.box);
+}
+
+llvm::raw_ostream &fir::operator<<(llvm::raw_ostream &os,
+                                   const fir::CharBoxValue &box) {
+  os << "boxchar { addr: " << box.getAddr() << ", len: " << box.getLen()
+     << " }";
+  return os;
+}
+
+llvm::raw_ostream &fir::operator<<(llvm::raw_ostream &os,
+                                   const fir::ArrayBoxValue &box) {
+  os << "boxarray { addr: " << box.getAddr();
+  if (box.getLBounds().size()) {
+    os << ", lbounds: [";
+    llvm::interleaveComma(box.getLBounds(), os);
+    os << "]";
+  } else {
+    os << ", lbounds: all-ones";
+  }
+  os << ", shape: [";
+  llvm::interleaveComma(box.getExtents(), os);
+  os << "]}";
+  return os;
+}
+
+llvm::raw_ostream &fir::operator<<(llvm::raw_ostream &os,
+                                   const fir::CharArrayBoxValue &box) {
+  os << "boxchararray { addr: " << box.getAddr() << ", len : " << box.getLen();
+  if (box.getLBounds().size()) {
+    os << ", lbounds: [";
+    llvm::interleaveComma(box.getLBounds(), os);
+    os << "]";
+  } else {
+    os << " lbounds: all-ones";
+  }
+  os << ", shape: [";
+  llvm::interleaveComma(box.getExtents(), os);
+  os << "]}";
+  return os;
+}
+
+llvm::raw_ostream &fir::operator<<(llvm::raw_ostream &os,
+                                   const fir::BoxValue &box) {
+  os << "box { addr: " << box.getAddr();
+  if (box.getLen())
+    os << ", size: " << box.getLen();
+  if (box.params.size()) {
+    os << ", type params: [";
+    llvm::interleaveComma(box.params, os);
+    os << "]";
+  }
+  if (box.getLBounds().size()) {
+    os << ", lbounds: [";
+    llvm::interleaveComma(box.getLBounds(), os);
+    os << "]";
+  }
+  if (box.getExtents().size()) {
+    os << ", shape: [";
+    llvm::interleaveComma(box.getExtents(), os);
+    os << "]";
+  }
+  os << "}";
+  return os;
+}
+
+llvm::raw_ostream &fir::operator<<(llvm::raw_ostream &os,
+                                   const fir::ProcBoxValue &box) {
+  os << "boxproc: { addr: " << box.getAddr() << ", context: " << box.hostContext
+     << "}";
+  return os;
+}
+
+llvm::raw_ostream &fir::operator<<(llvm::raw_ostream &os,
+                                   const fir::ExtendedValue &ex) {
+  std::visit([&](const auto &value) { os << value; }, ex.box);
+  return os;
+}
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 30cd365f139b..44310d6e0691 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -1395,15 +1395,28 @@ mlir::OpFoldResult fir::SubfOp::fold(llvm::ArrayRef<mlir::Attribute> opnds) {
 //===----------------------------------------------------------------------===//
 // WhereOp
 //===----------------------------------------------------------------------===//
-
 void fir::WhereOp::build(mlir::OpBuilder &builder, OperationState &result,
                          mlir::Value cond, bool withElseRegion) {
+  build(builder, result, llvm::None, cond, withElseRegion);
+}
+
+void fir::WhereOp::build(mlir::OpBuilder &builder, OperationState &result,
+                         mlir::TypeRange resultTypes, mlir::Value cond,
+                         bool withElseRegion) {
   result.addOperands(cond);
+  result.addTypes(resultTypes);
+
   mlir::Region *thenRegion = result.addRegion();
+  thenRegion->push_back(new mlir::Block());
+  if (resultTypes.empty())
+    WhereOp::ensureTerminator(*thenRegion, builder, result.location);
+
   mlir::Region *elseRegion = result.addRegion();
-  WhereOp::ensureTerminator(*thenRegion, builder, result.location);
-  if (withElseRegion)
-    WhereOp::ensureTerminator(*elseRegion, builder, result.location);
+  if (withElseRegion) {
+    elseRegion->push_back(new mlir::Block());
+    if (resultTypes.empty())
+      WhereOp::ensureTerminator(*elseRegion, builder, result.location);
+  }
 }
 
 static mlir::ParseResult parseWhereOp(OpAsmParser &parser,

From 15a07e41f01a64f1f4a4ffc3ca89a0f0e5431e54 Mon Sep 17 00:00:00 2001
From: David Truby <david.truby@arm.com>
Date: Thu, 16 Jul 2020 11:27:03 +0100
Subject: [PATCH 005/363] [flang] Add missing link dependencies to
 FrontendOpenACC.

Summary:
These link dependencies are required for shared library builds to
work correctly.

Reviewers: clementval

Reviewed By: clementval

Subscribers: mgorny, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D83938
---
 flang/lib/Parser/CMakeLists.txt    | 1 +
 flang/lib/Semantics/CMakeLists.txt | 1 +
 2 files changed, 2 insertions(+)

diff --git a/flang/lib/Parser/CMakeLists.txt b/flang/lib/Parser/CMakeLists.txt
index e1e77ac6e92d..9ee416803177 100644
--- a/flang/lib/Parser/CMakeLists.txt
+++ b/flang/lib/Parser/CMakeLists.txt
@@ -30,6 +30,7 @@ add_flang_library(FortranParser
 
   LINK_COMPONENTS
   Support
+  FrontendOpenACC
 
   DEPENDS
   omp_gen
diff --git a/flang/lib/Semantics/CMakeLists.txt b/flang/lib/Semantics/CMakeLists.txt
index a869d831109b..2bdc5f958281 100644
--- a/flang/lib/Semantics/CMakeLists.txt
+++ b/flang/lib/Semantics/CMakeLists.txt
@@ -52,4 +52,5 @@ add_flang_library(FortranSemantics
   LINK_COMPONENTS
   Support
   FrontendOpenMP
+  FrontendOpenACC
 )

From 529f2e03592c072bb0f768db5b5e3731cccbebf3 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Wed, 15 Jul 2020 20:15:30 -0700
Subject: [PATCH 006/363] Revert "[InstSimplify] Remove select ?, undef, X -> X
 and select ?, X, undef -> X transforms" and subsequent patches

This reverts most of the following patches due to reports of miscompiles.
I've left the added test cases with comments updated to be FIXMEs.

1cf6f210a2e [IR] Disable select ? C : undef -> C fold in ConstantFoldSelectInstruction unless we know C isn't poison.
469da663f2d [InstSimplify] Re-enable select ?, undef, X -> X transform when X is provably not poison
122b0640fc9 [InstSimplify] Don't fold vectors of partial undef in SimplifySelectInst if the non-undef element value might produce poison
ac0af12ed2f [InstSimplify] Add test cases for opportunities to fold select ?, X, undef -> X when we can prove X isn't poison
9b1e95329af [InstSimplify] Remove select ?, undef, X -> X and select ?, X, undef -> X transforms

(cherry picked from commit 00f3579aea6e3d4a4b7464c3db47294f71cef9e4)
---
 clang/test/CodeGen/arm-mve-intrinsics/dup.c   | 24 +++++--------
 llvm/lib/Analysis/InstructionSimplify.cpp     | 16 +++------
 llvm/lib/IR/ConstantFold.cpp                  | 24 ++-----------
 .../InferAddressSpaces/AMDGPU/select.ll       |  2 +-
 llvm/test/Transforms/InstCombine/select.ll    | 14 +++-----
 llvm/test/Transforms/InstSimplify/select.ll   | 35 +++++++------------
 6 files changed, 33 insertions(+), 82 deletions(-)

diff --git a/clang/test/CodeGen/arm-mve-intrinsics/dup.c b/clang/test/CodeGen/arm-mve-intrinsics/dup.c
index b443917cb258..283c08257005 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/dup.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/dup.c
@@ -242,8 +242,7 @@ uint32x4_t test_vdupq_m_n_u32(uint32x4_t inactive, uint32_t a, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[A:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x half> [[DOTSPLAT]], <8 x half> undef
-// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+// CHECK-NEXT:    ret <8 x half> [[DOTSPLAT]]
 //
 float16x8_t test_vdupq_x_n_f16(float16_t a, mve_pred16_t p)
 {
@@ -256,8 +255,7 @@ float16x8_t test_vdupq_x_n_f16(float16_t a, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[DOTSPLAT]], <4 x float> undef
-// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+// CHECK-NEXT:    ret <4 x float> [[DOTSPLAT]]
 //
 float32x4_t test_vdupq_x_n_f32(float32_t a, mve_pred16_t p)
 {
@@ -270,8 +268,7 @@ float32x4_t test_vdupq_x_n_f32(float32_t a, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[A:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[DOTSPLAT]], <16 x i8> undef
-// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+// CHECK-NEXT:    ret <16 x i8> [[DOTSPLAT]]
 //
 int8x16_t test_vdupq_x_n_s8(int8_t a, mve_pred16_t p)
 {
@@ -284,8 +281,7 @@ int8x16_t test_vdupq_x_n_s8(int8_t a, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[A:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[DOTSPLAT]], <8 x i16> undef
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+// CHECK-NEXT:    ret <8 x i16> [[DOTSPLAT]]
 //
 int16x8_t test_vdupq_x_n_s16(int16_t a, mve_pred16_t p)
 {
@@ -298,8 +294,7 @@ int16x8_t test_vdupq_x_n_s16(int16_t a, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[DOTSPLAT]], <4 x i32> undef
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NEXT:    ret <4 x i32> [[DOTSPLAT]]
 //
 int32x4_t test_vdupq_x_n_s32(int32_t a, mve_pred16_t p)
 {
@@ -312,8 +307,7 @@ int32x4_t test_vdupq_x_n_s32(int32_t a, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[A:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[DOTSPLAT]], <16 x i8> undef
-// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+// CHECK-NEXT:    ret <16 x i8> [[DOTSPLAT]]
 //
 uint8x16_t test_vdupq_x_n_u8(uint8_t a, mve_pred16_t p)
 {
@@ -326,8 +320,7 @@ uint8x16_t test_vdupq_x_n_u8(uint8_t a, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[A:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[DOTSPLAT]], <8 x i16> undef
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+// CHECK-NEXT:    ret <8 x i16> [[DOTSPLAT]]
 //
 uint16x8_t test_vdupq_x_n_u16(uint16_t a, mve_pred16_t p)
 {
@@ -340,8 +333,7 @@ uint16x8_t test_vdupq_x_n_u16(uint16_t a, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[DOTSPLAT]], <4 x i32> undef
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NEXT:    ret <4 x i32> [[DOTSPLAT]]
 //
 uint32x4_t test_vdupq_x_n_u32(uint32_t a, mve_pred16_t p)
 {
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 0975a65d183e..d3bdf9d6aafd 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4118,15 +4118,9 @@ static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
   if (TrueVal == FalseVal)
     return TrueVal;
 
-  // If the true or false value is undef, we can fold to the other value as
-  // long as the other value isn't poison.
-  // select ?, undef, X -> X
-  if (isa<UndefValue>(TrueVal) &&
-      isGuaranteedNotToBeUndefOrPoison(FalseVal, Q.CxtI, Q.DT))
+  if (isa<UndefValue>(TrueVal))   // select ?, undef, X -> X
     return FalseVal;
-  // select ?, X, undef -> X
-  if (isa<UndefValue>(FalseVal) &&
-      isGuaranteedNotToBeUndefOrPoison(TrueVal, Q.CxtI, Q.DT))
+  if (isa<UndefValue>(FalseVal))   // select ?, X, undef -> X
     return TrueVal;
 
   // Deal with partial undef vector constants: select ?, VecC, VecC' --> VecC''
@@ -4146,11 +4140,9 @@ static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
       // one element is undef, choose the defined element as the safe result.
       if (TEltC == FEltC)
         NewC.push_back(TEltC);
-      else if (isa<UndefValue>(TEltC) &&
-               isGuaranteedNotToBeUndefOrPoison(FEltC))
+      else if (isa<UndefValue>(TEltC))
         NewC.push_back(FEltC);
-      else if (isa<UndefValue>(FEltC) &&
-               isGuaranteedNotToBeUndefOrPoison(TEltC))
+      else if (isa<UndefValue>(FEltC))
         NewC.push_back(TEltC);
       else
         break;
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index f02246cda7fc..f3c3e9ad9f69 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -779,30 +779,10 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
     if (isa<UndefValue>(V1)) return V1;
     return V2;
   }
-
+  if (isa<UndefValue>(V1)) return V2;
+  if (isa<UndefValue>(V2)) return V1;
   if (V1 == V2) return V1;
 
-  // If the true or false value is undef, we can fold to the other value as
-  // long as the other value isn't poison.
-  auto NotPoison = [](Constant *C) {
-    // TODO: We can analyze ConstExpr by opcode to determine if there is any
-    //       possibility of poison.
-    if (isa<ConstantExpr>(C))
-      return false;
-
-    if (isa<ConstantInt>(C) || isa<GlobalVariable>(C) || isa<ConstantFP>(C) ||
-        isa<ConstantPointerNull>(C) || isa<Function>(C))
-      return true;
-
-    if (C->getType()->isVectorTy())
-      return !C->containsUndefElement() && !C->containsConstantExpression();
-
-    // TODO: Recursively analyze aggregates or other constants.
-    return false;
-  };
-  if (isa<UndefValue>(V1) && NotPoison(V2)) return V2;
-  if (isa<UndefValue>(V2) && NotPoison(V1)) return V1;
-
   if (ConstantExpr *TrueVal = dyn_cast<ConstantExpr>(V1)) {
     if (TrueVal->getOpcode() == Instruction::Select)
       if (TrueVal->getOperand(0) == Cond)
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/select.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
index 3acd21c73958..1fa4bdc1964e 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
@@ -221,7 +221,7 @@ define amdgpu_kernel void @store_select_group_global_mismatch_inttoptr_flat_null
 }
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_undef_undef_constexpr(
-; CHECK: store i32 7, i32* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32* addrspacecast (i32 addrspace(3)* null to i32*), i32* undef), align 4
+; CHECK: store i32 7, i32 addrspace(3)* null
 define amdgpu_kernel void @store_select_group_global_mismatch_undef_undef_constexpr() #0 {
   store i32 7, i32* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32* addrspacecast (i32 addrspace(3)* null to i32*), i32* addrspacecast (i32 addrspace(1)* undef to i32*)), align 4
   ret void
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index 08e547a6ea0a..8cd0e35139a8 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -2437,14 +2437,13 @@ exit:
   ret i32 %sel
 }
 
-; Negative tests to ensure we don't remove selects with undef true/false values.
+; FIXME: We shouldn't remove selects with undef true/false values.
 ; See https://bugs.llvm.org/show_bug.cgi?id=31633
 ; https://lists.llvm.org/pipermail/llvm-dev/2016-October/106182.html
 ; https://reviews.llvm.org/D83360
 define i32 @false_undef(i1 %cond, i32 %x) {
 ; CHECK-LABEL: @false_undef(
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[COND:%.*]], i32 [[X:%.*]], i32 undef
-; CHECK-NEXT:    ret i32 [[S]]
+; CHECK-NEXT:    ret i32 [[X:%.*]]
 ;
   %s = select i1 %cond, i32 %x, i32 undef
   ret i32 %s
@@ -2452,8 +2451,7 @@ define i32 @false_undef(i1 %cond, i32 %x) {
 
 define i32 @true_undef(i1 %cond, i32 %x) {
 ; CHECK-LABEL: @true_undef(
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[COND:%.*]], i32 undef, i32 [[X:%.*]]
-; CHECK-NEXT:    ret i32 [[S]]
+; CHECK-NEXT:    ret i32 [[X:%.*]]
 ;
   %s = select i1 %cond, i32 undef, i32 %x
   ret i32 %s
@@ -2461,8 +2459,7 @@ define i32 @true_undef(i1 %cond, i32 %x) {
 
 define <2 x i32> @false_undef_vec(i1 %cond, <2 x i32> %x) {
 ; CHECK-LABEL: @false_undef_vec(
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[COND:%.*]], <2 x i32> [[X:%.*]], <2 x i32> undef
-; CHECK-NEXT:    ret <2 x i32> [[S]]
+; CHECK-NEXT:    ret <2 x i32> [[X:%.*]]
 ;
   %s = select i1 %cond, <2 x i32> %x, <2 x i32> undef
   ret <2 x i32> %s
@@ -2470,8 +2467,7 @@ define <2 x i32> @false_undef_vec(i1 %cond, <2 x i32> %x) {
 
 define <2 x i32> @true_undef_vec(i1 %cond, <2 x i32> %x) {
 ; CHECK-LABEL: @true_undef_vec(
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[COND:%.*]], <2 x i32> undef, <2 x i32> [[X:%.*]]
-; CHECK-NEXT:    ret <2 x i32> [[S]]
+; CHECK-NEXT:    ret <2 x i32> [[X:%.*]]
 ;
   %s = select i1 %cond, <2 x i32> undef, <2 x i32> %x
   ret <2 x i32> %s
diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll
index 353f2e6a6753..b1264138a15e 100644
--- a/llvm/test/Transforms/InstSimplify/select.ll
+++ b/llvm/test/Transforms/InstSimplify/select.ll
@@ -751,14 +751,13 @@ define i1 @y_might_be_poison(float %x, float %y) {
   ret i1 %c3
 }
 
-; Negative tests to ensure we don't remove selects with undef true/false values.
+; FIXME: We shouldn't remove selects with undef true/false values.
 ; See https://bugs.llvm.org/show_bug.cgi?id=31633
 ; https://lists.llvm.org/pipermail/llvm-dev/2016-October/106182.html
 ; https://reviews.llvm.org/D83360
 define i32 @false_undef(i1 %cond, i32 %x) {
 ; CHECK-LABEL: @false_undef(
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[COND:%.*]], i32 [[X:%.*]], i32 undef
-; CHECK-NEXT:    ret i32 [[S]]
+; CHECK-NEXT:    ret i32 [[X:%.*]]
 ;
   %s = select i1 %cond, i32 %x, i32 undef
   ret i32 %s
@@ -766,8 +765,7 @@ define i32 @false_undef(i1 %cond, i32 %x) {
 
 define i32 @true_undef(i1 %cond, i32 %x) {
 ; CHECK-LABEL: @true_undef(
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[COND:%.*]], i32 undef, i32 [[X:%.*]]
-; CHECK-NEXT:    ret i32 [[S]]
+; CHECK-NEXT:    ret i32 [[X:%.*]]
 ;
   %s = select i1 %cond, i32 undef, i32 %x
   ret i32 %s
@@ -775,8 +773,7 @@ define i32 @true_undef(i1 %cond, i32 %x) {
 
 define <2 x i32> @false_undef_vec(i1 %cond, <2 x i32> %x) {
 ; CHECK-LABEL: @false_undef_vec(
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[COND:%.*]], <2 x i32> [[X:%.*]], <2 x i32> undef
-; CHECK-NEXT:    ret <2 x i32> [[S]]
+; CHECK-NEXT:    ret <2 x i32> [[X:%.*]]
 ;
   %s = select i1 %cond, <2 x i32> %x, <2 x i32> undef
   ret <2 x i32> %s
@@ -784,8 +781,7 @@ define <2 x i32> @false_undef_vec(i1 %cond, <2 x i32> %x) {
 
 define <2 x i32> @true_undef_vec(i1 %cond, <2 x i32> %x) {
 ; CHECK-LABEL: @true_undef_vec(
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[COND:%.*]], <2 x i32> undef, <2 x i32> [[X:%.*]]
-; CHECK-NEXT:    ret <2 x i32> [[S]]
+; CHECK-NEXT:    ret <2 x i32> [[X:%.*]]
 ;
   %s = select i1 %cond, <2 x i32> undef, <2 x i32> %x
   ret <2 x i32> %s
@@ -847,13 +843,12 @@ define i32 @false_undef_false_freeze(i1 %cond, i32 %x) {
 
 @g = external global i32, align 1
 
-; Make sure we don't fold partial undef vectors when constexprs are involved.
+; FIXME: We shouldn't fold partial undef vectors when constexprs are involved.
 ; We would need to prove the constexpr doesn't result in poison which we aren't
 ; equiped to do yet.
 define <2 x i32> @false_undef_true_constextpr_vec(i1 %cond) {
 ; CHECK-LABEL: @false_undef_true_constextpr_vec(
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[COND:%.*]], <2 x i32> <i32 undef, i32 ptrtoint (i32* @g to i32)>, <2 x i32> <i32 ptrtoint (i32* @g to i32), i32 undef>
-; CHECK-NEXT:    ret <2 x i32> [[S]]
+; CHECK-NEXT:    ret <2 x i32> <i32 ptrtoint (i32* @g to i32), i32 ptrtoint (i32* @g to i32)>
 ;
   %s = select i1 %cond, <2 x i32> <i32 undef, i32 ptrtoint (i32* @g to i32)>, <2 x i32> <i32 ptrtoint (i32* @g to i32), i32 undef>
   ret <2 x i32> %s
@@ -891,11 +886,10 @@ define <2 x float> @all_constant_false_undef_vec() {
   ret <2 x float> %s
 }
 
-; Negative tests. Don't fold if the non-undef operand is a constexpr.
+; FIXME: We shouldn't fold if the non-undef operand is a constexpr.
 define i32 @all_constant_false_undef_true_constexpr() {
 ; CHECK-LABEL: @all_constant_false_undef_true_constexpr(
-; CHECK-NEXT:    [[S:%.*]] = select i1 ptrtoint (i32 ()* @all_constant_false_undef_true_constexpr to i1), i32 ptrtoint (i32 ()* @all_constant_false_undef_true_constexpr to i32), i32 undef
-; CHECK-NEXT:    ret i32 [[S]]
+; CHECK-NEXT:    ret i32 ptrtoint (i32 ()* @all_constant_false_undef_true_constexpr to i32)
 ;
   %s = select i1 ptrtoint (i32 ()* @all_constant_false_undef_true_constexpr to i1), i32 ptrtoint (i32 ()* @all_constant_false_undef_true_constexpr to i32), i32 undef
   ret i32 %s
@@ -903,18 +897,16 @@ define i32 @all_constant_false_undef_true_constexpr() {
 
 define i32 @all_constant_true_undef_false_constexpr() {
 ; CHECK-LABEL: @all_constant_true_undef_false_constexpr(
-; CHECK-NEXT:    [[S:%.*]] = select i1 ptrtoint (i32 ()* @all_constant_true_undef_false_constexpr to i1), i32 undef, i32 ptrtoint (i32 ()* @all_constant_true_undef_false_constexpr to i32)
-; CHECK-NEXT:    ret i32 [[S]]
+; CHECK-NEXT:    ret i32 ptrtoint (i32 ()* @all_constant_true_undef_false_constexpr to i32)
 ;
   %s = select i1 ptrtoint (i32 ()* @all_constant_true_undef_false_constexpr to i1), i32 undef, i32 ptrtoint (i32 ()* @all_constant_true_undef_false_constexpr to i32)
   ret i32 %s
 }
 
-; Negative tests. Don't fold if the non-undef operand is a vector containing a constexpr.
+; FIXME: We shouldn't fold if the non-undef operand is a vector containing a constexpr.
 define <2 x i32> @all_constant_false_undef_true_constexpr_vec() {
 ; CHECK-LABEL: @all_constant_false_undef_true_constexpr_vec(
-; CHECK-NEXT:    [[S:%.*]] = select i1 ptrtoint (<2 x i32> ()* @all_constant_false_undef_true_constexpr_vec to i1), <2 x i32> <i32 ptrtoint (<2 x i32> ()* @all_constant_false_undef_true_constexpr_vec to i32), i32 -1>, <2 x i32> undef
-; CHECK-NEXT:    ret <2 x i32> [[S]]
+; CHECK-NEXT:    ret <2 x i32> <i32 ptrtoint (<2 x i32> ()* @all_constant_false_undef_true_constexpr_vec to i32), i32 -1>
 ;
   %s = select i1 ptrtoint (<2 x i32> ()* @all_constant_false_undef_true_constexpr_vec to i1), <2 x i32> <i32 ptrtoint (<2 x i32> ()* @all_constant_false_undef_true_constexpr_vec to i32), i32 -1>, <2 x i32> undef
   ret <2 x i32> %s
@@ -922,8 +914,7 @@ define <2 x i32> @all_constant_false_undef_true_constexpr_vec() {
 
 define <2 x i32> @all_constant_true_undef_false_constexpr_vec() {
 ; CHECK-LABEL: @all_constant_true_undef_false_constexpr_vec(
-; CHECK-NEXT:    [[S:%.*]] = select i1 ptrtoint (<2 x i32> ()* @all_constant_true_undef_false_constexpr_vec to i1), <2 x i32> undef, <2 x i32> <i32 -1, i32 ptrtoint (<2 x i32> ()* @all_constant_true_undef_false_constexpr_vec to i32)>
-; CHECK-NEXT:    ret <2 x i32> [[S]]
+; CHECK-NEXT:    ret <2 x i32> <i32 -1, i32 ptrtoint (<2 x i32> ()* @all_constant_true_undef_false_constexpr_vec to i32)>
 ;
   %s = select i1 ptrtoint (<2 x i32> ()* @all_constant_true_undef_false_constexpr_vec to i1), <2 x i32> undef, <2 x i32><i32 -1, i32 ptrtoint (<2 x i32> ()* @all_constant_true_undef_false_constexpr_vec to i32)>
   ret <2 x i32> %s

From 3388ca490dc61365a6607b3217bfe446de3eabe4 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Tue, 7 Jul 2020 14:52:45 -0400
Subject: [PATCH 007/363] [OPENMP]Fix PR46593: Reduction initializer missing
 construnctor call.

Summary:
If user-defined reductions with the initializer are used with classes,
the compiler misses the constructor call when trying to create a private
copy of the reduction variable.

Reviewers: jdoerfert

Subscribers: cfe-commits, yaxunl, guansong, caomhin

Tags: #clang

Differential Revision: https://reviews.llvm.org/D83334

(cherry picked from commit 41d0af00740ac5140f11c7f37157fc6e6dd1b016)
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp          |  3 +++
 clang/lib/Sema/SemaOpenMP.cpp                  | 18 ++++++++++++------
 .../test/OpenMP/for_reduction_codegen_UDR.cpp  | 15 +++++++++++++++
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 43cbe9c720ea..a7e1fe8560b6 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -886,8 +886,11 @@ void ReductionCodeGen::emitInitialization(
       SharedType, SharedAddresses[N].first.getBaseInfo(),
       CGF.CGM.getTBAAInfoForSubobject(SharedAddresses[N].first, SharedType));
   if (CGF.getContext().getAsArrayType(PrivateVD->getType())) {
+    if (DRD && DRD->getInitializer())
+      (void)DefaultInit(CGF);
     emitAggregateInitialization(CGF, N, PrivateAddr, SharedLVal, DRD);
   } else if (DRD && (DRD->getInitializer() || !PrivateVD->hasInit())) {
+    (void)DefaultInit(CGF);
     emitInitWithReductionInitializer(CGF, DRD, ClausesData[N].ReductionOp,
                                      PrivateAddr, SharedLVal.getAddress(CGF),
                                      SharedLVal.getType());
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 920463da4027..8bf605e5e76b 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -15153,6 +15153,7 @@ static bool actOnOMPReductionKindClause(
       auto *DRDRef = DeclareReductionRef.getAs<DeclRefExpr>();
       auto *DRD = cast<OMPDeclareReductionDecl>(DRDRef->getDecl());
       if (DRD->getInitializer()) {
+        S.ActOnUninitializedDecl(PrivateVD);
         Init = DRDRef;
         RHSVD->setInit(DRDRef);
         RHSVD->setInitStyle(VarDecl::CallInit);
@@ -15259,10 +15260,19 @@ static bool actOnOMPReductionKindClause(
         llvm_unreachable("Unexpected reduction operation");
       }
     }
-    if (Init && DeclareReductionRef.isUnset())
+    if (Init && DeclareReductionRef.isUnset()) {
       S.AddInitializerToDecl(RHSVD, Init, /*DirectInit=*/false);
-    else if (!Init)
+      // Store initializer for single element in private copy. Will be used
+      // during codegen.
+      PrivateVD->setInit(RHSVD->getInit());
+      PrivateVD->setInitStyle(RHSVD->getInitStyle());
+    } else if (!Init) {
       S.ActOnUninitializedDecl(RHSVD);
+      // Store initializer for single element in private copy. Will be used
+      // during codegen.
+      PrivateVD->setInit(RHSVD->getInit());
+      PrivateVD->setInitStyle(RHSVD->getInitStyle());
+    }
     if (RHSVD->isInvalidDecl())
       continue;
     if (!RHSVD->hasInit() &&
@@ -15276,10 +15286,6 @@ static bool actOnOMPReductionKindClause(
           << D;
       continue;
     }
-    // Store initializer for single element in private copy. Will be used during
-    // codegen.
-    PrivateVD->setInit(RHSVD->getInit());
-    PrivateVD->setInitStyle(RHSVD->getInitStyle());
     DeclRefExpr *PrivateDRE = buildDeclRefExpr(S, PrivateVD, PrivateTy, ELoc);
     ExprResult ReductionOp;
     if (DeclareReductionRef.isUsable()) {
diff --git a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
index 45962b3ed2b1..31168bc325e3 100644
--- a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
+++ b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
@@ -203,9 +203,11 @@ int main() {
 // For + reduction operation initial value of private variable is -1.
 // CHECK: call void [[RED_INIT1:@.+]](float* %{{.+}}, float* %{{.+}})
 
+// CHECK: call void @_ZN1SIfEC1Ev([[S_FLOAT_TY]]* [[VAR_PRIV]]
 // For & reduction operation initial value of private variable is defined by call of 'init()' function.
 // CHECK: call void [[RED_INIT2:@.+]](
 
+// CHECK: call void @_ZN1SIfEC1Ev([[S_FLOAT_TY]]* [[VAR1_PRIV]]
 // For && reduction operation initial value of private variable is 1.0.
 // CHECK: call void [[RED_INIT3:@.+]](
 
@@ -598,6 +600,17 @@ int main() {
 // CHECK: br i1 [[DONE]],
 
 // Check initialization of private copy.
+// CHECK: [[BEGIN:%.+]] = getelementptr inbounds [10 x [4 x [[S_FLOAT_TY]]]], [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]], i32 0, i32 0, i32 0
+// CHECK: [[END:%.+]] = getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[BEGIN]], i64 40
+// CHECK: br label %[[CTOR:[^,]+]]
+// CHECK: [[CTOR]]:
+// CHECK: [[CUR:%.+]] = phi [[S_FLOAT_TY]]* [ [[BEGIN]], %{{.+}} ], [ [[NEXT:%.+]], %[[CTOR]] ]
+// CHECK: call void @_ZN1SIfEC1Ev([[S_FLOAT_TY]]* [[CUR]])
+// CHECK: [[NEXT:%.+]] = getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[CUR]], i64 1
+// CHECK: [[IS_DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* [[NEXT]], [[END]]
+// CHECK: br i1 [[IS_DONE]], label %[[DONE:[^,]+]], label %[[CTOR]]
+// CHECK: [[DONE]]:
+
 // CHECK: [[BEGIN:%.+]] = getelementptr inbounds [10 x [4 x [[S_FLOAT_TY]]]], [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]], i32 0, i32 0, i32 0
 // CHECK: [[LHS_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* %{{.+}} to [[S_FLOAT_TY]]*
 // CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[BEGIN]], i64 40
@@ -901,9 +914,11 @@ int main() {
 // For + reduction operation initial value of private variable is 0.
 // CHECK: call void [[RED_INIT6:@.+]](
 
+// CHECK: call void @_ZN1SIiEC1Ev([[S_INT_TY]]* [[VAR_PRIV]]
 // For & reduction operation initial value of private variable is ones in all bits.
 // CHECK: call void [[RED_INIT2:@.+]](
 
+// CHECK: call void @_ZN1SIiEC1Ev([[S_INT_TY]]* [[VAR1_PRIV]]
 // For && reduction operation initial value of private variable is 1.0.
 // CHECK: call void [[RED_INIT7:@.+]](
 

From 73e8ca7bbad561170a874de6246863a0b9fc24f9 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Mon, 13 Jul 2020 12:56:18 -0400
Subject: [PATCH 008/363] [OPENMP]Fix PR46688: cast the type of the allocated
 variable to the initial one.

Summary:
If the original variable is marked for allocation in the different
address space using #pragma omp allocate, need to cast the allocated
variable to its original type with the original address space.
Otherwise, the compiler may crash trying to bitcast the type of the new
allocated variable to the original type in some cases, like passing this
variable as an argument in function calls.

Reviewers: jdoerfert

Subscribers: jholewinski, cfe-commits, yaxunl, guansong, caomhin

Tags: #clang

Differential Revision: https://reviews.llvm.org/D83696

(cherry picked from commit 9dc327d1b74637dac6dc432fb66f88711af16a55)
---
 clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp   | 57 ++++++++------------
 clang/test/OpenMP/nvptx_allocate_codegen.cpp |  2 +-
 2 files changed, 24 insertions(+), 35 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index cbd443134e7a..ac6ec742335c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -4770,6 +4770,7 @@ Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
                                                         const VarDecl *VD) {
   if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
     const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
+    auto AS = LangAS::Default;
     switch (A->getAllocatorType()) {
       // Use the default allocator here as by default local vars are
       // threadlocal.
@@ -4783,42 +4784,30 @@ Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
     case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
       // TODO: implement aupport for user-defined allocators.
       return Address::invalid();
-    case OMPAllocateDeclAttr::OMPConstMemAlloc: {
-      llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
-      auto *GV = new llvm::GlobalVariable(
-          CGM.getModule(), VarTy, /*isConstant=*/false,
-          llvm::GlobalValue::InternalLinkage,
-          llvm::Constant::getNullValue(VarTy), VD->getName(),
-          /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
-          CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant));
-      CharUnits Align = CGM.getContext().getDeclAlign(VD);
-      GV->setAlignment(Align.getAsAlign());
-      return Address(GV, Align);
-    }
-    case OMPAllocateDeclAttr::OMPPTeamMemAlloc: {
-      llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
-      auto *GV = new llvm::GlobalVariable(
-          CGM.getModule(), VarTy, /*isConstant=*/false,
-          llvm::GlobalValue::InternalLinkage,
-          llvm::Constant::getNullValue(VarTy), VD->getName(),
-          /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
-          CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
-      CharUnits Align = CGM.getContext().getDeclAlign(VD);
-      GV->setAlignment(Align.getAsAlign());
-      return Address(GV, Align);
-    }
+    case OMPAllocateDeclAttr::OMPConstMemAlloc:
+      AS = LangAS::cuda_constant;
+      break;
+    case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
+      AS = LangAS::cuda_shared;
+      break;
     case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
-    case OMPAllocateDeclAttr::OMPCGroupMemAlloc: {
-      llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
-      auto *GV = new llvm::GlobalVariable(
-          CGM.getModule(), VarTy, /*isConstant=*/false,
-          llvm::GlobalValue::InternalLinkage,
-          llvm::Constant::getNullValue(VarTy), VD->getName());
-      CharUnits Align = CGM.getContext().getDeclAlign(VD);
-      GV->setAlignment(Align.getAsAlign());
-      return Address(GV, Align);
-    }
+    case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
+      break;
     }
+    llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
+    auto *GV = new llvm::GlobalVariable(
+        CGM.getModule(), VarTy, /*isConstant=*/false,
+        llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
+        VD->getName(),
+        /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+        CGM.getContext().getTargetAddressSpace(AS));
+    CharUnits Align = CGM.getContext().getDeclAlign(VD);
+    GV->setAlignment(Align.getAsAlign());
+    return Address(
+        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+            GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
+                    VD->getType().getAddressSpace()))),
+        Align);
   }
 
   if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
diff --git a/clang/test/OpenMP/nvptx_allocate_codegen.cpp b/clang/test/OpenMP/nvptx_allocate_codegen.cpp
index 46565443354e..01542ca4044a 100644
--- a/clang/test/OpenMP/nvptx_allocate_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_allocate_codegen.cpp
@@ -101,7 +101,7 @@ void bar() {
 // CHECK: alloca float,
 // CHECK-NOT: alloca double,
 // CHECK: load float, float* %
-// CHECK: store double {{.+}}, double addrspace(3)* @bar_b,
+// CHECK: store double {{.+}}, double* addrspacecast (double addrspace(3)* @bar_b to double*),
 }
 
 #pragma omp end declare target

From 59521a06026e3912319db118340d48430838db09 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 15 Jul 2020 13:32:59 -0400
Subject: [PATCH 009/363] [InstCombine] update datalayout in test file; NFC

We need to specify legal integer widths to trigger PR46712,
so add those here. This doesn't appear to affect any existing
tests, and it's not clear why a datalayout would not include
any legal integer widths.

While here, change some variable names that include 'tmp' to
avoid warnings from the auto-generating script for CHECK lines.

(cherry picked from commit efc30e591bb5a6e869fd8e084bd310ae516b0fae)
---
 llvm/test/Transforms/InstCombine/or.ll | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/or.ll b/llvm/test/Transforms/InstCombine/or.ll
index 8037af681465..b747c5d97810 100644
--- a/llvm/test/Transforms/InstCombine/or.ll
+++ b/llvm/test/Transforms/InstCombine/or.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n32:64"
 
 define i32 @test12(i32 %A) {
         ; Should be eliminated
@@ -107,17 +107,17 @@ define i32 @test20(i32 %x) {
   ret i32 %z
 }
 
-define i32 @test21(i32 %tmp.1) {
+define i32 @test21(i32 %t1) {
 ; CHECK-LABEL: @test21(
-; CHECK-NEXT:    [[TMP_1_MASK1:%.*]] = add i32 [[TMP_1:%.*]], 2
-; CHECK-NEXT:    ret i32 [[TMP_1_MASK1]]
+; CHECK-NEXT:    [[T1_MASK1:%.*]] = add i32 [[T1:%.*]], 2
+; CHECK-NEXT:    ret i32 [[T1_MASK1]]
 ;
-  %tmp.1.mask1 = add i32 %tmp.1, 2
-  %tmp.3 = and i32 %tmp.1.mask1, -2
-  %tmp.5 = and i32 %tmp.1, 1
+  %t1.mask1 = add i32 %t1, 2
+  %t3 = and i32 %t1.mask1, -2
+  %t5 = and i32 %t1, 1
   ;; add tmp.1, 2
-  %tmp.6 = or i32 %tmp.5, %tmp.3
-  ret i32 %tmp.6
+  %t6 = or i32 %t5, %t3
+  ret i32 %t6
 }
 
 define i32 @test22(i32 %B) {

From 12aa43e621ff3b60b515eaf33bb25ba439094140 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Wed, 15 Jul 2020 14:09:46 -0400
Subject: [PATCH 010/363] [InstCombine] prevent infinite looping in or-icmp
 fold (PR46712)

I'm not sure if the test is truly minimal, but we need to
induce a situation where a value becomes a constant but is
not immediately folded before getting to the 'or' transform.

(cherry picked from commit d8b268680d0858aaf30cb1a278b64b11361bc780)
---
 .../InstCombine/InstCombineAndOrXor.cpp       |  3 +-
 llvm/test/Transforms/InstCombine/or.ll        | 35 +++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index d3c718a919c0..1304d46fdef4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1148,11 +1148,12 @@ static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
   assert((IsAnd || Logic.getOpcode() == Instruction::Or) && "Wrong logic op");
 
   // Match an equality compare with a non-poison constant as Cmp0.
+  // Also, give up if the compare can be constant-folded to avoid looping.
   ICmpInst::Predicate Pred0;
   Value *X;
   Constant *C;
   if (!match(Cmp0, m_ICmp(Pred0, m_Value(X), m_Constant(C))) ||
-      !isGuaranteedNotToBeUndefOrPoison(C))
+      !isGuaranteedNotToBeUndefOrPoison(C) || isa<Constant>(X))
     return nullptr;
   if ((IsAnd && Pred0 != ICmpInst::ICMP_EQ) ||
       (!IsAnd && Pred0 != ICmpInst::ICMP_NE))
diff --git a/llvm/test/Transforms/InstCombine/or.ll b/llvm/test/Transforms/InstCombine/or.ll
index b747c5d97810..48496948c190 100644
--- a/llvm/test/Transforms/InstCombine/or.ll
+++ b/llvm/test/Transforms/InstCombine/or.ll
@@ -841,3 +841,38 @@ define <16 x i1> @test51(<16 x i1> %arg, <16 x i1> %arg1) {
   %tmp3 = or <16 x i1> %tmp, %tmp2
   ret <16 x i1> %tmp3
 }
+
+; This would infinite loop because it reaches a transform
+; that was not expecting a constant-foldable value.
+
+define i32 @PR46712(i1 %x, i1 %y, i1 %b, i64 %z) {
+; CHECK-LABEL: @PR46712(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[B:%.*]], label [[TRUE:%.*]], label [[END:%.*]]
+; CHECK:       true:
+; CHECK-NEXT:    [[BOOL5:%.*]] = icmp eq i64 [[Z:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = zext i1 [[BOOL5]] to i32
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[T5:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SEL]], [[TRUE]] ]
+; CHECK-NEXT:    ret i32 [[T5]]
+;
+entry:
+  %t2 = or i1 %x, %y
+  %conv = sext i1 %t2 to i32
+  %cmp = icmp sge i32 %conv, 1
+  %conv2 = zext i1 %cmp to i64
+  br i1 %b, label %true, label %end
+
+true:
+  %bool4 = icmp eq i64 %conv2, 0
+  %bool5 = icmp ne i64 %z, 0
+  %and = and i1 %bool4, %bool5
+  %sel = select i1 %and, i1 false, i1 true
+  br label %end
+
+end:
+  %t5 = phi i1 [ 0, %entry ], [ %sel, %true ]
+  %conv8 = zext i1 %t5 to i32
+  ret i32 %conv8
+}

From 6e3fe0813c6a0f58f973badbef56af784ef5937b Mon Sep 17 00:00:00 2001
From: Eric Christopher <echristo@gmail.com>
Date: Thu, 16 Jul 2020 11:40:43 -0700
Subject: [PATCH 011/363] Temporarily Revert "[AssumeBundles] Use operand
 bundles to encode alignment assumptions" due to the performance bugs filed in
 https://bugs.llvm.org/show_bug.cgi?id=46753.

An SROA change soon may obviate some of these problems.

This reverts commit 8d09f20798ac180b1749276bff364682ce0196ab.

(cherry picked from commit 7bfaa40086359ed7e41c862ab0a65e0bb1be0aeb)
---
 clang/lib/CodeGen/CodeGenFunction.cpp         |  36 +-----
 clang/test/CodeGen/align_value.cpp            |  30 ++++-
 clang/test/CodeGen/alloc-align-attr.c         |  44 +++++--
 ...ssume-aligned-and-alloc-align-attributes.c |   8 +-
 clang/test/CodeGen/builtin-align-array.c      |  32 +++--
 clang/test/CodeGen/builtin-align.c            |  24 +++-
 clang/test/CodeGen/builtin-assume-aligned.c   |  32 ++++-
 ...mption-attribute-align_value-on-lvalue.cpp |   8 +-
 ...tion-attribute-align_value-on-paramvar.cpp |   2 +-
 ...ibute-alloc_align-on-function-variable.cpp |  10 +-
 ...tion-attribute-alloc_align-on-function.cpp |   2 +-
 ...-assume_aligned-on-function-two-params.cpp |  10 +-
 ...n-attribute-assume_aligned-on-function.cpp |   2 +-
 ...n_assume_aligned-three-params-variable.cpp |  10 +-
 ...on-builtin_assume_aligned-three-params.cpp |  10 +-
 ...tion-builtin_assume_aligned-two-params.cpp |   8 +-
 .../catch-alignment-assumption-openmp.cpp     |   8 +-
 .../non-power-of-2-alignment-assumptions.c    |  13 +-
 clang/test/OpenMP/simd_codegen.cpp            |  16 +++
 clang/test/OpenMP/simd_metadata.c             | 117 ++++++++++-------
 ...s_distribute_parallel_for_simd_codegen.cpp |   5 +-
 llvm/include/llvm/IR/IRBuilder.h              |  28 ++--
 .../Scalar/AlignmentFromAssumptions.h         |   6 +-
 llvm/lib/Analysis/AssumeBundleQueries.cpp     |  13 +-
 llvm/lib/IR/IRBuilder.cpp                     |  77 +++++++----
 llvm/lib/IR/Verifier.cpp                      |  23 +---
 .../InstCombine/InstCombineCalls.cpp          |  15 +--
 .../Scalar/AlignmentFromAssumptions.cpp       | 121 +++++++++++++-----
 .../AlignmentFromAssumptions/simple.ll        |  75 +++++++----
 .../AlignmentFromAssumptions/simple32.ll      | 114 +++++++++++++----
 llvm/test/Transforms/Inline/align.ll          |  15 ++-
 llvm/test/Transforms/InstCombine/assume.ll    |   1 -
 .../inlining-alignment-assumptions.ll         |  27 +++-
 llvm/test/Verifier/assume-bundles.ll          |  16 +--
 .../Analysis/AssumeBundleQueriesTest.cpp      |  38 ------
 35 files changed, 627 insertions(+), 369 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 4a7c84562dee..8ce488f35dd3 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -2154,39 +2154,13 @@ void CodeGenFunction::emitAlignmentAssumption(llvm::Value *PtrValue,
                                               SourceLocation AssumptionLoc,
                                               llvm::Value *Alignment,
                                               llvm::Value *OffsetValue) {
-  if (Alignment->getType() != IntPtrTy)
-    Alignment =
-        Builder.CreateIntCast(Alignment, IntPtrTy, false, "casted.align");
-  if (OffsetValue && OffsetValue->getType() != IntPtrTy)
-    OffsetValue =
-        Builder.CreateIntCast(OffsetValue, IntPtrTy, true, "casted.offset");
-  llvm::Value *TheCheck = nullptr;
+  llvm::Value *TheCheck;
+  llvm::Instruction *Assumption = Builder.CreateAlignmentAssumption(
+      CGM.getDataLayout(), PtrValue, Alignment, OffsetValue, &TheCheck);
   if (SanOpts.has(SanitizerKind::Alignment)) {
-    llvm::Value *PtrIntValue =
-        Builder.CreatePtrToInt(PtrValue, IntPtrTy, "ptrint");
-
-    if (OffsetValue) {
-      bool IsOffsetZero = false;
-      if (const auto *CI = dyn_cast<llvm::ConstantInt>(OffsetValue))
-        IsOffsetZero = CI->isZero();
-
-      if (!IsOffsetZero)
-        PtrIntValue = Builder.CreateSub(PtrIntValue, OffsetValue, "offsetptr");
-    }
-
-    llvm::Value *Zero = llvm::ConstantInt::get(IntPtrTy, 0);
-    llvm::Value *Mask =
-        Builder.CreateSub(Alignment, llvm::ConstantInt::get(IntPtrTy, 1));
-    llvm::Value *MaskedPtr = Builder.CreateAnd(PtrIntValue, Mask, "maskedptr");
-    TheCheck = Builder.CreateICmpEQ(MaskedPtr, Zero, "maskcond");
+    emitAlignmentAssumptionCheck(PtrValue, Ty, Loc, AssumptionLoc, Alignment,
+                                 OffsetValue, TheCheck, Assumption);
   }
-  llvm::Instruction *Assumption = Builder.CreateAlignmentAssumption(
-      CGM.getDataLayout(), PtrValue, Alignment, OffsetValue);
-
-  if (!SanOpts.has(SanitizerKind::Alignment))
-    return;
-  emitAlignmentAssumptionCheck(PtrValue, Ty, Loc, AssumptionLoc, Alignment,
-                               OffsetValue, TheCheck, Assumption);
 }
 
 void CodeGenFunction::emitAlignmentAssumption(llvm::Value *PtrValue,
diff --git a/clang/test/CodeGen/align_value.cpp b/clang/test/CodeGen/align_value.cpp
index a18cb651fe4c..acbfbaf2ba5c 100644
--- a/clang/test/CodeGen/align_value.cpp
+++ b/clang/test/CodeGen/align_value.cpp
@@ -29,7 +29,10 @@ struct ad_struct {
 // CHECK-NEXT:    [[TMP0:%.*]] = load %struct.ad_struct*, %struct.ad_struct** [[X_ADDR]], align 8
 // CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_AD_STRUCT:%.*]], %struct.ad_struct* [[TMP0]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP1:%.*]] = load double*, double** [[A]], align 8
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(double* [[TMP1]], i64 64) ]
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint double* [[TMP1]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 63
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    ret double* [[TMP1]]
 //
 double *foo(ad_struct& x) {
@@ -45,7 +48,10 @@ double *foo(ad_struct& x) {
 // CHECK-NEXT:    [[TMP0:%.*]] = load %struct.ad_struct*, %struct.ad_struct** [[X_ADDR]], align 8
 // CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_AD_STRUCT:%.*]], %struct.ad_struct* [[TMP0]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP1:%.*]] = load double*, double** [[A]], align 8
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(double* [[TMP1]], i64 64) ]
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint double* [[TMP1]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 63
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    ret double* [[TMP1]]
 //
 double *goo(ad_struct *x) {
@@ -60,7 +66,10 @@ double *goo(ad_struct *x) {
 // CHECK-NEXT:    store double** [[X]], double*** [[X_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load double**, double*** [[X_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load double*, double** [[TMP0]], align 8
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(double* [[TMP1]], i64 64) ]
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint double* [[TMP1]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 63
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    ret double* [[TMP1]]
 //
 double *bar(aligned_double *x) {
@@ -75,7 +84,10 @@ double *bar(aligned_double *x) {
 // CHECK-NEXT:    store double** [[X]], double*** [[X_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load double**, double*** [[X_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load double*, double** [[TMP0]], align 8
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(double* [[TMP1]], i64 64) ]
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint double* [[TMP1]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 63
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    ret double* [[TMP1]]
 //
 double *car(aligned_double &x) {
@@ -91,7 +103,10 @@ double *car(aligned_double &x) {
 // CHECK-NEXT:    [[TMP0:%.*]] = load double**, double*** [[X_ADDR]], align 8
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double*, double** [[TMP0]], i64 5
 // CHECK-NEXT:    [[TMP1:%.*]] = load double*, double** [[ARRAYIDX]], align 8
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(double* [[TMP1]], i64 64) ]
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint double* [[TMP1]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 63
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    ret double* [[TMP1]]
 //
 double *dar(aligned_double *x) {
@@ -103,7 +118,10 @@ aligned_double eep();
 // CHECK-LABEL: define {{[^@]+}}@_Z3retv() #0
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call double* @_Z3eepv()
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(double* [[CALL]], i64 64) ]
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint double* [[CALL]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 63
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    ret double* [[CALL]]
 //
 double *ret() {
diff --git a/clang/test/CodeGen/alloc-align-attr.c b/clang/test/CodeGen/alloc-align-attr.c
index 44a57291b47c..9517c50dbb1d 100644
--- a/clang/test/CodeGen/alloc-align-attr.c
+++ b/clang/test/CodeGen/alloc-align-attr.c
@@ -11,8 +11,12 @@ __INT32_TYPE__*m1(__INT32_TYPE__ i) __attribute__((alloc_align(1)));
 // CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
 // CHECK-NEXT:    [[CALL:%.*]] = call i32* @m1(i32 [[TMP0]])
-// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = zext i32 [[TMP0]] to i64
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[CALL]], i64 [[CASTED_ALIGN]]) ]
+// CHECK-NEXT:    [[ALIGNMENTCAST:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK-NEXT:    [[MASK:%.*]] = sub i64 [[ALIGNMENTCAST]], 1
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[CALL]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK]]
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CALL]], align 4
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -28,8 +32,12 @@ __INT32_TYPE__ test1(__INT32_TYPE__ a) {
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
 // CHECK-NEXT:    [[CALL:%.*]] = call i32* @m1(i32 [[CONV]])
-// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = zext i32 [[CONV]] to i64
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[CALL]], i64 [[CASTED_ALIGN]]) ]
+// CHECK-NEXT:    [[ALIGNMENTCAST:%.*]] = zext i32 [[CONV]] to i64
+// CHECK-NEXT:    [[MASK:%.*]] = sub i64 [[ALIGNMENTCAST]], 1
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[CALL]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK]]
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CALL]], align 4
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -47,7 +55,11 @@ __INT32_TYPE__ *m2(__SIZE_TYPE__ i) __attribute__((alloc_align(1)));
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP0]] to i64
 // CHECK-NEXT:    [[CALL:%.*]] = call i32* @m2(i64 [[CONV]])
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[CALL]], i64 [[CONV]]) ]
+// CHECK-NEXT:    [[MASK:%.*]] = sub i64 [[CONV]], 1
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[CALL]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK]]
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CALL]], align 4
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -63,7 +75,11 @@ __INT32_TYPE__ test3(__INT32_TYPE__ a) {
 // CHECK-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[A_ADDR]], align 8
 // CHECK-NEXT:    [[CALL:%.*]] = call i32* @m2(i64 [[TMP0]])
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[CALL]], i64 [[TMP0]]) ]
+// CHECK-NEXT:    [[MASK:%.*]] = sub i64 [[TMP0]], 1
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[CALL]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK]]
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[CALL]], align 4
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -99,8 +115,12 @@ __INT32_TYPE__ *m3(struct Empty s, __int128_t i) __attribute__((alloc_align(2)))
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[TMP4]], i32 0, i32 1
 // CHECK-NEXT:    [[TMP8:%.*]] = load i64, i64* [[TMP7]], align 8
 // CHECK-NEXT:    [[CALL:%.*]] = call i32* @m3(i64 [[TMP6]], i64 [[TMP8]])
-// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP3]] to i64
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[CALL]], i64 [[CASTED_ALIGN]]) ]
+// CHECK-NEXT:    [[ALIGNMENTCAST:%.*]] = trunc i128 [[TMP3]] to i64
+// CHECK-NEXT:    [[MASK:%.*]] = sub i64 [[ALIGNMENTCAST]], 1
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[CALL]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK]]
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[CALL]], align 4
 // CHECK-NEXT:    ret i32 [[TMP9]]
 //
@@ -137,8 +157,12 @@ __INT32_TYPE__ *m4(struct MultiArgs s, __int128_t i) __attribute__((alloc_align(
 // CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds { i64, i64 }, { i64, i64 }* [[TMP9]], i32 0, i32 1
 // CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
 // CHECK-NEXT:    [[CALL:%.*]] = call i32* @m4(i64 [[TMP6]], i64 [[TMP8]], i64 [[TMP11]], i64 [[TMP13]])
-// CHECK-NEXT:    [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP3]] to i64
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[CALL]], i64 [[CASTED_ALIGN]]) ]
+// CHECK-NEXT:    [[ALIGNMENTCAST:%.*]] = trunc i128 [[TMP3]] to i64
+// CHECK-NEXT:    [[MASK:%.*]] = sub i64 [[ALIGNMENTCAST]], 1
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[CALL]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK]]
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[CALL]], align 4
 // CHECK-NEXT:    ret i32 [[TMP14]]
 //
diff --git a/clang/test/CodeGen/assume-aligned-and-alloc-align-attributes.c b/clang/test/CodeGen/assume-aligned-and-alloc-align-attributes.c
index cd8a6f19b4f4..fa4ee8db12e7 100644
--- a/clang/test/CodeGen/assume-aligned-and-alloc-align-attributes.c
+++ b/clang/test/CodeGen/assume-aligned-and-alloc-align-attributes.c
@@ -36,8 +36,12 @@ void *t2_immediate2() {
 // CHECK-NEXT:    store i32 [[ALIGNMENT:%.*]], i32* [[ALIGNMENT_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ALIGNMENT_ADDR]], align 4
 // CHECK-NEXT:    [[CALL:%.*]] = call align 32 i8* @my_aligned_alloc(i32 320, i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[CALL]], i64 [[TMP1]]) ]
+// CHECK-NEXT:    [[ALIGNMENTCAST:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK-NEXT:    [[MASK:%.*]] = sub i64 [[ALIGNMENTCAST]], 1
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[CALL]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK]]
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    ret i8* [[CALL]]
 //
 void *t3_variable(int alignment) {
diff --git a/clang/test/CodeGen/builtin-align-array.c b/clang/test/CodeGen/builtin-align-array.c
index 31f7b42b5617..97235c33b7fb 100644
--- a/clang/test/CodeGen/builtin-align-array.c
+++ b/clang/test/CodeGen/builtin-align-array.c
@@ -4,7 +4,7 @@
 
 extern int func(char *c);
 
-// CHECK-LABEL: @test_array(
+// CHECK-LABEL: define {{[^@]+}}@test_array() #0
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[BUF:%.*]] = alloca [1024 x i8], align 16
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BUF]], i64 0, i64 44
@@ -12,7 +12,10 @@ extern int func(char *c);
 // CHECK-NEXT:    [[ALIGNED_INTPTR:%.*]] = and i64 [[INTPTR]], -16
 // CHECK-NEXT:    [[DIFF:%.*]] = sub i64 [[ALIGNED_INTPTR]], [[INTPTR]]
 // CHECK-NEXT:    [[ALIGNED_RESULT:%.*]] = getelementptr inbounds i8, i8* [[ARRAYIDX]], i64 [[DIFF]]
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[ALIGNED_RESULT]], i64 16) ]
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[ALIGNED_RESULT]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 15
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @func(i8* [[ALIGNED_RESULT]])
 // CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BUF]], i64 0, i64 22
 // CHECK-NEXT:    [[INTPTR2:%.*]] = ptrtoint i8* [[ARRAYIDX1]] to i64
@@ -20,10 +23,13 @@ extern int func(char *c);
 // CHECK-NEXT:    [[ALIGNED_INTPTR4:%.*]] = and i64 [[OVER_BOUNDARY]], -32
 // CHECK-NEXT:    [[DIFF5:%.*]] = sub i64 [[ALIGNED_INTPTR4]], [[INTPTR2]]
 // CHECK-NEXT:    [[ALIGNED_RESULT6:%.*]] = getelementptr inbounds i8, i8* [[ARRAYIDX1]], i64 [[DIFF5]]
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[ALIGNED_RESULT6]], i64 32) ]
-// CHECK-NEXT:    [[CALL7:%.*]] = call i32 @func(i8* [[ALIGNED_RESULT6]])
-// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BUF]], i64 0, i64 16
-// CHECK-NEXT:    [[SRC_ADDR:%.*]] = ptrtoint i8* [[ARRAYIDX8]] to i64
+// CHECK-NEXT:    [[PTRINT7:%.*]] = ptrtoint i8* [[ALIGNED_RESULT6]] to i64
+// CHECK-NEXT:    [[MASKEDPTR8:%.*]] = and i64 [[PTRINT7]], 31
+// CHECK-NEXT:    [[MASKCOND9:%.*]] = icmp eq i64 [[MASKEDPTR8]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND9]])
+// CHECK-NEXT:    [[CALL10:%.*]] = call i32 @func(i8* [[ALIGNED_RESULT6]])
+// CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BUF]], i64 0, i64 16
+// CHECK-NEXT:    [[SRC_ADDR:%.*]] = ptrtoint i8* [[ARRAYIDX11]] to i64
 // CHECK-NEXT:    [[SET_BITS:%.*]] = and i64 [[SRC_ADDR]], 63
 // CHECK-NEXT:    [[IS_ALIGNED:%.*]] = icmp eq i64 [[SET_BITS]], 0
 // CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[IS_ALIGNED]] to i32
@@ -36,7 +42,7 @@ int test_array(void) {
   return __builtin_is_aligned(&buf[16], 64);
 }
 
-// CHECK-LABEL: @test_array_should_not_mask(
+// CHECK-LABEL: define {{[^@]+}}@test_array_should_not_mask() #0
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[BUF:%.*]] = alloca [1024 x i8], align 32
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BUF]], i64 0, i64 64
@@ -44,7 +50,10 @@ int test_array(void) {
 // CHECK-NEXT:    [[ALIGNED_INTPTR:%.*]] = and i64 [[INTPTR]], -16
 // CHECK-NEXT:    [[DIFF:%.*]] = sub i64 [[ALIGNED_INTPTR]], [[INTPTR]]
 // CHECK-NEXT:    [[ALIGNED_RESULT:%.*]] = getelementptr inbounds i8, i8* [[ARRAYIDX]], i64 [[DIFF]]
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[ALIGNED_RESULT]], i64 16) ]
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[ALIGNED_RESULT]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 15
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @func(i8* [[ALIGNED_RESULT]])
 // CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[BUF]], i64 0, i64 32
 // CHECK-NEXT:    [[INTPTR2:%.*]] = ptrtoint i8* [[ARRAYIDX1]] to i64
@@ -52,8 +61,11 @@ int test_array(void) {
 // CHECK-NEXT:    [[ALIGNED_INTPTR4:%.*]] = and i64 [[OVER_BOUNDARY]], -32
 // CHECK-NEXT:    [[DIFF5:%.*]] = sub i64 [[ALIGNED_INTPTR4]], [[INTPTR2]]
 // CHECK-NEXT:    [[ALIGNED_RESULT6:%.*]] = getelementptr inbounds i8, i8* [[ARRAYIDX1]], i64 [[DIFF5]]
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[ALIGNED_RESULT6]], i64 32) ]
-// CHECK-NEXT:    [[CALL7:%.*]] = call i32 @func(i8* [[ALIGNED_RESULT6]])
+// CHECK-NEXT:    [[PTRINT7:%.*]] = ptrtoint i8* [[ALIGNED_RESULT6]] to i64
+// CHECK-NEXT:    [[MASKEDPTR8:%.*]] = and i64 [[PTRINT7]], 31
+// CHECK-NEXT:    [[MASKCOND9:%.*]] = icmp eq i64 [[MASKEDPTR8]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND9]])
+// CHECK-NEXT:    [[CALL10:%.*]] = call i32 @func(i8* [[ALIGNED_RESULT6]])
 // CHECK-NEXT:    ret i32 1
 //
 int test_array_should_not_mask(void) {
diff --git a/clang/test/CodeGen/builtin-align.c b/clang/test/CodeGen/builtin-align.c
index 60f7fc99c1d4..7e66e2b5c0b9 100644
--- a/clang/test/CodeGen/builtin-align.c
+++ b/clang/test/CodeGen/builtin-align.c
@@ -122,7 +122,11 @@ _Bool is_aligned(TYPE ptr, unsigned align) {
 // CHECK-VOID_PTR-NEXT:    [[ALIGNED_INTPTR:%.*]] = and i64 [[OVER_BOUNDARY]], [[INVERTED_MASK]]
 // CHECK-VOID_PTR-NEXT:    [[DIFF:%.*]] = sub i64 [[ALIGNED_INTPTR]], [[INTPTR]]
 // CHECK-VOID_PTR-NEXT:    [[ALIGNED_RESULT:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[DIFF]]
-// CHECK-VOID_PTR-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[ALIGNED_RESULT]], i64 [[ALIGNMENT]]) ]
+// CHECK-VOID_PTR-NEXT:    [[MASK1:%.*]] = sub i64 [[ALIGNMENT]], 1
+// CHECK-VOID_PTR-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[ALIGNED_RESULT]] to i64
+// CHECK-VOID_PTR-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK1]]
+// CHECK-VOID_PTR-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-VOID_PTR-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-VOID_PTR-NEXT:    ret i8* [[ALIGNED_RESULT]]
 //
 // CHECK-FLOAT_PTR-LABEL: define {{[^@]+}}@align_up
@@ -138,7 +142,11 @@ _Bool is_aligned(TYPE ptr, unsigned align) {
 // CHECK-FLOAT_PTR-NEXT:    [[TMP0:%.*]] = bitcast float* [[PTR]] to i8*
 // CHECK-FLOAT_PTR-NEXT:    [[ALIGNED_RESULT:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 [[DIFF]]
 // CHECK-FLOAT_PTR-NEXT:    [[TMP1:%.*]] = bitcast i8* [[ALIGNED_RESULT]] to float*
-// CHECK-FLOAT_PTR-NEXT:    call void @llvm.assume(i1 true) [ "align"(float* [[TMP1]], i64 [[ALIGNMENT]]) ]
+// CHECK-FLOAT_PTR-NEXT:    [[MASK1:%.*]] = sub i64 [[ALIGNMENT]], 1
+// CHECK-FLOAT_PTR-NEXT:    [[PTRINT:%.*]] = ptrtoint float* [[TMP1]] to i64
+// CHECK-FLOAT_PTR-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK1]]
+// CHECK-FLOAT_PTR-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-FLOAT_PTR-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-FLOAT_PTR-NEXT:    ret float* [[TMP1]]
 //
 // CHECK-LONG-LABEL: define {{[^@]+}}@align_up
@@ -176,7 +184,11 @@ TYPE align_up(TYPE ptr, unsigned align) {
 // CHECK-VOID_PTR-NEXT:    [[ALIGNED_INTPTR:%.*]] = and i64 [[INTPTR]], [[INVERTED_MASK]]
 // CHECK-VOID_PTR-NEXT:    [[DIFF:%.*]] = sub i64 [[ALIGNED_INTPTR]], [[INTPTR]]
 // CHECK-VOID_PTR-NEXT:    [[ALIGNED_RESULT:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[DIFF]]
-// CHECK-VOID_PTR-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[ALIGNED_RESULT]], i64 [[ALIGNMENT]]) ]
+// CHECK-VOID_PTR-NEXT:    [[MASK1:%.*]] = sub i64 [[ALIGNMENT]], 1
+// CHECK-VOID_PTR-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[ALIGNED_RESULT]] to i64
+// CHECK-VOID_PTR-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK1]]
+// CHECK-VOID_PTR-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-VOID_PTR-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-VOID_PTR-NEXT:    ret i8* [[ALIGNED_RESULT]]
 //
 // CHECK-FLOAT_PTR-LABEL: define {{[^@]+}}@align_down
@@ -191,7 +203,11 @@ TYPE align_up(TYPE ptr, unsigned align) {
 // CHECK-FLOAT_PTR-NEXT:    [[TMP0:%.*]] = bitcast float* [[PTR]] to i8*
 // CHECK-FLOAT_PTR-NEXT:    [[ALIGNED_RESULT:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 [[DIFF]]
 // CHECK-FLOAT_PTR-NEXT:    [[TMP1:%.*]] = bitcast i8* [[ALIGNED_RESULT]] to float*
-// CHECK-FLOAT_PTR-NEXT:    call void @llvm.assume(i1 true) [ "align"(float* [[TMP1]], i64 [[ALIGNMENT]]) ]
+// CHECK-FLOAT_PTR-NEXT:    [[MASK1:%.*]] = sub i64 [[ALIGNMENT]], 1
+// CHECK-FLOAT_PTR-NEXT:    [[PTRINT:%.*]] = ptrtoint float* [[TMP1]] to i64
+// CHECK-FLOAT_PTR-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK1]]
+// CHECK-FLOAT_PTR-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-FLOAT_PTR-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-FLOAT_PTR-NEXT:    ret float* [[TMP1]]
 //
 // CHECK-LONG-LABEL: define {{[^@]+}}@align_down
diff --git a/clang/test/CodeGen/builtin-assume-aligned.c b/clang/test/CodeGen/builtin-assume-aligned.c
index b9f1ebfbdcf5..90693cc21520 100644
--- a/clang/test/CodeGen/builtin-assume-aligned.c
+++ b/clang/test/CodeGen/builtin-assume-aligned.c
@@ -8,7 +8,10 @@
 // CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8*
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[TMP1]], i64 32, i64 0) ]
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[TMP1]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
 // CHECK-NEXT:    store i32* [[TMP2]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[A_ADDR]], align 8
@@ -28,7 +31,10 @@ int test1(int *a) {
 // CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8*
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[TMP1]], i64 32, i64 0) ]
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[TMP1]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
 // CHECK-NEXT:    store i32* [[TMP2]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[A_ADDR]], align 8
@@ -48,7 +54,10 @@ int test2(int *a) {
 // CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8*
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[TMP1]], i64 32) ]
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[TMP1]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
 // CHECK-NEXT:    store i32* [[TMP2]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[A_ADDR]], align 8
@@ -72,7 +81,11 @@ int test3(int *a) {
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8*
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[B_ADDR]], align 4
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP2]] to i64
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[TMP1]], i64 32, i64 [[CONV]]) ]
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[TMP1]] to i64
+// CHECK-NEXT:    [[OFFSETPTR:%.*]] = sub i64 [[PTRINT]], [[CONV]]
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[OFFSETPTR]], 31
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1]] to i32*
 // CHECK-NEXT:    store i32* [[TMP3]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[A_ADDR]], align 8
@@ -102,7 +115,11 @@ int *m2() __attribute__((assume_aligned(64, 12)));
 // CHECK-LABEL: define {{[^@]+}}@test6() #0
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32* (...) @m2()
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[CALL]], i64 64, i64 12) ]
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[CALL]] to i64
+// CHECK-NEXT:    [[OFFSETPTR:%.*]] = sub i64 [[PTRINT]], 12
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[OFFSETPTR]], 63
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CALL]], align 4
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
@@ -117,7 +134,10 @@ int test6() {
 // CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8*
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[TMP1]], i64 536870912) ]
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[TMP1]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 536870911
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
 // CHECK-NEXT:    store i32* [[TMP2]], i32** [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32*, i32** [[A_ADDR]], align 8
diff --git a/clang/test/CodeGen/catch-alignment-assumption-attribute-align_value-on-lvalue.cpp b/clang/test/CodeGen/catch-alignment-assumption-attribute-align_value-on-lvalue.cpp
index fb2b1a76116e..96d264190bec 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-attribute-align_value-on-lvalue.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-attribute-align_value-on-lvalue.cpp
@@ -21,9 +21,9 @@ char **load_from_ac_struct(struct ac_struct *x) {
   // CHECK-NEXT:                        %[[X_RELOADED:.*]] = load %[[STRUCT_AC_STRUCT]]*, %[[STRUCT_AC_STRUCT]]** %[[STRUCT_AC_STRUCT_ADDR]], align 8
   // CHECK:                             %[[A_ADDR:.*]] = getelementptr inbounds %[[STRUCT_AC_STRUCT]], %[[STRUCT_AC_STRUCT]]* %[[X_RELOADED]], i32 0, i32 0
   // CHECK:                             %[[A:.*]] = load i8**, i8*** %[[A_ADDR]], align 8
-  // CHECK-SANITIZE-NEXT:               %[[PTRINT:.*]] = ptrtoint i8** %[[A]] to i64
-  // CHECK-SANITIZE-NEXT:               %[[MASKEDPTR:.*]] = and i64 %[[PTRINT]], 2147483647
-  // CHECK-SANITIZE-NEXT:               %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
+  // CHECK-NEXT:                        %[[PTRINT:.*]] = ptrtoint i8** %[[A]] to i64
+  // CHECK-NEXT:                        %[[MASKEDPTR:.*]] = and i64 %[[PTRINT]], 2147483647
+  // CHECK-NEXT:                        %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
   // CHECK-SANITIZE-NEXT:               %[[PTRINT_DUP:.*]] = ptrtoint i8** %[[A]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[MASKCOND]], label %[[CONT:.*]], label %[[HANDLER_ALIGNMENT_ASSUMPTION:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_ALIGNMENT_ASSUMPTION]]:
@@ -32,7 +32,7 @@ char **load_from_ac_struct(struct ac_struct *x) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        call void @llvm.assume(i1 true) [ "align"(i8** %[[A]], i64 2147483648) ]
+  // CHECK-NEXT:                        call void @llvm.assume(i1 %[[MASKCOND]])
   // CHECK-NEXT:                        ret i8** %[[A]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-attribute-align_value-on-paramvar.cpp b/clang/test/CodeGen/catch-alignment-assumption-attribute-align_value-on-paramvar.cpp
index 46f7d09ae2aa..0e3fa750c66c 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-attribute-align_value-on-paramvar.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-attribute-align_value-on-paramvar.cpp
@@ -24,7 +24,7 @@ char **passthrough(__attribute__((align_value(0x80000000))) char **x) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-SANITIZE-NEXT:               call void @llvm.assume(i1 true) [ "align"(i8** %[[X_RELOADED]], i64 2147483648) ]
+  // CHECK-SANITIZE-NEXT:               call void @llvm.assume(i1 %[[MASKCOND]])
   // CHECK-NEXT:                        ret i8** %[[X_RELOADED]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-attribute-alloc_align-on-function-variable.cpp b/clang/test/CodeGen/catch-alignment-assumption-attribute-alloc_align-on-function-variable.cpp
index 40abbc387199..591eaa0e1313 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-attribute-alloc_align-on-function-variable.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-attribute-alloc_align-on-function-variable.cpp
@@ -30,10 +30,10 @@ char **caller(char **x, unsigned long alignment) {
   // CHECK-NEXT:                        %[[X_RELOADED:.*]] = load i8**, i8*** %[[X_ADDR]], align 8
   // CHECK-NEXT:                        %[[ALIGNMENT_RELOADED:.*]] = load i64, i64* %[[ALIGNMENT_ADDR]], align 8
   // CHECK-NEXT:                        %[[X_RETURNED:.*]] = call i8** @[[PASSTHROUGH]](i8** %[[X_RELOADED]], i64 %[[ALIGNMENT_RELOADED]])
-  // CHECK-SANITIZE-NEXT:               %[[PTRINT:.*]] = ptrtoint i8** %[[X_RETURNED]] to i64
-  // CHECK-SANITIZE-NEXT:               %[[MASK:.*]] = sub i64 %[[ALIGNMENT_RELOADED]], 1
-  // CHECK-SANITIZE-NEXT:               %[[MASKEDPTR:.*]] = and i64 %[[PTRINT]], %[[MASK]]
-  // CHECK-SANITIZE-NEXT:               %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
+  // CHECK-NEXT:                        %[[MASK:.*]] = sub i64 %[[ALIGNMENT_RELOADED]], 1
+  // CHECK-NEXT:                        %[[PTRINT:.*]] = ptrtoint i8** %[[X_RETURNED]] to i64
+  // CHECK-NEXT:                        %[[MASKEDPTR:.*]] = and i64 %[[PTRINT]], %[[MASK]]
+  // CHECK-NEXT:                        %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
   // CHECK-SANITIZE-NEXT:               %[[PTRINT_DUP:.*]] = ptrtoint i8** %[[X_RETURNED]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[MASKCOND]], label %[[CONT:.*]], label %[[HANDLER_ALIGNMENT_ASSUMPTION:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_ALIGNMENT_ASSUMPTION]]:
@@ -42,7 +42,7 @@ char **caller(char **x, unsigned long alignment) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        call void @llvm.assume(i1 true) [ "align"(i8** %[[X_RETURNED]], i64 %1) ]
+  // CHECK-NEXT:                        call void @llvm.assume(i1 %[[MASKCOND]])
   // CHECK-NEXT:                        ret i8** %[[X_RETURNED]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-attribute-alloc_align-on-function.cpp b/clang/test/CodeGen/catch-alignment-assumption-attribute-alloc_align-on-function.cpp
index 87d903c69716..a41357933f91 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-attribute-alloc_align-on-function.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-attribute-alloc_align-on-function.cpp
@@ -39,7 +39,7 @@ char **caller(char **x) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-SANITIZE-NEXT:               call void @llvm.assume(i1 true) [ "align"(i8** %[[X_RETURNED]], i64 128) ]
+  // CHECK-SANITIZE-NEXT:               call void @llvm.assume(i1 %[[MASKCOND]])
   // CHECK-NEXT:                        ret i8** %[[X_RETURNED]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-attribute-assume_aligned-on-function-two-params.cpp b/clang/test/CodeGen/catch-alignment-assumption-attribute-assume_aligned-on-function-two-params.cpp
index ecc96bcf6a53..e78667ce16e0 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-attribute-assume_aligned-on-function-two-params.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-attribute-assume_aligned-on-function-two-params.cpp
@@ -24,10 +24,10 @@ char **caller(char **x) {
   // CHECK-NEXT:                        store i8** %[[X]], i8*** %[[X_ADDR]], align 8
   // CHECK-NEXT:                        %[[X_RELOADED:.*]] = load i8**, i8*** %[[X_ADDR]], align 8
   // CHECK-NEXT:                        %[[X_RETURNED:.*]] = call i8** @[[PASSTHROUGH]](i8** %[[X_RELOADED]])
-  // CHECK-SANITIZE-NEXT:               %[[PTRINT:.*]] = ptrtoint i8** %[[X_RETURNED]] to i64
-  // CHECK-SANITIZE-NEXT:               %[[OFFSETPTR:.*]] = sub i64 %[[PTRINT]], 42
-  // CHECK-SANITIZE-NEXT:               %[[MASKEDPTR:.*]] = and i64 %[[OFFSETPTR]], 2147483647
-  // CHECK-SANITIZE-NEXT:               %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
+  // CHECK-NEXT:                        %[[PTRINT:.*]] = ptrtoint i8** %[[X_RETURNED]] to i64
+  // CHECK-NEXT:                        %[[OFFSETPTR:.*]] = sub i64 %[[PTRINT]], 42
+  // CHECK-NEXT:                        %[[MASKEDPTR:.*]] = and i64 %[[OFFSETPTR]], 2147483647
+  // CHECK-NEXT:                        %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
   // CHECK-SANITIZE-NEXT:               %[[PTRINT_DUP:.*]] = ptrtoint i8** %[[X_RETURNED]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[MASKCOND]], label %[[CONT:.*]], label %[[HANDLER_ALIGNMENT_ASSUMPTION:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_ALIGNMENT_ASSUMPTION]]:
@@ -36,7 +36,7 @@ char **caller(char **x) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        call void @llvm.assume(i1 true) [ "align"(i8** %[[X_RETURNED]], i64 2147483648, i64 42) ]
+  // CHECK-NEXT:                        call void @llvm.assume(i1 %[[MASKCOND]])
   // CHECK-NEXT:                        ret i8** %[[X_RETURNED]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-attribute-assume_aligned-on-function.cpp b/clang/test/CodeGen/catch-alignment-assumption-attribute-assume_aligned-on-function.cpp
index 5bbc5843b89f..f750bbd77d42 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-attribute-assume_aligned-on-function.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-attribute-assume_aligned-on-function.cpp
@@ -36,7 +36,7 @@ char **caller(char **x) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-SANITIZE-NEXT:               call void @llvm.assume(i1 true) [ "align"(i8** %[[X_RETURNED]], i64 128) ]
+  // CHECK-SANITIZE-NEXT:               call void @llvm.assume(i1 %[[MASKCOND]])
   // CHECK-NEXT:                        ret i8** %[[X_RETURNED]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-three-params-variable.cpp b/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-three-params-variable.cpp
index 9c8944ba280b..4306e322f5fb 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-three-params-variable.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-three-params-variable.cpp
@@ -16,10 +16,10 @@ void *caller(char **x, unsigned long offset) {
   // CHECK-NEXT:                        %[[X_RELOADED:.*]] = load i8**, i8*** %[[X_ADDR]], align 8
   // CHECK-NEXT:                        %[[BITCAST:.*]] = bitcast i8** %[[X_RELOADED]] to i8*
   // CHECK-NEXT:                        %[[OFFSET_RELOADED:.*]] = load i64, i64* %[[OFFSET_ADDR]], align 8
-  // CHECK-SANITIZE-NEXT:               %[[PTRINT:.*]] = ptrtoint i8* %[[BITCAST]] to i64
-  // CHECK-SANITIZE-NEXT:               %[[OFFSETPTR:.*]] = sub i64 %[[PTRINT]], %[[OFFSET_RELOADED]]
-  // CHECK-SANITIZE-NEXT:               %[[MASKEDPTR:.*]] = and i64 %[[OFFSETPTR]], 536870911
-  // CHECK-SANITIZE-NEXT:               %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
+  // CHECK-NEXT:                        %[[PTRINT:.*]] = ptrtoint i8* %[[BITCAST]] to i64
+  // CHECK-NEXT:                        %[[OFFSETPTR:.*]] = sub i64 %[[PTRINT]], %[[OFFSET_RELOADED]]
+  // CHECK-NEXT:                        %[[MASKEDPTR:.*]] = and i64 %[[OFFSETPTR]], 536870911
+  // CHECK-NEXT:                        %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
   // CHECK-SANITIZE-NEXT:               %[[PTRINT_DUP:.*]] = ptrtoint i8* %[[BITCAST]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[MASKCOND]], label %[[CONT:.*]], label %[[HANDLER_ALIGNMENT_ASSUMPTION:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_ALIGNMENT_ASSUMPTION]]:
@@ -28,7 +28,7 @@ void *caller(char **x, unsigned long offset) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        call void @llvm.assume(i1 true) [ "align"(i8* %[[BITCAST]], i64 536870912, i64 %[[OFFSET_RELOADED]]) ]
+  // CHECK-NEXT:                        call void @llvm.assume(i1 %[[MASKCOND]])
   // CHECK-NEXT:                        ret i8* %[[BITCAST]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-three-params.cpp b/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-three-params.cpp
index 9f61e08106a0..27f53e92bed8 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-three-params.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-three-params.cpp
@@ -13,10 +13,10 @@ void *caller(char **x) {
   // CHECK-NEXT:                        store i8** %[[X]], i8*** %[[X_ADDR]], align 8
   // CHECK-NEXT:                        %[[X_RELOADED:.*]] = load i8**, i8*** %[[X_ADDR]], align 8
   // CHECK-NEXT:                        %[[BITCAST:.*]] = bitcast i8** %[[X_RELOADED]] to i8*
-  // CHECK-SANITIZE-NEXT:               %[[PTRINT:.*]] = ptrtoint i8* %[[BITCAST]] to i64
-  // CHECK-SANITIZE-NEXT:               %[[OFFSETPTR:.*]] = sub i64 %[[PTRINT]], 42
-  // CHECK-SANITIZE-NEXT:               %[[MASKEDPTR:.*]] = and i64 %[[OFFSETPTR]], 536870911
-  // CHECK-SANITIZE-NEXT:               %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
+  // CHECK-NEXT:                        %[[PTRINT:.*]] = ptrtoint i8* %[[BITCAST]] to i64
+  // CHECK-NEXT:                        %[[OFFSETPTR:.*]] = sub i64 %[[PTRINT]], 42
+  // CHECK-NEXT:                        %[[MASKEDPTR:.*]] = and i64 %[[OFFSETPTR]], 536870911
+  // CHECK-NEXT:                        %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
   // CHECK-SANITIZE-NEXT:               %[[PTRINT_DUP:.*]] = ptrtoint i8* %[[BITCAST]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[MASKCOND]], label %[[CONT:.*]], label %[[HANDLER_ALIGNMENT_ASSUMPTION:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_ALIGNMENT_ASSUMPTION]]:
@@ -25,7 +25,7 @@ void *caller(char **x) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        call void @llvm.assume(i1 true) [ "align"(i8* %[[BITCAST]], i64 536870912, i64 42) ]
+  // CHECK-NEXT:                        call void @llvm.assume(i1 %[[MASKCOND]])
   // CHECK-NEXT:                        ret i8* %[[BITCAST]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-two-params.cpp b/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-two-params.cpp
index 20bed646ff95..5412270f3761 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-two-params.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-builtin_assume_aligned-two-params.cpp
@@ -13,9 +13,9 @@ void *caller(char **x) {
   // CHECK-NEXT:                        store i8** %[[X]], i8*** %[[X_ADDR]], align 8
   // CHECK-NEXT:                        %[[X_RELOADED:.*]] = load i8**, i8*** %[[X_ADDR]], align 8
   // CHECK-NEXT:                        %[[BITCAST:.*]] = bitcast i8** %[[X_RELOADED]] to i8*
-  // CHECK-SANITIZE-NEXT:               %[[PTRINT:.*]] = ptrtoint i8* %[[BITCAST]] to i64
-  // CHECK-SANITIZE-NEXT:               %[[MASKEDPTR:.*]] = and i64 %[[PTRINT]], 536870911
-  // CHECK-SANITIZE-NEXT:               %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
+  // CHECK-NEXT:                        %[[PTRINT:.*]] = ptrtoint i8* %[[BITCAST]] to i64
+  // CHECK-NEXT:                        %[[MASKEDPTR:.*]] = and i64 %[[PTRINT]], 536870911
+  // CHECK-NEXT:                        %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
   // CHECK-SANITIZE-NEXT:               %[[PTRINT_DUP:.*]] = ptrtoint i8* %[[BITCAST]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[MASKCOND]], label %[[CONT:.*]], label %[[HANDLER_ALIGNMENT_ASSUMPTION:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_ALIGNMENT_ASSUMPTION]]:
@@ -24,7 +24,7 @@ void *caller(char **x) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        call void @llvm.assume(i1 true) [ "align"(i8* %[[BITCAST]], i64 536870912) ]
+  // CHECK-NEXT:                        call void @llvm.assume(i1 %[[MASKCOND]])
   // CHECK-NEXT:                        ret i8* %[[BITCAST]]
   // CHECK-NEXT:                      }
 #line 100
diff --git a/clang/test/CodeGen/catch-alignment-assumption-openmp.cpp b/clang/test/CodeGen/catch-alignment-assumption-openmp.cpp
index 353f2fd7f17b..6d75ee0858da 100644
--- a/clang/test/CodeGen/catch-alignment-assumption-openmp.cpp
+++ b/clang/test/CodeGen/catch-alignment-assumption-openmp.cpp
@@ -12,9 +12,9 @@ void func(char *data) {
   // CHECK-NEXT:   %[[DATA_ADDR:.*]] = alloca i8*, align 8
   // CHECK:   store i8* %[[DATA]], i8** %[[DATA_ADDR]], align 8
   // CHECK:   %[[DATA_RELOADED:.*]] = load i8*, i8** %[[DATA_ADDR]], align 8
-  // CHECK-SANITIZE-NEXT:   %[[PTRINT:.*]] = ptrtoint i8* %[[DATA_RELOADED]] to i64
-  // CHECK-SANITIZE-NEXT:   %[[MASKEDPTR:.*]] = and i64 %[[PTRINT]], 1073741823
-  // CHECK-SANITIZE-NEXT:   %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
+  // CHECK-NEXT:   %[[PTRINT:.*]] = ptrtoint i8* %[[DATA_RELOADED]] to i64
+  // CHECK-NEXT:   %[[MASKEDPTR:.*]] = and i64 %[[PTRINT]], 1073741823
+  // CHECK-NEXT:   %[[MASKCOND:.*]] = icmp eq i64 %[[MASKEDPTR]], 0
   // CHECK-SANITIZE-NEXT:               %[[PTRINT_DUP:.*]] = ptrtoint i8* %[[DATA_RELOADED]] to i64, !nosanitize
   // CHECK-SANITIZE-NEXT:               br i1 %[[MASKCOND]], label %[[CONT:.*]], label %[[HANDLER_ALIGNMENT_ASSUMPTION:[^,]+]],{{.*}} !nosanitize
   // CHECK-SANITIZE:                  [[HANDLER_ALIGNMENT_ASSUMPTION]]:
@@ -23,7 +23,7 @@ void func(char *data) {
   // CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
   // CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
   // CHECK-SANITIZE:                  [[CONT]]:
-  // CHECK-NEXT:                        call void @llvm.assume(i1 true) [ "align"(i8* %[[DATA_RELOADED]], i64 1073741824) ]
+  // CHECK-NEXT:                        call void @llvm.assume(i1 %[[MASKCOND]])
 
 #line 100
 #pragma omp for simd aligned(data : 0x40000000)
diff --git a/clang/test/CodeGen/non-power-of-2-alignment-assumptions.c b/clang/test/CodeGen/non-power-of-2-alignment-assumptions.c
index b8ce1699f7ed..9467f6228dfc 100644
--- a/clang/test/CodeGen/non-power-of-2-alignment-assumptions.c
+++ b/clang/test/CodeGen/non-power-of-2-alignment-assumptions.c
@@ -9,8 +9,12 @@ void *__attribute__((alloc_align(1))) alloc(int align);
 // CHECK-NEXT:    store i32 [[ALIGN:%.*]], i32* [[ALIGN_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ALIGN_ADDR]], align 4
 // CHECK-NEXT:    [[CALL:%.*]] = call i8* @alloc(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[CALL]], i64 [[TMP1]]) ]
+// CHECK-NEXT:    [[ALIGNMENTCAST:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK-NEXT:    [[MASK:%.*]] = sub i64 [[ALIGNMENTCAST]], 1
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[CALL]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], [[MASK]]
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    ret void
 //
 void t0(int align) {
@@ -21,7 +25,10 @@ void t0(int align) {
 // CHECK-NEXT:    [[ALIGN_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i32 [[ALIGN:%.*]], i32* [[ALIGN_ADDR]], align 4
 // CHECK-NEXT:    [[CALL:%.*]] = call i8* @alloc(i32 7)
-// CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i8* [[CALL]], i64 7) ]
+// CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i8* [[CALL]] to i64
+// CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 6
+// CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 // CHECK-NEXT:    ret void
 //
 void t1(int align) {
diff --git a/clang/test/OpenMP/simd_codegen.cpp b/clang/test/OpenMP/simd_codegen.cpp
index 3440225673c4..cb53bb1aa38b 100644
--- a/clang/test/OpenMP/simd_codegen.cpp
+++ b/clang/test/OpenMP/simd_codegen.cpp
@@ -817,9 +817,25 @@ void parallel_simd(float *a) {
 // TERM_DEBUG: !{{[0-9]+}} = !DILocation(line: [[@LINE-11]],
 
 // CHECK-LABEL: S8
+// CHECK-DAG: ptrtoint [[SS_TY]]* %{{.+}} to i64
+// CHECK-DAG: ptrtoint [[SS_TY]]* %{{.+}} to i64
+// CHECK-DAG: ptrtoint [[SS_TY]]* %{{.+}} to i64
+// CHECK-DAG: ptrtoint [[SS_TY]]* %{{.+}} to i64
+
+// CHECK-DAG: and i64 %{{.+}}, 15
+// CHECK-DAG: icmp eq i64 %{{.+}}, 0
 // CHECK-DAG: call void @llvm.assume(i1
+
+// CHECK-DAG: and i64 %{{.+}}, 7
+// CHECK-DAG: icmp eq i64 %{{.+}}, 0
 // CHECK-DAG: call void @llvm.assume(i1
+
+// CHECK-DAG: and i64 %{{.+}}, 15
+// CHECK-DAG: icmp eq i64 %{{.+}}, 0
 // CHECK-DAG: call void @llvm.assume(i1
+
+// CHECK-DAG: and i64 %{{.+}}, 3
+// CHECK-DAG: icmp eq i64 %{{.+}}, 0
 // CHECK-DAG: call void @llvm.assume(i1
 struct SS {
   SS(): a(0) {}
diff --git a/clang/test/OpenMP/simd_metadata.c b/clang/test/OpenMP/simd_metadata.c
index 18133e3b6c2e..f0ae0200dd08 100644
--- a/clang/test/OpenMP/simd_metadata.c
+++ b/clang/test/OpenMP/simd_metadata.c
@@ -21,21 +21,30 @@ void h1(float *c, float *a, double b[], int size)
 // CHECK-LABEL: define void @h1
   int t = 0;
 #pragma omp simd safelen(16) linear(t) aligned(c:32) aligned(a,b)
-  // CHECK:         call void @llvm.assume(i1 true) [ "align"(float* [[PTR4:%.*]], {{i64|i32}} 32) ]
-  // CHECK-NEXT:    load
-
-  // X86-NEXT:       call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
-  // X86-AVX-NEXT:   call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 32) ]
-  // X86-AVX512-NEXT:call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 64) ]
-  // PPC-NEXT:       call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
-  // PPC-QPX-NEXT:   call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
-  // CHECK-NEXT:     load
-
-  // X86-NEXT:       call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 16) ]
-  // X86-AVX-NEXT:   call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 32) ]
-  // X86-AVX512-NEXT:call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 64) ]
-  // PPC-NEXT:       call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 16) ]
-  // PPC-QPX-NEXT:   call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 32) ]
+// CHECK:         [[C_PTRINT:%.+]] = ptrtoint
+// CHECK-NEXT:    [[C_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[C_PTRINT]], 31
+// CHECK-NEXT:    [[C_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[C_MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[C_MASKCOND]])
+// CHECK:         [[A_PTRINT:%.+]] = ptrtoint
+
+// X86-NEXT:     [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
+// X86-AVX-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 31
+// X86-AVX512-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 63
+// PPC-NEXT:     [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
+// PPC-QPX-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
+
+// CHECK-NEXT:    [[A_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[A_MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[A_MASKCOND]])
+// CHECK:         [[B_PTRINT:%.+]] = ptrtoint
+
+// X86-NEXT:      [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 15
+// X86-AVX-NEXT:  [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 31
+// X86-AVX512-NEXT: [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 63
+// PPC-NEXT:      [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 15
+// PPC-QPX-NEXT:  [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 31
+
+// CHECK-NEXT:    [[B_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[B_MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[B_MASKCOND]])
   for (int i = 0; i < size; ++i) {
     c[i] = a[i] * a[i] + b[i] * b[t];
     ++t;
@@ -43,21 +52,30 @@ void h1(float *c, float *a, double b[], int size)
 // do not emit llvm.access.group metadata due to usage of safelen clause.
 // CHECK-NOT: store float {{.+}}, float* {{.+}}, align {{.+}}, !llvm.access.group {{![0-9]+}}
 #pragma omp simd safelen(16) linear(t) aligned(c:32) aligned(a,b) simdlen(8)
-  // CHECK:         call void @llvm.assume(i1 true) [ "align"(float* [[PTR4:%.*]], {{i64|i32}} 32) ]
-  // CHECK-NEXT:    load
-
-  // X86-NEXT:       call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
-  // X86-AVX-NEXT:   call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 32) ]
-  // X86-AVX512-NEXT:call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 64) ]
-  // PPC-NEXT:       call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
-  // PPC-QPX-NEXT:   call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
-  // CHECK-NEXT:     load
-
-  // X86-NEXT:       call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 16) ]
-  // X86-AVX-NEXT:   call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 32) ]
-  // X86-AVX512-NEXT:call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 64) ]
-  // PPC-NEXT:       call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 16) ]
-  // PPC-QPX-NEXT:   call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 32) ]
+// CHECK:         [[C_PTRINT:%.+]] = ptrtoint
+// CHECK-NEXT:    [[C_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[C_PTRINT]], 31
+// CHECK-NEXT:    [[C_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[C_MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[C_MASKCOND]])
+// CHECK:         [[A_PTRINT:%.+]] = ptrtoint
+
+// X86-NEXT:     [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
+// X86-AVX-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 31
+// X86-AVX512-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 63
+// PPC-NEXT:     [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
+// PPC-QPX-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
+
+// CHECK-NEXT:    [[A_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[A_MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[A_MASKCOND]])
+// CHECK:         [[B_PTRINT:%.+]] = ptrtoint
+
+// X86-NEXT:      [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 15
+// X86-AVX-NEXT:  [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 31
+// X86-AVX512-NEXT: [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 63
+// PPC-NEXT:      [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 15
+// PPC-QPX-NEXT:  [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 31
+
+// CHECK-NEXT:    [[B_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[B_MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[B_MASKCOND]])
   for (int i = 0; i < size; ++i) {
     c[i] = a[i] * a[i] + b[i] * b[t];
     ++t;
@@ -65,21 +83,30 @@ void h1(float *c, float *a, double b[], int size)
 // do not emit llvm.access.group metadata due to usage of safelen clause.
 // CHECK-NOT: store float {{.+}}, float* {{.+}}, align {{.+}}, !llvm.access.group {{![0-9]+}}
 #pragma omp simd linear(t) aligned(c:32) aligned(a,b) simdlen(8)
-  // CHECK:         call void @llvm.assume(i1 true) [ "align"(float* [[PTR4:%.*]], {{i64|i32}} 32) ]
-  // CHECK-NEXT:    load
-
-  // X86-NEXT:       call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
-  // X86-AVX-NEXT:   call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 32) ]
-  // X86-AVX512-NEXT:call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 64) ]
-  // PPC-NEXT:       call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
-  // PPC-QPX-NEXT:   call void @llvm.assume(i1 true) [ "align"(float* [[PTR5:%.*]], {{i64|i32}} 16) ]
-  // CHECK-NEXT:     load
-
-  // X86-NEXT:       call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 16) ]
-  // X86-AVX-NEXT:   call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 32) ]
-  // X86-AVX512-NEXT:call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 64) ]
-  // PPC-NEXT:       call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 16) ]
-  // PPC-QPX-NEXT:   call void @llvm.assume(i1 true) [ "align"(double* [[PTR6:%.*]], {{i64|i32}} 32) ]
+// CHECK:         [[C_PTRINT:%.+]] = ptrtoint
+// CHECK-NEXT:    [[C_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[C_PTRINT]], 31
+// CHECK-NEXT:    [[C_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[C_MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[C_MASKCOND]])
+// CHECK:         [[A_PTRINT:%.+]] = ptrtoint
+
+// X86-NEXT:     [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
+// X86-AVX-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 31
+// X86-AVX512-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 63
+// PPC-NEXT:     [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
+// PPC-QPX-NEXT: [[A_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[A_PTRINT]], 15
+
+// CHECK-NEXT:    [[A_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[A_MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[A_MASKCOND]])
+// CHECK:         [[B_PTRINT:%.+]] = ptrtoint
+
+// X86-NEXT:      [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 15
+// X86-AVX-NEXT:  [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 31
+// X86-AVX512-NEXT: [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 63
+// PPC-NEXT:      [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 15
+// PPC-QPX-NEXT:  [[B_MASKEDPTR:%.+]] = and i{{[0-9]+}} [[B_PTRINT]], 31
+
+// CHECK-NEXT:    [[B_MASKCOND:%.+]] = icmp eq i{{[0-9]+}} [[B_MASKEDPTR]], 0
+// CHECK-NEXT:    call void @llvm.assume(i1 [[B_MASKCOND]])
   for (int i = 0; i < size; ++i) {
     c[i] = a[i] * a[i] + b[i] * b[t];
     ++t;
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
index 7192ef454d0a..2fc166ed0b87 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -101,7 +101,10 @@ int target_teams_fun(int *g){
 
   // CK1: define internal void @[[OUTL1]]({{.+}})
   // CK1: [[ARRDECAY:%.+]] = getelementptr inbounds [1000 x i32], [1000 x i32]* %{{.+}}, i{{32|64}} 0, i{{32|64}} 0
-  // CK1: call void @llvm.assume(i1 true) [ "align"(i32* [[ARRDECAY]], {{i64|i32}} 8) ]
+  // CK1: [[ARR_CAST:%.+]] = ptrtoint i32* [[ARRDECAY]] to i{{32|64}}
+  // CK1: [[MASKED_PTR:%.+]] = and i{{32|64}} [[ARR_CAST]], 7
+  // CK1: [[COND:%.+]] = icmp eq i{{32|64}} [[MASKED_PTR]], 0
+  // CK1: call void @llvm.assume(i1 [[COND]])
   // CK1: call void @__kmpc_for_static_init_4(
   // CK1: call void {{.+}} @__kmpc_fork_call(
   // CK1: call void @__kmpc_for_static_fini(
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 4552ca016bd7..ffec4ff64ca6 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -782,11 +782,7 @@ class IRBuilderBase {
 
   /// Create an assume intrinsic call that allows the optimizer to
   /// assume that the provided condition will be true.
-  ///
-  /// The optional argument \p OpBundles specifies operand bundles that are
-  /// added to the call instruction.
-  CallInst *CreateAssumption(Value *Cond,
-                             ArrayRef<OperandBundleDef> OpBundles = llvm::None);
+  CallInst *CreateAssumption(Value *Cond);
 
   /// Create a call to the experimental.gc.statepoint intrinsic to
   /// start a new statepoint sequence.
@@ -2506,11 +2502,13 @@ class IRBuilderBase {
 
 private:
   /// Helper function that creates an assume intrinsic call that
-  /// represents an alignment assumption on the provided pointer \p PtrValue
-  /// with offset \p OffsetValue and alignment value \p AlignValue.
+  /// represents an alignment assumption on the provided Ptr, Mask, Type
+  /// and Offset. It may be sometimes useful to do some other logic
+  /// based on this alignment check, thus it can be stored into 'TheCheck'.
   CallInst *CreateAlignmentAssumptionHelper(const DataLayout &DL,
-                                            Value *PtrValue, Value *AlignValue,
-                                            Value *OffsetValue);
+                                            Value *PtrValue, Value *Mask,
+                                            Type *IntPtrTy, Value *OffsetValue,
+                                            Value **TheCheck);
 
 public:
   /// Create an assume intrinsic call that represents an alignment
@@ -2519,9 +2517,13 @@ class IRBuilderBase {
   /// An optional offset can be provided, and if it is provided, the offset
   /// must be subtracted from the provided pointer to get the pointer with the
   /// specified alignment.
+  ///
+  /// It may be sometimes useful to do some other logic
+  /// based on this alignment check, thus it can be stored into 'TheCheck'.
   CallInst *CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue,
                                       unsigned Alignment,
-                                      Value *OffsetValue = nullptr);
+                                      Value *OffsetValue = nullptr,
+                                      Value **TheCheck = nullptr);
 
   /// Create an assume intrinsic call that represents an alignment
   /// assumption on the provided pointer.
@@ -2530,11 +2532,15 @@ class IRBuilderBase {
   /// must be subtracted from the provided pointer to get the pointer with the
   /// specified alignment.
   ///
+  /// It may be sometimes useful to do some other logic
+  /// based on this alignment check, thus it can be stored into 'TheCheck'.
+  ///
   /// This overload handles the condition where the Alignment is dependent
   /// on an existing value rather than a static value.
   CallInst *CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue,
                                       Value *Alignment,
-                                      Value *OffsetValue = nullptr);
+                                      Value *OffsetValue = nullptr,
+                                      Value **TheCheck = nullptr);
 };
 
 /// This provides a uniform API for creating instructions and inserting
diff --git a/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h b/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h
index 10b6e1c6a21b..be119b8ab855 100644
--- a/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h
+++ b/llvm/include/llvm/Transforms/Scalar/AlignmentFromAssumptions.h
@@ -37,9 +37,9 @@ struct AlignmentFromAssumptionsPass
   ScalarEvolution *SE = nullptr;
   DominatorTree *DT = nullptr;
 
-  bool extractAlignmentInfo(CallInst *I, unsigned Idx, Value *&AAPtr,
-                            const SCEV *&AlignSCEV, const SCEV *&OffSCEV);
-  bool processAssumption(CallInst *I, unsigned Idx);
+  bool extractAlignmentInfo(CallInst *I, Value *&AAPtr, const SCEV *&AlignSCEV,
+                            const SCEV *&OffSCEV);
+  bool processAssumption(CallInst *I);
 };
 }
 
diff --git a/llvm/lib/Analysis/AssumeBundleQueries.cpp b/llvm/lib/Analysis/AssumeBundleQueries.cpp
index 05fe05a0bd85..972d0d3ea7f2 100644
--- a/llvm/lib/Analysis/AssumeBundleQueries.cpp
+++ b/llvm/lib/Analysis/AssumeBundleQueries.cpp
@@ -108,17 +108,10 @@ llvm::getKnowledgeFromBundle(CallInst &Assume,
   Result.AttrKind = Attribute::getAttrKindFromName(BOI.Tag->getKey());
   if (bundleHasArgument(BOI, ABA_WasOn))
     Result.WasOn = getValueFromBundleOpInfo(Assume, BOI, ABA_WasOn);
-  auto GetArgOr1 = [&](unsigned Idx) -> unsigned {
-    if (auto *ConstInt = dyn_cast<ConstantInt>(
-            getValueFromBundleOpInfo(Assume, BOI, ABA_Argument + Idx)))
-      return ConstInt->getZExtValue();
-    return 1;
-  };
   if (BOI.End - BOI.Begin > ABA_Argument)
-    Result.ArgValue = GetArgOr1(0);
-  if (Result.AttrKind == Attribute::Alignment)
-    if (BOI.End - BOI.Begin > ABA_Argument + 1)
-      Result.ArgValue = MinAlign(Result.ArgValue, GetArgOr1(1));
+    Result.ArgValue =
+        cast<ConstantInt>(getValueFromBundleOpInfo(Assume, BOI, ABA_Argument))
+            ->getZExtValue();
   return Result;
 }
 
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index b87dfe1c8df6..1fffce015f70 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -71,9 +71,8 @@ Value *IRBuilderBase::getCastedInt8PtrValue(Value *Ptr) {
 static CallInst *createCallHelper(Function *Callee, ArrayRef<Value *> Ops,
                                   IRBuilderBase *Builder,
                                   const Twine &Name = "",
-                                  Instruction *FMFSource = nullptr,
-                                  ArrayRef<OperandBundleDef> OpBundles = {}) {
-  CallInst *CI = Builder->CreateCall(Callee, Ops, OpBundles, Name);
+                                  Instruction *FMFSource = nullptr) {
+  CallInst *CI = Builder->CreateCall(Callee, Ops, Name);
   if (FMFSource)
     CI->copyFastMathFlags(FMFSource);
   return CI;
@@ -450,16 +449,14 @@ CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) {
   return createCallHelper(TheFn, Ops, this);
 }
 
-CallInst *
-IRBuilderBase::CreateAssumption(Value *Cond,
-                                ArrayRef<OperandBundleDef> OpBundles) {
+CallInst *IRBuilderBase::CreateAssumption(Value *Cond) {
   assert(Cond->getType() == getInt1Ty() &&
          "an assumption condition must be of type i1");
 
   Value *Ops[] = { Cond };
   Module *M = BB->getParent()->getParent();
   Function *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume);
-  return createCallHelper(FnAssume, Ops, this, "", nullptr, OpBundles);
+  return createCallHelper(FnAssume, Ops, this);
 }
 
 /// Create a call to a Masked Load intrinsic.
@@ -1110,37 +1107,63 @@ Value *IRBuilderBase::CreatePreserveStructAccessIndex(
   return Fn;
 }
 
-CallInst *IRBuilderBase::CreateAlignmentAssumptionHelper(const DataLayout &DL,
-                                                         Value *PtrValue,
-                                                         Value *AlignValue,
-                                                         Value *OffsetValue) {
-  SmallVector<Value *, 4> Vals({PtrValue, AlignValue});
-  if (OffsetValue)
-    Vals.push_back(OffsetValue);
-  OperandBundleDefT<Value *> AlignOpB("align", Vals);
-  return CreateAssumption(ConstantInt::getTrue(getContext()), {AlignOpB});
+CallInst *IRBuilderBase::CreateAlignmentAssumptionHelper(
+    const DataLayout &DL, Value *PtrValue, Value *Mask, Type *IntPtrTy,
+    Value *OffsetValue, Value **TheCheck) {
+  Value *PtrIntValue = CreatePtrToInt(PtrValue, IntPtrTy, "ptrint");
+
+  if (OffsetValue) {
+    bool IsOffsetZero = false;
+    if (const auto *CI = dyn_cast<ConstantInt>(OffsetValue))
+      IsOffsetZero = CI->isZero();
+
+    if (!IsOffsetZero) {
+      if (OffsetValue->getType() != IntPtrTy)
+        OffsetValue = CreateIntCast(OffsetValue, IntPtrTy, /*isSigned*/ true,
+                                    "offsetcast");
+      PtrIntValue = CreateSub(PtrIntValue, OffsetValue, "offsetptr");
+    }
+  }
+
+  Value *Zero = ConstantInt::get(IntPtrTy, 0);
+  Value *MaskedPtr = CreateAnd(PtrIntValue, Mask, "maskedptr");
+  Value *InvCond = CreateICmpEQ(MaskedPtr, Zero, "maskcond");
+  if (TheCheck)
+    *TheCheck = InvCond;
+
+  return CreateAssumption(InvCond);
 }
 
-CallInst *IRBuilderBase::CreateAlignmentAssumption(const DataLayout &DL,
-                                                   Value *PtrValue,
-                                                   unsigned Alignment,
-                                                   Value *OffsetValue) {
+CallInst *IRBuilderBase::CreateAlignmentAssumption(
+    const DataLayout &DL, Value *PtrValue, unsigned Alignment,
+    Value *OffsetValue, Value **TheCheck) {
   assert(isa<PointerType>(PtrValue->getType()) &&
          "trying to create an alignment assumption on a non-pointer?");
   assert(Alignment != 0 && "Invalid Alignment");
   auto *PtrTy = cast<PointerType>(PtrValue->getType());
   Type *IntPtrTy = getIntPtrTy(DL, PtrTy->getAddressSpace());
-  Value *AlignValue = ConstantInt::get(IntPtrTy, Alignment);
-  return CreateAlignmentAssumptionHelper(DL, PtrValue, AlignValue, OffsetValue);
+
+  Value *Mask = ConstantInt::get(IntPtrTy, Alignment - 1);
+  return CreateAlignmentAssumptionHelper(DL, PtrValue, Mask, IntPtrTy,
+                                         OffsetValue, TheCheck);
 }
 
-CallInst *IRBuilderBase::CreateAlignmentAssumption(const DataLayout &DL,
-                                                   Value *PtrValue,
-                                                   Value *Alignment,
-                                                   Value *OffsetValue) {
+CallInst *IRBuilderBase::CreateAlignmentAssumption(
+    const DataLayout &DL, Value *PtrValue, Value *Alignment,
+    Value *OffsetValue, Value **TheCheck) {
   assert(isa<PointerType>(PtrValue->getType()) &&
          "trying to create an alignment assumption on a non-pointer?");
-  return CreateAlignmentAssumptionHelper(DL, PtrValue, Alignment, OffsetValue);
+  auto *PtrTy = cast<PointerType>(PtrValue->getType());
+  Type *IntPtrTy = getIntPtrTy(DL, PtrTy->getAddressSpace());
+
+  if (Alignment->getType() != IntPtrTy)
+    Alignment = CreateIntCast(Alignment, IntPtrTy, /*isSigned*/ false,
+                              "alignmentcast");
+
+  Value *Mask = CreateSub(Alignment, ConstantInt::get(IntPtrTy, 1), "mask");
+
+  return CreateAlignmentAssumptionHelper(DL, PtrValue, Mask, IntPtrTy,
+                                         OffsetValue, TheCheck);
 }
 
 IRBuilderDefaultInserter::~IRBuilderDefaultInserter() {}
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 6df1072925f9..c518ae87ea9b 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4449,32 +4449,21 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
       Assert(Elem.Tag->getKey() == "ignore" ||
                  Attribute::isExistingAttribute(Elem.Tag->getKey()),
              "tags must be valid attribute names");
+      Assert(Elem.End - Elem.Begin <= 2, "to many arguments");
       Attribute::AttrKind Kind =
           Attribute::getAttrKindFromName(Elem.Tag->getKey());
-      unsigned ArgCount = Elem.End - Elem.Begin;
-      if (Kind == Attribute::Alignment) {
-        Assert(ArgCount <= 3 && ArgCount >= 2,
-               "alignment assumptions should have 2 or 3 arguments");
-        Assert(Call.getOperand(Elem.Begin)->getType()->isPointerTy(),
-               "first argument should be a pointer");
-        Assert(Call.getOperand(Elem.Begin + 1)->getType()->isIntegerTy(),
-               "second argument should be an integer");
-        if (ArgCount == 3)
-          Assert(Call.getOperand(Elem.Begin + 2)->getType()->isIntegerTy(),
-                 "third argument should be an integer if present");
-        return;
-      }
-      Assert(ArgCount <= 2, "to many arguments");
       if (Kind == Attribute::None)
         break;
       if (Attribute::doesAttrKindHaveArgument(Kind)) {
-        Assert(ArgCount == 2, "this attribute should have 2 arguments");
+        Assert(Elem.End - Elem.Begin == 2,
+               "this attribute should have 2 arguments");
         Assert(isa<ConstantInt>(Call.getOperand(Elem.Begin + 1)),
                "the second argument should be a constant integral value");
       } else if (isFuncOnlyAttr(Kind)) {
-        Assert((ArgCount) == 0, "this attribute has no argument");
+        Assert((Elem.End - Elem.Begin) == 0, "this attribute has no argument");
       } else if (!isFuncOrArgAttr(Kind)) {
-        Assert((ArgCount) == 1, "this attribute should have one argument");
+        Assert((Elem.End - Elem.Begin) == 1,
+               "this attribute should have one argument");
       }
     }
     break;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index c734c9a68fb2..836af6234ad5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -4220,16 +4220,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   case Intrinsic::assume: {
     Value *IIOperand = II->getArgOperand(0);
-    SmallVector<OperandBundleDef, 4> OpBundles;
-    II->getOperandBundlesAsDefs(OpBundles);
-    bool HasOpBundles = !OpBundles.empty();
     // Remove an assume if it is followed by an identical assume.
     // TODO: Do we need this? Unless there are conflicting assumptions, the
     // computeKnownBits(IIOperand) below here eliminates redundant assumes.
     Instruction *Next = II->getNextNonDebugInstruction();
-    if (HasOpBundles &&
-        match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))) &&
-        !cast<IntrinsicInst>(Next)->hasOperandBundles())
+    if (match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))))
       return eraseInstFromFunction(CI);
 
     // Canonicalize assume(a && b) -> assume(a); assume(b);
@@ -4239,15 +4234,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *AssumeIntrinsic = II->getCalledOperand();
     Value *A, *B;
     if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) {
-      Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, OpBundles,
-                         II->getName());
+      Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, II->getName());
       Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, B, II->getName());
       return eraseInstFromFunction(*II);
     }
     // assume(!(a || b)) -> assume(!a); assume(!b);
     if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) {
       Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic,
-                         Builder.CreateNot(A), OpBundles, II->getName());
+                         Builder.CreateNot(A), II->getName());
       Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic,
                          Builder.CreateNot(B), II->getName());
       return eraseInstFromFunction(*II);
@@ -4263,8 +4257,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         isValidAssumeForContext(II, LHS, &DT)) {
       MDNode *MD = MDNode::get(II->getContext(), None);
       LHS->setMetadata(LLVMContext::MD_nonnull, MD);
-      if (!HasOpBundles)
-        return eraseInstFromFunction(*II);
+      return eraseInstFromFunction(*II);
 
       // TODO: apply nonnull return attributes to calls and invokes
       // TODO: apply range metadata for range check patterns?
diff --git a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index bccf94fc217f..5c008585869c 100644
--- a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Instructions.h"
 #include "llvm/InitializePasses.h"
 #define AA_NAME "alignment-from-assumptions"
 #define DEBUG_TYPE AA_NAME
@@ -204,33 +203,103 @@ static Align getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
 }
 
 bool AlignmentFromAssumptionsPass::extractAlignmentInfo(CallInst *I,
-                                                        unsigned Idx,
                                                         Value *&AAPtr,
                                                         const SCEV *&AlignSCEV,
                                                         const SCEV *&OffSCEV) {
-  Type *Int64Ty = Type::getInt64Ty(I->getContext());
-  OperandBundleUse AlignOB = I->getOperandBundleAt(Idx);
-  if (AlignOB.getTagName() != "align")
+  // An alignment assume must be a statement about the least-significant
+  // bits of the pointer being zero, possibly with some offset.
+  ICmpInst *ICI = dyn_cast<ICmpInst>(I->getArgOperand(0));
+  if (!ICI)
     return false;
-  assert(AlignOB.Inputs.size() >= 2);
-  AAPtr = AlignOB.Inputs[0].get();
-  // TODO: Consider accumulating the offset to the base.
-  AAPtr = AAPtr->stripPointerCastsSameRepresentation();
-  AlignSCEV = SE->getSCEV(AlignOB.Inputs[1].get());
-  AlignSCEV = SE->getTruncateOrZeroExtend(AlignSCEV, Int64Ty);
-  if (AlignOB.Inputs.size() == 3)
-    OffSCEV = SE->getSCEV(AlignOB.Inputs[2].get());
-  else
+
+  // This must be an expression of the form: x & m == 0.
+  if (ICI->getPredicate() != ICmpInst::ICMP_EQ)
+    return false;
+
+  // Swap things around so that the RHS is 0.
+  Value *CmpLHS = ICI->getOperand(0);
+  Value *CmpRHS = ICI->getOperand(1);
+  const SCEV *CmpLHSSCEV = SE->getSCEV(CmpLHS);
+  const SCEV *CmpRHSSCEV = SE->getSCEV(CmpRHS);
+  if (CmpLHSSCEV->isZero())
+    std::swap(CmpLHS, CmpRHS);
+  else if (!CmpRHSSCEV->isZero())
+    return false;
+
+  BinaryOperator *CmpBO = dyn_cast<BinaryOperator>(CmpLHS);
+  if (!CmpBO || CmpBO->getOpcode() != Instruction::And)
+    return false;
+
+  // Swap things around so that the right operand of the and is a constant
+  // (the mask); we cannot deal with variable masks.
+  Value *AndLHS = CmpBO->getOperand(0);
+  Value *AndRHS = CmpBO->getOperand(1);
+  const SCEV *AndLHSSCEV = SE->getSCEV(AndLHS);
+  const SCEV *AndRHSSCEV = SE->getSCEV(AndRHS);
+  if (isa<SCEVConstant>(AndLHSSCEV)) {
+    std::swap(AndLHS, AndRHS);
+    std::swap(AndLHSSCEV, AndRHSSCEV);
+  }
+
+  const SCEVConstant *MaskSCEV = dyn_cast<SCEVConstant>(AndRHSSCEV);
+  if (!MaskSCEV)
+    return false;
+
+  // The mask must have some trailing ones (otherwise the condition is
+  // trivial and tells us nothing about the alignment of the left operand).
+  unsigned TrailingOnes = MaskSCEV->getAPInt().countTrailingOnes();
+  if (!TrailingOnes)
+    return false;
+
+  // Cap the alignment at the maximum with which LLVM can deal (and make sure
+  // we don't overflow the shift).
+  uint64_t Alignment;
+  TrailingOnes = std::min(TrailingOnes,
+    unsigned(sizeof(unsigned) * CHAR_BIT - 1));
+  Alignment = std::min(1u << TrailingOnes, +Value::MaximumAlignment);
+
+  Type *Int64Ty = Type::getInt64Ty(I->getParent()->getParent()->getContext());
+  AlignSCEV = SE->getConstant(Int64Ty, Alignment);
+
+  // The LHS might be a ptrtoint instruction, or it might be the pointer
+  // with an offset.
+  AAPtr = nullptr;
+  OffSCEV = nullptr;
+  if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(AndLHS)) {
+    AAPtr = PToI->getPointerOperand();
     OffSCEV = SE->getZero(Int64Ty);
-  OffSCEV = SE->getTruncateOrZeroExtend(OffSCEV, Int64Ty);
+  } else if (const SCEVAddExpr* AndLHSAddSCEV =
+             dyn_cast<SCEVAddExpr>(AndLHSSCEV)) {
+    // Try to find the ptrtoint; subtract it and the rest is the offset.
+    for (SCEVAddExpr::op_iterator J = AndLHSAddSCEV->op_begin(),
+         JE = AndLHSAddSCEV->op_end(); J != JE; ++J)
+      if (const SCEVUnknown *OpUnk = dyn_cast<SCEVUnknown>(*J))
+        if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(OpUnk->getValue())) {
+          AAPtr = PToI->getPointerOperand();
+          OffSCEV = SE->getMinusSCEV(AndLHSAddSCEV, *J);
+          break;
+        }
+  }
+
+  if (!AAPtr)
+    return false;
+
+  // Sign extend the offset to 64 bits (so that it is like all of the other
+  // expressions).
+  unsigned OffSCEVBits = OffSCEV->getType()->getPrimitiveSizeInBits();
+  if (OffSCEVBits < 64)
+    OffSCEV = SE->getSignExtendExpr(OffSCEV, Int64Ty);
+  else if (OffSCEVBits > 64)
+    return false;
+
+  AAPtr = AAPtr->stripPointerCasts();
   return true;
 }
 
-bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall,
-                                                     unsigned Idx) {
+bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
   Value *AAPtr;
   const SCEV *AlignSCEV, *OffSCEV;
-  if (!extractAlignmentInfo(ACall, Idx, AAPtr, AlignSCEV, OffSCEV))
+  if (!extractAlignmentInfo(ACall, AAPtr, AlignSCEV, OffSCEV))
     return false;
 
   // Skip ConstantPointerNull and UndefValue.  Assumptions on these shouldn't
@@ -248,14 +317,13 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall,
       continue;
 
     if (Instruction *K = dyn_cast<Instruction>(J))
+      if (isValidAssumeForContext(ACall, K, DT))
         WorkList.push_back(K);
   }
 
   while (!WorkList.empty()) {
     Instruction *J = WorkList.pop_back_val();
     if (LoadInst *LI = dyn_cast<LoadInst>(J)) {
-      if (!isValidAssumeForContext(ACall, J, DT))
-        continue;
       Align NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
                                            LI->getPointerOperand(), SE);
       if (NewAlignment > LI->getAlign()) {
@@ -263,8 +331,6 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall,
         ++NumLoadAlignChanged;
       }
     } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) {
-      if (!isValidAssumeForContext(ACall, J, DT))
-        continue;
       Align NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
                                            SI->getPointerOperand(), SE);
       if (NewAlignment > SI->getAlign()) {
@@ -272,8 +338,6 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall,
         ++NumStoreAlignChanged;
       }
     } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) {
-      if (!isValidAssumeForContext(ACall, J, DT))
-        continue;
       Align NewDestAlignment =
           getNewAlignment(AASCEV, AlignSCEV, OffSCEV, MI->getDest(), SE);
 
@@ -305,7 +369,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall,
     Visited.insert(J);
     for (User *UJ : J->users()) {
       Instruction *K = cast<Instruction>(UJ);
-      if (!Visited.count(K))
+      if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DT))
         WorkList.push_back(K);
     }
   }
@@ -332,11 +396,8 @@ bool AlignmentFromAssumptionsPass::runImpl(Function &F, AssumptionCache &AC,
 
   bool Changed = false;
   for (auto &AssumeVH : AC.assumptions())
-    if (AssumeVH) {
-      CallInst *Call = cast<CallInst>(AssumeVH);
-      for (unsigned Idx = 0; Idx < Call->getNumOperandBundles(); Idx++)
-        Changed |= processAssumption(Call, Idx);
-    }
+    if (AssumeVH)
+      Changed |= processAssumption(cast<CallInst>(AssumeVH));
 
   return Changed;
 }
diff --git a/llvm/test/Transforms/AlignmentFromAssumptions/simple.ll b/llvm/test/Transforms/AlignmentFromAssumptions/simple.ll
index 610fd448c3b9..14e764f042c7 100644
--- a/llvm/test/Transforms/AlignmentFromAssumptions/simple.ll
+++ b/llvm/test/Transforms/AlignmentFromAssumptions/simple.ll
@@ -4,7 +4,10 @@ target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
 
 define i32 @foo(i32* nocapture %a) nounwind uwtable readonly {
 entry:
-  tail call void @llvm.assume(i1 true) ["align"(i32* %a, i32 32)]
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   %0 = load i32, i32* %a, align 4
   ret i32 %0
 
@@ -15,7 +18,11 @@ entry:
 
 define i32 @foo2(i32* nocapture %a) nounwind uwtable readonly {
 entry:
-  tail call void @llvm.assume(i1 true) ["align"(i32* %a, i32 32, i32 24)]
+  %ptrint = ptrtoint i32* %a to i64
+  %offsetptr = add i64 %ptrint, 24
+  %maskedptr = and i64 %offsetptr, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   %arrayidx = getelementptr inbounds i32, i32* %a, i64 2
   %0 = load i32, i32* %arrayidx, align 4
   ret i32 %0
@@ -27,7 +34,11 @@ entry:
 
 define i32 @foo2a(i32* nocapture %a) nounwind uwtable readonly {
 entry:
-  tail call void @llvm.assume(i1 true) ["align"(i32* %a, i32 32, i32 28)]
+  %ptrint = ptrtoint i32* %a to i64
+  %offsetptr = add i64 %ptrint, 28
+  %maskedptr = and i64 %offsetptr, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   %arrayidx = getelementptr inbounds i32, i32* %a, i64 -1
   %0 = load i32, i32* %arrayidx, align 4
   ret i32 %0
@@ -39,7 +50,10 @@ entry:
 
 define i32 @goo(i32* nocapture %a) nounwind uwtable readonly {
 entry:
-  tail call void @llvm.assume(i1 true) ["align"(i32* %a, i32 32, i32 0)]
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   %0 = load i32, i32* %a, align 4
   ret i32 %0
 
@@ -50,7 +64,10 @@ entry:
 
 define i32 @hoo(i32* nocapture %a) nounwind uwtable readonly {
 entry:
-  tail call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32, i32 0)]
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   br label %for.body
 
 for.body:                                         ; preds = %entry, %for.body
@@ -81,7 +98,10 @@ for.end:                                          ; preds = %for.body
 ;         load(a, i0+i1+i2+32)
 define void @hoo2(i32* nocapture %a, i64 %id, i64 %num) nounwind uwtable readonly {
 entry:
-  tail call void @llvm.assume(i1 true) ["align"(i32* %a, i8 32, i64 0)]
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   %id.mul = shl nsw i64 %id, 6
   %num.mul = shl nsw i64 %num, 6
   br label %for0.body
@@ -127,7 +147,10 @@ return:
 
 define i32 @joo(i32* nocapture %a) nounwind uwtable readonly {
 entry:
-  tail call void @llvm.assume(i1 true) ["align"(i32* %a, i8 32, i8 0)]
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   br label %for.body
 
 for.body:                                         ; preds = %entry, %for.body
@@ -152,13 +175,16 @@ for.end:                                          ; preds = %for.body
 
 define i32 @koo(i32* nocapture %a) nounwind uwtable readonly {
 entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   br label %for.body
 
 for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %r.06 = phi i32 [ 0, %entry ], [ %add, %for.body ]
   %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-  tail call void @llvm.assume(i1 true) ["align"(i32* %a, i8 32, i8 0)]
   %0 = load i32, i32* %arrayidx, align 4
   %add = add nsw i32 %0, %r.06
   %indvars.iv.next = add i64 %indvars.iv, 4
@@ -177,7 +203,10 @@ for.end:                                          ; preds = %for.body
 
 define i32 @koo2(i32* nocapture %a) nounwind uwtable readonly {
 entry:
-  tail call void @llvm.assume(i1 true) ["align"(i32* %a, i128 32, i128 0)]
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   br label %for.body
 
 for.body:                                         ; preds = %entry, %for.body
@@ -202,7 +231,10 @@ for.end:                                          ; preds = %for.body
 
 define i32 @moo(i32* nocapture %a) nounwind uwtable {
 entry:
-  tail call void @llvm.assume(i1 true) ["align"(i32* %a, i16 32)]
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   %0 = bitcast i32* %a to i8*
   tail call void @llvm.memset.p0i8.i64(i8* align 4 %0, i8 0, i64 64, i1 false)
   ret i32 undef
@@ -214,9 +246,15 @@ entry:
 
 define i32 @moo2(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
 entry:
-  tail call void @llvm.assume(i1 true) ["align"(i32* %b, i32 128)]
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %ptrint1 = ptrtoint i32* %b to i64
+  %maskedptr3 = and i64 %ptrint1, 127
+  %maskcond4 = icmp eq i64 %maskedptr3, 0
+  tail call void @llvm.assume(i1 %maskcond4)
   %0 = bitcast i32* %a to i8*
-  tail call void @llvm.assume(i1 true) ["align"(i8* %0, i16 32)]
   %1 = bitcast i32* %b to i8*
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 64, i1 false)
   ret i32 undef
@@ -226,19 +264,6 @@ entry:
 ; CHECK: ret i32 undef
 }
 
-define i32 @moo3(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
-entry:
-  %0 = bitcast i32* %a to i8*
-  tail call void @llvm.assume(i1 true) ["align"(i8* %0, i16 32), "align"(i32* %b, i32 128)]
-  %1 = bitcast i32* %b to i8*
-  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 64, i1 false)
-  ret i32 undef
-
-; CHECK-LABEL: @moo3
-; CHECK: @llvm.memcpy.p0i8.p0i8.i64(i8* align 32 %0, i8* align 128 %1, i64 64, i1 false)
-; CHECK: ret i32 undef
-}
-
 declare void @llvm.assume(i1) nounwind
 
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
diff --git a/llvm/test/Transforms/AlignmentFromAssumptions/simple32.ll b/llvm/test/Transforms/AlignmentFromAssumptions/simple32.ll
index 453899c15c4f..3f0819e3641b 100644
--- a/llvm/test/Transforms/AlignmentFromAssumptions/simple32.ll
+++ b/llvm/test/Transforms/AlignmentFromAssumptions/simple32.ll
@@ -7,12 +7,18 @@ define i32 @foo(i32* nocapture %a) nounwind uwtable readonly {
 ; CHECK-LABEL: define {{[^@]+}}@foo
 ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #0
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32) ]
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 32
 ; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
-  call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32)]
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   %0 = load i32, i32* %a, align 4
   ret i32 %0
 
@@ -22,13 +28,21 @@ define i32 @foo2(i32* nocapture %a) nounwind uwtable readonly {
 ; CHECK-LABEL: define {{[^@]+}}@foo2
 ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #0
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32, i64 24) ]
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64
+; CHECK-NEXT:    [[OFFSETPTR:%.*]] = add i64 [[PTRINT]], 24
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[OFFSETPTR]], 31
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 16
 ; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
-  call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32, i64 24)]
+  %ptrint = ptrtoint i32* %a to i64
+  %offsetptr = add i64 %ptrint, 24
+  %maskedptr = and i64 %offsetptr, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   %arrayidx = getelementptr inbounds i32, i32* %a, i64 2
   %0 = load i32, i32* %arrayidx, align 4
   ret i32 %0
@@ -39,13 +53,21 @@ define i32 @foo2a(i32* nocapture %a) nounwind uwtable readonly {
 ; CHECK-LABEL: define {{[^@]+}}@foo2a
 ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #0
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32, i64 28) ]
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64
+; CHECK-NEXT:    [[OFFSETPTR:%.*]] = add i64 [[PTRINT]], 28
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[OFFSETPTR]], 31
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 -1
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 32
 ; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
-  call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32, i64 28)]
+  %ptrint = ptrtoint i32* %a to i64
+  %offsetptr = add i64 %ptrint, 28
+  %maskedptr = and i64 %offsetptr, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   %arrayidx = getelementptr inbounds i32, i32* %a, i64 -1
   %0 = load i32, i32* %arrayidx, align 4
   ret i32 %0
@@ -56,12 +78,18 @@ define i32 @goo(i32* nocapture %a) nounwind uwtable readonly {
 ; CHECK-LABEL: define {{[^@]+}}@goo
 ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #0
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32) ]
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 32
 ; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
-  call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32)]
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   %0 = load i32, i32* %a, align 4
   ret i32 %0
 
@@ -71,7 +99,10 @@ define i32 @hoo(i32* nocapture %a) nounwind uwtable readonly {
 ; CHECK-LABEL: define {{[^@]+}}@hoo
 ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #0
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32) ]
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -88,7 +119,10 @@ define i32 @hoo(i32* nocapture %a) nounwind uwtable readonly {
 ; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
 ;
 entry:
-  call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32)]
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   br label %for.body
 
 for.body:                                         ; preds = %entry, %for.body
@@ -112,7 +146,10 @@ define i32 @joo(i32* nocapture %a) nounwind uwtable readonly {
 ; CHECK-LABEL: define {{[^@]+}}@joo
 ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #0
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32) ]
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 4, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -129,7 +166,10 @@ define i32 @joo(i32* nocapture %a) nounwind uwtable readonly {
 ; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
 ;
 entry:
-  call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32)]
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   br label %for.body
 
 for.body:                                         ; preds = %entry, %for.body
@@ -153,7 +193,10 @@ define i32 @koo(i32* nocapture %a) nounwind uwtable readonly {
 ; CHECK-LABEL: define {{[^@]+}}@koo
 ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #0
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32) ]
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -170,7 +213,10 @@ define i32 @koo(i32* nocapture %a) nounwind uwtable readonly {
 ; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
 ;
 entry:
-  call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32)]
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   br label %for.body
 
 for.body:                                         ; preds = %entry, %for.body
@@ -194,7 +240,10 @@ define i32 @koo2(i32* nocapture %a) nounwind uwtable readonly {
 ; CHECK-LABEL: define {{[^@]+}}@koo2
 ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #0
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32) ]
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ -4, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -211,7 +260,10 @@ define i32 @koo2(i32* nocapture %a) nounwind uwtable readonly {
 ; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
 ;
 entry:
-  call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32)]
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   br label %for.body
 
 for.body:                                         ; preds = %entry, %for.body
@@ -235,13 +287,19 @@ define i32 @moo(i32* nocapture %a) nounwind uwtable {
 ; CHECK-LABEL: define {{[^@]+}}@moo
 ; CHECK-SAME: (i32* nocapture [[A:%.*]]) #1
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32) ]
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to i8*
 ; CHECK-NEXT:    tail call void @llvm.memset.p0i8.i64(i8* align 32 [[TMP0]], i8 0, i64 64, i1 false)
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
-  call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32)]
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
   %0 = bitcast i32* %a to i8*
   tail call void @llvm.memset.p0i8.i64(i8* align 4 %0, i8 0, i64 64, i1 false)
   ret i32 undef
@@ -252,16 +310,28 @@ define i32 @moo2(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
 ; CHECK-LABEL: define {{[^@]+}}@moo2
 ; CHECK-SAME: (i32* nocapture [[A:%.*]], i32* nocapture [[B:%.*]]) #1
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[A]], i64 32) ]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(i32* [[B]], i64 128) ]
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i32* [[A]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT:    [[PTRINT1:%.*]] = ptrtoint i32* [[B]] to i64
+; CHECK-NEXT:    [[MASKEDPTR3:%.*]] = and i64 [[PTRINT1]], 127
+; CHECK-NEXT:    [[MASKCOND4:%.*]] = icmp eq i64 [[MASKEDPTR3]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to i8*
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B]] to i8*
 ; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 32 [[TMP0]], i8* align 128 [[TMP1]], i64 64, i1 false)
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
-  call void @llvm.assume(i1 true) ["align"(i32* %a, i64 32)]
-  call void @llvm.assume(i1 true) ["align"(i32* %b, i64 128)]
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %ptrint1 = ptrtoint i32* %b to i64
+  %maskedptr3 = and i64 %ptrint1, 127
+  %maskcond4 = icmp eq i64 %maskedptr3, 0
+  tail call void @llvm.assume(i1 %maskcond4)
   %0 = bitcast i32* %a to i8*
   %1 = bitcast i32* %b to i8*
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 64, i1 false)
diff --git a/llvm/test/Transforms/Inline/align.ll b/llvm/test/Transforms/Inline/align.ll
index f3a518456485..ede6c3fa7bcf 100644
--- a/llvm/test/Transforms/Inline/align.ll
+++ b/llvm/test/Transforms/Inline/align.ll
@@ -23,7 +23,10 @@ define void @foo(float* nocapture %a, float* nocapture readonly %c) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@foo
 ; CHECK-SAME: (float* nocapture [[A:%.*]], float* nocapture readonly [[C:%.*]]) #0
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(float* [[A]], i64 128) ]
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint float* [[A]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 127
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[C]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds float, float* [[A]], i64 5
 ; CHECK-NEXT:    store float [[TMP0]], float* [[ARRAYIDX_I]], align 4
@@ -84,8 +87,14 @@ define void @foo2(float* nocapture %a, float* nocapture %b, float* nocapture rea
 ; CHECK-LABEL: define {{[^@]+}}@foo2
 ; CHECK-SAME: (float* nocapture [[A:%.*]], float* nocapture [[B:%.*]], float* nocapture readonly [[C:%.*]]) #0
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(float* [[A]], i64 128) ]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(float* [[B]], i64 128) ]
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint float* [[A]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 127
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT:    [[PTRINT1:%.*]] = ptrtoint float* [[B]] to i64
+; CHECK-NEXT:    [[MASKEDPTR2:%.*]] = and i64 [[PTRINT1]], 127
+; CHECK-NEXT:    [[MASKCOND3:%.*]] = icmp eq i64 [[MASKEDPTR2]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[MASKCOND3]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[C]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds float, float* [[A]], i64 5
 ; CHECK-NEXT:    store float [[TMP0]], float* [[ARRAYIDX_I]], align 4
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index b372f52a2cdf..6f33e83ee336 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -377,7 +377,6 @@ define i32 @assumption_conflicts_with_known_bits(i32 %a, i32 %b) {
 define void @debug_interference(i8 %x) {
 ; CHECK-LABEL: @debug_interference(
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 [[X:%.*]], 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 false)
 ; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 5, metadata !7, metadata !DIExpression()), !dbg !9
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 false)
 ; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 5, metadata !7, metadata !DIExpression()), !dbg !9
diff --git a/llvm/test/Transforms/PhaseOrdering/inlining-alignment-assumptions.ll b/llvm/test/Transforms/PhaseOrdering/inlining-alignment-assumptions.ll
index 2605701d231d..61287e35005f 100644
--- a/llvm/test/Transforms/PhaseOrdering/inlining-alignment-assumptions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/inlining-alignment-assumptions.ll
@@ -41,7 +41,10 @@ define void @caller1(i1 %c, i64* align 1 %ptr) {
 ; ASSUMPTIONS-ON-NEXT:    br i1 [[C:%.*]], label [[TRUE2_CRITEDGE:%.*]], label [[FALSE1:%.*]]
 ; ASSUMPTIONS-ON:       false1:
 ; ASSUMPTIONS-ON-NEXT:    store volatile i64 1, i64* [[PTR:%.*]], align 8
-; ASSUMPTIONS-ON-NEXT:    call void @llvm.assume(i1 true) [ "align"(i64* [[PTR]], i64 8) ]
+; ASSUMPTIONS-ON-NEXT:    [[PTRINT:%.*]] = ptrtoint i64* [[PTR]] to i64
+; ASSUMPTIONS-ON-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 7
+; ASSUMPTIONS-ON-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; ASSUMPTIONS-ON-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; ASSUMPTIONS-ON-NEXT:    store volatile i64 0, i64* [[PTR]], align 8
 ; ASSUMPTIONS-ON-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
 ; ASSUMPTIONS-ON-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
@@ -51,7 +54,10 @@ define void @caller1(i1 %c, i64* align 1 %ptr) {
 ; ASSUMPTIONS-ON-NEXT:    store volatile i64 3, i64* [[PTR]], align 8
 ; ASSUMPTIONS-ON-NEXT:    ret void
 ; ASSUMPTIONS-ON:       true2.critedge:
-; ASSUMPTIONS-ON-NEXT:    call void @llvm.assume(i1 true) [ "align"(i64* [[PTR]], i64 8) ]
+; ASSUMPTIONS-ON-NEXT:    [[PTRINT_C:%.*]] = ptrtoint i64* [[PTR]] to i64
+; ASSUMPTIONS-ON-NEXT:    [[MASKEDPTR_C:%.*]] = and i64 [[PTRINT_C]], 7
+; ASSUMPTIONS-ON-NEXT:    [[MASKCOND_C:%.*]] = icmp eq i64 [[MASKEDPTR_C]], 0
+; ASSUMPTIONS-ON-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND_C]])
 ; ASSUMPTIONS-ON-NEXT:    store volatile i64 0, i64* [[PTR]], align 8
 ; ASSUMPTIONS-ON-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
 ; ASSUMPTIONS-ON-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
@@ -88,17 +94,26 @@ false2:
 ; This test checks that alignment assumptions do not prevent SROA.
 ; See PR45763.
 
-define internal void @callee2(i64* noalias sret align 32 %arg) {
+define internal void @callee2(i64* noalias sret align 8 %arg) {
   store i64 0, i64* %arg, align 8
   ret void
 }
 
 define amdgpu_kernel void @caller2() {
-; CHECK-LABEL: @caller2(
-; CHECK-NEXT:    ret void
+; ASSUMPTIONS-OFF-LABEL: @caller2(
+; ASSUMPTIONS-OFF-NEXT:    ret void
+;
+; ASSUMPTIONS-ON-LABEL: @caller2(
+; ASSUMPTIONS-ON-NEXT:    [[ALLOCA:%.*]] = alloca i64, align 8, addrspace(5)
+; ASSUMPTIONS-ON-NEXT:    [[CAST:%.*]] = addrspacecast i64 addrspace(5)* [[ALLOCA]] to i64*
+; ASSUMPTIONS-ON-NEXT:    [[PTRINT:%.*]] = ptrtoint i64* [[CAST]] to i64
+; ASSUMPTIONS-ON-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 7
+; ASSUMPTIONS-ON-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; ASSUMPTIONS-ON-NEXT:    call void @llvm.assume(i1 [[MASKCOND]])
+; ASSUMPTIONS-ON-NEXT:    ret void
 ;
   %alloca = alloca i64, align 8, addrspace(5)
   %cast = addrspacecast i64 addrspace(5)* %alloca to i64*
-  call void @callee2(i64* sret align 32 %cast)
+  call void @callee2(i64* sret align 8 %cast)
   ret void
 }
diff --git a/llvm/test/Verifier/assume-bundles.ll b/llvm/test/Verifier/assume-bundles.ll
index 6e260f25129e..302421715c79 100644
--- a/llvm/test/Verifier/assume-bundles.ll
+++ b/llvm/test/Verifier/assume-bundles.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: not opt -verify < %s 2>&1 | FileCheck %s
 
 declare void @llvm.assume(i1)
@@ -7,21 +6,14 @@ define void @func(i32* %P, i32 %P1, i32* %P2, i32* %P3) {
 ; CHECK: tags must be valid attribute names
   call void @llvm.assume(i1 true) ["adazdazd"()]
 ; CHECK: the second argument should be a constant integral value
-  call void @llvm.assume(i1 true) ["dereferenceable"(i32* %P, i32 %P1)]
+  call void @llvm.assume(i1 true) ["align"(i32* %P, i32 %P1)]
 ; CHECK: to many arguments
-  call void @llvm.assume(i1 true) ["dereferenceable"(i32* %P, i32 8, i32 8)]
+  call void @llvm.assume(i1 true) ["align"(i32* %P, i32 8, i32 8)]
 ; CHECK: this attribute should have 2 arguments
-  call void @llvm.assume(i1 true) ["dereferenceable"(i32* %P)]
+  call void @llvm.assume(i1 true) ["align"(i32* %P)]
 ; CHECK: this attribute has no argument
-  call void @llvm.assume(i1 true) ["dereferenceable"(i32* %P, i32 4), "cold"(i32* %P)]
+  call void @llvm.assume(i1 true) ["align"(i32* %P, i32 4), "cold"(i32* %P)]
 ; CHECK: this attribute should have one argument
   call void @llvm.assume(i1 true) ["noalias"()]
-  call void @llvm.assume(i1 true) ["align"(i32* %P, i32 %P1, i32 4)]
-; CHECK: alignment assumptions should have 2 or 3 arguments
-  call void @llvm.assume(i1 true) ["align"(i32* %P, i32 %P1, i32 4, i32 4)]
-; CHECK: second argument should be an integer
-  call void @llvm.assume(i1 true) ["align"(i32* %P, i32* %P2)]
-; CHECK: third argument should be an integer if present
-  call void @llvm.assume(i1 true) ["align"(i32* %P, i32 %P1, i32* %P2)]
   ret void
 }
diff --git a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp
index 946368e1cb94..d35a77fa379b 100644
--- a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp
+++ b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp
@@ -546,41 +546,3 @@ TEST(AssumeQueryAPI, AssumptionCache) {
   ASSERT_EQ(AR[0].Index, 1u);
   ASSERT_EQ(AR[0].Assume, &*First);
 }
-
-TEST(AssumeQueryAPI, Alignment) {
-  LLVMContext C;
-  SMDiagnostic Err;
-  std::unique_ptr<Module> Mod = parseAssemblyString(
-      "declare void @llvm.assume(i1)\n"
-      "define void @test(i32* %P, i32* %P1, i32* %P2, i32 %I3, i1 %B) {\n"
-      "call void @llvm.assume(i1 true) [\"align\"(i32* %P, i32 8, i32 %I3)]\n"
-      "call void @llvm.assume(i1 true) [\"align\"(i32* %P1, i32 %I3, i32 "
-      "%I3)]\n"
-      "call void @llvm.assume(i1 true) [\"align\"(i32* %P2, i32 16, i32 8)]\n"
-      "ret void\n}\n",
-      Err, C);
-  if (!Mod)
-    Err.print("AssumeQueryAPI", errs());
-
-  Function *F = Mod->getFunction("test");
-  BasicBlock::iterator Start = F->begin()->begin();
-  IntrinsicInst *II;
-  RetainedKnowledge RK;
-  II = cast<IntrinsicInst>(&*Start);
-  RK = getKnowledgeFromBundle(*II, II->bundle_op_info_begin()[0]);
-  ASSERT_EQ(RK.AttrKind, Attribute::Alignment);
-  ASSERT_EQ(RK.WasOn, F->getArg(0));
-  ASSERT_EQ(RK.ArgValue, 1u);
-  Start++;
-  II = cast<IntrinsicInst>(&*Start);
-  RK = getKnowledgeFromBundle(*II, II->bundle_op_info_begin()[0]);
-  ASSERT_EQ(RK.AttrKind, Attribute::Alignment);
-  ASSERT_EQ(RK.WasOn, F->getArg(1));
-  ASSERT_EQ(RK.ArgValue, 1u);
-  Start++;
-  II = cast<IntrinsicInst>(&*Start);
-  RK = getKnowledgeFromBundle(*II, II->bundle_op_info_begin()[0]);
-  ASSERT_EQ(RK.AttrKind, Attribute::Alignment);
-  ASSERT_EQ(RK.WasOn, F->getArg(2));
-  ASSERT_EQ(RK.ArgValue, 8u);
-}

From f7587ec858da3a2f69f2b744f47bbcaddb103b16 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Thu, 16 Jul 2020 11:27:31 +0200
Subject: [PATCH 012/363] [clangd] Always retrieve ProjectInfo from Base in
 OverlayCDB

Summary:
Clangd is returning current working directory for overriden commands.
This can cause inconsistencies between:
- header and the main files, as OverlayCDB only contains entries for the main
  files it direct any queries for the headers to the base, creating a
  discrepancy between the two.
- different clangd instances, as the results will be different depending on the
  timing of execution of the query and override of the command. hence clangd
  might see two different project infos for the same file between different
  invocations.
- editors and the way user has invoked it, as current working directory of
  clangd will depend on those, hence even when there's no underlying base CWD
  might change depending on the editor, or the directory user has started the
  editor in.

This patch gets rid of that discrepency by always directing queries to base or
returning llvm::None in absence of it.

For a sample bug see https://reviews.llvm.org/D83099#2154185.

Reviewers: sammccall

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D83934

(cherry picked from commit 46c921003c2ce5f1cdc4de9ef613eb001980780c)
---
 .../clangd/GlobalCompilationDatabase.cpp          | 10 +++-------
 .../clangd/GlobalCompilationDatabase.h            |  3 ++-
 .../unittests/GlobalCompilationDatabaseTests.cpp  | 15 ++++++++++++++-
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
index 5e75864ec8d4..23e8c9fe716d 100644
--- a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
+++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
@@ -298,15 +298,11 @@ void OverlayCDB::setCompileCommand(
 }
 
 llvm::Optional<ProjectInfo> OverlayCDB::getProjectInfo(PathRef File) const {
-  {
-    std::lock_guard<std::mutex> Lock(Mutex);
-    auto It = Commands.find(removeDots(File));
-    if (It != Commands.end())
-      return ProjectInfo{};
-  }
+  // It wouldn't make much sense to treat files with overridden commands
+  // specially when we can't do the same for the (unknown) local headers they
+  // include or changing behavior mid-air after receiving an override.
   if (Base)
     return Base->getProjectInfo(File);
-
   return llvm::None;
 }
 } // namespace clangd
diff --git a/clang-tools-extra/clangd/GlobalCompilationDatabase.h b/clang-tools-extra/clangd/GlobalCompilationDatabase.h
index e9a5417d9d69..95677f9f8c19 100644
--- a/clang-tools-extra/clangd/GlobalCompilationDatabase.h
+++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.h
@@ -119,7 +119,6 @@ std::unique_ptr<GlobalCompilationDatabase>
 getQueryDriverDatabase(llvm::ArrayRef<std::string> QueryDriverGlobs,
                        std::unique_ptr<GlobalCompilationDatabase> Base);
 
-
 /// Wraps another compilation database, and supports overriding the commands
 /// using an in-memory mapping.
 class OverlayCDB : public GlobalCompilationDatabase {
@@ -134,6 +133,8 @@ class OverlayCDB : public GlobalCompilationDatabase {
   llvm::Optional<tooling::CompileCommand>
   getCompileCommand(PathRef File) const override;
   tooling::CompileCommand getFallbackCommand(PathRef File) const override;
+  /// Project info is gathered purely from the inner compilation database to
+  /// ensure consistency.
   llvm::Optional<ProjectInfo> getProjectInfo(PathRef File) const override;
 
   /// Sets or clears the compilation command for a particular file.
diff --git a/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp b/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
index e68b8d727172..ef9a299483f6 100644
--- a/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
+++ b/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
@@ -313,9 +313,22 @@ TEST(GlobalCompilationDatabaseTest, NonCanonicalFilenames) {
   llvm::sys::path::append(File, "blabla", "..", "a.cc");
 
   EXPECT_TRUE(DB.getCompileCommand(File));
-  EXPECT_TRUE(DB.getProjectInfo(File));
+  EXPECT_FALSE(DB.getProjectInfo(File));
 }
 
+TEST_F(OverlayCDBTest, GetProjectInfo) {
+  OverlayCDB DB(Base.get());
+  Path File = testPath("foo.cc");
+  Path Header = testPath("foo.h");
+
+  EXPECT_EQ(DB.getProjectInfo(File)->SourceRoot, testRoot());
+  EXPECT_EQ(DB.getProjectInfo(Header)->SourceRoot, testRoot());
+
+  // Shouldn't change after an override.
+  DB.setCompileCommand(File, tooling::CompileCommand());
+  EXPECT_EQ(DB.getProjectInfo(File)->SourceRoot, testRoot());
+  EXPECT_EQ(DB.getProjectInfo(Header)->SourceRoot, testRoot());
+}
 } // namespace
 } // namespace clangd
 } // namespace clang

From f2eef8ccade595a97322a245484acf95de18be52 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 16 Jul 2020 10:33:20 -0700
Subject: [PATCH 013/363] [X86] Add test case for PR46455.

(cherry picked from commit 9adf7461f721170419058684a8d3f9228d641d59)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  2 ++
 llvm/test/CodeGen/X86/pr46455.ll        | 38 +++++++++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/pr46455.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 450927aaf5cc..8dfe7396699f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -44523,6 +44523,8 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
       isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
     return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
 
+  // NOTE: isHorizontalBinOp may have changed LHS/RHS variables.
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/X86/pr46455.ll b/llvm/test/CodeGen/X86/pr46455.ll
new file mode 100644
index 000000000000..7f608fbfdf6d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr46455.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512vl,avx512bw,avx512dq | FileCheck %s
+
+define void @EntryModule(i8** %buffer_table) {
+; CHECK-LABEL: EntryModule:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 24(%rdi), %rcx
+; CHECK-NEXT:    vcmpneqps (%rax), %ymm0, %ymm0
+; CHECK-NEXT:    vandps {{.*}}(%rip){1to4}, %xmm0, %xmm1
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
+; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; CHECK-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vmovd %xmm0, (%rcx)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %i = bitcast i8** %buffer_table to <8 x float>**
+  %i1 = load <8 x float>*, <8 x float>** %i, align 8
+  %i6 = load <8 x float>, <8 x float>* %i1, align 16
+  %i7 = fcmp une <8 x float> %i6, zeroinitializer
+  %i8 = zext <8 x i1> %i7 to <8 x i32>
+  %i18 = getelementptr inbounds i8*, i8** %buffer_table, i64 3
+  %i19 = load i8*, i8** %i18, align 8
+  %shift = shufflevector <8 x i32> %i8, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %i20 = add nuw nsw <8 x i32> %shift, %i8
+  %shift13 = shufflevector <8 x i32> %i8, <8 x i32> undef, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %i21 = add nuw nsw <8 x i32> %i20, %shift13
+  %shift14 = shufflevector <8 x i32> %i8, <8 x i32> undef, <8 x i32> <i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %i22 = add nuw nsw <8 x i32> %i21, %shift14
+  %i23 = extractelement <8 x i32> %i22, i32 0
+  %i24 = bitcast i8* %i19 to i32*
+  store i32 %i23, i32* %i24, align 8
+  ret void
+}

From fc2d3ffeb890d6e26d55c50b4a343d1e1e4896d5 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 16 Jul 2020 12:52:02 -0700
Subject: [PATCH 014/363] [X86] Move integer hadd/hsub formation into a helper
 function shared by combineAdd and combineSub.

There was a lot of duplicate code here for checking the VT and
subtarget. Moving it into a helper avoids that.

It also fixes a bug that combineAdd reused Op0/Op1 after a call
to isHorizontalBinOp may have changed it. The new helper function
has its own local version of Op0/Op1 that aren't shared by other
code.

Fixes PR46455.

Reviewed By: spatel, bkramer

Differential Revision: https://reviews.llvm.org/D83971

(cherry picked from commit 5408024fa87e0b23b169fec07913bd4357acdbc4)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 51 ++++++++++++++-----------
 llvm/test/CodeGen/X86/pr46455.ll        | 13 ++++---
 2 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8dfe7396699f..ea4b4734225d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47606,6 +47606,30 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
                           PMADDBuilder);
 }
 
+static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
+                                           const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  bool IsAdd = N->getOpcode() == ISD::ADD;
+  assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode");
+
+  if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
+       VT == MVT::v8i32) &&
+      Subtarget.hasSSSE3() &&
+      isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd)) {
+    auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
+                              ArrayRef<SDValue> Ops) {
+      return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB,
+                         DL, Ops[0].getValueType(), Ops);
+    };
+    return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
+                            HOpBuilder);
+  }
+
+  return SDValue();
+}
+
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
@@ -47619,17 +47643,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
     return MAdd;
 
   // Try to synthesize horizontal adds from adds of shuffles.
-  if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
-       VT == MVT::v8i32) &&
-      Subtarget.hasSSSE3() &&
-      isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true)) {
-    auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
-                          ArrayRef<SDValue> Ops) {
-      return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
-    };
-    return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
-                            HADDBuilder);
-  }
+  if (SDValue V = combineAddOrSubToHADDorHSUB(N, DAG, Subtarget))
+    return V;
 
   // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
   // (sub Y, (sext (vXi1 X))).
@@ -47802,18 +47817,8 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
   }
 
   // Try to synthesize horizontal subs from subs of shuffles.
-  EVT VT = N->getValueType(0);
-  if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
-       VT == MVT::v8i32) &&
-      Subtarget.hasSSSE3() &&
-      isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false)) {
-    auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
-                          ArrayRef<SDValue> Ops) {
-      return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
-    };
-    return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
-                            HSUBBuilder);
-  }
+  if (SDValue V = combineAddOrSubToHADDorHSUB(N, DAG, Subtarget))
+    return V;
 
   // Try to create PSUBUS if SUB's argument is max/min
   if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
diff --git a/llvm/test/CodeGen/X86/pr46455.ll b/llvm/test/CodeGen/X86/pr46455.ll
index 7f608fbfdf6d..e5ed94aa5493 100644
--- a/llvm/test/CodeGen/X86/pr46455.ll
+++ b/llvm/test/CodeGen/X86/pr46455.ll
@@ -8,12 +8,13 @@ define void @EntryModule(i8** %buffer_table) {
 ; CHECK-NEXT:    movq (%rdi), %rax
 ; CHECK-NEXT:    movq 24(%rdi), %rcx
 ; CHECK-NEXT:    vcmpneqps (%rax), %ymm0, %ymm0
-; CHECK-NEXT:    vandps {{.*}}(%rip){1to4}, %xmm0, %xmm1
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
-; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; CHECK-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpsrld $31, %xmm0, %xmm1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; CHECK-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovd %xmm0, (%rcx)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq

From 884357e301a9318003a9c229ae73aa06caea0592 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Thu, 16 Jul 2020 17:28:17 +0200
Subject: [PATCH 015/363] [lldb/DWARF] Don't get confused by line sequences
 with tombstone values

Summary:
With D81784, lld has started debug info resolving relocations to
garbage-collected symbols as -1 (instead of relocation addend). For an
unaware consumer this generated sequences which seemingly wrap the
address space -- their first entry was 0xfffff, but all other entries
were low numbers.

Lldb stores line sequences concatenated into one large vector, sorted by
the first entry, and searched with std::lower_bound. This resulted in
the low-value entries being placed at the end of the vector, which
utterly confused the lower_bound algorithm, and caused it to not find a
match. (Previously, these sequences would be at the start of the vector,
and normally would contain addresses that are far smaller than any real
address we want to look up, so std::lower_bound was fine.)

This patch makes lldb ignore these kinds of sequences completely. It
does that by changing the construction algorithm from iterating over the
rows (as parsed by llvm), to iterating over the sequences. This is
important because the llvm parsed performs validity checks when
constructing the sequence array, whereas the row array contains raw
data.

Reviewers: JDevlieghere, MaskRay

Differential Revision: https://reviews.llvm.org/D83957

(cherry picked from commit f3fab392f57421a5bdabfb7e40820257d8f637b2)
---
 .../SymbolFile/DWARF/SymbolFileDWARF.cpp      |  22 ++--
 .../SymbolFile/DWARF/debug_line-tombstone.s   | 106 ++++++++++++++++++
 2 files changed, 118 insertions(+), 10 deletions(-)
 create mode 100644 lldb/test/Shell/SymbolFile/DWARF/debug_line-tombstone.s

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 9f64e5255fd5..0b7e31ae2d1d 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -1036,18 +1036,20 @@ bool SymbolFileDWARF::ParseLineTable(CompileUnit &comp_unit) {
   // FIXME: Rather than parsing the whole line table and then copying it over
   // into LLDB, we should explore using a callback to populate the line table
   // while we parse to reduce memory usage.
-  std::unique_ptr<LineSequence> sequence =
-      LineTable::CreateLineSequenceContainer();
   std::vector<std::unique_ptr<LineSequence>> sequences;
-  for (auto &row : line_table->Rows) {
-    LineTable::AppendLineEntryToSequence(
-        sequence.get(), row.Address.Address, row.Line, row.Column, row.File,
-        row.IsStmt, row.BasicBlock, row.PrologueEnd, row.EpilogueBegin,
-        row.EndSequence);
-    if (row.EndSequence) {
-      sequences.push_back(std::move(sequence));
-      sequence = LineTable::CreateLineSequenceContainer();
+  // The Sequences view contains only valid line sequences. Don't iterate over
+  // the Rows directly.
+  for (const llvm::DWARFDebugLine::Sequence &seq : line_table->Sequences) {
+    std::unique_ptr<LineSequence> sequence =
+        LineTable::CreateLineSequenceContainer();
+    for (unsigned idx = seq.FirstRowIndex; idx < seq.LastRowIndex; ++idx) {
+      const llvm::DWARFDebugLine::Row &row = line_table->Rows[idx];
+      LineTable::AppendLineEntryToSequence(
+          sequence.get(), row.Address.Address, row.Line, row.Column, row.File,
+          row.IsStmt, row.BasicBlock, row.PrologueEnd, row.EpilogueBegin,
+          row.EndSequence);
     }
+    sequences.push_back(std::move(sequence));
   }
 
   std::unique_ptr<LineTable> line_table_up =
diff --git a/lldb/test/Shell/SymbolFile/DWARF/debug_line-tombstone.s b/lldb/test/Shell/SymbolFile/DWARF/debug_line-tombstone.s
new file mode 100644
index 000000000000..53600ac5f4b1
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/debug_line-tombstone.s
@@ -0,0 +1,106 @@
+# This test that we don't get confused by line tables containing a tombstone
+# (-1) value, as produced by recent lld's. Line sequences with the tombstone
+# value should be completely ignored. The tombstone sequence is deliberately
+# longer so that any attempt at an address binary search will likely land inside
+# the sequence.
+
+# RUN: llvm-mc --filetype=obj --triple=x86_64-pc-linux %s -o %t
+# RUN: %lldb -o "image lookup -n main -v" -o "image dump line-table main.cpp" \
+# RUN:   -o exit %t | FileCheck %s
+
+# CHECK-LABEL: image lookup -n main -v
+# CHECK: LineEntry: [0x0000000000001000-0x0000000000001001): main.cpp:1
+# CHECK-LABEL: image dump line-table main.cpp
+# CHECK-NEXT: Line table for main.cpp
+# CHECK-NEXT: 0x0000000000001000: main.cpp:1
+# CHECK-NEXT: 0x0000000000001001: main.cpp:1
+# CHECK-EMPTY:
+# CHECK-NEXT: exit
+
+        .text
+.space 0x1000
+main:
+  nop
+.Lmain_end:
+
+        .section        .debug_abbrev,"",@progbits
+        .byte   1                               # Abbreviation Code
+        .byte   17                              # DW_TAG_compile_unit
+        .byte   0                               # DW_CHILDREN_no
+        .byte   37                              # DW_AT_producer
+        .byte   8                               # DW_FORM_string
+        .byte   3                               # DW_AT_name
+        .byte   8                               # DW_FORM_string
+        .byte   16                              # DW_AT_stmt_list
+        .byte   23                              # DW_FORM_sec_offset
+        .byte   17                              # DW_AT_low_pc
+        .byte   1                               # DW_FORM_addr
+        .byte   18                              # DW_AT_high_pc
+        .byte   6                               # DW_FORM_data4
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   0                               # EOM(3)
+
+        .section        .debug_info,"",@progbits
+.Lcu_begin0:
+        .long   .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+        .short  4                               # DWARF version number
+        .long   0                               # Offset Into Abbrev. Section
+        .byte   8                               # Address Size (in bytes)
+        .byte   1                               # Abbrev [1] 0xb:0xc4 DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"            # DW_AT_producer
+        .asciz  "main.cpp"                      # DW_AT_name
+        .long   0                               # DW_AT_stmt_list
+        .quad   main-.text                      # DW_AT_low_pc
+        .long   .Lmain_end-main                 # DW_AT_high_pc
+.Ldebug_info_end0:
+
+.section .debug_line,"",@progbits
+        .long   .Llt1_end - .Llt1_start # Length of Unit (DWARF-32 format)
+.Llt1_start:
+        .short  4               # DWARF version number
+        .long   .Lprologue1_end-.Lprologue1_start # Length of Prologue
+.Lprologue1_start:
+        .byte   1               # Minimum Instruction Length
+        .byte   1               # Maximum Operations per Instruction
+        .byte   1               # Default is_stmt
+        .byte   -5              # Line Base
+        .byte   14              # Line Range
+        .byte   13              # Opcode Base
+        .byte   0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 # Standard Opcode Lengths
+        .byte   0
+        .asciz "main.cpp"          # File table
+        .byte   0, 0, 0
+        .byte   0
+.Lprologue1_end:
+        .byte   0, 9, 2         # DW_LNE_set_address
+        .quad   -1
+        .byte   1               # DW_LNS_copy
+        .byte   33              # address += 1,  line += 1
+        .byte   33              # address += 1,  line += 1
+        .byte   33              # address += 1,  line += 1
+        .byte   33              # address += 1,  line += 1
+        .byte   33              # address += 1,  line += 1
+        .byte   33              # address += 1,  line += 1
+        .byte   33              # address += 1,  line += 1
+        .byte   33              # address += 1,  line += 1
+        .byte   33              # address += 1,  line += 1
+        .byte   33              # address += 1,  line += 1
+        .byte   33              # address += 1,  line += 1
+        .byte   33              # address += 1,  line += 1
+        .byte   33              # address += 1,  line += 1
+        .byte   33              # address += 1,  line += 1
+        .byte   33              # address += 1,  line += 1
+        .byte   2               # DW_LNS_advance_pc
+        .uleb128 1
+        .byte   0, 1, 1         # DW_LNE_end_sequence
+
+        .byte   0, 9, 2         # DW_LNE_set_address
+        .quad   main-.text
+        .byte   18              # address += 0,  line += 0
+        .byte   2               # DW_LNS_advance_pc
+        .uleb128 1
+        .byte   0, 1, 1         # DW_LNE_end_sequence
+.Llt1_end:
+

From b35b720f5c95a0e8725a0a62337f87a452a93fe3 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 17 Jul 2020 01:45:14 -0700
Subject: [PATCH 016/363] [msan] Fix strxfrm test

Revert D83719 and explicitly set locate to "C".

(cherry picked from commit 650baf22e69fff99bbfbea65edcd8e202b05fdff)
---
 compiler-rt/test/msan/strxfrm.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/test/msan/strxfrm.cpp b/compiler-rt/test/msan/strxfrm.cpp
index d40b56b234cd..b4fee6f55c4c 100644
--- a/compiler-rt/test/msan/strxfrm.cpp
+++ b/compiler-rt/test/msan/strxfrm.cpp
@@ -7,12 +7,12 @@
 #include <string.h>
 
 int main(void) {
-  char q[30];
+  char q[10];
   size_t n = strxfrm(q, "abcdef", sizeof(q));
   assert(n < sizeof(q));
   __msan_check_mem_is_initialized(q, n + 1);
 
-  locale_t loc = newlocale(LC_ALL_MASK, "", (locale_t)0);
+  locale_t loc = newlocale(LC_ALL_MASK, "C", (locale_t)0);
 
   __msan_poison(&q, sizeof(q));
   n = strxfrm_l(q, "qwerty", sizeof(q), loc);

From 1dc182b1e6d1b53286d051ea204505c9c4a5003c Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Fri, 17 Jul 2020 13:22:17 +0200
Subject: [PATCH 017/363] Add -flang flag to the test-release.sh script

The flag is off by default.

(cherry picked from commit 033ef8420cec57187fffac1f06322f73aa945c4c)
---
 llvm/utils/release/test-release.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llvm/utils/release/test-release.sh b/llvm/utils/release/test-release.sh
index 5a8e5c244ecf..5351ae1064f9 100755
--- a/llvm/utils/release/test-release.sh
+++ b/llvm/utils/release/test-release.sh
@@ -41,6 +41,7 @@ do_lld="yes"
 do_lldb="no"
 do_polly="yes"
 do_mlir="yes"
+do_flang="no"
 BuildDir="`pwd`"
 ExtraConfigureFlags=""
 ExportBranch=""
@@ -172,6 +173,9 @@ while [ $# -gt 0 ]; do
         -no-mlir )
             do_mlir="no"
             ;;
+        -flang )
+            do_flang="yes"
+            ;;
         -help | --help | -h | --h | -\? )
             usage
             exit 0
@@ -261,6 +265,9 @@ fi
 if [ $do_mlir = "yes" ]; then
   projects="$projects mlir"
 fi
+if [ $do_flang = "yes" ]; then
+  projects="$projects flang"
+fi
 
 # Go to the build directory (may be different from CWD)
 BuildDir=$BuildDir/$RC

From 297be788a797c0ab98d9677f50e3dc57faab363b Mon Sep 17 00:00:00 2001
From: Jinsong Ji <jji@us.ibm.com>
Date: Fri, 17 Jul 2020 15:48:27 +0000
Subject: [PATCH 018/363] [docs] Add Deprecated section to ReleaseNotes

This is brought up in https://reviews.llvm.org/D83915.
We would like to remove some feature in PowerPC.

We did send RFC before, but we think it might be a better idea that
we indicate planned removal in the Release Notes for version 11
and actual removal in those for version 12..

Reviewed By: hubert.reinterpretcast

Differential Revision: https://reviews.llvm.org/D83968
---
 llvm/docs/ReleaseNotes.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 2f93afd8374c..ed1718a95054 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -31,6 +31,10 @@ LLVM web page, this document applies to the *next* release, not the current
 one.  To see the release notes for a specific release, please see the `releases
 page <https://llvm.org/releases/>`_.
 
+Deprecated and Removed Features/APIs
+=================================================
+* BG/Q support, including QPX, will be removed in the 12.0.0 release.
+
 Non-comprehensive list of changes in this release
 =================================================
 .. NOTE

From 8a438096ffa48dadeb73b78844c53a7428aaec20 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight@google.com>
Date: Thu, 16 Jul 2020 10:01:52 -0400
Subject: [PATCH 019/363] Remove
 TwoAddressInstructionPass::sink3AddrInstruction.

This function has a bug which will incorrectly reschedule instructions
after an INLINEASM_BR (which can branch). (The bug may also allow
scheduling past a throwing-CALL, I'm not certain.)

I could fix that bug, but, as the removed FIXME notes, it's better to
attempt rescheduling before converting to 3-addr form, as that may
remove the need to convert in the first place. In fact, the code to do
such reordering was added to this pass only a few months later, in
2011, via the addition of the function rescheduleMIBelowKill. That
code does not contain the same bug.

The removal of the sink3AddrInstruction function is not a no-op: in
some cases it would move an instruction post-conversion, when
rescheduleMIBelowKill would not move the instruction pre-converison.
However, this does not appear to be important: the machine instruction
scheduler can reorder the after-conversion instructions, in any case.

This patch fixes a kernel panic 4.4 LTS x86_64 Linux kernels, when
built with clang after 4b0aa5724feaa89a9538dcab97e018110b0e4bc3.

Link: https://github.com/ClangBuiltLinux/linux/issues/1085

Differential Revision: https://reviews.llvm.org/D83708

(cherry picked from commit 60433c63acb71935111304d71e41b7ee982398f8)
---
 .../lib/CodeGen/TwoAddressInstructionPass.cpp | 161 +-----------------
 llvm/test/CodeGen/X86/callbr-asm-sink.ll      |  35 ++++
 llvm/test/CodeGen/X86/masked-iv-unsafe.ll     |  14 +-
 llvm/test/CodeGen/X86/reverse_branches.ll     |  18 +-
 llvm/test/CodeGen/X86/rotate-extract.ll       |   6 +-
 llvm/test/CodeGen/X86/twoaddr-lea.ll          |   5 +-
 llvm/test/CodeGen/X86/twoaddr-pass-sink.ll    |  30 ----
 7 files changed, 62 insertions(+), 207 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/callbr-asm-sink.ll
 delete mode 100644 llvm/test/CodeGen/X86/twoaddr-pass-sink.ll

diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index de336abe607a..615ff4b8789c 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -70,7 +70,6 @@ STATISTIC(NumTwoAddressInstrs, "Number of two-address instructions");
 STATISTIC(NumCommuted        , "Number of instructions commuted to coalesce");
 STATISTIC(NumAggrCommuted    , "Number of instructions aggressively commuted");
 STATISTIC(NumConvertedTo3Addr, "Number of instructions promoted to 3-address");
-STATISTIC(Num3AddrSunk,        "Number of 3-address instructions sunk");
 STATISTIC(NumReSchedUps,       "Number of instructions re-scheduled up");
 STATISTIC(NumReSchedDowns,     "Number of instructions re-scheduled down");
 
@@ -109,10 +108,6 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
   // Set of already processed instructions in the current block.
   SmallPtrSet<MachineInstr*, 8> Processed;
 
-  // Set of instructions converted to three-address by target and then sunk
-  // down current basic block.
-  SmallPtrSet<MachineInstr*, 8> SunkInstrs;
-
   // A map from virtual registers to physical registers which are likely targets
   // to be coalesced to due to copies from physical registers to virtual
   // registers. e.g. v1024 = move r0.
@@ -123,9 +118,6 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
   // registers. e.g. r1 = move v1024.
   DenseMap<unsigned, unsigned> DstRegMap;
 
-  bool sink3AddrInstruction(MachineInstr *MI, unsigned Reg,
-                            MachineBasicBlock::iterator OldPos);
-
   bool isRevCopyChain(unsigned FromReg, unsigned ToReg, int Maxlen);
 
   bool noUseAfterLastDef(unsigned Reg, unsigned Dist, unsigned &LastDef);
@@ -209,136 +201,6 @@ INITIALIZE_PASS_END(TwoAddressInstructionPass, DEBUG_TYPE,
 
 static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg, LiveIntervals *LIS);
 
-/// A two-address instruction has been converted to a three-address instruction
-/// to avoid clobbering a register. Try to sink it past the instruction that
-/// would kill the above mentioned register to reduce register pressure.
-bool TwoAddressInstructionPass::
-sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
-                     MachineBasicBlock::iterator OldPos) {
-  // FIXME: Shouldn't we be trying to do this before we three-addressify the
-  // instruction?  After this transformation is done, we no longer need
-  // the instruction to be in three-address form.
-
-  // Check if it's safe to move this instruction.
-  bool SeenStore = true; // Be conservative.
-  if (!MI->isSafeToMove(AA, SeenStore))
-    return false;
-
-  unsigned DefReg = 0;
-  SmallSet<unsigned, 4> UseRegs;
-
-  for (const MachineOperand &MO : MI->operands()) {
-    if (!MO.isReg())
-      continue;
-    Register MOReg = MO.getReg();
-    if (!MOReg)
-      continue;
-    if (MO.isUse() && MOReg != SavedReg)
-      UseRegs.insert(MO.getReg());
-    if (!MO.isDef())
-      continue;
-    if (MO.isImplicit())
-      // Don't try to move it if it implicitly defines a register.
-      return false;
-    if (DefReg)
-      // For now, don't move any instructions that define multiple registers.
-      return false;
-    DefReg = MO.getReg();
-  }
-
-  // Find the instruction that kills SavedReg.
-  MachineInstr *KillMI = nullptr;
-  if (LIS) {
-    LiveInterval &LI = LIS->getInterval(SavedReg);
-    assert(LI.end() != LI.begin() &&
-           "Reg should not have empty live interval.");
-
-    SlotIndex MBBEndIdx = LIS->getMBBEndIdx(MBB).getPrevSlot();
-    LiveInterval::const_iterator I = LI.find(MBBEndIdx);
-    if (I != LI.end() && I->start < MBBEndIdx)
-      return false;
-
-    --I;
-    KillMI = LIS->getInstructionFromIndex(I->end);
-  }
-  if (!KillMI) {
-    for (MachineOperand &UseMO : MRI->use_nodbg_operands(SavedReg)) {
-      if (!UseMO.isKill())
-        continue;
-      KillMI = UseMO.getParent();
-      break;
-    }
-  }
-
-  // If we find the instruction that kills SavedReg, and it is in an
-  // appropriate location, we can try to sink the current instruction
-  // past it.
-  if (!KillMI || KillMI->getParent() != MBB || KillMI == MI ||
-      MachineBasicBlock::iterator(KillMI) == OldPos || KillMI->isTerminator())
-    return false;
-
-  // If any of the definitions are used by another instruction between the
-  // position and the kill use, then it's not safe to sink it.
-  //
-  // FIXME: This can be sped up if there is an easy way to query whether an
-  // instruction is before or after another instruction. Then we can use
-  // MachineRegisterInfo def / use instead.
-  MachineOperand *KillMO = nullptr;
-  MachineBasicBlock::iterator KillPos = KillMI;
-  ++KillPos;
-
-  unsigned NumVisited = 0;
-  for (MachineInstr &OtherMI : make_range(std::next(OldPos), KillPos)) {
-    // Debug instructions cannot be counted against the limit.
-    if (OtherMI.isDebugInstr())
-      continue;
-    if (NumVisited > 30)  // FIXME: Arbitrary limit to reduce compile time cost.
-      return false;
-    ++NumVisited;
-    for (unsigned i = 0, e = OtherMI.getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = OtherMI.getOperand(i);
-      if (!MO.isReg())
-        continue;
-      Register MOReg = MO.getReg();
-      if (!MOReg)
-        continue;
-      if (DefReg == MOReg)
-        return false;
-
-      if (MO.isKill() || (LIS && isPlainlyKilled(&OtherMI, MOReg, LIS))) {
-        if (&OtherMI == KillMI && MOReg == SavedReg)
-          // Save the operand that kills the register. We want to unset the kill
-          // marker if we can sink MI past it.
-          KillMO = &MO;
-        else if (UseRegs.count(MOReg))
-          // One of the uses is killed before the destination.
-          return false;
-      }
-    }
-  }
-  assert(KillMO && "Didn't find kill");
-
-  if (!LIS) {
-    // Update kill and LV information.
-    KillMO->setIsKill(false);
-    KillMO = MI->findRegisterUseOperand(SavedReg, false, TRI);
-    KillMO->setIsKill(true);
-
-    if (LV)
-      LV->replaceKillInstruction(SavedReg, *KillMI, *MI);
-  }
-
-  // Move instruction to its destination.
-  MBB->remove(MI);
-  MBB->insert(KillPos, MI);
-
-  if (LIS)
-    LIS->handleMove(*MI);
-
-  ++Num3AddrSunk;
-  return true;
-}
-
 /// Return the MachineInstr* if it is the single def of the Reg in current BB.
 static MachineInstr *getSingleDef(unsigned Reg, MachineBasicBlock *BB,
                                   const MachineRegisterInfo *MRI) {
@@ -740,26 +602,15 @@ TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
 
   LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
   LLVM_DEBUG(dbgs() << "2addr:         TO 3-ADDR: " << *NewMI);
-  bool Sunk = false;
 
   if (LIS)
     LIS->ReplaceMachineInstrInMaps(*mi, *NewMI);
 
-  if (NewMI->findRegisterUseOperand(RegB, false, TRI))
-    // FIXME: Temporary workaround. If the new instruction doesn't
-    // uses RegB, convertToThreeAddress must have created more
-    // then one instruction.
-    Sunk = sink3AddrInstruction(NewMI, RegB, mi);
-
   MBB->erase(mi); // Nuke the old inst.
 
-  if (!Sunk) {
-    DistanceMap.insert(std::make_pair(NewMI, Dist));
-    mi = NewMI;
-    nmi = std::next(mi);
-  }
-  else
-    SunkInstrs.insert(NewMI);
+  DistanceMap.insert(std::make_pair(NewMI, Dist));
+  mi = NewMI;
+  nmi = std::next(mi);
 
   // Update source and destination register maps.
   SrcRegMap.erase(RegA);
@@ -1700,13 +1551,11 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
     SrcRegMap.clear();
     DstRegMap.clear();
     Processed.clear();
-    SunkInstrs.clear();
     for (MachineBasicBlock::iterator mi = MBB->begin(), me = MBB->end();
          mi != me; ) {
       MachineBasicBlock::iterator nmi = std::next(mi);
-      // Don't revisit an instruction previously converted by target. It may
-      // contain undef register operands (%noreg), which are not handled.
-      if (mi->isDebugInstr() || SunkInstrs.count(&*mi)) {
+      // Skip debug instructions.
+      if (mi->isDebugInstr()) {
         mi = nmi;
         continue;
       }
diff --git a/llvm/test/CodeGen/X86/callbr-asm-sink.ll b/llvm/test/CodeGen/X86/callbr-asm-sink.ll
new file mode 100644
index 000000000000..758ac37f8ba4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/callbr-asm-sink.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+;; Verify that the machine instructions generated from the first
+;; getelementptr don't get sunk below the callbr. (Reduced from a bug
+;; report.)
+
+%struct1 = type { i8*, i32 }
+
+define void @klist_dec_and_del(%struct1*) {
+; CHECK-LABEL: klist_dec_and_del:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    leaq 8(%rdi), %rax
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # 8(%rdi) .Ltmp0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .Ltmp0: # Block address taken
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    movq $0, -8(%rax)
+; CHECK-NEXT:    retq
+  %2 = getelementptr inbounds %struct1, %struct1* %0, i64 0, i32 1
+  callbr void asm sideeffect "# $0 $1", "*m,X,~{memory},~{dirflag},~{fpsr},~{flags}"(i32* %2, i8* blockaddress(@klist_dec_and_del, %3))
+          to label %6 [label %3]
+
+3:
+  %4 = getelementptr i32, i32* %2, i64 -2
+  %5 = bitcast i32* %4 to i8**
+  store i8* null, i8** %5, align 8
+  br label %6
+
+6:
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/masked-iv-unsafe.ll b/llvm/test/CodeGen/X86/masked-iv-unsafe.ll
index 76f2ad22b44a..e4c82faa90d8 100644
--- a/llvm/test/CodeGen/X86/masked-iv-unsafe.ll
+++ b/llvm/test/CodeGen/X86/masked-iv-unsafe.ll
@@ -402,9 +402,9 @@ return:
 define void @another_count_down_signed(double* %d, i64 %n) nounwind {
 ; CHECK-LABEL: another_count_down_signed:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    shlq $24, %rax
-; CHECK-NEXT:    leaq -10(%rsi), %rcx
+; CHECK-NEXT:    leaq -10(%rsi), %rax
+; CHECK-NEXT:    movq %rsi, %rcx
+; CHECK-NEXT:    shlq $24, %rcx
 ; CHECK-NEXT:    shlq $8, %rsi
 ; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
@@ -417,17 +417,17 @@ define void @another_count_down_signed(double* %d, i64 %n) nounwind {
 ; CHECK-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
 ; CHECK-NEXT:    mulsd %xmm0, %xmm3
 ; CHECK-NEXT:    movsd %xmm3, (%rdi,%rdx,8)
-; CHECK-NEXT:    movq %rax, %rdx
+; CHECK-NEXT:    movq %rcx, %rdx
 ; CHECK-NEXT:    sarq $24, %rdx
 ; CHECK-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
 ; CHECK-NEXT:    mulsd %xmm1, %xmm3
 ; CHECK-NEXT:    movsd %xmm3, (%rdi,%rdx,8)
 ; CHECK-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
 ; CHECK-NEXT:    mulsd %xmm2, %xmm3
-; CHECK-NEXT:    movsd %xmm3, 80(%rdi,%rcx,8)
-; CHECK-NEXT:    addq $-16777216, %rax # imm = 0xFF000000
+; CHECK-NEXT:    movsd %xmm3, 80(%rdi,%rax,8)
+; CHECK-NEXT:    addq $-16777216, %rcx # imm = 0xFF000000
 ; CHECK-NEXT:    addq $-256, %rsi
-; CHECK-NEXT:    decq %rcx
+; CHECK-NEXT:    decq %rax
 ; CHECK-NEXT:    jne .LBB7_1
 ; CHECK-NEXT:  # %bb.2: # %return
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/reverse_branches.ll b/llvm/test/CodeGen/X86/reverse_branches.ll
index 170fc6a76280..7a9ff8452d1d 100644
--- a/llvm/test/CodeGen/X86/reverse_branches.ll
+++ b/llvm/test/CodeGen/X86/reverse_branches.ll
@@ -48,25 +48,25 @@ define i32 @test_branches_order() uwtable ssp {
 ; CHECK-NEXT:    jg LBB0_7
 ; CHECK-NEXT:  ## %bb.2: ## %for.cond1.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movl $-1, %r13d
-; CHECK-NEXT:    movq %r15, %rbx
-; CHECK-NEXT:    movq %r14, %rbp
+; CHECK-NEXT:    movl $-1, %ebp
+; CHECK-NEXT:    movq %r15, %rdi
+; CHECK-NEXT:    movq %r14, %rbx
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_3: ## %for.cond1
 ; CHECK-NEXT:    ## Parent Loop BB0_1 Depth=1
 ; CHECK-NEXT:    ## => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    incl %r13d
-; CHECK-NEXT:    cmpl $999, %r13d ## imm = 0x3E7
+; CHECK-NEXT:    incl %ebp
+; CHECK-NEXT:    cmpl $999, %ebp ## imm = 0x3E7
 ; CHECK-NEXT:    jg LBB0_6
 ; CHECK-NEXT:  ## %bb.4: ## %for.body3
 ; CHECK-NEXT:    ## in Loop: Header=BB0_3 Depth=2
-; CHECK-NEXT:    addq $1002, %rbp ## imm = 0x3EA
-; CHECK-NEXT:    movq %rbx, %rdi
-; CHECK-NEXT:    addq $1001, %rbx ## imm = 0x3E9
+; CHECK-NEXT:    addq $1002, %rbx ## imm = 0x3EA
+; CHECK-NEXT:    leaq 1001(%rdi), %r13
 ; CHECK-NEXT:    movl $1000, %edx ## imm = 0x3E8
 ; CHECK-NEXT:    movl $120, %esi
 ; CHECK-NEXT:    callq _memchr
-; CHECK-NEXT:    cmpq %rax, %rbp
+; CHECK-NEXT:    cmpq %rax, %rbx
+; CHECK-NEXT:    movq %r13, %rdi
 ; CHECK-NEXT:    je LBB0_3
 ; CHECK-NEXT:    jmp LBB0_5
 ; CHECK-NEXT:  LBB0_7: ## %for.end11
diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll
index 9ef29c7883d4..41003c9d335d 100644
--- a/llvm/test/CodeGen/X86/rotate-extract.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract.ll
@@ -306,9 +306,9 @@ define i32 @extract_add_1_comut(i32 %i) nounwind {
 define i32 @no_extract_add_1(i32 %i) nounwind {
 ; X86-LABEL: no_extract_add_1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    leal (%ecx,%ecx), %eax
-; X86-NEXT:    shrl $27, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax), %ecx
+; X86-NEXT:    shrl $27, %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/twoaddr-lea.ll b/llvm/test/CodeGen/X86/twoaddr-lea.ll
index 077cf805bcb1..716d20d63c44 100644
--- a/llvm/test/CodeGen/X86/twoaddr-lea.ll
+++ b/llvm/test/CodeGen/X86/twoaddr-lea.ll
@@ -68,8 +68,9 @@ bb2:
   br label %bb6
 
 bb3:
-; CHECK: subl %e[[REG0:[a-z0-9]+]],
-; CHECK: addq $4, %r[[REG0]]
+; CHECK: LBB3_3:
+; CHECK: addq $4, %r
+; CHECK: subl %e
   %tmp14 = phi i64 [ %tmp15, %bb5 ], [ 0, %bb1 ]
   %tmp15 = add nuw i64 %tmp14, 4
   %tmp16 = trunc i64 %tmp14 to i32
diff --git a/llvm/test/CodeGen/X86/twoaddr-pass-sink.ll b/llvm/test/CodeGen/X86/twoaddr-pass-sink.ll
deleted file mode 100644
index a06eaec894ca..000000000000
--- a/llvm/test/CodeGen/X86/twoaddr-pass-sink.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; REQUIRES: asserts
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -stats 2>&1 | grep "Number of 3-address instructions sunk"
-
-define void @t2(<2 x i64>* %vDct, <2 x i64>* %vYp, i8* %skiplist, <2 x i64> %a1) nounwind  {
-entry:
-	%tmp25 = bitcast <2 x i64> %a1 to <8 x i16>		; <<8 x i16>> [#uses=1]
-	br label %bb
-bb:		; preds = %bb, %entry
-	%skiplist_addr.0.rec = phi i32 [ 0, %entry ], [ %indvar.next, %bb ]		; <i32> [#uses=3]
-	%vYp_addr.0.rec = shl i32 %skiplist_addr.0.rec, 3		; <i32> [#uses=3]
-	%vDct_addr.0 = getelementptr <2 x i64>, <2 x i64>* %vDct, i32 %vYp_addr.0.rec		; <<2 x i64>*> [#uses=1]
-	%vYp_addr.0 = getelementptr <2 x i64>, <2 x i64>* %vYp, i32 %vYp_addr.0.rec		; <<2 x i64>*> [#uses=1]
-	%skiplist_addr.0 = getelementptr i8, i8* %skiplist, i32 %skiplist_addr.0.rec		; <i8*> [#uses=1]
-	%vDct_addr.0.sum43 = or i32 %vYp_addr.0.rec, 1		; <i32> [#uses=1]
-	%tmp7 = getelementptr <2 x i64>, <2 x i64>* %vDct, i32 %vDct_addr.0.sum43		; <<2 x i64>*> [#uses=1]
-	%tmp8 = load <2 x i64>, <2 x i64>* %tmp7, align 16		; <<2 x i64>> [#uses=1]
-	%tmp11 = load <2 x i64>, <2 x i64>* %vDct_addr.0, align 16		; <<2 x i64>> [#uses=1]
-	%tmp13 = bitcast <2 x i64> %tmp8 to <8 x i16>		; <<8 x i16>> [#uses=1]
-	%tmp15 = bitcast <2 x i64> %tmp11 to <8 x i16>		; <<8 x i16>> [#uses=1]
-	%tmp16 = shufflevector <8 x i16> %tmp15, <8 x i16> %tmp13, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 >		; <<8 x i16>> [#uses=1]
-	%tmp26 = mul <8 x i16> %tmp25, %tmp16		; <<8 x i16>> [#uses=1]
-	%tmp27 = bitcast <8 x i16> %tmp26 to <2 x i64>		; <<2 x i64>> [#uses=1]
-	store <2 x i64> %tmp27, <2 x i64>* %vYp_addr.0, align 16
-	%tmp37 = load i8, i8* %skiplist_addr.0, align 1		; <i8> [#uses=1]
-	%tmp38 = icmp eq i8 %tmp37, 0		; <i1> [#uses=1]
-	%indvar.next = add i32 %skiplist_addr.0.rec, 1		; <i32> [#uses=1]
-	br i1 %tmp38, label %return, label %bb
-return:		; preds = %bb
-	ret void
-}

From 094d6386ec0478076218303a756edda8a63d7c55 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 17 Jul 2020 23:29:50 -0700
Subject: [PATCH 020/363] [RelocationResolver] Support R_PPC_REL32 &
 R_PPC64_REL{32,64}

This suppresses `failed to compute relocation: R_PPC_REL32, Invalid data was encountered while parsing the file`
and its 64-bit variants when running llvm-dwarfdump on a PowerPC object file with .eh_frame

Unfortunately it is difficult to test the computation:
DWARFDataExtractor::getEncodedPointer does not use the relocated value
and even if it does, we need to teach llvm-dwarfdump --eh-frame to do
some linker job to report a reasonable address.

(cherry picked from commit b922004ea29d54534c4f09b9cfa655bf5f3360f0)
---
 llvm/lib/Object/RelocationResolver.cpp  | 20 +++++++++++--
 llvm/test/DebugInfo/PowerPC/eh-frame.ll | 39 +++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/DebugInfo/PowerPC/eh-frame.ll

diff --git a/llvm/lib/Object/RelocationResolver.cpp b/llvm/lib/Object/RelocationResolver.cpp
index 3f3f79b0f4ff..0058d12dcc87 100644
--- a/llvm/lib/Object/RelocationResolver.cpp
+++ b/llvm/lib/Object/RelocationResolver.cpp
@@ -152,6 +152,8 @@ static bool supportsPPC64(uint64_t Type) {
   switch (Type) {
   case ELF::R_PPC64_ADDR32:
   case ELF::R_PPC64_ADDR64:
+  case ELF::R_PPC64_REL32:
+  case ELF::R_PPC64_REL64:
     return true;
   default:
     return false;
@@ -164,6 +166,10 @@ static uint64_t resolvePPC64(RelocationRef R, uint64_t S, uint64_t A) {
     return (S + getELFAddend(R)) & 0xFFFFFFFF;
   case ELF::R_PPC64_ADDR64:
     return S + getELFAddend(R);
+  case ELF::R_PPC64_REL32:
+    return (S + getELFAddend(R) - R.getOffset()) & 0xFFFFFFFF;
+  case ELF::R_PPC64_REL64:
+    return S + getELFAddend(R) - R.getOffset();
   default:
     llvm_unreachable("Invalid relocation type");
   }
@@ -259,12 +265,22 @@ static uint64_t resolveX86(RelocationRef R, uint64_t S, uint64_t A) {
 }
 
 static bool supportsPPC32(uint64_t Type) {
-  return Type == ELF::R_PPC_ADDR32;
+  switch (Type) {
+  case ELF::R_PPC_ADDR32:
+  case ELF::R_PPC_REL32:
+    return true;
+  default:
+    return false;
+  }
 }
 
 static uint64_t resolvePPC32(RelocationRef R, uint64_t S, uint64_t A) {
-  if (R.getType() == ELF::R_PPC_ADDR32)
+  switch (R.getType()) {
+  case ELF::R_PPC_ADDR32:
     return (S + getELFAddend(R)) & 0xFFFFFFFF;
+  case ELF::R_PPC_REL32:
+    return (S + getELFAddend(R) - R.getOffset()) & 0xFFFFFFFF;
+  }
   llvm_unreachable("Invalid relocation type");
 }
 
diff --git a/llvm/test/DebugInfo/PowerPC/eh-frame.ll b/llvm/test/DebugInfo/PowerPC/eh-frame.ll
new file mode 100644
index 000000000000..3a8f7df6b61a
--- /dev/null
+++ b/llvm/test/DebugInfo/PowerPC/eh-frame.ll
@@ -0,0 +1,39 @@
+; RUN: llc -filetype=obj -mtriple=powerpc %s -o %t32.o
+; RUN: llvm-readobj -r %t32.o | FileCheck %s --check-prefix=PPC_REL
+; RUN: llvm-dwarfdump --eh-frame %t32.o 2>&1 | FileCheck %s --check-prefix=PPC
+
+; PPC_REL:      R_PPC_REL32 .text 0x0
+; PPC_REL-NEXT: R_PPC_REL32 .text 0x4
+
+; PPC-NOT: warning:
+; PPC: FDE cie=00000000 pc=00000000...00000004
+;; TODO Take relocation into consideration
+; PPC: FDE cie=00000000 pc=00000000...00000004
+
+; RUN: llc -filetype=obj -mtriple=ppc64 %s -o %t64.o
+; RUN: llvm-readobj -r %t64.o | FileCheck %s --check-prefix=PPC64_REL
+; RUN: llvm-dwarfdump --eh-frame %t64.o 2>&1 | FileCheck %s --check-prefix=PPC64
+
+; PPC64_REL:      R_PPC64_REL32 .text 0x0
+; PPC64_REL-NEXT: R_PPC64_REL32 .text 0x10
+
+; PPC64-NOT: warning:
+; PPC64: FDE cie=00000000 pc=00000000...00000010
+; PPC64: FDE cie=00000000 pc=00000000...00000010
+
+; RUN: llc -filetype=obj -mtriple=ppc64le -code-model=large %s -o %t64l.o
+; RUN: llvm-readobj -r %t64l.o | FileCheck %s --check-prefix=PPC64L_REL
+; RUN: llvm-dwarfdump --eh-frame %t64l.o 2>&1 | FileCheck %s --check-prefix=PPC64
+
+; PPC64L_REL:      R_PPC64_REL64 .text 0x0
+; PPC64L_REL-NEXT: R_PPC64_REL64 .text 0x10
+
+define void @foo() {
+entry:
+  ret void
+}
+
+define void @bar() {
+entry:
+  ret void
+}

From a1f2fd11bde3efe2dd145163be1a7df15d0800d1 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 17 Jul 2020 23:49:15 -0700
Subject: [PATCH 021/363] [RelocationResolver] Support R_AARCH64_PREL32

Code from D83800 by Yichao Yu

(cherry picked from commit 3073a3aa1ef1ce8c9cac9b97a8e5905dd8779e16)
---
 llvm/lib/Object/RelocationResolver.cpp  |  6 ++++++
 llvm/test/DebugInfo/AArch64/eh-frame.ll | 21 +++++++++++++++++++++
 2 files changed, 27 insertions(+)
 create mode 100644 llvm/test/DebugInfo/AArch64/eh-frame.ll

diff --git a/llvm/lib/Object/RelocationResolver.cpp b/llvm/lib/Object/RelocationResolver.cpp
index 0058d12dcc87..ad7a50d13bb7 100644
--- a/llvm/lib/Object/RelocationResolver.cpp
+++ b/llvm/lib/Object/RelocationResolver.cpp
@@ -62,6 +62,8 @@ static bool supportsAArch64(uint64_t Type) {
   switch (Type) {
   case ELF::R_AARCH64_ABS32:
   case ELF::R_AARCH64_ABS64:
+  case ELF::R_AARCH64_PREL32:
+  case ELF::R_AARCH64_PREL64:
     return true;
   default:
     return false;
@@ -74,6 +76,10 @@ static uint64_t resolveAArch64(RelocationRef R, uint64_t S, uint64_t A) {
     return (S + getELFAddend(R)) & 0xFFFFFFFF;
   case ELF::R_AARCH64_ABS64:
     return S + getELFAddend(R);
+  case ELF::R_AARCH64_PREL32:
+    return (S + getELFAddend(R) - R.getOffset()) & 0xFFFFFFFF;
+  case ELF::R_AARCH64_PREL64:
+    return S + getELFAddend(R) - R.getOffset();
   default:
     llvm_unreachable("Invalid relocation type");
   }
diff --git a/llvm/test/DebugInfo/AArch64/eh-frame.ll b/llvm/test/DebugInfo/AArch64/eh-frame.ll
new file mode 100644
index 000000000000..9651159271e5
--- /dev/null
+++ b/llvm/test/DebugInfo/AArch64/eh-frame.ll
@@ -0,0 +1,21 @@
+; RUN: llc -filetype=obj -mtriple=aarch64 %s -o %t.o
+; RUN: llvm-readobj -r %t.o | FileCheck %s --check-prefix=REL32
+; RUN: llvm-dwarfdump --eh-frame %t.o 2>&1 | FileCheck %s
+
+; REL32:      R_AARCH64_PREL32 .text 0x0
+; REL32-NEXT: R_AARCH64_PREL32 .text 0x4
+
+; CHECK-NOT:  warning:
+; CHECK: FDE cie=00000000 pc=00000000...00000004
+;; TODO Take relocation into consideration
+; CHECK: FDE cie=00000000 pc=00000000...00000004
+
+define void @foo() {
+entry:
+  ret void
+}
+
+define void @bar() {
+entry:
+  ret void
+}

From 96313d2de45ace49d40606dda71f03396f13ddef Mon Sep 17 00:00:00 2001
From: Joachim Protze <protze@itc.rwth-aachen.de>
Date: Thu, 16 Jul 2020 16:15:21 +0200
Subject: [PATCH 022/363] [TSan] Optimize handling of racy address

This patch splits the handling of racy address and racy stack into separate
functions. If a race was already reported for the address, we can avoid the
cost for collecting the involved stacks.

This patch also removes the race condition in storing the racy address / racy
stack. This race condition allowed all threads to report the race.

This patch changes the transitive suppression of reports. Previously
suppression could transitively chain memory location and racy stacks.
Now racy memory and racy stack are separate suppressions.

Commit again, now with fixed tests.

Reviewed by: dvyukov

Differential Revision: https://reviews.llvm.org/D83625

(cherry picked from commit 7358a1104a02d5f5e645ebff0530787453ae98da)
---
 compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp  | 103 +++++++++---------
 .../tsan/tests/rtl/tsan_test_util_posix.cpp   |  51 ++++++---
 2 files changed, 87 insertions(+), 67 deletions(-)

diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp
index 949beac1c551..3354546c2a10 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp
@@ -439,65 +439,61 @@ void RestoreStack(int tid, const u64 epoch, VarSizeStackTrace *stk,
   ExtractTagFromStack(stk, tag);
 }
 
-static bool HandleRacyStacks(ThreadState *thr, VarSizeStackTrace traces[2],
-                             uptr addr_min, uptr addr_max) {
-  bool equal_stack = false;
-  RacyStacks hash;
-  bool equal_address = false;
-  RacyAddress ra0 = {addr_min, addr_max};
-  {
-    ReadLock lock(&ctx->racy_mtx);
-    if (flags()->suppress_equal_stacks) {
-      hash.hash[0] = md5_hash(traces[0].trace, traces[0].size * sizeof(uptr));
-      hash.hash[1] = md5_hash(traces[1].trace, traces[1].size * sizeof(uptr));
-      for (uptr i = 0; i < ctx->racy_stacks.Size(); i++) {
-        if (hash == ctx->racy_stacks[i]) {
-          VPrintf(2,
-              "ThreadSanitizer: suppressing report as doubled (stack)\n");
-          equal_stack = true;
-          break;
-        }
-      }
-    }
-    if (flags()->suppress_equal_addresses) {
-      for (uptr i = 0; i < ctx->racy_addresses.Size(); i++) {
-        RacyAddress ra2 = ctx->racy_addresses[i];
-        uptr maxbeg = max(ra0.addr_min, ra2.addr_min);
-        uptr minend = min(ra0.addr_max, ra2.addr_max);
-        if (maxbeg < minend) {
-          VPrintf(2, "ThreadSanitizer: suppressing report as doubled (addr)\n");
-          equal_address = true;
-          break;
-        }
-      }
+static bool FindRacyStacks(const RacyStacks &hash) {
+  for (uptr i = 0; i < ctx->racy_stacks.Size(); i++) {
+    if (hash == ctx->racy_stacks[i]) {
+      VPrintf(2, "ThreadSanitizer: suppressing report as doubled (stack)\n");
+      return true;
     }
   }
-  if (!equal_stack && !equal_address)
+  return false;
+}
+
+static bool HandleRacyStacks(ThreadState *thr, VarSizeStackTrace traces[2]) {
+  if (!flags()->suppress_equal_stacks)
     return false;
-  if (!equal_stack) {
-    Lock lock(&ctx->racy_mtx);
-    ctx->racy_stacks.PushBack(hash);
-  }
-  if (!equal_address) {
-    Lock lock(&ctx->racy_mtx);
-    ctx->racy_addresses.PushBack(ra0);
+  RacyStacks hash;
+  hash.hash[0] = md5_hash(traces[0].trace, traces[0].size * sizeof(uptr));
+  hash.hash[1] = md5_hash(traces[1].trace, traces[1].size * sizeof(uptr));
+  {
+    ReadLock lock(&ctx->racy_mtx);
+    if (FindRacyStacks(hash))
+      return true;
   }
-  return true;
+  Lock lock(&ctx->racy_mtx);
+  if (FindRacyStacks(hash))
+    return true;
+  ctx->racy_stacks.PushBack(hash);
+  return false;
 }
 
-static void AddRacyStacks(ThreadState *thr, VarSizeStackTrace traces[2],
-                          uptr addr_min, uptr addr_max) {
-  Lock lock(&ctx->racy_mtx);
-  if (flags()->suppress_equal_stacks) {
-    RacyStacks hash;
-    hash.hash[0] = md5_hash(traces[0].trace, traces[0].size * sizeof(uptr));
-    hash.hash[1] = md5_hash(traces[1].trace, traces[1].size * sizeof(uptr));
-    ctx->racy_stacks.PushBack(hash);
+static bool FindRacyAddress(const RacyAddress &ra0) {
+  for (uptr i = 0; i < ctx->racy_addresses.Size(); i++) {
+    RacyAddress ra2 = ctx->racy_addresses[i];
+    uptr maxbeg = max(ra0.addr_min, ra2.addr_min);
+    uptr minend = min(ra0.addr_max, ra2.addr_max);
+    if (maxbeg < minend) {
+      VPrintf(2, "ThreadSanitizer: suppressing report as doubled (addr)\n");
+      return true;
+    }
   }
-  if (flags()->suppress_equal_addresses) {
-    RacyAddress ra0 = {addr_min, addr_max};
-    ctx->racy_addresses.PushBack(ra0);
+  return false;
+}
+
+static bool HandleRacyAddress(ThreadState *thr, uptr addr_min, uptr addr_max) {
+  if (!flags()->suppress_equal_addresses)
+    return false;
+  RacyAddress ra0 = {addr_min, addr_max};
+  {
+    ReadLock lock(&ctx->racy_mtx);
+    if (FindRacyAddress(ra0))
+      return true;
   }
+  Lock lock(&ctx->racy_mtx);
+  if (FindRacyAddress(ra0))
+    return true;
+  ctx->racy_addresses.PushBack(ra0);
+  return false;
 }
 
 bool OutputReport(ThreadState *thr, const ScopedReport &srep) {
@@ -618,6 +614,8 @@ void ReportRace(ThreadState *thr) {
     if (IsExpectedReport(addr_min, addr_max - addr_min))
       return;
   }
+  if (HandleRacyAddress(thr, addr_min, addr_max))
+    return;
 
   ReportType typ = ReportTypeRace;
   if (thr->is_vptr_access && freed)
@@ -668,7 +666,7 @@ void ReportRace(ThreadState *thr) {
   if (IsFiredSuppression(ctx, typ, traces[1]))
     return;
 
-  if (HandleRacyStacks(thr, traces, addr_min, addr_max))
+  if (HandleRacyStacks(thr, traces))
     return;
 
   // If any of the accesses has a tag, treat this as an "external" race.
@@ -711,7 +709,6 @@ void ReportRace(ThreadState *thr) {
   if (!OutputReport(thr, rep))
     return;
 
-  AddRacyStacks(thr, traces, addr_min, addr_max);
 }
 
 void PrintCurrentStack(ThreadState *thr, uptr pc) {
diff --git a/compiler-rt/lib/tsan/tests/rtl/tsan_test_util_posix.cpp b/compiler-rt/lib/tsan/tests/rtl/tsan_test_util_posix.cpp
index a24d04f47007..733e5d282a37 100644
--- a/compiler-rt/lib/tsan/tests/rtl/tsan_test_util_posix.cpp
+++ b/compiler-rt/lib/tsan/tests/rtl/tsan_test_util_posix.cpp
@@ -27,6 +27,8 @@
 #include <unistd.h>
 #include <errno.h>
 
+#define CALLERPC (__builtin_return_address(0))
+
 using namespace __tsan;
 
 static __thread bool expect_report;
@@ -249,22 +251,42 @@ void ScopedThread::Impl::HandleEvent(Event *ev) {
   switch (ev->type) {
   case Event::READ:
   case Event::WRITE: {
-    void (*tsan_mop)(void *addr) = 0;
+    void (*tsan_mop)(void *addr, void *pc) = 0;
     if (ev->type == Event::READ) {
       switch (ev->arg /*size*/) {
-        case 1: tsan_mop = __tsan_read1; break;
-        case 2: tsan_mop = __tsan_read2; break;
-        case 4: tsan_mop = __tsan_read4; break;
-        case 8: tsan_mop = __tsan_read8; break;
-        case 16: tsan_mop = __tsan_read16; break;
+        case 1:
+          tsan_mop = __tsan_read1_pc;
+          break;
+        case 2:
+          tsan_mop = __tsan_read2_pc;
+          break;
+        case 4:
+          tsan_mop = __tsan_read4_pc;
+          break;
+        case 8:
+          tsan_mop = __tsan_read8_pc;
+          break;
+        case 16:
+          tsan_mop = __tsan_read16_pc;
+          break;
       }
     } else {
       switch (ev->arg /*size*/) {
-        case 1: tsan_mop = __tsan_write1; break;
-        case 2: tsan_mop = __tsan_write2; break;
-        case 4: tsan_mop = __tsan_write4; break;
-        case 8: tsan_mop = __tsan_write8; break;
-        case 16: tsan_mop = __tsan_write16; break;
+        case 1:
+          tsan_mop = __tsan_write1_pc;
+          break;
+        case 2:
+          tsan_mop = __tsan_write2_pc;
+          break;
+        case 4:
+          tsan_mop = __tsan_write4_pc;
+          break;
+        case 8:
+          tsan_mop = __tsan_write8_pc;
+          break;
+        case 16:
+          tsan_mop = __tsan_write16_pc;
+          break;
       }
     }
     CHECK_NE(tsan_mop, 0);
@@ -274,7 +296,7 @@ void ScopedThread::Impl::HandleEvent(Event *ev) {
     const int ErrCode = ECHRNG;
 #endif
     errno = ErrCode;
-    tsan_mop(ev->ptr);
+    tsan_mop(ev->ptr, (void *)ev->arg2);
     CHECK_EQ(ErrCode, errno);  // In no case must errno be changed.
     break;
   }
@@ -327,7 +349,7 @@ void ScopedThread::Impl::HandleEvent(Event *ev) {
 }
 
 void *ScopedThread::Impl::ScopedThreadCallback(void *arg) {
-  __tsan_func_entry(__builtin_return_address(0));
+  __tsan_func_entry(CALLERPC);
   Impl *impl = (Impl*)arg;
   for (;;) {
     Event* ev = (Event*)atomic_load(&impl->event, memory_order_acquire);
@@ -392,7 +414,8 @@ void ScopedThread::Detach() {
 
 void ScopedThread::Access(void *addr, bool is_write,
                           int size, bool expect_race) {
-  Event event(is_write ? Event::WRITE : Event::READ, addr, size);
+  Event event(is_write ? Event::WRITE : Event::READ, addr, size,
+              (uptr)CALLERPC);
   if (expect_race)
     event.ExpectReport(ReportTypeRace);
   impl_->send(&event);

From 2a7f1931d7272051592155619f21b338b5735734 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <mkazantsev@azul.com>
Date: Fri, 17 Jul 2020 14:01:59 +0700
Subject: [PATCH 023/363] [InstCombine] Fix replace select with Phis when
 branch has the same labels

```
define i32 @test(i1 %cond) {
entry:
  br i1 %cond, label %exit, label %exit
exit:
  %result = select i1 %cond, i32 123, i32 456
  ret i32 %result
}
```
In this test, after applying transformation of replacing select with Phis,
the result will be:

```
define i32 @test(i1 %cond) {
entry:
  br i1 %cond, label %exit, label %exit
exit:
  %result = i32 phi [123, %exit], [123, %exit]
  ret i32 %result
}
```
That is, select is transformed into an invalid Phi, which will then be
reduced to 123 and the second value will be lost. But it is worth
noting that this problem will arise only if select is in the InstCombine
worklist will be before the branch. Otherwise, InstCombine will replace
the branch condition with false and transformation will not be applied.

The fix is to check the target labels in the branch condition for equality.

Patch By: Kirill Polushin
Differential Revision: https://reviews.llvm.org/D84003
Reviewed By: mkazantsev

(cherry picked from commit c98988107868db41c12b9d782fae25dea2a81c87)
---
 llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 17124f717af7..db27711f29b1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2469,6 +2469,10 @@ static Instruction *foldSelectToPhiImpl(SelectInst &Sel, BasicBlock *BB,
   } else
     return nullptr;
 
+  // Make sure the branches are actually different.
+  if (TrueSucc == FalseSucc)
+    return nullptr;
+
   // We want to replace select %cond, %a, %b with a phi that takes value %a
   // for all incoming edges that are dominated by condition `%cond == true`,
   // and value %b for edges dominated by condition `%cond == false`. If %a

From 7421cbd7a5a74b48a173f60763b9b78ecd3aec09 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <mkazantsev@azul.com>
Date: Fri, 17 Jul 2020 16:59:19 +0700
Subject: [PATCH 024/363] [InstCombine][Test] Test for fix of replacing select
 with Phis when branch has the same labels

An additional test that allows to check the correctness of handling the case of the same
branch labels in the dominator when trying to replace select with phi-node.

Patch By: Kirill Polushin
Differential Revision: https://reviews.llvm.org/D84006
Reviewed By: mkazantsev

(cherry picked from commit df6e185e8f895686510117301e568e5043909b66)
---
 llvm/test/Transforms/InstCombine/select.ll | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index 8cd0e35139a8..185ff838b819 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -2000,6 +2000,21 @@ merge:
   ret i32 %s
 }
 
+define i32 @select_dominating_cond_same_labels(i1 %cond) {
+; CHECK-LABEL: @select_dominating_cond_same_labels(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RESULT:%.*]] = select i1 [[COND:%.*]], i32 123, i32 456
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+entry:
+  %result = select i1 %cond, i32 123, i32 456
+  br i1 %cond, label %exit, label %exit
+exit:
+  ret i32 %result
+}
+
 define i32 @select_phi_same_condition(i1 %cond, i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_phi_same_condition(
 ; CHECK-NEXT:  entry:

From 76c22b392f18e533262de8db2fec5e6e25e0bd8b Mon Sep 17 00:00:00 2001
From: Zakk Chen <zakk.chen@sifive.com>
Date: Thu, 16 Jul 2020 10:32:01 -0700
Subject: [PATCH 025/363] [RISCV] Add support for -mcpu option.

Summary:
1. gcc uses `-march` and `-mtune` flag to chose arch and
pipeline model, but clang does not have `-mtune` flag,
we uses `-mcpu` to chose both infos.
2. Add SiFive e31 and u54 cpu which have default march
and pipeline model.
3. Specific `-mcpu` with rocket-rv[32|64] would select
pipeline model only, and use the driver's arch choosing
logic to get default arch.

Reviewers: lenary, asb, evandro, HsiangKai

Reviewed By: lenary, asb, evandro

Tags: #llvm, #clang

Differential Revision: https://reviews.llvm.org/D71124

(cherry picked from commit 294d1eae75bf8867821a4491f0d67445227f8470)
---
 clang/lib/Basic/Targets/RISCV.cpp             | 21 +++++
 clang/lib/Basic/Targets/RISCV.h               | 15 ++-
 clang/lib/Driver/ToolChains/Arch/RISCV.cpp    | 93 +++++++++++++------
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |  5 +
 clang/test/Driver/riscv-arch.c                |  4 +-
 clang/test/Driver/riscv-cpus.c                | 38 ++++++++
 clang/test/Misc/target-invalid-cpu-note.c     |  7 ++
 .../llvm/Support/RISCVTargetParser.def        | 13 +++
 llvm/include/llvm/Support/TargetParser.h      | 26 ++++++
 llvm/lib/Support/TargetParser.cpp             | 64 ++++++++++++-
 llvm/lib/Target/RISCV/RISCV.td                | 10 ++
 11 files changed, 262 insertions(+), 34 deletions(-)
 create mode 100644 clang/test/Driver/riscv-cpus.c
 create mode 100644 llvm/include/llvm/Support/RISCVTargetParser.def

diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index 522776437cd2..4ba703c8dd1a 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -13,6 +13,7 @@
 #include "RISCV.h"
 #include "clang/Basic/MacroBuilder.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/TargetParser.h"
 
 using namespace clang;
 using namespace clang::targets;
@@ -166,3 +167,23 @@ bool RISCVTargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
 
   return true;
 }
+
+bool RISCV32TargetInfo::isValidCPUName(StringRef Name) const {
+  return llvm::RISCV::checkCPUKind(llvm::RISCV::parseCPUKind(Name),
+                                   /*Is64Bit=*/false);
+}
+
+void RISCV32TargetInfo::fillValidCPUList(
+    SmallVectorImpl<StringRef> &Values) const {
+  llvm::RISCV::fillValidCPUArchList(Values, false);
+}
+
+bool RISCV64TargetInfo::isValidCPUName(StringRef Name) const {
+  return llvm::RISCV::checkCPUKind(llvm::RISCV::parseCPUKind(Name),
+                                   /*Is64Bit=*/true);
+}
+
+void RISCV64TargetInfo::fillValidCPUList(
+    SmallVectorImpl<StringRef> &Values) const {
+  llvm::RISCV::fillValidCPUArchList(Values, true);
+}
diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h
index 73652b409e9c..6db526da4c59 100644
--- a/clang/lib/Basic/Targets/RISCV.h
+++ b/clang/lib/Basic/Targets/RISCV.h
@@ -24,7 +24,7 @@ namespace targets {
 // RISC-V Target
 class RISCVTargetInfo : public TargetInfo {
 protected:
-  std::string ABI;
+  std::string ABI, CPU;
   bool HasM;
   bool HasA;
   bool HasF;
@@ -44,6 +44,13 @@ class RISCVTargetInfo : public TargetInfo {
     WIntType = UnsignedInt;
   }
 
+  bool setCPU(const std::string &Name) override {
+    if (!isValidCPUName(Name))
+      return false;
+    CPU = Name;
+    return true;
+  }
+
   StringRef getABI() const override { return ABI; }
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override;
@@ -97,6 +104,9 @@ class LLVM_LIBRARY_VISIBILITY RISCV32TargetInfo : public RISCVTargetInfo {
     return false;
   }
 
+  bool isValidCPUName(StringRef Name) const override;
+  void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override;
+
   void setMaxAtomicWidth() override {
     MaxAtomicPromoteWidth = 128;
 
@@ -121,6 +131,9 @@ class LLVM_LIBRARY_VISIBILITY RISCV64TargetInfo : public RISCVTargetInfo {
     return false;
   }
 
+  bool isValidCPUName(StringRef Name) const override;
+  void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override;
+
   void setMaxAtomicWidth() override {
     MaxAtomicPromoteWidth = 128;
 
diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
index 80d12e5aa8da..be3f0a07b576 100644
--- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
@@ -446,6 +446,19 @@ static bool getArchFeatures(const Driver &D, StringRef MArch,
   return true;
 }
 
+// Get features except standard extension feature
+void getRISCFeaturesFromMcpu(const Driver &D, const llvm::Triple &Triple,
+                             const llvm::opt::ArgList &Args,
+                             const llvm::opt::Arg *A, StringRef Mcpu,
+                             std::vector<StringRef> &Features) {
+  bool Is64Bit = (Triple.getArch() == llvm::Triple::riscv64);
+  llvm::RISCV::CPUKind CPUKind = llvm::RISCV::parseCPUKind(Mcpu);
+  if (!llvm::RISCV::checkCPUKind(CPUKind, Is64Bit) ||
+      !llvm::RISCV::getCPUFeaturesExceptStdExt(CPUKind, Features)) {
+    D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args);
+  }
+}
+
 void riscv::getRISCVTargetFeatures(const Driver &D, const llvm::Triple &Triple,
                                    const ArgList &Args,
                                    std::vector<StringRef> &Features) {
@@ -454,6 +467,11 @@ void riscv::getRISCVTargetFeatures(const Driver &D, const llvm::Triple &Triple,
   if (!getArchFeatures(D, MArch, Features, Args))
     return;
 
+  // If users give march and mcpu, get std extension feature from MArch
+  // and other features (ex. mirco architecture feature) from mcpu
+  if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
+    getRISCFeaturesFromMcpu(D, Triple, Args, A, A->getValue(), Features);
+
   // Handle features corresponding to "-ffixed-X" options
   if (Args.hasArg(options::OPT_ffixed_x1))
     Features.push_back("+reserve-x1");
@@ -543,11 +561,9 @@ StringRef riscv::getRISCVABI(const ArgList &Args, const llvm::Triple &Triple) {
 
   // GCC's logic around choosing a default `-mabi=` is complex. If GCC is not
   // configured using `--with-abi=`, then the logic for the default choice is
-  // defined in config.gcc. This function is based on the logic in GCC 9.2.0. We
-  // deviate from GCC's default only on baremetal targets (UnknownOS) where
-  // neither `-march` nor `-mabi` is specified.
+  // defined in config.gcc. This function is based on the logic in GCC 9.2.0.
   //
-  // The logic uses the following, in order:
+  // The logic used in GCC 9.2.0 is the following, in order:
   // 1. Explicit choices using `--with-abi=`
   // 2. A default based on `--with-arch=`, if provided
   // 3. A default based on the target triple's arch
@@ -556,38 +572,40 @@ StringRef riscv::getRISCVABI(const ArgList &Args, const llvm::Triple &Triple) {
   //
   // Clang does not have `--with-arch=` or `--with-abi=`, so we use `-march=`
   // and `-mabi=` respectively instead.
+  //
+  // In order to make chosing logic more clear, Clang uses the following logic,
+  // in order:
+  // 1. Explicit choices using `-mabi=`
+  // 2. A default based on the architecture as determined by getRISCVArch
+  // 3. Choose a default based on the triple
 
   // 1. If `-mabi=` is specified, use it.
   if (const Arg *A = Args.getLastArg(options::OPT_mabi_EQ))
     return A->getValue();
 
-  // 2. Choose a default based on `-march=`
+  // 2. Choose a default based on the target architecture.
   //
   // rv32g | rv32*d -> ilp32d
   // rv32e -> ilp32e
   // rv32* -> ilp32
   // rv64g | rv64*d -> lp64d
   // rv64* -> lp64
-  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
-    StringRef MArch = A->getValue();
-
-    if (MArch.startswith_lower("rv32")) {
-      // FIXME: parse `March` to find `D` extension properly
-      if (MArch.substr(4).contains_lower("d") ||
-          MArch.startswith_lower("rv32g"))
-        return "ilp32d";
-      else if (MArch.startswith_lower("rv32e"))
-        return "ilp32e";
-      else
-        return "ilp32";
-    } else if (MArch.startswith_lower("rv64")) {
-      // FIXME: parse `March` to find `D` extension properly
-      if (MArch.substr(4).contains_lower("d") ||
-          MArch.startswith_lower("rv64g"))
-        return "lp64d";
-      else
-        return "lp64";
-    }
+  StringRef MArch = getRISCVArch(Args, Triple);
+
+  if (MArch.startswith_lower("rv32")) {
+    // FIXME: parse `March` to find `D` extension properly
+    if (MArch.substr(4).contains_lower("d") || MArch.startswith_lower("rv32g"))
+      return "ilp32d";
+    else if (MArch.startswith_lower("rv32e"))
+      return "ilp32e";
+    else
+      return "ilp32";
+  } else if (MArch.startswith_lower("rv64")) {
+    // FIXME: parse `March` to find `D` extension properly
+    if (MArch.substr(4).contains_lower("d") || MArch.startswith_lower("rv64g"))
+      return "lp64d";
+    else
+      return "lp64";
   }
 
   // 3. Choose a default based on the triple
@@ -617,10 +635,11 @@ StringRef riscv::getRISCVArch(const llvm::opt::ArgList &Args,
   // GCC's logic around choosing a default `-march=` is complex. If GCC is not
   // configured using `--with-arch=`, then the logic for the default choice is
   // defined in config.gcc. This function is based on the logic in GCC 9.2.0. We
-  // deviate from GCC's default only on baremetal targets (UnknownOS) where
-  // neither `-march` nor `-mabi` is specified.
+  // deviate from GCC's default on additional `-mcpu` option (GCC does not
+  // support `-mcpu`) and baremetal targets (UnknownOS) where neither `-march`
+  // nor `-mabi` is specified.
   //
-  // The logic uses the following, in order:
+  // The logic used in GCC 9.2.0 is the following, in order:
   // 1. Explicit choices using `--with-arch=`
   // 2. A default based on `--with-abi=`, if provided
   // 3. A default based on the target triple's arch
@@ -630,6 +649,12 @@ StringRef riscv::getRISCVArch(const llvm::opt::ArgList &Args,
   // Clang does not have `--with-arch=` or `--with-abi=`, so we use `-march=`
   // and `-mabi=` respectively instead.
   //
+  // Clang uses the following logic, in order:
+  // 1. Explicit choices using `-march=`
+  // 2. Based on `-mcpu` if the target CPU has a default ISA string
+  // 3. A default based on `-mabi`, if provided
+  // 4. A default based on the target triple's arch
+  //
   // Clang does not yet support MULTILIB_REUSE, so we use `rv{XLEN}imafdc`
   // instead of `rv{XLEN}gc` though they are (currently) equivalent.
 
@@ -637,7 +662,15 @@ StringRef riscv::getRISCVArch(const llvm::opt::ArgList &Args,
   if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
     return A->getValue();
 
-  // 2. Choose a default based on `-mabi=`
+  // 2. Get march (isa string) based on `-mcpu=`
+  if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
+    StringRef MArch = llvm::RISCV::getMArchFromMcpu(A->getValue());
+    // Bypass if target cpu's default march is empty.
+    if (MArch != "")
+      return MArch;
+  }
+
+  // 3. Choose a default based on `-mabi=`
   //
   // ilp32e -> rv32e
   // ilp32 | ilp32f | ilp32d -> rv32imafdc
@@ -653,7 +686,7 @@ StringRef riscv::getRISCVArch(const llvm::opt::ArgList &Args,
       return "rv64imafdc";
   }
 
-  // 3. Choose a default based on the triple
+  // 4. Choose a default based on the triple
   //
   // We deviate from GCC's defaults here:
   // - On `riscv{XLEN}-unknown-elf` we default to `rv{XLEN}imac`
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 1cac5a0822a4..6b6e276b8ce7 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -333,6 +333,11 @@ std::string tools::getCPUName(const ArgList &Args, const llvm::Triple &T,
 
     return TargetCPUName;
   }
+  case llvm::Triple::riscv32:
+  case llvm::Triple::riscv64:
+    if (const Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
+      return A->getValue();
+    return "";
 
   case llvm::Triple::bpfel:
   case llvm::Triple::bpfeb:
diff --git a/clang/test/Driver/riscv-arch.c b/clang/test/Driver/riscv-arch.c
index 13d0748a967a..725201a77ba7 100644
--- a/clang/test/Driver/riscv-arch.c
+++ b/clang/test/Driver/riscv-arch.c
@@ -156,9 +156,9 @@
 // RV32-LOWER: error: invalid arch name 'rv32imC',
 // RV32-LOWER: string must be lowercase
 
-// RUN: %clang -target riscv32-unknown-elf -march=rv32 -### %s \
+// RUN: %clang -target riscv32-unknown-elf -march=unknown -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-STR %s
-// RV32-STR: error: invalid arch name 'rv32',
+// RV32-STR: error: invalid arch name 'unknown',
 // RV32-STR: string must begin with rv32{i,e,g} or rv64{i,g}
 
 // RUN: %clang -target riscv32-unknown-elf -march=rv32q -### %s \
diff --git a/clang/test/Driver/riscv-cpus.c b/clang/test/Driver/riscv-cpus.c
new file mode 100644
index 000000000000..c6281a0b6433
--- /dev/null
+++ b/clang/test/Driver/riscv-cpus.c
@@ -0,0 +1,38 @@
+// Check target CPUs are correctly passed.
+
+// RUN: %clang -target riscv32 -### -c %s 2>&1 -mcpu=rocket-rv32 | FileCheck -check-prefix=MCPU-ROCKETCHIP32 %s
+// MCPU-ROCKETCHIP32: "-nostdsysteminc" "-target-cpu" "rocket-rv32"
+
+// RUN: %clang -target riscv64 -### -c %s 2>&1 -mcpu=rocket-rv64 | FileCheck -check-prefix=MCPU-ROCKETCHIP64 %s
+// MCPU-ROCKETCHIP64: "-nostdsysteminc" "-target-cpu" "rocket-rv64"
+// MCPU-ROCKETCHIP64: "-target-feature" "+64bit"
+
+// mcpu with default march
+// RUN: %clang -target riscv64 -### -c %s 2>&1 -mcpu=sifive-u54 | FileCheck -check-prefix=MCPU-SIFIVE-U54 %s
+// MCPU-SIFIVE-U54: "-nostdsysteminc" "-target-cpu" "sifive-u54"
+// MCPU-SIFIVE-U54: "-target-feature" "+m" "-target-feature" "+a" "-target-feature" "+f" "-target-feature" "+d"
+// MCPU-SIFIVE-U54: "-target-feature" "+c" "-target-feature" "+64bit"
+// MCPU-SIFIVE-U54: "-target-abi" "lp64d"
+
+// mcpu with mabi option
+// RUN: %clang -target riscv64 -### -c %s 2>&1 -mcpu=sifive-u54 -mabi=lp64 | FileCheck -check-prefix=MCPU-ABI-SIFIVE-U54 %s
+// MCPU-ABI-SIFIVE-U54: "-nostdsysteminc" "-target-cpu" "sifive-u54"
+// MCPU-ABI-SIFIVE-U54: "-target-feature" "+m" "-target-feature" "+a" "-target-feature" "+f" "-target-feature" "+d"
+// MCPU-ABI-SIFIVE-U54: "-target-feature" "+c" "-target-feature" "+64bit"
+// MCPU-ABI-SIFIVE-U54: "-target-abi" "lp64"
+
+// march overwirte mcpu's default march
+// RUN: %clang -target riscv32 -### -c %s 2>&1 -mcpu=sifive-e31 -march=rv32imc | FileCheck -check-prefix=MCPU-MARCH %s
+// MCPU-MARCH: "-nostdsysteminc" "-target-cpu" "sifive-e31" "-target-feature" "+m" "-target-feature" "+c"
+// MCPU-MARCH: "-target-abi" "ilp32"
+
+// Check failed cases
+
+// RUN: %clang -target riscv32 -### -c %s 2>&1 -mcpu=generic-rv321 | FileCheck -check-prefix=FAIL-MCPU-NAME %s
+// FAIL-MCPU-NAME: error: the clang compiler does not support '-mcpu=generic-rv321'
+
+// RUN: %clang -target riscv32 -### -c %s 2>&1 -mcpu=generic-rv32 -march=rv64i | FileCheck -check-prefix=MISMATCH-ARCH %s
+// MISMATCH-ARCH: error: the clang compiler does not support '-mcpu=generic-rv32'
+
+// RUN: %clang -target riscv32 -### -c %s 2>&1 -mcpu=generic-rv64 | FileCheck -check-prefix=MISMATCH-MCPU %s
+// MISMATCH-MCPU: error: the clang compiler does not support '-mcpu=generic-rv64'
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index 5c571fb458ec..3a376a7caab4 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -156,3 +156,10 @@
 // AVR-SAME: ttiny4, attiny5, attiny9, attiny10, attiny20, attiny40, attiny102,
 // AVR-SAME: attiny104
 
+// RUN: not %clang_cc1 -triple riscv32 -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix RISCV32
+// RISCV32: error: unknown target CPU 'not-a-cpu'
+// RISCV32: note: valid target CPU values are: generic-rv32, rocket-rv32, sifive-e31
+
+// RUN: not %clang_cc1 -triple riscv64 -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix RISCV64
+// RISCV64: error: unknown target CPU 'not-a-cpu'
+// RISCV64: note: valid target CPU values are: generic-rv64, rocket-rv64, sifive-u54
diff --git a/llvm/include/llvm/Support/RISCVTargetParser.def b/llvm/include/llvm/Support/RISCVTargetParser.def
new file mode 100644
index 000000000000..28de6cd40132
--- /dev/null
+++ b/llvm/include/llvm/Support/RISCVTargetParser.def
@@ -0,0 +1,13 @@
+#ifndef PROC
+#define PROC(ENUM, NAME, FEATURES, DEFAULT_MARCH)
+#endif
+
+PROC(INVALID, {"invalid"}, FK_INVALID, {""})
+PROC(GENERIC_RV32, {"generic-rv32"}, FK_NONE, {""})
+PROC(GENERIC_RV64, {"generic-rv64"}, FK_64BIT, {""})
+PROC(ROCKET_RV32, {"rocket-rv32"}, FK_NONE, {""})
+PROC(ROCKET_RV64, {"rocket-rv64"}, FK_64BIT, {""})
+PROC(SIFIVE_E31, {"sifive-e31"}, FK_NONE, {"rv32imac"})
+PROC(SIFIVE_U54, {"sifive-u54"}, FK_64BIT, {"rv64gc"})
+
+#undef PROC
diff --git a/llvm/include/llvm/Support/TargetParser.h b/llvm/include/llvm/Support/TargetParser.h
index a0bd88c153b6..f521d8f836b4 100644
--- a/llvm/include/llvm/Support/TargetParser.h
+++ b/llvm/include/llvm/Support/TargetParser.h
@@ -130,6 +130,32 @@ IsaVersion getIsaVersion(StringRef GPU);
 
 } // namespace AMDGPU
 
+namespace RISCV {
+
+enum CPUKind : unsigned {
+#define PROC(ENUM, NAME, FEATURES, DEFAULT_MARCH) CK_##ENUM,
+#include "RISCVTargetParser.def"
+};
+
+enum FeatureKind : unsigned {
+  FK_INVALID = 0,
+  FK_NONE = 1,
+  FK_STDEXTM = 1 << 2,
+  FK_STDEXTA = 1 << 3,
+  FK_STDEXTF = 1 << 4,
+  FK_STDEXTD = 1 << 5,
+  FK_STDEXTC = 1 << 6,
+  FK_64BIT = 1 << 7,
+};
+
+bool checkCPUKind(CPUKind Kind, bool IsRV64);
+CPUKind parseCPUKind(StringRef CPU);
+StringRef getMArchFromMcpu(StringRef CPU);
+void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values, bool IsRV64);
+bool getCPUFeaturesExceptStdExt(CPUKind Kind, std::vector<StringRef> &Features);
+
+} // namespace RISCV
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Support/TargetParser.cpp b/llvm/lib/Support/TargetParser.cpp
index be9b541237c7..031384ebaa91 100644
--- a/llvm/lib/Support/TargetParser.cpp
+++ b/llvm/lib/Support/TargetParser.cpp
@@ -11,11 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/ARMBuildAttributes.h"
 #include "llvm/Support/TargetParser.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/ARMBuildAttributes.h"
 
 using namespace llvm;
 using namespace AMDGPU;
@@ -208,3 +209,64 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
   default:         return {0, 0, 0};
   }
 }
+
+namespace llvm {
+namespace RISCV {
+
+struct CPUInfo {
+  StringLiteral Name;
+  CPUKind Kind;
+  unsigned Features;
+  StringLiteral DefaultMarch;
+  bool is64Bit() const { return (Features & FK_64BIT); }
+};
+
+constexpr CPUInfo RISCVCPUInfo[] = {
+#define PROC(ENUM, NAME, FEATURES, DEFAULT_MARCH)                              \
+  {NAME, CK_##ENUM, FEATURES, DEFAULT_MARCH},
+#include "llvm/Support/RISCVTargetParser.def"
+};
+
+bool checkCPUKind(CPUKind Kind, bool IsRV64) {
+  if (Kind == CK_INVALID)
+    return false;
+  return RISCVCPUInfo[static_cast<unsigned>(Kind)].is64Bit() == IsRV64;
+}
+
+CPUKind parseCPUKind(StringRef CPU) {
+  return llvm::StringSwitch<CPUKind>(CPU)
+#define PROC(ENUM, NAME, FEATURES, DEFAULT_MARCH) .Case(NAME, CK_##ENUM)
+#include "llvm/Support/RISCVTargetParser.def"
+      .Default(CK_INVALID);
+}
+
+StringRef getMArchFromMcpu(StringRef CPU) {
+  CPUKind Kind = parseCPUKind(CPU);
+  return RISCVCPUInfo[static_cast<unsigned>(Kind)].DefaultMarch;
+}
+
+void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values, bool IsRV64) {
+  for (const auto &C : RISCVCPUInfo) {
+    if (C.Kind != CK_INVALID && IsRV64 == C.is64Bit())
+      Values.emplace_back(C.Name);
+  }
+}
+
+// Get all features except standard extension feature
+bool getCPUFeaturesExceptStdExt(CPUKind Kind,
+                                std::vector<StringRef> &Features) {
+  unsigned CPUFeatures = RISCVCPUInfo[static_cast<unsigned>(Kind)].Features;
+
+  if (CPUFeatures == FK_INVALID)
+    return false;
+
+  if (CPUFeatures & FK_64BIT)
+    Features.push_back("+64bit");
+  else
+    Features.push_back("-64bit");
+
+  return true;
+}
+
+} // namespace RISCV
+} // namespace llvm
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index f0583f691936..57e7c41c4271 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -215,6 +215,16 @@ def : ProcessorModel<"rocket-rv32", Rocket32Model, []>;
 
 def : ProcessorModel<"rocket-rv64", Rocket64Model, [Feature64Bit]>;
 
+def : ProcessorModel<"sifive-e31", Rocket32Model, [FeatureStdExtM,
+                                                   FeatureStdExtA,
+                                                   FeatureStdExtC]>;
+
+def : ProcessorModel<"sifive-u54", Rocket64Model, [Feature64Bit,
+                                                   FeatureStdExtM,
+                                                   FeatureStdExtA,
+                                                   FeatureStdExtF,
+                                                   FeatureStdExtD,
+                                                   FeatureStdExtC]>;
 
 //===----------------------------------------------------------------------===//
 // Define the RISC-V target.

From 2d499d7912ac83ff7a943b9061a557c759176fe6 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Wed, 15 Jul 2020 15:14:46 -0700
Subject: [PATCH 026/363] [X86] Teach assembler parser to accept lsl and lar
 with a 64 or 32 source register when the destination is a 64 register.

Previously we only accepted a 32-bit source with a 64-bit dest.

Accepting 64-bit as well is more consistent with gas behavior. I
think maybe we should accept 16 bit register as well, but I'm not
sure.

(cherry picked from commit 3c2a56a857227b6bc39285747269f02cd7a9dbe5)
---
 llvm/lib/Target/X86/X86InstrSystem.td | 4 ++--
 llvm/test/MC/X86/I286-64.s            | 8 ++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td
index c23bc7ebbf70..d5f10646d80a 100644
--- a/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/llvm/lib/Target/X86/X86InstrSystem.td
@@ -223,7 +223,7 @@ def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
 let mayLoad = 1 in
 def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
                  "lar{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
-def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32orGR64:$src),
                  "lar{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
 
 // i16mem operand in LSL32rm and GR32 operand in LSL32rr is not a typo.
@@ -245,7 +245,7 @@ def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
 let mayLoad = 1 in
 def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
                  "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
-def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR32orGR64:$src),
                  "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
 
 def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB;
diff --git a/llvm/test/MC/X86/I286-64.s b/llvm/test/MC/X86/I286-64.s
index 73376de97887..1bab0a64f3e2 100644
--- a/llvm/test/MC/X86/I286-64.s
+++ b/llvm/test/MC/X86/I286-64.s
@@ -32,6 +32,10 @@ larl %r13d, %r13d
 // CHECK: encoding: [0x44,0x0f,0x02,0x2a]        
 larl (%rdx), %r13d 
 
+// CHECK: larq %eax, %rax
+// CHECK: encoding: [0x48,0x0f,0x02,0xc0]        
+lar %rax, %rax
+
 // CHECK: lgdtq 485498096 
 // CHECK: encoding: [0x0f,0x01,0x14,0x25,0xf0,0x1c,0xf0,0x1c]         
 lgdtq 485498096 
@@ -164,6 +168,10 @@ lsll %r13d, %r13d
 // CHECK: encoding: [0x44,0x0f,0x03,0x2a]        
 lsll (%rdx), %r13d 
 
+// CHECK: lslq %eax, %rax
+// CHECK: encoding: [0x48,0x0f,0x03,0xc0]
+lsl %rax, %rax
+
 // CHECK: ltrw 485498096 
 // CHECK: encoding: [0x0f,0x00,0x1c,0x25,0xf0,0x1c,0xf0,0x1c]         
 ltrw 485498096 

From 40e2065d979b4417641105cff4a5ee9bbf67aebd Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Wed, 15 Jul 2020 23:50:29 -0700
Subject: [PATCH 027/363] [X86] Allow lsl/lar to be parsed with a GR16, GR32,
 or GR64 as source register.

This matches GNU assembler behavior. Operand size is determined
only from the destination register.

(cherry picked from commit 71b49aa438b22b02230fff30e8874ff756336e6d)
---
 llvm/lib/Target/X86/AsmParser/X86Operand.h   | 18 +++++++++++++++++-
 llvm/lib/Target/X86/X86InstrInfo.td          |  9 ++++++++-
 llvm/lib/Target/X86/X86InstrSystem.td        | 16 ++++++----------
 llvm/test/MC/X86/I286-32.s                   |  4 ++--
 llvm/test/MC/X86/I286-64.s                   | 16 ++++++++++++----
 llvm/utils/TableGen/X86RecognizableInstr.cpp |  3 +++
 6 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h
index 5cf4516ede97..e32335331879 100644
--- a/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -463,7 +463,14 @@ struct X86Operand final : public MCParsedAsmOperand {
   bool isGR32orGR64() const {
     return Kind == Register &&
       (X86MCRegisterClasses[X86::GR32RegClassID].contains(getReg()) ||
-      X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
+  }
+
+  bool isGR16orGR32orGR64() const {
+    return Kind == Register &&
+      (X86MCRegisterClasses[X86::GR16RegClassID].contains(getReg()) ||
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(getReg()) ||
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
   }
 
   bool isVectorReg() const {
@@ -520,6 +527,15 @@ struct X86Operand final : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createReg(RegNo));
   }
 
+  void addGR16orGR32orGR64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    MCRegister RegNo = getReg();
+    if (X86MCRegisterClasses[X86::GR32RegClassID].contains(RegNo) ||
+        X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
+      RegNo = getX86SubSuperRegister(RegNo, 16);
+    Inst.addOperand(MCOperand::createReg(RegNo));
+  }
+
   void addAVX512RCOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     addExpr(Inst, getImm());
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index 23841c3d7e50..3ea0ae8a8840 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -640,10 +640,17 @@ class ImmSExtAsmOperandClass : AsmOperandClass {
 def X86GR32orGR64AsmOperand : AsmOperandClass {
   let Name = "GR32orGR64";
 }
-
 def GR32orGR64 : RegisterOperand<GR32> {
   let ParserMatchClass = X86GR32orGR64AsmOperand;
 }
+
+def X86GR16orGR32orGR64AsmOperand : AsmOperandClass {
+  let Name = "GR16orGR32orGR64";
+}
+def GR16orGR32orGR64 : RegisterOperand<GR16> {
+  let ParserMatchClass = X86GR16orGR32orGR64AsmOperand;
+}
+
 def AVX512RCOperand : AsmOperandClass {
   let Name = "AVX512RC";
 }
diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td
index d5f10646d80a..13659b5c456e 100644
--- a/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/llvm/lib/Target/X86/X86InstrSystem.td
@@ -207,45 +207,41 @@ let mayLoad = 1 in
 def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                 "lar{w}\t{$src, $dst|$dst, $src}", []>, TB,
                 OpSize16, NotMemoryFoldable;
-def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16orGR32orGR64:$src),
                 "lar{w}\t{$src, $dst|$dst, $src}", []>, TB,
                 OpSize16, NotMemoryFoldable;
 
-// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
 let mayLoad = 1 in
 def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
                 "lar{l}\t{$src, $dst|$dst, $src}", []>, TB,
                 OpSize32, NotMemoryFoldable;
-def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR16orGR32orGR64:$src),
                 "lar{l}\t{$src, $dst|$dst, $src}", []>, TB,
                 OpSize32, NotMemoryFoldable;
-// i16mem operand in LAR64rm and GR32 operand in LAR64rr is not a typo.
 let mayLoad = 1 in
 def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
                  "lar{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
-def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32orGR64:$src),
+def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR16orGR32orGR64:$src),
                  "lar{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
 
-// i16mem operand in LSL32rm and GR32 operand in LSL32rr is not a typo.
 let mayLoad = 1 in
 def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                 "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB,
                 OpSize16, NotMemoryFoldable;
-def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16orGR32orGR64:$src),
                 "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB,
                 OpSize16, NotMemoryFoldable;
-// i16mem operand in LSL64rm and GR32 operand in LSL64rr is not a typo.
 let mayLoad = 1 in
 def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
                 "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB,
                 OpSize32, NotMemoryFoldable;
-def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR16orGR32orGR64:$src),
                 "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB,
                 OpSize32, NotMemoryFoldable;
 let mayLoad = 1 in
 def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
                  "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
-def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR32orGR64:$src),
+def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR16orGR32orGR64:$src),
                  "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable;
 
 def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB;
diff --git a/llvm/test/MC/X86/I286-32.s b/llvm/test/MC/X86/I286-32.s
index 0d463669f34a..648de019127f 100644
--- a/llvm/test/MC/X86/I286-32.s
+++ b/llvm/test/MC/X86/I286-32.s
@@ -24,7 +24,7 @@ larl 485498096(%edx), %eax
 // CHECK: encoding: [0x0f,0x02,0x44,0x02,0x40]        
 larl 64(%edx,%eax), %eax 
 
-// CHECK: larl %eax, %eax 
+// CHECK: larl %ax, %eax 
 // CHECK: encoding: [0x0f,0x02,0xc0]        
 larl %eax, %eax 
 
@@ -100,7 +100,7 @@ lsll 485498096(%edx), %eax
 // CHECK: encoding: [0x0f,0x03,0x44,0x02,0x40]        
 lsll 64(%edx,%eax), %eax 
 
-// CHECK: lsll %eax, %eax 
+// CHECK: lsll %ax, %eax 
 // CHECK: encoding: [0x0f,0x03,0xc0]        
 lsll %eax, %eax 
 
diff --git a/llvm/test/MC/X86/I286-64.s b/llvm/test/MC/X86/I286-64.s
index 1bab0a64f3e2..7707d7ba4d58 100644
--- a/llvm/test/MC/X86/I286-64.s
+++ b/llvm/test/MC/X86/I286-64.s
@@ -24,7 +24,7 @@ larl -64(%rdx,%rax,4), %r13d
 // CHECK: encoding: [0x44,0x0f,0x02,0x6c,0x02,0x40]        
 larl 64(%rdx,%rax), %r13d 
 
-// CHECK: larl %r13d, %r13d 
+// CHECK: larl %r13w, %r13d 
 // CHECK: encoding: [0x45,0x0f,0x02,0xed]        
 larl %r13d, %r13d 
 
@@ -32,7 +32,11 @@ larl %r13d, %r13d
 // CHECK: encoding: [0x44,0x0f,0x02,0x2a]        
 larl (%rdx), %r13d 
 
-// CHECK: larq %eax, %rax
+// CHECK: larq %ax, %rax
+// CHECK: encoding: [0x48,0x0f,0x02,0xc0]        
+lar %ax, %rax
+
+// CHECK: larq %ax, %rax
 // CHECK: encoding: [0x48,0x0f,0x02,0xc0]        
 lar %rax, %rax
 
@@ -160,7 +164,7 @@ lsll -64(%rdx,%rax,4), %r13d
 // CHECK: encoding: [0x44,0x0f,0x03,0x6c,0x02,0x40]        
 lsll 64(%rdx,%rax), %r13d 
 
-// CHECK: lsll %r13d, %r13d 
+// CHECK: lsll %r13w, %r13d 
 // CHECK: encoding: [0x45,0x0f,0x03,0xed]        
 lsll %r13d, %r13d 
 
@@ -168,7 +172,11 @@ lsll %r13d, %r13d
 // CHECK: encoding: [0x44,0x0f,0x03,0x2a]        
 lsll (%rdx), %r13d 
 
-// CHECK: lslq %eax, %rax
+// CHECK: lslq %ax, %rax
+// CHECK: encoding: [0x48,0x0f,0x03,0xc0]
+lsl %ax, %rax
+
+// CHECK: lslq %ax, %rax
 // CHECK: encoding: [0x48,0x0f,0x03,0xc0]
 lsl %rax, %rax
 
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index 84f6d5210d74..6a245b5eb425 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -874,6 +874,7 @@ OperandType RecognizableInstr::typeFromString(const std::string &s,
   TYPE("i16imm",              TYPE_IMM)
   TYPE("i16i8imm",            TYPE_IMM)
   TYPE("GR16",                TYPE_R16)
+  TYPE("GR16orGR32orGR64",    TYPE_R16)
   TYPE("i32mem",              TYPE_M)
   TYPE("i32imm",              TYPE_IMM)
   TYPE("i32i8imm",            TYPE_IMM)
@@ -1035,6 +1036,7 @@ RecognizableInstr::rmRegisterEncodingFromString(const std::string &s,
   ENCODING("RST",             ENCODING_FP)
   ENCODING("RSTi",            ENCODING_FP)
   ENCODING("GR16",            ENCODING_RM)
+  ENCODING("GR16orGR32orGR64",ENCODING_RM)
   ENCODING("GR32",            ENCODING_RM)
   ENCODING("GR32orGR64",      ENCODING_RM)
   ENCODING("GR64",            ENCODING_RM)
@@ -1072,6 +1074,7 @@ OperandEncoding
 RecognizableInstr::roRegisterEncodingFromString(const std::string &s,
                                                 uint8_t OpSize) {
   ENCODING("GR16",            ENCODING_REG)
+  ENCODING("GR16orGR32orGR64",ENCODING_REG)
   ENCODING("GR32",            ENCODING_REG)
   ENCODING("GR32orGR64",      ENCODING_REG)
   ENCODING("GR64",            ENCODING_REG)

From c467be74df68bc8c6db0b70ee64d0fc4616821d5 Mon Sep 17 00:00:00 2001
From: Eric Astor <epastor@google.com>
Date: Wed, 15 Jul 2020 15:00:34 -0400
Subject: [PATCH 028/363] [ms] [llvm-ml] Remove unused function

Summary: Remove unused function

Reviewed By: lbenes

Differential Revision: https://reviews.llvm.org/D83898

(cherry picked from commit 47a3b85a97136fca4a388646cbaec10b71414b60)
---
 llvm/lib/MC/MCParser/MasmParser.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index 58c22b2ccef2..e2aaeaae03b0 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -812,9 +812,6 @@ class MasmParser : public MCAsmParser {
                              const StructInitializer &Initializer);
 
   // User-defined types (structs, unions):
-  bool emitStructValue(const StructInfo &Structure,
-                       const StructInitializer &Initializer,
-                       size_t InitialOffset = 0, size_t InitialField = 0);
   bool emitStructValues(const StructInfo &Structure);
   bool addStructField(StringRef Name, const StructInfo &Structure);
   bool parseDirectiveStructValue(const StructInfo &Structure,

From e05c7e400f3afb9b6ec47d23a46bc68ef906ee77 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Mon, 20 Jul 2020 17:48:00 +0200
Subject: [PATCH 029/363] Require shell for lld/test/ELF/arm-exidx-range.s

The test fails in 32-bit Windows builds for unclear reasons:

ld.lld: error: failed to open
C:\src\llvm_package_1100-rc1\build32_stage0\tools\lld\test\ELF\Output\arm-exidx-range.s.tmp:
The parameter is incorrect.

(cherry picked from commit 8a197e0b16f2a0f560633f70886f4cdf3b7e20b4)
---
 lld/test/ELF/arm-exidx-range.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/test/ELF/arm-exidx-range.s b/lld/test/ELF/arm-exidx-range.s
index 69763705276c..623c6a81c97b 100644
--- a/lld/test/ELF/arm-exidx-range.s
+++ b/lld/test/ELF/arm-exidx-range.s
@@ -1,4 +1,4 @@
-// REQUIRES: arm
+// REQUIRES: arm, shell
 // RUN: llvm-mc --arm-add-build-attributes --triple=armv7a-linux-gnueabihf -filetype=obj %s -o %t.o
 // RUN: echo "SECTIONS { \
 // RUN:         . = 0x80000000; \

From bf2f2bffee244318a86fea90a0dfaf9be8a71b8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Fri, 17 Jul 2020 23:35:45 +0300
Subject: [PATCH 030/363] [LLDB] [COFF] Fix handling of symbols with more than
 one aux symbol

Differential Revision: https://reviews.llvm.org/D84070

(cherry picked from commit f07ddbc9c4b66e91aa7a106042512ee903b6b3ba)
---
 .../Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp     |  2 +-
 lldb/test/Shell/ObjectFile/PECOFF/symbol.yaml          | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
index d606b49130c4..dac2c496423f 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
@@ -698,7 +698,7 @@ Symtab *ObjectFilePECOFF::GetSymtab() {
 
             if (symbol.naux > 0) {
               i += symbol.naux;
-              offset += symbol_size;
+              offset += symbol.naux * symbol_size;
             }
           }
         }
diff --git a/lldb/test/Shell/ObjectFile/PECOFF/symbol.yaml b/lldb/test/Shell/ObjectFile/PECOFF/symbol.yaml
index 1d79e702333b..9dbc93b9b918 100644
--- a/lldb/test/Shell/ObjectFile/PECOFF/symbol.yaml
+++ b/lldb/test/Shell/ObjectFile/PECOFF/symbol.yaml
@@ -1,6 +1,9 @@
 # RUN: yaml2obj %s > %t
 # RUN: lldb-test symbols %t | FileCheck %s
 
+## The .file symbol isn't checked, but is included to test that the symbol
+## table iteration handles cases with a symbol with more than one aux symbol.
+
 # CHECK: Type File Address/Value {{.*}} Size            Flags           Name
 # CHECK: Code 0x0000000040001000        0x{{[0-9a-f]+}} 0x{{[0-9a-f]+}} entry
 # CHECK:      0x0000000040002000        0x{{[0-9a-f]+}} 0x{{[0-9a-f]+}} variable
@@ -101,6 +104,13 @@ symbols:
     SimpleType:      IMAGE_SYM_TYPE_NULL
     ComplexType:     IMAGE_SYM_DTYPE_NULL
     StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            .file
+    Value:           0
+    SectionNumber:   -2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_FILE
+    File:            longfilenameusingtwoauxsymbols
   - Name:            entry
     Value:           0
     SectionNumber:   1

From cebd637c88624dfd44520848cb1f43dc8a02ba80 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 15 Jul 2020 09:17:42 -0400
Subject: [PATCH 031/363] [ConstantFolding] check applicability of AllOnes
 constant creation first

The getAllOnesValue can only handle things that are bitcast from a
ConstantInt, while here we bitcast through a pointer, so we may see more
complex objects (like Array or Struct).

Differential Revision: https://reviews.llvm.org/D83870

(cherry picked from commit 8b354cc8db413f596c95b4f3240fabaa3e2c931e)
---
 llvm/lib/Analysis/ConstantFolding.cpp         |  8 +++-
 llvm/test/Analysis/ConstantFolding/allones.ll | 46 +++++++++++++++++++
 2 files changed, 52 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Analysis/ConstantFolding/allones.ll

diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 8c66decaaf58..6feffcbb98e1 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -342,8 +342,12 @@ Constant *llvm::ConstantFoldLoadThroughBitcast(Constant *C, Type *DestTy,
     // pointers legally).
     if (C->isNullValue() && !DestTy->isX86_MMXTy())
       return Constant::getNullValue(DestTy);
-    if (C->isAllOnesValue() && !DestTy->isX86_MMXTy() &&
-        !DestTy->isPtrOrPtrVectorTy()) // Don't get ones for ptr types!
+    if (C->isAllOnesValue() &&
+        (DestTy->isIntegerTy() || DestTy->isFloatingPointTy() ||
+         DestTy->isVectorTy()) &&
+        !DestTy->isX86_MMXTy() && !DestTy->isPtrOrPtrVectorTy())
+      // Get ones when the input is trivial, but
+      // only for supported types inside getAllOnesValue.
       return Constant::getAllOnesValue(DestTy);
 
     // If the type sizes are the same and a cast is legal, just directly
diff --git a/llvm/test/Analysis/ConstantFolding/allones.ll b/llvm/test/Analysis/ConstantFolding/allones.ll
new file mode 100644
index 000000000000..1315b3628475
--- /dev/null
+++ b/llvm/test/Analysis/ConstantFolding/allones.ll
@@ -0,0 +1,46 @@
+; RUN: opt -early-cse -S -o - %s | FileCheck %s
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64-ni:2"
+target triple = "armv7-unknown-linux-gnueabi"
+
+%struct.anon = type { i32 }
+
+@onesstruct = private constant %struct.anon { i32 -1 }, align 4
+
+define i32 @allones_struct() {
+; CHECK-LABEL: @allones_struct()
+; CHECK-NEXT:    %1 = load [1 x i32], [1 x i32]* bitcast (%struct.anon* @onesstruct to [1 x i32]*), align 4
+; CHECK-NEXT:    %2 = extractvalue [1 x i32] %1, 0
+; CHECK-NEXT:    ret i32 %2
+  %1 = load [1 x i32], [1 x i32]* bitcast (%struct.anon* @onesstruct to [1 x i32]*), align 4
+  %2 = extractvalue [1 x i32] %1, 0
+  ret i32 %2
+}
+
+define i32 @allones_int() {
+; CHECK-LABEL: @allones_int()
+; CHECK-NEXT:    ret i32 -1
+  %1 = load i32, i32* bitcast (%struct.anon* @onesstruct to i32*), align 4
+  ret i32 %1
+}
+
+define i32* @allones_ptr() {
+; CHECK-LABEL: @allones_ptr()
+; CHECK-NEXT:    ret i32* inttoptr (i32 -1 to i32*)
+  %1 = load i32*, i32** bitcast (%struct.anon* @onesstruct to i32**), align 4
+  ret i32* %1
+}
+
+define i32 addrspace(1)* @allones_ptr1() {
+; CHECK-LABEL: @allones_ptr1()
+; CHECK-NEXT:    ret i32 addrspace(1)* inttoptr (i32 -1 to i32 addrspace(1)*)
+  %1 = load i32 addrspace(1)*, i32 addrspace(1)** bitcast (%struct.anon* @onesstruct to i32 addrspace(1)**), align 4
+  ret i32 addrspace(1)* %1
+}
+
+define i32 addrspace(2)* @allones_ptr2() {
+; CHECK-LABEL: @allones_ptr2()
+; CHECK-NEXT:    %1 = load i32 addrspace(2)*, i32 addrspace(2)** bitcast (%struct.anon* @onesstruct to i32 addrspace(2)**), align 4
+; CHECK-NEXT:    ret i32 addrspace(2)* %1
+  %1 = load i32 addrspace(2)*, i32 addrspace(2)** bitcast (%struct.anon* @onesstruct to i32 addrspace(2)**), align 4
+  ret i32 addrspace(2)* %1
+}

From 021056563632ca625dab6e8270cec2e2fae7a8cb Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 20 Jul 2020 09:58:31 -0700
Subject: [PATCH 032/363] [LLVMgold.so] -plugin-opt=save-temps: save combined
 module to .lto.o instead of .o

This matches LLD and fixes https://sourceware.org/bugzilla/show_bug.cgi?id=26262#c1

.o is a bad choice for save-temps output because it is easy to override the bitcode file (*.o)

```
 # Use bfd for the example, -fuse-ld=gold is similar.
clang -flto -c a.c  # generate bitcode file a.o
clang -fuse-ld=bfd -flto a.o -o a -Wl,-plugin-opt=save-temps  # override a.o

 # The user repeats the command but get surprised, because a.o is now a combined module.
clang -fuse-ld=bfd -flto a.o -o a -Wl,-plugin-opt=save-temps
```

Reviewed By: tejohnson

Differential Revision: https://reviews.llvm.org/D84132

(cherry picked from commit 55fa315b0352b63454206600d6803fafacb42d5e)
---
 llvm/test/tools/gold/X86/parallel.ll             |  6 +++---
 llvm/test/tools/gold/X86/relocation-model-pic.ll | 16 ++++++++--------
 llvm/test/tools/gold/X86/thinlto.ll              |  2 +-
 llvm/tools/gold/gold-plugin.cpp                  |  2 +-
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/llvm/test/tools/gold/X86/parallel.ll b/llvm/test/tools/gold/X86/parallel.ll
index dabb5c469319..6972efc652a3 100644
--- a/llvm/test/tools/gold/X86/parallel.ll
+++ b/llvm/test/tools/gold/X86/parallel.ll
@@ -1,10 +1,10 @@
 ; RUN: llvm-as -o %t.bc %s
-; RUN: rm -f %t.0.5.precodegen.bc %t.1.5.precodegen.bc %t.o %t.o1
+; RUN: rm -f %t.0.5.precodegen.bc %t.1.5.precodegen.bc %t.lto.o %t.lto.o1
 ; RUN: env LD_PRELOAD=%llvmshlibdir/LLVMgold%shlibext %gold -plugin %llvmshlibdir/LLVMgold%shlibext -u foo -u bar -plugin-opt lto-partitions=2 -plugin-opt save-temps -m elf_x86_64 -o %t %t.bc
 ; RUN: llvm-dis %t.0.5.precodegen.bc -o - | FileCheck --check-prefix=CHECK-BC0 %s
 ; RUN: llvm-dis %t.1.5.precodegen.bc -o - | FileCheck --check-prefix=CHECK-BC1 %s
-; RUN: llvm-nm %t.o | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-nm %t.o1 | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-nm %t.lto.o | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-nm %t.lto.o1 | FileCheck --check-prefix=CHECK1 %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/tools/gold/X86/relocation-model-pic.ll b/llvm/test/tools/gold/X86/relocation-model-pic.ll
index ad7d2981e9ef..98034e49b0a0 100644
--- a/llvm/test/tools/gold/X86/relocation-model-pic.ll
+++ b/llvm/test/tools/gold/X86/relocation-model-pic.ll
@@ -10,44 +10,44 @@
 ; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \
 ; RUN:    --shared \
 ; RUN:    --plugin-opt=save-temps %t.o -o %t-out
-; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=PIC
+; RUN: llvm-readobj -r %t-out.lto.o | FileCheck %s --check-prefix=PIC
 
 ; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \
 ; RUN:    --export-dynamic --noinhibit-exec -pie \
 ; RUN:    --plugin-opt=save-temps %t.o -o %t-out
-; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=PIC
+; RUN: llvm-readobj -r %t-out.lto.o | FileCheck %s --check-prefix=PIC
 
 ; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \
 ; RUN:    --export-dynamic --noinhibit-exec \
 ; RUN:    --plugin-opt=save-temps %t.o -o %t-out
-; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=STATIC
+; RUN: llvm-readobj -r %t-out.lto.o | FileCheck %s --check-prefix=STATIC
 
 ; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \
 ; RUN:    -r \
 ; RUN:    --plugin-opt=save-temps %t.o -o %t-out
-; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=STATIC
+; RUN: llvm-readobj -r %t-out.lto.o | FileCheck %s --check-prefix=STATIC
 
 ;; PIC source.
 
 ; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \
 ; RUN:    --shared \
 ; RUN:    --plugin-opt=save-temps %t.pic.o -o %t-out
-; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=PIC
+; RUN: llvm-readobj -r %t-out.lto.o | FileCheck %s --check-prefix=PIC
 
 ; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \
 ; RUN:    --export-dynamic --noinhibit-exec -pie \
 ; RUN:    --plugin-opt=save-temps %t.pic.o -o %t-out
-; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=PIC
+; RUN: llvm-readobj -r %t-out.lto.o | FileCheck %s --check-prefix=PIC
 
 ; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \
 ; RUN:    --export-dynamic --noinhibit-exec \
 ; RUN:    --plugin-opt=save-temps %t.pic.o -o %t-out
-; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=STATIC
+; RUN: llvm-readobj -r %t-out.lto.o | FileCheck %s --check-prefix=STATIC
 
 ; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \
 ; RUN:    -r \
 ; RUN:    --plugin-opt=save-temps %t.pic.o -o %t-out
-; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=PIC
+; RUN: llvm-readobj -r %t-out.lto.o | FileCheck %s --check-prefix=PIC
 
 
 ; PIC: R_X86_64_GOTPCREL foo
diff --git a/llvm/test/tools/gold/X86/thinlto.ll b/llvm/test/tools/gold/X86/thinlto.ll
index 6857778b55d5..3ed07ffd5eed 100644
--- a/llvm/test/tools/gold/X86/thinlto.ll
+++ b/llvm/test/tools/gold/X86/thinlto.ll
@@ -42,7 +42,7 @@
 ; RUN: llvm-bcanalyzer -dump %t4.index.bc | FileCheck %s --check-prefix=COMBINED
 ; RUN: llvm-nm %t4 | FileCheck %s --check-prefix=NM
 ; Ensure ld does not emit empty combined module in default.
-; RUN: ls %t4.o* | count 2
+; RUN: ls %t4.lto.o* | count 2
 
 ; Check with --no-map-whole-files
 ; RUN: %gold -plugin %llvmshlibdir/LLVMgold%shlibext \
diff --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp
index 7654d44ba008..0124e0a93e8a 100644
--- a/llvm/tools/gold/gold-plugin.cpp
+++ b/llvm/tools/gold/gold-plugin.cpp
@@ -1050,7 +1050,7 @@ static std::vector<std::pair<SmallString<128>, bool>> runLTO() {
   if (!options::obj_path.empty())
     Filename = options::obj_path;
   else if (options::TheOutputType == options::OT_SAVE_TEMPS)
-    Filename = output_name + ".o";
+    Filename = output_name + ".lto.o";
   else if (options::TheOutputType == options::OT_ASM_ONLY)
     Filename = output_name;
   bool SaveTemps = !Filename.empty();

From 8f8ec9927623eb54f003993b3d8411fc0130ca90 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 21 Jul 2020 10:18:10 -0700
Subject: [PATCH 033/363] [LLVMgold.so][test] Fix tests after
 D84132/55fa315b0352

(cherry picked from commit aa830e9768303ff8d27c015759294c4ce704d50c)
---
 llvm/test/tools/gold/X86/cache.ll        | 3 +--
 llvm/test/tools/gold/X86/emit-llvm.ll    | 2 +-
 llvm/test/tools/gold/X86/relax-relocs.ll | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/test/tools/gold/X86/cache.ll b/llvm/test/tools/gold/X86/cache.ll
index 4446aa6d8878..5ab556302558 100644
--- a/llvm/test/tools/gold/X86/cache.ll
+++ b/llvm/test/tools/gold/X86/cache.ll
@@ -67,8 +67,7 @@
 ; With save-temps we can confirm that the cached files were copied into temp
 ; files to avoid a race condition with the cached files being pruned, since the
 ; gold plugin-api only accepts native objects passed back as files.
-; RUN: ls %t4.o.o1
-; RUN: ls %t4.o.o2
+; RUN: ls %t4.o.lto.o1 %t4.o.lto.o2
 
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/tools/gold/X86/emit-llvm.ll b/llvm/test/tools/gold/X86/emit-llvm.ll
index 8bab561889ae..f1f6b22a9c3c 100644
--- a/llvm/test/tools/gold/X86/emit-llvm.ll
+++ b/llvm/test/tools/gold/X86/emit-llvm.ll
@@ -12,7 +12,7 @@
 ; RUN: llvm-dis %t3.o.0.2.internalize.bc -o - | FileCheck %s
 ; RUN: llvm-dis %t3.o.0.4.opt.bc -o - | FileCheck --check-prefix=OPT %s
 ; RUN: llvm-dis %t3.o.0.4.opt.bc -o - | FileCheck --check-prefix=OPT2 %s
-; RUN: llvm-nm %t3.o.o | FileCheck --check-prefix=NM %s
+; RUN: llvm-nm %t3.o.lto.o | FileCheck --check-prefix=NM %s
 
 ; RUN: rm -f %t4.o
 ; RUN: %gold -plugin %llvmshlibdir/LLVMgold%shlibext \
diff --git a/llvm/test/tools/gold/X86/relax-relocs.ll b/llvm/test/tools/gold/X86/relax-relocs.ll
index f62125c48d1f..3ad79f73cb84 100644
--- a/llvm/test/tools/gold/X86/relax-relocs.ll
+++ b/llvm/test/tools/gold/X86/relax-relocs.ll
@@ -2,7 +2,7 @@
 ; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \
 ; RUN:    --plugin-opt=save-temps \
 ; RUN:    -shared %t.o -o %t.so
-; RUN: llvm-readobj -r %t.so.o | FileCheck %s
+; RUN: llvm-readobj -r %t.so.lto.o | FileCheck %s
 
 ; Test that we produce R_X86_64_GOTPCREL instead of R_X86_64_GOTPCRELX
 ; CHECK: R_X86_64_GOTPCREL foo

From 764e28231e4b3b04748ac1c85576402bb76919de Mon Sep 17 00:00:00 2001
From: Sylvain Audi <sylvain.audi@ubisoft.com>
Date: Thu, 9 Jul 2020 17:12:02 -0400
Subject: [PATCH 034/363] [LLD][COFF] Skip computation of the undefined symbols
 references that are not shown

The "undefined symbol" error message from lld-link displays up to 3 references to that symbol, and the number of extra references not shown.

This patch removes the computation of the strings for those extra references.

It fixes a freeze of lld-link we accidentally encountered when activating asan on a large project, without linking with the asan library.
In that case, __asan_report_load8 was referenced more than 2 million times, causing the computation of that many display strings, of which only 3 were used.

Differential Revision: https://reviews.llvm.org/D83510

(cherry picked from commit 3a108ab256dba7b5a7304f0e83818673d334405f)
---
 lld/COFF/SymbolTable.cpp                      | 60 +++++++++++++------
 .../COFF/Inputs/undefined-symbol-multi-lto.ll | 23 +++++++
 lld/test/COFF/undefined-symbol-multi.s        |  7 ++-
 3 files changed, 69 insertions(+), 21 deletions(-)
 create mode 100644 lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll

diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index d4d2a159a639..173e32f628ef 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -136,12 +136,16 @@ getFileLine(const SectionChunk *c, uint32_t addr) {
 // of all references to that symbol from that file. If no debug information is
 // available, returns just the name of the file, else one string per actual
 // reference as described in the debug info.
-std::vector<std::string> getSymbolLocations(ObjFile *file, uint32_t symIndex) {
+// Returns up to maxStrings string descriptions, along with the total number of
+// locations found.
+static std::pair<std::vector<std::string>, size_t>
+getSymbolLocations(ObjFile *file, uint32_t symIndex, size_t maxStrings) {
   struct Location {
     Symbol *sym;
     std::pair<StringRef, uint32_t> fileLine;
   };
   std::vector<Location> locations;
+  size_t numLocations = 0;
 
   for (Chunk *c : file->getChunks()) {
     auto *sc = dyn_cast<SectionChunk>(c);
@@ -150,6 +154,10 @@ std::vector<std::string> getSymbolLocations(ObjFile *file, uint32_t symIndex) {
     for (const coff_relocation &r : sc->getRelocs()) {
       if (r.SymbolTableIndex != symIndex)
         continue;
+      numLocations++;
+      if (locations.size() >= maxStrings)
+        continue;
+
       Optional<std::pair<StringRef, uint32_t>> fileLine =
           getFileLine(sc, r.VirtualAddress);
       Symbol *sym = getSymbol(sc, r.VirtualAddress);
@@ -160,8 +168,12 @@ std::vector<std::string> getSymbolLocations(ObjFile *file, uint32_t symIndex) {
     }
   }
 
-  if (locations.empty())
-    return std::vector<std::string>({"\n>>> referenced by " + toString(file)});
+  if (maxStrings == 0)
+    return std::make_pair(std::vector<std::string>(), numLocations);
+
+  if (numLocations == 0)
+    return std::make_pair(
+        std::vector<std::string>{"\n>>> referenced by " + toString(file)}, 1);
 
   std::vector<std::string> symbolLocations(locations.size());
   size_t i = 0;
@@ -175,17 +187,26 @@ std::vector<std::string> getSymbolLocations(ObjFile *file, uint32_t symIndex) {
     if (loc.sym)
       os << ":(" << toString(*loc.sym) << ')';
   }
-  return symbolLocations;
+  return std::make_pair(symbolLocations, numLocations);
+}
+
+std::vector<std::string> getSymbolLocations(ObjFile *file, uint32_t symIndex) {
+  return getSymbolLocations(file, symIndex, SIZE_MAX).first;
 }
 
-std::vector<std::string> getSymbolLocations(InputFile *file,
-                                            uint32_t symIndex) {
+static std::pair<std::vector<std::string>, size_t>
+getSymbolLocations(InputFile *file, uint32_t symIndex, size_t maxStrings) {
   if (auto *o = dyn_cast<ObjFile>(file))
-    return getSymbolLocations(o, symIndex);
-  if (auto *b = dyn_cast<BitcodeFile>(file))
-    return getSymbolLocations(b);
+    return getSymbolLocations(o, symIndex, maxStrings);
+  if (auto *b = dyn_cast<BitcodeFile>(file)) {
+    std::vector<std::string> symbolLocations = getSymbolLocations(b);
+    size_t numLocations = symbolLocations.size();
+    if (symbolLocations.size() > maxStrings)
+      symbolLocations.resize(maxStrings);
+    return std::make_pair(symbolLocations, numLocations);
+  }
   llvm_unreachable("unsupported file type passed to getSymbolLocations");
-  return {};
+  return std::make_pair(std::vector<std::string>(), (size_t)0);
 }
 
 // For an undefined symbol, stores all files referencing it and the index of
@@ -205,20 +226,21 @@ static void reportUndefinedSymbol(const UndefinedDiag &undefDiag) {
   os << "undefined symbol: " << toString(*undefDiag.sym);
 
   const size_t maxUndefReferences = 3;
-  size_t i = 0, numRefs = 0;
+  size_t numDisplayedRefs = 0, numRefs = 0;
   for (const UndefinedDiag::File &ref : undefDiag.files) {
-    std::vector<std::string> symbolLocations =
-        getSymbolLocations(ref.file, ref.symIndex);
-    numRefs += symbolLocations.size();
+    std::vector<std::string> symbolLocations;
+    size_t totalLocations = 0;
+    std::tie(symbolLocations, totalLocations) = getSymbolLocations(
+        ref.file, ref.symIndex, maxUndefReferences - numDisplayedRefs);
+
+    numRefs += totalLocations;
+    numDisplayedRefs += symbolLocations.size();
     for (const std::string &s : symbolLocations) {
-      if (i >= maxUndefReferences)
-        break;
       os << s;
-      i++;
     }
   }
-  if (i < numRefs)
-    os << "\n>>> referenced " << numRefs - i << " more times";
+  if (numDisplayedRefs < numRefs)
+    os << "\n>>> referenced " << numRefs - numDisplayedRefs << " more times";
   errorOrWarn(os.str());
 }
 
diff --git a/lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll b/lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll
new file mode 100644
index 000000000000..5f6730272e61
--- /dev/null
+++ b/lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll
@@ -0,0 +1,23 @@
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-windows-msvc"
+
+define dso_local i32 @"?baz@@YAHXZ"() #0 {
+  %1 = call i32 @"?foo@@YAHXZ"()
+  %2 = call i32 @"?foo@@YAHXZ"()
+  %3 = call i32 @"?bar@@YAHXZ"()
+  %4 = call i32 @"?bar@@YAHXZ"()
+  ret i32 0
+}
+
+declare dso_local i32 @"?foo@@YAHXZ"() #1
+
+declare dso_local i32 @"?bar@@YAHXZ"() #1
+
+attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 2}
+!1 = !{!"clang version 11.0.0 (https://github.com/llvm/llvm-project.git bed3e1a99b41f5a9525bc0edf12ecbcf63aab0cf)"}
diff --git a/lld/test/COFF/undefined-symbol-multi.s b/lld/test/COFF/undefined-symbol-multi.s
index 2d3d8285338c..086d12828b2d 100644
--- a/lld/test/COFF/undefined-symbol-multi.s
+++ b/lld/test/COFF/undefined-symbol-multi.s
@@ -16,17 +16,20 @@
 # RUN: echo '  call "?foo@@YAHXZ"' >> %t.moreref.s
 # RUN: echo '  call "?foo@@YAHXZ"' >> %t.moreref.s
 # RUN: llvm-mc -triple=x86_64-windows-msvc -filetype=obj -o %t2.obj %t.moreref.s
-# RUN: not lld-link /out:/dev/null  %t.obj %t2.obj 2>&1 | FileCheck %s
+# RUN: llvm-as %S/Inputs/undefined-symbol-multi-lto.ll -o %t3.obj
+# RUN: not lld-link /out:/dev/null  %t.obj %t2.obj %t3.obj 2>&1 | FileCheck %s
 
 # CHECK: error: undefined symbol: int __cdecl foo(void)
 # CHECK-NEXT: >>> referenced by {{.*}}tmp.obj:(main)
 # CHECK-NEXT: >>> referenced by {{.*}}tmp.obj:(main)
 # CHECK-NEXT: >>> referenced by {{.*}}tmp2.obj:(bar)
-# CHECK-NEXT: >>> referenced 9 more times
+# CHECK-NEXT: >>> referenced 10 more times
 # CHECK-EMPTY:
 # CHECK-NEXT: error: undefined symbol: int __cdecl bar(void)
 # CHECK-NEXT: >>> referenced by {{.*}}.obj:(main)
 # CHECK-NEXT: >>> referenced by {{.*}}.obj:(f1)
+# CHECK-NEXT: >>> referenced by {{.*}}undefined-symbol-multi-lto.ll
+# CHECK-NEXT: >>>               {{.*}}tmp3.obj
 
         .section        .text,"xr",one_only,main
 .globl main

From ba5bbd4bd00f8aacf379cdcb738b149a1f63166a Mon Sep 17 00:00:00 2001
From: Kai Luo <lkail@cn.ibm.com>
Date: Fri, 17 Jul 2020 08:36:30 +0000
Subject: [PATCH 035/363] [PowerPC] Precommit test case for PR46759. NFC.

(cherry picked from commit 817767abeec8343b20de83f8b1b2c8c20bbbe00a)
---
 llvm/test/CodeGen/PowerPC/pr46759.ll | 58 ++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/pr46759.ll

diff --git a/llvm/test/CodeGen/PowerPC/pr46759.ll b/llvm/test/CodeGen/PowerPC/pr46759.ll
new file mode 100644
index 000000000000..2c0af8950099
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr46759.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs \
+; RUN:   -mtriple=powerpc64le-linux-gnu < %s | FileCheck \
+; RUN:   -check-prefix=CHECK-LE %s
+
+define void @foo(i32 %vla_size) #0 {
+; CHECK-LE-LABEL: foo:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    std r31, -8(r1)
+; CHECK-LE-NEXT:    std r30, -16(r1)
+; CHECK-LE-NEXT:    mr r30, r1
+; CHECK-LE-NEXT:    mr r12, r1
+; CHECK-LE-NEXT:    .cfi_def_cfa r12, 0
+; CHECK-LE-NEXT:    clrldi r0, r12, 53
+; CHECK-LE-NEXT:    stdux r12, r1, r0
+; CHECK-LE-NEXT:    stdu r12, -2048(r1)
+; CHECK-LE-NEXT:    stdu r12, -4096(r1)
+; CHECK-LE-NEXT:    .cfi_def_cfa_register r1
+; CHECK-LE-NEXT:    .cfi_def_cfa_register r30
+; CHECK-LE-NEXT:    .cfi_offset r31, -8
+; CHECK-LE-NEXT:    .cfi_offset r30, -16
+; CHECK-LE-NEXT:    clrldi r3, r3, 32
+; CHECK-LE-NEXT:    li r6, -4096
+; CHECK-LE-NEXT:    ld r4, 0(r1)
+; CHECK-LE-NEXT:    mr r31, r1
+; CHECK-LE-NEXT:    addi r3, r3, 15
+; CHECK-LE-NEXT:    rldicl r3, r3, 60, 4
+; CHECK-LE-NEXT:    rldicl r3, r3, 4, 31
+; CHECK-LE-NEXT:    neg r5, r3
+; CHECK-LE-NEXT:    li r3, -2048
+; CHECK-LE-NEXT:    divd r7, r5, r6
+; CHECK-LE-NEXT:    and r3, r5, r3
+; CHECK-LE-NEXT:    add r3, r1, r3
+; CHECK-LE-NEXT:    mulld r6, r7, r6
+; CHECK-LE-NEXT:    sub r5, r5, r6
+; CHECK-LE-NEXT:    stdux r4, r1, r5
+; CHECK-LE-NEXT:    cmpd r1, r3
+; CHECK-LE-NEXT:    beq cr0, .LBB0_2
+; CHECK-LE-NEXT:  .LBB0_1: # %entry
+; CHECK-LE-NEXT:    #
+; CHECK-LE-NEXT:    stdu r4, -4096(r1)
+; CHECK-LE-NEXT:    cmpd r1, r3
+; CHECK-LE-NEXT:    bne cr0, .LBB0_1
+; CHECK-LE-NEXT:  .LBB0_2: # %entry
+; CHECK-LE-NEXT:    addi r3, r1, 2048
+; CHECK-LE-NEXT:    lbz r3, 0(r3)
+; CHECK-LE-NEXT:    ld r1, 0(r1)
+; CHECK-LE-NEXT:    ld r31, -8(r1)
+; CHECK-LE-NEXT:    ld r30, -16(r1)
+; CHECK-LE-NEXT:    blr
+entry:
+  %0 = zext i32 %vla_size to i64
+  %vla = alloca i8, i64 %0, align 2048
+  %1 = load volatile i8, i8* %vla, align 2048
+  ret void
+}
+
+attributes #0 = { "probe-stack"="inline-asm" }

From e95e071b6b68929527570cb830e5f3bc8b992e04 Mon Sep 17 00:00:00 2001
From: Kai Luo <lkail@cn.ibm.com>
Date: Wed, 22 Jul 2020 04:13:18 +0000
Subject: [PATCH 036/363] [PowerPC] Fix wrong codegen when stack pointer has to
 realign in prologue

Current powerpc backend generates wrong code sequence if stack pointer
has to realign if -fstack-clash-protection enabled. When probing in
prologue, backend should generate a subtraction instruction rather
than a `stux` instruction to realign the stack pointer.

This patch is part of fix of
https://bugs.llvm.org/show_bug.cgi?id=46759.

Differential Revision: https://reviews.llvm.org/D84218

(cherry picked from commit 8912252252c87d8ef6623ecf9fdde444560ee4b9)
---
 llvm/lib/Target/PowerPC/PPCFrameLowering.cpp | 7 +++----
 llvm/test/CodeGen/PowerPC/pr46759.ll         | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index bd9174c1973d..2ee394e9259d 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -1466,11 +1466,10 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
           .addImm(0)
           .addImm(32 - Log2(MaxAlign))
           .addImm(31);
-    BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::STDUX : PPC::STWUX),
+    BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::SUBFC8 : PPC::SUBFC),
             SPReg)
-        .addReg(FPReg)
-        .addReg(SPReg)
-        .addReg(ScratchReg);
+        .addReg(ScratchReg)
+        .addReg(SPReg);
   }
   // Probe residual part.
   if (NegResidualSize) {
diff --git a/llvm/test/CodeGen/PowerPC/pr46759.ll b/llvm/test/CodeGen/PowerPC/pr46759.ll
index 2c0af8950099..4d3e8cadc21e 100644
--- a/llvm/test/CodeGen/PowerPC/pr46759.ll
+++ b/llvm/test/CodeGen/PowerPC/pr46759.ll
@@ -12,7 +12,7 @@ define void @foo(i32 %vla_size) #0 {
 ; CHECK-LE-NEXT:    mr r12, r1
 ; CHECK-LE-NEXT:    .cfi_def_cfa r12, 0
 ; CHECK-LE-NEXT:    clrldi r0, r12, 53
-; CHECK-LE-NEXT:    stdux r12, r1, r0
+; CHECK-LE-NEXT:    subc r1, r1, r0
 ; CHECK-LE-NEXT:    stdu r12, -2048(r1)
 ; CHECK-LE-NEXT:    stdu r12, -4096(r1)
 ; CHECK-LE-NEXT:    .cfi_def_cfa_register r1

From c522fd02da1b0dcadeae041d12fe35e52ce0973f Mon Sep 17 00:00:00 2001
From: Kai Luo <lkail@cn.ibm.com>
Date: Wed, 22 Jul 2020 04:16:20 +0000
Subject: [PATCH 037/363] [PowerPC] Fix wrong codegen when stack pointer has to
 realign performing dynalloc

Current powerpc backend generates wrong code sequence if stack pointer
has to realign if `-fstack-clash-protection` enabled. When probing
dynamic stack allocation, current `PREPARE_PROBED_ALLOCA` takes
`NegSizeReg` as input and returns
`FinalStackPtr`. `FinalStackPtr=StackPtr+ActualNegSize` is calculated
correctly, however code following `PREPARE_PROBED_ALLOCA` still uses
value of `NegSizeReg`, which does not contain `ActualNegSize` if
`MaxAlign > TargetAlign`, to calculate loop trip count and residual
number of bytes.

This patch is part of fix of
https://bugs.llvm.org/show_bug.cgi?id=46759.

Differential Revision: https://reviews.llvm.org/D84152

(cherry picked from commit c3f9697f1f227296818fbaf1a770a29842ea454c)
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  36 ++-
 llvm/lib/Target/PowerPC/PPCInstr64Bit.td      |   9 +-
 llvm/lib/Target/PowerPC/PPCInstrInfo.td       |   9 +-
 llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp   |  37 ++-
 llvm/test/CodeGen/PowerPC/pr46759.ll          |  27 +-
 .../PowerPC/stack-clash-dynamic-alloca.ll     | 240 +++++++++---------
 6 files changed, 198 insertions(+), 160 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index ddfbd04e1ebc..11454841cab7 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -11950,18 +11950,34 @@ PPCTargetLowering::emitProbedAlloca(MachineInstr &MI,
   Register SPReg = isPPC64 ? PPC::X1 : PPC::R1;
   Register FinalStackPtr = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
   Register FramePointer = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
-
-  // Get the canonical FinalStackPtr like what
-  // PPCRegisterInfo::lowerDynamicAlloc does.
-  BuildMI(*MBB, {MI}, DL,
-          TII->get(isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_64
-                           : PPC::PREPARE_PROBED_ALLOCA_32),
-          FramePointer)
-      .addDef(FinalStackPtr)
+  Register ActualNegSizeReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+
+  // Since value of NegSizeReg might be realigned in prologepilog, insert a
+  // PREPARE_PROBED_ALLOCA pseudo instruction to get actual FramePointer and
+  // NegSize.
+  unsigned ProbeOpc;
+  if (!MRI.hasOneNonDBGUse(NegSizeReg))
+    ProbeOpc =
+        isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_64 : PPC::PREPARE_PROBED_ALLOCA_32;
+  else
+    // By introducing PREPARE_PROBED_ALLOCA_NEGSIZE_OPT, ActualNegSizeReg
+    // and NegSizeReg will be allocated in the same phyreg to avoid
+    // redundant copy when NegSizeReg has only one use which is current MI and
+    // will be replaced by PREPARE_PROBED_ALLOCA then.
+    ProbeOpc = isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_64
+                       : PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_32;
+  BuildMI(*MBB, {MI}, DL, TII->get(ProbeOpc), FramePointer)
+      .addDef(ActualNegSizeReg)
       .addReg(NegSizeReg)
       .add(MI.getOperand(2))
       .add(MI.getOperand(3));
 
+  // Calculate final stack pointer, which equals to SP + ActualNegSize.
+  BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::ADD8 : PPC::ADD4),
+          FinalStackPtr)
+      .addReg(SPReg)
+      .addReg(ActualNegSizeReg);
+
   // Materialize a scratch register for update.
   int64_t NegProbeSize = -(int64_t)ProbeSize;
   assert(isInt<32>(NegProbeSize) && "Unhandled probe size!");
@@ -11982,7 +11998,7 @@ PPCTargetLowering::emitProbedAlloca(MachineInstr &MI,
     // Probing leading residual part.
     Register Div = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
     BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::DIVD : PPC::DIVW), Div)
-        .addReg(NegSizeReg)
+        .addReg(ActualNegSizeReg)
         .addReg(ScratchReg);
     Register Mul = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
     BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::MULLD : PPC::MULLW), Mul)
@@ -11991,7 +12007,7 @@ PPCTargetLowering::emitProbedAlloca(MachineInstr &MI,
     Register NegMod = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
     BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::SUBF8 : PPC::SUBF), NegMod)
         .addReg(Mul)
-        .addReg(NegSizeReg);
+        .addReg(ActualNegSizeReg);
     BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)
         .addReg(FramePointer)
         .addReg(SPReg)
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 1c457d4170d5..6956c40a70be 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -431,9 +431,14 @@ def PROBED_ALLOCA_64 : PPCCustomInserterPseudo<(outs g8rc:$result),
                          (ins g8rc:$negsize, memri:$fpsi), "#PROBED_ALLOCA_64",
                            [(set i64:$result,
                              (PPCprobedalloca i64:$negsize, iaddr:$fpsi))]>;
-def PREPARE_PROBED_ALLOCA_64 : PPCEmitTimePseudo<(outs g8rc:$fp,
-    g8rc:$sp),
+def PREPARE_PROBED_ALLOCA_64 : PPCEmitTimePseudo<(outs
+    g8rc:$fp, g8rc:$actual_negsize),
     (ins g8rc:$negsize, memri:$fpsi), "#PREPARE_PROBED_ALLOCA_64", []>;
+def PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_64 : PPCEmitTimePseudo<(outs
+    g8rc:$fp, g8rc:$actual_negsize),
+    (ins g8rc:$negsize, memri:$fpsi),
+    "#PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_64", []>,
+    RegConstraint<"$actual_negsize = $negsize">;
 def PROBED_STACKALLOC_64 : PPCEmitTimePseudo<(outs g8rc:$scratch, g8rc:$temp),
     (ins i64imm:$stacksize),
     "#PROBED_STACKALLOC_64", []>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 673ab63039cf..fedbf592af39 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -1406,9 +1406,14 @@ def PROBED_ALLOCA_32 : PPCCustomInserterPseudo<(outs gprc:$result),
                          (ins gprc:$negsize, memri:$fpsi), "#PROBED_ALLOCA_32",
                            [(set i32:$result,
                              (PPCprobedalloca i32:$negsize, iaddr:$fpsi))]>;
-def PREPARE_PROBED_ALLOCA_32 : PPCEmitTimePseudo<(outs gprc:$fp,
-    gprc:$sp),
+def PREPARE_PROBED_ALLOCA_32 : PPCEmitTimePseudo<(outs
+    gprc:$fp, gprc:$actual_negsize),
     (ins gprc:$negsize, memri:$fpsi), "#PREPARE_PROBED_ALLOCA_32", []>;
+def PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_32 : PPCEmitTimePseudo<(outs
+    gprc:$fp, gprc:$actual_negsize),
+    (ins gprc:$negsize, memri:$fpsi),
+    "#PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_32", []>,
+    RegConstraint<"$actual_negsize = $negsize">;
 def PROBED_STACKALLOC_32 : PPCEmitTimePseudo<(outs gprc:$scratch, gprc:$temp),
     (ins i64imm:$stacksize),
     "#PROBED_STACKALLOC_32", []>;
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 35f5e1fbebcd..ed8948a63972 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -624,21 +624,30 @@ void PPCRegisterInfo::lowerPrepareProbedAlloca(
   bool LP64 = TM.isPPC64();
   DebugLoc dl = MI.getDebugLoc();
   Register FramePointer = MI.getOperand(0).getReg();
-  Register FinalStackPtr = MI.getOperand(1).getReg();
+  const Register ActualNegSizeReg = MI.getOperand(1).getReg();
   bool KillNegSizeReg = MI.getOperand(2).isKill();
   Register NegSizeReg = MI.getOperand(2).getReg();
-  prepareDynamicAlloca(II, NegSizeReg, KillNegSizeReg, FramePointer);
-  if (LP64) {
-    BuildMI(MBB, II, dl, TII.get(PPC::ADD8), FinalStackPtr)
-        .addReg(PPC::X1)
-        .addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
-
-  } else {
-    BuildMI(MBB, II, dl, TII.get(PPC::ADD4), FinalStackPtr)
-        .addReg(PPC::R1)
-        .addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
+  const MCInstrDesc &CopyInst = TII.get(LP64 ? PPC::OR8 : PPC::OR);
+  // RegAllocator might allocate FramePointer and NegSizeReg in the same phyreg.
+  if (FramePointer == NegSizeReg) {
+    assert(KillNegSizeReg && "FramePointer is a def and NegSizeReg is an use, "
+                             "NegSizeReg should be killed");
+    // FramePointer is clobbered earlier than the use of NegSizeReg in
+    // prepareDynamicAlloca, save NegSizeReg in ActualNegSizeReg to avoid
+    // misuse.
+    BuildMI(MBB, II, dl, CopyInst, ActualNegSizeReg)
+        .addReg(NegSizeReg)
+        .addReg(NegSizeReg);
+    NegSizeReg = ActualNegSizeReg;
+    KillNegSizeReg = false;
   }
-
+  prepareDynamicAlloca(II, NegSizeReg, KillNegSizeReg, FramePointer);
+  // NegSizeReg might be updated in prepareDynamicAlloca if MaxAlign >
+  // TargetAlign.
+  if (NegSizeReg != ActualNegSizeReg)
+    BuildMI(MBB, II, dl, CopyInst, ActualNegSizeReg)
+        .addReg(NegSizeReg)
+        .addReg(NegSizeReg);
   MBB.erase(II);
 }
 
@@ -1084,7 +1093,9 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   if (FPSI && FrameIndex == FPSI &&
       (OpC == PPC::PREPARE_PROBED_ALLOCA_64 ||
-       OpC == PPC::PREPARE_PROBED_ALLOCA_32)) {
+       OpC == PPC::PREPARE_PROBED_ALLOCA_32 ||
+       OpC == PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_64 ||
+       OpC == PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_32)) {
     lowerPrepareProbedAlloca(II);
     return;
   }
diff --git a/llvm/test/CodeGen/PowerPC/pr46759.ll b/llvm/test/CodeGen/PowerPC/pr46759.ll
index 4d3e8cadc21e..d1d68a5db7e3 100644
--- a/llvm/test/CodeGen/PowerPC/pr46759.ll
+++ b/llvm/test/CodeGen/PowerPC/pr46759.ll
@@ -20,26 +20,27 @@ define void @foo(i32 %vla_size) #0 {
 ; CHECK-LE-NEXT:    .cfi_offset r31, -8
 ; CHECK-LE-NEXT:    .cfi_offset r30, -16
 ; CHECK-LE-NEXT:    clrldi r3, r3, 32
-; CHECK-LE-NEXT:    li r6, -4096
-; CHECK-LE-NEXT:    ld r4, 0(r1)
+; CHECK-LE-NEXT:    li r5, -2048
 ; CHECK-LE-NEXT:    mr r31, r1
 ; CHECK-LE-NEXT:    addi r3, r3, 15
 ; CHECK-LE-NEXT:    rldicl r3, r3, 60, 4
 ; CHECK-LE-NEXT:    rldicl r3, r3, 4, 31
-; CHECK-LE-NEXT:    neg r5, r3
-; CHECK-LE-NEXT:    li r3, -2048
-; CHECK-LE-NEXT:    divd r7, r5, r6
-; CHECK-LE-NEXT:    and r3, r5, r3
-; CHECK-LE-NEXT:    add r3, r1, r3
-; CHECK-LE-NEXT:    mulld r6, r7, r6
-; CHECK-LE-NEXT:    sub r5, r5, r6
-; CHECK-LE-NEXT:    stdux r4, r1, r5
-; CHECK-LE-NEXT:    cmpd r1, r3
+; CHECK-LE-NEXT:    neg r4, r3
+; CHECK-LE-NEXT:    ld r3, 0(r1)
+; CHECK-LE-NEXT:    and r5, r4, r5
+; CHECK-LE-NEXT:    mr r4, r5
+; CHECK-LE-NEXT:    li r5, -4096
+; CHECK-LE-NEXT:    divd r6, r4, r5
+; CHECK-LE-NEXT:    mulld r5, r6, r5
+; CHECK-LE-NEXT:    sub r5, r4, r5
+; CHECK-LE-NEXT:    add r4, r1, r4
+; CHECK-LE-NEXT:    stdux r3, r1, r5
+; CHECK-LE-NEXT:    cmpd r1, r4
 ; CHECK-LE-NEXT:    beq cr0, .LBB0_2
 ; CHECK-LE-NEXT:  .LBB0_1: # %entry
 ; CHECK-LE-NEXT:    #
-; CHECK-LE-NEXT:    stdu r4, -4096(r1)
-; CHECK-LE-NEXT:    cmpd r1, r3
+; CHECK-LE-NEXT:    stdu r3, -4096(r1)
+; CHECK-LE-NEXT:    cmpd r1, r4
 ; CHECK-LE-NEXT:    bne cr0, .LBB0_1
 ; CHECK-LE-NEXT:  .LBB0_2: # %entry
 ; CHECK-LE-NEXT:    addi r3, r1, 2048
diff --git a/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll
index 6c136e9a541c..9af43017d8db 100644
--- a/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll
+++ b/llvm/test/CodeGen/PowerPC/stack-clash-dynamic-alloca.ll
@@ -18,23 +18,23 @@ define i32 @foo(i32 %n) local_unnamed_addr #0 "stack-probe-size"="32768" nounwin
 ; CHECK-LE-NEXT:    std r31, -8(r1)
 ; CHECK-LE-NEXT:    stdu r1, -48(r1)
 ; CHECK-LE-NEXT:    rldic r3, r3, 2, 30
-; CHECK-LE-NEXT:    li r6, -32768
+; CHECK-LE-NEXT:    li r5, -32768
 ; CHECK-LE-NEXT:    mr r31, r1
 ; CHECK-LE-NEXT:    addi r3, r3, 15
-; CHECK-LE-NEXT:    addi r4, r31, 48
 ; CHECK-LE-NEXT:    rldicl r3, r3, 60, 4
 ; CHECK-LE-NEXT:    rldicl r3, r3, 4, 29
-; CHECK-LE-NEXT:    neg r5, r3
-; CHECK-LE-NEXT:    divd r7, r5, r6
-; CHECK-LE-NEXT:    add r3, r1, r5
-; CHECK-LE-NEXT:    mulld r6, r7, r6
-; CHECK-LE-NEXT:    sub r5, r5, r6
-; CHECK-LE-NEXT:    stdux r4, r1, r5
-; CHECK-LE-NEXT:    cmpd r1, r3
+; CHECK-LE-NEXT:    neg r4, r3
+; CHECK-LE-NEXT:    addi r3, r31, 48
+; CHECK-LE-NEXT:    divd r6, r4, r5
+; CHECK-LE-NEXT:    mulld r5, r6, r5
+; CHECK-LE-NEXT:    sub r5, r4, r5
+; CHECK-LE-NEXT:    add r4, r1, r4
+; CHECK-LE-NEXT:    stdux r3, r1, r5
+; CHECK-LE-NEXT:    cmpd r1, r4
 ; CHECK-LE-NEXT:    beq cr0, .LBB0_2
 ; CHECK-LE-NEXT:  .LBB0_1:
-; CHECK-LE-NEXT:    stdu r4, -32768(r1)
-; CHECK-LE-NEXT:    cmpd r1, r3
+; CHECK-LE-NEXT:    stdu r3, -32768(r1)
+; CHECK-LE-NEXT:    cmpd r1, r4
 ; CHECK-LE-NEXT:    bne cr0, .LBB0_1
 ; CHECK-LE-NEXT:  .LBB0_2:
 ; CHECK-LE-NEXT:    li r4, 1
@@ -58,15 +58,15 @@ define i32 @foo(i32 %n) local_unnamed_addr #0 "stack-probe-size"="32768" nounwin
 ; CHECK-P9-LE-NEXT:    divd r7, r5, r6
 ; CHECK-P9-LE-NEXT:    mulld r6, r7, r6
 ; CHECK-P9-LE-NEXT:    mr r31, r1
-; CHECK-P9-LE-NEXT:    addi r4, r31, 48
-; CHECK-P9-LE-NEXT:    add r3, r1, r5
+; CHECK-P9-LE-NEXT:    addi r3, r31, 48
+; CHECK-P9-LE-NEXT:    add r4, r1, r5
 ; CHECK-P9-LE-NEXT:    sub r5, r5, r6
-; CHECK-P9-LE-NEXT:    stdux r4, r1, r5
-; CHECK-P9-LE-NEXT:    cmpd r1, r3
+; CHECK-P9-LE-NEXT:    stdux r3, r1, r5
+; CHECK-P9-LE-NEXT:    cmpd r1, r4
 ; CHECK-P9-LE-NEXT:    beq cr0, .LBB0_2
 ; CHECK-P9-LE-NEXT:  .LBB0_1:
-; CHECK-P9-LE-NEXT:    stdu r4, -32768(r1)
-; CHECK-P9-LE-NEXT:    cmpd r1, r3
+; CHECK-P9-LE-NEXT:    stdu r3, -32768(r1)
+; CHECK-P9-LE-NEXT:    cmpd r1, r4
 ; CHECK-P9-LE-NEXT:    bne cr0, .LBB0_1
 ; CHECK-P9-LE-NEXT:  .LBB0_2:
 ; CHECK-P9-LE-NEXT:    addi r3, r1, 32
@@ -82,23 +82,23 @@ define i32 @foo(i32 %n) local_unnamed_addr #0 "stack-probe-size"="32768" nounwin
 ; CHECK-BE-NEXT:    std r31, -8(r1)
 ; CHECK-BE-NEXT:    stdu r1, -64(r1)
 ; CHECK-BE-NEXT:    rldic r3, r3, 2, 30
-; CHECK-BE-NEXT:    li r6, -32768
+; CHECK-BE-NEXT:    li r5, -32768
 ; CHECK-BE-NEXT:    addi r3, r3, 15
 ; CHECK-BE-NEXT:    rldicl r3, r3, 60, 4
 ; CHECK-BE-NEXT:    mr r31, r1
 ; CHECK-BE-NEXT:    rldicl r3, r3, 4, 29
-; CHECK-BE-NEXT:    addi r4, r31, 64
-; CHECK-BE-NEXT:    neg r5, r3
-; CHECK-BE-NEXT:    divd r7, r5, r6
-; CHECK-BE-NEXT:    add r3, r1, r5
-; CHECK-BE-NEXT:    mulld r6, r7, r6
-; CHECK-BE-NEXT:    sub r5, r5, r6
-; CHECK-BE-NEXT:    stdux r4, r1, r5
-; CHECK-BE-NEXT:    cmpd r1, r3
+; CHECK-BE-NEXT:    neg r4, r3
+; CHECK-BE-NEXT:    divd r6, r4, r5
+; CHECK-BE-NEXT:    addi r3, r31, 64
+; CHECK-BE-NEXT:    mulld r5, r6, r5
+; CHECK-BE-NEXT:    sub r5, r4, r5
+; CHECK-BE-NEXT:    add r4, r1, r4
+; CHECK-BE-NEXT:    stdux r3, r1, r5
+; CHECK-BE-NEXT:    cmpd r1, r4
 ; CHECK-BE-NEXT:    beq cr0, .LBB0_2
 ; CHECK-BE-NEXT:  .LBB0_1:
-; CHECK-BE-NEXT:    stdu r4, -32768(r1)
-; CHECK-BE-NEXT:    cmpd r1, r3
+; CHECK-BE-NEXT:    stdu r3, -32768(r1)
+; CHECK-BE-NEXT:    cmpd r1, r4
 ; CHECK-BE-NEXT:    bne cr0, .LBB0_1
 ; CHECK-BE-NEXT:  .LBB0_2:
 ; CHECK-BE-NEXT:    li r4, 1
@@ -115,21 +115,21 @@ define i32 @foo(i32 %n) local_unnamed_addr #0 "stack-probe-size"="32768" nounwin
 ; CHECK-32-NEXT:    slwi r3, r3, 2
 ; CHECK-32-NEXT:    addi r3, r3, 15
 ; CHECK-32-NEXT:    rlwinm r3, r3, 0, 0, 27
-; CHECK-32-NEXT:    neg r5, r3
-; CHECK-32-NEXT:    li r6, -32768
-; CHECK-32-NEXT:    divw r7, r5, r6
+; CHECK-32-NEXT:    neg r4, r3
+; CHECK-32-NEXT:    li r5, -32768
+; CHECK-32-NEXT:    divw r6, r4, r5
 ; CHECK-32-NEXT:    stw r31, 28(r1)
 ; CHECK-32-NEXT:    mr r31, r1
-; CHECK-32-NEXT:    addi r4, r31, 32
-; CHECK-32-NEXT:    add r3, r1, r5
-; CHECK-32-NEXT:    mullw r6, r7, r6
-; CHECK-32-NEXT:    sub r5, r5, r6
-; CHECK-32-NEXT:    stwux r4, r1, r5
-; CHECK-32-NEXT:    cmpw r1, r3
+; CHECK-32-NEXT:    addi r3, r31, 32
+; CHECK-32-NEXT:    mullw r5, r6, r5
+; CHECK-32-NEXT:    sub r5, r4, r5
+; CHECK-32-NEXT:    add r4, r1, r4
+; CHECK-32-NEXT:    stwux r3, r1, r5
+; CHECK-32-NEXT:    cmpw r1, r4
 ; CHECK-32-NEXT:    beq cr0, .LBB0_2
 ; CHECK-32-NEXT:  .LBB0_1:
-; CHECK-32-NEXT:    stwu r4, -32768(r1)
-; CHECK-32-NEXT:    cmpw r1, r3
+; CHECK-32-NEXT:    stwu r3, -32768(r1)
+; CHECK-32-NEXT:    cmpw r1, r4
 ; CHECK-32-NEXT:    bne cr0, .LBB0_1
 ; CHECK-32-NEXT:  .LBB0_2:
 ; CHECK-32-NEXT:    li r4, 1
@@ -154,23 +154,23 @@ define i32 @bar(i32 %n) local_unnamed_addr #0 nounwind {
 ; CHECK-LE-NEXT:    std r31, -8(r1)
 ; CHECK-LE-NEXT:    stdu r1, -48(r1)
 ; CHECK-LE-NEXT:    rldic r4, r3, 2, 30
-; CHECK-LE-NEXT:    li r7, -4096
+; CHECK-LE-NEXT:    li r6, -4096
 ; CHECK-LE-NEXT:    mr r31, r1
 ; CHECK-LE-NEXT:    addi r4, r4, 15
-; CHECK-LE-NEXT:    addi r5, r31, 48
 ; CHECK-LE-NEXT:    rldicl r4, r4, 60, 4
 ; CHECK-LE-NEXT:    rldicl r4, r4, 4, 29
-; CHECK-LE-NEXT:    neg r6, r4
-; CHECK-LE-NEXT:    divd r8, r6, r7
-; CHECK-LE-NEXT:    add r4, r1, r6
-; CHECK-LE-NEXT:    mulld r7, r8, r7
-; CHECK-LE-NEXT:    sub r6, r6, r7
-; CHECK-LE-NEXT:    stdux r5, r1, r6
-; CHECK-LE-NEXT:    cmpd r1, r4
+; CHECK-LE-NEXT:    neg r5, r4
+; CHECK-LE-NEXT:    addi r4, r31, 48
+; CHECK-LE-NEXT:    divd r7, r5, r6
+; CHECK-LE-NEXT:    mulld r6, r7, r6
+; CHECK-LE-NEXT:    sub r6, r5, r6
+; CHECK-LE-NEXT:    add r5, r1, r5
+; CHECK-LE-NEXT:    stdux r4, r1, r6
+; CHECK-LE-NEXT:    cmpd r1, r5
 ; CHECK-LE-NEXT:    beq cr0, .LBB1_2
 ; CHECK-LE-NEXT:  .LBB1_1:
-; CHECK-LE-NEXT:    stdu r5, -4096(r1)
-; CHECK-LE-NEXT:    cmpd r1, r4
+; CHECK-LE-NEXT:    stdu r4, -4096(r1)
+; CHECK-LE-NEXT:    cmpd r1, r5
 ; CHECK-LE-NEXT:    bne cr0, .LBB1_1
 ; CHECK-LE-NEXT:  .LBB1_2:
 ; CHECK-LE-NEXT:    extsw r3, r3
@@ -197,15 +197,15 @@ define i32 @bar(i32 %n) local_unnamed_addr #0 nounwind {
 ; CHECK-P9-LE-NEXT:    divd r8, r6, r7
 ; CHECK-P9-LE-NEXT:    mulld r7, r8, r7
 ; CHECK-P9-LE-NEXT:    mr r31, r1
-; CHECK-P9-LE-NEXT:    addi r5, r31, 48
-; CHECK-P9-LE-NEXT:    add r4, r1, r6
+; CHECK-P9-LE-NEXT:    addi r4, r31, 48
+; CHECK-P9-LE-NEXT:    add r5, r1, r6
 ; CHECK-P9-LE-NEXT:    sub r6, r6, r7
-; CHECK-P9-LE-NEXT:    stdux r5, r1, r6
-; CHECK-P9-LE-NEXT:    cmpd r1, r4
+; CHECK-P9-LE-NEXT:    stdux r4, r1, r6
+; CHECK-P9-LE-NEXT:    cmpd r1, r5
 ; CHECK-P9-LE-NEXT:    beq cr0, .LBB1_2
 ; CHECK-P9-LE-NEXT:  .LBB1_1:
-; CHECK-P9-LE-NEXT:    stdu r5, -4096(r1)
-; CHECK-P9-LE-NEXT:    cmpd r1, r4
+; CHECK-P9-LE-NEXT:    stdu r4, -4096(r1)
+; CHECK-P9-LE-NEXT:    cmpd r1, r5
 ; CHECK-P9-LE-NEXT:    bne cr0, .LBB1_1
 ; CHECK-P9-LE-NEXT:  .LBB1_2:
 ; CHECK-P9-LE-NEXT:    addi r4, r1, 32
@@ -223,23 +223,23 @@ define i32 @bar(i32 %n) local_unnamed_addr #0 nounwind {
 ; CHECK-BE-NEXT:    std r31, -8(r1)
 ; CHECK-BE-NEXT:    stdu r1, -64(r1)
 ; CHECK-BE-NEXT:    rldic r4, r3, 2, 30
-; CHECK-BE-NEXT:    li r7, -4096
+; CHECK-BE-NEXT:    li r6, -4096
 ; CHECK-BE-NEXT:    addi r4, r4, 15
 ; CHECK-BE-NEXT:    rldicl r4, r4, 60, 4
 ; CHECK-BE-NEXT:    mr r31, r1
 ; CHECK-BE-NEXT:    rldicl r4, r4, 4, 29
-; CHECK-BE-NEXT:    addi r5, r31, 64
-; CHECK-BE-NEXT:    neg r6, r4
-; CHECK-BE-NEXT:    divd r8, r6, r7
-; CHECK-BE-NEXT:    add r4, r1, r6
-; CHECK-BE-NEXT:    mulld r7, r8, r7
-; CHECK-BE-NEXT:    sub r6, r6, r7
-; CHECK-BE-NEXT:    stdux r5, r1, r6
-; CHECK-BE-NEXT:    cmpd r1, r4
+; CHECK-BE-NEXT:    neg r5, r4
+; CHECK-BE-NEXT:    divd r7, r5, r6
+; CHECK-BE-NEXT:    addi r4, r31, 64
+; CHECK-BE-NEXT:    mulld r6, r7, r6
+; CHECK-BE-NEXT:    sub r6, r5, r6
+; CHECK-BE-NEXT:    add r5, r1, r5
+; CHECK-BE-NEXT:    stdux r4, r1, r6
+; CHECK-BE-NEXT:    cmpd r1, r5
 ; CHECK-BE-NEXT:    beq cr0, .LBB1_2
 ; CHECK-BE-NEXT:  .LBB1_1:
-; CHECK-BE-NEXT:    stdu r5, -4096(r1)
-; CHECK-BE-NEXT:    cmpd r1, r4
+; CHECK-BE-NEXT:    stdu r4, -4096(r1)
+; CHECK-BE-NEXT:    cmpd r1, r5
 ; CHECK-BE-NEXT:    bne cr0, .LBB1_1
 ; CHECK-BE-NEXT:  .LBB1_2:
 ; CHECK-BE-NEXT:    extsw r3, r3
@@ -259,21 +259,21 @@ define i32 @bar(i32 %n) local_unnamed_addr #0 nounwind {
 ; CHECK-32-NEXT:    slwi r3, r3, 2
 ; CHECK-32-NEXT:    addi r4, r3, 15
 ; CHECK-32-NEXT:    rlwinm r4, r4, 0, 0, 27
-; CHECK-32-NEXT:    neg r6, r4
-; CHECK-32-NEXT:    li r7, -4096
-; CHECK-32-NEXT:    divw r8, r6, r7
+; CHECK-32-NEXT:    neg r5, r4
+; CHECK-32-NEXT:    li r6, -4096
+; CHECK-32-NEXT:    divw r7, r5, r6
 ; CHECK-32-NEXT:    stw r31, 28(r1)
 ; CHECK-32-NEXT:    mr r31, r1
-; CHECK-32-NEXT:    addi r5, r31, 32
-; CHECK-32-NEXT:    add r4, r1, r6
-; CHECK-32-NEXT:    mullw r7, r8, r7
-; CHECK-32-NEXT:    sub r6, r6, r7
-; CHECK-32-NEXT:    stwux r5, r1, r6
-; CHECK-32-NEXT:    cmpw r1, r4
+; CHECK-32-NEXT:    addi r4, r31, 32
+; CHECK-32-NEXT:    mullw r6, r7, r6
+; CHECK-32-NEXT:    sub r6, r5, r6
+; CHECK-32-NEXT:    add r5, r1, r5
+; CHECK-32-NEXT:    stwux r4, r1, r6
+; CHECK-32-NEXT:    cmpw r1, r5
 ; CHECK-32-NEXT:    beq cr0, .LBB1_2
 ; CHECK-32-NEXT:  .LBB1_1:
-; CHECK-32-NEXT:    stwu r5, -4096(r1)
-; CHECK-32-NEXT:    cmpw r1, r4
+; CHECK-32-NEXT:    stwu r4, -4096(r1)
+; CHECK-32-NEXT:    cmpw r1, r5
 ; CHECK-32-NEXT:    bne cr0, .LBB1_1
 ; CHECK-32-NEXT:  .LBB1_2:
 ; CHECK-32-NEXT:    addi r4, r1, 16
@@ -300,24 +300,24 @@ define i32 @f(i32 %n) local_unnamed_addr #0 "stack-probe-size"="65536" nounwind
 ; CHECK-LE-NEXT:    std r31, -8(r1)
 ; CHECK-LE-NEXT:    stdu r1, -48(r1)
 ; CHECK-LE-NEXT:    rldic r3, r3, 2, 30
-; CHECK-LE-NEXT:    lis r5, -1
+; CHECK-LE-NEXT:    lis r4, -1
 ; CHECK-LE-NEXT:    mr r31, r1
 ; CHECK-LE-NEXT:    addi r3, r3, 15
-; CHECK-LE-NEXT:    ori r5, r5, 0
-; CHECK-LE-NEXT:    addi r4, r31, 48
+; CHECK-LE-NEXT:    ori r4, r4, 0
 ; CHECK-LE-NEXT:    rldicl r3, r3, 60, 4
 ; CHECK-LE-NEXT:    rldicl r3, r3, 4, 29
-; CHECK-LE-NEXT:    neg r6, r3
-; CHECK-LE-NEXT:    divd r7, r6, r5
-; CHECK-LE-NEXT:    add r3, r1, r6
-; CHECK-LE-NEXT:    mulld r7, r7, r5
-; CHECK-LE-NEXT:    sub r6, r6, r7
-; CHECK-LE-NEXT:    stdux r4, r1, r6
-; CHECK-LE-NEXT:    cmpd r1, r3
+; CHECK-LE-NEXT:    neg r5, r3
+; CHECK-LE-NEXT:    addi r3, r31, 48
+; CHECK-LE-NEXT:    divd r6, r5, r4
+; CHECK-LE-NEXT:    mulld r6, r6, r4
+; CHECK-LE-NEXT:    sub r6, r5, r6
+; CHECK-LE-NEXT:    add r5, r1, r5
+; CHECK-LE-NEXT:    stdux r3, r1, r6
+; CHECK-LE-NEXT:    cmpd r1, r5
 ; CHECK-LE-NEXT:    beq cr0, .LBB2_2
 ; CHECK-LE-NEXT:  .LBB2_1:
-; CHECK-LE-NEXT:    stdux r4, r1, r5
-; CHECK-LE-NEXT:    cmpd r1, r3
+; CHECK-LE-NEXT:    stdux r3, r1, r4
+; CHECK-LE-NEXT:    cmpd r1, r5
 ; CHECK-LE-NEXT:    bne cr0, .LBB2_1
 ; CHECK-LE-NEXT:  .LBB2_2:
 ; CHECK-LE-NEXT:    li r4, 1
@@ -342,15 +342,15 @@ define i32 @f(i32 %n) local_unnamed_addr #0 "stack-probe-size"="65536" nounwind
 ; CHECK-P9-LE-NEXT:    divd r7, r6, r5
 ; CHECK-P9-LE-NEXT:    mulld r7, r7, r5
 ; CHECK-P9-LE-NEXT:    mr r31, r1
-; CHECK-P9-LE-NEXT:    addi r4, r31, 48
-; CHECK-P9-LE-NEXT:    add r3, r1, r6
+; CHECK-P9-LE-NEXT:    addi r3, r31, 48
+; CHECK-P9-LE-NEXT:    add r4, r1, r6
 ; CHECK-P9-LE-NEXT:    sub r6, r6, r7
-; CHECK-P9-LE-NEXT:    stdux r4, r1, r6
-; CHECK-P9-LE-NEXT:    cmpd r1, r3
+; CHECK-P9-LE-NEXT:    stdux r3, r1, r6
+; CHECK-P9-LE-NEXT:    cmpd r1, r4
 ; CHECK-P9-LE-NEXT:    beq cr0, .LBB2_2
 ; CHECK-P9-LE-NEXT:  .LBB2_1:
-; CHECK-P9-LE-NEXT:    stdux r4, r1, r5
-; CHECK-P9-LE-NEXT:    cmpd r1, r3
+; CHECK-P9-LE-NEXT:    stdux r3, r1, r5
+; CHECK-P9-LE-NEXT:    cmpd r1, r4
 ; CHECK-P9-LE-NEXT:    bne cr0, .LBB2_1
 ; CHECK-P9-LE-NEXT:  .LBB2_2:
 ; CHECK-P9-LE-NEXT:    addi r3, r1, 32
@@ -366,24 +366,24 @@ define i32 @f(i32 %n) local_unnamed_addr #0 "stack-probe-size"="65536" nounwind
 ; CHECK-BE-NEXT:    std r31, -8(r1)
 ; CHECK-BE-NEXT:    stdu r1, -64(r1)
 ; CHECK-BE-NEXT:    rldic r3, r3, 2, 30
-; CHECK-BE-NEXT:    lis r5, -1
+; CHECK-BE-NEXT:    lis r4, -1
 ; CHECK-BE-NEXT:    addi r3, r3, 15
 ; CHECK-BE-NEXT:    rldicl r3, r3, 60, 4
-; CHECK-BE-NEXT:    ori r5, r5, 0
+; CHECK-BE-NEXT:    ori r4, r4, 0
 ; CHECK-BE-NEXT:    rldicl r3, r3, 4, 29
 ; CHECK-BE-NEXT:    mr r31, r1
-; CHECK-BE-NEXT:    neg r6, r3
-; CHECK-BE-NEXT:    divd r7, r6, r5
-; CHECK-BE-NEXT:    addi r4, r31, 64
-; CHECK-BE-NEXT:    mulld r7, r7, r5
-; CHECK-BE-NEXT:    add r3, r1, r6
-; CHECK-BE-NEXT:    sub r6, r6, r7
-; CHECK-BE-NEXT:    stdux r4, r1, r6
-; CHECK-BE-NEXT:    cmpd r1, r3
+; CHECK-BE-NEXT:    neg r5, r3
+; CHECK-BE-NEXT:    divd r6, r5, r4
+; CHECK-BE-NEXT:    addi r3, r31, 64
+; CHECK-BE-NEXT:    mulld r6, r6, r4
+; CHECK-BE-NEXT:    sub r6, r5, r6
+; CHECK-BE-NEXT:    add r5, r1, r5
+; CHECK-BE-NEXT:    stdux r3, r1, r6
+; CHECK-BE-NEXT:    cmpd r1, r5
 ; CHECK-BE-NEXT:    beq cr0, .LBB2_2
 ; CHECK-BE-NEXT:  .LBB2_1:
-; CHECK-BE-NEXT:    stdux r4, r1, r5
-; CHECK-BE-NEXT:    cmpd r1, r3
+; CHECK-BE-NEXT:    stdux r3, r1, r4
+; CHECK-BE-NEXT:    cmpd r1, r5
 ; CHECK-BE-NEXT:    bne cr0, .LBB2_1
 ; CHECK-BE-NEXT:  .LBB2_2:
 ; CHECK-BE-NEXT:    li r4, 1
@@ -400,22 +400,22 @@ define i32 @f(i32 %n) local_unnamed_addr #0 "stack-probe-size"="65536" nounwind
 ; CHECK-32-NEXT:    slwi r3, r3, 2
 ; CHECK-32-NEXT:    addi r3, r3, 15
 ; CHECK-32-NEXT:    rlwinm r3, r3, 0, 0, 27
-; CHECK-32-NEXT:    lis r5, -1
-; CHECK-32-NEXT:    neg r6, r3
-; CHECK-32-NEXT:    ori r5, r5, 0
-; CHECK-32-NEXT:    divw r7, r6, r5
+; CHECK-32-NEXT:    lis r4, -1
+; CHECK-32-NEXT:    neg r5, r3
+; CHECK-32-NEXT:    ori r4, r4, 0
+; CHECK-32-NEXT:    divw r6, r5, r4
 ; CHECK-32-NEXT:    stw r31, 28(r1)
 ; CHECK-32-NEXT:    mr r31, r1
-; CHECK-32-NEXT:    addi r4, r31, 32
-; CHECK-32-NEXT:    add r3, r1, r6
-; CHECK-32-NEXT:    mullw r7, r7, r5
-; CHECK-32-NEXT:    sub r6, r6, r7
-; CHECK-32-NEXT:    stwux r4, r1, r6
-; CHECK-32-NEXT:    cmpw r1, r3
+; CHECK-32-NEXT:    addi r3, r31, 32
+; CHECK-32-NEXT:    mullw r6, r6, r4
+; CHECK-32-NEXT:    sub r6, r5, r6
+; CHECK-32-NEXT:    add r5, r1, r5
+; CHECK-32-NEXT:    stwux r3, r1, r6
+; CHECK-32-NEXT:    cmpw r1, r5
 ; CHECK-32-NEXT:    beq cr0, .LBB2_2
 ; CHECK-32-NEXT:  .LBB2_1:
-; CHECK-32-NEXT:    stwux r4, r1, r5
-; CHECK-32-NEXT:    cmpw r1, r3
+; CHECK-32-NEXT:    stwux r3, r1, r4
+; CHECK-32-NEXT:    cmpw r1, r5
 ; CHECK-32-NEXT:    bne cr0, .LBB2_1
 ; CHECK-32-NEXT:  .LBB2_2:
 ; CHECK-32-NEXT:    li r4, 1

From 3be0d8669f9a7e43cf909cdb29dc2cf087a4292d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lubo=C5=A1=20Lu=C5=88=C3=A1k?= <l.lunak@centrum.cz>
Date: Mon, 13 Jul 2020 22:24:44 +0200
Subject: [PATCH 038/363] accept 'clang++ -c a.pch -o a.o' to create PCH's
 object file

This way should be the same like with a.pcm for modules.
An alternative way is 'clang++ -c empty.cpp -include-pch a.pch -o a.o
-Xclang -building-pch-with-obj', which is what clang-cl's /Yc does
internally.

Differential Revision: https://reviews.llvm.org/D83716

(cherry picked from commit 3895466e2c336c0797710ae35150ba1ce6bc0b96)
---
 clang/lib/Driver/Types.cpp                |  2 +-
 clang/lib/Frontend/CompilerInvocation.cpp |  7 +++--
 clang/test/Driver/pch-codegen.cpp         | 38 +++++++++++++++++++++++
 clang/test/PCH/codegen.cpp                |  8 ++---
 4 files changed, 47 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/Driver/pch-codegen.cpp

diff --git a/clang/lib/Driver/Types.cpp b/clang/lib/Driver/Types.cpp
index 399e26d8d64a..2050dffa6fa0 100644
--- a/clang/lib/Driver/Types.cpp
+++ b/clang/lib/Driver/Types.cpp
@@ -141,7 +141,7 @@ bool types::isAcceptedByClang(ID Id) {
   case TY_CXXHeader: case TY_PP_CXXHeader:
   case TY_ObjCXXHeader: case TY_PP_ObjCXXHeader:
   case TY_CXXModule: case TY_PP_CXXModule:
-  case TY_AST: case TY_ModuleFile:
+  case TY_AST: case TY_ModuleFile: case TY_PCH:
   case TY_LLVM_IR: case TY_LLVM_BC:
     return true;
   }
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 75d7cf5d26d3..73114c6d76cb 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -2022,8 +2022,9 @@ static InputKind ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
     // FIXME: Supporting '<lang>-header-cpp-output' would be useful.
     bool Preprocessed = XValue.consume_back("-cpp-output");
     bool ModuleMap = XValue.consume_back("-module-map");
-    IsHeaderFile =
-        !Preprocessed && !ModuleMap && XValue.consume_back("-header");
+    IsHeaderFile = !Preprocessed && !ModuleMap &&
+                   XValue != "precompiled-header" &&
+                   XValue.consume_back("-header");
 
     // Principal languages.
     DashX = llvm::StringSwitch<InputKind>(XValue)
@@ -2050,7 +2051,7 @@ static InputKind ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
       DashX = llvm::StringSwitch<InputKind>(XValue)
                   .Case("cpp-output", InputKind(Language::C).getPreprocessed())
                   .Case("assembler-with-cpp", Language::Asm)
-                  .Cases("ast", "pcm",
+                  .Cases("ast", "pcm", "precompiled-header",
                          InputKind(Language::Unknown, InputKind::Precompiled))
                   .Case("ir", Language::LLVM_IR)
                   .Default(Language::Unknown);
diff --git a/clang/test/Driver/pch-codegen.cpp b/clang/test/Driver/pch-codegen.cpp
new file mode 100644
index 000000000000..1b125107fb28
--- /dev/null
+++ b/clang/test/Driver/pch-codegen.cpp
@@ -0,0 +1,38 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+
+// Create PCH without codegen.
+// RUN: %clang -x c++-header %S/../Modules/Inputs/codegen-flags/foo.h -o %t/foo-cg.pch -### 2>&1 | FileCheck %s -check-prefix=CHECK-PCH-CREATE
+// CHECK-PCH-CREATE: -emit-pch
+// CHECK-PCH-CREATE-NOT: -fmodules-codegen
+// CHECK-PCH-CREATE-NOT: -fmodules-debuginfo
+
+// Create PCH with -fmodules-codegen.
+// RUN: %clang -x c++-header -Xclang -fmodules-codegen %S/../Modules/Inputs/codegen-flags/foo.h -o %t/foo-cg.pch -### 2>&1 | FileCheck %s -check-prefix=CHECK-PCH-CODEGEN-CREATE
+// CHECK-PCH-CODEGEN-CREATE: -emit-pch
+// CHECK-PCH-CODEGEN-CREATE: -fmodules-codegen
+// CHECK-PCH-CODEGEN-CREATE: "-x" "c++-header"
+// CHECK-PCH-CODEGEN-CREATE-NOT: -fmodules-debuginfo
+
+// Create PCH with -fmodules-debuginfo.
+// RUN: %clang -x c++-header -Xclang -fmodules-debuginfo %S/../Modules/Inputs/codegen-flags/foo.h -g -o %t/foo-di.pch -### 2>&1 | FileCheck %s -check-prefix=CHECK-PCH-DEBUGINFO-CREATE
+// CHECK-PCH-DEBUGINFO-CREATE: -emit-pch
+// CHECK-PCH-DEBUGINFO-CREATE: -fmodules-debuginfo
+// CHECK-PCH-DEBUGINFO-CREATE: "-x" "c++-header"
+// CHECK-PCH-DEBUGINFO-CREATE-NOT: -fmodules-codegen
+
+// Create PCH's object file for -fmodules-codegen.
+// RUN: touch %t/foo-cg.pch
+// RUN: %clang -c %t/foo-cg.pch -o %t/foo-cg.o -### 2>&1 | FileCheck %s -check-prefix=CHECK-PCH-CODEGEN-OBJ
+// CHECK-PCH-CODEGEN-OBJ: -emit-obj
+// CHECK-PCH-CODEGEN-OBJ: "-main-file-name" "foo-cg.pch"
+// CHECK-PCH-CODEGEN-OBJ: "-o" "{{.*}}foo-cg.o"
+// CHECK-PCH-CODEGEN-OBJ: "-x" "precompiled-header"
+
+// Create PCH's object file for -fmodules-debuginfo.
+// RUN: touch %t/foo-di.pch
+// RUN: %clang -c %t/foo-di.pch -g -o %t/foo-di.o -### 2>&1 | FileCheck %s -check-prefix=CHECK-PCH-DEBUGINFO-OBJ
+// CHECK-PCH-DEBUGINFO-OBJ: -emit-obj
+// CHECK-PCH-DEBUGINFO-OBJ: "-main-file-name" "foo-di.pch"
+// CHECK-PCH-DEBUGINFO-OBJ: "-o" "{{.*}}foo-di.o"
+// CHECK-PCH-DEBUGINFO-OBJ: "-x" "precompiled-header"
diff --git a/clang/test/PCH/codegen.cpp b/clang/test/PCH/codegen.cpp
index 49ed30aeaf05..9d817316bc8a 100644
--- a/clang/test/PCH/codegen.cpp
+++ b/clang/test/PCH/codegen.cpp
@@ -9,8 +9,8 @@
 // RUN: %clang_cc1 -triple=x86_64-linux-gnu -fmodules-codegen -x c++-header -building-pch-with-obj -emit-pch %S/../Modules/Inputs/codegen-flags/foo.h -o %t/foo-cg.pch
 // RUN: %clang_cc1 -triple=x86_64-linux-gnu -fmodules-debuginfo -x c++-header -building-pch-with-obj -emit-pch %S/../Modules/Inputs/codegen-flags/foo.h -o %t/foo-di.pch
 
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -debug-info-kind=limited -o - %s -include-pch %t/foo-cg.pch -building-pch-with-obj -fmodules-codegen | FileCheck --check-prefix=CG %s
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -debug-info-kind=limited -o - %s -include-pch %t/foo-di.pch -building-pch-with-obj -fmodules-debuginfo | FileCheck --check-prefix=DI %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -debug-info-kind=limited -o - -x precompiled-header %t/foo-cg.pch | FileCheck --check-prefix=CG %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -debug-info-kind=limited -o - -x precompiled-header %t/foo-di.pch | FileCheck --check-prefix=DI %s
 
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -debug-info-kind=limited -o - -include-pch %t/foo-cg.pch %S/../Modules/Inputs/codegen-flags/use.cpp | FileCheck --check-prefix=CG-USE %s
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -debug-info-kind=limited -o - -include-pch %t/foo-di.pch %S/../Modules/Inputs/codegen-flags/use.cpp | FileCheck --check-prefix=DI-USE %s
@@ -20,8 +20,8 @@
 // RUN: %clang_cc1 -triple=x86_64-linux-gnu -fmodules-codegen -x c++-header -building-pch-with-obj -emit-pch -fpch-instantiate-templates %S/../Modules/Inputs/codegen-flags/foo.h -o %t/foo-cg.pch
 // RUN: %clang_cc1 -triple=x86_64-linux-gnu -fmodules-debuginfo -x c++-header -building-pch-with-obj -emit-pch -fpch-instantiate-templates %S/../Modules/Inputs/codegen-flags/foo.h -o %t/foo-di.pch
 
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -debug-info-kind=limited -o - %s -include-pch %t/foo-cg.pch -building-pch-with-obj -fmodules-codegen | FileCheck --check-prefix=CG %s
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -debug-info-kind=limited -o - %s -include-pch %t/foo-di.pch -building-pch-with-obj -fmodules-debuginfo | FileCheck --check-prefix=DI %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -debug-info-kind=limited -o - -x precompiled-header %t/foo-cg.pch | FileCheck --check-prefix=CG %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -debug-info-kind=limited -o - -x precompiled-header %t/foo-di.pch | FileCheck --check-prefix=DI %s
 
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -debug-info-kind=limited -o - -include-pch %t/foo-cg.pch %S/../Modules/Inputs/codegen-flags/use.cpp | FileCheck --check-prefix=CG-USE %s
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -debug-info-kind=limited -o - -include-pch %t/foo-di.pch %S/../Modules/Inputs/codegen-flags/use.cpp | FileCheck --check-prefix=DI-USE %s

From 2f1aff8325387b8ca1c9a1a14e2065827e9b1c15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lubo=C5=A1=20Lu=C5=88=C3=A1k?= <l.lunak@centrum.cz>
Date: Sat, 11 Jul 2020 14:53:50 +0200
Subject: [PATCH 039/363] add -fpch-codegen/debuginfo mapping to
 -fmodules-codegen/debuginfo

Using -fmodules-* options for PCHs is a bit confusing, so add -fpch-*
variants. Having extra options also makes it simple to do a configure
check for the feature.
Also document the options in the release notes.

Differential Revision: https://reviews.llvm.org/D83623

(cherry picked from commit 54eea6127c4d77db03787b7c55765632fb9a6f1c)
---
 clang/docs/ReleaseNotes.rst           | 26 ++++++++++++++++++++++++++
 clang/include/clang/Driver/Options.td |  4 ++++
 clang/lib/Driver/ToolChains/Clang.cpp |  6 ++++++
 clang/test/Driver/pch-codegen.cpp     | 12 ++++++------
 4 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 8b27e663d9f8..3264846506c6 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -115,6 +115,32 @@ New Compiler Flags
   if the source header file is not self-contained. This option is enabled
   by default for clang-cl.
 
+- -fpch-codegen and -fpch-debuginfo generate shared code and/or debuginfo
+  for contents of a precompiled header in a separate object file. This object
+  file needs to be linked in, but its contents do not need to be generated
+  for other objects using the precompiled header. This should usually save
+  compile time. If not using clang-cl, the separate object file needs to
+  be created explicitly from the precompiled header.
+  Example of use:
+
+  .. code-block:: console
+
+    $ clang++ -x c++-header header.h -o header.pch -fpch-codegen -fpch-debuginfo
+    $ clang++ -c header.pch -o shared.o
+    $ clang++ -c source.cpp -o source.o -include-pch header.pch
+    $ clang++ -o binary source.o shared.o
+
+  - Using -fpch-instantiate-templates when generating the precompiled header
+    usually increases the amount of code/debuginfo that can be shared.
+  - In some cases, especially when building with optimizations enabled, using
+    -fpch-codegen may generate so much code in the shared object that compiling
+    it may be a net loss in build time.
+  - Since headers may bring in private symbols of other libraries, it may be
+    sometimes necessary to discard unused symbols (such as by adding
+    -Wl,--gc-sections on ELF platforms to the linking command, and possibly
+    adding -fdata-sections -ffunction-sections to the command generating
+    the shared object).
+
 Deprecated Compiler Flags
 -------------------------
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index f4556c15d744..b20b8a288221 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1440,6 +1440,10 @@ def fpch_instantiate_templates:
 def fno_pch_instantiate_templates:
   Flag <["-"], "fno-pch-instantiate-templates">,
   Group<f_Group>, Flags<[CC1Option]>;
+defm pch_codegen: OptInFFlag<"pch-codegen", "Generate ", "Do not generate ",
+  "code for uses of this PCH that assumes an explicit object file will be built for the PCH">;
+defm pch_debuginfo: OptInFFlag<"pch-debuginfo", "Generate ", "Do not generate ",
+  "debug info for types in an object file built from this PCH and do not generate them elsewhere">;
 
 def fmodules : Flag <["-"], "fmodules">, Group<f_Group>,
   Flags<[DriverOption, CC1Option]>,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 9d6333bb5f1d..25fc837e803b 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5627,6 +5627,12 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   if (Args.hasFlag(options::OPT_fpch_instantiate_templates,
                    options::OPT_fno_pch_instantiate_templates, false))
     CmdArgs.push_back("-fpch-instantiate-templates");
+  if (Args.hasFlag(options::OPT_fpch_codegen, options::OPT_fno_pch_codegen,
+                   false))
+    CmdArgs.push_back("-fmodules-codegen");
+  if (Args.hasFlag(options::OPT_fpch_debuginfo, options::OPT_fno_pch_debuginfo,
+                   false))
+    CmdArgs.push_back("-fmodules-debuginfo");
 
   Args.AddLastArg(CmdArgs, options::OPT_fexperimental_new_pass_manager,
                   options::OPT_fno_experimental_new_pass_manager);
diff --git a/clang/test/Driver/pch-codegen.cpp b/clang/test/Driver/pch-codegen.cpp
index 1b125107fb28..c6b6d9217e42 100644
--- a/clang/test/Driver/pch-codegen.cpp
+++ b/clang/test/Driver/pch-codegen.cpp
@@ -7,21 +7,21 @@
 // CHECK-PCH-CREATE-NOT: -fmodules-codegen
 // CHECK-PCH-CREATE-NOT: -fmodules-debuginfo
 
-// Create PCH with -fmodules-codegen.
-// RUN: %clang -x c++-header -Xclang -fmodules-codegen %S/../Modules/Inputs/codegen-flags/foo.h -o %t/foo-cg.pch -### 2>&1 | FileCheck %s -check-prefix=CHECK-PCH-CODEGEN-CREATE
+// Create PCH with -fpch-codegen.
+// RUN: %clang -x c++-header -fpch-codegen %S/../Modules/Inputs/codegen-flags/foo.h -o %t/foo-cg.pch -### 2>&1 | FileCheck %s -check-prefix=CHECK-PCH-CODEGEN-CREATE
 // CHECK-PCH-CODEGEN-CREATE: -emit-pch
 // CHECK-PCH-CODEGEN-CREATE: -fmodules-codegen
 // CHECK-PCH-CODEGEN-CREATE: "-x" "c++-header"
 // CHECK-PCH-CODEGEN-CREATE-NOT: -fmodules-debuginfo
 
-// Create PCH with -fmodules-debuginfo.
-// RUN: %clang -x c++-header -Xclang -fmodules-debuginfo %S/../Modules/Inputs/codegen-flags/foo.h -g -o %t/foo-di.pch -### 2>&1 | FileCheck %s -check-prefix=CHECK-PCH-DEBUGINFO-CREATE
+// Create PCH with -fpch-debuginfo.
+// RUN: %clang -x c++-header -fpch-debuginfo %S/../Modules/Inputs/codegen-flags/foo.h -g -o %t/foo-di.pch -### 2>&1 | FileCheck %s -check-prefix=CHECK-PCH-DEBUGINFO-CREATE
 // CHECK-PCH-DEBUGINFO-CREATE: -emit-pch
 // CHECK-PCH-DEBUGINFO-CREATE: -fmodules-debuginfo
 // CHECK-PCH-DEBUGINFO-CREATE: "-x" "c++-header"
 // CHECK-PCH-DEBUGINFO-CREATE-NOT: -fmodules-codegen
 
-// Create PCH's object file for -fmodules-codegen.
+// Create PCH's object file for -fpch-codegen.
 // RUN: touch %t/foo-cg.pch
 // RUN: %clang -c %t/foo-cg.pch -o %t/foo-cg.o -### 2>&1 | FileCheck %s -check-prefix=CHECK-PCH-CODEGEN-OBJ
 // CHECK-PCH-CODEGEN-OBJ: -emit-obj
@@ -29,7 +29,7 @@
 // CHECK-PCH-CODEGEN-OBJ: "-o" "{{.*}}foo-cg.o"
 // CHECK-PCH-CODEGEN-OBJ: "-x" "precompiled-header"
 
-// Create PCH's object file for -fmodules-debuginfo.
+// Create PCH's object file for -fpch-debuginfo.
 // RUN: touch %t/foo-di.pch
 // RUN: %clang -c %t/foo-di.pch -g -o %t/foo-di.o -### 2>&1 | FileCheck %s -check-prefix=CHECK-PCH-DEBUGINFO-OBJ
 // CHECK-PCH-DEBUGINFO-OBJ: -emit-obj

From 9c48156c25f96cbd3b3405a53f681fee521514fa Mon Sep 17 00:00:00 2001
From: Max Kazantsev <mkazantsev@azul.com>
Date: Wed, 22 Jul 2020 15:32:13 +0700
Subject: [PATCH 040/363] [SCEV] Remove premature assert. PR46786

This assert was added to verify assumption that GEP's SCEV will be of pointer type,
basing on fact that it should be a SCEVAddExpr with (at least) last operand being
pointer. Two notes:
- GEP's SCEV does not have to be a SCEVAddExpr after all simplifications;
- In current state, GEP's SCEV does not have to have at least one pointer operands
  (all of them can become int during the transforms).

However, we might want to be at a point where it is true. We are currently removing
this assert and will try to enumerate the cases where "is pointer" notion might be
lost during the transforms. When all of them are fixed, we can return it.

Differential Revision: https://reviews.llvm.org/D84294
Reviewed By: lebedev.ri

(cherry picked from commit b96114c1e1fc4448ea966bce013706359aee3fa9)
---
 llvm/lib/Analysis/ScalarEvolution.cpp         |  5 +--
 llvm/test/Analysis/ScalarEvolution/pr46786.ll | 37 +++++++++++++++++++
 2 files changed, 38 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Analysis/ScalarEvolution/pr46786.ll

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 48c686b73260..3c96b3f20461 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -3317,10 +3317,7 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
   }
 
   // Add the total offset from all the GEP indices to the base.
-  auto *GEPExpr = getAddExpr(BaseExpr, TotalOffset, Wrap);
-  assert(BaseExpr->getType() == GEPExpr->getType() &&
-         "GEP should not change type mid-flight.");
-  return GEPExpr;
+  return getAddExpr(BaseExpr, TotalOffset, Wrap);
 }
 
 std::tuple<SCEV *, FoldingSetNodeID, void *>
diff --git a/llvm/test/Analysis/ScalarEvolution/pr46786.ll b/llvm/test/Analysis/ScalarEvolution/pr46786.ll
new file mode 100644
index 000000000000..21a65702b3a3
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/pr46786.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -analyze -enable-new-pm=0 -scalar-evolution | FileCheck %s
+; RUN: opt < %s -disable-output "-passes=print<scalar-evolution>" 2>&1 | FileCheck %s
+
+source_filename = "input.cpp"
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+
+; Function Attrs: nofree
+define i8* @FSE_decompress_usingDTable(i8* %arg, i32 %arg1, i32 %arg2, i32 %arg3) local_unnamed_addr #0 {
+; CHECK-LABEL: 'FSE_decompress_usingDTable'
+; CHECK-NEXT:  Classifying expressions for: @FSE_decompress_usingDTable
+; CHECK-NEXT:    %i = getelementptr inbounds i8, i8* %arg, i32 %arg2
+; CHECK-NEXT:    --> (%arg2 + %arg)<nsw> U: full-set S: full-set
+; CHECK-NEXT:    %i4 = sub nsw i32 0, %arg1
+; CHECK-NEXT:    --> (-1 * %arg1) U: full-set S: full-set
+; CHECK-NEXT:    %i5 = getelementptr inbounds i8, i8* %i, i32 %i4
+; CHECK-NEXT:    --> ((-1 * %arg1) + %arg2 + %arg) U: full-set S: full-set
+; CHECK-NEXT:    %i7 = select i1 %i6, i32 %arg2, i32 %arg1
+; CHECK-NEXT:    --> ((-1 * %arg) + (((-1 * %arg1) + %arg2 + %arg) umin %arg) + %arg1) U: full-set S: full-set
+; CHECK-NEXT:    %i8 = sub i32 %arg3, %i7
+; CHECK-NEXT:    --> ((-1 * (((-1 * %arg1) + %arg2 + %arg) umin %arg)) + (-1 * %arg1) + %arg3 + %arg) U: full-set S: full-set
+; CHECK-NEXT:    %i9 = getelementptr inbounds i8, i8* %arg, i32 %i8
+; CHECK-NEXT:    --> ((2 * %arg) + (-1 * (((-1 * %arg1) + %arg2 + %arg) umin %arg)) + (-1 * %arg1) + %arg3) U: full-set S: full-set
+; CHECK-NEXT:  Determining loop execution counts for: @FSE_decompress_usingDTable
+;
+bb:
+  %i = getelementptr inbounds i8, i8* %arg, i32 %arg2
+  %i4 = sub nsw i32 0, %arg1
+  %i5 = getelementptr inbounds i8, i8* %i, i32 %i4
+  %i6 = icmp ult i8* %i5, %arg
+  %i7 = select i1 %i6, i32 %arg2, i32 %arg1
+  %i8 = sub i32 %arg3, %i7
+  %i9 = getelementptr inbounds i8, i8* %arg, i32 %i8
+  ret i8* %i9
+}
+
+attributes #0 = { nofree }

From e9d37a2ee97f820bc65e2badf5142414495580e5 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Thu, 23 Jul 2020 15:11:38 +0200
Subject: [PATCH 041/363] Drop the npm run line from
 llvm/test/Analysis/ScalarEvolution/pr46786.ll since it's failing.

---
 llvm/test/Analysis/ScalarEvolution/pr46786.ll | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/test/Analysis/ScalarEvolution/pr46786.ll b/llvm/test/Analysis/ScalarEvolution/pr46786.ll
index 21a65702b3a3..17110679c88e 100644
--- a/llvm/test/Analysis/ScalarEvolution/pr46786.ll
+++ b/llvm/test/Analysis/ScalarEvolution/pr46786.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -analyze -enable-new-pm=0 -scalar-evolution | FileCheck %s
-; RUN: opt < %s -disable-output "-passes=print<scalar-evolution>" 2>&1 | FileCheck %s
 
 source_filename = "input.cpp"
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"

From 826f730f3f1e2722059fe9d7f271a27a0d980a0f Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Sat, 18 Jul 2020 23:36:42 +0200
Subject: [PATCH 042/363] [InstCombine] Add test for PR46680 (NFC)

(cherry picked from commit 13ae440de4a408cf9d1a448def09769ecbecfdf7)
---
 llvm/test/Transforms/InstCombine/pr46680.ll | 92 +++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/pr46680.ll

diff --git a/llvm/test/Transforms/InstCombine/pr46680.ll b/llvm/test/Transforms/InstCombine/pr46680.ll
new file mode 100644
index 000000000000..90ea2e110afe
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr46680.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine -instcombine-infinite-loop-threshold=3 < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+@a = dso_local local_unnamed_addr global i64 0, align 8
+@d = dso_local local_unnamed_addr global i64 0, align 8
+@c = external dso_local local_unnamed_addr global i8, align 1
+
+define void @test(i16* nocapture readonly %arg) local_unnamed_addr {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[I:%.*]] = load i64, i64* @d, align 8
+; CHECK-NEXT:    [[I1:%.*]] = icmp eq i64 [[I]], 0
+; CHECK-NEXT:    [[I2:%.*]] = load i64, i64* @a, align 8
+; CHECK-NEXT:    [[I3:%.*]] = icmp ne i64 [[I2]], 0
+; CHECK-NEXT:    br i1 [[I1]], label [[BB13:%.*]], label [[BB4:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    [[I5:%.*]] = load i16, i16* [[ARG:%.*]], align 2
+; CHECK-NEXT:    [[I6:%.*]] = trunc i16 [[I5]] to i8
+; CHECK-NEXT:    store i8 [[I6]], i8* @c, align 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[I3]])
+; CHECK-NEXT:    br label [[BB22:%.*]]
+; CHECK:       bb13:
+; CHECK-NEXT:    [[I14:%.*]] = load i16, i16* [[ARG]], align 2
+; CHECK-NEXT:    [[I15:%.*]] = trunc i16 [[I14]] to i8
+; CHECK-NEXT:    store i8 [[I15]], i8* @c, align 1
+; CHECK-NEXT:    br label [[BB22]]
+; CHECK:       bb22:
+; CHECK-NEXT:    [[STOREMERGE2_IN:%.*]] = load i16, i16* [[ARG]], align 2
+; CHECK-NEXT:    [[STOREMERGE2:%.*]] = trunc i16 [[STOREMERGE2_IN]] to i8
+; CHECK-NEXT:    store i8 [[STOREMERGE2]], i8* @c, align 1
+; CHECK-NEXT:    [[STOREMERGE1_IN:%.*]] = load i16, i16* [[ARG]], align 2
+; CHECK-NEXT:    [[STOREMERGE1:%.*]] = trunc i16 [[STOREMERGE1_IN]] to i8
+; CHECK-NEXT:    store i8 [[STOREMERGE1]], i8* @c, align 1
+; CHECK-NEXT:    [[STOREMERGE_IN:%.*]] = load i16, i16* [[ARG]], align 2
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = trunc i16 [[STOREMERGE_IN]] to i8
+; CHECK-NEXT:    store i8 [[STOREMERGE]], i8* @c, align 1
+; CHECK-NEXT:    br label [[BB23:%.*]]
+; CHECK:       bb23:
+; CHECK-NEXT:    br label [[BB23]]
+;
+bb:
+  %i = load i64, i64* @d, align 8
+  %i1 = icmp eq i64 %i, 0
+  %i2 = load i64, i64* @a, align 8
+  %i3 = icmp ne i64 %i2, 0
+  br i1 %i1, label %bb13, label %bb4
+
+bb4:                                              ; preds = %bb
+  %i5 = load i16, i16* %arg, align 2
+  %i6 = trunc i16 %i5 to i8
+  store i8 %i6, i8* @c, align 1
+  tail call void @llvm.assume(i1 %i3)
+  %i7 = load i16, i16* %arg, align 2
+  %i8 = trunc i16 %i7 to i8
+  store i8 %i8, i8* @c, align 1
+  %i9 = load i16, i16* %arg, align 2
+  %i10 = trunc i16 %i9 to i8
+  store i8 %i10, i8* @c, align 1
+  %i11 = load i16, i16* %arg, align 2
+  %i12 = trunc i16 %i11 to i8
+  store i8 %i12, i8* @c, align 1
+  br label %bb22
+
+bb13:                                             ; preds = %bb
+  %i14 = load i16, i16* %arg, align 2
+  %i15 = trunc i16 %i14 to i8
+  store i8 %i15, i8* @c, align 1
+  %i16 = load i16, i16* %arg, align 2
+  %i17 = trunc i16 %i16 to i8
+  store i8 %i17, i8* @c, align 1
+  %i18 = load i16, i16* %arg, align 2
+  %i19 = trunc i16 %i18 to i8
+  store i8 %i19, i8* @c, align 1
+  %i20 = load i16, i16* %arg, align 2
+  %i21 = trunc i16 %i20 to i8
+  store i8 %i21, i8* @c, align 1
+  br label %bb22
+
+bb22:                                             ; preds = %bb13, %bb4
+  br label %bb23
+
+bb23:                                             ; preds = %bb23, %bb22
+  br label %bb23
+}
+
+; Function Attrs: nounwind willreturn
+declare void @llvm.assume(i1) #0
+
+attributes #0 = { nounwind willreturn }

From eb3c5db40a1450d50c387f3a42f4c095001220cb Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Sat, 18 Jul 2020 22:22:41 +0200
Subject: [PATCH 043/363] [InstCombine] Fix store merge worklist management
 (PR46680)

Fixes https://bugs.llvm.org/show_bug.cgi?id=46680.

Just like insertions through IRBuilder, InsertNewInstBefore()
should be using the deferred worklist mechanism, so that processing
of newly added instructions is prioritized.

There's one side-effect of the worklist order change which could be
classified as a regression. An add op gets pushed through a select
that at the time is not a umax. We could add a reverse transform
that tries to push adds in the reverse direction to restore a min/max,
but that seems like a sure way of getting infinite loops... Seems
like something that should best wait on min/max intrinsics.

Differential Revision: https://reviews.llvm.org/D84109

(cherry picked from commit d12ec0f752e7f2c7f7252539da2d124264ec33f7)
---
 .../InstCombine/InstCombineInternal.h         |  2 +-
 .../Transforms/InstCombine/minmax-fold.ll     | 20 +++++++++----------
 llvm/test/Transforms/InstCombine/pr46680.ll   |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index f918dc7198ca..ca51f37af4d9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -653,7 +653,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner
            "New instruction already inserted into a basic block!");
     BasicBlock *BB = Old.getParent();
     BB->getInstList().insert(Old.getIterator(), New); // Insert inst
-    Worklist.push(New);
+    Worklist.add(New);
     return New;
   }
 
diff --git a/llvm/test/Transforms/InstCombine/minmax-fold.ll b/llvm/test/Transforms/InstCombine/minmax-fold.ll
index 5ee38978ed78..dcf060c09613 100644
--- a/llvm/test/Transforms/InstCombine/minmax-fold.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-fold.ll
@@ -953,8 +953,8 @@ define i32 @add_umin(i32 %x) {
 
 define i32 @add_umin_constant_limit(i32 %x) {
 ; CHECK-LABEL: @add_umin_constant_limit(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP1]], i32 41, i32 42
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[DOTNOT]], i32 41, i32 42
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %a = add nuw i32 %x, 41
@@ -1165,8 +1165,8 @@ define <2 x i33> @add_umax_vec(<2 x i33> %x) {
 
 define i8 @PR14613_umin(i8 %x) {
 ; CHECK-LABEL: @PR14613_umin(
-; CHECK-NEXT:    [[U7:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[X:%.*]], i8 15)
-; CHECK-NEXT:    ret i8 [[U7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.uadd.sat.i8(i8 [[X:%.*]], i8 15)
+; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
   %u4 = zext i8 %x to i32
   %u5 = add nuw nsw i32 %u4, 15
@@ -1179,8 +1179,8 @@ define i8 @PR14613_umin(i8 %x) {
 define i8 @PR14613_umax(i8 %x) {
 ; CHECK-LABEL: @PR14613_umax(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i8 [[X:%.*]], -16
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i8 [[X]], i8 -16
-; CHECK-NEXT:    [[U7:%.*]] = add nsw i8 [[TMP2]], 15
+; CHECK-NEXT:    [[X_OP:%.*]] = add i8 [[X]], 15
+; CHECK-NEXT:    [[U7:%.*]] = select i1 [[TMP1]], i8 [[X_OP]], i8 -1
 ; CHECK-NEXT:    ret i8 [[U7]]
 ;
   %u4 = zext i8 %x to i32
@@ -1422,8 +1422,8 @@ define <2 x i33> @add_smax_vec(<2 x i33> %x) {
 define i8 @PR14613_smin(i8 %x) {
 ; CHECK-LABEL: @PR14613_smin(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i8 [[X:%.*]], 40
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i8 [[X]], i8 40
-; CHECK-NEXT:    [[U7:%.*]] = add nsw i8 [[TMP2]], 15
+; CHECK-NEXT:    [[X_OP:%.*]] = add i8 [[X]], 15
+; CHECK-NEXT:    [[U7:%.*]] = select i1 [[TMP1]], i8 [[X_OP]], i8 55
 ; CHECK-NEXT:    ret i8 [[U7]]
 ;
   %u4 = sext i8 %x to i32
@@ -1437,8 +1437,8 @@ define i8 @PR14613_smin(i8 %x) {
 define i8 @PR14613_smax(i8 %x) {
 ; CHECK-LABEL: @PR14613_smax(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i8 [[X:%.*]], 40
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i8 [[X]], i8 40
-; CHECK-NEXT:    [[U7:%.*]] = add nuw i8 [[TMP2]], 15
+; CHECK-NEXT:    [[X_OP:%.*]] = add i8 [[X]], 15
+; CHECK-NEXT:    [[U7:%.*]] = select i1 [[TMP1]], i8 [[X_OP]], i8 55
 ; CHECK-NEXT:    ret i8 [[U7]]
 ;
   %u4 = sext i8 %x to i32
diff --git a/llvm/test/Transforms/InstCombine/pr46680.ll b/llvm/test/Transforms/InstCombine/pr46680.ll
index 90ea2e110afe..59d449d5dc23 100644
--- a/llvm/test/Transforms/InstCombine/pr46680.ll
+++ b/llvm/test/Transforms/InstCombine/pr46680.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -instcombine -instcombine-infinite-loop-threshold=3 < %s | FileCheck %s
+; RUN: opt -S -instcombine -instcombine-infinite-loop-threshold=2 < %s | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-linux-gnu"

From 8a2bc9431193026454745d538cf7e5a5a6b6d5be Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 21 Jul 2020 21:50:38 +0100
Subject: [PATCH 044/363] [X86][AVX] getTargetShuffleMask - don't decode
 VBROADCAST(EXTRACT_SUBVECTOR(X,0)) patterns.

getTargetShuffleMask is used by the various "SimplifyDemanded" folds so we can't assume that the bypassed extract_subvector can be safely simplified - getFauxShuffleMask performs a more general decode that allows us to more safely catch many of these cases so the impact is minimal.

(cherry picked from commit 5b5dc2442ac7a574a3b7d17c15ebeeb9eb3bec26)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp      |  19 +--
 llvm/test/CodeGen/X86/vector-fshl-256.ll     |  38 +++---
 llvm/test/CodeGen/X86/vector-fshl-512.ll     |  20 ++--
 llvm/test/CodeGen/X86/vector-fshl-rot-512.ll |  92 ++++++++-------
 llvm/test/CodeGen/X86/vector-fshr-256.ll     |  68 +++++------
 llvm/test/CodeGen/X86/vector-fshr-512.ll     | 116 +++++++++----------
 llvm/test/CodeGen/X86/vector-fshr-rot-512.ll |  52 +++++----
 llvm/test/CodeGen/X86/vector-rotate-512.ll   |  98 +++++++---------
 8 files changed, 244 insertions(+), 259 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ea4b4734225d..f8b6b7eb3aff 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6916,25 +6916,16 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     DecodeZeroMoveLowMask(NumElems, Mask);
     IsUnary = true;
     break;
-  case X86ISD::VBROADCAST: {
-    SDValue N0 = N->getOperand(0);
-    // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
-    // add the pre-extracted value to the Ops vector.
-    if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-        N0.getOperand(0).getValueType() == VT &&
-        N0.getConstantOperandVal(1) == 0)
-      Ops.push_back(N0.getOperand(0));
-
-    // We only decode broadcasts of same-sized vectors, unless the broadcast
-    // came from an extract from the original width. If we found one, we
-    // pushed it the Ops vector above.
-    if (N0.getValueType() == VT || !Ops.empty()) {
+  case X86ISD::VBROADCAST:
+    // We only decode broadcasts of same-sized vectors, peeking through to
+    // extracted subvectors is likely to cause hasOneUse issues with
+    // SimplifyDemandedBits etc.
+    if (N->getOperand(0).getValueType() == VT) {
       DecodeVectorBroadcast(NumElems, Mask);
       IsUnary = true;
       break;
     }
     return false;
-  }
   case X86ISD::VPERMILPV: {
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     IsUnary = true;
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 12feea765898..0688107ed5c0 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -1092,12 +1092,12 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX2-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX2-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
@@ -1110,12 +1110,12 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512F-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; AVX512F-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512F-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512F-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1126,12 +1126,12 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VL-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512VL-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VL-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VL-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1144,12 +1144,12 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512BW-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512BW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512BW-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512BW-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1162,12 +1162,12 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VBMI2-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512VBMI2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VBMI2-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
 ; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1178,12 +1178,12 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VLBW-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512VLBW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VLBW-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1224,12 +1224,12 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; XOPAVX2-NEXT:    vpslld %xmm3, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; XOPAVX2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; XOPAVX2-NEXT:    vpsrld %xmm4, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; XOPAVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
@@ -1271,12 +1271,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; AVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX2-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -1287,12 +1287,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; AVX512F-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -1303,12 +1303,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; AVX512VL-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -1320,12 +1320,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; AVX512BW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512BW-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1337,12 +1337,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; AVX512VBMI2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1353,12 +1353,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0
@@ -1399,12 +1399,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
+; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
 ; XOPAVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -1458,6 +1458,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX2-NEXT:    vpsllw %xmm3, %xmm5, %xmm3
 ; AVX2-NEXT:    vpbroadcastb %xmm3, %ymm3
 ; AVX2-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
@@ -1467,7 +1468,6 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX2-NEXT:    vpbroadcastb %xmm4, %ymm4
 ; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -1482,6 +1482,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512F-NEXT:    vpsllw %xmm3, %xmm5, %xmm3
 ; AVX512F-NEXT:    vpbroadcastb %xmm3, %ymm3
 ; AVX512F-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512F-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
@@ -1491,7 +1492,6 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512F-NEXT:    vpbroadcastb %xmm4, %ymm4
 ; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -1506,6 +1506,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512VL-NEXT:    vpsllw %xmm3, %xmm5, %xmm3
 ; AVX512VL-NEXT:    vpbroadcastb %xmm3, %ymm3
 ; AVX512VL-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512VL-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
@@ -1514,9 +1515,8 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512VL-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpbroadcastb %xmm4, %ymm4
 ; AVX512VL-NEXT:    vpternlogq $236, %ymm1, %ymm3, %ymm4
-; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm1
-; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpcmpeqb %ymm1, %ymm2, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm0, %ymm4, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index 6e0cb76398df..088a590a2e07 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -643,12 +643,12 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512F-NEXT:    vpslld %xmm3, %zmm0, %zmm3
+; AVX512F-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512F-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512F-NEXT:    vpsrld %xmm4, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpord %zmm1, %zmm3, %zmm1
-; AVX512F-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -659,12 +659,12 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VL-NEXT:    vpslld %xmm3, %zmm0, %zmm3
+; AVX512VL-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512VL-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VL-NEXT:    vpsrld %xmm4, %zmm1, %zmm1
 ; AVX512VL-NEXT:    vpord %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512VL-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512VL-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -676,12 +676,12 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512BW-NEXT:    vpslld %xmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512BW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512BW-NEXT:    vpsrld %xmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpord %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -698,12 +698,12 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VLBW-NEXT:    vpslld %xmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512VLBW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VLBW-NEXT:    vpsrld %xmm4, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpord %zmm1, %zmm3, %zmm1
-; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -771,12 +771,12 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %zmm2
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; AVX512BW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vporq %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT:    vpbroadcastw %xmm2, %zmm2
 ; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -793,12 +793,12 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
 ; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm4, %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vporq %zmm1, %zmm3, %zmm1
-; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -890,6 +890,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512BW-NEXT:    vpsllw %xmm3, %xmm5, %xmm3
 ; AVX512BW-NEXT:    vpbroadcastb %xmm3, %zmm3
 ; AVX512BW-NEXT:    vpandq %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpbroadcastb %xmm2, %zmm2
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512BW-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
@@ -898,7 +899,6 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512BW-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
 ; AVX512BW-NEXT:    vpternlogq $236, %zmm6, %zmm3, %zmm1
-; AVX512BW-NEXT:    vpbroadcastb %xmm2, %zmm2
 ; AVX512BW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -913,6 +913,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512VBMI2-NEXT:    vpsllw %xmm3, %xmm5, %xmm3
 ; AVX512VBMI2-NEXT:    vpbroadcastb %xmm3, %zmm3
 ; AVX512VBMI2-NEXT:    vpandq %zmm3, %zmm4, %zmm3
+; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
 ; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512VBMI2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
@@ -921,7 +922,6 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512VBMI2-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX512VBMI2-NEXT:    vpbroadcastb %xmm1, %zmm1
 ; AVX512VBMI2-NEXT:    vpternlogq $236, %zmm6, %zmm3, %zmm1
-; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
 ; AVX512VBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512VBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -936,6 +936,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512VLBW-NEXT:    vpsllw %xmm3, %xmm5, %xmm3
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm3, %zmm3
 ; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512VLBW-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
@@ -944,7 +945,6 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512VLBW-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpternlogq $236, %zmm6, %zmm3, %zmm1
-; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512VLBW-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -959,6 +959,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512VLVBMI2-NEXT:    vpsllw %xmm3, %xmm5, %xmm3
 ; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm3, %zmm3
 ; AVX512VLVBMI2-NEXT:    vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
 ; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512VLVBMI2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512VLVBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
@@ -967,7 +968,6 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512VLVBMI2-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm1, %zmm1
 ; AVX512VLVBMI2-NEXT:    vpternlogq $236, %zmm6, %zmm3, %zmm1
-; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
 ; AVX512VLVBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
 ; AVX512VLVBMI2-NEXT:    vmovdqa64 %zmm1, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
index 49f229bf1d67..ff177678bed6 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
@@ -366,13 +366,14 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounw
 define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm3
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX512F-NEXT:    vpsllw %xmm3, %ymm2, %ymm4
-; AVX512F-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
 ; AVX512F-NEXT:    vpbroadcastw %xmm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm2
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpsllw %xmm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
 ; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsubw %ymm1, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
@@ -380,22 +381,23 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT:    vpsrlvd %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT:    vpsrlvd %zmm1, %zmm3, %zmm1
 ; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm3
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX512VL-NEXT:    vpsllw %xmm3, %ymm2, %ymm4
-; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
 ; AVX512VL-NEXT:    vpbroadcastw %xmm1, %ymm1
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm2
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT:    vpsllw %xmm2, %ymm3, %ymm4
+; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsubw %ymm1, %ymm4, %ymm1
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
@@ -403,11 +405,11 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm3, %zmm1
 ; AVX512VL-NEXT:    vpmovdw %zmm1, %ymm1
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512VL-NEXT:    vporq %zmm0, %zmm2, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v32i16:
@@ -447,16 +449,17 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
 define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm3
+; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm1, %zmm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm3
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm5
-; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT:    vpsubb %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
 ; AVX512F-NEXT:    vpsllw $5, %ymm5, %ymm5
-; AVX512F-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm3
+; AVX512F-NEXT:    vpblendvb %ymm5, %ymm3, %ymm1, %ymm3
 ; AVX512F-NEXT:    vpsrlw $2, %ymm3, %ymm6
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
 ; AVX512F-NEXT:    vpand %ymm7, %ymm6, %ymm6
@@ -477,31 +480,32 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind
 ; AVX512F-NEXT:    vpand %ymm5, %ymm9, %ymm5
 ; AVX512F-NEXT:    vpblendvb %ymm10, %ymm5, %ymm4, %ymm4
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vpsllw %xmm1, %ymm2, %ymm2
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpsllw %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT:    vpsllw %xmm1, %xmm4, %xmm4
+; AVX512F-NEXT:    vpsllw %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpbroadcastb %xmm4, %ymm4
-; AVX512F-NEXT:    vpand %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vporq %zmm3, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm1, %zmm2
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm3
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpbroadcastb %xmm1, %ymm5
-; AVX512VL-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT:    vpsubb %ymm5, %ymm6, %ymm5
+; AVX512VL-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT:    vpsubb %ymm2, %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpsllw $5, %ymm5, %ymm5
-; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm3
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm3, %ymm1, %ymm3
 ; AVX512VL-NEXT:    vpsrlw $2, %ymm3, %ymm6
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
 ; AVX512VL-NEXT:    vpand %ymm7, %ymm6, %ymm6
@@ -522,16 +526,16 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind
 ; AVX512VL-NEXT:    vpand %ymm5, %ymm9, %ymm5
 ; AVX512VL-NEXT:    vpblendvb %ymm10, %ymm5, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT:    vpsllw %xmm1, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm2, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpsllw %xmm1, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpsllw %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpbroadcastb %xmm4, %ymm4
-; AVX512VL-NEXT:    vpand %ymm4, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vporq %zmm3, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index bbeaed5cc725..e9cb0a0586f0 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -1095,12 +1095,12 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX2-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
+; AVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX2-NEXT:    vpslld %xmm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
@@ -1113,12 +1113,12 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512F-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512F-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512F-NEXT:    vpslld %xmm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -1129,12 +1129,12 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VL-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512VL-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VL-NEXT:    vpslld %xmm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VL-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
 ; AVX512VL-NEXT:    retq
@@ -1146,12 +1146,12 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512BW-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
+; AVX512BW-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512BW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512BW-NEXT:    vpslld %xmm4, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -1164,12 +1164,12 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512VBMI2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VBMI2-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
+; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512VBMI2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VBMI2-NEXT:    vpslld %xmm4, %ymm0, %ymm0
 ; AVX512VBMI2-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -1180,12 +1180,12 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VLBW-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
+; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512VLBW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VLBW-NEXT:    vpslld %xmm4, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vptestnmd %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
@@ -1226,12 +1226,12 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; XOPAVX2-NEXT:    vpsrld %xmm3, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; XOPAVX2-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; XOPAVX2-NEXT:    vpslld %xmm4, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
@@ -1273,12 +1273,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX2-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
+; AVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; AVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -1289,12 +1289,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; AVX512F-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -1305,12 +1305,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; AVX512VL-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -1322,12 +1322,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; AVX512BW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -1339,12 +1339,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; AVX512VBMI2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VBMI2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VBMI2-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
 ; AVX512VBMI2-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VBMI2-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -1355,12 +1355,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; AVX512VLBW-NEXT:    vptestnmw %ymm2, %ymm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
@@ -1401,12 +1401,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsrlw %xmm3, %ymm1, %ymm3
+; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; XOPAVX2-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
 ; XOPAVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; XOPAVX2-NEXT:    vpbroadcastw %xmm2, %ymm2
 ; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; XOPAVX2-NEXT:    vpcmpeqw %ymm3, %ymm2, %ymm2
 ; XOPAVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -1461,6 +1461,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX2-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX2-NEXT:    vpbroadcastb %xmm3, %ymm3
 ; AVX2-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX2-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
@@ -1469,7 +1470,6 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX2-NEXT:    vpbroadcastb %xmm4, %ymm4
 ; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -1485,6 +1485,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512F-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX512F-NEXT:    vpbroadcastb %xmm3, %ymm3
 ; AVX512F-NEXT:    vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512F-NEXT:    vpsubb %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
@@ -1493,7 +1494,6 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512F-NEXT:    vpbroadcastb %xmm4, %ymm4
 ; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
 ; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -1502,24 +1502,24 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512VL-LABEL: splatvar_funnnel_v32i8:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT:    vpsubb %xmm2, %xmm3, %xmm3
-; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT:    vpsllw %xmm3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpsllw %xmm3, %xmm4, %xmm3
-; AVX512VL-NEXT:    vpbroadcastb %xmm3, %ymm3
-; AVX512VL-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm1, %ymm5
-; AVX512VL-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
-; AVX512VL-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX512VL-NEXT:    vpbroadcastb %xmm3, %ymm3
-; AVX512VL-NEXT:    vpternlogq $236, %ymm5, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm0
-; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpblendvb %ymm0, %ymm1, %ymm3, %ymm0
+; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm3, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT:    vpsllw %xmm4, %xmm5, %xmm4
+; AVX512VL-NEXT:    vpbroadcastb %xmm4, %ymm4
+; AVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm1, %ymm4
+; AVX512VL-NEXT:    vpsrlw %xmm2, %xmm5, %xmm2
+; AVX512VL-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
+; AVX512VL-NEXT:    vpternlogq $236, %ymm4, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpcmpeqb %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index c89782bc359c..fa70e840081f 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -633,12 +633,12 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512F-NEXT:    vpsrld %xmm3, %zmm1, %zmm3
+; AVX512F-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512F-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512F-NEXT:    vpslld %xmm4, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpord %zmm3, %zmm0, %zmm0
-; AVX512F-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512F-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
@@ -648,12 +648,12 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VL-NEXT:    vpsrld %xmm3, %zmm1, %zmm3
+; AVX512VL-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512VL-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VL-NEXT:    vpslld %xmm4, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vpord %zmm3, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512VL-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512VL-NEXT:    retq
@@ -664,12 +664,12 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512BW-NEXT:    vpsrld %xmm3, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512BW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512BW-NEXT:    vpslld %xmm4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpord %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512BW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
@@ -686,12 +686,12 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i
 ; AVX512VLBW-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
 ; AVX512VLBW-NEXT:    vpsrld %xmm3, %zmm1, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
 ; AVX512VLBW-NEXT:    vpsubd %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX512VLBW-NEXT:    vpslld %xmm4, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpord %zmm3, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpbroadcastd %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vptestnmd %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
@@ -759,12 +759,12 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm3, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpbroadcastw %xmm2, %zmm2
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; AVX512BW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpbroadcastw %xmm2, %zmm2
 ; AVX512BW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
@@ -781,12 +781,12 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i
 ; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm3, %zmm1, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
 ; AVX512VLBW-NEXT:    vpsubw %xmm2, %xmm4, %xmm4
 ; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpbroadcastw %xmm2, %zmm2
 ; AVX512VLBW-NEXT:    vptestnmw %zmm2, %zmm2, %k1
 ; AVX512VLBW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
@@ -872,88 +872,88 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
 ; AVX512BW-LABEL: splatvar_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %xmm2, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT:    vpsllw %xmm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpsllw %xmm3, %xmm4, %xmm3
-; AVX512BW-NEXT:    vpbroadcastb %xmm3, %zmm3
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT:    vpbroadcastb %xmm2, %zmm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm3, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpsllw %xmm4, %xmm5, %xmm4
+; AVX512BW-NEXT:    vpbroadcastb %xmm4, %zmm4
+; AVX512BW-NEXT:    vpandq %zmm4, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT:    vpsrlw %xmm0, %zmm1, %zmm5
-; AVX512BW-NEXT:    vpsrlw %xmm0, %xmm4, %xmm0
+; AVX512BW-NEXT:    vpsrlw %xmm0, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsrlw %xmm0, %xmm5, %xmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpbroadcastb %xmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogq $236, %zmm5, %zmm3, %zmm0
-; AVX512BW-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512BW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
+; AVX512BW-NEXT:    vpternlogq $236, %zmm2, %zmm4, %zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm3, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT:    vpsubb %xmm2, %xmm3, %xmm3
-; AVX512VBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT:    vpsllw %xmm3, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX512VBMI2-NEXT:    vpsllw %xmm3, %xmm4, %xmm3
-; AVX512VBMI2-NEXT:    vpbroadcastb %xmm3, %zmm3
-; AVX512VBMI2-NEXT:    vpandq %zmm3, %zmm0, %zmm3
+; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %zmm3
+; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT:    vpsubb %xmm3, %xmm4, %xmm4
+; AVX512VBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
+; AVX512VBMI2-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VBMI2-NEXT:    vpsllw %xmm4, %xmm5, %xmm4
+; AVX512VBMI2-NEXT:    vpbroadcastb %xmm4, %zmm4
+; AVX512VBMI2-NEXT:    vpandq %zmm4, %zmm0, %zmm4
 ; AVX512VBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT:    vpsrlw %xmm0, %zmm1, %zmm5
-; AVX512VBMI2-NEXT:    vpsrlw %xmm0, %xmm4, %xmm0
+; AVX512VBMI2-NEXT:    vpsrlw %xmm0, %zmm1, %zmm2
+; AVX512VBMI2-NEXT:    vpsrlw %xmm0, %xmm5, %xmm0
 ; AVX512VBMI2-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX512VBMI2-NEXT:    vpbroadcastb %xmm0, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogq $236, %zmm5, %zmm3, %zmm0
-; AVX512VBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512VBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
+; AVX512VBMI2-NEXT:    vpternlogq $236, %zmm2, %zmm4, %zmm0
+; AVX512VBMI2-NEXT:    vptestnmb %zmm3, %zmm3, %k1
 ; AVX512VBMI2-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %xmm2, %xmm3, %xmm3
-; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT:    vpsllw %xmm3, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX512VLBW-NEXT:    vpsllw %xmm3, %xmm4, %xmm3
-; AVX512VLBW-NEXT:    vpbroadcastb %xmm3, %zmm3
-; AVX512VLBW-NEXT:    vpandq %zmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %zmm3
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm3, %xmm4, %xmm4
+; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
+; AVX512VLBW-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VLBW-NEXT:    vpsllw %xmm4, %xmm5, %xmm4
+; AVX512VLBW-NEXT:    vpbroadcastb %xmm4, %zmm4
+; AVX512VLBW-NEXT:    vpandq %zmm4, %zmm0, %zmm4
 ; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT:    vpsrlw %xmm0, %zmm1, %zmm5
-; AVX512VLBW-NEXT:    vpsrlw %xmm0, %xmm4, %xmm0
+; AVX512VLBW-NEXT:    vpsrlw %xmm0, %zmm1, %zmm2
+; AVX512VLBW-NEXT:    vpsrlw %xmm0, %xmm5, %xmm0
 ; AVX512VLBW-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm0, %zmm0
-; AVX512VLBW-NEXT:    vpternlogq $236, %zmm5, %zmm3, %zmm0
-; AVX512VLBW-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512VLBW-NEXT:    vptestnmb %zmm2, %zmm2, %k1
+; AVX512VLBW-NEXT:    vpternlogq $236, %zmm2, %zmm4, %zmm0
+; AVX512VLBW-NEXT:    vptestnmb %zmm3, %zmm3, %k1
 ; AVX512VLBW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT:    vpsubb %xmm2, %xmm3, %xmm3
-; AVX512VLVBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT:    vpsllw %xmm3, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX512VLVBMI2-NEXT:    vpsllw %xmm3, %xmm4, %xmm3
-; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm3, %zmm3
-; AVX512VLVBMI2-NEXT:    vpandq %zmm3, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %zmm3
+; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT:    vpsubb %xmm3, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT:    vpsllw %xmm4, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VLVBMI2-NEXT:    vpsllw %xmm4, %xmm5, %xmm4
+; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm4, %zmm4
+; AVX512VLVBMI2-NEXT:    vpandq %zmm4, %zmm0, %zmm4
 ; AVX512VLVBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT:    vpsrlw %xmm0, %zmm1, %zmm5
-; AVX512VLVBMI2-NEXT:    vpsrlw %xmm0, %xmm4, %xmm0
+; AVX512VLVBMI2-NEXT:    vpsrlw %xmm0, %zmm1, %zmm2
+; AVX512VLVBMI2-NEXT:    vpsrlw %xmm0, %xmm5, %xmm0
 ; AVX512VLVBMI2-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogq $236, %zmm5, %zmm3, %zmm0
-; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm2, %zmm2
-; AVX512VLVBMI2-NEXT:    vptestnmb %zmm2, %zmm2, %k1
+; AVX512VLVBMI2-NEXT:    vpternlogq $236, %zmm2, %zmm4, %zmm0
+; AVX512VLVBMI2-NEXT:    vptestnmb %zmm3, %zmm3, %k1
 ; AVX512VLVBMI2-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
 ; AVX512VLVBMI2-NEXT:    retq
   %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
index d642e513c49b..fda0dacedf85 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
@@ -366,13 +366,14 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounw
 define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm3
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX512F-NEXT:    vpsrlw %xmm3, %ymm2, %ymm4
-; AVX512F-NEXT:    vpsrlw %xmm3, %ymm0, %ymm3
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
 ; AVX512F-NEXT:    vpbroadcastw %xmm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm2
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpsrlw %xmm2, %ymm0, %ymm2
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
 ; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpsubw %ymm1, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
@@ -380,22 +381,23 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
 ; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512F-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT:    vpsllvd %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT:    vpsllvd %zmm1, %zmm3, %zmm1
 ; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vporq %zmm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v32i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm3
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm2, %ymm4
-; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm0, %ymm3
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
 ; AVX512VL-NEXT:    vpbroadcastw %xmm1, %ymm1
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm2
+; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm3, %ymm4
+; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm2
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
 ; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpsubw %ymm1, %ymm4, %ymm1
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
@@ -403,11 +405,11 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
 ; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm3, %zmm1
 ; AVX512VL-NEXT:    vpmovdw %zmm1, %ymm1
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vporq %zmm2, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v32i16:
@@ -447,13 +449,14 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw
 define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
 ; AVX512F-LABEL: splatvar_funnnel_v64i8:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm1, %zmm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512F-NEXT:    vpsllw $4, %ymm2, %ymm3
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm5
-; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT:    vpsubb %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
 ; AVX512F-NEXT:    vpsllw $5, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm3
@@ -490,13 +493,14 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v64i8:
 ; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm1, %zmm1
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512VL-NEXT:    vpsllw $4, %ymm2, %ymm3
 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
 ; AVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpbroadcastb %xmm1, %ymm5
-; AVX512VL-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT:    vpsubb %ymm5, %ymm6, %ymm5
+; AVX512VL-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT:    vpsubb %ymm1, %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpsllw $5, %ymm5, %ymm5
 ; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm3
diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll
index 05d989ebaa30..831c03f03825 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-512.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll
@@ -348,38 +348,34 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind
 ; AVX512F-LABEL: splatvar_rotate_v32i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT:    vpsubw %xmm1, %xmm3, %xmm4
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512F-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
 ; AVX512F-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT:    vpsllw %xmm2, %ymm3, %ymm5
+; AVX512F-NEXT:    vpsllw %xmm2, %ymm3, %ymm4
 ; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm2
-; AVX512F-NEXT:    vpsrlw %xmm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vpsrlw %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512F-NEXT:    vpsrlw %xmm1, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
 ; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_rotate_v32i16:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    vpsubw %xmm1, %xmm3, %xmm4
-; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
 ; AVX512VL-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT:    vpsllw %xmm2, %ymm3, %ymm5
+; AVX512VL-NEXT:    vpsllw %xmm2, %ymm3, %ymm4
 ; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm2
-; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm3, %ymm1
-; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vporq %zmm0, %zmm2, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -416,64 +412,54 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512F-LABEL: splatvar_rotate_v64i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT:    vpsubb %xmm1, %xmm3, %xmm4
-; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512F-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
 ; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT:    vpsllw %xmm2, %ymm3, %ymm5
-; AVX512F-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT:    vpsllw %xmm2, %xmm6, %xmm7
-; AVX512F-NEXT:    vpbroadcastb %xmm7, %ymm7
-; AVX512F-NEXT:    vpand %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT:    vpsllw %xmm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT:    vpsllw %xmm2, %xmm5, %xmm6
+; AVX512F-NEXT:    vpbroadcastb %xmm6, %ymm6
+; AVX512F-NEXT:    vpand %ymm6, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
-; AVX512F-NEXT:    vpand %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm2
+; AVX512F-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
 ; AVX512F-NEXT:    vpsrlw %xmm1, %ymm3, %ymm3
-; AVX512F-NEXT:    vpsrlw %xmm1, %xmm6, %xmm1
-; AVX512F-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512F-NEXT:    vpand %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vpsrlw %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlw %xmm4, %xmm6, %xmm3
-; AVX512F-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX512F-NEXT:    vpbroadcastb %xmm3, %ymm3
-; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpsrlw %xmm1, %xmm5, %xmm4
+; AVX512F-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX512F-NEXT:    vpbroadcastb %xmm4, %ymm4
+; AVX512F-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
 ; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatvar_rotate_v64i8:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT:    vpsubb %xmm1, %xmm3, %xmm4
-; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
 ; AVX512VL-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
 ; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT:    vpsllw %xmm2, %ymm3, %ymm5
-; AVX512VL-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT:    vpsllw %xmm2, %xmm6, %xmm7
-; AVX512VL-NEXT:    vpbroadcastb %xmm7, %ymm7
-; AVX512VL-NEXT:    vpand %ymm7, %ymm5, %ymm5
+; AVX512VL-NEXT:    vpsllw %xmm2, %ymm3, %ymm4
+; AVX512VL-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT:    vpsllw %xmm2, %xmm5, %xmm6
+; AVX512VL-NEXT:    vpbroadcastb %xmm6, %ymm6
+; AVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
 ; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
-; AVX512VL-NEXT:    vpand %ymm7, %ymm2, %ymm2
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
 ; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm6, %xmm1
-; AVX512VL-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512VL-NEXT:    vpand %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT:    vpsrlw %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsrlw %xmm4, %xmm6, %xmm3
-; AVX512VL-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX512VL-NEXT:    vpbroadcastb %xmm3, %ymm3
-; AVX512VL-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm5, %xmm4
+; AVX512VL-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; AVX512VL-NEXT:    vpbroadcastb %xmm4, %ymm4
+; AVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vporq %zmm0, %zmm2, %zmm0
 ; AVX512VL-NEXT:    retq
 ;

From 833f8c958601bb640ba6a25d627c1dc58dad14d2 Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias@hieta.se>
Date: Fri, 24 Jul 2020 00:10:22 +0300
Subject: [PATCH 045/363] [clang] Fix libdl linking for libclang in standalone
 mode

Differential Revision: https://reviews.llvm.org/D81385

(cherry picked from commit a41af6e41e6fcf3e7030feaf24057cbe8291b748)
---
 clang/tools/libclang/CMakeLists.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/clang/tools/libclang/CMakeLists.txt b/clang/tools/libclang/CMakeLists.txt
index 9b34682cc49b..a4077140acee 100644
--- a/clang/tools/libclang/CMakeLists.txt
+++ b/clang/tools/libclang/CMakeLists.txt
@@ -68,7 +68,12 @@ endif ()
 
 if (HAVE_LIBDL)
   list(APPEND LIBS ${CMAKE_DL_LIBS})
-endif()
+elseif (CLANG_BUILT_STANDALONE)
+  find_library(DL_LIBRARY_PATH dl)
+  if (DL_LIBRARY_PATH)
+    list(APPEND LIBS dl)
+  endif ()
+endif ()
 
 option(LIBCLANG_BUILD_STATIC
   "Build libclang as a static library (in addition to a shared one)" OFF)

From 3c1fca803bc14617b67ba2125e1b4b77190e9f86 Mon Sep 17 00:00:00 2001
From: David Goldman <davg@google.com>
Date: Fri, 17 Jul 2020 15:06:28 -0400
Subject: [PATCH 046/363] Fix issue in typo handling which could lead clang to
 hang

Summary:
We need to detect when certain TypoExprs are not being transformed
due to invalid trees, otherwise we risk endlessly trying to fix it.

Reviewers: rsmith

Subscribers: cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D84067

(cherry picked from commit dde98c82c0ad02410229e7e5c9efcbb0ab42a995)
---
 clang/include/clang/Sema/SemaInternal.h       |  5 +++
 clang/lib/Sema/SemaExprCXX.cpp                | 25 +++++++-----
 clang/test/Sema/typo-correction-no-hang.cpp   | 40 +++++++++++++++++++
 clang/test/Sema/typo-correction-recursive.cpp | 12 ++++++
 4 files changed, 73 insertions(+), 9 deletions(-)
 create mode 100644 clang/test/Sema/typo-correction-no-hang.cpp

diff --git a/clang/include/clang/Sema/SemaInternal.h b/clang/include/clang/Sema/SemaInternal.h
index cdaf7b70a92f..842eec099540 100644
--- a/clang/include/clang/Sema/SemaInternal.h
+++ b/clang/include/clang/Sema/SemaInternal.h
@@ -168,6 +168,11 @@ class TypoCorrectionConsumer : public VisibleDeclConsumer {
     return TC;
   }
 
+  /// In the case of deeply invalid expressions, `getNextCorrection()` will
+  /// never be called since the transform never makes progress. If we don't
+  /// detect this we risk trying to correct typos forever.
+  bool hasMadeAnyCorrectionProgress() const { return CurrentTCIndex != 0; }
+
   /// Reset the consumer's position in the stream of viable corrections
   /// (i.e. getNextCorrection() will return each of the previously returned
   /// corrections in order before returning any new corrections).
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index d885920b6c14..77bd1ab360b2 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -7977,19 +7977,26 @@ class TransformTypos : public TreeTransform<TransformTypos> {
     }
   }
 
-  /// If corrections for the first TypoExpr have been exhausted for a
-  /// given combination of the other TypoExprs, retry those corrections against
-  /// the next combination of substitutions for the other TypoExprs by advancing
-  /// to the next potential correction of the second TypoExpr. For the second
-  /// and subsequent TypoExprs, if its stream of corrections has been exhausted,
-  /// the stream is reset and the next TypoExpr's stream is advanced by one (a
-  /// TypoExpr's correction stream is advanced by removing the TypoExpr from the
-  /// TransformCache). Returns true if there is still any untried combinations
-  /// of corrections.
+  /// Try to advance the typo correction state of the first unfinished TypoExpr.
+  /// We allow advancement of the correction stream by removing it from the
+  /// TransformCache which allows `TransformTypoExpr` to advance during the
+  /// next transformation attempt.
+  ///
+  /// Any substitution attempts for the previous TypoExprs (which must have been
+  /// finished) will need to be retried since it's possible that they will now
+  /// be invalid given the latest advancement.
+  ///
+  /// We need to be sure that we're making progress - it's possible that the
+  /// tree is so malformed that the transform never makes it to the
+  /// `TransformTypoExpr`.
+  ///
+  /// Returns true if there are any untried correction combinations.
   bool CheckAndAdvanceTypoExprCorrectionStreams() {
     for (auto TE : TypoExprs) {
       auto &State = SemaRef.getTypoExprState(TE);
       TransformCache.erase(TE);
+      if (!State.Consumer->hasMadeAnyCorrectionProgress())
+        return false;
       if (!State.Consumer->finished())
         return true;
       State.Consumer->resetCorrectionStream();
diff --git a/clang/test/Sema/typo-correction-no-hang.cpp b/clang/test/Sema/typo-correction-no-hang.cpp
new file mode 100644
index 000000000000..3c591645be25
--- /dev/null
+++ b/clang/test/Sema/typo-correction-no-hang.cpp
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+// From `test/Sema/typo-correction.c` but for C++ since the behavior varies
+// between the two languages.
+struct rdar38642201 {
+  int fieldName;
+};
+
+void rdar38642201_callee(int x, int y);
+void rdar38642201_caller() {
+  struct rdar38642201 structVar;
+  rdar38642201_callee(
+      structVar1.fieldName1.member1,  //expected-error{{use of undeclared identifier 'structVar1'}}
+      structVar2.fieldName2.member2); //expected-error{{use of undeclared identifier 'structVar2'}}
+}
+
+// Similar reproducer.
+class A {
+public:
+  int minut() const = delete;
+  int hour() const = delete;
+
+  int longit() const; //expected-note{{'longit' declared here}}
+  int latit() const;
+};
+
+class B {
+public:
+  A depar() const { return A(); }
+};
+
+int Foo(const B &b) {
+  return b.deparT().hours() * 60 + //expected-error{{no member named 'deparT' in 'B'}}
+         b.deparT().minutes();     //expected-error{{no member named 'deparT' in 'B'}}
+}
+
+int Bar(const B &b) {
+  return b.depar().longitude() + //expected-error{{no member named 'longitude' in 'A'; did you mean 'longit'?}}
+         b.depar().latitude();   //expected-error{{no member named 'latitude' in 'A'}}
+}
diff --git a/clang/test/Sema/typo-correction-recursive.cpp b/clang/test/Sema/typo-correction-recursive.cpp
index 48bd3b80c599..b39beb5493f6 100644
--- a/clang/test/Sema/typo-correction-recursive.cpp
+++ b/clang/test/Sema/typo-correction-recursive.cpp
@@ -118,3 +118,15 @@ int testDeepAmbiguity() {
       asDeepASItGet().
       functionE();
 }
+
+struct Dog {
+  int age;  //expected-note{{'age' declared here}}
+  int size; //expected-note{{'size' declared here}}
+};
+
+int from_dog_years(int DogYears, int DogSize);
+int get_dog_years() {
+  struct Dog doggo;
+  return from_dog_years(doggo.agee,   //expected-error{{no member named 'agee' in 'Dog'; did you mean 'age'}}
+                        doggo.sizee); //expected-error{{no member named 'sizee' in 'Dog'; did you mean 'size'}}
+}

From f749d92f7a32f71598e8c1e1f37d7eb261a40ec5 Mon Sep 17 00:00:00 2001
From: lewis-revill <lewis.revill@embecosm.com>
Date: Wed, 15 Jul 2020 11:50:03 +0100
Subject: [PATCH 047/363] [RISCV] Add matching of codegen patterns to RISCV Bit
 Manipulation Zbb asm instructions

This patch provides optimization of bit manipulation operations by
enabling the +experimental-b target feature.
It adds matching of single block patterns of instructions to specific
bit-manip instructions from the base subset (zbb subextension) of the
experimental B extension of RISC-V.
It adds also the correspondent codegen tests.

This patch is based on Claire Wolf's proposal for the bit manipulation
extension of RISCV:
https://github.com/riscv/riscv-bitmanip/blob/master/bitmanip-0.92.pdf

Differential Revision: https://reviews.llvm.org/D79870

(cherry picked from commit e2692f0ee7f338fea4fc918669643315cefc7678)
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp |  190 +++
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h   |    6 +
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |    9 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoB.td    |   76 ++
 llvm/test/CodeGen/RISCV/rv32Zbb.ll          | 1218 +++++++++++++++++++
 llvm/test/CodeGen/RISCV/rv64Zbb.ll          | 1149 +++++++++++++++++
 6 files changed, 2645 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rv32Zbb.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64Zbb.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index a0ae05081adc..99e5135b424f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -184,6 +184,196 @@ bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
   return false;
 }
 
+// Check that it is a SLOI (Shift Left Ones Immediate). We first check that
+// it is the right node tree:
+//
+//  (OR (SHL RS1, VC2), VC1)
+//
+// and then we check that VC1, the mask used to fill with ones, is compatible
+// with VC2, the shamt:
+//
+//  VC1 == maskTrailingOnes<uint64_t>(VC2)
+
+bool RISCVDAGToDAGISel::SelectSLOI(SDValue N, SDValue &RS1, SDValue &Shamt) {
+  MVT XLenVT = Subtarget->getXLenVT();
+  if (N.getOpcode() == ISD::OR) {
+    SDValue Or = N;
+    if (Or.getOperand(0).getOpcode() == ISD::SHL) {
+      SDValue Shl = Or.getOperand(0);
+      if (isa<ConstantSDNode>(Shl.getOperand(1)) &&
+          isa<ConstantSDNode>(Or.getOperand(1))) {
+        if (XLenVT == MVT::i64) {
+          uint64_t VC1 = Or.getConstantOperandVal(1);
+          uint64_t VC2 = Shl.getConstantOperandVal(1);
+          if (VC1 == maskTrailingOnes<uint64_t>(VC2)) {
+            RS1 = Shl.getOperand(0);
+            Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
+                           Shl.getOperand(1).getValueType());
+            return true;
+          }
+        }
+        if (XLenVT == MVT::i32) {
+          uint32_t VC1 = Or.getConstantOperandVal(1);
+          uint32_t VC2 = Shl.getConstantOperandVal(1);
+          if (VC1 == maskTrailingOnes<uint32_t>(VC2)) {
+            RS1 = Shl.getOperand(0);
+            Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
+                           Shl.getOperand(1).getValueType());
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+// Check that it is a SROI (Shift Right Ones Immediate). We first check that
+// it is the right node tree:
+//
+//  (OR (SRL RS1, VC2), VC1)
+//
+// and then we check that VC1, the mask used to fill with ones, is compatible
+// with VC2, the shamt:
+//
+//  VC1 == maskLeadingOnes<uint64_t>(VC2)
+
+bool RISCVDAGToDAGISel::SelectSROI(SDValue N, SDValue &RS1, SDValue &Shamt) {
+  MVT XLenVT = Subtarget->getXLenVT();
+  if (N.getOpcode() == ISD::OR) {
+    SDValue Or = N;
+    if (Or.getOperand(0).getOpcode() == ISD::SRL) {
+      SDValue Srl = Or.getOperand(0);
+      if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
+          isa<ConstantSDNode>(Or.getOperand(1))) {
+        if (XLenVT == MVT::i64) {
+          uint64_t VC1 = Or.getConstantOperandVal(1);
+          uint64_t VC2 = Srl.getConstantOperandVal(1);
+          if (VC1 == maskLeadingOnes<uint64_t>(VC2)) {
+            RS1 = Srl.getOperand(0);
+            Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
+                           Srl.getOperand(1).getValueType());
+            return true;
+          }
+        }
+        if (XLenVT == MVT::i32) {
+          uint32_t VC1 = Or.getConstantOperandVal(1);
+          uint32_t VC2 = Srl.getConstantOperandVal(1);
+          if (VC1 == maskLeadingOnes<uint32_t>(VC2)) {
+            RS1 = Srl.getOperand(0);
+            Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
+                           Srl.getOperand(1).getValueType());
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+// Check that it is a SLLIUW (Shift Logical Left Immediate Unsigned i32
+// on RV64).
+// SLLIUW is the same as SLLI except for the fact that it clears the bits
+// XLEN-1:32 of the input RS1 before shifting.
+// We first check that it is the right node tree:
+//
+//  (AND (SHL RS1, VC2), VC1)
+//
+// We check that VC2, the shamt is less than 32, otherwise the pattern is
+// exactly the same as SLLI and we give priority to that.
+// Eventually we check that that VC1, the mask used to clear the upper 32 bits
+// of RS1, is correct:
+//
+//  VC1 == (0xFFFFFFFF << VC2)
+
+bool RISCVDAGToDAGISel::SelectSLLIUW(SDValue N, SDValue &RS1, SDValue &Shamt) {
+  if (N.getOpcode() == ISD::AND && Subtarget->getXLenVT() == MVT::i64) {
+    SDValue And = N;
+    if (And.getOperand(0).getOpcode() == ISD::SHL) {
+      SDValue Shl = And.getOperand(0);
+      if (isa<ConstantSDNode>(Shl.getOperand(1)) &&
+          isa<ConstantSDNode>(And.getOperand(1))) {
+        uint64_t VC1 = And.getConstantOperandVal(1);
+        uint64_t VC2 = Shl.getConstantOperandVal(1);
+        if (VC2 < 32 && VC1 == ((uint64_t)0xFFFFFFFF << VC2)) {
+          RS1 = Shl.getOperand(0);
+          Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
+                                            Shl.getOperand(1).getValueType());
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+// Check that it is a SLOIW (Shift Left Ones Immediate i32 on RV64).
+// We first check that it is the right node tree:
+//
+//  (SIGN_EXTEND_INREG (OR (SHL RS1, VC2), VC1))
+//
+// and then we check that VC1, the mask used to fill with ones, is compatible
+// with VC2, the shamt:
+//
+//  VC1 == maskTrailingOnes<uint32_t>(VC2)
+
+bool RISCVDAGToDAGISel::SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
+  if (Subtarget->getXLenVT() == MVT::i64 &&
+      N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+      cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
+    if (N.getOperand(0).getOpcode() == ISD::OR) {
+      SDValue Or = N.getOperand(0);
+      if (Or.getOperand(0).getOpcode() == ISD::SHL) {
+        SDValue Shl = Or.getOperand(0);
+        if (isa<ConstantSDNode>(Shl.getOperand(1)) &&
+            isa<ConstantSDNode>(Or.getOperand(1))) {
+          uint32_t VC1 = Or.getConstantOperandVal(1);
+          uint32_t VC2 = Shl.getConstantOperandVal(1);
+          if (VC1 == maskTrailingOnes<uint32_t>(VC2)) {
+            RS1 = Shl.getOperand(0);
+            Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
+                                              Shl.getOperand(1).getValueType());
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+// Check that it is a SROIW (Shift Right Ones Immediate i32 on RV64).
+// We first check that it is the right node tree:
+//
+//  (OR (SHL RS1, VC2), VC1)
+//
+// and then we check that VC1, the mask used to fill with ones, is compatible
+// with VC2, the shamt:
+//
+//  VC1 == maskLeadingOnes<uint32_t>(VC2)
+
+bool RISCVDAGToDAGISel::SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
+  if (N.getOpcode() == ISD::OR && Subtarget->getXLenVT() == MVT::i64) {
+    SDValue Or = N;
+    if (Or.getOperand(0).getOpcode() == ISD::SRL) {
+      SDValue Srl = Or.getOperand(0);
+      if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
+          isa<ConstantSDNode>(Or.getOperand(1))) {
+        uint32_t VC1 = Or.getConstantOperandVal(1);
+        uint32_t VC2 = Srl.getConstantOperandVal(1);
+        if (VC1 == maskLeadingOnes<uint32_t>(VC2)) {
+          RS1 = Srl.getOperand(0);
+          Shamt = CurDAG->getTargetConstant(VC2, SDLoc(N),
+                                            Srl.getOperand(1).getValueType());
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 // Merge an ADDI into the offset of a load/store instruction where possible.
 // (load (addi base, off1), off2) -> (load base, off1+off2)
 // (store val, (addi base, off1), off2) -> (store val, base, off1+off2)
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index dcf733ec3675..4e382ee58500 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -45,6 +45,12 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
 
   bool SelectAddrFI(SDValue Addr, SDValue &Base);
 
+  bool SelectSLOI(SDValue N, SDValue &RS1, SDValue &Shamt);
+  bool SelectSROI(SDValue N, SDValue &RS1, SDValue &Shamt);
+  bool SelectSLLIUW(SDValue N, SDValue &RS1, SDValue &Shamt);
+  bool SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt);
+  bool SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt);
+
 // Include the pieces autogenerated from the target description.
 #include "RISCVGenDAGISel.inc"
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 91fc69b5bc10..fb44f826eb6c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -152,9 +152,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::ROTL, XLenVT, Expand);
   setOperationAction(ISD::ROTR, XLenVT, Expand);
   setOperationAction(ISD::BSWAP, XLenVT, Expand);
-  setOperationAction(ISD::CTTZ, XLenVT, Expand);
-  setOperationAction(ISD::CTLZ, XLenVT, Expand);
-  setOperationAction(ISD::CTPOP, XLenVT, Expand);
+
+  if (!Subtarget.hasStdExtZbb()) {
+    setOperationAction(ISD::CTTZ, XLenVT, Expand);
+    setOperationAction(ISD::CTLZ, XLenVT, Expand);
+    setOperationAction(ISD::CTPOP, XLenVT, Expand);
+  }
 
   ISD::CondCode FPCCToExtend[] = {
       ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
index 34a463626e29..dc3d6cbb4fe8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
@@ -632,3 +632,79 @@ let Predicates = [HasStdExtZbproposedc, HasStdExtZbbOrZbp, HasStdExtC, IsRV64] i
 def : CompressPat<(PACK GPRC:$rs1, GPRC:$rs1, X0),
                   (C_ZEXTW GPRC:$rs1)>;
 } // Predicates = [HasStdExtZbproposedc, HasStdExtC, IsRV64]
+
+//===----------------------------------------------------------------------===//
+// Codegen patterns
+//===----------------------------------------------------------------------===//
+def SLOIPat   : ComplexPattern<XLenVT, 2, "SelectSLOI", [or]>;
+def SROIPat   : ComplexPattern<XLenVT, 2, "SelectSROI", [or]>;
+def SLLIUWPat : ComplexPattern<i64, 2, "SelectSLLIUW", [and]>;
+def SLOIWPat  : ComplexPattern<i64, 2, "SelectSLOIW", [sext_inreg]>;
+def SROIWPat  : ComplexPattern<i64, 2, "SelectSROIW", [or]>;
+
+let Predicates = [HasStdExtZbb] in {
+def : Pat<(xor (shl (xor GPR:$rs1, -1), GPR:$rs2), -1),
+          (SLO GPR:$rs1, GPR:$rs2)>;
+def : Pat<(xor (srl (xor GPR:$rs1, -1), GPR:$rs2), -1),
+          (SRO GPR:$rs1, GPR:$rs2)>;
+def : Pat<(SLOIPat GPR:$rs1, uimmlog2xlen:$shamt),
+          (SLOI GPR:$rs1, uimmlog2xlen:$shamt)>;
+def : Pat<(SROIPat GPR:$rs1, uimmlog2xlen:$shamt),
+          (SROI GPR:$rs1, uimmlog2xlen:$shamt)>;
+def : Pat<(ctlz GPR:$rs1), (CLZ GPR:$rs1)>;
+def : Pat<(cttz GPR:$rs1), (CTZ GPR:$rs1)>;
+def : Pat<(ctpop GPR:$rs1), (PCNT GPR:$rs1)>;
+} // Predicates = [HasStdExtZbb]
+
+let Predicates = [HasStdExtZbb, IsRV32] in
+def : Pat<(sra (shl GPR:$rs1, (i32 24)), (i32 24)), (SEXTB GPR:$rs1)>;
+let Predicates = [HasStdExtZbb, IsRV64] in
+def : Pat<(sra (shl GPR:$rs1, (i64 56)), (i64 56)), (SEXTB GPR:$rs1)>;
+
+let Predicates = [HasStdExtZbb, IsRV32] in
+def : Pat<(sra (shl GPR:$rs1, (i32 16)), (i32 16)), (SEXTH GPR:$rs1)>;
+let Predicates = [HasStdExtZbb, IsRV64] in
+def : Pat<(sra (shl GPR:$rs1, (i64 48)), (i64 48)), (SEXTH GPR:$rs1)>;
+
+let Predicates = [HasStdExtZbb] in {
+def : Pat<(smin GPR:$rs1, GPR:$rs2), (MIN  GPR:$rs1, GPR:$rs2)>;
+def : Pat<(riscv_selectcc GPR:$rs1, GPR:$rs2, (XLenVT 20), GPR:$rs1, GPR:$rs2),
+          (MIN  GPR:$rs1, GPR:$rs2)>;
+def : Pat<(smax GPR:$rs1, GPR:$rs2), (MAX  GPR:$rs1, GPR:$rs2)>;
+def : Pat<(riscv_selectcc GPR:$rs2, GPR:$rs1, (XLenVT 20), GPR:$rs1, GPR:$rs2),
+          (MAX  GPR:$rs1, GPR:$rs2)>;
+def : Pat<(umin GPR:$rs1, GPR:$rs2), (MINU GPR:$rs1, GPR:$rs2)>;
+def : Pat<(riscv_selectcc GPR:$rs1, GPR:$rs2, (XLenVT 12), GPR:$rs1, GPR:$rs2),
+          (MINU  GPR:$rs1, GPR:$rs2)>;
+def : Pat<(umax GPR:$rs1, GPR:$rs2), (MAXU GPR:$rs1, GPR:$rs2)>;
+def : Pat<(riscv_selectcc GPR:$rs2, GPR:$rs1, (XLenVT 12), GPR:$rs1, GPR:$rs2),
+          (MAXU  GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbb]
+
+let Predicates = [HasStdExtZbb, IsRV64] in {
+def : Pat<(and (add GPR:$rs, simm12:$simm12), (i64 0xFFFFFFFF)),
+          (ADDIWU GPR:$rs, simm12:$simm12)>;
+def : Pat<(SLLIUWPat GPR:$rs1, uimmlog2xlen:$shamt),
+          (SLLIUW GPR:$rs1, uimmlog2xlen:$shamt)>;
+def : Pat<(and (add GPR:$rs1, GPR:$rs2), (i64 0xFFFFFFFF)),
+          (ADDWU GPR:$rs1, GPR:$rs2)>;
+def : Pat<(and (sub GPR:$rs1, GPR:$rs2), (i64 0xFFFFFFFF)),
+          (SUBWU GPR:$rs1, GPR:$rs2)>;
+def : Pat<(add GPR:$rs1, (and GPR:$rs2, (i64 0xFFFFFFFF))),
+          (ADDUW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(sub GPR:$rs1, (and GPR:$rs2, (i64 0xFFFFFFFF))),
+          (SUBUW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(xor (riscv_sllw (xor GPR:$rs1, -1), GPR:$rs2), -1),
+          (SLOW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(xor (riscv_srlw (xor GPR:$rs1, -1), GPR:$rs2), -1),
+          (SROW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(SLOIWPat GPR:$rs1, uimmlog2xlen:$shamt),
+          (SLOIW GPR:$rs1, uimmlog2xlen:$shamt)>;
+def : Pat<(SROIWPat GPR:$rs1, uimmlog2xlen:$shamt),
+          (SROIW GPR:$rs1, uimmlog2xlen:$shamt)>;
+def : Pat<(add (ctlz (and GPR:$rs1, (i64 0xFFFFFFFF))), (i64 -32)),
+          (CLZW GPR:$rs1)>;
+// We don't pattern-match CTZW here as it has the same pattern and result as
+// RV64 CTZ
+def : Pat<(ctpop (and GPR:$rs1, (i64 0xFFFFFFFF))), (PCNTW GPR:$rs1)>;
+} // Predicates = [HasStdExtZbb, IsRV64]
diff --git a/llvm/test/CodeGen/RISCV/rv32Zbb.ll b/llvm/test/CodeGen/RISCV/rv32Zbb.ll
new file mode 100644
index 000000000000..6933bad1f8cd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv32Zbb.ll
@@ -0,0 +1,1218 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-b -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32IB
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32IBB
+
+define i32 @slo_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: slo_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: slo_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    slo a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: slo_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    slo a0, a0, a1
+; RV32IBB-NEXT:    ret
+  %neg = xor i32 %a, -1
+  %shl = shl i32 %neg, %b
+  %neg1 = xor i32 %shl, -1
+  ret i32 %neg1
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the experimental-b
+; extension introduce instructions suitable for this pattern.
+
+define i64 @slo_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: slo_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a3, a2, -32
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    bltz a3, .LBB1_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a2, zero
+; RV32I-NEXT:    sll a1, a0, a3
+; RV32I-NEXT:    j .LBB1_3
+; RV32I-NEXT:  .LBB1_2:
+; RV32I-NEXT:    not a1, a1
+; RV32I-NEXT:    sll a1, a1, a2
+; RV32I-NEXT:    addi a3, zero, 31
+; RV32I-NEXT:    sub a3, a3, a2
+; RV32I-NEXT:    srli a4, a0, 1
+; RV32I-NEXT:    srl a3, a4, a3
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    sll a2, a0, a2
+; RV32I-NEXT:  .LBB1_3:
+; RV32I-NEXT:    not a1, a1
+; RV32I-NEXT:    not a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: slo_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    addi a3, a2, -32
+; RV32IB-NEXT:    not a0, a0
+; RV32IB-NEXT:    bltz a3, .LBB1_2
+; RV32IB-NEXT:  # %bb.1:
+; RV32IB-NEXT:    mv a2, zero
+; RV32IB-NEXT:    sll a1, a0, a3
+; RV32IB-NEXT:    j .LBB1_3
+; RV32IB-NEXT:  .LBB1_2:
+; RV32IB-NEXT:    not a1, a1
+; RV32IB-NEXT:    sll a1, a1, a2
+; RV32IB-NEXT:    addi a3, zero, 31
+; RV32IB-NEXT:    sub a3, a3, a2
+; RV32IB-NEXT:    srli a4, a0, 1
+; RV32IB-NEXT:    srl a3, a4, a3
+; RV32IB-NEXT:    or a1, a1, a3
+; RV32IB-NEXT:    sll a2, a0, a2
+; RV32IB-NEXT:  .LBB1_3:
+; RV32IB-NEXT:    not a1, a1
+; RV32IB-NEXT:    not a0, a2
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: slo_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    addi a3, a2, -32
+; RV32IBB-NEXT:    not a0, a0
+; RV32IBB-NEXT:    bltz a3, .LBB1_2
+; RV32IBB-NEXT:  # %bb.1:
+; RV32IBB-NEXT:    mv a2, zero
+; RV32IBB-NEXT:    sll a1, a0, a3
+; RV32IBB-NEXT:    j .LBB1_3
+; RV32IBB-NEXT:  .LBB1_2:
+; RV32IBB-NEXT:    not a1, a1
+; RV32IBB-NEXT:    sll a1, a1, a2
+; RV32IBB-NEXT:    addi a3, zero, 31
+; RV32IBB-NEXT:    sub a3, a3, a2
+; RV32IBB-NEXT:    srli a4, a0, 1
+; RV32IBB-NEXT:    srl a3, a4, a3
+; RV32IBB-NEXT:    or a1, a1, a3
+; RV32IBB-NEXT:    sll a2, a0, a2
+; RV32IBB-NEXT:  .LBB1_3:
+; RV32IBB-NEXT:    not a1, a1
+; RV32IBB-NEXT:    not a0, a2
+; RV32IBB-NEXT:    ret
+  %neg = xor i64 %a, -1
+  %shl = shl i64 %neg, %b
+  %neg1 = xor i64 %shl, -1
+  ret i64 %neg1
+}
+
+define i32 @sro_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: sro_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sro_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    sro a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: sro_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    sro a0, a0, a1
+; RV32IBB-NEXT:    ret
+  %neg = xor i32 %a, -1
+  %shr = lshr i32 %neg, %b
+  %neg1 = xor i32 %shr, -1
+  ret i32 %neg1
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the experimental-b
+; extension introduce instructions suitable for this pattern.
+
+define i64 @sro_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: sro_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a3, a2, -32
+; RV32I-NEXT:    not a1, a1
+; RV32I-NEXT:    bltz a3, .LBB3_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a2, zero
+; RV32I-NEXT:    srl a0, a1, a3
+; RV32I-NEXT:    j .LBB3_3
+; RV32I-NEXT:  .LBB3_2:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srl a0, a0, a2
+; RV32I-NEXT:    addi a3, zero, 31
+; RV32I-NEXT:    sub a3, a3, a2
+; RV32I-NEXT:    slli a4, a1, 1
+; RV32I-NEXT:    sll a3, a4, a3
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    srl a2, a1, a2
+; RV32I-NEXT:  .LBB3_3:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    not a1, a2
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sro_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    addi a3, a2, -32
+; RV32IB-NEXT:    not a1, a1
+; RV32IB-NEXT:    bltz a3, .LBB3_2
+; RV32IB-NEXT:  # %bb.1:
+; RV32IB-NEXT:    mv a2, zero
+; RV32IB-NEXT:    srl a0, a1, a3
+; RV32IB-NEXT:    j .LBB3_3
+; RV32IB-NEXT:  .LBB3_2:
+; RV32IB-NEXT:    not a0, a0
+; RV32IB-NEXT:    srl a0, a0, a2
+; RV32IB-NEXT:    addi a3, zero, 31
+; RV32IB-NEXT:    sub a3, a3, a2
+; RV32IB-NEXT:    slli a4, a1, 1
+; RV32IB-NEXT:    sll a3, a4, a3
+; RV32IB-NEXT:    or a0, a0, a3
+; RV32IB-NEXT:    srl a2, a1, a2
+; RV32IB-NEXT:  .LBB3_3:
+; RV32IB-NEXT:    not a0, a0
+; RV32IB-NEXT:    not a1, a2
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: sro_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    addi a3, a2, -32
+; RV32IBB-NEXT:    not a1, a1
+; RV32IBB-NEXT:    bltz a3, .LBB3_2
+; RV32IBB-NEXT:  # %bb.1:
+; RV32IBB-NEXT:    mv a2, zero
+; RV32IBB-NEXT:    srl a0, a1, a3
+; RV32IBB-NEXT:    j .LBB3_3
+; RV32IBB-NEXT:  .LBB3_2:
+; RV32IBB-NEXT:    not a0, a0
+; RV32IBB-NEXT:    srl a0, a0, a2
+; RV32IBB-NEXT:    addi a3, zero, 31
+; RV32IBB-NEXT:    sub a3, a3, a2
+; RV32IBB-NEXT:    slli a4, a1, 1
+; RV32IBB-NEXT:    sll a3, a4, a3
+; RV32IBB-NEXT:    or a0, a0, a3
+; RV32IBB-NEXT:    srl a2, a1, a2
+; RV32IBB-NEXT:  .LBB3_3:
+; RV32IBB-NEXT:    not a0, a0
+; RV32IBB-NEXT:    not a1, a2
+; RV32IBB-NEXT:    ret
+  %neg = xor i64 %a, -1
+  %shr = lshr i64 %neg, %b
+  %neg1 = xor i64 %shr, -1
+  ret i64 %neg1
+}
+
+define i32 @sloi_i32(i32 %a) nounwind {
+; RV32I-LABEL: sloi_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    ori a0, a0, 1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sloi_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    sloi a0, a0, 1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: sloi_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    sloi a0, a0, 1
+; RV32IBB-NEXT:    ret
+  %neg = shl i32 %a, 1
+  %neg12 = or i32 %neg, 1
+  ret i32 %neg12
+}
+
+define i64 @sloi_i64(i64 %a) nounwind {
+; RV32I-LABEL: sloi_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a2, a0, 31
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    ori a0, a0, 1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sloi_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    addi a2, zero, 1
+; RV32IB-NEXT:    fsl a1, a1, a2, a0
+; RV32IB-NEXT:    sloi a0, a0, 1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: sloi_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    srli a2, a0, 31
+; RV32IBB-NEXT:    slli a1, a1, 1
+; RV32IBB-NEXT:    or a1, a1, a2
+; RV32IBB-NEXT:    sloi a0, a0, 1
+; RV32IBB-NEXT:    ret
+  %neg = shl i64 %a, 1
+  %neg12 = or i64 %neg, 1
+  ret i64 %neg12
+}
+
+define i32 @sroi_i32(i32 %a) nounwind {
+; RV32I-LABEL: sroi_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    lui a1, 524288
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sroi_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    sroi a0, a0, 1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: sroi_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    sroi a0, a0, 1
+; RV32IBB-NEXT:    ret
+  %neg = lshr i32 %a, 1
+  %neg12 = or i32 %neg, -2147483648
+  ret i32 %neg12
+}
+
+define i64 @sroi_i64(i64 %a) nounwind {
+; RV32I-LABEL: sroi_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a1, 31
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    lui a2, 524288
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sroi_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    addi a2, zero, 31
+; RV32IB-NEXT:    fsl a0, a1, a2, a0
+; RV32IB-NEXT:    sroi a1, a1, 1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: sroi_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    slli a2, a1, 31
+; RV32IBB-NEXT:    srli a0, a0, 1
+; RV32IBB-NEXT:    or a0, a0, a2
+; RV32IBB-NEXT:    sroi a1, a1, 1
+; RV32IBB-NEXT:    ret
+  %neg = lshr i64 %a, 1
+  %neg12 = or i64 %neg, -9223372036854775808
+  ret i64 %neg12
+}
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define i32 @ctlz_i32(i32 %a) nounwind {
+; RV32I-LABEL: ctlz_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp)
+; RV32I-NEXT:    beqz a0, .LBB8_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 8
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    addi a2, a2, 1365
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    lui a1, 209715
+; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lui a1, 61681
+; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    lui a1, 4112
+; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    j .LBB8_3
+; RV32I-NEXT:  .LBB8_2:
+; RV32I-NEXT:    addi a0, zero, 32
+; RV32I-NEXT:  .LBB8_3: # %cond.end
+; RV32I-NEXT:    lw ra, 12(sp)
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: ctlz_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    beqz a0, .LBB8_2
+; RV32IB-NEXT:  # %bb.1: # %cond.false
+; RV32IB-NEXT:    clz a0, a0
+; RV32IB-NEXT:    ret
+; RV32IB-NEXT:  .LBB8_2:
+; RV32IB-NEXT:    addi a0, zero, 32
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: ctlz_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    beqz a0, .LBB8_2
+; RV32IBB-NEXT:  # %bb.1: # %cond.false
+; RV32IBB-NEXT:    clz a0, a0
+; RV32IBB-NEXT:    ret
+; RV32IBB-NEXT:  .LBB8_2:
+; RV32IBB-NEXT:    addi a0, zero, 32
+; RV32IBB-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i64 @ctlz_i64(i64 %a) nounwind {
+; RV32I-LABEL: ctlz_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp)
+; RV32I-NEXT:    sw s0, 24(sp)
+; RV32I-NEXT:    sw s1, 20(sp)
+; RV32I-NEXT:    sw s2, 16(sp)
+; RV32I-NEXT:    sw s3, 12(sp)
+; RV32I-NEXT:    sw s4, 8(sp)
+; RV32I-NEXT:    sw s5, 4(sp)
+; RV32I-NEXT:    sw s6, 0(sp)
+; RV32I-NEXT:    mv s3, a1
+; RV32I-NEXT:    mv s4, a0
+; RV32I-NEXT:    srli a0, a1, 1
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 8
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    addi s5, a2, 1365
+; RV32I-NEXT:    and a1, a1, s5
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    lui a1, 209715
+; RV32I-NEXT:    addi s1, a1, 819
+; RV32I-NEXT:    and a1, a0, s1
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, s1
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lui a1, 61681
+; RV32I-NEXT:    addi s6, a1, -241
+; RV32I-NEXT:    and a0, a0, s6
+; RV32I-NEXT:    lui a1, 4112
+; RV32I-NEXT:    addi s0, a1, 257
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    srli a0, s4, 1
+; RV32I-NEXT:    or a0, s4, a0
+; RV32I-NEXT:    srli a1, a0, 2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 8
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    and a1, a1, s5
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, s1
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, s1
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    and a0, a0, s6
+; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    bnez s3, .LBB9_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    j .LBB9_3
+; RV32I-NEXT:  .LBB9_2:
+; RV32I-NEXT:    srli a0, s2, 24
+; RV32I-NEXT:  .LBB9_3:
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    lw s6, 0(sp)
+; RV32I-NEXT:    lw s5, 4(sp)
+; RV32I-NEXT:    lw s4, 8(sp)
+; RV32I-NEXT:    lw s3, 12(sp)
+; RV32I-NEXT:    lw s2, 16(sp)
+; RV32I-NEXT:    lw s1, 20(sp)
+; RV32I-NEXT:    lw s0, 24(sp)
+; RV32I-NEXT:    lw ra, 28(sp)
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: ctlz_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    bnez a1, .LBB9_2
+; RV32IB-NEXT:  # %bb.1:
+; RV32IB-NEXT:    clz a0, a0
+; RV32IB-NEXT:    addi a0, a0, 32
+; RV32IB-NEXT:    mv a1, zero
+; RV32IB-NEXT:    ret
+; RV32IB-NEXT:  .LBB9_2:
+; RV32IB-NEXT:    clz a0, a1
+; RV32IB-NEXT:    mv a1, zero
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: ctlz_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    bnez a1, .LBB9_2
+; RV32IBB-NEXT:  # %bb.1:
+; RV32IBB-NEXT:    clz a0, a0
+; RV32IBB-NEXT:    addi a0, a0, 32
+; RV32IBB-NEXT:    mv a1, zero
+; RV32IBB-NEXT:    ret
+; RV32IBB-NEXT:  .LBB9_2:
+; RV32IBB-NEXT:    clz a0, a1
+; RV32IBB-NEXT:    mv a1, zero
+; RV32IBB-NEXT:    ret
+  %1 = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define i32 @cttz_i32(i32 %a) nounwind {
+; RV32I-LABEL: cttz_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp)
+; RV32I-NEXT:    beqz a0, .LBB10_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    addi a1, a0, -1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    addi a2, a2, 1365
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    lui a1, 209715
+; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lui a1, 61681
+; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    lui a1, 4112
+; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    j .LBB10_3
+; RV32I-NEXT:  .LBB10_2:
+; RV32I-NEXT:    addi a0, zero, 32
+; RV32I-NEXT:  .LBB10_3: # %cond.end
+; RV32I-NEXT:    lw ra, 12(sp)
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: cttz_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    beqz a0, .LBB10_2
+; RV32IB-NEXT:  # %bb.1: # %cond.false
+; RV32IB-NEXT:    ctz a0, a0
+; RV32IB-NEXT:    ret
+; RV32IB-NEXT:  .LBB10_2:
+; RV32IB-NEXT:    addi a0, zero, 32
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: cttz_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    beqz a0, .LBB10_2
+; RV32IBB-NEXT:  # %bb.1: # %cond.false
+; RV32IBB-NEXT:    ctz a0, a0
+; RV32IBB-NEXT:    ret
+; RV32IBB-NEXT:  .LBB10_2:
+; RV32IBB-NEXT:    addi a0, zero, 32
+; RV32IBB-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i64 @cttz_i64(i64 %a) nounwind {
+; RV32I-LABEL: cttz_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp)
+; RV32I-NEXT:    sw s0, 24(sp)
+; RV32I-NEXT:    sw s1, 20(sp)
+; RV32I-NEXT:    sw s2, 16(sp)
+; RV32I-NEXT:    sw s3, 12(sp)
+; RV32I-NEXT:    sw s4, 8(sp)
+; RV32I-NEXT:    sw s5, 4(sp)
+; RV32I-NEXT:    sw s6, 0(sp)
+; RV32I-NEXT:    mv s3, a1
+; RV32I-NEXT:    mv s4, a0
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    not a1, s4
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    addi s5, a2, 1365
+; RV32I-NEXT:    and a1, a1, s5
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    lui a1, 209715
+; RV32I-NEXT:    addi s0, a1, 819
+; RV32I-NEXT:    and a1, a0, s0
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, s0
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lui a1, 61681
+; RV32I-NEXT:    addi s6, a1, -241
+; RV32I-NEXT:    and a0, a0, s6
+; RV32I-NEXT:    lui a1, 4112
+; RV32I-NEXT:    addi s1, a1, 257
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    addi a0, s3, -1
+; RV32I-NEXT:    not a1, s3
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    and a1, a1, s5
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, s0
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, s0
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    and a0, a0, s6
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    bnez s4, .LBB11_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    j .LBB11_3
+; RV32I-NEXT:  .LBB11_2:
+; RV32I-NEXT:    srli a0, s2, 24
+; RV32I-NEXT:  .LBB11_3:
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    lw s6, 0(sp)
+; RV32I-NEXT:    lw s5, 4(sp)
+; RV32I-NEXT:    lw s4, 8(sp)
+; RV32I-NEXT:    lw s3, 12(sp)
+; RV32I-NEXT:    lw s2, 16(sp)
+; RV32I-NEXT:    lw s1, 20(sp)
+; RV32I-NEXT:    lw s0, 24(sp)
+; RV32I-NEXT:    lw ra, 28(sp)
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: cttz_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    bnez a0, .LBB11_2
+; RV32IB-NEXT:  # %bb.1:
+; RV32IB-NEXT:    ctz a0, a1
+; RV32IB-NEXT:    addi a0, a0, 32
+; RV32IB-NEXT:    mv a1, zero
+; RV32IB-NEXT:    ret
+; RV32IB-NEXT:  .LBB11_2:
+; RV32IB-NEXT:    ctz a0, a0
+; RV32IB-NEXT:    mv a1, zero
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: cttz_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    bnez a0, .LBB11_2
+; RV32IBB-NEXT:  # %bb.1:
+; RV32IBB-NEXT:    ctz a0, a1
+; RV32IBB-NEXT:    addi a0, a0, 32
+; RV32IBB-NEXT:    mv a1, zero
+; RV32IBB-NEXT:    ret
+; RV32IBB-NEXT:  .LBB11_2:
+; RV32IBB-NEXT:    ctz a0, a0
+; RV32IBB-NEXT:    mv a1, zero
+; RV32IBB-NEXT:    ret
+  %1 = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+declare i32 @llvm.ctpop.i32(i32)
+
+define i32 @ctpop_i32(i32 %a) nounwind {
+; RV32I-LABEL: ctpop_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp)
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    addi a2, a2, 1365
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    lui a1, 209715
+; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lui a1, 61681
+; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    lui a1, 4112
+; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    lw ra, 12(sp)
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: ctpop_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    pcnt a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: ctpop_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    pcnt a0, a0
+; RV32IBB-NEXT:    ret
+  %1 = call i32 @llvm.ctpop.i32(i32 %a)
+  ret i32 %1
+}
+
+declare i64 @llvm.ctpop.i64(i64)
+
+define i64 @ctpop_i64(i64 %a) nounwind {
+; RV32I-LABEL: ctpop_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp)
+; RV32I-NEXT:    sw s0, 24(sp)
+; RV32I-NEXT:    sw s1, 20(sp)
+; RV32I-NEXT:    sw s2, 16(sp)
+; RV32I-NEXT:    sw s3, 12(sp)
+; RV32I-NEXT:    sw s4, 8(sp)
+; RV32I-NEXT:    sw s5, 4(sp)
+; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    srli a0, a1, 1
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    addi s3, a2, 1365
+; RV32I-NEXT:    and a0, a0, s3
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    lui a1, 209715
+; RV32I-NEXT:    addi s0, a1, 819
+; RV32I-NEXT:    and a1, a0, s0
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, s0
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    lui a1, 61681
+; RV32I-NEXT:    addi s4, a1, -241
+; RV32I-NEXT:    and a0, a0, s4
+; RV32I-NEXT:    lui a1, 4112
+; RV32I-NEXT:    addi s1, a1, 257
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    srli s5, a0, 24
+; RV32I-NEXT:    srli a0, s2, 1
+; RV32I-NEXT:    and a0, a0, s3
+; RV32I-NEXT:    sub a0, s2, a0
+; RV32I-NEXT:    and a1, a0, s0
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, s0
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    and a0, a0, s4
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    add a0, a0, s5
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    lw s5, 4(sp)
+; RV32I-NEXT:    lw s4, 8(sp)
+; RV32I-NEXT:    lw s3, 12(sp)
+; RV32I-NEXT:    lw s2, 16(sp)
+; RV32I-NEXT:    lw s1, 20(sp)
+; RV32I-NEXT:    lw s0, 24(sp)
+; RV32I-NEXT:    lw ra, 28(sp)
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: ctpop_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    pcnt a1, a1
+; RV32IB-NEXT:    pcnt a0, a0
+; RV32IB-NEXT:    add a0, a0, a1
+; RV32IB-NEXT:    mv a1, zero
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: ctpop_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    pcnt a1, a1
+; RV32IBB-NEXT:    pcnt a0, a0
+; RV32IBB-NEXT:    add a0, a0, a1
+; RV32IBB-NEXT:    mv a1, zero
+; RV32IBB-NEXT:    ret
+  %1 = call i64 @llvm.ctpop.i64(i64 %a)
+  ret i64 %1
+}
+
+define i32 @sextb_i32(i32 %a) nounwind {
+; RV32I-LABEL: sextb_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a0, a0, 24
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sextb_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    sext.b a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: sextb_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    sext.b a0, a0
+; RV32IBB-NEXT:    ret
+  %shl = shl i32 %a, 24
+  %shr = ashr exact i32 %shl, 24
+  ret i32 %shr
+}
+
+define i64 @sextb_i64(i64 %a) nounwind {
+; RV32I-LABEL: sextb_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srai a0, a1, 24
+; RV32I-NEXT:    srai a1, a1, 31
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sextb_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    sext.b a2, a0
+; RV32IB-NEXT:    slli a0, a0, 24
+; RV32IB-NEXT:    srai a1, a0, 31
+; RV32IB-NEXT:    mv a0, a2
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: sextb_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    sext.b a2, a0
+; RV32IBB-NEXT:    slli a0, a0, 24
+; RV32IBB-NEXT:    srai a1, a0, 31
+; RV32IBB-NEXT:    mv a0, a2
+; RV32IBB-NEXT:    ret
+  %shl = shl i64 %a, 56
+  %shr = ashr exact i64 %shl, 56
+  ret i64 %shr
+}
+
+define i32 @sexth_i32(i32 %a) nounwind {
+; RV32I-LABEL: sexth_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a0, a0, 16
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sexth_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    sext.h a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: sexth_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    sext.h a0, a0
+; RV32IBB-NEXT:    ret
+  %shl = shl i32 %a, 16
+  %shr = ashr exact i32 %shl, 16
+  ret i32 %shr
+}
+
+define i64 @sexth_i64(i64 %a) nounwind {
+; RV32I-LABEL: sexth_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    srai a0, a1, 16
+; RV32I-NEXT:    srai a1, a1, 31
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sexth_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    sext.h a2, a0
+; RV32IB-NEXT:    slli a0, a0, 16
+; RV32IB-NEXT:    srai a1, a0, 31
+; RV32IB-NEXT:    mv a0, a2
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: sexth_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    sext.h a2, a0
+; RV32IBB-NEXT:    slli a0, a0, 16
+; RV32IBB-NEXT:    srai a1, a0, 31
+; RV32IBB-NEXT:    mv a0, a2
+; RV32IBB-NEXT:    ret
+  %shl = shl i64 %a, 48
+  %shr = ashr exact i64 %shl, 48
+  ret i64 %shr
+}
+
+define i32 @min_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: min_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    blt a0, a1, .LBB18_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:  .LBB18_2:
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: min_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    min a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: min_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    min a0, a0, a1
+; RV32IBB-NEXT:    ret
+  %cmp = icmp slt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the experimental-b
+; extension introduce instructions suitable for this pattern.
+
+define i64 @min_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: min_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beq a1, a3, .LBB19_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt a4, a1, a3
+; RV32I-NEXT:    beqz a4, .LBB19_3
+; RV32I-NEXT:    j .LBB19_4
+; RV32I-NEXT:  .LBB19_2:
+; RV32I-NEXT:    sltu a4, a0, a2
+; RV32I-NEXT:    bnez a4, .LBB19_4
+; RV32I-NEXT:  .LBB19_3:
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:  .LBB19_4:
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: min_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    beq a1, a3, .LBB19_2
+; RV32IB-NEXT:  # %bb.1:
+; RV32IB-NEXT:    slt a4, a1, a3
+; RV32IB-NEXT:    beqz a4, .LBB19_3
+; RV32IB-NEXT:    j .LBB19_4
+; RV32IB-NEXT:  .LBB19_2:
+; RV32IB-NEXT:    sltu a4, a0, a2
+; RV32IB-NEXT:    bnez a4, .LBB19_4
+; RV32IB-NEXT:  .LBB19_3:
+; RV32IB-NEXT:    mv a0, a2
+; RV32IB-NEXT:    mv a1, a3
+; RV32IB-NEXT:  .LBB19_4:
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: min_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    beq a1, a3, .LBB19_2
+; RV32IBB-NEXT:  # %bb.1:
+; RV32IBB-NEXT:    slt a4, a1, a3
+; RV32IBB-NEXT:    beqz a4, .LBB19_3
+; RV32IBB-NEXT:    j .LBB19_4
+; RV32IBB-NEXT:  .LBB19_2:
+; RV32IBB-NEXT:    sltu a4, a0, a2
+; RV32IBB-NEXT:    bnez a4, .LBB19_4
+; RV32IBB-NEXT:  .LBB19_3:
+; RV32IBB-NEXT:    mv a0, a2
+; RV32IBB-NEXT:    mv a1, a3
+; RV32IBB-NEXT:  .LBB19_4:
+; RV32IBB-NEXT:    ret
+  %cmp = icmp slt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define i32 @max_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: max_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    blt a1, a0, .LBB20_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:  .LBB20_2:
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: max_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    max a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: max_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    max a0, a0, a1
+; RV32IBB-NEXT:    ret
+  %cmp = icmp sgt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the experimental-b
+; extension introduce instructions suitable for this pattern.
+
+define i64 @max_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: max_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beq a1, a3, .LBB21_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    slt a4, a3, a1
+; RV32I-NEXT:    beqz a4, .LBB21_3
+; RV32I-NEXT:    j .LBB21_4
+; RV32I-NEXT:  .LBB21_2:
+; RV32I-NEXT:    sltu a4, a2, a0
+; RV32I-NEXT:    bnez a4, .LBB21_4
+; RV32I-NEXT:  .LBB21_3:
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:  .LBB21_4:
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: max_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    beq a1, a3, .LBB21_2
+; RV32IB-NEXT:  # %bb.1:
+; RV32IB-NEXT:    slt a4, a3, a1
+; RV32IB-NEXT:    beqz a4, .LBB21_3
+; RV32IB-NEXT:    j .LBB21_4
+; RV32IB-NEXT:  .LBB21_2:
+; RV32IB-NEXT:    sltu a4, a2, a0
+; RV32IB-NEXT:    bnez a4, .LBB21_4
+; RV32IB-NEXT:  .LBB21_3:
+; RV32IB-NEXT:    mv a0, a2
+; RV32IB-NEXT:    mv a1, a3
+; RV32IB-NEXT:  .LBB21_4:
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: max_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    beq a1, a3, .LBB21_2
+; RV32IBB-NEXT:  # %bb.1:
+; RV32IBB-NEXT:    slt a4, a3, a1
+; RV32IBB-NEXT:    beqz a4, .LBB21_3
+; RV32IBB-NEXT:    j .LBB21_4
+; RV32IBB-NEXT:  .LBB21_2:
+; RV32IBB-NEXT:    sltu a4, a2, a0
+; RV32IBB-NEXT:    bnez a4, .LBB21_4
+; RV32IBB-NEXT:  .LBB21_3:
+; RV32IBB-NEXT:    mv a0, a2
+; RV32IBB-NEXT:    mv a1, a3
+; RV32IBB-NEXT:  .LBB21_4:
+; RV32IBB-NEXT:    ret
+  %cmp = icmp sgt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define i32 @minu_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: minu_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    bltu a0, a1, .LBB22_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:  .LBB22_2:
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: minu_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    minu a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: minu_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    minu a0, a0, a1
+; RV32IBB-NEXT:    ret
+  %cmp = icmp ult i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the experimental-b
+; extension introduce instructions suitable for this pattern.
+
+define i64 @minu_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: minu_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beq a1, a3, .LBB23_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a4, a1, a3
+; RV32I-NEXT:    beqz a4, .LBB23_3
+; RV32I-NEXT:    j .LBB23_4
+; RV32I-NEXT:  .LBB23_2:
+; RV32I-NEXT:    sltu a4, a0, a2
+; RV32I-NEXT:    bnez a4, .LBB23_4
+; RV32I-NEXT:  .LBB23_3:
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:  .LBB23_4:
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: minu_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    beq a1, a3, .LBB23_2
+; RV32IB-NEXT:  # %bb.1:
+; RV32IB-NEXT:    sltu a4, a1, a3
+; RV32IB-NEXT:    beqz a4, .LBB23_3
+; RV32IB-NEXT:    j .LBB23_4
+; RV32IB-NEXT:  .LBB23_2:
+; RV32IB-NEXT:    sltu a4, a0, a2
+; RV32IB-NEXT:    bnez a4, .LBB23_4
+; RV32IB-NEXT:  .LBB23_3:
+; RV32IB-NEXT:    mv a0, a2
+; RV32IB-NEXT:    mv a1, a3
+; RV32IB-NEXT:  .LBB23_4:
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: minu_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    beq a1, a3, .LBB23_2
+; RV32IBB-NEXT:  # %bb.1:
+; RV32IBB-NEXT:    sltu a4, a1, a3
+; RV32IBB-NEXT:    beqz a4, .LBB23_3
+; RV32IBB-NEXT:    j .LBB23_4
+; RV32IBB-NEXT:  .LBB23_2:
+; RV32IBB-NEXT:    sltu a4, a0, a2
+; RV32IBB-NEXT:    bnez a4, .LBB23_4
+; RV32IBB-NEXT:  .LBB23_3:
+; RV32IBB-NEXT:    mv a0, a2
+; RV32IBB-NEXT:    mv a1, a3
+; RV32IBB-NEXT:  .LBB23_4:
+; RV32IBB-NEXT:    ret
+  %cmp = icmp ult i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define i32 @maxu_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: maxu_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    bltu a1, a0, .LBB24_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:  .LBB24_2:
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: maxu_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    maxu a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: maxu_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    maxu a0, a0, a1
+; RV32IBB-NEXT:    ret
+  %cmp = icmp ugt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the experimental-b
+; extension introduce instructions suitable for this pattern.
+
+define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: maxu_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beq a1, a3, .LBB25_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sltu a4, a3, a1
+; RV32I-NEXT:    beqz a4, .LBB25_3
+; RV32I-NEXT:    j .LBB25_4
+; RV32I-NEXT:  .LBB25_2:
+; RV32I-NEXT:    sltu a4, a2, a0
+; RV32I-NEXT:    bnez a4, .LBB25_4
+; RV32I-NEXT:  .LBB25_3:
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:  .LBB25_4:
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: maxu_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    beq a1, a3, .LBB25_2
+; RV32IB-NEXT:  # %bb.1:
+; RV32IB-NEXT:    sltu a4, a3, a1
+; RV32IB-NEXT:    beqz a4, .LBB25_3
+; RV32IB-NEXT:    j .LBB25_4
+; RV32IB-NEXT:  .LBB25_2:
+; RV32IB-NEXT:    sltu a4, a2, a0
+; RV32IB-NEXT:    bnez a4, .LBB25_4
+; RV32IB-NEXT:  .LBB25_3:
+; RV32IB-NEXT:    mv a0, a2
+; RV32IB-NEXT:    mv a1, a3
+; RV32IB-NEXT:  .LBB25_4:
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: maxu_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    beq a1, a3, .LBB25_2
+; RV32IBB-NEXT:  # %bb.1:
+; RV32IBB-NEXT:    sltu a4, a3, a1
+; RV32IBB-NEXT:    beqz a4, .LBB25_3
+; RV32IBB-NEXT:    j .LBB25_4
+; RV32IBB-NEXT:  .LBB25_2:
+; RV32IBB-NEXT:    sltu a4, a2, a0
+; RV32IBB-NEXT:    bnez a4, .LBB25_4
+; RV32IBB-NEXT:  .LBB25_3:
+; RV32IBB-NEXT:    mv a0, a2
+; RV32IBB-NEXT:    mv a1, a3
+; RV32IBB-NEXT:  .LBB25_4:
+; RV32IBB-NEXT:    ret
+  %cmp = icmp ugt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64Zbb.ll b/llvm/test/CodeGen/RISCV/rv64Zbb.ll
new file mode 100644
index 000000000000..2e4b69e4997b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64Zbb.ll
@@ -0,0 +1,1149 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-b -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64IB
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64IBB
+
+define signext i32 @slo_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: slo_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    sllw a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: slo_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    slow a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: slo_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    slow a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %neg = xor i32 %a, -1
+  %shl = shl i32 %neg, %b
+  %neg1 = xor i32 %shl, -1
+  ret i32 %neg1
+}
+
+define i64 @slo_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: slo_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: slo_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    slo a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: slo_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    slo a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %neg = xor i64 %a, -1
+  %shl = shl i64 %neg, %b
+  %neg1 = xor i64 %shl, -1
+  ret i64 %neg1
+}
+
+define signext i32 @sro_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sro_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srlw a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sro_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    srow a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: sro_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    srow a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %neg = xor i32 %a, -1
+  %shr = lshr i32 %neg, %b
+  %neg1 = xor i32 %shr, -1
+  ret i32 %neg1
+}
+
+define i64 @sro_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: sro_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sro_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sro a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: sro_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    sro a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %neg = xor i64 %a, -1
+  %shr = lshr i64 %neg, %b
+  %neg1 = xor i64 %shr, -1
+  ret i64 %neg1
+}
+
+define signext i32 @sloi_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: sloi_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    ori a0, a0, 1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sloi_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sloiw a0, a0, 1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: sloi_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    sloiw a0, a0, 1
+; RV64IBB-NEXT:    ret
+  %neg = shl i32 %a, 1
+  %neg12 = or i32 %neg, 1
+  ret i32 %neg12
+}
+
+define i64 @sloi_i64(i64 %a) nounwind {
+; RV64I-LABEL: sloi_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    ori a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sloi_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sloi a0, a0, 1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: sloi_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    sloi a0, a0, 1
+; RV64IBB-NEXT:    ret
+  %neg = shl i64 %a, 1
+  %neg12 = or i64 %neg, 1
+  ret i64 %neg12
+}
+
+define signext i32 @sroi_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: sroi_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    lui a1, 524288
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sroi_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sroiw a0, a0, 1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: sroi_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    sroiw a0, a0, 1
+; RV64IBB-NEXT:    ret
+  %neg = lshr i32 %a, 1
+  %neg12 = or i32 %neg, -2147483648
+  ret i32 %neg12
+}
+
+define i64 @sroi_i64(i64 %a) nounwind {
+; RV64I-LABEL: sroi_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    addi a1, zero, -1
+; RV64I-NEXT:    slli a1, a1, 63
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sroi_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sroi a0, a0, 1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: sroi_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    sroi a0, a0, 1
+; RV64IBB-NEXT:    ret
+  %neg = lshr i64 %a, 1
+  %neg12 = or i64 %neg, -9223372036854775808
+  ret i64 %neg12
+}
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define signext i32 @ctlz_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: ctlz_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp)
+; RV64I-NEXT:    beqz a0, .LBB8_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    lui a2, 21845
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    lui a1, 13107
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lui a1, 3855
+; RV64I-NEXT:    addiw a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    addi a1, a1, 257
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    addi a1, a1, 257
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    addi a0, a0, -32
+; RV64I-NEXT:    j .LBB8_3
+; RV64I-NEXT:  .LBB8_2:
+; RV64I-NEXT:    addi a0, zero, 32
+; RV64I-NEXT:  .LBB8_3: # %cond.end
+; RV64I-NEXT:    ld ra, 8(sp)
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: ctlz_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    beqz a0, .LBB8_2
+; RV64IB-NEXT:  # %bb.1: # %cond.false
+; RV64IB-NEXT:    clzw a0, a0
+; RV64IB-NEXT:    ret
+; RV64IB-NEXT:  .LBB8_2:
+; RV64IB-NEXT:    addi a0, zero, 32
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: ctlz_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    beqz a0, .LBB8_2
+; RV64IBB-NEXT:  # %bb.1: # %cond.false
+; RV64IBB-NEXT:    clzw a0, a0
+; RV64IBB-NEXT:    ret
+; RV64IBB-NEXT:  .LBB8_2:
+; RV64IBB-NEXT:    addi a0, zero, 32
+; RV64IBB-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i64 @ctlz_i64(i64 %a) nounwind {
+; RV64I-LABEL: ctlz_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp)
+; RV64I-NEXT:    beqz a0, .LBB9_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    lui a2, 21845
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    lui a1, 13107
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lui a1, 3855
+; RV64I-NEXT:    addiw a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    addi a1, a1, 257
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    addi a1, a1, 257
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    j .LBB9_3
+; RV64I-NEXT:  .LBB9_2:
+; RV64I-NEXT:    addi a0, zero, 64
+; RV64I-NEXT:  .LBB9_3: # %cond.end
+; RV64I-NEXT:    ld ra, 8(sp)
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: ctlz_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    beqz a0, .LBB9_2
+; RV64IB-NEXT:  # %bb.1: # %cond.false
+; RV64IB-NEXT:    clz a0, a0
+; RV64IB-NEXT:    ret
+; RV64IB-NEXT:  .LBB9_2:
+; RV64IB-NEXT:    addi a0, zero, 64
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: ctlz_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    beqz a0, .LBB9_2
+; RV64IBB-NEXT:  # %bb.1: # %cond.false
+; RV64IBB-NEXT:    clz a0, a0
+; RV64IBB-NEXT:    ret
+; RV64IBB-NEXT:  .LBB9_2:
+; RV64IBB-NEXT:    addi a0, zero, 64
+; RV64IBB-NEXT:    ret
+  %1 = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define signext i32 @cttz_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: cttz_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp)
+; RV64I-NEXT:    beqz a0, .LBB10_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi a1, a0, -1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    lui a2, 21845
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    lui a1, 13107
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lui a1, 3855
+; RV64I-NEXT:    addiw a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    addi a1, a1, 257
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    addi a1, a1, 257
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    j .LBB10_3
+; RV64I-NEXT:  .LBB10_2:
+; RV64I-NEXT:    addi a0, zero, 32
+; RV64I-NEXT:  .LBB10_3: # %cond.end
+; RV64I-NEXT:    ld ra, 8(sp)
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: cttz_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    beqz a0, .LBB10_2
+; RV64IB-NEXT:  # %bb.1: # %cond.false
+; RV64IB-NEXT:    ctz a0, a0
+; RV64IB-NEXT:    ret
+; RV64IB-NEXT:  .LBB10_2:
+; RV64IB-NEXT:    addi a0, zero, 32
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: cttz_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    beqz a0, .LBB10_2
+; RV64IBB-NEXT:  # %bb.1: # %cond.false
+; RV64IBB-NEXT:    ctz a0, a0
+; RV64IBB-NEXT:    ret
+; RV64IBB-NEXT:  .LBB10_2:
+; RV64IBB-NEXT:    addi a0, zero, 32
+; RV64IBB-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i64 @cttz_i64(i64 %a) nounwind {
+; RV64I-LABEL: cttz_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp)
+; RV64I-NEXT:    beqz a0, .LBB11_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi a1, a0, -1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    lui a2, 21845
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    lui a1, 13107
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lui a1, 3855
+; RV64I-NEXT:    addiw a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    addi a1, a1, 257
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    addi a1, a1, 257
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    j .LBB11_3
+; RV64I-NEXT:  .LBB11_2:
+; RV64I-NEXT:    addi a0, zero, 64
+; RV64I-NEXT:  .LBB11_3: # %cond.end
+; RV64I-NEXT:    ld ra, 8(sp)
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: cttz_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    beqz a0, .LBB11_2
+; RV64IB-NEXT:  # %bb.1: # %cond.false
+; RV64IB-NEXT:    ctz a0, a0
+; RV64IB-NEXT:    ret
+; RV64IB-NEXT:  .LBB11_2:
+; RV64IB-NEXT:    addi a0, zero, 64
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: cttz_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    beqz a0, .LBB11_2
+; RV64IBB-NEXT:  # %bb.1: # %cond.false
+; RV64IBB-NEXT:    ctz a0, a0
+; RV64IBB-NEXT:    ret
+; RV64IBB-NEXT:  .LBB11_2:
+; RV64IBB-NEXT:    addi a0, zero, 64
+; RV64IBB-NEXT:    ret
+  %1 = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+declare i32 @llvm.ctpop.i32(i32)
+
+define signext i32 @ctpop_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: ctpop_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp)
+; RV64I-NEXT:    slli a1, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    srliw a0, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    lui a2, 13107
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 819
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lui a1, 3855
+; RV64I-NEXT:    addiw a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    addi a1, a1, 257
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    addi a1, a1, 257
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp)
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: ctpop_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    pcntw a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: ctpop_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    pcntw a0, a0
+; RV64IBB-NEXT:    ret
+  %1 = call i32 @llvm.ctpop.i32(i32 %a)
+  ret i32 %1
+}
+
+declare i64 @llvm.ctpop.i64(i64)
+
+define i64 @ctpop_i64(i64 %a) nounwind {
+; RV64I-LABEL: ctpop_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp)
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    lui a2, 21845
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    lui a1, 13107
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lui a1, 3855
+; RV64I-NEXT:    addiw a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    addi a1, a1, 257
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    addi a1, a1, 257
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp)
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: ctpop_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    pcnt a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: ctpop_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    pcnt a0, a0
+; RV64IBB-NEXT:    ret
+  %1 = call i64 @llvm.ctpop.i64(i64 %a)
+  ret i64 %1
+}
+
+define signext i32 @sextb_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: sextb_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a0, a0, 56
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sextb_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sext.b a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: sextb_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    sext.b a0, a0
+; RV64IBB-NEXT:    ret
+  %shl = shl i32 %a, 24
+  %shr = ashr exact i32 %shl, 24
+  ret i32 %shr
+}
+
+define i64 @sextb_i64(i64 %a) nounwind {
+; RV64I-LABEL: sextb_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a0, a0, 56
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sextb_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sext.b a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: sextb_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    sext.b a0, a0
+; RV64IBB-NEXT:    ret
+  %shl = shl i64 %a, 56
+  %shr = ashr exact i64 %shl, 56
+  ret i64 %shr
+}
+
+define signext i32 @sexth_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: sexth_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sexth_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sext.h a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: sexth_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    sext.h a0, a0
+; RV64IBB-NEXT:    ret
+  %shl = shl i32 %a, 16
+  %shr = ashr exact i32 %shl, 16
+  ret i32 %shr
+}
+
+define i64 @sexth_i64(i64 %a) nounwind {
+; RV64I-LABEL: sexth_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sexth_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sext.h a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: sexth_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    sext.h a0, a0
+; RV64IBB-NEXT:    ret
+  %shl = shl i64 %a, 48
+  %shr = ashr exact i64 %shl, 48
+  ret i64 %shr
+}
+
+define signext i32 @min_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: min_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    blt a0, a1, .LBB18_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB18_2:
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: min_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    min a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: min_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    min a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %cmp = icmp slt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @min_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: min_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    blt a0, a1, .LBB19_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB19_2:
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: min_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    min a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: min_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    min a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %cmp = icmp slt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @max_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: max_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    blt a1, a0, .LBB20_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB20_2:
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: max_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    max a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: max_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    max a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %cmp = icmp sgt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @max_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: max_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    blt a1, a0, .LBB21_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB21_2:
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: max_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    max a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: max_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    max a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %cmp = icmp sgt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: minu_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    bltu a0, a1, .LBB22_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB22_2:
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: minu_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    minu a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: minu_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    minu a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %cmp = icmp ult i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @minu_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: minu_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    bltu a0, a1, .LBB23_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB23_2:
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: minu_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    minu a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: minu_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    minu a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %cmp = icmp ult i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: maxu_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    bltu a1, a0, .LBB24_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB24_2:
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: maxu_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    maxu a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: maxu_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    maxu a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %cmp = icmp ugt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: maxu_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    bltu a1, a0, .LBB25_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB25_2:
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: maxu_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    maxu a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: maxu_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    maxu a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %cmp = icmp ugt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+; We select a i32 addi that zero-extends the result on RV64 as addiwu
+
+define zeroext i32 @zext_add_to_addiwu(i32 signext %a) nounwind {
+; RV64I-LABEL: zext_add_to_addiwu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: zext_add_to_addiwu:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    addiwu a0, a0, 1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: zext_add_to_addiwu:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    addiwu a0, a0, 1
+; RV64IBB-NEXT:    ret
+  %add = add i32 %a, 1
+  ret i32 %add
+}
+
+define i64 @addiwu(i64 %a) nounwind {
+; RV64I-LABEL: addiwu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: addiwu:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    addiwu a0, a0, 1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: addiwu:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    addiwu a0, a0, 1
+; RV64IBB-NEXT:    ret
+  %conv = add i64 %a, 1
+  %conv1 = and i64 %conv, 4294967295
+  ret i64 %conv1
+}
+
+define i64 @slliuw(i64 %a) nounwind {
+; RV64I-LABEL: slliuw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    addi a1, zero, 1
+; RV64I-NEXT:    slli a1, a1, 33
+; RV64I-NEXT:    addi a1, a1, -2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: slliuw:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    slliu.w a0, a0, 1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: slliuw:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    slliu.w a0, a0, 1
+; RV64IBB-NEXT:    ret
+  %conv1 = shl i64 %a, 1
+  %shl = and i64 %conv1, 8589934590
+  ret i64 %shl
+}
+
+; We select a i32 add that zero-extends the result on RV64 as addwu
+
+define zeroext i32 @zext_add_to_addwu(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_add_to_addwu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: zext_add_to_addwu:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    addwu a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: zext_add_to_addwu:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    addwu a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %add = add i32 %a, %b
+  ret i32 %add
+}
+
+define i64 @addwu(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: addwu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: addwu:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    addwu a0, a1, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: addwu:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    addwu a0, a1, a0
+; RV64IBB-NEXT:    ret
+  %add = add i64 %b, %a
+  %conv1 = and i64 %add, 4294967295
+  ret i64 %conv1
+}
+
+; We select a i32 sub that zero-extends the result on RV64 as subwu
+
+define zeroext i32 @zext_sub_to_subwu(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: zext_sub_to_subwu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: zext_sub_to_subwu:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    subwu a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: zext_sub_to_subwu:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    subwu a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %sub = sub i32 %a, %b
+  ret i32 %sub
+}
+
+define i64 @subwu(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: subwu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: subwu:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    subwu a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: subwu:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    subwu a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %sub = sub i64 %a, %b
+  %conv1 = and i64 %sub, 4294967295
+  ret i64 %conv1
+}
+
+define i64 @adduw(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: adduw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: adduw:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    addu.w a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: adduw:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    addu.w a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %and = and i64 %b, 4294967295
+  %add = add i64 %and, %a
+  ret i64 %add
+}
+
+define i64 @subuw(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: subuw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: subuw:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    subu.w a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: subuw:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    subu.w a0, a0, a1
+; RV64IBB-NEXT:    ret
+  %and = and i64 %b, 4294967295
+  %sub = sub i64 %a, %and
+  ret i64 %sub
+}

From ffe85d6c03b91cf9294c7ec1d8192d4cc337cdfd Mon Sep 17 00:00:00 2001
From: lewis-revill <lewis.revill@embecosm.com>
Date: Wed, 15 Jul 2020 11:53:06 +0100
Subject: [PATCH 048/363] [RISCV] Add matching of codegen patterns to RISCV Bit
 Manipulation Zbp asm instructions

This patch provides optimization of bit manipulation operations by
enabling the +experimental-b target feature.
It adds matching of single block patterns of instructions to specific
bit-manip instructions from the permutation subset (zbp subextension) of
the experimental B extension of RISC-V.
It adds also the correspondent codegen tests.

This patch is based on Claire Wolf's proposal for the bit manipulation
extension of RISCV:
https://github.com/riscv/riscv-bitmanip/blob/master/bitmanip-0.92.pdf

Differential Revision: https://reviews.llvm.org/D79871

(cherry picked from commit 31b52b4345e36b169a2b6a89eac44651f59889dd)
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |    7 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoB.td    |  190 +++
 llvm/test/CodeGen/RISCV/rv32Zbp.ll          | 1245 +++++++++++++++++
 llvm/test/CodeGen/RISCV/rv64Zbp.ll          | 1343 +++++++++++++++++++
 4 files changed, 2784 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rv32Zbp.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64Zbp.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index fb44f826eb6c..c89bb21c9701 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -151,7 +151,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::ROTL, XLenVT, Expand);
   setOperationAction(ISD::ROTR, XLenVT, Expand);
-  setOperationAction(ISD::BSWAP, XLenVT, Expand);
+
+  if (!Subtarget.hasStdExtZbp())
+    setOperationAction(ISD::BSWAP, XLenVT, Expand);
 
   if (!Subtarget.hasStdExtZbb()) {
     setOperationAction(ISD::CTTZ, XLenVT, Expand);
@@ -159,6 +161,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::CTPOP, XLenVT, Expand);
   }
 
+  if (Subtarget.hasStdExtZbp())
+    setOperationAction(ISD::BITREVERSE, XLenVT, Legal);
+
   ISD::CondCode FPCCToExtend[] = {
       ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT,
       ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
index dc3d6cbb4fe8..09d5f1ef856a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
@@ -651,6 +651,97 @@ def : Pat<(SLOIPat GPR:$rs1, uimmlog2xlen:$shamt),
           (SLOI GPR:$rs1, uimmlog2xlen:$shamt)>;
 def : Pat<(SROIPat GPR:$rs1, uimmlog2xlen:$shamt),
           (SROI GPR:$rs1, uimmlog2xlen:$shamt)>;
+} // Predicates = [HasStdExtZbb]
+
+let Predicates = [HasStdExtZbp, IsRV32] in {
+def : Pat<(or (or (and (srl GPR:$rs1, (i32 1)), (i32 0x55555555)), GPR:$rs1),
+              (and (shl GPR:$rs1, (i32 1)), (i32 0xAAAAAAAA))),
+          (GORCI GPR:$rs1, (i32 1))>;
+def : Pat<(or (or (and (srl GPR:$rs1, (i32 2)), (i32 0x33333333)), GPR:$rs1),
+              (and (shl GPR:$rs1, (i32 2)), (i32 0xCCCCCCCC))),
+          (GORCI GPR:$rs1, (i32 2))>;
+def : Pat<(or (or (and (srl GPR:$rs1, (i32 4)), (i32 0x0F0F0F0F)), GPR:$rs1),
+              (and (shl GPR:$rs1, (i32 4)), (i32 0xF0F0F0F0))),
+          (GORCI GPR:$rs1, (i32 4))>;
+def : Pat<(or (or (and (srl GPR:$rs1, (i32 8)), (i32 0x00FF00FF)), GPR:$rs1),
+              (and (shl GPR:$rs1, (i32 8)), (i32 0xFF00FF00))),
+          (GORCI GPR:$rs1, (i32 8))>;
+def : Pat<(or (or (srl GPR:$rs1, (i32 16)), GPR:$rs1),
+              (shl GPR:$rs1, (i32 16))),
+          (GORCI GPR:$rs1, (i32 16))>;
+} // Predicates = [HasStdExtZbp, IsRV32]
+
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : Pat<(or (or (and (srl GPR:$rs1, (i64 1)), (i64 0x5555555555555555)),
+                   GPR:$rs1),
+              (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAAAAAAAAAA))),
+          (GORCI GPR:$rs1, (i64 1))>;
+def : Pat<(or (or (and (srl GPR:$rs1, (i64 2)), (i64 0x3333333333333333)),
+                   GPR:$rs1),
+              (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCCCCCCCCCC))),
+          (GORCI GPR:$rs1, (i64 2))>;
+def : Pat<(or (or (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F0F0F0F0F)),
+                   GPR:$rs1),
+              (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0F0F0F0F0))),
+          (GORCI GPR:$rs1, (i64 4))>;
+def : Pat<(or (or (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF00FF00FF)),
+                   GPR:$rs1),
+              (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00FF00FF00))),
+          (GORCI GPR:$rs1, (i64 8))>;
+def : Pat<(or (or (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF0000FFFF)),
+                   GPR:$rs1),
+              (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000FFFF0000))),
+          (GORCI GPR:$rs1, (i64 16))>;
+def : Pat<(or (or (srl GPR:$rs1, (i64 32)), GPR:$rs1),
+              (shl GPR:$rs1, (i64 32))),
+          (GORCI GPR:$rs1, (i64 32))>;
+} // Predicates = [HasStdExtZbp, IsRV64]
+
+let Predicates = [HasStdExtZbp, IsRV32] in {
+def : Pat<(or (and (shl GPR:$rs1, (i32 1)), (i32 0xAAAAAAAA)),
+              (and (srl GPR:$rs1, (i32 1)), (i32 0x55555555))),
+          (GREVI GPR:$rs1, (i32 1))>;
+def : Pat<(or (and (shl GPR:$rs1, (i32 2)), (i32 0xCCCCCCCC)),
+              (and (srl GPR:$rs1, (i32 2)), (i32 0x33333333))),
+          (GREVI GPR:$rs1, (i32 2))>;
+def : Pat<(or (and (shl GPR:$rs1, (i32 4)), (i32 0xF0F0F0F0)),
+              (and (srl GPR:$rs1, (i32 4)), (i32 0x0F0F0F0F))),
+          (GREVI GPR:$rs1, (i32 4))>;
+def : Pat<(or (and (shl GPR:$rs1, (i32 8)), (i32 0xFF00FF00)),
+              (and (srl GPR:$rs1, (i32 8)), (i32 0x00FF00FF))),
+          (GREVI GPR:$rs1, (i32 8))>;
+def : Pat<(rotr (bswap GPR:$rs1), (i32 16)), (GREVI GPR:$rs1, (i32 8))>;
+def : Pat<(or (shl GPR:$rs1, (i32 16)), (srl GPR:$rs1, (i32 16))),
+          (GREVI GPR:$rs1, (i32 16))>;
+def : Pat<(rotl GPR:$rs1, (i32 16)), (GREVI GPR:$rs1, (i32 16))>;
+def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i32 24))>;
+def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i32 31))>;
+} // Predicates = [HasStdExtZbp, IsRV32]
+
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : Pat<(or (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAAAAAAAAAA)),
+              (and (srl GPR:$rs1, (i64 1)), (i64 0x5555555555555555))),
+          (GREVI GPR:$rs1, (i64 1))>;
+def : Pat<(or (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCCCCCCCCCC)),
+              (and (srl GPR:$rs1, (i64 2)), (i64 0x3333333333333333))),
+          (GREVI GPR:$rs1, (i64 2))>;
+def : Pat<(or (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0F0F0F0F0)),
+              (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F0F0F0F0F))),
+          (GREVI GPR:$rs1, (i64 4))>;
+def : Pat<(or (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00FF00FF00)),
+              (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF00FF00FF))),
+          (GREVI GPR:$rs1, (i64 8))>;
+def : Pat<(or (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000FFFF0000)),
+              (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF0000FFFF))),
+          (GREVI GPR:$rs1, (i64 16))>;
+def : Pat<(or (shl GPR:$rs1, (i64 32)), (srl GPR:$rs1, (i64 32))),
+          (GREVI GPR:$rs1, (i64 32))>;
+def : Pat<(rotl GPR:$rs1, (i64 32)), (GREVI GPR:$rs1, (i64 32))>;
+def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i64 56))>;
+def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i64 63))>;
+} // Predicates = [HasStdExtZbp, IsRV64]
+
+let Predicates = [HasStdExtZbb] in {
 def : Pat<(ctlz GPR:$rs1), (CLZ GPR:$rs1)>;
 def : Pat<(cttz GPR:$rs1), (CTZ GPR:$rs1)>;
 def : Pat<(ctpop GPR:$rs1), (PCNT GPR:$rs1)>;
@@ -681,6 +772,48 @@ def : Pat<(riscv_selectcc GPR:$rs2, GPR:$rs1, (XLenVT 12), GPR:$rs1, GPR:$rs2),
           (MAXU  GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtZbb]
 
+let Predicates = [HasStdExtZbp, IsRV32] in {
+def : Pat<(or (or (and (shl GPR:$rs1, (i32 8)), (i32 0x00FF0000)),
+                  (and GPR:$rs1, (i32 0xFF0000FF))),
+              (and (srl GPR:$rs1, (i32 8)), (i32 0x0000FF00))),
+          (SHFLI GPR:$rs1, (i32 8))>;
+def : Pat<(or (or (and (shl GPR:$rs1, (i32 4)), (i32 0x0F000F00)),
+                  (and GPR:$rs1, (i32 0xF00FF00F))),
+              (and (srl GPR:$rs1, (i32 4)), (i32 0x00F000F0))),
+          (SHFLI GPR:$rs1, (i32 4))>;
+def : Pat<(or (or (and (shl GPR:$rs1, (i32 2)), (i32 0x30303030)),
+                  (and GPR:$rs1, (i32 0xC3C3C3C3))),
+              (and (srl GPR:$rs1, (i32 2)), (i32 0x0C0C0C0C))),
+          (SHFLI GPR:$rs1, (i32 2))>;
+def : Pat<(or (or (and (shl GPR:$rs1, (i32 1)), (i32 0x44444444)),
+                  (and GPR:$rs1, (i32 0x99999999))),
+              (and (srl GPR:$rs1, (i32 1)), (i32 0x22222222))),
+          (SHFLI GPR:$rs1, (i32 1))>;
+} // Predicates = [HasStdExtZbp, IsRV32]
+
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : Pat<(or (or (and (shl GPR:$rs1, (i64 16)), (i64 0x0000FFFF00000000)),
+                  (and GPR:$rs1, (i64 0xFFFF00000000FFFF))),
+              (and (srl GPR:$rs1, (i64 16)), (i64 0x00000000FFFF0000))),
+          (SHFLI GPR:$rs1, (i64 16))>;
+def : Pat<(or (or (and (shl GPR:$rs1, (i64 8)), (i64 0x00FF000000FF0000)),
+                  (and GPR:$rs1, (i64 0xFF0000FFFF0000FF))),
+              (and (srl GPR:$rs1, (i64 8)), (i64 0x0000FF000000FF00))),
+          (SHFLI GPR:$rs1, (i64 8))>;
+def : Pat<(or (or (and (shl GPR:$rs1, (i64 4)), (i64 0x0F000F000F000F00)),
+                  (and GPR:$rs1, (i64 0xF00FF00FF00FF00F))),
+              (and (srl GPR:$rs1, (i64 4)), (i64 0x00F000F000F000F0))),
+          (SHFLI GPR:$rs1, (i64 4))>;
+def : Pat<(or (or (and (shl GPR:$rs1, (i64 2)), (i64 0x3030303030303030)),
+                  (and GPR:$rs1, (i64 0xC3C3C3C3C3C3C3C3))),
+              (and (srl GPR:$rs1, (i64 2)), (i64 0x0C0C0C0C0C0C0C0C))),
+          (SHFLI GPR:$rs1, (i64 2))>;
+def : Pat<(or (or (and (shl GPR:$rs1, (i64 1)), (i64 0x4444444444444444)),
+                  (and GPR:$rs1, (i64 0x9999999999999999))),
+              (and (srl GPR:$rs1, (i64 1)), (i64 0x2222222222222222))),
+          (SHFLI GPR:$rs1, (i64 1))>;
+} // Predicates = [HasStdExtZbp, IsRV64]
+
 let Predicates = [HasStdExtZbb, IsRV64] in {
 def : Pat<(and (add GPR:$rs, simm12:$simm12), (i64 0xFFFFFFFF)),
           (ADDIWU GPR:$rs, simm12:$simm12)>;
@@ -702,6 +835,63 @@ def : Pat<(SLOIWPat GPR:$rs1, uimmlog2xlen:$shamt),
           (SLOIW GPR:$rs1, uimmlog2xlen:$shamt)>;
 def : Pat<(SROIWPat GPR:$rs1, uimmlog2xlen:$shamt),
           (SROIW GPR:$rs1, uimmlog2xlen:$shamt)>;
+} // Predicates = [HasStdExtZbb, IsRV64]
+
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 1)), (i64 0x55555555)),
+                              GPR:$rs1),
+                          (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAA))),
+                      i32),
+          (GORCIW GPR:$rs1, (i64 1))>;
+def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 2)), (i64 0x33333333)),
+                              GPR:$rs1),
+                          (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCC))),
+                      i32),
+          (GORCIW GPR:$rs1, (i64 2))>;
+def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F)),
+                              GPR:$rs1),
+                          (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0))),
+                      i32),
+          (GORCIW GPR:$rs1, (i64 4))>;
+def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF)),
+                              GPR:$rs1),
+                          (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00))),
+                      i32),
+          (GORCIW GPR:$rs1, (i64 8))>;
+def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 16)), (i64 0x0000FFFF)),
+                              GPR:$rs1),
+                          (and (shl GPR:$rs1, (i64 16)), (i64 0xFFFF0000))),
+                      i32),
+          (GORCIW GPR:$rs1, (i64 16))>;
+def : Pat<(sext_inreg (or (or (srl (and GPR:$rs1, (i64 0xFFFF0000)), (i64 16)),
+                              GPR:$rs1),
+                          (shl GPR:$rs1, (i64 16))), i32),
+          (GORCIW GPR:$rs1, (i64 16))>;
+
+def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 1)), (i64 0xAAAAAAAA)),
+                          (and (srl GPR:$rs1, (i64 1)), (i64 0x55555555))),
+                      i32),
+          (GREVIW GPR:$rs1, (i64 1))>;
+def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 2)), (i64 0xCCCCCCCC)),
+                          (and (srl GPR:$rs1, (i64 2)), (i64 0x33333333))),
+                      i32),
+          (GREVIW GPR:$rs1, (i64 2))>;
+def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 4)), (i64 0xF0F0F0F0)),
+                          (and (srl GPR:$rs1, (i64 4)), (i64 0x0F0F0F0F))),
+                      i32),
+          (GREVIW GPR:$rs1, (i64 4))>;
+def : Pat<(sext_inreg (or (and (shl GPR:$rs1, (i64 8)), (i64 0xFF00FF00)),
+                          (and (srl GPR:$rs1, (i64 8)), (i64 0x00FF00FF))),
+                      i32),
+          (GREVIW GPR:$rs1, (i64 8))>;
+def : Pat<(sext_inreg (or (shl GPR:$rs1, (i64 16)),
+                          (srl (and GPR:$rs1, 0xFFFF0000), (i64 16))), i32),
+          (GREVIW GPR:$rs1, (i64 16))>;
+def : Pat<(sra (bswap GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 24))>;
+def : Pat<(sra (bitreverse GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 31))>;
+} // Predicates = [HasStdExtZbp, IsRV64]
+
+let Predicates = [HasStdExtZbb, IsRV64] in {
 def : Pat<(add (ctlz (and GPR:$rs1, (i64 0xFFFFFFFF))), (i64 -32)),
           (CLZW GPR:$rs1)>;
 // We don't pattern-match CTZW here as it has the same pattern and result as
diff --git a/llvm/test/CodeGen/RISCV/rv32Zbp.ll b/llvm/test/CodeGen/RISCV/rv32Zbp.ll
new file mode 100644
index 000000000000..8769ce77337c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv32Zbp.ll
@@ -0,0 +1,1245 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-b -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32IB
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zbp -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32IBP
+
+define i32 @gorc1_i32(i32 %a) nounwind {
+; RV32I-LABEL: gorc1_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 1
+; RV32I-NEXT:    lui a2, 699051
+; RV32I-NEXT:    addi a2, a2, -1366
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    addi a3, a3, 1365
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: gorc1_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    orc.p a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: gorc1_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    orc.p a0, a0
+; RV32IBP-NEXT:    ret
+  %and = shl i32 %a, 1
+  %shl = and i32 %and, -1431655766
+  %and1 = lshr i32 %a, 1
+  %shr = and i32 %and1, 1431655765
+  %or = or i32 %shr, %a
+  %or2 = or i32 %or, %shl
+  ret i32 %or2
+}
+
+define i64 @gorc1_i64(i64 %a) nounwind {
+; RV32I-LABEL: gorc1_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a0, 1
+; RV32I-NEXT:    slli a3, a1, 1
+; RV32I-NEXT:    lui a4, 699051
+; RV32I-NEXT:    addi a4, a4, -1366
+; RV32I-NEXT:    and a6, a3, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    srli a4, a1, 1
+; RV32I-NEXT:    srli a5, a0, 1
+; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    addi a3, a3, 1365
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    or a0, a5, a0
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: gorc1_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    orc.p a0, a0
+; RV32IB-NEXT:    orc.p a1, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: gorc1_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    orc.p a0, a0
+; RV32IBP-NEXT:    orc.p a1, a1
+; RV32IBP-NEXT:    ret
+  %and = shl i64 %a, 1
+  %shl = and i64 %and, -6148914691236517206
+  %and1 = lshr i64 %a, 1
+  %shr = and i64 %and1, 6148914691236517205
+  %or = or i64 %shr, %a
+  %or2 = or i64 %or, %shl
+  ret i64 %or2
+}
+
+define i32 @gorc2_i32(i32 %a) nounwind {
+; RV32I-LABEL: gorc2_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 2
+; RV32I-NEXT:    lui a2, 838861
+; RV32I-NEXT:    addi a2, a2, -820
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: gorc2_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    orc2.n a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: gorc2_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    orc2.n a0, a0
+; RV32IBP-NEXT:    ret
+  %and = shl i32 %a, 2
+  %shl = and i32 %and, -858993460
+  %and1 = lshr i32 %a, 2
+  %shr = and i32 %and1, 858993459
+  %or = or i32 %shr, %a
+  %or2 = or i32 %or, %shl
+  ret i32 %or2
+}
+
+define i64 @gorc2_i64(i64 %a) nounwind {
+; RV32I-LABEL: gorc2_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a0, 2
+; RV32I-NEXT:    slli a3, a1, 2
+; RV32I-NEXT:    lui a4, 838861
+; RV32I-NEXT:    addi a4, a4, -820
+; RV32I-NEXT:    and a6, a3, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    srli a4, a1, 2
+; RV32I-NEXT:    srli a5, a0, 2
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    or a0, a5, a0
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: gorc2_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    orc2.n a0, a0
+; RV32IB-NEXT:    orc2.n a1, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: gorc2_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    orc2.n a0, a0
+; RV32IBP-NEXT:    orc2.n a1, a1
+; RV32IBP-NEXT:    ret
+  %and = shl i64 %a, 2
+  %shl = and i64 %and, -3689348814741910324
+  %and1 = lshr i64 %a, 2
+  %shr = and i64 %and1, 3689348814741910323
+  %or = or i64 %shr, %a
+  %or2 = or i64 %or, %shl
+  ret i64 %or2
+}
+
+define i32 @gorc4_i32(i32 %a) nounwind {
+; RV32I-LABEL: gorc4_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 4
+; RV32I-NEXT:    lui a2, 986895
+; RV32I-NEXT:    addi a2, a2, 240
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    lui a3, 61681
+; RV32I-NEXT:    addi a3, a3, -241
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: gorc4_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    orc4.b a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: gorc4_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    orc4.b a0, a0
+; RV32IBP-NEXT:    ret
+  %and = shl i32 %a, 4
+  %shl = and i32 %and, -252645136
+  %and1 = lshr i32 %a, 4
+  %shr = and i32 %and1, 252645135
+  %or = or i32 %shr, %a
+  %or2 = or i32 %or, %shl
+  ret i32 %or2
+}
+
+define i64 @gorc4_i64(i64 %a) nounwind {
+; RV32I-LABEL: gorc4_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a0, 4
+; RV32I-NEXT:    slli a3, a1, 4
+; RV32I-NEXT:    lui a4, 986895
+; RV32I-NEXT:    addi a4, a4, 240
+; RV32I-NEXT:    and a6, a3, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    srli a4, a1, 4
+; RV32I-NEXT:    srli a5, a0, 4
+; RV32I-NEXT:    lui a3, 61681
+; RV32I-NEXT:    addi a3, a3, -241
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    or a0, a5, a0
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: gorc4_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    orc4.b a0, a0
+; RV32IB-NEXT:    orc4.b a1, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: gorc4_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    orc4.b a0, a0
+; RV32IBP-NEXT:    orc4.b a1, a1
+; RV32IBP-NEXT:    ret
+  %and = shl i64 %a, 4
+  %shl = and i64 %and, -1085102592571150096
+  %and1 = lshr i64 %a, 4
+  %shr = and i64 %and1, 1085102592571150095
+  %or = or i64 %shr, %a
+  %or2 = or i64 %or, %shl
+  ret i64 %or2
+}
+
+define i32 @gorc8_i32(i32 %a) nounwind {
+; RV32I-LABEL: gorc8_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    lui a2, 1044496
+; RV32I-NEXT:    addi a2, a2, -256
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    lui a3, 4080
+; RV32I-NEXT:    addi a3, a3, 255
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: gorc8_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    orc8.h a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: gorc8_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    orc8.h a0, a0
+; RV32IBP-NEXT:    ret
+  %and = shl i32 %a, 8
+  %shl = and i32 %and, -16711936
+  %and1 = lshr i32 %a, 8
+  %shr = and i32 %and1, 16711935
+  %or = or i32 %shr, %a
+  %or2 = or i32 %or, %shl
+  ret i32 %or2
+}
+
+define i64 @gorc8_i64(i64 %a) nounwind {
+; RV32I-LABEL: gorc8_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a1, 8
+; RV32I-NEXT:    lui a4, 1044496
+; RV32I-NEXT:    addi a4, a4, -256
+; RV32I-NEXT:    and a6, a3, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    srli a4, a1, 8
+; RV32I-NEXT:    srli a5, a0, 8
+; RV32I-NEXT:    lui a3, 4080
+; RV32I-NEXT:    addi a3, a3, 255
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    or a0, a5, a0
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: gorc8_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    orc8.h a0, a0
+; RV32IB-NEXT:    orc8.h a1, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: gorc8_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    orc8.h a0, a0
+; RV32IBP-NEXT:    orc8.h a1, a1
+; RV32IBP-NEXT:    ret
+  %and = shl i64 %a, 8
+  %shl = and i64 %and, -71777214294589696
+  %and1 = lshr i64 %a, 8
+  %shr = and i64 %and1, 71777214294589695
+  %or = or i64 %shr, %a
+  %or2 = or i64 %or, %shl
+  ret i64 %or2
+}
+
+define i32 @gorc16_i32(i32 %a) nounwind {
+; RV32I-LABEL: gorc16_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: gorc16_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    orc16 a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: gorc16_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    orc16 a0, a0
+; RV32IBP-NEXT:    ret
+  %shl = shl i32 %a, 16
+  %shr = lshr i32 %a, 16
+  %or = or i32 %shr, %a
+  %or2 = or i32 %or, %shl
+  ret i32 %or2
+}
+
+define i64 @gorc16_i64(i64 %a) nounwind {
+; RV32I-LABEL: gorc16_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a1, 16
+; RV32I-NEXT:    slli a3, a0, 16
+; RV32I-NEXT:    srli a4, a0, 16
+; RV32I-NEXT:    srli a5, a1, 16
+; RV32I-NEXT:    or a1, a5, a1
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: gorc16_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    orc16 a0, a0
+; RV32IB-NEXT:    orc16 a1, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: gorc16_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    orc16 a0, a0
+; RV32IBP-NEXT:    orc16 a1, a1
+; RV32IBP-NEXT:    ret
+  %and = shl i64 %a, 16
+  %shl = and i64 %and, -281470681808896
+  %and1 = lshr i64 %a, 16
+  %shr = and i64 %and1, 281470681808895
+  %or = or i64 %shr, %a
+  %or2 = or i64 %or, %shl
+  ret i64 %or2
+}
+
+define i32 @grev1_i32(i32 %a) nounwind {
+; RV32I-LABEL: grev1_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 1
+; RV32I-NEXT:    lui a2, 699051
+; RV32I-NEXT:    addi a2, a2, -1366
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    addi a2, a2, 1365
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: grev1_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    rev.p a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: grev1_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    rev.p a0, a0
+; RV32IBP-NEXT:    ret
+  %and = shl i32 %a, 1
+  %shl = and i32 %and, -1431655766
+  %and1 = lshr i32 %a, 1
+  %shr = and i32 %and1, 1431655765
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i64 @grev1_i64(i64 %a) nounwind {
+; RV32I-LABEL: grev1_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a0, 1
+; RV32I-NEXT:    slli a3, a1, 1
+; RV32I-NEXT:    lui a4, 699051
+; RV32I-NEXT:    addi a4, a4, -1366
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    lui a4, 349525
+; RV32I-NEXT:    addi a4, a4, 1365
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: grev1_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    rev.p a0, a0
+; RV32IB-NEXT:    rev.p a1, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: grev1_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    rev.p a0, a0
+; RV32IBP-NEXT:    rev.p a1, a1
+; RV32IBP-NEXT:    ret
+  %and = shl i64 %a, 1
+  %shl = and i64 %and, -6148914691236517206
+  %and1 = lshr i64 %a, 1
+  %shr = and i64 %and1, 6148914691236517205
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+define i32 @grev2_i32(i32 %a) nounwind {
+; RV32I-LABEL: grev2_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 2
+; RV32I-NEXT:    lui a2, 838861
+; RV32I-NEXT:    addi a2, a2, -820
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: grev2_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    rev2.n a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: grev2_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    rev2.n a0, a0
+; RV32IBP-NEXT:    ret
+  %and = shl i32 %a, 2
+  %shl = and i32 %and, -858993460
+  %and1 = lshr i32 %a, 2
+  %shr = and i32 %and1, 858993459
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i64 @grev2_i64(i64 %a) nounwind {
+; RV32I-LABEL: grev2_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a0, 2
+; RV32I-NEXT:    slli a3, a1, 2
+; RV32I-NEXT:    lui a4, 838861
+; RV32I-NEXT:    addi a4, a4, -820
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: grev2_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    rev2.n a0, a0
+; RV32IB-NEXT:    rev2.n a1, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: grev2_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    rev2.n a0, a0
+; RV32IBP-NEXT:    rev2.n a1, a1
+; RV32IBP-NEXT:    ret
+  %and = shl i64 %a, 2
+  %shl = and i64 %and, -3689348814741910324
+  %and1 = lshr i64 %a, 2
+  %shr = and i64 %and1, 3689348814741910323
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+define i32 @grev4_i32(i32 %a) nounwind {
+; RV32I-LABEL: grev4_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 4
+; RV32I-NEXT:    lui a2, 986895
+; RV32I-NEXT:    addi a2, a2, 240
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a0, a0, 4
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: grev4_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    rev4.b a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: grev4_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    rev4.b a0, a0
+; RV32IBP-NEXT:    ret
+  %and = shl i32 %a, 4
+  %shl = and i32 %and, -252645136
+  %and1 = lshr i32 %a, 4
+  %shr = and i32 %and1, 252645135
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i64 @grev4_i64(i64 %a) nounwind {
+; RV32I-LABEL: grev4_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a0, 4
+; RV32I-NEXT:    slli a3, a1, 4
+; RV32I-NEXT:    lui a4, 986895
+; RV32I-NEXT:    addi a4, a4, 240
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    srli a0, a0, 4
+; RV32I-NEXT:    srli a1, a1, 4
+; RV32I-NEXT:    lui a4, 61681
+; RV32I-NEXT:    addi a4, a4, -241
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: grev4_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    rev4.b a0, a0
+; RV32IB-NEXT:    rev4.b a1, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: grev4_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    rev4.b a0, a0
+; RV32IBP-NEXT:    rev4.b a1, a1
+; RV32IBP-NEXT:    ret
+  %and = shl i64 %a, 4
+  %shl = and i64 %and, -1085102592571150096
+  %and1 = lshr i64 %a, 4
+  %shr = and i64 %and1, 1085102592571150095
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+define i32 @grev8_i32(i32 %a) nounwind {
+; RV32I-LABEL: grev8_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    lui a2, 1044496
+; RV32I-NEXT:    addi a2, a2, -256
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    lui a2, 4080
+; RV32I-NEXT:    addi a2, a2, 255
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: grev8_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    rev8.h a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: grev8_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    rev8.h a0, a0
+; RV32IBP-NEXT:    ret
+  %and = shl i32 %a, 8
+  %shl = and i32 %and, -16711936
+  %and1 = lshr i32 %a, 8
+  %shr = and i32 %and1, 16711935
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i64 @grev8_i64(i64 %a) nounwind {
+; RV32I-LABEL: grev8_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a1, 8
+; RV32I-NEXT:    lui a4, 1044496
+; RV32I-NEXT:    addi a4, a4, -256
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    lui a4, 4080
+; RV32I-NEXT:    addi a4, a4, 255
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: grev8_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    rev8.h a0, a0
+; RV32IB-NEXT:    rev8.h a1, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: grev8_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    rev8.h a0, a0
+; RV32IBP-NEXT:    rev8.h a1, a1
+; RV32IBP-NEXT:    ret
+  %and = shl i64 %a, 8
+  %shl = and i64 %and, -71777214294589696
+  %and1 = lshr i64 %a, 8
+  %shr = and i64 %and1, 71777214294589695
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+define i32 @grev16_i32(i32 %a) nounwind {
+; RV32I-LABEL: grev16_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: grev16_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    rori a0, a0, 16
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: grev16_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    rori a0, a0, 16
+; RV32IBP-NEXT:    ret
+  %shl = shl i32 %a, 16
+  %shr = lshr i32 %a, 16
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i64 @grev16_i64(i64 %a) nounwind {
+; RV32I-LABEL: grev16_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a1, 16
+; RV32I-NEXT:    srli a3, a0, 16
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    srli a1, a1, 16
+; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: grev16_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    rori a0, a0, 16
+; RV32IB-NEXT:    rori a1, a1, 16
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: grev16_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    rori a0, a0, 16
+; RV32IBP-NEXT:    rori a1, a1, 16
+; RV32IBP-NEXT:    ret
+  %and = shl i64 %a, 16
+  %shl = and i64 %and, -281470681808896
+  %and1 = lshr i64 %a, 16
+  %shr = and i64 %and1, 281470681808895
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+declare i32 @llvm.bswap.i32(i32)
+
+define i32 @bswap_i32(i32 %a) nounwind {
+; RV32I-LABEL: bswap_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a1, a0, 8
+; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    addi a2, a2, -256
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 24
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    lui a3, 4080
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: bswap_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    rev8 a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: bswap_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    rev8 a0, a0
+; RV32IBP-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %1
+}
+
+declare i64 @llvm.bswap.i64(i64)
+
+define i64 @bswap_i64(i64 %a) {
+; RV32I-LABEL: bswap_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a2, a1, 8
+; RV32I-NEXT:    lui a3, 16
+; RV32I-NEXT:    addi a3, a3, -256
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    srli a4, a1, 24
+; RV32I-NEXT:    or a2, a2, a4
+; RV32I-NEXT:    slli a4, a1, 8
+; RV32I-NEXT:    lui a5, 4080
+; RV32I-NEXT:    and a4, a4, a5
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a2, a1, a2
+; RV32I-NEXT:    srli a1, a0, 8
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    srli a3, a0, 24
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    slli a3, a0, 8
+; RV32I-NEXT:    and a3, a3, a5
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: bswap_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    rev8 a2, a1
+; RV32IB-NEXT:    rev8 a1, a0
+; RV32IB-NEXT:    mv a0, a2
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: bswap_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    rev8 a2, a1
+; RV32IBP-NEXT:    rev8 a1, a0
+; RV32IBP-NEXT:    mv a0, a2
+; RV32IBP-NEXT:    ret
+  %1 = call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %1
+}
+
+declare i32 @llvm.bitreverse.i32(i32)
+
+define i32 @bitreverse_i32(i32 %a) nounwind {
+; RV32I-LABEL: bitreverse_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a1, a0, 8
+; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    addi a2, a2, -256
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 24
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    lui a3, 4080
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    lui a1, 61681
+; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    slli a1, a1, 4
+; RV32I-NEXT:    lui a2, 986895
+; RV32I-NEXT:    addi a2, a2, 240
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    srli a0, a0, 4
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    lui a1, 209715
+; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    lui a2, 838861
+; RV32I-NEXT:    addi a2, a2, -820
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    lui a1, 349525
+; RV32I-NEXT:    addi a1, a1, 1365
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    lui a2, 699051
+; RV32I-NEXT:    addi a2, a2, -1366
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: bitreverse_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    rev a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: bitreverse_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    rev a0, a0
+; RV32IBP-NEXT:    ret
+  %1 = tail call i32 @llvm.bitreverse.i32(i32 %a)
+  ret i32 %1
+}
+
+declare i64 @llvm.bitreverse.i64(i64)
+
+define i64 @bitreverse_i64(i64 %a) nounwind {
+; RV32I-LABEL: bitreverse_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a2, a1, 8
+; RV32I-NEXT:    lui a3, 16
+; RV32I-NEXT:    addi t0, a3, -256
+; RV32I-NEXT:    and a2, a2, t0
+; RV32I-NEXT:    srli a4, a1, 24
+; RV32I-NEXT:    or a2, a2, a4
+; RV32I-NEXT:    slli a4, a1, 8
+; RV32I-NEXT:    lui a6, 4080
+; RV32I-NEXT:    and a4, a4, a6
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    addi t1, a2, -241
+; RV32I-NEXT:    and a2, a1, t1
+; RV32I-NEXT:    slli a2, a2, 4
+; RV32I-NEXT:    lui a5, 986895
+; RV32I-NEXT:    addi t2, a5, 240
+; RV32I-NEXT:    and a1, a1, t2
+; RV32I-NEXT:    srli a1, a1, 4
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi t3, a2, 819
+; RV32I-NEXT:    and a3, a1, t3
+; RV32I-NEXT:    slli a3, a3, 2
+; RV32I-NEXT:    lui a4, 838861
+; RV32I-NEXT:    addi a4, a4, -820
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    addi a3, a3, 1365
+; RV32I-NEXT:    and a5, a1, a3
+; RV32I-NEXT:    slli a5, a5, 1
+; RV32I-NEXT:    lui a2, 699051
+; RV32I-NEXT:    addi a2, a2, -1366
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    or a7, a1, a5
+; RV32I-NEXT:    srli a1, a0, 8
+; RV32I-NEXT:    and a1, a1, t0
+; RV32I-NEXT:    srli a5, a0, 24
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    slli a5, a0, 8
+; RV32I-NEXT:    and a5, a5, a6
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    and a1, a0, t1
+; RV32I-NEXT:    slli a1, a1, 4
+; RV32I-NEXT:    and a0, a0, t2
+; RV32I-NEXT:    srli a0, a0, 4
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    and a1, a0, t3
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a3
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    mv a0, a7
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: bitreverse_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    rev a2, a1
+; RV32IB-NEXT:    rev a1, a0
+; RV32IB-NEXT:    mv a0, a2
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: bitreverse_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    rev a2, a1
+; RV32IBP-NEXT:    rev a1, a0
+; RV32IBP-NEXT:    mv a0, a2
+; RV32IBP-NEXT:    ret
+  %1 = call i64 @llvm.bitreverse.i64(i64 %a)
+  ret i64 %1
+}
+
+define i32 @shfl1_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: shfl1_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 629146
+; RV32I-NEXT:    addi a1, a1, -1639
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    slli a2, a0, 1
+; RV32I-NEXT:    lui a3, 279620
+; RV32I-NEXT:    addi a3, a3, 1092
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    lui a2, 139810
+; RV32I-NEXT:    addi a2, a2, 546
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: shfl1_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    zip.n a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: shfl1_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    zip.n a0, a0
+; RV32IBP-NEXT:    ret
+  %and = and i32 %a, -1717986919
+  %shl = shl i32 %a, 1
+  %and1 = and i32 %shl, 1145324612
+  %or = or i32 %and1, %and
+  %shr = lshr i32 %a, 1
+  %and2 = and i32 %shr, 572662306
+  %or3 = or i32 %or, %and2
+  ret i32 %or3
+}
+
+define i64 @shfl1_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: shfl1_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a2, 629146
+; RV32I-NEXT:    addi a2, a2, -1639
+; RV32I-NEXT:    and a6, a0, a2
+; RV32I-NEXT:    and a2, a1, a2
+; RV32I-NEXT:    slli a4, a1, 1
+; RV32I-NEXT:    slli a5, a0, 1
+; RV32I-NEXT:    lui a3, 279620
+; RV32I-NEXT:    addi a3, a3, 1092
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    or a2, a3, a2
+; RV32I-NEXT:    or a3, a5, a6
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    lui a4, 139810
+; RV32I-NEXT:    addi a4, a4, 546
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: shfl1_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    zip.n a0, a0
+; RV32IB-NEXT:    zip.n a1, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: shfl1_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    zip.n a0, a0
+; RV32IBP-NEXT:    zip.n a1, a1
+; RV32IBP-NEXT:    ret
+  %and = and i64 %a, -7378697629483820647
+  %shl = shl i64 %a, 1
+  %and1 = and i64 %shl, 4919131752989213764
+  %or = or i64 %and1, %and
+  %shr = lshr i64 %a, 1
+  %and2 = and i64 %shr, 2459565876494606882
+  %or3 = or i64 %or, %and2
+  ret i64 %or3
+}
+
+define i32 @shfl2_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: shfl2_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 801852
+; RV32I-NEXT:    addi a1, a1, 963
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    slli a2, a0, 2
+; RV32I-NEXT:    lui a3, 197379
+; RV32I-NEXT:    addi a3, a3, 48
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    lui a2, 49345
+; RV32I-NEXT:    addi a2, a2, -1012
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: shfl2_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    zip2.b a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: shfl2_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    zip2.b a0, a0
+; RV32IBP-NEXT:    ret
+  %and = and i32 %a, -1010580541
+  %shl = shl i32 %a, 2
+  %and1 = and i32 %shl, 808464432
+  %or = or i32 %and1, %and
+  %shr = lshr i32 %a, 2
+  %and2 = and i32 %shr, 202116108
+  %or3 = or i32 %or, %and2
+  ret i32 %or3
+}
+
+define i64 @shfl2_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: shfl2_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a2, 801852
+; RV32I-NEXT:    addi a2, a2, 963
+; RV32I-NEXT:    and a6, a0, a2
+; RV32I-NEXT:    and a2, a1, a2
+; RV32I-NEXT:    slli a4, a1, 2
+; RV32I-NEXT:    slli a5, a0, 2
+; RV32I-NEXT:    lui a3, 197379
+; RV32I-NEXT:    addi a3, a3, 48
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    or a2, a3, a2
+; RV32I-NEXT:    or a3, a5, a6
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    lui a4, 49345
+; RV32I-NEXT:    addi a4, a4, -1012
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: shfl2_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    zip2.b a0, a0
+; RV32IB-NEXT:    zip2.b a1, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: shfl2_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    zip2.b a0, a0
+; RV32IBP-NEXT:    zip2.b a1, a1
+; RV32IBP-NEXT:    ret
+  %and = and i64 %a, -4340410370284600381
+  %shl = shl i64 %a, 2
+  %and1 = and i64 %shl, 3472328296227680304
+  %or = or i64 %and1, %and
+  %shr = lshr i64 %a, 2
+  %and2 = and i64 %shr, 868082074056920076
+  %or3 = or i64 %or, %and2
+  ret i64 %or3
+}
+
+define i32 @shfl4_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: shfl4_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 983295
+; RV32I-NEXT:    addi a1, a1, 15
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    slli a2, a0, 4
+; RV32I-NEXT:    lui a3, 61441
+; RV32I-NEXT:    addi a3, a3, -256
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    srli a0, a0, 4
+; RV32I-NEXT:    lui a2, 3840
+; RV32I-NEXT:    addi a2, a2, 240
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: shfl4_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    zip4.h a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: shfl4_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    zip4.h a0, a0
+; RV32IBP-NEXT:    ret
+  %and = and i32 %a, -267390961
+  %shl = shl i32 %a, 4
+  %and1 = and i32 %shl, 251662080
+  %or = or i32 %and1, %and
+  %shr = lshr i32 %a, 4
+  %and2 = and i32 %shr, 15728880
+  %or3 = or i32 %or, %and2
+  ret i32 %or3
+}
+
+define i64 @shfl4_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: shfl4_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a2, 983295
+; RV32I-NEXT:    addi a2, a2, 15
+; RV32I-NEXT:    and a6, a0, a2
+; RV32I-NEXT:    and a2, a1, a2
+; RV32I-NEXT:    slli a4, a1, 4
+; RV32I-NEXT:    slli a5, a0, 4
+; RV32I-NEXT:    lui a3, 61441
+; RV32I-NEXT:    addi a3, a3, -256
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    or a2, a3, a2
+; RV32I-NEXT:    or a3, a5, a6
+; RV32I-NEXT:    srli a0, a0, 4
+; RV32I-NEXT:    srli a1, a1, 4
+; RV32I-NEXT:    lui a4, 3840
+; RV32I-NEXT:    addi a4, a4, 240
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: shfl4_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    zip4.h a0, a0
+; RV32IB-NEXT:    zip4.h a1, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: shfl4_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    zip4.h a0, a0
+; RV32IBP-NEXT:    zip4.h a1, a1
+; RV32IBP-NEXT:    ret
+  %and = and i64 %a, -1148435428713435121
+  %shl = shl i64 %a, 4
+  %and1 = and i64 %shl, 1080880403494997760
+  %or = or i64 %and1, %and
+  %shr = lshr i64 %a, 4
+  %and2 = and i64 %shr, 67555025218437360
+  %or3 = or i64 %or, %and2
+  ret i64 %or3
+}
+
+define i32 @shfl8_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: shfl8_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 1044480
+; RV32I-NEXT:    addi a1, a1, 255
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    lui a3, 4080
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    addi a2, a2, -256
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: shfl8_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    zip8 a0, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: shfl8_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    zip8 a0, a0
+; RV32IBP-NEXT:    ret
+  %and = and i32 %a, -16776961
+  %shl = shl i32 %a, 8
+  %and1 = and i32 %shl, 16711680
+  %or = or i32 %and1, %and
+  %shr = lshr i32 %a, 8
+  %and2 = and i32 %shr, 65280
+  %or3 = or i32 %or, %and2
+  ret i32 %or3
+}
+
+define i64 @shfl8_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: shfl8_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a2, 1044480
+; RV32I-NEXT:    addi a2, a2, 255
+; RV32I-NEXT:    and a3, a0, a2
+; RV32I-NEXT:    and a2, a1, a2
+; RV32I-NEXT:    slli a4, a1, 8
+; RV32I-NEXT:    slli a5, a0, 8
+; RV32I-NEXT:    lui a6, 4080
+; RV32I-NEXT:    and a5, a5, a6
+; RV32I-NEXT:    and a4, a4, a6
+; RV32I-NEXT:    or a2, a4, a2
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    lui a4, 16
+; RV32I-NEXT:    addi a4, a4, -256
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: shfl8_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    zip8 a0, a0
+; RV32IB-NEXT:    zip8 a1, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBP-LABEL: shfl8_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    zip8 a0, a0
+; RV32IBP-NEXT:    zip8 a1, a1
+; RV32IBP-NEXT:    ret
+  %and = and i64 %a, -72056494543077121
+  %shl = shl i64 %a, 8
+  %and1 = and i64 %shl, 71776119077928960
+  %or = or i64 %and1, %and
+  %shr = lshr i64 %a, 8
+  %and2 = and i64 %shr, 280375465148160
+  %or3 = or i64 %or, %and2
+  ret i64 %or3
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64Zbp.ll b/llvm/test/CodeGen/RISCV/rv64Zbp.ll
new file mode 100644
index 000000000000..ae467efaab83
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64Zbp.ll
@@ -0,0 +1,1343 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-b -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64IB
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zbp -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64IBP
+
+define signext i32 @gorc1_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: gorc1_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    lui a2, 171
+; RV64I-NEXT:    addiw a2, a2, -1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -1366
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    lui a3, 349525
+; RV64I-NEXT:    addiw a3, a3, 1365
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: gorc1_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    gorciw a0, a0, 1
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: gorc1_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    gorciw a0, a0, 1
+; RV64IBP-NEXT:    ret
+  %and = shl i32 %a, 1
+  %shl = and i32 %and, -1431655766
+  %and1 = lshr i32 %a, 1
+  %shr = and i32 %and1, 1431655765
+  %or = or i32 %shr, %a
+  %or2 = or i32 %or, %shl
+  ret i32 %or2
+}
+
+define i64 @gorc1_i64(i64 %a) nounwind {
+; RV64I-LABEL: gorc1_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    lui a2, 1026731
+; RV64I-NEXT:    addiw a2, a2, -1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -1366
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    lui a3, 21845
+; RV64I-NEXT:    addiw a3, a3, 1365
+; RV64I-NEXT:    slli a3, a3, 12
+; RV64I-NEXT:    addi a3, a3, 1365
+; RV64I-NEXT:    slli a3, a3, 12
+; RV64I-NEXT:    addi a3, a3, 1365
+; RV64I-NEXT:    slli a3, a3, 12
+; RV64I-NEXT:    addi a3, a3, 1365
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: gorc1_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    orc.p a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: gorc1_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    orc.p a0, a0
+; RV64IBP-NEXT:    ret
+  %and = shl i64 %a, 1
+  %shl = and i64 %and, -6148914691236517206
+  %and1 = lshr i64 %a, 1
+  %shr = and i64 %and1, 6148914691236517205
+  %or = or i64 %shr, %a
+  %or2 = or i64 %or, %shl
+  ret i64 %or2
+}
+
+define signext i32 @gorc2_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: gorc2_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 2
+; RV64I-NEXT:    lui a2, 205
+; RV64I-NEXT:    addiw a2, a2, -819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -820
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 2
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    addiw a3, a3, 819
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: gorc2_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    gorciw a0, a0, 2
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: gorc2_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    gorciw a0, a0, 2
+; RV64IBP-NEXT:    ret
+  %and = shl i32 %a, 2
+  %shl = and i32 %and, -858993460
+  %and1 = lshr i32 %a, 2
+  %shr = and i32 %and1, 858993459
+  %or = or i32 %shr, %a
+  %or2 = or i32 %or, %shl
+  ret i32 %or2
+}
+
+define i64 @gorc2_i64(i64 %a) nounwind {
+; RV64I-LABEL: gorc2_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 2
+; RV64I-NEXT:    lui a2, 1035469
+; RV64I-NEXT:    addiw a2, a2, -819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -820
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 2
+; RV64I-NEXT:    lui a3, 13107
+; RV64I-NEXT:    addiw a3, a3, 819
+; RV64I-NEXT:    slli a3, a3, 12
+; RV64I-NEXT:    addi a3, a3, 819
+; RV64I-NEXT:    slli a3, a3, 12
+; RV64I-NEXT:    addi a3, a3, 819
+; RV64I-NEXT:    slli a3, a3, 12
+; RV64I-NEXT:    addi a3, a3, 819
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: gorc2_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    orc2.n a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: gorc2_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    orc2.n a0, a0
+; RV64IBP-NEXT:    ret
+  %and = shl i64 %a, 2
+  %shl = and i64 %and, -3689348814741910324
+  %and1 = lshr i64 %a, 2
+  %shr = and i64 %and1, 3689348814741910323
+  %or = or i64 %shr, %a
+  %or2 = or i64 %or, %shl
+  ret i64 %or2
+}
+
+define signext i32 @gorc4_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: gorc4_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 4
+; RV64I-NEXT:    lui a2, 241
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 240
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: gorc4_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    gorciw a0, a0, 4
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: gorc4_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    gorciw a0, a0, 4
+; RV64IBP-NEXT:    ret
+  %and = shl i32 %a, 4
+  %shl = and i32 %and, -252645136
+  %and1 = lshr i32 %a, 4
+  %shr = and i32 %and1, 252645135
+  %or = or i32 %shr, %a
+  %or2 = or i32 %or, %shl
+  ret i32 %or2
+}
+
+define i64 @gorc4_i64(i64 %a) nounwind {
+; RV64I-LABEL: gorc4_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 4
+; RV64I-NEXT:    lui a2, 1044721
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 240
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    lui a3, 3855
+; RV64I-NEXT:    addiw a3, a3, 241
+; RV64I-NEXT:    slli a3, a3, 12
+; RV64I-NEXT:    addi a3, a3, -241
+; RV64I-NEXT:    slli a3, a3, 12
+; RV64I-NEXT:    addi a3, a3, 241
+; RV64I-NEXT:    slli a3, a3, 12
+; RV64I-NEXT:    addi a3, a3, -241
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: gorc4_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    orc4.b a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: gorc4_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    orc4.b a0, a0
+; RV64IBP-NEXT:    ret
+  %and = shl i64 %a, 4
+  %shl = and i64 %and, -1085102592571150096
+  %and1 = lshr i64 %a, 4
+  %shr = and i64 %and1, 1085102592571150095
+  %or = or i64 %shr, %a
+  %or2 = or i64 %or, %shl
+  ret i64 %or2
+}
+
+define signext i32 @gorc8_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: gorc8_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 8
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -255
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    addi a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 8
+; RV64I-NEXT:    lui a3, 4080
+; RV64I-NEXT:    addiw a3, a3, 255
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: gorc8_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    gorciw a0, a0, 8
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: gorc8_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    gorciw a0, a0, 8
+; RV64IBP-NEXT:    ret
+  %and = shl i32 %a, 8
+  %shl = and i32 %and, -16711936
+  %and1 = lshr i32 %a, 8
+  %shr = and i32 %and1, 16711935
+  %or = or i32 %shr, %a
+  %or2 = or i32 %or, %shl
+  ret i32 %or2
+}
+
+define i64 @gorc8_i64(i64 %a) nounwind {
+; RV64I-LABEL: gorc8_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 8
+; RV64I-NEXT:    lui a2, 1044496
+; RV64I-NEXT:    addiw a2, a2, -255
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    addi a2, a2, -255
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    addi a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 8
+; RV64I-NEXT:    lui a3, 4080
+; RV64I-NEXT:    addiw a3, a3, 255
+; RV64I-NEXT:    slli a3, a3, 16
+; RV64I-NEXT:    addi a3, a3, 255
+; RV64I-NEXT:    slli a3, a3, 16
+; RV64I-NEXT:    addi a3, a3, 255
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: gorc8_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    orc8.h a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: gorc8_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    orc8.h a0, a0
+; RV64IBP-NEXT:    ret
+  %and = shl i64 %a, 8
+  %shl = and i64 %and, -71777214294589696
+  %and1 = lshr i64 %a, 8
+  %shr = and i64 %and1, 71777214294589695
+  %or = or i64 %shr, %a
+  %or2 = or i64 %or, %shl
+  ret i64 %or2
+}
+
+define signext i32 @gorc16_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: gorc16_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 16
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: gorc16_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    gorciw a0, a0, 16
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: gorc16_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    gorciw a0, a0, 16
+; RV64IBP-NEXT:    ret
+  %shl = shl i32 %a, 16
+  %shr = lshr i32 %a, 16
+  %or = or i32 %shr, %a
+  %or2 = or i32 %or, %shl
+  ret i32 %or2
+}
+
+define i64 @gorc16_i64(i64 %a) nounwind {
+; RV64I-LABEL: gorc16_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 16
+; RV64I-NEXT:    lui a2, 1048560
+; RV64I-NEXT:    addiw a2, a2, 1
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    addi a2, a2, -1
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 16
+; RV64I-NEXT:    lui a3, 16
+; RV64I-NEXT:    addiw a3, a3, -1
+; RV64I-NEXT:    slli a3, a3, 16
+; RV64I-NEXT:    addi a3, a3, 1
+; RV64I-NEXT:    slli a3, a3, 16
+; RV64I-NEXT:    addi a3, a3, -1
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: gorc16_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    orc16.w a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: gorc16_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    orc16.w a0, a0
+; RV64IBP-NEXT:    ret
+  %and = shl i64 %a, 16
+  %shl = and i64 %and, -281470681808896
+  %and1 = lshr i64 %a, 16
+  %shr = and i64 %and1, 281470681808895
+  %or = or i64 %shr, %a
+  %or2 = or i64 %or, %shl
+  ret i64 %or2
+}
+
+define i64 @gorc32(i64 %a) nounwind {
+; RV64I-LABEL: gorc32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 32
+; RV64I-NEXT:    srli a2, a0, 32
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: gorc32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    orc32 a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: gorc32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    orc32 a0, a0
+; RV64IBP-NEXT:    ret
+  %shl = shl i64 %a, 32
+  %shr = lshr i64 %a, 32
+  %or = or i64 %shr, %a
+  %or2 = or i64 %or, %shl
+  ret i64 %or2
+}
+
+define signext i32 @grev1_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: grev1_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    lui a2, 171
+; RV64I-NEXT:    addiw a2, a2, -1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -1366
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: grev1_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    greviw a0, a0, 1
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: grev1_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    greviw a0, a0, 1
+; RV64IBP-NEXT:    ret
+  %and = shl i32 %a, 1
+  %shl = and i32 %and, -1431655766
+  %and1 = lshr i32 %a, 1
+  %shr = and i32 %and1, 1431655765
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i64 @grev1_i64(i64 %a) nounwind {
+; RV64I-LABEL: grev1_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    lui a2, 1026731
+; RV64I-NEXT:    addiw a2, a2, -1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -1366
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    lui a2, 21845
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: grev1_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    rev.p a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: grev1_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    rev.p a0, a0
+; RV64IBP-NEXT:    ret
+  %and = shl i64 %a, 1
+  %shl = and i64 %and, -6148914691236517206
+  %and1 = lshr i64 %a, 1
+  %shr = and i64 %and1, 6148914691236517205
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+define signext i32 @grev2_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: grev2_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 2
+; RV64I-NEXT:    lui a2, 205
+; RV64I-NEXT:    addiw a2, a2, -819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -820
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: grev2_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    greviw a0, a0, 2
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: grev2_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    greviw a0, a0, 2
+; RV64IBP-NEXT:    ret
+  %and = shl i32 %a, 2
+  %shl = and i32 %and, -858993460
+  %and1 = lshr i32 %a, 2
+  %shr = and i32 %and1, 858993459
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i64 @grev2_i64(i64 %a) nounwind {
+; RV64I-LABEL: grev2_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 2
+; RV64I-NEXT:    lui a2, 1035469
+; RV64I-NEXT:    addiw a2, a2, -819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -820
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    lui a2, 13107
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 819
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: grev2_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    rev2.n a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: grev2_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    rev2.n a0, a0
+; RV64IBP-NEXT:    ret
+  %and = shl i64 %a, 2
+  %shl = and i64 %and, -3689348814741910324
+  %and1 = lshr i64 %a, 2
+  %shr = and i64 %and1, 3689348814741910323
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+define signext i32 @grev4_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: grev4_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 4
+; RV64I-NEXT:    lui a2, 241
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 240
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a0, a0, 4
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: grev4_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    greviw a0, a0, 4
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: grev4_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    greviw a0, a0, 4
+; RV64IBP-NEXT:    ret
+  %and = shl i32 %a, 4
+  %shl = and i32 %and, -252645136
+  %and1 = lshr i32 %a, 4
+  %shr = and i32 %and1, 252645135
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i64 @grev4_i64(i64 %a) nounwind {
+; RV64I-LABEL: grev4_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 4
+; RV64I-NEXT:    lui a2, 1044721
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 240
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a0, a0, 4
+; RV64I-NEXT:    lui a2, 3855
+; RV64I-NEXT:    addiw a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: grev4_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    rev4.b a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: grev4_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    rev4.b a0, a0
+; RV64IBP-NEXT:    ret
+  %and = shl i64 %a, 4
+  %shl = and i64 %and, -1085102592571150096
+  %and1 = lshr i64 %a, 4
+  %shr = and i64 %and1, 1085102592571150095
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+define signext i32 @grev8_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: grev8_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 8
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -255
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    addi a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    lui a2, 4080
+; RV64I-NEXT:    addiw a2, a2, 255
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: grev8_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    greviw a0, a0, 8
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: grev8_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    greviw a0, a0, 8
+; RV64IBP-NEXT:    ret
+  %and = shl i32 %a, 8
+  %shl = and i32 %and, -16711936
+  %and1 = lshr i32 %a, 8
+  %shr = and i32 %and1, 16711935
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i64 @grev8_i64(i64 %a) nounwind {
+; RV64I-LABEL: grev8_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 8
+; RV64I-NEXT:    lui a2, 1044496
+; RV64I-NEXT:    addiw a2, a2, -255
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    addi a2, a2, -255
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    addi a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    lui a2, 4080
+; RV64I-NEXT:    addiw a2, a2, 255
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    addi a2, a2, 255
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    addi a2, a2, 255
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: grev8_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    rev8.h a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: grev8_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    rev8.h a0, a0
+; RV64IBP-NEXT:    ret
+  %and = shl i64 %a, 8
+  %shl = and i64 %and, -71777214294589696
+  %and1 = lshr i64 %a, 8
+  %shr = and i64 %and1, 71777214294589695
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+define signext i32 @grev16_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: grev16_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 16
+; RV64I-NEXT:    srliw a0, a0, 16
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: grev16_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    greviw a0, a0, 16
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: grev16_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    greviw a0, a0, 16
+; RV64IBP-NEXT:    ret
+  %shl = shl i32 %a, 16
+  %shr = lshr i32 %a, 16
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i64 @grev16_i64(i64 %a) nounwind {
+; RV64I-LABEL: grev16_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 16
+; RV64I-NEXT:    lui a2, 1048560
+; RV64I-NEXT:    addiw a2, a2, 1
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    addi a2, a2, -1
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a0, a0, 16
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -1
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    addi a2, a2, 1
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    addi a2, a2, -1
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: grev16_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    rev16.w a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: grev16_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    rev16.w a0, a0
+; RV64IBP-NEXT:    ret
+  %and = shl i64 %a, 16
+  %shl = and i64 %and, -281470681808896
+  %and1 = lshr i64 %a, 16
+  %shr = and i64 %and1, 281470681808895
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+define i64 @grev32(i64 %a) nounwind {
+; RV64I-LABEL: grev32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: grev32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    rori a0, a0, 32
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: grev32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    rori a0, a0, 32
+; RV64IBP-NEXT:    ret
+  %shl = shl i64 %a, 32
+  %shr = lshr i64 %a, 32
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+declare i32 @llvm.bswap.i32(i32)
+
+define signext i32 @bswap_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: bswap_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 8
+; RV64I-NEXT:    addi a2, zero, 255
+; RV64I-NEXT:    slli a3, a2, 32
+; RV64I-NEXT:    and a1, a1, a3
+; RV64I-NEXT:    slli a3, a0, 24
+; RV64I-NEXT:    slli a4, a2, 40
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    slli a3, a0, 40
+; RV64I-NEXT:    slli a2, a2, 48
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srai a0, a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: bswap_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    greviw a0, a0, 24
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: bswap_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    greviw a0, a0, 24
+; RV64IBP-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %1
+}
+
+declare i64 @llvm.bswap.i64(i64)
+
+define i64 @bswap_i64(i64 %a) {
+; RV64I-LABEL: bswap_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    lui a2, 4080
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 8
+; RV64I-NEXT:    addi a3, zero, 255
+; RV64I-NEXT:    slli a4, a3, 24
+; RV64I-NEXT:    and a2, a2, a4
+; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    srli a2, a0, 40
+; RV64I-NEXT:    lui a4, 16
+; RV64I-NEXT:    addiw a4, a4, -256
+; RV64I-NEXT:    and a2, a2, a4
+; RV64I-NEXT:    srli a4, a0, 56
+; RV64I-NEXT:    or a2, a2, a4
+; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    slli a2, a0, 8
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    and a2, a2, a4
+; RV64I-NEXT:    slli a4, a0, 24
+; RV64I-NEXT:    slli a5, a3, 40
+; RV64I-NEXT:    and a4, a4, a5
+; RV64I-NEXT:    or a2, a4, a2
+; RV64I-NEXT:    slli a4, a0, 40
+; RV64I-NEXT:    slli a3, a3, 48
+; RV64I-NEXT:    and a3, a4, a3
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: bswap_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    rev8 a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: bswap_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    rev8 a0, a0
+; RV64IBP-NEXT:    ret
+  %1 = call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %1
+}
+
+declare i32 @llvm.bitreverse.i32(i32)
+
+define signext i32 @bitreverse_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: bitreverse_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    lui a2, 4080
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 8
+; RV64I-NEXT:    addi a3, zero, 255
+; RV64I-NEXT:    slli a4, a3, 24
+; RV64I-NEXT:    and a2, a2, a4
+; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    srli a2, a0, 40
+; RV64I-NEXT:    lui a4, 16
+; RV64I-NEXT:    addiw a4, a4, -256
+; RV64I-NEXT:    and a2, a2, a4
+; RV64I-NEXT:    srli a4, a0, 56
+; RV64I-NEXT:    or a2, a2, a4
+; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    slli a2, a0, 8
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    and a2, a2, a4
+; RV64I-NEXT:    slli a4, a0, 24
+; RV64I-NEXT:    slli a5, a3, 40
+; RV64I-NEXT:    and a4, a4, a5
+; RV64I-NEXT:    or a2, a4, a2
+; RV64I-NEXT:    slli a4, a0, 40
+; RV64I-NEXT:    slli a3, a3, 48
+; RV64I-NEXT:    and a3, a4, a3
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    lui a1, 3855
+; RV64I-NEXT:    addiw a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    slli a1, a1, 4
+; RV64I-NEXT:    lui a2, 1044721
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 240
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    srli a0, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    lui a1, 13107
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    lui a2, 1035469
+; RV64I-NEXT:    addiw a2, a2, -819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -820
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    lui a2, 873813
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    slli a2, a2, 33
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srai a0, a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: bitreverse_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    greviw a0, a0, 31
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: bitreverse_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    greviw a0, a0, 31
+; RV64IBP-NEXT:    ret
+  %1 = tail call i32 @llvm.bitreverse.i32(i32 %a)
+  ret i32 %1
+}
+
+declare i64 @llvm.bitreverse.i64(i64)
+
+define i64 @bitreverse_i64(i64 %a) nounwind {
+; RV64I-LABEL: bitreverse_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    lui a2, 4080
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 8
+; RV64I-NEXT:    addi a3, zero, 255
+; RV64I-NEXT:    slli a4, a3, 24
+; RV64I-NEXT:    and a2, a2, a4
+; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    srli a2, a0, 40
+; RV64I-NEXT:    lui a4, 16
+; RV64I-NEXT:    addiw a4, a4, -256
+; RV64I-NEXT:    and a2, a2, a4
+; RV64I-NEXT:    srli a4, a0, 56
+; RV64I-NEXT:    or a2, a2, a4
+; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    slli a2, a0, 8
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    and a2, a2, a4
+; RV64I-NEXT:    slli a4, a0, 24
+; RV64I-NEXT:    slli a5, a3, 40
+; RV64I-NEXT:    and a4, a4, a5
+; RV64I-NEXT:    or a2, a4, a2
+; RV64I-NEXT:    slli a4, a0, 40
+; RV64I-NEXT:    slli a3, a3, 48
+; RV64I-NEXT:    and a3, a4, a3
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    lui a1, 3855
+; RV64I-NEXT:    addiw a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    slli a1, a1, 4
+; RV64I-NEXT:    lui a2, 1044721
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 240
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    srli a0, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    lui a1, 13107
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    lui a2, 1035469
+; RV64I-NEXT:    addiw a2, a2, -819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -819
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -820
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    lui a1, 21845
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 1365
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 1365
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 1365
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    lui a2, 1026731
+; RV64I-NEXT:    addiw a2, a2, -1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -1365
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -1366
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: bitreverse_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    rev a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: bitreverse_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    rev a0, a0
+; RV64IBP-NEXT:    ret
+  %1 = call i64 @llvm.bitreverse.i64(i64 %a)
+  ret i64 %1
+}
+
+; There's no [un]shfliw instruction as slliu.w occupies the encoding slot that
+; would be occupied by shfliw.
+
+define i64 @shfl1_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: shfl1_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1035469
+; RV64I-NEXT:    addiw a1, a1, -819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -819
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -819
+; RV64I-NEXT:    slli a1, a1, 13
+; RV64I-NEXT:    addi a1, a1, -1639
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    slli a2, a0, 1
+; RV64I-NEXT:    lui a3, 4369
+; RV64I-NEXT:    addiw a3, a3, 273
+; RV64I-NEXT:    slli a3, a3, 12
+; RV64I-NEXT:    addi a3, a3, 273
+; RV64I-NEXT:    slli a3, a3, 12
+; RV64I-NEXT:    addi a3, a3, 273
+; RV64I-NEXT:    slli a4, a3, 14
+; RV64I-NEXT:    addi a4, a4, 1092
+; RV64I-NEXT:    and a2, a2, a4
+; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    slli a2, a3, 13
+; RV64I-NEXT:    addi a2, a2, 546
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: shfl1_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    zip.n a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: shfl1_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    zip.n a0, a0
+; RV64IBP-NEXT:    ret
+  %and = and i64 %a, -7378697629483820647
+  %shl = shl i64 %a, 1
+  %and1 = and i64 %shl, 4919131752989213764
+  %or = or i64 %and1, %and
+  %shr = lshr i64 %a, 1
+  %and2 = and i64 %shr, 2459565876494606882
+  %or3 = or i64 %or, %and2
+  ret i64 %or3
+}
+
+define i64 @shfl2_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: shfl2_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1044721
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 241
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    slli a1, a1, 14
+; RV64I-NEXT:    addi a1, a1, 963
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    slli a2, a0, 2
+; RV64I-NEXT:    lui a3, 48
+; RV64I-NEXT:    addiw a3, a3, 771
+; RV64I-NEXT:    slli a3, a3, 16
+; RV64I-NEXT:    addi a3, a3, 771
+; RV64I-NEXT:    slli a4, a3, 16
+; RV64I-NEXT:    addi a4, a4, 771
+; RV64I-NEXT:    slli a4, a4, 12
+; RV64I-NEXT:    addi a4, a4, 48
+; RV64I-NEXT:    and a2, a2, a4
+; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    slli a2, a3, 14
+; RV64I-NEXT:    addi a2, a2, 193
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -1012
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: shfl2_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    zip2.b a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: shfl2_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    zip2.b a0, a0
+; RV64IBP-NEXT:    ret
+  %and = and i64 %a, -4340410370284600381
+  %shl = shl i64 %a, 2
+  %and1 = and i64 %shl, 3472328296227680304
+  %or = or i64 %and1, %and
+  %shr = lshr i64 %a, 2
+  %and2 = and i64 %shr, 868082074056920076
+  %or3 = or i64 %or, %and2
+  ret i64 %or3
+}
+
+define i64 @shfl4_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: shfl4_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1048560
+; RV64I-NEXT:    addiw a1, a1, 255
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    addi a1, a1, 255
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    addi a1, a1, 255
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 15
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    slli a2, a0, 4
+; RV64I-NEXT:    lui a3, 240
+; RV64I-NEXT:    addiw a3, a3, 15
+; RV64I-NEXT:    slli a3, a3, 16
+; RV64I-NEXT:    addi a3, a3, 15
+; RV64I-NEXT:    slli a4, a3, 12
+; RV64I-NEXT:    addi a4, a4, 1
+; RV64I-NEXT:    slli a4, a4, 12
+; RV64I-NEXT:    addi a4, a4, -256
+; RV64I-NEXT:    and a2, a2, a4
+; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    srli a0, a0, 4
+; RV64I-NEXT:    slli a2, a3, 20
+; RV64I-NEXT:    addi a2, a2, 240
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: shfl4_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    zip4.h a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: shfl4_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    zip4.h a0, a0
+; RV64IBP-NEXT:    ret
+  %and = and i64 %a, -1148435428713435121
+  %shl = shl i64 %a, 4
+  %and1 = and i64 %shl, 1080880403494997760
+  %or = or i64 %and1, %and
+  %shr = lshr i64 %a, 4
+  %and2 = and i64 %shr, 67555025218437360
+  %or3 = or i64 %or, %and2
+  ret i64 %or3
+}
+
+define i64 @shfl8_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: shfl8_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1048560
+; RV64I-NEXT:    addiw a1, a1, 1
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    addi a1, a1, 255
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    slli a2, a0, 8
+; RV64I-NEXT:    addi a3, zero, 255
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    addi a4, a4, 255
+; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    and a2, a2, a4
+; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    slli a2, a3, 24
+; RV64I-NEXT:    addi a2, a2, 1
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    addi a2, a2, -256
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: shfl8_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    zip8.w a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: shfl8_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    zip8.w a0, a0
+; RV64IBP-NEXT:    ret
+  %and = and i64 %a, -72056494543077121
+  %shl = shl i64 %a, 8
+  %and1 = and i64 %shl, 71776119077928960
+  %or = or i64 %and1, %and
+  %shr = lshr i64 %a, 8
+  %and2 = and i64 %shr, 280375465148160
+  %or3 = or i64 %or, %and2
+  ret i64 %or3
+}
+
+define i64 @shfl16(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: shfl16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a1, zero, -1
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    addi a1, a1, 1
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    slli a2, a0, 16
+; RV64I-NEXT:    lui a3, 16
+; RV64I-NEXT:    addiw a3, a3, -1
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    and a2, a2, a4
+; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    srli a0, a0, 16
+; RV64I-NEXT:    slli a2, a3, 16
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: shfl16:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    zip16 a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: shfl16:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    zip16 a0, a0
+; RV64IBP-NEXT:    ret
+  %and = and i64 %a, -281474976645121
+  %shl = shl i64 %a, 16
+  %and1 = and i64 %shl, 281470681743360
+  %or = or i64 %and1, %and
+  %shr = lshr i64 %a, 16
+  %and2 = and i64 %shr, 4294901760
+  %or3 = or i64 %or, %and2
+  ret i64 %or3
+}

From 7776c991d06e1d84ffb5e709024bfff5e51f7e8e Mon Sep 17 00:00:00 2001
From: lewis-revill <lewis.revill@embecosm.com>
Date: Wed, 15 Jul 2020 11:55:44 +0100
Subject: [PATCH 049/363] [RISCV] Add matching of codegen patterns to RISCV Bit
 Manipulation Zbbp asm instructions

This patch provides optimization of bit manipulation operations by
enabling the +experimental-b target feature.
It adds matching of single block patterns of instructions to specific
bit-manip instructions belonging to both the permutation and the base
subsets of the experimental B extension of RISC-V.
It adds also the correspondent codegen tests.

This patch is based on Claire Wolf's proposal for the bit manipulation
extension of RISCV:
https://github.com/riscv/riscv-bitmanip/blob/master/bitmanip-0.92.pdf

Differential Revision: https://reviews.llvm.org/D79873

(cherry picked from commit 6144f0a1e52e7f5439a67267ca65f2d72c21aaa6)
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp |  85 ++
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h   |   2 +
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |   6 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoB.td    |  71 ++
 llvm/test/CodeGen/RISCV/rv32Zbbp.ll         | 892 ++++++++++++++++++++
 llvm/test/CodeGen/RISCV/rv64Zbbp.ll         | 517 ++++++++++++
 6 files changed, 1571 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rv32Zbbp.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64Zbbp.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 99e5135b424f..fd1a91f68802 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -272,6 +272,44 @@ bool RISCVDAGToDAGISel::SelectSROI(SDValue N, SDValue &RS1, SDValue &Shamt) {
   return false;
 }
 
+// Check that it is a RORI (Rotate Right Immediate). We first check that
+// it is the right node tree:
+//
+//  (ROTL RS1, VC)
+//
+// The compiler translates immediate rotations to the right given by the call
+// to the rotateright32/rotateright64 intrinsics as rotations to the left.
+// Since the rotation to the left can be easily emulated as a rotation to the
+// right by negating the constant, there is no encoding for ROLI.
+// We then select the immediate left rotations as RORI by the complementary
+// constant:
+//
+//  Shamt == XLen - VC
+
+bool RISCVDAGToDAGISel::SelectRORI(SDValue N, SDValue &RS1, SDValue &Shamt) {
+  MVT XLenVT = Subtarget->getXLenVT();
+  if (N.getOpcode() == ISD::ROTL) {
+    if (isa<ConstantSDNode>(N.getOperand(1))) {
+      if (XLenVT == MVT::i64) {
+        uint64_t VC = N.getConstantOperandVal(1);
+        Shamt = CurDAG->getTargetConstant((64 - VC), SDLoc(N),
+                                          N.getOperand(1).getValueType());
+        RS1 = N.getOperand(0);
+        return true;
+      }
+      if (XLenVT == MVT::i32) {
+        uint32_t VC = N.getConstantOperandVal(1);
+        Shamt = CurDAG->getTargetConstant((32 - VC), SDLoc(N),
+                                          N.getOperand(1).getValueType());
+        RS1 = N.getOperand(0);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+
 // Check that it is a SLLIUW (Shift Logical Left Immediate Unsigned i32
 // on RV64).
 // SLLIUW is the same as SLLI except for the fact that it clears the bits
@@ -374,6 +412,53 @@ bool RISCVDAGToDAGISel::SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
   return false;
 }
 
+// Check that it is a RORIW (i32 Right Rotate Immediate on RV64).
+// We first check that it is the right node tree:
+//
+//  (SIGN_EXTEND_INREG (OR (SHL (AsserSext RS1, i32), VC2),
+//                         (SRL (AND (AssertSext RS2, i32), VC3), VC1)))
+//
+// Then we check that the constant operands respect these constraints:
+//
+// VC2 == 32 - VC1
+// VC3 == maskLeadingOnes<uint32_t>(VC2)
+//
+// being VC1 the Shamt we need, VC2 the complementary of Shamt over 32
+// and VC3 a 32 bit mask of (32 - VC1) leading ones.
+
+bool RISCVDAGToDAGISel::SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
+  if (N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+      Subtarget->getXLenVT() == MVT::i64 &&
+      cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
+    if (N.getOperand(0).getOpcode() == ISD::OR) {
+      SDValue Or = N.getOperand(0);
+      if (Or.getOperand(0).getOpcode() == ISD::SHL &&
+          Or.getOperand(1).getOpcode() == ISD::SRL) {
+        SDValue Shl = Or.getOperand(0);
+        SDValue Srl = Or.getOperand(1);
+        if (Srl.getOperand(0).getOpcode() == ISD::AND) {
+          SDValue And = Srl.getOperand(0);
+          if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
+              isa<ConstantSDNode>(Shl.getOperand(1)) &&
+              isa<ConstantSDNode>(And.getOperand(1))) {
+            uint32_t VC1 = Srl.getConstantOperandVal(1);
+            uint32_t VC2 = Shl.getConstantOperandVal(1);
+            uint32_t VC3 = And.getConstantOperandVal(1);
+            if (VC2 == (32 - VC1) &&
+                VC3 == maskLeadingOnes<uint32_t>(VC2)) {
+              RS1 = Shl.getOperand(0);
+              Shamt = CurDAG->getTargetConstant(VC1, SDLoc(N),
+                                              Srl.getOperand(1).getValueType());
+              return true;
+            }
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
 // Merge an ADDI into the offset of a load/store instruction where possible.
 // (load (addi base, off1), off2) -> (load base, off1+off2)
 // (store val, (addi base, off1), off2) -> (store val, base, off1+off2)
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index 4e382ee58500..bc1655b673d7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -47,9 +47,11 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
 
   bool SelectSLOI(SDValue N, SDValue &RS1, SDValue &Shamt);
   bool SelectSROI(SDValue N, SDValue &RS1, SDValue &Shamt);
+  bool SelectRORI(SDValue N, SDValue &RS1, SDValue &Shamt);
   bool SelectSLLIUW(SDValue N, SDValue &RS1, SDValue &Shamt);
   bool SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt);
   bool SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt);
+  bool SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt);
 
 // Include the pieces autogenerated from the target description.
 #include "RISCVGenDAGISel.inc"
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c89bb21c9701..7cad9f9bd43e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -149,8 +149,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SRL_PARTS, XLenVT, Custom);
   setOperationAction(ISD::SRA_PARTS, XLenVT, Custom);
 
-  setOperationAction(ISD::ROTL, XLenVT, Expand);
-  setOperationAction(ISD::ROTR, XLenVT, Expand);
+  if (!(Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp())) {
+    setOperationAction(ISD::ROTL, XLenVT, Expand);
+    setOperationAction(ISD::ROTR, XLenVT, Expand);
+  }
 
   if (!Subtarget.hasStdExtZbp())
     setOperationAction(ISD::BSWAP, XLenVT, Expand);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
index 09d5f1ef856a..45eb41f93b2e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
@@ -638,21 +638,46 @@ def : CompressPat<(PACK GPRC:$rs1, GPRC:$rs1, X0),
 //===----------------------------------------------------------------------===//
 def SLOIPat   : ComplexPattern<XLenVT, 2, "SelectSLOI", [or]>;
 def SROIPat   : ComplexPattern<XLenVT, 2, "SelectSROI", [or]>;
+def RORIPat   : ComplexPattern<XLenVT, 2, "SelectRORI", [rotl]>;
 def SLLIUWPat : ComplexPattern<i64, 2, "SelectSLLIUW", [and]>;
 def SLOIWPat  : ComplexPattern<i64, 2, "SelectSLOIW", [sext_inreg]>;
 def SROIWPat  : ComplexPattern<i64, 2, "SelectSROIW", [or]>;
+def RORIWPat  : ComplexPattern<i64, 2, "SelectRORIW", [sext_inreg]>;
+
+let Predicates = [HasStdExtZbbOrZbp] in {
+def : Pat<(and GPR:$rs1, (not GPR:$rs2)), (ANDN GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or  GPR:$rs1, (not GPR:$rs2)), (ORN  GPR:$rs1, GPR:$rs2)>;
+def : Pat<(xor GPR:$rs1, (not GPR:$rs2)), (XNOR GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbbOrZbp]
 
 let Predicates = [HasStdExtZbb] in {
 def : Pat<(xor (shl (xor GPR:$rs1, -1), GPR:$rs2), -1),
           (SLO GPR:$rs1, GPR:$rs2)>;
 def : Pat<(xor (srl (xor GPR:$rs1, -1), GPR:$rs2), -1),
           (SRO GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbb]
+
+let Predicates = [HasStdExtZbbOrZbp] in {
+def : Pat<(rotl GPR:$rs1, GPR:$rs2), (ROL GPR:$rs1, GPR:$rs2)>;
+def : Pat<(fshl GPR:$rs1, GPR:$rs1, GPR:$rs2), (ROL GPR:$rs1, GPR:$rs2)>;
+def : Pat<(rotr GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>;
+def : Pat<(fshr GPR:$rs1, GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbbOrZbp]
+
+let Predicates = [HasStdExtZbb] in {
 def : Pat<(SLOIPat GPR:$rs1, uimmlog2xlen:$shamt),
           (SLOI GPR:$rs1, uimmlog2xlen:$shamt)>;
 def : Pat<(SROIPat GPR:$rs1, uimmlog2xlen:$shamt),
           (SROI GPR:$rs1, uimmlog2xlen:$shamt)>;
 } // Predicates = [HasStdExtZbb]
 
+// There's no encoding for roli in the current version of the 'B' extension
+// (v0.92) as it can be implemented with rori by negating the immediate.
+// For this reason we pattern-match only against rori[w].
+let Predicates = [HasStdExtZbbOrZbp] in
+def : Pat<(RORIPat GPR:$rs1, uimmlog2xlen:$shamt),
+          (RORI GPR:$rs1, uimmlog2xlen:$shamt)>;
+
 let Predicates = [HasStdExtZbp, IsRV32] in {
 def : Pat<(or (or (and (srl GPR:$rs1, (i32 1)), (i32 0x55555555)), GPR:$rs1),
               (and (shl GPR:$rs1, (i32 1)), (i32 0xAAAAAAAA))),
@@ -772,6 +797,23 @@ def : Pat<(riscv_selectcc GPR:$rs2, GPR:$rs1, (XLenVT 12), GPR:$rs1, GPR:$rs2),
           (MAXU  GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtZbb]
 
+let Predicates = [HasStdExtZbbOrZbp, IsRV32] in
+def : Pat<(or (and GPR:$rs1, 0x0000FFFF), (shl GPR:$rs2, (i32 16))),
+          (PACK GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
+def : Pat<(or (and GPR:$rs1, 0x00000000FFFFFFFF), (shl GPR:$rs2, (i64 32))),
+          (PACK GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbbOrZbp, IsRV32] in
+def : Pat<(or (and GPR:$rs2, 0xFFFF0000), (srl GPR:$rs1, (i32 16))),
+          (PACKU GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
+def : Pat<(or (and GPR:$rs2, 0xFFFFFFFF00000000), (srl GPR:$rs1, (i64 32))),
+          (PACKU GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbbOrZbp] in
+def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFF00),
+              (and GPR:$rs1, 0x00FF)),
+          (PACKH GPR:$rs1, GPR:$rs2)>;
+
 let Predicates = [HasStdExtZbp, IsRV32] in {
 def : Pat<(or (or (and (shl GPR:$rs1, (i32 8)), (i32 0x00FF0000)),
                   (and GPR:$rs1, (i32 0xFF0000FF))),
@@ -831,12 +873,30 @@ def : Pat<(xor (riscv_sllw (xor GPR:$rs1, -1), GPR:$rs2), -1),
           (SLOW GPR:$rs1, GPR:$rs2)>;
 def : Pat<(xor (riscv_srlw (xor GPR:$rs1, -1), GPR:$rs2), -1),
           (SROW GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbb, IsRV64]
+
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+def : Pat<(or (riscv_sllw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2)),
+              (riscv_srlw (assertsexti32 GPR:$rs1),
+                          (sub (i64 0), (assertsexti32 GPR:$rs2)))),
+          (ROLW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (riscv_sllw (assertsexti32 GPR:$rs1),
+                          (sub (i64 0), (assertsexti32 GPR:$rs2))),
+              (riscv_srlw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2))),
+          (RORW GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+
+let Predicates = [HasStdExtZbb, IsRV64] in {
 def : Pat<(SLOIWPat GPR:$rs1, uimmlog2xlen:$shamt),
           (SLOIW GPR:$rs1, uimmlog2xlen:$shamt)>;
 def : Pat<(SROIWPat GPR:$rs1, uimmlog2xlen:$shamt),
           (SROIW GPR:$rs1, uimmlog2xlen:$shamt)>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
+def : Pat<(RORIWPat GPR:$rs1, uimmlog2xlen:$shamt),
+          (RORIW GPR:$rs1, uimmlog2xlen:$shamt)>;
+
 let Predicates = [HasStdExtZbp, IsRV64] in {
 def : Pat<(sext_inreg (or (or (and (srl GPR:$rs1, (i64 1)), (i64 0x55555555)),
                               GPR:$rs1),
@@ -898,3 +958,14 @@ def : Pat<(add (ctlz (and GPR:$rs1, (i64 0xFFFFFFFF))), (i64 -32)),
 // RV64 CTZ
 def : Pat<(ctpop (and GPR:$rs1, (i64 0xFFFFFFFF))), (PCNTW GPR:$rs1)>;
 } // Predicates = [HasStdExtZbb, IsRV64]
+
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+def : Pat<(sext_inreg (or (shl (assertsexti32 GPR:$rs2), (i64 16)),
+                          (and (assertsexti32 GPR:$rs1), 0x000000000000FFFF)),
+                      i32),
+          (PACKW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000),
+              (srl (and (assertsexti32 GPR:$rs1), 0x00000000FFFF0000),
+                   (i64 16))),
+          (PACKUW GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
diff --git a/llvm/test/CodeGen/RISCV/rv32Zbbp.ll b/llvm/test/CodeGen/RISCV/rv32Zbbp.ll
new file mode 100644
index 000000000000..0e6288928f0c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv32Zbbp.ll
@@ -0,0 +1,892 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-b -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32IB
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32IBB
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zbp -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32IBP
+
+define i32 @andn_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: andn_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a1, a1
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: andn_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    andn a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: andn_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    andn a0, a0, a1
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: andn_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    andn a0, a0, a1
+; RV32IBP-NEXT:    ret
+  %neg = xor i32 %b, -1
+  %and = and i32 %neg, %a
+  ret i32 %and
+}
+
+define i64 @andn_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: andn_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a3, a3
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    and a0, a2, a0
+; RV32I-NEXT:    and a1, a3, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: andn_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    andn a0, a0, a2
+; RV32IB-NEXT:    andn a1, a1, a3
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: andn_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    andn a0, a0, a2
+; RV32IBB-NEXT:    andn a1, a1, a3
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: andn_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    andn a0, a0, a2
+; RV32IBP-NEXT:    andn a1, a1, a3
+; RV32IBP-NEXT:    ret
+  %neg = xor i64 %b, -1
+  %and = and i64 %neg, %a
+  ret i64 %and
+}
+
+define i32 @orn_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: orn_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a1, a1
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: orn_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    orn a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: orn_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    orn a0, a0, a1
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: orn_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    orn a0, a0, a1
+; RV32IBP-NEXT:    ret
+  %neg = xor i32 %b, -1
+  %or = or i32 %neg, %a
+  ret i32 %or
+}
+
+define i64 @orn_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: orn_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a3, a3
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: orn_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    orn a0, a0, a2
+; RV32IB-NEXT:    orn a1, a1, a3
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: orn_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    orn a0, a0, a2
+; RV32IBB-NEXT:    orn a1, a1, a3
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: orn_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    orn a0, a0, a2
+; RV32IBP-NEXT:    orn a1, a1, a3
+; RV32IBP-NEXT:    ret
+  %neg = xor i64 %b, -1
+  %or = or i64 %neg, %a
+  ret i64 %or
+}
+
+define i32 @xnor_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: xnor_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    xor a0, a0, a1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: xnor_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    xnor a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: xnor_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    xnor a0, a0, a1
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: xnor_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    xnor a0, a0, a1
+; RV32IBP-NEXT:    ret
+  %neg = xor i32 %a, -1
+  %xor = xor i32 %neg, %b
+  ret i32 %xor
+}
+
+define i64 @xnor_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: xnor_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    xor a1, a1, a3
+; RV32I-NEXT:    xor a0, a0, a2
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    not a1, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: xnor_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    xnor a0, a0, a2
+; RV32IB-NEXT:    xnor a1, a1, a3
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: xnor_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    xnor a0, a0, a2
+; RV32IBB-NEXT:    xnor a1, a1, a3
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: xnor_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    xnor a0, a0, a2
+; RV32IBP-NEXT:    xnor a1, a1, a3
+; RV32IBP-NEXT:    ret
+  %neg = xor i64 %a, -1
+  %xor = xor i64 %neg, %b
+  ret i64 %xor
+}
+
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+
+define i32 @rol_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: rol_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sll a2, a0, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: rol_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    rol a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: rol_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    rol a0, a0, a1
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: rol_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    rol a0, a0, a1
+; RV32IBP-NEXT:    ret
+  %or = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 %b)
+  ret i32 %or
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the experimental-b
+; extension introduce instructions suitable for this pattern.
+
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+
+define i64 @rol_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: rol_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a3, a2, 63
+; RV32I-NEXT:    addi t1, a3, -32
+; RV32I-NEXT:    addi a6, zero, 31
+; RV32I-NEXT:    bltz t1, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sll a7, a0, t1
+; RV32I-NEXT:    j .LBB7_3
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    sll a4, a1, a2
+; RV32I-NEXT:    sub a3, a6, a3
+; RV32I-NEXT:    srli a5, a0, 1
+; RV32I-NEXT:    srl a3, a5, a3
+; RV32I-NEXT:    or a7, a4, a3
+; RV32I-NEXT:  .LBB7_3:
+; RV32I-NEXT:    neg a4, a2
+; RV32I-NEXT:    andi a5, a4, 63
+; RV32I-NEXT:    addi a3, a5, -32
+; RV32I-NEXT:    bltz a3, .LBB7_7
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    mv t0, zero
+; RV32I-NEXT:    bgez a3, .LBB7_8
+; RV32I-NEXT:  .LBB7_5:
+; RV32I-NEXT:    srl a3, a0, a4
+; RV32I-NEXT:    sub a4, a6, a5
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    sll a1, a1, a4
+; RV32I-NEXT:    or a4, a3, a1
+; RV32I-NEXT:    or a1, a7, t0
+; RV32I-NEXT:    bgez t1, .LBB7_9
+; RV32I-NEXT:  .LBB7_6:
+; RV32I-NEXT:    sll a0, a0, a2
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB7_7:
+; RV32I-NEXT:    srl t0, a1, a4
+; RV32I-NEXT:    bltz a3, .LBB7_5
+; RV32I-NEXT:  .LBB7_8:
+; RV32I-NEXT:    srl a4, a1, a3
+; RV32I-NEXT:    or a1, a7, t0
+; RV32I-NEXT:    bltz t1, .LBB7_6
+; RV32I-NEXT:  .LBB7_9:
+; RV32I-NEXT:    or a0, zero, a4
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: rol_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    andi a3, a2, 63
+; RV32IB-NEXT:    addi t1, a3, -32
+; RV32IB-NEXT:    addi a6, zero, 31
+; RV32IB-NEXT:    bltz t1, .LBB7_2
+; RV32IB-NEXT:  # %bb.1:
+; RV32IB-NEXT:    sll a7, a0, t1
+; RV32IB-NEXT:    j .LBB7_3
+; RV32IB-NEXT:  .LBB7_2:
+; RV32IB-NEXT:    sll a4, a1, a2
+; RV32IB-NEXT:    sub a3, a6, a3
+; RV32IB-NEXT:    srli a5, a0, 1
+; RV32IB-NEXT:    srl a3, a5, a3
+; RV32IB-NEXT:    or a7, a4, a3
+; RV32IB-NEXT:  .LBB7_3:
+; RV32IB-NEXT:    neg a4, a2
+; RV32IB-NEXT:    andi a5, a4, 63
+; RV32IB-NEXT:    addi a3, a5, -32
+; RV32IB-NEXT:    bltz a3, .LBB7_7
+; RV32IB-NEXT:  # %bb.4:
+; RV32IB-NEXT:    mv t0, zero
+; RV32IB-NEXT:    bgez a3, .LBB7_8
+; RV32IB-NEXT:  .LBB7_5:
+; RV32IB-NEXT:    srl a3, a0, a4
+; RV32IB-NEXT:    sub a4, a6, a5
+; RV32IB-NEXT:    slli a1, a1, 1
+; RV32IB-NEXT:    sll a1, a1, a4
+; RV32IB-NEXT:    or a4, a3, a1
+; RV32IB-NEXT:    or a1, a7, t0
+; RV32IB-NEXT:    bgez t1, .LBB7_9
+; RV32IB-NEXT:  .LBB7_6:
+; RV32IB-NEXT:    sll a0, a0, a2
+; RV32IB-NEXT:    or a0, a0, a4
+; RV32IB-NEXT:    ret
+; RV32IB-NEXT:  .LBB7_7:
+; RV32IB-NEXT:    srl t0, a1, a4
+; RV32IB-NEXT:    bltz a3, .LBB7_5
+; RV32IB-NEXT:  .LBB7_8:
+; RV32IB-NEXT:    srl a4, a1, a3
+; RV32IB-NEXT:    or a1, a7, t0
+; RV32IB-NEXT:    bltz t1, .LBB7_6
+; RV32IB-NEXT:  .LBB7_9:
+; RV32IB-NEXT:    or a0, zero, a4
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: rol_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    andi a3, a2, 63
+; RV32IBB-NEXT:    addi t1, a3, -32
+; RV32IBB-NEXT:    addi a6, zero, 31
+; RV32IBB-NEXT:    bltz t1, .LBB7_2
+; RV32IBB-NEXT:  # %bb.1:
+; RV32IBB-NEXT:    sll a7, a0, t1
+; RV32IBB-NEXT:    j .LBB7_3
+; RV32IBB-NEXT:  .LBB7_2:
+; RV32IBB-NEXT:    sll a4, a1, a2
+; RV32IBB-NEXT:    sub a3, a6, a3
+; RV32IBB-NEXT:    srli a5, a0, 1
+; RV32IBB-NEXT:    srl a3, a5, a3
+; RV32IBB-NEXT:    or a7, a4, a3
+; RV32IBB-NEXT:  .LBB7_3:
+; RV32IBB-NEXT:    neg a4, a2
+; RV32IBB-NEXT:    andi a5, a4, 63
+; RV32IBB-NEXT:    addi a3, a5, -32
+; RV32IBB-NEXT:    bltz a3, .LBB7_7
+; RV32IBB-NEXT:  # %bb.4:
+; RV32IBB-NEXT:    mv t0, zero
+; RV32IBB-NEXT:    bgez a3, .LBB7_8
+; RV32IBB-NEXT:  .LBB7_5:
+; RV32IBB-NEXT:    srl a3, a0, a4
+; RV32IBB-NEXT:    sub a4, a6, a5
+; RV32IBB-NEXT:    slli a1, a1, 1
+; RV32IBB-NEXT:    sll a1, a1, a4
+; RV32IBB-NEXT:    or a4, a3, a1
+; RV32IBB-NEXT:    or a1, a7, t0
+; RV32IBB-NEXT:    bgez t1, .LBB7_9
+; RV32IBB-NEXT:  .LBB7_6:
+; RV32IBB-NEXT:    sll a0, a0, a2
+; RV32IBB-NEXT:    or a0, a0, a4
+; RV32IBB-NEXT:    ret
+; RV32IBB-NEXT:  .LBB7_7:
+; RV32IBB-NEXT:    srl t0, a1, a4
+; RV32IBB-NEXT:    bltz a3, .LBB7_5
+; RV32IBB-NEXT:  .LBB7_8:
+; RV32IBB-NEXT:    srl a4, a1, a3
+; RV32IBB-NEXT:    or a1, a7, t0
+; RV32IBB-NEXT:    bltz t1, .LBB7_6
+; RV32IBB-NEXT:  .LBB7_9:
+; RV32IBB-NEXT:    or a0, zero, a4
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: rol_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    andi a3, a2, 63
+; RV32IBP-NEXT:    addi t1, a3, -32
+; RV32IBP-NEXT:    addi a6, zero, 31
+; RV32IBP-NEXT:    bltz t1, .LBB7_2
+; RV32IBP-NEXT:  # %bb.1:
+; RV32IBP-NEXT:    sll a7, a0, t1
+; RV32IBP-NEXT:    j .LBB7_3
+; RV32IBP-NEXT:  .LBB7_2:
+; RV32IBP-NEXT:    sll a4, a1, a2
+; RV32IBP-NEXT:    sub a3, a6, a3
+; RV32IBP-NEXT:    srli a5, a0, 1
+; RV32IBP-NEXT:    srl a3, a5, a3
+; RV32IBP-NEXT:    or a7, a4, a3
+; RV32IBP-NEXT:  .LBB7_3:
+; RV32IBP-NEXT:    neg a4, a2
+; RV32IBP-NEXT:    andi a5, a4, 63
+; RV32IBP-NEXT:    addi a3, a5, -32
+; RV32IBP-NEXT:    bltz a3, .LBB7_7
+; RV32IBP-NEXT:  # %bb.4:
+; RV32IBP-NEXT:    mv t0, zero
+; RV32IBP-NEXT:    bgez a3, .LBB7_8
+; RV32IBP-NEXT:  .LBB7_5:
+; RV32IBP-NEXT:    srl a3, a0, a4
+; RV32IBP-NEXT:    sub a4, a6, a5
+; RV32IBP-NEXT:    slli a1, a1, 1
+; RV32IBP-NEXT:    sll a1, a1, a4
+; RV32IBP-NEXT:    or a4, a3, a1
+; RV32IBP-NEXT:    or a1, a7, t0
+; RV32IBP-NEXT:    bgez t1, .LBB7_9
+; RV32IBP-NEXT:  .LBB7_6:
+; RV32IBP-NEXT:    sll a0, a0, a2
+; RV32IBP-NEXT:    or a0, a0, a4
+; RV32IBP-NEXT:    ret
+; RV32IBP-NEXT:  .LBB7_7:
+; RV32IBP-NEXT:    srl t0, a1, a4
+; RV32IBP-NEXT:    bltz a3, .LBB7_5
+; RV32IBP-NEXT:  .LBB7_8:
+; RV32IBP-NEXT:    srl a4, a1, a3
+; RV32IBP-NEXT:    or a1, a7, t0
+; RV32IBP-NEXT:    bltz t1, .LBB7_6
+; RV32IBP-NEXT:  .LBB7_9:
+; RV32IBP-NEXT:    or a0, zero, a4
+; RV32IBP-NEXT:    ret
+  %or = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %b)
+  ret i64 %or
+}
+
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+
+define i32 @ror_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: ror_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srl a2, a0, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: ror_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    ror a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: ror_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    ror a0, a0, a1
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: ror_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    ror a0, a0, a1
+; RV32IBP-NEXT:    ret
+  %or = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 %b)
+  ret i32 %or
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the experimental-b
+; extension introduce instructions suitable for this pattern.
+
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+
+define i64 @ror_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: ror_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a3, a2, 63
+; RV32I-NEXT:    addi t1, a3, -32
+; RV32I-NEXT:    addi a6, zero, 31
+; RV32I-NEXT:    bltz t1, .LBB9_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    srl a7, a1, t1
+; RV32I-NEXT:    j .LBB9_3
+; RV32I-NEXT:  .LBB9_2:
+; RV32I-NEXT:    srl a4, a0, a2
+; RV32I-NEXT:    sub a3, a6, a3
+; RV32I-NEXT:    slli a5, a1, 1
+; RV32I-NEXT:    sll a3, a5, a3
+; RV32I-NEXT:    or a7, a4, a3
+; RV32I-NEXT:  .LBB9_3:
+; RV32I-NEXT:    neg a4, a2
+; RV32I-NEXT:    andi a5, a4, 63
+; RV32I-NEXT:    addi a3, a5, -32
+; RV32I-NEXT:    bltz a3, .LBB9_7
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    mv t0, zero
+; RV32I-NEXT:    bgez a3, .LBB9_8
+; RV32I-NEXT:  .LBB9_5:
+; RV32I-NEXT:    sll a3, a1, a4
+; RV32I-NEXT:    sub a4, a6, a5
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    srl a0, a0, a4
+; RV32I-NEXT:    or a4, a3, a0
+; RV32I-NEXT:    or a0, t0, a7
+; RV32I-NEXT:    bgez t1, .LBB9_9
+; RV32I-NEXT:  .LBB9_6:
+; RV32I-NEXT:    srl a1, a1, a2
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB9_7:
+; RV32I-NEXT:    sll t0, a0, a4
+; RV32I-NEXT:    bltz a3, .LBB9_5
+; RV32I-NEXT:  .LBB9_8:
+; RV32I-NEXT:    sll a4, a0, a3
+; RV32I-NEXT:    or a0, t0, a7
+; RV32I-NEXT:    bltz t1, .LBB9_6
+; RV32I-NEXT:  .LBB9_9:
+; RV32I-NEXT:    or a1, a4, zero
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: ror_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    andi a3, a2, 63
+; RV32IB-NEXT:    addi t1, a3, -32
+; RV32IB-NEXT:    addi a6, zero, 31
+; RV32IB-NEXT:    bltz t1, .LBB9_2
+; RV32IB-NEXT:  # %bb.1:
+; RV32IB-NEXT:    srl a7, a1, t1
+; RV32IB-NEXT:    j .LBB9_3
+; RV32IB-NEXT:  .LBB9_2:
+; RV32IB-NEXT:    srl a4, a0, a2
+; RV32IB-NEXT:    sub a3, a6, a3
+; RV32IB-NEXT:    slli a5, a1, 1
+; RV32IB-NEXT:    sll a3, a5, a3
+; RV32IB-NEXT:    or a7, a4, a3
+; RV32IB-NEXT:  .LBB9_3:
+; RV32IB-NEXT:    neg a4, a2
+; RV32IB-NEXT:    andi a5, a4, 63
+; RV32IB-NEXT:    addi a3, a5, -32
+; RV32IB-NEXT:    bltz a3, .LBB9_7
+; RV32IB-NEXT:  # %bb.4:
+; RV32IB-NEXT:    mv t0, zero
+; RV32IB-NEXT:    bgez a3, .LBB9_8
+; RV32IB-NEXT:  .LBB9_5:
+; RV32IB-NEXT:    sll a3, a1, a4
+; RV32IB-NEXT:    sub a4, a6, a5
+; RV32IB-NEXT:    srli a0, a0, 1
+; RV32IB-NEXT:    srl a0, a0, a4
+; RV32IB-NEXT:    or a4, a3, a0
+; RV32IB-NEXT:    or a0, t0, a7
+; RV32IB-NEXT:    bgez t1, .LBB9_9
+; RV32IB-NEXT:  .LBB9_6:
+; RV32IB-NEXT:    srl a1, a1, a2
+; RV32IB-NEXT:    or a1, a4, a1
+; RV32IB-NEXT:    ret
+; RV32IB-NEXT:  .LBB9_7:
+; RV32IB-NEXT:    sll t0, a0, a4
+; RV32IB-NEXT:    bltz a3, .LBB9_5
+; RV32IB-NEXT:  .LBB9_8:
+; RV32IB-NEXT:    sll a4, a0, a3
+; RV32IB-NEXT:    or a0, t0, a7
+; RV32IB-NEXT:    bltz t1, .LBB9_6
+; RV32IB-NEXT:  .LBB9_9:
+; RV32IB-NEXT:    or a1, a4, zero
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: ror_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    andi a3, a2, 63
+; RV32IBB-NEXT:    addi t1, a3, -32
+; RV32IBB-NEXT:    addi a6, zero, 31
+; RV32IBB-NEXT:    bltz t1, .LBB9_2
+; RV32IBB-NEXT:  # %bb.1:
+; RV32IBB-NEXT:    srl a7, a1, t1
+; RV32IBB-NEXT:    j .LBB9_3
+; RV32IBB-NEXT:  .LBB9_2:
+; RV32IBB-NEXT:    srl a4, a0, a2
+; RV32IBB-NEXT:    sub a3, a6, a3
+; RV32IBB-NEXT:    slli a5, a1, 1
+; RV32IBB-NEXT:    sll a3, a5, a3
+; RV32IBB-NEXT:    or a7, a4, a3
+; RV32IBB-NEXT:  .LBB9_3:
+; RV32IBB-NEXT:    neg a4, a2
+; RV32IBB-NEXT:    andi a5, a4, 63
+; RV32IBB-NEXT:    addi a3, a5, -32
+; RV32IBB-NEXT:    bltz a3, .LBB9_7
+; RV32IBB-NEXT:  # %bb.4:
+; RV32IBB-NEXT:    mv t0, zero
+; RV32IBB-NEXT:    bgez a3, .LBB9_8
+; RV32IBB-NEXT:  .LBB9_5:
+; RV32IBB-NEXT:    sll a3, a1, a4
+; RV32IBB-NEXT:    sub a4, a6, a5
+; RV32IBB-NEXT:    srli a0, a0, 1
+; RV32IBB-NEXT:    srl a0, a0, a4
+; RV32IBB-NEXT:    or a4, a3, a0
+; RV32IBB-NEXT:    or a0, t0, a7
+; RV32IBB-NEXT:    bgez t1, .LBB9_9
+; RV32IBB-NEXT:  .LBB9_6:
+; RV32IBB-NEXT:    srl a1, a1, a2
+; RV32IBB-NEXT:    or a1, a4, a1
+; RV32IBB-NEXT:    ret
+; RV32IBB-NEXT:  .LBB9_7:
+; RV32IBB-NEXT:    sll t0, a0, a4
+; RV32IBB-NEXT:    bltz a3, .LBB9_5
+; RV32IBB-NEXT:  .LBB9_8:
+; RV32IBB-NEXT:    sll a4, a0, a3
+; RV32IBB-NEXT:    or a0, t0, a7
+; RV32IBB-NEXT:    bltz t1, .LBB9_6
+; RV32IBB-NEXT:  .LBB9_9:
+; RV32IBB-NEXT:    or a1, a4, zero
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: ror_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    andi a3, a2, 63
+; RV32IBP-NEXT:    addi t1, a3, -32
+; RV32IBP-NEXT:    addi a6, zero, 31
+; RV32IBP-NEXT:    bltz t1, .LBB9_2
+; RV32IBP-NEXT:  # %bb.1:
+; RV32IBP-NEXT:    srl a7, a1, t1
+; RV32IBP-NEXT:    j .LBB9_3
+; RV32IBP-NEXT:  .LBB9_2:
+; RV32IBP-NEXT:    srl a4, a0, a2
+; RV32IBP-NEXT:    sub a3, a6, a3
+; RV32IBP-NEXT:    slli a5, a1, 1
+; RV32IBP-NEXT:    sll a3, a5, a3
+; RV32IBP-NEXT:    or a7, a4, a3
+; RV32IBP-NEXT:  .LBB9_3:
+; RV32IBP-NEXT:    neg a4, a2
+; RV32IBP-NEXT:    andi a5, a4, 63
+; RV32IBP-NEXT:    addi a3, a5, -32
+; RV32IBP-NEXT:    bltz a3, .LBB9_7
+; RV32IBP-NEXT:  # %bb.4:
+; RV32IBP-NEXT:    mv t0, zero
+; RV32IBP-NEXT:    bgez a3, .LBB9_8
+; RV32IBP-NEXT:  .LBB9_5:
+; RV32IBP-NEXT:    sll a3, a1, a4
+; RV32IBP-NEXT:    sub a4, a6, a5
+; RV32IBP-NEXT:    srli a0, a0, 1
+; RV32IBP-NEXT:    srl a0, a0, a4
+; RV32IBP-NEXT:    or a4, a3, a0
+; RV32IBP-NEXT:    or a0, t0, a7
+; RV32IBP-NEXT:    bgez t1, .LBB9_9
+; RV32IBP-NEXT:  .LBB9_6:
+; RV32IBP-NEXT:    srl a1, a1, a2
+; RV32IBP-NEXT:    or a1, a4, a1
+; RV32IBP-NEXT:    ret
+; RV32IBP-NEXT:  .LBB9_7:
+; RV32IBP-NEXT:    sll t0, a0, a4
+; RV32IBP-NEXT:    bltz a3, .LBB9_5
+; RV32IBP-NEXT:  .LBB9_8:
+; RV32IBP-NEXT:    sll a4, a0, a3
+; RV32IBP-NEXT:    or a0, t0, a7
+; RV32IBP-NEXT:    bltz t1, .LBB9_6
+; RV32IBP-NEXT:  .LBB9_9:
+; RV32IBP-NEXT:    or a1, a4, zero
+; RV32IBP-NEXT:    ret
+  %or = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b)
+  ret i64 %or
+}
+
+define i32 @rori_i32(i32 %a) nounwind {
+; RV32I-LABEL: rori_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    slli a0, a0, 31
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: rori_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    rori a0, a0, 1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: rori_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    rori a0, a0, 1
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: rori_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    rori a0, a0, 1
+; RV32IBP-NEXT:    ret
+  %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 31)
+  ret i32 %1
+}
+
+define i64 @rori_i64(i64 %a) nounwind {
+; RV32I-LABEL: rori_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a2, a1, 31
+; RV32I-NEXT:    srli a3, a0, 1
+; RV32I-NEXT:    or a2, a3, a2
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    slli a0, a0, 31
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: rori_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    addi a3, zero, 31
+; RV32IB-NEXT:    fsl a2, a1, a3, a0
+; RV32IB-NEXT:    fsl a1, a0, a3, a1
+; RV32IB-NEXT:    mv a0, a2
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: rori_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    slli a2, a1, 31
+; RV32IBB-NEXT:    srli a3, a0, 1
+; RV32IBB-NEXT:    or a2, a3, a2
+; RV32IBB-NEXT:    srli a1, a1, 1
+; RV32IBB-NEXT:    slli a0, a0, 31
+; RV32IBB-NEXT:    or a1, a0, a1
+; RV32IBB-NEXT:    mv a0, a2
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: rori_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    slli a2, a1, 31
+; RV32IBP-NEXT:    srli a3, a0, 1
+; RV32IBP-NEXT:    or a2, a3, a2
+; RV32IBP-NEXT:    srli a1, a1, 1
+; RV32IBP-NEXT:    slli a0, a0, 31
+; RV32IBP-NEXT:    or a1, a0, a1
+; RV32IBP-NEXT:    mv a0, a2
+; RV32IBP-NEXT:    ret
+  %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 63)
+  ret i64 %1
+}
+
+define i32 @pack_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: pack_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    addi a2, a2, -1
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: pack_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    pack a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: pack_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    pack a0, a0, a1
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: pack_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    pack a0, a0, a1
+; RV32IBP-NEXT:    ret
+  %shl = and i32 %a, 65535
+  %shl1 = shl i32 %b, 16
+  %or = or i32 %shl1, %shl
+  ret i32 %or
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the experimental-b
+; extension introduce instructions suitable for this pattern.
+
+define i64 @pack_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: pack_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: pack_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    mv a1, a2
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: pack_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    mv a1, a2
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: pack_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    mv a1, a2
+; RV32IBP-NEXT:    ret
+  %shl = and i64 %a, 4294967295
+  %shl1 = shl i64 %b, 32
+  %or = or i64 %shl1, %shl
+  ret i64 %or
+}
+
+define i32 @packu_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: packu_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    lui a2, 1048560
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: packu_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    packu a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: packu_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    packu a0, a0, a1
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: packu_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    packu a0, a0, a1
+; RV32IBP-NEXT:    ret
+  %shr = lshr i32 %a, 16
+  %shr1 = and i32 %b, -65536
+  %or = or i32 %shr1, %shr
+  ret i32 %or
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the experimental-b
+; extension introduce instructions suitable for this pattern.
+
+define i64 @packu_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: packu_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: packu_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    mv a0, a1
+; RV32IB-NEXT:    mv a1, a3
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: packu_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    mv a0, a1
+; RV32IBB-NEXT:    mv a1, a3
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: packu_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    mv a0, a1
+; RV32IBP-NEXT:    mv a1, a3
+; RV32IBP-NEXT:    ret
+  %shr = lshr i64 %a, 32
+  %shr1 = and i64 %b, -4294967296
+  %or = or i64 %shr1, %shr
+  ret i64 %or
+}
+
+define i32 @packh_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: packh_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 255
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    addi a2, a2, -256
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: packh_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    packh a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: packh_i32:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    packh a0, a0, a1
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: packh_i32:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    packh a0, a0, a1
+; RV32IBP-NEXT:    ret
+  %and = and i32 %a, 255
+  %and1 = shl i32 %b, 8
+  %shl = and i32 %and1, 65280
+  %or = or i32 %shl, %and
+  ret i32 %or
+}
+
+define i64 @packh_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: packh_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 255
+; RV32I-NEXT:    slli a1, a2, 8
+; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    addi a2, a2, -256
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: packh_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    packh a0, a0, a2
+; RV32IB-NEXT:    mv a1, zero
+; RV32IB-NEXT:    ret
+;
+; RV32IBB-LABEL: packh_i64:
+; RV32IBB:       # %bb.0:
+; RV32IBB-NEXT:    packh a0, a0, a2
+; RV32IBB-NEXT:    mv a1, zero
+; RV32IBB-NEXT:    ret
+;
+; RV32IBP-LABEL: packh_i64:
+; RV32IBP:       # %bb.0:
+; RV32IBP-NEXT:    packh a0, a0, a2
+; RV32IBP-NEXT:    mv a1, zero
+; RV32IBP-NEXT:    ret
+  %and = and i64 %a, 255
+  %and1 = shl i64 %b, 8
+  %shl = and i64 %and1, 65280
+  %or = or i64 %shl, %and
+  ret i64 %or
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64Zbbp.ll b/llvm/test/CodeGen/RISCV/rv64Zbbp.ll
new file mode 100644
index 000000000000..c3a6799739d2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64Zbbp.ll
@@ -0,0 +1,517 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-b -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64IB
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64IBB
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zbp -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64IBP
+
+define signext i32 @andn_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: andn_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: andn_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    andn a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: andn_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    andn a0, a0, a1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: andn_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    andn a0, a0, a1
+; RV64IBP-NEXT:    ret
+  %neg = xor i32 %b, -1
+  %and = and i32 %neg, %a
+  ret i32 %and
+}
+
+define i64 @andn_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: andn_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: andn_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    andn a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: andn_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    andn a0, a0, a1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: andn_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    andn a0, a0, a1
+; RV64IBP-NEXT:    ret
+  %neg = xor i64 %b, -1
+  %and = and i64 %neg, %a
+  ret i64 %and
+}
+
+define signext i32 @orn_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: orn_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: orn_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    orn a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: orn_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    orn a0, a0, a1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: orn_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    orn a0, a0, a1
+; RV64IBP-NEXT:    ret
+  %neg = xor i32 %b, -1
+  %or = or i32 %neg, %a
+  ret i32 %or
+}
+
+define i64 @orn_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: orn_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: orn_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    orn a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: orn_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    orn a0, a0, a1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: orn_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    orn a0, a0, a1
+; RV64IBP-NEXT:    ret
+  %neg = xor i64 %b, -1
+  %or = or i64 %neg, %a
+  ret i64 %or
+}
+
+define signext i32 @xnor_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: xnor_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: xnor_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    xnor a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: xnor_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    xnor a0, a0, a1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: xnor_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    xnor a0, a0, a1
+; RV64IBP-NEXT:    ret
+  %neg = xor i32 %a, -1
+  %xor = xor i32 %neg, %b
+  ret i32 %xor
+}
+
+define i64 @xnor_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: xnor_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: xnor_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    xnor a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: xnor_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    xnor a0, a0, a1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: xnor_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    xnor a0, a0, a1
+; RV64IBP-NEXT:    ret
+  %neg = xor i64 %a, -1
+  %xor = xor i64 %neg, %b
+  ret i64 %xor
+}
+
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+
+define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: rol_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sllw a2, a0, a1
+; RV64I-NEXT:    neg a1, a1
+; RV64I-NEXT:    srlw a0, a0, a1
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: rol_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    rolw a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: rol_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    rolw a0, a0, a1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: rol_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    rolw a0, a0, a1
+; RV64IBP-NEXT:    ret
+  %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 %b)
+  ret i32 %1
+}
+
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+
+define i64 @rol_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: rol_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a2, a0, a1
+; RV64I-NEXT:    neg a1, a1
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: rol_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    rol a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: rol_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    rol a0, a0, a1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: rol_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    rol a0, a0, a1
+; RV64IBP-NEXT:    ret
+  %or = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %b)
+  ret i64 %or
+}
+
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+
+define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: ror_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srlw a2, a0, a1
+; RV64I-NEXT:    neg a1, a1
+; RV64I-NEXT:    sllw a0, a0, a1
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: ror_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    rorw a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: ror_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    rorw a0, a0, a1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: ror_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    rorw a0, a0, a1
+; RV64IBP-NEXT:    ret
+  %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 %b)
+  ret i32 %1
+}
+
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+
+define i64 @ror_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: ror_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srl a2, a0, a1
+; RV64I-NEXT:    neg a1, a1
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: ror_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    ror a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: ror_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    ror a0, a0, a1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: ror_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    ror a0, a0, a1
+; RV64IBP-NEXT:    ret
+  %or = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b)
+  ret i64 %or
+}
+
+define signext i32 @rori_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: rori_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    slli a0, a0, 31
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: rori_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    fsriw a0, a0, a0, 1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: rori_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    roriw a0, a0, 1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: rori_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    roriw a0, a0, 1
+; RV64IBP-NEXT:    ret
+  %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 31)
+  ret i32 %1
+}
+
+define i64 @rori_i64(i64 %a) nounwind {
+; RV64I-LABEL: rori_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    slli a0, a0, 63
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: rori_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    rori a0, a0, 1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: rori_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    rori a0, a0, 1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: rori_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    rori a0, a0, 1
+; RV64IBP-NEXT:    ret
+  %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 63)
+  ret i64 %1
+}
+
+define signext i32 @pack_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: pack_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -1
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: pack_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    packw a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: pack_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    packw a0, a0, a1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: pack_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    packw a0, a0, a1
+; RV64IBP-NEXT:    ret
+  %shl = and i32 %a, 65535
+  %shl1 = shl i32 %b, 16
+  %or = or i32 %shl1, %shl
+  ret i32 %or
+}
+
+define i64 @pack_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: pack_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: pack_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    pack a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: pack_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    pack a0, a0, a1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: pack_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    pack a0, a0, a1
+; RV64IBP-NEXT:    ret
+  %shl = and i64 %a, 4294967295
+  %shl1 = shl i64 %b, 32
+  %or = or i64 %shl1, %shl
+  ret i64 %or
+}
+
+define signext i32 @packu_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: packu_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a0, a0, 16
+; RV64I-NEXT:    lui a2, 1048560
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: packu_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    packuw a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: packu_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    packuw a0, a0, a1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: packu_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    packuw a0, a0, a1
+; RV64IBP-NEXT:    ret
+  %shr = lshr i32 %a, 16
+  %shr1 = and i32 %b, -65536
+  %or = or i32 %shr1, %shr
+  ret i32 %or
+}
+
+define i64 @packu_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: packu_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    addi a2, zero, -1
+; RV64I-NEXT:    slli a2, a2, 32
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: packu_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    packu a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: packu_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    packu a0, a0, a1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: packu_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    packu a0, a0, a1
+; RV64IBP-NEXT:    ret
+  %shr = lshr i64 %a, 32
+  %shr1 = and i64 %b, -4294967296
+  %or = or i64 %shr1, %shr
+  ret i64 %or
+}
+
+define signext i32 @packh_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: packh_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: packh_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    packh a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: packh_i32:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    packh a0, a0, a1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: packh_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    packh a0, a0, a1
+; RV64IBP-NEXT:    ret
+  %and = and i32 %a, 255
+  %and1 = shl i32 %b, 8
+  %shl = and i32 %and1, 65280
+  %or = or i32 %shl, %and
+  ret i32 %or
+}
+
+define i64 @packh_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: packh_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: packh_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    packh a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBB-LABEL: packh_i64:
+; RV64IBB:       # %bb.0:
+; RV64IBB-NEXT:    packh a0, a0, a1
+; RV64IBB-NEXT:    ret
+;
+; RV64IBP-LABEL: packh_i64:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    packh a0, a0, a1
+; RV64IBP-NEXT:    ret
+  %and = and i64 %a, 255
+  %and1 = shl i64 %b, 8
+  %shl = and i64 %and1, 65280
+  %or = or i64 %shl, %and
+  ret i64 %or
+}

From 36d3eaf1ea686db5635ab504dac8d26277167bb0 Mon Sep 17 00:00:00 2001
From: lewis-revill <lewis.revill@embecosm.com>
Date: Wed, 15 Jul 2020 11:57:29 +0100
Subject: [PATCH 050/363] [RISCV] Add matching of codegen patterns to RISCV Bit
 Manipulation Zbs asm instructions

This patch provides optimization of bit manipulation operations by
enabling the +experimental-b target feature.
It adds matching of single block patterns of instructions to specific
bit-manip instructions from the single-bit subset (zbs subextension) of
the experimental B extension of RISC-V.
It adds also the correspondent codegen tests.

This patch is based on Claire Wolf's proposal for the bit manipulation
extension of RISCV:
https://github.com/riscv/riscv-bitmanip/blob/master/bitmanip-0.92.pdf

Differential Revision: https://reviews.llvm.org/D79874

(cherry picked from commit d4be33374c07ea9a9362892876aa76b227298181)
---
 llvm/lib/Target/RISCV/RISCVInstrInfoB.td |  53 ++++
 llvm/test/CodeGen/RISCV/rv32Zbs.ll       | 361 +++++++++++++++++++++++
 llvm/test/CodeGen/RISCV/rv64Zbs.ll       | 235 +++++++++++++++
 3 files changed, 649 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/rv32Zbs.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64Zbs.ll

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
index 45eb41f93b2e..aa1ed7ff79cd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
@@ -664,6 +664,38 @@ def : Pat<(rotr GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>;
 def : Pat<(fshr GPR:$rs1, GPR:$rs1, GPR:$rs2), (ROR GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtZbbOrZbp]
 
+let Predicates = [HasStdExtZbs, IsRV32] in
+def : Pat<(and (xor (shl 1, (and GPR:$rs2, 31)), -1), GPR:$rs1),
+          (SBCLR GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbs, IsRV64] in
+def : Pat<(and (xor (shl 1, (and GPR:$rs2, 63)), -1), GPR:$rs1),
+          (SBCLR GPR:$rs1, GPR:$rs2)>;
+
+let Predicates = [HasStdExtZbs] in
+def : Pat<(and (rotl -2, GPR:$rs2), GPR:$rs1), (SBCLR GPR:$rs1, GPR:$rs2)>;
+
+let Predicates = [HasStdExtZbs, IsRV32] in
+def : Pat<(or (shl 1, (and GPR:$rs2, 31)), GPR:$rs1),
+          (SBSET GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbs, IsRV64] in
+def : Pat<(or (shl 1, (and GPR:$rs2, 63)), GPR:$rs1),
+          (SBSET GPR:$rs1, GPR:$rs2)>;
+
+let Predicates = [HasStdExtZbs, IsRV32] in
+def : Pat<(xor (shl 1, (and GPR:$rs2, 31)), GPR:$rs1),
+          (SBINV GPR:$rs1, GPR:$rs2)>;
+let Predicates = [HasStdExtZbs, IsRV64] in
+def : Pat<(xor (shl 1, (and GPR:$rs2, 63)), GPR:$rs1),
+          (SBINV GPR:$rs1, GPR:$rs2)>;
+
+let Predicates = [HasStdExtZbs, IsRV32] in
+def : Pat<(and (srl GPR:$rs1, (and GPR:$rs2, 31)), 1),
+          (SBEXT GPR:$rs1, GPR:$rs2)>;
+
+let Predicates = [HasStdExtZbs, IsRV64] in
+def : Pat<(and (srl GPR:$rs1, (and GPR:$rs2, 63)), 1),
+          (SBEXT GPR:$rs1, GPR:$rs2)>;
+
 let Predicates = [HasStdExtZbb] in {
 def : Pat<(SLOIPat GPR:$rs1, uimmlog2xlen:$shamt),
           (SLOI GPR:$rs1, uimmlog2xlen:$shamt)>;
@@ -678,6 +710,12 @@ let Predicates = [HasStdExtZbbOrZbp] in
 def : Pat<(RORIPat GPR:$rs1, uimmlog2xlen:$shamt),
           (RORI GPR:$rs1, uimmlog2xlen:$shamt)>;
 
+// We don't pattern-match sbclri[w], sbseti[w], sbinvi[w] because they are
+// pattern-matched by simple andi, ori, and xori.
+let Predicates = [HasStdExtZbs] in
+def : Pat<(and (srl GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1)),
+          (SBEXTI GPR:$rs1, uimmlog2xlen:$shamt)>;
+
 let Predicates = [HasStdExtZbp, IsRV32] in {
 def : Pat<(or (or (and (srl GPR:$rs1, (i32 1)), (i32 0x55555555)), GPR:$rs1),
               (and (shl GPR:$rs1, (i32 1)), (i32 0xAAAAAAAA))),
@@ -886,6 +924,21 @@ def : Pat<(or (riscv_sllw (assertsexti32 GPR:$rs1),
           (RORW GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtZbbOrZbp, IsRV64]
 
+let Predicates = [HasStdExtZbs, IsRV64] in {
+def : Pat<(and (xor (riscv_sllw 1, (assertsexti32 GPR:$rs2)), -1),
+               (assertsexti32 GPR:$rs1)),
+          (SBCLRW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (riscv_sllw 1, (assertsexti32 GPR:$rs2)),
+              (assertsexti32 GPR:$rs1)),
+          (SBSETW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(xor (riscv_sllw 1, (assertsexti32 GPR:$rs2)),
+               (assertsexti32 GPR:$rs1)),
+          (SBINVW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(and (riscv_srlw (assertsexti32 GPR:$rs1), (assertsexti32 GPR:$rs2)),
+               1),
+          (SBEXTW GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbs, IsRV64]
+
 let Predicates = [HasStdExtZbb, IsRV64] in {
 def : Pat<(SLOIWPat GPR:$rs1, uimmlog2xlen:$shamt),
           (SLOIW GPR:$rs1, uimmlog2xlen:$shamt)>;
diff --git a/llvm/test/CodeGen/RISCV/rv32Zbs.ll b/llvm/test/CodeGen/RISCV/rv32Zbs.ll
new file mode 100644
index 000000000000..16da34e49c66
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv32Zbs.ll
@@ -0,0 +1,361 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-b -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32IB
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zbs -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32IBS
+
+define i32 @sbclr_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: sbclr_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a2, zero, 1
+; RV32I-NEXT:    sll a1, a2, a1
+; RV32I-NEXT:    not a1, a1
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sbclr_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    sbclr a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBS-LABEL: sbclr_i32:
+; RV32IBS:       # %bb.0:
+; RV32IBS-NEXT:    sbclr a0, a0, a1
+; RV32IBS-NEXT:    ret
+  %and = and i32 %b, 31
+  %shl = shl nuw i32 1, %and
+  %neg = xor i32 %shl, -1
+  %and1 = and i32 %neg, %a
+  ret i32 %and1
+}
+
+define i64 @sbclr_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: sbclr_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a3, a2, 63
+; RV32I-NEXT:    addi a4, a3, -32
+; RV32I-NEXT:    addi a3, zero, 1
+; RV32I-NEXT:    bltz a4, .LBB1_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a2, zero
+; RV32I-NEXT:    sll a4, a3, a4
+; RV32I-NEXT:    j .LBB1_3
+; RV32I-NEXT:  .LBB1_2:
+; RV32I-NEXT:    mv a4, zero
+; RV32I-NEXT:    sll a2, a3, a2
+; RV32I-NEXT:  .LBB1_3:
+; RV32I-NEXT:    not a3, a4
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    and a0, a2, a0
+; RV32I-NEXT:    and a1, a3, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sbclr_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    andi a3, a2, 63
+; RV32IB-NEXT:    addi a4, a3, -32
+; RV32IB-NEXT:    addi a3, zero, 1
+; RV32IB-NEXT:    bltz a4, .LBB1_2
+; RV32IB-NEXT:  # %bb.1:
+; RV32IB-NEXT:    mv a2, zero
+; RV32IB-NEXT:    sll a4, a3, a4
+; RV32IB-NEXT:    j .LBB1_3
+; RV32IB-NEXT:  .LBB1_2:
+; RV32IB-NEXT:    mv a4, zero
+; RV32IB-NEXT:    sll a2, a3, a2
+; RV32IB-NEXT:  .LBB1_3:
+; RV32IB-NEXT:    andn a0, a0, a2
+; RV32IB-NEXT:    andn a1, a1, a4
+; RV32IB-NEXT:    ret
+;
+; RV32IBS-LABEL: sbclr_i64:
+; RV32IBS:       # %bb.0:
+; RV32IBS-NEXT:    andi a3, a2, 63
+; RV32IBS-NEXT:    addi a4, a3, -32
+; RV32IBS-NEXT:    addi a3, zero, 1
+; RV32IBS-NEXT:    bltz a4, .LBB1_2
+; RV32IBS-NEXT:  # %bb.1:
+; RV32IBS-NEXT:    mv a2, zero
+; RV32IBS-NEXT:    sll a4, a3, a4
+; RV32IBS-NEXT:    j .LBB1_3
+; RV32IBS-NEXT:  .LBB1_2:
+; RV32IBS-NEXT:    mv a4, zero
+; RV32IBS-NEXT:    sll a2, a3, a2
+; RV32IBS-NEXT:  .LBB1_3:
+; RV32IBS-NEXT:    not a3, a4
+; RV32IBS-NEXT:    not a2, a2
+; RV32IBS-NEXT:    and a0, a2, a0
+; RV32IBS-NEXT:    and a1, a3, a1
+; RV32IBS-NEXT:    ret
+  %and = and i64 %b, 63
+  %shl = shl nuw i64 1, %and
+  %neg = xor i64 %shl, -1
+  %and1 = and i64 %neg, %a
+  ret i64 %and1
+}
+
+define i32 @sbset_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: sbset_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a2, zero, 1
+; RV32I-NEXT:    sll a1, a2, a1
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sbset_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    sbset a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBS-LABEL: sbset_i32:
+; RV32IBS:       # %bb.0:
+; RV32IBS-NEXT:    sbset a0, a0, a1
+; RV32IBS-NEXT:    ret
+  %and = and i32 %b, 31
+  %shl = shl nuw i32 1, %and
+  %or = or i32 %shl, %a
+  ret i32 %or
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the experimental-b
+; extension introduce instructions suitable for this pattern.
+
+define i64 @sbset_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: sbset_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a3, zero, 1
+; RV32I-NEXT:    sll a2, a3, a2
+; RV32I-NEXT:    srai a3, a2, 31
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sbset_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    addi a3, zero, 1
+; RV32IB-NEXT:    sll a2, a3, a2
+; RV32IB-NEXT:    srai a3, a2, 31
+; RV32IB-NEXT:    or a0, a2, a0
+; RV32IB-NEXT:    or a1, a3, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBS-LABEL: sbset_i64:
+; RV32IBS:       # %bb.0:
+; RV32IBS-NEXT:    addi a3, zero, 1
+; RV32IBS-NEXT:    sll a2, a3, a2
+; RV32IBS-NEXT:    srai a3, a2, 31
+; RV32IBS-NEXT:    or a0, a2, a0
+; RV32IBS-NEXT:    or a1, a3, a1
+; RV32IBS-NEXT:    ret
+  %1 = trunc i64 %b to i32
+  %conv = and i32 %1, 63
+  %shl = shl nuw i32 1, %conv
+  %conv1 = sext i32 %shl to i64
+  %or = or i64 %conv1, %a
+  ret i64 %or
+}
+
+define i32 @sbinv_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: sbinv_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a2, zero, 1
+; RV32I-NEXT:    sll a1, a2, a1
+; RV32I-NEXT:    xor a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sbinv_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    sbinv a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBS-LABEL: sbinv_i32:
+; RV32IBS:       # %bb.0:
+; RV32IBS-NEXT:    sbinv a0, a0, a1
+; RV32IBS-NEXT:    ret
+  %and = and i32 %b, 31
+  %shl = shl nuw i32 1, %and
+  %xor = xor i32 %shl, %a
+  ret i32 %xor
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the experimental-b
+; extension introduce instructions suitable for this pattern.
+
+define i64 @sbinv_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: sbinv_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi a3, zero, 1
+; RV32I-NEXT:    sll a2, a3, a2
+; RV32I-NEXT:    srai a3, a2, 31
+; RV32I-NEXT:    xor a0, a2, a0
+; RV32I-NEXT:    xor a1, a3, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sbinv_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    addi a3, zero, 1
+; RV32IB-NEXT:    sll a2, a3, a2
+; RV32IB-NEXT:    srai a3, a2, 31
+; RV32IB-NEXT:    xor a0, a2, a0
+; RV32IB-NEXT:    xor a1, a3, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBS-LABEL: sbinv_i64:
+; RV32IBS:       # %bb.0:
+; RV32IBS-NEXT:    addi a3, zero, 1
+; RV32IBS-NEXT:    sll a2, a3, a2
+; RV32IBS-NEXT:    srai a3, a2, 31
+; RV32IBS-NEXT:    xor a0, a2, a0
+; RV32IBS-NEXT:    xor a1, a3, a1
+; RV32IBS-NEXT:    ret
+  %1 = trunc i64 %b to i32
+  %conv = and i32 %1, 63
+  %shl = shl nuw i32 1, %conv
+  %conv1 = sext i32 %shl to i64
+  %xor = xor i64 %conv1, %a
+  ret i64 %xor
+}
+
+define i32 @sbext_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: sbext_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sbext_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    sbext a0, a0, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBS-LABEL: sbext_i32:
+; RV32IBS:       # %bb.0:
+; RV32IBS-NEXT:    sbext a0, a0, a1
+; RV32IBS-NEXT:    ret
+  %and = and i32 %b, 31
+  %shr = lshr i32 %a, %and
+  %and1 = and i32 %shr, 1
+  ret i32 %and1
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the experimental-b
+; extension introduce instructions suitable for this pattern.
+
+define i64 @sbext_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: sbext_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a3, a2, 63
+; RV32I-NEXT:    addi a4, a3, -32
+; RV32I-NEXT:    bltz a4, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    srl a0, a1, a4
+; RV32I-NEXT:    j .LBB7_3
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    srl a0, a0, a2
+; RV32I-NEXT:    addi a2, zero, 31
+; RV32I-NEXT:    sub a2, a2, a3
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    sll a1, a1, a2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:  .LBB7_3:
+; RV32I-NEXT:    andi a0, a0, 1
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sbext_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    andi a3, a2, 63
+; RV32IB-NEXT:    addi a4, a3, -32
+; RV32IB-NEXT:    bltz a4, .LBB7_2
+; RV32IB-NEXT:  # %bb.1:
+; RV32IB-NEXT:    srl a0, a1, a4
+; RV32IB-NEXT:    j .LBB7_3
+; RV32IB-NEXT:  .LBB7_2:
+; RV32IB-NEXT:    srl a0, a0, a2
+; RV32IB-NEXT:    addi a2, zero, 31
+; RV32IB-NEXT:    sub a2, a2, a3
+; RV32IB-NEXT:    slli a1, a1, 1
+; RV32IB-NEXT:    sll a1, a1, a2
+; RV32IB-NEXT:    or a0, a0, a1
+; RV32IB-NEXT:  .LBB7_3:
+; RV32IB-NEXT:    andi a0, a0, 1
+; RV32IB-NEXT:    mv a1, zero
+; RV32IB-NEXT:    ret
+;
+; RV32IBS-LABEL: sbext_i64:
+; RV32IBS:       # %bb.0:
+; RV32IBS-NEXT:    andi a3, a2, 63
+; RV32IBS-NEXT:    addi a4, a3, -32
+; RV32IBS-NEXT:    bltz a4, .LBB7_2
+; RV32IBS-NEXT:  # %bb.1:
+; RV32IBS-NEXT:    srl a0, a1, a4
+; RV32IBS-NEXT:    j .LBB7_3
+; RV32IBS-NEXT:  .LBB7_2:
+; RV32IBS-NEXT:    srl a0, a0, a2
+; RV32IBS-NEXT:    addi a2, zero, 31
+; RV32IBS-NEXT:    sub a2, a2, a3
+; RV32IBS-NEXT:    slli a1, a1, 1
+; RV32IBS-NEXT:    sll a1, a1, a2
+; RV32IBS-NEXT:    or a0, a0, a1
+; RV32IBS-NEXT:  .LBB7_3:
+; RV32IBS-NEXT:    andi a0, a0, 1
+; RV32IBS-NEXT:    mv a1, zero
+; RV32IBS-NEXT:    ret
+  %conv = and i64 %b, 63
+  %shr = lshr i64 %a, %conv
+  %and1 = and i64 %shr, 1
+  ret i64 %and1
+}
+
+define i32 @sbexti_i32(i32 %a) nounwind {
+; RV32I-LABEL: sbexti_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a0, a0, 5
+; RV32I-NEXT:    andi a0, a0, 1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sbexti_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    sbexti a0, a0, 5
+; RV32IB-NEXT:    ret
+;
+; RV32IBS-LABEL: sbexti_i32:
+; RV32IBS:       # %bb.0:
+; RV32IBS-NEXT:    sbexti a0, a0, 5
+; RV32IBS-NEXT:    ret
+  %shr = lshr i32 %a, 5
+  %and = and i32 %shr, 1
+  ret i32 %and
+}
+
+define i64 @sbexti_i64(i64 %a) nounwind {
+; RV32I-LABEL: sbexti_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a0, a0, 5
+; RV32I-NEXT:    andi a0, a0, 1
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: sbexti_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    sbexti a0, a0, 5
+; RV32IB-NEXT:    mv a1, zero
+; RV32IB-NEXT:    ret
+;
+; RV32IBS-LABEL: sbexti_i64:
+; RV32IBS:       # %bb.0:
+; RV32IBS-NEXT:    sbexti a0, a0, 5
+; RV32IBS-NEXT:    mv a1, zero
+; RV32IBS-NEXT:    ret
+  %shr = lshr i64 %a, 5
+  %and = and i64 %shr, 1
+  ret i64 %and
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64Zbs.ll b/llvm/test/CodeGen/RISCV/rv64Zbs.ll
new file mode 100644
index 000000000000..f7990b36dec8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64Zbs.ll
@@ -0,0 +1,235 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-b -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64IB
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zbs -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64IBS
+
+define signext i32 @sbclr_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sbclr_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a2, zero, 1
+; RV64I-NEXT:    sllw a1, a2, a1
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sbclr_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sbclrw a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBS-LABEL: sbclr_i32:
+; RV64IBS:       # %bb.0:
+; RV64IBS-NEXT:    sbclrw a0, a0, a1
+; RV64IBS-NEXT:    ret
+  %and = and i32 %b, 31
+  %shl = shl nuw i32 1, %and
+  %neg = xor i32 %shl, -1
+  %and1 = and i32 %neg, %a
+  ret i32 %and1
+}
+
+define i64 @sbclr_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: sbclr_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a2, zero, 1
+; RV64I-NEXT:    sll a1, a2, a1
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sbclr_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sbclr a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBS-LABEL: sbclr_i64:
+; RV64IBS:       # %bb.0:
+; RV64IBS-NEXT:    sbclr a0, a0, a1
+; RV64IBS-NEXT:    ret
+  %and = and i64 %b, 63
+  %shl = shl nuw i64 1, %and
+  %neg = xor i64 %shl, -1
+  %and1 = and i64 %neg, %a
+  ret i64 %and1
+}
+
+define signext i32 @sbset_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sbset_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a2, zero, 1
+; RV64I-NEXT:    sllw a1, a2, a1
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sbset_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sbsetw a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBS-LABEL: sbset_i32:
+; RV64IBS:       # %bb.0:
+; RV64IBS-NEXT:    sbsetw a0, a0, a1
+; RV64IBS-NEXT:    ret
+  %and = and i32 %b, 31
+  %shl = shl nuw i32 1, %and
+  %or = or i32 %shl, %a
+  ret i32 %or
+}
+
+define i64 @sbset_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: sbset_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a2, zero, 1
+; RV64I-NEXT:    sll a1, a2, a1
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sbset_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sbset a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBS-LABEL: sbset_i64:
+; RV64IBS:       # %bb.0:
+; RV64IBS-NEXT:    sbset a0, a0, a1
+; RV64IBS-NEXT:    ret
+  %conv = and i64 %b, 63
+  %shl = shl nuw i64 1, %conv
+  %or = or i64 %shl, %a
+  ret i64 %or
+}
+
+define signext i32 @sbinv_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sbinv_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a2, zero, 1
+; RV64I-NEXT:    sllw a1, a2, a1
+; RV64I-NEXT:    xor a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sbinv_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sbinvw a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBS-LABEL: sbinv_i32:
+; RV64IBS:       # %bb.0:
+; RV64IBS-NEXT:    sbinvw a0, a0, a1
+; RV64IBS-NEXT:    ret
+  %and = and i32 %b, 31
+  %shl = shl nuw i32 1, %and
+  %xor = xor i32 %shl, %a
+  ret i32 %xor
+}
+
+define i64 @sbinv_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: sbinv_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a2, zero, 1
+; RV64I-NEXT:    sll a1, a2, a1
+; RV64I-NEXT:    xor a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sbinv_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sbinv a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBS-LABEL: sbinv_i64:
+; RV64IBS:       # %bb.0:
+; RV64IBS-NEXT:    sbinv a0, a0, a1
+; RV64IBS-NEXT:    ret
+  %conv = and i64 %b, 63
+  %shl = shl nuw i64 1, %conv
+  %xor = xor i64 %shl, %a
+  ret i64 %xor
+}
+
+define signext i32 @sbext_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: sbext_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srlw a0, a0, a1
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sbext_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sbextw a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBS-LABEL: sbext_i32:
+; RV64IBS:       # %bb.0:
+; RV64IBS-NEXT:    sbextw a0, a0, a1
+; RV64IBS-NEXT:    ret
+  %and = and i32 %b, 31
+  %shr = lshr i32 %a, %and
+  %and1 = and i32 %shr, 1
+  ret i32 %and1
+}
+
+define i64 @sbext_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: sbext_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sbext_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sbext a0, a0, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBS-LABEL: sbext_i64:
+; RV64IBS:       # %bb.0:
+; RV64IBS-NEXT:    sbext a0, a0, a1
+; RV64IBS-NEXT:    ret
+  %conv = and i64 %b, 63
+  %shr = lshr i64 %a, %conv
+  %and1 = and i64 %shr, 1
+  ret i64 %and1
+}
+
+define signext i32 @sbexti_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: sbexti_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a0, a0, 5
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sbexti_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sbexti a0, a0, 5
+; RV64IB-NEXT:    ret
+;
+; RV64IBS-LABEL: sbexti_i32:
+; RV64IBS:       # %bb.0:
+; RV64IBS-NEXT:    sbexti a0, a0, 5
+; RV64IBS-NEXT:    ret
+  %shr = lshr i32 %a, 5
+  %and = and i32 %shr, 1
+  ret i32 %and
+}
+
+define i64 @sbexti_i64(i64 %a) nounwind {
+; RV64I-LABEL: sbexti_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a0, a0, 5
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: sbexti_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    sbexti a0, a0, 5
+; RV64IB-NEXT:    ret
+;
+; RV64IBS-LABEL: sbexti_i64:
+; RV64IBS:       # %bb.0:
+; RV64IBS-NEXT:    sbexti a0, a0, 5
+; RV64IBS-NEXT:    ret
+  %shr = lshr i64 %a, 5
+  %and = and i64 %shr, 1
+  ret i64 %and
+}

From 1daf1144ad1ea302d3a5f208c450da7774b4b607 Mon Sep 17 00:00:00 2001
From: lewis-revill <lewis.revill@embecosm.com>
Date: Wed, 15 Jul 2020 11:59:47 +0100
Subject: [PATCH 051/363] [RISCV] Add matching of codegen patterns to RISCV Bit
 Manipulation Zbt asm instructions

This patch provides optimization of bit manipulation operations by
enabling the +experimental-b target feature.
It adds matching of single block patterns of instructions to specific
bit-manip instructions from the ternary subset (zbt subextension) of the
experimental B extension of RISC-V.
It adds also the correspondent codegen tests.

This patch is based on Claire Wolf's proposal for the bit manipulation
extension of RISCV:
https://github.com/riscv/riscv-bitmanip/blob/master/bitmanip-0.92.pdf

Differential Revision: https://reviews.llvm.org/D79875

(cherry picked from commit c9c955ada8e65205312f2bc41b46eefa0e98b36c)
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp |  49 ++
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h   |   1 +
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |   5 +
 llvm/lib/Target/RISCV/RISCVInstrInfoB.td    |  39 ++
 llvm/test/CodeGen/RISCV/rv32Zbt.ll          | 569 ++++++++++++++++++++
 llvm/test/CodeGen/RISCV/rv64Zbt.ll          | 266 +++++++++
 6 files changed, 929 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/rv32Zbt.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64Zbt.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index fd1a91f68802..7570385e38e3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -459,6 +459,55 @@ bool RISCVDAGToDAGISel::SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt) {
   return false;
 }
 
+// Check that it is a FSRIW (i32 Funnel Shift Right Immediate on RV64).
+// We first check that it is the right node tree:
+//
+//  (SIGN_EXTEND_INREG (OR (SHL (AsserSext RS1, i32), VC2),
+//                         (SRL (AND (AssertSext RS2, i32), VC3), VC1)))
+//
+// Then we check that the constant operands respect these constraints:
+//
+// VC2 == 32 - VC1
+// VC3 == maskLeadingOnes<uint32_t>(VC2)
+//
+// being VC1 the Shamt we need, VC2 the complementary of Shamt over 32
+// and VC3 a 32 bit mask of (32 - VC1) leading ones.
+
+bool RISCVDAGToDAGISel::SelectFSRIW(SDValue N, SDValue &RS1, SDValue &RS2,
+                                    SDValue &Shamt) {
+  if (N.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+      Subtarget->getXLenVT() == MVT::i64 &&
+      cast<VTSDNode>(N.getOperand(1))->getVT() == MVT::i32) {
+    if (N.getOperand(0).getOpcode() == ISD::OR) {
+      SDValue Or = N.getOperand(0);
+      if (Or.getOperand(0).getOpcode() == ISD::SHL &&
+          Or.getOperand(1).getOpcode() == ISD::SRL) {
+        SDValue Shl = Or.getOperand(0);
+        SDValue Srl = Or.getOperand(1);
+        if (Srl.getOperand(0).getOpcode() == ISD::AND) {
+          SDValue And = Srl.getOperand(0);
+          if (isa<ConstantSDNode>(Srl.getOperand(1)) &&
+              isa<ConstantSDNode>(Shl.getOperand(1)) &&
+              isa<ConstantSDNode>(And.getOperand(1))) {
+            uint32_t VC1 = Srl.getConstantOperandVal(1);
+            uint32_t VC2 = Shl.getConstantOperandVal(1);
+            uint32_t VC3 = And.getConstantOperandVal(1);
+            if (VC2 == (32 - VC1) &&
+                VC3 == maskLeadingOnes<uint32_t>(VC2)) {
+              RS1 = Shl.getOperand(0);
+              RS2 = And.getOperand(0);
+              Shamt = CurDAG->getTargetConstant(VC1, SDLoc(N),
+                                              Srl.getOperand(1).getValueType());
+              return true;
+            }
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
 // Merge an ADDI into the offset of a load/store instruction where possible.
 // (load (addi base, off1), off2) -> (load base, off1+off2)
 // (store val, (addi base, off1), off2) -> (store val, base, off1+off2)
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index bc1655b673d7..0ca12510a230 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -52,6 +52,7 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
   bool SelectSLOIW(SDValue N, SDValue &RS1, SDValue &Shamt);
   bool SelectSROIW(SDValue N, SDValue &RS1, SDValue &Shamt);
   bool SelectRORIW(SDValue N, SDValue &RS1, SDValue &Shamt);
+  bool SelectFSRIW(SDValue N, SDValue &RS1, SDValue &RS2, SDValue &Shamt);
 
 // Include the pieces autogenerated from the target description.
 #include "RISCVGenDAGISel.inc"
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7cad9f9bd43e..03d9eefd59d0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -166,6 +166,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   if (Subtarget.hasStdExtZbp())
     setOperationAction(ISD::BITREVERSE, XLenVT, Legal);
 
+  if (Subtarget.hasStdExtZbt()) {
+    setOperationAction(ISD::FSHL, XLenVT, Legal);
+    setOperationAction(ISD::FSHR, XLenVT, Legal);
+  }
+
   ISD::CondCode FPCCToExtend[] = {
       ISD::SETOGT, ISD::SETOGE, ISD::SETONE, ISD::SETUEQ, ISD::SETUGT,
       ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
index aa1ed7ff79cd..afac509f743d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
@@ -643,6 +643,7 @@ def SLLIUWPat : ComplexPattern<i64, 2, "SelectSLLIUW", [and]>;
 def SLOIWPat  : ComplexPattern<i64, 2, "SelectSLOIW", [sext_inreg]>;
 def SROIWPat  : ComplexPattern<i64, 2, "SelectSROIW", [or]>;
 def RORIWPat  : ComplexPattern<i64, 2, "SelectRORIW", [sext_inreg]>;
+def FSRIWPat  : ComplexPattern<i64, 3, "SelectFSRIW", [sext_inreg]>;
 
 let Predicates = [HasStdExtZbbOrZbp] in {
 def : Pat<(and GPR:$rs1, (not GPR:$rs2)), (ANDN GPR:$rs1, GPR:$rs2)>;
@@ -804,6 +805,19 @@ def : Pat<(bswap GPR:$rs1), (GREVI GPR:$rs1, (i64 56))>;
 def : Pat<(bitreverse GPR:$rs1), (GREVI GPR:$rs1, (i64 63))>;
 } // Predicates = [HasStdExtZbp, IsRV64]
 
+let Predicates = [HasStdExtZbt] in {
+def : Pat<(or (and (xor GPR:$rs2, -1), GPR:$rs3), (and GPR:$rs2, GPR:$rs1)),
+          (CMIX GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(riscv_selectcc GPR:$rs2, (XLenVT 0), (XLenVT 17), GPR:$rs3, GPR:$rs1),
+          (CMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(fshl GPR:$rs1, GPR:$rs2, GPR:$rs3),
+          (FSL GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(fshr GPR:$rs1, GPR:$rs2, GPR:$rs3),
+          (FSR GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(fshr GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt),
+          (FSRI GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt)>;
+} // Predicates = [HasStdExtZbt]
+
 let Predicates = [HasStdExtZbb] in {
 def : Pat<(ctlz GPR:$rs1), (CLZ GPR:$rs1)>;
 def : Pat<(cttz GPR:$rs1), (CTZ GPR:$rs1)>;
@@ -1004,6 +1018,31 @@ def : Pat<(sra (bswap GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 24))>;
 def : Pat<(sra (bitreverse GPR:$rs1), (i64 32)), (GREVIW GPR:$rs1, (i64 31))>;
 } // Predicates = [HasStdExtZbp, IsRV64]
 
+let Predicates = [HasStdExtZbt, IsRV64] in {
+def : Pat<(riscv_selectcc (and (assertsexti32 GPR:$rs3), 31),
+                          (i64 0),
+                          (i64 17),
+                          (assertsexti32 GPR:$rs1),
+                          (or (riscv_sllw (assertsexti32 GPR:$rs1),
+                                          (and (assertsexti32 GPR:$rs3), 31)),
+                              (riscv_srlw (assertsexti32 GPR:$rs2),
+                                          (sub (i64 32),
+                                               (assertsexti32 GPR:$rs3))))),
+          (FSLW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(riscv_selectcc (and (assertsexti32 GPR:$rs3), 31),
+                          (i64 0),
+                          (i64 17),
+                          (assertsexti32 GPR:$rs2),
+                          (or (riscv_sllw (assertsexti32 GPR:$rs1),
+                                          (sub (i64 32),
+                                               (assertsexti32 GPR:$rs3))),
+                              (riscv_srlw (assertsexti32 GPR:$rs2),
+                                          (and (assertsexti32 GPR:$rs3), 31)))),
+          (FSRW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(FSRIWPat GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt),
+          (FSRIW GPR:$rs1, GPR:$rs2, uimmlog2xlen:$shamt)>;
+} // Predicates = [HasStdExtZbt, IsRV64]
+
 let Predicates = [HasStdExtZbb, IsRV64] in {
 def : Pat<(add (ctlz (and GPR:$rs1, (i64 0xFFFFFFFF))), (i64 -32)),
           (CLZW GPR:$rs1)>;
diff --git a/llvm/test/CodeGen/RISCV/rv32Zbt.ll b/llvm/test/CodeGen/RISCV/rv32Zbt.ll
new file mode 100644
index 000000000000..54b5b79778f4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv32Zbt.ll
@@ -0,0 +1,569 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-b -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32IB
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zbt -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32IBT
+
+define i32 @cmix_i32(i32 %a, i32 %b, i32 %c) nounwind {
+; RV32I-LABEL: cmix_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    not a1, a1
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: cmix_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    cmix a0, a1, a0, a2
+; RV32IB-NEXT:    ret
+;
+; RV32IBT-LABEL: cmix_i32:
+; RV32IBT:       # %bb.0:
+; RV32IBT-NEXT:    cmix a0, a1, a0, a2
+; RV32IBT-NEXT:    ret
+  %and = and i32 %b, %a
+  %neg = xor i32 %b, -1
+  %and1 = and i32 %neg, %c
+  %or = or i32 %and1, %and
+  ret i32 %or
+}
+
+define i64 @cmix_i64(i64 %a, i64 %b, i64 %c) nounwind {
+; RV32I-LABEL: cmix_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    and a1, a3, a1
+; RV32I-NEXT:    and a0, a2, a0
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    not a3, a3
+; RV32I-NEXT:    and a3, a3, a5
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: cmix_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    cmix a0, a2, a0, a4
+; RV32IB-NEXT:    cmix a1, a3, a1, a5
+; RV32IB-NEXT:    ret
+;
+; RV32IBT-LABEL: cmix_i64:
+; RV32IBT:       # %bb.0:
+; RV32IBT-NEXT:    cmix a0, a2, a0, a4
+; RV32IBT-NEXT:    cmix a1, a3, a1, a5
+; RV32IBT-NEXT:    ret
+  %and = and i64 %b, %a
+  %neg = xor i64 %b, -1
+  %and1 = and i64 %neg, %c
+  %or = or i64 %and1, %and
+  ret i64 %or
+}
+
+define i32 @cmov_i32(i32 %a, i32 %b, i32 %c) nounwind {
+; RV32I-LABEL: cmov_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    beqz a1, .LBB2_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:  .LBB2_2:
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: cmov_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    cmov a0, a1, a0, a2
+; RV32IB-NEXT:    ret
+;
+; RV32IBT-LABEL: cmov_i32:
+; RV32IBT:       # %bb.0:
+; RV32IBT-NEXT:    cmov a0, a1, a0, a2
+; RV32IBT-NEXT:    ret
+  %tobool.not = icmp eq i32 %b, 0
+  %cond = select i1 %tobool.not, i32 %c, i32 %a
+  ret i32 %cond
+}
+
+define i64 @cmov_i64(i64 %a, i64 %b, i64 %c) nounwind {
+; RV32I-LABEL: cmov_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    or a2, a2, a3
+; RV32I-NEXT:    beqz a2, .LBB3_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv a4, a0
+; RV32I-NEXT:    mv a5, a1
+; RV32I-NEXT:  .LBB3_2:
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: cmov_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    or a2, a2, a3
+; RV32IB-NEXT:    cmov a0, a2, a0, a4
+; RV32IB-NEXT:    cmov a1, a2, a1, a5
+; RV32IB-NEXT:    ret
+;
+; RV32IBT-LABEL: cmov_i64:
+; RV32IBT:       # %bb.0:
+; RV32IBT-NEXT:    or a2, a2, a3
+; RV32IBT-NEXT:    cmov a0, a2, a0, a4
+; RV32IBT-NEXT:    cmov a1, a2, a1, a5
+; RV32IBT-NEXT:    ret
+  %tobool.not = icmp eq i64 %b, 0
+  %cond = select i1 %tobool.not, i64 %c, i64 %a
+  ret i64 %cond
+}
+
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+
+define i32 @fshl_i32(i32 %a, i32 %b, i32 %c) nounwind {
+; RV32I-LABEL: fshl_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a3, a2, 31
+; RV32I-NEXT:    beqz a3, .LBB4_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sll a0, a0, a2
+; RV32I-NEXT:    addi a2, zero, 32
+; RV32I-NEXT:    sub a2, a2, a3
+; RV32I-NEXT:    srl a1, a1, a2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:  .LBB4_2:
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: fshl_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    fsl a0, a0, a2, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBT-LABEL: fshl_i32:
+; RV32IBT:       # %bb.0:
+; RV32IBT-NEXT:    fsl a0, a0, a2, a1
+; RV32IBT-NEXT:    ret
+  %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
+  ret i32 %1
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet an efficient pattern-matching with bit manipulation
+; instructions on RV32.
+; This test is presented here in case future expansions of the experimental-b
+; extension introduce instructions that can match more efficiently this pattern.
+
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+
+define i64 @fshl_i64(i64 %a, i64 %b, i64 %c) nounwind {
+; RV32I-LABEL: fshl_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi t1, a4, 63
+; RV32I-NEXT:    addi a6, t1, -32
+; RV32I-NEXT:    addi a7, zero, 31
+; RV32I-NEXT:    bltz a6, .LBB5_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    sll t0, a0, a6
+; RV32I-NEXT:    j .LBB5_3
+; RV32I-NEXT:  .LBB5_2:
+; RV32I-NEXT:    sll t0, a1, a4
+; RV32I-NEXT:    sub t2, a7, t1
+; RV32I-NEXT:    srli a5, a0, 1
+; RV32I-NEXT:    srl a5, a5, t2
+; RV32I-NEXT:    or t0, t0, a5
+; RV32I-NEXT:  .LBB5_3:
+; RV32I-NEXT:    addi a5, zero, 32
+; RV32I-NEXT:    sub t4, a5, t1
+; RV32I-NEXT:    addi a5, zero, 64
+; RV32I-NEXT:    sub t2, a5, t1
+; RV32I-NEXT:    bltz t4, .LBB5_5
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    mv t3, zero
+; RV32I-NEXT:    bnez t1, .LBB5_6
+; RV32I-NEXT:    j .LBB5_7
+; RV32I-NEXT:  .LBB5_5:
+; RV32I-NEXT:    srl t3, a3, t2
+; RV32I-NEXT:    beqz t1, .LBB5_7
+; RV32I-NEXT:  .LBB5_6:
+; RV32I-NEXT:    or a1, t0, t3
+; RV32I-NEXT:  .LBB5_7:
+; RV32I-NEXT:    bltz t4, .LBB5_10
+; RV32I-NEXT:  # %bb.8:
+; RV32I-NEXT:    srl a2, a3, t4
+; RV32I-NEXT:    bgez a6, .LBB5_11
+; RV32I-NEXT:  .LBB5_9:
+; RV32I-NEXT:    sll a3, a0, a4
+; RV32I-NEXT:    bnez t1, .LBB5_12
+; RV32I-NEXT:    j .LBB5_13
+; RV32I-NEXT:  .LBB5_10:
+; RV32I-NEXT:    srl a2, a2, t2
+; RV32I-NEXT:    sub a5, a7, t2
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    sll a3, a3, a5
+; RV32I-NEXT:    or a2, a2, a3
+; RV32I-NEXT:    bltz a6, .LBB5_9
+; RV32I-NEXT:  .LBB5_11:
+; RV32I-NEXT:    mv a3, zero
+; RV32I-NEXT:    beqz t1, .LBB5_13
+; RV32I-NEXT:  .LBB5_12:
+; RV32I-NEXT:    or a0, a3, a2
+; RV32I-NEXT:  .LBB5_13:
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: fshl_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    andi t1, a4, 63
+; RV32IB-NEXT:    addi a6, t1, -32
+; RV32IB-NEXT:    addi a7, zero, 31
+; RV32IB-NEXT:    bltz a6, .LBB5_2
+; RV32IB-NEXT:  # %bb.1:
+; RV32IB-NEXT:    sll t0, a0, a6
+; RV32IB-NEXT:    j .LBB5_3
+; RV32IB-NEXT:  .LBB5_2:
+; RV32IB-NEXT:    sll t0, a1, a4
+; RV32IB-NEXT:    sub t2, a7, t1
+; RV32IB-NEXT:    srli a5, a0, 1
+; RV32IB-NEXT:    srl a5, a5, t2
+; RV32IB-NEXT:    or t0, t0, a5
+; RV32IB-NEXT:  .LBB5_3:
+; RV32IB-NEXT:    addi a5, zero, 32
+; RV32IB-NEXT:    sub t4, a5, t1
+; RV32IB-NEXT:    addi a5, zero, 64
+; RV32IB-NEXT:    sub t2, a5, t1
+; RV32IB-NEXT:    bltz t4, .LBB5_7
+; RV32IB-NEXT:  # %bb.4:
+; RV32IB-NEXT:    mv t3, zero
+; RV32IB-NEXT:    or t0, t0, t3
+; RV32IB-NEXT:    bgez t4, .LBB5_8
+; RV32IB-NEXT:  .LBB5_5:
+; RV32IB-NEXT:    srl a2, a2, t2
+; RV32IB-NEXT:    sub a5, a7, t2
+; RV32IB-NEXT:    slli a3, a3, 1
+; RV32IB-NEXT:    sll a3, a3, a5
+; RV32IB-NEXT:    or a2, a2, a3
+; RV32IB-NEXT:    cmov a1, t1, t0, a1
+; RV32IB-NEXT:    bgez a6, .LBB5_9
+; RV32IB-NEXT:  .LBB5_6:
+; RV32IB-NEXT:    sll a3, a0, a4
+; RV32IB-NEXT:    j .LBB5_10
+; RV32IB-NEXT:  .LBB5_7:
+; RV32IB-NEXT:    srl t3, a3, t2
+; RV32IB-NEXT:    or t0, t0, t3
+; RV32IB-NEXT:    bltz t4, .LBB5_5
+; RV32IB-NEXT:  .LBB5_8:
+; RV32IB-NEXT:    srl a2, a3, t4
+; RV32IB-NEXT:    cmov a1, t1, t0, a1
+; RV32IB-NEXT:    bltz a6, .LBB5_6
+; RV32IB-NEXT:  .LBB5_9:
+; RV32IB-NEXT:    mv a3, zero
+; RV32IB-NEXT:  .LBB5_10:
+; RV32IB-NEXT:    or a2, a3, a2
+; RV32IB-NEXT:    cmov a0, t1, a2, a0
+; RV32IB-NEXT:    ret
+;
+; RV32IBT-LABEL: fshl_i64:
+; RV32IBT:       # %bb.0:
+; RV32IBT-NEXT:    andi t1, a4, 63
+; RV32IBT-NEXT:    addi a6, t1, -32
+; RV32IBT-NEXT:    addi a7, zero, 31
+; RV32IBT-NEXT:    bltz a6, .LBB5_2
+; RV32IBT-NEXT:  # %bb.1:
+; RV32IBT-NEXT:    sll t0, a0, a6
+; RV32IBT-NEXT:    j .LBB5_3
+; RV32IBT-NEXT:  .LBB5_2:
+; RV32IBT-NEXT:    sll t0, a1, a4
+; RV32IBT-NEXT:    sub t2, a7, t1
+; RV32IBT-NEXT:    srli a5, a0, 1
+; RV32IBT-NEXT:    srl a5, a5, t2
+; RV32IBT-NEXT:    or t0, t0, a5
+; RV32IBT-NEXT:  .LBB5_3:
+; RV32IBT-NEXT:    addi a5, zero, 32
+; RV32IBT-NEXT:    sub t4, a5, t1
+; RV32IBT-NEXT:    addi a5, zero, 64
+; RV32IBT-NEXT:    sub t2, a5, t1
+; RV32IBT-NEXT:    bltz t4, .LBB5_7
+; RV32IBT-NEXT:  # %bb.4:
+; RV32IBT-NEXT:    mv t3, zero
+; RV32IBT-NEXT:    or t0, t0, t3
+; RV32IBT-NEXT:    bgez t4, .LBB5_8
+; RV32IBT-NEXT:  .LBB5_5:
+; RV32IBT-NEXT:    srl a2, a2, t2
+; RV32IBT-NEXT:    sub a5, a7, t2
+; RV32IBT-NEXT:    slli a3, a3, 1
+; RV32IBT-NEXT:    sll a3, a3, a5
+; RV32IBT-NEXT:    or a2, a2, a3
+; RV32IBT-NEXT:    cmov a1, t1, t0, a1
+; RV32IBT-NEXT:    bgez a6, .LBB5_9
+; RV32IBT-NEXT:  .LBB5_6:
+; RV32IBT-NEXT:    sll a3, a0, a4
+; RV32IBT-NEXT:    j .LBB5_10
+; RV32IBT-NEXT:  .LBB5_7:
+; RV32IBT-NEXT:    srl t3, a3, t2
+; RV32IBT-NEXT:    or t0, t0, t3
+; RV32IBT-NEXT:    bltz t4, .LBB5_5
+; RV32IBT-NEXT:  .LBB5_8:
+; RV32IBT-NEXT:    srl a2, a3, t4
+; RV32IBT-NEXT:    cmov a1, t1, t0, a1
+; RV32IBT-NEXT:    bltz a6, .LBB5_6
+; RV32IBT-NEXT:  .LBB5_9:
+; RV32IBT-NEXT:    mv a3, zero
+; RV32IBT-NEXT:  .LBB5_10:
+; RV32IBT-NEXT:    or a2, a3, a2
+; RV32IBT-NEXT:    cmov a0, t1, a2, a0
+; RV32IBT-NEXT:    ret
+  %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c)
+  ret i64 %1
+}
+
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+
+define i32 @fshr_i32(i32 %a, i32 %b, i32 %c) nounwind {
+; RV32I-LABEL: fshr_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a3, a2, 31
+; RV32I-NEXT:    beqz a3, .LBB6_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    srl a1, a1, a2
+; RV32I-NEXT:    addi a2, zero, 32
+; RV32I-NEXT:    sub a2, a2, a3
+; RV32I-NEXT:    sll a0, a0, a2
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:  .LBB6_2:
+; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: fshr_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    fsr a0, a0, a2, a1
+; RV32IB-NEXT:    ret
+;
+; RV32IBT-LABEL: fshr_i32:
+; RV32IBT:       # %bb.0:
+; RV32IBT-NEXT:    fsr a0, a0, a2, a1
+; RV32IBT-NEXT:    ret
+  %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
+  ret i32 %1
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet an efficient pattern-matching with bit manipulation
+; instructions on RV32.
+; This test is presented here in case future expansions of the experimental-b
+; extension introduce instructions that can match more efficiently this pattern.
+
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+
+define i64 @fshr_i64(i64 %a, i64 %b, i64 %c) nounwind {
+; RV32I-LABEL: fshr_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    mv t1, a3
+; RV32I-NEXT:    mv a6, a2
+; RV32I-NEXT:    andi a5, a4, 63
+; RV32I-NEXT:    addi t2, a5, -32
+; RV32I-NEXT:    addi a7, zero, 31
+; RV32I-NEXT:    bltz t2, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    srl t0, t1, t2
+; RV32I-NEXT:    j .LBB7_3
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    srl t0, a6, a4
+; RV32I-NEXT:    sub a3, a7, a5
+; RV32I-NEXT:    slli a2, t1, 1
+; RV32I-NEXT:    sll a2, a2, a3
+; RV32I-NEXT:    or t0, t0, a2
+; RV32I-NEXT:  .LBB7_3:
+; RV32I-NEXT:    addi a2, zero, 32
+; RV32I-NEXT:    sub a3, a2, a5
+; RV32I-NEXT:    addi a2, zero, 64
+; RV32I-NEXT:    sub a2, a2, a5
+; RV32I-NEXT:    bltz a3, .LBB7_5
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    mv t3, zero
+; RV32I-NEXT:    bnez a5, .LBB7_6
+; RV32I-NEXT:    j .LBB7_7
+; RV32I-NEXT:  .LBB7_5:
+; RV32I-NEXT:    sll t3, a0, a2
+; RV32I-NEXT:    beqz a5, .LBB7_7
+; RV32I-NEXT:  .LBB7_6:
+; RV32I-NEXT:    or a6, t3, t0
+; RV32I-NEXT:  .LBB7_7:
+; RV32I-NEXT:    bltz a3, .LBB7_10
+; RV32I-NEXT:  # %bb.8:
+; RV32I-NEXT:    sll a0, a0, a3
+; RV32I-NEXT:    bgez t2, .LBB7_11
+; RV32I-NEXT:  .LBB7_9:
+; RV32I-NEXT:    srl a1, t1, a4
+; RV32I-NEXT:    bnez a5, .LBB7_12
+; RV32I-NEXT:    j .LBB7_13
+; RV32I-NEXT:  .LBB7_10:
+; RV32I-NEXT:    sll a1, a1, a2
+; RV32I-NEXT:    sub a2, a7, a2
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    srl a0, a0, a2
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    bltz t2, .LBB7_9
+; RV32I-NEXT:  .LBB7_11:
+; RV32I-NEXT:    mv a1, zero
+; RV32I-NEXT:    beqz a5, .LBB7_13
+; RV32I-NEXT:  .LBB7_12:
+; RV32I-NEXT:    or t1, a0, a1
+; RV32I-NEXT:  .LBB7_13:
+; RV32I-NEXT:    mv a0, a6
+; RV32I-NEXT:    mv a1, t1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: fshr_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    andi t1, a4, 63
+; RV32IB-NEXT:    addi a6, t1, -32
+; RV32IB-NEXT:    addi a7, zero, 31
+; RV32IB-NEXT:    bltz a6, .LBB7_2
+; RV32IB-NEXT:  # %bb.1:
+; RV32IB-NEXT:    srl t0, a3, a6
+; RV32IB-NEXT:    j .LBB7_3
+; RV32IB-NEXT:  .LBB7_2:
+; RV32IB-NEXT:    srl t0, a2, a4
+; RV32IB-NEXT:    sub t2, a7, t1
+; RV32IB-NEXT:    slli a5, a3, 1
+; RV32IB-NEXT:    sll a5, a5, t2
+; RV32IB-NEXT:    or t0, t0, a5
+; RV32IB-NEXT:  .LBB7_3:
+; RV32IB-NEXT:    addi a5, zero, 32
+; RV32IB-NEXT:    sub t4, a5, t1
+; RV32IB-NEXT:    addi a5, zero, 64
+; RV32IB-NEXT:    sub t2, a5, t1
+; RV32IB-NEXT:    bltz t4, .LBB7_7
+; RV32IB-NEXT:  # %bb.4:
+; RV32IB-NEXT:    mv t3, zero
+; RV32IB-NEXT:    or t0, t3, t0
+; RV32IB-NEXT:    bgez t4, .LBB7_8
+; RV32IB-NEXT:  .LBB7_5:
+; RV32IB-NEXT:    sll a1, a1, t2
+; RV32IB-NEXT:    sub a5, a7, t2
+; RV32IB-NEXT:    srli a0, a0, 1
+; RV32IB-NEXT:    srl a0, a0, a5
+; RV32IB-NEXT:    or a1, a1, a0
+; RV32IB-NEXT:    cmov a0, t1, t0, a2
+; RV32IB-NEXT:    bgez a6, .LBB7_9
+; RV32IB-NEXT:  .LBB7_6:
+; RV32IB-NEXT:    srl a2, a3, a4
+; RV32IB-NEXT:    j .LBB7_10
+; RV32IB-NEXT:  .LBB7_7:
+; RV32IB-NEXT:    sll t3, a0, t2
+; RV32IB-NEXT:    or t0, t3, t0
+; RV32IB-NEXT:    bltz t4, .LBB7_5
+; RV32IB-NEXT:  .LBB7_8:
+; RV32IB-NEXT:    sll a1, a0, t4
+; RV32IB-NEXT:    cmov a0, t1, t0, a2
+; RV32IB-NEXT:    bltz a6, .LBB7_6
+; RV32IB-NEXT:  .LBB7_9:
+; RV32IB-NEXT:    mv a2, zero
+; RV32IB-NEXT:  .LBB7_10:
+; RV32IB-NEXT:    or a1, a1, a2
+; RV32IB-NEXT:    cmov a1, t1, a1, a3
+; RV32IB-NEXT:    ret
+;
+; RV32IBT-LABEL: fshr_i64:
+; RV32IBT:       # %bb.0:
+; RV32IBT-NEXT:    andi t1, a4, 63
+; RV32IBT-NEXT:    addi a6, t1, -32
+; RV32IBT-NEXT:    addi a7, zero, 31
+; RV32IBT-NEXT:    bltz a6, .LBB7_2
+; RV32IBT-NEXT:  # %bb.1:
+; RV32IBT-NEXT:    srl t0, a3, a6
+; RV32IBT-NEXT:    j .LBB7_3
+; RV32IBT-NEXT:  .LBB7_2:
+; RV32IBT-NEXT:    srl t0, a2, a4
+; RV32IBT-NEXT:    sub t2, a7, t1
+; RV32IBT-NEXT:    slli a5, a3, 1
+; RV32IBT-NEXT:    sll a5, a5, t2
+; RV32IBT-NEXT:    or t0, t0, a5
+; RV32IBT-NEXT:  .LBB7_3:
+; RV32IBT-NEXT:    addi a5, zero, 32
+; RV32IBT-NEXT:    sub t4, a5, t1
+; RV32IBT-NEXT:    addi a5, zero, 64
+; RV32IBT-NEXT:    sub t2, a5, t1
+; RV32IBT-NEXT:    bltz t4, .LBB7_7
+; RV32IBT-NEXT:  # %bb.4:
+; RV32IBT-NEXT:    mv t3, zero
+; RV32IBT-NEXT:    or t0, t3, t0
+; RV32IBT-NEXT:    bgez t4, .LBB7_8
+; RV32IBT-NEXT:  .LBB7_5:
+; RV32IBT-NEXT:    sll a1, a1, t2
+; RV32IBT-NEXT:    sub a5, a7, t2
+; RV32IBT-NEXT:    srli a0, a0, 1
+; RV32IBT-NEXT:    srl a0, a0, a5
+; RV32IBT-NEXT:    or a1, a1, a0
+; RV32IBT-NEXT:    cmov a0, t1, t0, a2
+; RV32IBT-NEXT:    bgez a6, .LBB7_9
+; RV32IBT-NEXT:  .LBB7_6:
+; RV32IBT-NEXT:    srl a2, a3, a4
+; RV32IBT-NEXT:    j .LBB7_10
+; RV32IBT-NEXT:  .LBB7_7:
+; RV32IBT-NEXT:    sll t3, a0, t2
+; RV32IBT-NEXT:    or t0, t3, t0
+; RV32IBT-NEXT:    bltz t4, .LBB7_5
+; RV32IBT-NEXT:  .LBB7_8:
+; RV32IBT-NEXT:    sll a1, a0, t4
+; RV32IBT-NEXT:    cmov a0, t1, t0, a2
+; RV32IBT-NEXT:    bltz a6, .LBB7_6
+; RV32IBT-NEXT:  .LBB7_9:
+; RV32IBT-NEXT:    mv a2, zero
+; RV32IBT-NEXT:  .LBB7_10:
+; RV32IBT-NEXT:    or a1, a1, a2
+; RV32IBT-NEXT:    cmov a1, t1, a1, a3
+; RV32IBT-NEXT:    ret
+  %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
+  ret i64 %1
+}
+
+define i32 @fshri_i32(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: fshri_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a1, a1, 5
+; RV32I-NEXT:    slli a0, a0, 27
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: fshri_i32:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    fsri a0, a0, a1, 5
+; RV32IB-NEXT:    ret
+;
+; RV32IBT-LABEL: fshri_i32:
+; RV32IBT:       # %bb.0:
+; RV32IBT-NEXT:    fsri a0, a0, a1, 5
+; RV32IBT-NEXT:    ret
+  %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 5)
+  ret i32 %1
+}
+
+define i64 @fshri_i64(i64 %a, i64 %b) nounwind {
+; RV32I-LABEL: fshri_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a3, 27
+; RV32I-NEXT:    srli a2, a2, 5
+; RV32I-NEXT:    or a2, a2, a1
+; RV32I-NEXT:    srli a1, a3, 5
+; RV32I-NEXT:    slli a0, a0, 27
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IB-LABEL: fshri_i64:
+; RV32IB:       # %bb.0:
+; RV32IB-NEXT:    addi a1, zero, 27
+; RV32IB-NEXT:    fsl a2, a3, a1, a2
+; RV32IB-NEXT:    fsl a1, a0, a1, a3
+; RV32IB-NEXT:    mv a0, a2
+; RV32IB-NEXT:    ret
+;
+; RV32IBT-LABEL: fshri_i64:
+; RV32IBT:       # %bb.0:
+; RV32IBT-NEXT:    addi a1, zero, 27
+; RV32IBT-NEXT:    fsl a2, a3, a1, a2
+; RV32IBT-NEXT:    fsl a1, a0, a1, a3
+; RV32IBT-NEXT:    mv a0, a2
+; RV32IBT-NEXT:    ret
+  %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 5)
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64Zbt.ll b/llvm/test/CodeGen/RISCV/rv64Zbt.ll
new file mode 100644
index 000000000000..22e25fadbd91
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64Zbt.ll
@@ -0,0 +1,266 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-b -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64IB
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zbt -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64IBT
+
+define signext i32 @cmix_i32(i32 signext %a, i32 signext %b, i32 signext %c) nounwind {
+; RV64I-LABEL: cmix_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: cmix_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    cmix a0, a1, a0, a2
+; RV64IB-NEXT:    ret
+;
+; RV64IBT-LABEL: cmix_i32:
+; RV64IBT:       # %bb.0:
+; RV64IBT-NEXT:    cmix a0, a1, a0, a2
+; RV64IBT-NEXT:    ret
+  %and = and i32 %b, %a
+  %neg = xor i32 %b, -1
+  %and1 = and i32 %neg, %c
+  %or = or i32 %and1, %and
+  ret i32 %or
+}
+
+define i64 @cmix_i64(i64 %a, i64 %b, i64 %c) nounwind {
+; RV64I-LABEL: cmix_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: cmix_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    cmix a0, a1, a0, a2
+; RV64IB-NEXT:    ret
+;
+; RV64IBT-LABEL: cmix_i64:
+; RV64IBT:       # %bb.0:
+; RV64IBT-NEXT:    cmix a0, a1, a0, a2
+; RV64IBT-NEXT:    ret
+  %and = and i64 %b, %a
+  %neg = xor i64 %b, -1
+  %and1 = and i64 %neg, %c
+  %or = or i64 %and1, %and
+  ret i64 %or
+}
+
+define signext i32 @cmov_i32(i32 signext %a, i32 signext %b, i32 signext %c) nounwind {
+; RV64I-LABEL: cmov_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beqz a1, .LBB2_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:  .LBB2_2:
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: cmov_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    cmov a0, a1, a0, a2
+; RV64IB-NEXT:    ret
+;
+; RV64IBT-LABEL: cmov_i32:
+; RV64IBT:       # %bb.0:
+; RV64IBT-NEXT:    cmov a0, a1, a0, a2
+; RV64IBT-NEXT:    ret
+  %tobool.not = icmp eq i32 %b, 0
+  %cond = select i1 %tobool.not, i32 %c, i32 %a
+  ret i32 %cond
+}
+
+define i64 @cmov_i64(i64 %a, i64 %b, i64 %c) nounwind {
+; RV64I-LABEL: cmov_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beqz a1, .LBB3_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:  .LBB3_2:
+; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: cmov_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    cmov a0, a1, a0, a2
+; RV64IB-NEXT:    ret
+;
+; RV64IBT-LABEL: cmov_i64:
+; RV64IBT:       # %bb.0:
+; RV64IBT-NEXT:    cmov a0, a1, a0, a2
+; RV64IBT-NEXT:    ret
+  %tobool.not = icmp eq i64 %b, 0
+  %cond = select i1 %tobool.not, i64 %c, i64 %a
+  ret i64 %cond
+}
+
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+
+define signext i32 @fshl_i32(i32 signext %a, i32 signext %b, i32 signext %c) nounwind {
+; RV64I-LABEL: fshl_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a3, a2, 31
+; RV64I-NEXT:    beqz a3, .LBB4_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    addi a4, zero, 32
+; RV64I-NEXT:    sub a2, a4, a2
+; RV64I-NEXT:    srlw a1, a1, a2
+; RV64I-NEXT:    sllw a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:  .LBB4_2:
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: fshl_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    fslw a0, a0, a2, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBT-LABEL: fshl_i32:
+; RV64IBT:       # %bb.0:
+; RV64IBT-NEXT:    fslw a0, a0, a2, a1
+; RV64IBT-NEXT:    ret
+  %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
+  ret i32 %1
+}
+
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+
+define i64 @fshl_i64(i64 %a, i64 %b, i64 %c) nounwind {
+; RV64I-LABEL: fshl_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a3, a2, 63
+; RV64I-NEXT:    beqz a3, .LBB5_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    sll a0, a0, a2
+; RV64I-NEXT:    addi a2, zero, 64
+; RV64I-NEXT:    sub a2, a2, a3
+; RV64I-NEXT:    srl a1, a1, a2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:  .LBB5_2:
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: fshl_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    fsl a0, a0, a2, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBT-LABEL: fshl_i64:
+; RV64IBT:       # %bb.0:
+; RV64IBT-NEXT:    fsl a0, a0, a2, a1
+; RV64IBT-NEXT:    ret
+  %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c)
+  ret i64 %1
+}
+
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+
+define signext i32 @fshr_i32(i32 signext %a, i32 signext %b, i32 signext %c) nounwind {
+; RV64I-LABEL: fshr_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a3, a2, 31
+; RV64I-NEXT:    beqz a3, .LBB6_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    srlw a1, a1, a3
+; RV64I-NEXT:    addi a3, zero, 32
+; RV64I-NEXT:    sub a2, a3, a2
+; RV64I-NEXT:    sllw a0, a0, a2
+; RV64I-NEXT:    or a1, a0, a1
+; RV64I-NEXT:  .LBB6_2:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: fshr_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    fsrw a0, a0, a2, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBT-LABEL: fshr_i32:
+; RV64IBT:       # %bb.0:
+; RV64IBT-NEXT:    fsrw a0, a0, a2, a1
+; RV64IBT-NEXT:    ret
+  %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
+  ret i32 %1
+}
+
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+
+define i64 @fshr_i64(i64 %a, i64 %b, i64 %c) nounwind {
+; RV64I-LABEL: fshr_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a3, a2, 63
+; RV64I-NEXT:    beqz a3, .LBB7_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    srl a1, a1, a2
+; RV64I-NEXT:    addi a2, zero, 64
+; RV64I-NEXT:    sub a2, a2, a3
+; RV64I-NEXT:    sll a0, a0, a2
+; RV64I-NEXT:    or a1, a0, a1
+; RV64I-NEXT:  .LBB7_2:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: fshr_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    fsr a0, a0, a2, a1
+; RV64IB-NEXT:    ret
+;
+; RV64IBT-LABEL: fshr_i64:
+; RV64IBT:       # %bb.0:
+; RV64IBT-NEXT:    fsr a0, a0, a2, a1
+; RV64IBT-NEXT:    ret
+  %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
+  ret i64 %1
+}
+
+define signext i32 @fshri_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: fshri_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 5
+; RV64I-NEXT:    slli a0, a0, 27
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: fshri_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    fsriw a0, a0, a1, 5
+; RV64IB-NEXT:    ret
+;
+; RV64IBT-LABEL: fshri_i32:
+; RV64IBT:       # %bb.0:
+; RV64IBT-NEXT:    fsriw a0, a0, a1, 5
+; RV64IBT-NEXT:    ret
+  %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 5)
+  ret i32 %1
+}
+
+define i64 @fshri_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: fshri_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a1, 5
+; RV64I-NEXT:    slli a0, a0, 59
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: fshri_i64:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    fsri a0, a0, a1, 5
+; RV64IB-NEXT:    ret
+;
+; RV64IBT-LABEL: fshri_i64:
+; RV64IBT:       # %bb.0:
+; RV64IBT-NEXT:    fsri a0, a0, a1, 5
+; RV64IBT-NEXT:    ret
+  %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 5)
+  ret i64 %1
+}

From f3a043717d265105ab854c5f84ec03c2f67d9a63 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 21 Jul 2020 23:39:37 +0300
Subject: [PATCH 052/363] [MC] [COFF] Make sure that weak external symbols are
 undefined symbols

For comdats (e.g. caused by -ffunction-sections), Section is already
set here; make sure it's null, for the weak external symbol to be undefined.

This fixes PR46779.

Differential Revision: https://reviews.llvm.org/D84507

(cherry picked from commit 9e81d8bbf19d72fca3d87b7334c613d1aa2a5795)
---
 llvm/lib/MC/WinCOFFObjectWriter.cpp |  1 +
 llvm/test/MC/COFF/weak-comdat.s     | 34 +++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100644 llvm/test/MC/COFF/weak-comdat.s

diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 4796ef531054..8e7bf1eb0169 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -375,6 +375,7 @@ void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &MCSym,
   COFFSymbol *Local = nullptr;
   if (cast<MCSymbolCOFF>(MCSym).isWeakExternal()) {
     Sym->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
+    Sym->Section = nullptr;
 
     COFFSymbol *WeakDefault = getLinkedSymbol(MCSym);
     if (!WeakDefault) {
diff --git a/llvm/test/MC/COFF/weak-comdat.s b/llvm/test/MC/COFF/weak-comdat.s
new file mode 100644
index 000000000000..8605da6b521d
--- /dev/null
+++ b/llvm/test/MC/COFF/weak-comdat.s
@@ -0,0 +1,34 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-win32 %s -o %t.o
+// RUN: llvm-readobj --symbols %t.o | FileCheck %s
+
+// Test that the weak symbol is properly undefined, while originally being
+// the leader symbol for a comdat. (This can easily happen if building with
+// -ffunction-sections).
+
+        .section .text$func,"xr",one_only,func
+        .weak   func
+func:
+        ret
+
+// CHECK:       Symbol {
+// CHECK:         Name: func
+// CHECK-NEXT:    Value: 0
+// CHECK-NEXT:    Section: IMAGE_SYM_UNDEFINED (0)
+// CHECK-NEXT:    BaseType: Null (0x0)
+// CHECK-NEXT:    ComplexType: Null (0x0)
+// CHECK-NEXT:    StorageClass: WeakExternal (0x69)
+// CHECK-NEXT:    AuxSymbolCount: 1
+// CHECK-NEXT:    AuxWeakExternal {
+// CHECK-NEXT:      Linked: .weak.func.default (10)
+// CHECK-NEXT:      Search: Alias (0x3)
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  Symbol {
+// CHECK-NEXT:    Name: .weak.func.default
+// CHECK-NEXT:    Value: 0
+// CHECK-NEXT:    Section: .text$func (4)
+// CHECK-NEXT:    BaseType: Null (0x0)
+// CHECK-NEXT:    ComplexType: Null (0x0)
+// CHECK-NEXT:    StorageClass: External (0x2)
+// CHECK-NEXT:    AuxSymbolCount: 0
+// CHECK-NEXT:  }

From 405d10e4ad134a5eb573d6d1fdd7ad99f3f8c5ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Fri, 24 Jul 2020 00:05:55 +0300
Subject: [PATCH 053/363] [llvm-lib] Support adding short import library
 objects with llvm-lib

This fixes PR 42837.

Differential Revision: https://reviews.llvm.org/D84465

(cherry picked from commit 4d09ed953b5b8c70d9ca0aeaed8f26a237b612c6)
---
 llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp |  6 ++++--
 llvm/test/tools/llvm-lib/implibs.test       | 12 ++++++++++++
 llvm/test/tools/llvm-lib/invalid.test       |  2 +-
 3 files changed, 17 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/tools/llvm-lib/implibs.test

diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index c40901255424..cd39428b9c38 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -191,9 +191,11 @@ static void appendFile(std::vector<NewArchiveMember> &Members,
   file_magic Magic = identify_magic(MB.getBuffer());
 
   if (Magic != file_magic::coff_object && Magic != file_magic::bitcode &&
-      Magic != file_magic::archive && Magic != file_magic::windows_resource) {
+      Magic != file_magic::archive && Magic != file_magic::windows_resource &&
+      Magic != file_magic::coff_import_library) {
     llvm::errs() << MB.getBufferIdentifier()
-                 << ": not a COFF object, bitcode, archive or resource file\n";
+                 << ": not a COFF object, bitcode, archive, import library or "
+                    "resource file\n";
     exit(1);
   }
 
diff --git a/llvm/test/tools/llvm-lib/implibs.test b/llvm/test/tools/llvm-lib/implibs.test
new file mode 100644
index 000000000000..ebff4bb4608f
--- /dev/null
+++ b/llvm/test/tools/llvm-lib/implibs.test
@@ -0,0 +1,12 @@
+Test that import libraries (and the members thereof) can be added to another
+static library.
+
+RUN: rm -rf %t
+RUN: mkdir -p %t
+
+RUN: echo -e "EXPORTS\nMyFunc" > %t/lib.def
+RUN: llvm-dlltool -m i386:x86-64 -l %t/lib.lib -d %t/lib.def -D lib.dll
+RUN: llvm-lib -out:%t/newlib.lib %t/lib.lib
+
+RUN: llvm-ar t %t/newlib.lib | FileCheck %s
+CHECK: lib.dll
diff --git a/llvm/test/tools/llvm-lib/invalid.test b/llvm/test/tools/llvm-lib/invalid.test
index 57266400cdc8..a4b06a03358b 100644
--- a/llvm/test/tools/llvm-lib/invalid.test
+++ b/llvm/test/tools/llvm-lib/invalid.test
@@ -1,2 +1,2 @@
 RUN: not llvm-lib %S/Inputs/cl-gl.obj 2>&1 | FileCheck %s
-CHECK: not a COFF object, bitcode, archive or resource file
+CHECK: not a COFF object, bitcode, archive, import library or resource file

From d3600f84c4a7a6826f33d45b2545b4e915ab3a5c Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Wed, 15 Jul 2020 17:32:02 -0400
Subject: [PATCH 054/363] [OPENMP] Fix PR46730: Fix compiler crash on taskloop
 over constructible loop counters.

Summary:
If the variable is constrcutible, its copy is created by calling a
constructor. Such variables are duplicated and thus, must be captured.

Reviewers: jdoerfert

Subscribers: yaxunl, guansong, cfe-commits, sstefan1, caomhin

Tags: #clang

Differential Revision: https://reviews.llvm.org/D83909

(cherry picked from commit 9840208db6980f690d09b209e6ad6d57133ec5e5)
---
 clang/lib/Sema/SemaOpenMP.cpp          |  6 +++++-
 clang/test/OpenMP/taskloop_codegen.cpp | 16 ++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 8bf605e5e76b..533c5b1f6ff0 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -2244,7 +2244,11 @@ OpenMPClauseKind Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level,
           [](OpenMPDirectiveKind K) { return isOpenMPTaskingDirective(K); },
           Level)) {
     bool IsTriviallyCopyable =
-        D->getType().getNonReferenceType().isTriviallyCopyableType(Context);
+        D->getType().getNonReferenceType().isTriviallyCopyableType(Context) &&
+        !D->getType()
+             .getNonReferenceType()
+             .getCanonicalType()
+             ->getAsCXXRecordDecl();
     OpenMPDirectiveKind DKind = DSAStack->getDirective(Level);
     SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
     getOpenMPCaptureRegions(CaptureRegions, DKind);
diff --git a/clang/test/OpenMP/taskloop_codegen.cpp b/clang/test/OpenMP/taskloop_codegen.cpp
index 55e43ff3a115..7402c2ad65eb 100644
--- a/clang/test/OpenMP/taskloop_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_codegen.cpp
@@ -229,4 +229,20 @@ struct S {
 // CHECK: br label %
 // CHECK: ret i32 0
 
+class St {
+public:
+  operator int();
+  St &operator+=(int);
+};
+
+// CHECK-LABEL: taskloop_with_class
+void taskloop_with_class() {
+  St s1;
+  // CHECK: [[TD:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @{{.+}}, i32 [[GTID:%.+]], i32 1, i64 88, i64 8, i32 (i32, i8*)* bitcast (i32 (i32, [[TD_TYPE:%.+]]*)* @{{.+}} to i32 (i32, i8*)*))
+  // CHECK: call void @__kmpc_taskloop(%struct.ident_t* @{{.+}}, i32 [[GTID]], i8* [[TD]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* bitcast (void ([[TD_TYPE]]*, [[TD_TYPE]]*, i32)* @{{.+}} to i8*))
+#pragma omp taskloop
+  for (St s = St(); s < s1; s += 1) {
+  }
+}
+
 #endif

From 152c2b1befb1879e690f228d95bbfe1bd381a1da Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 23 Jul 2020 18:43:40 -0700
Subject: [PATCH 055/363] [LegalizeTypes] Teach
 DAGTypeLegalizer::GenWidenVectorLoads to pad with undef if needed when
 concatenating small or loads to match a larger load

In the included test case the align 16 allowed the v23f32 load to handled as load v16f32, load v4f32, and load v4f32(one element not used). These loads all need to be concatenated together into a final vector. In this case we tried to concatenate the two v4f32 loads to match the type of the v16f32 load so we could do a second concat_vectors, but those loads alone only add up to v8f32. So we need to two v4f32 undefs to pad it.

It appears we've tried to hack around a similar issue in this code before by adding undef padding to loads in one of the earlier loops in this function. Originally in r147964 by padding all loads narrower than previous loads to the same size. Later modifed to only the last load in r293088. This patch removes that earlier code and just handles it on demand where we know we need it.

Fixes PR46820

Differential Revision: https://reviews.llvm.org/D84463

(cherry picked from commit 8131e190647ac2b5b085b48a6e3b48c1d7520a66)
---
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 27 +++++------
 llvm/test/CodeGen/X86/pr46820.ll              | 47 +++++++++++++++++++
 2 files changed, 59 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/pr46820.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 414ba25ffd5f..b2299931021c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4913,7 +4913,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
 
   int LdWidth = LdVT.getSizeInBits();
   int WidthDiff = WidenWidth - LdWidth;
-  // Allow wider loads.
+  // Allow wider loads if they are sufficiently aligned to avoid memory faults
+  // and if the original load is simple.
   unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment();
 
   // Find the vector type that can load from.
@@ -4965,19 +4966,6 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
                       LD->getPointerInfo().getWithOffset(Offset),
                       LD->getOriginalAlign(), MMOFlags, AAInfo);
       LdChain.push_back(L.getValue(1));
-      if (L->getValueType(0).isVector() && NewVTWidth >= LdWidth) {
-        // Later code assumes the vector loads produced will be mergeable, so we
-        // must pad the final entry up to the previous width. Scalars are
-        // combined separately.
-        SmallVector<SDValue, 16> Loads;
-        Loads.push_back(L);
-        unsigned size = L->getValueSizeInBits(0);
-        while (size < LdOp->getValueSizeInBits(0)) {
-          Loads.push_back(DAG.getUNDEF(L->getValueType(0)));
-          size += L->getValueSizeInBits(0);
-        }
-        L = DAG.getNode(ISD::CONCAT_VECTORS, dl, LdOp->getValueType(0), Loads);
-      }
     } else {
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset),
@@ -5018,8 +5006,17 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     EVT NewLdTy = LdOps[i].getValueType();
     if (NewLdTy != LdTy) {
       // Create a larger vector.
+      unsigned NumOps = NewLdTy.getSizeInBits() / LdTy.getSizeInBits();
+      assert(NewLdTy.getSizeInBits() % LdTy.getSizeInBits() == 0);
+      SmallVector<SDValue, 16> WidenOps(NumOps);
+      unsigned j = 0;
+      for (; j != End-Idx; ++j)
+        WidenOps[j] = ConcatOps[Idx+j];
+      for (; j != NumOps; ++j)
+        WidenOps[j] = DAG.getUNDEF(LdTy);
+
       ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
-                                     makeArrayRef(&ConcatOps[Idx], End - Idx));
+                                     WidenOps);
       Idx = End - 1;
       LdTy = NewLdTy;
     }
diff --git a/llvm/test/CodeGen/X86/pr46820.ll b/llvm/test/CodeGen/X86/pr46820.ll
new file mode 100644
index 000000000000..76093801f9d0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr46820.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s
+
+; The alignment of 16 causes type legalization to split this as 3 loads,
+; v16f32, v4f32, and v4f32. This loads 24 elements, but the load is aligned
+; to 16 bytes so this i safe. There was an issue with type legalization building
+; the proper concat_vectors for this because the two v4f32s don't add up to
+; v16f32 and require padding.
+
+define <23 x float> @load23(<23 x float>* %p) {
+; CHECK-LABEL: load23:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    vmovups 64(%rsi), %ymm0
+; CHECK-NEXT:    vmovups (%rsi), %zmm1
+; CHECK-NEXT:    vmovaps 64(%rsi), %xmm2
+; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss %xmm3, 88(%rdi)
+; CHECK-NEXT:    vmovaps %xmm2, 64(%rdi)
+; CHECK-NEXT:    vmovaps %zmm1, (%rdi)
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovlps %xmm0, 80(%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %t0 = load <23 x float>, <23 x float>* %p, align 16
+  ret <23 x float> %t0
+}
+
+; Same test as above with minimal alignment just to demonstrate the different
+; codegen.
+define <23 x float> @load23_align_1(<23 x float>* %p) {
+; CHECK-LABEL: load23_align_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    vmovups (%rsi), %zmm0
+; CHECK-NEXT:    vmovups 64(%rsi), %xmm1
+; CHECK-NEXT:    movq 80(%rsi), %rcx
+; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss %xmm2, 88(%rdi)
+; CHECK-NEXT:    movq %rcx, 80(%rdi)
+; CHECK-NEXT:    vmovaps %xmm1, 64(%rdi)
+; CHECK-NEXT:    vmovaps %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %t0 = load <23 x float>, <23 x float>* %p, align 1
+  ret <23 x float> %t0
+}

From ca49a47b8f87fb942be7c043da2000375346d8b4 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Fri, 24 Jul 2020 15:38:27 -0400
Subject: [PATCH 056/363] [PowerPC] Fix computation of offset for
 load-and-splat for permuted loads

Unfortunately this is another regression from my canonicalization patch
(1fed131660b2). The patch contained two implicit assumptions:
1. That we would have a permuted load only if we are loading a partial vector
2. That a partial vector load would necessarily be as wide as the splat

However, assumption 2 is not correct since it is possible to do a wider
load and only splat a half of it. This patch corrects this assumption by
simply checking if the load is permuted and adjusting the offset if it is.

(cherry picked from commit 7d076e19e31a2a32e357cbdcf0183f88fe1fb0fb)
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   | 26 ++++--
 .../PowerPC/canonical-merge-shuffles.ll       | 88 +++++++++++++++++++
 2 files changed, 106 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 11454841cab7..980b5ea2fb7d 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9111,13 +9111,15 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
                      Op0.getOperand(1));
 }
 
-static const SDValue *getNormalLoadInput(const SDValue &Op) {
+static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) {
   const SDValue *InputLoad = &Op;
   if (InputLoad->getOpcode() == ISD::BITCAST)
     InputLoad = &InputLoad->getOperand(0);
   if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||
-      InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED)
+      InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) {
+    IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED;
     InputLoad = &InputLoad->getOperand(0);
+  }
   if (InputLoad->getOpcode() != ISD::LOAD)
     return nullptr;
   LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
@@ -9289,7 +9291,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
 
   if (!BVNIsConstantSplat || SplatBitSize > 32) {
 
-    const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
+    bool IsPermutedLoad = false;
+    const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0), IsPermutedLoad);
     // Handle load-and-splat patterns as we have instructions that will do this
     // in one go.
     if (InputLoad && DAG.isSplatValue(Op, true)) {
@@ -9912,7 +9915,8 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   // If this is a load-and-splat, we can do that with a single instruction
   // in some cases. However if the load has multiple uses, we don't want to
   // combine it because that will just produce multiple loads.
-  const SDValue *InputLoad = getNormalLoadInput(V1);
+  bool IsPermutedLoad = false;
+  const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad);
   if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
       (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) &&
       InputLoad->hasOneUse()) {
@@ -9920,6 +9924,16 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     int SplatIdx =
       PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);
 
+    // The splat index for permuted loads will be in the left half of the vector
+    // which is strictly wider than the loaded value by 8 bytes. So we need to
+    // adjust the splat index to point to the correct address in memory.
+    if (IsPermutedLoad) {
+      assert(isLittleEndian && "Unexpected permuted load on big endian target");
+      SplatIdx += IsFourByte ? 2 : 1;
+      assert(SplatIdx < IsFourByte ? 4 : 2 &&
+             "Splat of a value outside of the loaded memory");
+    }
+
     LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
     // For 4-byte load-and-splat, we need Power9.
     if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
@@ -9929,10 +9943,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
       else
         Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
 
-      // If we are loading a partial vector, it does not make sense to adjust
-      // the base pointer. This happens with (splat (s_to_v_permuted (ld))).
-      if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64))
-        Offset = 0;
       SDValue BasePtr = LD->getBasePtr();
       if (Offset != 0)
         BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
index cc349ec228f4..58984d385afa 100644
--- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
+++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
@@ -446,5 +446,93 @@ entry:
   ret <16 x i8> %shuffle
 }
 
+define dso_local <4 x i32> @testSplat4Low(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: testSplat4Low:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    ld r3, 0(r3)
+; CHECK-P8-NEXT:    mtfprd f0, r3
+; CHECK-P8-NEXT:    xxspltw v2, vs0, 0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: testSplat4Low:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    addi r3, r3, 4
+; CHECK-P9-NEXT:    lxvwsx v2, 0, r3
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-NOVSX-LABEL: testSplat4Low:
+; CHECK-NOVSX:       # %bb.0: # %entry
+; CHECK-NOVSX-NEXT:    ld r3, 0(r3)
+; CHECK-NOVSX-NEXT:    addi r4, r1, -16
+; CHECK-NOVSX-NEXT:    std r3, -16(r1)
+; CHECK-NOVSX-NEXT:    lvx v2, 0, r4
+; CHECK-NOVSX-NEXT:    vspltw v2, v2, 2
+; CHECK-NOVSX-NEXT:    blr
+entry:
+  %0 = load <8 x i8>, <8 x i8>* %ptr, align 8
+  %vecinit18 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %1 = bitcast <16 x i8> %vecinit18 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+; Function Attrs: norecurse nounwind readonly
+define dso_local <4 x i32> @testSplat4hi(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: testSplat4hi:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    ld r3, 0(r3)
+; CHECK-P8-NEXT:    mtfprd f0, r3
+; CHECK-P8-NEXT:    xxspltw v2, vs0, 1
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: testSplat4hi:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxvwsx v2, 0, r3
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-NOVSX-LABEL: testSplat4hi:
+; CHECK-NOVSX:       # %bb.0: # %entry
+; CHECK-NOVSX-NEXT:    ld r3, 0(r3)
+; CHECK-NOVSX-NEXT:    addi r4, r1, -16
+; CHECK-NOVSX-NEXT:    std r3, -16(r1)
+; CHECK-NOVSX-NEXT:    lvx v2, 0, r4
+; CHECK-NOVSX-NEXT:    vspltw v2, v2, 3
+; CHECK-NOVSX-NEXT:    blr
+entry:
+  %0 = load <8 x i8>, <8 x i8>* %ptr, align 8
+  %vecinit22 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %1 = bitcast <16 x i8> %vecinit22 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+; Function Attrs: norecurse nounwind readonly
+define dso_local <2 x i64> @testSplat8(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: testSplat8:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lxvdsx v2, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: testSplat8:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxvdsx v2, 0, r3
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-NOVSX-LABEL: testSplat8:
+; CHECK-NOVSX:       # %bb.0: # %entry
+; CHECK-NOVSX-NEXT:    ld r3, 0(r3)
+; CHECK-NOVSX-NEXT:    addis r4, r2, .LCPI19_0@toc@ha
+; CHECK-NOVSX-NEXT:    addi r4, r4, .LCPI19_0@toc@l
+; CHECK-NOVSX-NEXT:    lvx v2, 0, r4
+; CHECK-NOVSX-NEXT:    std r3, -16(r1)
+; CHECK-NOVSX-NEXT:    addi r3, r1, -16
+; CHECK-NOVSX-NEXT:    lvx v3, 0, r3
+; CHECK-NOVSX-NEXT:    vperm v2, v3, v3, v2
+; CHECK-NOVSX-NEXT:    blr
+entry:
+  %0 = load <8 x i8>, <8 x i8>* %ptr, align 8
+  %vecinit30 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %1 = bitcast <16 x i8> %vecinit30 to <2 x i64>
+  ret <2 x i64> %1
+}
+
 declare double @dummy() local_unnamed_addr
 attributes #0 = { nounwind }

From eb3f43bb3a472cbd4dd71b6c7acdf9a298990c29 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Sat, 25 Jul 2020 20:28:52 -0400
Subject: [PATCH 057/363] [PowerPC][NFC] Fix an assert that cannot trip from
 7d076e19e31a

I mixed up the precedence of operators in the assert and thought I
had it right since there was no compiler warning. This just
adds the parentheses in the expression as needed.

(cherry picked from commit cdead4f89c0eecf11f50092bc088e3a9c6511825)
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 980b5ea2fb7d..5c1a4cb16568 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9292,7 +9292,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   if (!BVNIsConstantSplat || SplatBitSize > 32) {
 
     bool IsPermutedLoad = false;
-    const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0), IsPermutedLoad);
+    const SDValue *InputLoad =
+        getNormalLoadInput(Op.getOperand(0), IsPermutedLoad);
     // Handle load-and-splat patterns as we have instructions that will do this
     // in one go.
     if (InputLoad && DAG.isSplatValue(Op, true)) {
@@ -9930,7 +9931,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     if (IsPermutedLoad) {
       assert(isLittleEndian && "Unexpected permuted load on big endian target");
       SplatIdx += IsFourByte ? 2 : 1;
-      assert(SplatIdx < IsFourByte ? 4 : 2 &&
+      assert((SplatIdx < (IsFourByte ? 4 : 2)) &&
              "Splat of a value outside of the loaded memory");
     }
 

From 97edd8fdf06170fb5f7c60e7a2324b5e0491e492 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Mon, 27 Jul 2020 15:07:51 +0300
Subject: [PATCH 058/363] [JumpThreading] ProcessBranchOnXOR(): bailout if any
 pred ends in indirect branch (PR46857)

SplitBlockPredecessors() can not split blocks that have such terminators,
and in two other places we already ensure that we don't end up calling
SplitBlockPredecessors() on such blocks. Do so in one more place.

Fixes https://bugs.llvm.org/show_bug.cgi?id=46857

(cherry picked from commit 1da9834557cd4302a5183b8228ce063e69f82602)
---
 llvm/lib/Transforms/Scalar/JumpThreading.cpp  |  8 +++
 .../JumpThreading/pr46857-callbr.ll           | 52 +++++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 llvm/test/Transforms/JumpThreading/pr46857-callbr.ll

diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 9d0500419a7f..2f379b7f6160 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1859,6 +1859,14 @@ bool JumpThreadingPass::ProcessBranchOnXOR(BinaryOperator *BO) {
     return true;
   }
 
+  // If any of predecessors end with an indirect goto, we can't change its
+  // destination. Same for CallBr.
+  if (any_of(BlocksToFoldInto, [](BasicBlock *Pred) {
+        return isa<IndirectBrInst>(Pred->getTerminator()) ||
+               isa<CallBrInst>(Pred->getTerminator());
+      }))
+    return false;
+
   // Try to duplicate BB into PredBB.
   return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
 }
diff --git a/llvm/test/Transforms/JumpThreading/pr46857-callbr.ll b/llvm/test/Transforms/JumpThreading/pr46857-callbr.ll
new file mode 100644
index 000000000000..3de7d6265136
--- /dev/null
+++ b/llvm/test/Transforms/JumpThreading/pr46857-callbr.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -jump-threading -S | FileCheck %s
+
+; CHECK-ALL-LABEL: @func(
+
+define i1 @func(i1 %arg, i32 %arg1, i1 %arg2) {
+; CHECK-LABEL: @func(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB3:%.*]], label [[BB4:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[I:%.*]] = icmp eq i32 [[ARG1:%.*]], 0
+; CHECK-NEXT:    br label [[BB7:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    callbr void asm sideeffect "", "X"(i8* blockaddress(@func, [[BB7]]))
+; CHECK-NEXT:    to label [[BB5:%.*]] [label %bb7]
+; CHECK:       bb5:
+; CHECK-NEXT:    br label [[BB7]]
+; CHECK:       bb7:
+; CHECK-NEXT:    [[I8:%.*]] = phi i1 [ [[I]], [[BB3]] ], [ [[ARG2:%.*]], [[BB5]] ], [ [[ARG2]], [[BB4]] ]
+; CHECK-NEXT:    [[I9:%.*]] = xor i1 [[I8]], [[ARG]]
+; CHECK-NEXT:    br i1 [[I9]], label [[BB11:%.*]], label [[BB11]]
+; CHECK:       bb11:
+; CHECK-NEXT:    ret i1 [[I9]]
+;
+bb:
+  br i1 %arg, label %bb3, label %bb4
+
+bb3:
+  %i = icmp eq i32 %arg1, 0
+  br label %bb7
+
+bb4:
+  callbr void asm sideeffect "", "X"(i8* blockaddress(@func, %bb6))
+  to label %bb5 [label %bb6]
+
+bb5:
+  br label %bb6
+
+bb6:
+  br label %bb7
+
+bb7:
+  %i8 = phi i1 [ %i, %bb3 ], [ %arg2, %bb6 ]
+  %i9 = xor i1 %i8, %arg
+  br i1 %i9, label %bb11, label %bb10
+
+bb10:
+  br label %bb11
+
+bb11:
+  ret i1 %i9
+}

From 1b1ad651eacb67385f187eb92a798165deb72c64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sat, 25 Jul 2020 15:01:48 +0300
Subject: [PATCH 059/363] [LLD] [COFF] Fix test to properly test all aspects of
 c3b1d730d6. NFC.

Previously, the test could pass with one part of c3b1d730d6 removed.

(cherry picked from commit 8dc820393219c7ee440b4ec86c9a201301943276)
---
 lld/test/COFF/associative-comdat-mingw-i386.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/test/COFF/associative-comdat-mingw-i386.s b/lld/test/COFF/associative-comdat-mingw-i386.s
index 3f5e02330d50..8d89478d4eb0 100644
--- a/lld/test/COFF/associative-comdat-mingw-i386.s
+++ b/lld/test/COFF/associative-comdat-mingw-i386.s
@@ -30,7 +30,7 @@ _main:
         .scl            2;
         .type           32;
         .endef
-        .section        .text$foo,"xr",discard,foo
+        .section        .text$foo,"xr",discard,_foo
         .globl          _foo
         .p2align        4
 _foo:

From 9dea95b78082a1d9739a60be8a40d721788e4447 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sat, 25 Jul 2020 12:25:19 +0300
Subject: [PATCH 060/363] [LLD] [COFF] Fix mingw comdat associativity for
 leader symbols with a different name

For a weak symbol func in a comdat, the actual leader symbol ends up
named like .weak.func.default*. Likewise, for stdcall on i386, the symbol
may be named _func@4, while the section suffix only is "func", which the
previous implementation didn't handle.

This fixes unwinding through weak functions when using
-ffunction-sections in mingw environments.

Differential Revision: https://reviews.llvm.org/D84607

(cherry picked from commit 343ffa70fc4c55f4dc0d717cf8c168865beaa9c4)
---
 lld/COFF/InputFiles.cpp                       |  8 +--
 lld/test/COFF/associative-comdat-mingw-i386.s | 21 ++++++-
 lld/test/COFF/associative-comdat-mingw-weak.s | 63 +++++++++++++++++++
 3 files changed, 87 insertions(+), 5 deletions(-)
 create mode 100644 lld/test/COFF/associative-comdat-mingw-weak.s

diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index 0adc2b91bd99..4346b3a2ffa7 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -348,13 +348,13 @@ void ObjFile::recordPrevailingSymbolForMingw(
   // of the section chunk we actually include instead of discarding it,
   // add the symbol to a map to allow using it for implicitly
   // associating .[px]data$<func> sections to it.
+  // Use the suffix from the .text$<func> instead of the leader symbol
+  // name, for cases where the names differ (i386 mangling/decorations,
+  // cases where the leader is a weak symbol named .weak.func.default*).
   int32_t sectionNumber = sym.getSectionNumber();
   SectionChunk *sc = sparseChunks[sectionNumber];
   if (sc && sc->getOutputCharacteristics() & IMAGE_SCN_MEM_EXECUTE) {
-    StringRef name;
-    name = check(coffObj->getSymbolName(sym));
-    if (getMachineType() == I386)
-      name.consume_front("_");
+    StringRef name = sc->getSectionName().split('$').second;
     prevailingSectionMap[name] = sectionNumber;
   }
 }
diff --git a/lld/test/COFF/associative-comdat-mingw-i386.s b/lld/test/COFF/associative-comdat-mingw-i386.s
index 8d89478d4eb0..3ba8c1cd9a75 100644
--- a/lld/test/COFF/associative-comdat-mingw-i386.s
+++ b/lld/test/COFF/associative-comdat-mingw-i386.s
@@ -1,10 +1,14 @@
 # REQUIRES: x86
 
-# RUN: llvm-mc -triple=i686-windows-gnu %s -filetype=obj -o %t.obj
+# RUN: llvm-mc -triple=i686-windows-gnu %s -defsym stdcall=0 -filetype=obj -o %t.obj
 
 # RUN: lld-link -lldmingw -entry:main %t.obj -out:%t.exe
 # RUN: llvm-objdump -s %t.exe | FileCheck %s
 
+# RUN: llvm-mc -triple=i686-windows-gnu %s -defsym stdcall=1 -filetype=obj -o %t.stdcall.obj
+# RUN: lld-link -lldmingw -entry:main %t.stdcall.obj -out:%t.stdcall.exe
+# RUN: llvm-objdump -s %t.stdcall.exe | FileCheck %s
+
 # Check that the .eh_frame comdat was included, even if it had no symbols,
 # due to associativity with the symbol _foo.
 
@@ -19,19 +23,34 @@
         .globl          _main
         .p2align        4, 0x90
 _main:
+.if stdcall==0
         call            _foo
+.else
+        call            _foo@0
+.endif
         ret
 
         .section        .eh_frame$foo,"dr"
         .linkonce       discard
         .byte           0x42
 
+.if stdcall==0
         .def            _foo;
+.else
+        .def            _foo@0;
+.endif
         .scl            2;
         .type           32;
         .endef
+.if stdcall==0
         .section        .text$foo,"xr",discard,_foo
         .globl          _foo
         .p2align        4
 _foo:
+.else
+        .section        .text$foo,"xr",discard,_foo@0
+        .globl          _foo@0
+        .p2align        4
+_foo@0:
+.endif
         ret
diff --git a/lld/test/COFF/associative-comdat-mingw-weak.s b/lld/test/COFF/associative-comdat-mingw-weak.s
new file mode 100644
index 000000000000..80c738b436be
--- /dev/null
+++ b/lld/test/COFF/associative-comdat-mingw-weak.s
@@ -0,0 +1,63 @@
+# REQUIRES: x86
+
+# RUN: llvm-mc -triple=x86_64-windows-gnu %s -filetype=obj -o %t.obj
+# RUN: llvm-readobj --symbols %t.obj | FileCheck %s --check-prefix=SYMBOL
+
+# RUN: lld-link -lldmingw -entry:main %t.obj -out:%t.exe -lldmap:%t.map -verbose
+# RUN: llvm-readobj --sections %t.exe | FileCheck %s
+
+# CHECK: Sections [
+# CHECK:   Section {
+# CHECK:     Number: 2
+# CHECK-LABEL:     Name: .rdata (2E 72 64 61 74 61 00 00)
+#             This is the critical check to show that .xdata$foo was
+#             retained, while .xdata$bar wasn't. This *must* be 0x24
+#             (0x4 for the .xdata section and 0x20 for the
+#             .ctors/.dtors headers/ends).
+# CHECK-NEXT:     VirtualSize: 0x24
+
+# Check that the weak symbols still are emitted as it was when the test was
+# written, to make sure the test still actually tests what was intended.
+
+# SYMBOL:       Symbol {
+# SYMBOL:         Name: foo
+# SYMBOL-NEXT:    Value: 0
+# SYMBOL-NEXT:    Section: IMAGE_SYM_UNDEFINED (0)
+# SYMBOL-NEXT:    BaseType: Null (0x0)
+# SYMBOL-NEXT:    ComplexType: Null (0x0)
+# SYMBOL-NEXT:    StorageClass: WeakExternal (0x69)
+# SYMBOL-NEXT:    AuxSymbolCount: 1
+# SYMBOL-NEXT:    AuxWeakExternal {
+# SYMBOL-NEXT:      Linked: .weak.foo.default.main (19)
+# SYMBOL-NEXT:      Search: Alias (0x3)
+# SYMBOL-NEXT:    }
+# SYMBOL-NEXT:  }
+
+        .text
+        .globl          main
+main:
+        call            foo
+        retq
+
+# See associative-comdat-mingw.s for the general setup. Here, the leader
+# symbols are weak, which causes the functions foo and bar to be undefined
+# weak externals, while the actual leader symbols are named like
+# .weak.foo.default.main.
+
+        .section        .xdata$foo,"dr"
+        .linkonce       discard
+        .long           42
+
+        .section        .xdata$bar,"dr"
+        .linkonce       discard
+        .long           43
+
+        .section        .text$foo,"xr",discard,foo
+        .weak           foo
+foo:
+        ret
+
+        .section        .text$bar,"xr",discard,bar
+        .weak           bar
+bar:
+        ret

From 22324a62231082a18c59052232d16c055c1a2968 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 16 Jul 2020 13:37:57 +0100
Subject: [PATCH 061/363] [BasicAA] Add additional negative phi tests. NFC

(cherry picked from commit 30fa57662760e1489cf70cb411c55fbe9fc189fe)
---
 llvm/test/Analysis/BasicAA/recphi.ll | 106 +++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

diff --git a/llvm/test/Analysis/BasicAA/recphi.ll b/llvm/test/Analysis/BasicAA/recphi.ll
index 130058c74560..3c8397cc0fec 100644
--- a/llvm/test/Analysis/BasicAA/recphi.ll
+++ b/llvm/test/Analysis/BasicAA/recphi.ll
@@ -83,3 +83,109 @@ if.then: ; preds = %f.exit
 if.end: ; preds = %f.exit
   ret i32 0
 }
+
+; CHECK-LABEL: Function: reverse: 6 pointers, 0 call sites
+; CHECK:         MustAlias:    [10 x i32]* %tab, i8* %0
+; CHECK:         MustAlias:    [10 x i32]* %tab, i32* %arrayidx
+; CHECK:         MustAlias:    i32* %arrayidx, i8* %0
+; CHECK:         PartialAlias: [10 x i32]* %tab, i32* %arrayidx1
+; CHECK:         NoAlias:      i32* %arrayidx1, i8* %0
+; CHECK:         NoAlias:      i32* %arrayidx, i32* %arrayidx1
+; CHECK:         MayAlias:     [10 x i32]* %tab, i32* %p.addr.05.i
+; CHECK:         NoAlias:      i32* %p.addr.05.i, i8* %0
+; CHECK:         NoAlias:      i32* %arrayidx, i32* %p.addr.05.i
+; CHECK:         MayAlias:     i32* %arrayidx1, i32* %p.addr.05.i
+; CHECK:         MayAlias:     [10 x i32]* %tab, i32* %incdec.ptr.i
+; CHECK:         MayAlias:     i32* %incdec.ptr.i, i8* %0
+; CHECK:         MayAlias:     i32* %arrayidx, i32* %incdec.ptr.i
+; CHECK:         MayAlias:     i32* %arrayidx1, i32* %incdec.ptr.i
+; CHECK:         NoAlias:      i32* %incdec.ptr.i, i32* %p.addr.05.i
+define i32 @reverse() nounwind {
+entry:
+  %tab = alloca [10 x i32], align 4
+  %0 = bitcast [10 x i32]* %tab to i8*
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %tab, i32 0, i32 0
+  store i32 0, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32]* %tab, i32 0, i32 9
+  store i32 0, i32* %arrayidx1, align 4
+  %1 = add i32 1, 1
+  %cmp4.i = icmp slt i32 %1, 2
+  br i1 %cmp4.i, label %while.body.i, label %f.exit
+
+while.body.i: ; preds = %while.body.i, %entry
+  %2 = phi i32 [ 1, %while.body.i ], [ %1, %entry ]
+  %foo.06.i = phi i32 [ %sub.i, %while.body.i ], [ 2, %entry ]
+  %p.addr.05.i = phi i32* [ %incdec.ptr.i, %while.body.i ], [ %arrayidx1, %entry ]
+  %sub.i = sub nsw i32 %foo.06.i, %2
+  %incdec.ptr.i = getelementptr inbounds i32, i32* %p.addr.05.i, i32 -1
+  store i32 %sub.i, i32* %p.addr.05.i, align 4
+  %cmp.i = icmp sgt i32 %sub.i, 1
+  br i1 %cmp.i, label %while.body.i, label %f.exit
+
+f.exit: ; preds = %entry, %while.body.i
+  %3 = load i32, i32* %arrayidx1, align 4
+  %cmp = icmp eq i32 %3, 2
+  %4 = load i32, i32* %arrayidx, align 4
+  %cmp4 = icmp eq i32 %4, 1
+  %or.cond = and i1 %cmp, %cmp4
+  br i1 %or.cond, label %if.end, label %if.then
+
+if.then: ; preds = %f.exit
+  unreachable
+
+if.end: ; preds = %f.exit
+  ret i32 0
+}
+
+; CHECK-LABEL: Function: negative: 6 pointers, 1 call sites
+; CHECK:         NoAlias:      [3 x i16]* %int_arr.10, i16** %argv.6.par
+; CHECK:         NoAlias:      i16* %_tmp1, i16** %argv.6.par
+; CHECK:         PartialAlias: [3 x i16]* %int_arr.10, i16* %_tmp1
+; CHECK:         NoAlias:      i16* %ls1.9.0, i16** %argv.6.par
+; CHECK:         MayAlias:     [3 x i16]* %int_arr.10, i16* %ls1.9.0
+; CHECK:         MayAlias:     i16* %_tmp1, i16* %ls1.9.0
+; CHECK:         NoAlias:      i16* %_tmp7, i16** %argv.6.par
+; CHECK:         MayAlias:     [3 x i16]* %int_arr.10, i16* %_tmp7
+; CHECK:         MayAlias:     i16* %_tmp1, i16* %_tmp7
+; CHECK:         NoAlias:      i16* %_tmp7, i16* %ls1.9.0
+; CHECK:         NoAlias:      i16* %_tmp11, i16** %argv.6.par
+; CHECK:         PartialAlias: [3 x i16]* %int_arr.10, i16* %_tmp11
+; CHECK:         NoAlias:      i16* %_tmp1, i16* %_tmp11
+; CHECK:         NoAlias:      i16* %_tmp11, i16* %ls1.9.0
+; CHECK:         MayAlias:     i16* %_tmp11, i16* %_tmp7
+; CHECK:         Both ModRef:  Ptr: i16** %argv.6.par  <->  %_tmp16 = call i16 @call(i32 %_tmp13)
+; CHECK:         NoModRef:  Ptr: [3 x i16]* %int_arr.10        <->  %_tmp16 = call i16 @call(i32 %_tmp13)
+; CHECK:         NoModRef:  Ptr: i16* %_tmp1   <->  %_tmp16 = call i16 @call(i32 %_tmp13)
+; CHECK:         Both ModRef:  Ptr: i16* %ls1.9.0      <->  %_tmp16 = call i16 @call(i32 %_tmp13)
+; CHECK:         Both ModRef:  Ptr: i16* %_tmp7        <->  %_tmp16 = call i16 @call(i32 %_tmp13)
+; CHECK:         NoModRef:  Ptr: i16* %_tmp11  <->  %_tmp16 = call i16 @call(i32 %_tmp13)
+define i16 @negative(i16 %argc.5.par, i16** nocapture readnone %argv.6.par) {
+  %int_arr.10 = alloca [3 x i16], align 1
+  %_tmp1 = getelementptr inbounds [3 x i16], [3 x i16]* %int_arr.10, i16 0, i16 2
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %0
+  %i.7.0 = phi i16 [ 2, %0 ], [ %_tmp5, %bb1 ]
+  %ls1.9.0 = phi i16* [ %_tmp1, %0 ], [ %_tmp7, %bb1 ]
+  store i16 %i.7.0, i16* %ls1.9.0, align 1
+  %_tmp5 = add nsw i16 %i.7.0, -1
+  %_tmp7 = getelementptr i16, i16* %ls1.9.0, i16 -1
+  %_tmp9 = icmp sgt i16 %i.7.0, 0
+  br i1 %_tmp9, label %bb1, label %bb3
+
+bb3:                                              ; preds = %bb1
+  %_tmp11 = getelementptr inbounds [3 x i16], [3 x i16]* %int_arr.10, i16 0, i16 1
+  %_tmp12 = load i16, i16* %_tmp11, align 1
+  %_tmp13 = sext i16 %_tmp12 to i32
+  %_tmp16 = call i16 @call(i32 %_tmp13)
+  %_tmp18.not = icmp eq i16 %_tmp12, 1
+  br i1 %_tmp18.not, label %bb5, label %bb4
+
+bb4:                                              ; preds = %bb3
+  ret i16 1
+
+bb5:                                              ; preds = %bb3, %bb4
+  ret i16 0
+}
+
+declare i16 @call(i32)

From 00ed5355e45b4d7e84366619dbbbb9df9b4aa816 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 16 Jul 2020 15:42:05 +0100
Subject: [PATCH 062/363] [BasicAA] Fix -basicaa-recphi for geps with negative
 offsets

As shown in D82998, the basic-aa-recphi option can cause miscompiles for
gep's with negative constants. The option checks for recursive phi, that
recurse through a contant gep. If it finds one, it performs aliasing
calculations using the other phi operands with an unknown size, to
specify that an unknown number of elements after the initial value are
potentially accessed. This works fine expect where the constant is
negative, as the size is still considered to be positive. So this patch
expands the check to make sure that the constant is also positive.

Differential Revision: https://reviews.llvm.org/D83576

(cherry picked from commit 311fafd2c90aed5b3fed9566503eebe629f1e979)
---
 llvm/lib/Analysis/BasicAliasAnalysis.cpp | 55 +++++++++++++-----------
 llvm/test/Analysis/BasicAA/recphi.ll     | 10 ++---
 2 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 74664098ce1d..33f122728d2a 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1648,8 +1648,32 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,
     }
 
   SmallVector<Value *, 4> V1Srcs;
+  // For a recursive phi, that recurses through a contant gep, we can perform
+  // aliasing calculations using the other phi operands with an unknown size to
+  // specify that an unknown number of elements after the initial value are
+  // potentially accessed.
   bool isRecursive = false;
-  if (PV)  {
+  auto CheckForRecPhi = [&](Value *PV) {
+    if (!EnableRecPhiAnalysis)
+      return false;
+    if (GEPOperator *PVGEP = dyn_cast<GEPOperator>(PV)) {
+      // Check whether the incoming value is a GEP that advances the pointer
+      // result of this PHI node (e.g. in a loop). If this is the case, we
+      // would recurse and always get a MayAlias. Handle this case specially
+      // below. We need to ensure that the phi is inbounds and has a constant
+      // positive operand so that we can check for alias with the initial value
+      // and an unknown but positive size.
+      if (PVGEP->getPointerOperand() == PN && PVGEP->isInBounds() &&
+          PVGEP->getNumIndices() == 1 && isa<ConstantInt>(PVGEP->idx_begin()) &&
+          !cast<ConstantInt>(PVGEP->idx_begin())->isNegative()) {
+        isRecursive = true;
+        return true;
+      }
+    }
+    return false;
+  };
+
+  if (PV) {
     // If we have PhiValues then use it to get the underlying phi values.
     const PhiValues::ValueSet &PhiValueSet = PV->getValuesForPhi(PN);
     // If we have more phi values than the search depth then return MayAlias
@@ -1660,19 +1684,8 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,
       return MayAlias;
     // Add the values to V1Srcs
     for (Value *PV1 : PhiValueSet) {
-      if (EnableRecPhiAnalysis) {
-        if (GEPOperator *PV1GEP = dyn_cast<GEPOperator>(PV1)) {
-          // Check whether the incoming value is a GEP that advances the pointer
-          // result of this PHI node (e.g. in a loop). If this is the case, we
-          // would recurse and always get a MayAlias. Handle this case specially
-          // below.
-          if (PV1GEP->getPointerOperand() == PN && PV1GEP->getNumIndices() == 1 &&
-              isa<ConstantInt>(PV1GEP->idx_begin())) {
-            isRecursive = true;
-            continue;
-          }
-        }
-      }
+      if (CheckForRecPhi(PV1))
+        continue;
       V1Srcs.push_back(PV1);
     }
   } else {
@@ -1687,18 +1700,8 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,
         // and 'n' are the number of PHI sources.
         return MayAlias;
 
-      if (EnableRecPhiAnalysis)
-        if (GEPOperator *PV1GEP = dyn_cast<GEPOperator>(PV1)) {
-          // Check whether the incoming value is a GEP that advances the pointer
-          // result of this PHI node (e.g. in a loop). If this is the case, we
-          // would recurse and always get a MayAlias. Handle this case specially
-          // below.
-          if (PV1GEP->getPointerOperand() == PN && PV1GEP->getNumIndices() == 1 &&
-              isa<ConstantInt>(PV1GEP->idx_begin())) {
-            isRecursive = true;
-            continue;
-          }
-        }
+      if (CheckForRecPhi(PV1))
+        continue;
 
       if (UniqueSrc.insert(PV1).second)
         V1Srcs.push_back(PV1);
diff --git a/llvm/test/Analysis/BasicAA/recphi.ll b/llvm/test/Analysis/BasicAA/recphi.ll
index 3c8397cc0fec..dfc88937bf69 100644
--- a/llvm/test/Analysis/BasicAA/recphi.ll
+++ b/llvm/test/Analysis/BasicAA/recphi.ll
@@ -92,8 +92,8 @@ if.end: ; preds = %f.exit
 ; CHECK:         NoAlias:      i32* %arrayidx1, i8* %0
 ; CHECK:         NoAlias:      i32* %arrayidx, i32* %arrayidx1
 ; CHECK:         MayAlias:     [10 x i32]* %tab, i32* %p.addr.05.i
-; CHECK:         NoAlias:      i32* %p.addr.05.i, i8* %0
-; CHECK:         NoAlias:      i32* %arrayidx, i32* %p.addr.05.i
+; CHECK:         MayAlias:     i32* %p.addr.05.i, i8* %0
+; CHECK:         MayAlias:     i32* %arrayidx, i32* %p.addr.05.i
 ; CHECK:         MayAlias:     i32* %arrayidx1, i32* %p.addr.05.i
 ; CHECK:         MayAlias:     [10 x i32]* %tab, i32* %incdec.ptr.i
 ; CHECK:         MayAlias:     i32* %incdec.ptr.i, i8* %0
@@ -141,17 +141,17 @@ if.end: ; preds = %f.exit
 ; CHECK:         NoAlias:      [3 x i16]* %int_arr.10, i16** %argv.6.par
 ; CHECK:         NoAlias:      i16* %_tmp1, i16** %argv.6.par
 ; CHECK:         PartialAlias: [3 x i16]* %int_arr.10, i16* %_tmp1
-; CHECK:         NoAlias:      i16* %ls1.9.0, i16** %argv.6.par
+; CHECK:         MayAlias:     i16* %ls1.9.0, i16** %argv.6.par
 ; CHECK:         MayAlias:     [3 x i16]* %int_arr.10, i16* %ls1.9.0
 ; CHECK:         MayAlias:     i16* %_tmp1, i16* %ls1.9.0
-; CHECK:         NoAlias:      i16* %_tmp7, i16** %argv.6.par
+; CHECK:         MayAlias:     i16* %_tmp7, i16** %argv.6.par
 ; CHECK:         MayAlias:     [3 x i16]* %int_arr.10, i16* %_tmp7
 ; CHECK:         MayAlias:     i16* %_tmp1, i16* %_tmp7
 ; CHECK:         NoAlias:      i16* %_tmp7, i16* %ls1.9.0
 ; CHECK:         NoAlias:      i16* %_tmp11, i16** %argv.6.par
 ; CHECK:         PartialAlias: [3 x i16]* %int_arr.10, i16* %_tmp11
 ; CHECK:         NoAlias:      i16* %_tmp1, i16* %_tmp11
-; CHECK:         NoAlias:      i16* %_tmp11, i16* %ls1.9.0
+; CHECK:         MayAlias:     i16* %_tmp11, i16* %ls1.9.0
 ; CHECK:         MayAlias:     i16* %_tmp11, i16* %_tmp7
 ; CHECK:         Both ModRef:  Ptr: i16** %argv.6.par  <->  %_tmp16 = call i16 @call(i32 %_tmp13)
 ; CHECK:         NoModRef:  Ptr: [3 x i16]* %int_arr.10        <->  %_tmp16 = call i16 @call(i32 %_tmp13)

From bf2c0fb8a254345e62f18d61ed8c3ca84307b0d8 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Mon, 27 Jul 2020 17:13:49 +0200
Subject: [PATCH 063/363] Drop the 'git' suffix from various version variables

---
 libcxx/CMakeLists.txt    | 2 +-
 libcxxabi/CMakeLists.txt | 2 +-
 libunwind/CMakeLists.txt | 2 +-
 llvm/CMakeLists.txt      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index aabe31fa6ec1..f37d729a8a15 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -32,7 +32,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBCXX_STANDALONE_BUIL
   project(libcxx CXX C)
 
   set(PACKAGE_NAME libcxx)
-  set(PACKAGE_VERSION 11.0.0git)
+  set(PACKAGE_VERSION 11.0.0)
   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
 
diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index 8881a5018dc4..6cb139b311c0 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -25,7 +25,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBCXXABI_STANDALONE_B
   project(libcxxabi CXX C)
 
   set(PACKAGE_NAME libcxxabi)
-  set(PACKAGE_VERSION 11.0.0git)
+  set(PACKAGE_VERSION 11.0.0)
   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
 
diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt
index 7065112627a2..02c130ad1bd5 100644
--- a/libunwind/CMakeLists.txt
+++ b/libunwind/CMakeLists.txt
@@ -83,7 +83,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBUNWIND_STANDALONE_B
   endif()
 
   set(PACKAGE_NAME libunwind)
-  set(PACKAGE_VERSION 11.0.0git)
+  set(PACKAGE_VERSION 11.0.0)
   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
 
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index eacf8d5e5501..038139a24090 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -33,7 +33,7 @@ if(NOT DEFINED LLVM_VERSION_PATCH)
   set(LLVM_VERSION_PATCH 0)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
-  set(LLVM_VERSION_SUFFIX git)
+  set(LLVM_VERSION_SUFFIX "")
 endif()
 
 if (NOT PACKAGE_VERSION)

From e47a6a224a4b6d0ce98028f560a8b3806d145907 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 27 Jul 2020 21:11:54 -0700
Subject: [PATCH 064/363] [X86] Detect if EFLAGs is live across XBEGIN pseudo
 instruction. Add it as livein to the basic blocks created when expanding the
 pseudo

XBEGIN causes several based blocks to be inserted. If flags are live across it we need to make eflags live in the new basic blocks to avoid machine verifier errors.

Fixes PR46827

Reviewed By: ivanbaev

Differential Revision: https://reviews.llvm.org/D84479

(cherry picked from commit 647e861e080382593648b234668ad2f5a376ac5e)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 57 ++++++++++++++++---------
 llvm/test/CodeGen/X86/pr46827.ll        | 39 +++++++++++++++++
 2 files changed, 75 insertions(+), 21 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/pr46827.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f8b6b7eb3aff..7d846e4f2a77 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30953,6 +30953,34 @@ bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
 //                           X86 Scheduler Hooks
 //===----------------------------------------------------------------------===//
 
+// Returns true if EFLAG is consumed after this iterator in the rest of the
+// basic block or any successors of the basic block.
+static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
+                              MachineBasicBlock *BB) {
+  // Scan forward through BB for a use/def of EFLAGS.
+  for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();
+         miI != miE; ++miI) {
+    const MachineInstr& mi = *miI;
+    if (mi.readsRegister(X86::EFLAGS))
+      return true;
+    // If we found a def, we can stop searching.
+    if (mi.definesRegister(X86::EFLAGS))
+      return false;
+  }
+
+  // If we hit the end of the block, check whether EFLAGS is live into a
+  // successor.
+  for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+                                        sEnd = BB->succ_end();
+       sItr != sEnd; ++sItr) {
+    MachineBasicBlock* succ = *sItr;
+    if (succ->isLiveIn(X86::EFLAGS))
+      return true;
+  }
+
+  return false;
+}
+
 /// Utility function to emit xbegin specifying the start of an RTM region.
 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
                                      const TargetInstrInfo *TII) {
@@ -30985,6 +31013,12 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
   MF->insert(I, fallMBB);
   MF->insert(I, sinkMBB);
 
+  if (isEFLAGSLiveAfter(MI, MBB)) {
+    mainMBB->addLiveIn(X86::EFLAGS);
+    fallMBB->addLiveIn(X86::EFLAGS);
+    sinkMBB->addLiveIn(X86::EFLAGS);
+  }
+
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), MBB,
                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
@@ -31373,27 +31407,8 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
                                      MachineBasicBlock* BB,
                                      const TargetRegisterInfo* TRI) {
-  // Scan forward through BB for a use/def of EFLAGS.
-  MachineBasicBlock::iterator miI(std::next(SelectItr));
-  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
-    const MachineInstr& mi = *miI;
-    if (mi.readsRegister(X86::EFLAGS))
-      return false;
-    if (mi.definesRegister(X86::EFLAGS))
-      break; // Should have kill-flag - update below.
-  }
-
-  // If we hit the end of the block, check whether EFLAGS is live into a
-  // successor.
-  if (miI == BB->end()) {
-    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
-                                          sEnd = BB->succ_end();
-         sItr != sEnd; ++sItr) {
-      MachineBasicBlock* succ = *sItr;
-      if (succ->isLiveIn(X86::EFLAGS))
-        return false;
-    }
-  }
+  if (isEFLAGSLiveAfter(SelectItr, BB))
+    return false;
 
   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
   // out. SelectMI should have a kill flag on EFLAGS.
diff --git a/llvm/test/CodeGen/X86/pr46827.ll b/llvm/test/CodeGen/X86/pr46827.ll
new file mode 100644
index 000000000000..438b13c3400f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr46827.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -mtriple=i686-pc-linux -mattr=+rtm -verify-machineinstrs -stop-after=finalize-isel | FileCheck %s
+
+; CHECK: body:             |
+; CHECK:   bb.0.bb107:
+; CHECK:     successors: %bb.3(0x40000000), %bb.4(0x40000000)
+; CHECK:     %0:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.0, align 16)
+; CHECK:     %1:gr32 = SUB32ri8 %0, 1, implicit-def $eflags
+; CHECK:     XBEGIN_4 %bb.4, implicit-def $eax
+; CHECK:   bb.3.bb107:
+; CHECK:     successors: %bb.5(0x80000000)
+; CHECK:     liveins: $eflags
+; CHECK:     %3:gr32 = MOV32ri -1
+; CHECK:     JMP_1 %bb.5
+; CHECK:   bb.4.bb107:
+; CHECK:     successors: %bb.5(0x80000000)
+; CHECK:     liveins: $eflags
+; CHECK:     XABORT_DEF implicit-def $eax
+; CHECK:     %4:gr32 = COPY $eax
+; CHECK:   bb.5.bb107:
+; CHECK:     successors: %bb.1(0x40000000), %bb.2(0x40000000)
+; CHECK:     liveins: $eflags
+; CHECK:     %2:gr32 = PHI %3, %bb.3, %4, %bb.4
+; CHECK:     JCC_1 %bb.2, 5, implicit $eflags
+; CHECK:     JMP_1 %bb.1
+
+declare i32 @llvm.x86.xbegin() #0
+
+define void @wobble.12(i32 %tmp116) {
+bb107:                                            ; preds = %bb42
+  %tmp117 = icmp eq i32 %tmp116, 1
+  %tmp127 = tail call i32 @llvm.x86.xbegin() #0
+  br i1 %tmp117, label %bb129, label %bb250
+
+bb129:                                            ; preds = %bb107
+  unreachable
+
+bb250:                                            ; preds = %bb107
+  unreachable
+}

From 592454c367dec9d3dd0abfb840cbaa664bc67bb8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 23 Jul 2020 18:22:07 +0100
Subject: [PATCH 065/363] [X86][SSE] Add additional
 (f)add(shuffle(x,y),shuffle(x,y)) tests for D83789

(cherry picked from commit bfc4294ef61d5cf69fffe6b64287a323c003d90f)
---
 llvm/test/CodeGen/X86/haddsub-4.ll | 405 +++++++++++++++++++++++++++++
 1 file changed, 405 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/haddsub-4.ll

diff --git a/llvm/test/CodeGen/X86/haddsub-4.ll b/llvm/test/CodeGen/X86/haddsub-4.ll
new file mode 100644
index 000000000000..5c8e9a7c72f2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/haddsub-4.ll
@@ -0,0 +1,405 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSE,SSSE3,SSSE3-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSSE3,SSSE3-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST
+
+define <8 x i16> @hadd_reverse_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
+; SSE-LABEL: hadd_reverse_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15]
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    pshufb %xmm2, %xmm3
+; SSE-NEXT:    movdqa %xmm0, %xmm4
+; SSE-NEXT:    pshufb %xmm2, %xmm4
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15]
+; SSE-NEXT:    pshufb %xmm2, %xmm1
+; SSE-NEXT:    pshufb %xmm2, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    paddw %xmm4, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: hadd_reverse_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %lhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 7, i32 5, i32 3, i32 1, i32 15, i32 13, i32 11, i32 9>
+  %rhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 6, i32 4, i32 2, i32 0, i32 14, i32 12, i32 10, i32 8>
+  %add = add <8 x i16> %lhs, %rhs
+  ret <8 x i16> %add
+}
+
+define <8 x i16> @hadd_reverse2_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
+; SSE-LABEL: hadd_reverse2_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; SSE-NEXT:    pshufb %xmm2, %xmm0
+; SSE-NEXT:    pshufb %xmm2, %xmm1
+; SSE-NEXT:    phaddw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: hadd_reverse2_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuf0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %shuf1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %lhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %rhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %add = add <8 x i16> %lhs, %rhs
+  ret <8 x i16> %add
+}
+
+define <16 x i16> @hadd_reverse_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: hadd_reverse_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm4
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15]
+; SSE-NEXT:    movdqa %xmm2, %xmm5
+; SSE-NEXT:    pshufb %xmm0, %xmm5
+; SSE-NEXT:    movdqa %xmm4, %xmm6
+; SSE-NEXT:    pshufb %xmm0, %xmm6
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
+; SSE-NEXT:    movdqa %xmm3, %xmm5
+; SSE-NEXT:    pshufb %xmm0, %xmm5
+; SSE-NEXT:    movdqa %xmm1, %xmm7
+; SSE-NEXT:    pshufb %xmm0, %xmm7
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15]
+; SSE-NEXT:    pshufb %xmm0, %xmm2
+; SSE-NEXT:    pshufb %xmm0, %xmm4
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-NEXT:    paddw %xmm6, %xmm4
+; SSE-NEXT:    pshufb %xmm0, %xmm3
+; SSE-NEXT:    pshufb %xmm0, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT:    paddw %xmm7, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: hadd_reverse_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpshufb %xmm3, %xmm5, %xmm6
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm6
+; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; AVX1-NEXT:    vpaddw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: hadd_reverse_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm1[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,0],ymm3[6,4],ymm2[6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,12,13,8,9,4,5,0,1,u,u,u,u,u,u,u,u,28,29,24,25,20,21,16,17]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,u,u,u,u,u,u,u,u,28,29,24,25,20,21,16,17,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+  %lhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17>
+  %rhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16>
+  %add = add <16 x i16> %lhs, %rhs
+  ret <16 x i16> %add
+}
+
+define <16 x i16> @hadd_reverse2_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: hadd_reverse2_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm4
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; SSE-NEXT:    pshufb %xmm0, %xmm4
+; SSE-NEXT:    pshufb %xmm0, %xmm1
+; SSE-NEXT:    pshufb %xmm0, %xmm2
+; SSE-NEXT:    phaddw %xmm2, %xmm4
+; SSE-NEXT:    pshufb %xmm0, %xmm3
+; SSE-NEXT:    phaddw %xmm3, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: hadd_reverse2_v16i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vphaddw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: hadd_reverse2_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuf0 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %shuf1 = shufflevector <16 x i16> %a1, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %lhs = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+  %rhs = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+  %add = add <16 x i16> %lhs, %rhs
+  ret <16 x i16> %add
+}
+
+define <8 x double> @hadd_reverse_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
+; SSE-LABEL: hadd_reverse_v8f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movapd %xmm1, %xmm8
+; SSE-NEXT:    movapd %xmm0, %xmm9
+; SSE-NEXT:    haddpd %xmm7, %xmm3
+; SSE-NEXT:    haddpd %xmm6, %xmm2
+; SSE-NEXT:    haddpd %xmm5, %xmm8
+; SSE-NEXT:    haddpd %xmm4, %xmm9
+; SSE-NEXT:    movapd %xmm3, %xmm0
+; SSE-NEXT:    movapd %xmm2, %xmm1
+; SSE-NEXT:    movapd %xmm8, %xmm2
+; SSE-NEXT:    movapd %xmm9, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: hadd_reverse_v8f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vhaddpd %ymm2, %ymm0, %ymm2
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vhaddpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovapd %ymm2, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: hadd_reverse_v8f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm2[0,3,2,1]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm0[0,3,2,1]
+; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm3[0,3,2,1]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm1[0,3,2,1]
+; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,0,3]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,3]
+; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; AVX2-NEXT:    vaddpd %ymm0, %ymm4, %ymm2
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm3[2,1,0,3]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3]
+; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; AVX2-NEXT:    vaddpd %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vmovapd %ymm2, %ymm1
+; AVX2-NEXT:    retq
+  %lhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 7, i32 15, i32 5, i32 13, i32 3, i32 11, i32 1, i32 9>
+  %rhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 6, i32 14, i32 4, i32 12, i32 2, i32 10, i32 0, i32 8>
+  %fadd = fadd <8 x double> %lhs, %rhs
+  ret <8 x double> %fadd
+}
+
+define <8 x double> @hadd_reverse2_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
+; SSE-LABEL: hadd_reverse2_v8f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movapd %xmm1, %xmm8
+; SSE-NEXT:    movapd %xmm0, %xmm9
+; SSE-NEXT:    shufpd {{.*#+}} xmm9 = xmm9[1],xmm0[0]
+; SSE-NEXT:    shufpd {{.*#+}} xmm8 = xmm8[1],xmm1[0]
+; SSE-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1,0]
+; SSE-NEXT:    shufpd {{.*#+}} xmm3 = xmm3[1,0]
+; SSE-NEXT:    shufpd {{.*#+}} xmm4 = xmm4[1,0]
+; SSE-NEXT:    haddpd %xmm4, %xmm9
+; SSE-NEXT:    shufpd {{.*#+}} xmm5 = xmm5[1,0]
+; SSE-NEXT:    haddpd %xmm5, %xmm8
+; SSE-NEXT:    shufpd {{.*#+}} xmm6 = xmm6[1,0]
+; SSE-NEXT:    haddpd %xmm6, %xmm2
+; SSE-NEXT:    shufpd {{.*#+}} xmm7 = xmm7[1,0]
+; SSE-NEXT:    haddpd %xmm7, %xmm3
+; SSE-NEXT:    movapd %xmm3, %xmm0
+; SSE-NEXT:    movapd %xmm2, %xmm1
+; SSE-NEXT:    movapd %xmm8, %xmm2
+; SSE-NEXT:    movapd %xmm9, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: hadd_reverse2_v8f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,3,2]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm2[1,0,3,2]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vhaddpd %ymm1, %ymm0, %ymm1
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm3[1,0,3,2]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vhaddpd %ymm0, %ymm4, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: hadd_reverse2_v8f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm1[3,2,1,0]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm2[3,2,1,0]
+; AVX2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm3[3,2,1,0]
+; AVX2-NEXT:    vhaddpd %ymm0, %ymm4, %ymm0
+; AVX2-NEXT:    retq
+  %shuf0 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %shuf1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %lhs = shufflevector <8 x double> %shuf0, <8 x double> %shuf1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %rhs = shufflevector <8 x double> %shuf0, <8 x double> %shuf1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %fadd = fadd <8 x double> %lhs, %rhs
+  ret <8 x double> %fadd
+}
+
+define <16 x float> @hadd_reverse_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
+; SSE-LABEL: hadd_reverse_v16f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm5, %xmm8
+; SSE-NEXT:    movaps %xmm1, %xmm5
+; SSE-NEXT:    movaps %xmm8, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[3,1]
+; SSE-NEXT:    movaps %xmm5, %xmm9
+; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[3,1],xmm0[3,1]
+; SSE-NEXT:    movaps %xmm7, %xmm10
+; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[3,1],xmm6[3,1]
+; SSE-NEXT:    movaps %xmm3, %xmm11
+; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[3,1],xmm2[3,1]
+; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,0]
+; SSE-NEXT:    addps %xmm1, %xmm8
+; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,0]
+; SSE-NEXT:    addps %xmm9, %xmm5
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,0]
+; SSE-NEXT:    addps %xmm10, %xmm7
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,0]
+; SSE-NEXT:    addps %xmm11, %xmm3
+; SSE-NEXT:    movaps %xmm3, %xmm0
+; SSE-NEXT:    movaps %xmm7, %xmm1
+; SSE-NEXT:    movaps %xmm5, %xmm2
+; SSE-NEXT:    movaps %xmm8, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: hadd_reverse_v16f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm0[3,1],ymm4[7,5],ymm0[7,5]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm3[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm3
+; AVX1-NEXT:    vshufps {{.*#+}} ymm6 = ymm5[3,1],ymm3[3,1],ymm5[7,5],ymm3[7,5]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm0[2,0],ymm4[6,4],ymm0[6,4]
+; AVX1-NEXT:    vaddps %ymm0, %ymm2, %ymm1
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[2,0],ymm3[2,0],ymm5[6,4],ymm3[6,4]
+; AVX1-NEXT:    vaddps %ymm0, %ymm6, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: hadd_reverse_v16f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm0[3,1],ymm2[3,1],ymm0[7,5],ymm2[7,5]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[2,0,3,1]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm1[3,1],ymm3[3,1],ymm1[7,5],ymm3[7,5]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[2,0,3,1]
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,0],ymm0[6,4],ymm2[6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1]
+; AVX2-NEXT:    vaddps %ymm0, %ymm4, %ymm2
+; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1]
+; AVX2-NEXT:    vaddps %ymm0, %ymm5, %ymm0
+; AVX2-NEXT:    vmovaps %ymm2, %ymm1
+; AVX2-NEXT:    retq
+  %lhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17>
+  %rhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16>
+  %fadd = fadd <16 x float> %lhs, %rhs
+  ret <16 x float> %fadd
+}
+
+define <16 x float> @hadd_reverse2_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
+; SSE-LABEL: hadd_reverse2_v16f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm1, %xmm8
+; SSE-NEXT:    movaps %xmm0, %xmm9
+; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[3,2],xmm0[1,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[3,2],xmm1[1,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,2,1,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,2,1,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,2,1,0]
+; SSE-NEXT:    haddps %xmm4, %xmm9
+; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,2,1,0]
+; SSE-NEXT:    haddps %xmm5, %xmm8
+; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[3,2,1,0]
+; SSE-NEXT:    haddps %xmm6, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[3,2,1,0]
+; SSE-NEXT:    haddps %xmm7, %xmm3
+; SSE-NEXT:    movaps %xmm3, %xmm0
+; SSE-NEXT:    movaps %xmm2, %xmm1
+; SSE-NEXT:    movaps %xmm8, %xmm2
+; SSE-NEXT:    movaps %xmm9, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: hadd_reverse2_v16f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm2[3,2,1,0,7,6,5,4]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vhaddps %ymm1, %ymm0, %ymm1
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm3[3,2,1,0,7,6,5,4]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vhaddps %ymm0, %ymm4, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: hadd_reverse2_v16f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm1[2,3,0,1]
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm2[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-NEXT:    vhaddps %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm3[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vhaddps %ymm0, %ymm4, %ymm0
+; AVX2-NEXT:    retq
+  %shuf0 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %shuf1 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %lhs = shufflevector <16 x float> %shuf0, <16 x float> %shuf1, <16 x i32> <i32 0, i32 2, i32 16, i32 18, i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30>
+  %rhs = shufflevector <16 x float> %shuf0, <16 x float> %shuf1, <16 x i32> <i32 1, i32 3, i32 17, i32 19, i32 5, i32 7, i32 21, i32 23, i32 9, i32 11, i32 25, i32 27, i32 13, i32 15, i32 29, i32 31>
+  %fadd = fadd <16 x float> %lhs, %rhs
+  ret <16 x float> %fadd
+}

From d6875948aaade1cd39e5d9b373d02749dd1e58f2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 28 Jul 2020 09:52:38 +0100
Subject: [PATCH 066/363] [X86][SSE] Attempt to match
 OP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y))

An initial backend patch towards fixing the various poor HADD combines (PR34724, PR41813, PR45747 etc.).

This extends isHorizontalBinOp to check if we have per-element horizontal ops (odd+even element pairs), but not in the expected serial order - in which case we build a "post shuffle mask" that we can apply to the HOP result, assuming we have fast-hops/optsize etc.

The next step will be to extend the SHUFFLE(HOP(X,Y)) combines as suggested on PR41813 - accepting more post-shuffle masks even on slow-hop targets if we can fold it into another shuffle.

Differential Revision: https://reviews.llvm.org/D83789

(cherry picked from commit 182111777b4ec215eeebe8ab5cc2a324e2f055ff)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  74 ++++++--
 llvm/test/CodeGen/X86/haddsub-3.ll            |  54 ++++--
 llvm/test/CodeGen/X86/haddsub-4.ll            | 169 +++++-------------
 llvm/test/CodeGen/X86/haddsub-shuf.ll         | 125 ++++---------
 llvm/test/CodeGen/X86/haddsub-undef.ll        |  98 ++++++----
 .../test/CodeGen/X86/vector-shuffle-256-v4.ll |  48 +++--
 .../test/CodeGen/X86/vector-shuffle-256-v8.ll | 162 ++++-------------
 7 files changed, 321 insertions(+), 409 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7d846e4f2a77..86aa85e965f6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -44364,8 +44364,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
 /// A horizontal-op B, for some already available A and B, and if so then LHS is
 /// set to A, RHS to B, and the routine returns 'true'.
 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
-                              const X86Subtarget &Subtarget,
-                              bool IsCommutative) {
+                              const X86Subtarget &Subtarget, bool IsCommutative,
+                              SmallVectorImpl<int> &PostShuffleMask) {
   // If either operand is undef, bail out. The binop should be simplified.
   if (LHS.isUndef() || RHS.isUndef())
     return false;
@@ -44458,6 +44458,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
       RMask.push_back(i);
   }
 
+  // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split).
+  if (!Subtarget.hasAVX2() && VT.isFloatingPoint() &&
+      (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) ||
+       isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask)))
+    return false;
+
   // If A and B occur in reverse order in RHS, then canonicalize by commuting
   // RHS operands and shuffle mask.
   if (A != C) {
@@ -44468,6 +44474,9 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
   if (!(A == C && B == D))
     return false;
 
+  PostShuffleMask.clear();
+  PostShuffleMask.append(NumElts, SM_SentinelUndef);
+
   // LHS and RHS are now:
   //   LHS = shuffle A, B, LMask
   //   RHS = shuffle A, B, RMask
@@ -44476,6 +44485,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
   // so we just repeat the inner loop if this is a 256-bit op.
   unsigned Num128BitChunks = VT.getSizeInBits() / 128;
   unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
+  unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
   assert((NumEltsPer128BitChunk % 2 == 0) &&
          "Vector type should have an even number of elements in each lane");
   for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
@@ -44487,25 +44497,40 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
         continue;
 
+      // Check that successive odd/even elements are being operated on. If not,
+      // this is not a horizontal operation.
+      if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
+          !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
+        return false;
+
+      // Compute the post-shuffle mask index based on where the element
+      // is stored in the HOP result, and where it needs to be moved to.
+      int Base = LIdx & ~1u;
+      int Index = ((Base % NumEltsPer128BitChunk) / 2) +
+                  ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
+
       // The  low half of the 128-bit result must choose from A.
       // The high half of the 128-bit result must choose from B,
       // unless B is undef. In that case, we are always choosing from A.
-      unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
-      unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
-
-      // Check that successive elements are being operated on. If not, this is
-      // not a horizontal operation.
-      int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
-      if (!(LIdx == Index && RIdx == Index + 1) &&
-          !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
-        return false;
+      if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
+        Index += NumEltsPer64BitChunk;
+      PostShuffleMask[i + j] = Index;
     }
   }
 
   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
 
-  if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
+  bool IsIdentityPostShuffle =
+      isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
+  if (IsIdentityPostShuffle)
+    PostShuffleMask.clear();
+
+  // Assume a SingleSource HOP if we only shuffle one input and don't need to
+  // shuffle the result.
+  if (!shouldUseHorizontalOp(LHS == RHS &&
+                                 (NumShuffles < 2 || !IsIdentityPostShuffle),
+                             DAG, Subtarget))
     return false;
 
   LHS = DAG.getBitcast(VT, LHS);
@@ -44524,10 +44549,16 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
   assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
 
   // Try to synthesize horizontal add/sub from adds/subs of shuffles.
+  SmallVector<int, 8> PostShuffleMask;
   if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
        (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
-      isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
-    return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
+      isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {
+    SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
+    if (!PostShuffleMask.empty())
+      HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
+                                        DAG.getUNDEF(VT), PostShuffleMask);
+    return HorizBinOp;
+  }
 
   // NOTE: isHorizontalBinOp may have changed LHS/RHS variables.
 
@@ -47620,17 +47651,22 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
   bool IsAdd = N->getOpcode() == ISD::ADD;
   assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode");
 
+  SmallVector<int, 8> PostShuffleMask;
   if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
        VT == MVT::v8i32) &&
       Subtarget.hasSSSE3() &&
-      isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd)) {
+      isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) {
     auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
                               ArrayRef<SDValue> Ops) {
-      return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB,
-                         DL, Ops[0].getValueType(), Ops);
+      return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,
+                         Ops[0].getValueType(), Ops);
     };
-    return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
-                            HOpBuilder);
+    SDValue HorizBinOp =
+        SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
+    if (!PostShuffleMask.empty())
+      HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
+                                        DAG.getUNDEF(VT), PostShuffleMask);
+    return HorizBinOp;
   }
 
   return SDValue();
diff --git a/llvm/test/CodeGen/X86/haddsub-3.ll b/llvm/test/CodeGen/X86/haddsub-3.ll
index f603ace202a1..6abba1bbfe9f 100644
--- a/llvm/test/CodeGen/X86/haddsub-3.ll
+++ b/llvm/test/CodeGen/X86/haddsub-3.ll
@@ -17,22 +17,46 @@ define float @pr26491(<4 x float> %a0) {
 ; SSE2-NEXT:    addss %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: pr26491:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    addps %xmm0, %xmm1
-; SSSE3-NEXT:    movaps %xmm1, %xmm0
-; SSSE3-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSSE3-NEXT:    addss %xmm1, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: pr26491:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSSE3-SLOW-NEXT:    addps %xmm0, %xmm1
+; SSSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: pr26491:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: pr26491:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm1
+; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
+; SSSE3-FAST-NEXT:    addss %xmm0, %xmm1
+; SSSE3-FAST-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: pr26491:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: pr26491:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-FAST-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-LABEL: pr26491:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
   %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
   %2 = fadd <4 x float> %1, %a0
   %3 = extractelement <4 x float> %2, i32 2
diff --git a/llvm/test/CodeGen/X86/haddsub-4.ll b/llvm/test/CodeGen/X86/haddsub-4.ll
index 5c8e9a7c72f2..4c1dc71982aa 100644
--- a/llvm/test/CodeGen/X86/haddsub-4.ll
+++ b/llvm/test/CodeGen/X86/haddsub-4.ll
@@ -9,30 +9,16 @@
 define <8 x i16> @hadd_reverse_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 ; SSE-LABEL: hadd_reverse_v8i16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15]
-; SSE-NEXT:    movdqa %xmm1, %xmm3
-; SSE-NEXT:    pshufb %xmm2, %xmm3
-; SSE-NEXT:    movdqa %xmm0, %xmm4
-; SSE-NEXT:    pshufb %xmm2, %xmm4
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15]
-; SSE-NEXT:    pshufb %xmm2, %xmm1
-; SSE-NEXT:    pshufb %xmm2, %xmm0
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT:    paddw %xmm4, %xmm0
+; SSE-NEXT:    phaddw %xmm1, %xmm0
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: hadd_reverse_v8i16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15]
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15]
-; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 ; AVX-NEXT:    retq
   %lhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 7, i32 5, i32 3, i32 1, i32 15, i32 13, i32 11, i32 9>
   %rhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 6, i32 4, i32 2, i32 0, i32 14, i32 12, i32 10, i32 8>
@@ -67,67 +53,34 @@ define <8 x i16> @hadd_reverse2_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
 define <16 x i16> @hadd_reverse_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: hadd_reverse_v16i16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm4
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15]
-; SSE-NEXT:    movdqa %xmm2, %xmm5
-; SSE-NEXT:    pshufb %xmm0, %xmm5
-; SSE-NEXT:    movdqa %xmm4, %xmm6
-; SSE-NEXT:    pshufb %xmm0, %xmm6
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
-; SSE-NEXT:    movdqa %xmm3, %xmm5
-; SSE-NEXT:    pshufb %xmm0, %xmm5
-; SSE-NEXT:    movdqa %xmm1, %xmm7
-; SSE-NEXT:    pshufb %xmm0, %xmm7
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0]
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15]
-; SSE-NEXT:    pshufb %xmm0, %xmm2
-; SSE-NEXT:    pshufb %xmm0, %xmm4
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; SSE-NEXT:    paddw %xmm6, %xmm4
-; SSE-NEXT:    pshufb %xmm0, %xmm3
-; SSE-NEXT:    pshufb %xmm0, %xmm1
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE-NEXT:    paddw %xmm7, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    phaddw %xmm3, %xmm1
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    phaddw %xmm2, %xmm0
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    movdqa %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: hadd_reverse_v16i16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpshufb %xmm3, %xmm5, %xmm6
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm6
-; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX1-NEXT:    vpaddw %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vphaddw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; AVX1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: hadd_reverse_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm1[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,0],ymm3[6,4],ymm2[6,4]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,12,13,8,9,4,5,0,1,u,u,u,u,u,u,u,u,28,29,24,25,20,21,16,17]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,u,u,u,u,u,u,u,u,28,29,24,25,20,21,16,17,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
   %lhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17>
   %rhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16>
@@ -209,21 +162,11 @@ define <8 x double> @hadd_reverse_v8f64(<8 x double> %a0, <8 x double> %a1) noun
 ;
 ; AVX2-LABEL: hadd_reverse_v8f64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm2[0,3,2,1]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm0[0,3,2,1]
-; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm3[0,3,2,1]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm1[0,3,2,1]
-; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,3]
-; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; AVX2-NEXT:    vaddpd %ymm0, %ymm4, %ymm2
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm3[2,1,0,3]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3]
-; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX2-NEXT:    vaddpd %ymm0, %ymm5, %ymm0
-; AVX2-NEXT:    vmovapd %ymm2, %ymm1
+; AVX2-NEXT:    vhaddpd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1]
+; AVX2-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vmovapd %ymm3, %ymm0
 ; AVX2-NEXT:    retq
   %lhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 7, i32 15, i32 5, i32 13, i32 3, i32 11, i32 1, i32 9>
   %rhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 6, i32 14, i32 4, i32 12, i32 2, i32 10, i32 0, i32 8>
@@ -290,22 +233,14 @@ define <16 x float> @hadd_reverse_v16f32(<16 x float> %a0, <16 x float> %a1) nou
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movaps %xmm5, %xmm8
 ; SSE-NEXT:    movaps %xmm1, %xmm5
-; SSE-NEXT:    movaps %xmm8, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[3,1]
-; SSE-NEXT:    movaps %xmm5, %xmm9
-; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[3,1],xmm0[3,1]
-; SSE-NEXT:    movaps %xmm7, %xmm10
-; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[3,1],xmm6[3,1]
-; SSE-NEXT:    movaps %xmm3, %xmm11
-; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[3,1],xmm2[3,1]
-; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,0]
-; SSE-NEXT:    addps %xmm1, %xmm8
-; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,0]
-; SSE-NEXT:    addps %xmm9, %xmm5
-; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,0]
-; SSE-NEXT:    addps %xmm10, %xmm7
-; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,0]
-; SSE-NEXT:    addps %xmm11, %xmm3
+; SSE-NEXT:    haddps %xmm2, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0,3,2]
+; SSE-NEXT:    haddps %xmm6, %xmm7
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,0,3,2]
+; SSE-NEXT:    haddps %xmm0, %xmm5
+; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,0,3,2]
+; SSE-NEXT:    haddps %xmm4, %xmm8
+; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,0,3,2]
 ; SSE-NEXT:    movaps %xmm3, %xmm0
 ; SSE-NEXT:    movaps %xmm7, %xmm1
 ; SSE-NEXT:    movaps %xmm5, %xmm2
@@ -316,29 +251,23 @@ define <16 x float> @hadd_reverse_v16f32(<16 x float> %a0, <16 x float> %a1) nou
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm0[3,1],ymm4[7,5],ymm0[7,5]
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm3[2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm3
-; AVX1-NEXT:    vshufps {{.*#+}} ymm6 = ymm5[3,1],ymm3[3,1],ymm5[7,5],ymm3[7,5]
-; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm0[2,0],ymm4[6,4],ymm0[6,4]
-; AVX1-NEXT:    vaddps %ymm0, %ymm2, %ymm1
-; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm5[2,0],ymm3[2,0],ymm5[6,4],ymm3[6,4]
-; AVX1-NEXT:    vaddps %ymm0, %ymm6, %ymm0
+; AVX1-NEXT:    vhaddps %ymm0, %ymm4, %ymm2
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm3[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm2[1,0,3,2,5,4,7,6]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: hadd_reverse_v16f32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vshufps {{.*#+}} ymm4 = ymm0[3,1],ymm2[3,1],ymm0[7,5],ymm2[7,5]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[2,0,3,1]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm5 = ymm1[3,1],ymm3[3,1],ymm1[7,5],ymm3[7,5]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[2,0,3,1]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,0],ymm0[6,4],ymm2[6,4]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1]
-; AVX2-NEXT:    vaddps %ymm0, %ymm4, %ymm2
-; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1]
-; AVX2-NEXT:    vaddps %ymm0, %ymm5, %ymm0
-; AVX2-NEXT:    vmovaps %ymm2, %ymm1
+; AVX2-NEXT:    vhaddps %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm1[2,0,3,1]
+; AVX2-NEXT:    vhaddps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,0,3,1]
+; AVX2-NEXT:    vmovaps %ymm3, %ymm0
 ; AVX2-NEXT:    retq
   %lhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17>
   %rhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16>
diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll
index 7bedbeb58109..76ef7afbebf3 100644
--- a/llvm/test/CodeGen/X86/haddsub-shuf.ll
+++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll
@@ -879,77 +879,59 @@ declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
 define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
 ; SSSE3_SLOW-LABEL: PR34724_1:
 ; SSSE3_SLOW:       # %bb.0:
-; SSSE3_SLOW-NEXT:    movaps %xmm1, %xmm2
-; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,2]
-; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
 ; SSSE3_SLOW-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSSE3_SLOW-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3_SLOW-NEXT:    addps %xmm0, %xmm2
-; SSSE3_SLOW-NEXT:    movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSSE3_SLOW-NEXT:    addps %xmm1, %xmm0
-; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[1,0]
-; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0]
-; SSSE3_SLOW-NEXT:    movaps %xmm2, %xmm0
+; SSSE3_SLOW-NEXT:    haddps %xmm1, %xmm0
+; SSSE3_SLOW-NEXT:    movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; SSSE3_SLOW-NEXT:    addps %xmm1, %xmm2
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0]
 ; SSSE3_SLOW-NEXT:    retq
 ;
 ; SSSE3_FAST-LABEL: PR34724_1:
 ; SSSE3_FAST:       # %bb.0:
-; SSSE3_FAST-NEXT:    movaps %xmm1, %xmm2
-; SSSE3_FAST-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,2]
-; SSSE3_FAST-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
 ; SSSE3_FAST-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSSE3_FAST-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3_FAST-NEXT:    addps %xmm0, %xmm2
+; SSSE3_FAST-NEXT:    haddps %xmm1, %xmm0
 ; SSSE3_FAST-NEXT:    haddps %xmm1, %xmm1
-; SSSE3_FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[1,0]
-; SSSE3_FAST-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
-; SSSE3_FAST-NEXT:    movaps %xmm2, %xmm0
+; SSSE3_FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
+; SSSE3_FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
 ; SSSE3_FAST-NEXT:    retq
 ;
 ; AVX1_SLOW-LABEL: PR34724_1:
 ; AVX1_SLOW:       # %bb.0:
-; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX1_SLOW-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero
-; AVX1_SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
-; AVX1_SLOW-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1_SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX1_SLOW-NEXT:    vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
 ; AVX1_SLOW-NEXT:    vaddps %xmm1, %xmm2, %xmm1
-; AVX1_SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3]
+; AVX1_SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
 ; AVX1_SLOW-NEXT:    retq
 ;
 ; AVX1_FAST-LABEL: PR34724_1:
 ; AVX1_FAST:       # %bb.0:
-; AVX1_FAST-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX1_FAST-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero
-; AVX1_FAST-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
-; AVX1_FAST-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; AVX1_FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1_FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX1_FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
-; AVX1_FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX1_FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3]
+; AVX1_FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX1_FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
 ; AVX1_FAST-NEXT:    retq
 ;
 ; AVX2_SLOW-LABEL: PR34724_1:
 ; AVX2_SLOW:       # %bb.0:
-; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX2_SLOW-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero
-; AVX2_SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
-; AVX2_SLOW-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2_SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX2_SLOW-NEXT:    vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
 ; AVX2_SLOW-NEXT:    vaddps %xmm1, %xmm2, %xmm1
-; AVX2_SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3]
+; AVX2_SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
 ; AVX2_SLOW-NEXT:    retq
 ;
 ; AVX2_FAST-LABEL: PR34724_1:
 ; AVX2_FAST:       # %bb.0:
-; AVX2_FAST-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX2_FAST-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero
-; AVX2_FAST-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
-; AVX2_FAST-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; AVX2_FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2_FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX2_FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
-; AVX2_FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2_FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3]
+; AVX2_FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX2_FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3]
 ; AVX2_FAST-NEXT:    retq
   %t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 2, i32 4>
   %t1 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 3, i32 5>
@@ -964,78 +946,49 @@ define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) {
 ; SSSE3_SLOW-LABEL: PR34724_2:
 ; SSSE3_SLOW:       # %bb.0:
-; SSSE3_SLOW-NEXT:    movaps %xmm1, %xmm2
-; SSSE3_SLOW-NEXT:    movsldup {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSSE3_SLOW-NEXT:    addps %xmm1, %xmm3
-; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
-; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
-; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,0]
-; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
+; SSSE3_SLOW-NEXT:    haddps %xmm1, %xmm0
+; SSSE3_SLOW-NEXT:    movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
 ; SSSE3_SLOW-NEXT:    addps %xmm1, %xmm2
-; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[1,0]
-; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[2,0]
-; SSSE3_SLOW-NEXT:    movaps %xmm2, %xmm0
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
 ; SSSE3_SLOW-NEXT:    retq
 ;
 ; SSSE3_FAST-LABEL: PR34724_2:
 ; SSSE3_FAST:       # %bb.0:
-; SSSE3_FAST-NEXT:    movaps %xmm1, %xmm3
-; SSSE3_FAST-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[2,0]
-; SSSE3_FAST-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3]
-; SSSE3_FAST-NEXT:    movaps %xmm1, %xmm2
-; SSSE3_FAST-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,0]
-; SSSE3_FAST-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
-; SSSE3_FAST-NEXT:    addps %xmm3, %xmm2
+; SSSE3_FAST-NEXT:    haddps %xmm1, %xmm0
 ; SSSE3_FAST-NEXT:    haddps %xmm1, %xmm1
-; SSSE3_FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[1,0]
-; SSSE3_FAST-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
-; SSSE3_FAST-NEXT:    movaps %xmm2, %xmm0
+; SSSE3_FAST-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
+; SSSE3_FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
 ; SSSE3_FAST-NEXT:    retq
 ;
 ; AVX1_SLOW-LABEL: PR34724_2:
 ; AVX1_SLOW:       # %bb.0:
-; AVX1_SLOW-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3]
-; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3]
-; AVX1_SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
-; AVX1_SLOW-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; AVX1_SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX1_SLOW-NEXT:    vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
 ; AVX1_SLOW-NEXT:    vaddps %xmm1, %xmm2, %xmm1
-; AVX1_SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3]
+; AVX1_SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX1_SLOW-NEXT:    retq
 ;
 ; AVX1_FAST-LABEL: PR34724_2:
 ; AVX1_FAST:       # %bb.0:
-; AVX1_FAST-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3]
-; AVX1_FAST-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3]
-; AVX1_FAST-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
-; AVX1_FAST-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; AVX1_FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX1_FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
-; AVX1_FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX1_FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3]
+; AVX1_FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX1_FAST-NEXT:    retq
 ;
 ; AVX2_SLOW-LABEL: PR34724_2:
 ; AVX2_SLOW:       # %bb.0:
-; AVX2_SLOW-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3]
-; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3]
-; AVX2_SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
-; AVX2_SLOW-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; AVX2_SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX2_SLOW-NEXT:    vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
 ; AVX2_SLOW-NEXT:    vaddps %xmm1, %xmm2, %xmm1
-; AVX2_SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3]
+; AVX2_SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX2_SLOW-NEXT:    retq
 ;
 ; AVX2_FAST-LABEL: PR34724_2:
 ; AVX2_FAST:       # %bb.0:
-; AVX2_FAST-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3]
-; AVX2_FAST-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3]
-; AVX2_FAST-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
-; AVX2_FAST-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; AVX2_FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX2_FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm1
-; AVX2_FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2_FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3]
+; AVX2_FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX2_FAST-NEXT:    retq
   %t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>
   %t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 5, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll
index f8648f5b7018..ae53f2d8905f 100644
--- a/llvm/test/CodeGen/X86/haddsub-undef.ll
+++ b/llvm/test/CodeGen/X86/haddsub-undef.ll
@@ -818,12 +818,25 @@ define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) {
 ; SSE-FAST-NEXT:    movapd %xmm2, %xmm1
 ; SSE-FAST-NEXT:    retq
 ;
-; AVX-LABEL: PR44694:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT:    vhaddpd %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    retq
+; AVX1-SLOW-LABEL: PR44694:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-SLOW-NEXT:    vhaddpd %ymm0, %ymm1, %ymm0
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: PR44694:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-FAST-NEXT:    vhaddpd %ymm0, %ymm1, %ymm0
+; AVX1-FAST-NEXT:    retq
+;
+; AVX512-LABEL: PR44694:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512-NEXT:    retq
   %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
   %5 = fadd <4 x double> %3, %4
@@ -831,20 +844,30 @@ define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) {
 }
 
 define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind {
-; SSE-LABEL: PR45747_1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE-NEXT:    addps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: PR45747_1:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-SLOW-NEXT:    addps %xmm0, %xmm1
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-SLOW-NEXT:    movaps %xmm1, %xmm0
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: PR45747_1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: PR45747_1:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: PR45747_1:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: PR45747_1:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %t0 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
   %t1 = fadd <4 x float> %t0, %a
   %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
@@ -852,19 +875,32 @@ define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind {
 }
 
 define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind {
-; SSE-LABEL: PR45747_2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE-NEXT:    addps %xmm1, %xmm0
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: PR45747_2:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE-SLOW-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: PR45747_2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: PR45747_2:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    movaps %xmm1, %xmm0
+; SSE-FAST-NEXT:    haddps %xmm1, %xmm0
+; SSE-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: PR45747_2:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: PR45747_2:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm1, %xmm0
+; AVX-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-FAST-NEXT:    retq
   %t0 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
   %t1 = fadd <4 x float> %t0, %b
   %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 5195f5f0e0c7..1fd61912ed4a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1766,12 +1766,24 @@ define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <4 x double> @add_v4f64_0246_1357(<4 x double> %a, <4 x double> %b) {
-; ALL-LABEL: add_v4f64_0246_1357:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
-; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
-; ALL-NEXT:    retq
+; AVX1-LABEL: add_v4f64_0246_1357:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: add_v4f64_0246_1357:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: add_v4f64_0246_1357:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VL-NEXT:    retq
 entry:
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
   %shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1780,12 +1792,24 @@ entry:
 }
 
 define <4 x double> @add_v4f64_4602_5713(<4 x double> %a, <4 x double> %b) {
-; ALL-LABEL: add_v4f64_4602_5713:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; ALL-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
-; ALL-NEXT:    retq
+; AVX1-LABEL: add_v4f64_4602_5713:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: add_v4f64_4602_5713:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,3,0,2]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: add_v4f64_4602_5713:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,3,0,2]
+; AVX512VL-NEXT:    retq
 entry:
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 6, i32 0, i32 2>
   %shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 7, i32 1, i32 3>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 973045696fbd..9edd6a187643 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -3039,32 +3039,11 @@ define <8 x float> @add_v8f32_02468ACE_13579BDF(<8 x float> %a, <8 x float> %b)
 ; AVX1-NEXT:    vhaddps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: add_v8f32_02468ACE_13579BDF:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vaddps %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512VL-SLOW-LABEL: add_v8f32_02468ACE_13579BDF:
-; AVX512VL-SLOW:       # %bb.0: # %entry
-; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VL-SLOW-NEXT:    vaddps %ymm0, %ymm2, %ymm0
-; AVX512VL-SLOW-NEXT:    retq
-;
-; AVX512VL-FAST-LABEL: add_v8f32_02468ACE_13579BDF:
-; AVX512VL-FAST:       # %bb.0: # %entry
-; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
-; AVX512VL-FAST-NEXT:    vpermi2ps %ymm1, %ymm0, %ymm2
-; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15]
-; AVX512VL-FAST-NEXT:    vpermi2ps %ymm1, %ymm0, %ymm3
-; AVX512VL-FAST-NEXT:    vaddps %ymm3, %ymm2, %ymm0
-; AVX512VL-FAST-NEXT:    retq
+; AVX2OR512VL-LABEL: add_v8f32_02468ACE_13579BDF:
+; AVX2OR512VL:       # %bb.0: # %entry
+; AVX2OR512VL-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2OR512VL-NEXT:    retq
 entry:
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %shuffle1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -3080,32 +3059,11 @@ define <8 x float> @add_v8f32_8ACE0246_9BDF1357(<8 x float> %a, <8 x float> %b)
 ; AVX1-NEXT:    vhaddps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: add_v8f32_8ACE0246_9BDF1357:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vaddps %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512VL-SLOW-LABEL: add_v8f32_8ACE0246_9BDF1357:
-; AVX512VL-SLOW:       # %bb.0: # %entry
-; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
-; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VL-SLOW-NEXT:    vaddps %ymm0, %ymm2, %ymm0
-; AVX512VL-SLOW-NEXT:    retq
-;
-; AVX512VL-FAST-LABEL: add_v8f32_8ACE0246_9BDF1357:
-; AVX512VL-FAST:       # %bb.0: # %entry
-; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
-; AVX512VL-FAST-NEXT:    vpermi2ps %ymm0, %ymm1, %ymm2
-; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15]
-; AVX512VL-FAST-NEXT:    vpermi2ps %ymm0, %ymm1, %ymm3
-; AVX512VL-FAST-NEXT:    vaddps %ymm3, %ymm2, %ymm0
-; AVX512VL-FAST-NEXT:    retq
+; AVX2OR512VL-LABEL: add_v8f32_8ACE0246_9BDF1357:
+; AVX2OR512VL:       # %bb.0: # %entry
+; AVX2OR512VL-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,3,0,2]
+; AVX2OR512VL-NEXT:    retq
 entry:
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 0, i32 2, i32 4, i32 6>
   %shuffle1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 1, i32 3, i32 5, i32 7>
@@ -3116,45 +3074,21 @@ entry:
 define <8 x i32> @add_v8i32_02468ACE_13579BDF(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: add_v8i32_02468ACE_13579BDF:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
-; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[3]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: add_v8i32_02468ACE_13579BDF:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512VL-SLOW-LABEL: add_v8i32_02468ACE_13579BDF:
-; AVX512VL-SLOW:       # %bb.0: # %entry
-; AVX512VL-SLOW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VL-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2
-; AVX512VL-SLOW-NEXT:    vpmovqd %zmm2, %ymm2
-; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VL-SLOW-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
-; AVX512VL-SLOW-NEXT:    retq
-;
-; AVX512VL-FAST-LABEL: add_v8i32_02468ACE_13579BDF:
-; AVX512VL-FAST:       # %bb.0: # %entry
-; AVX512VL-FAST-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VL-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2
-; AVX512VL-FAST-NEXT:    vpmovqd %zmm2, %ymm2
-; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15]
-; AVX512VL-FAST-NEXT:    vpermi2d %ymm1, %ymm0, %ymm3
-; AVX512VL-FAST-NEXT:    vpaddd %ymm3, %ymm2, %ymm0
-; AVX512VL-FAST-NEXT:    retq
+; AVX2OR512VL-LABEL: add_v8i32_02468ACE_13579BDF:
+; AVX2OR512VL:       # %bb.0: # %entry
+; AVX2OR512VL-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2OR512VL-NEXT:    retq
 entry:
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %shuffle1 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -3165,45 +3099,21 @@ entry:
 define <8 x i32> @add_v8i32_8ACE0246_9BDF1357(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: add_v8i32_8ACE0246_9BDF1357:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
-; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[2],ymm0[2]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: add_v8i32_8ACE0246_9BDF1357:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
-; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512VL-SLOW-LABEL: add_v8i32_8ACE0246_9BDF1357:
-; AVX512VL-SLOW:       # %bb.0: # %entry
-; AVX512VL-SLOW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VL-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm2
-; AVX512VL-SLOW-NEXT:    vpmovqd %zmm2, %ymm2
-; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
-; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VL-SLOW-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
-; AVX512VL-SLOW-NEXT:    retq
-;
-; AVX512VL-FAST-LABEL: add_v8i32_8ACE0246_9BDF1357:
-; AVX512VL-FAST:       # %bb.0: # %entry
-; AVX512VL-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VL-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm2
-; AVX512VL-FAST-NEXT:    vpmovqd %zmm2, %ymm2
-; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15]
-; AVX512VL-FAST-NEXT:    vpermi2d %ymm0, %ymm1, %ymm3
-; AVX512VL-FAST-NEXT:    vpaddd %ymm3, %ymm2, %ymm0
-; AVX512VL-FAST-NEXT:    retq
+; AVX2OR512VL-LABEL: add_v8i32_8ACE0246_9BDF1357:
+; AVX2OR512VL:       # %bb.0: # %entry
+; AVX2OR512VL-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
+; AVX2OR512VL-NEXT:    retq
 entry:
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 0, i32 2, i32 4, i32 6>
   %shuffle1 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 1, i32 3, i32 5, i32 7>

From 70b2872f4810569173c7042c51333d83deb16d88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 27 Jul 2020 23:44:41 +0300
Subject: [PATCH 067/363] [LLD] [MinGW] Implement the --no-seh flag

Previously this flag was just ignored. If set, set the
IMAGE_DLL_CHARACTERISTICS_NO_SEH bit, regardless of the normal safeSEH
machinery.

In mingw configurations, the safeSEH bit might not be set in e.g. object
files built from handwritten assembly, making it impossible to use the
normal safeseh flag. As mingw setups don't generally use SEH on 32 bit
x86 at all, it should be fine to set that flag bit though - hook up
the existing GNU ld flag for controlling that.

Differential Revision: https://reviews.llvm.org/D84701

(cherry picked from commit 745eb02496b515cc8292dd7f9d7f0db43e162013)
---
 lld/COFF/Config.h          |  1 +
 lld/COFF/Driver.cpp        |  7 ++++---
 lld/COFF/Options.td        |  1 +
 lld/COFF/Writer.cpp        |  2 +-
 lld/MinGW/Driver.cpp       |  2 ++
 lld/MinGW/Options.td       |  2 +-
 lld/test/COFF/noseh.s      | 19 +++++++++++++++++++
 lld/test/MinGW/driver.test |  4 ++++
 8 files changed, 33 insertions(+), 5 deletions(-)
 create mode 100644 lld/test/COFF/noseh.s

diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
index 72d826b8bd17..7c439176f3a4 100644
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -140,6 +140,7 @@ struct Configuration {
   bool safeSEH = false;
   Symbol *sehTable = nullptr;
   Symbol *sehCount = nullptr;
+  bool noSEH = false;
 
   // Used for /opt:lldlto=N
   unsigned ltoo = 2;
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 7372505bb616..9ceccef86779 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -1700,9 +1700,10 @@ void LinkerDriver::link(ArrayRef<const char *> argsArr) {
   config->wordsize = config->is64() ? 8 : 4;
 
   // Handle /safeseh, x86 only, on by default, except for mingw.
-  if (config->machine == I386 &&
-      args.hasFlag(OPT_safeseh, OPT_safeseh_no, !config->mingw))
-    config->safeSEH = true;
+  if (config->machine == I386) {
+    config->safeSEH = args.hasFlag(OPT_safeseh, OPT_safeseh_no, !config->mingw);
+    config->noSEH = args.hasArg(OPT_noseh);
+  }
 
   // Handle /functionpadmin
   for (auto *arg : args.filtered(OPT_functionpadmin, OPT_functionpadmin_opt))
diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td
index 212879e1d60b..087d53b5d2dd 100644
--- a/lld/COFF/Options.td
+++ b/lld/COFF/Options.td
@@ -204,6 +204,7 @@ def include_optional : Joined<["/", "-", "/?", "-?"], "includeoptional:">,
     HelpText<"Add symbol as undefined, but allow it to remain undefined">;
 def kill_at : F<"kill-at">;
 def lldmingw : F<"lldmingw">;
+def noseh : F<"noseh">;
 def output_def : Joined<["/", "-", "/?", "-?"], "output-def:">;
 def pdb_source_path : P<"pdbsourcepath",
     "Base path used to make relative source file path absolute in PDB">;
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 3bcc1777f7ac..082de5b8c1d6 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1393,7 +1393,7 @@ template <typename PEHeaderTy> void Writer::writeHeader() {
     pe->DLLCharacteristics |= IMAGE_DLL_CHARACTERISTICS_GUARD_CF;
   if (config->integrityCheck)
     pe->DLLCharacteristics |= IMAGE_DLL_CHARACTERISTICS_FORCE_INTEGRITY;
-  if (setNoSEHCharacteristic)
+  if (setNoSEHCharacteristic || config->noSEH)
     pe->DLLCharacteristics |= IMAGE_DLL_CHARACTERISTICS_NO_SEH;
   if (config->terminalServerAware)
     pe->DLLCharacteristics |= IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE;
diff --git a/lld/MinGW/Driver.cpp b/lld/MinGW/Driver.cpp
index f33b5e19502c..d60765c70c09 100644
--- a/lld/MinGW/Driver.cpp
+++ b/lld/MinGW/Driver.cpp
@@ -288,6 +288,8 @@ bool mingw::link(ArrayRef<const char *> argsArr, bool canExitEarly,
     add("-kill-at");
   if (args.hasArg(OPT_appcontainer))
     add("-appcontainer");
+  if (args.hasArg(OPT_no_seh))
+    add("-noseh");
 
   if (args.getLastArgValue(OPT_m) != "thumb2pe" &&
       args.getLastArgValue(OPT_m) != "arm64pe" && !args.hasArg(OPT_dynamicbase))
diff --git a/lld/MinGW/Options.td b/lld/MinGW/Options.td
index 3281951dc89d..fe4416660050 100644
--- a/lld/MinGW/Options.td
+++ b/lld/MinGW/Options.td
@@ -56,6 +56,7 @@ defm minor_subsystem_version: EqLong<"minor-subsystem-version",
      "Set the OS and subsystem minor version">;
 def no_insert_timestamp: F<"no-insert-timestamp">,
     HelpText<"Don't include PE header timestamp">;
+def no_seh: F<"no-seh">, HelpText<"Set the 'no SEH' flag in the executable">;
 def no_whole_archive: F<"no-whole-archive">,
     HelpText<"No longer include all object files for following archives">;
 def large_address_aware: Flag<["--"], "large-address-aware">,
@@ -111,7 +112,6 @@ def: Flag<["--"], "full-shutdown">;
 def: F<"high-entropy-va">;
 def: S<"major-image-version">;
 def: S<"minor-image-version">;
-def: F<"no-seh">;
 def: F<"nxcompat">;
 def: F<"pic-executable">;
 def: S<"plugin">;
diff --git a/lld/test/COFF/noseh.s b/lld/test/COFF/noseh.s
new file mode 100644
index 000000000000..442952286229
--- /dev/null
+++ b/lld/test/COFF/noseh.s
@@ -0,0 +1,19 @@
+# REQUIRES: x86
+# RUN: llvm-mc -triple i686-w64-mingw32 %s -filetype=obj -o %t.obj
+# RUN: lld-link -lldmingw %t.obj -out:%t.exe -entry:main
+# RUN: llvm-readobj --file-headers %t.exe | FileCheck %s --check-prefix=DEFAULT
+# RUN: lld-link -lldmingw %t.obj -out:%t.noseh.exe -entry:main -noseh
+# RUN: llvm-readobj --file-headers %t.noseh.exe | FileCheck %s --check-prefix=NOSEH
+
+# DEFAULT: Characteristics [
+# DEFAULT-NOT:   IMAGE_DLL_CHARACTERISTICS_NO_SEH
+# DEFAULT: ]
+
+# NOSEH: Characteristics [
+# NOSEH:   IMAGE_DLL_CHARACTERISTICS_NO_SEH
+# NOSEH: ]
+
+        .text
+        .globl  _main
+_main:
+        ret
diff --git a/lld/test/MinGW/driver.test b/lld/test/MinGW/driver.test
index 385822c7e1f7..faac3a0be57d 100644
--- a/lld/test/MinGW/driver.test
+++ b/lld/test/MinGW/driver.test
@@ -256,3 +256,7 @@ RUN: ld.lld -### -m i386pep foo.o -section-alignment 0x2000 | FileCheck -check-p
 RUN: ld.lld -### -m i386pep foo.o --section-alignment=0x2000 | FileCheck -check-prefix ALIGN %s
 RUN: ld.lld -### -m i386pep foo.o -section-alignment=0x2000 | FileCheck -check-prefix ALIGN %s
 ALIGN: -align:0x2000
+
+RUN: ld.lld -### -m i386pe foo.o -no-seh | FileCheck -check-prefix NOSEH %s
+RUN: ld.lld -### -m i386pe foo.o --no-seh | FileCheck -check-prefix NOSEH %s
+NOSEH: -noseh

From 7f2a078b11316e11b89ee09215b0e7c0b78f359b Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 28 Jul 2020 15:02:36 -0400
Subject: [PATCH 068/363] [InstCombine] avoid crashing on vector constant
 expression (PR46872)

(cherry picked from commit f75cf240d6ed528e1ce7770bbe09b417338b40ef)
---
 .../InstCombine/InstructionCombining.cpp          |  2 +-
 llvm/test/Transforms/InstCombine/vec_shuffle.ll   | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index b3254c10a0b2..17a5ec3f87fa 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1543,7 +1543,7 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
   Constant *C;
   if (match(&Inst,
             m_c_BinOp(m_OneUse(m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))),
-                      m_Constant(C))) &&
+                      m_Constant(C))) && !isa<ConstantExpr>(C) &&
       cast<FixedVectorType>(V1->getType())->getNumElements() <= NumElts) {
     assert(Inst.getType()->getScalarType() == V1->getType()->getScalarType() &&
            "Shuffle should not change scalar type");
diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle.ll b/llvm/test/Transforms/InstCombine/vec_shuffle.ll
index e7e55b07b7cd..3f3431c5d904 100644
--- a/llvm/test/Transforms/InstCombine/vec_shuffle.ll
+++ b/llvm/test/Transforms/InstCombine/vec_shuffle.ll
@@ -1745,3 +1745,18 @@ define <4 x i32> @splat_assoc_add_mul(<4 x i32> %x, <4 x i32> %y) {
   %r = mul <4 x i32> %splatx, %a
   ret <4 x i32> %r
 }
+
+
+; Do not crash on constant expressions.
+
+define <4 x i32> @PR46872(<4 x i32> %x) {
+; CHECK-LABEL: @PR46872(
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[A:%.*]] = and <4 x i32> [[S]], bitcast (<2 x i64> <i64 ptrtoint (<4 x i32> (<4 x i32>)* @PR46872 to i64), i64 ptrtoint (<4 x i32> (<4 x i32>)* @PR46872 to i64)> to <4 x i32>)
+; CHECK-NEXT:    ret <4 x i32> [[A]]
+;
+  %s = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 1>
+  %a = and <4 x i32> %s, bitcast (<2 x i64> <i64 ptrtoint (<4 x i32> (<4 x i32>)* @PR46872 to i64), i64 ptrtoint (<4 x i32> (<4 x i32>)* @PR46872 to i64)> to <4 x i32>)
+  ret <4 x i32> %a
+}
+

From baf2999b49c6aff2fcd5448c7d299eb2f4bf8b86 Mon Sep 17 00:00:00 2001
From: Nathan James <n.james93@hotmail.co.uk>
Date: Wed, 29 Jul 2020 15:35:28 +0100
Subject: [PATCH 069/363] [clang-tidy] Fix RedundantStringCStrCheck with r
 values

The previous fix for this, https://reviews.llvm.org/D76761, Passed test cases but failed in the real world as std::string has a non trivial destructor so creates a CXXBindTemporaryExpr.

This handles that shortfall and updates the test case std::basic_string implementation to use a non trivial destructor to reflect real world behaviour.

Reviewed By: gribozavr2

Differential Revision: https://reviews.llvm.org/D84831

(cherry picked from commit b99630e432614d06b380afb15c466665065eaa0a)
---
 .../readability/RedundantStringCStrCheck.cpp       | 14 ++++++++------
 .../checkers/readability-redundant-string-cstr.cpp |  2 ++
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp
index bea02a6ba111..1f371eed2db8 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantStringCStrCheck.cpp
@@ -92,16 +92,18 @@ void RedundantStringCStrCheck::registerMatchers(
                         callee(memberExpr().bind("member")),
                         callee(cxxMethodDecl(hasAnyName("c_str", "data"))))
           .bind("call");
-
+  const auto HasRValueTempParent =
+      hasParent(materializeTemporaryExpr(unless(isBoundToLValue())));
   // Detect redundant 'c_str()' calls through a string constructor.
   // If CxxConstructExpr is the part of some CallExpr we need to
   // check that matched ParamDecl of the ancestor CallExpr is not rvalue.
   Finder->addMatcher(
-      traverse(ast_type_traits::TK_AsIs,
-               cxxConstructExpr(StringConstructorExpr,
-                                hasArgument(0, StringCStrCallExpr),
-                                unless(hasParent(materializeTemporaryExpr(
-                                    unless(isBoundToLValue())))))),
+      traverse(
+          ast_type_traits::TK_AsIs,
+          cxxConstructExpr(
+              StringConstructorExpr, hasArgument(0, StringCStrCallExpr),
+              unless(anyOf(HasRValueTempParent, hasParent(cxxBindTemporaryExpr(
+                                                    HasRValueTempParent)))))),
       this);
 
   // Detect: 's == str.c_str()'  ->  's == str'
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability-redundant-string-cstr.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability-redundant-string-cstr.cpp
index 2561b81805bd..e1df8cccc10b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability-redundant-string-cstr.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability-redundant-string-cstr.cpp
@@ -15,6 +15,8 @@ struct basic_string {
   basic_string();
   basic_string(const C *p, const A &a = A());
 
+  ~basic_string();
+
   const C *c_str() const;
   const C *data() const;
 

From fdb1299e70c24fd35dae7804323769bd470c06b8 Mon Sep 17 00:00:00 2001
From: AndreyChurbanov <andrey.churbanov@intel.com>
Date: Tue, 21 Jul 2020 16:31:17 +0300
Subject: [PATCH 070/363] [OpenMP] add missed REQUIRES:ompt for 2 OMPT tests

(cherry picked from commit 617787ea77a22f752ba1fcd4ac7cb9a62a710756)
---
 openmp/runtime/test/ompt/tasks/task_early_fulfill.c | 1 +
 openmp/runtime/test/ompt/tasks/task_late_fulfill.c  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/openmp/runtime/test/ompt/tasks/task_early_fulfill.c b/openmp/runtime/test/ompt/tasks/task_early_fulfill.c
index f1d07a1503c8..e1324e6af681 100644
--- a/openmp/runtime/test/ompt/tasks/task_early_fulfill.c
+++ b/openmp/runtime/test/ompt/tasks/task_early_fulfill.c
@@ -1,5 +1,6 @@
 // RUN: %libomp-compile -fopenmp-version=50 && env OMP_NUM_THREADS='3' \
 // RUN:    %libomp-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
 
 // Checked gcc 10.1 still does not support detach clause on task construct.
 // UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8, gcc-9, gcc-10
diff --git a/openmp/runtime/test/ompt/tasks/task_late_fulfill.c b/openmp/runtime/test/ompt/tasks/task_late_fulfill.c
index 4824f3358cfd..13a2a54a60df 100644
--- a/openmp/runtime/test/ompt/tasks/task_late_fulfill.c
+++ b/openmp/runtime/test/ompt/tasks/task_late_fulfill.c
@@ -1,5 +1,6 @@
 // RUN: %libomp-compile -fopenmp-version=50 && env OMP_NUM_THREADS='3' \
 // RUN:    %libomp-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
 
 // Checked gcc 10.1 still does not support detach clause on task construct.
 // UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7, gcc-8, gcc-9, gcc-10

From b88690b737518f8776c8f95063b89e8bbbd97428 Mon Sep 17 00:00:00 2001
From: Sebastian Neubauer <sebastian.neubauer@amd.com>
Date: Tue, 21 Jul 2020 10:28:12 +0200
Subject: [PATCH 071/363] [AMDGPU] Don't combine memory intrs to v3i16

v3i16 and v3f16 currently cannot be legalized and lowered so they should
not be emitted by inst combining.

Moved the check down to still allow extracting 1 or 2 elements via the dmask.

Fixes image intrinsics being combined to return v3x16.

Differential Revision: https://reviews.llvm.org/D84223

(cherry picked from commit 2c659082bda6319732118e746fe025d8d5f9bfac)
---
 .../InstCombineSimplifyDemanded.cpp           | 11 ++--
 .../AMDGPU/amdgcn-demanded-vector-elts.ll     | 58 +++++++++++++++++++
 2 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 7cfe4c8b5892..c7f2f4ec3ca1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1030,12 +1030,6 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
                                                            APInt DemandedElts,
                                                            int DMaskIdx) {
 
-  // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported.
-  if (DMaskIdx < 0 &&
-      II->getType()->getScalarSizeInBits() != 32 &&
-      DemandedElts.getActiveBits() == 3)
-    return nullptr;
-
   auto *IIVTy = cast<VectorType>(II->getType());
   unsigned VWidth = IIVTy->getNumElements();
   if (VWidth == 1)
@@ -1124,6 +1118,11 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
   if (!NewNumElts)
     return UndefValue::get(II->getType());
 
+  // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are
+  // fully supported.
+  if (II->getType()->getScalarSizeInBits() == 16 && NewNumElts == 3)
+    return nullptr;
+
   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
     if (DMaskIdx >= 0)
       II->setArgOperand(DMaskIdx, Args[DMaskIdx]);
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
index c2f5e2857be0..3d58f0608081 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
@@ -2965,6 +2965,64 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, fl
 ; llvm.amdgcn.image.sample.cd.cl
 ; --------------------------------------------------------------------
 
+; CHECK-LABEL: @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 8, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret half %data
+define amdgpu_ps half @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x half> %data, i32 3
+  ret half %elt0
+}
+
+; CHECK-LABEL: @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 4, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret half %data
+define amdgpu_ps half @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x half> %data, i32 2
+  ret half %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 2, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret half %data
+define amdgpu_ps half @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x half> %data, i32 1
+  ret half %elt0
+}
+
+; FIXME: Enable load shortening when full support for v3f16 has been added (should expect call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32).
+; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+; CHECK-NEXT: ret <4 x half> %res
+define amdgpu_ps <4 x half> @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x half> %res
+}
+
+; CHECK-LABEL: @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32(i32 3, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: %res = shufflevector <2 x half> %data, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT: ret <4 x half> %res
+define amdgpu_ps <4 x half> @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x half> %res
+}
+
+; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: %res = insertelement <4 x half> undef, half %data, i64 0
+; CHECK-NEXT: ret <4 x half> %res
+define amdgpu_ps <4 x half> @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
+  ret <4 x half> %res
+}
+
 ; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(
 ; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 ; CHECK-NEXT: ret half %data

From 3286126de175755a887e7fa335583811075be4f7 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Fri, 31 Jul 2020 17:22:49 +0200
Subject: [PATCH 072/363] Add flang to export.sh to it gets source tarballs in
 releases

(cherry picked from commit 9853786ce39b9510eeb2688baaef7a364d58e113)
---
 llvm/utils/release/export.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/release/export.sh b/llvm/utils/release/export.sh
index 02a77afd0533..c3277de38b53 100755
--- a/llvm/utils/release/export.sh
+++ b/llvm/utils/release/export.sh
@@ -13,7 +13,7 @@
 
 set -e
 
-projects="llvm clang test-suite compiler-rt libcxx libcxxabi clang-tools-extra polly lldb lld openmp libunwind"
+projects="llvm clang test-suite compiler-rt libcxx libcxxabi clang-tools-extra polly lldb lld openmp libunwind flang"
 
 release=""
 rc=""

From 4fd4ec63813fd5b22d81adb6e201cb16ccf72b69 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Tue, 14 Jul 2020 13:02:58 -0700
Subject: [PATCH 073/363] [AArch64][SVE] Add support for trunc to <vscale x N x
 i1>.

This isn't a natively supported operation, so convert it to a
mask+compare.

In addition to the operation itself, fix up some surrounding stuff to
make the testcase work: we need concat_vectors on i1 vectors, we need
legalization of i1 vector truncates, and we need to fix up all the
relevant uses of getVectorNumElements().

Differential Revision: https://reviews.llvm.org/D83811

(cherry picked from commit b8f765a1e17f8d212ab1cd8f630d35adc7495556)
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  19 ++-
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  11 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  15 ++-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  22 ++++
 llvm/test/CodeGen/AArch64/sve-trunc.ll        | 120 ++++++++++++++++++
 5 files changed, 175 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f14b3dba4f31..a026d3960026 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11372,9 +11372,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       // Stop if more than one members are non-undef.
       if (NumDefs > 1)
         break;
+
       VTs.push_back(EVT::getVectorVT(*DAG.getContext(),
                                      VT.getVectorElementType(),
-                                     X.getValueType().getVectorNumElements()));
+                                     X.getValueType().getVectorElementCount()));
     }
 
     if (NumDefs == 0)
@@ -18795,6 +18796,11 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
 static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
   EVT OpVT = N->getOperand(0).getValueType();
+
+  // We currently can't generate an appropriate shuffle for a scalable vector.
+  if (VT.isScalableVector())
+    return SDValue();
+
   int NumElts = VT.getVectorNumElements();
   int NumOpElts = OpVT.getVectorNumElements();
 
@@ -19055,11 +19061,14 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
     return V;
 
   // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
-  // nodes often generate nop CONCAT_VECTOR nodes.
-  // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that
-  // place the incoming vectors at the exact same location.
+  // nodes often generate nop CONCAT_VECTOR nodes. Scan the CONCAT_VECTOR
+  // operands and look for a CONCAT operations that place the incoming vectors
+  // at the exact same location.
+  //
+  // For scalable vectors, EXTRACT_SUBVECTOR indexes are implicitly scaled.
   SDValue SingleSource = SDValue();
-  unsigned PartNumElem = N->getOperand(0).getValueType().getVectorNumElements();
+  unsigned PartNumElem =
+      N->getOperand(0).getValueType().getVectorMinNumElements();
 
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     SDValue Op = N->getOperand(i);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index b2299931021c..1394f084c6dc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2151,7 +2151,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
   EVT InVT = Lo.getValueType();
 
   EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
-                               InVT.getVectorNumElements());
+                               InVT.getVectorElementCount());
 
   if (N->isStrictFPOpcode()) {
     Lo = DAG.getNode(N->getOpcode(), dl, { OutVT, MVT::Other }, 
@@ -2559,13 +2559,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
   SDValue InVec = N->getOperand(OpNo);
   EVT InVT = InVec->getValueType(0);
   EVT OutVT = N->getValueType(0);
-  unsigned NumElements = OutVT.getVectorNumElements();
+  ElementCount NumElements = OutVT.getVectorElementCount();
   bool IsFloat = OutVT.isFloatingPoint();
 
-  // Widening should have already made sure this is a power-two vector
-  // if we're trying to split it at all. assert() that's true, just in case.
-  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
-
   unsigned InElementSize = InVT.getScalarSizeInBits();
   unsigned OutElementSize = OutVT.getScalarSizeInBits();
 
@@ -2595,6 +2591,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
   GetSplitVector(InVec, InLoVec, InHiVec);
 
   // Truncate them to 1/2 the element size.
+  //
+  // This assumes the number of elements is a power of two; any vector that
+  // isn't should be widened, not split.
   EVT HalfElementVT = IsFloat ?
     EVT::getFloatingPointVT(InElementSize/2) :
     EVT::getIntegerVT(*DAG.getContext(), InElementSize/2);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 85db14ab66fe..d9951b7b8c5b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -932,8 +932,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::SHL, VT, Custom);
         setOperationAction(ISD::SRL, VT, Custom);
         setOperationAction(ISD::SRA, VT, Custom);
-        if (VT.getScalarType() == MVT::i1)
+        if (VT.getScalarType() == MVT::i1) {
           setOperationAction(ISD::SETCC, VT, Custom);
+          setOperationAction(ISD::TRUNCATE, VT, Custom);
+          setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
+        }
       }
     }
 
@@ -8858,6 +8861,16 @@ SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
                                              SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
 
+  if (VT.getScalarType() == MVT::i1) {
+    // Lower i1 truncate to `(x & 1) != 0`.
+    SDLoc dl(Op);
+    EVT OpVT = Op.getOperand(0).getValueType();
+    SDValue Zero = DAG.getConstant(0, dl, OpVT);
+    SDValue One = DAG.getConstant(1, dl, OpVT);
+    SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
+    return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
+  }
+
   if (!VT.isVector() || VT.isScalableVector())
     return Op;
 
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 28a54e6f7d79..3449a8bd16d2 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1109,6 +1109,28 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1", AArch64trn1>;
   defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>;
 
+  // Extract lo/hi halves of legal predicate types.
+  def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))),
+            (ZIP1_PPP_S PPR:$Ps, (PFALSE))>;
+  def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))),
+            (ZIP2_PPP_S PPR:$Ps, (PFALSE))>;
+  def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))),
+            (ZIP1_PPP_H PPR:$Ps, (PFALSE))>;
+  def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))),
+            (ZIP2_PPP_H PPR:$Ps, (PFALSE))>;
+  def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))),
+            (ZIP1_PPP_B PPR:$Ps, (PFALSE))>;
+  def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))),
+            (ZIP2_PPP_B PPR:$Ps, (PFALSE))>;
+
+  // Concatenate two predicates.
+  def : Pat<(nxv4i1 (concat_vectors nxv2i1:$p1, nxv2i1:$p2)),
+            (UZP1_PPP_S $p1, $p2)>;
+  def : Pat<(nxv8i1 (concat_vectors nxv4i1:$p1, nxv4i1:$p2)),
+            (UZP1_PPP_H $p1, $p2)>;
+  def : Pat<(nxv16i1 (concat_vectors nxv8i1:$p1, nxv8i1:$p2)),
+            (UZP1_PPP_B $p1, $p2)>;
+
   defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
   defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
   defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>;
diff --git a/llvm/test/CodeGen/AArch64/sve-trunc.ll b/llvm/test/CodeGen/AArch64/sve-trunc.ll
index 876003a3962c..3743301cfa9b 100644
--- a/llvm/test/CodeGen/AArch64/sve-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-trunc.ll
@@ -59,3 +59,123 @@ entry:
   %out = trunc <vscale x 2 x i64> %in to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %out
 }
+
+; Truncating to i1 requires convert it to a cmp
+
+define <vscale x 2 x i1> @trunc_i64toi1(<vscale x 2 x i64> %in) {
+; CHECK-LABEL: trunc_i64toi1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    and z0.d, z0.d, #0x1
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ret
+entry:
+  %out = trunc <vscale x 2 x i64> %in to <vscale x 2 x i1>
+  ret <vscale x 2 x i1> %out
+}
+
+define <vscale x 4 x i1> @trunc_i64toi1_split(<vscale x 4 x i64> %in) {
+; CHECK-LABEL: trunc_i64toi1_split:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    and z1.d, z1.d, #0x1
+; CHECK-NEXT:    and z0.d, z0.d, #0x1
+; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    uzp1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ret
+entry:
+  %out = trunc <vscale x 4 x i64> %in to <vscale x 4 x i1>
+  ret <vscale x 4 x i1> %out
+}
+
+define <vscale x 8 x i1> @trunc_i64toi1_split2(<vscale x 8 x i64> %in) {
+; CHECK-LABEL: trunc_i64toi1_split2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    and z3.d, z3.d, #0x1
+; CHECK-NEXT:    and z2.d, z2.d, #0x1
+; CHECK-NEXT:    and z1.d, z1.d, #0x1
+; CHECK-NEXT:    and z0.d, z0.d, #0x1
+; CHECK-NEXT:    cmpne p1.d, p0/z, z3.d, #0
+; CHECK-NEXT:    cmpne p2.d, p0/z, z2.d, #0
+; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
+; CHECK-NEXT:    cmpne p2.d, p0/z, z1.d, #0
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    uzp1 p0.s, p0.s, p2.s
+; CHECK-NEXT:    uzp1 p0.h, p0.h, p1.h
+; CHECK-NEXT:    ret
+entry:
+  %out = trunc <vscale x 8 x i64> %in to <vscale x 8 x i1>
+  ret <vscale x 8 x i1> %out
+}
+
+define <vscale x 16 x i1> @trunc_i64toi1_split3(<vscale x 16 x i64> %in) {
+; CHECK-LABEL: trunc_i64toi1_split3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    and z7.d, z7.d, #0x1
+; CHECK-NEXT:    and z6.d, z6.d, #0x1
+; CHECK-NEXT:    and z5.d, z5.d, #0x1
+; CHECK-NEXT:    and z4.d, z4.d, #0x1
+; CHECK-NEXT:    and z3.d, z3.d, #0x1
+; CHECK-NEXT:    and z2.d, z2.d, #0x1
+; CHECK-NEXT:    cmpne p1.d, p0/z, z7.d, #0
+; CHECK-NEXT:    cmpne p2.d, p0/z, z6.d, #0
+; CHECK-NEXT:    cmpne p3.d, p0/z, z5.d, #0
+; CHECK-NEXT:    cmpne p4.d, p0/z, z4.d, #0
+; CHECK-NEXT:    and z1.d, z1.d, #0x1
+; CHECK-NEXT:    and z0.d, z0.d, #0x1
+; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
+; CHECK-NEXT:    cmpne p2.d, p0/z, z3.d, #0
+; CHECK-NEXT:    uzp1 p3.s, p4.s, p3.s
+; CHECK-NEXT:    cmpne p4.d, p0/z, z2.d, #0
+; CHECK-NEXT:    uzp1 p2.s, p4.s, p2.s
+; CHECK-NEXT:    cmpne p4.d, p0/z, z1.d, #0
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    uzp1 p0.s, p0.s, p4.s
+; CHECK-NEXT:    uzp1 p1.h, p3.h, p1.h
+; CHECK-NEXT:    uzp1 p0.h, p0.h, p2.h
+; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
+; CHECK-NEXT:    ret
+entry:
+  %out = trunc <vscale x 16 x i64> %in to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %out
+}
+
+
+define <vscale x 4 x i1> @trunc_i32toi1(<vscale x 4 x i32> %in) {
+; CHECK-LABEL: trunc_i32toi1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    and z0.s, z0.s, #0x1
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ret
+entry:
+  %out = trunc <vscale x 4 x i32> %in to <vscale x 4 x i1>
+  ret <vscale x 4 x i1> %out
+}
+
+define <vscale x 8 x i1> @trunc_i16toi1(<vscale x 8 x i16> %in) {
+; CHECK-LABEL: trunc_i16toi1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    and z0.h, z0.h, #0x1
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ret
+entry:
+  %out = trunc <vscale x 8 x i16> %in to <vscale x 8 x i1>
+  ret <vscale x 8 x i1> %out
+}
+
+define <vscale x 16 x i1> @trunc_i8toi1(<vscale x 16 x i8> %in) {
+; CHECK-LABEL: trunc_i8toi1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    and z0.b, z0.b, #0x1
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    ret
+entry:
+  %out = trunc <vscale x 16 x i8> %in to <vscale x 16 x i1>
+  ret <vscale x 16 x i1> %out
+}

From 328269f3834d793bd4a7287d4344aa266d6641b9 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Mon, 20 Jul 2020 14:43:50 +0100
Subject: [PATCH 074/363] [AArch64][SVE] Fix PCS for functions taking/returning
 scalable types.

The default calling convention needs to save/restore the SVE callee
saves according to the SVE PCS when the function takes or returns
scalable types, even when the `aarch64_sve_vector_pcs` CC is not
specified for the function.

Reviewers: efriedma, paulwalker-arm, david-arm, rengolin

Reviewed By: paulwalker-arm

Differential Revision: https://reviews.llvm.org/D84041

(cherry picked from commit 9bacf1588583014538a0217add18f370acb95788)
---
 .../Target/AArch64/AArch64RegisterInfo.cpp    | 10 ++++++++
 .../CodeGen/AArch64/sve-calling-convention.ll | 23 +++++++++++++++++++
 llvm/test/CodeGen/AArch64/sve-trunc.ll        |  9 ++++++++
 3 files changed, 42 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 886158ca4490..de1ae4759210 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -40,6 +40,14 @@ AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
   AArch64_MC::initLLVMToCVRegMapping(this);
 }
 
+static bool hasSVEArgsOrReturn(const MachineFunction *MF) {
+  const Function &F = MF->getFunction();
+  return isa<ScalableVectorType>(F.getReturnType()) ||
+         any_of(F.args(), [](const Argument &Arg) {
+           return isa<ScalableVectorType>(Arg.getType());
+         });
+}
+
 const MCPhysReg *
 AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   assert(MF && "Invalid MachineFunction pointer.");
@@ -75,6 +83,8 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     // This is for OSes other than Windows; Windows is a separate case further
     // above.
     return CSR_AArch64_AAPCS_X18_SaveList;
+  if (hasSVEArgsOrReturn(MF))
+    return CSR_AArch64_SVE_AAPCS_SaveList;
   return CSR_AArch64_AAPCS_SaveList;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
index 767a3cd8acfe..f95e749ad5ee 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -stop-after=finalize-isel < %s 2>%t | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -stop-after=prologepilog < %s 2>%t | FileCheck %s --check-prefix=CHECKCSR
 ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
 
 ; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
@@ -123,3 +124,25 @@ define <vscale x 4 x i1> @sve_signature_pred_caller(<vscale x 4 x i1> %arg1, <vs
   %res = call <vscale x 4 x i1> @sve_signature_pred(<vscale x 4 x i1> %arg2, <vscale x 4 x i1> %arg1)
   ret <vscale x 4 x i1> %res
 }
+
+; Test that functions returning or taking SVE arguments use the correct
+; callee-saved set when using the default C calling convention (as opposed
+; to aarch64_sve_vector_pcs)
+
+; CHECKCSR-LABEL: name: sve_signature_vec_ret_callee
+; CHECKCSR: callee-saved-register: '$z8'
+; CHECKCSR: callee-saved-register: '$p4'
+; CHECKCSR: RET_ReallyLR
+define <vscale x 4 x i32> @sve_signature_vec_ret_callee() nounwind {
+  call void asm sideeffect "nop", "~{z8},~{p4}"()
+  ret <vscale x 4 x i32> zeroinitializer
+}
+
+; CHECKCSR-LABEL: name: sve_signature_vec_arg_callee
+; CHECKCSR: callee-saved-register: '$z8'
+; CHECKCSR: callee-saved-register: '$p4'
+; CHECKCSR: RET_ReallyLR
+define void @sve_signature_vec_arg_callee(<vscale x 4 x i32> %v) nounwind {
+  call void asm sideeffect "nop", "~{z8},~{p4}"()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-trunc.ll b/llvm/test/CodeGen/AArch64/sve-trunc.ll
index 3743301cfa9b..46d152bbf7ac 100644
--- a/llvm/test/CodeGen/AArch64/sve-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-trunc.ll
@@ -113,6 +113,12 @@ entry:
 define <vscale x 16 x i1> @trunc_i64toi1_split3(<vscale x 16 x i64> %in) {
 ; CHECK-LABEL: trunc_i64toi1_split3:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset p4, -16
+; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z7.d, z7.d, #0x1
 ; CHECK-NEXT:    and z6.d, z6.d, #0x1
@@ -134,9 +140,12 @@ define <vscale x 16 x i1> @trunc_i64toi1_split3(<vscale x 16 x i64> %in) {
 ; CHECK-NEXT:    cmpne p4.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    uzp1 p0.s, p0.s, p4.s
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    uzp1 p1.h, p3.h, p1.h
 ; CHECK-NEXT:    uzp1 p0.h, p0.h, p2.h
 ; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   %out = trunc <vscale x 16 x i64> %in to <vscale x 16 x i1>

From fd2d5a0c4cdc9ccb0b88f264ae452e3a0e8dcc09 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Wed, 22 Jul 2020 10:04:36 +0100
Subject: [PATCH 075/363] [AArch64][SVE] Correctly allocate scavenging slot in
 presence of SVE.

This patch addresses two issues:

* Forces the availability of the base-pointer (x19) when the frame has
  both scalable vectors and variable-length arrays. Otherwise it will
  be expensive to access non-SVE locals.

* In presence of SVE stack objects, it will allocate the emergency
  scavenging slot close to the SP, so that they can be accessed from
  the SP or BP if available. If accessed from the frame-pointer, it will
  otherwise need an extra register to access the scavenging slot because
  of mixed scalable/non-scalable addressing modes.

Reviewers: efriedma, ostannard, cameron.mcinally, rengolin, david-arm

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D70174

(cherry picked from commit bef56f7fe2382ed1476aa67a55626b364635b44e)
---
 .../Target/AArch64/AArch64RegisterInfo.cpp    | 18 +++++++++++-
 .../AArch64/framelayout-scavengingslot.mir    | 27 ++++++++++++++++++
 .../AArch64/framelayout-sve-basepointer.mir   | 23 +++++++++++++++
 .../framelayout-sve-scavengingslot.mir        | 28 +++++++++++++++++++
 llvm/test/CodeGen/AArch64/framelayout-sve.mir | 22 +++++++--------
 5 files changed, 106 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/framelayout-scavengingslot.mir
 create mode 100644 llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir
 create mode 100644 llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir

diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index de1ae4759210..83a488afc797 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -353,6 +353,15 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   if (MFI.hasVarSizedObjects() || MF.hasEHFunclets()) {
     if (needsStackRealignment(MF))
       return true;
+
+    if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
+      const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+      // Frames that have variable sized objects and scalable SVE objects,
+      // should always use a basepointer.
+      if (!AFI->hasCalculatedStackSizeSVE() || AFI->getStackSizeSVE())
+        return true;
+    }
+
     // Conservatively estimate whether the negative offset from the frame
     // pointer will be sufficient to reach. If a function has a smallish
     // frame, it's less likely to have lots of spills and callee saved
@@ -389,8 +398,15 @@ AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
   // (closer to SP).
   //
   // The beginning works most reliably if we have a frame pointer.
+  // In the presence of any non-constant space between FP and locals,
+  // (e.g. in case of stack realignment or a scalable SVE area), it is
+  // better to use SP or BP.
   const AArch64FrameLowering &TFI = *getFrameLowering(MF);
-  return TFI.hasFP(MF);
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  assert((!MF.getSubtarget<AArch64Subtarget>().hasSVE() ||
+          AFI->hasCalculatedStackSizeSVE()) &&
+         "Expected SVE area to be calculated by this point");
+  return TFI.hasFP(MF) && !needsStackRealignment(MF) && !AFI->getStackSizeSVE();
 }
 
 bool AArch64RegisterInfo::requiresFrameIndexScavenging(
diff --git a/llvm/test/CodeGen/AArch64/framelayout-scavengingslot.mir b/llvm/test/CodeGen/AArch64/framelayout-scavengingslot.mir
new file mode 100644
index 000000000000..d1252435f874
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/framelayout-scavengingslot.mir
@@ -0,0 +1,27 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s
+---
+# This test verifies that the emergency scavenging slot is located near
+# the SP when the stack is realigned.
+name: LateScavengingSlotRealignment
+# CHECK-LABEL: name: LateScavengingSlotRealignment
+# CHECK: bb.0:
+# CHECK:      STRXui killed $[[SCRATCH:x[0-9]+]], $sp, 3
+# CHECK-NEXT: $[[SCRATCH]] = ADDXri $sp, 40, 0
+# CHECK-NEXT: STRXui $x0, killed $[[SCRATCH]], 4095
+# CHECK-NEXT: $[[SCRATCH]] = LDRXui $sp, 3
+# CHECK: bb.1:
+tracksRegLiveness: true
+frameInfo:
+  isFrameAddressTaken: true
+stack:
+  - { id: 0, size:    16, alignment: 16 }
+  - { id: 1, size: 32768, alignment: 32 }
+body: |
+  bb.0:
+    liveins: $x0, $x8
+    STRXui $x0, %stack.0, 0
+    B %bb.1
+  bb.1:
+    liveins: $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $lr
+    RET_ReallyLR implicit $x19, implicit $x20, implicit $x21, implicit $x22, implicit $x23, implicit $x24, implicit $x25, implicit $x26, implicit $x27, implicit $x28, implicit $lr
+...
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir
new file mode 100644
index 000000000000..a366744d8fa2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir
@@ -0,0 +1,23 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog -mattr=+sve %s -o - | FileCheck %s
+---
+# This test verifies that the basepointer is available in presence of SVE stack objects.
+name: hasBasepointer
+# CHECK-LABEL: name: hasBasepointer
+# CHECK: bb.0:
+# CHECK:      $sp = frame-setup ADDVL_XXI $sp, -1
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
+# CHECK-NEXT: $x19 = ADDXri $sp, 0, 0
+# CHECK:      STRXui $x0, $x19, 0
+tracksRegLiveness: true
+frameInfo:
+  isFrameAddressTaken: true
+stack:
+  - { id: 0, type: variable-sized,  alignment: 1 }
+  - { id: 1, name: '', size: 16, alignment: 8 }
+  - { id: 2, stack-id: sve-vec, size: 16, alignment: 16 }
+body: |
+  bb.0:
+    liveins: $x0
+    STRXui $x0, %stack.1, 0
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir
new file mode 100644
index 000000000000..2ee6007db289
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir
@@ -0,0 +1,28 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog -mattr=+sve %s -o - | FileCheck %s
+---
+# This test verifies that the emergency scavenging slot is located near the SP/BP.
+name: LateScavengingSlot
+# CHECK-LABEL: name: LateScavengingSlot
+# CHECK: bb.0:
+# CHECK:      $sp = frame-setup ADDVL_XXI $sp, -1
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 8, 12
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
+# CHECK:      STRXui killed $[[SCRATCH:x[0-9]+]], $sp, 1
+# CHECK-NEXT: $[[SCRATCH]] = ADDVL_XXI $fp, -1
+# CHECK-NEXT: STRXui $x0, killed $[[SCRATCH]], 0
+# CHECK: bb.1:
+tracksRegLiveness: true
+frameInfo:
+  isFrameAddressTaken: true
+stack:
+  - { id: 0, name: '', size: 32761, alignment: 8 }
+  - { id: 1, stack-id: sve-vec, size: 16, alignment: 16 }
+body: |
+  bb.0:
+    liveins: $x0, $x8
+    STRXui $x0, %stack.1, 0
+    B %bb.1
+  bb.1:
+    liveins: $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $lr
+    RET_ReallyLR implicit $x19, implicit $x20, implicit $x21, implicit $x22, implicit $x23, implicit $x24, implicit $x25, implicit $x26, implicit $x27, implicit $x28, implicit $lr
+...
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 046357b860b3..7903df64863b 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s
+# RUN: llc -mattr=+sve -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s
 #
 # Test allocation and deallocation of SVE objects on the stack,
 # as well as using a combination of scalable and non-scalable
@@ -30,7 +30,7 @@
   define void @test_address_sve_fp() nounwind { entry: unreachable }
   define void @test_stack_arg_sve() nounwind { entry: unreachable }
   define void @test_address_sve_out_of_range() nounwind { entry: unreachable }
-  define void @test_address_gpr_vla_nobp() nounwind { entry: unreachable }
+  define void @test_address_gpr_vla() nounwind { entry: unreachable }
   define aarch64_sve_vector_pcs void @save_restore_pregs_sve() nounwind { entry: unreachable }
   define aarch64_sve_vector_pcs void @save_restore_zregs_sve() nounwind { entry: unreachable }
   define aarch64_sve_vector_pcs void @save_restore_sve() nounwind { entry: unreachable }
@@ -335,23 +335,23 @@ body:             |
     RET_ReallyLR
 ---
 ...
-# Test that non-SVE objects are accessed from FP when there is no BP,
-# but the SP cannot be used because of variable-length arrays.
+# Test that non-SVE objects are accessed from BP when there are
+# variable length arrays, because it will be more expensive to
+# access from the FP when there are also SVE objects on the stack.
 #
 # +----------+ <- FP
 # | %fstack.0|  // 16 scalable bytes
 # +----------+ <- @FP - 16 scalable bytes
 # | %stack.0 |  // 16 bytes
-# +----------+ <- @FP - 16 scalable bytes - 16b
+# +----------+ <- @BP
 # : %stack.1 :  // variable length
 # +----------+ <- SP
 
-# CHECK-LABEL: name: test_address_gpr_vla_nobp
-# CHECK:      bb.0.entry:
-# CHECK:      $[[TMP:x[0-9]+]] = ADDVL_XXI $fp, -1
-# CHECK-NEXT: STURXi $xzr, killed $[[TMP]], -16
-# CHECK:      RET_ReallyLR
-name:            test_address_gpr_vla_nobp
+# CHECK-LABEL: name: test_address_gpr_vla
+# CHECK: bb.0.entry:
+# CHECK: STRXui $xzr, $x19, 0
+# CHECK: RET_ReallyLR
+name:            test_address_gpr_vla
 frameInfo:
   maxAlignment:  16
 fixedStack:

From 48eb1aa387eb1d356632b82efaf6438d1fcb6640 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Wed, 22 Jul 2020 17:06:47 -0700
Subject: [PATCH 076/363] [AArch64][SVE] Teach copyPhysReg to copy ZPR2/3/4.

It's sort of tricky to hit this in practice, but not impossible. I have
a synthetic C testcase if anyone is interested.

The implementation is identical to the equivalent NEON register copies.

Differential Revision: https://reviews.llvm.org/D84373

(cherry picked from commit 993c1a3219a8ae69f1d700183bf174d75f3815d4)
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 29 +++++++
 .../test/CodeGen/AArch64/sve-copy-zprpair.mir | 78 +++++++++++++++++++
 2 files changed, 107 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-copy-zprpair.mir

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 5139ae5ccaf1..08f80c9aa361 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2744,6 +2744,35 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  // Copy a Z register pair by copying the individual sub-registers.
+  if (AArch64::ZPR2RegClass.contains(DestReg) &&
+      AArch64::ZPR2RegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
+                     Indices);
+    return;
+  }
+
+  // Copy a Z register triple by copying the individual sub-registers.
+  if (AArch64::ZPR3RegClass.contains(DestReg) &&
+      AArch64::ZPR3RegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
+                                       AArch64::zsub2};
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
+                     Indices);
+    return;
+  }
+
+  // Copy a Z register quad by copying the individual sub-registers.
+  if (AArch64::ZPR4RegClass.contains(DestReg) &&
+      AArch64::ZPR4RegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
+                                       AArch64::zsub2, AArch64::zsub3};
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
+                     Indices);
+    return;
+  }
+
   if (AArch64::GPR64spRegClass.contains(DestReg) &&
       (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
     if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
diff --git a/llvm/test/CodeGen/AArch64/sve-copy-zprpair.mir b/llvm/test/CodeGen/AArch64/sve-copy-zprpair.mir
new file mode 100644
index 000000000000..83a0b5dd1c14
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-copy-zprpair.mir
@@ -0,0 +1,78 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -run-pass=postrapseudos -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            copy_zpr2
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$z0_z1' }
+frameInfo:
+  maxCallFrameSize: 0
+body:             |
+  bb.0:
+    liveins: $z0_z1
+    ; CHECK-LABEL: name: copy_zpr2
+    ; CHECK: liveins: $z0_z1
+    ; CHECK: $z2 = ORR_ZZZ $z1, $z1
+    ; CHECK: $z1 = ORR_ZZZ $z0, $z0
+    ; CHECK: $z0 = ORR_ZZZ $z1, $z1
+    ; CHECK: $z1 = ORR_ZZZ $z2, $z2
+    ; CHECK: RET_ReallyLR
+    $z1_z2 = COPY $z0_z1
+    $z0_z1 = COPY $z1_z2
+    RET_ReallyLR
+
+...
+---
+name:            copy_zpr3
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$z0_z1_z2' }
+frameInfo:
+  maxCallFrameSize: 0
+body:             |
+  bb.0:
+    liveins: $z0_z1_z2
+    ; CHECK-LABEL: name: copy_zpr3
+    ; CHECK: liveins: $z0_z1_z2
+    ; CHECK: $z3 = ORR_ZZZ $z2, $z2
+    ; CHECK: $z2 = ORR_ZZZ $z1, $z1
+    ; CHECK: $z1 = ORR_ZZZ $z0, $z0
+    ; CHECK: $z0 = ORR_ZZZ $z1, $z1
+    ; CHECK: $z1 = ORR_ZZZ $z2, $z2
+    ; CHECK: $z2 = ORR_ZZZ $z3, $z3
+    ; CHECK: RET_ReallyLR
+    $z1_z2_z3 = COPY $z0_z1_z2
+    $z0_z1_z2 = COPY $z1_z2_z3
+    RET_ReallyLR
+
+...
+---
+name:            copy_zpr4
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$z0_z1_z2_z3' }
+frameInfo:
+  maxCallFrameSize: 0
+body:             |
+  bb.0:
+    liveins: $z0_z1_z2_z3
+    ; CHECK-LABEL: name: copy_zpr4
+    ; CHECK: liveins: $z0_z1_z2_z3
+    ; CHECK: $z4 = ORR_ZZZ $z3, $z3
+    ; CHECK: $z3 = ORR_ZZZ $z2, $z2
+    ; CHECK: $z2 = ORR_ZZZ $z1, $z1
+    ; CHECK: $z1 = ORR_ZZZ $z0, $z0
+    ; CHECK: $z0 = ORR_ZZZ $z1, $z1
+    ; CHECK: $z1 = ORR_ZZZ $z2, $z2
+    ; CHECK: $z2 = ORR_ZZZ $z3, $z3
+    ; CHECK: $z3 = ORR_ZZZ $z4, $z4
+    ; CHECK: RET_ReallyLR
+    $z1_z2_z3_z4 = COPY $z0_z1_z2_z3
+    $z0_z1_z2_z3 = COPY $z1_z2_z3_z4
+    RET_ReallyLR
+
+...

From 967b84c7a7e2a39d01ab4266bf5eac8c2de98ce5 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Mon, 27 Jul 2020 12:57:41 +0100
Subject: [PATCH 077/363] [AArch64][SVE] Don't support fixedStack for SVE
 objects.

Fixed stack objects are preallocated and defined to be allocated before
any of the regular stack objects. These are normally used to model stack
arguments.

The AAPCS does not support passing SVE registers on the stack by value
(only by reference). The current layout also doesn't place them before
all stack objects, but rather before all SVE objects. Removing this
simplifies the code that emits the allocation/deallocation
around callee-saved registers (D84042).

This patch also removes all uses of fixedStack from from
framelayout-sve.mir, where this was used purely for testing purposes.

Reviewers: paulwalker-arm, efriedma, rengolin

Reviewed By: paulwalker-arm

Differential Revision: https://reviews.llvm.org/D84538

(cherry picked from commit 54492a5843a34684ce21ae201dd8ca3e509288fd)
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |  13 +-
 llvm/test/CodeGen/AArch64/framelayout-sve.mir | 125 +++++++++---------
 2 files changed, 66 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index efa3fd5ca9ce..cc563dd70632 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2595,20 +2595,21 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
                                               int &MinCSFrameIndex,
                                               int &MaxCSFrameIndex,
                                               bool AssignOffsets) {
+#ifndef NDEBUG
   // First process all fixed stack objects.
-  int64_t Offset = 0;
   for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
-    if (MFI.getStackID(I) == TargetStackID::SVEVector) {
-      int64_t FixedOffset = -MFI.getObjectOffset(I);
-      if (FixedOffset > Offset)
-        Offset = FixedOffset;
-    }
+    assert(MFI.getStackID(I) != TargetStackID::SVEVector &&
+           "SVE vectors should never be passed on the stack by value, only by "
+           "reference.");
+#endif
 
   auto Assign = [&MFI](int FI, int64_t Offset) {
     LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
     MFI.setObjectOffset(FI, Offset);
   };
 
+  int64_t Offset = 0;
+
   // Then process all callee saved slots.
   if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
     // Make sure to align the last callee save slot.
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 7903df64863b..575c839fbd15 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -41,10 +41,10 @@
 # +----------+
 # |scratchreg|  // x29 is used as scratch reg.
 # +----------+
-# | %fixed-  |  // scalable SVE object of n * 18 bytes, aligned to 16 bytes,
-# |  stack.0 |  // to be materialized with 2*ADDVL (<=> 2 * n * 16bytes)
+# | %stack.0 |  // scalable SVE object of n * 18 bytes, aligned to 16 bytes,
+# |          |  // to be materialized with 2*ADDVL (<=> 2 * n * 16bytes)
 # +----------+
-# | %stack.0 |  // not scalable
+# | %stack.1 |  // not scalable
 # +----------+ <- SP
 
 # CHECK-LABEL: name: test_allocate_sve
@@ -60,10 +60,9 @@
 # CHECK-NEXT: $sp, $[[SCRATCH]] = frame-destroy LDRXpost $sp, 16
 # CHECK-NEXT: RET_ReallyLR
 name:            test_allocate_sve
-fixedStack:
-  - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 }
 stack:
-  - { id: 0, stack-id: default, size: 16, alignment: 8 }
+  - { id: 0, stack-id: sve-vec, size: 18, alignment: 2 }
+  - { id: 1, stack-id: default, size: 16, alignment: 8 }
 body:             |
   bb.0.entry:
     RET_ReallyLR
@@ -73,10 +72,9 @@ body:             |
 # | x20, x21 |  // callee saves
 # |scratchreg|  // x29 is used as scratch reg.
 # +----------+
-# | %fixed-  |  // scalable objects
-# |  stack.0 |
+# | %stack.0 |  // scalable objects
 # +----------+
-# | %stack.0 |  // not scalable
+# | %stack.1 |  // not scalable
 # +----------+ <- SP
 
 # CHECK-LABEL: name: test_allocate_sve_gpr_callee_saves
@@ -95,10 +93,9 @@ body:             |
 # CHECK-NEXT: $sp, $[[SCRATCH]] = frame-destroy LDRXpost $sp, 32
 # CHECK-NEXT: RET_ReallyLR
 name:            test_allocate_sve_gpr_callee_saves
-fixedStack:
-  - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 }
 stack:
-  - { id: 0, stack-id: default, size: 16, alignment: 8 }
+  - { id: 0, stack-id: sve-vec, size: 18, alignment: 2 }
+  - { id: 1, stack-id: default, size: 16, alignment: 8 }
 body:             |
   bb.0.entry:
     $x20 = IMPLICIT_DEF
@@ -109,11 +106,10 @@ body:             |
 # +----------+
 # |  lr, fp  |  // frame record
 # +----------+ <- FP
-# | %fixed-  |  // scalable objects
-# |  stack.0 |
+# | %stack.0 |  // scalable objects
 # +----------+
 # |//////////|  // alignment gap
-# | %stack.0 |  // not scalable
+# | %stack.1 |  // not scalable
 # +----------+ <- SP
 # CHECK-LABEL: name: test_allocate_sve_gpr_realigned
 # CHECK:       stackSize: 32
@@ -128,10 +124,9 @@ body:             |
 # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2
 # CHECK-NEXT: RET_ReallyLR
 name:            test_allocate_sve_gpr_realigned
-fixedStack:
-  - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 }
 stack:
-  - { id: 0, stack-id: default, size: 16, alignment: 32 }
+  - { id: 0, stack-id: sve-vec, size: 18, alignment: 2  }
+  - { id: 1, stack-id: default, size: 16, alignment: 32 }
 body:             |
   bb.0.entry:
     RET_ReallyLR
@@ -144,7 +139,7 @@ body:             |
 # | %stack.1 |  // scalable @ SP + 16b + 16 scalable bytes
 # | %stack.2 |  // scalable @ SP + 16b + 14 scalable bytes
 # +----------+
-# | %stack.0 |  // not scalable
+# | %stack.3 |  // not scalable
 # +----------+ <- SP
 
 # CHECK-LABEL: name: test_address_sve
@@ -169,19 +164,18 @@ body:             |
 name:            test_address_sve
 frameInfo:
   maxAlignment:  16
-fixedStack:
-  - { id: 0, stack-id: sve-vec, size: 16, alignment: 8, offset: -16 }
-  - { id: 1, stack-id: sve-vec, size: 16, alignment: 8, offset: -32 }
-  - { id: 2, stack-id: sve-vec, size:  2, alignment: 2, offset: -34 }
 stack:
-  - { id: 0, stack-id: default, size: 16, alignment: 8 }
+  - { id: 0, stack-id: sve-vec, size: 16, alignment: 8 }
+  - { id: 1, stack-id: sve-vec, size: 16, alignment: 8 }
+  - { id: 2, stack-id: sve-vec, size:  2, alignment: 2 }
+  - { id: 3, stack-id: default, size: 16, alignment: 8 }
 body:             |
   bb.0.entry:
     liveins: $z0, $z1, $p0
 
-    STR_ZXI $z0, %fixed-stack.0, 0
-    STR_ZXI $z1, %fixed-stack.1, 0
-    STR_PXI $p0, %fixed-stack.2, 0
+    STR_ZXI $z0, %stack.0, 0
+    STR_ZXI $z1, %stack.1, 0
+    STR_PXI $p0, %stack.2, 0
 
     RET_ReallyLR
 ---
@@ -190,11 +184,11 @@ body:             |
 # | x20, x21  |  // callee saves
 # |  lr, fp   |  // frame record
 # +-----------+ <- FP
-# | %fstack.0 |  // scalable @ FP - 16 scalable bytes
-# | %fstack.1 |  // scalable @ FP - 32 scalable bytes
-# | %fstack.2 |  // scalable @ FP - 34 scalable bytes
+# | %stack.0  |  // scalable @ FP - 16 scalable bytes
+# | %stack.1  |  // scalable @ FP - 32 scalable bytes
+# | %stack.2  |  // scalable @ FP - 34 scalable bytes
 # +-----------+
-# | %stack.0  |  // not scalable
+# | %stack.3  |  // not scalable
 # +-----------+ <- SP
 
 # CHECK-LABEL: name: test_address_sve_fp
@@ -218,19 +212,18 @@ name:            test_address_sve_fp
 frameInfo:
   maxAlignment:  16
   isFrameAddressTaken: true
-fixedStack:
-  - { id: 0, stack-id: sve-vec, size: 16, alignment: 8, offset: -16 }
-  - { id: 1, stack-id: sve-vec, size: 16, alignment: 8, offset: -32 }
-  - { id: 2, stack-id: sve-vec, size:  2, alignment: 2, offset: -34 }
 stack:
-  - { id: 0, stack-id: default, size: 16, alignment: 8 }
+  - { id: 0, stack-id: sve-vec, size: 16, alignment: 8 }
+  - { id: 1, stack-id: sve-vec, size: 16, alignment: 8 }
+  - { id: 2, stack-id: sve-vec, size:  2, alignment: 2 }
+  - { id: 3, stack-id: default, size: 16, alignment: 8 }
 body:             |
   bb.0.entry:
     liveins: $z0, $z1, $p0
 
-    STR_ZXI $z0, %fixed-stack.0, 0
-    STR_ZXI $z1, %fixed-stack.1, 0
-    STR_PXI $p0, %fixed-stack.2, 0
+    STR_ZXI $z0, %stack.0, 0
+    STR_ZXI $z1, %stack.1, 0
+    STR_PXI $p0, %stack.2, 0
 
     RET_ReallyLR
 ---
@@ -240,9 +233,9 @@ body:             |
 # +-----------+
 # |callee save|  // register saved as scratch reg.
 # +-----------+
-# | %fstack.1 |  // vector of 16 scalable bytes
+# | %stack.0  |  // vector of 16 scalable bytes
 # +---------- +
-# | %stack.0  |  // not scalable, 16 bytes
+# | %stack.1  |  // not scalable, 16 bytes
 # +-----------+ <- SP
 # CHECK-LABEL: name: test_stack_arg_sve
 # CHECK:       stackSize: 32
@@ -262,9 +255,9 @@ body:             |
 name:             test_stack_arg_sve
 fixedStack:
   - { id: 0, stack-id: default, size: 16, alignment: 16, offset: 0 }
-  - { id: 1, stack-id: sve-vec, size: 16, alignment: 16, offset: -16 }
 stack:
-  - { id: 0, stack-id: default, size: 16, alignment: 16 }
+  - { id: 0, stack-id: sve-vec, size: 16, alignment: 16 }
+  - { id: 1, stack-id: default, size: 16, alignment: 16 }
 body:             |
   bb.0.entry:
     liveins: $x0
@@ -320,17 +313,17 @@ body:             |
 name:            test_address_sve_out_of_range
 frameInfo:
   maxAlignment:  16
-fixedStack:
-  - { id: 0, stack-id: sve-vec, size:   16, alignment: 16, offset: -16 }
-  - { id: 1, stack-id: sve-vec, size: 3584, alignment: 16, offset: -3600 }
-  - { id: 2, stack-id: sve-vec, size:  512, alignment: 16, offset: -4112 }
+stack:
+  - { id: 0, stack-id: sve-vec, size:   16, alignment: 16 }
+  - { id: 1, stack-id: sve-vec, size: 3584, alignment: 16 }
+  - { id: 2, stack-id: sve-vec, size:  512, alignment: 16 }
 
 body:             |
   bb.0.entry:
     liveins: $z0, $p0
 
-    STR_ZXI $z0, %fixed-stack.0, 0
-    STR_PXI $p0, %fixed-stack.1, 0
+    STR_ZXI $z0, %stack.0, 0
+    STR_PXI $p0, %stack.1, 0
 
     RET_ReallyLR
 ---
@@ -340,11 +333,11 @@ body:             |
 # access from the FP when there are also SVE objects on the stack.
 #
 # +----------+ <- FP
-# | %fstack.0|  // 16 scalable bytes
+# | %stack.0 |  // 16 scalable bytes
 # +----------+ <- @FP - 16 scalable bytes
-# | %stack.0 |  // 16 bytes
+# | %stack.1 |  // 16 bytes
 # +----------+ <- @BP
-# : %stack.1 :  // variable length
+# : %stack.2 :  // variable length
 # +----------+ <- SP
 
 # CHECK-LABEL: name: test_address_gpr_vla
@@ -354,16 +347,15 @@ body:             |
 name:            test_address_gpr_vla
 frameInfo:
   maxAlignment:  16
-fixedStack:
-  - { id: 0, stack-id: sve-vec, size: 16, alignment: 8, offset: -16 }
 stack:
-  - { id: 0, stack-id: default, size: 16, alignment: 8 }
-  - { id: 1, stack-id: default, type: variable-sized }
+  - { id: 0, stack-id: sve-vec, size: 16, alignment: 8 }
+  - { id: 1, stack-id: default, size: 16, alignment: 8 }
+  - { id: 2, stack-id: default, type: variable-sized }
 body:             |
   bb.0.entry:
     liveins: $xzr
 
-    STRXui $xzr, %stack.0, 0
+    STRXui $xzr, %stack.1, 0
 
     RET_ReallyLR
 ---
@@ -429,7 +421,7 @@ body:             |
 # CHECK-LABEL: name: save_restore_sve
 # CHECK: $sp = frame-setup STPXpre killed ${{[a-z0-9]+}}, killed $x21, $sp, -4
 # CHECK: frame-setup STPXi killed $x20, killed $x19, $sp, 2
-# CHECK: $sp = frame-setup ADDVL_XXI $sp, -19
+# CHECK: $sp = frame-setup ADDVL_XXI $sp, -18
 # CHECK: frame-setup STR_PXI killed $p15, $sp, 4
 # CHECK: frame-setup STR_PXI killed $p14, $sp, 5
 # CHECK: frame-setup STR_PXI killed $p5, $sp, 14
@@ -438,9 +430,11 @@ body:             |
 # CHECK: frame-setup STR_ZXI killed $z22, $sp, 3
 # CHECK: frame-setup STR_ZXI killed $z9, $sp, 16
 # CHECK: frame-setup STR_ZXI killed $z8, $sp, 17
+# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1
 # CHECK: $sp = frame-setup SUBXri $sp, 32, 0
 
 # CHECK: $sp = frame-destroy ADDXri $sp, 32, 0
+# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 1
 # CHECK: $p15 = frame-destroy LDR_PXI $sp, 4
 # CHECK: $p14 = frame-destroy LDR_PXI $sp, 5
 # CHECK: $p5 = frame-destroy LDR_PXI $sp, 14
@@ -449,15 +443,14 @@ body:             |
 # CHECK: $z22 = frame-destroy LDR_ZXI $sp, 3
 # CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16
 # CHECK: $z8 = frame-destroy LDR_ZXI $sp, 17
-# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 19
+# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 18
 # CHECK: $x20, $x19 = frame-destroy LDPXi $sp, 2
 # CHECK: $sp, ${{[a-z0-9]+}}, $x21 = frame-destroy LDPXpost $sp, 4
 # CHECK: RET_ReallyLR
 name: save_restore_sve
-fixedStack:
-  - { id: 0, stack-id: sve-vec, size: 16, alignment: 16, offset: -16 }
 stack:
-  - { id: 0, stack-id: default, size: 32, alignment: 16 }
+  - { id: 0, stack-id: sve-vec, size: 16, alignment: 16 }
+  - { id: 1, stack-id: default, size: 32, alignment: 16 }
 body:             |
   bb.0.entry:
 
@@ -494,7 +487,7 @@ body:             |
 # CHECK-LABEL: name: save_restore_sve_realign
 # CHECK:      $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2
 # CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -19
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -18
 # CHECK-NEXT: STR_PXI killed $p15, $sp, 4
 # CHECK-NEXT: STR_PXI killed $p14, $sp, 5
 # CHECK:      STR_PXI killed $p5, $sp, 14
@@ -503,6 +496,7 @@ body:             |
 # CHECK-NEXT: STR_ZXI killed $z22, $sp, 3
 # CHECK:      STR_ZXI killed $z9, $sp, 16
 # CHECK-NEXT: STR_ZXI killed $z8, $sp, 17
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1
 # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0
 # CHECK-NEXT: $sp = ANDXri killed $[[TMP]]
 
@@ -519,10 +513,9 @@ body:             |
 # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2
 # CHECK-NEXT: RET_ReallyLR
 name: save_restore_sve_realign
-fixedStack:
-  - { id: 0, stack-id: sve-vec, size: 16, alignment: 16, offset: -16 }
 stack:
-  - { id: 0, stack-id: default, size: 16, alignment: 32 }
+  - { id: 0, stack-id: sve-vec, size: 16, alignment: 16 }
+  - { id: 1, stack-id: default, size: 16, alignment: 32 }
 body:             |
   bb.0.entry:
 

From 5596693504af263d53d6676ec6f0f4669ac706b0 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Mon, 27 Jul 2020 14:16:55 +0100
Subject: [PATCH 078/363] [AArch64][SVE] Don't align the last SVE callee save.

Instead of aligning the last callee-saved-register slot to the stack
alignment (16 bytes), just align the SVE callee-saved block. This also
simplifies the code that allocates space for the callee-saves.

This change is needed to make sure the offset to which the callee-saved
register is spilled, corresponds to the offset used for e.g. unwind call
frame instructions.

Reviewers: efriedma, paulwalker-arm, david-arm, rengolin

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D84042

(cherry picked from commit 26b4ef3694973ea2fa656d3d3a7f67f16f135654)
---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 25 +++++++------------
 llvm/test/CodeGen/AArch64/framelayout-sve.mir |  2 +-
 llvm/test/CodeGen/AArch64/sve-trunc.ll        |  2 +-
 3 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index cc563dd70632..1b49c692f293 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1192,7 +1192,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
 
   // Process the SVE callee-saves to determine what space needs to be
   // allocated.
-  if (AFI->getSVECalleeSavedStackSize()) {
+  if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
     // Find callee save instructions in frame.
     CalleeSavesBegin = MBBI;
     assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
@@ -1200,11 +1200,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       ++MBBI;
     CalleeSavesEnd = MBBI;
 
-    int64_t OffsetToFirstCalleeSaveFromSP =
-        MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex());
-    StackOffset OffsetToCalleeSavesFromSP =
-        StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize;
-    AllocateBefore -= OffsetToCalleeSavesFromSP;
+    AllocateBefore = {CalleeSavedSize, MVT::nxv1i8};
     AllocateAfter = SVEStackSize - AllocateBefore;
   }
 
@@ -1582,7 +1578,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // deallocated.
   StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
   MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
-  if (AFI->getSVECalleeSavedStackSize()) {
+  if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
     RestoreBegin = std::prev(RestoreEnd);;
     while (IsSVECalleeSave(RestoreBegin) &&
            RestoreBegin != MBB.begin())
@@ -1592,12 +1588,9 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     assert(IsSVECalleeSave(RestoreBegin) &&
            IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
 
-    int64_t OffsetToFirstCalleeSaveFromSP =
-        MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex());
-    StackOffset OffsetToCalleeSavesFromSP =
-        StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize;
-    DeallocateBefore = OffsetToCalleeSavesFromSP;
-    DeallocateAfter = SVEStackSize - DeallocateBefore;
+    StackOffset CalleeSavedSizeAsOffset = {CalleeSavedSize, MVT::nxv1i8};
+    DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
+    DeallocateAfter = CalleeSavedSizeAsOffset;
   }
 
   // Deallocate the SVE area.
@@ -2612,9 +2605,6 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
 
   // Then process all callee saved slots.
   if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
-    // Make sure to align the last callee save slot.
-    MFI.setObjectAlignment(MaxCSFrameIndex, Align(16));
-
     // Assign offsets to the callee save slots.
     for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
       Offset += MFI.getObjectSize(I);
@@ -2624,6 +2614,9 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
     }
   }
 
+  // Ensure that the Callee-save area is aligned to 16bytes.
+  Offset = alignTo(Offset, Align(16U));
+
   // Create a buffer of SVE objects to allocate and sort it.
   SmallVector<int, 8> ObjectsToAllocate;
   for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 575c839fbd15..75a65a6ad522 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -573,7 +573,7 @@ body:             |
 # CHECK-NEXT:       stack-id: sve-vec, callee-saved-register: '$z23',
 # CHECK:        - { id: 8, name: '', type: spill-slot, offset: -34, size: 2, alignment: 2,
 # CHECK-NEXT:       stack-id: sve-vec, callee-saved-register: '$p4',
-# CHECK:        - { id: 9, name: '', type: spill-slot, offset: -48, size: 2, alignment: 16,
+# CHECK:        - { id: 9, name: '', type: spill-slot, offset: -36, size: 2, alignment: 2,
 # CHECK-NEXT:       stack-id: sve-vec, callee-saved-register: '$p15',
 # CHECK:        - { id: 10, name: '', type: spill-slot, offset: -16, size: 8, alignment: 16,
 # CHECK-NEXT:       stack-id: default, callee-saved-register: '$fp',
diff --git a/llvm/test/CodeGen/AArch64/sve-trunc.ll b/llvm/test/CodeGen/AArch64/sve-trunc.ll
index 46d152bbf7ac..af50176f6b10 100644
--- a/llvm/test/CodeGen/AArch64/sve-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-trunc.ll
@@ -117,7 +117,7 @@ define <vscale x 16 x i1> @trunc_i64toi1_split3(<vscale x 16 x i64> %in) {
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset p4, -16
+; CHECK-NEXT:    .cfi_offset p4, -2
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z7.d, z7.d, #0x1

From 13fb3d3007f0c2ddfc14a1819def5bec1533a0b0 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Tue, 28 Jul 2020 12:11:09 +0100
Subject: [PATCH 079/363] [AArch64][SVE] Fix epilogue for SVE when the stack is
 realigned.

While deallocating the stackframe, the offset used to reload the
callee-saved registers was not pointing to the SVE callee-saves,
but rather to the whole SVE area.

   +--------------+
   | GRP callee   |
   |     saves    |
   +--------------+ <- FP
   | SVE callee   |
   |     saves    |
   +--------------+ <- Should restore SVE callee saves from here
   |  SVE Spills  |
   |  and Locals  |
   +--------------+ <- instead of from here.
   |              |
   :              :
   |              |
   +--------------+ <- SP

Reviewed By: paulwalker-arm

Differential Revision: https://reviews.llvm.org/D84539

(cherry picked from commit cda2eb3ad2bbe923e74d6eb083af196a0622d800)
---
 llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 9 +++++----
 llvm/test/CodeGen/AArch64/framelayout-sve.mir    | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 1b49c692f293..4789a9f02937 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1596,12 +1596,13 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // Deallocate the SVE area.
   if (SVEStackSize) {
     if (AFI->isStackRealigned()) {
-      if (AFI->getSVECalleeSavedStackSize())
-        // Set SP to start of SVE area, from which the callee-save reloads
-        // can be done. The code below will deallocate the stack space
+      if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize())
+        // Set SP to start of SVE callee-save area from which they can
+        // be reloaded. The code below will deallocate the stack space
         // space by moving FP -> SP.
         emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
-                        -SVEStackSize, TII, MachineInstr::FrameDestroy);
+                        {-CalleeSavedSize, MVT::nxv1i8}, TII,
+                        MachineInstr::FrameDestroy);
     } else {
       if (AFI->getSVECalleeSavedStackSize()) {
         // Deallocate the non-SVE locals first before we can deallocate (and
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 75a65a6ad522..668b243dd79e 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -500,7 +500,7 @@ body:             |
 # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0
 # CHECK-NEXT: $sp = ANDXri killed $[[TMP]]
 
-# CHECK:      $sp = frame-destroy ADDVL_XXI $fp, -19
+# CHECK:      $sp = frame-destroy ADDVL_XXI $fp, -18
 # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4
 # CHECK-NEXT: $p14 = frame-destroy LDR_PXI $sp, 5
 # CHECK:      $p5 = frame-destroy LDR_PXI $sp, 14

From a3532c58be5c3a4107549c2462613be76507fe55 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Tue, 14 Jul 2020 16:20:00 +0100
Subject: [PATCH 080/363] [SVE] Don't use LocalStackAllocation for SVE objects

I have introduced a new TargetFrameLowering query function:

  isStackIdSafeForLocalArea

that queries whether or not it is safe for objects of a given stack
id to be bundled into the local area. The default behaviour is to
always bundle regardless of the stack id, however for AArch64 this is
overriden so that it's only safe for fixed-size stack objects.
There is future work here to extend this algorithm for multiple local
areas so that SVE stack objects can be bundled together and accessed
from their own virtual base-pointer.

Differential Revision: https://reviews.llvm.org/D83859

(cherry picked from commit 14bc85e0ebb6c00c1672158ab6a692bfbb11e1cc)
---
 .../llvm/CodeGen/TargetFrameLowering.h        |  6 ++
 llvm/lib/CodeGen/LocalStackSlotAllocation.cpp |  4 ++
 .../lib/Target/AArch64/AArch64FrameLowering.h |  6 ++
 .../CodeGen/AArch64/sve-localstackalloc.mir   | 61 +++++++++++++++++++
 4 files changed, 77 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-localstackalloc.mir

diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index c3a11b199675..d6580430daf7 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -134,6 +134,12 @@ class TargetFrameLowering {
   /// was called).
   virtual unsigned getStackAlignmentSkew(const MachineFunction &MF) const;
 
+  /// This method returns whether or not it is safe for an object with the
+  /// given stack id to be bundled into the local area.
+  virtual bool isStackIdSafeForLocalArea(unsigned StackId) const {
+    return true;
+  }
+
   /// getOffsetOfLocalArea - This method returns the offset of the local area
   /// from the stack pointer on entrance to a function.
   ///
diff --git a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
index 6c5ef0255a08..204fb556d810 100644
--- a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -220,6 +220,8 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
         continue;
       if (StackProtectorFI == (int)i)
         continue;
+      if (!TFI.isStackIdSafeForLocalArea(MFI.getStackID(i)))
+        continue;
 
       switch (MFI.getObjectSSPLayout(i)) {
       case MachineFrameInfo::SSPLK_None:
@@ -254,6 +256,8 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
       continue;
     if (ProtectedObjs.count(i))
       continue;
+    if (!TFI.isStackIdSafeForLocalArea(MFI.getStackID(i)))
+      continue;
 
     AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign);
   }
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 9d0a6d9eaf25..444740cb50ab 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -105,6 +105,12 @@ class AArch64FrameLowering : public TargetFrameLowering {
     }
   }
 
+  bool isStackIdSafeForLocalArea(unsigned StackId) const override {
+    // We don't support putting SVE objects into the pre-allocated local
+    // frame block at the moment.
+    return StackId != TargetStackID::SVEVector;
+  }
+
 private:
   bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
                                       uint64_t StackBumpBytes) const;
diff --git a/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir b/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir
new file mode 100644
index 000000000000..c20846c54b6a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir
@@ -0,0 +1,61 @@
+# RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -run-pass=localstackalloc -o - %s | FileCheck %s
+
+--- |
+  ; ModuleID = '<stdin>'
+  source_filename = "<stdin>"
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-linux-gnu"
+
+  define <vscale x 32 x i8> @insert_32i8_idx(<vscale x 32 x i8> %a, i8 %elt, i64 %idx) #0 {
+    %ins = insertelement <vscale x 32 x i8> %a, i8 %elt, i64 %idx
+    ret <vscale x 32 x i8> %ins
+  }
+
+  attributes #0 = { "target-features"="+sve" }
+
+...
+---
+name:            insert_32i8_idx
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: zpr, preferred-register: '' }
+  - { id: 1, class: zpr, preferred-register: '' }
+  - { id: 2, class: gpr32, preferred-register: '' }
+  - { id: 3, class: gpr64, preferred-register: '' }
+  - { id: 5, class: ppr_3b, preferred-register: '' }
+  - { id: 6, class: gpr64sp, preferred-register: '' }
+  - { id: 7, class: zpr, preferred-register: '' }
+  - { id: 8, class: zpr, preferred-register: '' }
+liveins:
+  - { reg: '$z0', virtual-reg: '%0' }
+  - { reg: '$z1', virtual-reg: '%1' }
+  - { reg: '$w0', virtual-reg: '%2' }
+frameInfo:
+  maxAlignment:    1
+  maxCallFrameSize: 0
+# CHECK-LABEL: name: insert_32i8_idx
+# CHECK: localFrameSize:  0
+stack:
+  - { id: 0, name: '', type: default, offset: 0, size: 32, alignment: 16,
+      stack-id: sve-vec, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+machineFunctionInfo: {}
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $z0, $z1, $w0
+
+    %2:gpr32 = COPY $w0
+    %1:zpr = COPY $z1
+    %0:zpr = COPY $z0
+    %5:ppr_3b = PTRUE_B 31
+    %6:gpr64sp = ADDXri %stack.0, 0, 0
+    ST1B_IMM %1, %5, %6, 1 :: (store unknown-size, align 16)
+    ST1B_IMM %0, %5, %stack.0, 0 :: (store unknown-size into %stack.0, align 16)
+    %7:zpr = LD1B_IMM %5, %6, 1 :: (load unknown-size from %stack.0 + 16, align 16)
+    %8:zpr = LD1B_IMM %5, %stack.0, 0 :: (load unknown-size from %stack.0, align 16)
+    $z0 = COPY %8
+    $z1 = COPY %7
+    RET_ReallyLR implicit $z0, implicit $z1
+
+...

From 75e5d4f42ad9556f5d53bb2984a23082d6c6a830 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Fri, 3 Jul 2020 13:41:34 +0100
Subject: [PATCH 081/363] [CodeGen] Remove calls to getVectorNumElements in
 DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR

In DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR I have replaced
calls to getVectorNumElements with getVectorMinNumElements, since
this code path works for both fixed and scalable vector types. For
scalable vectors the index will be multiplied by VSCALE.

Fixes warnings in this test:

  sve-sext-zext.ll

Differential revision: https://reviews.llvm.org/D83198

(cherry picked from commit 5d84eafc6b86a42e261af8d753c3a823e0e7c67e)
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1394f084c6dc..6963de2e5029 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2197,13 +2197,19 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
   SDValue Idx = N->getOperand(1);
   SDLoc dl(N);
   SDValue Lo, Hi;
+
+  if (SubVT.isScalableVector() !=
+      N->getOperand(0).getValueType().isScalableVector())
+    report_fatal_error("Extracting a fixed-length vector from an illegal "
+                       "scalable vector is not yet supported");
+
   GetSplitVector(N->getOperand(0), Lo, Hi);
 
-  uint64_t LoElts = Lo.getValueType().getVectorNumElements();
+  uint64_t LoElts = Lo.getValueType().getVectorMinNumElements();
   uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
 
   if (IdxVal < LoElts) {
-    assert(IdxVal + SubVT.getVectorNumElements() <= LoElts &&
+    assert(IdxVal + SubVT.getVectorMinNumElements() <= LoElts &&
            "Extracted subvector crosses vector split!");
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx);
   } else {

From 07a7044b805a422469041928c7c6ee55bcdda2a4 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Fri, 3 Jul 2020 14:20:59 +0100
Subject: [PATCH 082/363] [SVE] Add checks for no warnings in
 CodeGen/AArch64/sve-sext-zext.ll

Previous patches fixed up all the warnings in this test:

  llvm/test/CodeGen/AArch64/sve-sext-zext.ll

and this change simply checks that no new warnings are added in future.

Differential revision: https://reviews.llvm.org/D83205

(cherry picked from commit f43b5c7a76ab83dcc80e6769d41d5c4b761312b1)
---
 llvm/test/CodeGen/AArch64/sve-sext-zext.ll | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AArch64/sve-sext-zext.ll b/llvm/test/CodeGen/AArch64/sve-sext-zext.ll
index 24cf433306bb..1275811d175e 100644
--- a/llvm/test/CodeGen/AArch64/sve-sext-zext.ll
+++ b/llvm/test/CodeGen/AArch64/sve-sext-zext.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; WARN-NOT: warning
 
 define <vscale x 16 x i8> @sext_i1_i8(<vscale x 16 x i1> %a) {
 ; CHECK-LABEL: sext_i1_i8:

From bec306442de89c71c2268e7e2629b4d454895a56 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Thu, 16 Jul 2020 11:29:25 +0100
Subject: [PATCH 083/363] [SVE][CodeGen] Add simple integer add tests for SVE
 tuple types

I have added tests to:

  CodeGen/AArch64/sve-intrinsics-int-arith.ll

for doing simple integer add operations on tuple types. Since these
tests introduced new warnings due to incorrect use of
getVectorNumElements() I have also fixed up these warnings in the
same patch. These fixes are:

1. In narrowExtractedVectorBinOp I have changed the code to bail out
early for scalable vector types, since we've not yet hit a case that
proves the optimisations are profitable for scalable vectors.
2. In DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS I have replaced
calls to getVectorNumElements with getVectorMinNumElements in cases
that work with scalable vectors. For the other cases I have added
asserts that the vector is not scalable because we should not be
using shuffle vectors and build vectors in such cases.

Differential revision: https://reviews.llvm.org/D84016

(cherry picked from commit 207877175944656bd9b52d36f391a092854572be)
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  5 ++-
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 19 +++++++---
 .../AArch64/sve-intrinsics-int-arith.ll       | 37 +++++++++++++++++++
 3 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a026d3960026..ec384d2a7c56 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -19190,7 +19190,10 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
 
   // The binop must be a vector type, so we can extract some fraction of it.
   EVT WideBVT = BinOp.getValueType();
-  if (!WideBVT.isVector())
+  // The optimisations below currently assume we are dealing with fixed length
+  // vectors. It is possible to add support for scalable vectors, but at the
+  // moment we've done no analysis to prove whether they are profitable or not.
+  if (!WideBVT.isFixedLengthVector())
     return SDValue();
 
   EVT VT = Extract->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 6963de2e5029..c81d03cac81b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3610,16 +3610,15 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
   EVT InVT = N->getOperand(0).getValueType();
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
-  unsigned WidenNumElts = WidenVT.getVectorNumElements();
-  unsigned NumInElts = InVT.getVectorNumElements();
   unsigned NumOperands = N->getNumOperands();
 
   bool InputWidened = false; // Indicates we need to widen the input.
   if (getTypeAction(InVT) != TargetLowering::TypeWidenVector) {
-    if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) {
+    unsigned WidenNumElts = WidenVT.getVectorMinNumElements();
+    unsigned NumInElts = InVT.getVectorMinNumElements();
+    if (WidenNumElts % NumInElts == 0) {
       // Add undef vectors to widen to correct length.
-      unsigned NumConcat = WidenVT.getVectorNumElements() /
-                           InVT.getVectorNumElements();
+      unsigned NumConcat = WidenNumElts / NumInElts;
       SDValue UndefVal = DAG.getUNDEF(InVT);
       SmallVector<SDValue, 16> Ops(NumConcat);
       for (unsigned i=0; i < NumOperands; ++i)
@@ -3643,6 +3642,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
         return GetWidenedVector(N->getOperand(0));
 
       if (NumOperands == 2) {
+        assert(!WidenVT.isScalableVector() &&
+               "Cannot use vector shuffles to widen CONCAT_VECTOR result");
+        unsigned WidenNumElts = WidenVT.getVectorNumElements();
+        unsigned NumInElts = InVT.getVectorNumElements();
+
         // Replace concat of two operands with a shuffle.
         SmallVector<int, 16> MaskOps(WidenNumElts, -1);
         for (unsigned i = 0; i < NumInElts; ++i) {
@@ -3657,6 +3661,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
     }
   }
 
+  assert(!WidenVT.isScalableVector() &&
+         "Cannot use build vectors to widen CONCAT_VECTOR result");
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  unsigned NumInElts = InVT.getVectorNumElements();
+
   // Fall back to use extracts and build vector.
   EVT EltVT = WidenVT.getVectorElementType();
   SmallVector<SDValue, 16> Ops(WidenNumElts);
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith.ll
index 8a5d669e4241..fa67d92c2ae0 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith.ll
@@ -325,6 +325,39 @@ define <vscale x 2 x i64> @uqsub_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %
   ret <vscale x 2 x i64> %out
 }
 
+; ADD (tuples)
+
+define <vscale x 4 x i64> @add_i64_tuple2(<vscale x 4 x i64>* %out, <vscale x 2 x i64> %in1, <vscale x 2 x i64> %in2) {
+; CHECK-LABEL: add_i64_tuple2
+; CHECK: add z0.d, z0.d, z0.d
+; CHECK: add z1.d, z1.d, z1.d
+  %tuple = tail call <vscale x 4 x i64> @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64(<vscale x 2 x i64> %in1, <vscale x 2 x i64> %in2)
+  %res = add <vscale x 4 x i64> %tuple, %tuple
+  ret <vscale x 4 x i64> %res
+}
+
+define <vscale x 6 x i64> @add_i64_tuple3(<vscale x 6 x i64>* %out, <vscale x 2 x i64> %in1, <vscale x 2 x i64> %in2, <vscale x 2 x i64> %in3) {
+; CHECK-LABEL: add_i64_tuple3
+; CHECK: add z0.d, z0.d, z0.d
+; CHECK: add z1.d, z1.d, z1.d
+; CHECK: add z2.d, z2.d, z2.d
+  %tuple = tail call <vscale x 6 x i64> @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64(<vscale x 2 x i64> %in1, <vscale x 2 x i64> %in2, <vscale x 2 x i64> %in3)
+  %res = add <vscale x 6 x i64> %tuple, %tuple
+  ret <vscale x 6 x i64> %res
+}
+
+define <vscale x 8 x i64> @add_i64_tuple4(<vscale x 8 x i64>* %out, <vscale x 2 x i64> %in1, <vscale x 2 x i64> %in2, <vscale x 2 x i64> %in3, <vscale x 2 x i64> %in4) {
+; CHECK-LABEL: add_i64_tuple4
+; CHECK: add z0.d, z0.d, z0.d
+; CHECK: add z1.d, z1.d, z1.d
+; CHECK: add z2.d, z2.d, z2.d
+; CHECK: add z3.d, z3.d, z3.d
+  %tuple = tail call <vscale x 8 x i64> @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64(<vscale x 2 x i64> %in1, <vscale x 2 x i64> %in2, <vscale x 2 x i64> %in3, <vscale x 2 x i64> %in4)
+  %res = add <vscale x 8 x i64> %tuple, %tuple
+  ret <vscale x 8 x i64> %res
+}
+
+
 declare <vscale x 16 x i8> @llvm.aarch64.sve.abs.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 16 x i8>)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.abs.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x i16>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.abs.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>)
@@ -366,3 +399,7 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.uqsub.x.nxv16i8(<vscale x 16 x i8>,
 declare <vscale x 8 x i16> @llvm.aarch64.sve.uqsub.x.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.uqsub.x.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.uqsub.x.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 4 x i64> @llvm.aarch64.sve.tuple.create2.nxv4i64.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 6 x i64> @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 8 x i64> @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)

From 16a68abcebc0c09edaa03bde11e0f452cfd4abdf Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Fri, 10 Jul 2020 10:26:33 +0100
Subject: [PATCH 084/363] [SVE] Don't consider scalable vector types in
 SLPVectorizerPass::vectorizeChainsInBlock

In vectorizeChainsInBlock we try to collect chains of PHI nodes
that have the same element type, but the code is relying upon
the implicit conversion from TypeSize -> uint64_t. For now, I have
modified the code to ignore PHI nodes with scalable types.

Differential Revision: https://reviews.llvm.org/D83542

(cherry picked from commit 9ad7c980bb47edd7db8f8db828b487cc7dfc9921)
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 13 +++++++--
 .../SLPVectorizer/AArch64/scalable-vector.ll  | 28 ++++++++++++++++++-
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5bc35aa4695f..f950d0d4eb2b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7397,8 +7397,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       // Look for the next elements with the same type.
       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
       Type *EltTy = (*IncIt)->getType();
-      unsigned EltSize = EltTy->isSized() ? DL->getTypeSizeInBits(EltTy)
-                                          : MaxVecRegSize;
+
+      assert(EltTy->isSized() &&
+             "Instructions should all be sized at this point");
+      TypeSize EltTS = DL->getTypeSizeInBits(EltTy);
+      if (EltTS.isScalable()) {
+        // For now, just ignore vectorizing scalable types.
+        ++IncIt;
+        continue;
+      }
+
+      unsigned EltSize = EltTS.getFixedSize();
       unsigned MaxNumElts = MaxVecRegSize / EltSize;
       if (MaxNumElts < 2) {
         ++IncIt;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll
index 70ce0dc4d7ba..99c60912f9db 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -S 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; WARN-NOT: warning
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
@@ -21,5 +24,28 @@ define void @test() {
   ret void
 }
 
+define <vscale x 4 x i32> @scalable_phi(<vscale x 4 x i32> %a, i32 %b) {
+; CHECK-LABEL: @scalable_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi <vscale x 4 x i32> [ [[A:%.*]], [[ENTRY:%.*]] ], [ zeroinitializer, [[IF_THEN]] ]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[RETVAL]]
+;
+entry:
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %if.then, label %end
+
+if.then:
+  br label %end
+
+end:
+  %retval = phi <vscale x 4 x i32> [ %a, %entry ], [ zeroinitializer, %if.then ]
+  ret <vscale x 4 x i32> %retval
+}
+
 declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0nxv16i8(<vscale x 16 x i8>*, i32 immarg, <vscale x 16 x i1>, <vscale x 16 x i8>)
 declare void @llvm.masked.store.nxv16i8.p0nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>*, i32 immarg, <vscale x 16 x i1>)

From 5583444d188015fbcf97d16c946b2617af81698a Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Tue, 28 Jul 2020 13:28:14 +0100
Subject: [PATCH 085/363] [SVE][CodeGen] At -O0 fallback to DAG ISel when
 translating alloca with scalable types

When building code at -O0 We weren't falling back to DAG ISel correctly
when encountering alloca instructions with scalable vector types. This
is because the alloca has no operands that are scalable. I've fixed this by
adding a check in AArch64ISelLowering::fallBackToDAGISel for alloca
instructions with scalable types.

Differential Revision: https://reviews.llvm.org/D84746

(cherry picked from commit 23ad660b5d34930b2b5362f1bba63daee78f6aa4)
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp        | 5 +++++
 llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d9951b7b8c5b..2c992c07fad9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14922,6 +14922,11 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
     if (isa<ScalableVectorType>(Inst.getOperand(i)->getType()))
       return true;
 
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
+    if (isa<ScalableVectorType>(AI->getAllocatedType()))
+      return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
index cf596c98d462..ea382af14933 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
@@ -244,6 +244,14 @@ define i8 @scalable_call(i8* %addr) #1 {
   ret i8 %res
 }
 
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to translate instruction{{.*}}scalable_alloca
+; FALLBACK-WITH-REPORT-OUT-LABEL: scalable_alloca
+define void @scalable_alloca() #1 {
+  %local0 = alloca <vscale x 16 x i8>
+  load volatile <vscale x 16 x i8>, <vscale x 16 x i8>* %local0
+  ret void
+}
+
 ; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to translate instruction{{.*}}asm_indirect_output
 ; FALLBACK-WITH-REPORT-OUT-LABEL: asm_indirect_output
 define void @asm_indirect_output() {

From 6b66be512110acb2dfdab00d9755d86e185f1e3d Mon Sep 17 00:00:00 2001
From: Francesco Petrogalli <francesco.petrogalli@arm.com>
Date: Tue, 7 Jul 2020 19:03:13 +0000
Subject: [PATCH 086/363] [llvm][sve] Reg + Imm addressing mode for ld1ro.

Reviewers: kmclaughlin, efriedma, sdesmalen

Subscribers: tschuett, hiraditya, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D83357

(cherry picked from commit 809600d6642773f71245f76995dab355effc73af)
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   3 +
 .../lib/Target/AArch64/AArch64InstrFormats.td |   5 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  10 +-
 ...ntrinsics-ld1ro-addressing-mode-reg-imm.ll | 174 ++++++++++++++++++
 4 files changed, 188 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro-addressing-mode-reg-imm.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2c992c07fad9..1500da2fdfc7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12301,6 +12301,9 @@ static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
                 "Unsupported opcode.");
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
+  if (VT == MVT::nxv8bf16 &&
+      !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+    return SDValue();
 
   EVT LoadVT = VT;
   if (VT.isFloatingPoint())
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 6df7970f4d82..4f4ba692c2db 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -495,6 +495,9 @@ def SImmS4XForm : SDNodeXForm<imm, [{
 def SImmS16XForm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getSExtValue() / 16, SDLoc(N), MVT::i64);
 }]>;
+def SImmS32XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue() / 32, SDLoc(N), MVT::i64);
+}]>;
 
 // simm6sN predicate - True if the immediate is a multiple of N in the range
 // [-32 * N, 31 * N].
@@ -546,7 +549,7 @@ def simm4s16 : Operand<i64>, ImmLeaf<i64,
   let DecoderMethod = "DecodeSImm<4>";
 }
 def simm4s32 : Operand<i64>, ImmLeaf<i64,
-[{ return Imm >=-256  && Imm <= 224 && (Imm % 32) == 0x0; }]> {
+[{ return Imm >=-256  && Imm <= 224 && (Imm % 32) == 0x0; }], SImmS32XForm> {
   let PrintMethod = "printImmScale<32>";
   let ParserMatchClass = SImm4s32Operand;
   let DecoderMethod = "DecodeSImm<4>";
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index a005d1e65abe..c56a65b9e212 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -7718,9 +7718,13 @@ multiclass sve_mem_ldor_si<bits<2> sz, string asm, RegisterOperand listty,
                   (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>;
 
   // Base addressing mode
-  def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), GPR64sp:$base)),
-            (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, (i64 0))>;
-
+  def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), GPR64sp:$base)),
+            (!cast<Instruction>(NAME) PPR3bAny:$Pg, GPR64sp:$base, (i64 0))>;
+  let AddedComplexity = 2 in {
+    // Reg + Imm addressing mode
+    def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), (add GPR64:$base, (i64 simm4s32:$imm)))),
+              (!cast<Instruction>(NAME) $Pg, $base, simm4s32:$imm)>;
+  }
 }
 
 class sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand VecList,
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro-addressing-mode-reg-imm.ll
new file mode 100644
index 000000000000..e7edfc9d6bdd
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1ro-addressing-mode-reg-imm.ll
@@ -0,0 +1,174 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+f64mm -asm-verbose=0 < %s 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; WARN-NOT: warning
+
+;
+; LD1ROB
+;
+
+define <vscale x 16 x i8> @ld1rob_i8(<vscale x 16 x i1> %pg, i8* %a) nounwind {
+; CHECK-LABEL: ld1rob_i8:
+; CHECK-NEXT:  ld1rob { z0.b }, p0/z, [x0, #32]
+; CHECK-NEXT:  ret
+  %base = getelementptr i8, i8* %a, i64 32
+  %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1ro.nxv16i8(<vscale x 16 x i1> %pg, i8* %base)
+  ret <vscale x 16 x i8> %load
+}
+
+;
+; LD1ROH
+;
+
+define <vscale x 8 x i16> @ld1roh_i16(<vscale x 8 x i1> %pg, i16* %a) nounwind {
+; CHECK-LABEL: ld1roh_i16:
+; CHECK-NEXT:  ld1roh { z0.h }, p0/z, [x0, #64]
+; CHECK-NEXT:  ret
+  %base = getelementptr i16, i16* %a, i64 32
+  %load = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1ro.nxv8i16(<vscale x 8 x i1> %pg, i16* %base)
+  ret <vscale x 8 x i16> %load
+}
+
+define <vscale x 8 x half> @ld1roh_f16(<vscale x 8 x i1> %pg, half* %a) nounwind {
+; CHECK-LABEL: ld1roh_f16:
+; CHECK-NEXT:  ld1roh { z0.h }, p0/z, [x0, #64]
+; CHECK-NEXT:  ret
+  %base = getelementptr half, half* %a, i64 32
+  %load = call <vscale x 8 x half> @llvm.aarch64.sve.ld1ro.nxv8f16(<vscale x 8 x i1> %pg, half* %base)
+  ret <vscale x 8 x half> %load
+}
+
+define <vscale x 8 x bfloat> @ld1roh_bf16(<vscale x 8 x i1> %pg, bfloat* %a) nounwind #0 {
+; CHECK-LABEL: ld1roh_bf16:
+; CHECK-NEXT:  ld1roh { z0.h }, p0/z, [x0, #64]
+; CHECK-NEXT:  ret
+  %base = getelementptr bfloat, bfloat* %a, i64 32
+  %load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1ro.nxv8bf16(<vscale x 8 x i1> %pg, bfloat* %base)
+  ret <vscale x 8 x bfloat> %load
+}
+
+;
+; LD1ROW
+;
+
+define<vscale x 4 x i32> @ld1row_i32(<vscale x 4 x i1> %pg, i32* %a) nounwind {
+; CHECK-LABEL: ld1row_i32:
+; CHECK-NEXT:  ld1row { z0.s }, p0/z, [x0, #128]
+; CHECK-NEXT:  ret
+  %base = getelementptr i32, i32* %a, i64 32
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1ro.nxv4i32(<vscale x 4 x i1> %pg, i32* %base)
+  ret <vscale x 4 x i32> %load
+}
+
+define<vscale x 4 x float> @ld1row_f32(<vscale x 4 x i1> %pg, float* %a) nounwind {
+; CHECK-LABEL: ld1row_f32:
+; CHECK-NEXT:  ld1row { z0.s }, p0/z, [x0, #128]
+; CHECK-NEXT:  ret
+  %base = getelementptr float, float* %a, i64 32
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1ro.nxv4f32(<vscale x 4 x i1> %pg, float* %base)
+  ret <vscale x 4 x float> %load
+}
+
+;
+; LD1ROD
+;
+
+define <vscale x 2 x i64> @ld1rod_i64(<vscale x 2 x i1> %pg, i64* %a) nounwind {
+; CHECK-LABEL: ld1rod_i64:
+; CHECK-NEXT:  ld1rod { z0.d }, p0/z, [x0, #-64]
+; CHECK-NEXT:  ret
+  %base = getelementptr i64, i64* %a, i64 -8
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1ro.nxv2i64(<vscale x 2 x i1> %pg, i64* %base)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @ld1rod_f64(<vscale x 2 x i1> %pg, double* %a) nounwind {
+; CHECK-LABEL: ld1rod_f64:
+; CHECK-NEXT:  ld1rod { z0.d }, p0/z, [x0, #-128]
+; CHECK-NEXT:  ret
+  %base = getelementptr double, double* %a, i64 -16
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1ro.nxv2f64(<vscale x 2 x i1> %pg, double* %base)
+  ret <vscale x 2 x double> %load
+}
+
+
+;;;;;;;;;;;;;;
+; range checks: immediate must be a multiple of 32 in the range -256, ..., 224
+
+; lower bound
+define <vscale x 16 x i8> @ld1rob_i8_lower_bound(<vscale x 16 x i1> %pg, i8* %a) nounwind {
+; CHECK-LABEL: ld1rob_i8_lower_bound:
+; CHECK-NEXT:  ld1rob { z0.b }, p0/z, [x0, #-256]
+; CHECK-NEXT:  ret
+  %base = getelementptr i8, i8* %a, i64 -256
+  %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1ro.nxv16i8(<vscale x 16 x i1> %pg, i8* %base)
+  ret <vscale x 16 x i8> %load
+}
+
+; below lower bound
+define <vscale x 8 x i16> @ld1roh_i16_below_lower_bound(<vscale x 8 x i1> %pg, i16* %a) nounwind {
+; CHECK-LABEL: ld1roh_i16_below_lower_bound:
+; CHECK-NEXT:  sub     x[[BASE:[0-9]+]], x0, #258
+; CHECK-NEXT:  ld1roh { z0.h }, p0/z, [x[[BASE]]]
+; CHECK-NEXT:  ret
+  %base = getelementptr i16, i16* %a, i64 -129
+  %load = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1ro.nxv8i16(<vscale x 8 x i1> %pg, i16* %base)
+  ret <vscale x 8 x i16> %load
+}
+
+define <vscale x 16 x i8> @ld1rob_i8_below_lower_bound_01(<vscale x 16 x i1> %pg, i8* %a) nounwind {
+; CHECK-LABEL: ld1rob_i8_below_lower_bound_01:
+; CHECK-NEXT:  mov x[[OFFSET:[0-9]+]], #-257
+; CHECK-NEXT:  ld1rob { z0.b }, p0/z, [x0, x[[OFFSET]]]
+; CHECK-NEXT:  ret
+  %base = getelementptr i8, i8* %a, i64 -257
+  %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1ro.nxv16i8(<vscale x 16 x i1> %pg, i8* %base)
+  ret <vscale x 16 x i8> %load
+}
+
+; not a multiple of 32
+define<vscale x 4 x i32> @ld1row_i32_not_multiple(<vscale x 4 x i1> %pg, i32* %a) nounwind {
+; CHECK-LABEL: ld1row_i32_not_multiple:
+; CHECK-NEXT:  add x[[BASE:[0-9]+]], x0, #12
+; CHECK-NEXT:  ld1row { z0.s }, p0/z, [x[[BASE]]]
+; CHECK-NEXT:  ret
+  %base = getelementptr i32, i32* %a, i64 3
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1ro.nxv4i32(<vscale x 4 x i1> %pg, i32* %base)
+  ret <vscale x 4 x i32> %load
+}
+
+; upper bound
+define <vscale x 2 x i64> @ld1rod_i64_upper_bound(<vscale x 2 x i1> %pg, i64* %a) nounwind {
+; CHECK-LABEL: ld1rod_i64_upper_bound:
+; CHECK-NEXT:  ld1rod { z0.d }, p0/z, [x0, #224]
+; CHECK-NEXT:  ret
+  %base = getelementptr i64, i64* %a, i64 28
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1ro.nxv2i64(<vscale x 2 x i1> %pg, i64* %base)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 16 x i8> @ld1rob_i8_beyond_upper_bound(<vscale x 16 x i1> %pg, i8* %a) nounwind {
+; CHECK-LABEL: ld1rob_i8_beyond_upper_bound:
+; CHECK-NEXT:  mov w[[OFFSET:[0-9]+]], #225
+; CHECK-NEXT:  ld1rob { z0.b }, p0/z, [x0, x[[OFFSET]]]
+; CHECK-NEXT:  ret
+  %base = getelementptr i8, i8* %a, i64 225
+  %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1ro.nxv16i8(<vscale x 16 x i1> %pg, i8* %base)
+  ret <vscale x 16 x i8> %load
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1ro.nxv16i8(<vscale x 16 x i1>, i8*)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1ro.nxv8i16(<vscale x 8 x i1>, i16*)
+declare <vscale x 8 x half> @llvm.aarch64.sve.ld1ro.nxv8f16(<vscale x 8 x i1>, half*)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1ro.nxv8bf16(<vscale x 8 x i1>, bfloat*)
+
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1ro.nxv4i32(<vscale x 4 x i1>, i32*)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1ro.nxv4f32(<vscale x 4 x i1>, float*)
+
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1ro.nxv2i64(<vscale x 2 x i1>, i64*)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1ro.nxv2f64(<vscale x 2 x i1>, double*)
+
+
+; +bf16 is required for the bfloat version.
+attributes #0 = { "target-features"="+sve,+f64mm,+bf16" }

From db306412bf65f4b6fa4314dd5611752448bbc80c Mon Sep 17 00:00:00 2001
From: Francesco Petrogalli <francesco.petrogalli@arm.com>
Date: Mon, 27 Jul 2020 16:24:18 +0000
Subject: [PATCH 087/363] [NFC][AArch64] Replace some template
 methods/invocations...

...with the non-template version, as the template version might
increase the size of the compiler build.

Methods affected:

1.`findAddrModeSVELoadStore`
2. `SelectPredicatedStore`

Also, remove the `const` qualifier from the `unsigned` parameters of
the methods to conform with other similar methods in the class.

(cherry picked from commit dbeb184b7f54db2d3ef20ac153b1c77f81cf0b99)
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 67 +++++++------------
 1 file changed, 26 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 10c477853353..dbd7db7ee8e6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -262,14 +262,12 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
   void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
   void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
   void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-  template <unsigned Scale>
-  void SelectPredicatedStore(SDNode *N, unsigned NumVecs, const unsigned Opc_rr,
-                             const unsigned Opc_ri);
-  template <unsigned Scale>
+  void SelectPredicatedStore(SDNode *N, unsigned NumVecs, unsigned Scale,
+                             unsigned Opc_rr, unsigned Opc_ri);
   std::tuple<unsigned, SDValue, SDValue>
-  findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr,
-                           const unsigned Opc_ri, const SDValue &OldBase,
-                           const SDValue &OldOffset);
+  findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr, unsigned Opc_ri,
+                           const SDValue &OldBase, const SDValue &OldOffset,
+                           unsigned Scale);
 
   bool tryBitfieldExtractOp(SDNode *N);
   bool tryBitfieldExtractOpFromSExt(SDNode *N);
@@ -1414,12 +1412,12 @@ void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
 /// Optimize \param OldBase and \param OldOffset selecting the best addressing
 /// mode. Returns a tuple consisting of an Opcode, an SDValue representing the
 /// new Base and an SDValue representing the new offset.
-template <unsigned Scale>
 std::tuple<unsigned, SDValue, SDValue>
-AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr,
-                                              const unsigned Opc_ri,
+AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr,
+                                              unsigned Opc_ri,
                                               const SDValue &OldBase,
-                                              const SDValue &OldOffset) {
+                                              const SDValue &OldOffset,
+                                              unsigned Scale) {
   SDValue NewBase = OldBase;
   SDValue NewOffset = OldOffset;
   // Detect a possible Reg+Imm addressing mode.
@@ -1429,7 +1427,7 @@ AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr,
   // Detect a possible reg+reg addressing mode, but only if we haven't already
   // detected a Reg+Imm one.
   const bool IsRegReg =
-      !IsRegImm && SelectSVERegRegAddrMode<Scale>(OldBase, NewBase, NewOffset);
+      !IsRegImm && SelectSVERegRegAddrMode(OldBase, Scale, NewBase, NewOffset);
 
   // Select the instruction.
   return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset);
@@ -1479,10 +1477,9 @@ void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
   ReplaceNode(N, St);
 }
 
-template <unsigned Scale>
 void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs,
-                                                const unsigned Opc_rr,
-                                                const unsigned Opc_ri) {
+                                                unsigned Scale, unsigned Opc_rr,
+                                                unsigned Opc_ri) {
   SDLoc dl(N);
 
   // Form a REG_SEQUENCE to force register allocation.
@@ -1492,9 +1489,9 @@ void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs,
   // Optimize addressing mode.
   unsigned Opc;
   SDValue Offset, Base;
-  std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore<Scale>(
+  std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore(
       N, Opc_rr, Opc_ri, N->getOperand(NumVecs + 3),
-      CurDAG->getTargetConstant(0, dl, MVT::i64));
+      CurDAG->getTargetConstant(0, dl, MVT::i64), Scale);
 
   SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), // predicate
                    Base,                               // address
@@ -4085,63 +4082,51 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     }
     case Intrinsic::aarch64_sve_st2: {
       if (VT == MVT::nxv16i8) {
-        SelectPredicatedStore</*Scale=*/0>(Node, 2, AArch64::ST2B,
-                                           AArch64::ST2B_IMM);
+        SelectPredicatedStore(Node, 2, 0, AArch64::ST2B, AArch64::ST2B_IMM);
         return;
       } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
                  (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
-        SelectPredicatedStore</*Scale=*/1>(Node, 2, AArch64::ST2H,
-                                           AArch64::ST2H_IMM);
+        SelectPredicatedStore(Node, 2, 1, AArch64::ST2H, AArch64::ST2H_IMM);
         return;
       } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
-        SelectPredicatedStore</*Scale=*/2>(Node, 2, AArch64::ST2W,
-                                           AArch64::ST2W_IMM);
+        SelectPredicatedStore(Node, 2, 2, AArch64::ST2W, AArch64::ST2W_IMM);
         return;
       } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
-        SelectPredicatedStore</*Scale=*/3>(Node, 2, AArch64::ST2D,
-                                           AArch64::ST2D_IMM);
+        SelectPredicatedStore(Node, 2, 3, AArch64::ST2D, AArch64::ST2D_IMM);
         return;
       }
       break;
     }
     case Intrinsic::aarch64_sve_st3: {
       if (VT == MVT::nxv16i8) {
-        SelectPredicatedStore</*Scale=*/0>(Node, 3, AArch64::ST3B,
-                                           AArch64::ST3B_IMM);
+        SelectPredicatedStore(Node, 3, 0, AArch64::ST3B, AArch64::ST3B_IMM);
         return;
       } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
                  (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
-        SelectPredicatedStore</*Scale=*/1>(Node, 3, AArch64::ST3H,
-                                           AArch64::ST3H_IMM);
+        SelectPredicatedStore(Node, 3, 1, AArch64::ST3H, AArch64::ST3H_IMM);
         return;
       } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
-        SelectPredicatedStore</*Scale=*/2>(Node, 3, AArch64::ST3W,
-                                           AArch64::ST3W_IMM);
+        SelectPredicatedStore(Node, 3, 2, AArch64::ST3W, AArch64::ST3W_IMM);
         return;
       } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
-        SelectPredicatedStore</*Scale=*/3>(Node, 3, AArch64::ST3D,
-                                           AArch64::ST3D_IMM);
+        SelectPredicatedStore(Node, 3, 3, AArch64::ST3D, AArch64::ST3D_IMM);
         return;
       }
       break;
     }
     case Intrinsic::aarch64_sve_st4: {
       if (VT == MVT::nxv16i8) {
-        SelectPredicatedStore</*Scale=*/0>(Node, 4, AArch64::ST4B,
-                                           AArch64::ST4B_IMM);
+        SelectPredicatedStore(Node, 4, 0, AArch64::ST4B, AArch64::ST4B_IMM);
         return;
       } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
                  (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
-        SelectPredicatedStore</*Scale=*/1>(Node, 4, AArch64::ST4H,
-                                           AArch64::ST4H_IMM);
+        SelectPredicatedStore(Node, 4, 1, AArch64::ST4H, AArch64::ST4H_IMM);
         return;
       } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
-        SelectPredicatedStore</*Scale=*/2>(Node, 4, AArch64::ST4W,
-                                           AArch64::ST4W_IMM);
+        SelectPredicatedStore(Node, 4, 2, AArch64::ST4W, AArch64::ST4W_IMM);
         return;
       } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
-        SelectPredicatedStore</*Scale=*/3>(Node, 4, AArch64::ST4D,
-                                           AArch64::ST4D_IMM);
+        SelectPredicatedStore(Node, 4, 3, AArch64::ST4D, AArch64::ST4D_IMM);
         return;
       }
       break;

From f3fa634264e3504ab684dfd29307594488878f14 Mon Sep 17 00:00:00 2001
From: Francesco Petrogalli <francesco.petrogalli@arm.com>
Date: Mon, 6 Jul 2020 17:46:59 +0000
Subject: [PATCH 088/363] [llvm][CodeGen] Addressing modes for SVE ldN.

Reviewers: c-rhodes, efriedma, sdesmalen

Subscribers: huihuiz, tschuett, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D77251

(cherry picked from commit adb28e0fb2b0e97ea9dce422c09b36979cf7cd2f)
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  64 ++-
 .../sve-intrinsics-ldN-reg+imm-addr-mode.ll   | 495 ++++++++++++++++++
 .../sve-intrinsics-ldN-reg+reg-addr-mode.ll   | 259 +++++++++
 3 files changed, 798 insertions(+), 20 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index dbd7db7ee8e6..7799ebfbd68e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -245,7 +245,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
                          unsigned SubRegIdx);
   void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
   void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-  void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc);
+  void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale,
+                            unsigned Opc_rr, unsigned Opc_ri);
 
   bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
   /// SVE Reg+Imm addressing mode.
@@ -1434,14 +1435,23 @@ AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr,
 }
 
 void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
-                                               const unsigned Opc) {
+                                               unsigned Scale, unsigned Opc_ri,
+                                               unsigned Opc_rr) {
+  assert(Scale < 4 && "Invalid scaling value.");
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   SDValue Chain = N->getOperand(0);
 
+  // Optimize addressing mode.
+  SDValue Base, Offset;
+  unsigned Opc;
+  std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore(
+      N, Opc_rr, Opc_ri, N->getOperand(2),
+      CurDAG->getTargetConstant(0, DL, MVT::i64), Scale);
+
   SDValue Ops[] = {N->getOperand(1), // Predicate
-                   N->getOperand(2), // Memory operand
-                   CurDAG->getTargetConstant(0, DL, MVT::i64), Chain};
+                   Base,             // Memory operand
+                   Offset, Chain};
 
   const EVT ResTys[] = {MVT::Untyped, MVT::Other};
 
@@ -4726,51 +4736,51 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
   }
   case AArch64ISD::SVE_LD2_MERGE_ZERO: {
     if (VT == MVT::nxv16i8) {
-      SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM);
+      SelectPredicatedLoad(Node, 2, 0, AArch64::LD2B_IMM, AArch64::LD2B);
       return;
     } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
                (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
-      SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM);
+      SelectPredicatedLoad(Node, 2, 1, AArch64::LD2H_IMM, AArch64::LD2H);
       return;
     } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
-      SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM);
+      SelectPredicatedLoad(Node, 2, 2, AArch64::LD2W_IMM, AArch64::LD2W);
       return;
     } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
-      SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM);
+      SelectPredicatedLoad(Node, 2, 3, AArch64::LD2D_IMM, AArch64::LD2D);
       return;
     }
     break;
   }
   case AArch64ISD::SVE_LD3_MERGE_ZERO: {
     if (VT == MVT::nxv16i8) {
-      SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM);
+      SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B);
       return;
     } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
                (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
-      SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM);
+      SelectPredicatedLoad(Node, 3, 1, AArch64::LD3H_IMM, AArch64::LD3H);
       return;
     } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
-      SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM);
+      SelectPredicatedLoad(Node, 3, 2, AArch64::LD3W_IMM, AArch64::LD3W);
       return;
     } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
-      SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM);
+      SelectPredicatedLoad(Node, 3, 3, AArch64::LD3D_IMM, AArch64::LD3D);
       return;
     }
     break;
   }
   case AArch64ISD::SVE_LD4_MERGE_ZERO: {
     if (VT == MVT::nxv16i8) {
-      SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM);
+      SelectPredicatedLoad(Node, 4, 0, AArch64::LD4B_IMM, AArch64::LD4B);
       return;
     } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
                (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
-      SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM);
+      SelectPredicatedLoad(Node, 4, 1, AArch64::LD4H_IMM, AArch64::LD4H);
       return;
     } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
-      SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM);
+      SelectPredicatedLoad(Node, 4, 2, AArch64::LD4W_IMM, AArch64::LD4W);
       return;
     } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
-      SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM);
+      SelectPredicatedLoad(Node, 4, 3, AArch64::LD4D_IMM, AArch64::LD4D);
       return;
     }
     break;
@@ -4790,10 +4800,14 @@ FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
 
 /// When \p PredVT is a scalable vector predicate in the form
 /// MVT::nx<M>xi1, it builds the correspondent scalable vector of
-/// integers MVT::nx<M>xi<bits> s.t. M x bits = 128. If the input
+/// integers MVT::nx<M>xi<bits> s.t. M x bits = 128. When targeting
+/// structured vectors (NumVec >1), the output data type is
+/// MVT::nx<M*NumVec>xi<bits> s.t. M x bits = 128. If the input
 /// PredVT is not in the form MVT::nx<M>xi1, it returns an invalid
 /// EVT.
-static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT) {
+static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT,
+                                                unsigned NumVec) {
+  assert(NumVec > 0 && NumVec < 5 && "Invalid number of vectors.");
   if (!PredVT.isScalableVector() || PredVT.getVectorElementType() != MVT::i1)
     return EVT();
 
@@ -4803,7 +4817,8 @@ static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT) {
 
   ElementCount EC = PredVT.getVectorElementCount();
   EVT ScalarVT = EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.Min);
-  EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC);
+  EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC * NumVec);
+
   return MemVT;
 }
 
@@ -4827,6 +4842,15 @@ static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) {
     return cast<VTSDNode>(Root->getOperand(3))->getVT();
   case AArch64ISD::ST1_PRED:
     return cast<VTSDNode>(Root->getOperand(4))->getVT();
+  case AArch64ISD::SVE_LD2_MERGE_ZERO:
+    return getPackedVectorTypeFromPredicateType(
+        Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/2);
+  case AArch64ISD::SVE_LD3_MERGE_ZERO:
+    return getPackedVectorTypeFromPredicateType(
+        Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/3);
+  case AArch64ISD::SVE_LD4_MERGE_ZERO:
+    return getPackedVectorTypeFromPredicateType(
+        Ctx, Root->getOperand(1)->getValueType(0), /*NumVec=*/4);
   default:
     break;
   }
@@ -4842,7 +4866,7 @@ static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) {
   // We are using an SVE prefetch intrinsic. Type must be inferred
   // from the width of the predicate.
   return getPackedVectorTypeFromPredicateType(
-      Ctx, Root->getOperand(2)->getValueType(0));
+      Ctx, Root->getOperand(2)->getValueType(0), /*NumVec=*/1);
 }
 
 /// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode:
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll
new file mode 100644
index 000000000000..1ffa78ec2735
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+imm-addr-mode.ll
@@ -0,0 +1,495 @@
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sve < %s | FileCheck %s
+
+; NOTE: invalid, upper and lower bound immediate values of the regimm
+; addressing mode are checked only for the byte version of each
+; instruction (`ld<N>b`), as the code for detecting the immediate is
+; common to all instructions, and varies only for the number of
+; elements of the structure store, which is <N> = 2, 3, 4.
+
+; ld2b
+define <vscale x 32 x i8> @ld2.nxv32i8(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld2.nxv32i8:
+; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 2
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8*
+%res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 32 x i8> %res
+}
+
+define <vscale x 32 x i8> @ld2.nxv32i8_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld2.nxv32i8_lower_bound:
+; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0, #-16, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -16
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 32 x i8> %res
+}
+
+define <vscale x 32 x i8> @ld2.nxv32i8_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld2.nxv32i8_upper_bound:
+; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0, #14, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 14
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 32 x i8> %res
+}
+
+define <vscale x 32 x i8> @ld2.nxv32i8_not_multiple_of_2(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld2.nxv32i8_not_multiple_of_2:
+; CHECK:      rdvl x[[OFFSET:[0-9]]], #3
+; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x[[OFFSET]]]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 3
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 32 x i8> %res
+}
+
+define <vscale x 32 x i8> @ld2.nxv32i8_outside_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld2.nxv32i8_outside_lower_bound:
+; CHECK:      rdvl x[[OFFSET:[0-9]]], #-18
+; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x[[OFFSET]]]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -18
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 32 x i8> %res
+}
+
+define <vscale x 32 x i8> @ld2.nxv32i8_outside_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld2.nxv32i8_outside_upper_bound:
+; CHECK:      rdvl x[[OFFSET:[0-9]]], #16
+; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0, x[[OFFSET]]]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 16
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 32 x i8> %res
+}
+
+; ld2h
+define <vscale x 16 x i16> @ld2.nxv16i16(<vscale x 8 x i1> %Pg, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld2.nxv16i16:
+; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, #14, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 14
+%base_ptr = bitcast <vscale x 8 x i16>* %base to i16 *
+%res = call <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(<vscale x 8 x i1> %Pg, i16 *%base_ptr)
+ret <vscale x 16 x i16> %res
+}
+
+define <vscale x 16 x half> @ld2.nxv16f16(<vscale x 8 x i1> %Pg, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld2.nxv16f16:
+; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, #-16, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 -16
+%base_ptr = bitcast <vscale x 8 x half>* %base to half *
+%res = call <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(<vscale x 8 x i1> %Pg, half *%base_ptr)
+ret <vscale x 16 x half> %res
+}
+
+define <vscale x 16 x bfloat> @ld2.nxv16bf16(<vscale x 8 x i1> %Pg, <vscale x 8 x bfloat>* %addr) #0 {
+; CHECK-LABEL: ld2.nxv16bf16:
+; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, #12, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %addr, i64 12
+%base_ptr = bitcast <vscale x 8 x bfloat>* %base to bfloat *
+%res = call <vscale x 16 x bfloat> @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(<vscale x 8 x i1> %Pg, bfloat *%base_ptr)
+ret <vscale x 16 x bfloat> %res
+}
+
+; ld2w
+define <vscale x 8 x i32> @ld2.nxv8i32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld2.nxv8i32:
+; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0, #14, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 14
+%base_ptr = bitcast <vscale x 4 x i32>* %base to i32 *
+%res = call <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(<vscale x 4 x i1> %Pg, i32 *%base_ptr)
+ret <vscale x 8 x i32> %res
+}
+
+define <vscale x 8 x float> @ld2.nxv8f32(<vscale x 4 x i1> %Pg, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld2.nxv8f32:
+; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0, #-16, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 -16
+%base_ptr = bitcast <vscale x 4 x float>* %base to float *
+%res = call <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(<vscale x 4 x i1> %Pg, float *%base_ptr)
+ret <vscale x 8 x float> %res
+}
+
+; ld2d
+define <vscale x 4 x i64> @ld2.nxv4i64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld2.nxv4i64:
+; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0, #14, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 14
+%base_ptr = bitcast <vscale x 2 x i64>* %base to i64 *
+%res = call <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(<vscale x 2 x i1> %Pg, i64 *%base_ptr)
+ret <vscale x 4 x i64> %res
+}
+
+define <vscale x 4 x double> @ld2.nxv4f64(<vscale x 2 x i1> %Pg, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld2.nxv4f64:
+; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0, #-16, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 -16
+%base_ptr = bitcast <vscale x 2 x double>* %base to double *
+%res = call <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(<vscale x 2 x i1> %Pg, double *%base_ptr)
+ret <vscale x 4 x double> %res
+}
+
+; ld3b
+define <vscale x 48 x i8> @ld3.nxv48i8(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld3.nxv48i8:
+; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 3
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 48 x i8> %res
+}
+
+define <vscale x 48 x i8> @ld3.nxv48i8_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld3.nxv48i8_lower_bound:
+; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, #-24, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -24
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 48 x i8> %res
+}
+
+define <vscale x 48 x i8> @ld3.nxv48i8_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld3.nxv48i8_upper_bound:
+; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, #21, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 21
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 48 x i8> %res
+}
+
+define <vscale x 48 x i8> @ld3.nxv48i8_not_multiple_of_3_01(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld3.nxv48i8_not_multiple_of_3_01:
+; CHECK:      rdvl x[[OFFSET:[0-9]]], #4
+; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x[[OFFSET]]]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 4
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 48 x i8> %res
+}
+
+define <vscale x 48 x i8> @ld3.nxv48i8_not_multiple_of_3_02(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld3.nxv48i8_not_multiple_of_3_02:
+; CHECK:      rdvl x[[OFFSET:[0-9]]], #5
+; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x[[OFFSET]]]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 5
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 48 x i8> %res
+}
+
+define <vscale x 48 x i8> @ld3.nxv48i8_outside_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld3.nxv48i8_outside_lower_bound:
+; CHECK:      rdvl x[[OFFSET:[0-9]]], #-27
+; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x[[OFFSET]]]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -27
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 48 x i8> %res
+}
+
+define <vscale x 48 x i8> @ld3.nxv48i8_outside_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld3.nxv48i8_outside_upper_bound:
+; CHECK:      rdvl x[[OFFSET:[0-9]]], #24
+; CHECK-NEXT: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x[[OFFSET]]]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 24
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 48 x i8> %res
+}
+
+; ld3h
+define <vscale x 24 x i16> @ld3.nxv24i16(<vscale x 8 x i1> %Pg, <vscale x 8 x i16> *%addr) {
+; CHECK-LABEL: ld3.nxv24i16:
+; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, #21, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 21
+%base_ptr = bitcast <vscale x 8 x i16>* %base to i16 *
+%res = call <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(<vscale x 8 x i1> %Pg, i16 *%base_ptr)
+ret <vscale x 24 x i16> %res
+}
+
+define <vscale x 24 x half> @ld3.nxv24f16(<vscale x 8 x i1> %Pg, <vscale x 8 x half> *%addr) {
+; CHECK-LABEL: ld3.nxv24f16:
+; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, #21, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 21
+%base_ptr = bitcast <vscale x 8 x half>* %base to half *
+%res = call <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(<vscale x 8 x i1> %Pg, half *%base_ptr)
+ret <vscale x 24 x half> %res
+}
+
+define <vscale x 24 x bfloat> @ld3.nxv24bf16(<vscale x 8 x i1> %Pg, <vscale x 8 x bfloat> *%addr) #0 {
+; CHECK-LABEL: ld3.nxv24bf16:
+; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, #-24, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %addr, i64 -24
+%base_ptr = bitcast <vscale x 8 x bfloat>* %base to bfloat *
+%res = call <vscale x 24 x bfloat> @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(<vscale x 8 x i1> %Pg, bfloat *%base_ptr)
+ret <vscale x 24 x bfloat> %res
+}
+
+; ld3w
+define <vscale x 12 x i32> @ld3.nxv12i32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> *%addr) {
+; CHECK-LABEL: ld3.nxv12i32:
+; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, #21, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 21
+%base_ptr = bitcast <vscale x 4 x i32>* %base to i32 *
+%res = call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(<vscale x 4 x i1> %Pg, i32 *%base_ptr)
+ret <vscale x 12 x i32> %res
+}
+
+define <vscale x 12 x float> @ld3.nxv12f32(<vscale x 4 x i1> %Pg, <vscale x 4 x float> *%addr) {
+; CHECK-LABEL: ld3.nxv12f32:
+; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, #-24, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 -24
+%base_ptr = bitcast <vscale x 4 x float>* %base to float *
+%res = call <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(<vscale x 4 x i1> %Pg, float *%base_ptr)
+ret <vscale x 12 x float> %res
+}
+
+; ld3d
+define <vscale x 6 x i64> @ld3.nxv6i64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> *%addr) {
+; CHECK-LABEL: ld3.nxv6i64:
+; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, #21, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 21
+%base_ptr = bitcast <vscale x 2 x i64>* %base to i64 *
+%res = call <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(<vscale x 2 x i1> %Pg, i64 *%base_ptr)
+ret <vscale x 6 x i64> %res
+}
+
+define <vscale x 6 x double> @ld3.nxv6f64(<vscale x 2 x i1> %Pg, <vscale x 2 x double> *%addr) {
+; CHECK-LABEL: ld3.nxv6f64:
+; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, #-24, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 -24
+%base_ptr = bitcast <vscale x 2 x double>* %base to double *
+%res = call <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(<vscale x 2 x i1> %Pg, double *%base_ptr)
+ret <vscale x 6 x double> %res
+}
+
+; ; ld4b
+define <vscale x 64 x i8> @ld4.nxv64i8(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld4.nxv64i8:
+; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 4
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 64 x i8> %res
+}
+
+define <vscale x 64 x i8> @ld4.nxv64i8_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld4.nxv64i8_lower_bound:
+; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, #-32, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -32
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 64 x i8> %res
+}
+
+define <vscale x 64 x i8> @ld4.nxv64i8_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld4.nxv64i8_upper_bound:
+; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, #28, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 28
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 64 x i8> %res
+}
+
+define <vscale x 64 x i8> @ld4.nxv64i8_not_multiple_of_4_01(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_01:
+; CHECK:      rdvl x[[OFFSET:[0-9]]], #5
+; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x[[OFFSET]]]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 5
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 64 x i8> %res
+}
+
+define <vscale x 64 x i8> @ld4.nxv64i8_not_multiple_of_4_02(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_02:
+; CHECK:      rdvl x[[OFFSET:[0-9]]], #6
+; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x[[OFFSET]]]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 6
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 64 x i8> %res
+}
+
+define <vscale x 64 x i8> @ld4.nxv64i8_not_multiple_of_4_03(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_03:
+; CHECK:      rdvl x[[OFFSET:[0-9]]], #7
+; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x[[OFFSET]]]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 7
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 64 x i8> %res
+}
+
+define <vscale x 64 x i8> @ld4.nxv64i8_outside_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld4.nxv64i8_outside_lower_bound:
+; FIXME: optimize OFFSET computation so that xOFFSET = (mul (RDVL #4) #9)
+; xM = -9 * 2^6
+; xP = RDVL * 2^-4 
+; xOFFSET = RDVL * 2^-4 * -9 * 2^6 = RDVL * -36
+; CHECK:      rdvl x[[N:[0-9]]], #1
+; CHECK-DAG:  mov  x[[M:[0-9]]], #-576
+; CHECK-DAG:  lsr  x[[P:[0-9]]], x[[N]], #4
+; CHECK-DAG:  mul  x[[OFFSET:[0-9]]], x[[P]], x[[M]]
+; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x[[OFFSET]]]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -36
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 64 x i8> %res
+}
+
+define <vscale x 64 x i8> @ld4.nxv64i8_outside_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
+; CHECK-LABEL: ld4.nxv64i8_outside_upper_bound:
+; FIXME: optimize OFFSET computation so that xOFFSET = (mul (RDVL #16) #2)
+; xM = 2^9
+; xP = RDVL * 2^-4
+; xOFFSET = RDVL * 2^-4 * 2^9 = RDVL * 32
+; CHECK:      rdvl x[[N:[0-9]]], #1
+; CHECK-DAG:  mov  w[[M:[0-9]]], #512
+; CHECK-DAG:  lsr  x[[P:[0-9]]], x[[N]], #4
+; CHECK-DAG:  mul  x[[OFFSET:[0-9]]], x[[P]], x[[M]]
+; CHECK-NEXT: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x[[OFFSET]]]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 32
+%base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
+%res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
+ret <vscale x 64 x i8> %res
+}
+
+; ld4h
+define <vscale x 32 x i16> @ld4.nxv32i16(<vscale x 8 x i1> %Pg, <vscale x 8 x i16> *%addr) {
+; CHECK-LABEL: ld4.nxv32i16:
+; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, #8, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 8
+%base_ptr = bitcast <vscale x 8 x i16>* %base to i16 *
+%res = call <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(<vscale x 8 x i1> %Pg, i16 *%base_ptr)
+ret <vscale x 32 x i16> %res
+}
+
+define <vscale x 32 x half> @ld4.nxv32f16(<vscale x 8 x i1> %Pg, <vscale x 8 x half> *%addr) {
+; CHECK-LABEL: ld4.nxv32f16:
+; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, #28, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 28
+%base_ptr = bitcast <vscale x 8 x half>* %base to half *
+%res = call <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(<vscale x 8 x i1> %Pg, half *%base_ptr)
+ret <vscale x 32 x half> %res
+}
+
+define <vscale x 32 x bfloat> @ld4.nxv32bf16(<vscale x 8 x i1> %Pg, <vscale x 8 x bfloat> *%addr) #0 {
+; CHECK-LABEL: ld4.nxv32bf16:
+; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, #-32, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %addr, i64 -32
+%base_ptr = bitcast <vscale x 8 x bfloat>* %base to bfloat *
+%res = call <vscale x 32 x bfloat> @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(<vscale x 8 x i1> %Pg, bfloat *%base_ptr)
+ret <vscale x 32 x bfloat> %res
+}
+
+; ld4w
+define <vscale x 16 x i32> @ld4.nxv16i32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> *%addr) {
+; CHECK-LABEL: ld4.nxv16i32:
+; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, #28, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 28
+%base_ptr = bitcast <vscale x 4 x i32>* %base to i32 *
+%res = call <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(<vscale x 4 x i1> %Pg, i32 *%base_ptr)
+ret <vscale x 16 x i32> %res
+}
+
+define <vscale x 16 x float> @ld4.nxv16f32(<vscale x 4 x i1> %Pg, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld4.nxv16f32:
+; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, #-32, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 -32
+%base_ptr = bitcast <vscale x 4 x float>* %base to float *
+%res = call <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(<vscale x 4 x i1> %Pg, float *%base_ptr)
+ret <vscale x 16 x float> %res
+}
+
+; ld4d
+define <vscale x 8 x i64> @ld4.nxv8i64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> *%addr) {
+; CHECK-LABEL: ld4.nxv8i64:
+; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, #28, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 28
+%base_ptr = bitcast <vscale x 2 x i64>* %base to i64 *
+%res = call <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(<vscale x 2 x i1> %Pg, i64 *%base_ptr)
+ret <vscale x 8 x i64> %res
+}
+
+define <vscale x 8 x double> @ld4.nxv8f64(<vscale x 2 x i1> %Pg, <vscale x 2 x double> *%addr) {
+; CHECK-LABEL: ld4.nxv8f64:
+; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, #-32, mul vl]
+; CHECK-NEXT: ret
+%base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 -32
+%base_ptr = bitcast <vscale x 2 x double>* %base to double *
+%res = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(<vscale x 2 x i1> %Pg, double * %base_ptr)
+ret <vscale x 8 x double> %res
+}
+
+declare <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(<vscale x 16 x i1>, i8*)
+declare <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(<vscale x 8 x i1>, i16*)
+declare <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*)
+declare <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*)
+declare <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*)
+declare <vscale x 16 x bfloat> @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*)
+declare <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*)
+declare <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*)
+
+declare <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(<vscale x 16 x i1>, i8*)
+declare <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(<vscale x 8 x i1>, i16*)
+declare <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*)
+declare <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*)
+declare <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*)
+declare <vscale x 24 x bfloat> @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*)
+declare <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*)
+declare <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*)
+
+declare <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1>, i8*)
+declare <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(<vscale x 8 x i1>, i16*)
+declare <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*)
+declare <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*)
+declare <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*)
+declare <vscale x 32 x bfloat> @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*)
+declare <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*)
+declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*)
+
+; +bf16 is required for the bfloat version.
+attributes #0 = { "target-features"="+sve,+bf16" }
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll
new file mode 100644
index 000000000000..ab59c8413795
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-reg+reg-addr-mode.ll
@@ -0,0 +1,259 @@
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sve < %s | FileCheck %s
+
+; ld2b
+define <vscale x 32 x i8> @ld2.nxv32i8(<vscale x 16 x i1> %Pg, i8 *%addr, i64 %a) {
+; CHECK-LABEL: ld2.nxv32i8:
+; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0, x1]
+; CHECK-NEXT: ret
+%addr2 = getelementptr i8, i8 *  %addr, i64 %a
+%res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%addr2)
+ret <vscale x 32 x i8> %res
+}
+
+; ld2h
+define <vscale x 16 x i16> @ld2.nxv16i16(<vscale x 8 x i1> %Pg, i16 *%addr, i64 %a) {
+; CHECK-LABEL: ld2.nxv16i16:
+; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+%addr2 = getelementptr i16, i16 *  %addr, i64 %a
+%res = call <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(<vscale x 8 x i1> %Pg, i16 *%addr2)
+ret <vscale x 16 x i16> %res
+}
+
+define <vscale x 16 x half> @ld2.nxv16f16(<vscale x 8 x i1> %Pg, half *%addr, i64 %a) {
+; CHECK-LABEL: ld2.nxv16f16:
+; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+%addr2 = getelementptr half, half *  %addr, i64 %a
+%res = call <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(<vscale x 8 x i1> %Pg, half *%addr2)
+ret <vscale x 16 x half> %res
+}
+
+define <vscale x 16 x bfloat> @ld2.nxv16bf16(<vscale x 8 x i1> %Pg, bfloat *%addr, i64 %a) #0 {
+; CHECK-LABEL: ld2.nxv16bf16:
+; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+%addr2 = getelementptr bfloat, bfloat *  %addr, i64 %a
+%res = call <vscale x 16 x bfloat> @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(<vscale x 8 x i1> %Pg, bfloat *%addr2)
+ret <vscale x 16 x bfloat> %res
+}
+
+; ld2w
+define <vscale x 8 x i32> @ld2.nxv8i32(<vscale x 4 x i1> %Pg, i32 *%addr, i64 %a) {
+; CHECK-LABEL: ld2.nxv8i32:
+; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+%addr2 = getelementptr i32, i32 *  %addr, i64 %a
+%res = call <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(<vscale x 4 x i1> %Pg, i32 *%addr2)
+ret <vscale x 8 x i32> %res
+}
+
+define <vscale x 8 x float> @ld2.nxv8f32(<vscale x 4 x i1> %Pg, float *%addr, i64 %a) {
+; CHECK-LABEL: ld2.nxv8f32:
+; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+%addr2 = getelementptr float, float *  %addr, i64 %a
+%res = call <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(<vscale x 4 x i1> %Pg, float *%addr2)
+ret <vscale x 8 x float> %res
+}
+
+; ld2d
+define <vscale x 4 x i64> @ld2.nxv4i64(<vscale x 2 x i1> %Pg, i64 *%addr, i64 %a) {
+; CHECK-LABEL: ld2.nxv4i64:
+; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+%addr2 = getelementptr i64, i64 *  %addr, i64 %a
+%res = call <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(<vscale x 2 x i1> %Pg, i64 *%addr2)
+ret <vscale x 4 x i64> %res
+}
+
+define <vscale x 4 x double> @ld2.nxv4f64(<vscale x 2 x i1> %Pg, double *%addr, i64 %a) {
+; CHECK-LABEL: ld2.nxv4f64:
+; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+%addr2 = getelementptr double, double *  %addr, i64 %a
+%res = call <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(<vscale x 2 x i1> %Pg, double *%addr2)
+ret <vscale x 4 x double> %res
+}
+
+; ld3b
+define <vscale x 48 x i8> @ld3.nxv48i8(<vscale x 16 x i1> %Pg, i8 *%addr, i64 %a) {
+; CHECK-LABEL: ld3.nxv48i8:
+; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x1]
+; CHECK-NEXT: ret
+%addr2 = getelementptr i8, i8 *  %addr, i64 %a
+%res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%addr2)
+ret <vscale x 48 x i8> %res
+}
+
+; ld3h
+define <vscale x 24 x i16> @ld3.nxv24i16(<vscale x 8 x i1> %Pg, i16 *%addr, i64 %a) {
+; CHECK-LABEL: ld3.nxv24i16:
+; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+%addr2 = getelementptr i16, i16 *  %addr, i64 %a
+%res = call <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(<vscale x 8 x i1> %Pg, i16 *%addr2)
+ret <vscale x 24 x i16> %res
+}
+
+define <vscale x 24 x half> @ld3.nxv24f16(<vscale x 8 x i1> %Pg, half *%addr, i64 %a) {
+; CHECK-LABEL: ld3.nxv24f16:
+; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+%addr2 = getelementptr half, half *  %addr, i64 %a
+%res = call <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(<vscale x 8 x i1> %Pg, half *%addr2)
+ret <vscale x 24 x half> %res
+}
+
+define <vscale x 24 x bfloat> @ld3.nxv24bf16(<vscale x 8 x i1> %Pg, bfloat *%addr, i64 %a) #0 {
+; CHECK-LABEL: ld3.nxv24bf16:
+; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+%addr2 = getelementptr bfloat, bfloat *  %addr, i64 %a
+%res = call <vscale x 24 x bfloat> @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(<vscale x 8 x i1> %Pg, bfloat *%addr2)
+ret <vscale x 24 x bfloat> %res
+}
+
+; ld3w
+define <vscale x 12 x i32> @ld3.nxv12i32(<vscale x 4 x i1> %Pg, i32 *%addr, i64 %a) {
+; CHECK-LABEL: ld3.nxv12i32:
+; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+%addr2 = getelementptr i32, i32 *  %addr, i64 %a
+%res = call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(<vscale x 4 x i1> %Pg, i32 *%addr2)
+ret <vscale x 12 x i32> %res
+}
+
+define <vscale x 12 x float> @ld3.nxv12f32(<vscale x 4 x i1> %Pg, float *%addr, i64 %a) {
+; CHECK-LABEL: ld3.nxv12f32:
+; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+%addr2 = getelementptr float, float *  %addr, i64 %a
+%res = call <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(<vscale x 4 x i1> %Pg, float *%addr2)
+ret <vscale x 12 x float> %res
+}
+
+; ld3d
+define <vscale x 6 x i64> @ld3.nxv6i64(<vscale x 2 x i1> %Pg, i64 *%addr, i64 %a) {
+; CHECK-LABEL: ld3.nxv6i64:
+; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+%addr2 = getelementptr i64, i64 *  %addr, i64 %a
+%res = call <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(<vscale x 2 x i1> %Pg, i64 *%addr2)
+ret <vscale x 6 x i64> %res
+}
+
+define <vscale x 6 x double> @ld3.nxv6f64(<vscale x 2 x i1> %Pg, double *%addr, i64 %a) {
+; CHECK-LABEL: ld3.nxv6f64:
+; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+%addr2 = getelementptr double, double *  %addr, i64 %a
+%res = call <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(<vscale x 2 x i1> %Pg, double *%addr2)
+ret <vscale x 6 x double> %res
+}
+
+; ld4b
+define <vscale x 64 x i8> @ld4.nxv64i8(<vscale x 16 x i1> %Pg, i8 *%addr, i64 %a) {
+; CHECK-LABEL: ld4.nxv64i8:
+; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x1]
+; CHECK-NEXT: ret
+%addr2 = getelementptr i8, i8 *  %addr, i64 %a
+%res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%addr2)
+ret <vscale x 64 x i8> %res
+}
+
+; ld4h
+define <vscale x 32 x i16> @ld4.nxv32i16(<vscale x 8 x i1> %Pg, i16 *%addr, i64 %a) {
+; CHECK-LABEL: ld4.nxv32i16:
+; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+%addr2 = getelementptr i16, i16 *  %addr, i64 %a
+%res = call <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(<vscale x 8 x i1> %Pg, i16 *%addr2)
+ret <vscale x 32 x i16> %res
+}
+
+define <vscale x 32 x half> @ld4.nxv32f16(<vscale x 8 x i1> %Pg, half *%addr, i64 %a) {
+; CHECK-LABEL: ld4.nxv32f16:
+; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+%addr2 = getelementptr half, half *  %addr, i64 %a
+%res = call <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(<vscale x 8 x i1> %Pg, half *%addr2)
+ret <vscale x 32 x half> %res
+}
+
+define <vscale x 32 x bfloat> @ld4.nxv32bf16(<vscale x 8 x i1> %Pg, bfloat *%addr, i64 %a) #0 {
+; CHECK-LABEL: ld4.nxv32bf16:
+; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+%addr2 = getelementptr bfloat, bfloat *  %addr, i64 %a
+%res = call <vscale x 32 x bfloat> @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(<vscale x 8 x i1> %Pg, bfloat *%addr2)
+ret <vscale x 32 x bfloat> %res
+}
+
+; ld4w
+define <vscale x 16 x i32> @ld4.nxv16i32(<vscale x 4 x i1> %Pg, i32 *%addr, i64 %a) {
+; CHECK-LABEL: ld4.nxv16i32:
+; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+%addr2 = getelementptr i32, i32 *  %addr, i64 %a
+%res = call <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(<vscale x 4 x i1> %Pg, i32 *%addr2)
+ret <vscale x 16 x i32> %res
+}
+
+define <vscale x 16 x float> @ld4.nxv16f32(<vscale x 4 x i1> %Pg, float *%addr, i64 %a) {
+; CHECK-LABEL: ld4.nxv16f32:
+; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+%addr2 = getelementptr float, float *  %addr, i64 %a
+%res = call <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(<vscale x 4 x i1> %Pg, float *%addr2)
+ret <vscale x 16 x float> %res
+}
+
+; ld4d
+define <vscale x 8 x i64> @ld4.nxv8i64(<vscale x 2 x i1> %Pg, i64 *%addr, i64 %a) {
+; CHECK-LABEL: ld4.nxv8i64:
+; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+%addr2 = getelementptr i64, i64 *  %addr, i64 %a
+%res = call <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(<vscale x 2 x i1> %Pg, i64 *%addr2)
+ret <vscale x 8 x i64> %res
+}
+
+define <vscale x 8 x double> @ld4.nxv8f64(<vscale x 2 x i1> %Pg, double *%addr, i64 %a) {
+; CHECK-LABEL: ld4.nxv8f64:
+; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+%addr2 = getelementptr double, double *  %addr, i64 %a
+%res = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(<vscale x 2 x i1> %Pg, double *%addr2)
+ret <vscale x 8 x double> %res
+}
+
+declare <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(<vscale x 16 x i1>, i8*)
+declare <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(<vscale x 8 x i1>, i16*)
+declare <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*)
+declare <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*)
+declare <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*)
+declare <vscale x 16 x bfloat> @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*)
+declare <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*)
+declare <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*)
+
+declare <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(<vscale x 16 x i1>, i8*)
+declare <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(<vscale x 8 x i1>, i16*)
+declare <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*)
+declare <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*)
+declare <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*)
+declare <vscale x 24 x bfloat> @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*)
+declare <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*)
+declare <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*)
+
+declare <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1>, i8*)
+declare <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(<vscale x 8 x i1>, i16*)
+declare <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*)
+declare <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*)
+declare <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*)
+declare <vscale x 32 x bfloat> @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*)
+declare <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*)
+declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*)
+
+; +bf16 is required for the bfloat version.
+attributes #0 = { "target-features"="+sve,+bf16" }

From 821e924f0d3e0fc9b5991a126fd094eec12bd535 Mon Sep 17 00:00:00 2001
From: Peiyuan Song <squallatf@gmail.com>
Date: Thu, 30 Jul 2020 23:32:37 +0300
Subject: [PATCH 089/363] [LLD] [Mingw] Don't export symbols from profile
 generate

Differential Revision: https://reviews.llvm.org/D84756

(cherry picked from commit da324f9904634855a0a3549284758cb079723cdf)
---
 lld/COFF/MinGW.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lld/COFF/MinGW.cpp b/lld/COFF/MinGW.cpp
index bded985f04d0..e24cdca6ee34 100644
--- a/lld/COFF/MinGW.cpp
+++ b/lld/COFF/MinGW.cpp
@@ -34,6 +34,11 @@ AutoExporter::AutoExporter() {
       "libclang_rt.builtins-arm",
       "libclang_rt.builtins-i386",
       "libclang_rt.builtins-x86_64",
+      "libclang_rt.profile",
+      "libclang_rt.profile-aarch64",
+      "libclang_rt.profile-arm",
+      "libclang_rt.profile-i386",
+      "libclang_rt.profile-x86_64",
       "libc++",
       "libc++abi",
       "libunwind",
@@ -57,6 +62,10 @@ AutoExporter::AutoExporter() {
       "__builtin_",
       // Artificial symbols such as .refptr
       ".",
+      // profile generate symbols
+      "__profc_",
+      "__profd_",
+      "__profvp_",
   };
 
   excludeSymbolSuffixes = {

From df10da2ff0bd378917665fab295f025295413271 Mon Sep 17 00:00:00 2001
From: Joachim Protze <protze@itc.rwth-aachen.de>
Date: Thu, 30 Jul 2020 09:28:17 +0200
Subject: [PATCH 090/363] [OpenMP] Use weak attribute in interface only for
 static library

This is to address the issue reported at:
https://bugs.llvm.org/show_bug.cgi?id=46863

Since weak is meaningless for a shared library interface function, this patch
disables the attribute, when the OpenMP library is built as shared library.

ompt_start_tool is not an interface function, but a internally called function
possibly implemented by an OMPT tool.
This function needs to be weak if possible to allow overwriting ompt_start_tool
with a function implementation built into the application.

Differential Revision: https://reviews.llvm.org/D84871

(cherry picked from commit 03116a9f8c2fc98577e153083aaf9b6a701ab8f9)
---
 openmp/runtime/src/kmp_ftn_entry.h   |  8 ++++----
 openmp/runtime/src/kmp_os.h          | 10 ++++++++--
 openmp/runtime/src/ompt-specific.cpp |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h
index ab57907e088e..b4b0dea0d1af 100644
--- a/openmp/runtime/src/kmp_ftn_entry.h
+++ b/openmp/runtime/src/kmp_ftn_entry.h
@@ -939,7 +939,7 @@ void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_DEFAULT_DEVICE)(int KMP_DEREF arg) {
 
 // Get number of NON-HOST devices.
 // libomptarget, if loaded, provides this function in api.cpp.
-int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) KMP_WEAK_ATTRIBUTE;
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
 int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) {
 #if KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB)
   return 0;
@@ -957,13 +957,13 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) {
 
 // This function always returns true when called on host device.
 // Compiler/libomptarget should handle when it is called inside target region.
-int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) KMP_WEAK_ATTRIBUTE;
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
 int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) {
   return 1; // This is the host
 }
 
 // libomptarget, if loaded, provides this function
-int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) KMP_WEAK_ATTRIBUTE;
+int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
 int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) {
 #if KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB)
   return KMP_HOST_DEVICE;
@@ -1318,7 +1318,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_TASK_PRIORITY)(void) {
 // This function will be defined in libomptarget. When libomptarget is not
 // loaded, we assume we are on the host and return KMP_HOST_DEVICE.
 // Compiler/libomptarget will handle this if called inside target.
-int FTN_STDCALL FTN_GET_DEVICE_NUM(void) KMP_WEAK_ATTRIBUTE;
+int FTN_STDCALL FTN_GET_DEVICE_NUM(void) KMP_WEAK_ATTRIBUTE_EXTERNAL;
 int FTN_STDCALL FTN_GET_DEVICE_NUM(void) { return KMP_HOST_DEVICE; }
 
 // Compiler will ensure that this is only called from host in sequential region
diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h
index bfe7765b2a96..d1511904e94b 100644
--- a/openmp/runtime/src/kmp_os.h
+++ b/openmp/runtime/src/kmp_os.h
@@ -338,10 +338,16 @@ extern "C" {
 #define KMP_ALIAS(alias_of) __attribute__((alias(alias_of)))
 #endif
 
+#if KMP_HAVE_WEAK_ATTRIBUTE && !KMP_DYNAMIC_LIB
+#define KMP_WEAK_ATTRIBUTE_EXTERNAL __attribute__((weak))
+#else
+#define KMP_WEAK_ATTRIBUTE_EXTERNAL /* Nothing */
+#endif
+
 #if KMP_HAVE_WEAK_ATTRIBUTE
-#define KMP_WEAK_ATTRIBUTE __attribute__((weak))
+#define KMP_WEAK_ATTRIBUTE_INTERNAL __attribute__((weak))
 #else
-#define KMP_WEAK_ATTRIBUTE /* Nothing */
+#define KMP_WEAK_ATTRIBUTE_INTERNAL /* Nothing */
 #endif
 
 // Define KMP_VERSION_SYMBOL and KMP_EXPAND_NAME
diff --git a/openmp/runtime/src/ompt-specific.cpp b/openmp/runtime/src/ompt-specific.cpp
index a7288f08a661..9be699110fc6 100644
--- a/openmp/runtime/src/ompt-specific.cpp
+++ b/openmp/runtime/src/ompt-specific.cpp
@@ -27,7 +27,7 @@
 #define THREAD_LOCAL __thread
 #endif
 
-#define OMPT_WEAK_ATTRIBUTE KMP_WEAK_ATTRIBUTE
+#define OMPT_WEAK_ATTRIBUTE KMP_WEAK_ATTRIBUTE_INTERNAL
 
 //******************************************************************************
 // macros

From 903c872b169dc88f434cf84c0aee32e429e1cc56 Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs@gmail.com>
Date: Fri, 31 Jul 2020 10:28:14 +0200
Subject: [PATCH 091/363] [analyzer] Fix out-of-tree only clang build by not
 relaying on private header

It turned out that the D78704 included a private LLVM header, which is excluded
from the LLVM install target.
I'm substituting that `#include` with the public one by moving the necessary
`#define` into that. There was a discussion about this at D78704 and on the
cfe-dev mailing list.

I'm also placing a note to remind others of this pitfall.

Reviewed By: mgorny

Differential Revision: https://reviews.llvm.org/D84929

(cherry picked from commit 63d3aeb529a7b0fb95c2092ca38ad21c1f5cfd74)
---
 .../StaticAnalyzer/FalsePositiveRefutationBRVisitorTest.cpp | 2 +-
 llvm/include/llvm/Config/config.h.cmake                     | 6 +++---
 llvm/include/llvm/Config/llvm-config.h.cmake                | 3 +++
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/clang/unittests/StaticAnalyzer/FalsePositiveRefutationBRVisitorTest.cpp b/clang/unittests/StaticAnalyzer/FalsePositiveRefutationBRVisitorTest.cpp
index 7c151c182113..e67dcacca0a9 100644
--- a/clang/unittests/StaticAnalyzer/FalsePositiveRefutationBRVisitorTest.cpp
+++ b/clang/unittests/StaticAnalyzer/FalsePositiveRefutationBRVisitorTest.cpp
@@ -16,7 +16,7 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Frontend/AnalysisConsumer.h"
 #include "clang/StaticAnalyzer/Frontend/CheckerRegistry.h"
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #include "gtest/gtest.h"
 
 // FIXME: Use GTEST_SKIP() instead if GTest is updated to version 1.10.0
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 290f74bd02d2..9a682481ccaf 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -1,6 +1,9 @@
 #ifndef CONFIG_H
 #define CONFIG_H
 
+// Include this header only under the llvm source tree.
+// This is a private header.
+
 /* Exported configuration */
 #include "llvm/Config/llvm-config.h"
 
@@ -338,9 +341,6 @@
 /* Whether GlobalISel rule coverage is being collected */
 #cmakedefine01 LLVM_GISEL_COV_ENABLED
 
-/* Define if we have z3 and want to build it */
-#cmakedefine LLVM_WITH_Z3 ${LLVM_WITH_Z3}
-
 /* Define to the default GlobalISel coverage file prefix */
 #cmakedefine LLVM_GISEL_COV_PREFIX "${LLVM_GISEL_COV_PREFIX}"
 
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 82b682ddb3dc..c1556e61f040 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -79,6 +79,9 @@
  */
 #cmakedefine01 LLVM_FORCE_ENABLE_STATS
 
+/* Define if we have z3 and want to build it */
+#cmakedefine LLVM_WITH_Z3 ${LLVM_WITH_Z3}
+
 /* Define if LLVM was built with a dependency to the libtensorflow dynamic library */
 #cmakedefine LLVM_HAVE_TF_API
 

From 15bf939137283027fae04e7da8c018346657b254 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 29 Jul 2020 13:01:31 -0700
Subject: [PATCH 092/363] [ELF][test] Fix ppc64-reloc-pcrel34-overflow.s

(cherry picked from commit ed7bde0e4b40cbf8a7c833fd8240c957fcda176e)
---
 lld/test/ELF/ppc64-reloc-pcrel34-overflow.s | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/lld/test/ELF/ppc64-reloc-pcrel34-overflow.s b/lld/test/ELF/ppc64-reloc-pcrel34-overflow.s
index a97160a430dd..ad80ed720b63 100644
--- a/lld/test/ELF/ppc64-reloc-pcrel34-overflow.s
+++ b/lld/test/ELF/ppc64-reloc-pcrel34-overflow.s
@@ -1,21 +1,22 @@
 # REQUIRES: ppc
 # RUN: echo 'SECTIONS { \
-# RUN:       .text_low 0x10010000: { *(.text_low) } \
-# RUN:       .text_overflow 0x1000000000 : { *(.text_overflow) } \
-# RUN:       }' > %t.script
+# RUN:   .text 0x10000: { *(.text) } \
+# RUN:   .data 0x200010000 : { *(.data) } \
+# RUN: }' > %t.script
 
 # RUN: llvm-mc -filetype=obj -triple=powerpc64le %s -o %t.o
-# RUN: not ld.lld -T %t.script %t.o -o %t
+# RUN: not ld.lld -T %t.script %t.o -o /dev/null 2>&1 | FileCheck %s
 
 # RUN: llvm-mc -filetype=obj -triple=powerpc64 %s -o %t.o
-# RUN: not ld.lld -T %t.script %t.o -o %t
+# RUN: not ld.lld -T %t.script %t.o -o /dev/null 2>&1 | FileCheck %s
 
-.section .text_low, "ax", %progbits
-# CHECK: relocation R_PPC64_PCREL34 out of range
-GlobIntOverflow:
+# CHECK: relocation R_PPC64_PCREL34 out of range: 8589934592 is not in [-8589934592, 8589934591]
 	plwa 3, glob_overflow@PCREL(0), 1
-	blr
-.section .text_overflow, "ax", %progbits
+
+# CHECK-NOT: relocation
+	plwa 3, .data@PCREL(0), 1
+
+.data
 glob_overflow:
 	.long	0
 	.size	glob_overflow, 4

From 3ae25b7a09ded12ff63acec0efcf8c7d715114fe Mon Sep 17 00:00:00 2001
From: Aleksandr Platonov <platonov.aleksandr@huawei.com>
Date: Thu, 30 Jul 2020 12:45:07 +0300
Subject: [PATCH 093/363] [clangd] findNearbyIdentifier(): fix the word search
 in the token stream.

Without this patch the word occurrence search always returns the first token of the file.
Despite of that, `findNeardyIdentifier()` returns the correct result (but inefficently) until there are several matched tokens with the same value `floor(log2(<token line> - <word line>))` (e.g. several matched tokens on the same line).

Reviewed By: kadircet

Differential Revision: https://reviews.llvm.org/D84912

(cherry picked from commit 05b173466142596b3297ab02e423574cb74b3799)
---
 clang-tools-extra/clangd/XRefs.cpp                | 2 +-
 clang-tools-extra/clangd/unittests/XRefsTests.cpp | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp
index c208e953f2ab..2a82dfd66499 100644
--- a/clang-tools-extra/clangd/XRefs.cpp
+++ b/clang-tools-extra/clangd/XRefs.cpp
@@ -518,7 +518,7 @@ const syntax::Token *findNearbyIdentifier(const SpelledWord &Word,
   // Find where the word occurred in the token stream, to search forward & back.
   auto *I = llvm::partition_point(SpelledTokens, [&](const syntax::Token &T) {
     assert(SM.getFileID(T.location()) == SM.getFileID(Word.Location));
-    return T.location() >= Word.Location; // Comparison OK: same file.
+    return T.location() < Word.Location; // Comparison OK: same file.
   });
   // Search for matches after the cursor.
   for (const syntax::Token &Tok : llvm::makeArrayRef(I, SpelledTokens.end()))
diff --git a/clang-tools-extra/clangd/unittests/XRefsTests.cpp b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
index 0428303f5b0a..0a8f85ed5317 100644
--- a/clang-tools-extra/clangd/unittests/XRefsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
@@ -1197,7 +1197,14 @@ TEST(LocateSymbol, NearbyIdentifier) {
 
       // h^i
     )cpp",
-  };
+      R"cpp(
+      // prefer nearest occurrence even if several matched tokens
+      // have the same value of `floor(log2(<token line> - <word line>))`.
+      int hello;
+      int x = hello, y = hello;
+      int z = [[hello]];
+      // h^ello
+    )cpp"};
   for (const char *Test : Tests) {
     Annotations T(Test);
     auto AST = TestTU::withCode(T.code()).build();

From a45dd85fe4ccf721dc5ab01768c79bce73ff3474 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Thu, 30 Jul 2020 09:19:58 +0200
Subject: [PATCH 094/363] [Concepts] Fix a deserialization crash.

`TemplateTypeParmDecl::hasTypeConstraint` is not a safe guard for
checking `TemplateTypeParmDecl::getTypeConstraint()` result is null.

in somecases (e.g. implicit deduction guide templates synthesized from the
constructor, immediately-declared constraint is not formed because of an error),
hasTypeConstraint returns false, and getTypeConstraint returns a nullptr.

Fix https://bugs.llvm.org/show_bug.cgi?id=46790

Differential Revision: https://reviews.llvm.org/D84455

(cherry picked from commit 73c12bd8ff1a9cd8375a357ea06f171e127ec1b8)
---
 clang/lib/Serialization/ASTReaderDecl.cpp  |  8 +++---
 clang/test/PCH/cxx2a-constraints-crash.cpp | 29 ++++++++++++++++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/PCH/cxx2a-constraints-crash.cpp

diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index eef4ab16ec15..117eb598bd5e 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2912,9 +2912,11 @@ static bool isSameTemplateParameter(const NamedDecl *X,
       return false;
     if (TX->hasTypeConstraint() != TY->hasTypeConstraint())
       return false;
-    if (TX->hasTypeConstraint()) {
-      const TypeConstraint *TXTC = TX->getTypeConstraint();
-      const TypeConstraint *TYTC = TY->getTypeConstraint();
+    const TypeConstraint *TXTC = TX->getTypeConstraint();
+    const TypeConstraint *TYTC = TY->getTypeConstraint();
+    if (!TXTC != !TYTC)
+      return false;
+    if (TXTC && TYTC) {
       if (TXTC->getNamedConcept() != TYTC->getNamedConcept())
         return false;
       if (TXTC->hasExplicitTemplateArgs() != TYTC->hasExplicitTemplateArgs())
diff --git a/clang/test/PCH/cxx2a-constraints-crash.cpp b/clang/test/PCH/cxx2a-constraints-crash.cpp
new file mode 100644
index 000000000000..637c55f0c879
--- /dev/null
+++ b/clang/test/PCH/cxx2a-constraints-crash.cpp
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -std=c++2a -emit-pch %s -o %t
+// RUN: %clang_cc1 -std=c++2a -include-pch %t -verify %s
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+template <typename T, typename U>
+concept not_same_as = true;
+
+template <int Kind>
+struct subrange {
+  template <not_same_as<int> R>
+  subrange(R) requires(Kind == 0);
+
+  template <not_same_as<int> R>
+  subrange(R) requires(Kind != 0);
+};
+
+template <typename R>
+subrange(R) -> subrange<42>;
+
+int main() {
+  int c;
+  subrange s(c);
+}
+
+#endif

From 2cf9a07fdcf683fdb2dfad6eeb79a78d8d3a8e3a Mon Sep 17 00:00:00 2001
From: Brendon Cahoon <bcahoon@quicinc.com>
Date: Thu, 30 Jul 2020 09:50:59 -0500
Subject: [PATCH 095/363] Align store conditional address

In cases where the alignment of the datatype is smaller than
expected by the instruction, the address is aligned. The aligned
address is used for the load, but wasn't used for the store
conditional, which resulted in a run-time alignment exception.

(cherry picked from commit 7b114446c320de542c50c4c02f566e5d18adee33)
---
 llvm/lib/CodeGen/AtomicExpandPass.cpp          |  3 ++-
 llvm/test/CodeGen/Hexagon/atomic-store-byte.ll | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/atomic-store-byte.ll

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index a5030305435c..c61531c5141a 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -1239,7 +1239,8 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   Value *NewValueInsert =
       insertMaskedValue(Builder, LoadedTryStore, CI->getNewValOperand(), PMV);
   Value *StoreSuccess =
-      TLI->emitStoreConditional(Builder, NewValueInsert, Addr, MemOpOrder);
+      TLI->emitStoreConditional(Builder, NewValueInsert, PMV.AlignedAddr,
+                                MemOpOrder);
   StoreSuccess = Builder.CreateICmpEQ(
       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
   BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
diff --git a/llvm/test/CodeGen/Hexagon/atomic-store-byte.ll b/llvm/test/CodeGen/Hexagon/atomic-store-byte.ll
new file mode 100644
index 000000000000..e3febe0264ad
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/atomic-store-byte.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=hexagon < %s | FileCheck %s
+
+; Test that the address for a store conditional for a byte is aligned
+; correctly to use the memw_locked instruction.
+
+; CHECK: [[REG:(r[0-9]+)]] = and(r{{[0-9]+}},#-4)
+; CHECK: = memw_locked([[REG]])
+; CHECK: memw_locked([[REG]],p{{[0-4]}}) =
+
+@foo.a00 = internal global i8 0, align 1
+
+; Function Attrs: nofree norecurse nounwind
+define dso_local void @foo() local_unnamed_addr #0 {
+entry:
+  %0 = cmpxchg volatile i8* @foo.a00, i8 0, i8 1 seq_cst seq_cst
+  ret void
+}
+

From 542a08dcb7a8044c0ba52146d866515603fad122 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 30 Jul 2020 19:12:28 +0100
Subject: [PATCH 096/363] [LAA] Avoid adding pointers to the checks if they are
 not needed.

Currently we skip alias sets with only reads or a single write and no
reads, but still add the pointers to the list of pointers in RtCheck.

This can lead to cases where we try to access a pointer that does not
exist when grouping checks.  In most cases, the way we access
PositionMap masked that, as the value would default to index 0.

But in the example in PR46854 it causes a crash.

This patch updates the logic to avoid adding pointers for alias sets
that do not need any checks. It makes things slightly more verbose, by
first checking the numbers of reads/writes and bailing out early if we don't
need checks for the alias set.

I think this makes the logic a bit simpler to follow.

Reviewed By: anemet

Differential Revision: https://reviews.llvm.org/D84608

(cherry picked from commit 2062b3707c1ef698deaa9abc571b937fdd077168)
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 60 ++++++++-------
 .../LoopLoadElim/pr46854-adress-spaces.ll     | 77 +++++++++++++++++++
 2 files changed, 110 insertions(+), 27 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopLoadElim/pr46854-adress-spaces.ll

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index ae282a7a1095..f409cd322146 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -393,7 +393,10 @@ void RuntimePointerChecking::groupChecks(
     // equivalence class, the iteration order is deterministic.
     for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end();
          MI != ME; ++MI) {
-      unsigned Pointer = PositionMap[MI->getPointer()];
+      auto PointerI = PositionMap.find(MI->getPointer());
+      assert(PointerI != PositionMap.end() &&
+             "pointer in equivalence class not found in PositionMap");
+      unsigned Pointer = PointerI->second;
       bool Merged = false;
       // Mark this pointer as seen.
       Seen.insert(Pointer);
@@ -726,52 +729,55 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
 
     SmallVector<MemAccessInfo, 4> Retries;
 
+    // First, count how many write and read accesses are in the alias set. Also
+    // collect MemAccessInfos for later.
+    SmallVector<MemAccessInfo, 4> AccessInfos;
     for (auto A : AS) {
       Value *Ptr = A.getValue();
       bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
-      MemAccessInfo Access(Ptr, IsWrite);
 
       if (IsWrite)
         ++NumWritePtrChecks;
       else
         ++NumReadPtrChecks;
+      AccessInfos.emplace_back(Ptr, IsWrite);
+    }
 
+    // We do not need runtime checks for this alias set, if there are no writes
+    // or a single write and no reads.
+    if (NumWritePtrChecks == 0 ||
+        (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) {
+      assert((AS.size() <= 1 ||
+              all_of(AS,
+                     [this](auto AC) {
+                       MemAccessInfo AccessWrite(AC.getValue(), true);
+                       return DepCands.findValue(AccessWrite) == DepCands.end();
+                     })) &&
+             "Can only skip updating CanDoRT below, if all entries in AS "
+             "are reads or there is at most 1 entry");
+      continue;
+    }
+
+    for (auto &Access : AccessInfos) {
       if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId, TheLoop,
                                 RunningDepId, ASId, ShouldCheckWrap, false)) {
-        LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" << *Ptr << '\n');
+        LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
+                          << *Access.getPointer() << '\n');
         Retries.push_back(Access);
         CanDoAliasSetRT = false;
       }
     }
 
-    // If we have at least two writes or one write and a read then we need to
-    // check them.  But there is no need to checks if there is only one
-    // dependence set for this alias set.
-    //
     // Note that this function computes CanDoRT and MayNeedRTCheck
     // independently. For example CanDoRT=false, MayNeedRTCheck=false means that
     // we have a pointer for which we couldn't find the bounds but we don't
     // actually need to emit any checks so it does not matter.
-    bool NeedsAliasSetRTCheck = false;
-    if (!(IsDepCheckNeeded && CanDoAliasSetRT && RunningDepId == 2)) {
-      NeedsAliasSetRTCheck = (NumWritePtrChecks >= 2 ||
-                             (NumReadPtrChecks >= 1 && NumWritePtrChecks >= 1));
-      // For alias sets without at least 2 writes or 1 write and 1 read, there
-      // is no need to generate RT checks and CanDoAliasSetRT for this alias set
-      // does not impact whether runtime checks can be generated.
-      if (!NeedsAliasSetRTCheck) {
-        assert((AS.size() <= 1 ||
-                all_of(AS,
-                       [this](auto AC) {
-                         MemAccessInfo AccessWrite(AC.getValue(), true);
-                         return DepCands.findValue(AccessWrite) ==
-                                DepCands.end();
-                       })) &&
-               "Can only skip updating CanDoRT below, if all entries in AS "
-               "are reads or there is at most 1 entry");
-        continue;
-      }
-    }
+    //
+    // We need runtime checks for this alias set, if there are at least 2
+    // dependence sets (in which case RunningDepId > 2) or if we need to re-try
+    // any bound checks (because in that case the number of dependence sets is
+    // incomplete).
+    bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty();
 
     // We need to perform run-time alias checks, but some pointers had bounds
     // that couldn't be checked.
diff --git a/llvm/test/Transforms/LoopLoadElim/pr46854-adress-spaces.ll b/llvm/test/Transforms/LoopLoadElim/pr46854-adress-spaces.ll
new file mode 100644
index 000000000000..396899d8d280
--- /dev/null
+++ b/llvm/test/Transforms/LoopLoadElim/pr46854-adress-spaces.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: amdgpu-registered-target
+
+; RUN: opt -globals-aa -loop-load-elim -S %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+%struct.foo = type { %struct.pluto, i8, i8*, i32 }
+%struct.pluto = type { i32, i32, i32, %struct.wombat*, i32, i32, i32 }
+%struct.wombat = type { %struct.barney }
+%struct.barney = type { <2 x float> }
+
+@global = external protected local_unnamed_addr addrspace(4) externally_initialized global %struct.foo, align 8
+@global.1 = internal unnamed_addr addrspace(3) constant [4000 x float] undef, align 16
+
+; Function Attrs: nounwind
+define protected amdgpu_kernel void @widget(i32 %arg, i32 %arg1) #0 {
+; CHECK-LABEL: @widget(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [4000 x float], [4000 x float] addrspace(3)* @global.1, i32 0, i32 [[ARG:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load %struct.wombat*, %struct.wombat* addrspace(4)* getelementptr inbounds (%struct.foo, [[STRUCT_FOO:%.*]] addrspace(4)* @global, i64 0, i32 0, i32 3), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_WOMBAT:%.*]], %struct.wombat* [[TMP2]], i64 undef, i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.barney* [[TMP3]] to i64*
+; CHECK-NEXT:    br label [[BB5:%.*]]
+; CHECK:       bb5.loopexit:
+; CHECK-NEXT:    br label [[BB5]]
+; CHECK:       bb5:
+; CHECK-NEXT:    br label [[BB6:%.*]]
+; CHECK:       bb6:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i32 [ undef, [[BB5]] ], [ [[TMP19:%.*]], [[BB6]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nsw i32 [[TMP7]], undef
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], undef
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_WOMBAT]], %struct.wombat* [[TMP2]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.wombat* [[TMP11]] to i64*
+; CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = srem i32 1, [[ARG1:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i32 [[TMP14]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4000 x float], [4000 x float] addrspace(3)* @global.1, i32 0, i32 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, float addrspace(3)* [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load float, float addrspace(3)* [[TMP]], align 4
+; CHECK-NEXT:    store i64 [[TMP13]], i64* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP19]] = add nsw i32 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp slt i32 [[TMP7]], 3
+; CHECK-NEXT:    br i1 [[TMP20]], label [[BB6]], label [[BB5_LOOPEXIT:%.*]]
+;
+bb:
+  %tmp = getelementptr inbounds [4000 x float], [4000 x float] addrspace(3)* @global.1, i32 0, i32 %arg
+  %tmp2 = load %struct.wombat*, %struct.wombat* addrspace(4)* getelementptr inbounds (%struct.foo, %struct.foo addrspace(4)* @global, i64 0, i32 0, i32 3), align 8
+  %tmp3 = getelementptr inbounds %struct.wombat, %struct.wombat* %tmp2, i64 undef, i32 0
+  %tmp4 = bitcast %struct.barney* %tmp3 to i64*
+  br label %bb5
+
+bb5:                                              ; preds = %bb6, %bb
+  br label %bb6
+
+bb6:                                              ; preds = %bb6, %bb5
+  %tmp7 = phi i32 [ undef, %bb5 ], [ %tmp19, %bb6 ]
+  %tmp8 = mul nsw i32 %tmp7, undef
+  %tmp9 = add i32 %tmp8, undef
+  %tmp10 = sext i32 %tmp9 to i64
+  %tmp11 = getelementptr inbounds %struct.wombat, %struct.wombat* %tmp2, i64 %tmp10
+  %tmp12 = bitcast %struct.wombat* %tmp11 to i64*
+  %tmp13 = load i64, i64* %tmp12, align 8
+  %tmp14 = srem i32 1, %arg1
+  %tmp15 = add nuw nsw i32 %tmp14, 1
+  %tmp16 = getelementptr inbounds [4000 x float], [4000 x float] addrspace(3)* @global.1, i32 0, i32 %tmp15
+  %tmp17 = load float, float addrspace(3)* %tmp16, align 4
+  %tmp18 = load float, float addrspace(3)* %tmp, align 4
+  store i64 %tmp13, i64* %tmp4, align 8
+  %tmp19 = add nsw i32 %tmp7, 1
+  %tmp20 = icmp slt i32 %tmp7, 3
+  br i1 %tmp20, label %bb6, label %bb5
+}
+
+attributes #0 = { nounwind }

From 921838e68fcd379ff779da0a3d9ebb6fc0fb96f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= <mgorny@gentoo.org>
Date: Fri, 31 Jul 2020 00:11:40 +0200
Subject: [PATCH 097/363] [CMake] Pass bugreport URL to standalone clang build

BUG_REPORT_URL is currently used both in LLVM and in Clang but declared
only in the latter.  This means that it's missing in standalone clang
builds and the driver ends up outputting:

  PLEASE submit a bug report to  and include [...]

(note the missing URL)

To fix this, include LLVM_PACKAGE_BUGREPORT in LLVMConfig.cmake
(similarly to how we pass PACKAGE_VERSION) and use it to fill
BUG_REPORT_URL when building clang standalone.

Differential Revision: https://reviews.llvm.org/D84987

(cherry picked from commit 21c165de2a1bcca9dceb452f637d9e8959fba113)
---
 clang/CMakeLists.txt                   | 2 ++
 llvm/cmake/modules/LLVMConfig.cmake.in | 1 +
 2 files changed, 3 insertions(+)

diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index 7f8e0718c2eb..2e06c5fd9028 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -121,6 +121,8 @@ if( CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR )
   include(LLVMDistributionSupport)
 
   set(PACKAGE_VERSION "${LLVM_PACKAGE_VERSION}")
+  set(BUG_REPORT_URL "${LLVM_PACKAGE_BUGREPORT}" CACHE STRING
+    "Default URL where bug reports are to be submitted.")
 
   if (NOT DEFINED LLVM_INCLUDE_TESTS)
     set(LLVM_INCLUDE_TESTS ON)
diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in
index e729a839f614..4d8e33711d27 100644
--- a/llvm/cmake/modules/LLVMConfig.cmake.in
+++ b/llvm/cmake/modules/LLVMConfig.cmake.in
@@ -7,6 +7,7 @@ set(LLVM_VERSION_MINOR @LLVM_VERSION_MINOR@)
 set(LLVM_VERSION_PATCH @LLVM_VERSION_PATCH@)
 set(LLVM_VERSION_SUFFIX @LLVM_VERSION_SUFFIX@)
 set(LLVM_PACKAGE_VERSION @PACKAGE_VERSION@)
+set(LLVM_PACKAGE_BUGREPORT @PACKAGE_BUGREPORT@)
 
 set(LLVM_BUILD_TYPE @CMAKE_BUILD_TYPE@)
 

From a9430a1c9e9c99151361374f0462d751457fa15c Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang@gmail.com>
Date: Thu, 30 Jul 2020 14:37:06 -0700
Subject: [PATCH 098/363] AMDGPU: Put inexpensive ops first in
 AMDGPUAnnotateUniformValues::visitLoadInst

Summary:
  This is in response to the review of https://reviews.llvm.org/D84873:
The expensive check should be reordered last

Reviewers:
  arsenm

Differential Revision:
  https://reviews.llvm.org/D84890

(cherry picked from commit 243376cdc7b719d443f42c8c4667e5d96af53dcc)
---
 .../AMDGPU/AMDGPUAnnotateUniformValues.cpp       | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index b09e92c07f9b..45f515c5115e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -131,10 +131,20 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
   // We're tracking up to the Function boundaries, and cannot go beyond because
   // of FunctionPass restrictions. We can ensure that is memory not clobbered
   // for memory operations that are live in to entry points only.
-  bool NotClobbered = isEntryFunc && !isClobberedInFunction(&I);
   Instruction *PtrI = dyn_cast<Instruction>(Ptr);
-  if (!PtrI && NotClobbered && isGlobalLoad(I)) {
-    if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
+
+  if (!isEntryFunc) {
+    if (PtrI)
+      setUniformMetadata(PtrI);
+    return;
+  }
+
+  bool NotClobbered = false;
+  if (PtrI)
+    NotClobbered = !isClobberedInFunction(&I);
+  else if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
+    if (isGlobalLoad(I) && !isClobberedInFunction(&I)) {
+      NotClobbered = true;
       // Lookup for the existing GEP
       if (noClobberClones.count(Ptr)) {
         PtrI = noClobberClones[Ptr];

From a19ff10e6d74468505815123647ee89a42683b5f Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Wed, 5 Aug 2020 17:12:51 +0200
Subject: [PATCH 099/363] Bump forgotten version nbr in llvm/docs/conf.py

---
 llvm/docs/conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/docs/conf.py b/llvm/docs/conf.py
index aed5e06b6f50..a13eb63a632b 100644
--- a/llvm/docs/conf.py
+++ b/llvm/docs/conf.py
@@ -66,9 +66,9 @@
 # built documents.
 #
 # The short version.
-version = '10'
+version = '11'
 # The full version, including alpha/beta/rc tags.
-release = '10'
+release = '11'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

From 145694f1ba6688c5c79466490e13e2c3236914d6 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 16 Jul 2020 21:38:49 -0700
Subject: [PATCH 100/363] [llvm] Add RISCVTargetParser.def to the module map

This fixes the modules build.

(cherry picked from commit 1b3c25e7b61f44b80788f8758f0d7f0b013135b5)
---
 llvm/include/llvm/module.modulemap | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/module.modulemap b/llvm/include/llvm/module.modulemap
index b262311a96a0..778a17c8aeee 100644
--- a/llvm/include/llvm/module.modulemap
+++ b/llvm/include/llvm/module.modulemap
@@ -388,7 +388,7 @@ module LLVM_Utils {
 
     umbrella "Support"
     module * { export * }
-    
+
     // Exclude this; it should only be used on Windows.
     exclude header "Support/Windows/WindowsSupport.h"
 
@@ -397,8 +397,9 @@ module LLVM_Utils {
     exclude header "Support/Solaris/sys/regset.h"
 
     // These are intended for textual inclusion.
-    textual header "Support/ARMTargetParser.def"
     textual header "Support/AArch64TargetParser.def"
+    textual header "Support/ARMTargetParser.def"
+    textual header "Support/RISCVTargetParser.def"
     textual header "Support/TargetOpcodes.def"
     textual header "Support/X86TargetParser.def"
   }

From 280653d2ea4a40f2968b3f4662aa7c6d254c6cb9 Mon Sep 17 00:00:00 2001
From: Adrian Pop <adrian.pop@liu.se>
Date: Tue, 4 Aug 2020 23:15:17 +0300
Subject: [PATCH 101/363] [OpenMP] support build on msys2/mingw with clang or
 gcc

RTM Adaptive Locks are supported on msys2/mingw for clang and gcc.

Differential Revision: https://reviews.llvm.org/D81776

(cherry picked from commit bf2aa74e51997ee190f3b34dd26a1b564e59e267)
---
 openmp/runtime/src/kmp_lock.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openmp/runtime/src/kmp_lock.cpp b/openmp/runtime/src/kmp_lock.cpp
index 8bf7ef2deb71..775693253db2 100644
--- a/openmp/runtime/src/kmp_lock.cpp
+++ b/openmp/runtime/src/kmp_lock.cpp
@@ -1706,7 +1706,8 @@ static void __kmp_set_queuing_lock_flags(kmp_queuing_lock_t *lck,
 
 #if (KMP_COMPILER_ICC && __INTEL_COMPILER >= 1300) ||                          \
     (KMP_COMPILER_MSVC && _MSC_VER >= 1700) ||                                 \
-    (KMP_COMPILER_CLANG && KMP_MSVC_COMPAT)
+    (KMP_COMPILER_CLANG && (KMP_MSVC_COMPAT || __MINGW32__)) ||                \
+    (KMP_COMPILER_GCC && __MINGW32__)
 
 #include <immintrin.h>
 #define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)

From d11e17309414b91c2e7dfc611f098041a62de201 Mon Sep 17 00:00:00 2001
From: AndreyChurbanov <andrey.churbanov@intel.com>
Date: Tue, 4 Aug 2020 18:48:25 +0300
Subject: [PATCH 102/363] [OpenMP] Don't use MSVC workaround with MinGW

Patch by mati865@gmail.com

Differential Revision: https://reviews.llvm.org/D85210

(cherry picked from commit 4a04bc8995639e1d333790518e4d42e0961f740e)
---
 openmp/runtime/cmake/LibompExports.cmake    |  4 +--
 openmp/runtime/cmake/LibompMicroTests.cmake |  2 +-
 openmp/runtime/src/CMakeLists.txt           | 34 ++++++++++++---------
 3 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/openmp/runtime/cmake/LibompExports.cmake b/openmp/runtime/cmake/LibompExports.cmake
index f98de2631b83..44fb97631b28 100644
--- a/openmp/runtime/cmake/LibompExports.cmake
+++ b/openmp/runtime/cmake/LibompExports.cmake
@@ -83,11 +83,11 @@ add_custom_command(TARGET omp POST_BUILD
 
 # Copy Windows import library into exports/ directory post build
 if(WIN32)
-  get_target_property(LIBOMPIMP_OUTPUT_DIRECTORY ompimp ARCHIVE_OUTPUT_DIRECTORY)
+  get_target_property(LIBOMPIMP_OUTPUT_DIRECTORY ${LIBOMP_IMP_LIB_TARGET} ARCHIVE_OUTPUT_DIRECTORY)
   if(NOT LIBOMPIMP_OUTPUT_DIRECTORY)
     set(LIBOMPIMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
-  add_custom_command(TARGET ompimp POST_BUILD
+  add_custom_command(TARGET ${LIBOMP_IMP_LIB_TARGET} POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_LIB_DIR}
     COMMAND ${CMAKE_COMMAND} -E copy ${LIBOMPIMP_OUTPUT_DIRECTORY}/${LIBOMP_IMP_LIB_FILE} ${LIBOMP_EXPORTS_LIB_DIR}
   )
diff --git a/openmp/runtime/cmake/LibompMicroTests.cmake b/openmp/runtime/cmake/LibompMicroTests.cmake
index dc44e2c1e2fc..1ca3412edc8e 100644
--- a/openmp/runtime/cmake/LibompMicroTests.cmake
+++ b/openmp/runtime/cmake/LibompMicroTests.cmake
@@ -40,7 +40,7 @@
 # get library location
 if(WIN32)
   get_target_property(LIBOMP_OUTPUT_DIRECTORY omp RUNTIME_OUTPUT_DIRECTORY)
-  get_target_property(LIBOMPIMP_OUTPUT_DIRECTORY ompimp ARCHIVE_OUTPUT_DIRECTORY)
+  get_target_property(LIBOMPIMP_OUTPUT_DIRECTORY ${LIBOMP_IMP_LIB_TARGET} ARCHIVE_OUTPUT_DIRECTORY)
   if(NOT LIBOMPIMP_OUTPUT_DIRECTORY)
     set(LIBOMPIMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index 1211441876eb..81275c0483dd 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -202,21 +202,27 @@ if(WIN32)
     IMPORT_PREFIX "" IMPORT_SUFFIX "" # control generated import library name when building omp
     ARCHIVE_OUTPUT_NAME ${LIBOMP_GENERATED_IMP_LIB_FILENAME}
   )
-  # Get generated import library from creating omp
-  get_target_property(LIBOMP_IMPORT_LIB_DIRECTORY omp ARCHIVE_OUTPUT_DIRECTORY)
-  if(LIBOMP_IMPORT_LIB_DIRECTORY)
-    set(LIBOMP_GENERATED_IMP_LIB ${LIBOMP_IMPORT_LIB_DIRECTORY}/${LIBOMP_GENERATED_IMP_LIB_FILENAME})
+
+  if(MSVC)
+    # Get generated import library from creating omp
+    get_target_property(LIBOMP_IMPORT_LIB_DIRECTORY omp ARCHIVE_OUTPUT_DIRECTORY)
+    if(LIBOMP_IMPORT_LIB_DIRECTORY)
+      set(LIBOMP_GENERATED_IMP_LIB ${LIBOMP_IMPORT_LIB_DIRECTORY}/${LIBOMP_GENERATED_IMP_LIB_FILENAME})
+    else()
+      set(LIBOMP_GENERATED_IMP_LIB ${CMAKE_CURRENT_BINARY_DIR}/${LIBOMP_GENERATED_IMP_LIB_FILENAME})
+    endif()
+    set_source_files_properties(${LIBOMP_GENERATED_IMP_LIB} PROPERTIES GENERATED TRUE EXTERNAL_OBJECT TRUE)
+    # Create new import library that is just the previously created one + kmp_import.cpp
+    add_library(ompimp STATIC ${LIBOMP_GENERATED_IMP_LIB} kmp_import.cpp)
+    set_target_properties(ompimp PROPERTIES
+      PREFIX "" SUFFIX "" OUTPUT_NAME "${LIBOMP_IMP_LIB_FILE}"
+      LINKER_LANGUAGE C
+    )
+    add_dependencies(ompimp omp) # ensure generated import library is created first
+    set(LIBOMP_IMP_LIB_TARGET ompimp)
   else()
-    set(LIBOMP_GENERATED_IMP_LIB ${CMAKE_CURRENT_BINARY_DIR}/${LIBOMP_GENERATED_IMP_LIB_FILENAME})
+    set(LIBOMP_IMP_LIB_TARGET omp)
   endif()
-  set_source_files_properties(${LIBOMP_GENERATED_IMP_LIB} PROPERTIES GENERATED TRUE EXTERNAL_OBJECT TRUE)
-  # Create new import library that is just the previously created one + kmp_import.cpp
-  add_library(ompimp STATIC ${LIBOMP_GENERATED_IMP_LIB} kmp_import.cpp)
-  set_target_properties(ompimp PROPERTIES
-    PREFIX "" SUFFIX "" OUTPUT_NAME "${LIBOMP_IMP_LIB_FILE}"
-    LINKER_LANGUAGE C
-  )
-  add_dependencies(ompimp omp) # ensure generated import library is created first
 
   # Create def file to designate exported functions
   libomp_get_gdflags(LIBOMP_GDFLAGS) # generate-def.pl flags (Windows only)
@@ -290,7 +296,7 @@ else()
 endif()
 if(WIN32)
   install(TARGETS omp RUNTIME DESTINATION bin)
-  install(TARGETS ompimp ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+  install(TARGETS ${LIBOMP_IMP_LIB_TARGET} ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
   # Create aliases (regular copies) of the library for backwards compatibility
   set(LIBOMP_ALIASES "libiomp5md")
   foreach(alias IN LISTS LIBOMP_ALIASES)

From 4c1f394c69fec616d73af1f333154eba0a95590f Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Thu, 23 Jul 2020 11:17:16 +0200
Subject: [PATCH 103/363] RuntimeDyldELF: report_fatal_error instead of
 asserting for unimplemented relocations (PR46816)

This fixes the ExecutionEngine/MCJIT/stubs-sm-pic.ll test in no-asserts
builds which is set to XFAIL on some platforms like 32-bit x86. More
importantly, we probably don't want to silently error in these cases.

Differential revision: https://reviews.llvm.org/D84390

(cherry picked from commit 6a3b07a4bf14be32569550f2e9814d8797d27d31)
---
 .../ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 7c39ddc8b1da..7ed8a718ed3c 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -269,7 +269,7 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
                                              uint64_t SymOffset) {
   switch (Type) {
   default:
-    llvm_unreachable("Relocation type not implemented yet!");
+    report_fatal_error("Relocation type not implemented yet!");
     break;
   case ELF::R_X86_64_NONE:
     break;
@@ -359,7 +359,7 @@ void RuntimeDyldELF::resolveX86Relocation(const SectionEntry &Section,
   default:
     // There are other relocation types, but it appears these are the
     // only ones currently used by the LLVM ELF object writer
-    llvm_unreachable("Relocation type not implemented yet!");
+    report_fatal_error("Relocation type not implemented yet!");
     break;
   }
 }
@@ -382,7 +382,7 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
 
   switch (Type) {
   default:
-    llvm_unreachable("Relocation type not implemented yet!");
+    report_fatal_error("Relocation type not implemented yet!");
     break;
   case ELF::R_AARCH64_ABS16: {
     uint64_t Result = Value + Addend;
@@ -721,7 +721,7 @@ void RuntimeDyldELF::resolvePPC32Relocation(const SectionEntry &Section,
   uint8_t *LocalAddress = Section.getAddressWithOffset(Offset);
   switch (Type) {
   default:
-    llvm_unreachable("Relocation type not implemented yet!");
+    report_fatal_error("Relocation type not implemented yet!");
     break;
   case ELF::R_PPC_ADDR16_LO:
     writeInt16BE(LocalAddress, applyPPClo(Value + Addend));
@@ -741,7 +741,7 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section,
   uint8_t *LocalAddress = Section.getAddressWithOffset(Offset);
   switch (Type) {
   default:
-    llvm_unreachable("Relocation type not implemented yet!");
+    report_fatal_error("Relocation type not implemented yet!");
     break;
   case ELF::R_PPC64_ADDR16:
     writeInt16BE(LocalAddress, applyPPClo(Value + Addend));
@@ -835,7 +835,7 @@ void RuntimeDyldELF::resolveSystemZRelocation(const SectionEntry &Section,
   uint8_t *LocalAddress = Section.getAddressWithOffset(Offset);
   switch (Type) {
   default:
-    llvm_unreachable("Relocation type not implemented yet!");
+    report_fatal_error("Relocation type not implemented yet!");
     break;
   case ELF::R_390_PC16DBL:
   case ELF::R_390_PLT16DBL: {
@@ -890,7 +890,7 @@ void RuntimeDyldELF::resolveBPFRelocation(const SectionEntry &Section,
 
   switch (Type) {
   default:
-    llvm_unreachable("Relocation type not implemented yet!");
+    report_fatal_error("Relocation type not implemented yet!");
     break;
   case ELF::R_BPF_NONE:
     break;

From a862618aab2551540e175dff3a6a1d1d7c4b4a29 Mon Sep 17 00:00:00 2001
From: Peiyuan Song <squallatf@gmail.com>
Date: Thu, 30 Jul 2020 23:37:17 +0300
Subject: [PATCH 104/363] [compiler-rt] [profile] fix profile generate for
 mingw x86_64

Differential Revision: https://reviews.llvm.org/D84757

(cherry picked from commit 14c1b4017422cbf374086ea4c4fa74e16fb56779)
---
 compiler-rt/lib/profile/CMakeLists.txt       | 6 +++---
 compiler-rt/lib/profile/InstrProfilingPort.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt
index ece674b2daa1..29c6c02f2d03 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -1,11 +1,11 @@
 
 CHECK_CXX_SOURCE_COMPILES("
-#ifdef _MSC_VER
-#include <Intrin.h> /* Workaround for PR19898. */
+#ifdef _WIN32
+#include <intrin.h> /* Workaround for PR19898. */
 #include <windows.h>
 #endif
 int main() {
-#ifdef _MSC_VER
+#ifdef _WIN32
         volatile LONG val = 1;
         MemoryBarrier();
         InterlockedCompareExchange(&val, 0, 1);
diff --git a/compiler-rt/lib/profile/InstrProfilingPort.h b/compiler-rt/lib/profile/InstrProfilingPort.h
index 20cf5d660c6a..4493dd512ff0 100644
--- a/compiler-rt/lib/profile/InstrProfilingPort.h
+++ b/compiler-rt/lib/profile/InstrProfilingPort.h
@@ -53,9 +53,9 @@
 #endif
 
 #if COMPILER_RT_HAS_ATOMICS == 1
-#ifdef _MSC_VER
+#ifdef _WIN32
 #include <windows.h>
-#if _MSC_VER < 1900
+#if defined(_MSC_VER) && _MSC_VER < 1900
 #define snprintf _snprintf
 #endif
 #if defined(_WIN64)
@@ -73,7 +73,7 @@
   (DomType *)InterlockedExchangeAdd((LONG volatile *)&PtrVar,                  \
                                     (LONG)sizeof(DomType) * PtrIncr)
 #endif
-#else /* !defined(_MSC_VER) */
+#else /* !defined(_WIN32) */
 #define COMPILER_RT_BOOL_CMPXCHG(Ptr, OldV, NewV)                              \
   __sync_bool_compare_and_swap(Ptr, OldV, NewV)
 #define COMPILER_RT_PTR_FETCH_ADD(DomType, PtrVar, PtrIncr)                    \

From 0835988de17ef8ea70ac790d3d99fea832fcc3e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 4 Aug 2020 10:24:32 +0300
Subject: [PATCH 105/363] [llvm-rc] Allow string table values split into
 multiple string literals

This can practically easily be a product of combining strings with
macros in resource files.

This fixes https://github.com/mstorsjo/llvm-mingw/issues/140.

As string literals within llvm-rc are handled as StringRefs, each
referencing an uninterpreted slice of the input file, with actual
interpretation of the input string (codepage handling, unescaping etc)
done only right before writing them out to disk, it's hard to
concatenate them other than just bundling them up in a vector,
without rearchitecting a large part of llvm-rc.

This matches how the same already is supported in VersionInfoValue,
with a std::vector<IntOrString> Values.

MS rc.exe only supports concatenated string literals in version info
values (already supported), string tables (implemented in this patch)
and user data resources (easily implemented in a separate patch, but
hasn't been requested by any end user yet), while GNU windres supports
string immediates split into multiple strings anywhere (e.g. like
(100 ICON "myicon" ".ico"). Not sure if concatenation in other
statements actually is used in the wild though, in resource files
normally built by GNU windres.

Differential Revision: https://reviews.llvm.org/D85183

(cherry picked from commit b989fcbae6f179ad887d19ceef83ace1c00b87cc)
---
 .../llvm-rc/Inputs/tag-stringtable-basic.rc     |  4 ++--
 llvm/tools/llvm-rc/ResourceFileWriter.cpp       | 17 ++++++++++-------
 llvm/tools/llvm-rc/ResourceFileWriter.h         |  5 +++--
 llvm/tools/llvm-rc/ResourceScriptParser.cpp     |  8 +++++++-
 llvm/tools/llvm-rc/ResourceScriptStmt.cpp       |  8 ++++++--
 llvm/tools/llvm-rc/ResourceScriptStmt.h         |  6 +++---
 6 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/llvm/test/tools/llvm-rc/Inputs/tag-stringtable-basic.rc b/llvm/test/tools/llvm-rc/Inputs/tag-stringtable-basic.rc
index afda2f3af63d..7c929bb4a326 100644
--- a/llvm/test/tools/llvm-rc/Inputs/tag-stringtable-basic.rc
+++ b/llvm/test/tools/llvm-rc/Inputs/tag-stringtable-basic.rc
@@ -13,8 +13,8 @@ STRINGTABLE {
 STRINGTABLE
 VERSION 100
 LANGUAGE 4, 7 {
-  16 "hello"
-  17 "world"
+  16 "hel" "lo"
+  17 "wor" L"ld"
 }
 
 STRINGTABLE
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.cpp b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
index d8d401412486..09b078c94cd2 100644
--- a/llvm/tools/llvm-rc/ResourceFileWriter.cpp
+++ b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
@@ -1246,7 +1246,8 @@ Error ResourceFileWriter::visitStringTableBundle(const RCResource *Res) {
 }
 
 Error ResourceFileWriter::insertStringIntoBundle(
-    StringTableInfo::Bundle &Bundle, uint16_t StringID, StringRef String) {
+    StringTableInfo::Bundle &Bundle, uint16_t StringID,
+    const std::vector<StringRef> &String) {
   uint16_t StringLoc = StringID & 15;
   if (Bundle.Data[StringLoc])
     return createError("Multiple STRINGTABLE strings located under ID " +
@@ -1261,13 +1262,15 @@ Error ResourceFileWriter::writeStringTableBundleBody(const RCResource *Base) {
     // The string format is a tiny bit different here. We
     // first output the size of the string, and then the string itself
     // (which is not null-terminated).
-    bool IsLongString;
     SmallVector<UTF16, 128> Data;
-    RETURN_IF_ERROR(processString(Res->Bundle.Data[ID].getValueOr(StringRef()),
-                                  NullHandlingMethod::CutAtDoubleNull,
-                                  IsLongString, Data, Params.CodePage));
-    if (AppendNull && Res->Bundle.Data[ID])
-      Data.push_back('\0');
+    if (Res->Bundle.Data[ID]) {
+      bool IsLongString;
+      for (StringRef S : *Res->Bundle.Data[ID])
+        RETURN_IF_ERROR(processString(S, NullHandlingMethod::CutAtDoubleNull,
+                                      IsLongString, Data, Params.CodePage));
+      if (AppendNull)
+        Data.push_back('\0');
+    }
     RETURN_IF_ERROR(
         checkNumberFits<uint16_t>(Data.size(), "STRINGTABLE string size"));
     writeInt<uint16_t>(Data.size());
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.h b/llvm/tools/llvm-rc/ResourceFileWriter.h
index 673830601e86..d545a7a9cab1 100644
--- a/llvm/tools/llvm-rc/ResourceFileWriter.h
+++ b/llvm/tools/llvm-rc/ResourceFileWriter.h
@@ -103,7 +103,7 @@ class ResourceFileWriter : public Visitor {
     using BundleKey = std::pair<uint16_t, uint16_t>;
     // Each bundle is in fact an array of 16 strings.
     struct Bundle {
-      std::array<Optional<StringRef>, 16> Data;
+      std::array<Optional<std::vector<StringRef>>, 16> Data;
       ObjectInfo DeclTimeInfo;
       uint16_t MemoryFlags;
       Bundle(const ObjectInfo &Info, uint16_t Flags)
@@ -157,7 +157,8 @@ class ResourceFileWriter : public Visitor {
   Error visitStringTableBundle(const RCResource *);
   Error writeStringTableBundleBody(const RCResource *);
   Error insertStringIntoBundle(StringTableInfo::Bundle &Bundle,
-                               uint16_t StringID, StringRef String);
+                               uint16_t StringID,
+                               const std::vector<StringRef> &String);
 
   // User defined resource
   Error writeUserDefinedBody(const RCResource *);
diff --git a/llvm/tools/llvm-rc/ResourceScriptParser.cpp b/llvm/tools/llvm-rc/ResourceScriptParser.cpp
index 36b305645fb8..2155985c61b8 100644
--- a/llvm/tools/llvm-rc/ResourceScriptParser.cpp
+++ b/llvm/tools/llvm-rc/ResourceScriptParser.cpp
@@ -698,8 +698,14 @@ RCParser::ParseType RCParser::parseStringTableResource() {
     // between, however we strictly adhere to the single statement definition.
     ASSIGN_OR_RETURN(IDResult, readInt());
     consumeOptionalType(Kind::Comma);
+
+    std::vector<StringRef> Strings;
     ASSIGN_OR_RETURN(StrResult, readString());
-    Table->addString(*IDResult, *StrResult);
+    Strings.push_back(*StrResult);
+    while (isNextTokenKind(Kind::String))
+      Strings.push_back(read().value());
+
+    Table->addStrings(*IDResult, std::move(Strings));
   }
 
   return std::move(Table);
diff --git a/llvm/tools/llvm-rc/ResourceScriptStmt.cpp b/llvm/tools/llvm-rc/ResourceScriptStmt.cpp
index a0d4adbe6418..ef8c34541881 100644
--- a/llvm/tools/llvm-rc/ResourceScriptStmt.cpp
+++ b/llvm/tools/llvm-rc/ResourceScriptStmt.cpp
@@ -118,8 +118,12 @@ raw_ostream &MenuResource::log(raw_ostream &OS) const {
 raw_ostream &StringTableResource::log(raw_ostream &OS) const {
   OS << "StringTable:\n";
   OptStatements->log(OS);
-  for (const auto &String : Table)
-    OS << "  " << String.first << " => " << String.second << "\n";
+  for (const auto &String : Table) {
+    OS << "  " << String.first << " =>";
+    for (const auto &S : String.second)
+      OS << " " << S;
+    OS << "\n";
+  }
   return OS;
 }
 
diff --git a/llvm/tools/llvm-rc/ResourceScriptStmt.h b/llvm/tools/llvm-rc/ResourceScriptStmt.h
index 7076eca96a23..1a23d1e27d1f 100644
--- a/llvm/tools/llvm-rc/ResourceScriptStmt.h
+++ b/llvm/tools/llvm-rc/ResourceScriptStmt.h
@@ -581,12 +581,12 @@ class MenuResource : public OptStatementsRCResource {
 // Ref: msdn.microsoft.com/en-us/library/windows/desktop/aa381050(v=vs.85).aspx
 class StringTableResource : public OptStatementsRCResource {
 public:
-  std::vector<std::pair<uint32_t, StringRef>> Table;
+  std::vector<std::pair<uint32_t, std::vector<StringRef>>> Table;
 
   StringTableResource(OptionalStmtList &&List, uint16_t Flags)
       : OptStatementsRCResource(std::move(List), Flags) {}
-  void addString(uint32_t ID, StringRef String) {
-    Table.emplace_back(ID, String);
+  void addStrings(uint32_t ID, std::vector<StringRef> &&Strings) {
+    Table.emplace_back(ID, Strings);
   }
   raw_ostream &log(raw_ostream &) const override;
   Twine getResourceTypeName() const override { return "STRINGTABLE"; }

From 0b617ebb82e11d6e146d4ef71effb0f4db3c1439 Mon Sep 17 00:00:00 2001
From: Nathan James <n.james93@hotmail.co.uk>
Date: Tue, 4 Aug 2020 09:27:01 +0100
Subject: [PATCH 106/363] [clang-tidy] Fix regression in RenamerClangTidy

See bug https://bugs.llvm.org/show_bug.cgi\?id\=46976

(cherry picked from commit 7c4782ce91d66a8447a851362b99bb86a42b7c08)
---
 clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp | 5 +++--
 .../clang-tidy/checkers/readability-identifier-naming.cpp    | 5 +++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
index 040378d980f1..2d67ca4a1618 100644
--- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
@@ -157,6 +157,9 @@ void RenamerClangTidyCheck::addUsage(
   RenamerClangTidyCheck::NamingCheckFailure &Failure =
       NamingCheckFailures[Decl];
 
+  if (!Failure.RawUsageLocs.insert(FixLocation.getRawEncoding()).second)
+    return;
+
   if (!Failure.ShouldFix())
     return;
 
@@ -165,8 +168,6 @@ void RenamerClangTidyCheck::addUsage(
 
   if (!utils::rangeCanBeFixed(Range, SourceMgr))
     Failure.FixStatus = RenamerClangTidyCheck::ShouldFixStatus::InsideMacro;
-
-  Failure.RawUsageLocs.insert(FixLocation.getRawEncoding());
 }
 
 void RenamerClangTidyCheck::addUsage(const NamedDecl *Decl, SourceRange Range,
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp
index 24c1c4270dec..fed362bbecde 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp
@@ -578,3 +578,8 @@ void Foo() {
 #undef M1
 #undef DUP
 } // namespace scratchspace
+
+template<typename type_t>
+auto GetRes(type_t& Param) -> decltype(Param.res());
+// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: invalid case style for parameter 'Param'
+// CHECK-FIXES: auto GetRes(type_t& a_param) -> decltype(a_param.res());

From 3cab8184f32067464f3ab6bdfd6e123ddd38ef0f Mon Sep 17 00:00:00 2001
From: Chen Zheng <czhengsz@cn.ibm.com>
Date: Wed, 22 Jul 2020 06:01:52 -0400
Subject: [PATCH 107/363] [PowerPC] fixupIsDeadOrKill start and end in
 different block fixing

In fixupIsDeadOrKill, we assume StartMI and EndMI not exist in same
basic block, so we add an assertion in that function. This is wrong
before RA, as before RA the true definition may exist in another
block through copy like instructions.

Reviewed By: nemanjai

Differential Revision: https://reviews.llvm.org/D83365

(cherry picked from commit 36f9fe2d3493717dbc6866d96b2e989839ce1a4c)
---
 llvm/lib/Target/PowerPC/PPCInstrInfo.cpp      |  9 ++++++++
 llvm/lib/Target/PowerPC/PPCInstrInfo.h        | 14 ++++++++-----
 .../PowerPC/fixup-kill-dead-flag-crash.mir    | 21 +++++++++++++++++++
 3 files changed, 39 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/fixup-kill-dead-flag-crash.mir

diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 11c97210ead9..9a4c57fedac2 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2655,6 +2655,15 @@ const unsigned *PPCInstrInfo::getLoadOpcodesForSpillArray() const {
 
 void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI,
                                      unsigned RegNo) const {
+  // Conservatively clear kill flag for the register if the instructions are in
+  // different basic blocks and in SSA form, because the kill flag may no longer
+  // be right. There is no need to bother with dead flags since defs with no
+  // uses will be handled by DCE.
+  MachineRegisterInfo &MRI = StartMI.getParent()->getParent()->getRegInfo();
+  if (MRI.isSSA() && (StartMI.getParent() != EndMI.getParent())) {
+    MRI.clearKillFlags(RegNo);
+    return;
+  }
 
   // Instructions between [StartMI, EndMI] should be in same basic block.
   assert((StartMI.getParent() == EndMI.getParent()) &&
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index d98597f48340..43973c627fcf 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -565,14 +565,18 @@ class PPCInstrInfo : public PPCGenInstrInfo {
                              int64_t OffsetImm) const;
 
   /// Fixup killed/dead flag for register \p RegNo between instructions [\p
-  /// StartMI, \p EndMI]. Some PostRA transformations may violate register
-  /// killed/dead flags semantics, this function can be called to fix up. Before
-  /// calling this function,
+  /// StartMI, \p EndMI]. Some pre-RA or post-RA transformations may violate
+  /// register killed/dead flags semantics, this function can be called to fix
+  /// up. Before calling this function,
   /// 1. Ensure that \p RegNo liveness is killed after instruction \p EndMI.
   /// 2. Ensure that there is no new definition between (\p StartMI, \p EndMI)
   ///    and possible definition for \p RegNo is \p StartMI or \p EndMI.
-  /// 3. Ensure that all instructions between [\p StartMI, \p EndMI] are in same
-  ///    basic block.
+  /// 3. We can do accurate fixup for the case when all instructions between
+  ///    [\p StartMI, \p EndMI] are in same basic block.
+  /// 4. For the case when \p StartMI and \p EndMI are not in same basic block,
+  ///    we conservatively clear kill flag for all uses of \p RegNo for pre-RA
+  ///    and for post-RA, we give an assertion as without reaching definition
+  ///    analysis post-RA, \p StartMI and \p EndMI are hard to keep right.
   void fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI,
                          unsigned RegNo) const;
   void replaceInstrWithLI(MachineInstr &MI, const LoadImmediateInfo &LII) const;
diff --git a/llvm/test/CodeGen/PowerPC/fixup-kill-dead-flag-crash.mir b/llvm/test/CodeGen/PowerPC/fixup-kill-dead-flag-crash.mir
new file mode 100644
index 000000000000..be2671fa9b5d
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/fixup-kill-dead-flag-crash.mir
@@ -0,0 +1,21 @@
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -start-before ppc-mi-peepholes \
+# RUN:   -stop-after ppc-mi-peepholes %s -o - | FileCheck %s
+
+---
+name: test
+#CHECK : name : test
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x3
+    %0:g8rc = COPY $x3
+    %1:gprc = COPY %0.sub_32:g8rc
+    %2:g8rc = LI8 63
+
+  bb.1:
+    %3:gprc = COPY %2.sub_32:g8rc
+    ; CHECK: %4:gprc = LI 0
+    %4:gprc = XORI killed %3:gprc, 63
+    STW killed %4:gprc, %4:gprc, 100
+    BLR8 implicit $lr8, implicit $rm
+...

From 3aec1c6a493f543d41caf99e5df9056776856941 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 4 Aug 2020 14:41:48 +0300
Subject: [PATCH 108/363] [AArch64] [Windows] Error out on unsupported symbol
 locations

These might occur in seemingly generic assembly. Previously when
targeting COFF, they were silently ignored, which certainly won't
give the right result. Instead clearly error out, to make it clear
that the assembly needs to be adjusted for this target.

Also change a preexisting report_fatal_error into a proper error
message, pointing out the offending source instruction. This isn't
strictly an internal error, as it can be triggered by user input.

Differential Revision: https://reviews.llvm.org/D85242

(cherry picked from commit f5e6fbac24f198d075a7c4bc0879426e79040bcf)
---
 .../AArch64WinCOFFObjectWriter.cpp            | 28 +++++++++++-
 llvm/test/MC/AArch64/coff-relocations-diags.s | 43 +++++++++++++++++++
 2 files changed, 69 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/MC/AArch64/coff-relocations-diags.s

diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index aa50bd05cb71..aaadc8dc1b60 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCFixupKindInfo.h"
@@ -48,10 +49,33 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
                                       : Target.getSymA()->getKind();
   const MCExpr *Expr = Fixup.getValue();
 
+  if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
+    AArch64MCExpr::VariantKind RefKind = A64E->getKind();
+    switch (AArch64MCExpr::getSymbolLoc(RefKind)) {
+    case AArch64MCExpr::VK_ABS:
+    case AArch64MCExpr::VK_SECREL:
+      // Supported
+      break;
+    default:
+      Ctx.reportError(Fixup.getLoc(), "relocation variant " +
+                                          A64E->getVariantKindName() +
+                                          " unsupported on COFF targets");
+      return COFF::IMAGE_REL_ARM64_ABSOLUTE; // Dummy return value
+    }
+  }
+
   switch (static_cast<unsigned>(Fixup.getKind())) {
   default: {
-    const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
-    report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
+    if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
+      Ctx.reportError(Fixup.getLoc(), "relocation type " +
+                                          A64E->getVariantKindName() +
+                                          " unsupported on COFF targets");
+    } else {
+      const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
+      Ctx.reportError(Fixup.getLoc(), Twine("relocation type ") + Info.Name +
+                                          " unsupported on COFF targets");
+    }
+    return COFF::IMAGE_REL_ARM64_ABSOLUTE; // Dummy return value
   }
 
   case FK_Data_4:
diff --git a/llvm/test/MC/AArch64/coff-relocations-diags.s b/llvm/test/MC/AArch64/coff-relocations-diags.s
new file mode 100644
index 000000000000..24869ce8349e
--- /dev/null
+++ b/llvm/test/MC/AArch64/coff-relocations-diags.s
@@ -0,0 +1,43 @@
+// RUN: not llvm-mc -triple aarch64-win32 -filetype=obj %s -o /dev/null 2>&1 | FileCheck %s
+
+  adrp x0, :got:symbol
+  // CHECK: error: relocation variant :got: unsupported on COFF targets
+  // CHECK-NEXT: adrp x0, :got:symbol
+  // CHECK-NEXT: ^
+
+  ldr x0, [x0, :got_lo12:symbol]
+  // CHECK: error: relocation variant :got_lo12: unsupported on COFF targets
+  // CHECK-NEXT: ldr x0, [x0, :got_lo12:symbol]
+  // CHECK-NEXT: ^
+
+  adrp x0, :tlsdesc:symbol
+  // CHECK: error: relocation variant :tlsdesc: unsupported on COFF targets
+  // CHECK-NEXT: adrp x0, :tlsdesc:symbol
+  // CHECK-NEXT: ^
+  add x0, x0, :tlsdesc_lo12:symbol
+  // CHECK: error: relocation variant :tlsdesc_lo12: unsupported on COFF targets
+  // CHECK-NEXT: add x0, x0, :tlsdesc_lo12:symbol
+  // CHECK-NEXT: ^
+
+  adrp x0, :gottprel:symbol
+  // CHECK: error: relocation variant :gottprel: unsupported on COFF targets
+  // CHECK-NEXT: adrp x0, :gottprel:symbol
+  // CHECK-NEXT: ^
+  ldr x0, [x0, :gottprel_lo12:symbol]
+  // CHECK: error: relocation variant :gottprel_lo12: unsupported on COFF targets
+  // CHECK-NEXT: ldr x0, [x0, :gottprel_lo12:symbol]
+  // CHECK-NEXT: ^
+
+  add x0, x0, #:dtprel_hi12:symbol, lsl #12
+  // CHECK: error: relocation variant :dtprel_hi12: unsupported on COFF targets
+  // CHECK-NEXT: add x0, x0, #:dtprel_hi12:symbol, lsl #12
+  // CHECK-NEXT: ^
+  add x0, x0, :dtprel_lo12:symbol
+  // CHECK: error: relocation variant :dtprel_lo12: unsupported on COFF targets
+  // CHECK-NEXT: add x0, x0, :dtprel_lo12:symbol
+  // CHECK-NEXT: ^
+
+  movz x0, #:abs_g0:symbol
+  // CHECK: error: relocation type :abs_g0: unsupported on COFF targets
+  // CHECK-NEXT: movz x0, #:abs_g0:symbol
+  // CHECK-NEXT: ^

From b067f5eb56684476b5dad4ebd8d6bc5291603d4e Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@amd.com>
Date: Thu, 6 Aug 2020 14:26:10 +0200
Subject: [PATCH 109/363] [GlobalISel][InlineAsm] Fix matching input constraint
 to physreg

Add given input and mark it as tied.
Doesn't create additional copy compared to
matching input constraint to virtual register.

Differential Revision: https://reviews.llvm.org/D85122

(cherry picked from commit d893278bba01b0e1209e8b8accbdd5cfa75a0932)
---
 .../CodeGen/GlobalISel/InlineAsmLowering.cpp   | 18 +++++++++++-------
 .../GlobalISel/irtranslator-inline-asm.ll      | 12 ++++++++++++
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
index 2ce1d414e755..1e2a82615da8 100644
--- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
@@ -455,19 +455,23 @@ bool InlineAsmLowering::lowerInlineAsm(
         unsigned DefRegIdx = InstFlagIdx + 1;
         Register Def = Inst->getOperand(DefRegIdx).getReg();
 
-        // Copy input to new vreg with same reg class as Def
-        const TargetRegisterClass *RC = MRI->getRegClass(Def);
         ArrayRef<Register> SrcRegs = GetOrCreateVRegs(*OpInfo.CallOperandVal);
         assert(SrcRegs.size() == 1 && "Single register is expected here");
-        Register Tmp = MRI->createVirtualRegister(RC);
-        if (!buildAnyextOrCopy(Tmp, SrcRegs[0], MIRBuilder))
-          return false;
 
-        // Add Flag and input register operand (Tmp) to Inst. Tie Tmp to Def.
+        // When Def is physreg: use given input.
+        Register In = SrcRegs[0];
+        // When Def is vreg: copy input to new vreg with same reg class as Def.
+        if (Def.isVirtual()) {
+          In = MRI->createVirtualRegister(MRI->getRegClass(Def));
+          if (!buildAnyextOrCopy(In, SrcRegs[0], MIRBuilder))
+            return false;
+        }
+
+        // Add Flag and input register operand (In) to Inst. Tie In to Def.
         unsigned UseFlag = InlineAsm::getFlagWord(InlineAsm::Kind_RegUse, 1);
         unsigned Flag = InlineAsm::getFlagWordForMatchingOp(UseFlag, DefIdx);
         Inst.addImm(Flag);
-        Inst.addReg(Tmp);
+        Inst.addReg(In);
         Inst->tieOperands(DefRegIdx, Inst->getNumOperands() - 1);
         break;
       }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll
index f8b23ef84721..bfe96827dfe3 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll
@@ -243,3 +243,15 @@ define i16 @test_anyext_input_with_matching_constraint() {
   %1 = call i16 asm sideeffect "", "=r,0"(i16 1)
   ret i16 %1
 }
+
+define i64 @test_input_with_matching_constraint_to_physical_register() {
+  ; CHECK-LABEL: name: test_input_with_matching_constraint_to_physical_register
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK:   INLINEASM &"", 0 /* attdialect */, 10 /* regdef */, implicit-def $x2, 2147483657 /* reguse tiedto:$0 */, [[C]](tied-def 3)(s64)
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s64) = COPY $x2
+  ; CHECK:   $x0 = COPY [[COPY]](s64)
+  ; CHECK:   RET_ReallyLR implicit $x0
+  %1 = tail call i64 asm "", "={x2},0"(i64 0)
+  ret i64 %1
+}

From 279922f108c26fe09667ba2525ab18b4735b28b2 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Thu, 6 Aug 2020 12:34:16 -0700
Subject: [PATCH 110/363] [ELF] Change tombstone values to
 (.debug_ranges/.debug_loc) 1 and (other .debug_*) 0

tl;dr See D81784 for the 'tombstone value' concept. This patch changes our behavior to be almost the same as GNU ld (except that we also use 1 for .debug_loc):

* .debug_ranges & .debug_loc: 1 (LLD<11: 0+addend; GNU ld uses 1 for .debug_ranges)
* .debug_*: 0 (LLD<11: 0+addend; GNU ld uses 0; future LLD: -1)

We make the tweaks because:

1) The new tombstone is novel and needs more time to be adopted by consumers before it's the default.
2) The old (gold) strategy had problems with zero-length functions - so rather than going back that, we're going to the GNU ld strategy which doesn't have that problem.
3) One slight tweak to (2) is to apply the .debug_ranges workaround to .debug_loc for the same reasons it applies to debug_ranges - to avoid terminating lists early.

-----

http://lists.llvm.org/pipermail/llvm-dev/2020-July/143482.html

The tombstone value -1 in .debug_line caused problems to lldb (fixed by D83957;
will be included in 11.0.0) and breakpad (fixed by
https://crrev.com/c/2321300). It may potentially affects other DWARF consumers.

For .debug_ranges & .debug_loc: 1, an argument preferring 1 (GNU ld for .debug_ranges) over -2 is that:
```
{-1, -2}    <<< base address selection entry
{0, length} <<< address range
```
may create a situation where low_pc is greater than high_pc. So we use
1, the GNU ld behavior for .debug_ranges

For other .debug_* sections, there haven't been many reports. One issue is that
bloaty (src/dwarf.cc) can incorrectly count address ranges in .debug_ranges . To
reduce similar disruption, this patch changes the tombstone values to be similar to GNU ld.

This does mean another behavior change to the default trunk behavior. Sorry
about it. The default trunk behavior will be similar to release/11.x while we work on a transition plan for LLD users.

Reviewed By: dblaikie, echristo

Differential Revision: https://reviews.llvm.org/D84825

(cherry picked from commit 004be4037e1e9c6092323c5c9268acb3ecf9176c)
---
 lld/ELF/InputSection.cpp                | 11 +++++++----
 lld/test/ELF/dead-reloc-in-nonalloc.s   |  2 +-
 lld/test/ELF/debug-dead-reloc-32.s      |  8 ++++----
 lld/test/ELF/debug-dead-reloc-icf.s     |  2 +-
 lld/test/ELF/debug-dead-reloc-tls-arm.s |  2 +-
 lld/test/ELF/debug-dead-reloc-tls.s     |  4 ++--
 lld/test/ELF/debug-dead-reloc.s         | 14 +++++++-------
 7 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 7a7ebd974909..a6c97a3506ba 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -938,14 +938,17 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
       // the folded-in function, so exclude .debug_line.
       //
       // For pre-DWARF-v5 .debug_loc and .debug_ranges, -1 is a reserved value
-      // (base address selection entry), so -2 is used.
+      // (base address selection entry), use 1 (which is used by GNU ld for
+      // .debug_ranges).
+      //
+      // TODO To reduce disruption, we use 0 instead of -1 as the tombstone
+      // value. Enable -1 in a future release.
       auto *ds = dyn_cast<Defined>(&sym);
       if (!sym.getOutputSection() ||
           (ds && ds->section->repl != ds->section && !isDebugLine)) {
         // If -z dead-reloc-in-nonalloc= is specified, respect it.
-        const uint64_t value =
-            tombstone ? SignExtend64<bits>(*tombstone)
-                      : (isDebugLocOrRanges ? UINT64_MAX - 1 : UINT64_MAX);
+        const uint64_t value = tombstone ? SignExtend64<bits>(*tombstone)
+                                         : (isDebugLocOrRanges ? 1 : 0);
         target->relocateNoSym(bufLoc, type, value);
         continue;
       }
diff --git a/lld/test/ELF/dead-reloc-in-nonalloc.s b/lld/test/ELF/dead-reloc-in-nonalloc.s
index 00d3d2cbc4a8..9e93a0cf32af 100644
--- a/lld/test/ELF/dead-reloc-in-nonalloc.s
+++ b/lld/test/ELF/dead-reloc-in-nonalloc.s
@@ -11,7 +11,7 @@
 # RUN:   -z dead-reloc-in-nonalloc=.not_debug=0xbbbbbbbb %t.o -o - | cmp %t -
 
 # COMMON:      Contents of section .debug_addr:
-# COMMON-NEXT:  0000 [[ADDR:[0-9a-f]+]] 00000000 ffffffff ffffffff
+# COMMON-NEXT:  0000 [[ADDR:[0-9a-f]+]] 00000000 00000000 00000000
 
 # AA:          Contents of section .debug_info:
 # AA-NEXT:      0000 [[ADDR]] 00000000 aaaaaaaa 00000000
diff --git a/lld/test/ELF/debug-dead-reloc-32.s b/lld/test/ELF/debug-dead-reloc-32.s
index a7496798c580..b2708a744f28 100644
--- a/lld/test/ELF/debug-dead-reloc-32.s
+++ b/lld/test/ELF/debug-dead-reloc-32.s
@@ -8,11 +8,11 @@
 # RUN: llvm-objdump -s %t | FileCheck %s
 
 # CHECK:      Contents of section .debug_loc:
-# CHECK-NEXT:  0000 feffffff
+# CHECK-NEXT:  0000 01000000
 # CHECK-NEXT: Contents of section .debug_ranges:
-# CHECK-NEXT:  0000 feffffff
+# CHECK-NEXT:  0000 01000000
 # CHECK-NEXT: Contents of section .debug_addr:
-# CHECK-NEXT:  0000 ffffffff
+# CHECK-NEXT:  0000 00000000
 
 .section .text.1,"axe"
   .byte 0
@@ -24,6 +24,6 @@
 .section .debug_ranges
   .long .text.1+16
 
-## Resolved to UINT32_C(-1), with the addend ignored.
+## Resolved to UINT32_C(0), with the addend ignored.
 .section .debug_addr
   .long .text.1+8
diff --git a/lld/test/ELF/debug-dead-reloc-icf.s b/lld/test/ELF/debug-dead-reloc-icf.s
index 716e245c12c9..282838e3f216 100644
--- a/lld/test/ELF/debug-dead-reloc-icf.s
+++ b/lld/test/ELF/debug-dead-reloc-icf.s
@@ -9,7 +9,7 @@
 # RUN: llvm-objdump -s %t | FileCheck %s
 
 # CHECK:      Contents of section .debug_info:
-# CHECK-NEXT:  0000 {{[0-9a-f]+}}000 00000000 ffffffff ffffffff
+# CHECK-NEXT:  0000 {{[0-9a-f]+}}000 00000000 00000000 00000000
 # CHECK:      Contents of section .debug_line:
 # CHECK-NEXT:  0000 [[ADDR:[0-9a-f]+]] 00000000
 # CHECK-SAME:                                   [[ADDR]] 00000000
diff --git a/lld/test/ELF/debug-dead-reloc-tls-arm.s b/lld/test/ELF/debug-dead-reloc-tls-arm.s
index 7fa5bcaae19e..3fad5306facc 100644
--- a/lld/test/ELF/debug-dead-reloc-tls-arm.s
+++ b/lld/test/ELF/debug-dead-reloc-tls-arm.s
@@ -7,7 +7,7 @@
 # RUN: llvm-objdump -s %t | FileCheck %s
 
 # CHECK:      Contents of section .debug_info:
-# CHECK-NEXT:  0000 ffffffff
+# CHECK-NEXT:  0000 00000000
 
 .globl _start
 _start:
diff --git a/lld/test/ELF/debug-dead-reloc-tls.s b/lld/test/ELF/debug-dead-reloc-tls.s
index 1b26a920d3dd..066627738544 100644
--- a/lld/test/ELF/debug-dead-reloc-tls.s
+++ b/lld/test/ELF/debug-dead-reloc-tls.s
@@ -7,8 +7,8 @@
 # RUN: llvm-objdump -s %t | FileCheck %s
 
 # CHECK:      Contents of section .debug_info:
-# CHECK-NEXT:  0000 ffffffff ffffffff ffffffff ffffffff
-# CHECK-NEXT:  0010 ffffffff ffffffff
+# CHECK-NEXT:  0000 00000000 00000000 00000000 00000000
+# CHECK-NEXT:  0010 00000000 ffffffff
 
 .globl _start
 _start:
diff --git a/lld/test/ELF/debug-dead-reloc.s b/lld/test/ELF/debug-dead-reloc.s
index d784519e9af4..e1adf4e2a25e 100644
--- a/lld/test/ELF/debug-dead-reloc.s
+++ b/lld/test/ELF/debug-dead-reloc.s
@@ -9,15 +9,15 @@
 # RUN: llvm-objdump -s %t | FileCheck %s
 
 # CHECK:      Contents of section .debug_loc:
-# CHECK-NEXT:  0000 feffffff ffffffff feffffff ffffffff
+# CHECK-NEXT:  0000 01000000 00000000 01000000 00000000
 # CHECK-NEXT: Contents of section .debug_ranges:
-# CHECK-NEXT:  0000 feffffff ffffffff feffffff ffffffff
+# CHECK-NEXT:  0000 01000000 00000000 01000000 00000000
 # CHECK-NEXT: Contents of section .debug_addr:
 # CHECK-NEXT:  0000 {{.*}}000 00000000 {{.*}}000 00000000
-# CHECK-NEXT:  0010 ffffffff  ffffffff {{.*}}000 00000000
+# CHECK-NEXT:  0010 00000000  00000000 {{.*}}000 00000000
 # CHECK-NEXT: Contents of section .debug_foo:
-# CHECK-NEXT:  0000 ffffffff ffffffff 08000000 00000000
-# CHECK-NEXT:  0010 ffffffff ffffffff 08000000 00000000
+# CHECK-NEXT:  0000 00000000 00000000 08000000 00000000
+# CHECK-NEXT:  0010 00000000 00000000 08000000 00000000
 
 ## -z dead-reloc-in-nonalloc= can override the tombstone value.
 # RUN: ld.lld --gc-sections -z dead-reloc-in-nonalloc=.debug_loc=42 %t.o %t1.o %t1.o -o %t42
@@ -35,7 +35,7 @@
 group:
   .byte 0
 
-## Resolved to UINT64_C(-2), with the addend ignored.
+## Resolved to UINT64_C(1), with the addend ignored.
 ## UINT64_C(-1) is a reserved value (base address selection entry) which can't be used.
 .section .debug_loc
   .quad .text.1+8
@@ -44,7 +44,7 @@ group:
 
 .section .debug_addr
 ## .text.3 is a local symbol. The symbol defined in a non-prevailing group is
-## discarded. Resolved to UINT64_C(-1).
+## discarded. Resolved to UINT64_C(0).
   .quad .text.3+24
 ## group is a non-local symbol. The relocation from the second %t1.o gets
 ## resolved to the prevailing copy.

From f0c41f1d63627a29055474e6df73f78761ca8213 Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Mon, 10 Aug 2020 12:38:24 +0200
Subject: [PATCH 111/363] [clangd] Release notes for 11.x

---
 clang-tools-extra/docs/ReleaseNotes.rst | 191 +++++++++++++++++++++++-
 1 file changed, 190 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 0238ef5149b0..9f96d6eab38e 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -47,7 +47,196 @@ Major New Features
 Improvements to clangd
 ----------------------
 
-The improvements are...
+Performance
+^^^^^^^^^^^
+
+- Eliminated long delays after adding/removing includes ("async preambles")
+
+- Faster indexing
+
+- Less memory used to index headers used by open files ("dynamic index")
+
+- Many requests are implicitly cancelled rather than queued when the file is
+  edited, preventing a backlog
+
+- Background indexing can be selectively disabled per-path through config
+
+Selecting and targeting
+^^^^^^^^^^^^^^^^^^^^^^^
+
+- Improved understanding and selection around broken code ("recovery AST")
+
+- Operations like "go-to-definition" will target things on the left of the
+  cursor, if there is nothing eligible on the right.
+
+- Arguments to ``assert()``-like macros can be properly selected.
+
+Diagnostics
+^^^^^^^^^^^
+
+- When a header is saved, diagnostics for files that use it are updated.
+
+- Calls ``std::make_unique`` produce diagnostics for the constructor call.
+  (Template functions *in general* are not expanded for performance reasons).
+
+- Diagnostics update more quickly for files that build quickly (no 500ms delay)
+
+- Automatic fixes are offered even when they affect macro arguments.
+
+- Warnings from included headers are not shown (but errors still are).
+
+- A handful of high-quality clang-tidy checks are enabled by default:
+
+  - readability-misleading-indentation,
+
+  - readability-deleted-default,
+
+  - bugprone-integer-division,
+
+  - bugprone-sizeof-expression,
+
+  - bugprone-suspicious-missing-comma,
+
+  - bugprone-unused-raii,
+
+  - bugprone-unused-return-value,
+
+  - misc-unused-using-decls,
+
+  - misc-unused-alias-decls,
+
+  - misc-definitions-in-headers
+
+Refactorings
+^^^^^^^^^^^^
+
+- Rename applies across the project, using the index.
+
+- Accuracy of rename improved in many places.
+
+- New refactoring: add using declaration for qualified name.
+
+- New refactoring: move function definition out-of-line.
+
+Code completion
+^^^^^^^^^^^^^^^
+
+- Function call parentheses are not inserted if they already exist.
+
+- Completion of ``#include`` filenames triggers earlier (after ``<``, ``"``, and
+  ``/``) and is less aggressive about replacing existing text.
+
+- Documentation is reflowed in the same way as on hover.
+
+Go-to-definition
+^^^^^^^^^^^^^^^^
+
+- Dependent names in templates may be heuristically resolved
+
+- Identifiers in comments may be resolved using other occurrences in the file
+  or in the index.
+
+- Go-to-definition on an ``override`` or ``final`` specifier jumps to the
+  overridden method.
+
+Hover
+^^^^^
+
+- Expressions passed as function arguments show parameter name, conversions etc.
+
+- Members now include the access specifier in the displayed declaration.
+
+- Classes and fields show memory layout information (size and offset).
+
+- Somewhat improved understanding of formatting in documentation comments.
+
+- Trivial inline getters/setters are implicitly documented as such.
+
+Highlighting
+^^^^^^^^^^^^
+
+- The ``semanticTokens`` protocol from LSP 3.16 is supported.
+  (Only token types are exposed, no modifiers yet).
+
+- The non-standard ``textDocument/semanticHighlighting`` notification is
+  deprecated and will be removed in clangd 12.
+
+- Placing the cursor on a control flow keyword highlights related flow
+  (e.g. ``break`` -> ``for``).
+
+Language support
+^^^^^^^^^^^^^^^^
+
+- clangd features now work inside templates on windows.
+  (MSVC-compatible delayed-template-parsing is no longer used).
+
+- Objective-C properties can be targeted and cross-references are indexed.
+
+- Field names in designated initializers (C++20) can be targeted, and code
+  completion works in many cases.
+
+- ``goto`` labels: go-to-defintion, cross-references, and rename all work.
+
+- Concepts (C++20): go-to-definition on concept names, and some limited code
+  completion support for concept members.
+
+System integration
+^^^^^^^^^^^^^^^^^^
+
+- The project index is now written to ``$PROJECT/.cache/clangd/index``.
+  ``$PROJECT/.clangd`` is now expected to be a configuration file.
+
+  Old ``$PROJECT/.clangd`` directories can safely be deleted.
+
+  We recommend including both ``.cache/`` and ``.clangd/`` (with trailing slash)
+  in ``.gitignore``, for backward-compatibility with earlier releases of clangd.
+
+- For non-project files (those without a compilation database), the index
+  location better reflects OS conventions:
+
+  - ``%LocalAppData%\clangd\index`` on Windows
+
+  - ``$(getconf DARWIN_USER_CACHE_DIR)/clangd/index`` on Mac
+
+  - ``$XDG_CACHE_HOME/clangd/index`` or ``~/.cache/clangd/index`` on others
+
+  Old ``~/.clangd/index`` directories can safely be deleted.
+
+- clangd now reads configuration from ``.clangd`` files inside your project,
+  and from a user configuration file in an OS-specific location:
+
+  - ``%LocalAppData%\clangd\config.yaml`` on Windows
+
+  - ``~/Library/clangd/config.yaml`` on Mac
+
+  - ``$XDG_CONFIG_HOME/clangd/config.yaml`` or ``~/.config/clangd/config.yaml``
+    on others
+
+  See `clangd configuration format <https://clangd.llvm.org/config.html>`_.
+
+- clangd will search for compilation databases (``compile_commands.json``) in
+  a ``build/`` subdirectory, as well as in the project root.
+  This follows CMake conventions, avoiding the need for a symlink in many cases.
+
+- Compile flags can be selectively modified per-path, using configuration.
+
+- Improved filtering of unhelpful compile flags (such as those relating to
+  pre-compiled headers).
+
+- Improved detection of standard library headers location.
+
+Miscellaneous
+^^^^^^^^^^^^^
+
+- Background indexing status is reported using LSP 3.15 progress events
+  (``window/workDoneProgress/create``).
+
+- Infrastructure for gathering internal metrics.
+  (Off by default, set ``$CLANGD_METRICS`` to generate a named CSV file).
+
+- Document versions are now tracked, version is reported along with diagnostics.
+
+- Too many stability and correctness fixes to mention.
 
 Improvements to clang-doc
 -------------------------

From a450654a52874b094c264e0366c31126c03fdf2d Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Mon, 10 Aug 2020 16:45:11 +0200
Subject: [PATCH 112/363] [clangd] Fix error in release notes

---
 clang-tools-extra/docs/ReleaseNotes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 9f96d6eab38e..83ae2c6605fd 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -207,7 +207,7 @@ System integration
 
   - ``%LocalAppData%\clangd\config.yaml`` on Windows
 
-  - ``~/Library/clangd/config.yaml`` on Mac
+  - ``~/Library/Preferences/clangd/config.yaml`` on Mac
 
   - ``$XDG_CONFIG_HOME/clangd/config.yaml`` or ``~/.config/clangd/config.yaml``
     on others

From ff47911ddfc10d023ef0debf229a60c9fce9443a Mon Sep 17 00:00:00 2001
From: Richard Smith <richard@metafoo.co.uk>
Date: Wed, 12 Aug 2020 16:53:02 -0700
Subject: [PATCH 113/363] PR47143: Don't crash while constant-evaluating
 value-initialization of an array of unknown bound as the initializer of an
 array new expression.

(cherry picked from commit bd08e0cf1cb1f1f294e4253ba5907ec4c81b05fe)
---
 clang/lib/AST/ExprConstant.cpp                 | 18 ++++++++++++++----
 .../test/SemaCXX/constant-expression-cxx2a.cpp | 14 ++++++++++++++
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 41a4ae4b91c8..8367ffc6f48c 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -8974,6 +8974,7 @@ bool PointerExprEvaluator::VisitCXXNewExpr(const CXXNewExpr *E) {
   const Expr *Init = E->getInitializer();
   const InitListExpr *ResizedArrayILE = nullptr;
   const CXXConstructExpr *ResizedArrayCCE = nullptr;
+  bool ValueInit = false;
 
   QualType AllocType = E->getAllocatedType();
   if (Optional<const Expr*> ArraySize = E->getArraySize()) {
@@ -9017,7 +9018,14 @@ bool PointerExprEvaluator::VisitCXXNewExpr(const CXXNewExpr *E) {
     //   -- the new-initializer is a braced-init-list and the number of
     //      array elements for which initializers are provided [...]
     //      exceeds the number of elements to initialize
-    if (Init && !isa<CXXConstructExpr>(Init)) {
+    if (!Init) {
+      // No initialization is performed.
+    } else if (isa<CXXScalarValueInitExpr>(Init) ||
+               isa<ImplicitValueInitExpr>(Init)) {
+      ValueInit = true;
+    } else if (auto *CCE = dyn_cast<CXXConstructExpr>(Init)) {
+      ResizedArrayCCE = CCE;
+    } else {
       auto *CAT = Info.Ctx.getAsConstantArrayType(Init->getType());
       assert(CAT && "unexpected type for array initializer");
 
@@ -9040,8 +9048,6 @@ bool PointerExprEvaluator::VisitCXXNewExpr(const CXXNewExpr *E) {
       // special handling for this case when we initialize.
       if (InitBound != AllocBound)
         ResizedArrayILE = cast<InitListExpr>(Init);
-    } else if (Init) {
-      ResizedArrayCCE = cast<CXXConstructExpr>(Init);
     }
 
     AllocType = Info.Ctx.getConstantArrayType(AllocType, ArrayBound, nullptr,
@@ -9102,7 +9108,11 @@ bool PointerExprEvaluator::VisitCXXNewExpr(const CXXNewExpr *E) {
       return false;
   }
 
-  if (ResizedArrayILE) {
+  if (ValueInit) {
+    ImplicitValueInitExpr VIE(AllocType);
+    if (!EvaluateInPlace(*Val, Info, Result, &VIE))
+      return false;
+  } else if (ResizedArrayILE) {
     if (!EvaluateArrayNewInitList(Info, Result, *Val, ResizedArrayILE,
                                   AllocType))
       return false;
diff --git a/clang/test/SemaCXX/constant-expression-cxx2a.cpp b/clang/test/SemaCXX/constant-expression-cxx2a.cpp
index f66f380b635f..344797bafb11 100644
--- a/clang/test/SemaCXX/constant-expression-cxx2a.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx2a.cpp
@@ -950,6 +950,20 @@ namespace dynamic_alloc {
     p = new ((std::align_val_t)n) char[n];
     p = new char(n);
   }
+
+  namespace PR47143 {
+    constexpr char *f(int n) {
+      return new char[n]();
+    }
+    const char *p = f(3);
+    constexpr bool test() {
+      char *p = f(3);
+      bool result = !p[0] && !p[1] && !p[2];
+      delete [] p;
+      return result;
+    }
+    static_assert(test());
+  }
 }
 
 struct placement_new_arg {};

From e6ec96f4215a4f5302e4dd5d0ac287a1b0563586 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 17 Aug 2020 11:17:15 +0300
Subject: [PATCH 114/363] [docs] Add release notes for the 11.x release

---
 clang/docs/ReleaseNotes.rst | 10 ++++++++++
 lld/docs/ReleaseNotes.rst   |  9 +++++++--
 llvm/docs/ReleaseNotes.rst  | 21 +++++++++++++++++++++
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 3264846506c6..6f336088750f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -191,6 +191,16 @@ Attribute Changes in Clang
 Windows Support
 ---------------
 
+- Don't warn about `ms_struct may not produce Microsoft-compatible layouts
+  for classes with base classes or virtual functions` if the option is
+  enabled globally, as opposed to enabled on a specific class/struct or
+  on a specific section in the source files. This avoids needing to
+  couple `-mms-bitfields` with `-Wno-incompatible-ms-struct` if building
+  C++ code.
+
+- Enable `-mms-bitfields` by default for MinGW targets, matching a similar
+  change in GCC 4.7.
+
 C Language Changes in Clang
 ---------------------------
 
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index fe3de8306cd8..f0482c2428c4 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -40,12 +40,17 @@ Breaking changes
 COFF Improvements
 -----------------
 
-* ...
+* Fixed exporting symbols whose names contain a period (``.``), which was
+  a regression in lld 7.
 
 MinGW Improvements
 ------------------
 
-* ...
+* Implemented new options for disabling auto import and runtime pseudo
+  relocations (``--disable-auto-import`` and
+  ``--disable-runtime-pseudo-reloc``), the ``--no-seh`` flag and options
+  for selecting file and section alignment (``--file-alignment`` and
+  ``--section-alignment``).
 
 MachO Improvements
 ------------------
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index ed1718a95054..c9ac61d29676 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -81,6 +81,15 @@ Changes to the LLVM IR
 Changes to building LLVM
 ------------------------
 
+Changes to the AArch64 Backend
+------------------------------
+
+* Back up and restore x18 in functions with windows calling convention on
+  non-windows OSes.
+
+* Clearly error out on unsupported relocations when targeting COFF, instead
+  of silently accepting some (without being able to do what was requested).
+
 Changes to the ARM Backend
 --------------------------
 
@@ -157,6 +166,12 @@ Changes to the WebAssembly Target
 * `__attribute__((visibility("protected")))` now evokes a warning, as
   WebAssembly does not support "protected" visibility.
 
+Changes to the Windows Target
+-----------------------------
+
+* Produce COFF weak external symbols for IR level weak symbols without a comdat
+  (e.g. for `__attribute__((weak))` in C)
+
 Changes to the OCaml bindings
 -----------------------------
 
@@ -195,6 +210,12 @@ Changes to the LLVM tools
   symbols, i.e. mapping symbols on ARM and AArch64, by default. This matches
   the GNU nm behavior.
 
+* llvm-rc now tolerates -1 as menu item ID, supports the language id option
+  and allows string table values to be split into multiple string literals
+
+* llvm-lib supports adding import library objects in addition to regular
+  object files
+
 Changes to LLDB
 ===============
 

From 3d0470ae802ae443a2df39f495d2ddd92805c8f8 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Tue, 4 Aug 2020 10:58:47 +0100
Subject: [PATCH 115/363] [AArch64][SVE] Fix CFA calculation in presence of SVE
 objects.

The CFA is calculated as (SP/FP + offset), but when there are
SVE objects on the stack the SP offset is partly scalable and
should instead be expressed as the DWARF expression:

     SP + offset + scalable_offset * VG

where VG is the Vector Granule register, containing the
number of 64bits 'granules' in a scalable vector.

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D84043

(cherry picked from commit fd6584a22043b254a323635c142b28ce80ae5b5b)
---
 llvm/include/llvm/MC/MCDwarf.h                |  15 +-
 .../CodeGen/AsmPrinter/AsmPrinterDwarf.cpp    |   1 +
 .../Target/AArch64/AArch64FrameLowering.cpp   |  74 ++++++-
 .../lib/Target/AArch64/AArch64FrameLowering.h |   7 +
 .../lib/Target/AArch64/AArch64RegisterInfo.td |   3 +
 llvm/lib/Target/AArch64/AArch64StackOffset.h  |  12 ++
 llvm/test/CodeGen/AArch64/framelayout-sve.mir | 186 +++++++++++++++---
 llvm/test/CodeGen/AArch64/sve-trunc.ll        |   2 +-
 8 files changed, 267 insertions(+), 33 deletions(-)

diff --git a/llvm/include/llvm/MC/MCDwarf.h b/llvm/include/llvm/MC/MCDwarf.h
index e3cea0ae64cf..70da5f76e766 100644
--- a/llvm/include/llvm/MC/MCDwarf.h
+++ b/llvm/include/llvm/MC/MCDwarf.h
@@ -467,10 +467,12 @@ class MCCFIInstruction {
     unsigned Register2;
   };
   std::vector<char> Values;
+  std::string Comment;
 
-  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int O, StringRef V)
+  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int O, StringRef V,
+                   StringRef Comment = "")
       : Operation(Op), Label(L), Register(R), Offset(O),
-        Values(V.begin(), V.end()) {
+        Values(V.begin(), V.end()), Comment(Comment) {
     assert(Op != OpRegister);
   }
 
@@ -570,8 +572,9 @@ class MCCFIInstruction {
 
   /// .cfi_escape Allows the user to add arbitrary bytes to the unwind
   /// info.
-  static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals) {
-    return MCCFIInstruction(OpEscape, L, 0, 0, Vals);
+  static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals,
+                                       StringRef Comment = "") {
+    return MCCFIInstruction(OpEscape, L, 0, 0, Vals, Comment);
   }
 
   /// A special wrapper for .cfi_escape that indicates GNU_ARGS_SIZE
@@ -606,6 +609,10 @@ class MCCFIInstruction {
     assert(Operation == OpEscape);
     return StringRef(&Values[0], Values.size());
   }
+
+  StringRef getComment() const {
+    return Comment;
+  }
 };
 
 struct MCDwarfFrameInfo {
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index d81a9be26d39..b6a9a9568360 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -241,6 +241,7 @@ void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const {
     OutStreamer->emitCFIGnuArgsSize(Inst.getOffset());
     break;
   case MCCFIInstruction::OpEscape:
+    OutStreamer->AddComment(Inst.getComment());
     OutStreamer->emitCFIEscape(Inst.getValues());
     break;
   case MCCFIInstruction::OpRestore:
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 4789a9f02937..177d5e24fdb3 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -148,6 +148,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -399,6 +400,64 @@ static bool ShouldSignReturnAddress(MachineFunction &MF) {
   return false;
 }
 
+// Convenience function to create a DWARF expression for
+//   Expr + NumBytes + NumVGScaledBytes * AArch64::VG
+static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr,
+                                     int NumBytes, int NumVGScaledBytes, unsigned VG,
+                                     llvm::raw_string_ostream &Comment) {
+  uint8_t buffer[16];
+
+  if (NumBytes) {
+    Expr.push_back(dwarf::DW_OP_consts);
+    Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
+    Expr.push_back((uint8_t)dwarf::DW_OP_plus);
+    Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
+  }
+
+  if (NumVGScaledBytes) {
+    Expr.push_back((uint8_t)dwarf::DW_OP_consts);
+    Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
+
+    Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
+    Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
+    Expr.push_back(0);
+
+    Expr.push_back((uint8_t)dwarf::DW_OP_mul);
+    Expr.push_back((uint8_t)dwarf::DW_OP_plus);
+
+    Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
+            << std::abs(NumVGScaledBytes) << " * VG";
+  }
+}
+
+// Creates an MCCFIInstruction:
+//    { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
+MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP(
+    const TargetRegisterInfo &TRI, const StackOffset &OffsetFromSP) const {
+  int64_t NumBytes, NumVGScaledBytes;
+  OffsetFromSP.getForDwarfOffset(NumBytes, NumVGScaledBytes);
+
+  std::string CommentBuffer = "sp";
+  llvm::raw_string_ostream Comment(CommentBuffer);
+
+  // Build up the expression (SP + NumBytes + NumVGScaledBytes * AArch64::VG)
+  SmallString<64> Expr;
+  Expr.push_back(dwarf::DW_OP_breg0 + /*SP*/ 31);
+  Expr.push_back(0);
+  appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
+                           TRI.getDwarfRegNum(AArch64::VG, true), Comment);
+
+  // Wrap this into DW_CFA_def_cfa.
+  SmallString<64> DefCfaExpr;
+  DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
+  uint8_t buffer[16];
+  DefCfaExpr.append(buffer,
+                    buffer + encodeULEB128(Expr.size(), buffer));
+  DefCfaExpr.append(Expr.str());
+  return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(),
+                                        Comment.str());
+}
+
 void AArch64FrameLowering::emitCalleeSavedFrameMoves(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
   MachineFunction &MF = *MBB.getParent();
@@ -1383,9 +1442,18 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
     } else {
-      // Encode the stack size of the leaf function.
-      unsigned CFIIndex = MF.addFrameInst(
-          MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
+      unsigned CFIIndex;
+      if (SVEStackSize) {
+        const TargetSubtargetInfo &STI = MF.getSubtarget();
+        const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+        StackOffset TotalSize =
+            SVEStackSize + StackOffset((int64_t)MFI.getStackSize(), MVT::i8);
+        CFIIndex = MF.addFrameInst(createDefCFAExpressionFromSP(TRI, TotalSize));
+      } else {
+        // Encode the stack size of the leaf function.
+        CFIIndex = MF.addFrameInst(
+            MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
+      }
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 444740cb50ab..753593df2b4d 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -18,6 +18,8 @@
 
 namespace llvm {
 
+class MCCFIInstruction;
+
 class AArch64FrameLowering : public TargetFrameLowering {
 public:
   explicit AArch64FrameLowering()
@@ -119,6 +121,11 @@ class AArch64FrameLowering : public TargetFrameLowering {
   int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
                                       int &MinCSFrameIndex,
                                       int &MaxCSFrameIndex) const;
+  MCCFIInstruction
+  createDefCFAExpressionFromSP(const TargetRegisterInfo &TRI,
+                               const StackOffset &OffsetFromSP) const;
+  MCCFIInstruction createCfaOffset(const MCRegisterInfo &MRI, unsigned DwarfReg,
+                                   const StackOffset &OffsetFromDefCFA) const;
   bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
                                                 unsigned StackBumpBytes) const;
 };
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index bd05c56009a1..54b351fda053 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -133,6 +133,9 @@ def NZCV  : AArch64Reg<0, "nzcv">;
 // First fault status register
 def FFR : AArch64Reg<0, "ffr">, DwarfRegNum<[47]>;
 
+// Purely virtual Vector Granule (VG) Dwarf register
+def VG : AArch64Reg<0, "vg">, DwarfRegNum<[46]>;
+
 // GPR register classes with the intersections of GPR32/GPR32sp and
 // GPR64/GPR64sp for use by the coalescer.
 def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> {
diff --git a/llvm/lib/Target/AArch64/AArch64StackOffset.h b/llvm/lib/Target/AArch64/AArch64StackOffset.h
index 6fa1c744f77e..24751a81797d 100644
--- a/llvm/lib/Target/AArch64/AArch64StackOffset.h
+++ b/llvm/lib/Target/AArch64/AArch64StackOffset.h
@@ -123,6 +123,18 @@ class StackOffset {
     }
   }
 
+  void getForDwarfOffset(int64_t &ByteSized, int64_t &VGSized) const {
+    assert(isValid() && "Invalid frame offset");
+
+    // VGSized offsets are divided by '2', because the VG register is the
+    // the number of 64bit granules as opposed to 128bit vector chunks,
+    // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
+    // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
+    // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
+    ByteSized = Bytes;
+    VGSized = ScalableBytes / 2;
+  }
+
   /// Returns whether the offset is known zero.
   explicit operator bool() const { return Bytes || ScalableBytes; }
 
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 668b243dd79e..9e2077855c11 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -1,4 +1,8 @@
 # RUN: llc -mattr=+sve -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -start-before=prologepilog %s -o - | FileCheck %s --check-prefix=ASM
+# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -start-before=prologepilog %s -filetype=obj -o %t
+# RUN: llvm-objdump --dwarf=frames %t | FileCheck %s --check-prefix=UNWINDINFO
+# RUN: rm -rf %t
 #
 # Test allocation and deallocation of SVE objects on the stack,
 # as well as using a combination of scalable and non-scalable
@@ -23,19 +27,19 @@
 #
 --- |
 
-  define void @test_allocate_sve() nounwind { entry: unreachable }
-  define void @test_allocate_sve_gpr_callee_saves() nounwind { entry: unreachable }
-  define void @test_allocate_sve_gpr_realigned() nounwind { entry: unreachable }
-  define void @test_address_sve() nounwind { entry: unreachable }
-  define void @test_address_sve_fp() nounwind { entry: unreachable }
-  define void @test_stack_arg_sve() nounwind { entry: unreachable }
-  define void @test_address_sve_out_of_range() nounwind { entry: unreachable }
-  define void @test_address_gpr_vla() nounwind { entry: unreachable }
-  define aarch64_sve_vector_pcs void @save_restore_pregs_sve() nounwind { entry: unreachable }
-  define aarch64_sve_vector_pcs void @save_restore_zregs_sve() nounwind { entry: unreachable }
-  define aarch64_sve_vector_pcs void @save_restore_sve() nounwind { entry: unreachable }
-  define aarch64_sve_vector_pcs void @save_restore_sve_realign() nounwind { entry: unreachable }
-  define aarch64_sve_vector_pcs void @frame_layout() nounwind { entry: unreachable }
+  define void @test_allocate_sve() { entry: unreachable }
+  define void @test_allocate_sve_gpr_callee_saves() { entry: unreachable }
+  define void @test_allocate_sve_gpr_realigned() { entry: unreachable }
+  define void @test_address_sve() { entry: unreachable }
+  define void @test_address_sve_fp() { entry: unreachable }
+  define void @test_stack_arg_sve() { entry: unreachable }
+  define void @test_address_sve_out_of_range() { entry: unreachable }
+  define void @test_address_gpr_vla() { entry: unreachable }
+  define aarch64_sve_vector_pcs void @save_restore_pregs_sve() { entry: unreachable }
+  define aarch64_sve_vector_pcs void @save_restore_zregs_sve() { entry: unreachable }
+  define aarch64_sve_vector_pcs void @save_restore_sve() { entry: unreachable }
+  define aarch64_sve_vector_pcs void @save_restore_sve_realign() { entry: unreachable }
+  define aarch64_sve_vector_pcs void @frame_layout() { entry: unreachable }
 
 ...
 # +----------+
@@ -54,11 +58,19 @@
 # CHECK-NEXT: $sp = frame-setup STRXpre killed $[[SCRATCH:[a-z0-9]+]], $sp, -16
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2
 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
+# CHECK-COUNT-2: frame-setup CFI_INSTRUCTION
 
 # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2
 # CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
 # CHECK-NEXT: $sp, $[[SCRATCH]] = frame-destroy LDRXpost $sp, 16
 # CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_allocate_sve:
+# ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 16 * VG
+# ASM-NEXT:  .cfi_offset w29, -16
+#
+# UNWINDINFO:       DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT:  DW_CFA_offset: reg29 -16
 name:            test_allocate_sve
 stack:
   - { id: 0, stack-id: sve-vec, size: 18, alignment: 2 }
@@ -85,6 +97,8 @@ body:             |
 # CHECK-NEXT: frame-setup STPXi killed $x21, killed $x20, $sp, 2
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2
 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
+# CHECK-COUNT-4: frame-setup CFI_INSTRUCTION
+#
 # CHECK-NEXT: $x20 = IMPLICIT_DEF
 # CHECK-NEXT: $x21 = IMPLICIT_DEF
 # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2
@@ -92,6 +106,17 @@ body:             |
 # CHECK-NEXT: $x21, $x20 = frame-destroy LDPXi $sp, 2
 # CHECK-NEXT: $sp, $[[SCRATCH]] = frame-destroy LDRXpost $sp, 32
 # CHECK-NEXT: RET_ReallyLR
+#
+# ASM-LABEL: test_allocate_sve_gpr_callee_saves:
+# ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 16 * VG
+# ASM-NEXT:  .cfi_offset w20, -8
+# ASM-NEXT:  .cfi_offset w21, -16
+# ASM-NEXT:  .cfi_offset w29, -32
+#
+# UNWINDINFO:       DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +48, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT:  DW_CFA_offset: reg20 -8
+# UNWINDINFO-NEXT:  DW_CFA_offset: reg21 -16
+# UNWINDINFO-NEXT:  DW_CFA_offset: reg29 -32
 name:            test_allocate_sve_gpr_callee_saves
 stack:
   - { id: 0, stack-id: sve-vec, size: 18, alignment: 2 }
@@ -120,9 +145,20 @@ body:             |
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2
 # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0
 # CHECK-NEXT: $sp = ANDXri killed $[[TMP]]
+# CHECK-COUNT-3: frame-setup CFI_INSTRUCTION
 # CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
 # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2
 # CHECK-NEXT: RET_ReallyLR
+#
+# ASM-LABEL: test_allocate_sve_gpr_realigned:
+# ASM:       .cfi_def_cfa w29, 16
+# ASM-NEXT:  .cfi_offset w30, -8
+# ASM-NEXT:  .cfi_offset w29, -16
+#
+# UNWINDINFO:       DW_CFA_def_cfa: reg29 +16
+# UNWINDINFO-NEXT:  DW_CFA_offset: reg30 -8
+# UNWINDINFO-NEXT:  DW_CFA_offset: reg29 -16
+
 name:            test_allocate_sve_gpr_realigned
 stack:
   - { id: 0, stack-id: sve-vec, size: 18, alignment: 2  }
@@ -149,6 +185,7 @@ body:             |
 # CHECK-NEXT: $sp = frame-setup STRXpre killed $[[SCRATCH:[a-z0-9]+]], $sp, -16
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3
 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
+# CHECK-COUNT-2: frame-setup CFI_INSTRUCTION
 
 # CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 16
 # CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], 2
@@ -161,6 +198,14 @@ body:             |
 # CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
 # CHECK-NEXT: $sp, $[[SCRATCH]] = frame-destroy LDRXpost $sp, 16
 # CHECK-NEXT: RET_ReallyLR
+#
+# ASM-LABEL:  test_address_sve:
+# ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 24 * VG
+# ASM-NEXT:  .cfi_offset w29, -16
+#
+# UNWINDINFO:       DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT:  DW_CFA_offset: reg29 -16
+
 name:            test_address_sve
 frameInfo:
   maxAlignment:  16
@@ -199,6 +244,7 @@ body:             |
 # CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3
 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
+# CHECK-COUNT-3: frame-setup CFI_INSTRUCTION
 
 # CHECK-NEXT: STR_ZXI $z0, $fp, -1
 # CHECK-NEXT: STR_ZXI $z1, $fp, -2
@@ -208,6 +254,15 @@ body:             |
 # CHECK:      $sp = frame-destroy ADDXri $sp, 16, 0
 # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2
 # CHECK-NEXT: RET_ReallyLR
+#
+# ASM-LABEL: test_address_sve_fp:
+# ASM:       .cfi_def_cfa w29, 16
+# ASM-NEXT:  .cfi_offset w30, -8
+# ASM-NEXT:  .cfi_offset w29, -16
+#
+# UNWINDINFO:      DW_CFA_def_cfa: reg29 +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
 name:            test_address_sve_fp
 frameInfo:
   maxAlignment:  16
@@ -244,6 +299,7 @@ body:             |
 # CHECK-NEXT: $sp = frame-setup STRXpre killed $[[SCRATCH:[a-z0-9]+]], $sp, -16
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1
 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0
+# CHECK-COUNT-2: frame-setup CFI_INSTRUCTION
 
 # CHECK:      $[[TMP:x[0-9]+]] = ADDVL_XXI $sp, 1
 # CHECK-NEXT: $x0 = LDRXui killed $[[TMP]], 4
@@ -252,6 +308,14 @@ body:             |
 # CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
 # CHECK-NEXT: $sp, $[[SCRATCH]] = frame-destroy LDRXpost $sp, 16
 # CHECK-NEXT: RET_ReallyLR
+#
+# ASM-LABEL: test_stack_arg_sve:
+# ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG
+# ASM-NEXT:  .cfi_offset w29, -16
+#
+# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+
 name:             test_stack_arg_sve
 fixedStack:
   - { id: 0, stack-id: default, size: 16, alignment: 16, offset: 0 }
@@ -292,6 +356,7 @@ body:             |
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -32
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -32
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1
+# CHECK-COUNT-2: frame-setup CFI_INSTRUCTION
 
 # CHECK-NEXT: $[[TMP2:x[0-9]+]] = ADDVL_XXI $sp, 1
 # CHECK-NEXT: STR_ZXI $z0, killed $[[TMP2]], 255
@@ -310,6 +375,13 @@ body:             |
 # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 9
 # CHECK-NEXT: $sp, $[[SCRATCH]] = frame-destroy LDRXpost $sp, 16
 # CHECK-NEXT: RET_ReallyLR
+#
+# ASM-LABEL: test_address_sve_out_of_range:
+# ASM:       .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 2056 * VG
+# ASM-NEXT:  .cfi_offset w29, -16
+#
+# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +2056, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
 name:            test_address_sve_out_of_range
 frameInfo:
   maxAlignment:  16
@@ -344,6 +416,17 @@ body:             |
 # CHECK: bb.0.entry:
 # CHECK: STRXui $xzr, $x19, 0
 # CHECK: RET_ReallyLR
+#
+# ASM-LABEL: test_address_gpr_vla:
+# ASM:       .cfi_def_cfa w29, 32
+# ASM-NEXT:  .cfi_offset w19, -16
+# ASM-NEXT:  .cfi_offset w30, -24
+# ASM-NEXT:  .cfi_offset w29, -32
+#
+# UNWINDINFO:      DW_CFA_def_cfa: reg29 +32
+# UNWINDINFO-NEXT: DW_CFA_offset: reg19 -16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -24
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -32
 name:            test_address_gpr_vla
 frameInfo:
   maxAlignment:  16
@@ -366,6 +449,7 @@ body:             |
 # CHECK: frame-setup STR_PXI killed $p5, $sp, 6
 # CHECK: frame-setup STR_PXI killed $p4, $sp, 7
 # CHECK: $sp = frame-setup SUBXri $sp, 32, 0
+# CHECK-COUNT-5: frame-setup CFI_INSTRUCTION
 
 # CHECK: $sp = frame-destroy ADDXri $sp, 32, 0
 # CHECK: $p6 = frame-destroy LDR_PXI $sp, 5
@@ -373,6 +457,15 @@ body:             |
 # CHECK: $p4 = frame-destroy LDR_PXI $sp, 7
 # CHECK: $sp = frame-destroy ADDVL_XXI $sp, 1
 # CHECK: RET_ReallyLR
+#
+# ASM-LABEL: save_restore_pregs_sve:
+# ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 8 * VG
+# ASM-COUNT-3: .cfi_offset
+# ASM-NEXT:  .cfi_offset w29, -16
+#
+# UNWINDINFO:         DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +48, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-COUNT-3: DW_CFA_offset
+# UNWINDINFO-NEXT:    DW_CFA_offset: reg29 -16
 name: save_restore_pregs_sve
 stack:
   - { id: 0, stack-id: default, size: 32, alignment: 16 }
@@ -387,18 +480,29 @@ body:             |
 ---
 ...
 # CHECK-LABEL: name: save_restore_zregs_sve
-# CHECK: $sp = frame-setup ADDVL_XXI $sp, -3
-# CHECK: frame-setup STR_ZXI killed $z10, $sp, 0
-# CHECK: frame-setup STR_ZXI killed $z9, $sp, 1
-# CHECK: frame-setup STR_ZXI killed $z8, $sp, 2
-# CHECK: $sp = frame-setup SUBXri $sp, 32, 0
-
-# CHECK: $sp  = frame-destroy ADDXri $sp, 32, 0
-# CHECK: $z10 = frame-destroy LDR_ZXI $sp, 0
-# CHECK: $z9  = frame-destroy LDR_ZXI $sp, 1
-# CHECK: $z8  = frame-destroy LDR_ZXI $sp, 2
-# CHECK: $sp  = frame-destroy ADDVL_XXI $sp, 3
-# CHECK: RET_ReallyLR
+# CHECK:      $sp = frame-setup STRXpre killed $fp, $sp, -16
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3
+# CHECK-NEXT: frame-setup STR_ZXI killed $z10, $sp, 0
+# CHECK-NEXT: frame-setup STR_ZXI killed $z9, $sp, 1
+# CHECK-NEXT: frame-setup STR_ZXI killed $z8, $sp, 2
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 32, 0
+# CHECK-COUNT-5: frame-setup CFI_INSTRUCTION
+
+# CHECK:      $sp  = frame-destroy ADDXri $sp, 32, 0
+# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0
+# CHECK-NEXT: $z9  = frame-destroy LDR_ZXI $sp, 1
+# CHECK-NEXT: $z8  = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $sp  = frame-destroy ADDVL_XXI $sp, 3
+# CHECK-NEXT: $sp, $fp = frame-destroy LDRXpost $sp, 16
+# CHECK-NEXT: RET_ReallyLR
+#
+# ASM-LABEL: save_restore_zregs_sve:
+# ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 24 * VG
+# ASM-COUNT-3: .cfi_offset
+#
+# UNWINDINFO:         DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +48, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-COUNT-3: DW_CFA_offset
+# UNWINDINFO-NEXT:    DW_CFA_offset: reg29 -16
 name: save_restore_zregs_sve
 stack:
   - { id: 0, stack-id: default, size: 32, alignment: 16 }
@@ -432,6 +536,7 @@ body:             |
 # CHECK: frame-setup STR_ZXI killed $z8, $sp, 17
 # CHECK: $sp = frame-setup ADDVL_XXI $sp, -1
 # CHECK: $sp = frame-setup SUBXri $sp, 32, 0
+# CHECK-COUNT-33: frame-setup CFI_INSTRUCTION
 
 # CHECK: $sp = frame-destroy ADDXri $sp, 32, 0
 # CHECK: $sp = frame-destroy ADDVL_XXI $sp, 1
@@ -447,6 +552,22 @@ body:             |
 # CHECK: $x20, $x19 = frame-destroy LDPXi $sp, 2
 # CHECK: $sp, ${{[a-z0-9]+}}, $x21 = frame-destroy LDPXpost $sp, 4
 # CHECK: RET_ReallyLR
+#
+# ASM-LABEL: save_restore_sve:
+# ASM:       .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 64 + 152 * VG
+# ASM-COUNT-28: .cfi_offset
+# ASM-NEXT:  .cfi_offset w19, -8
+# ASM-NEXT:  .cfi_offset w20, -16
+# ASM-NEXT:  .cfi_offset w21, -24
+# ASM-NEXT:  .cfi_offset w29, -32
+#
+# UNWINDINFO:          DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +64, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-COUNT-28: DW_CFA_offset
+# UNWINDINFO-NEXT:     DW_CFA_offset: reg19 -8
+# UNWINDINFO-NEXT:     DW_CFA_offset: reg20 -16
+# UNWINDINFO-NEXT:     DW_CFA_offset: reg21 -24
+# UNWINDINFO-NEXT:     DW_CFA_offset: reg29 -32
+
 name: save_restore_sve
 stack:
   - { id: 0, stack-id: sve-vec, size: 16, alignment: 16 }
@@ -499,6 +620,7 @@ body:             |
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1
 # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0
 # CHECK-NEXT: $sp = ANDXri killed $[[TMP]]
+# CHECK-COUNT-31: frame-setup CFI_INSTRUCTION
 
 # CHECK:      $sp = frame-destroy ADDVL_XXI $fp, -18
 # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4
@@ -512,6 +634,11 @@ body:             |
 # CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
 # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2
 # CHECK-NEXT: RET_ReallyLR
+#
+# UNWINDINFO:          DW_CFA_def_cfa: reg29 +16
+# UNWINDINFO-COUNT-28: DW_CFA_offset
+# UNWINDINFO-NEXT:     DW_CFA_offset: reg30 -8
+# UNWINDINFO-NEXT:     DW_CFA_offset: reg29 -16
 name: save_restore_sve_realign
 stack:
   - { id: 0, stack-id: sve-vec, size: 16, alignment: 16 }
@@ -586,6 +713,15 @@ body:             |
 # CHECK-NEXT: STR_ZXI killed $z23, $sp, 1
 # CHECK-NEXT: STR_ZXI killed $z8, $sp, 2
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -7
+# CHECK-COUNT-6: frame-setup CFI_INSTRUCTION
+# ASM-LABEL: frame_layout:
+# ASM:       .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 80 * VG
+# ASM-COUNT-4: .cfi_offset
+# ASM-NEXT:  .cfi_offset w29, -16
+#
+# UNWINDINFO:         DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +80, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-COUNT-4: DW_CFA_offset
+# UNWINDINFO-NEXT:    DW_CFA_offset: reg29 -16
 name: frame_layout
 stack:
   - { id: 0, type: default,    size:  32, alignment: 16, stack-id: sve-vec }
diff --git a/llvm/test/CodeGen/AArch64/sve-trunc.ll b/llvm/test/CodeGen/AArch64/sve-trunc.ll
index af50176f6b10..191df22eda50 100644
--- a/llvm/test/CodeGen/AArch64/sve-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-trunc.ll
@@ -116,7 +116,7 @@ define <vscale x 16 x i1> @trunc_i64toi1_split3(<vscale x 16 x i64> %in) {
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset p4, -2
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d

From 8a755a4c3ee7738bbeea92829dfc2876445ff366 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Tue, 4 Aug 2020 11:10:32 +0100
Subject: [PATCH 116/363] [AArch64][SVE] Add missing unwind info for SVE
 registers.

This patch adds a CFI entry for each SVE callee saved register
that needs unwind info at an offset from the CFA. The offset is
a DWARF expression because the offset is partly scalable.

The CFI entries only cover a subset of the SVE callee-saves and
only encodes the lower 64-bits, thus implementing the lowest
common denominator ABI. Existing unwinders may support VG but
only restore the lower 64-bits.

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D84044

(cherry picked from commit bb3344c7d8c2703c910dd481ada43ecaf11536a6)
---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 60 +++++++++++--
 .../lib/Target/AArch64/AArch64FrameLowering.h |  2 +-
 .../Target/AArch64/AArch64RegisterInfo.cpp    | 23 +++++
 llvm/lib/Target/AArch64/AArch64RegisterInfo.h |  1 +
 llvm/test/CodeGen/AArch64/framelayout-sve.mir | 89 +++++++++++++------
 llvm/test/CodeGen/AArch64/sve-trunc.ll        |  1 -
 6 files changed, 142 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 177d5e24fdb3..30666009801c 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -458,12 +458,44 @@ MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP(
                                         Comment.str());
 }
 
+MCCFIInstruction AArch64FrameLowering::createCfaOffset(
+    const TargetRegisterInfo &TRI, unsigned Reg,
+    const StackOffset &OffsetFromDefCFA) const {
+  int64_t NumBytes, NumVGScaledBytes;
+  OffsetFromDefCFA.getForDwarfOffset(NumBytes, NumVGScaledBytes);
+
+  unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
+
+  // Non-scalable offsets can use DW_CFA_offset directly.
+  if (!NumVGScaledBytes)
+    return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
+
+  std::string CommentBuffer;
+  llvm::raw_string_ostream Comment(CommentBuffer);
+  Comment << printReg(Reg, &TRI) << "  @ cfa";
+
+  // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
+  SmallString<64> OffsetExpr;
+  appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
+                           TRI.getDwarfRegNum(AArch64::VG, true), Comment);
+
+  // Wrap this into DW_CFA_expression
+  SmallString<64> CfaExpr;
+  CfaExpr.push_back(dwarf::DW_CFA_expression);
+  uint8_t buffer[16];
+  CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
+  CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
+  CfaExpr.append(OffsetExpr.str());
+
+  return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str());
+}
+
 void AArch64FrameLowering::emitCalleeSavedFrameMoves(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetSubtargetInfo &STI = MF.getSubtarget();
-  const MCRegisterInfo *MRI = STI.getRegisterInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
   const TargetInstrInfo *TII = STI.getInstrInfo();
   DebugLoc DL = MBB.findDebugLoc(MBBI);
 
@@ -474,11 +506,26 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
 
   for (const auto &Info : CSI) {
     unsigned Reg = Info.getReg();
-    int64_t Offset =
-        MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
-    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
-    unsigned CFIIndex = MF.addFrameInst(
-        MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+
+    // Not all unwinders may know about SVE registers, so assume the lowest
+    // common demoninator.
+    unsigned NewReg;
+    if (static_cast<const AArch64RegisterInfo *>(TRI)->regNeedsCFI(Reg, NewReg))
+      Reg = NewReg;
+    else
+      continue;
+
+    StackOffset Offset;
+    if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::SVEVector) {
+      AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+      Offset = StackOffset(MFI.getObjectOffset(Info.getFrameIdx()), MVT::nxv1i8) -
+               StackOffset(AFI->getCalleeSavedStackSize(MFI), MVT::i8);
+    } else {
+      Offset = {MFI.getObjectOffset(Info.getFrameIdx()) -
+                    getOffsetOfLocalArea(),
+                MVT::i8};
+    }
+    unsigned CFIIndex = MF.addFrameInst(createCfaOffset(*TRI, Reg, Offset));
     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex)
         .setMIFlags(MachineInstr::FrameSetup);
@@ -2074,6 +2121,7 @@ static void computeCalleeSaveRegisterPairs(
   // available unwind codes.  This flag assures that the alignment fixup is done
   // only once, as intened.
   bool FixupDone = false;
+
   for (unsigned i = 0; i < Count; ++i) {
     RegPairInfo RPI;
     RPI.Reg1 = CSI[i].getReg();
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 753593df2b4d..1ca8c3e9e2bf 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -124,7 +124,7 @@ class AArch64FrameLowering : public TargetFrameLowering {
   MCCFIInstruction
   createDefCFAExpressionFromSP(const TargetRegisterInfo &TRI,
                                const StackOffset &OffsetFromSP) const;
-  MCCFIInstruction createCfaOffset(const MCRegisterInfo &MRI, unsigned DwarfReg,
+  MCCFIInstruction createCfaOffset(const TargetRegisterInfo &MRI, unsigned DwarfReg,
                                    const StackOffset &OffsetFromDefCFA) const;
   bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
                                                 unsigned StackBumpBytes) const;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 83a488afc797..62cc865fd1c3 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -40,6 +40,29 @@ AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
   AArch64_MC::initLLVMToCVRegMapping(this);
 }
 
+/// Return whether the register needs a CFI entry. Not all unwinders may know
+/// about SVE registers, so we assume the lowest common denominator, i.e. the
+/// callee-saves required by the base ABI. For the SVE registers z8-z15 only the
+/// lower 64-bits (d8-d15) need to be saved. The lower 64-bits subreg is
+/// returned in \p RegToUseForCFI.
+bool AArch64RegisterInfo::regNeedsCFI(unsigned Reg,
+                                      unsigned &RegToUseForCFI) const {
+  if (AArch64::PPRRegClass.contains(Reg))
+    return false;
+
+  if (AArch64::ZPRRegClass.contains(Reg)) {
+    RegToUseForCFI = getSubReg(Reg, AArch64::dsub);
+    for (int I = 0; CSR_AArch64_AAPCS_SaveList[I]; ++I) {
+      if (CSR_AArch64_AAPCS_SaveList[I] == RegToUseForCFI)
+        return true;
+    }
+    return false;
+  }
+
+  RegToUseForCFI = Reg;
+  return true;
+}
+
 static bool hasSVEArgsOrReturn(const MachineFunction *MF) {
   const Function &F = MF->getFunction();
   return isa<ScalableVectorType>(F.getReturnType()) ||
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 22a8ba76c611..91064787d3da 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -122,6 +122,7 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
                                MachineFunction &MF) const override;
 
   unsigned getLocalAddressRegister(const MachineFunction &MF) const;
+  bool regNeedsCFI(unsigned Reg, unsigned &RegToUseForCFI) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 9e2077855c11..e6eb9e7a3d3e 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -449,7 +449,7 @@ body:             |
 # CHECK: frame-setup STR_PXI killed $p5, $sp, 6
 # CHECK: frame-setup STR_PXI killed $p4, $sp, 7
 # CHECK: $sp = frame-setup SUBXri $sp, 32, 0
-# CHECK-COUNT-5: frame-setup CFI_INSTRUCTION
+# CHECK-COUNT-2: frame-setup CFI_INSTRUCTION
 
 # CHECK: $sp = frame-destroy ADDXri $sp, 32, 0
 # CHECK: $p6 = frame-destroy LDR_PXI $sp, 5
@@ -460,11 +460,9 @@ body:             |
 #
 # ASM-LABEL: save_restore_pregs_sve:
 # ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 8 * VG
-# ASM-COUNT-3: .cfi_offset
 # ASM-NEXT:  .cfi_offset w29, -16
 #
 # UNWINDINFO:         DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +48, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-# UNWINDINFO-COUNT-3: DW_CFA_offset
 # UNWINDINFO-NEXT:    DW_CFA_offset: reg29 -16
 name: save_restore_pregs_sve
 stack:
@@ -498,11 +496,16 @@ body:             |
 #
 # ASM-LABEL: save_restore_zregs_sve:
 # ASM:       .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 24 * VG
-# ASM-COUNT-3: .cfi_offset
-#
-# UNWINDINFO:         DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +48, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-# UNWINDINFO-COUNT-3: DW_CFA_offset
-# UNWINDINFO-NEXT:    DW_CFA_offset: reg29 -16
+# ASM-NEXT:  .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8  @ cfa - 16 - 8 * VG
+# ASM-NEXT:  .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9  @ cfa - 16 - 16 * VG
+# ASM-NEXT:  .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10  @ cfa - 16 - 24 * VG
+
+# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +48, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg72 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg73 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg74 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+
 name: save_restore_zregs_sve
 stack:
   - { id: 0, stack-id: default, size: 32, alignment: 16 }
@@ -536,7 +539,7 @@ body:             |
 # CHECK: frame-setup STR_ZXI killed $z8, $sp, 17
 # CHECK: $sp = frame-setup ADDVL_XXI $sp, -1
 # CHECK: $sp = frame-setup SUBXri $sp, 32, 0
-# CHECK-COUNT-33: frame-setup CFI_INSTRUCTION
+# CHECK-COUNT-13: frame-setup CFI_INSTRUCTION
 
 # CHECK: $sp = frame-destroy ADDXri $sp, 32, 0
 # CHECK: $sp = frame-destroy ADDVL_XXI $sp, 1
@@ -555,18 +558,32 @@ body:             |
 #
 # ASM-LABEL: save_restore_sve:
 # ASM:       .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 64 + 152 * VG
-# ASM-COUNT-28: .cfi_offset
+# ASM-NEXT:  .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG
+# ASM-NEXT:  .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG
+# ASM-NEXT:  .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG
+# ASM-NEXT:  .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 32 - 32 * VG
+# ASM-NEXT:  .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 32 - 40 * VG
+# ASM-NEXT:  .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 32 - 48 * VG
+# ASM-NEXT:  .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 32 - 56 * VG
+# ASM-NEXT:  .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 32 - 64 * VG
 # ASM-NEXT:  .cfi_offset w19, -8
 # ASM-NEXT:  .cfi_offset w20, -16
 # ASM-NEXT:  .cfi_offset w21, -24
 # ASM-NEXT:  .cfi_offset w29, -32
 #
-# UNWINDINFO:          DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +64, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-# UNWINDINFO-COUNT-28: DW_CFA_offset
-# UNWINDINFO-NEXT:     DW_CFA_offset: reg19 -8
-# UNWINDINFO-NEXT:     DW_CFA_offset: reg20 -16
-# UNWINDINFO-NEXT:     DW_CFA_offset: reg21 -24
-# UNWINDINFO-NEXT:     DW_CFA_offset: reg29 -32
+# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +64, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg72 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg73 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg74 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg75 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -32, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg76 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -40, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg77 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -48, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg78 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -56, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg79 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -64, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_offset: reg19 -8
+# UNWINDINFO-NEXT: DW_CFA_offset: reg20 -16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg21 -24
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -32
 
 name: save_restore_sve
 stack:
@@ -620,7 +637,7 @@ body:             |
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1
 # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0
 # CHECK-NEXT: $sp = ANDXri killed $[[TMP]]
-# CHECK-COUNT-31: frame-setup CFI_INSTRUCTION
+# CHECK-COUNT-11: frame-setup CFI_INSTRUCTION
 
 # CHECK:      $sp = frame-destroy ADDVL_XXI $fp, -18
 # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4
@@ -635,10 +652,30 @@ body:             |
 # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2
 # CHECK-NEXT: RET_ReallyLR
 #
-# UNWINDINFO:          DW_CFA_def_cfa: reg29 +16
-# UNWINDINFO-COUNT-28: DW_CFA_offset
-# UNWINDINFO-NEXT:     DW_CFA_offset: reg30 -8
-# UNWINDINFO-NEXT:     DW_CFA_offset: reg29 -16
+# ASM-LABEL: save_restore_sve_realign:
+# ASM:       .cfi_def_cfa w29, 16
+# ASM-NEXT:  .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8  @ cfa - 16 - 8 * VG
+# ASM-NEXT:  .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9  @ cfa - 16 - 16 * VG
+# ASM-NEXT:  .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10  @ cfa - 16 - 24 * VG
+# ASM-NEXT:  .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11  @ cfa - 16 - 32 * VG
+# ASM-NEXT:  .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12  @ cfa - 16 - 40 * VG
+# ASM-NEXT:  .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13  @ cfa - 16 - 48 * VG
+# ASM-NEXT:  .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14  @ cfa - 16 - 56 * VG
+# ASM-NEXT:  .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15  @ cfa - 16 - 64 * VG
+# ASM-NEXT:  .cfi_offset w30, -8
+# ASM-NEXT:  .cfi_offset w29, -16
+#
+# UNWINDINFO:      DW_CFA_def_cfa: reg29 +16
+# UNWINDINFO-NEXT: DW_CFA_expression: reg72 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg73 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg74 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg75 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -32, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg76 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -40, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg77 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -48, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg78 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -56, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg79 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -64, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
 name: save_restore_sve_realign
 stack:
   - { id: 0, stack-id: sve-vec, size: 16, alignment: 16 }
@@ -713,15 +750,15 @@ body:             |
 # CHECK-NEXT: STR_ZXI killed $z23, $sp, 1
 # CHECK-NEXT: STR_ZXI killed $z8, $sp, 2
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -7
-# CHECK-COUNT-6: frame-setup CFI_INSTRUCTION
+# CHECK-COUNT-3: frame-setup CFI_INSTRUCTION
 # ASM-LABEL: frame_layout:
 # ASM:       .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 80 * VG
-# ASM-COUNT-4: .cfi_offset
+# ASM-NEXT:  .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8  @ cfa - 16 - 8 * VG
 # ASM-NEXT:  .cfi_offset w29, -16
 #
-# UNWINDINFO:         DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +80, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
-# UNWINDINFO-COUNT-4: DW_CFA_offset
-# UNWINDINFO-NEXT:    DW_CFA_offset: reg29 -16
+# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +80, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_expression: reg72 DW_OP_consts -16, DW_OP_plus, DW_OP_consts -8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
 name: frame_layout
 stack:
   - { id: 0, type: default,    size:  32, alignment: 16, stack-id: sve-vec }
diff --git a/llvm/test/CodeGen/AArch64/sve-trunc.ll b/llvm/test/CodeGen/AArch64/sve-trunc.ll
index 191df22eda50..7c0e9e9f4d9b 100644
--- a/llvm/test/CodeGen/AArch64/sve-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-trunc.ll
@@ -117,7 +117,6 @@ define <vscale x 16 x i1> @trunc_i64toi1_split3(<vscale x 16 x i64> %in) {
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset p4, -2
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z7.d, z7.d, #0x1

From d485dbdcd1e90dd0a715e78c6532227e7ed10e4d Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Wed, 5 Aug 2020 09:17:57 +0100
Subject: [PATCH 117/363] [AArch64][SVE] Disable tail calls if callee does not
 preserve SVE regs.

This fixes an issue triggered by the following code, where emitEpilogue
got confused when trying to restore the SVE registers after the call,
whereas the call to bar() is implemented as a TCReturn:

  int non_sve();
  int sve(svint32_t x) { return non_sve(); }

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D84869

(cherry picked from commit f2916636f83dfeb4808a16045db0025783743471)
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  38 ++++---
 .../Target/AArch64/AArch64RegisterInfo.cpp    |   2 +-
 llvm/lib/Target/AArch64/AArch64RegisterInfo.h |   2 +
 llvm/test/CodeGen/AArch64/sve-tailcall.ll     | 107 ++++++++++++++++++
 4 files changed, 134 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-tailcall.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1500da2fdfc7..45bfa85bdc07 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4107,6 +4107,7 @@ static bool canGuaranteeTCO(CallingConv::ID CC) {
 static bool mayTailCallThisCC(CallingConv::ID CC) {
   switch (CC) {
   case CallingConv::C:
+  case CallingConv::AArch64_SVE_VectorCall:
   case CallingConv::PreserveMost:
   case CallingConv::Swift:
     return true;
@@ -4126,6 +4127,15 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   MachineFunction &MF = DAG.getMachineFunction();
   const Function &CallerF = MF.getFunction();
   CallingConv::ID CallerCC = CallerF.getCallingConv();
+
+  // If this function uses the C calling convention but has an SVE signature,
+  // then it preserves more registers and should assume the SVE_VectorCall CC.
+  // The check for matching callee-saved regs will determine whether it is
+  // eligible for TCO.
+  if (CallerCC == CallingConv::C &&
+      AArch64RegisterInfo::hasSVEArgsOrReturn(&MF))
+    CallerCC = CallingConv::AArch64_SVE_VectorCall;
+
   bool CCMatch = CallerCC == CalleeCC;
 
   // When using the Windows calling convention on a non-windows OS, we want
@@ -4313,6 +4323,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
   bool IsSibCall = false;
 
+  // Check callee args/returns for SVE registers and set calling convention
+  // accordingly.
+  if (CallConv == CallingConv::C) {
+    bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
+      return Out.VT.isScalableVector();
+    });
+    bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
+      return In.VT.isScalableVector();
+    });
+
+    if (CalleeInSVE || CalleeOutSVE)
+      CallConv = CallingConv::AArch64_SVE_VectorCall;
+  }
+
   if (IsTailCall) {
     // Check if it's really possible to do a tail call.
     IsTailCall = isEligibleForTailCallOptimization(
@@ -4666,20 +4690,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     Ops.push_back(DAG.getRegister(RegToPass.first,
                                   RegToPass.second.getValueType()));
 
-  // Check callee args/returns for SVE registers and set calling convention
-  // accordingly.
-  if (CallConv == CallingConv::C) {
-    bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
-      return Out.VT.isScalableVector();
-    });
-    bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
-      return In.VT.isScalableVector();
-    });
-
-    if (CalleeInSVE || CalleeOutSVE)
-      CallConv = CallingConv::AArch64_SVE_VectorCall;
-  }
-
   // Add a register mask operand representing the call-preserved registers.
   const uint32_t *Mask;
   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 62cc865fd1c3..3e9c8c7b6df2 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -63,7 +63,7 @@ bool AArch64RegisterInfo::regNeedsCFI(unsigned Reg,
   return true;
 }
 
-static bool hasSVEArgsOrReturn(const MachineFunction *MF) {
+bool AArch64RegisterInfo::hasSVEArgsOrReturn(const MachineFunction *MF) {
   const Function &F = MF->getFunction();
   return isa<ScalableVectorType>(F.getReturnType()) ||
          any_of(F.args(), [](const Argument &Arg) {
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 91064787d3da..7b20f181e76d 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -42,6 +42,8 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
   void UpdateCustomCallPreservedMask(MachineFunction &MF,
                                      const uint32_t **Mask) const;
 
+  static bool hasSVEArgsOrReturn(const MachineFunction *MF);
+
   /// Code Generation virtual methods...
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
   const MCPhysReg *getDarwinCalleeSavedRegs(const MachineFunction *MF) const;
diff --git a/llvm/test/CodeGen/AArch64/sve-tailcall.ll b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
new file mode 100644
index 000000000000..81e98ba54287
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+; Check that a tail call from an SVE function to another SVE function
+; can use a tail-call, as the same registers will be preserved by the
+; callee.
+define <vscale x 4 x i32> @sve_caller_sve_callee() nounwind {
+; CHECK-LABEL: sve_caller_sve_callee:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    b sve_callee
+  tail call void asm sideeffect "", "~{z9},~{z10}"()
+  %call = tail call <vscale x 4 x i32> @sve_callee()
+  ret <vscale x 4 x i32> %call
+}
+
+declare <vscale x 4 x i32> @sve_callee()
+
+; Check that a tail call from an SVE function to a non-SVE function
+; does not use a tail-call, because after the call many of the SVE
+; registers may be clobbered and needs to be restored.
+define i32 @sve_caller_non_sve_callee(<vscale x 4 x i32> %arg) nounwind {
+; CHECK-LABEL: sve_caller_non_sve_callee:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-18
+; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    bl non_sve_callee
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #18
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  tail call void asm sideeffect "", "~{z9},~{z10}"()
+  %call = tail call i32 @non_sve_callee()
+  ret i32 %call
+}
+
+declare i32 @non_sve_callee()

From 2d52adc4ade0d194a3341742b01d088c5fdd70c6 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Thu, 6 Aug 2020 16:53:13 +0100
Subject: [PATCH 118/363] [SVE][CodeGen] Fix bug with store of unpacked FP
 scalable vectors

Fixed an incorrect pattern in lib/Target/AArch64/AArch64SVEInstrInfo.td
for storing out <vscale x 2 x f32> unpacked scalable vectors. Added
a couple of tests to

  test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll

Differential Revision: https://reviews.llvm.org/D85441

(cherry picked from commit 0905d9f31ead399d054c5d2a2c353e690f5c8daa)
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  2 +-
 .../sve-st1-addressing-mode-reg-imm.ll        | 29 +++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3449a8bd16d2..4f29f2f18185 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1765,7 +1765,7 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   defm : unpred_store<         store, nxv4f16, ST1H_S_IMM, PTRUE_S>;
   defm : unpred_store<         store, nxv2f16, ST1H_D_IMM, PTRUE_D>;
   defm : unpred_store<         store, nxv4f32,   ST1W_IMM, PTRUE_S>;
-  defm : unpred_store<         store, nxv4f32, ST1W_D_IMM, PTRUE_D>;
+  defm : unpred_store<         store, nxv2f32, ST1W_D_IMM, PTRUE_D>;
   defm : unpred_store<         store, nxv2f64,   ST1D_IMM, PTRUE_D>;
 
   multiclass unpred_load<PatFrag Load, ValueType Ty, Instruction RegImmInst,
diff --git a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll
index 1d81a586826f..e24db77b682e 100644
--- a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll
@@ -104,3 +104,32 @@ define void @st1d_inbound(<vscale x 2 x i64> %data, <vscale x 2 x i64>* %a) {
   store <vscale x 2 x i64> %data, <vscale x 2 x i64>* %base
   ret void
 }
+
+
+; Splat stores of unpacked FP scalable vectors
+
+define void @store_nxv2f32(<vscale x 2 x float>* %out) {
+; CHECK-LABEL: store_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z0.s, #1.00000000
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 2 x float> undef, float 1.0, i32 0
+  %splat = shufflevector <vscale x 2 x float> %ins, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
+  store <vscale x 2 x float> %splat, <vscale x 2 x float>* %out
+  ret void
+}
+
+define void @store_nxv4f16(<vscale x 4 x half>* %out) {
+; CHECK-LABEL: store_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z0.h, #1.00000000
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
+; CHECK-NEXT:    ret
+  %ins = insertelement <vscale x 4 x half> undef, half 1.0, i32 0
+  %splat = shufflevector <vscale x 4 x half> %ins, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
+  store <vscale x 4 x half> %splat, <vscale x 4 x half>* %out
+  ret void
+}

From a318950c0f9b166245b3d912be2642af7584a078 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Tue, 4 Aug 2020 10:41:27 -0700
Subject: [PATCH 119/363] Fix -Wconstant-conversion warning with explicit cast

Introduced by fd6584a22043b254a323635c142b28ce80ae5b5b

Following similar use of casts in AsmParser.cpp, for instance - ideally
this type would use unsigned chars as they're more representative of raw
data and don't get confused around implementation defined choices of
char's signedness, but this is what it is & the signed/unsigned
conversions are (so far as I understand) safe/bit preserving in this
usage and what's intended, given the API design here.

(cherry picked from commit e31cfc4cd3e393300002e9c519787c96e3b67bab)
---
 llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 30666009801c..83653dcbb8cf 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -442,7 +442,7 @@ MCCFIInstruction AArch64FrameLowering::createDefCFAExpressionFromSP(
 
   // Build up the expression (SP + NumBytes + NumVGScaledBytes * AArch64::VG)
   SmallString<64> Expr;
-  Expr.push_back(dwarf::DW_OP_breg0 + /*SP*/ 31);
+  Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + /*SP*/ 31));
   Expr.push_back(0);
   appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);

From 97319d39d143810e9b1d3ef1bf11009fd8df6571 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Tue, 18 Aug 2020 09:54:35 +0200
Subject: [PATCH 120/363] lld release notes: increased default max page size on
 arm

By Tobias Hieta!
---
 lld/docs/ReleaseNotes.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index f0482c2428c4..513ad37e278e 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -28,6 +28,10 @@ ELF Improvements
   chrome://tracing. The file can be specified with ``--time-trace-file``.
   Trace granularity can be specified with ``--time-trace-granularity``.
   (`D71060 <https://reviews.llvm.org/D71060>`_)
+* For ARM architectures the default max page size was increased to 64k.
+  This increases compatibility with systems where a non standard page
+  size was configured. This also is inline with GNU ld defaults.
+  (`D77330 <https://reviews.llvm.org/D77330>`_)
 * ...
 
 Breaking changes

From da72df44005c909e40af867843168c4c456baa3d Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Thu, 13 Aug 2020 10:44:12 +0100
Subject: [PATCH 121/363] [SVE] Fix bug in SVEIntrinsicOpts::optimizePTest

The code wasn't taking into account that the two operands
passed to ptest could be identical and was trying to erase
them twice.

Differential Revision: https://reviews.llvm.org/D85892

(cherry picked from commit 6c7957c9901714b7ad0a8d2743a8c431b57fd0c9)
---
 llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp          |  2 +-
 llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
index 74fe0cdd1ea7..0245dd1d611a 100644
--- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -160,7 +160,7 @@ bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) {
     I->eraseFromParent();
     if (Op1->use_empty())
       Op1->eraseFromParent();
-    if (Op2->use_empty())
+    if (Op1 != Op2 && Op2->use_empty())
       Op2->eraseFromParent();
 
     return true;
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll
index 191fddacffd1..9af34676b342 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-ptest.ll
@@ -44,6 +44,16 @@ define i1 @ptest_first(<vscale x 4 x i1> %a) {
   ret i1 %out
 }
 
+define i1 @ptest_first_same_ops(<vscale x 2 x i1> %a) {
+; OPT-LABEL: ptest_first_same_ops
+; OPT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.first.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %a)
+; OPT-NOT: convert
+; OPT-NEXT: ret i1 %[[OUT]]
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
+  %2 = tail call i1 @llvm.aarch64.sve.ptest.first.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %1)
+  ret i1 %2
+}
+
 define i1 @ptest_last(<vscale x 8 x i1> %a) {
 ; OPT-LABEL: ptest_last
 ; OPT: %mask = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)

From 8cf2c031632f88a44eea68b48235496cf1c5d6ec Mon Sep 17 00:00:00 2001
From: Chen Zheng <czhengsz@cn.ibm.com>
Date: Mon, 10 Aug 2020 10:55:22 -0400
Subject: [PATCH 122/363] [PowerPC] Make StartMI ignore COPY like instructions.

Reviewed By: lkail

Differential Revision: https://reviews.llvm.org/D85659

(cherry picked from commit 4d52ebb9b9c72b656c1ccb6a1424841f246cd791)
---
 llvm/lib/Target/PowerPC/PPCInstrInfo.cpp      | 49 ++++++++++++-------
 llvm/lib/Target/PowerPC/PPCInstrInfo.h        |  6 ++-
 .../PowerPC/fixup-kill-dead-flag-crash.mir    | 17 +++++++
 3 files changed, 52 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 9a4c57fedac2..e428e7155e5e 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2653,22 +2653,35 @@ const unsigned *PPCInstrInfo::getLoadOpcodesForSpillArray() const {
   return LoadSpillOpcodesArray[getSpillTarget()];
 }
 
-void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI,
+void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr *StartMI, MachineInstr *EndMI,
                                      unsigned RegNo) const {
   // Conservatively clear kill flag for the register if the instructions are in
   // different basic blocks and in SSA form, because the kill flag may no longer
   // be right. There is no need to bother with dead flags since defs with no
   // uses will be handled by DCE.
-  MachineRegisterInfo &MRI = StartMI.getParent()->getParent()->getRegInfo();
-  if (MRI.isSSA() && (StartMI.getParent() != EndMI.getParent())) {
+  MachineRegisterInfo &MRI = StartMI->getParent()->getParent()->getRegInfo();
+  if (MRI.isSSA() && (StartMI->getParent() != EndMI->getParent())) {
     MRI.clearKillFlags(RegNo);
     return;
   }
 
   // Instructions between [StartMI, EndMI] should be in same basic block.
-  assert((StartMI.getParent() == EndMI.getParent()) &&
+  assert((StartMI->getParent() == EndMI->getParent()) &&
          "Instructions are not in same basic block");
 
+  // If before RA, StartMI may be def through COPY, we need to adjust it to the
+  // real def. See function getForwardingDefMI.
+  if (MRI.isSSA()) {
+    bool Reads, Writes;
+    std::tie(Reads, Writes) = StartMI->readsWritesVirtualRegister(RegNo);
+    if (!Reads && !Writes) {
+      assert(Register::isVirtualRegister(RegNo) &&
+             "Must be a virtual register");
+      // Get real def and ignore copies.
+      StartMI = MRI.getVRegDef(RegNo);
+    }
+  }
+
   bool IsKillSet = false;
 
   auto clearOperandKillInfo = [=] (MachineInstr &MI, unsigned Index) {
@@ -2681,21 +2694,21 @@ void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI,
   // Set killed flag for EndMI.
   // No need to do anything if EndMI defines RegNo.
   int UseIndex =
-      EndMI.findRegisterUseOperandIdx(RegNo, false, &getRegisterInfo());
+      EndMI->findRegisterUseOperandIdx(RegNo, false, &getRegisterInfo());
   if (UseIndex != -1) {
-    EndMI.getOperand(UseIndex).setIsKill(true);
+    EndMI->getOperand(UseIndex).setIsKill(true);
     IsKillSet = true;
     // Clear killed flag for other EndMI operands related to RegNo. In some
     // upexpected cases, killed may be set multiple times for same register
     // operand in same MI.
-    for (int i = 0, e = EndMI.getNumOperands(); i != e; ++i)
+    for (int i = 0, e = EndMI->getNumOperands(); i != e; ++i)
       if (i != UseIndex)
-        clearOperandKillInfo(EndMI, i);
+        clearOperandKillInfo(*EndMI, i);
   }
 
   // Walking the inst in reverse order (EndMI -> StartMI].
-  MachineBasicBlock::reverse_iterator It = EndMI;
-  MachineBasicBlock::reverse_iterator E = EndMI.getParent()->rend();
+  MachineBasicBlock::reverse_iterator It = *EndMI;
+  MachineBasicBlock::reverse_iterator E = EndMI->getParent()->rend();
   // EndMI has been handled above, skip it here.
   It++;
   MachineOperand *MO = nullptr;
@@ -2721,13 +2734,13 @@ void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI,
       } else if ((MO = It->findRegisterDefOperand(RegNo, false, true,
                                                   &getRegisterInfo()))) {
         // No use found, set dead for its def.
-        assert(&*It == &StartMI && "No new def between StartMI and EndMI.");
+        assert(&*It == StartMI && "No new def between StartMI and EndMI.");
         MO->setIsDead(true);
         break;
       }
     }
 
-    if ((&*It) == &StartMI)
+    if ((&*It) == StartMI)
       break;
   }
   // Ensure RegMo liveness is killed after EndMI.
@@ -3858,7 +3871,7 @@ bool PPCInstrInfo::simplifyToLI(MachineInstr &MI, MachineInstr &DefMI,
     // ForwardingOperandReg = LI imm1
     // y = op2 imm2, ForwardingOperandReg(killed)
     if (IsForwardingOperandKilled)
-      fixupIsDeadOrKill(DefMI, MI, ForwardingOperandReg);
+      fixupIsDeadOrKill(&DefMI, &MI, ForwardingOperandReg);
 
     LLVM_DEBUG(dbgs() << "With:\n");
     LLVM_DEBUG(MI.dump());
@@ -3950,9 +3963,9 @@ bool PPCInstrInfo::transformToNewImmFormFedByAdd(
 
     // Update kill flag
     if (RegMO->isKill() || IsKilledFor(RegMO->getReg()))
-      fixupIsDeadOrKill(DefMI, MI, RegMO->getReg());
+      fixupIsDeadOrKill(&DefMI, &MI, RegMO->getReg());
     if (ForwardKilledOperandReg != ~0U)
-      fixupIsDeadOrKill(DefMI, MI, ForwardKilledOperandReg);
+      fixupIsDeadOrKill(&DefMI, &MI, ForwardKilledOperandReg);
   }
 
   LLVM_DEBUG(dbgs() << "With:\n");
@@ -4063,12 +4076,12 @@ bool PPCInstrInfo::transformToImmFormFedByAdd(
   // x = ADD reg(killed), imm
   // y = XOP 0, x
   if (IsFwdFeederRegKilled || RegMO->isKill())
-    fixupIsDeadOrKill(DefMI, MI, RegMO->getReg());
+    fixupIsDeadOrKill(&DefMI, &MI, RegMO->getReg());
   // Pattern 3:
   // ForwardKilledOperandReg = ADD reg, imm
   // y = XOP 0, ForwardKilledOperandReg(killed)
   if (ForwardKilledOperandReg != ~0U)
-    fixupIsDeadOrKill(DefMI, MI, ForwardKilledOperandReg);
+    fixupIsDeadOrKill(&DefMI, &MI, ForwardKilledOperandReg);
 
   LLVM_DEBUG(dbgs() << "With:\n");
   LLVM_DEBUG(MI.dump());
@@ -4224,7 +4237,7 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI,
   // ForwardKilledOperandReg = LI imm
   // y = XOP reg, ForwardKilledOperandReg(killed)
   if (ForwardKilledOperandReg != ~0U)
-    fixupIsDeadOrKill(DefMI, MI, ForwardKilledOperandReg);
+    fixupIsDeadOrKill(&DefMI, &MI, ForwardKilledOperandReg);
   return true;
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 43973c627fcf..556c95fef3bd 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -570,14 +570,16 @@ class PPCInstrInfo : public PPCGenInstrInfo {
   /// up. Before calling this function,
   /// 1. Ensure that \p RegNo liveness is killed after instruction \p EndMI.
   /// 2. Ensure that there is no new definition between (\p StartMI, \p EndMI)
-  ///    and possible definition for \p RegNo is \p StartMI or \p EndMI.
+  ///    and possible definition for \p RegNo is \p StartMI or \p EndMI. For
+  ///    pre-RA cases, definition may be \p StartMI through COPY, \p StartMI
+  ///    will be adjust to true definition.
   /// 3. We can do accurate fixup for the case when all instructions between
   ///    [\p StartMI, \p EndMI] are in same basic block.
   /// 4. For the case when \p StartMI and \p EndMI are not in same basic block,
   ///    we conservatively clear kill flag for all uses of \p RegNo for pre-RA
   ///    and for post-RA, we give an assertion as without reaching definition
   ///    analysis post-RA, \p StartMI and \p EndMI are hard to keep right.
-  void fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI,
+  void fixupIsDeadOrKill(MachineInstr *StartMI, MachineInstr *EndMI,
                          unsigned RegNo) const;
   void replaceInstrWithLI(MachineInstr &MI, const LoadImmediateInfo &LII) const;
   void replaceInstrOperandWithImm(MachineInstr &MI, unsigned OpNo,
diff --git a/llvm/test/CodeGen/PowerPC/fixup-kill-dead-flag-crash.mir b/llvm/test/CodeGen/PowerPC/fixup-kill-dead-flag-crash.mir
index be2671fa9b5d..84c9b183e86d 100644
--- a/llvm/test/CodeGen/PowerPC/fixup-kill-dead-flag-crash.mir
+++ b/llvm/test/CodeGen/PowerPC/fixup-kill-dead-flag-crash.mir
@@ -19,3 +19,20 @@ body: |
     STW killed %4:gprc, %4:gprc, 100
     BLR8 implicit $lr8, implicit $rm
 ...
+---
+name: test2
+#CHECK : name : test2
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $r3
+    %0:gprc = COPY $r3
+    %1:gprc_and_gprc_nor0 = LI 0
+    ; CHECK: dead %2:gprc = COPY %1
+    %2:gprc = COPY %1:gprc_and_gprc_nor0
+    ; CHECK: %3:gprc = LI 1
+    %3:gprc = ORI killed %2:gprc, 1
+    ; CHECK: STW killed %3, %0, 100
+    STW killed %3:gprc, %0:gprc, 100
+    BLR8 implicit $lr8, implicit $rm
+...

From cea0ff34238d164a5d667898bbedf18f0d3ad11e Mon Sep 17 00:00:00 2001
From: AlexisPerry <aperry@lanl.gov>
Date: Mon, 27 Jul 2020 09:57:31 -0700
Subject: [PATCH 123/363] [flang] Temp Driver - pass the flag to change the
 default integer kind through to F18_FC

fixes BUG 46307

Differential Revision: https://reviews.llvm.org/D84266

(cherry picked from commit 4a4cafabc9067fced5890a245b03ef5897ad988b)
---
 flang/tools/f18/f18.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/flang/tools/f18/f18.cpp b/flang/tools/f18/f18.cpp
index 574a37074e52..884b21322b73 100644
--- a/flang/tools/f18/f18.cpp
+++ b/flang/tools/f18/f18.cpp
@@ -544,6 +544,11 @@ int main(int argc, char *const argv[]) {
       defaultKinds.set_defaultIntegerKind(8);
       defaultKinds.set_subscriptIntegerKind(8);
       defaultKinds.set_sizeIntegerKind(8);
+      if (isPGF90) {
+        driver.F18_FCArgs.push_back("-i8");
+      } else {
+        driver.F18_FCArgs.push_back("-fdefault-integer-8");
+      }
     } else if (arg == "-Mlargearray") {
     } else if (arg == "-Mnolargearray") {
     } else if (arg == "-flarge-sizes") {

From 556e65b8e9a1ca5d2a62fa397a7fa5eee5b58ac4 Mon Sep 17 00:00:00 2001
From: Camille Coti <coti@cs.uoregon.edu>
Date: Mon, 27 Jul 2020 16:58:39 -0600
Subject: [PATCH 124/363] Order of libraries and source files in the f18
 frontend

When the f18 frontend calls the link editor, put the libraries and object files in the correct order.

Fixes the issues reported here https://github.com/flang-compiler/flang/issues/897

Reviewed By: sscalpone, AlexisPerry

Differential Revision: https://reviews.llvm.org/D84340

(cherry picked from commit ca0bf440dbf9977340db4a32ba61740930c2be03)
---
 flang/tools/f18/f18.cpp | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/flang/tools/f18/f18.cpp b/flang/tools/f18/f18.cpp
index 884b21322b73..77af3964bba8 100644
--- a/flang/tools/f18/f18.cpp
+++ b/flang/tools/f18/f18.cpp
@@ -368,20 +368,24 @@ std::string CompileOtherLanguage(std::string path, DriverOptions &driver) {
   return {};
 }
 
-void Link(std::vector<std::string> &relocatables, DriverOptions &driver) {
+void Link(std::vector<std::string> &liblist, std::vector<std::string> &objects,
+    DriverOptions &driver) {
   if (!ParentProcess()) {
     std::vector<char *> argv;
     for (size_t j{0}; j < driver.F18_FCArgs.size(); ++j) {
       argv.push_back(driver.F18_FCArgs[j].data());
     }
-    for (auto &relo : relocatables) {
-      argv.push_back(relo.data());
+    for (auto &obj : objects) {
+      argv.push_back(obj.data());
     }
     if (!driver.outputPath.empty()) {
       char dashO[3] = "-o";
       argv.push_back(dashO);
       argv.push_back(driver.outputPath.data());
     }
+    for (auto &lib : liblist) {
+      argv.push_back(lib.data());
+    }
     Exec(argv, driver.verbose);
   }
 }
@@ -396,6 +400,7 @@ int main(int argc, char *const argv[]) {
   bool isPGF90{driver.F18_FCArgs.back().rfind("pgf90") != std::string::npos};
 
   std::list<std::string> args{argList(argc, argv)};
+  std::vector<std::string> objlist, liblist;
   std::string prefix{args.front()};
   args.pop_front();
   prefix += ": ";
@@ -412,32 +417,37 @@ int main(int argc, char *const argv[]) {
 
   Fortran::common::IntrinsicTypeDefaultKinds defaultKinds;
 
-  std::vector<std::string> fortranSources, otherSources, relocatables;
+  std::vector<std::string> fortranSources, otherSources;
   bool anyFiles{false};
 
   while (!args.empty()) {
     std::string arg{std::move(args.front())};
+    auto dot{arg.rfind(".")};
+    std::string suffix{arg.substr(dot + 1)};
+    std::string prefix{arg.substr(0, 2)};
     args.pop_front();
     if (arg.empty()) {
     } else if (arg.at(0) != '-') {
       anyFiles = true;
-      auto dot{arg.rfind(".")};
       if (dot == std::string::npos) {
         driver.F18_FCArgs.push_back(arg);
       } else {
-        std::string suffix{arg.substr(dot + 1)};
         if (suffix == "f" || suffix == "F" || suffix == "ff" ||
             suffix == "f90" || suffix == "F90" || suffix == "ff90" ||
             suffix == "f95" || suffix == "F95" || suffix == "ff95" ||
             suffix == "cuf" || suffix == "CUF" || suffix == "f18" ||
             suffix == "F18" || suffix == "ff18") {
           fortranSources.push_back(arg);
-        } else if (suffix == "o" || suffix == "a") {
-          relocatables.push_back(arg);
+        } else if (suffix == "o" || suffix == "so") {
+          objlist.push_back(arg);
+        } else if (suffix == "a") {
+          liblist.push_back(arg);
         } else {
           otherSources.push_back(arg);
         }
       }
+    } else if (prefix == "-l" || suffix == "a") {
+      liblist.push_back(arg);
     } else if (arg == "-") {
       fortranSources.push_back("-");
     } else if (arg == "--") {
@@ -679,17 +689,17 @@ int main(int argc, char *const argv[]) {
   for (const auto &path : fortranSources) {
     std::string relo{CompileFortran(path, options, driver, defaultKinds)};
     if (!driver.compileOnly && !relo.empty()) {
-      relocatables.push_back(relo);
+      objlist.push_back(relo);
     }
   }
   for (const auto &path : otherSources) {
     std::string relo{CompileOtherLanguage(path, driver)};
     if (!driver.compileOnly && !relo.empty()) {
-      relocatables.push_back(relo);
+      objlist.push_back(relo);
     }
   }
-  if (!relocatables.empty()) {
-    Link(relocatables, driver);
+  if (!objlist.empty()) {
+    Link(liblist, objlist, driver);
   }
   return exitStatus;
 }

From dfc845904b27d5da3b6ae7a389ade01590454331 Mon Sep 17 00:00:00 2001
From: Richard Barton <richard.barton@arm.com>
Date: Fri, 17 Jul 2020 09:15:21 +0100
Subject: [PATCH 125/363] [flang] Make interactive behaviour more obvious

When flang is invoked with no files it waits for input on stdin. Make it
print a message saying this to prevent the user being surprised.

Differential Revision: https://reviews.llvm.org/D84855

(cherry picked from commit dd5ea5674b86bade4904fab4c66a1156b3df033e)
---
 flang/test/Driver/Inputs/hello.f90 |  3 +++
 flang/test/Driver/no_files.f90     | 10 ++++++++++
 flang/tools/f18/f18.cpp            |  2 ++
 3 files changed, 15 insertions(+)
 create mode 100644 flang/test/Driver/Inputs/hello.f90
 create mode 100644 flang/test/Driver/no_files.f90

diff --git a/flang/test/Driver/Inputs/hello.f90 b/flang/test/Driver/Inputs/hello.f90
new file mode 100644
index 000000000000..d0c7eb94f53c
--- /dev/null
+++ b/flang/test/Driver/Inputs/hello.f90
@@ -0,0 +1,3 @@
+program hello
+  write (*,*), "hello world" 
+end program hello
diff --git a/flang/test/Driver/no_files.f90 b/flang/test/Driver/no_files.f90
new file mode 100644
index 000000000000..718985dce4ca
--- /dev/null
+++ b/flang/test/Driver/no_files.f90
@@ -0,0 +1,10 @@
+! RUN: %f18 < %S/Inputs/hello.f90 | FileCheck %s
+
+
+! CHECK: Enter Fortran source
+! CHECK: Use EOF character (^D) to end file
+
+! CHECK: Parse tree comprises {{.*}} objects and occupies {{.*}} total bytes
+! CHECK: PROGRAM hello
+! CHECK:  WRITE (*, *) "hello world"
+! CHECK: END PROGRAM hello
diff --git a/flang/tools/f18/f18.cpp b/flang/tools/f18/f18.cpp
index 77af3964bba8..83db520d8ff9 100644
--- a/flang/tools/f18/f18.cpp
+++ b/flang/tools/f18/f18.cpp
@@ -683,6 +683,8 @@ int main(int argc, char *const argv[]) {
   if (!anyFiles) {
     driver.measureTree = true;
     driver.dumpUnparse = true;
+    llvm::outs() << "Enter Fortran source\n"
+                 << "Use EOF character (^D) to end file\n";
     CompileFortran("-", options, driver, defaultKinds);
     return exitStatus;
   }

From 4ad21aadae561b95cc9d4bb98e91499cb5342367 Mon Sep 17 00:00:00 2001
From: Richard Barton <richard.barton@arm.com>
Date: Thu, 25 Jun 2020 16:01:56 +0100
Subject: [PATCH 126/363] [flang] Add details to --help screen on default
 behaviour

Add a usage string and a defaults section that clarifies:
 * If no input files are given, f18 reads from stdin
 * If no input files are given, f18 dumps the parse tree.
 * The default behaviour is to exec F18_FC.
 * The fefault F18_FC setting is 'gfortran'

Adds a simple regression test which tests the top and tail of the help
screen and the exit status.

Depends on D84855

Differential Revision: https://reviews.llvm.org/D84856

(cherry picked from commit b068d19a151d9d3a73b0265df27836d9fd0ad1e3)
---
 flang/test/Driver/help.f90 |  9 +++++++++
 flang/tools/f18/f18.cpp    | 16 +++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Driver/help.f90

diff --git a/flang/test/Driver/help.f90 b/flang/test/Driver/help.f90
new file mode 100644
index 000000000000..66dd14aa5a86
--- /dev/null
+++ b/flang/test/Driver/help.f90
@@ -0,0 +1,9 @@
+! RUN: %f18 -help 2>&1 | FileCheck %s
+! RUN: %f18 --help 2>&1 | FileCheck %s
+! RUN: %f18 -? 2>&1 | FileCheck %s
+
+! CHECK: f18: LLVM Fortran compiler
+
+! CHECK:   -help                print this again
+! CHECK: Unrecognised options are passed through to the external compiler
+! CHECK: set by F18_FC (see defaults).
diff --git a/flang/tools/f18/f18.cpp b/flang/tools/f18/f18.cpp
index 83db520d8ff9..9189f6a46aff 100644
--- a/flang/tools/f18/f18.cpp
+++ b/flang/tools/f18/f18.cpp
@@ -602,6 +602,19 @@ int main(int argc, char *const argv[]) {
       driver.getSymbolsSources = true;
     } else if (arg == "-help" || arg == "--help" || arg == "-?") {
       llvm::errs()
+          << "f18: LLVM Fortran compiler\n"
+          << "\n"
+          << "Usage: f18 [options] <input files>\n"
+          << "\n"
+          << "Defaults:\n"
+          << "  When invoked with input files, and no options to tell\n"
+          << "  it otherwise, f18 will unparse its input and pass that on to an\n"
+          << "  external compiler to continue the compilation.\n"
+          << "  The external compiler is specified by the F18_FC environment\n"
+          << "  variable. The default is 'gfortran'.\n"
+          << "  If invoked with no input files, f18 reads source code from\n"
+          << "  stdin and runs with -fdebug-measure-parse-tree -funparse.\n"
+          << "\n"
           << "f18 options:\n"
           << "  -Mfixed | -Mfree | -ffixed-form | -ffree-form   force the "
              "source form\n"
@@ -635,7 +648,8 @@ int main(int argc, char *const argv[]) {
           << "  -fget-symbols-sources\n"
           << "  -v -c -o -I -D -U    have their usual meanings\n"
           << "  -help                print this again\n"
-          << "Other options are passed through to the compiler.\n";
+          << "Unrecognised options are passed through to the external compiler\n"
+          << "set by F18_FC (see defaults).\n";
       return exitStatus;
     } else if (arg == "-V") {
       llvm::errs() << "\nf18 compiler (under development)\n";

From 633865571bfdd104d84eadfd7ed9dedbdf6027ee Mon Sep 17 00:00:00 2001
From: Richard Barton <richard.barton@arm.com>
Date: Sat, 18 Jul 2020 14:22:18 +0100
Subject: [PATCH 127/363] [flang] Add -h as a synonym for help

As expected by user in http://lists.llvm.org/pipermail/flang-dev/2020-June/000404.html

Depends on D84856

Differential Revision: https://reviews.llvm.org/D84857

(cherry picked from commit 30e45f339eb0841dc7fe27fad119cc5db0c052f3)
---
 flang/test/Driver/help.f90 | 1 +
 flang/tools/f18/f18.cpp    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/flang/test/Driver/help.f90 b/flang/test/Driver/help.f90
index 66dd14aa5a86..d6162954a872 100644
--- a/flang/test/Driver/help.f90
+++ b/flang/test/Driver/help.f90
@@ -1,3 +1,4 @@
+! RUN: %f18 -h 2>&1 | FileCheck %s
 ! RUN: %f18 -help 2>&1 | FileCheck %s
 ! RUN: %f18 --help 2>&1 | FileCheck %s
 ! RUN: %f18 -? 2>&1 | FileCheck %s
diff --git a/flang/tools/f18/f18.cpp b/flang/tools/f18/f18.cpp
index 9189f6a46aff..03c0f7afe810 100644
--- a/flang/tools/f18/f18.cpp
+++ b/flang/tools/f18/f18.cpp
@@ -600,7 +600,7 @@ int main(int argc, char *const argv[]) {
       driver.getDefinitionArgs = {arguments[0], arguments[1], arguments[2]};
     } else if (arg == "-fget-symbols-sources") {
       driver.getSymbolsSources = true;
-    } else if (arg == "-help" || arg == "--help" || arg == "-?") {
+    } else if (arg == "-h" || arg == "-help" || arg == "--help" || arg == "-?") {
       llvm::errs()
           << "f18: LLVM Fortran compiler\n"
           << "\n"

From 844f018114b52325b36e1042c8a8fc0ea82d9c17 Mon Sep 17 00:00:00 2001
From: Camille Coti <camille.coti@gmail.com>
Date: Mon, 10 Aug 2020 16:16:35 +0100
Subject: [PATCH 128/363] [flang] Version information in flang/f18

Fixed some version information in flang/f18:

  - fixed the behavior of the -v switch: this flag enables verbosity with used with arguments, but just displays the version when used alone (related to this bug: https://bugs.llvm.org/show_bug.cgi?id=46017)
 - added __FLANG, __FLANG_MAJOR__, __FLANG_MINOR__ and __FLANG_PATCHLEVEL__ (similar to their __F18* counterparts) for compatibility purpose

Reviewed By: sscalpone, AlexisPerry, richard.barton.arm, tskeith

Differential Revision: https://reviews.llvm.org/D84334

(cherry picked from commit 89a9db438f85c6d4c0f11ecd0448bb71e1deac24)
---
 flang/test/Driver/version_test.f90            |  7 ++++--
 .../Preprocessing/compiler_defined_macros.F90 | 12 +++++++++
 flang/tools/f18/CMakeLists.txt                |  1 +
 flang/tools/f18/f18.cpp                       | 25 ++++++++++++++++---
 flang/tools/f18/f18_version.h.in              |  9 +++++++
 5 files changed, 48 insertions(+), 6 deletions(-)
 create mode 100644 flang/test/Preprocessing/compiler_defined_macros.F90
 create mode 100644 flang/tools/f18/f18_version.h.in

diff --git a/flang/test/Driver/version_test.f90 b/flang/test/Driver/version_test.f90
index 08ea35ba49ea..7fe229e2be17 100644
--- a/flang/test/Driver/version_test.f90
+++ b/flang/test/Driver/version_test.f90
@@ -1,7 +1,10 @@
 ! Check that lit configuration works by checking the compiler version
 
-! RUN: %f18 -V 2>&1 | FileCheck  -check-prefix=VERSION %s
 ! VERSION-NOT:{{![[:space:]]}}
 ! VERSION:{{[[:space:]]}}
-! VERSION-SAME:f18 compiler (under development)
+! VERSION-SAME:f18 compiler (under development), version {{[1-9][0-9]*.[0-9]*.[0-9]*}}
 ! VERSION-EMPTY:
+
+! RUN: %f18 -V 2>&1 | FileCheck  -check-prefix=VERSION %s
+! RUN: %f18 -v 2>&1 | FileCheck  -check-prefix=VERSION %s
+! RUN: %f18 --version 2>&1 | FileCheck  -check-prefix=VERSION %s
diff --git a/flang/test/Preprocessing/compiler_defined_macros.F90 b/flang/test/Preprocessing/compiler_defined_macros.F90
new file mode 100644
index 000000000000..80852cfb4472
--- /dev/null
+++ b/flang/test/Preprocessing/compiler_defined_macros.F90
@@ -0,0 +1,12 @@
+! Check that the macros that give the verion number are set properly
+
+!CHECK: flang_major = {{[1-9][0-9]*$}}
+!CHECK: flang_minor = {{[0-9]+$}}
+!CHECK: flang_patchlevel = {{[0-9]+$}}
+!RUN: %f18 -E %s | FileCheck  --ignore-case %s
+
+  
+integer, parameter :: flang_major = __flang_major__
+integer, parameter :: flang_minor = __flang_minor__
+integer, parameter :: flang_patchlevel = __flang_patchlevel__
+
diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
index 46c38fa43a2e..3dfce3437948 100644
--- a/flang/tools/f18/CMakeLists.txt
+++ b/flang/tools/f18/CMakeLists.txt
@@ -64,5 +64,6 @@ file(COPY ${CMAKE_BINARY_DIR}/tools/flang/bin/flang DESTINATION ${CMAKE_BINARY_D
 # The flang script to be installed needs a different path to the headers.
 set(FLANG_INTRINSIC_MODULES_DIR ${CMAKE_INSTALL_PREFIX}/include/flang)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/flang.sh.in ${FLANG_BINARY_DIR}/bin/flang-install.sh @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/f18_version.h.in ${CMAKE_CURRENT_BINARY_DIR}/f18_version.h @ONLY)
 
 install(PROGRAMS ${FLANG_BINARY_DIR}/bin/flang-install.sh DESTINATION bin RENAME flang PERMISSIONS OWNER_EXECUTE OWNER_READ OWNER_WRITE)
diff --git a/flang/tools/f18/f18.cpp b/flang/tools/f18/f18.cpp
index 03c0f7afe810..23b104ee520c 100644
--- a/flang/tools/f18/f18.cpp
+++ b/flang/tools/f18/f18.cpp
@@ -38,6 +38,8 @@
 #include <unistd.h>
 #include <vector>
 
+#include "f18_version.h"
+
 static std::list<std::string> argList(int argc, char *const argv[]) {
   std::list<std::string> result;
   for (int j = 0; j < argc; ++j) {
@@ -390,6 +392,13 @@ void Link(std::vector<std::string> &liblist, std::vector<std::string> &objects,
   }
 }
 
+int printVersion() {
+  llvm::errs() << "\nf18 compiler (under development), version "
+               << __FLANG_MAJOR__ << "." << __FLANG_MINOR__ << "."
+               << __FLANG_PATCHLEVEL__ << "\n";
+  return exitStatus;
+}
+
 int main(int argc, char *const argv[]) {
 
   atexit(CleanUpAtExit);
@@ -411,6 +420,11 @@ int main(int argc, char *const argv[]) {
   options.predefinitions.emplace_back("__F18_MAJOR__", "1");
   options.predefinitions.emplace_back("__F18_MINOR__", "1");
   options.predefinitions.emplace_back("__F18_PATCHLEVEL__", "1");
+  options.predefinitions.emplace_back("__flang__", __FLANG__);
+  options.predefinitions.emplace_back("__flang_major__", __FLANG_MAJOR__);
+  options.predefinitions.emplace_back("__flang_minor__", __FLANG_MINOR__);
+  options.predefinitions.emplace_back(
+      "__flang_patchlevel__", __FLANG_PATCHLEVEL__);
 #if __x86_64__
   options.predefinitions.emplace_back("__x86_64__", "1");
 #endif
@@ -651,13 +665,16 @@ int main(int argc, char *const argv[]) {
           << "Unrecognised options are passed through to the external compiler\n"
           << "set by F18_FC (see defaults).\n";
       return exitStatus;
-    } else if (arg == "-V") {
-      llvm::errs() << "\nf18 compiler (under development)\n";
-      return exitStatus;
+    } else if (arg == "-V" || arg == "--version") {
+      return printVersion();
     } else {
       driver.F18_FCArgs.push_back(arg);
       if (arg == "-v") {
-        driver.verbose = true;
+        if (args.size() > 1) {
+          driver.verbose = true;
+        } else {
+          return printVersion();
+        }
       } else if (arg == "-I") {
         driver.F18_FCArgs.push_back(args.front());
         driver.searchDirectories.push_back(args.front());
diff --git a/flang/tools/f18/f18_version.h.in b/flang/tools/f18/f18_version.h.in
new file mode 100644
index 000000000000..0c8d5227cd00
--- /dev/null
+++ b/flang/tools/f18/f18_version.h.in
@@ -0,0 +1,9 @@
+#ifndef _F18_H_
+#define _F18_H_
+
+#define __FLANG__ "1"
+#define __FLANG_MAJOR__ "@LLVM_VERSION_MAJOR@"
+#define __FLANG_MINOR__ "@LLVM_VERSION_MINOR@"
+#define __FLANG_PATCHLEVEL__ "@LLVM_VERSION_PATCH@"
+
+#endif // _F18_H_

From 529b2229acb25f005633d43cdb67e0ee093b45e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?D=C3=A1vid=20Bolvansk=C3=BD?= <david.bolvansky@gmail.com>
Date: Wed, 12 Aug 2020 18:36:06 +0200
Subject: [PATCH 129/363] [Driver] Change -fnostack-clash-protection to 
 -fno-stack-clash-protection

Clang command line docs mention `-fno-stack-clash-protection`, and GCC also uses  -fno-stack-clash-protection.

Fixes PR47139

Reviewed By: tstellar

Differential Revision: https://reviews.llvm.org/D85844

(cherry picked from commit df3bfaa39071a1382a59a94658ee1a2da30d92fd)
---
 clang/include/clang/Driver/Options.td      | 2 +-
 clang/lib/Driver/ToolChains/Clang.cpp      | 2 +-
 clang/test/Driver/stack-clash-protection.c | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index b20b8a288221..f818acb39d51 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1780,7 +1780,7 @@ def fstack_protector_all : Flag<["-"], "fstack-protector-all">, Group<f_Group>,
   HelpText<"Enable stack protectors for all functions">;
 def fstack_clash_protection : Flag<["-"], "fstack-clash-protection">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Enable stack clash protection">;
-def fnostack_clash_protection : Flag<["-"], "fnostack-clash-protection">, Group<f_Group>,
+def fno_stack_clash_protection : Flag<["-"], "fno-stack-clash-protection">, Group<f_Group>,
   HelpText<"Disable stack clash protection">;
 def fstack_protector_strong : Flag<["-"], "fstack-protector-strong">, Group<f_Group>,
   HelpText<"Enable stack protectors for some functions vulnerable to stack smashing. "
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 25fc837e803b..828606bbd0f0 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2971,7 +2971,7 @@ static void RenderSCPOptions(const ToolChain &TC, const ArgList &Args,
     return;
 
   if (Args.hasFlag(options::OPT_fstack_clash_protection,
-                   options::OPT_fnostack_clash_protection, false))
+                   options::OPT_fno_stack_clash_protection, false))
     CmdArgs.push_back("-fstack-clash-protection");
 }
 
diff --git a/clang/test/Driver/stack-clash-protection.c b/clang/test/Driver/stack-clash-protection.c
index a2cf3f82a8fd..5217ed26a5b1 100644
--- a/clang/test/Driver/stack-clash-protection.c
+++ b/clang/test/Driver/stack-clash-protection.c
@@ -1,6 +1,6 @@
 // RUN: %clang -target i386-unknown-linux -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-i386
-// RUN: %clang -target i386-unknown-linux -fnostack-clash-protection -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-i386
-// RUN: %clang -target i386-unknown-linux -fstack-clash-protection -fnostack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-i386-NO
+// RUN: %clang -target i386-unknown-linux -fno-stack-clash-protection -fstack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-i386
+// RUN: %clang -target i386-unknown-linux -fstack-clash-protection -fno-stack-clash-protection -### %s 2>&1 | FileCheck %s -check-prefix=SCP-i386-NO
 // SCP-i386: "-fstack-clash-protection"
 // SCP-i386-NO-NOT: "-fstack-clash-protection"
 

From 522eeb66edfb0d6c4656d3f7a3f635544def30db Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Wed, 12 Aug 2020 21:06:28 +0300
Subject: [PATCH 130/363] [InstCombine] Sanitize undef vector constant to 1 in 
 X*(2^C) with X << C (PR47133)

While x*undef is undef, shift-by-undef is poison,
which we must avoid introducing.

Also log2(iN undef) is *NOT* iN undef, because log2(iN undef) u< N.

See https://bugs.llvm.org/show_bug.cgi?id=47133

(cherry picked from commit 12d93a27e7b78d58dd00817cb737f273d2dba8ae)
---
 llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp | 6 +++++-
 llvm/test/Transforms/InstCombine/getelementptr.ll        | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index c6233a68847d..2f1325e80d2f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -216,7 +216,11 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
 
     if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) {
       // Replace X*(2^C) with X << C, where C is either a scalar or a vector.
-      if (Constant *NewCst = getLogBase2(NewOp->getType(), C1)) {
+      // Note that we need to sanitize undef multipliers to 1,
+      // to avoid introducing poison.
+      Constant *SafeC1 = Constant::replaceUndefsWith(
+          C1, ConstantInt::get(C1->getType()->getScalarType(), 1));
+      if (Constant *NewCst = getLogBase2(NewOp->getType(), SafeC1)) {
         BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst);
 
         if (I.hasNoUnsignedWrap())
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index 0257725d12de..17362b088241 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -216,7 +216,7 @@ define <2 x i1> @test13_vector(<2 x i64> %X, <2 x %S*> %P) nounwind {
 define <2 x i1> @test13_vector2(i64 %X, <2 x %S*> %P) nounwind {
 ; CHECK-LABEL: @test13_vector2(
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> [[DOTSPLATINSERT]], <i64 2, i64 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> [[DOTSPLATINSERT]], <i64 2, i64 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], <i64 -4, i64 undef>
 ; CHECK-NEXT:    [[C:%.*]] = shufflevector <2 x i1> [[TMP2]], <2 x i1> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[C]]
@@ -231,7 +231,7 @@ define <2 x i1> @test13_vector2(i64 %X, <2 x %S*> %P) nounwind {
 define <2 x i1> @test13_vector3(i64 %X, <2 x %S*> %P) nounwind {
 ; CHECK-LABEL: @test13_vector3(
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[X:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> [[DOTSPLATINSERT]], <i64 2, i64 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> [[DOTSPLATINSERT]], <i64 2, i64 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], <i64 4, i64 undef>
 ; CHECK-NEXT:    [[C:%.*]] = shufflevector <2 x i1> [[TMP2]], <2 x i1> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[C]]

From d9b3d7557a8e98f91cca6f1ce2b1f1f784eaa355 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Wed, 12 Aug 2020 11:35:08 -0700
Subject: [PATCH 131/363] [WebAssembly] Don't depend on the flags set by
 handleTargetFeatures in initFeatureMap.

Properly set "simd128" in the feature map when "unimplemented-simd128"
is requested.

initFeatureMap is used to create the feature vector used by
handleTargetFeatures. There are later calls to initFeatureMap in
CodeGen that were using these flags to recreate the map. But the
original feature vector should be passed to those calls. So that
should be enough to rebuild the map.

The only issue seemed to be that simd128 was not enabled in the
map by the first call to initFeatureMap. Using the SIMDLevel set
by handleTargetFeatures in the later calls allowed simd128 to be
set in the later versions of the map.

To fix this I've added an override of setFeatureEnabled that
will update the map the first time with the correct simd dependency.

Differential Revision: https://reviews.llvm.org/D85806

(cherry picked from commit 2b8ad6b6040833f4f8702721ebaa7749e5c23e60)
---
 clang/lib/Basic/Targets/WebAssembly.cpp | 60 +++++++++++++------------
 clang/lib/Basic/Targets/WebAssembly.h   |  6 ++-
 2 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/clang/lib/Basic/Targets/WebAssembly.cpp b/clang/lib/Basic/Targets/WebAssembly.cpp
index 6746768090f5..dcb3d8fd7790 100644
--- a/clang/lib/Basic/Targets/WebAssembly.cpp
+++ b/clang/lib/Basic/Targets/WebAssembly.cpp
@@ -96,19 +96,43 @@ void WebAssemblyTargetInfo::getTargetDefines(const LangOptions &Opts,
 }
 
 void WebAssemblyTargetInfo::setSIMDLevel(llvm::StringMap<bool> &Features,
-                                         SIMDEnum Level) {
+                                         SIMDEnum Level, bool Enabled) {
+  if (Enabled) {
+    switch (Level) {
+    case UnimplementedSIMD128:
+      Features["unimplemented-simd128"] = true;
+      LLVM_FALLTHROUGH;
+    case SIMD128:
+      Features["simd128"] = true;
+      LLVM_FALLTHROUGH;
+    case NoSIMD:
+      break;
+    }
+    return;
+  }
+
   switch (Level) {
-  case UnimplementedSIMD128:
-    Features["unimplemented-simd128"] = true;
-    LLVM_FALLTHROUGH;
+  case NoSIMD:
   case SIMD128:
-    Features["simd128"] = true;
+    Features["simd128"] = false;
     LLVM_FALLTHROUGH;
-  case NoSIMD:
+  case UnimplementedSIMD128:
+    Features["unimplemented-simd128"] = false;
     break;
   }
 }
 
+void WebAssemblyTargetInfo::setFeatureEnabled(llvm::StringMap<bool> &Features,
+                                              StringRef Name,
+                                              bool Enabled) const {
+  if (Name == "simd128")
+    setSIMDLevel(Features, SIMD128, Enabled);
+  else if (Name == "unimplemented-simd128")
+    setSIMDLevel(Features, UnimplementedSIMD128, Enabled);
+  else
+    Features[Name] = Enabled;
+}
+
 bool WebAssemblyTargetInfo::initFeatureMap(
     llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
     const std::vector<std::string> &FeaturesVec) const {
@@ -119,30 +143,8 @@ bool WebAssemblyTargetInfo::initFeatureMap(
     Features["atomics"] = true;
     Features["mutable-globals"] = true;
     Features["tail-call"] = true;
-    setSIMDLevel(Features, SIMD128);
+    setSIMDLevel(Features, SIMD128, true);
   }
-  // Other targets do not consider user-configured features here, but while we
-  // are actively developing new features it is useful to let user-configured
-  // features control availability of builtins
-  setSIMDLevel(Features, SIMDLevel);
-  if (HasNontrappingFPToInt)
-    Features["nontrapping-fptoint"] = true;
-  if (HasSignExt)
-    Features["sign-ext"] = true;
-  if (HasExceptionHandling)
-    Features["exception-handling"] = true;
-  if (HasBulkMemory)
-    Features["bulk-memory"] = true;
-  if (HasAtomics)
-    Features["atomics"] = true;
-  if (HasMutableGlobals)
-    Features["mutable-globals"] = true;
-  if (HasMultivalue)
-    Features["multivalue"] = true;
-  if (HasTailCall)
-    Features["tail-call"] = true;
-  if (HasReferenceTypes)
-    Features["reference-types"] = true;
 
   return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
 }
diff --git a/clang/lib/Basic/Targets/WebAssembly.h b/clang/lib/Basic/Targets/WebAssembly.h
index 77a2fe9ae117..0068ccb5d71f 100644
--- a/clang/lib/Basic/Targets/WebAssembly.h
+++ b/clang/lib/Basic/Targets/WebAssembly.h
@@ -69,7 +69,8 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyTargetInfo : public TargetInfo {
                         MacroBuilder &Builder) const override;
 
 private:
-  static void setSIMDLevel(llvm::StringMap<bool> &Features, SIMDEnum Level);
+  static void setSIMDLevel(llvm::StringMap<bool> &Features, SIMDEnum Level,
+                           bool Enabled);
 
   bool
   initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
@@ -77,6 +78,9 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyTargetInfo : public TargetInfo {
                  const std::vector<std::string> &FeaturesVec) const override;
   bool hasFeature(StringRef Feature) const final;
 
+  void setFeatureEnabled(llvm::StringMap<bool> &Features, StringRef Name,
+                         bool Enabled) const final;
+
   bool handleTargetFeatures(std::vector<std::string> &Features,
                             DiagnosticsEngine &Diags) final;
 

From 2d010325ea233fc97e61c00806b46ab2abab3a94 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Wed, 12 Aug 2020 11:47:29 -0700
Subject: [PATCH 132/363] [Target] Cache the command line derived feature map
 in TargetOptions.

We can use this to remove some calls to initFeatureMap from Sema
and CodeGen when a function doesn't have a target attribute.

This reduces compile time of the linux kernel where this map
is needed to diagnose some inline assembly constraints based
on whether sse, avx, or avx512 is enabled.

Differential Revision: https://reviews.llvm.org/D85807

(cherry picked from commit 5c1fe4e20f887286baac6989943a0875e12834fe)
---
 clang/include/clang/Basic/TargetOptions.h  | 4 ++++
 clang/lib/AST/ASTContext.cpp               | 3 +--
 clang/lib/Basic/Targets.cpp                | 5 ++---
 clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 6 +-----
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h
index bbe86aebb074..4a5d469b8e54 100644
--- a/clang/include/clang/Basic/TargetOptions.h
+++ b/clang/include/clang/Basic/TargetOptions.h
@@ -54,6 +54,10 @@ class TargetOptions {
   /// be a list of strings starting with by '+' or '-'.
   std::vector<std::string> Features;
 
+  /// The map of which features have been enabled disabled based on the command
+  /// line.
+  llvm::StringMap<bool> FeatureMap;
+
   /// Supported OpenCL extensions and optional core features.
   OpenCLOptions SupportedOpenCLOptions;
 
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 2ba643f12a82..e3798bb46e86 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -11147,8 +11147,7 @@ void ASTContext::getFunctionFeatureMap(llvm::StringMap<bool> &FeatureMap,
     std::vector<std::string> Features(FeaturesTmp.begin(), FeaturesTmp.end());
     Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Features);
   } else {
-    Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU,
-                           Target->getTargetOpts().Features);
+    FeatureMap = Target->getTargetOpts().FeatureMap;
   }
 }
 
diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index 6bbcafa27dfe..206fb9b3f1a2 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -662,14 +662,13 @@ TargetInfo::CreateTargetInfo(DiagnosticsEngine &Diags,
 
   // Compute the default target features, we need the target to handle this
   // because features may have dependencies on one another.
-  llvm::StringMap<bool> Features;
-  if (!Target->initFeatureMap(Features, Diags, Opts->CPU,
+  if (!Target->initFeatureMap(Opts->FeatureMap, Diags, Opts->CPU,
                               Opts->FeaturesAsWritten))
     return nullptr;
 
   // Add the features to the compile options.
   Opts->Features.clear();
-  for (const auto &F : Features)
+  for (const auto &F : Opts->FeatureMap)
     Opts->Features.push_back((F.getValue() ? "+" : "-") + F.getKey().str());
   // Sort here, so we handle the features in a predictable order. (This matters
   // when we're dealing with features that overlap.)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index ac6ec742335c..1f79b33772f3 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -4956,11 +4956,7 @@ bool CGOpenMPRuntimeNVPTX::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
 static CudaArch getCudaArch(CodeGenModule &CGM) {
   if (!CGM.getTarget().hasFeature("ptx"))
     return CudaArch::UNKNOWN;
-  llvm::StringMap<bool> Features;
-  CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
-                                 CGM.getTarget().getTargetOpts().CPU,
-                                 CGM.getTarget().getTargetOpts().Features);
-  for (const auto &Feature : Features) {
+  for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
     if (Feature.getValue()) {
       CudaArch Arch = StringToCudaArch(Feature.getKey());
       if (Arch != CudaArch::UNKNOWN)

From 536f65f47fcb314f269355c4001c0f2b4744c0b6 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Tue, 4 Aug 2020 17:50:06 -0700
Subject: [PATCH 133/363] [X86] Optimize getImpliedDisabledFeatures &
 getImpliedEnabledFeatures after D83273

Previously the time complexity is O(|number of paths from the root to an
implied feature| * CPU_FWATURE_MAX) where CPU_FEATURE_MAX is 92.

The number of paths can be large (theoretically exponential).

For an inline asm statement, there is a code path
`clang::Parser::ParseAsmStatement -> clang::Sema::ActOnGCCAsmStmt -> ASTContext::getFunctionFeatureMap`
leading to potentially many calls of getImpliedEnabledFeatures (41 for my -march=native case).

We should improve the performance a bit in case the number of inline asm
statements is large (Linux kernel builds).

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D85257

(cherry picked from commit 0c7af8c83bd1acb0ca78f35ddde29b6fde4363a0)
---
 llvm/lib/Support/X86TargetParser.cpp | 39 +++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp
index 572d1203aaf2..c629f872df12 100644
--- a/llvm/lib/Support/X86TargetParser.cpp
+++ b/llvm/lib/Support/X86TargetParser.cpp
@@ -37,6 +37,10 @@ class FeatureBitset {
       set(I);
   }
 
+  bool any() const {
+    return llvm::any_of(Bits, [](uint64_t V) { return V != 0; });
+  }
+
   constexpr FeatureBitset &set(unsigned I) {
     // GCC <6.2 crashes if this is written in a single statement.
     uint32_t NewBits = Bits[I / 32] | (uint32_t(1) << (I % 32));
@@ -89,6 +93,13 @@ class FeatureBitset {
       Result.Bits[I] = ~Bits[I];
     return Result;
   }
+
+  constexpr bool operator!=(const FeatureBitset &RHS) const {
+    for (unsigned I = 0, E = array_lengthof(Bits); I != E; ++I)
+      if (Bits[I] != RHS.Bits[I])
+        return true;
+    return false;
+  }
 };
 
 struct ProcInfo {
@@ -552,11 +563,17 @@ void llvm::X86::getFeaturesForCPU(StringRef CPU,
 // For each feature that is (transitively) implied by this feature, set it.
 static void getImpliedEnabledFeatures(FeatureBitset &Bits,
                                       const FeatureBitset &Implies) {
+  // Fast path: Implies is often empty.
+  if (!Implies.any())
+    return;
+  FeatureBitset Prev;
   Bits |= Implies;
-  for (unsigned i = 0; i != CPU_FEATURE_MAX; ++i) {
-    if (Implies[i])
-      getImpliedEnabledFeatures(Bits, FeatureInfos[i].ImpliedFeatures);
-  }
+  do {
+    Prev = Bits;
+    for (unsigned i = CPU_FEATURE_MAX; i;)
+      if (Bits[--i])
+        Bits |= FeatureInfos[i].ImpliedFeatures;
+  } while (Prev != Bits);
 }
 
 /// Create bit vector of features that are implied disabled if the feature
@@ -564,12 +581,14 @@ static void getImpliedEnabledFeatures(FeatureBitset &Bits,
 static void getImpliedDisabledFeatures(FeatureBitset &Bits, unsigned Value) {
   // Check all features looking for any dependent on this feature. If we find
   // one, mark it and recursively find any feature that depend on it.
-  for (unsigned i = 0; i != CPU_FEATURE_MAX; ++i) {
-    if (FeatureInfos[i].ImpliedFeatures[Value]) {
-      Bits.set(i);
-      getImpliedDisabledFeatures(Bits, i);
-    }
-  }
+  FeatureBitset Prev;
+  Bits.set(Value);
+  do {
+    Prev = Bits;
+    for (unsigned i = 0; i != CPU_FEATURE_MAX; ++i)
+      if ((FeatureInfos[i].ImpliedFeatures & Bits).any())
+        Bits.set(i);
+  } while (Prev != Bits);
 }
 
 void llvm::X86::getImpliedFeatures(

From a3e8436475242e4d4b8669840f3bf954a538f23f Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dimitry@andric.com>
Date: Sat, 1 Aug 2020 16:26:36 +0200
Subject: [PATCH 134/363] On FreeBSD, add -pthread to ASan dynamic compile
 flags for tests

Otherwise, lots of these tests fail with a CHECK error similar to:

==12345==AddressSanitizer CHECK failed: compiler-rt/lib/asan/asan_posix.cpp:120 "((0)) == ((pthread_key_create(&tsd_key, destructor)))" (0x0, 0x4e)

This is because the default pthread stubs in FreeBSD's libc always
return failures (such as ENOSYS for pthread_key_create) in case the
pthread library is not linked in.

Reviewed By: arichardson

Differential Revision: https://reviews.llvm.org/D85082

(cherry picked from commit 3aecf4bdf3f87e674724ad58d94c4b728feecb2e)
---
 compiler-rt/test/asan/lit.cfg.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/test/asan/lit.cfg.py b/compiler-rt/test/asan/lit.cfg.py
index 9045c6ec8f9c..63c02f7ddeeb 100644
--- a/compiler-rt/test/asan/lit.cfg.py
+++ b/compiler-rt/test/asan/lit.cfg.py
@@ -91,9 +91,12 @@ def push_dynamic_library_lookup_path(config, new_path):
 asan_dynamic_flags = []
 if config.asan_dynamic:
   asan_dynamic_flags = ["-shared-libasan"]
-  # On Windows, we need to simulate "clang-cl /MD" on the clang driver side.
   if platform.system() == 'Windows':
+    # On Windows, we need to simulate "clang-cl /MD" on the clang driver side.
     asan_dynamic_flags += ["-D_MT", "-D_DLL", "-Wl,-nodefaultlib:libcmt,-defaultlib:msvcrt,-defaultlib:oldnames"]
+  elif platform.system() == 'FreeBSD':
+    # On FreeBSD, we need to add -pthread to ensure pthread functions are available.
+    asan_dynamic_flags += ['-pthread']
   config.available_features.add("asan-dynamic-runtime")
 else:
   config.available_features.add("asan-static-runtime")

From c89e9d67721ab2bdbb25656bc2da2a52c1116917 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Sat, 8 Aug 2020 13:49:45 -0400
Subject: [PATCH 135/363] Change the default target CPU for OpenBSD/i386 to
 i586

(cherry picked from commit cd5ab56bc406c3f9a6f593f98c63dafb53547ab1)
---
 clang/lib/Driver/ToolChains/Arch/X86.cpp | 6 +++---
 clang/test/Driver/openbsd.c              | 5 +++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index 2cc44c09917f..6b82abec6f65 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -93,13 +93,13 @@ const char *x86::getX86TargetCPU(const ArgList &Args,
     return "x86-64";
 
   switch (Triple.getOS()) {
-  case llvm::Triple::FreeBSD:
-    return "i686";
   case llvm::Triple::NetBSD:
-  case llvm::Triple::OpenBSD:
     return "i486";
   case llvm::Triple::Haiku:
+  case llvm::Triple::OpenBSD:
     return "i586";
+  case llvm::Triple::FreeBSD:
+    return "i686";
   default:
     // Fallback to p4.
     return "pentium4";
diff --git a/clang/test/Driver/openbsd.c b/clang/test/Driver/openbsd.c
index 51a5b4380f45..e17d05dc76da 100644
--- a/clang/test/Driver/openbsd.c
+++ b/clang/test/Driver/openbsd.c
@@ -14,6 +14,11 @@
 // CHECK-PG: clang{{.*}}" "-cc1" "-triple" "i686-pc-openbsd"
 // CHECK-PG: ld{{.*}}" "-e" "__start" "--eh-frame-hdr" "-Bdynamic" "-dynamic-linker" "{{.*}}ld.so" "-nopie" "-o" "a.out" "{{.*}}gcrt0.o" "{{.*}}crtbegin.o" "{{.*}}.o" "-lcompiler_rt" "-lpthread_p" "-lc_p" "-lcompiler_rt" "{{.*}}crtend.o"
 
+// Check CPU type for i386
+// RUN: %clang -target i386-unknown-openbsd -### -c %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-i386-CPU %s
+// CHECK-i386-CPU: "-target-cpu" "i586"
+
 // Check CPU type for MIPS64
 // RUN: %clang -target mips64-unknown-openbsd -### -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-MIPS64-CPU %s

From d09901e2d8aa4ffdf35d50622dd096062d5e6eef Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Sun, 9 Aug 2020 19:43:16 -0400
Subject: [PATCH 136/363] int64_t and intmax_t are always (signed) long long on
 OpenBSD.

(cherry picked from commit 92e82a2890c38bbb158cbf9dd592328b4c383696)
---
 clang/lib/Basic/Targets/OSTargets.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index cfa362bef1b1..a885aa96cc76 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -465,6 +465,8 @@ class LLVM_LIBRARY_VISIBILITY OpenBSDTargetInfo : public OSTargetInfo<Target> {
 public:
   OpenBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
       : OSTargetInfo<Target>(Triple, Opts) {
+    this->IntMaxType = TargetInfo::SignedLongLong;
+    this->Int64Type = TargetInfo::SignedLongLong;
     switch (Triple.getArch()) {
     case llvm::Triple::x86:
     case llvm::Triple::x86_64:

From c2f52e2c1288b00eae528825fb92668b1f3df732 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Sun, 16 Aug 2020 06:50:50 -0400
Subject: [PATCH 137/363] Create strict aligned code for OpenBSD/arm64.

(cherry picked from commit 44613bbec88be9e86b8c52c4f40bb1b1ab48d84c)
---
 clang/lib/Driver/ToolChains/Arch/AArch64.cpp | 4 +++-
 clang/test/Driver/arm-alignment.c            | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
index 487c50dfc466..dd4545d6c48f 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -370,9 +370,11 @@ void aarch64::getAArch64TargetFeatures(const Driver &D,
     V8_6Pos = Features.insert(std::next(V8_6Pos), {"+i8mm", "+bf16"});
 
   if (Arg *A = Args.getLastArg(options::OPT_mno_unaligned_access,
-                               options::OPT_munaligned_access))
+                               options::OPT_munaligned_access)) {
     if (A->getOption().matches(options::OPT_mno_unaligned_access))
       Features.push_back("+strict-align");
+  } else if (Triple.isOSOpenBSD())
+    Features.push_back("+strict-align");
 
   if (Args.hasArg(options::OPT_ffixed_x1))
     Features.push_back("+reserve-x1");
diff --git a/clang/test/Driver/arm-alignment.c b/clang/test/Driver/arm-alignment.c
index e0b4946d0a4b..b2bc8a35dfc6 100644
--- a/clang/test/Driver/arm-alignment.c
+++ b/clang/test/Driver/arm-alignment.c
@@ -80,6 +80,9 @@
 // RUN: %clang -target aarch64-none-gnueabi -mkernel -mno-unaligned-access -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ALIGNED-AARCH64 < %t %s
 
+// RUN: %clang -target aarch64-unknown-openbsd -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-ALIGNED-AARCH64 < %t %s
+
 // CHECK-ALIGNED-ARM: "-target-feature" "+strict-align"
 // CHECK-ALIGNED-AARCH64: "-target-feature" "+strict-align"
 

From 28a1b6ea4db9e405d563ad36ca4b3bb1916f92ed Mon Sep 17 00:00:00 2001
From: Richard Barton <richard.barton@arm.com>
Date: Tue, 18 Aug 2020 12:17:13 +0100
Subject: [PATCH 138/363] Revert "[flang] Version information in flang/f18"

This reverts commit 844f018114b52325b36e1042c8a8fc0ea82d9c17.
---
 flang/test/Driver/version_test.f90            |  7 ++----
 .../Preprocessing/compiler_defined_macros.F90 | 12 ---------
 flang/tools/f18/CMakeLists.txt                |  1 -
 flang/tools/f18/f18.cpp                       | 25 +++----------------
 flang/tools/f18/f18_version.h.in              |  9 -------
 5 files changed, 6 insertions(+), 48 deletions(-)
 delete mode 100644 flang/test/Preprocessing/compiler_defined_macros.F90
 delete mode 100644 flang/tools/f18/f18_version.h.in

diff --git a/flang/test/Driver/version_test.f90 b/flang/test/Driver/version_test.f90
index 7fe229e2be17..08ea35ba49ea 100644
--- a/flang/test/Driver/version_test.f90
+++ b/flang/test/Driver/version_test.f90
@@ -1,10 +1,7 @@
 ! Check that lit configuration works by checking the compiler version
 
+! RUN: %f18 -V 2>&1 | FileCheck  -check-prefix=VERSION %s
 ! VERSION-NOT:{{![[:space:]]}}
 ! VERSION:{{[[:space:]]}}
-! VERSION-SAME:f18 compiler (under development), version {{[1-9][0-9]*.[0-9]*.[0-9]*}}
+! VERSION-SAME:f18 compiler (under development)
 ! VERSION-EMPTY:
-
-! RUN: %f18 -V 2>&1 | FileCheck  -check-prefix=VERSION %s
-! RUN: %f18 -v 2>&1 | FileCheck  -check-prefix=VERSION %s
-! RUN: %f18 --version 2>&1 | FileCheck  -check-prefix=VERSION %s
diff --git a/flang/test/Preprocessing/compiler_defined_macros.F90 b/flang/test/Preprocessing/compiler_defined_macros.F90
deleted file mode 100644
index 80852cfb4472..000000000000
--- a/flang/test/Preprocessing/compiler_defined_macros.F90
+++ /dev/null
@@ -1,12 +0,0 @@
-! Check that the macros that give the verion number are set properly
-
-!CHECK: flang_major = {{[1-9][0-9]*$}}
-!CHECK: flang_minor = {{[0-9]+$}}
-!CHECK: flang_patchlevel = {{[0-9]+$}}
-!RUN: %f18 -E %s | FileCheck  --ignore-case %s
-
-  
-integer, parameter :: flang_major = __flang_major__
-integer, parameter :: flang_minor = __flang_minor__
-integer, parameter :: flang_patchlevel = __flang_patchlevel__
-
diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
index 3dfce3437948..46c38fa43a2e 100644
--- a/flang/tools/f18/CMakeLists.txt
+++ b/flang/tools/f18/CMakeLists.txt
@@ -64,6 +64,5 @@ file(COPY ${CMAKE_BINARY_DIR}/tools/flang/bin/flang DESTINATION ${CMAKE_BINARY_D
 # The flang script to be installed needs a different path to the headers.
 set(FLANG_INTRINSIC_MODULES_DIR ${CMAKE_INSTALL_PREFIX}/include/flang)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/flang.sh.in ${FLANG_BINARY_DIR}/bin/flang-install.sh @ONLY)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/f18_version.h.in ${CMAKE_CURRENT_BINARY_DIR}/f18_version.h @ONLY)
 
 install(PROGRAMS ${FLANG_BINARY_DIR}/bin/flang-install.sh DESTINATION bin RENAME flang PERMISSIONS OWNER_EXECUTE OWNER_READ OWNER_WRITE)
diff --git a/flang/tools/f18/f18.cpp b/flang/tools/f18/f18.cpp
index 23b104ee520c..03c0f7afe810 100644
--- a/flang/tools/f18/f18.cpp
+++ b/flang/tools/f18/f18.cpp
@@ -38,8 +38,6 @@
 #include <unistd.h>
 #include <vector>
 
-#include "f18_version.h"
-
 static std::list<std::string> argList(int argc, char *const argv[]) {
   std::list<std::string> result;
   for (int j = 0; j < argc; ++j) {
@@ -392,13 +390,6 @@ void Link(std::vector<std::string> &liblist, std::vector<std::string> &objects,
   }
 }
 
-int printVersion() {
-  llvm::errs() << "\nf18 compiler (under development), version "
-               << __FLANG_MAJOR__ << "." << __FLANG_MINOR__ << "."
-               << __FLANG_PATCHLEVEL__ << "\n";
-  return exitStatus;
-}
-
 int main(int argc, char *const argv[]) {
 
   atexit(CleanUpAtExit);
@@ -420,11 +411,6 @@ int main(int argc, char *const argv[]) {
   options.predefinitions.emplace_back("__F18_MAJOR__", "1");
   options.predefinitions.emplace_back("__F18_MINOR__", "1");
   options.predefinitions.emplace_back("__F18_PATCHLEVEL__", "1");
-  options.predefinitions.emplace_back("__flang__", __FLANG__);
-  options.predefinitions.emplace_back("__flang_major__", __FLANG_MAJOR__);
-  options.predefinitions.emplace_back("__flang_minor__", __FLANG_MINOR__);
-  options.predefinitions.emplace_back(
-      "__flang_patchlevel__", __FLANG_PATCHLEVEL__);
 #if __x86_64__
   options.predefinitions.emplace_back("__x86_64__", "1");
 #endif
@@ -665,16 +651,13 @@ int main(int argc, char *const argv[]) {
           << "Unrecognised options are passed through to the external compiler\n"
           << "set by F18_FC (see defaults).\n";
       return exitStatus;
-    } else if (arg == "-V" || arg == "--version") {
-      return printVersion();
+    } else if (arg == "-V") {
+      llvm::errs() << "\nf18 compiler (under development)\n";
+      return exitStatus;
     } else {
       driver.F18_FCArgs.push_back(arg);
       if (arg == "-v") {
-        if (args.size() > 1) {
-          driver.verbose = true;
-        } else {
-          return printVersion();
-        }
+        driver.verbose = true;
       } else if (arg == "-I") {
         driver.F18_FCArgs.push_back(args.front());
         driver.searchDirectories.push_back(args.front());
diff --git a/flang/tools/f18/f18_version.h.in b/flang/tools/f18/f18_version.h.in
deleted file mode 100644
index 0c8d5227cd00..000000000000
--- a/flang/tools/f18/f18_version.h.in
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _F18_H_
-#define _F18_H_
-
-#define __FLANG__ "1"
-#define __FLANG_MAJOR__ "@LLVM_VERSION_MAJOR@"
-#define __FLANG_MINOR__ "@LLVM_VERSION_MINOR@"
-#define __FLANG_PATCHLEVEL__ "@LLVM_VERSION_PATCH@"
-
-#endif // _F18_H_

From 5fc0afee3f09abeceff4064f8ed364c4c6369886 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Wed, 5 Aug 2020 15:17:27 -0700
Subject: [PATCH 139/363] [X86] Add test case for PR47000. NFC

(cherry picked from commit 13796d14238baabff972e15ceddb4ae61b1584b8)
---
 llvm/test/CodeGen/X86/pr47000.ll | 151 +++++++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100755 llvm/test/CodeGen/X86/pr47000.ll

diff --git a/llvm/test/CodeGen/X86/pr47000.ll b/llvm/test/CodeGen/X86/pr47000.ll
new file mode 100755
index 000000000000..e6ddf3d97c19
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr47000.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_rip
+; RUN: llc < %s -mcpu=pentium4 -O0 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-unknown"
+
+define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
+; CHECK-LABEL: doTheTestMod:
+; CHECK:       # %bb.0: # %Entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    subl $124, %esp
+; CHECK-NEXT:    movl 144(%esp), %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    movw 176(%esp), %dx
+; CHECK-NEXT:    movw 172(%esp), %si
+; CHECK-NEXT:    movw 164(%esp), %di
+; CHECK-NEXT:    movw 166(%esp), %bx
+; CHECK-NEXT:    movw 160(%esp), %bp
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movw 156(%esp), %ax
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movw 148(%esp), %ax
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movw 150(%esp), %ax
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %ax # 2-byte Reload
+; CHECK-NEXT:    movw %ax, 112(%esp)
+; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %ax # 2-byte Reload
+; CHECK-NEXT:    movw %ax, 114(%esp)
+; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %ax # 2-byte Reload
+; CHECK-NEXT:    movw %ax, 116(%esp)
+; CHECK-NEXT:    movw %bp, 118(%esp)
+; CHECK-NEXT:    movw %dx, 110(%esp)
+; CHECK-NEXT:    movw %si, 108(%esp)
+; CHECK-NEXT:    movw %bx, 106(%esp)
+; CHECK-NEXT:    movw %di, 104(%esp)
+; CHECK-NEXT:    movzwl 118(%esp), %edx
+; CHECK-NEXT:    movzwl 116(%esp), %esi
+; CHECK-NEXT:    movzwl 114(%esp), %edi
+; CHECK-NEXT:    movzwl 112(%esp), %ebx
+; CHECK-NEXT:    movzwl 110(%esp), %ebp
+; CHECK-NEXT:    movzwl 108(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl 106(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl 104(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    movl %ebx, (%eax)
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    calll __gnu_h2f_ieee
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; CHECK-NEXT:    calll __gnu_h2f_ieee
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    fstps 4(%eax)
+; CHECK-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; CHECK-NEXT:    fstps (%eax)
+; CHECK-NEXT:    calll fmodf
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    fstps (%eax)
+; CHECK-NEXT:    calll __gnu_f2h_ieee
+; CHECK-NEXT:    movl %esp, %ecx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT:    movl %edx, (%ecx)
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    calll __gnu_h2f_ieee
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; CHECK-NEXT:    calll __gnu_h2f_ieee
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    fstps 4(%eax)
+; CHECK-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; CHECK-NEXT:    fstps (%eax)
+; CHECK-NEXT:    calll fmodf
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    fstps (%eax)
+; CHECK-NEXT:    calll __gnu_f2h_ieee
+; CHECK-NEXT:    movl %esp, %ecx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT:    movl %edx, (%ecx)
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    calll __gnu_h2f_ieee
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; CHECK-NEXT:    calll __gnu_h2f_ieee
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    fstps 4(%eax)
+; CHECK-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; CHECK-NEXT:    fstps (%eax)
+; CHECK-NEXT:    calll fmodf
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    fstps (%eax)
+; CHECK-NEXT:    calll __gnu_f2h_ieee
+; CHECK-NEXT:    movl %esp, %ecx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT:    movl %edx, (%ecx)
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    calll __gnu_h2f_ieee
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; CHECK-NEXT:    calll __gnu_h2f_ieee
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    fstps 4(%eax)
+; CHECK-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; CHECK-NEXT:    fstps (%eax)
+; CHECK-NEXT:    calll fmodf
+; CHECK-NEXT:    movl %esp, %eax
+; CHECK-NEXT:    fstps (%eax)
+; CHECK-NEXT:    calll __gnu_f2h_ieee
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movw %ax, 6(%ecx)
+; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %ax # 2-byte Reload
+; CHECK-NEXT:    movw %ax, 4(%ecx)
+; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %dx # 2-byte Reload
+; CHECK-NEXT:    movw %dx, 2(%ecx)
+; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %si # 2-byte Reload
+; CHECK-NEXT:    movw %si, (%ecx)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    addl $124, %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl $4
+Entry:
+  %x = alloca <4 x half>, align 8
+  %y = alloca <4 x half>, align 8
+  store <4 x half> %0, <4 x half>* %x, align 8
+  store <4 x half> %1, <4 x half>* %y, align 8
+  %2 = load <4 x half>, <4 x half>* %x, align 8
+  %3 = load <4 x half>, <4 x half>* %y, align 8
+  %4 = frem <4 x half> %2, %3
+  ret <4 x half> %4
+}
+

From 6fed1b7bcb50743d9aad54a08276c9c179d86487 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Wed, 5 Aug 2020 15:35:16 -0700
Subject: [PATCH 140/363] [X86] Disable copy elision in LowerMemArgument for
 scalarized vectors when the loc VT is a different size than the original
 element.

For example a v4f16 argument is scalarized to 4 i32 values. So
the values are spread out instead of being packed tightly like
in the original vector.

Fixes PR47000.

(cherry picked from commit 08b2d0a963dbbf54317a137d69f430b347d1bfae)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 14 ++++++++++++--
 llvm/test/CodeGen/X86/pr47000.ll        | 14 ++++++--------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 86aa85e965f6..1671917157f4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3208,13 +3208,23 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
     return DAG.getFrameIndex(FI, PtrVT);
   }
 
+  EVT ArgVT = Ins[i].ArgVT;
+
+  // If this is a vector that has been split into multiple parts, and the
+  // scalar size of the parts don't match the vector element size, then we can't
+  // elide the copy. The parts will have padding between them instead of being
+  // packed like a vector.
+  bool ScalarizedAndExtendedVector =
+      ArgVT.isVector() && !VA.getLocVT().isVector() &&
+      VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
+
   // This is an argument in memory. We might be able to perform copy elision.
   // If the argument is passed directly in memory without any extension, then we
   // can perform copy elision. Large vector types, for example, may be passed
   // indirectly by pointer.
   if (Flags.isCopyElisionCandidate() &&
-      VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
-    EVT ArgVT = Ins[i].ArgVT;
+      VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
+      !ScalarizedAndExtendedVector) {
     SDValue PartAddr;
     if (Ins[i].PartOffset == 0) {
       // If this is a one-part value or the first part of a multi-part value,
diff --git a/llvm/test/CodeGen/X86/pr47000.ll b/llvm/test/CodeGen/X86/pr47000.ll
index e6ddf3d97c19..083aa780a07c 100755
--- a/llvm/test/CodeGen/X86/pr47000.ll
+++ b/llvm/test/CodeGen/X86/pr47000.ll
@@ -16,17 +16,15 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    movw 176(%esp), %dx
 ; CHECK-NEXT:    movw 172(%esp), %si
-; CHECK-NEXT:    movw 164(%esp), %di
-; CHECK-NEXT:    movw 166(%esp), %bx
+; CHECK-NEXT:    movw 168(%esp), %di
+; CHECK-NEXT:    movw 164(%esp), %bx
 ; CHECK-NEXT:    movw 160(%esp), %bp
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    movw 156(%esp), %ax
 ; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; CHECK-NEXT:    movw 148(%esp), %ax
-; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; CHECK-NEXT:    movw 150(%esp), %ax
+; CHECK-NEXT:    movw 152(%esp), %ax
 ; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %ax # 2-byte Reload
+; CHECK-NEXT:    movw 148(%esp), %ax
 ; CHECK-NEXT:    movw %ax, 112(%esp)
 ; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %ax # 2-byte Reload
 ; CHECK-NEXT:    movw %ax, 114(%esp)
@@ -35,8 +33,8 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
 ; CHECK-NEXT:    movw %bp, 118(%esp)
 ; CHECK-NEXT:    movw %dx, 110(%esp)
 ; CHECK-NEXT:    movw %si, 108(%esp)
-; CHECK-NEXT:    movw %bx, 106(%esp)
-; CHECK-NEXT:    movw %di, 104(%esp)
+; CHECK-NEXT:    movw %di, 106(%esp)
+; CHECK-NEXT:    movw %bx, 104(%esp)
 ; CHECK-NEXT:    movzwl 118(%esp), %edx
 ; CHECK-NEXT:    movzwl 116(%esp), %esi
 ; CHECK-NEXT:    movzwl 114(%esp), %edi

From 7e6bf0bfe6de9e0d0e58764a66f93210f296bbfa Mon Sep 17 00:00:00 2001
From: Francesco Petrogalli <francesco.petrogalli@arm.com>
Date: Wed, 12 Aug 2020 19:53:07 +0000
Subject: [PATCH 141/363] [release][docs] Update contributions to LLVM 11 for
 SVE.

Differential Revision: https://reviews.llvm.org/D85977
---
 llvm/docs/ReleaseNotes.rst | 64 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index c9ac61d29676..612a5417df95 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -66,7 +66,9 @@ Changes to the LLVM IR
   added to describe the mapping between scalar functions and vector
   functions, to enable vectorization of call sites. The information
   provided by the attribute is interfaced via the API provided by the
-  ``VFDatabase`` class.
+  ``VFDatabase`` class. When scanning through the set of vector
+  functions associated with a scalar call, the loop vectorizer now
+  relies on ``VFDatabase``, instead of ``TargetLibraryInfo``.
 
 * `dereferenceable` attributes and metadata on pointers no longer imply
   anything about the alignment of the pointer in question. Previously, some
@@ -78,6 +80,17 @@ Changes to the LLVM IR
   information. This information is used to represent Fortran modules debug
   info at IR level.
 
+* LLVM IR now supports two distinct ``llvm::FixedVectorType`` and
+  ``llvm::ScalableVectorType`` vector types, both derived from the
+  base class ``llvm::VectorType``. A number of algorithms dealing with
+  IR vector types have been updated to make sure they work for both
+  scalable and fixed vector types. Where possible, the code has been
+  made generic to cover both cases using the base class. Specifically,
+  places that were using the type ``unsigned`` to count the number of
+  lanes of a vector are now using ``llvm::ElementCount``. In places
+  where ``uint64_t`` was used to denote the size in bits of a IR type
+  we have partially migrated the codebase to using ``llvm::TypeSize``.
+
 Changes to building LLVM
 ------------------------
 
@@ -110,6 +123,55 @@ During this release ...
   default may wish to specify ``-fno-omit-frame-pointer`` to get the old
   behavior. This improves compatibility with GCC.
 
+* Clang adds support for the following macros that enable the
+  C-intrinsics from the `Arm C language extensions for SVE
+  <https://developer.arm.com/documentation/100987/>`_ (version
+  ``00bet5``, see section 2.1 for the list of intrinsics associated to
+  each macro):
+
+
+      =================================  =================
+      Preprocessor macro                 Target feature
+      =================================  =================
+      ``__ARM_FEATURE_SVE``              ``+sve``
+      ``__ARM_FEATURE_SVE_BF16``         ``+sve+bf16``
+      ``__ARM_FEATURE_SVE_MATMUL_FP32``  ``+sve+f32mm``
+      ``__ARM_FEATURE_SVE_MATMUL_FP64``  ``+sve+f64mm``
+      ``__ARM_FEATURE_SVE_MATMUL_INT8``  ``+sve+i8mm``
+      ``__ARM_FEATURE_SVE2``             ``+sve2``
+      ``__ARM_FEATURE_SVE2_AES``         ``+sve2-aes``
+      ``__ARM_FEATURE_SVE2_BITPERM``     ``+sve2-bitperm``
+      ``__ARM_FEATURE_SVE2_SHA3``        ``+sve2-sha3``
+      ``__ARM_FEATURE_SVE2_SM4``         ``+sve2-sm4``
+      =================================  =================
+
+  The macros enable users to write C/C++ `Vector Length Agnostic
+  (VLA)` loops, that can be executed on any CPU that implements the
+  underlying instructions supported by the C intrinsics, independently
+  of the hardware vector register size.
+
+  For example, the ``__ARM_FEATURE_SVE`` macro is enabled when
+  targeting AArch64 code generation by setting ``-march=armv8-a+sve``
+  on the command line.
+
+  .. code-block:: c
+     :caption: Example of VLA addition of two arrays with SVE ACLE.
+
+     // Compile with:
+     // `clang++ -march=armv8a+sve ...` (for c++)
+     // `clang -stc=c11 -march=armv8a+sve ...` (for c)
+     #include <arm_sve.h>
+
+     void VLA_add_arrays(double *x, double *y, double *out, unsigned N) {
+       for (unsigned i = 0; i < N; i += svcntd()) {
+         svbool_t Pg = svwhilelt_b64(i, N);
+         svfloat64_t vx = svld1(Pg, &x[i]);
+         svfloat64_t vy = svld1(Pg, &y[i]);
+         svfloat64_t vout = svadd_x(Pg, vx, vy);
+         svst1(Pg, &out[i], vout);
+       }
+     }
+
 Changes to the MIPS Target
 --------------------------
 

From cb7f903994646c5b9223e0bb6cee3792190991f7 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Sat, 8 Aug 2020 17:51:19 -0400
Subject: [PATCH 142/363] Hook up OpenBSD 64-bit PowerPC support

(cherry picked from commit 4eb4ebf76a6e26b0632968dd299d1dc6ad07e694)
---
 clang/lib/Basic/Targets.cpp           | 4 ++++
 clang/lib/Basic/Targets/OSTargets.h   | 2 ++
 clang/lib/Basic/Targets/PPC.h         | 4 ++--
 clang/lib/Driver/ToolChains/Clang.cpp | 4 ++--
 clang/test/Driver/ppc-abi.c           | 1 +
 clang/test/Preprocessor/init-ppc64.c  | 1 +
 clang/test/Preprocessor/init.c        | 2 ++
 7 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index 206fb9b3f1a2..e4456ea7fa0f 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -346,6 +346,8 @@ TargetInfo *AllocateTarget(const llvm::Triple &Triple,
       return new FreeBSDTargetInfo<PPC64TargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
       return new NetBSDTargetInfo<PPC64TargetInfo>(Triple, Opts);
+    case llvm::Triple::OpenBSD:
+      return new OpenBSDTargetInfo<PPC64TargetInfo>(Triple, Opts);
     case llvm::Triple::AIX:
       return new AIXPPC64TargetInfo(Triple, Opts);
     default:
@@ -358,6 +360,8 @@ TargetInfo *AllocateTarget(const llvm::Triple &Triple,
       return new LinuxTargetInfo<PPC64TargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
       return new NetBSDTargetInfo<PPC64TargetInfo>(Triple, Opts);
+    case llvm::Triple::OpenBSD:
+      return new OpenBSDTargetInfo<PPC64TargetInfo>(Triple, Opts);
     default:
       return new PPC64TargetInfo(Triple, Opts);
     }
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index a885aa96cc76..8d56f561ba9f 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -478,6 +478,8 @@ class LLVM_LIBRARY_VISIBILITY OpenBSDTargetInfo : public OSTargetInfo<Target> {
     case llvm::Triple::mips64:
     case llvm::Triple::mips64el:
     case llvm::Triple::ppc:
+    case llvm::Triple::ppc64:
+    case llvm::Triple::ppc64le:
     case llvm::Triple::sparcv9:
       this->MCountName = "_mcount";
       break;
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index 858059bacb86..bda6cb7dc89f 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -414,8 +414,8 @@ class LLVM_LIBRARY_VISIBILITY PPC64TargetInfo : public PPCTargetInfo {
       ABI = "elfv1";
     }
 
-    if (Triple.isOSFreeBSD() || Triple.getOS() == llvm::Triple::AIX ||
-        Triple.isMusl()) {
+    if (Triple.isOSFreeBSD() || Triple.isOSOpenBSD() ||
+        Triple.getOS() == llvm::Triple::AIX || Triple.isMusl()) {
       LongDoubleWidth = LongDoubleAlign = 64;
       LongDoubleFormat = &llvm::APFloat::IEEEdouble();
     }
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 828606bbd0f0..c4b25ba3713c 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1879,8 +1879,8 @@ void Clang::AddPPCTargetArgs(const ArgList &Args,
         ABIName = "elfv1-qpx";
         break;
       }
-
-      if (T.isMusl() || (T.isOSFreeBSD() && T.getOSMajorVersion() >= 13))
+      if ((T.isOSFreeBSD() && T.getOSMajorVersion() >= 13) ||
+          T.isOSOpenBSD() || T.isMusl()
         ABIName = "elfv2";
       else
         ABIName = "elfv1";
diff --git a/clang/test/Driver/ppc-abi.c b/clang/test/Driver/ppc-abi.c
index acc4981a2eee..8508e818f34c 100644
--- a/clang/test/Driver/ppc-abi.c
+++ b/clang/test/Driver/ppc-abi.c
@@ -28,6 +28,7 @@
 // RUN: %clang -target powerpc64-unknown-freebsd12 %s -### 2>&1 | FileCheck --check-prefix=CHECK-ELFv1 %s
 // RUN: %clang -target powerpc64-unknown-freebsd13 %s -### 2>&1 | FileCheck --check-prefix=CHECK-ELFv2-BE %s
 // RUN: %clang -target powerpc64-unknown-freebsd14 %s -### 2>&1 | FileCheck --check-prefix=CHECK-ELFv2-BE %s
+// RUN: %clang -target powerpc64-unknown-openbsd %s -### 2>&1 | FileCheck --check-prefix=CHECK-ELFv2-BE-PIE %
 // RUN: %clang -target powerpc64-linux-musl %s -### 2>&1 | FileCheck --check-prefix=CHECK-ELFv2-BE-PIE %s
 
 // CHECK-ELFv1: "-mrelocation-model" "static"
diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c
index ed8601636554..edaf794db2a8 100644
--- a/clang/test/Preprocessor/init-ppc64.c
+++ b/clang/test/Preprocessor/init-ppc64.c
@@ -1078,6 +1078,7 @@
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-freebsd11 -target-abi elfv1 -xc /dev/null | FileCheck --check-prefix=PPC64-ELFv1 %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-freebsd12 -target-abi elfv1 -xc /dev/null | FileCheck --check-prefix=PPC64-ELFv1 %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-freebsd13 -target-abi elfv2 -xc /dev/null | FileCheck --check-prefix=PPC64-ELFv2 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-openbsd -target-abi elfv2 -xc /dev/null | FileCheck --check-prefix=PPC64-ELFv2 %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-linux-musl -target-abi elfv2 -xc /dev/null | FileCheck --check-prefix=PPC64-ELFv2 %s
 
 // PPC64-ELFv1:#define _CALL_ELF 1
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index d6f3225bd04b..9e085a8f9fe8 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -7321,6 +7321,8 @@
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-unknown-openbsd6.1-gnueabi < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64el-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s

From f2a53ad5f953889fbc5c027ab17c54b5b8947db2 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Sat, 8 Aug 2020 17:58:13 -0400
Subject: [PATCH 143/363] fix typo

(cherry picked from commit 430db35bf21505015c618e292e98793e2ed49169)
---
 clang/lib/Driver/ToolChains/Clang.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index c4b25ba3713c..c77ae5a44a0e 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1880,7 +1880,7 @@ void Clang::AddPPCTargetArgs(const ArgList &Args,
         break;
       }
       if ((T.isOSFreeBSD() && T.getOSMajorVersion() >= 13) ||
-          T.isOSOpenBSD() || T.isMusl()
+          T.isOSOpenBSD() || T.isMusl())
         ABIName = "elfv2";
       else
         ABIName = "elfv1";

From ad51ff4854e5b32b310f352077039e10df425eab Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Sat, 8 Aug 2020 18:39:43 -0400
Subject: [PATCH 144/363] Backout a test that is dependent on an uncommited
 diff. Fix another.

(cherry picked from commit f4aba9d76c61cc4c87b45e4edb57b1968eb7194c)
---
 clang/test/Driver/ppc-abi.c    | 2 +-
 clang/test/Preprocessor/init.c | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/test/Driver/ppc-abi.c b/clang/test/Driver/ppc-abi.c
index 8508e818f34c..aef8d8576ada 100644
--- a/clang/test/Driver/ppc-abi.c
+++ b/clang/test/Driver/ppc-abi.c
@@ -28,7 +28,7 @@
 // RUN: %clang -target powerpc64-unknown-freebsd12 %s -### 2>&1 | FileCheck --check-prefix=CHECK-ELFv1 %s
 // RUN: %clang -target powerpc64-unknown-freebsd13 %s -### 2>&1 | FileCheck --check-prefix=CHECK-ELFv2-BE %s
 // RUN: %clang -target powerpc64-unknown-freebsd14 %s -### 2>&1 | FileCheck --check-prefix=CHECK-ELFv2-BE %s
-// RUN: %clang -target powerpc64-unknown-openbsd %s -### 2>&1 | FileCheck --check-prefix=CHECK-ELFv2-BE-PIE %
+// RUN: %clang -target powerpc64-unknown-openbsd %s -### 2>&1 | FileCheck --check-prefix=CHECK-ELFv2-BE-PIE %s
 // RUN: %clang -target powerpc64-linux-musl %s -### 2>&1 | FileCheck --check-prefix=CHECK-ELFv2-BE-PIE %s
 
 // CHECK-ELFv1: "-mrelocation-model" "static"
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 9e085a8f9fe8..d6f3225bd04b 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -7321,8 +7321,6 @@
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-unknown-openbsd6.1-gnueabi < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64el-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s

From f2b2668267edd599f5e354ebdb98c0d96e06d91b Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Sun, 9 Aug 2020 20:52:43 -0400
Subject: [PATCH 145/363] Re-enable OpenBSD PowerPC64 tests.

(cherry picked from commit f5fdb6141c5e7a76a10ea702d6fc046692827c43)
---
 clang/test/Preprocessor/init.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index d6f3225bd04b..9e085a8f9fe8 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -7321,6 +7321,8 @@
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-unknown-openbsd6.1-gnueabi < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64el-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s

From 33c13cd8c5772871ec79f0b061e7f7a2fb287383 Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine@google.com>
Date: Tue, 18 Aug 2020 12:05:07 -0700
Subject: [PATCH 146/363] Default to disabling the libunwind frameheader cache.

Although it works fine with glibc, as currently implemented the
frameheader cache is incompatible with certain platforms with
slightly different locking semantics inside dl_iterate_phdr.

Therefore only enable it when it is turned on explicitly with
a configure-time option.

Differential Revision: https://reviews.llvm.org/D86163

(cherry picked from commit a20f5fe70810e0a768c1814d69d10862965c21e4)
---
 libunwind/CMakeLists.txt                      | 5 +++++
 libunwind/src/AddressSpace.hpp                | 6 ++++++
 libunwind/test/frameheadercache_test.pass.cpp | 2 +-
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt
index 02c130ad1bd5..bd8176c67925 100644
--- a/libunwind/CMakeLists.txt
+++ b/libunwind/CMakeLists.txt
@@ -137,6 +137,7 @@ option(LIBUNWIND_ENABLE_THREADS "Build libunwind with threading support." ON)
 option(LIBUNWIND_WEAK_PTHREAD_LIB "Use weak references to refer to pthread functions." OFF)
 option(LIBUNWIND_USE_COMPILER_RT "Use compiler-rt instead of libgcc" OFF)
 option(LIBUNWIND_INCLUDE_DOCS "Build the libunwind documentation." ${LLVM_INCLUDE_DOCS})
+option(LIBUNWIND_USE_FRAME_HEADER_CACHE "Cache frame headers for unwinding. Requires locking dl_iterate_phdr." OFF)
 
 set(LIBUNWIND_LIBDIR_SUFFIX "${LLVM_LIBDIR_SUFFIX}" CACHE STRING
     "Define suffix of library directory name (32/64)")
@@ -365,6 +366,10 @@ if (LIBUNWIND_ENABLE_ARM_WMMX)
   add_compile_flags(-D__ARM_WMMX)
 endif()
 
+if(LIBUNWIND_USE_FRAME_HEADER_CACHE)
+  add_compile_definitions(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
+endif()
+
 # This is the _ONLY_ place where add_definitions is called.
 if (MSVC)
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
diff --git a/libunwind/src/AddressSpace.hpp b/libunwind/src/AddressSpace.hpp
index a4564cb67328..e40c23291f84 100644
--- a/libunwind/src/AddressSpace.hpp
+++ b/libunwind/src/AddressSpace.hpp
@@ -452,10 +452,12 @@ struct _LIBUNWIND_HIDDEN dl_iterate_cb_data {
     #error "_LIBUNWIND_SUPPORT_DWARF_UNWIND requires _LIBUNWIND_SUPPORT_DWARF_INDEX on this platform."
   #endif
 
+#if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
 #include "FrameHeaderCache.hpp"
 
 // There should be just one of these per process.
 static FrameHeaderCache ProcessFrameHeaderCache;
+#endif
 
 static bool checkAddrInSegment(const Elf_Phdr *phdr, size_t image_base,
                                dl_iterate_cb_data *cbdata) {
@@ -476,8 +478,10 @@ int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, size_t pinfo_size,
   auto cbdata = static_cast<dl_iterate_cb_data *>(data);
   if (pinfo->dlpi_phnum == 0 || cbdata->targetAddr < pinfo->dlpi_addr)
     return 0;
+#if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
   if (ProcessFrameHeaderCache.find(pinfo, pinfo_size, data))
     return 1;
+#endif
 
   Elf_Addr image_base = calculateImageBase(pinfo);
   bool found_obj = false;
@@ -505,7 +509,9 @@ int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, size_t pinfo_size,
       found_obj = checkAddrInSegment(phdr, image_base, cbdata);
     }
     if (found_obj && found_hdr) {
+#if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
       ProcessFrameHeaderCache.add(cbdata->sects);
+#endif
       return 1;
     }
   }
diff --git a/libunwind/test/frameheadercache_test.pass.cpp b/libunwind/test/frameheadercache_test.pass.cpp
index 9397e70d66cb..ebbc00464e07 100644
--- a/libunwind/test/frameheadercache_test.pass.cpp
+++ b/libunwind/test/frameheadercache_test.pass.cpp
@@ -6,7 +6,7 @@
 // The frame header cache should work fine for other architectures,
 // but the #ifdefs end up being even more complicated than this.
 
-#ifdef __x86_64__
+#if defined(__x86_64__) && defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
 
 // This #if chain is ugly, but see the comments in AddressSpace.hpp for
 // the reasoning.

From 9b0e9ed0ac5f9047538106c55c84082f12a1945c Mon Sep 17 00:00:00 2001
From: Amy Huang <akhuang@google.com>
Date: Tue, 11 Aug 2020 14:50:56 -0700
Subject: [PATCH 147/363] [globalopt] Change so that emitting fragments doesn't
 use the type size of DIVariables

When turning on -debug-info-kind=constructor we ran into a "fragment covers
entire variable" error during thinlto. The fragment is currently always
emitted if there is no type size, but sometimes the variable has a
forward declared struct type which doesn't have a size.

This changes the code to get the type size from the GlobalVariable instead.

Differential Revision: https://reviews.llvm.org/D85572

(cherry picked from commit 54b6cca0f28484395ae43bcda4c9f929bc51cfe3)
---
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         | 14 ++---
 .../Generic/global-sra-struct-fwd-decl.ll     | 63 +++++++++++++++++++
 2 files changed, 69 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/DebugInfo/Generic/global-sra-struct-fwd-decl.ll

diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index d9fb820f7cb5..9524d9a36204 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -468,19 +468,16 @@ static bool CanDoGlobalSRA(GlobalVariable *GV) {
 /// Copy over the debug info for a variable to its SRA replacements.
 static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV,
                                  uint64_t FragmentOffsetInBits,
-                                 uint64_t FragmentSizeInBits) {
+                                 uint64_t FragmentSizeInBits,
+                                 uint64_t VarSize) {
   SmallVector<DIGlobalVariableExpression *, 1> GVs;
   GV->getDebugInfo(GVs);
   for (auto *GVE : GVs) {
     DIVariable *Var = GVE->getVariable();
-    Optional<uint64_t> VarSize = Var->getSizeInBits();
-
     DIExpression *Expr = GVE->getExpression();
     // If the FragmentSize is smaller than the variable,
     // emit a fragment expression.
-    // If the variable size is unknown a fragment must be
-    // emitted to be safe.
-    if (!VarSize || FragmentSizeInBits < *VarSize) {
+    if (FragmentSizeInBits < VarSize) {
       if (auto E = DIExpression::createFragmentExpression(
               Expr, FragmentOffsetInBits, FragmentSizeInBits))
         Expr = *E;
@@ -505,6 +502,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
   assert(GV->hasLocalLinkage());
   Constant *Init = GV->getInitializer();
   Type *Ty = Init->getType();
+  uint64_t VarSize = DL.getTypeSizeInBits(Ty);
 
   std::map<unsigned, GlobalVariable *> NewGlobals;
 
@@ -560,7 +558,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
       // Copy over the debug info for the variable.
       uint64_t Size = DL.getTypeAllocSizeInBits(NGV->getValueType());
       uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(ElementIdx);
-      transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size);
+      transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, VarSize);
     } else {
       uint64_t EltSize = DL.getTypeAllocSize(ElTy);
       Align EltAlign = DL.getABITypeAlign(ElTy);
@@ -573,7 +571,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
       if (NewAlign > EltAlign)
         NGV->setAlignment(NewAlign);
       transferSRADebugInfo(GV, NGV, FragmentSizeInBits * ElementIdx,
-                           FragmentSizeInBits);
+                           FragmentSizeInBits, VarSize);
     }
   }
 
diff --git a/llvm/test/DebugInfo/Generic/global-sra-struct-fwd-decl.ll b/llvm/test/DebugInfo/Generic/global-sra-struct-fwd-decl.ll
new file mode 100644
index 000000000000..caef2dd4ef78
--- /dev/null
+++ b/llvm/test/DebugInfo/Generic/global-sra-struct-fwd-decl.ll
@@ -0,0 +1,63 @@
+; RUN: opt -S -globalopt < %s | FileCheck %s
+; Generated at -O2 -g from:
+; typedef struct {} a;
+; static struct {
+;     long b;
+;     a c;
+; } d;
+; e() {
+;     long f = d.b + 1;
+;     d.b = f;
+; }
+; (IR is modified so that d's struct type is forward declared.)
+
+; Check that the global variable "d" is not
+; emitted as a fragment if its struct type is
+; forward declared but d.c has zero length, so
+; a fragment shouldn't be emitted.
+
+source_filename = "t.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.anon = type { i64, %struct.a }
+%struct.a = type {}
+
+; CHECK: @d.0 = internal unnamed_addr global i64 0, align 8, !dbg ![[GVE:.*]]
+@d = internal global %struct.anon zeroinitializer, align 8, !dbg !0
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i32 @e() #0 !dbg !18 {
+entry:
+  %0 = load i64, i64* getelementptr inbounds (%struct.anon, %struct.anon* @d, i32 0, i32 0), align 8
+  %add = add nsw i64 %0, 1
+  call void @llvm.dbg.value(metadata i64 %add, metadata !24, metadata !DIExpression()), !dbg !25
+  store i64 %add, i64* getelementptr inbounds (%struct.anon, %struct.anon* @d, i32 0, i32 0), align 8
+  ret i32 undef
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!14, !15}
+
+; CHECK: ![[GVE]] = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "d", scope: !2, file: !3, line: 6, type: !7, isLocal: true, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !{}, globals: !{!0}, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "t.c", directory: "/")
+!7 = distinct !DICompositeType(tag: DW_TAG_structure_type, file: !3, line: 3, flags: DIFlagFwdDecl)
+!10 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+!12 = !DIDerivedType(tag: DW_TAG_typedef, name: "a", file: !3, line: 2, baseType: !13)
+!13 = distinct !DICompositeType(tag: DW_TAG_structure_type, file: !3, line: 1, elements: !{})
+!14 = !{i32 7, !"Dwarf Version", i32 4}
+!15 = !{i32 2, !"Debug Info Version", i32 3}
+!18 = distinct !DISubprogram(name: "e", scope: !3, file: !3, line: 7, type: !19, scopeLine: 7, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !{})
+!19 = !DISubroutineType(types: !{!21})
+!21 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!24 = !DILocalVariable(name: "f", scope: !18, file: !3, line: 8, type: !10)
+!25 = !DILocation(line: 0, scope: !18)

From a2fa88a05f40d8ba96e96b48e0f2fcd091420e54 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Tue, 18 Aug 2020 19:56:19 -0400
Subject: [PATCH 148/363] WCharType and WIntType are always signed int on
 OpenBSD.

(cherry picked from commit d9ff48d03817f83d4059b610a776c797308de2e5)
---
 clang/lib/Basic/Targets/OSTargets.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index 8d56f561ba9f..2a9e4f91d478 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -465,6 +465,7 @@ class LLVM_LIBRARY_VISIBILITY OpenBSDTargetInfo : public OSTargetInfo<Target> {
 public:
   OpenBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
       : OSTargetInfo<Target>(Triple, Opts) {
+    this->WCharType = this->WIntType = this->SignedInt;
     this->IntMaxType = TargetInfo::SignedLongLong;
     this->Int64Type = TargetInfo::SignedLongLong;
     switch (Triple.getArch()) {

From 709830a7538fb7ad339d75474dc1da500bde0d12 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Tue, 18 Aug 2020 18:59:55 -0400
Subject: [PATCH 149/363] Hook up OpenBSD 64-bit RISC-V support

(cherry picked from commit 592b8996bf9b55eec21e1c9e563f51b6108ec2d2)
---
 clang/lib/Basic/Targets.cpp    | 2 ++
 clang/test/Preprocessor/init.c | 1 +
 2 files changed, 3 insertions(+)

diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index e4456ea7fa0f..818133f66f3f 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -391,6 +391,8 @@ TargetInfo *AllocateTarget(const llvm::Triple &Triple,
     switch (os) {
     case llvm::Triple::FreeBSD:
       return new FreeBSDTargetInfo<RISCV64TargetInfo>(Triple, Opts);
+    case llvm::Triple::OpenBSD:
+      return new OpenBSDTargetInfo<RISCV64TargetInfo>(Triple, Opts);
     case llvm::Triple::Fuchsia:
       return new FuchsiaTargetInfo<RISCV64TargetInfo>(Triple, Opts);
     case llvm::Triple::Linux:
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 9e085a8f9fe8..3cc36302aa0e 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -7326,6 +7326,7 @@
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64el-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=riscv64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // OPENBSD:#define __ELF__ 1
 // OPENBSD:#define __INT16_TYPE__ short
 // OPENBSD:#define __INT32_TYPE__ int

From 1a14ce26e006f1cfd438943308d0db43fb80efd4 Mon Sep 17 00:00:00 2001
From: Sam Elliott <selliott@lowrisc.org>
Date: Mon, 17 Aug 2020 12:25:45 +0100
Subject: [PATCH 150/363] [RISCV] Indirect branch generation in position
 independent code

This fixes the "Unable to insert indirect branch" fatal error sometimes
seen when generating position-independent code.

Patch by msizanoen1

Reviewed By: jrtc27

Differential Revision: https://reviews.llvm.org/D84833

(cherry picked from commit 5f9ecc5d857fa5d95f6ea36153be19db40576f8a)
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp     | 21 +++++++++-----------
 llvm/lib/Target/RISCV/RISCVInstrInfo.td      |  4 ++--
 llvm/test/CodeGen/RISCV/branch-relaxation.ll |  8 +++++---
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index d39ec505127c..7b6ea002c7b7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -279,7 +279,7 @@ bool RISCVInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
 
   // Handle a single unconditional branch.
   if (NumTerminators == 1 && I->getDesc().isUnconditionalBranch()) {
-    TBB = I->getOperand(0).getMBB();
+    TBB = getBranchDestBlock(*I);
     return false;
   }
 
@@ -293,7 +293,7 @@ bool RISCVInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   if (NumTerminators == 2 && std::prev(I)->getDesc().isConditionalBranch() &&
       I->getDesc().isUnconditionalBranch()) {
     parseCondBranch(*std::prev(I), TBB, Cond);
-    FBB = I->getOperand(0).getMBB();
+    FBB = getBranchDestBlock(*I);
     return false;
   }
 
@@ -384,10 +384,6 @@ unsigned RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
 
   MachineFunction *MF = MBB.getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  const auto &TM = static_cast<const RISCVTargetMachine &>(MF->getTarget());
-
-  if (TM.isPositionIndependent())
-    report_fatal_error("Unable to insert indirect branch");
 
   if (!isInt<32>(BrOffset))
     report_fatal_error(
@@ -399,15 +395,13 @@ unsigned RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass);
   auto II = MBB.end();
 
-  MachineInstr &LuiMI = *BuildMI(MBB, II, DL, get(RISCV::LUI), ScratchReg)
-                             .addMBB(&DestBB, RISCVII::MO_HI);
-  BuildMI(MBB, II, DL, get(RISCV::PseudoBRIND))
-      .addReg(ScratchReg, RegState::Kill)
-      .addMBB(&DestBB, RISCVII::MO_LO);
+  MachineInstr &MI = *BuildMI(MBB, II, DL, get(RISCV::PseudoJump))
+                          .addReg(ScratchReg, RegState::Define | RegState::Dead)
+                          .addMBB(&DestBB, RISCVII::MO_CALL);
 
   RS->enterBasicBlockEnd(MBB);
   unsigned Scav = RS->scavengeRegisterBackwards(RISCV::GPRRegClass,
-                                                LuiMI.getIterator(), false, 0);
+                                                MI.getIterator(), false, 0);
   MRI.replaceRegWith(ScratchReg, Scav);
   MRI.clearVirtRegs();
   RS->setRegUsed(Scav);
@@ -431,6 +425,7 @@ RISCVInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
 
 bool RISCVInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
                                            int64_t BrOffset) const {
+  unsigned XLen = STI.getXLen();
   // Ideally we could determine the supported branch offset from the
   // RISCVII::FormMask, but this can't be used for Pseudo instructions like
   // PseudoBR.
@@ -447,6 +442,8 @@ bool RISCVInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
   case RISCV::JAL:
   case RISCV::PseudoBR:
     return isIntN(21, BrOffset);
+  case RISCV::PseudoJump:
+    return isIntN(32, SignExtend64(BrOffset + 0x800, XLen));
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index b9483062ddeb..8547f791092b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1012,8 +1012,8 @@ def : Pat<(riscv_tail (iPTR tglobaladdr:$dst)),
 def : Pat<(riscv_tail (iPTR texternalsym:$dst)),
           (PseudoTAIL texternalsym:$dst)>;
 
-let isCall = 0, isBarrier = 0, isCodeGenOnly = 0, hasSideEffects = 0,
-    mayStore = 0, mayLoad = 0 in
+let isCall = 0, isBarrier = 1, isBranch = 1, isTerminator = 1,
+    isCodeGenOnly = 0, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in
 def PseudoJump : Pseudo<(outs GPR:$rd), (ins pseudo_jump_symbol:$target), []> {
   let AsmString = "jump\t$target, $rd";
 }
diff --git a/llvm/test/CodeGen/RISCV/branch-relaxation.ll b/llvm/test/CodeGen/RISCV/branch-relaxation.ll
index 3d617bf0b26b..5925e17ae407 100644
--- a/llvm/test/CodeGen/RISCV/branch-relaxation.ll
+++ b/llvm/test/CodeGen/RISCV/branch-relaxation.ll
@@ -1,7 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs -filetype=obj < %s \
 ; RUN:   -o /dev/null 2>&1
+; RUN: llc -mtriple=riscv32 -relocation-model=pic -verify-machineinstrs \
+; RUN:   -filetype=obj < %s -o /dev/null 2>&1
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -relocation-model=pic -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
 
 define void @relax_bcc(i1 %a) nounwind {
 ; CHECK-LABEL: relax_bcc:
@@ -25,15 +29,13 @@ tail:
   ret void
 }
 
-; TODO: Extend simm12's MCOperandPredicate so the jalr zero is printed as a jr.
 define i32 @relax_jal(i1 %a) nounwind {
 ; CHECK-LABEL: relax_jal:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a0, a0, 1
 ; CHECK-NEXT:    bnez a0, .LBB1_1
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    lui a0, %hi(.LBB1_2)
-; CHECK-NEXT:    jalr zero, %lo(.LBB1_2)(a0)
+; CHECK-NEXT:    jump .LBB1_2, a0
 ; CHECK-NEXT:  .LBB1_1: # %iftrue
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP

From 7bcb12aef366681c78559b4ce4b89fc5dea6eb90 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Tue, 11 Aug 2020 00:04:24 -0400
Subject: [PATCH 151/363] [Sparc] Define __GCC_HAVE_SYNC_COMPARE_AND_SWAP
 macros on SPARCv9

(cherry picked from commit 5fe171321c018a811debc306a776dbdf27a306dd)
---
 clang/lib/Basic/Targets/Sparc.cpp                | 5 +++++
 clang/test/Preprocessor/predefined-arch-macros.c | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/clang/lib/Basic/Targets/Sparc.cpp b/clang/lib/Basic/Targets/Sparc.cpp
index 13aa964d4716..48f36c5ba1c6 100644
--- a/clang/lib/Basic/Targets/Sparc.cpp
+++ b/clang/lib/Basic/Targets/Sparc.cpp
@@ -240,6 +240,11 @@ void SparcV9TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__sparc_v9__");
     Builder.defineMacro("__sparcv9__");
   }
+
+  Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
+  Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
+  Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4");
+  Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8");
 }
 
 void SparcV9TargetInfo::fillValidCPUList(
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index e457a0479b33..abab9274ffbb 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -3174,6 +3174,14 @@
 // CHECK_SPARCV9: #define __sparcv9 1
 // CHECK_SPARCV9: #define __sparcv9__ 1
 
+// RUN: %clang -E -dM %s -o - 2>&1 \
+// RUN:     -target sparcv9-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SPARCV9_GCC_ATOMICS
+// CHECK_SPARCV9_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK_SPARCV9_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK_SPARCV9_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK_SPARCV9_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
+
 // Begin SystemZ/GCC/Linux tests ----------------
 
 // RUN: %clang -march=arch8 -E -dM %s -o - 2>&1 \

From b0b18ec9e8a7e7c7ca6e01d1a6f636622fa59941 Mon Sep 17 00:00:00 2001
From: sameeran joshi <sameeranjayant.joshi@amd.com>
Date: Fri, 7 Aug 2020 01:03:54 +0530
Subject: [PATCH 152/363] [Flang] Fix release blocker issue #46931 related to
 documentation.

Summary:
Fixes bug : https://bugs.llvm.org/show_bug.cgi?id=46931
This commit add a new flag -DLLVM_ENABLE_SPHINX=ON to cmake command to generate sphinx documentation,
along with new cmake targets `docs-flang-html`.
`ninja docs-flang-html` - generates sphinx documentation.
Generated release notes are present in <builddir>/tools/flang/docs/html/ folder.

Reviewers: richard.barton.arm, DavidTruby

Tags: #flang

Differential Revision: https://reviews.llvm.org/D85470

(cherry picked from commit 2fc86ccdd25309b9f77739aaed4a7b06f1c6f111)
---
 flang/docs/ReleaseNotes.rst                   |  96 +++++
 flang/docs/_static/llvm.css                   | 112 ++++++
 flang/docs/_themes/llvm-theme/layout.html     |  23 ++
 .../_themes/llvm-theme/static/contents.png    | Bin 0 -> 202 bytes
 .../_themes/llvm-theme/static/llvm-theme.css  | 371 ++++++++++++++++++
 flang/docs/_themes/llvm-theme/static/logo.png | Bin 0 -> 9864 bytes
 .../_themes/llvm-theme/static/navigation.png  | Bin 0 -> 218 bytes
 flang/docs/_themes/llvm-theme/theme.conf      |   4 +
 flang/docs/conf.py                            | 252 ++++++++++++
 9 files changed, 858 insertions(+)
 create mode 100644 flang/docs/ReleaseNotes.rst
 create mode 100644 flang/docs/_static/llvm.css
 create mode 100644 flang/docs/_themes/llvm-theme/layout.html
 create mode 100644 flang/docs/_themes/llvm-theme/static/contents.png
 create mode 100644 flang/docs/_themes/llvm-theme/static/llvm-theme.css
 create mode 100644 flang/docs/_themes/llvm-theme/static/logo.png
 create mode 100644 flang/docs/_themes/llvm-theme/static/navigation.png
 create mode 100644 flang/docs/_themes/llvm-theme/theme.conf
 create mode 100644 flang/docs/conf.py

diff --git a/flang/docs/ReleaseNotes.rst b/flang/docs/ReleaseNotes.rst
new file mode 100644
index 000000000000..bbc7377412d6
--- /dev/null
+++ b/flang/docs/ReleaseNotes.rst
@@ -0,0 +1,96 @@
+========================================
+Flang 11.0.0 (In-Progress) Release Notes
+========================================
+
+.. contents::
+   :local:
+   :depth: 2
+
+.. warning::
+
+   These are in-progress notes for the upcoming LLVM 11.0.0 release.
+   Release notes for previous releases can be found on
+   `the Download Page <https://releases.llvm.org/download.html>`_.
+
+Introduction
+============
+
+This document contains the release notes for the Flang Fortran
+frontend, part of the LLVM Compiler Infrastructure, release 11.0.0. Here we
+describe the status of Flang in some detail, including major
+improvements from the previous release and new feature work. For the
+general LLVM release notes, see `the LLVM
+documentation <https://llvm.org/docs/ReleaseNotes.html>`_. All LLVM
+releases may be downloaded from the `LLVM releases web
+site <https://llvm.org/releases/>`_.
+
+Note that if you are reading this file from a Git checkout, this document
+applies to the *next* release, not
+the current one. To see the release notes for a specific release, please
+see the `releases page <https://llvm.org/releases/>`_.
+
+Known Issues
+============
+
+These are issues that couldn't be fixed before the release. See the bug reports for the latest status.
+
+- ...
+
+Introducing Flang
+=================
+
+Flang is LLVM's Fortran front end and is new for the LLVM 11 release.
+
+Flang is still a work in progress for this release and is included for
+experimentation and feedback.
+
+Flang status
+------------
+
+Flang is able to parse a comprehensive subset of the Fortran language
+and check it for correctness. Flang is not yet able to generate LLVM IR for
+the source code and thus is unable to compile a running binary. 
+
+Flang is able to unparse the input source code into a canonical form and emit
+it to allow testing. Flang can also invoke an external Fortran compiler on this
+canonical input.
+
+Flang's parser has comprehensive support for:
+- Fortran 2018
+- OpenMP 4.5
+- OpenACC 3.0
+
+Major missing features
+----------------------
+
+- Flang is not supported on Windows platforms.
+
+Using Flang
+===========
+
+Usage: ``flang hello.f90 -o hello.bin``
+
+Flang will parse the Fortran file ``hello.f90`` then unparse it to a canonical
+Fortran source file. Flang will then invoke an external Fortran compiler to
+compile this source file and link it, placing the resulting executable
+in ``hello.bin``.
+
+To specify the external Fortran compiler, set the ``F18_FC`` environment
+variable to the name of the compiler binary and ensure it is on your ``PATH``.
+The default value for ``F18_FC`` is ``gfortran``.
+
+When invoked with no source input, Flang will wait for input on standard in.
+When invoked in this way, Flang performs the same actions as if called with
+``-fdebug-measure-parse-tree -funparse`` and does not invoke ``F18_FC``.
+
+For a full list of options that Flang supports, run ``flang --help``.
+
+Additional Information
+======================
+
+Flang's documentation is located in the ``flang/docs/`` directory in
+the LLVM monorepo.
+
+If you have any questions or comments about Flang, please feel free to
+contact us via the `mailing
+list <https://lists.llvm.org/mailman/listinfo/flang-dev>`_.
diff --git a/flang/docs/_static/llvm.css b/flang/docs/_static/llvm.css
new file mode 100644
index 000000000000..53eeed95c6c0
--- /dev/null
+++ b/flang/docs/_static/llvm.css
@@ -0,0 +1,112 @@
+/*
+ * LLVM documentation style sheet
+ */
+
+/* Common styles */
+.body { color: black; background: white; margin: 0 0 0 0 }
+
+/* No borders on image links */
+a:link img, a:visited img { border-style: none }
+
+address img { float: right; width: 88px; height: 31px; }
+address     { clear: right; }
+
+table       { text-align: center; border: 2px solid black;
+              border-collapse: collapse; margin-top: 1em; margin-left: 1em;
+              margin-right: 1em; margin-bottom: 1em; }
+tr, td      { border: 2px solid gray; padding: 4pt 4pt 2pt 2pt; }
+th          { border: 2px solid gray; font-weight: bold; font-size: 105%;
+              background: url("lines.gif");
+              font-family: "Georgia,Palatino,Times,Roman,SanSerif";
+              text-align: center; vertical-align: middle; }
+/*
+ * Documentation
+ */
+/* Common for title and header */
+.doc_title, .doc_section, .doc_subsection, h1, h2, h3 {
+  color: black; background: url("lines.gif");
+  font-family: "Georgia,Palatino,Times,Roman,SanSerif"; font-weight: bold;
+  border-width: 1px;
+  border-style: solid none solid none;
+  text-align: center;
+  vertical-align: middle;
+  padding-left: 8pt;
+  padding-top: 1px;
+  padding-bottom: 2px
+}
+
+h1, .doc_title, .title { text-align: left;   font-size: 25pt }
+
+h2, .doc_section   { text-align: center; font-size: 22pt;
+                     margin: 20pt 0pt 5pt 0pt; }
+
+h3, .doc_subsection { width: 75%;
+                      text-align: left;  font-size: 12pt;
+                      padding: 4pt 4pt 4pt 4pt;
+                      margin: 1.5em 0.5em 0.5em 0.5em }
+
+h4, .doc_subsubsection { margin: 2.0em 0.5em 0.5em 0.5em;
+                         font-weight: bold; font-style: oblique;
+                         border-bottom: 1px solid #999999; font-size: 12pt;
+                         width: 75%; }
+
+.doc_author     { text-align: left; font-weight: bold; padding-left: 20pt }
+.doc_text       { text-align: left; padding-left: 20pt; padding-right: 10pt }
+
+.doc_footer     { text-align: left; padding: 0 0 0 0 }
+
+.doc_hilite     { color: blue; font-weight: bold; }
+
+.doc_table      { text-align: center; width: 90%;
+                  padding: 1px 1px 1px 1px; border: 1px; }
+
+.doc_warning    { color: red; font-weight: bold }
+
+/* <div class="doc_code"> would use this class, and <div> adds more padding */
+.doc_code, .literal-block
+                { border: solid 1px gray; background: #eeeeee;
+                  margin: 0 1em 0 1em;
+                  padding: 0 1em 0 1em;
+                  display: table;
+                }
+
+blockquote pre {
+        padding: 1em 2em 1em 1em;
+        border: solid 1px gray;
+        background: #eeeeee;
+        margin: 0 1em 0 1em;
+        display: table;
+}
+
+h2+div, h2+p {text-align: left; padding-left: 20pt; padding-right: 10pt;}
+h3+div, h3+p {text-align: left; padding-left: 20pt; padding-right: 10pt;}
+h4+div, h4+p {text-align: left; padding-left: 20pt; padding-right: 10pt;}
+
+/* It is preferable to use <pre class="doc_code"> everywhere instead of the
+ * <div class="doc_code"><pre>...</ptr></div> construct.
+ *
+ * Once all docs use <pre> for code regions, this style can  be merged with the
+ * one above, and we can drop the [pre] qualifier.
+ */
+pre.doc_code, .literal-block { padding: 1em 2em 1em 1em }
+
+.doc_notes      { background: #fafafa; border: 1px solid #cecece;
+                  display: table; padding: 0 1em 0 .1em }
+
+table.layout    { text-align: left; border: none; border-collapse: collapse;
+                  padding: 4px 4px 4px 4px; }
+tr.layout, td.layout, td.left, td.right
+                { border: none; padding: 4pt 4pt 2pt 2pt; vertical-align: top; }
+td.left         { text-align: left }
+td.right        { text-align: right }
+th.layout       { border: none; font-weight: bold; font-size: 105%;
+                  text-align: center; vertical-align: middle; }
+
+/* Left align table cell */
+.td_left        { border: 2px solid gray; text-align: left; }
+
+/* ReST-specific */
+.title { margin-top: 0 }
+.topic-title{ display: none }
+div.contents ul { list-style-type: decimal }
+.toc-backref    { color: black; text-decoration: none; }
diff --git a/flang/docs/_themes/llvm-theme/layout.html b/flang/docs/_themes/llvm-theme/layout.html
new file mode 100644
index 000000000000..746c2f56c82a
--- /dev/null
+++ b/flang/docs/_themes/llvm-theme/layout.html
@@ -0,0 +1,23 @@
+{#
+    sphinxdoc/layout.html
+    ~~~~~~~~~~~~~~~~~~~~~
+
+    Sphinx layout template for the sphinxdoc theme.
+
+    :copyright: Copyright 2007-2010 by the Sphinx team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+#}
+{% extends "basic/layout.html" %}
+
+{% block relbar1 %}
+<div class="logo">
+  <a href="{{ pathto('index') }}">
+    <img src="{{pathto("_static/logo.png", 1) }}"
+         alt="LLVM Logo" width="250" height="88"/></a>
+</div>
+{{ super() }}
+{% endblock %}
+
+{# put the sidebar before the body #}
+{% block sidebar1 %}{{ sidebar() }}{% endblock %}
+{% block sidebar2 %}{% endblock %}
diff --git a/flang/docs/_themes/llvm-theme/static/contents.png b/flang/docs/_themes/llvm-theme/static/contents.png
new file mode 100644
index 0000000000000000000000000000000000000000..7fb82154a1748d507925865d3fbf7508d62483e5
GIT binary patch
literal 202
zcmeAS@N?(olHy`uVBq!ia0vp^j6kfx!3HGlw@oMq2^0spJ29*~C-V}>;VkfoEM{Qf
z76xHPhFNnYfP(BLp1!W^HyC+E#mt?nx10eANtU=qlsM<-=BDPAFgO>bCYGe8D3oWG
zWGJ|M`UZqI@`(c#nR~i8hHzY8+H1+jpulh_>fir3VfEN66+L<?4$M=cq8XZmq*xT0
q7F)Q=oS3J(I{(|@x%uL6VqQL2#_By`CBGoh2nJ7AKbLh*2~7Zv?Ki9d

literal 0
HcmV?d00001

diff --git a/flang/docs/_themes/llvm-theme/static/llvm-theme.css b/flang/docs/_themes/llvm-theme/static/llvm-theme.css
new file mode 100644
index 000000000000..4b04e0cbaccf
--- /dev/null
+++ b/flang/docs/_themes/llvm-theme/static/llvm-theme.css
@@ -0,0 +1,371 @@
+/*
+ * sphinxdoc.css_t
+ * ~~~~~~~~~~~~~~~
+ *
+ * Sphinx stylesheet -- sphinxdoc theme.  Originally created by
+ * Armin Ronacher for Werkzeug.
+ *
+ * :copyright: Copyright 2007-2010 by the Sphinx team, see AUTHORS.
+ * :license: BSD, see LICENSE for details.
+ *
+ */
+
+@import url("basic.css");
+
+/* -- page layout ----------------------------------------------------------- */
+
+body {
+    font-family: 'Lucida Grande', 'Lucida Sans Unicode', 'Geneva',
+                 'Verdana', sans-serif;
+    font-size: 14px;
+    line-height: 150%;
+    text-align: center;
+    background-color: #BFD1D4;
+    color: black;
+    padding: 0;
+    border: 1px solid #aaa;
+    margin: 0px 80px 0px 80px;
+    min-width: 740px;
+}
+
+div.logo {
+    background-color: white;
+    text-align: left;
+    padding: 10px 10px 15px 15px;
+}
+
+div.document {
+    background-color: white;
+    text-align: left;
+    background-image: url(contents.png);
+    background-repeat: repeat-x;
+}
+
+div.bodywrapper {
+    margin: 0 240px 0 0;
+    border-right: 1px solid #ccc;
+}
+
+div.body {
+    margin: 0;
+    padding: 0.5em 20px 20px 20px;
+    max-width: 1000px;
+}
+
+div.related {
+    font-size: 1em;
+}
+
+div.related ul {
+    background-image: url(navigation.png);
+    height: 2em;
+    border-top: 1px solid #ddd;
+    border-bottom: 1px solid #ddd;
+}
+
+div.related ul li {
+    margin: 0;
+    padding: 0;
+    height: 2em;
+    float: left;
+}
+
+div.related ul li.right {
+    float: right;
+    margin-right: 5px;
+}
+
+div.related ul li a {
+    margin: 0;
+    padding: 0 5px 0 5px;
+    line-height: 1.75em;
+    color: #EE9816;
+}
+
+div.related ul li a:hover {
+    color: #3CA8E7;
+}
+
+div.sphinxsidebarwrapper {
+    padding: 0;
+}
+
+div.sphinxsidebar {
+    margin: 0;
+    padding: 0.5em 15px 15px 0;
+    width: 210px;
+    float: right;
+    font-size: 1em;
+    text-align: left;
+}
+
+div.sphinxsidebar h3, div.sphinxsidebar h4 {
+    margin: 1em 0 0.5em 0;
+    font-size: 1em;
+    padding: 0.1em 0 0.1em 0.5em;
+    color: white;
+    border: 1px solid #86989B;
+    background-color: #AFC1C4;
+}
+
+div.sphinxsidebar h3 a {
+    color: white;
+}
+
+div.sphinxsidebar ul {
+    padding-left: 1.5em;
+    margin-top: 7px;
+    padding: 0;
+    line-height: 130%;
+}
+
+div.sphinxsidebar ul ul {
+    margin-left: 20px;
+}
+
+div.footer {
+    background-color: #E3EFF1;
+    color: #86989B;
+    padding: 3px 8px 3px 0;
+    clear: both;
+    font-size: 0.8em;
+    text-align: right;
+}
+
+div.footer a {
+    color: #86989B;
+    text-decoration: underline;
+}
+
+/* -- body styles ----------------------------------------------------------- */
+
+p {
+    margin: 0.8em 0 0.5em 0;
+}
+
+a {
+    color: #CA7900;
+    text-decoration: none;
+}
+
+a:hover {
+    color: #2491CF;
+}
+
+div.body p a{
+    text-decoration: underline;
+}
+
+h1 {
+    margin: 0;
+    padding: 0.7em 0 0.3em 0;
+    font-size: 1.5em;
+    color: #11557C;
+}
+
+h2 {
+    margin: 1.3em 0 0.2em 0;
+    font-size: 1.35em;
+    padding: 0;
+}
+
+h3 {
+    margin: 1em 0 -0.3em 0;
+    font-size: 1.2em;
+}
+
+h3 a:hover {
+    text-decoration: underline;
+}
+
+div.body h1 a, div.body h2 a, div.body h3 a, div.body h4 a, div.body h5 a, div.body h6 a {
+    color: black!important;
+}
+
+div.body h1,
+div.body h2,
+div.body h3,
+div.body h4,
+div.body h5,
+div.body h6 {
+    background-color: #f2f2f2;
+    font-weight: normal;
+    color: #20435c;
+    border-bottom: 1px solid #ccc;
+    margin: 20px -20px 10px -20px;
+    padding: 3px 0 3px 10px;
+}
+
+div.body h1 { margin-top: 0; font-size: 200%; }
+div.body h2 { font-size: 160%; }
+div.body h3 { font-size: 140%; }
+div.body h4 { font-size: 120%; }
+div.body h5 { font-size: 110%; }
+div.body h6 { font-size: 100%; }
+
+h1 a.anchor, h2 a.anchor, h3 a.anchor, h4 a.anchor, h5 a.anchor, h6 a.anchor {
+    display: none;
+    margin: 0 0 0 0.3em;
+    padding: 0 0.2em 0 0.2em;
+    color: #aaa!important;
+}
+
+h1:hover a.anchor, h2:hover a.anchor, h3:hover a.anchor, h4:hover a.anchor,
+h5:hover a.anchor, h6:hover a.anchor {
+    display: inline;
+}
+
+h1 a.anchor:hover, h2 a.anchor:hover, h3 a.anchor:hover, h4 a.anchor:hover,
+h5 a.anchor:hover, h6 a.anchor:hover {
+    color: #777;
+    background-color: #eee;
+}
+
+a.headerlink {
+    color: #c60f0f!important;
+    font-size: 1em;
+    margin-left: 6px;
+    padding: 0 4px 0 4px;
+    text-decoration: none!important;
+}
+
+a.headerlink:hover {
+    background-color: #ccc;
+    color: white!important;
+}
+
+cite, code, tt {
+    font-family: 'Consolas', 'Deja Vu Sans Mono',
+                 'Bitstream Vera Sans Mono', monospace;
+    font-size: 0.95em;
+}
+
+:not(a.reference) > tt {
+    background-color: #f2f2f2;
+    border-bottom: 1px solid #ddd;
+    color: #333;
+}
+
+tt.descname, tt.descclassname, tt.xref {
+    border: 0;
+}
+
+hr {
+    border: 1px solid #abc;
+    margin: 2em;
+}
+
+p a tt {
+    border: 0;
+    color: #CA7900;
+}
+
+p a tt:hover {
+    color: #2491CF;
+}
+
+a tt {
+    border: none;
+}
+
+pre {
+    font-family: 'Consolas', 'Deja Vu Sans Mono',
+                 'Bitstream Vera Sans Mono', monospace;
+    font-size: 0.95em;
+    line-height: 120%;
+    padding: 0.5em;
+    border: 1px solid #ccc;
+    background-color: #f8f8f8;
+}
+
+pre a {
+    color: inherit;
+    text-decoration: underline;
+}
+
+td.linenos pre {
+    padding: 0.5em 0;
+}
+
+div.quotebar {
+    background-color: #f8f8f8;
+    max-width: 250px;
+    float: right;
+    padding: 2px 7px;
+    border: 1px solid #ccc;
+}
+
+div.topic {
+    background-color: #f8f8f8;
+}
+
+table {
+    border-collapse: collapse;
+    margin: 0 -0.5em 0 -0.5em;
+}
+
+table td, table th {
+    padding: 0.2em 0.5em 0.2em 0.5em;
+}
+
+div.admonition, div.warning {
+    font-size: 0.9em;
+    margin: 1em 0 1em 0;
+    border: 1px solid #86989B;
+    background-color: #f7f7f7;
+    padding: 0;
+}
+
+div.admonition p, div.warning p {
+    margin: 0.5em 1em 0.5em 1em;
+    padding: 0;
+}
+
+div.admonition pre, div.warning pre {
+    margin: 0.4em 1em 0.4em 1em;
+}
+
+div.admonition p.admonition-title,
+div.warning p.admonition-title {
+    margin: 0;
+    padding: 0.1em 0 0.1em 0.5em;
+    color: white;
+    border-bottom: 1px solid #86989B;
+    font-weight: bold;
+    background-color: #AFC1C4;
+}
+
+div.warning {
+    border: 1px solid #940000;
+}
+
+div.warning p.admonition-title {
+    background-color: #CF0000;
+    border-bottom-color: #940000;
+}
+
+div.admonition ul, div.admonition ol,
+div.warning ul, div.warning ol {
+    margin: 0.1em 0.5em 0.5em 3em;
+    padding: 0;
+}
+
+div.versioninfo {
+    margin: 1em 0 0 0;
+    border: 1px solid #ccc;
+    background-color: #DDEAF0;
+    padding: 8px;
+    line-height: 1.3em;
+    font-size: 0.9em;
+}
+
+.viewcode-back {
+    font-family: 'Lucida Grande', 'Lucida Sans Unicode', 'Geneva',
+                 'Verdana', sans-serif;
+}
+
+div.viewcode-block:target {
+    background-color: #f4debf;
+    border-top: 1px solid #ac9;
+    border-bottom: 1px solid #ac9;
+}
diff --git a/flang/docs/_themes/llvm-theme/static/logo.png b/flang/docs/_themes/llvm-theme/static/logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..18d424c53c09a76f87bf92ff6276155f9b4a1771
GIT binary patch
literal 9864
zcmd^F1ydePvt5E)AZTz69$a4B-QC?GxVr_n;O_1g9-QFr?(Q1g;Xe8P#ywTDUAtR#
zX1b@Rx2I>r739Q`5pWRz06><M5K#ilXK>Sig9Kks4{%Sw0^VLi(+L0&(f>6FAU*Rd
z03gU)3JWVJSlBt+Ia%1*6H5vU6Wcr5nOXiY1pxPzEM;>Q<zp<~hmBhy=`jC9X*(rM
zIASHCNPmnt3K|l4RH-nMyk$(q?k}REP<TVRVURH~{!y5UbVv~hOR#$+`7uESVG$!Q
z+g=6M3+)dF<L`}&e20}cSq+o0U2sUr5-f^r{zzp)gkQFT2KxrK_vi%z5XkKTRJaBc
z5~nvZ2;k9|hliH53$_!0aGynl1G;20x)||8KTuAD(sUsFe?oLQ$NeV_=Z^shc*Y17
z0|H_Y{#mIM%0Mm@U@&TIv<oPF2Mj3v4rT%Wtj9Dj2tYfL1QQ}R4j@K14HE$@xPi(^
zwFpr_gAu?mlkVdLmc9dwlIms>Ky4GyHGzs)55OS-jEZ5w)BuzhU@$~R<_`Q!12Dv{
z)p@RHt5FYWz?Mp@=V>A56b;aZ`|bd%u1-%#H6e*ji@|RA$uM1jzQ-dChdF>1W$R`P
z0CM9n!P?%uxleqqo|s^d0e#m0e)0$AgVe}q_kDk|!d?IXww-;a-{}|{aQ(Sq{B7Rz
zzg<9C=pp91JVct+qX;wtxyLK&H}?N*BbxWCWqEmjZ*NJePe|KvMBV4zyhp!F{q4t-
zFVE}4-B#xgc>uF+fH>6SR?pb2OcDNMJp50iwS!pk*Cxb|SAtp6K1rh%4H}GHMKp&P
zk@Tn=<|5K?p?K>4Z!;ezJDYSbkbIkLK&=%Ygjd}6Qe9)ndud8k4d<?%2LQNkv+JIt
zf`jz82-==>`?%zP6VLqy_?t<_I{<*52sypVNCW>M8~}*q2GG<B;@x(CrR##h?}lFM
zMtn73|0zV?*CT`~gka*2?`S|>87M>*GG0qSW56&cK)~9iXc3ZN|Czo=wfQrzJ<78o
zTy{5AdjLF?;1C?TA<0S@gi(YBSqvO@TJR%Dw*(YgBsuYL7=|LLbR3%mlTw5Vv8n{Y
zHLv3*&QKk();R7VfI09rq+OgTF`!<NcMhphxTTZ>_orAs;*^OqQ+g~`ZpM`3Z(QCe
z_PoQX3Ja8#SU%xF#;&8Uuq=H)80iOkh3fvJgHJFXts|{Osu;knRjY(-3%@g-taF1x
z3lQysXNPhVA~qxxl+cjTkXw={CB`Rb{jvhj0T208v4@g0p;Y=eMeCQRzOP2iOraVQ
zbYu%?vFHq_USYz09OP)>!pxK{zdlLU6{^gj&4|s|Dv=%2m1VF>;F2W_ub5cZ!*V9l
z5z`I@?`7@v?y>CQ?U7t-BV-u~I21lA?@%d?SV`gRqwcfr!<eQr3#t~RE3KF2seECJ
z)e@@811Z%ghL)nUx}<ZC2VqG{7ggma&Ei^MTRd3S967gsSx+LAOPNxi89f$1N;!hO
zbwff9!3-J<GpEAI_zL^g9LMczO{z>W&TJ&kpb8@g(`o8q8cLd-ss;^+#v3zk1R+T(
zi6RM)hC!vQytsU$98vW~byOv`Oi$UTT1=HidAAIHF1bRrJWJtJ8Bf`~jJq_n+`QCI
zg}=~O^DP`kvsb29Ubs*(w@!bzeSjFz7^lIVj>H<dBE%(tt&gH^eA(NTrY-f2pe>TO
zyeyAXm_>?3y+P?F(TpE<H`U=W(^(Gwkg6o*#y4UE<C1!Z8n2jDriIB=$GSW@C0V0F
zqC&Ak{Ir9;`n)8oxI?r}^CjT3;Gh(x4yCwx>;!byq4KiwdI>@amMkyVT{V5VbrD`(
zn~2*V+mega>`p!ZY#F<f4Udk)XM7)qH<dS==aM^dBn_AbByyNlq=~OQ{cH|)JOm7b
zt5=G`y1fFzMTSX+Dce8eTBdml=Eb7LP@*gcbau&iZ3b)x3MgPGNGPObPh>CBm1J9r
zsf)Q~xMc>%$Hw!=7t+`=c$pTN4>Jxk7Be=R47A*}=s=~QW>d-fJ*{HR3sAdluokCg
zk%p?4Zi84=R@JwKwyL(0FC{H<333(L3+BDGSD;}~Srbm}{u0b;HOL0!V`y%KZQj&h
z-FuPHk>Jzn6aB^uiyXTAxq?88z}M-wBL<fm*GBq8<^{h`|84RygSo~e>f{p3E(m>;
zy3aC8Ib&Ab9NWCbDd;9FGF&=vls<)zJ_B!HeW+tk%K6SgciVI3E-RxUqvyPIwdR8T
z!tx??EoY21hdNu6#c8T{NO((ef0`dLCyCK|v(i^ySU$kK@sNzcm?2KRYO%Jd$gTXr
z@DiKvp3gC7mv5JcV}p4^Q-6%3t7)idRKC|PTR(VH*pKYP<iqo08YuDq24M~v9v~Q?
z{psb=)0^Z^3_sVZ>3#k3xnNXoiy)?8Y+zO3es>j##n9QWDw2o8)QPwW%g@0IzDEmF
zu!aYQk-ff1l)<6F>LL6w_F)>h(@ZIBZqsYRPMadnBFQ2f8Rks#oHgI2*l765>8*HH
zI9gd-`3LCDG{@#OvNR-rNaJ(T(^RwD{^;C%J--Rv$=X5O^}<eN7}B-($eZZ74KBed
zqer9TN*0g_llw36uOKQ-DOS|CYjn47F$f+#1U=@Y4Abn>VN1d~G=y^35;U@&@n>=E
zY#!wOEJbwtEk7yCeznUW%dBT`Wn-ng65BBIa&<5T#mD_}5Gp5;O#qpnm>-zAj#iom
z*T47MMm8}JCczF{H~gxE*8GkD5-L7Y(V|+Ufvq<;RX6)H9zMdET#-#Xem>SPvigf7
z^D^h-9IXd!J04UNBCY<NK;xjJ(u%X5$H8M2VHXpcHbf1klBYQWL}N2AUvaLUQtjJ8
z;VEk!pw~ST*xd+WeTt}xupjJmy0&<(E3d(zx!Gv=9Fm2{hZ!DGjj@WIg6X$JwdTH$
z7iu^1Cv7e9cUeVWi^-(fjM?$Q#`kRP&&lN}2PvOzk(XC~BX6qxKEJycU1dlar&?_b
z0tvSi{NZVCH!T^S80Y3!DO9;tc~)V+m|1T$Tzs^hzQFTpzh^z0U)64Zc;g%HTvdfo
zm7O=6H+_+OX?WnT4lh4`On$M^t&+2GI3I+Zfqa7{`wUuanVTn=;Z;$JU7$S`|2#<4
z`JQ#@_8plWSv5LyV28|J#AEOH0e<s=5SeSmHOi}U8+Mk+mB5^U!AdCoI$bP7IsHZ3
zNV`>jjBk43WW)1Se3b|#la)u)L+VCl*mC7Ka5lmGdNL|2Da*%~{%Or%H6P@4XgslL
zg<^Nvit4;QBl;0}$lcC8p>y5rd|kh~kFMUVo?gSHZ*$^&!t<UmD?iwl*_K}2qGR21
z;n~sXW%h84m(4M;e%s-3h4ry8pUug;-f`n;_t5`vw~4jx>qz-FG1|85MtM=yF5j+y
zD{wBjEW92%&#(Qywsg7XXVvAFdo#Tz{s0pkH2EHP7mFec9h(+A8iE{hn*ARiC+7~o
z<J<9TiSVS!Bt<s%d;a@qH8vK5@8kUG;>2|7bY9AJO2Qu-@1nPpu2Y<v)q$P?!p>oL
z7AC!0wU_g5w>39{%Ysc5`8{u9KdKLlr@A}ksgjojoP>lAD9F^CS7Vm%tKfW-*jPeI
z763fI0f2uH06cww<uL%bFap4d0RV8N0syvMyneqJ06+o?GAg1vy83r_cj1vS?VVjY
zd4;sh8dk1(PfySO;nkJZwZ_J#-o63fC@7P%+Pb>Cb4z-rrlu{eZ1(r}&(F_Y{EM%y
zu9Z|&kB^VF%;VYEIo8)VctuQp*w_V^FDx&w%+D`qY3URem5PXpB_^l+1~ogmdW?-v
zWM=1y+m@Es4!gK}MMeKAE-mjL7{dPgHRe}BcTZnSTgT6k@QH~jDQTIZ;gPz!`h|tX
zzW#y0prEa-t?JtP+q*|cXBT%*pO24^{e#2gl=Rov*V6LJ)by<3k+F@9&D#3kzgwq#
z{ep7yi@ZyhZS5WX1A}Gd6zYbKI@&u!!Xke+HaU5v+B-V;^bO?Y=bM;X@To-U8yFe<
zNH43buKx`JHMhwrDn}&zwsOvDZfRRtS&fTN{PioYv#ZD2)O2QMHt=U?@UIe^oVlHy
zT@Pm$Pj6opHI4Yhl-Bl6V?8}{3oGxSEPE@f@SnlurDZF7cZo?UJG*-$V-rLD18GUg
z{$Aesg(WHx{aPxj>Ka;ESvj~QwCNey-adW<gTs@P(`HsbC&woimzHZPEA1Ve+&#S7
zT3QAdFXv|G?A;=&$}8sP7Npe!H#WDxmWYjtk`NL7`}aywMkcvwYjtfsGd+E6b<Nem
zq3ciA#@2pZVtz_mW^+@shpVTkxI}4j@zUZ_QBiR~cv?|j{?7K!+{_F;E$z_Y(ALh5
zkGuQU=2l;KPf~n*O;ydz-1g7V(2#(@+^p>4g2JVRh26dVx|&)sN$Hu%sgsjaCQjd@
zqoaoUdJ+n5UT)se;gQ{)-T2sFaVeGCTiZ>IjjJmwgQKfsqho(M|7>n-rl+K05>Q6Q
zX0*4p2glUNh)YC;h7sc7lM|B&2nfzC?LouBU0(h@IlcPRx3IZ=Ix?}ox4U<CdUkkt
z^z!nOkz3c>+xPeK@9pjF<HO_G+1csIX-Z<^+uK`CM&{wcp{#FfVb@-epC2;=<Lk?d
zqJrYh&Hc^o!_Cc2@4#F^PHswOg|n@ls!;;ie?0AQ!KpF?;H)I448HyU3_$q-792OQ
zGbBX>Roqw3GW>Lu#X((?0+K>uDf-h>Jn=fj)a0nX&=NudNWbEt7jnDe0W4B}5PP`N
zs_2T6C8SESvt^lvcBHLC)w+wRwO`Y_0LFQF<b72Iyd)~7Y5zj2_NZ59=Z1&h+G{gB
zyhYvR?8*dDoY&--NvG39CZ`j{vwzNyt0$Z{^V^(hKd~HrN&a_qQKFyGqTJ^wW(}N>
z?%t`YJhto^1eb&32TbM!;%`V!FpL@FU-K53ZC#)8FA=lq9(2~*;>kdieO0Ntv|i{8
z&R#@Eb%JBfqY0tieFl3dKAT6DO$$Da?~z$6aJLR$AfyE~_KHdDc0{4D2c{73VV0ir
zum?Xe-N6xxHH=Ur<G*B2yP@E9KH9&iU*0C$*e@OR;LKha-RSjUC*CWzO{fn92g3yt
zk07!`vWKpmnvNZLjZrA-y}mDc9{wGn<nwx+K9`?ScO9?gVz6lsAK&;zPYRch?=}%U
zb2_)u*7^6XttD(4)WG25@qXSK$BfL2$F0Hu!&&j))J<2wvW?+twC{ElA(rPo<0d2*
zlpmwxb4M(Qr{i70!^~_qtm3&n&kJ*P+$CxCMHDW7?5{tcaST22hyGEy;d_}Z_h;{$
zWA#l}78Vv`#GZrD3k9zMB;8hbCs@x?>>|I-qo9y-0%4zw4bJNlH~zKGRZ12X<@_AV
z*5zGNLbseK1(D+~(@{&%4@f?=WQv*V401VL?Y&o}vJ0z`y(k?{*SjTzo6M+*X=!Pa
zHEBd#aCe2{qNz3;pv4-5+c{VW#ep^HTeu`rtr=@Z3)Xa*H?!My75MBqKhD#U^KO>V
z8Trnq<tI1w!2vMDD96#)J2eK^$K2PDc?Zt&cK!?OTjRd1QjE|D%BvFWJ~THMXD@3R
zrA4Qme@Z{2WY|qM)opQo8q3L9d+IyY?zD&!T|PYwwcTv`q+1c%8v2vz-jv{IhdL4G
z0l_Dhb>UZ;h3CShW`|x*Lc;834nfOa{|wo2hU;ZZO#bC5q5F0bLz8n--rn@uQ{E~;
z*38{J^g|=jZfz{n**hxS1@d$DnFX@mN1t?ilh*5h_@6^>D)nk6H3Lj?eCx?Ri4N-W
zcmJ2XB4$-2_aZVVCJsjE7>bYg04OQXISuaj8#K!OmoQ@E2Phv}>mo6?+((HMFB<CD
zZz-d1;IG(!ndTocl}k*3DE^-q?c_VQi|1_l4La`sv<i!`RHqC60sSD<mc!(e>k~uz
z^v2n5l(x%BfnTaygt_vf(3Yg~+2GBGLojl7zm=tB-00hefLbQ6n%_tWVLiz_>_!sv
z$ebKD!5o%|xODT4h~LoqFL;dtkbKB8&PU&W#u_)g0yP{WhjclGnR>n;PjiQ>IF)=U
znj|R?9;}LeWRw0q37^fP(`Hl6QF}E(1*xQDt^HZ4EV*UqCM*fk!|?2FF;;e#W?_=^
zfN0rs9I2ncLA0-|xE+_z4`Lr?!@=Wy^NNas2--do(zsG3$)a|SUzo>a>pm$$_M`R5
zva);DjeKKjW}M2>-=zY?L$}X7@>h{P@)B<H&2J;=ujG+Vhz>$GbpCeBQKk9Fpr-Jp
z32=6Jdl<Qx*2z#{MTU#ypD^t$LS}gO1POx~ZDq=WQcVjTl{8!AD_ptuw~iQXOgfKT
z^CRyY|4jAn8LWSzQP2Mva33>T!+B<FEG){HM*h|y4h89OoB*FlP~wALK^qxKOJl;!
zq!?ZbH5~PK3w3Q#lcNt8ElaKhmMwlHt!geG!~GU3m7z7uBFTr;jNBNG-T4guE>QO6
zMnFD+YNtp3+{{^+AVW;-AMO8mJFP2dX=w}cX{t#_hIK65^z7j-PAhkMK81Hrt%{=1
zkUcRr&QiCD<*YK=-HK|6H;o7j^~7WcaJJF^u6nwRq1PRb{Li77!HA=vjZn(0LEI5K
z@V@zW_@~xpqcu8s%-E!U(Z-8z+3=sceRQK2p)g=9jJ`oTx2;r+?i@t(g=qE~Uu<(*
zt%kiky$5F9x8>`<Y(b7F{JrwoW_K+$M$h1l_mY_+OyQF(tGc`Eh@dOO{BaVq)Wiyv
zG9<4=g?qh<)IobJXiA==c77{`ip6f{+;*yWbZfz)lu>;(DO5Yv3F#|-`}YW6Y-p>m
zi1(%Eh19j=&UtZBSV3HzIb%JO)<T{kO-4c|P{)So_=wYG=fFsnJZG|>67zt2u9|?4
zi&h(^nXHaXjLpm05%|y8(<?b<i!tUhM6ZJ60;NHZ5P{EkN5)-W<nV%3Oy14S$A=fn
zg}fod3Hsatg^YFw>POgg6J?d?CQ;e$E+%rY&5;$u1GVP`Kdrej;_u)Rr!Xd_6H)mm
zj)sO~Xmd|jZ^9!xA5<4SH!tg<EvOMa^e=VX-@iw98!&pdo}T9I9q4?jCs*vcDBUE8
z>xy@}UFIrT3Br|JD-W8)9P42>J4_Wm)9E#alGjv6;!<d(EVem{ynxB-Wq4ZgJJCwL
zyH@yH#74;;YIn-5iv9(R>E-1X?&xXdO+kTQ6)oe#3i8!5{l-rYjtY&3O*o!yN72W*
z#jnIRSRjEH&&+@16+E1r_7JnqabC&v1IC*TdAoCO3JdkL%IuB$T24m#dhU-PKCiNH
zgT$YtLp1E%I7Tm6qhU1JnFW)!J5E=U!4PR3w8k6&pu)g+m}N8eC2yR>wIvegA$63Z
z`4R@P+f8(^@p#78WW9MOwvE87qy5qB-=IY@CilQ8|Nef~a(j!N+ZaD8l%^_*ORco`
zqMBM*xKh*#TOB?=c&p*96|2r3*o$<CYCOTnavj4ii@xR-6xvD4HPLn1^?kIpZVYUT
zCH@*eK4+H4nJZpOxiD}bu#nf29lb$mk0O>ZFh`lAlaT)_8&!^4V6}MF5>ePg9#U2C
z_S-e;fuwn`1Y0*VfrCjbjquVA&Q@IOZ1~Q0+F{x~;4a-Sd3}e0AzK5t-LE&D&HBD_
zSy}mI5SEC?`w;>P{`|~j&W#mO%F3rA(o}?cG<J5g?{FufP&uX=ccnw8#)K-XEQPag
zsiFaCGJwAOTW7Z?)FcP@#*{-m3Yu-mTK33Dt3&_4nPl6$$qO&)?tbCWLAbPhy7Ozl
zk2Lpnafir%&1tvEihAB#yO-STqry=fiFa}&b<z3r$KV;^#xar^D*|T=-zdtk@*3;R
zv^k{1n1%Ui*X?IEnG6rQH|Gz8M)$SzDv}h~PzqZZB6GO+&b>Ilj|;aN-s`Px=%Mm{
zYj(TSvw)?*+m*&7*T@IsF&+nQHJW_#C4>WV&d{a(Ip1kv7+Q=Dr+%~FnPLW@OV_*u
zZUOyutUJlaqJC`$b%n@3SZ2sA?d}%xEc=%z`0tAhzE2sOuP<@1C=43c?^`o$9&DC(
z!z~W*o%8)hq&5BW8=S_6B+Z+NWEv@Kp-bm;)tTFCOylqYyRI)yX_LZG-*y~>cwwL8
zjvzz7mfRx+uicA(C;YdD?mAaWD`;cr|9jmct9iRl`JWcB?*WK72v{~?HN^0-P@X0|
z8!y%9C%(HjJPM#pa~JE>YPQe3@mUQ-SBc+dMH8X?IN@u&Jh-bZ!()#`Ih7;V(+wj=
zj3IoVXNi3sWFUN6+=iAOC;@$zs~oN{6!;-1X=9WarQtO`E9LxA7rLy+d?653_m~O>
zPR4U{!^Rke!P*a3(AVB~p?-8vea-tAN{%;=hqSFp8>~zI`2wS1bz-={cJJGxK>fGV
zG;>0oS|D8WfC!8Ex1p1)7O%E~QX5Dq8OE2VAM)8|<|L~m2cPp+a`UQx!TMcOlcPFJ
z1%JZWHPU12Xu3c=mfjslbKkG`!*yuN7lXIM)n*^}@2(h!_uXp3=q9J@QG&7;g^;cM
z`p|P^dD%zZx1rv=iciAR=-Z<VWMPiebM>^%wWsKK*H$wMLD&IA9M9U(M1S0_&$tKK
zU2;~i!A?U%9x?h+z<g_UXX44tZFjtI^`{>S4K9+N?(X3MJlh01&nbW5VCLwk%49R|
zGPtDEWDXFIHp_$z%P`3riV^0kZ`8_Rz(*u%#v9>3g$!vxXhj$|#LgM+c?wj0ods~g
zLKbSbawp1m@ApH9+}UflhwIxZFdu0{s#Iz`19798O8Im&({A<V{PeV^1DJk(ersyr
z2-^2PzRGpVoxX~W=E&i-pY87I`IzL?Br^a30KrIz!74~xA|b)#h{=#sbfkd)kWqKf
zuV5i&<d;taJQ`Hh`j#Xu`MtFYX!7aE>xZu?M1P$7KuZs`S5L^agtN!>cfQ|w@++HH
zI7sBCCN|jJdH1e&`h$Xmr5_@D$i9L-_!xe?xw(q4<whweLc-4Pj>>0Uj|>w~^e(Gd
z!>xe<1n7;MrpN(+ZG1<}s-HcZVEr5i>l2_!e0fx{_~+$0C;LBp$EUqI;i!vZ4cV?p
z1RnjAs>2}<8V6BRqaqnPtjPZG@ol_&SlCN_GK3n*iqLfEZ;c}lB`elzbv5$$v!m~S
z`^EM5_+yDpYjC3nA;S06#AU&T7RWb(t)_qkhKeqZ0IUf}g0a92r;FE;9G{y$FUrYI
zmfB;9%Mm?UUtzTDipa6;A+Uifg>)1%KJT+rihNN8_nJTMv1&ErNem*U-=@ibeXDP8
z94FP5U`gVrSFU2B%I3Ad?!u!@20Jnx2~KoP(4z{6b2*d?pdo`877B>KU{$m&C|_de
z%H}EPOjnUFwRPKhKVN;9_qjc=i*2tp7FQzcsNi79WrUyH=eTov)lec;Jiwzg5YF=P
zuzq$dD5Tjun3(wO8J4xmr}^?VkeKNs2{WE<_o`@FGWevp>GRo=vqj-f4HZWqvGmD3
zZqww~0^K+aq}HKNrD())%_>;&>uRTGN3355MhqkSg&I6XH|sBsQJTE3_6He?F1#b<
ztO~IukZgh_5vs>`it11LL^gjq3yN!Pq`$pbOZAC3m<oA$=?I;#SbYm51}_wMEZp~C
zP5wV=c<5gvqH|N`)|r(8UX)-}FiH}m90suzq+m>j2-fGtH><iMcY-&jIArYnUV3jX
zJKswvKl4o7lCZ&GN*J*{A%o|cV0rS&_={<M#0a?8+j|{X`HhgRj2CXSaHBLCHAO|8
zGae7<fxf=J*Py?DqrzUFxDu`~w~Sl>Kmped8URxF0&N=;<8>i^ULaq3R#jMTOnoRc
zy`9%g_&qOkY$hsd`;3eR9?^QSK;l~+mKBO5c96^cTGi2aqJBIWw{gHUs?(roix|_*
z);DLx4WQE72@xREgishX<}myUSheE<S|=SMp|#bJk$m>HZ%wg$Z;#0gAce?MEi+E5
ztI`#hxHo(~M0wFfOm&Lq1&YE!c&HXK%lxe(^zL<d=DJ6}OW<u)J$j%65U}S#G8U||
z=aHchwpA#@nx^~gL&1d{vaNJES&Ir_V@HbWt2e!`p4<5~o4!=*QqO9Y^dBDXws`A;
zh{Pv^mMa#YmC&SSW_NdH637Bm<d9?wlhy(jayI4Iam&E<$b5xLdQ{Q~YdVTR0KfnM
zaz`)Dg5+KWDhvb=_Jg^^-U0!Pk3p;L+cUTe(fsHhYSGeu4j54<&YcJ=oEmDSnx;B+
z5mn0SyQ-=pa&iSLYxJf34D?9;O`@+{*v~}lxX}aA{C!_LZFP0mpY4cv-6GkMi0IN6
znA7&_G7<D$4@0)bx7jj0;@&*@IU`F*BTV}hljg&%eSCZtt7R*zqpr_Uk$rsEpI<`F
z`u(oXJH!ecMp00Gw&q<5TklTAhrKL&3)+;5BaaYNtKrNOo<sk-u)9(#(j};3h^wmo
zgBYgzP@hOr<<BqmckH%Sr!OebXgmQ7!+x)r1=AL5Pmm%t#?{*D>TaVV8(!ey#U>vX
z8aI?1Fs0N)o-I1<edj5QkiZx<!35(dmL`mbx}raptXdZD_KzbOE$!)_%@6G{(ofyC
zkRzc)v%D_8U{+RKKmX8$04Kb~Ds8t>6Xx>{(E_$>P)pI0*kA8P`g;AztAqx3rGLbc
zHh&7^q&(<ah{XTp!cLl)i4*Nxrs~D^h6~+WJ|(Rj$n7^hr-zaS6bWGBcwhgcRSsJd
z=7)3{3iLFYq|f7}rAk{*-nu5B>|QXd22Ytvxl);mQd#1l(eT@U_&%<MF`KpVDi^!E
zy+3U3ENRCk?N}rb0!w>;YkLh-V7z_H(8p2)fG#NrK)_WUiIcZ}s18y9BW_(+nMw_y
z0*!Z{oB@DAk%9=IVKaZ0jgOYs25)jYJios>&CpTrushH}kuYo`vyJQ_HAYGOSEsX@
znp=Ew`*B!XgAOZpaJNpQ603zMwBT~mcE%w{t5M5JRaH?@QA%p0TbB{3myw4{rY-pX
z2kS-S<$zG+3L&W1#2K<!!{9$cu{cSP1vcsImOC8qHRL<%*ARb3!jSK1;(f8cT9OvI
zKsV#~r`}p#d;t_|R_!r^4??%wn;JR0c2nlGsNPJH*n>Fz)WR;OCo5eO*O=_y?Z>C#
zea9A^i^IWf`&-WCI4-uFjq-V=ic%XML8#9FFw?qeosTm6Ly7|@dO)7wXwSWZK=%kv
zFCZzTI?Ns3Gw@plM6FQ)3kec{p??x+`z@g|6>iCcRA`zS_%&oZs7v?UqF!Q2kziS4
zKQuaG(NK5eA{)2-#_7pBsCN6D`F<vm6*DPoHE&hM$MfLKS`;nDo_Ex1_}MvC>dxO5
zaY|_z^lLW$1S^$m;_R#Q4~0XV^?k3ksZFo_d2Nb>xIMNs{Osjq{Zx~nMK)10^M}K|
zf4KcucR6<U=EG)~Rw_$JMTzcJyJSm~p_u&+W1k<nqs(jBG3Odcneqwn@5AvhYNU4*
zq)eFzmLv5L_q$!I9cDc_y__eojbyX9M#PK9aOX*xR6li=m5uucSy`?ux$#4%EB9cO
zMw%6A6hoq-R2df}e?izDciITsIt_*SE=JGt-&(}Pf~wR}lqIdXgT40N<)>PMgep9_
zvj)ub5l@8bx3$xT=$!mNlCuml3Kw(XYkQrZy!rI27LMnv_;Mx>9j_2XmFOBr4p#_%
z(6~AakhNH52?bb!1Gm)|1WBw67PZyuBz9-qZ%Jnd2aFXx=&k9b;n*JY&vAyzQrDHc
z=d{0WahX@I5&x-xz(Y9cg26*rqd?Avk*uy-*S(&rOxJ@0W0wa2rL_uFQB8+=HqUOj
z-sR9slR0H3=H-x)F$e7yMIcw!JoQ`g;hwVPL2Y3M4eD?<3x&r*@?fEW!QCtOcA-28
zDJd(eu@n_dwtzMvSTy&LjG$mZ<2yK_!jH3s2RS%lMT|ccFe8Yav)?M!V**2@^{Ux%
zQ_lIZ7pYENg3g{Pg&YgHmWV?PbU!R491}xAdU{>~<c3nfE#*k9UK~Xq7X{$5r`06=
zxl1BXNLZ*PA@o%ilr*3mEL0Acu~iTkjK<1t@9C~Vg)y<T=ygD7uW5|hkt;oeXFW9c
z8+z{$n@h^qS0IBTAv?dL*NalBY<C9=?GqqHeLsr1Do(@2M*;=xwHFK4pfg5`S`F$(
zHO~1mkkwZ#IOLFFEb~bPXHQ+`QjOpS9o+`fOWM_R|0<jQ-3Sf!55FzJbwc6F!jmn>
zzq>SUQPm8T%UkMFo?gbtFhL~XLguUiW8c8N6#1BLMfY2T+!Zxt9GL0HEt({Q7Wl7@
z!iwA<>Zli;;N2{<;^ROrF;|wJg2L4NKd34w2$XPKl;c7@eCHanvoU>6Oi36S!Nm&-
z3_Q9=!@!txN3Di^V0v~e#|mZu^N!#$1v}e2Q#R*JLs+Jl&x4rL_^3Jy5{dBpgI@4_
zavLqurRG8Zo`SIt>RDrZhG?Fg?ndP(D*gA$N?Thy4aCg*O!n>1=*NWD`Ps=re*Uo~
zxTZF5)u?4NUp=h|{Yae)33N?$X;&Z@<{uyDCk8RfC<$y)2KjC#{;Pz4;JlS6zS{xq
YG?i=yuH1&;qdq`VR8FK?$RP0l0Cb!GPXGV_

literal 0
HcmV?d00001

diff --git a/flang/docs/_themes/llvm-theme/static/navigation.png b/flang/docs/_themes/llvm-theme/static/navigation.png
new file mode 100644
index 0000000000000000000000000000000000000000..1081dc1439fb984dfa7ef627afe3c7dc476fdbce
GIT binary patch
literal 218
zcmeAS@N?(olHy`uVBq!ia0vp^j6iI|!3HFkf4uMuBv2gW?!>U}oXkrghqJ&VvY3H^
zTNs2H8D`Cq01C2~c>21s-(chw<zuM4+c^s;Bw6AbQR1ARo12<f!r)w#npl#WqEMb$
zlA+-4=^GH<$R`d|<nHO>7$R|bZ|_0D0|q>YSbqDzW^|HYIk%*-&O)*<eU%p5hFqPM
zoCZSI1cj<~W;wMTU^!DV{f1fo9KD%^zhWZgXV;bV*7viUUsLj7{xJDI&`1VPS3j3^
HP6<r_1g=6L

literal 0
HcmV?d00001

diff --git a/flang/docs/_themes/llvm-theme/theme.conf b/flang/docs/_themes/llvm-theme/theme.conf
new file mode 100644
index 000000000000..573fd78aba99
--- /dev/null
+++ b/flang/docs/_themes/llvm-theme/theme.conf
@@ -0,0 +1,4 @@
+[theme]
+inherit = basic
+stylesheet = llvm-theme.css
+pygments_style = friendly
diff --git a/flang/docs/conf.py b/flang/docs/conf.py
new file mode 100644
index 000000000000..bbe37a68cc28
--- /dev/null
+++ b/flang/docs/conf.py
@@ -0,0 +1,252 @@
+# -*- coding: utf-8 -*-
+# Flang documentation build configuration file.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+from datetime import date
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.todo', 'sphinx.ext.mathjax', 'sphinx.ext.intersphinx']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'ReleaseNotes'
+
+# General information about the project.
+project = u'Flang'
+copyright = u'2017-%d, The Flang Team' % date.today().year
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents. These are currently set to zero because we don't use them.
+# Should somebody consider in the future to change them, they need to be updated
+# everytime a new release comes out.
+#
+# The short version.
+#version = '0'
+# The full version, including alpha/beta/rc tags.
+#release = '0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build', 'analyzer']
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'friendly'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'llvm-theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+html_theme_options = { "nosidebar": False }
+
+# Add any paths that contain custom themes here, relative to this directory.
+html_theme_path = ["_themes"]
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+html_title = 'The Flang Compiler'
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+html_context = {
+    'css_files': [
+        '_static/llvm.css'
+        ],
+    }
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Flangdoc'
+
+# If true, the reST sources are included in the HTML build as
+# _sources/name. The default is True.
+html_copy_source = False
+
+# -- Options for LaTeX output --------------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+  ('ReleaseNotes', 'Flang.tex', u'Flang Documentation',
+   u'The Flang Team', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = []
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output ------------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  ('ReleaseNotes', 'Flang', u'Flang Documentation',
+   u'The Flang Team', 'Flang', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'

From 07de36dbc817b67609e60e1e645d2faf3198da6e Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Thu, 20 Aug 2020 16:44:14 +0200
Subject: [PATCH 153/363] lld docs config: Use a list key in html_sidebars

Otherwise the docs-lld-html target fails to build using recent Sphinx
with the following not very helpful error message:

An error happened in rendering the page index.
Reason: TemplateNotFound()

It turns out the values in the html_sidebars dictionary always need to be lists
now. See https://github.com/sphinx-doc/sphinx/issues/6186
---
 lld/docs/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/docs/conf.py b/lld/docs/conf.py
index 7d4fc0c5ad75..ee93c01f7f32 100644
--- a/lld/docs/conf.py
+++ b/lld/docs/conf.py
@@ -134,7 +134,7 @@
 #html_use_smartypants = True
 
 # Custom sidebar templates, maps document names to template names.
-html_sidebars = {'index': 'indexsidebar.html'}
+html_sidebars = {'index': ['indexsidebar.html']}
 
 # Additional templates that should be rendered to pages, maps page names to
 # template names.

From c7c68c7965190393ffa594d0c8bec79c4ca7dbfb Mon Sep 17 00:00:00 2001
From: Francesco Petrogalli <francesco.petrogalli@arm.com>
Date: Wed, 19 Aug 2020 15:27:47 +0000
Subject: [PATCH 154/363] [release][docs] Note on lazy binding and SVE.

---
 llvm/docs/ReleaseNotes.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 612a5417df95..116898aeb75a 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -172,6 +172,11 @@ During this release ...
        }
      }
 
+  Please note that support for lazy binding of SVE function calls is
+  incomplete. When you interface user code with SVE functions that are
+  provided through shared libraries, avoid using lazy binding. If you
+  use lazy binding, the results could be corrupted.
+
 Changes to the MIPS Target
 --------------------------
 

From 414f32a9e862b11f51063b75729278f8d81b12e9 Mon Sep 17 00:00:00 2001
From: Francesco Petrogalli <francesco.petrogalli@arm.com>
Date: Thu, 20 Aug 2020 16:24:59 +0000
Subject: [PATCH 155/363] [release][docs] Move SVE release notes to AArch64
 section.

---
 llvm/docs/ReleaseNotes.rst | 40 +++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 116898aeb75a..aea1550960e8 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -103,26 +103,6 @@ Changes to the AArch64 Backend
 * Clearly error out on unsupported relocations when targeting COFF, instead
   of silently accepting some (without being able to do what was requested).
 
-Changes to the ARM Backend
---------------------------
-
-During this release ...
-
-* Implemented C-language intrinsics for the full Arm v8.1-M MVE instruction
-  set. ``<arm_mve.h>`` now supports the complete API defined in the Arm C
-  Language Extensions.
-
-* Added support for assembly for the optional Custom Datapath Extension (CDE)
-  for Arm M-profile targets.
-
-* Implemented C-language intrinsics ``<arm_cde.h>`` for the CDE instruction set.
-
-* Clang now defaults to ``-fomit-frame-pointer`` when targeting non-Android
-  Linux for arm and thumb when optimizations are enabled. Users that were
-  previously not specifying a value and relying on the implicit compiler
-  default may wish to specify ``-fno-omit-frame-pointer`` to get the old
-  behavior. This improves compatibility with GCC.
-
 * Clang adds support for the following macros that enable the
   C-intrinsics from the `Arm C language extensions for SVE
   <https://developer.arm.com/documentation/100987/>`_ (version
@@ -177,6 +157,26 @@ During this release ...
   provided through shared libraries, avoid using lazy binding. If you
   use lazy binding, the results could be corrupted.
 
+Changes to the ARM Backend
+--------------------------
+
+During this release ...
+
+* Implemented C-language intrinsics for the full Arm v8.1-M MVE instruction
+  set. ``<arm_mve.h>`` now supports the complete API defined in the Arm C
+  Language Extensions.
+
+* Added support for assembly for the optional Custom Datapath Extension (CDE)
+  for Arm M-profile targets.
+
+* Implemented C-language intrinsics ``<arm_cde.h>`` for the CDE instruction set.
+
+* Clang now defaults to ``-fomit-frame-pointer`` when targeting non-Android
+  Linux for arm and thumb when optimizations are enabled. Users that were
+  previously not specifying a value and relying on the implicit compiler
+  default may wish to specify ``-fno-omit-frame-pointer`` to get the old
+  behavior. This improves compatibility with GCC.
+
 Changes to the MIPS Target
 --------------------------
 

From 1708358fbbf58d4a870be015ef21eeea9adae87b Mon Sep 17 00:00:00 2001
From: Josh Stone <jistone@redhat.com>
Date: Mon, 17 Aug 2020 15:31:32 -0700
Subject: [PATCH 156/363] lld: link libatomic if needed for Timer

D80298 made Timer::total atomic, but this requires linking libatomic
on some targets.

Reviewed By: aaronpuchert

Differential Revision: https://reviews.llvm.org/D85691

(cherry picked from commit b26b32b5d3b85812a12f5e3bf011428612f78e19)
---
 lld/CMakeLists.txt        | 1 +
 lld/Common/CMakeLists.txt | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/lld/CMakeLists.txt b/lld/CMakeLists.txt
index 5090c935e75a..040bb2c8f6d7 100644
--- a/lld/CMakeLists.txt
+++ b/lld/CMakeLists.txt
@@ -54,6 +54,7 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
   include(AddLLVM)
   include(TableGen)
   include(HandleLLVMOptions)
+  include(CheckAtomic)
 
   if(LLVM_INCLUDE_TESTS)
     if(CMAKE_VERSION VERSION_LESS 3.12)
diff --git a/lld/Common/CMakeLists.txt b/lld/Common/CMakeLists.txt
index 53649032bd98..41eb58c15198 100644
--- a/lld/Common/CMakeLists.txt
+++ b/lld/Common/CMakeLists.txt
@@ -2,6 +2,12 @@ if(NOT LLD_BUILT_STANDALONE)
   set(tablegen_deps intrinsics_gen)
 endif()
 
+set(LLD_SYSTEM_LIBS ${LLVM_PTHREAD_LIB})
+
+if(NOT HAVE_CXX_ATOMICS64_WITHOUT_LIB)
+  list(APPEND LLD_SYSTEM_LIBS atomic)
+endif()
+
 find_first_existing_vc_file("${LLVM_MAIN_SRC_DIR}" llvm_vc)
 find_first_existing_vc_file("${LLD_SOURCE_DIR}" lld_vc)
 
@@ -54,7 +60,7 @@ add_lld_library(lldCommon
   Target
 
   LINK_LIBS
-  ${LLVM_PTHREAD_LIB}
+  ${LLD_SYSTEM_LIBS}
 
   DEPENDS
   ${tablegen_deps}

From 0c37a9165611880b99b1f9632179864ecb3f2e13 Mon Sep 17 00:00:00 2001
From: Bas Zalmstra <zalmstra.bas@gmail.com>
Date: Sat, 22 Aug 2020 23:04:22 +0300
Subject: [PATCH 157/363] [LLD][COFF] Reset outputSections for successive runs

The global variable outputSections in the COFF writer was not
cleared between runs which caused successive calls to lld::coff::link
to generate invalid binaries. These binaries when loaded would result
in "invalid win32 applications" and/or "bad image" errors.

Differential Revision: https://reviews.llvm.org/D86401

(cherry picked from commit 54f5a4ea4c859cf7f34f0d4955abc3a2f44bd0dc)
---
 lld/COFF/Writer.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 082de5b8c1d6..0188f0971a75 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -599,6 +599,9 @@ void Writer::finalizeAddresses() {
 void Writer::run() {
   ScopedTimer t1(codeLayoutTimer);
 
+  // First, clear the output sections from previous runs
+  outputSections.clear();
+
   createImportTables();
   createSections();
   createMiscChunks();

From 82e48a579024d0ffbc352702ec0c52b47a6fe691 Mon Sep 17 00:00:00 2001
From: "Mott, Jeffrey T" <jeffrey.t.mott@intel.com>
Date: Fri, 17 Jul 2020 09:50:08 -0700
Subject: [PATCH 158/363] Disable use of _ExtInt with '__atomic' builtins

We're (temporarily) disabling ExtInt for the '__atomic' builtins so we can better design their behavior later. The idea is until we do an audit/design for the way atomic builtins are supposed to work with _ExtInt, we should leave them restricted so they don't limit our future options, such as by binding us to a sub-optimal implementation via ABI.

Example after this change:

    $ cat test.c

        void f(_ExtInt(64) *ptr) {
          __atomic_fetch_add(ptr, 1, 0);
        }

    $ clang -c test.c

        test.c:2:22: error: argument to atomic builtin of type '_ExtInt' is not supported
          __atomic_fetch_add(ptr, 1, 0);
                             ^
        1 error generated.

Differential Revision: https://reviews.llvm.org/D84049

(cherry picked from commit ca77ab494aa29f7521ff797d230cd1b36cbe4e62)
---
 clang/include/clang/Basic/DiagnosticSemaKinds.td |  7 ++++---
 clang/lib/Sema/SemaChecking.cpp                  |  5 +++++
 clang/lib/Sema/SemaType.cpp                      |  5 +----
 clang/test/Sema/builtins.c                       |  4 ++++
 clang/test/SemaCXX/ext-int.cpp                   |  5 +++--
 libcxx/test/libcxx/atomics/ext-int.verify.cpp    | 11 +++++++++++
 6 files changed, 28 insertions(+), 9 deletions(-)
 create mode 100644 libcxx/test/libcxx/atomics/ext-int.verify.cpp

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index aa4de2812312..941f2cafc372 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6021,9 +6021,8 @@ def err_func_def_incomplete_result : Error<
 def err_atomic_specifier_bad_type
     : Error<"_Atomic cannot be applied to "
             "%select{incomplete |array |function |reference |atomic |qualified "
-            "|sizeless ||integer |integer }0type "
-            "%1 %select{|||||||which is not trivially copyable|with less than "
-            "1 byte of precision|with a non power of 2 precision}0">;
+            "|sizeless ||integer }0type "
+            "%1 %select{|||||||which is not trivially copyable|}0">;
 
 // Expressions.
 def ext_sizeof_alignof_function_type : Extension<
@@ -7941,6 +7940,8 @@ def err_atomic_exclusive_builtin_pointer_size : Error<
   " 1,2,4 or 8 byte type (%0 invalid)">;
 def err_atomic_builtin_ext_int_size : Error<
   "Atomic memory operand must have a power-of-two size">;
+def err_atomic_builtin_ext_int_prohibit : Error<
+  "argument to atomic builtin of type '_ExtInt' is not supported">;
 def err_atomic_op_needs_atomic : Error<
   "address argument to atomic operation must be a pointer to _Atomic "
   "type (%0 invalid)">;
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 509d88e25000..b00d2ff5f1d5 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -4956,6 +4956,11 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange,
                 ? 0
                 : 1);
 
+  if (ValType->isExtIntType()) {
+    Diag(Ptr->getExprLoc(), diag::err_atomic_builtin_ext_int_prohibit);
+    return ExprError();
+  }
+
   return AE;
 }
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index b8f7f1a58159..cc151a048b98 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -8880,11 +8880,8 @@ QualType Sema::BuildAtomicType(QualType T, SourceLocation Loc) {
     else if (!T.isTriviallyCopyableType(Context))
       // Some other non-trivially-copyable type (probably a C++ class)
       DisallowedKind = 7;
-    else if (auto *ExtTy = T->getAs<ExtIntType>()) {
-      if (ExtTy->getNumBits() < 8)
+    else if (T->isExtIntType()) {
         DisallowedKind = 8;
-      else if (!llvm::isPowerOf2_32(ExtTy->getNumBits()))
-        DisallowedKind = 9;
     }
 
     if (DisallowedKind != -1) {
diff --git a/clang/test/Sema/builtins.c b/clang/test/Sema/builtins.c
index 90c033e47cd1..3e0a9cbfdeaf 100644
--- a/clang/test/Sema/builtins.c
+++ b/clang/test/Sema/builtins.c
@@ -285,12 +285,16 @@ void test_ei_i42i(_ExtInt(42) *ptr, int value) {
   __sync_fetch_and_add(ptr, value); // expected-error {{Atomic memory operand must have a power-of-two size}}
   // expected-warning@+1 {{the semantics of this intrinsic changed with GCC version 4.4 - the newer semantics are provided here}}
   __sync_nand_and_fetch(ptr, value); // expected-error {{Atomic memory operand must have a power-of-two size}}
+
+  __atomic_fetch_add(ptr, 1, 0); // expected-error {{argument to atomic builtin of type '_ExtInt' is not supported}}
 }
 
 void test_ei_i64i(_ExtInt(64) *ptr, int value) {
   __sync_fetch_and_add(ptr, value); // expect success
   // expected-warning@+1 {{the semantics of this intrinsic changed with GCC version 4.4 - the newer semantics are provided here}}
   __sync_nand_and_fetch(ptr, value); // expect success
+
+  __atomic_fetch_add(ptr, 1, 0); // expected-error {{argument to atomic builtin of type '_ExtInt' is not supported}}
 }
 
 void test_ei_ii42(int *ptr, _ExtInt(42) value) {
diff --git a/clang/test/SemaCXX/ext-int.cpp b/clang/test/SemaCXX/ext-int.cpp
index 14f11a6bb961..f4c2dc4752ee 100644
--- a/clang/test/SemaCXX/ext-int.cpp
+++ b/clang/test/SemaCXX/ext-int.cpp
@@ -91,10 +91,11 @@ typedef _ExtInt(32) __attribute__((vector_size(16))) VecTy;
 _Complex _ExtInt(3) Cmplx;
 
 // Reject cases of _Atomic:
-// expected-error@+1{{_Atomic cannot be applied to integer type '_ExtInt(4)' with less than 1 byte of precision}}
+// expected-error@+1{{_Atomic cannot be applied to integer type '_ExtInt(4)'}}
 _Atomic _ExtInt(4) TooSmallAtomic;
-// expected-error@+1{{_Atomic cannot be applied to integer type '_ExtInt(9)' with a non power of 2 precision}}
+// expected-error@+1{{_Atomic cannot be applied to integer type '_ExtInt(9)'}}
 _Atomic _ExtInt(9) NotPow2Atomic;
+// expected-error@+1{{_Atomic cannot be applied to integer type '_ExtInt(128)'}}
 _Atomic _ExtInt(128) JustRightAtomic;
 
 // Test result types of Unary/Bitwise/Binary Operations:
diff --git a/libcxx/test/libcxx/atomics/ext-int.verify.cpp b/libcxx/test/libcxx/atomics/ext-int.verify.cpp
new file mode 100644
index 000000000000..3f57437f43cc
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/ext-int.verify.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: clang-11
+
+#include <atomic>
+
+int main(int, char**)
+{
+  // expected-error@atomic:*1 {{_Atomic cannot be applied to integer type '_ExtInt(32)'}}
+  std::atomic<_ExtInt(32)> x {42};
+
+  return 0;
+}

From dcdf2aff02a154f9f862d6f01feff3de1389cc47 Mon Sep 17 00:00:00 2001
From: Kang Zhang <shkzhang@cn.ibm.com>
Date: Fri, 21 Aug 2020 01:10:52 +0000
Subject: [PATCH 159/363] [PowerPC] Fix a typo for InstAlias of mfsprg

D77531 has a type for mfsprg, it should be mtsprg. This patch is to fix
this typo.

(cherry picked from commit 95e18b2d9d5f93c209ea81df79c2e18ef77de506)
---
 llvm/lib/Target/PowerPC/PPCInstr64Bit.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 6956c40a70be..de42d354a048 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1026,8 +1026,8 @@ def : InstAlias<"mfamr $Rx", (MFSPR8 g8rc:$Rx, 29)>;
 foreach SPRG = 0-3 in {
   def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR8 g8rc:$RT, !add(SPRG, 272))>;
   def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR8 g8rc:$RT, !add(SPRG, 272))>;
-  def : InstAlias<"mfsprg "#SPRG#", $RT", (MTSPR8 !add(SPRG, 272), g8rc:$RT)>;
-  def : InstAlias<"mfsprg"#SPRG#" $RT", (MTSPR8 !add(SPRG, 272), g8rc:$RT)>;
+  def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR8 !add(SPRG, 272), g8rc:$RT)>;
+  def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR8 !add(SPRG, 272), g8rc:$RT)>;
 }
 
 def : InstAlias<"mfasr $RT", (MFSPR8 g8rc:$RT, 280)>;

From d6d03d09e3f7498f60e2976b8cea235080f55fe7 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 19 Aug 2020 22:48:28 -0700
Subject: [PATCH 160/363] [ELF][test] Fix some llvm-objdump RUN lines which
 don't actually test anything

(cherry picked from commit ac46bc35e98d922f1b05b451341f03dcaccd1527)
---
 lld/test/ELF/arm-ldrlit.s                |  1 -
 lld/test/ELF/arm-thumb-interwork-ifunc.s | 20 ++++++++++----------
 lld/test/ELF/arm-thumb2-adr.s            |  1 -
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/lld/test/ELF/arm-ldrlit.s b/lld/test/ELF/arm-ldrlit.s
index a466b0a8e1d1..b6ee2d8e9da0 100644
--- a/lld/test/ELF/arm-ldrlit.s
+++ b/lld/test/ELF/arm-ldrlit.s
@@ -1,6 +1,5 @@
 // REQUIRES: arm
 // RUN: llvm-mc --triple=armv7a-none-eabi --arm-add-build-attributes -filetype=obj -o %t.o %s
-// RUN: llvm-objdump -d -r --triple=armv7a-none-eabi %t.o
 // RUN: echo "SECTIONS { \
 // RUN:                 .rodata.low 0x8012  : { *(.rodata.low) } \
 // RUN:                 .text.low   0x8f00  : { *(.text.low) } \
diff --git a/lld/test/ELF/arm-thumb-interwork-ifunc.s b/lld/test/ELF/arm-thumb-interwork-ifunc.s
index 319737a08aad..947bc2dd7786 100644
--- a/lld/test/ELF/arm-thumb-interwork-ifunc.s
+++ b/lld/test/ELF/arm-thumb-interwork-ifunc.s
@@ -1,7 +1,7 @@
 // REQUIRES: arm
 // RUN: llvm-mc --triple=armv7a-linux-gnueabihf -arm-add-build-attributes -filetype=obj -o %t.o %s
 // RUN: ld.lld %t.o -o %t
-// RUN: llvm-objdump --triple=armv7a-none-linux-gnueabi -d --no-show-raw-insn %t
+// RUN: llvm-objdump --triple=armv7a-none-linux-gnueabi -d --no-show-raw-insn %t | FileCheck %s
 
 /// Non-preemptible ifuncs are called via a PLT entry which is always Arm
 /// state, expect the ARM callers to go direct to the PLT entry, Thumb
@@ -30,23 +30,23 @@ thumb_caller:
  b.w foo
  bl foo
 
-// CHECK: 00012004 _start:
-// CHECK-NEXT: b       #36
-// CHECK-NEXT: bl      #32
+// CHECK:      00021004 <_start>:
+// CHECK-NEXT: b       #36 <$a>
+// CHECK-NEXT: bl      #32 <$a>
 
-// CHECK: 0001200c thumb_caller:
+// CHECK:      0002100c <thumb_caller>:
 // CHECK-NEXT: b.w     #8
 // CHECK-NEXT: b.w     #4
 // CHECK-NEXT: blx     #24
 
-// CHECK: 00012018 __Thumbv7ABSLongThunk_foo:
-// CHECK-NEXT: movw    r12, #8240
-// CHECK-NEXT: movt    r12, #1
+// CHECK:      00021018 <__Thumbv7ABSLongThunk_foo>:
+// CHECK-NEXT: movw    r12, #4144
+// CHECK-NEXT: movt    r12, #2
 // CHECK-NEXT: bx      r12
 
 // CHECK: Disassembly of section .iplt:
 
-// CHECK: 00012030 $a:
+// CHECK:      00021030 <$a>:
 // CHECK-NEXT: add     r12, pc, #0, #12
-// CHECK-NEXT: add     r12, r12, #4096
+// CHECK-NEXT: add     r12, r12, #16, #20
 // CHECK-NEXT: ldr     pc, [r12, #8]!
diff --git a/lld/test/ELF/arm-thumb2-adr.s b/lld/test/ELF/arm-thumb2-adr.s
index a6895bc878b6..c0c7cfcc3fd2 100644
--- a/lld/test/ELF/arm-thumb2-adr.s
+++ b/lld/test/ELF/arm-thumb2-adr.s
@@ -10,7 +10,6 @@
 // RUN:               } " > %t.script
 // RUN: ld.lld --script %t.script %t.o -o %t
 // RUN: llvm-readobj --symbols %t | FileCheck %s --check-prefix=SYMS
-// RUN: llvm-objdump -d --triple=thumbv7m-none-eabi %t
 // RUN: llvm-objdump -d --no-show-raw-insn --triple=thumbv7m-none-eabi %t | FileCheck %s
 
 /// Test the various legal cases for the R_ARM_THM_ALU_PREL_11_0 relocation

From c4e216711d001316df2313a85c8af73a85d804c0 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 24 Aug 2020 12:17:48 -0700
Subject: [PATCH 161/363] [ELF] Keep st_type for symbol assignment

PR46970: for `alias = aliasee`, the alias can be used in relocation processing
and on ARM st_type does affect Thumb interworking. It is thus desirable for the
alias to get the same st_type.

Note that the st_size field should not be inherited because some tools use
st_size=0 as a heuristic to detect aliases. Retaining st_size can thwart such
heuristics and cause aliases to be preferred over the original symbols.

Differential Revision: https://reviews.llvm.org/D86263

(cherry picked from commit 9670029b6b302c75bb373fb1814f4e02790c4da8)
The test symbol-assign-type.s was rewritten to not depend on 'split-file'.
---
 lld/ELF/LinkerScript.cpp                      | 13 +++++--
 lld/ELF/LinkerScript.h                        |  4 ++
 lld/docs/ELF/linker_script.rst                | 19 +++++++++
 lld/test/ELF/arm-thumb-interwork-ifunc.s      | 11 ++++++
 lld/test/ELF/linkerscript/common-assign.s     |  4 +-
 .../ELF/linkerscript/symbol-assign-type.s     | 39 +++++++++++++++++++
 6 files changed, 85 insertions(+), 5 deletions(-)
 create mode 100644 lld/test/ELF/linkerscript/symbol-assign-type.s

diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index 72e2ebff9b8c..6de2cd65b973 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -180,7 +180,7 @@ void LinkerScript::addSymbol(SymbolAssignment *cmd) {
   // write expressions like this: `alignment = 16; . = ALIGN(., alignment)`.
   uint64_t symValue = value.sec ? 0 : value.getValue();
 
-  Defined newSym(nullptr, cmd->name, STB_GLOBAL, visibility, STT_NOTYPE,
+  Defined newSym(nullptr, cmd->name, STB_GLOBAL, visibility, value.type,
                  symValue, 0, sec);
 
   Symbol *sym = symtab->insert(cmd->name);
@@ -317,6 +317,7 @@ void LinkerScript::assignSymbol(SymbolAssignment *cmd, bool inSec) {
     cmd->sym->section = v.sec;
     cmd->sym->value = v.getSectionOffset();
   }
+  cmd->sym->type = v.type;
 }
 
 static std::string getFilename(InputFile *file) {
@@ -1215,8 +1216,14 @@ ExprValue LinkerScript::getSymbolValue(StringRef name, const Twine &loc) {
   }
 
   if (Symbol *sym = symtab->find(name)) {
-    if (auto *ds = dyn_cast<Defined>(sym))
-      return {ds->section, false, ds->value, loc};
+    if (auto *ds = dyn_cast<Defined>(sym)) {
+      ExprValue v{ds->section, false, ds->value, loc};
+      // Retain the original st_type, so that the alias will get the same
+      // behavior in relocation processing. Any operation will reset st_type to
+      // STT_NOTYPE.
+      v.type = ds->type;
+      return v;
+    }
     if (isa<SharedSymbol>(sym))
       if (!errorOnMissingSection)
         return {nullptr, false, 0, loc};
diff --git a/lld/ELF/LinkerScript.h b/lld/ELF/LinkerScript.h
index ec4fc22db486..4a1a5fd71b67 100644
--- a/lld/ELF/LinkerScript.h
+++ b/lld/ELF/LinkerScript.h
@@ -59,6 +59,10 @@ struct ExprValue {
   uint64_t val;
   uint64_t alignment = 1;
 
+  // The original st_type if the expression represents a symbol. Any operation
+  // resets type to STT_NOTYPE.
+  uint8_t type = llvm::ELF::STT_NOTYPE;
+
   // Original source location. Used for error messages.
   std::string loc;
 };
diff --git a/lld/docs/ELF/linker_script.rst b/lld/docs/ELF/linker_script.rst
index 0f409b2020ac..debddbf511b6 100644
--- a/lld/docs/ELF/linker_script.rst
+++ b/lld/docs/ELF/linker_script.rst
@@ -17,6 +17,25 @@ possible. We reserve the right to make different implementation choices where
 it is appropriate for LLD. Intentional deviations will be documented in this
 file.
 
+Symbol assignment
+~~~~~~~~~~~~~~~~~
+
+A symbol assignment looks like:
+
+::
+
+  symbol = expression;
+  symbol += expression;
+
+The first form defines ``symbol``. If ``symbol`` is already defined, it will be
+overridden. The other form requires ``symbol`` to be already defined.
+
+For a simple assignment like ``alias = aliasee;``, the ``st_type`` field is
+copied from the original symbol. Any arithmetic operation (e.g. ``+ 0`` will
+reset ``st_type`` to ``STT_NOTYPE``.
+
+The ``st_size`` field is set to 0.
+
 Output section description
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/lld/test/ELF/arm-thumb-interwork-ifunc.s b/lld/test/ELF/arm-thumb-interwork-ifunc.s
index 947bc2dd7786..f77439c6c50b 100644
--- a/lld/test/ELF/arm-thumb-interwork-ifunc.s
+++ b/lld/test/ELF/arm-thumb-interwork-ifunc.s
@@ -3,6 +3,11 @@
 // RUN: ld.lld %t.o -o %t
 // RUN: llvm-objdump --triple=armv7a-none-linux-gnueabi -d --no-show-raw-insn %t | FileCheck %s
 
+/// A symbol assignment defined alias inherits st_type and gets the same treatment.
+// RUN: llvm-mc --triple=armv7a-linux-gnueabihf -arm-add-build-attributes -filetype=obj --defsym ALIAS=1 -o %t1.o %s
+// RUN: ld.lld --defsym foo=foo1 %t1.o -o %t1
+// RUN: llvm-objdump --triple=armv7a-none-linux-gnueabi -d --no-show-raw-insn %t | FileCheck %s
+
 /// Non-preemptible ifuncs are called via a PLT entry which is always Arm
 /// state, expect the ARM callers to go direct to the PLT entry, Thumb
 /// branches are indirected via state change thunks, the bl is changed to blx.
@@ -10,9 +15,15 @@
  .syntax unified
  .text
  .balign 0x1000
+.ifdef ALIAS
+ .type foo1 STT_GNU_IFUNC
+ .globl foo1
+foo1:
+.else
  .type foo STT_GNU_IFUNC
  .globl foo
 foo:
+.endif
  bx lr
 
  .section .text.1, "ax", %progbits
diff --git a/lld/test/ELF/linkerscript/common-assign.s b/lld/test/ELF/linkerscript/common-assign.s
index ef0ad14ce92d..f0d783886e4d 100644
--- a/lld/test/ELF/linkerscript/common-assign.s
+++ b/lld/test/ELF/linkerscript/common-assign.s
@@ -27,7 +27,7 @@
 # CHECK-NEXT:     Value: [[FOO]]
 # CHECK-NEXT:     Size: 0
 # CHECK-NEXT:     Binding: Global
-# CHECK-NEXT:     Type: None
+# CHECK-NEXT:     Type: Object
 # CHECK-NEXT:     Other: 0
 # CHECK-NEXT:     Section: .bss
 # CHECK-NEXT:   }
@@ -36,7 +36,7 @@
 # CHECK-NEXT:     Value: [[BAR]]
 # CHECK-NEXT:     Size: 0
 # CHECK-NEXT:     Binding: Global
-# CHECK-NEXT:     Type: None
+# CHECK-NEXT:     Type: Object
 # CHECK-NEXT:     Other: 0
 # CHECK-NEXT:     Section: .bss
 # CHECK-NEXT:   }
diff --git a/lld/test/ELF/linkerscript/symbol-assign-type.s b/lld/test/ELF/linkerscript/symbol-assign-type.s
new file mode 100644
index 000000000000..c3db8ce8c8ae
--- /dev/null
+++ b/lld/test/ELF/linkerscript/symbol-assign-type.s
@@ -0,0 +1,39 @@
+# REQUIRES: x86
+## Keep st_type for simple assignment (`alias = aliasee`). This property is
+## desired on some targets, where symbol types can affect relocation processing
+## (e.g. Thumb interworking). However, the st_size field should not be retained
+## because some tools use st_size=0 as a heuristic to detect aliases. With any
+## operation, it can be argued that the new symbol may not be of the same type,
+## so reset st_type to STT_NOTYPE.
+
+## NOTE: GNU ld retains st_type for many operations.
+
+# RUN: echo 'retain1 = _start; \
+# RUN:   retain2 = 1 ? _start : 0; \
+# RUN:   drop1 = _start + 0; \
+# RUN:   drop2 = 0 ? _start : 1; \
+# RUN:   drop3 = -_start; \
+# RUN: ' > %t.lds
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld -T %t.lds %t.o -o %t1
+# RUN: llvm-readelf -s %t1 | FileCheck %s
+
+# CHECK:      Size Type   Bind   Vis     Ndx Name
+# CHECK:         1 FUNC   GLOBAL DEFAULT   1 _start
+# CHECK:         0 FUNC   GLOBAL DEFAULT   1 retain1
+# CHECK-NEXT:    0 FUNC   GLOBAL DEFAULT   1 retain2
+# CHECK-NEXT:    0 NOTYPE GLOBAL DEFAULT   1 drop1
+# CHECK-NEXT:    0 NOTYPE GLOBAL DEFAULT ABS drop2
+# CHECK-NEXT:    0 NOTYPE GLOBAL DEFAULT ABS drop3
+
+# RUN: ld.lld --defsym 'retain=_start' --defsym 'drop=_start+0' %t.o -o %t2
+# RUN: llvm-readelf -s %t2 | FileCheck %s --check-prefix=DEFSYM
+
+# DEFSYM:        0 FUNC   GLOBAL DEFAULT   1 retain
+# DEFSYM-NEXT:   0 NOTYPE GLOBAL DEFAULT   1 drop
+
+.globl _start
+.type _start, @function
+_start:
+  ret
+.size _start, 1

From 6406b6fa5ac8192b0861c343509d98368b555d12 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dimitry@andric.com>
Date: Fri, 21 Aug 2020 21:03:00 +0200
Subject: [PATCH 162/363] Instantiate Error in Target::GetEntryPointAddress()
 only when necessary

When `Target::GetEntryPointAddress()` calls `exe_module->GetObjectFile()->GetEntryPointAddress()`, and the returned
`entry_addr` is valid, it can immediately be returned.

However, just before that, an `llvm::Error` value has been setup, but in this case it is not consumed before returning, like is done further below in the function.

In https://bugs.freebsd.org/248745 we got a bug report for this, where a very simple test case aborts and dumps core:

```
* thread #1, name = 'testcase', stop reason = breakpoint 1.1
    frame #0: 0x00000000002018d4 testcase`main(argc=1, argv=0x00007fffffffea18) at testcase.c:3:5
   1	int main(int argc, char *argv[])
   2	{
-> 3	    return 0;
   4	}
(lldb) p argc
Program aborted due to an unhandled Error:
Error value was Success. (Note: Success values must still be checked prior to being destroyed).

Thread 1 received signal SIGABRT, Aborted.
thr_kill () at thr_kill.S:3
3	thr_kill.S: No such file or directory.
(gdb) bt
#0  thr_kill () at thr_kill.S:3
#1  0x00000008049a0004 in __raise (s=6) at /usr/src/lib/libc/gen/raise.c:52
#2  0x0000000804916229 in abort () at /usr/src/lib/libc/stdlib/abort.c:67
#3  0x000000000451b5f5 in fatalUncheckedError () at /usr/src/contrib/llvm-project/llvm/lib/Support/Error.cpp:112
#4  0x00000000019cf008 in GetEntryPointAddress () at /usr/src/contrib/llvm-project/llvm/include/llvm/Support/Error.h:267
#5  0x0000000001bccbd8 in ConstructorSetup () at /usr/src/contrib/llvm-project/lldb/source/Target/ThreadPlanCallFunction.cpp:67
#6  0x0000000001bcd2c0 in ThreadPlanCallFunction () at /usr/src/contrib/llvm-project/lldb/source/Target/ThreadPlanCallFunction.cpp:114
#7  0x00000000020076d4 in InferiorCallMmap () at /usr/src/contrib/llvm-project/lldb/source/Plugins/Process/Utility/InferiorCallPOSIX.cpp:97
#8  0x0000000001f4be33 in DoAllocateMemory () at /usr/src/contrib/llvm-project/lldb/source/Plugins/Process/FreeBSD/ProcessFreeBSD.cpp:604
#9  0x0000000001fe51b9 in AllocatePage () at /usr/src/contrib/llvm-project/lldb/source/Target/Memory.cpp:347
#10 0x0000000001fe5385 in AllocateMemory () at /usr/src/contrib/llvm-project/lldb/source/Target/Memory.cpp:383
#11 0x0000000001974da2 in AllocateMemory () at /usr/src/contrib/llvm-project/lldb/source/Target/Process.cpp:2301
#12 CanJIT () at /usr/src/contrib/llvm-project/lldb/source/Target/Process.cpp:2331
#13 0x0000000001a1bf3d in Evaluate () at /usr/src/contrib/llvm-project/lldb/source/Expression/UserExpression.cpp:190
#14 0x00000000019ce7a2 in EvaluateExpression () at /usr/src/contrib/llvm-project/lldb/source/Target/Target.cpp:2372
#15 0x0000000001ad784c in EvaluateExpression () at /usr/src/contrib/llvm-project/lldb/source/Commands/CommandObjectExpression.cpp:414
#16 0x0000000001ad86ae in DoExecute () at /usr/src/contrib/llvm-project/lldb/source/Commands/CommandObjectExpression.cpp:646
#17 0x0000000001a5e3ed in Execute () at /usr/src/contrib/llvm-project/lldb/source/Interpreter/CommandObject.cpp:1003
#18 0x0000000001a6c4a3 in HandleCommand () at /usr/src/contrib/llvm-project/lldb/source/Interpreter/CommandInterpreter.cpp:1762
#19 0x0000000001a6f98c in IOHandlerInputComplete () at /usr/src/contrib/llvm-project/lldb/source/Interpreter/CommandInterpreter.cpp:2760
#20 0x0000000001a90b08 in Run () at /usr/src/contrib/llvm-project/lldb/source/Core/IOHandler.cpp:548
#21 0x00000000019a6c6a in ExecuteIOHandlers () at /usr/src/contrib/llvm-project/lldb/source/Core/Debugger.cpp:903
#22 0x0000000001a70337 in RunCommandInterpreter () at /usr/src/contrib/llvm-project/lldb/source/Interpreter/CommandInterpreter.cpp:2946
#23 0x0000000001d9d812 in RunCommandInterpreter () at /usr/src/contrib/llvm-project/lldb/source/API/SBDebugger.cpp:1169
#24 0x0000000001918be8 in MainLoop () at /usr/src/contrib/llvm-project/lldb/tools/driver/Driver.cpp:675
#25 0x000000000191a114 in main () at /usr/src/contrib/llvm-project/lldb/tools/driver/Driver.cpp:890```

Fix the incorrect error catch by only instantiating an `Error` object if it is necessary.

Reviewed By: JDevlieghere

Differential Revision: https://reviews.llvm.org/D86355

(cherry picked from commit 1ce07cd614beab5150a5440c7faf195009f99e2c)
---
 lldb/source/Target/Target.cpp | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index dad56376005c..707344f99fcb 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -2401,21 +2401,13 @@ lldb::addr_t Target::GetPersistentSymbol(ConstString name) {
 
 llvm::Expected<lldb_private::Address> Target::GetEntryPointAddress() {
   Module *exe_module = GetExecutableModulePointer();
-  llvm::Error error = llvm::Error::success();
-  assert(!error); // Check the success value when assertions are enabled.
 
-  if (!exe_module || !exe_module->GetObjectFile()) {
-    error = llvm::make_error<llvm::StringError>("No primary executable found",
-                                                llvm::inconvertibleErrorCode());
-  } else {
+  // Try to find the entry point address in the primary executable.
+  const bool has_primary_executable = exe_module && exe_module->GetObjectFile();
+  if (has_primary_executable) {
     Address entry_addr = exe_module->GetObjectFile()->GetEntryPointAddress();
     if (entry_addr.IsValid())
       return entry_addr;
-
-    error = llvm::make_error<llvm::StringError>(
-        "Could not find entry point address for executable module \"" +
-            exe_module->GetFileSpec().GetFilename().GetStringRef() + "\"",
-        llvm::inconvertibleErrorCode());
   }
 
   const ModuleList &modules = GetImages();
@@ -2426,14 +2418,21 @@ llvm::Expected<lldb_private::Address> Target::GetEntryPointAddress() {
       continue;
 
     Address entry_addr = module_sp->GetObjectFile()->GetEntryPointAddress();
-    if (entry_addr.IsValid()) {
-      // Discard the error.
-      llvm::consumeError(std::move(error));
+    if (entry_addr.IsValid())
       return entry_addr;
-    }
   }
 
-  return std::move(error);
+  // We haven't found the entry point address. Return an appropriate error.
+  if (!has_primary_executable)
+    return llvm::make_error<llvm::StringError>(
+        "No primary executable found and could not find entry point address in "
+        "any executable module",
+        llvm::inconvertibleErrorCode());
+
+  return llvm::make_error<llvm::StringError>(
+      "Could not find entry point address for primary executable module \"" +
+          exe_module->GetFileSpec().GetFilename().GetStringRef() + "\"",
+      llvm::inconvertibleErrorCode());
 }
 
 lldb::addr_t Target::GetCallableLoadAddress(lldb::addr_t load_addr,

From 0c001a171c7d671f0129d69c6a47b159544cdca6 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Sun, 23 Aug 2020 20:01:38 -0400
Subject: [PATCH 163/363] [clang][Driver] Implement AddClangSystemIncludeArgs
 and HasNativeLLVMSupport for the OpenBSD clang driver.

If not overridden, AddClangSystemIncludeArgs's implementation is empty, so by
default, no system include args are added to the Clang driver. This means that
invoking Clang without the frontend must include a manual -I/usr/include flag,
which is inconsistent behavior. Therefore, override and implement this method
to match. Some boilerplate is also borrowed for handling of the other driver
flags.

While we are here, also override and enable HasNativeLLVMSupport.

Patch by: 3405691582 (dana koch)

Differential Revision: https://reviews.llvm.org/D86412

(cherry picked from commit 2b37174b9a5db235e493cb72e4454cc08a1b1791)
---
 clang/lib/Driver/ToolChains/OpenBSD.cpp | 37 +++++++++++++++++++++++++
 clang/lib/Driver/ToolChains/OpenBSD.h   |  6 ++++
 clang/lib/Frontend/InitHeaderSearch.cpp |  2 ++
 3 files changed, 45 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index 9c1a9c5f8228..b0174ac62b58 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -10,10 +10,12 @@
 #include "Arch/Mips.h"
 #include "Arch/Sparc.h"
 #include "CommonArgs.h"
+#include "clang/Config/config.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "llvm/Option/ArgList.h"
+#include "llvm/Support/Path.h"
 
 using namespace clang::driver;
 using namespace clang::driver::tools;
@@ -278,3 +280,38 @@ void OpenBSD::addClangTargetOptions(const ArgList &DriverArgs,
                           options::OPT_fno_use_init_array, false))
     CC1Args.push_back("-fno-use-init-array");
 }
+
+void OpenBSD::AddClangSystemIncludeArgs(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {
+  const Driver &D = getDriver();
+
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+    return;
+
+  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
+    SmallString<128> Dir(D.ResourceDir);
+    llvm::sys::path::append(Dir, "include");
+    addSystemInclude(DriverArgs, CC1Args, Dir.str());
+  }
+
+  if (DriverArgs.hasArg(options::OPT_nostdlibinc))
+    return;
+
+  // Check for configure-time C include directories.
+  StringRef CIncludeDirs(C_INCLUDE_DIRS);
+  if (CIncludeDirs != "") {
+    SmallVector<StringRef, 5> dirs;
+    CIncludeDirs.split(dirs, ":");
+    for (StringRef dir : dirs) {
+      StringRef Prefix =
+          llvm::sys::path::is_absolute(dir) ? StringRef(D.SysRoot) : "";
+      addExternCSystemInclude(DriverArgs, CC1Args, Prefix + dir);
+    }
+    return;
+  }
+
+  addExternCSystemInclude(DriverArgs, CC1Args, D.SysRoot + "/usr/include");
+}
+
+bool OpenBSD::HasNativeLLVMSupport() const { return true; }
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.h b/clang/lib/Driver/ToolChains/OpenBSD.h
index 897eee57ab68..9924aa22e9d9 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.h
+++ b/clang/lib/Driver/ToolChains/OpenBSD.h
@@ -65,6 +65,12 @@ class LLVM_LIBRARY_VISIBILITY OpenBSD : public Generic_ELF {
     return ToolChain::CST_Libcxx;
   }
 
+  void
+  AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                            llvm::opt::ArgStringList &CC1Args) const override;
+
+  bool HasNativeLLVMSupport() const override;
+
   void AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args,
                            llvm::opt::ArgStringList &CmdArgs) const override;
 
diff --git a/clang/lib/Frontend/InitHeaderSearch.cpp b/clang/lib/Frontend/InitHeaderSearch.cpp
index 16f1f1670e8d..bc31445d6d08 100644
--- a/clang/lib/Frontend/InitHeaderSearch.cpp
+++ b/clang/lib/Frontend/InitHeaderSearch.cpp
@@ -270,6 +270,7 @@ void InitHeaderSearch::AddDefaultCIncludePaths(const llvm::Triple &triple,
   case llvm::Triple::Linux:
   case llvm::Triple::Hurd:
   case llvm::Triple::Solaris:
+  case llvm::Triple::OpenBSD:
     llvm_unreachable("Include management is handled in the driver.");
 
   case llvm::Triple::CloudABI: {
@@ -423,6 +424,7 @@ void InitHeaderSearch::AddDefaultIncludePaths(const LangOptions &Lang,
   case llvm::Triple::Emscripten:
   case llvm::Triple::Linux:
   case llvm::Triple::Hurd:
+  case llvm::Triple::OpenBSD:
   case llvm::Triple::Solaris:
   case llvm::Triple::WASI:
   case llvm::Triple::AIX:

From e4f4d48665526c6d8001e8101dc539a7f7653aec Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Sun, 23 Aug 2020 20:44:29 -0400
Subject: [PATCH 164/363] [clang][Driver] Implement addLibCxxIncludePaths and
 getCompilerRT for the OpenBSD clang driver.

(cherry picked from commit bf3577ef64c300ba7841a90a4e09e1e305271976)
---
 clang/lib/Driver/ToolChains/OpenBSD.cpp | 60 +++++++++++++++----------
 clang/lib/Driver/ToolChains/OpenBSD.h   |  9 +++-
 2 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index b0174ac62b58..4f2d04058d24 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -258,29 +258,6 @@ OpenBSD::OpenBSD(const Driver &D, const llvm::Triple &Triple,
   getFilePaths().push_back(getDriver().SysRoot + "/usr/lib");
 }
 
-void OpenBSD::AddCXXStdlibLibArgs(const ArgList &Args,
-                                  ArgStringList &CmdArgs) const {
-  bool Profiling = Args.hasArg(options::OPT_pg);
-
-  CmdArgs.push_back(Profiling ? "-lc++_p" : "-lc++");
-  CmdArgs.push_back(Profiling ? "-lc++abi_p" : "-lc++abi");
-}
-
-Tool *OpenBSD::buildAssembler() const {
-  return new tools::openbsd::Assembler(*this);
-}
-
-Tool *OpenBSD::buildLinker() const { return new tools::openbsd::Linker(*this); }
-
-void OpenBSD::addClangTargetOptions(const ArgList &DriverArgs,
-                                    ArgStringList &CC1Args,
-                                    Action::OffloadKind) const {
-  // Support for .init_array is still new (Aug 2016).
-  if (!DriverArgs.hasFlag(options::OPT_fuse_init_array,
-                          options::OPT_fno_use_init_array, false))
-    CC1Args.push_back("-fno-use-init-array");
-}
-
 void OpenBSD::AddClangSystemIncludeArgs(
     const llvm::opt::ArgList &DriverArgs,
     llvm::opt::ArgStringList &CC1Args) const {
@@ -314,4 +291,41 @@ void OpenBSD::AddClangSystemIncludeArgs(
   addExternCSystemInclude(DriverArgs, CC1Args, D.SysRoot + "/usr/include");
 }
 
+void OpenBSD::addLibCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
+                                    llvm::opt::ArgStringList &CC1Args) const {
+  addSystemInclude(DriverArgs, CC1Args,
+                   getDriver().SysRoot + "/usr/include/c++/v1");
+}
+
+void OpenBSD::AddCXXStdlibLibArgs(const ArgList &Args,
+                                  ArgStringList &CmdArgs) const {
+  bool Profiling = Args.hasArg(options::OPT_pg);
+
+  CmdArgs.push_back(Profiling ? "-lc++_p" : "-lc++");
+  CmdArgs.push_back(Profiling ? "-lc++abi_p" : "-lc++abi");
+}
+
+std::string OpenBSD::getCompilerRT(const ArgList &Args,
+                                   StringRef Component,
+                                   FileType Type) const {
+  SmallString<128> Path(getDriver().SysRoot);
+  llvm::sys::path::append(Path, "/usr/lib/libcompiler_rt.a");
+  return std::string(Path.str());
+}
+
+void OpenBSD::addClangTargetOptions(const ArgList &DriverArgs,
+                                    ArgStringList &CC1Args,
+                                    Action::OffloadKind) const {
+  // Support for .init_array is still new (Aug 2016).
+  if (!DriverArgs.hasFlag(options::OPT_fuse_init_array,
+                          options::OPT_fno_use_init_array, false))
+    CC1Args.push_back("-fno-use-init-array");
+}
+
+Tool *OpenBSD::buildAssembler() const {
+  return new tools::openbsd::Assembler(*this);
+}
+
+Tool *OpenBSD::buildLinker() const { return new tools::openbsd::Linker(*this); }
+
 bool OpenBSD::HasNativeLLVMSupport() const { return true; }
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.h b/clang/lib/Driver/ToolChains/OpenBSD.h
index 9924aa22e9d9..09595faf9d6b 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.h
+++ b/clang/lib/Driver/ToolChains/OpenBSD.h
@@ -54,6 +54,8 @@ class LLVM_LIBRARY_VISIBILITY OpenBSD : public Generic_ELF {
   OpenBSD(const Driver &D, const llvm::Triple &Triple,
           const llvm::opt::ArgList &Args);
 
+  bool HasNativeLLVMSupport() const override;
+
   bool IsMathErrnoDefault() const override { return false; }
   bool IsObjCNonFragileABIDefault() const override { return true; }
   bool isPIEDefault() const override { return true; }
@@ -69,11 +71,14 @@ class LLVM_LIBRARY_VISIBILITY OpenBSD : public Generic_ELF {
   AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                             llvm::opt::ArgStringList &CC1Args) const override;
 
-  bool HasNativeLLVMSupport() const override;
-
+  void addLibCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
+                             llvm::opt::ArgStringList &CC1Args) const override;
   void AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args,
                            llvm::opt::ArgStringList &CmdArgs) const override;
 
+  std::string getCompilerRT(const llvm::opt::ArgList &Args, StringRef Component,
+                            FileType Type = ToolChain::FT_Static) const override;
+
   unsigned GetDefaultStackProtectorLevel(bool KernelOrKext) const override {
     return 2;
   }

From 90c2c51a524f28c23c41c67eb71d7e278ede7524 Mon Sep 17 00:00:00 2001
From: Amy Huang <akhuang@google.com>
Date: Tue, 28 Jul 2020 11:23:59 -0700
Subject: [PATCH 165/363] Revert "Switch to using -debug-info-kind=constructor
 as default (from =limited)"

This reverts commit 227db86a1b7dd6f96f7df14890fcd071bc4fe1f5.

Causing debug info errors in google3 LTO builds; also causes a
debuginfo-test failure.

(cherry picked from commit 394db2259575ef3cac8d3d37836b11eb2373c435)
---
 clang/lib/Driver/ToolChains/Clang.cpp            | 16 +++++++---------
 clang/test/Driver/cl-options.c                   |  6 +++---
 clang/test/Driver/clang-g-opts.c                 |  2 +-
 clang/test/Driver/cuda-dwarf-2.cu                |  2 +-
 clang/test/Driver/debug-options-as.c             |  2 +-
 clang/test/Driver/debug-options.c                |  8 ++++----
 clang/test/Driver/integrated-as.s                | 10 +++++-----
 clang/test/Driver/myriad-toolchain.c             |  2 +-
 clang/test/Driver/openmp-offload-gpu.c           |  2 +-
 clang/test/Driver/split-debug.c                  | 10 +++++-----
 .../SymbolFile/PDB/Inputs/ClassLayoutTest.cpp    |  1 -
 11 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index c77ae5a44a0e..f0a5451322aa 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -498,7 +498,7 @@ static codegenoptions::DebugInfoKind DebugLevelToInfoKind(const Arg &A) {
     return codegenoptions::DebugLineTablesOnly;
   if (A.getOption().matches(options::OPT_gline_directives_only))
     return codegenoptions::DebugDirectivesOnly;
-  return codegenoptions::DebugInfoConstructor;
+  return codegenoptions::LimitedDebugInfo;
 }
 
 static bool mustUseNonLeafFramePointerForTarget(const llvm::Triple &Triple) {
@@ -2380,7 +2380,7 @@ static void CollectArgsForIntegratedAssembler(Compilation &C,
           CmdArgs.push_back(Value.data());
         } else {
           RenderDebugEnablingArgs(Args, CmdArgs,
-                                  codegenoptions::DebugInfoConstructor,
+                                  codegenoptions::LimitedDebugInfo,
                                   DwarfVersion, llvm::DebuggerKind::Default);
         }
       } else if (Value.startswith("-mcpu") || Value.startswith("-mfpu") ||
@@ -3653,7 +3653,7 @@ static void RenderDebugOptions(const ToolChain &TC, const Driver &D,
   if (const Arg *A =
           Args.getLastArg(options::OPT_g_Group, options::OPT_gsplit_dwarf,
                           options::OPT_gsplit_dwarf_EQ)) {
-    DebugInfoKind = codegenoptions::DebugInfoConstructor;
+    DebugInfoKind = codegenoptions::LimitedDebugInfo;
 
     // If the last option explicitly specified a debug-info level, use it.
     if (checkDebugInfoOption(A, Args, D, TC) &&
@@ -3758,7 +3758,7 @@ static void RenderDebugOptions(const ToolChain &TC, const Driver &D,
     if (checkDebugInfoOption(A, Args, D, TC)) {
       if (DebugInfoKind != codegenoptions::DebugLineTablesOnly &&
           DebugInfoKind != codegenoptions::DebugDirectivesOnly) {
-        DebugInfoKind = codegenoptions::DebugInfoConstructor;
+        DebugInfoKind = codegenoptions::LimitedDebugInfo;
         CmdArgs.push_back("-dwarf-ext-refs");
         CmdArgs.push_back("-fmodule-format=obj");
       }
@@ -3778,9 +3778,7 @@ static void RenderDebugOptions(const ToolChain &TC, const Driver &D,
           TC.GetDefaultStandaloneDebug());
   if (const Arg *A = Args.getLastArg(options::OPT_fstandalone_debug))
     (void)checkDebugInfoOption(A, Args, D, TC);
-  if ((DebugInfoKind == codegenoptions::LimitedDebugInfo ||
-       DebugInfoKind == codegenoptions::DebugInfoConstructor) &&
-      NeedFullDebug)
+  if (DebugInfoKind == codegenoptions::LimitedDebugInfo && NeedFullDebug)
     DebugInfoKind = codegenoptions::FullDebugInfo;
 
   if (Args.hasFlag(options::OPT_gembed_source, options::OPT_gno_embed_source,
@@ -6566,7 +6564,7 @@ void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType,
                           options::OPT_gline_tables_only)) {
     *EmitCodeView = true;
     if (DebugInfoArg->getOption().matches(options::OPT__SLASH_Z7))
-      *DebugInfoKind = codegenoptions::DebugInfoConstructor;
+      *DebugInfoKind = codegenoptions::LimitedDebugInfo;
     else
       *DebugInfoKind = codegenoptions::DebugLineTablesOnly;
   } else {
@@ -6863,7 +6861,7 @@ void ClangAs::ConstructJob(Compilation &C, const JobAction &JA,
     // the guard for source type, however there is a test which asserts
     // that some assembler invocation receives no -debug-info-kind,
     // and it's not clear whether that test is just overly restrictive.
-    DebugInfoKind = (WantDebug ? codegenoptions::DebugInfoConstructor
+    DebugInfoKind = (WantDebug ? codegenoptions::LimitedDebugInfo
                                : codegenoptions::NoDebugInfo);
     // Add the -fdebug-compilation-dir flag if needed.
     addDebugCompDirArg(Args, CmdArgs, C.getDriver().getVFS());
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index 0dcaf6108806..d0c48ae41d9a 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -524,11 +524,11 @@
 
 // RUN: %clang_cl /Zi /c -### -- %s 2>&1 | FileCheck -check-prefix=Zi %s
 // Zi: "-gcodeview"
-// Zi: "-debug-info-kind=constructor"
+// Zi: "-debug-info-kind=limited"
 
 // RUN: %clang_cl /Z7 /c -### -- %s 2>&1 | FileCheck -check-prefix=Z7 %s
 // Z7: "-gcodeview"
-// Z7: "-debug-info-kind=constructor"
+// Z7: "-debug-info-kind=limited"
 
 // RUN: %clang_cl /Zd /c -### -- %s 2>&1 | FileCheck -check-prefix=Z7GMLT %s
 // Z7GMLT: "-gcodeview"
@@ -557,7 +557,7 @@
 // which made it "win". This test could not detect that bug.
 // RUN: %clang_cl /Z7 -gdwarf /c -### -- %s 2>&1 | FileCheck -check-prefix=Z7_gdwarf %s
 // Z7_gdwarf: "-gcodeview"
-// Z7_gdwarf: "-debug-info-kind=constructor"
+// Z7_gdwarf: "-debug-info-kind=limited"
 // Z7_gdwarf: "-dwarf-version=4"
 
 // RUN: %clang_cl -fmsc-version=1800 -TP -### -- %s 2>&1 | FileCheck -check-prefix=CXX11 %s
diff --git a/clang/test/Driver/clang-g-opts.c b/clang/test/Driver/clang-g-opts.c
index 60c97790b7da..bc714b6c9379 100644
--- a/clang/test/Driver/clang-g-opts.c
+++ b/clang/test/Driver/clang-g-opts.c
@@ -31,7 +31,7 @@
 // RUN:             | FileCheck --check-prefix=CHECK-WITH-G-DWARF2 %s
 
 // CHECK-WITHOUT-G-NOT: -debug-info-kind
-// CHECK-WITH-G: "-debug-info-kind=constructor"
+// CHECK-WITH-G: "-debug-info-kind=limited"
 // CHECK-WITH-G: "-dwarf-version=4"
 // CHECK-WITH-G-DWARF2: "-dwarf-version=2"
 
diff --git a/clang/test/Driver/cuda-dwarf-2.cu b/clang/test/Driver/cuda-dwarf-2.cu
index 92b8919729fc..bcfb2444bc51 100644
--- a/clang/test/Driver/cuda-dwarf-2.cu
+++ b/clang/test/Driver/cuda-dwarf-2.cu
@@ -49,7 +49,7 @@
 
 // HAS_DEBUG-NOT: warning: debug
 // HAS_DEBUG: "-fcuda-is-device"
-// HAS_DEBUG-SAME: "-debug-info-kind={{constructor|line-tables-only}}"
+// HAS_DEBUG-SAME: "-debug-info-kind={{limited|line-tables-only}}"
 // HAS_DEBUG-SAME: "-dwarf-version=2"
 // HAS_DEBUG: ptxas
 // HAS_DEBUG-SAME: "-g"
diff --git a/clang/test/Driver/debug-options-as.c b/clang/test/Driver/debug-options-as.c
index 4808219702e7..51475680e9b1 100644
--- a/clang/test/Driver/debug-options-as.c
+++ b/clang/test/Driver/debug-options-as.c
@@ -23,7 +23,7 @@
 // RUN:   | FileCheck %s
 //
 // CHECK: "-cc1as"
-// CHECK: "-debug-info-kind=constructor"
+// CHECK: "-debug-info-kind=limited"
 
 // Check to make sure clang with -g on a .s file gets passed -dwarf-debug-producer.
 // rdar://12955296
diff --git a/clang/test/Driver/debug-options.c b/clang/test/Driver/debug-options.c
index 2d1a0b2d5cd8..189c1f9addeb 100644
--- a/clang/test/Driver/debug-options.c
+++ b/clang/test/Driver/debug-options.c
@@ -274,18 +274,18 @@
 // GLIO_ONLY_DWARF2: "-dwarf-version=2"
 //
 // G_ONLY: "-cc1"
-// G_ONLY: "-debug-info-kind=constructor"
+// G_ONLY: "-debug-info-kind=limited"
 //
 // These tests assert that "-gline-tables-only" "-g" uses the latter,
 // but otherwise not caring about the DebugInfoKind.
 // G_ONLY_DWARF2: "-cc1"
-// G_ONLY_DWARF2: "-debug-info-kind={{standalone|constructor}}"
+// G_ONLY_DWARF2: "-debug-info-kind={{standalone|limited}}"
 // G_ONLY_DWARF2: "-dwarf-version=2"
 //
 // G_STANDALONE: "-cc1"
 // G_STANDALONE: "-debug-info-kind=standalone"
 // G_LIMITED: "-cc1"
-// G_LIMITED: "-debug-info-kind=constructor"
+// G_LIMITED: "-debug-info-kind=limited"
 // G_DWARF2: "-dwarf-version=2"
 // G_DWARF4: "-dwarf-version=4"
 //
@@ -339,7 +339,7 @@
 // NOCI: "-gno-column-info"
 //
 // GEXTREFS: "-dwarf-ext-refs" "-fmodule-format=obj"
-// GEXTREFS: "-debug-info-kind={{standalone|constructor}}"
+// GEXTREFS: "-debug-info-kind={{standalone|limited}}"
 
 // RUN: not %clang -cc1 -debug-info-kind=watkind 2>&1 | FileCheck -check-prefix=BADSTRING1 %s
 // BADSTRING1: error: invalid value 'watkind' in '-debug-info-kind=watkind'
diff --git a/clang/test/Driver/integrated-as.s b/clang/test/Driver/integrated-as.s
index 05999cfe002b..0194a3d5a438 100644
--- a/clang/test/Driver/integrated-as.s
+++ b/clang/test/Driver/integrated-as.s
@@ -27,19 +27,19 @@
 // XA_INCLUDE2: "-Ifoo_dir"
 
 // RUN: %clang -### -target x86_64--- -c -integrated-as %s -gdwarf-4 -gdwarf-2 2>&1 | FileCheck --check-prefix=DWARF2 %s
-// DWARF2: "-debug-info-kind=constructor" "-dwarf-version=2"
+// DWARF2: "-debug-info-kind=limited" "-dwarf-version=2"
 
 // RUN: %clang -### -target x86_64--- -c -integrated-as %s -gdwarf-3 2>&1 | FileCheck --check-prefix=DWARF3 %s
-// DWARF3: "-debug-info-kind=constructor" "-dwarf-version=3"
+// DWARF3: "-debug-info-kind=limited" "-dwarf-version=3"
 
 // RUN: %clang -### -target x86_64--- -c -integrated-as %s -gdwarf-4 2>&1 | FileCheck --check-prefix=DWARF4 %s
-// DWARF4: "-debug-info-kind=constructor" "-dwarf-version=4"
+// DWARF4: "-debug-info-kind=limited" "-dwarf-version=4"
 
 // RUN: %clang -### -target x86_64--- -c -integrated-as %s -Xassembler -gdwarf-2 2>&1 | FileCheck --check-prefix=DWARF2XASSEMBLER %s
-// DWARF2XASSEMBLER: "-debug-info-kind=constructor" "-dwarf-version=2"
+// DWARF2XASSEMBLER: "-debug-info-kind=limited" "-dwarf-version=2"
 
 // RUN: %clang -### -target x86_64--- -c -integrated-as %s -Wa,-gdwarf-2 2>&1 | FileCheck --check-prefix=DWARF2WA %s
-// DWARF2WA: "-debug-info-kind=constructor" "-dwarf-version=2"
+// DWARF2WA: "-debug-info-kind=limited" "-dwarf-version=2"
 
 // A dwarf version number that driver can't parse is just stuffed in.
 // RUN: %clang -### -target x86_64--- -c -integrated-as %s -Wa,-gdwarf-huh 2>&1 | FileCheck --check-prefix=BOGODWARF %s
diff --git a/clang/test/Driver/myriad-toolchain.c b/clang/test/Driver/myriad-toolchain.c
index a4bd260a1498..215a02fd0dec 100644
--- a/clang/test/Driver/myriad-toolchain.c
+++ b/clang/test/Driver/myriad-toolchain.c
@@ -83,7 +83,7 @@
 // NOSTDLIB-NOT: "-lc"
 
 // RUN: %clang -### -c -g %s -target sparc-myriad 2>&1 | FileCheck -check-prefix=G_SPARC %s
-// G_SPARC: "-debug-info-kind=constructor" "-dwarf-version=2"
+// G_SPARC: "-debug-info-kind=limited" "-dwarf-version=2"
 
 // RUN: %clang -### -c %s -target sparc-myriad-rtems -fuse-init-array 2>&1 \
 // RUN: | FileCheck -check-prefix=USE-INIT-ARRAY %s
diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c
index 3ddd6446d117..6415f1d61b72 100644
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@@ -241,7 +241,7 @@
 
 // HAS_DEBUG-NOT: warning: debug
 // HAS_DEBUG: "-triple" "nvptx64-nvidia-cuda"
-// HAS_DEBUG-SAME: "-debug-info-kind={{constructor|line-tables-only}}"
+// HAS_DEBUG-SAME: "-debug-info-kind={{limited|line-tables-only}}"
 // HAS_DEBUG-SAME: "-dwarf-version=2"
 // HAS_DEBUG-SAME: "-fopenmp-is-device"
 // HAS_DEBUG: ptxas
diff --git a/clang/test/Driver/split-debug.c b/clang/test/Driver/split-debug.c
index 70f8d91d48e0..d40207d5ae3b 100644
--- a/clang/test/Driver/split-debug.c
+++ b/clang/test/Driver/split-debug.c
@@ -68,18 +68,18 @@
 // RUN: FileCheck -check-prefix=CHECK-NOINLINE-WITHOUT-SPLIT < %t %s
 //
 // CHECK-NOINLINE-WITHOUT-SPLIT: "-fno-split-dwarf-inlining"
-// CHECK-NOINLINE-WITHOUT-SPLIT: "-debug-info-kind=constructor"
+// CHECK-NOINLINE-WITHOUT-SPLIT: "-debug-info-kind=limited"
 
 // RUN: %clang -target x86_64-unknown-linux-gnu -gmlt -gsplit-dwarf -fno-split-dwarf-inlining -S -### %s 2> %t
 // RUN: FileCheck -check-prefix=CHECK-SPLIT-WITH-GMLT < %t %s
 //
-// CHECK-SPLIT-WITH-GMLT: "-debug-info-kind=constructor"
+// CHECK-SPLIT-WITH-GMLT: "-debug-info-kind=limited"
 // CHECK-SPLIT-WITH-GMLT: "-split-dwarf-output"
 
 // RUN: %clang -target x86_64-unknown-linux-gnu -gsplit-dwarf -fno-split-dwarf-inlining -S -### %s 2> %t
 // RUN: FileCheck -check-prefix=CHECK-SPLIT-WITH-NOINL < %t %s
 //
-// CHECK-SPLIT-WITH-NOINL: "-debug-info-kind=constructor"
+// CHECK-SPLIT-WITH-NOINL: "-debug-info-kind=limited"
 // CHECK-SPLIT-WITH-NOINL: "-split-dwarf-output"
 
 // RUN: %clang -target x86_64-unknown-linux-gnu -gsplit-dwarf -gmlt -fsplit-dwarf-inlining -S -### %s 2> %t
@@ -92,7 +92,7 @@
 // RUN: %clang -target x86_64-unknown-linux-gnu -gmlt -gsplit-dwarf -S -### %s 2> %t
 // RUN: FileCheck -check-prefix=CHECK-SPLIT-OVER-GMLT < %t %s
 //
-// CHECK-SPLIT-OVER-GMLT: "-debug-info-kind=constructor"
+// CHECK-SPLIT-OVER-GMLT: "-debug-info-kind=limited"
 // CHECK-SPLIT-OVER-GMLT: "-split-dwarf-file"
 // CHECK-SPLIT-OVER-GMLT: "-split-dwarf-output"
 
@@ -117,6 +117,6 @@
 // RUN: %clang -target x86_64-unknown-linux-gnu -g0 -gsplit-dwarf=split -S -### %s 2> %t
 // RUN: FileCheck -check-prefix=CHECK-SPLIT-OVER-G0 < %t %s
 //
-// CHECK-SPLIT-OVER-G0: "-debug-info-kind=constructor"
+// CHECK-SPLIT-OVER-G0: "-debug-info-kind=limited"
 // CHECK-SPLIT-OVER-G0: "-split-dwarf-file"
 // CHECK-SPLIT-OVER-G0: "-split-dwarf-output"
diff --git a/lldb/test/Shell/SymbolFile/PDB/Inputs/ClassLayoutTest.cpp b/lldb/test/Shell/SymbolFile/PDB/Inputs/ClassLayoutTest.cpp
index 503939680c50..3c4b005cdf1b 100644
--- a/lldb/test/Shell/SymbolFile/PDB/Inputs/ClassLayoutTest.cpp
+++ b/lldb/test/Shell/SymbolFile/PDB/Inputs/ClassLayoutTest.cpp
@@ -106,7 +106,6 @@ class Class : public Base { // Test base class.
 int main() {
   MemberTest::Base B1;
   B1.Get();
-  MemberTest::Class C1;
   MemberTest::Class::StaticMemberFunc(1, 10, 2);
   return 0;
 }

From 83338bed0c2078870b36a81fe9a36723bd3be2e5 Mon Sep 17 00:00:00 2001
From: Ryan Prichard <rprichard@google.com>
Date: Sat, 22 Aug 2020 17:12:52 -0700
Subject: [PATCH 166/363] [libunwind] Make findUnwindSectionsByPhdr static

Currently, this function is present in the dynsym table of
libunwind.so (on ELF targets). Make the function static instead.

In the previous release (LLVM 10.x), this function was instead a lambda
function inside LocalAddressSpace::findUnwindSections, and because
LocalAddressSpace was marked with _LIBUNWIND_HIDDEN, the lambda
function was also a hidden symbol.

Differential Revision: https://reviews.llvm.org/D86372

(cherry picked from commit 3c1b2e338dfdf4f305b1cb40e2ebcb93a7e470c3)
---
 libunwind/src/AddressSpace.hpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/libunwind/src/AddressSpace.hpp b/libunwind/src/AddressSpace.hpp
index e40c23291f84..93395ffb3b1d 100644
--- a/libunwind/src/AddressSpace.hpp
+++ b/libunwind/src/AddressSpace.hpp
@@ -473,8 +473,8 @@ static bool checkAddrInSegment(const Elf_Phdr *phdr, size_t image_base,
   return false;
 }
 
-int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, size_t pinfo_size,
-                             void *data) {
+static int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo,
+                                    size_t pinfo_size, void *data) {
   auto cbdata = static_cast<dl_iterate_cb_data *>(data);
   if (pinfo->dlpi_phnum == 0 || cbdata->targetAddr < pinfo->dlpi_addr)
     return 0;
@@ -523,7 +523,8 @@ int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, size_t pinfo_size,
 // Given all the #ifdef's above, the code here is for
 // defined(LIBUNWIND_ARM_EHABI)
 
-int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, size_t, void *data) {
+static int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, size_t,
+                                    void *data) {
   auto *cbdata = static_cast<dl_iterate_cb_data *>(data);
   bool found_obj = false;
   bool found_hdr = false;

From c160ff1564d8047c852f54d64ba4e9a81d080cac Mon Sep 17 00:00:00 2001
From: Richard Smith <richard@metafoo.co.uk>
Date: Mon, 24 Aug 2020 22:49:41 -0700
Subject: [PATCH 167/363] PR37556: Don't diagnose conflicts between
 instantiated unqualified friend declarations and declarations found in inline
 namespaces within the target context.

(cherry picked from commit 04ba18563390ec87400fa068a9b4981b235ebaa6)
---
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |  7 +++++++
 clang/test/SemaTemplate/friend.cpp            | 19 +++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 2efb7acb9724..baec13ba627c 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -2053,6 +2053,13 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl(
     // typedef (C++ [dcl.typedef]p4).
     if (Previous.isSingleTagDecl())
       Previous.clear();
+
+    // Filter out previous declarations that don't match the scope. The only
+    // effect this has is to remove declarations found in inline namespaces
+    // for friend declarations with unqualified names.
+    SemaRef.FilterLookupForScope(Previous, DC, /*Scope*/ nullptr,
+                                 /*ConsiderLinkage*/ true,
+                                 QualifierLoc.hasQualifier());
   }
 
   SemaRef.CheckFunctionDeclaration(/*Scope*/ nullptr, Function, Previous,
diff --git a/clang/test/SemaTemplate/friend.cpp b/clang/test/SemaTemplate/friend.cpp
index 777682be3f1b..283c7732ccff 100644
--- a/clang/test/SemaTemplate/friend.cpp
+++ b/clang/test/SemaTemplate/friend.cpp
@@ -122,3 +122,22 @@ namespace qualified_friend_finds_nothing {
   namespace N { void f(int); }
   B<int> bi; // ok?!
 }
+
+namespace PR37556 {
+  inline namespace N { int x1, x2, y1, y2; } // expected-note 2{{previous}}
+  struct X {
+    friend void x1(int);
+    friend void PR37556::x2(int); // expected-error {{different kind}}
+  };
+  template<typename T> struct Y {
+    friend void y1(T);
+    friend void PR37556::y2(T); // expected-error {{different kind}}
+  };
+  template struct Y<int>;
+  template<typename T> struct Z {
+    friend void z1(T);
+    friend void PR37556::z2(T); // expected-error {{does not match any}}
+  };
+  inline namespace N { int z1, z2; }
+  template struct Z<int>;
+}

From 9f4a92a4349ff1794379b706a4851c678899d5d2 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Thu, 28 May 2020 09:41:01 -0500
Subject: [PATCH 168/363] Reuse OMPIRBuilder `struct ident_t` handling in Clang

Replace the `ident_t` handling in Clang with the methods offered by the
OMPIRBuilder. This cuts down on the clang code as well as the
differences between the two, making further transitions easier. Tests
have changed but there should not be a real functional change. The most
interesting difference is probably that we stop generating local ident_t
allocations for now and just use globals. Given that this happens only
with debug info, the location part of the `ident_t` is probably bigger
than the test anyway. As the location part is already a global, we can
avoid the allocation, memcpy, and store in favor of a constant global
that is slightly bigger. This can be revisited if there are
complications.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D80735
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 121 ++---------
 clang/lib/CodeGen/CGOpenMPRuntime.h           |  12 +-
 clang/test/OpenMP/distribute_codegen.cpp      |   4 +-
 ...ibute_parallel_for_num_threads_codegen.cpp |   2 +-
 ...tribute_parallel_for_proc_bind_codegen.cpp |   2 +-
 ..._parallel_for_simd_num_threads_codegen.cpp |   2 +-
 ...te_parallel_for_simd_proc_bind_codegen.cpp |   2 +-
 clang/test/OpenMP/distribute_simd_codegen.cpp |   4 +-
 clang/test/OpenMP/for_codegen.cpp             |   4 +-
 .../test/OpenMP/for_firstprivate_codegen.cpp  |   2 +-
 clang/test/OpenMP/for_lastprivate_codegen.cpp |   2 +-
 clang/test/OpenMP/for_linear_codegen.cpp      |   2 +-
 clang/test/OpenMP/for_reduction_codegen.cpp   |   6 +-
 .../test/OpenMP/for_reduction_codegen_UDR.cpp |   4 +-
 .../master_taskloop_in_reduction_codegen.cpp  |  14 +-
 .../master_taskloop_reduction_codegen.cpp     |   4 +-
 ...ter_taskloop_simd_in_reduction_codegen.cpp |  14 +-
 ...master_taskloop_simd_reduction_codegen.cpp |   4 +-
 clang/test/OpenMP/openmp_win_codegen.cpp      |   8 +-
 clang/test/OpenMP/ordered_codegen.cpp         |   2 +-
 clang/test/OpenMP/parallel_codegen.cpp        |  24 +--
 clang/test/OpenMP/parallel_copyin_codegen.cpp |   4 +-
 clang/test/OpenMP/parallel_for_codegen.cpp    |   2 +-
 clang/test/OpenMP/parallel_master_codegen.cpp |  38 ++--
 ...llel_master_taskloop_reduction_codegen.cpp |   4 +-
 ...master_taskloop_simd_reduction_codegen.cpp |   4 +-
 .../OpenMP/parallel_num_threads_codegen.cpp   |   2 +-
 .../OpenMP/parallel_proc_bind_codegen.cpp     |   2 +-
 .../OpenMP/parallel_reduction_codegen.cpp     |   2 +-
 clang/test/OpenMP/sections_codegen.cpp        |   4 +-
 .../OpenMP/sections_firstprivate_codegen.cpp  |   2 +-
 .../OpenMP/sections_lastprivate_codegen.cpp   |   2 +-
 .../OpenMP/sections_reduction_codegen.cpp     |   4 +-
 clang/test/OpenMP/single_codegen.cpp          |   2 +-
 .../OpenMP/single_firstprivate_codegen.cpp    |   2 +-
 clang/test/OpenMP/target_depend_codegen.cpp   |  62 +++---
 clang/test/OpenMP/target_parallel_codegen.cpp |   2 +-
 .../OpenMP/target_parallel_depend_codegen.cpp |  67 +++---
 .../OpenMP/target_parallel_for_codegen.cpp    |   2 +-
 .../target_parallel_for_simd_codegen.cpp      |   2 +-
 .../OpenMP/target_parallel_if_codegen.cpp     |   2 +-
 .../target_parallel_num_threads_codegen.cpp   |   2 +-
 .../OpenMP/target_simd_depend_codegen.cpp     |  66 +++---
 clang/test/OpenMP/target_teams_codegen.cpp    |   2 +-
 .../OpenMP/target_teams_depend_codegen.cpp    |  66 +++---
 .../target_teams_distribute_codegen.cpp       |   2 +-
 ...tribute_parallel_for_proc_bind_codegen.cpp |   2 +-
 ...te_parallel_for_simd_proc_bind_codegen.cpp |   2 +-
 .../target_teams_distribute_simd_codegen.cpp  |   2 +-
 .../OpenMP/target_teams_num_teams_codegen.cpp |   2 +-
 .../target_teams_thread_limit_codegen.cpp     |   2 +-
 .../test/OpenMP/task_in_reduction_codegen.cpp |  14 +-
 .../OpenMP/taskloop_in_reduction_codegen.cpp  |  14 +-
 .../OpenMP/taskloop_reduction_codegen.cpp     |   4 +-
 .../taskloop_simd_in_reduction_codegen.cpp    |  14 +-
 .../taskloop_simd_reduction_codegen.cpp       |   4 +-
 clang/test/OpenMP/teams_codegen.cpp           |   6 +-
 ...ibute_parallel_for_num_threads_codegen.cpp |   2 +-
 ...tribute_parallel_for_proc_bind_codegen.cpp |   2 +-
 ..._parallel_for_simd_num_threads_codegen.cpp |   2 +-
 ...te_parallel_for_simd_proc_bind_codegen.cpp |   2 +-
 clang/test/OpenMP/threadprivate_codegen.cpp   | 202 +++++++++---------
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  11 +-
 llvm/include/llvm/IR/IRBuilder.h              |  14 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  62 ++++--
 llvm/lib/IR/IRBuilder.cpp                     |  13 +-
 llvm/test/Transforms/OpenMP/deduplication.ll  |  14 +-
 67 files changed, 461 insertions(+), 517 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index a7e1fe8560b6..b221deab0174 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -29,7 +29,6 @@
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalValue.h"
@@ -1064,23 +1063,6 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM, StringRef FirstSeparator,
                                  StringRef Separator)
     : CGM(CGM), FirstSeparator(FirstSeparator), Separator(Separator),
       OMPBuilder(CGM.getModule()), OffloadEntriesInfoManager(CGM) {
-  ASTContext &C = CGM.getContext();
-  RecordDecl *RD = C.buildImplicitRecord("ident_t");
-  QualType KmpInt32Ty = C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1);
-  RD->startDefinition();
-  // reserved_1
-  addFieldToRecordDecl(C, RD, KmpInt32Ty);
-  // flags
-  addFieldToRecordDecl(C, RD, KmpInt32Ty);
-  // reserved_2
-  addFieldToRecordDecl(C, RD, KmpInt32Ty);
-  // reserved_3
-  addFieldToRecordDecl(C, RD, KmpInt32Ty);
-  // psource
-  addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-  RD->completeDefinition();
-  IdentQTy = C.getRecordType(RD);
-  IdentTy = CGM.getTypes().ConvertRecordDeclType(RD);
   KmpCriticalNameTy = llvm::ArrayType::get(CGM.Int32Ty, /*NumElements*/ 8);
 
   // Initialize Types used in OpenMPIRBuilder from OMPKinds.def
@@ -1397,39 +1379,6 @@ createConstantGlobalStructAndAddToParent(CodeGenModule &CGM, QualType Ty,
   Fields.finishAndAddTo(Parent);
 }
 
-Address CGOpenMPRuntime::getOrCreateDefaultLocation(unsigned Flags) {
-  CharUnits Align = CGM.getContext().getTypeAlignInChars(IdentQTy);
-  unsigned Reserved2Flags = getDefaultLocationReserved2Flags();
-  FlagsTy FlagsKey(Flags, Reserved2Flags);
-  llvm::Value *Entry = OpenMPDefaultLocMap.lookup(FlagsKey);
-  if (!Entry) {
-    if (!DefaultOpenMPPSource) {
-      // Initialize default location for psource field of ident_t structure of
-      // all ident_t objects. Format is ";file;function;line;column;;".
-      // Taken from
-      // https://github.com/llvm/llvm-project/blob/master/openmp/runtime/src/kmp_str.cpp
-      DefaultOpenMPPSource =
-          CGM.GetAddrOfConstantCString(";unknown;unknown;0;0;;").getPointer();
-      DefaultOpenMPPSource =
-          llvm::ConstantExpr::getBitCast(DefaultOpenMPPSource, CGM.Int8PtrTy);
-    }
-
-    llvm::Constant *Data[] = {
-        llvm::ConstantInt::getNullValue(CGM.Int32Ty),
-        llvm::ConstantInt::get(CGM.Int32Ty, Flags),
-        llvm::ConstantInt::get(CGM.Int32Ty, Reserved2Flags),
-        llvm::ConstantInt::getNullValue(CGM.Int32Ty), DefaultOpenMPPSource};
-    llvm::GlobalValue *DefaultOpenMPLocation =
-        createGlobalStruct(CGM, IdentQTy, isDefaultLocationConstant(), Data, "",
-                           llvm::GlobalValue::PrivateLinkage);
-    DefaultOpenMPLocation->setUnnamedAddr(
-        llvm::GlobalValue::UnnamedAddr::Global);
-
-    OpenMPDefaultLocMap[FlagsKey] = Entry = DefaultOpenMPLocation;
-  }
-  return Address(Entry, Align);
-}
-
 void CGOpenMPRuntime::setLocThreadIdInsertPt(CodeGenFunction &CGF,
                                              bool AtCurrentPoint) {
   auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn);
@@ -1458,62 +1407,24 @@ void CGOpenMPRuntime::clearLocThreadIdInsertPt(CodeGenFunction &CGF) {
 llvm::Value *CGOpenMPRuntime::emitUpdateLocation(CodeGenFunction &CGF,
                                                  SourceLocation Loc,
                                                  unsigned Flags) {
-  Flags |= OMP_IDENT_KMPC;
-  // If no debug info is generated - return global default location.
+  llvm::Constant *SrcLocStr;
   if (CGM.getCodeGenOpts().getDebugInfo() == codegenoptions::NoDebugInfo ||
-      Loc.isInvalid())
-    return getOrCreateDefaultLocation(Flags).getPointer();
-
-  assert(CGF.CurFn && "No function in current CodeGenFunction.");
-
-  CharUnits Align = CGM.getContext().getTypeAlignInChars(IdentQTy);
-  Address LocValue = Address::invalid();
-  auto I = OpenMPLocThreadIDMap.find(CGF.CurFn);
-  if (I != OpenMPLocThreadIDMap.end())
-    LocValue = Address(I->second.DebugLoc, Align);
-
-  // OpenMPLocThreadIDMap may have null DebugLoc and non-null ThreadID, if
-  // GetOpenMPThreadID was called before this routine.
-  if (!LocValue.isValid()) {
-    // Generate "ident_t .kmpc_loc.addr;"
-    Address AI = CGF.CreateMemTemp(IdentQTy, ".kmpc_loc.addr");
-    auto &Elem = OpenMPLocThreadIDMap.FindAndConstruct(CGF.CurFn);
-    Elem.second.DebugLoc = AI.getPointer();
-    LocValue = AI;
-
-    if (!Elem.second.ServiceInsertPt)
-      setLocThreadIdInsertPt(CGF);
-    CGBuilderTy::InsertPointGuard IPG(CGF.Builder);
-    CGF.Builder.SetInsertPoint(Elem.second.ServiceInsertPt);
-    CGF.Builder.CreateMemCpy(LocValue, getOrCreateDefaultLocation(Flags),
-                             CGF.getTypeSize(IdentQTy));
-  }
-
-  // char **psource = &.kmpc_loc_<flags>.addr.psource;
-  LValue Base = CGF.MakeAddrLValue(LocValue, IdentQTy);
-  auto Fields = cast<RecordDecl>(IdentQTy->getAsTagDecl())->field_begin();
-  LValue PSource =
-      CGF.EmitLValueForField(Base, *std::next(Fields, IdentField_PSource));
-
-  llvm::Value *OMPDebugLoc = OpenMPDebugLocMap.lookup(Loc.getRawEncoding());
-  if (OMPDebugLoc == nullptr) {
-    SmallString<128> Buffer2;
-    llvm::raw_svector_ostream OS2(Buffer2);
-    // Build debug location
-    PresumedLoc PLoc = CGF.getContext().getSourceManager().getPresumedLoc(Loc);
-    OS2 << ";" << PLoc.getFilename() << ";";
+      Loc.isInvalid()) {
+    SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr();
+  } else {
+    std::string FunctionName = "";
     if (const auto *FD = dyn_cast_or_null<FunctionDecl>(CGF.CurFuncDecl))
-      OS2 << FD->getQualifiedNameAsString();
-    OS2 << ";" << PLoc.getLine() << ";" << PLoc.getColumn() << ";;";
-    OMPDebugLoc = CGF.Builder.CreateGlobalStringPtr(OS2.str());
-    OpenMPDebugLocMap[Loc.getRawEncoding()] = OMPDebugLoc;
+      FunctionName = FD->getQualifiedNameAsString();
+    PresumedLoc PLoc = CGF.getContext().getSourceManager().getPresumedLoc(Loc);
+    const char *FileName = PLoc.getFilename();
+    unsigned Line = PLoc.getLine();
+    unsigned Column = PLoc.getColumn();
+    SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(FunctionName.c_str(), FileName,
+                                                Line, Column);
   }
-  // *psource = ";<File>;<Function>;<Line>;<Column>;;";
-  CGF.EmitStoreOfScalar(OMPDebugLoc, PSource);
-
-  // Our callers always pass this to a runtime function, so for
-  // convenience, go ahead and return a naked pointer.
-  return LocValue.getPointer();
+  unsigned Reserved2Flags = getDefaultLocationReserved2Flags();
+  return OMPBuilder.getOrCreateIdent(SrcLocStr, llvm::omp::IdentFlag(Flags),
+                                     Reserved2Flags);
 }
 
 llvm::Value *CGOpenMPRuntime::getThreadID(CodeGenFunction &CGF,
@@ -1595,7 +1506,7 @@ void CGOpenMPRuntime::functionFinished(CodeGenFunction &CGF) {
 }
 
 llvm::Type *CGOpenMPRuntime::getIdentTyPointerTy() {
-  return IdentTy->getPointerTo();
+  return OMPBuilder.IdentPtr;
 }
 
 llvm::Type *CGOpenMPRuntime::getKmpc_MicroPointerTy() {
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index eb22f155f5ef..cf3dbf59634d 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -374,17 +374,7 @@ class CGOpenMPRuntime {
 private:
   /// An OpenMP-IR-Builder instance.
   llvm::OpenMPIRBuilder OMPBuilder;
-  /// Default const ident_t object used for initialization of all other
-  /// ident_t objects.
-  llvm::Constant *DefaultOpenMPPSource = nullptr;
-  using FlagsTy = std::pair<unsigned, unsigned>;
-  /// Map of flags and corresponding default locations.
-  using OpenMPDefaultLocMapTy = llvm::DenseMap<FlagsTy, llvm::Value *>;
-  OpenMPDefaultLocMapTy OpenMPDefaultLocMap;
-  Address getOrCreateDefaultLocation(unsigned Flags);
-
-  QualType IdentQTy;
-  llvm::StructType *IdentTy = nullptr;
+
   /// Map for SourceLocation and OpenMP runtime library debug locations.
   typedef llvm::DenseMap<unsigned, llvm::Value *> OpenMPDebugLocMapTy;
   OpenMPDebugLocMapTy OpenMPDebugLocMap;
diff --git a/clang/test/OpenMP/distribute_codegen.cpp b/clang/test/OpenMP/distribute_codegen.cpp
index 4e8bcb44f63d..bbece45c6e31 100644
--- a/clang/test/OpenMP/distribute_codegen.cpp
+++ b/clang/test/OpenMP/distribute_codegen.cpp
@@ -55,8 +55,8 @@
 
 // CHECK-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
-// CHECK-DAG: [[DEF_LOC_DISTRIBUTE_0:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_DISTRIBUTE_0:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 // CHECK-LABEL: define {{.*void}} @{{.*}}without_schedule_clause{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
 void without_schedule_clause(float *a, float *b, float *c, float *d) {
diff --git a/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
index 8d941391c75b..21d094945427 100644
--- a/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
@@ -15,7 +15,7 @@ typedef __INTPTR_TYPE__ intptr_t;
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[S_TY:%.+]] = type { [[INTPTR_T_TY:i[0-9]+]], [[INTPTR_T_TY]], [[INTPTR_T_TY]] }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
diff --git a/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp
index 3e2a65e47f0e..fce005be80fb 100644
--- a/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp
@@ -16,7 +16,7 @@ typedef __INTPTR_TYPE__ intptr_t;
 
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
index 318fc1401963..014fb9523fe5 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
@@ -15,7 +15,7 @@ typedef __INTPTR_TYPE__ intptr_t;
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[S_TY:%.+]] = type { [[INTPTR_T_TY:i[0-9]+]], [[INTPTR_T_TY]], [[INTPTR_T_TY]] }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp
index 716d7d7fa2e9..4fb1f5b0274d 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp
@@ -16,7 +16,7 @@ typedef __INTPTR_TYPE__ intptr_t;
 
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
diff --git a/clang/test/OpenMP/distribute_simd_codegen.cpp b/clang/test/OpenMP/distribute_simd_codegen.cpp
index 7229c8095f0e..a0c3c6accc0f 100644
--- a/clang/test/OpenMP/distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_codegen.cpp
@@ -68,8 +68,8 @@
 
 // CHECK-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
-// CHECK-DAG: [[DEF_LOC_DISTRIBUTE_0:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_DISTRIBUTE_0:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 // CHECK-LABEL: define {{.*void}} @{{.*}}without_schedule_clause{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
 void without_schedule_clause(float *a, float *b, float *c, float *d) {
diff --git a/clang/test/OpenMP/for_codegen.cpp b/clang/test/OpenMP/for_codegen.cpp
index 26b09c574f3c..5c4f984e8fc1 100644
--- a/clang/test/OpenMP/for_codegen.cpp
+++ b/clang/test/OpenMP/for_codegen.cpp
@@ -22,8 +22,8 @@
 // PROF-INSTR-PATH: constant [25 x i8] c"for_codegen-test.profraw\00"
 
 // CHECK: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
-// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
-// CHECK-DAG: [[LOOP_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 514, i32 0, i32 0, i8*
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[LOOP_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 514, i32 0, i32 0, i8*
 // CHECK-DAG: [[I:@.+]] = global i8 1,
 // CHECK-DAG: [[J:@.+]] = global i8 2,
 // CHECK-DAG: [[K:@.+]] = global i8 3,
diff --git a/clang/test/OpenMP/for_firstprivate_codegen.cpp b/clang/test/OpenMP/for_firstprivate_codegen.cpp
index 1cfd94af9d4e..756665523b7c 100644
--- a/clang/test/OpenMP/for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/for_firstprivate_codegen.cpp
@@ -65,7 +65,7 @@ S<float> s_arr[] = {1, 2};
 // CHECK-DAG: [[VAR:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
 S<float> var(3);
 // CHECK: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
-// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
 
 // CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
 // CHECK: ([[S_FLOAT_TY]]*)* [[S_FLOAT_TY_DESTR:@[^ ]+]] {{[^,]+}}, {{.+}}([[S_FLOAT_TY]]* [[TEST]]
diff --git a/clang/test/OpenMP/for_lastprivate_codegen.cpp b/clang/test/OpenMP/for_lastprivate_codegen.cpp
index fd7cad07e8b4..fbbb6ad6bc3d 100644
--- a/clang/test/OpenMP/for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/for_lastprivate_codegen.cpp
@@ -172,7 +172,7 @@ char cnt;
 // BLOCKS: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // CHECK: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK: [[S_INT_TY:%.+]] = type { i32 }
-// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
 // CHECK-DAG: [[X:@.+]] = global double 0.0
 // CHECK-DAG: [[F:@.+]] = global float 0.0
 // CHECK-DAG: [[CNT:@.+]] = global i8 0
diff --git a/clang/test/OpenMP/for_linear_codegen.cpp b/clang/test/OpenMP/for_linear_codegen.cpp
index 2f4e8dd531dd..fd9d89c38dcb 100644
--- a/clang/test/OpenMP/for_linear_codegen.cpp
+++ b/clang/test/OpenMP/for_linear_codegen.cpp
@@ -112,7 +112,7 @@ struct SST {
 // BLOCKS: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // CHECK: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK: [[S_INT_TY:%.+]] = type { i32 }
-// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
 // CHECK-DAG: [[F:@.+]] = global float 0.0
 // CHECK-DAG: [[CNT:@.+]] = global i8 0
 template <typename T>
diff --git a/clang/test/OpenMP/for_reduction_codegen.cpp b/clang/test/OpenMP/for_reduction_codegen.cpp
index 61f7afd6b460..5a360fb24684 100644
--- a/clang/test/OpenMP/for_reduction_codegen.cpp
+++ b/clang/test/OpenMP/for_reduction_codegen.cpp
@@ -29,9 +29,9 @@ struct S {
 
 // CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
-// CHECK-DAG: [[ATOMIC_REDUCE_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
-// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
-// CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
+// CHECK-DAG: [[ATOMIC_REDUCE_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
 // CHECK-DAG: [[REDUCTION_LOCK:@.+]] = common global [8 x i32] zeroinitializer
 
 template <typename T, int length>
diff --git a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
index 31168bc325e3..5a20fa187e9c 100644
--- a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
+++ b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
@@ -53,8 +53,8 @@ void init_plus(BaseS1&, const BaseS1&);
 
 // CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { %{{[^,]+}}, %{{[^,]+}}, float }
 // CHECK-DAG: [[S_INT_TY:%.+]] = type { %{{[^,]+}}, %{{[^,]+}}, i{{[0-9]+}} }
-// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 66, i32 0, i32 0, i8*
-// CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 18, i32 0, i32 0, i8*
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 0, i8*
 // CHECK-DAG: [[REDUCTION_LOCK:@.+]] = common global [8 x i32] zeroinitializer
 
 #pragma omp declare reduction(operator* : S<int> : omp_out.f = 17 * omp_in.f) initializer(omp_priv = S<int>())
diff --git a/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
index 5d6d1645408f..e6cc39c5345a 100644
--- a/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
@@ -39,22 +39,22 @@ int main(int argc, char **argv) {
 }
 
 // CHECK-LABEL: @main
-// CHECK:       void @__kmpc_taskgroup(%struct.ident_t* @0, i32 [[GTID:%.+]])
+// CHECK:       void @__kmpc_taskgroup(%struct.ident_t* @1, i32 [[GTID:%.+]])
 // CHECK:       [[TD1:%.+]] = call i8* @__kmpc_taskred_init(i32 [[GTID]], i32 3, i8* %
 // CHECK-NEXT:  store i8* [[TD1]], i8** [[TD1_ADDR:%[^,]+]],
-// CHECK-NEXT:  call void @__kmpc_taskgroup(%struct.ident_t* @0, i32 [[GTID]])
+// CHECK-NEXT:  call void @__kmpc_taskgroup(%struct.ident_t* @1, i32 [[GTID]])
 // CHECK:       [[TD2:%.+]] = call i8* @__kmpc_taskred_init(i32 [[GTID]], i32 2, i8* %
 // CHECK-NEXT:  store i8* [[TD2]], i8** [[TD2_ADDR:%[^,]+]],
-// CHECK-NEXT:  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 5, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64, i16*, i8**, i8**)* [[OMP_PARALLEL:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i64 %{{.+}}, i16* %{{.+}}, i8** [[TD1_ADDR]], i8** [[TD2_ADDR]])
-// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @0, i32 [[GTID]])
-// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @0, i32 [[GTID]])
+// CHECK-NEXT:  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @1, i32 5, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64, i16*, i8**, i8**)* [[OMP_PARALLEL:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i64 %{{.+}}, i16* %{{.+}}, i8** [[TD1_ADDR]], i8** [[TD2_ADDR]])
+// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @1, i32 [[GTID]])
+// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @1, i32 [[GTID]])
 
 // CHECK:       define internal void [[OMP_PARALLEL]](
 // CHECK:       [[RES:%.+]] = call {{.*}}i32 @__kmpc_master(
 // CHECK-NEXT:  [[IS_MASTER:%.+]] = icmp ne i32 [[RES]], 0
 // CHECK-NEXT:  br i1 [[IS_MASTER]], label {{%?}}[[THEN:.+]], label {{%?}}[[EXIT:.+]]
 // CHECK:       [[THEN]]
-// CHECK:       [[TASK_T:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @0, i32 [[GTID:%.+]], i32 1, i64 96, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[T:%.+]]*)* [[OMP_TASK:@.+]] to i32 (i32, i8*)*))
+// CHECK:       [[TASK_T:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @1, i32 [[GTID:%.+]], i32 1, i64 96, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[T:%.+]]*)* [[OMP_TASK:@.+]] to i32 (i32, i8*)*))
 // CHECK-NEXT:  [[TASK_T_WITH_PRIVS:%.+]] = bitcast i8* [[TASK_T]] to [[T]]*
 // CHECK:       [[PRIVS:%.+]] = getelementptr inbounds [[T]], [[T]]* [[TASK_T_WITH_PRIVS]], i32 0, i32 1
 // CHECK:       [[TD1_REF:%.+]] = getelementptr inbounds [[PRIVATES]], [[PRIVATES]]* [[PRIVS]], i32 0, i32 0
@@ -63,7 +63,7 @@ int main(int argc, char **argv) {
 // CHECK-NEXT:  [[TD2_REF:%.+]] = getelementptr inbounds [[PRIVATES]], [[PRIVATES]]* [[PRIVS]], i32 0, i32 1
 // CHECK-NEXT:  [[TD2:%.+]] = load i8*, i8** %{{.+}},
 // CHECK-NEXT:  store i8* [[TD2]], i8** [[TD2_REF]],
-// CHECK:       call void @__kmpc_taskloop(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK_T]], i32 1,
+// CHECK:       call void @__kmpc_taskloop(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK_T]], i32 1,
 // CHECK:  call {{.*}}void @__kmpc_end_master(
 // CHECK-NEXT:  br label {{%?}}[[EXIT]]
 // CHECK:       [[EXIT]]
diff --git a/clang/test/OpenMP/master_taskloop_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_reduction_codegen.cpp
index 2c67e49caf43..4d151bed649d 100644
--- a/clang/test/OpenMP/master_taskloop_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_reduction_codegen.cpp
@@ -161,8 +161,8 @@ sum = 0.0;
 // CHECK:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
 // CHECK:    [[SUB12:%.*]] = sub nsw i32 [[DIV]], 1
 // CHECK:    store i32 [[SUB12]], i32* [[DOTCAPTURE_EXPR_9]],
-// CHECK:    [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* %{{.+}}, i32 [[TMP0]], i32 1, i64 888, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @[[TASK:.+]] to i32 (i32, i8*)*))
-// CHECK:    call void @__kmpc_taskloop(%struct.ident_t* %{{.+}}, i32 [[TMP0]], i8* [[TMP65]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* null)
+// CHECK:    [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* {{.+}}, i32 [[TMP0]], i32 1, i64 888, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @[[TASK:.+]] to i32 (i32, i8*)*))
+// CHECK:    call void @__kmpc_taskloop(%struct.ident_t* {{.+}}, i32 [[TMP0]], i8* [[TMP65]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* null)
 // CHECK:    call void @__kmpc_end_taskgroup(%struct.ident_t*
 // CHECK:  call {{.*}}void @__kmpc_end_master(
 // CHECK-NEXT:  br label {{%?}}[[EXIT]]
diff --git a/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
index 58f1b0e034b0..aca7f0f47244 100644
--- a/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
@@ -39,18 +39,18 @@ int main(int argc, char **argv) {
 }
 
 // CHECK-LABEL: @main
-// CHECK:       void @__kmpc_taskgroup(%struct.ident_t* @0, i32 [[GTID:%.+]])
+// CHECK:       void @__kmpc_taskgroup(%struct.ident_t* @1, i32 [[GTID:%.+]])
 // CHECK:       [[TD1:%.+]] = call i8* @__kmpc_taskred_init(i32 [[GTID]], i32 3, i8* %
 // CHECK-NEXT:  store i8* [[TD1]], i8** [[TD1_ADDR:%[^,]+]],
-// CHECK-NEXT:  call void @__kmpc_taskgroup(%struct.ident_t* @0, i32 [[GTID]])
+// CHECK-NEXT:  call void @__kmpc_taskgroup(%struct.ident_t* @1, i32 [[GTID]])
 // CHECK:       [[TD2:%.+]] = call i8* @__kmpc_taskred_init(i32 [[GTID]], i32 2, i8* %
 // CHECK-NEXT:  store i8* [[TD2]], i8** [[TD2_ADDR:%[^,]+]],
-// CHECK-NEXT:  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 5, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64, i16*, i8**, i8**)* [[OMP_PARALLEL:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i64 %{{.+}}, i16* %{{.+}}, i8** [[TD1_ADDR]], i8** [[TD2_ADDR]])
-// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @0, i32 [[GTID]])
-// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @0, i32 [[GTID]])
+// CHECK-NEXT:  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @1, i32 5, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64, i16*, i8**, i8**)* [[OMP_PARALLEL:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i64 %{{.+}}, i16* %{{.+}}, i8** [[TD1_ADDR]], i8** [[TD2_ADDR]])
+// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @1, i32 [[GTID]])
+// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @1, i32 [[GTID]])
 
 // CHECK:       define internal void [[OMP_PARALLEL]](
-// CHECK:       [[TASK_T:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @0, i32 [[GTID:%.+]], i32 1, i64 96, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[T:%.+]]*)* [[OMP_TASK:@.+]] to i32 (i32, i8*)*))
+// CHECK:       [[TASK_T:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @1, i32 [[GTID:%.+]], i32 1, i64 96, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[T:%.+]]*)* [[OMP_TASK:@.+]] to i32 (i32, i8*)*))
 // CHECK-NEXT:  [[TASK_T_WITH_PRIVS:%.+]] = bitcast i8* [[TASK_T]] to [[T]]*
 // CHECK:       [[PRIVS:%.+]] = getelementptr inbounds [[T]], [[T]]* [[TASK_T_WITH_PRIVS]], i32 0, i32 1
 // CHECK:       [[TD1_REF:%.+]] = getelementptr inbounds [[PRIVATES]], [[PRIVATES]]* [[PRIVS]], i32 0, i32 0
@@ -59,7 +59,7 @@ int main(int argc, char **argv) {
 // CHECK-NEXT:  [[TD2_REF:%.+]] = getelementptr inbounds [[PRIVATES]], [[PRIVATES]]* [[PRIVS]], i32 0, i32 1
 // CHECK-NEXT:  [[TD2:%.+]] = load i8*, i8** %{{.+}},
 // CHECK-NEXT:  store i8* [[TD2]], i8** [[TD2_REF]],
-// CHECK:       call void @__kmpc_taskloop(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK_T]], i32 1,
+// CHECK:       call void @__kmpc_taskloop(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK_T]], i32 1,
 // CHECK:       ret void
 // CHECK-NEXT:  }
 
diff --git a/clang/test/OpenMP/master_taskloop_simd_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_simd_reduction_codegen.cpp
index 0e31b2f4eb49..c48a52029ebb 100644
--- a/clang/test/OpenMP/master_taskloop_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_simd_reduction_codegen.cpp
@@ -157,8 +157,8 @@ sum = 0.0;
 // CHECK:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
 // CHECK:    [[SUB12:%.*]] = sub nsw i32 [[DIV]], 1
 // CHECK:    store i32 [[SUB12]], i32* [[DOTCAPTURE_EXPR_9]],
-// CHECK:    [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* %{{.+}}, i32 [[TMP0]], i32 1, i64 888, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @{{.+}} to i32 (i32, i8*)*))
-// CHECK:    call void @__kmpc_taskloop(%struct.ident_t* %{{.+}}, i32 [[TMP0]], i8* [[TMP65]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* null)
+// CHECK:    [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* {{.+}}, i32 [[TMP0]], i32 1, i64 888, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @{{.+}} to i32 (i32, i8*)*))
+// CHECK:    call void @__kmpc_taskloop(%struct.ident_t* {{.+}}, i32 [[TMP0]], i8* [[TMP65]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* null)
 // CHECK:    call void @__kmpc_end_taskgroup(%struct.ident_t*
 
 // CHECK:    ret i32
diff --git a/clang/test/OpenMP/openmp_win_codegen.cpp b/clang/test/OpenMP/openmp_win_codegen.cpp
index 4b330eccc669..11f5adb39fe5 100644
--- a/clang/test/OpenMP/openmp_win_codegen.cpp
+++ b/clang/test/OpenMP/openmp_win_codegen.cpp
@@ -33,7 +33,7 @@ struct Test {
 int main() {
   // CHECK: call void @{{.+}}main
   Test::main();
-  // CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* {{.*}}@0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*))
+  // CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* {{.*}}@1, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*))
 #pragma omp parallel
   {
     try {
@@ -53,12 +53,12 @@ int main() {
 // CHECK: invoke void @{{.+}}foo
 // CHECK: [[CATCHSWITCH:%.+]] = catchswitch within none
 // CHECK: [[CATCHPAD:%.+]] = catchpad within [[CATCHSWITCH]]
-// CHECK: call void @__kmpc_critical(%struct.ident_t* {{.*}}@0, i32 [[GID:%.+]],
+// CHECK: call void @__kmpc_critical(%struct.ident_t* {{.*}}@1, i32 [[GID:%.+]],
 // CHECK: invoke void @{{.+}}bar
-// CHECK: call void @__kmpc_end_critical(%struct.ident_t* {{.*}}@0, i32 [[GID]],
+// CHECK: call void @__kmpc_end_critical(%struct.ident_t* {{.*}}@1, i32 [[GID]],
 // CHECK: catchret from [[CATCHPAD]] to
 // CHECK:      cleanuppad within [[CATCHPAD]] []
-// CHECK-NEXT: call void @__kmpc_end_critical(%struct.ident_t* {{.*}}@0, i32 [[GID]],
+// CHECK-NEXT: call void @__kmpc_end_critical(%struct.ident_t* {{.*}}@1, i32 [[GID]],
 // CHECK-NEXT: cleanupret from {{.*}} unwind label %[[CATCHTERM:[^ ]+]]
 // CHECK:      cleanuppad within none []
 // CHECK-NEXT: call void @"?terminate@@YAXXZ"() #{{[0-9]+}} [ "funclet"(token %{{.*}}) ]
diff --git a/clang/test/OpenMP/ordered_codegen.cpp b/clang/test/OpenMP/ordered_codegen.cpp
index 07ecee45974c..85235f31a0ea 100644
--- a/clang/test/OpenMP/ordered_codegen.cpp
+++ b/clang/test/OpenMP/ordered_codegen.cpp
@@ -15,7 +15,7 @@
 #define HEADER
 
 // CHECK: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
-// CHECK: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
 // CHECK-LABEL: define {{.*void}} @{{.*}}static_not_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
 void static_not_chunked(float *a, float *b, float *c, float *d) {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
diff --git a/clang/test/OpenMP/parallel_codegen.cpp b/clang/test/OpenMP/parallel_codegen.cpp
index 6fd394c0bbc9..bceab0637f6a 100644
--- a/clang/test/OpenMP/parallel_codegen.cpp
+++ b/clang/test/OpenMP/parallel_codegen.cpp
@@ -17,10 +17,9 @@
 #define HEADER
 // ALL-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // ALL-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// ALL-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// ALL-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 // CHECK-DEBUG-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
-// CHECK-DEBUG-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DEBUG-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+
 // CHECK-DEBUG-DAG: [[LOC1:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}parallel_codegen.cpp;main;[[@LINE+23]];1;;\00"
 // CHECK-DEBUG-DAG: [[LOC2:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}parallel_codegen.cpp;tmain;[[@LINE+11]];1;;\00"
 // IRBUILDER-DEBUG-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
@@ -72,14 +71,11 @@ int main (int argc, char **argv) {
 // ALL:       ret i32
 // ALL-NEXT:  }
 // ALL-DEBUG-LABEL: define i32 @main(i32 %argc, i8** %argv)
-// CHECK-DEBUG:       [[LOC_2_ADDR:%.+]] = alloca %struct.ident_t
-// CHECK-DEBUG:       [[KMPC_LOC_VOIDPTR:%.+]] = bitcast %struct.ident_t* [[LOC_2_ADDR]] to i8*
-// CHECK-DEBUG-NEXT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[KMPC_LOC_VOIDPTR]], i8* align 8 bitcast (%struct.ident_t* [[DEF_LOC_2]] to i8*), i64 24, i1 false)
+
 // ALL-DEBUG:       store i32 %argc, i32* [[ARGC_ADDR:%.+]],
 // ALL-DEBUG:       [[VLA:%.+]] = alloca i32, i64 [[VLA_SIZE:%[^,]+]],
-// CHECK-DEBUG:       [[KMPC_LOC_PSOURCE_REF:%.+]] = getelementptr inbounds %struct.ident_t, %struct.ident_t* [[LOC_2_ADDR]], i32 0, i32 4
-// CHECK-DEBUG-NEXT:  store i8* getelementptr inbounds ([{{.+}} x i8], [{{.+}} x i8]* [[LOC1]], i32 0, i32 0), i8** [[KMPC_LOC_PSOURCE_REF]]
-// CHECK-DEBUG:       call {{.*}}void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[LOC_2_ADDR]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i32*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i64 [[VLA_SIZE]], i32* [[VLA]])
+
+// CHECK-DEBUG:       call {{.*}}void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.*}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i32*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i64 [[VLA_SIZE]], i32* [[VLA]])
 // IRBUILDER-DEBUG:       call {{.*}}void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.*}}, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i32* [[VLA]])
 // ALL-DEBUG:        [[ARGV:%.+]] = load i8**, i8*** {{%[a-z0-9.]+}}
 // ALL-DEBUG:        [[RET:%.+]] = call i32 [[TMAIN:@.+tmain.+]](i8** [[ARGV]])
@@ -144,13 +140,9 @@ int main (int argc, char **argv) {
 // ALL:  ret i32 0
 // ALL-NEXT:  }
 // ALL-DEBUG:       define linkonce_odr i32 [[TMAIN]](i8** %argc)
-// CHECK-DEBUG-DAG:   [[LOC_2_ADDR:%.+]] = alloca %struct.ident_t
-// CHECK-DEBUG:       [[KMPC_LOC_VOIDPTR:%.+]] = bitcast %struct.ident_t* [[LOC_2_ADDR]] to i8*
-// CHECK-DEBUG-NEXT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[KMPC_LOC_VOIDPTR]], i8* align 8 bitcast (%struct.ident_t* [[DEF_LOC_2]] to i8*), i64 24, i1 false)
-// CHECK-DEBUG-NEXT:  store i8** %argc, i8*** [[ARGC_ADDR:%.+]],
-// CHECK-DEBUG:  [[KMPC_LOC_PSOURCE_REF:%.+]] = getelementptr inbounds %struct.ident_t, %struct.ident_t* [[LOC_2_ADDR]], i32 0, i32 4
-// CHECK-DEBUG-NEXT:  store i8* getelementptr inbounds ([{{.+}} x i8], [{{.+}} x i8]* [[LOC2]], i32 0, i32 0), i8** [[KMPC_LOC_PSOURCE_REF]]
-// CHECK-DEBUG-NEXT:  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[LOC_2_ADDR]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i8***, i64)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i8*** [[ARGC_ADDR]], i64 %{{.+}})
+
+// CHECK-DEBUG:       store i8** %argc, i8*** [[ARGC_ADDR:%.+]],
+// CHECK-DEBUG:       call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.*}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i8***, i64)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i8*** [[ARGC_ADDR]], i64 %{{.+}})
 // IRBUILDER-DEBUG:   call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.*}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i8***, i64)* [[OMP_OUTLINED:@.+]] to void (i32*, i32*, ...)*), i8*** [[ARGC_ADDR]], i64 %{{.+}})
 // ALL-DEBUG:  ret i32 0
 // ALL-DEBUG-NEXT:  }
diff --git a/clang/test/OpenMP/parallel_copyin_codegen.cpp b/clang/test/OpenMP/parallel_copyin_codegen.cpp
index 1331a2b8c0e4..0f974af5ec54 100644
--- a/clang/test/OpenMP/parallel_copyin_codegen.cpp
+++ b/clang/test/OpenMP/parallel_copyin_codegen.cpp
@@ -48,10 +48,10 @@ struct S {
 
 // CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
-// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
 // TLS-CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
 // TLS-CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
-// TLS-CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// TLS-CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
 
 // CHECK-DAG: [[T_VAR:@.+]] = internal global i{{[0-9]+}} 1122,
 // CHECK-DAG: [[VEC:@.+]] = internal global [2 x i{{[0-9]+}}] [i{{[0-9]+}} 1, i{{[0-9]+}} 2],
diff --git a/clang/test/OpenMP/parallel_for_codegen.cpp b/clang/test/OpenMP/parallel_for_codegen.cpp
index de445634470b..cc5bb8f4858f 100644
--- a/clang/test/OpenMP/parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/parallel_for_codegen.cpp
@@ -29,7 +29,7 @@
 
 #ifndef OMP5
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
-// CHECK-DAG: [[LOOP_LOC:@.+]] = private unnamed_addr global [[IDENT_T_TY]] { i32 0, i32 514, i32 0, i32 0, i8*
+// CHECK-DAG: [[LOOP_LOC:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 514, i32 0, i32 0, i8*
 
 // CHECK-LABEL: with_var_schedule
 void with_var_schedule() {
diff --git a/clang/test/OpenMP/parallel_master_codegen.cpp b/clang/test/OpenMP/parallel_master_codegen.cpp
index 82e18c80f103..850a650ca7ad 100644
--- a/clang/test/OpenMP/parallel_master_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_codegen.cpp
@@ -15,7 +15,7 @@
 
 // CK1-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CK1-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CK1-DAG: [[DEF_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CK1-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 // CK1-LABEL: foo
 void foo() {}
@@ -52,7 +52,7 @@ void parallel_master() {
 
 // CK2-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CK2-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CK2-DAG: [[DEF_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CK2-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void parallel_master_private() {
   int a;
@@ -98,12 +98,12 @@ void parallel_master_private() {
 
 // CK3-LABEL: define void @{{.+}}parallel_master{{.+}}
 // CK3:       [[A_VAL:%.+]] = alloca i32
-// CK3:       call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* {{.+}}, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* [[OMP_OUTLINED:@.+]] to void 
+// CK3:       call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* {{.+}}, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* [[OMP_OUTLINED:@.+]] to void
 
 // CK3:       define internal {{.*}}void [[OMP_OUTLINED]](i32* noalias [[GTID:%.+]], i32* noalias [[BTID:%.+]], i32* nonnull align 4 dereferenceable(4) [[A_VAL]])
 // CK3:       [[GTID_ADDR:%.+]] = alloca i32*
 // CK3:       [[BTID_ADDR:%.+]] = alloca i32*
-// CK3:       [[A_ADDR:%.+]] = alloca i32* 
+// CK3:       [[A_ADDR:%.+]] = alloca i32*
 // CK3:       store i32* [[GTID]], i32** [[GTID_ADDR]]
 // CK3:       store i32* [[BTID]], i32** [[BTID_ADDR]]
 // CK3:       store i32* [[A_VAL]], i32** [[A_ADDR]]
@@ -145,7 +145,7 @@ void parallel_master_default_firstprivate() {
 // CK31:       [[CONV:%.+]] = bitcast i64* [[A_CASTED]] to i32*
 // CK31:       store i32 [[ZERO_VAL]], i32* [[CONV]]
 // CK31:       [[ONE_VAL:%.+]] = load i64, i64* [[A_CASTED]]
-// CK31:       call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64)* @.omp_outlined. to void (i32*, i32*, ...)*), i64 [[ONE_VAL]])
+// CK31:       call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.*}}, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64)* @.omp_outlined. to void (i32*, i32*, ...)*), i64 [[ONE_VAL]])
 // CK31:       ret void
 
 // CK31:       [[GLOBAL_TID_ADDR:%.+]] = alloca i32*
@@ -157,14 +157,14 @@ void parallel_master_default_firstprivate() {
 // CK31:       [[CONV]] = bitcast i64* [[A_ADDR]]
 // CK31:       [[ZERO_VAL]] = load i32*, i32** [[GLOBAL_TID_ADDR]]
 // CK31:       [[ONE_VAL]] = load i32, i32* [[ZERO_VAL]]
-// CK31:       [[TWO_VAL:%.+]] = call i32 @__kmpc_master(%struct.ident_t* @0, i32 [[ONE_VAL]])
+// CK31:       [[TWO_VAL:%.+]] = call i32 @__kmpc_master(%struct.ident_t* @{{.*}}, i32 [[ONE_VAL]])
 // CK31:       [[THREE:%.+]] = icmp ne i32 [[TWO_VAL]], 0
 // CK31:       br i1 %3, label [[OMP_IF_THEN:%.+]], label [[OMP_IF_END:%.+]]
 
 // CK31:       [[FOUR:%.+]] = load i32, i32* [[CONV:%.+]]
 // CK31:       [[INC:%.+]] = add nsw i32 [[FOUR]]
 // CK31:       store i32 [[INC]], i32* [[CONV]]
-// CK31:       call void @__kmpc_end_master(%struct.ident_t* @0, i32 [[ONE_VAL]])
+// CK31:       call void @__kmpc_end_master(%struct.ident_t* @{{.*}}, i32 [[ONE_VAL]])
 // CK31:       br label [[OMP_IF_END]]
 
 // CK31:       ret void
@@ -287,7 +287,7 @@ void parallel_master_default_firstprivate() {
 
 // CK4-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CK4-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CK4-DAG: [[DEF_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CK4-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void parallel_master_firstprivate() {
   int a;
@@ -307,7 +307,7 @@ void parallel_master_firstprivate() {
 // CK4:       define internal {{.*}}void [[OMP_OUTLINED]](i32* noalias [[GLOBAL_TID:%.+]], i32* noalias [[BOUND_TID:%.+]], i64 [[A_VAL]])
 // CK4:       [[GLOBAL_TID_ADDR:%.+]] = alloca i32*
 // CK4:       [[BOUND_TID_ADDR:%.+]] = alloca i32*
-// CK4:       [[A_ADDR:%.+]] = alloca i64 
+// CK4:       [[A_ADDR:%.+]] = alloca i64
 // CK4:       store i32* [[GLOBAL_TID]], i32** [[GLOBAL_TID_ADDR]]
 // CK4:       store i32* [[BOUND_TID]], i32** [[BOUND_TID_ADDR]]
 // CK4:       store i64 [[A_VAL]], i64* [[A_ADDR]]
@@ -345,14 +345,14 @@ void parallel_master_firstprivate() {
 // CK5-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CK5-DAG: [[A:@.+]] = {{.+}} i32 0
 // CK5-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CK5-DAG: [[DEF_LOC_1:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CK5-DAG: [[DEF_LOC_1:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 // CK5-DAG: [[A_CACHE:@.+]] = common global i8** null
-// CK5-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 66, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CK5-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 // TLS-CHECK-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // TLS-CHECK-DAG: [[A:@.+]] = thread_local global i32 0
 // TLS-CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// TLS-CHECK-DAG: [[DEF_LOC_1:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 66, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
-// TLS-CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// TLS-CHECK-DAG: [[DEF_LOC_1:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// TLS-CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 int a;
 #pragma omp threadprivate(a)
@@ -443,9 +443,9 @@ void parallel_master_copyin() {
 
 // CK6-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CK6-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CK6-DAG: [[DEF_LOC_1:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CK6-DAG: [[DEF_LOC_1:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 // CK6-DAG: [[GOMP:@.+]] = common global [8 x i32] zeroinitializer
-// CK6-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 18, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CK6-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void parallel_master_reduction() {
   int g;
@@ -510,7 +510,7 @@ void parallel_master_reduction() {
 
 // CK7-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CK7-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CK7-DAG: [[DEF_LOC_1:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CK7-DAG: [[DEF_LOC_1:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void parallel_master_if() {
 #pragma omp parallel master if (parallel: false)
@@ -525,7 +525,7 @@ void parallel_master_if() {
 // CK7:       ret void
 
 // CK7:       define internal void @.omp_outlined.(i32* noalias [[GTID:%.+]], i32* noalias [[BTID:%.+]])
-// CK7:       [[EXECUTE:%.+]] = call i32 @__kmpc_master(%struct.ident_t* @0, i32 %1)
+// CK7:       [[EXECUTE:%.+]] = call i32 @__kmpc_master(%struct.ident_t* @1, i32 %1)
 // CK7:       call void @__kmpc_end_master(%struct.ident_t* [[DEF_LOC_1]], i32 %1)
 
 #endif
@@ -544,7 +544,7 @@ typedef __INTPTR_TYPE__ intptr_t;
 
 // CK8-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CK8-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CK8-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CK8-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
@@ -600,7 +600,7 @@ int main() {
 // SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
 // CK9-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CK9-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CK9-DAG: [[DEF_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CK9-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 typedef void **omp_allocator_handle_t;
 extern const omp_allocator_handle_t omp_null_allocator;
 extern const omp_allocator_handle_t omp_default_mem_alloc;
diff --git a/clang/test/OpenMP/parallel_master_taskloop_reduction_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_reduction_codegen.cpp
index 1c24e4e1d084..c75e35f9ed41 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_reduction_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_reduction_codegen.cpp
@@ -161,8 +161,8 @@ sum = 0.0;
 // CHECK:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
 // CHECK:    [[SUB12:%.*]] = sub nsw i32 [[DIV]], 1
 // CHECK:    store i32 [[SUB12]], i32* [[DOTCAPTURE_EXPR_9]],
-// CHECK:    [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* %{{.+}}, i32 [[TMP0]], i32 1, i64 888, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @[[TASK:.+]] to i32 (i32, i8*)*))
-// CHECK:    call void @__kmpc_taskloop(%struct.ident_t* %{{.+}}, i32 [[TMP0]], i8* [[TMP65]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* null)
+// CHECK:    [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* {{.+}}, i32 [[TMP0]], i32 1, i64 888, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @[[TASK:.+]] to i32 (i32, i8*)*))
+// CHECK:    call void @__kmpc_taskloop(%struct.ident_t* {{.+}}, i32 [[TMP0]], i8* [[TMP65]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* null)
 // CHECK:    call void @__kmpc_end_taskgroup(%struct.ident_t*
 // CHECK:  call {{.*}}void @__kmpc_end_master(
 // CHECK-NEXT:  br label {{%?}}[[EXIT]]
diff --git a/clang/test/OpenMP/parallel_master_taskloop_simd_reduction_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_simd_reduction_codegen.cpp
index c83589f34c78..bf5b2dc88d24 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_simd_reduction_codegen.cpp
@@ -161,8 +161,8 @@ sum = 0.0;
 // CHECK:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
 // CHECK:    [[SUB12:%.*]] = sub nsw i32 [[DIV]], 1
 // CHECK:    store i32 [[SUB12]], i32* [[DOTCAPTURE_EXPR_9]],
-// CHECK:    [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* %{{.+}}, i32 [[TMP0]], i32 1, i64 888, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @[[TASK:.+]] to i32 (i32, i8*)*))
-// CHECK:    call void @__kmpc_taskloop(%struct.ident_t* %{{.+}}, i32 [[TMP0]], i8* [[TMP65]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* null)
+// CHECK:    [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* {{.+}}, i32 [[TMP0]], i32 1, i64 888, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @[[TASK:.+]] to i32 (i32, i8*)*))
+// CHECK:    call void @__kmpc_taskloop(%struct.ident_t* {{.+}}, i32 [[TMP0]], i8* [[TMP65]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* null)
 // CHECK:    call void @__kmpc_end_taskgroup(%struct.ident_t*
 // CHECK:  call {{.*}}void @__kmpc_end_master(
 // CHECK-NEXT:  br label {{%?}}[[EXIT]]
diff --git a/clang/test/OpenMP/parallel_num_threads_codegen.cpp b/clang/test/OpenMP/parallel_num_threads_codegen.cpp
index 79615b934168..9ec712f83c53 100644
--- a/clang/test/OpenMP/parallel_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/parallel_num_threads_codegen.cpp
@@ -15,7 +15,7 @@ typedef __INTPTR_TYPE__ intptr_t;
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[S_TY:%.+]] = type { [[INTPTR_T_TY:i[0-9]+]], [[INTPTR_T_TY]], [[INTPTR_T_TY]] }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
diff --git a/clang/test/OpenMP/parallel_proc_bind_codegen.cpp b/clang/test/OpenMP/parallel_proc_bind_codegen.cpp
index 4747a8182e58..8b9e09191b24 100644
--- a/clang/test/OpenMP/parallel_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/parallel_proc_bind_codegen.cpp
@@ -14,7 +14,7 @@ typedef __INTPTR_TYPE__ intptr_t;
 
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
diff --git a/clang/test/OpenMP/parallel_reduction_codegen.cpp b/clang/test/OpenMP/parallel_reduction_codegen.cpp
index 21f9efb32223..3b4348e4bc1d 100644
--- a/clang/test/OpenMP/parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/parallel_reduction_codegen.cpp
@@ -84,7 +84,7 @@ struct SST {
 // BLOCKS: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
-// CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
+// CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
 // CHECK-DAG: [[REDUCTION_LOCK:@.+]] = common global [8 x i32] zeroinitializer
 
 //CHECK: foo_array_sect
diff --git a/clang/test/OpenMP/sections_codegen.cpp b/clang/test/OpenMP/sections_codegen.cpp
index 68fd38f7d0bb..ba918c385fc3 100644
--- a/clang/test/OpenMP/sections_codegen.cpp
+++ b/clang/test/OpenMP/sections_codegen.cpp
@@ -9,8 +9,8 @@
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
-// CHECK-DAG: [[IMPLICIT_BARRIER_SECTIONS_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 194, i32 0, i32 0, i8*
-// CHECK-DAG: [[SECTIONS_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 1026, i32 0, i32 0, i8*
+// CHECK-DAG: [[IMPLICIT_BARRIER_SECTIONS_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 194, i32 0, i32 0, i8*
+// CHECK-DAG: [[SECTIONS_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 1026, i32 0, i32 0, i8*
 // CHECK-LABEL: foo
 void foo() {};
 // CHECK-LABEL: bar
diff --git a/clang/test/OpenMP/sections_firstprivate_codegen.cpp b/clang/test/OpenMP/sections_firstprivate_codegen.cpp
index 4ba4cf70eb2b..8d73c5dcfca1 100644
--- a/clang/test/OpenMP/sections_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/sections_firstprivate_codegen.cpp
@@ -65,7 +65,7 @@ S<float> s_arr[] = {1, 2};
 // CHECK-DAG: [[VAR:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
 S<float> var(3);
 // CHECK-DAG: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
-// CHECK-DAG: [[SECTIONS_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 194, i32 0, i32 0, i8*
+// CHECK-DAG: [[SECTIONS_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 194, i32 0, i32 0, i8*
 
 // CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
 // CHECK: ([[S_FLOAT_TY]]*)* [[S_FLOAT_TY_DESTR:@[^ ]+]] {{[^,]+}}, {{.+}}([[S_FLOAT_TY]]* [[TEST]]
diff --git a/clang/test/OpenMP/sections_lastprivate_codegen.cpp b/clang/test/OpenMP/sections_lastprivate_codegen.cpp
index 12a64d17e727..acbda9dddd90 100644
--- a/clang/test/OpenMP/sections_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/sections_lastprivate_codegen.cpp
@@ -44,7 +44,7 @@ volatile int g = 1212;
 
 // CHECK: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK: [[S_INT_TY:%.+]] = type { i32 }
-// CHECK-DAG: [[SECTIONS_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 194, i32 0, i32 0, i8*
+// CHECK-DAG: [[SECTIONS_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 194, i32 0, i32 0, i8*
 // CHECK-DAG: [[X:@.+]] = global double 0.0
 // OMP50-DAG: [[LAST_IV_X:@.+]] = {{.*}}common global i32 0
 // OMP50-DAG: [[LAST_X:@.+]] = {{.*}}common global double 0.000000e+00,
diff --git a/clang/test/OpenMP/sections_reduction_codegen.cpp b/clang/test/OpenMP/sections_reduction_codegen.cpp
index a583606c5677..19f57fd19feb 100644
--- a/clang/test/OpenMP/sections_reduction_codegen.cpp
+++ b/clang/test/OpenMP/sections_reduction_codegen.cpp
@@ -28,8 +28,8 @@ struct S {
 
 // CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
-// CHECK-DAG: [[ATOMIC_REDUCE_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
-// CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
+// CHECK-DAG: [[ATOMIC_REDUCE_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
+// CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
 // CHECK-DAG: [[REDUCTION_LOCK:@.+]] = common global [8 x i32] zeroinitializer
 
 template <typename T>
diff --git a/clang/test/OpenMP/single_codegen.cpp b/clang/test/OpenMP/single_codegen.cpp
index a56cdb0ae81a..e5b2c86b995b 100644
--- a/clang/test/OpenMP/single_codegen.cpp
+++ b/clang/test/OpenMP/single_codegen.cpp
@@ -34,7 +34,7 @@ class TestClass {
 // CHECK-DAG:   [[SST_TY:%.+]] = type { double }
 // CHECK-DAG:   [[SS_TY:%.+]] = type { i32, i8, i32* }
 // CHECK-DAG:   [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
-// CHECK:       [[IMPLICIT_BARRIER_SINGLE_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 322, i32 0, i32 0, i8*
+// CHECK:       [[IMPLICIT_BARRIER_SINGLE_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 322, i32 0, i32 0, i8*
 
 // CHECK:       define void [[FOO:@.+]]()
 
diff --git a/clang/test/OpenMP/single_firstprivate_codegen.cpp b/clang/test/OpenMP/single_firstprivate_codegen.cpp
index aadec94270fd..0c1d7df370df 100644
--- a/clang/test/OpenMP/single_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/single_firstprivate_codegen.cpp
@@ -63,7 +63,7 @@ int vec[] = {1, 2};
 S<float> s_arr[] = {1, 2};
 // CHECK-DAG: [[VAR:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
 S<float> var(3);
-// CHECK-DAG: [[SINGLE_BARRIER_LOC:@.+]] = private unnamed_addr global %{{.+}} { i32 0, i32 322, i32 0, i32 0, i8*
+// CHECK-DAG: [[SINGLE_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 322, i32 0, i32 0, i8*
 
 // CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
 // CHECK: ([[S_FLOAT_TY]]*)* [[S_FLOAT_TY_DESTR:@[^ ]+]] {{[^,]+}}, {{.+}}([[S_FLOAT_TY]]* [[TEST]]
diff --git a/clang/test/OpenMP/target_depend_codegen.cpp b/clang/test/OpenMP/target_depend_codegen.cpp
index e8b07ace5fb0..b2e375f21186 100644
--- a/clang/test/OpenMP/target_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_depend_codegen.cpp
@@ -75,22 +75,24 @@ int foo(int n) {
   TT<long long, char> d;
   static long *plocal;
 
-  // CHECK:       [[ADD:%.+]] = add nsw i32
-  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
-  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
-  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
-  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @0, i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
-  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP_START:%.+]], i[[SZ]] 1
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP_START]], i[[SZ]] 2
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP_START]], i[[SZ]] 3
-  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
-  // CHECK:       call void @__kmpc_omp_wait_deps(%struct.ident_t* @0, i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
-  // CHECK:       call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]])
-  // CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
-  // CHECK:       call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]])
-  #pragma omp target device(global + a) depend(in: global) depend(out: a, b, cn[4])
+// CHECK:       [[ADD:%.+]] = add nsw i32
+// CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+// CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       store i32 [[DEV]], i32* [[GEP]],
+// CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @1, i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
+// CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP_START:%.+]], i[[SZ]] 1
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP_START]], i[[SZ]] 2
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP_START]], i[[SZ]] 3
+// CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
+// CHECK:       call void @__kmpc_omp_wait_deps(%struct.ident_t* @1, i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
+// CHECK:       call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]])
+// CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
+// CHECK:       call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]])
+#pragma omp target device(global + a) depend(in                   \
+                                             : global) depend(out \
+                                                              : a, b, cn[4])
   {
   }
 
@@ -121,12 +123,12 @@ int foo(int n) {
   // CHECK:       [[DEV1:%.+]] = load i32, i32* [[DEVICE_CAP]],
   // CHECK:       [[DEV2:%.+]] = sext i32 [[DEV1]] to i64
 
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{104|60}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @{{.*}}, i32 [[GTID]], i32 1, i[[SZ]] {{104|60}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
   // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1_:%.+]]*
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP_START:%.+]], i[[SZ]] 1
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP_START]], i[[SZ]] 2
   // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
-  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
   // CHECK:       br label %[[EXIT:.+]]
 
   // CHECK:       [[ELSE]]:
@@ -137,30 +139,32 @@ int foo(int n) {
   // CHECK:       [[DEV1:%.+]] = load i32, i32* [[DEVICE_CAP]],
   // CHECK:       [[DEV2:%.+]] = sext i32 [[DEV1]] to i64
 
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @1, i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
   // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1__:%.+]]*
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP_START:%.+]], i[[SZ]] 1
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP_START]], i[[SZ]] 2
   // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* [[DEP_START]] to i8*
-  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
   // CHECK:       br label %[[EXIT:.+]]
   // CHECK:       [[EXIT]]:
 
-  #pragma omp target device(global + a) nowait depend(inout: global, a, bn) if(a)
+#pragma omp target device(global + a) nowait depend(inout \
+                                                    : global, a, bn) if (a)
   {
     static int local1;
     *plocal = global;
     local1 = global;
   }
 
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
-  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
-  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
-  // CHECK:       call void @__kmpc_omp_wait_deps(%struct.ident_t* @0, i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
-  // CHECK:       call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]])
-  // CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
-  // CHECK:       call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]])
-  #pragma omp target if(0) firstprivate(global) depend(out:global)
+// CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @1, i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
+// CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
+// CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
+// CHECK:       call void @__kmpc_omp_wait_deps(%struct.ident_t* @1, i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
+// CHECK:       call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]])
+// CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
+// CHECK:       call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]])
+#pragma omp target if (0) firstprivate(global) depend(out \
+                                                      : global)
   {
     global += 1;
   }
diff --git a/clang/test/OpenMP/target_parallel_codegen.cpp b/clang/test/OpenMP/target_parallel_codegen.cpp
index 2e094c294dfa..88a227565070 100644
--- a/clang/test/OpenMP/target_parallel_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_codegen.cpp
@@ -40,7 +40,7 @@
 
 // CHECK-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
 // CHECK-DAG: [[S1:%.+]] = type { double }
diff --git a/clang/test/OpenMP/target_parallel_depend_codegen.cpp b/clang/test/OpenMP/target_parallel_depend_codegen.cpp
index 71d02ec19b4b..44ee8695611a 100644
--- a/clang/test/OpenMP/target_parallel_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_depend_codegen.cpp
@@ -75,23 +75,25 @@ int foo(int n) {
   TT<long long, char> d;
   static long *plocal;
 
-  // CHECK:       [[ADD:%.+]] = add nsw i32
-  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
-  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
-  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
-  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @0, i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
-  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 1
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 2
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 3
-  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
-  // CHECK:       call void @__kmpc_omp_wait_deps(%struct.ident_t* @0, i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
-  // CHECK:       call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]])
-  // CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
-  // CHECK:       call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]])
-  #pragma omp target parallel device(global + a) depend(in: global) depend(out: a, b, cn[4])
+// CHECK:       [[ADD:%.+]] = add nsw i32
+// CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+// CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       store i32 [[DEV]], i32* [[GEP]],
+// CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @1, i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
+// CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 1
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 2
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 3
+// CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
+// CHECK:       call void @__kmpc_omp_wait_deps(%struct.ident_t* @1, i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
+// CHECK:       call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]])
+// CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
+// CHECK:       call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]])
+#pragma omp target parallel device(global + a) depend(in                   \
+                                                      : global) depend(out \
+                                                                       : a, b, cn[4])
   {
   }
 
@@ -122,13 +124,13 @@ int foo(int n) {
   // CHECK:       [[DEV1:%.+]] = load i32, i32* [[DEVICE_CAP]],
   // CHECK:       [[DEV2:%.+]] = sext i32 [[DEV1]] to i64
 
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{104|60}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @{{.*}}, i32 [[GTID]], i32 1, i[[SZ]] {{104|60}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
   // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1_:%.+]]*
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 1
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 2
   // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
-  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
   // CHECK:       br label %[[EXIT:.+]]
 
   // CHECK:       [[ELSE]]:
@@ -139,32 +141,35 @@ int foo(int n) {
   // CHECK:       [[DEV1:%.+]] = load i32, i32* [[DEVICE_CAP]],
   // CHECK:       [[DEV2:%.+]] = sext i32 [[DEV1]] to i64
 
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @1, i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
   // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1__:%.+]]*
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 1
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 2
   // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
-  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
   // CHECK:       br label %[[EXIT:.+]]
   // CHECK:       [[EXIT]]:
 
-  #pragma omp target parallel device(global + a) nowait depend(inout: global, a, bn) if(target:a)
+#pragma omp target parallel device(global + a) nowait depend(inout                       \
+                                                             : global, a, bn) if (target \
+                                                                                  : a)
   {
     static int local1;
     *plocal = global;
     local1 = global;
   }
 
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
-  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
-  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
-  // CHECK:       call void @__kmpc_omp_wait_deps(%struct.ident_t* @0, i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
-  // CHECK:       call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]])
-  // CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
-  // CHECK:       call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]])
-  #pragma omp target parallel if(0) firstprivate(global) depend(out:global)
+// CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @1, i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
+// CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
+// CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
+// CHECK:       call void @__kmpc_omp_wait_deps(%struct.ident_t* @1, i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
+// CHECK:       call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]])
+// CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
+// CHECK:       call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]])
+#pragma omp target parallel if (0) firstprivate(global) depend(out \
+                                                               : global)
   {
     global += 1;
   }
diff --git a/clang/test/OpenMP/target_parallel_for_codegen.cpp b/clang/test/OpenMP/target_parallel_for_codegen.cpp
index e8590530a0d8..78494dba97e8 100644
--- a/clang/test/OpenMP/target_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_codegen.cpp
@@ -40,7 +40,7 @@
 
 // CHECK-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
 // CHECK-DAG: [[S1:%.+]] = type { double }
diff --git a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
index 055d5dce28bb..02d08b6329be 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
@@ -67,7 +67,7 @@
 #define HEADER
 // CHECK-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
 // CHECK-DAG: [[S1:%.+]] = type { double }
diff --git a/clang/test/OpenMP/target_parallel_if_codegen.cpp b/clang/test/OpenMP/target_parallel_if_codegen.cpp
index b315362735fe..9f3f13e6aa9a 100644
--- a/clang/test/OpenMP/target_parallel_if_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_if_codegen.cpp
@@ -40,7 +40,7 @@
 
 // CHECK-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 // CHECK-DAG: [[S1:%.+]] = type { double }
 // CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
diff --git a/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp
index f12248d6458c..bb231b3328e6 100644
--- a/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp
@@ -40,7 +40,7 @@
 
 // CHECK-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 // CHECK-DAG: [[S1:%.+]] = type { double }
 // CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
diff --git a/clang/test/OpenMP/target_simd_depend_codegen.cpp b/clang/test/OpenMP/target_simd_depend_codegen.cpp
index 72cd550207b6..d45001c9eaa7 100644
--- a/clang/test/OpenMP/target_simd_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_simd_depend_codegen.cpp
@@ -75,23 +75,25 @@ int foo(int n) {
   TT<long long, char> d;
   static long *plocal;
 
-  // CHECK:       [[ADD:%.+]] = add nsw i32
-  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
-  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
-  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
-  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @0, i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
-  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 1
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 2
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 3
-  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
-  // CHECK:       call void @__kmpc_omp_wait_deps(%struct.ident_t* @0, i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
-  // CHECK:       call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]])
-  // CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
-  // CHECK:       call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]])
-  #pragma omp target simd device(global + a) depend(in: global) depend(out: a, b, cn[4])
+// CHECK:       [[ADD:%.+]] = add nsw i32
+// CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+// CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       store i32 [[DEV]], i32* [[GEP]],
+// CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @1, i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
+// CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 1
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 2
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 3
+// CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
+// CHECK:       call void @__kmpc_omp_wait_deps(%struct.ident_t* @1, i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
+// CHECK:       call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]])
+// CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
+// CHECK:       call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]])
+#pragma omp target simd device(global + a) depend(in                   \
+                                                  : global) depend(out \
+                                                                   : a, b, cn[4])
   for (int i = 0; i < 10; ++i) {
   }
 
@@ -122,13 +124,13 @@ int foo(int n) {
   // CHECK:       [[DEV1:%.+]] = load i32, i32* [[DEVICE_CAP]],
   // CHECK:       [[DEV2:%.+]] = sext i32 [[DEV1]] to i64
 
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{104|60}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @{{.*}}, i32 [[GTID]], i32 1, i[[SZ]] {{104|60}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
   // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1_:%.+]]*
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 1
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 2
   // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
-  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
   // CHECK:       br label %[[EXIT:.+]]
 
   // CHECK:       [[ELSE]]:
@@ -139,32 +141,34 @@ int foo(int n) {
   // CHECK:       [[DEV1:%.+]] = load i32, i32* [[DEVICE_CAP]],
   // CHECK:       [[DEV2:%.+]] = sext i32 [[DEV1]] to i64
 
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @1, i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
   // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1__:%.+]]*
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 1
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 2
   // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
-  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
   // CHECK:       br label %[[EXIT:.+]]
   // CHECK:       [[EXIT]]:
 
-  #pragma omp target simd device(global + a) nowait depend(inout: global, a, bn) if(a)
+#pragma omp target simd device(global + a) nowait depend(inout \
+                                                         : global, a, bn) if (a)
   for (int i = 0; i < *plocal; ++i) {
     static int local1;
     *plocal = global;
     local1 = global;
   }
 
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
-  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
-  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
-  // CHECK:       call void @__kmpc_omp_wait_deps(%struct.ident_t* @0, i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
-  // CHECK:       call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]])
-  // CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
-  // CHECK:       call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]])
-  #pragma omp target simd if(0) firstprivate(global) depend(out:global)
+// CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @1, i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
+// CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
+// CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
+// CHECK:       call void @__kmpc_omp_wait_deps(%struct.ident_t* @1, i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
+// CHECK:       call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]])
+// CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
+// CHECK:       call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]])
+#pragma omp target simd if (0) firstprivate(global) depend(out \
+                                                           : global)
   for (int i = 0; i < global; ++i) {
     global += 1;
   }
diff --git a/clang/test/OpenMP/target_teams_codegen.cpp b/clang/test/OpenMP/target_teams_codegen.cpp
index 9011c3c0ff80..af4831cb8792 100644
--- a/clang/test/OpenMP/target_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_codegen.cpp
@@ -40,7 +40,7 @@
 
 // CHECK-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
 // CHECK-DAG: [[S1:%.+]] = type { double }
diff --git a/clang/test/OpenMP/target_teams_depend_codegen.cpp b/clang/test/OpenMP/target_teams_depend_codegen.cpp
index 9a58e40de750..69c749945de6 100644
--- a/clang/test/OpenMP/target_teams_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_depend_codegen.cpp
@@ -75,23 +75,25 @@ int foo(int n) {
   TT<long long, char> d;
   static long *plocal;
 
-  // CHECK:       [[ADD:%.+]] = add nsw i32
-  // CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
-  // CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
-  // CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
-  // CHECK:       store i32 [[DEV]], i32* [[GEP]],
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @0, i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
-  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 1
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 2
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 3
-  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
-  // CHECK:       call void @__kmpc_omp_wait_deps(%struct.ident_t* @0, i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
-  // CHECK:       call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]])
-  // CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
-  // CHECK:       call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]])
-  #pragma omp target teams device(global + a) depend(in: global) depend(out: a, b, cn[4])
+// CHECK:       [[ADD:%.+]] = add nsw i32
+// CHECK:       store i32 [[ADD]], i32* [[DEVICE_CAP:%.+]],
+// CHECK:       [[GEP:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0
+// CHECK:       [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]],
+// CHECK:       store i32 [[DEV]], i32* [[GEP]],
+// CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @1, i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY0:@.+]] to i32 (i32, i8*)*))
+// CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY0:%.+]]*
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 1
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 2
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 3
+// CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
+// CHECK:       call void @__kmpc_omp_wait_deps(%struct.ident_t* @1, i32 [[GTID]], i32 4, i8* [[DEP]], i32 0, i8* null)
+// CHECK:       call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]])
+// CHECK:       call i32 [[TASK_ENTRY0]](i32 [[GTID]], [[TASK_TY0]]* [[BC_TASK]])
+// CHECK:       call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]])
+#pragma omp target teams device(global + a) depend(in                   \
+                                                   : global) depend(out \
+                                                                    : a, b, cn[4])
   {
   }
 
@@ -122,13 +124,13 @@ int foo(int n) {
   // CHECK:       [[DEV1:%.+]] = load i32, i32* [[DEVICE_CAP]],
   // CHECK:       [[DEV2:%.+]] = sext i32 [[DEV1]] to i64
 
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{104|60}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @{{.*}}, i32 [[GTID]], i32 1, i[[SZ]] {{104|60}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1_:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
   // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1_:%.+]]*
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 1
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 2
   // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
-  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
   // CHECK:       br label %[[EXIT:.+]]
 
   // CHECK:       [[ELSE]]:
@@ -139,32 +141,34 @@ int foo(int n) {
   // CHECK:       [[DEV1:%.+]] = load i32, i32* [[DEVICE_CAP]],
   // CHECK:       [[DEV2:%.+]] = sext i32 [[DEV1]] to i64
 
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
+  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_target_task_alloc(%struct.ident_t* @1, i32 [[GTID]], i32 1, i[[SZ]] {{56|28}}, i[[SZ]] {{16|12}}, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY1__:@.+]] to i32 (i32, i8*)*), i64 [[DEV2]])
   // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY1__:%.+]]*
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 1
   // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 2
   // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
-  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
+  // CHECK:       call i32 @__kmpc_omp_task_with_deps(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]], i32 3, i8* [[DEP]], i32 0, i8* null)
   // CHECK:       br label %[[EXIT:.+]]
   // CHECK:       [[EXIT]]:
 
-  #pragma omp target teams device(global + a) nowait depend(inout: global, a, bn) if(a)
+#pragma omp target teams device(global + a) nowait depend(inout \
+                                                          : global, a, bn) if (a)
   {
     static int local1;
     *plocal = global;
     local1 = global;
   }
 
-  // CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @0, i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
-  // CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
-  // CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
-  // CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
-  // CHECK:       call void @__kmpc_omp_wait_deps(%struct.ident_t* @0, i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
-  // CHECK:       call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]])
-  // CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
-  // CHECK:       call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK]])
-  #pragma omp target teams if(0) firstprivate(global) depend(out:global)
+// CHECK:       [[TASK:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @1, i32 [[GTID]], i32 1, i[[SZ]] {{48|24}}, i[[SZ]] 4, i32 (i32, i8*)* bitcast (i32 (i32, %{{.+}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
+// CHECK:       [[BC_TASK:%.+]] = bitcast i8* [[TASK]] to [[TASK_TY2:%.+]]*
+// CHECK:       getelementptr %struct.kmp_depend_info, %struct.kmp_depend_info* %{{.+}}, i[[SZ]] 0
+// CHECK:       [[DEP:%.+]] = bitcast %struct.kmp_depend_info* %{{.+}} to i8*
+// CHECK:       call void @__kmpc_omp_wait_deps(%struct.ident_t* @1, i32 [[GTID]], i32 1, i8* [[DEP]], i32 0, i8* null)
+// CHECK:       call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]])
+// CHECK:       call i32 [[TASK_ENTRY2]](i32 [[GTID]], [[TASK_TY2]]* [[BC_TASK]])
+// CHECK:       call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK]])
+#pragma omp target teams if (0) firstprivate(global) depend(out \
+                                                            : global)
   {
     global += 1;
   }
diff --git a/clang/test/OpenMP/target_teams_distribute_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_codegen.cpp
index 547e45f6d3e7..7f34ce07597e 100644
--- a/clang/test/OpenMP/target_teams_distribute_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_codegen.cpp
@@ -40,7 +40,7 @@
 
 // CHECK-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
 // CHECK-DAG: [[S1:%.+]] = type { double }
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp
index a7242c911245..9107c218a436 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp
@@ -17,7 +17,7 @@ typedef __INTPTR_TYPE__ intptr_t;
 
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
index 9d10c2e3dc7c..40bf51f34028 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
@@ -17,7 +17,7 @@ typedef __INTPTR_TYPE__ intptr_t;
 
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
index 4912352e17ca..2c3fcf08d16e 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
@@ -68,7 +68,7 @@
 
 // CHECK-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
 // CHECK-DAG: [[S1:%.+]] = type { double }
diff --git a/clang/test/OpenMP/target_teams_num_teams_codegen.cpp b/clang/test/OpenMP/target_teams_num_teams_codegen.cpp
index 93b28f8c4364..35763c946a13 100644
--- a/clang/test/OpenMP/target_teams_num_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_num_teams_codegen.cpp
@@ -40,7 +40,7 @@
 
 // CHECK-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 // CHECK-DAG: [[S1:%.+]] = type { double }
 // CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
diff --git a/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp b/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp
index 2432d6b3ad6e..34ffbf6efa22 100644
--- a/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp
@@ -40,7 +40,7 @@
 
 // CHECK-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 // CHECK-DAG: [[S1:%.+]] = type { double }
 // CHECK-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i[[SZ:32|64]], i32, i32 }
diff --git a/clang/test/OpenMP/task_in_reduction_codegen.cpp b/clang/test/OpenMP/task_in_reduction_codegen.cpp
index 8679daded948..64477211ca1f 100644
--- a/clang/test/OpenMP/task_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/task_in_reduction_codegen.cpp
@@ -51,18 +51,18 @@ int main(int argc, char **argv) {
 }
 
 // CHECK-LABEL: @main
-// CHECK:       void @__kmpc_taskgroup(%struct.ident_t* @0, i32 [[GTID:%.+]])
+// CHECK:       void @__kmpc_taskgroup(%struct.ident_t* @1, i32 [[GTID:%.+]])
 // CHECK:       [[TD1:%.+]] = call i8* @__kmpc_taskred_init(i32 [[GTID]], i32 3, i8* %
 // CHECK-NEXT:  store i8* [[TD1]], i8** [[TD1_ADDR:%[^,]+]],
-// CHECK-NEXT:  call void @__kmpc_taskgroup(%struct.ident_t* @0, i32 [[GTID]])
+// CHECK-NEXT:  call void @__kmpc_taskgroup(%struct.ident_t* @1, i32 [[GTID]])
 // CHECK:       [[TD2:%.+]] = call i8* @__kmpc_taskred_init(i32 [[GTID]], i32 2, i8* %
 // CHECK-NEXT:  store i8* [[TD2]], i8** [[TD2_ADDR:%[^,]+]],
-// CHECK-NEXT:  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 5, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64, i16*, i8**, i8**)* [[OMP_PARALLEL:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i64 %{{.+}}, i16* %{{.+}}, i8** [[TD1_ADDR]], i8** [[TD2_ADDR]])
-// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @0, i32 [[GTID]])
-// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @0, i32 [[GTID]])
+// CHECK-NEXT:  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @1, i32 5, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64, i16*, i8**, i8**)* [[OMP_PARALLEL:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i64 %{{.+}}, i16* %{{.+}}, i8** [[TD1_ADDR]], i8** [[TD2_ADDR]])
+// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @1, i32 [[GTID]])
+// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @1, i32 [[GTID]])
 
 // CHECK:       define internal void [[OMP_PARALLEL]](
-// CHECK:       [[TASK_T:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @0, i32 [[GTID:%.+]], i32 1, i64 56, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[T:%.+]]*)* [[OMP_TASK:@.+]] to i32 (i32, i8*)*))
+// CHECK:       [[TASK_T:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @1, i32 [[GTID:%.+]], i32 1, i64 56, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[T:%.+]]*)* [[OMP_TASK:@.+]] to i32 (i32, i8*)*))
 // CHECK-NEXT:  [[TASK_T_WITH_PRIVS:%.+]] = bitcast i8* [[TASK_T]] to [[T]]*
 // CHECK:       [[PRIVS:%.+]] = getelementptr inbounds [[T]], [[T]]* [[TASK_T_WITH_PRIVS]], i32 0, i32 1
 // CHECK:       [[TD1_REF:%.+]] = getelementptr inbounds [[PRIVATES]], [[PRIVATES]]* [[PRIVS]], i32 0, i32 0
@@ -71,7 +71,7 @@ int main(int argc, char **argv) {
 // CHECK-NEXT:  [[TD2_REF:%.+]] = getelementptr inbounds [[PRIVATES]], [[PRIVATES]]* [[PRIVS]], i32 0, i32 1
 // CHECK-NEXT:  [[TD2:%.+]] = load i8*, i8** %{{.+}},
 // CHECK-NEXT:  store i8* [[TD2]], i8** [[TD2_REF]],
-// CHECK-NEXT:  call i32 @__kmpc_omp_task(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK_T]])
+// CHECK-NEXT:  call i32 @__kmpc_omp_task(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK_T]])
 // CHECK-NEXT:  ret void
 // CHECK-NEXT:  }
 
diff --git a/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
index e648c2df50b8..3a150eeedd1f 100644
--- a/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
@@ -39,18 +39,18 @@ int main(int argc, char **argv) {
 }
 
 // CHECK-LABEL: @main
-// CHECK:       void @__kmpc_taskgroup(%struct.ident_t* @0, i32 [[GTID:%.+]])
+// CHECK:       void @__kmpc_taskgroup(%struct.ident_t* @1, i32 [[GTID:%.+]])
 // CHECK:       [[TD1:%.+]] = call i8* @__kmpc_taskred_init(i32 [[GTID]], i32 3, i8* %
 // CHECK-NEXT:  store i8* [[TD1]], i8** [[TD1_ADDR:%[^,]+]],
-// CHECK-NEXT:  call void @__kmpc_taskgroup(%struct.ident_t* @0, i32 [[GTID]])
+// CHECK-NEXT:  call void @__kmpc_taskgroup(%struct.ident_t* @1, i32 [[GTID]])
 // CHECK:       [[TD2:%.+]] = call i8* @__kmpc_taskred_init(i32 [[GTID]], i32 2, i8* %
 // CHECK-NEXT:  store i8* [[TD2]], i8** [[TD2_ADDR:%[^,]+]],
-// CHECK-NEXT:  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 5, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64, i16*, i8**, i8**)* [[OMP_PARALLEL:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i64 %{{.+}}, i16* %{{.+}}, i8** [[TD1_ADDR]], i8** [[TD2_ADDR]])
-// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @0, i32 [[GTID]])
-// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @0, i32 [[GTID]])
+// CHECK-NEXT:  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @1, i32 5, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64, i16*, i8**, i8**)* [[OMP_PARALLEL:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i64 %{{.+}}, i16* %{{.+}}, i8** [[TD1_ADDR]], i8** [[TD2_ADDR]])
+// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @1, i32 [[GTID]])
+// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @1, i32 [[GTID]])
 
 // CHECK:       define internal void [[OMP_PARALLEL]](
-// CHECK:       [[TASK_T:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @0, i32 [[GTID:%.+]], i32 1, i64 96, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[T:%.+]]*)* [[OMP_TASK:@.+]] to i32 (i32, i8*)*))
+// CHECK:       [[TASK_T:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @1, i32 [[GTID:%.+]], i32 1, i64 96, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[T:%.+]]*)* [[OMP_TASK:@.+]] to i32 (i32, i8*)*))
 // CHECK-NEXT:  [[TASK_T_WITH_PRIVS:%.+]] = bitcast i8* [[TASK_T]] to [[T]]*
 // CHECK:       [[PRIVS:%.+]] = getelementptr inbounds [[T]], [[T]]* [[TASK_T_WITH_PRIVS]], i32 0, i32 1
 // CHECK:       [[TD1_REF:%.+]] = getelementptr inbounds [[PRIVATES]], [[PRIVATES]]* [[PRIVS]], i32 0, i32 0
@@ -59,7 +59,7 @@ int main(int argc, char **argv) {
 // CHECK-NEXT:  [[TD2_REF:%.+]] = getelementptr inbounds [[PRIVATES]], [[PRIVATES]]* [[PRIVS]], i32 0, i32 1
 // CHECK-NEXT:  [[TD2:%.+]] = load i8*, i8** %{{.+}},
 // CHECK-NEXT:  store i8* [[TD2]], i8** [[TD2_REF]],
-// CHECK:       call void @__kmpc_taskloop(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK_T]], i32 1,
+// CHECK:       call void @__kmpc_taskloop(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK_T]], i32 1,
 // CHECK:       ret void
 // CHECK-NEXT:  }
 
diff --git a/clang/test/OpenMP/taskloop_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_reduction_codegen.cpp
index 7d143f33353a..f50c1318bf22 100644
--- a/clang/test/OpenMP/taskloop_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_reduction_codegen.cpp
@@ -160,8 +160,8 @@ sum = 0.0;
 // CHECK:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
 // CHECK:    [[SUB12:%.*]] = sub nsw i32 [[DIV]], 1
 // CHECK:    store i32 [[SUB12]], i32* [[DOTCAPTURE_EXPR_9]],
-// CHECK:    [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* %{{.+}}, i32 [[TMP0]], i32 1, i64 888, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @[[TASK:.+]] to i32 (i32, i8*)*))
-// CHECK:    call void @__kmpc_taskloop(%struct.ident_t* %{{.+}}, i32 [[TMP0]], i8* [[TMP65]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* null)
+// CHECK:    [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* {{.+}}, i32 [[TMP0]], i32 1, i64 888, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @[[TASK:.+]] to i32 (i32, i8*)*))
+// CHECK:    call void @__kmpc_taskloop(%struct.ident_t* {{.+}}, i32 [[TMP0]], i8* [[TMP65]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* null)
 // CHECK:    call void @__kmpc_end_taskgroup(%struct.ident_t*
 
 // CHECK:    ret i32
diff --git a/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
index 4640b44cbe93..e0fd21d8e937 100644
--- a/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
@@ -39,18 +39,18 @@ int main(int argc, char **argv) {
 }
 
 // CHECK-LABEL: @main
-// CHECK:       void @__kmpc_taskgroup(%struct.ident_t* @0, i32 [[GTID:%.+]])
+// CHECK:       void @__kmpc_taskgroup(%struct.ident_t* @1, i32 [[GTID:%.+]])
 // CHECK:       [[TD1:%.+]] = call i8* @__kmpc_taskred_init(i32 [[GTID]], i32 3, i8* %
 // CHECK-NEXT:  store i8* [[TD1]], i8** [[TD1_ADDR:%[^,]+]],
-// CHECK-NEXT:  call void @__kmpc_taskgroup(%struct.ident_t* @0, i32 [[GTID]])
+// CHECK-NEXT:  call void @__kmpc_taskgroup(%struct.ident_t* @1, i32 [[GTID]])
 // CHECK:       [[TD2:%.+]] = call i8* @__kmpc_taskred_init(i32 [[GTID]], i32 2, i8* %
 // CHECK-NEXT:  store i8* [[TD2]], i8** [[TD2_ADDR:%[^,]+]],
-// CHECK-NEXT:  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @0, i32 5, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64, i16*, i8**, i8**)* [[OMP_PARALLEL:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i64 %{{.+}}, i16* %{{.+}}, i8** [[TD1_ADDR]], i8** [[TD2_ADDR]])
-// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @0, i32 [[GTID]])
-// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @0, i32 [[GTID]])
+// CHECK-NEXT:  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @1, i32 5, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64, i16*, i8**, i8**)* [[OMP_PARALLEL:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i64 %{{.+}}, i16* %{{.+}}, i8** [[TD1_ADDR]], i8** [[TD2_ADDR]])
+// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @1, i32 [[GTID]])
+// CHECK-NEXT:  call void @__kmpc_end_taskgroup(%struct.ident_t* @1, i32 [[GTID]])
 
 // CHECK:       define internal void [[OMP_PARALLEL]](
-// CHECK:       [[TASK_T:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @0, i32 [[GTID:%.+]], i32 1, i64 96, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[T:%.+]]*)* [[OMP_TASK:@.+]] to i32 (i32, i8*)*))
+// CHECK:       [[TASK_T:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @1, i32 [[GTID:%.+]], i32 1, i64 96, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[T:%.+]]*)* [[OMP_TASK:@.+]] to i32 (i32, i8*)*))
 // CHECK-NEXT:  [[TASK_T_WITH_PRIVS:%.+]] = bitcast i8* [[TASK_T]] to [[T]]*
 // CHECK:       [[PRIVS:%.+]] = getelementptr inbounds [[T]], [[T]]* [[TASK_T_WITH_PRIVS]], i32 0, i32 1
 // CHECK:       [[TD1_REF:%.+]] = getelementptr inbounds [[PRIVATES]], [[PRIVATES]]* [[PRIVS]], i32 0, i32 0
@@ -59,7 +59,7 @@ int main(int argc, char **argv) {
 // CHECK-NEXT:  [[TD2_REF:%.+]] = getelementptr inbounds [[PRIVATES]], [[PRIVATES]]* [[PRIVS]], i32 0, i32 1
 // CHECK-NEXT:  [[TD2:%.+]] = load i8*, i8** %{{.+}},
 // CHECK-NEXT:  store i8* [[TD2]], i8** [[TD2_REF]],
-// CHECK:       call void @__kmpc_taskloop(%struct.ident_t* @0, i32 [[GTID]], i8* [[TASK_T]], i32 1,
+// CHECK:       call void @__kmpc_taskloop(%struct.ident_t* @1, i32 [[GTID]], i8* [[TASK_T]], i32 1,
 // CHECK:       ret void
 // CHECK-NEXT:  }
 
diff --git a/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp
index 16d42ec8e15e..e7d235b3fc96 100644
--- a/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_simd_reduction_codegen.cpp
@@ -157,8 +157,8 @@ sum = 0.0;
 // CHECK:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
 // CHECK:    [[SUB12:%.*]] = sub nsw i32 [[DIV]], 1
 // CHECK:    store i32 [[SUB12]], i32* [[DOTCAPTURE_EXPR_9]],
-// CHECK:    [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* %{{.+}}, i32 [[TMP0]], i32 1, i64 888, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @{{.+}} to i32 (i32, i8*)*))
-// CHECK:    call void @__kmpc_taskloop(%struct.ident_t* %{{.+}}, i32 [[TMP0]], i8* [[TMP65]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* null)
+// CHECK:    [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* {{.+}}, i32 [[TMP0]], i32 1, i64 888, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @{{.+}} to i32 (i32, i8*)*))
+// CHECK:    call void @__kmpc_taskloop(%struct.ident_t* {{.+}}, i32 [[TMP0]], i8* [[TMP65]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 1, i32 0, i64 0, i8* null)
 // CHECK:    call void @__kmpc_end_taskgroup(%struct.ident_t*
 
 // CHECK:    ret i32
diff --git a/clang/test/OpenMP/teams_codegen.cpp b/clang/test/OpenMP/teams_codegen.cpp
index 54e0f6ea29eb..67031105fce4 100644
--- a/clang/test/OpenMP/teams_codegen.cpp
+++ b/clang/test/OpenMP/teams_codegen.cpp
@@ -266,7 +266,7 @@ int teams_template_struct(void) {
 
 // CK4-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CK4-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CK4-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CK4-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 // CK4-DEBUG-DAG: [[LOC1:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}teams_codegen.cpp;main;[[@LINE+14]];9;;\00"
 // CK4-DEBUG-DAG: [[LOC2:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}teams_codegen.cpp;tmain;[[@LINE+7]];9;;\00"
 
@@ -327,7 +327,7 @@ int main (int argc, char **argv) {
 
 // CK5-DAG: %struct.ident_t = type { i32, i32, i32, i32, i8* }
 // CK5-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CK5-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CK5-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 // CK5-DEBUG-DAG: [[LOC1:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}teams_codegen.cpp;main;[[@LINE+14]];9;;\00"
 // CK5-DEBUG-DAG: [[LOC2:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}teams_codegen.cpp;tmain;[[@LINE+7]];9;;\00"
 
@@ -414,7 +414,7 @@ int main (int argc, char **argv) {
 
 // CK6-LABEL: foo
 void foo() {
-  // CK6: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @0, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @{{.+}} to void (i32*, i32*, ...)*))
+  // CK6: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @1, i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @{{.+}} to void (i32*, i32*, ...)*))
 #pragma omp teams
   ;
 }
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp
index 0f93fe219aae..ea4afa1418cb 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp
@@ -15,7 +15,7 @@ typedef __INTPTR_TYPE__ intptr_t;
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[S_TY:%.+]] = type { [[INTPTR_T_TY:i[0-9]+]], [[INTPTR_T_TY]], [[INTPTR_T_TY]] }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp
index 0b7f3b2d8c62..0c242f851b7d 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp
@@ -16,7 +16,7 @@ typedef __INTPTR_TYPE__ intptr_t;
 
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp
index 4faa99e2ee36..8c0c8208ab80 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp
@@ -15,7 +15,7 @@ typedef __INTPTR_TYPE__ intptr_t;
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[S_TY:%.+]] = type { [[INTPTR_T_TY:i[0-9]+]], [[INTPTR_T_TY]], [[INTPTR_T_TY]] }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
index 447a1a60109c..08b3cd3a47ba 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
@@ -16,7 +16,7 @@ typedef __INTPTR_TYPE__ intptr_t;
 
 // CHECK-DAG: [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
-// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr global [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CHECK-DAG: [[DEF_LOC_2:@.+]] = private unnamed_addr constant [[IDENT_T_TY]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
 
 void foo();
 
diff --git a/clang/test/OpenMP/threadprivate_codegen.cpp b/clang/test/OpenMP/threadprivate_codegen.cpp
index c24d1ea78745..a46bb6907015 100644
--- a/clang/test/OpenMP/threadprivate_codegen.cpp
+++ b/clang/test/OpenMP/threadprivate_codegen.cpp
@@ -133,7 +133,7 @@ struct S5 {
 
 // CHECK-DAG:  [[GS1:@.+]] = internal global [[S1]] zeroinitializer
 // CHECK-DAG:  [[GS1]].cache. = common global i8** null
-// CHECK-DAG:  [[DEFAULT_LOC:@.+]] = private unnamed_addr global [[IDENT]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* {{@.+}}, i32 0, i32 0) }
+// CHECK-DAG:  [[DEFAULT_LOC:@.+]] = private unnamed_addr constant [[IDENT]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* {{@.+}}, i32 0, i32 0) }
 // CHECK-DAG:  [[GS2:@.+]] = internal global [[S2]] zeroinitializer
 // CHECK-DAG:  [[ARR_X:@.+]] = global [2 x [3 x [[S1]]]] zeroinitializer
 // CHECK-DAG:  [[ARR_X]].cache. = common global i8** null
@@ -164,7 +164,9 @@ struct S5 {
 // CHECK-DEBUG-DAG: [[ST_S4_ST:@.+]] = linkonce_odr global %struct.S4 zeroinitializer
 
 // CHECK-DEBUG-DAG: [[LOC1:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;;249;1;;\00"
+// CHECK-DEBUG-DAG: [[ID1:@.*]] = private unnamed_addr constant %struct.ident_t { {{.*}} [[LOC1]]
 // CHECK-DEBUG-DAG: [[LOC2:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;;304;1;;\00"
+// CHECK-DEBUG-DAG: [[ID2:@.*]] = private unnamed_addr constant %struct.ident_t { {{.*}} [[LOC2]]
 // CHECK-DEBUG-DAG: [[LOC3:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;;422;19;;\00"
 // CHECK-DEBUG-DAG: [[LOC4:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;459;1;;\00"
 // CHECK-DEBUG-DAG: [[LOC5:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;476;9;;\00"
@@ -199,10 +201,8 @@ struct S5 {
 // CHECK-TLS-DAG:  @__dso_handle = external hidden global i8
 // CHECK-TLS-DAG:  [[GS1_TLS_INIT:@_ZTHL3gs1]] = internal alias void (), void ()* @__tls_init
 // CHECK-TLS-DAG:  [[ARR_X_TLS_INIT:@_ZTH5arr_x]] = alias void (), void ()* @__tls_init
-
 // CHECK-TLS-DAG:  [[ST_S4_ST_TLS_INIT:@_ZTHN2STI2S4E2stE]] = linkonce_odr alias void (), void ()* [[ST_S4_ST_CXX_INIT:@[^, ]*]]
 
-
 // OMP50-TLS: define internal void [[GS1_CXX_INIT:@.*]]()
 // OMP50-TLS: call void [[GS1_CTOR1:@.*]]([[S1]]* [[GS1]], i32 5)
 // OMP50-TLS: call i32 @__cxa_thread_atexit(void (i8*)* bitcast (void ([[S1]]*)* [[GS1_DTOR1:.*]] to void (i8*)*), i8* bitcast ([[S1]]* [[GS1]] to i8*)
@@ -269,11 +269,11 @@ static S1 gs1(5);
 // CHECK:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[DEFAULT_LOC]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i8* (i8*)* [[GS1_CTOR]], i8* (i8*, i8*)* null, void (i8*)* [[GS1_DTOR]])
 // CHECK-NEXT: ret void
 // CHECK-NEXT: }
-// CHECK-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
-// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-// CHECK-DEBUG:      store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC1]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+
+
+
 // CHECK-DEBUG:      @__kmpc_global_thread_num
-// CHECK-DEBUG:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[KMPC_LOC_ADDR]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i8* (i8*)* [[GS1_CTOR:@\.__kmpc_global_ctor_\..*]], i8* (i8*, i8*)* null, void (i8*)* [[GS1_DTOR:@\.__kmpc_global_dtor_\..*]])
+// CHECK-DEBUG:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[ID1]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i8* (i8*)* [[GS1_CTOR:@\.__kmpc_global_ctor_\..*]], i8* (i8*, i8*)* null, void (i8*)* [[GS1_DTOR:@\.__kmpc_global_dtor_\..*]])
 // CHECK-DEBUG:      define internal {{.*}}i8* [[GS1_CTOR]](i8* %0)
 // CHECK-DEBUG:      store i8* %0, i8** [[ARG_ADDR:%.*]],
 // CHECK-DEBUG:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
@@ -342,11 +342,11 @@ S1 arr_x[2][3] = { { 1, 2, 3 }, { 4, 5, 6 } };
 // CHECK:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[DEFAULT_LOC]], i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i8* (i8*)* [[ARR_X_CTOR]], i8* (i8*, i8*)* null, void (i8*)* [[ARR_X_DTOR]])
 // CHECK-NEXT: ret void
 // CHECK-NEXT: }
-// CHECK-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
-// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-// CHECK-DEBUG:      store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC2]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+
+
+
 // CHECK-DEBUG:      @__kmpc_global_thread_num
-// CHECK-DEBUG:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[KMPC_LOC_ADDR]], i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i8* (i8*)* [[ARR_X_CTOR:@\.__kmpc_global_ctor_\..*]], i8* (i8*, i8*)* null, void (i8*)* [[ARR_X_DTOR:@\.__kmpc_global_dtor_\..*]])
+// CHECK-DEBUG:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[ID2]], i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i8* (i8*)* [[ARR_X_CTOR:@\.__kmpc_global_ctor_\..*]], i8* (i8*, i8*)* null, void (i8*)* [[ARR_X_DTOR:@\.__kmpc_global_dtor_\..*]])
 // CHECK-DEBUG:      define internal {{.*}}i8* [[ARR_X_CTOR]](i8* %0)
 // CHECK-DEBUG:      }
 // CHECK-DEBUG:      define internal {{.*}}void [[ARR_X_DTOR]](i8* %0)
@@ -364,11 +364,11 @@ struct ST {
 };
 
 
-// OMP50-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
-// OMP50-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-// OMP50-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC20]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+
+
+
 // OMP50-DEBUG:      @__kmpc_global_thread_num
-// OMP50-DEBUG:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[KMPC_LOC_ADDR]], i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i8* (i8*)* [[ST_S4_ST_CTOR:@\.__kmpc_global_ctor_\..+]], i8* (i8*, i8*)* null, void (i8*)* [[ST_S4_ST_DTOR:@\.__kmpc_global_dtor_\..+]])
+// OMP50-DEBUG:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* {{.*}}, i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i8* (i8*)* [[ST_S4_ST_CTOR:@\.__kmpc_global_ctor_\..+]], i8* (i8*, i8*)* null, void (i8*)* [[ST_S4_ST_DTOR:@\.__kmpc_global_dtor_\..+]])
 // OMP50-DEBUG:      define internal {{.*}}i8* [[ST_S4_ST_CTOR]](i8* %0)
 // OMP50-DEBUG:      }
 // OMP50-DEBUG:      define {{.*}} [[S4_CTOR:@.*]]([[S4]]* {{.*}},
@@ -400,7 +400,7 @@ T ST<T>::st(23);
 // CHECK-LABEL:  @main()
 // CHECK-DEBUG-LABEL: @main()
 int main() {
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
+
   int Res;
   struct Smain {
     int a;
@@ -424,21 +424,21 @@ int main() {
 // CHECK:      call {{.*}}i{{.*}} @__cxa_guard_acquire
 // CHECK:      call {{.*}}i32 @__kmpc_global_thread_num([[IDENT]]* [[DEFAULT_LOC]])
 // CHECK:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[DEFAULT_LOC]], i8* bitcast ([[SMAIN]]* [[SM]] to i8*), i8* (i8*)* [[SM_CTOR:@\.__kmpc_global_ctor_\..+]], i8* (i8*, i8*)* null, void (i8*)* [[SM_DTOR:@\.__kmpc_global_dtor_\..+]])
-// CHECK:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS1]].cache.)
+// CHECK:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS1]].cache.)
 // CHECK-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
 // CHECK-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
 // CHECK-NEXT: [[GS1_A:%.*]] = load [[INT]], [[INT]]* [[GS1_A_ADDR]]
 // CHECK-NEXT: invoke {{.*}} [[SMAIN_CTOR:.*]]([[SMAIN]]* [[SM]], [[INT]] {{.*}}[[GS1_A]])
 // CHECK:      call {{.*}}void @__cxa_guard_release
-// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC3]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-// CHECK-DEBUG-NEXT: [[THREAD_NUM:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT]]* [[KMPC_LOC_ADDR]])
+
+
+
 // CHECK-DEBUG:      call {{.*}}i{{.*}} @__cxa_guard_acquire
-// CHECK-DEBUG:      call {{.*}}i32 @__kmpc_global_thread_num([[IDENT]]* [[KMPC_LOC_ADDR]])
-// CHECK-DEBUG:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[KMPC_LOC_ADDR]], i8* bitcast ([[SMAIN]]* [[SM]] to i8*), i8* (i8*)* [[SM_CTOR:@\.__kmpc_global_ctor_\..+]], i8* (i8*, i8*)* null, void (i8*)* [[SM_DTOR:@\.__kmpc_global_dtor_\..+]])
-// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC3]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-// CHECK-DEBUG:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+// CHECK-DEBUG:      call {{.*}}i32 @__kmpc_global_thread_num([[IDENT]]* [[KMPC_LOC:@.+]])
+// CHECK-DEBUG:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[KMPC_LOC]], i8* bitcast ([[SMAIN]]* [[SM]] to i8*), i8* (i8*)* [[SM_CTOR:@\.__kmpc_global_ctor_\..+]], i8* (i8*, i8*)* null, void (i8*)* [[SM_DTOR:@\.__kmpc_global_dtor_\..+]])
+// CHECK-DEBUG:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
 // CHECK-DEBUG-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
 // CHECK-DEBUG-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
 // CHECK-DEBUG-NEXT: [[GS1_A:%.*]] = load [[INT]], [[INT]]* [[GS1_A_ADDR]]
@@ -457,14 +457,14 @@ int main() {
 // CHECK-TLS-NEXT: br label %[[INIT_DONE]]
 // CHECK-TLS:      [[INIT_DONE]]
 #pragma omp threadprivate(sm)
-  // CHECK:      [[STATIC_S_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S3]]* [[STATIC_S]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[STATIC_S]].cache.)
+  // CHECK:      [[STATIC_S_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast ([[S3]]* [[STATIC_S]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[STATIC_S]].cache.)
   // CHECK-NEXT: [[STATIC_S_ADDR:%.*]] = bitcast i8* [[STATIC_S_TEMP_ADDR]] to [[S3]]*
   // CHECK-NEXT: [[STATIC_S_A_ADDR:%.*]] = getelementptr inbounds [[S3]], [[S3]]* [[STATIC_S_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-NEXT: [[STATIC_S_A:%.*]] = load [[INT]], [[INT]]* [[STATIC_S_A_ADDR]]
   // CHECK-NEXT: store [[INT]] [[STATIC_S_A]], [[INT]]* [[RES_ADDR:[^,]+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC5]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-  // CHECK-DEBUG-NEXT: [[STATIC_S_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S3]]* [[STATIC_S]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG:[[STATIC_S_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast ([[S3]]* [[STATIC_S]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
   // CHECK-DEBUG-NEXT: [[STATIC_S_ADDR:%.*]] = bitcast i8* [[STATIC_S_TEMP_ADDR]] to [[S3]]*
   // CHECK-DEBUG-NEXT: [[STATIC_S_A_ADDR:%.*]] = getelementptr inbounds [[S3]], [[S3]]* [[STATIC_S_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-DEBUG-NEXT: [[STATIC_S_A:%.*]] = load [[INT]], [[INT]]* [[STATIC_S_A_ADDR]]
@@ -474,16 +474,16 @@ int main() {
   // CHECK-TLS-NEXT: [[STATIC_S_A:%.*]] = load i32, i32* [[STATIC_S_A_ADDR]]
   // CHECK-TLS-NEXT: store i32 [[STATIC_S_A]], i32* [[RES_ADDR:[^,]+]]
   Res = Static::s.a;
-  // CHECK:      [[SM_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[SMAIN]]* [[SM]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[SM]].cache.)
+  // CHECK:      [[SM_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast ([[SMAIN]]* [[SM]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[SM]].cache.)
   // CHECK-NEXT: [[SM_ADDR:%.*]] = bitcast i8* [[SM_TEMP_ADDR]] to [[SMAIN]]*
   // CHECK-NEXT: [[SM_A_ADDR:%.*]] = getelementptr inbounds [[SMAIN]], [[SMAIN]]* [[SM_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-NEXT: [[SM_A:%.*]] = load [[INT]], [[INT]]* [[SM_A_ADDR]]
   // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[SM_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC6]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-  // CHECK-DEBUG-NEXT: [[SM_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[SMAIN]]* [[SM]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[SM_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast ([[SMAIN]]* [[SM]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
   // CHECK-DEBUG-NEXT: [[SM_ADDR:%.*]] = bitcast i8* [[SM_TEMP_ADDR]] to [[SMAIN]]*
   // CHECK-DEBUG-NEXT: [[SM_A_ADDR:%.*]] = getelementptr inbounds [[SMAIN]], [[SMAIN]]* [[SM_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-DEBUG-NEXT: [[SM_A:%.*]] = load [[INT]], [[INT]]* [[SM_A_ADDR]]
@@ -496,16 +496,16 @@ int main() {
   // CHECK-TLS-NEXT: [[ADD:%.*]] = add {{.*}} i32 [[RES]], [[SM_A]]
   // CHECK-TLS-NEXT: store i32 [[ADD]], i32* [[RES_ADDR]]
   Res += sm.a;
-  // CHECK:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS1]].cache.)
+  // CHECK:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS1]].cache.)
   // CHECK-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
   // CHECK-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-NEXT: [[GS1_A:%.*]] = load [[INT]], [[INT]]* [[GS1_A_ADDR]]
   // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS1_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC7]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-  // CHECK-DEBUG-NEXT: [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
   // CHECK-DEBUG-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
   // CHECK-DEBUG-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-DEBUG-NEXT: [[GS1_A:%.*]] = load [[INT]], [[INT]]* [[GS1_A_ADDR]]
@@ -532,16 +532,16 @@ int main() {
   // CHECK-TLS-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS2_A]]
   // CHECK-TLS-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += gs2.a;
-  // CHECK:      [[GS3_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S5]]* [[GS3]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS3]].cache.)
+  // CHECK:      [[GS3_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast ([[S5]]* [[GS3]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS3]].cache.)
   // CHECK-NEXT: [[GS3_ADDR:%.*]] = bitcast i8* [[GS3_TEMP_ADDR]] to [[S5]]*
   // CHECK-NEXT: [[GS3_A_ADDR:%.*]] = getelementptr inbounds [[S5]], [[S5]]* [[GS3_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-NEXT: [[GS3_A:%.*]] = load [[INT]], [[INT]]* [[GS3_A_ADDR]]
   // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS3_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC8]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-  // CHECK-DEBUG-NEXT: [[GS3_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S5]]* [[GS3]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[GS3_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast ([[S5]]* [[GS3]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
   // CHECK-DEBUG-NEXT: [[GS3_ADDR:%.*]] = bitcast i8* [[GS3_TEMP_ADDR]] to [[S5]]*
   // CHECK-DEBUG-NEXT: [[GS3_A_ADDR:%.*]] = getelementptr inbounds [[S5]], [[S5]]* [[GS3_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-DEBUG-NEXT: [[GS3_A:%.*]] = load [[INT]], [[INT]]* [[GS3_A_ADDR]]
@@ -555,7 +555,7 @@ int main() {
   // CHECK-TLS-NEXT: [[ADD:%.*]] = add nsw i32 [[RES]], [[GS3_A]]
   // CHECK-TLS-NEXT: store i32 [[ADD]], i32* [[RES_ADDR]]
   Res += gs3.a;
-  // CHECK:      [[ARR_X_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ARR_X]].cache.)
+  // CHECK:      [[ARR_X_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ARR_X]].cache.)
   // CHECK-NEXT: [[ARR_X_ADDR:%.*]] = bitcast i8* [[ARR_X_TEMP_ADDR]] to [2 x [3 x [[S1]]]]*
   // CHECK-NEXT: [[ARR_X_1_ADDR:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]], [2 x [3 x [[S1]]]]* [[ARR_X_ADDR]], i{{.*}} 0, i{{.*}} 1
   // CHECK-NEXT: [[ARR_X_1_1_ADDR:%.*]] = getelementptr inbounds [3 x [[S1]]], [3 x [[S1]]]* [[ARR_X_1_ADDR]], i{{.*}} 0, i{{.*}} 1
@@ -564,9 +564,9 @@ int main() {
   // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ARR_X_1_1_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC9]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-  // CHECK-DEBUG-NEXT:      [[ARR_X_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT:      [[ARR_X_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
   // CHECK-DEBUG-NEXT: [[ARR_X_ADDR:%.*]] = bitcast i8* [[ARR_X_TEMP_ADDR]] to [2 x [3 x [[S1]]]]*
   // CHECK-DEBUG-NEXT: [[ARR_X_1_ADDR:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]], [2 x [3 x [[S1]]]]* [[ARR_X_ADDR]], i{{.*}} 0, i{{.*}} 1
   // CHECK-DEBUG-NEXT: [[ARR_X_1_1_ADDR:%.*]] = getelementptr inbounds [3 x [[S1]]], [3 x [[S1]]]* [[ARR_X_1_ADDR]], i{{.*}} 0, i{{.*}} 1
@@ -584,15 +584,15 @@ int main() {
   // CHECK-TLS-NEXT:  [[ADD:%.*]] = add {{.*}} i32 [[RES]], [[ARR_X_1_1_A]]
   // CHECK-TLS-NEXT:  store i32 [[ADD]], i32* [[RES_ADDR]]
   Res += arr_x[1][1].a;
-  // CHECK:      [[ST_INT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[INT]]* [[ST_INT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_INT_ST]].cache.)
+  // CHECK:      [[ST_INT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast ([[INT]]* [[ST_INT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_INT_ST]].cache.)
   // CHECK-NEXT: [[ST_INT_ST_ADDR:%.*]] = bitcast i8* [[ST_INT_ST_TEMP_ADDR]] to [[INT]]*
   // CHECK-NEXT: [[ST_INT_ST_VAL:%.*]] = load [[INT]], [[INT]]* [[ST_INT_ST_ADDR]]
   // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_INT_ST_VAL]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC10]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-  // CHECK-DEBUG-NEXT: [[ST_INT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[INT]]* [[ST_INT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[ST_INT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast ([[INT]]* [[ST_INT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
   // CHECK-DEBUG-NEXT: [[ST_INT_ST_ADDR:%.*]] = bitcast i8* [[ST_INT_ST_TEMP_ADDR]] to [[INT]]*
   // CHECK-DEBUG-NEXT: [[ST_INT_ST_VAL:%.*]] = load [[INT]], [[INT]]* [[ST_INT_ST_ADDR]]
   // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
@@ -604,16 +604,16 @@ int main() {
   // CHECK-TLS-NEXT:  [[ADD:%.*]] = add {{.*}} i32 [[RES]], [[ST_INT_ST_VAL]]
   // CHECK-TLS-NEXT:  store i32 [[ADD]], i32* [[RES_ADDR]]
   Res += ST<int>::st;
-  // CHECK:      [[ST_FLOAT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast (float* [[ST_FLOAT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_FLOAT_ST]].cache.)
+  // CHECK:      [[ST_FLOAT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast (float* [[ST_FLOAT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_FLOAT_ST]].cache.)
   // CHECK-NEXT: [[ST_FLOAT_ST_ADDR:%.*]] = bitcast i8* [[ST_FLOAT_ST_TEMP_ADDR]] to float*
   // CHECK-NEXT: [[ST_FLOAT_ST_VAL:%.*]] = load float, float* [[ST_FLOAT_ST_ADDR]]
   // CHECK-NEXT: [[FLOAT_TO_INT_CONV:%.*]] = fptosi float [[ST_FLOAT_ST_VAL]] to [[INT]]
   // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[FLOAT_TO_INT_CONV]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC11]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-  // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast (float* [[ST_FLOAT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast (float* [[ST_FLOAT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
   // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_ADDR:%.*]] = bitcast i8* [[ST_FLOAT_ST_TEMP_ADDR]] to float*
   // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_VAL:%.*]] = load float, float* [[ST_FLOAT_ST_ADDR]]
   // CHECK-DEBUG-NEXT: [[FLOAT_TO_INT_CONV:%.*]] = fptosi float [[ST_FLOAT_ST_VAL]] to [[INT]]
@@ -627,16 +627,16 @@ int main() {
   // CHECK-TLS-NEXT: [[ADD:%.*]] = add {{.*}} i32 [[RES]], [[FLOAT_TO_INT_CONV]]
   // CHECK-TLS-NEXT: store i32 [[ADD]], i32* [[RES_ADDR]]
   Res += static_cast<int>(ST<float>::st);
-  // CHECK:      [[ST_S4_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_S4_ST]].cache.)
+  // CHECK:      [[ST_S4_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_S4_ST]].cache.)
   // CHECK-NEXT: [[ST_S4_ST_ADDR:%.*]] = bitcast i8* [[ST_S4_ST_TEMP_ADDR]] to [[S4]]*
   // CHECK-NEXT: [[ST_S4_ST_A_ADDR:%.*]] = getelementptr inbounds [[S4]], [[S4]]* [[ST_S4_ST_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-NEXT: [[ST_S4_ST_A:%.*]] = load [[INT]], [[INT]]* [[ST_S4_ST_A_ADDR]]
   // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_S4_ST_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC12]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-  // CHECK-DEBUG-NEXT: [[ST_S4_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[ST_S4_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
   // CHECK-DEBUG-NEXT: [[ST_S4_ST_ADDR:%.*]] = bitcast i8* [[ST_S4_ST_TEMP_ADDR]] to [[S4]]*
   // CHECK-DEBUG-NEXT: [[ST_S4_ST_A_ADDR:%.*]] = getelementptr inbounds [[S4]], [[S4]]* [[ST_S4_ST_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-DEBUG-NEXT: [[ST_S4_ST_A:%.*]] = load [[INT]], [[INT]]* [[ST_S4_ST_A_ADDR]]
@@ -665,7 +665,7 @@ int main() {
 // CHECK:      store i8* %0, i8** [[ARG_ADDR:%.*]],
 // CHECK:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK:      [[RES:%.*]] = bitcast i8* [[ARG]] to [[SMAIN]]*
-// CHECK:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS1]].cache.)
+// CHECK:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS1]].cache.)
 // CHECK-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
 // CHECK-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
 // CHECK-NEXT: [[GS1_A:%.*]] = load [[INT]], [[INT]]* [[GS1_A_ADDR]]
@@ -683,16 +683,16 @@ int main() {
 // CHECK-NEXT: }
 // CHECK:      define {{.*}} [[SMAIN_DTOR]]([[SMAIN]]* {{.*}})
 // CHECK-DEBUG:      define internal {{.*}}i8* [[SM_CTOR]](i8* %0)
-// CHECK-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
-// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC3]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-// CHECK-DEBUG-NEXT: [[THREAD_NUM:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT]]* [[KMPC_LOC_ADDR]])
+// CHECK-DEBUG:      [[THREAD_NUM:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT]]* {{.*}})
+
+
+
 // CHECK-DEBUG:      store i8* %0, i8** [[ARG_ADDR:%.*]],
 // CHECK-DEBUG:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK-DEBUG:      [[RES:%.*]] = bitcast i8* [[ARG]] to [[SMAIN]]*
-// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC3]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-// CHECK-DEBUG:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+// CHECK-DEBUG:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
 // CHECK-DEBUG-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
 // CHECK-DEBUG-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
 // CHECK-DEBUG-NEXT: [[GS1_A:%.*]] = load [[INT]], [[INT]]* [[GS1_A_ADDR]]
@@ -749,20 +749,20 @@ int main() {
 // CHECK-DEBUG-LABEL: @{{.*}}foobar{{.*}}()
 // CHECK-TLS-LABEL: @{{.*}}foobar{{.*}}()
 int foobar() {
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
+
   int Res;
   // CHECK:      [[THREAD_NUM:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT]]* [[DEFAULT_LOC]])
-  // CHECK:      [[STATIC_S_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S3]]* [[STATIC_S]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[STATIC_S]].cache.)
+  // CHECK:      [[STATIC_S_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast ([[S3]]* [[STATIC_S]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[STATIC_S]].cache.)
   // CHECK-NEXT: [[STATIC_S_ADDR:%.*]] = bitcast i8* [[STATIC_S_TEMP_ADDR]] to [[S3]]*
   // CHECK-NEXT: [[STATIC_S_A_ADDR:%.*]] = getelementptr inbounds [[S3]], [[S3]]* [[STATIC_S_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-NEXT: [[STATIC_S_A:%.*]] = load [[INT]], [[INT]]* [[STATIC_S_A_ADDR]]
   // CHECK-NEXT: store [[INT]] [[STATIC_S_A]], [[INT]]* [[RES_ADDR:[^,]+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC13]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-  // CHECK-DEBUG-NEXT: [[THREAD_NUM:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT]]* [[KMPC_LOC_ADDR]])
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC13]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-  // CHECK-DEBUG-NEXT: [[STATIC_S_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S3]]* [[STATIC_S]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG:      [[THREAD_NUM:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT]]* {{.*}})
+  // CHECK-DEBUG:      [[STATIC_S_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast ([[S3]]* [[STATIC_S]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
+
+
   // CHECK-DEBUG-NEXT: [[STATIC_S_ADDR:%.*]] = bitcast i8* [[STATIC_S_TEMP_ADDR]] to [[S3]]*
   // CHECK-DEBUG-NEXT: [[STATIC_S_A_ADDR:%.*]] = getelementptr inbounds [[S3]], [[S3]]* [[STATIC_S_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-DEBUG-NEXT: [[STATIC_S_A:%.*]] = load [[INT]], [[INT]]* [[STATIC_S_A_ADDR]]
@@ -772,16 +772,16 @@ int foobar() {
   // CHECK-TLS-NEXT: [[STATIC_S_A:%.*]] = load i32, i32* [[STATIC_S_A_ADDR]]
   // CHECK-TLS-NEXT: store i32 [[STATIC_S_A]], i32* [[RES_ADDR:[^,]+]]
   Res = Static::s.a;
-  // CHECK:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS1]].cache.)
+  // CHECK:      [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS1]].cache.)
   // CHECK-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
   // CHECK-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-NEXT: [[GS1_A:%.*]] = load [[INT]], [[INT]]* [[GS1_A_ADDR]]
   // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS1_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC14]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-  // CHECK-DEBUG-NEXT: [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[GS1_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
   // CHECK-DEBUG-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
   // CHECK-DEBUG-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]], [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-DEBUG-NEXT: [[GS1_A:%.*]] = load [[INT]], [[INT]]* [[GS1_A_ADDR]]
@@ -808,16 +808,16 @@ int foobar() {
   // CHECK-TLS-NEXT: [[ADD:%.*]] = add {{.*}} i32 [[RES]], [[GS2_A]]
   // CHECK-TLS-NEXT: store i32 [[ADD]], i32* [[RES:.+]]
   Res += gs2.a;
-  // CHECK:      [[GS3_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S5]]* [[GS3]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS3]].cache.)
+  // CHECK:      [[GS3_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast ([[S5]]* [[GS3]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[GS3]].cache.)
   // CHECK-NEXT: [[GS3_ADDR:%.*]] = bitcast i8* [[GS3_TEMP_ADDR]] to [[S5]]*
   // CHECK-NEXT: [[GS3_A_ADDR:%.*]] = getelementptr inbounds [[S5]], [[S5]]* [[GS3_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-NEXT: [[GS3_A:%.*]] = load [[INT]], [[INT]]* [[GS3_A_ADDR]]
   // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS3_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC15]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-  // CHECK-DEBUG-NEXT: [[GS3_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S5]]* [[GS3]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[GS3_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast ([[S5]]* [[GS3]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
   // CHECK-DEBUG-NEXT: [[GS3_ADDR:%.*]] = bitcast i8* [[GS3_TEMP_ADDR]] to [[S5]]*
   // CHECK-DEBUG-NEXT: [[GS3_A_ADDR:%.*]] = getelementptr inbounds [[S5]], [[S5]]* [[GS3_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-DEBUG-NEXT: [[GS3_A:%.*]] = load [[INT]], [[INT]]* [[GS3_A_ADDR]]
@@ -831,7 +831,7 @@ int foobar() {
   // CHECK-TLS-DEBUG: [[ADD:%.*]]= add nsw i32 [[RES]], [[GS3_A]]
   // CHECK-TLS-DEBUG: store i32 [[ADD]], i32* [[RES_ADDR]]
   Res += gs3.a;
-  // CHECK:      [[ARR_X_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ARR_X]].cache.)
+  // CHECK:      [[ARR_X_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ARR_X]].cache.)
   // CHECK-NEXT: [[ARR_X_ADDR:%.*]] = bitcast i8* [[ARR_X_TEMP_ADDR]] to [2 x [3 x [[S1]]]]*
   // CHECK-NEXT: [[ARR_X_1_ADDR:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]], [2 x [3 x [[S1]]]]* [[ARR_X_ADDR]], i{{.*}} 0, i{{.*}} 1
   // CHECK-NEXT: [[ARR_X_1_1_ADDR:%.*]] = getelementptr inbounds [3 x [[S1]]], [3 x [[S1]]]* [[ARR_X_1_ADDR]], i{{.*}} 0, i{{.*}} 1
@@ -840,9 +840,9 @@ int foobar() {
   // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ARR_X_1_1_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC16]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-  // CHECK-DEBUG-NEXT:      [[ARR_X_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT:      [[ARR_X_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
   // CHECK-DEBUG-NEXT: [[ARR_X_ADDR:%.*]] = bitcast i8* [[ARR_X_TEMP_ADDR]] to [2 x [3 x [[S1]]]]*
   // CHECK-DEBUG-NEXT: [[ARR_X_1_ADDR:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]], [2 x [3 x [[S1]]]]* [[ARR_X_ADDR]], i{{.*}} 0, i{{.*}} 1
   // CHECK-DEBUG-NEXT: [[ARR_X_1_1_ADDR:%.*]] = getelementptr inbounds [3 x [[S1]]], [3 x [[S1]]]* [[ARR_X_1_ADDR]], i{{.*}} 0, i{{.*}} 1
@@ -860,15 +860,15 @@ int foobar() {
   // CHECK-TLS-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ARR_X_1_1_A]]
   // CHECK-TLS-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += arr_x[1][1].a;
-  // CHECK:      [[ST_INT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[INT]]* [[ST_INT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_INT_ST]].cache.)
+  // CHECK:      [[ST_INT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast ([[INT]]* [[ST_INT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_INT_ST]].cache.)
   // CHECK-NEXT: [[ST_INT_ST_ADDR:%.*]] = bitcast i8* [[ST_INT_ST_TEMP_ADDR]] to [[INT]]*
   // CHECK-NEXT: [[ST_INT_ST_VAL:%.*]] = load [[INT]], [[INT]]* [[ST_INT_ST_ADDR]]
   // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_INT_ST_VAL]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC17]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-  // CHECK-DEBUG-NEXT: [[ST_INT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[INT]]* [[ST_INT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[ST_INT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast ([[INT]]* [[ST_INT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
   // CHECK-DEBUG-NEXT: [[ST_INT_ST_ADDR:%.*]] = bitcast i8* [[ST_INT_ST_TEMP_ADDR]] to [[INT]]*
   // CHECK-DEBUG-NEXT: [[ST_INT_ST_VAL:%.*]] = load [[INT]], [[INT]]* [[ST_INT_ST_ADDR]]
   // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
@@ -880,16 +880,16 @@ int foobar() {
   // OMP45-TLS-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_INT_ST_VAL]]
   // OMP45-TLS-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += ST<int>::st;
-  // CHECK:      [[ST_FLOAT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast (float* [[ST_FLOAT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_FLOAT_ST]].cache.)
+  // CHECK:      [[ST_FLOAT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast (float* [[ST_FLOAT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_FLOAT_ST]].cache.)
   // CHECK-NEXT: [[ST_FLOAT_ST_ADDR:%.*]] = bitcast i8* [[ST_FLOAT_ST_TEMP_ADDR]] to float*
   // CHECK-NEXT: [[ST_FLOAT_ST_VAL:%.*]] = load float, float* [[ST_FLOAT_ST_ADDR]]
   // CHECK-NEXT: [[FLOAT_TO_INT_CONV:%.*]] = fptosi float [[ST_FLOAT_ST_VAL]] to [[INT]]
   // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[FLOAT_TO_INT_CONV]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC18]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-  // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast (float* [[ST_FLOAT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast (float* [[ST_FLOAT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
   // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_ADDR:%.*]] = bitcast i8* [[ST_FLOAT_ST_TEMP_ADDR]] to float*
   // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_VAL:%.*]] = load float, float* [[ST_FLOAT_ST_ADDR]]
   // CHECK-DEBUG-NEXT: [[FLOAT_TO_INT_CONV:%.*]] = fptosi float [[ST_FLOAT_ST_VAL]] to [[INT]]
@@ -903,16 +903,16 @@ int foobar() {
   // OMP45-TLS-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[FLOAT_TO_INT_CONV]]
   // OMP45-TLS-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
   Res += static_cast<int>(ST<float>::st);
-  // CHECK:      [[ST_S4_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_S4_ST]].cache.)
+  // CHECK:      [[ST_S4_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 {{.*}}, i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[ST_S4_ST]].cache.)
   // CHECK-NEXT: [[ST_S4_ST_ADDR:%.*]] = bitcast i8* [[ST_S4_ST_TEMP_ADDR]] to [[S4]]*
   // CHECK-NEXT: [[ST_S4_ST_A_ADDR:%.*]] = getelementptr inbounds [[S4]], [[S4]]* [[ST_S4_ST_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-NEXT: [[ST_S4_ST_A:%.*]] = load [[INT]], [[INT]]* [[ST_S4_ST_A_ADDR]]
   // CHECK-NEXT: [[RES:%.*]] = load [[INT]], [[INT]]* [[RES_ADDR]]
   // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_S4_ST_A]]
   // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
-  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC19]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
-  // CHECK-DEBUG-NEXT: [[ST_S4_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[ST_S4_ST_TEMP_ADDR:%.*]] = call {{.*}}i8* @__kmpc_threadprivate_cached([[IDENT]]* {{.*}}, i32 {{.*}}, i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+
+
   // CHECK-DEBUG-NEXT: [[ST_S4_ST_ADDR:%.*]] = bitcast i8* [[ST_S4_ST_TEMP_ADDR]] to [[S4]]*
   // CHECK-DEBUG-NEXT: [[ST_S4_ST_A_ADDR:%.*]] = getelementptr inbounds [[S4]], [[S4]]* [[ST_S4_ST_ADDR]], i{{.*}} 0, i{{.*}} 0
   // CHECK-DEBUG-NEXT: [[ST_S4_ST_A:%.*]] = load [[INT]], [[INT]]* [[ST_S4_ST_A_ADDR]]
@@ -955,11 +955,11 @@ int foobar() {
 // OMP45-NEXT: }
 // OMP45:      define {{.*}} [[S4_DTOR]]([[S4]]* {{.*}})
 
-// OMP45-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
-// OMP45-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]], [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
-// OMP45-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[LOC20]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+
+
+
 // OMP45-DEBUG:      @__kmpc_global_thread_num
-// OMP45-DEBUG:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* [[KMPC_LOC_ADDR]], i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i8* (i8*)* [[ST_S4_ST_CTOR:@\.__kmpc_global_ctor_\..+]], i8* (i8*, i8*)* null, void (i8*)* [[ST_S4_ST_DTOR:@\.__kmpc_global_dtor_\..+]])
+// OMP45-DEBUG:      call {{.*}}void @__kmpc_threadprivate_register([[IDENT]]* {{.*}}, i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i8* (i8*)* [[ST_S4_ST_CTOR:@\.__kmpc_global_ctor_\..+]], i8* (i8*, i8*)* null, void (i8*)* [[ST_S4_ST_DTOR:@\.__kmpc_global_dtor_\..+]])
 // OMP45-DEBUG:      define internal {{.*}}i8* [[ST_S4_ST_CTOR]](i8* %0)
 // OMP45-DEBUG:      }
 // OMP45-DEBUG:      define {{.*}} [[S4_CTOR:@.*]]([[S4]]* {{.*}},
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 95eed59f1b3d..a2a440d65fd8 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -210,12 +210,19 @@ class OpenMPIRBuilder {
   /// Return the (LLVM-IR) string describing the default source location.
   Constant *getOrCreateDefaultSrcLocStr();
 
+  /// Return the (LLVM-IR) string describing the source location identified by
+  /// the arguments.
+  Constant *getOrCreateSrcLocStr(StringRef FunctionName, StringRef FileName,
+                                 unsigned Line, unsigned Column);
+
   /// Return the (LLVM-IR) string describing the source location \p Loc.
   Constant *getOrCreateSrcLocStr(const LocationDescription &Loc);
 
   /// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags.
+  /// TODO: Create a enum class for the Reserve2Flags
   Value *getOrCreateIdent(Constant *SrcLocStr,
-                          omp::IdentFlag Flags = omp::IdentFlag(0));
+                          omp::IdentFlag Flags = omp::IdentFlag(0),
+                          unsigned Reserve2Flags = 0);
 
   /// Generate control flow and cleanup for cancellation.
   ///
@@ -280,7 +287,7 @@ class OpenMPIRBuilder {
   StringMap<Constant *> SrcLocStrMap;
 
   /// Map to remember existing ident_t*.
-  DenseMap<std::pair<Constant *, uint64_t>, GlobalVariable *> IdentMap;
+  DenseMap<std::pair<Constant *, uint64_t>, Value *> IdentMap;
 
   /// Helper that contains information about regions we need to outline
   /// during finalization.
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index ffec4ff64ca6..b90480ebc59e 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -386,8 +386,12 @@ class IRBuilderBase {
   /// filled in with the null terminated string value specified.  The new global
   /// variable will be marked mergable with any others of the same contents.  If
   /// Name is specified, it is the name of the global variable created.
+  ///
+  /// If no module is given via \p M, it is take from the insertion point basic
+  /// block.
   GlobalVariable *CreateGlobalString(StringRef Str, const Twine &Name = "",
-                                     unsigned AddressSpace = 0);
+                                     unsigned AddressSpace = 0,
+                                     Module *M = nullptr);
 
   /// Get a constant value representing either true or false.
   ConstantInt *getInt1(bool V) {
@@ -1934,9 +1938,13 @@ class IRBuilderBase {
 
   /// Same as CreateGlobalString, but return a pointer with "i8*" type
   /// instead of a pointer to array of i8.
+  ///
+  /// If no module is given via \p M, it is take from the insertion point basic
+  /// block.
   Constant *CreateGlobalStringPtr(StringRef Str, const Twine &Name = "",
-                                  unsigned AddressSpace = 0) {
-    GlobalVariable *GV = CreateGlobalString(Str, Name, AddressSpace);
+                                  unsigned AddressSpace = 0,
+                                  Module *M = nullptr) {
+    GlobalVariable *GV = CreateGlobalString(Str, Name, AddressSpace, M);
     Constant *Zero = ConstantInt::get(Type::getInt32Ty(Context), 0);
     Constant *Indices[] = {Zero, Zero};
     return ConstantExpr::getInBoundsGetElementPtr(GV->getValueType(), GV,
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 9468a3aa3c8d..6c72cd01ce6e 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -185,16 +185,18 @@ void OpenMPIRBuilder::finalize() {
 }
 
 Value *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
-                                         IdentFlag LocFlags) {
+                                         IdentFlag LocFlags,
+                                         unsigned Reserve2Flags) {
   // Enable "C-mode".
   LocFlags |= OMP_IDENT_FLAG_KMPC;
 
-  GlobalVariable *&DefaultIdent = IdentMap[{SrcLocStr, uint64_t(LocFlags)}];
-  if (!DefaultIdent) {
+  Value *&Ident =
+      IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
+  if (!Ident) {
     Constant *I32Null = ConstantInt::getNullValue(Int32);
-    Constant *IdentData[] = {I32Null,
-                             ConstantInt::get(Int32, uint64_t(LocFlags)),
-                             I32Null, I32Null, SrcLocStr};
+    Constant *IdentData[] = {
+        I32Null, ConstantInt::get(Int32, uint32_t(LocFlags)),
+        ConstantInt::get(Int32, Reserve2Flags), I32Null, SrcLocStr};
     Constant *Initializer = ConstantStruct::get(
         cast<StructType>(IdentPtr->getPointerElementType()), IdentData);
 
@@ -203,15 +205,16 @@ Value *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
     for (GlobalVariable &GV : M.getGlobalList())
       if (GV.getType() == IdentPtr && GV.hasInitializer())
         if (GV.getInitializer() == Initializer)
-          return DefaultIdent = &GV;
-
-    DefaultIdent = new GlobalVariable(M, IdentPtr->getPointerElementType(),
-                                      /* isConstant = */ false,
-                                      GlobalValue::PrivateLinkage, Initializer);
-    DefaultIdent->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-    DefaultIdent->setAlignment(Align(8));
+          return Ident = &GV;
+
+    auto *GV = new GlobalVariable(M, IdentPtr->getPointerElementType(),
+                                  /* isConstant = */ true,
+                                  GlobalValue::PrivateLinkage, Initializer);
+    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+    GV->setAlignment(Align(8));
+    Ident = GV;
   }
-  return DefaultIdent;
+  return Ident;
 }
 
 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr) {
@@ -227,11 +230,30 @@ Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr) {
           GV.getInitializer() == Initializer)
         return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
 
-    SrcLocStr = Builder.CreateGlobalStringPtr(LocStr);
+    SrcLocStr = Builder.CreateGlobalStringPtr(LocStr, /* Name */ "",
+                                              /* AddressSpace */ 0, &M);
   }
   return SrcLocStr;
 }
 
+Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
+                                                StringRef FileName,
+                                                unsigned Line,
+                                                unsigned Column) {
+  SmallString<128> Buffer;
+  Buffer.push_back(';');
+  Buffer.append(FileName);
+  Buffer.push_back(';');
+  Buffer.append(FunctionName);
+  Buffer.push_back(';');
+  Buffer.append(std::to_string(Line));
+  Buffer.push_back(';');
+  Buffer.append(std::to_string(Column));
+  Buffer.push_back(';');
+  Buffer.push_back(';');
+  return getOrCreateSrcLocStr(Buffer.str());
+}
+
 Constant *OpenMPIRBuilder::getOrCreateDefaultSrcLocStr() {
   return getOrCreateSrcLocStr(";unknown;unknown;0;0;;");
 }
@@ -241,17 +263,13 @@ OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc) {
   DILocation *DIL = Loc.DL.get();
   if (!DIL)
     return getOrCreateDefaultSrcLocStr();
-  StringRef Filename =
+  StringRef FileName =
       !DIL->getFilename().empty() ? DIL->getFilename() : M.getName();
   StringRef Function = DIL->getScope()->getSubprogram()->getName();
   Function =
       !Function.empty() ? Function : Loc.IP.getBlock()->getParent()->getName();
-  std::string LineStr = std::to_string(DIL->getLine());
-  std::string ColumnStr = std::to_string(DIL->getColumn());
-  std::stringstream SrcLocStr;
-  SrcLocStr << ";" << Filename.data() << ";" << Function.data() << ";"
-            << LineStr << ";" << ColumnStr << ";;";
-  return getOrCreateSrcLocStr(SrcLocStr.str());
+  return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
+                              DIL->getColumn());
 }
 
 Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 1fffce015f70..a82f15895782 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -42,13 +42,14 @@ using namespace llvm;
 /// created.
 GlobalVariable *IRBuilderBase::CreateGlobalString(StringRef Str,
                                                   const Twine &Name,
-                                                  unsigned AddressSpace) {
+                                                  unsigned AddressSpace,
+                                                  Module *M) {
   Constant *StrConstant = ConstantDataArray::getString(Context, Str);
-  Module &M = *BB->getParent()->getParent();
-  auto *GV = new GlobalVariable(M, StrConstant->getType(), true,
-                                GlobalValue::PrivateLinkage, StrConstant, Name,
-                                nullptr, GlobalVariable::NotThreadLocal,
-                                AddressSpace);
+  if (!M)
+    M = BB->getParent()->getParent();
+  auto *GV = new GlobalVariable(
+      *M, StrConstant->getType(), true, GlobalValue::PrivateLinkage,
+      StrConstant, Name, nullptr, GlobalVariable::NotThreadLocal, AddressSpace);
   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   GV->setAlignment(Align(1));
   return GV;
diff --git a/llvm/test/Transforms/OpenMP/deduplication.ll b/llvm/test/Transforms/OpenMP/deduplication.ll
index a25d980b1806..9074b948cc3f 100644
--- a/llvm/test/Transforms/OpenMP/deduplication.ll
+++ b/llvm/test/Transforms/OpenMP/deduplication.ll
@@ -5,21 +5,21 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 
 %struct.ident_t = type { i32, i32, i32, i32, i8* }
 
-@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 34, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str0, i32 0, i32 0) }, align 8
-@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str1, i32 0, i32 0) }, align 8
-@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str2, i32 0, i32 0) }, align 8
+@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 34, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str0, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str1, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str2, i32 0, i32 0) }, align 8
 @.str0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @.str1 = private unnamed_addr constant [23 x i8] c";file001;loc0001;0;0;;\00", align 1
 @.str2 = private unnamed_addr constant [23 x i8] c";file002;loc0002;0;0;;\00", align 1
 
 ; UTC_ARGS: --disable
-; CHECK-DAG: @0 = private unnamed_addr global %struct.ident_t { i32 0, i32 34, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str0, i32 0, i32 0) }, align 8
-; CHECK-DAG: @1 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str1, i32 0, i32 0) }, align 8
-; CHECK-DAG: @2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str2, i32 0, i32 0) }, align 8
+; CHECK-DAG: @0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 34, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str0, i32 0, i32 0) }, align 8
+; CHECK-DAG: @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str1, i32 0, i32 0) }, align 8
+; CHECK-DAG: @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str2, i32 0, i32 0) }, align 8
 ; CHECK-DAG: @.str0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 ; CHECK-DAG: @.str1 = private unnamed_addr constant [23 x i8] c";file001;loc0001;0;0;;\00", align 1
 ; CHECK-DAG: @.str2 = private unnamed_addr constant [23 x i8] c";file002;loc0002;0;0;;\00", align 1
-; CHECK-DAG: @3 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str0, i32 0, i32 0) }, align 8
+; CHECK-DAG: @3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str0, i32 0, i32 0) }, align 8
 ; UTC_ARGS: --enable
 
 

From 1274d83482b950fa31a34a5fdc3a0575c8d1b6a4 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Sat, 15 Aug 2020 17:27:14 -0500
Subject: [PATCH 169/363] Do not use TBAA in type punning reduction GPU code
 PR46156

When we implement OpenMP GPU reductions we use type punning a lot during
the shuffle and reduce operations. This is not always compatible with
language rules on aliasing. So far we generated TBAA which later allowed
to remove some of the reduce code as accesses and initialization were
"known to not alias". With this patch we avoid TBAA in this step,
hopefully for all accesses that we need to.

Verified on the reproducer of PR46156 and QMCPack.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D86037
---
 clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp    | 61 +++++++++++++------
 ...arallel_reduction_codegen_tbaa_PR46146.cpp | 38 ++++++++++++
 2 files changed, 80 insertions(+), 19 deletions(-)
 create mode 100644 clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index 1f79b33772f3..de78926755df 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -2857,8 +2857,12 @@ static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
   Address CastItem = CGF.CreateMemTemp(CastTy);
   Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
       CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
-  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
-  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
+  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
+                        LValueBaseInfo(AlignmentSource::Type),
+                        TBAAAccessInfo());
+  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
+                              LValueBaseInfo(AlignmentSource::Type),
+                              TBAAAccessInfo());
 }
 
 /// This function creates calls to one of two shuffle functions to copy
@@ -2945,9 +2949,14 @@ static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
                        ThenBB, ExitBB);
       CGF.EmitBlock(ThenBB);
       llvm::Value *Res = createRuntimeShuffleFunction(
-          CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
+          CGF,
+          CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
+                               LValueBaseInfo(AlignmentSource::Type),
+                               TBAAAccessInfo()),
           IntType, Offset, Loc);
-      CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
+      CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
+                            LValueBaseInfo(AlignmentSource::Type),
+                            TBAAAccessInfo());
       Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
       Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
       PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
@@ -2956,9 +2965,14 @@ static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
       CGF.EmitBlock(ExitBB);
     } else {
       llvm::Value *Res = createRuntimeShuffleFunction(
-          CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
+          CGF,
+          CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
+                               LValueBaseInfo(AlignmentSource::Type),
+                               TBAAAccessInfo()),
           IntType, Offset, Loc);
-      CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
+      CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
+                            LValueBaseInfo(AlignmentSource::Type),
+                            TBAAAccessInfo());
       Ptr = Bld.CreateConstGEP(Ptr, 1);
       ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
     }
@@ -3112,12 +3126,14 @@ static void emitReductionListCopy(
     } else {
       switch (CGF.getEvaluationKind(Private->getType())) {
       case TEK_Scalar: {
-        llvm::Value *Elem =
-            CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
-                                 Private->getType(), Private->getExprLoc());
+        llvm::Value *Elem = CGF.EmitLoadOfScalar(
+            SrcElementAddr, /*Volatile=*/false, Private->getType(),
+            Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
+            TBAAAccessInfo());
         // Store the source element value to the dest element address.
-        CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
-                              Private->getType());
+        CGF.EmitStoreOfScalar(
+            Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
+            LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
         break;
       }
       case TEK_Complex: {
@@ -3260,8 +3276,9 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
   Address LocalReduceList(
       Bld.CreatePointerBitCastOrAddrSpaceCast(
-          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
-                               C.VoidPtrTy, Loc),
+          CGF.EmitLoadOfScalar(
+              AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
+              LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
       CGF.getPointerAlign());
 
@@ -3339,10 +3356,13 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
 
       // elem = *elemptr
       //*MediumPtr = elem
-      llvm::Value *Elem =
-          CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc);
+      llvm::Value *Elem = CGF.EmitLoadOfScalar(
+          ElemPtr, /*Volatile=*/false, CType, Loc,
+          LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
       // Store the source element value to the dest element address.
-      CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType);
+      CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
+                            LValueBaseInfo(AlignmentSource::Type),
+                            TBAAAccessInfo());
 
       Bld.CreateBr(MergeBB);
 
@@ -3722,8 +3742,9 @@ static llvm::Value *emitListToGlobalCopyFunction(
     GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
     switch (CGF.getEvaluationKind(Private->getType())) {
     case TEK_Scalar: {
-      llvm::Value *V = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
-                                            Private->getType(), Loc);
+      llvm::Value *V = CGF.EmitLoadOfScalar(
+          ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
+          LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
       CGF.EmitStoreOfScalar(V, GlobLVal);
       break;
     }
@@ -3926,7 +3947,9 @@ static llvm::Value *emitGlobalToListCopyFunction(
     switch (CGF.getEvaluationKind(Private->getType())) {
     case TEK_Scalar: {
       llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
-      CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType());
+      CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
+                            LValueBaseInfo(AlignmentSource::Type),
+                            TBAAAccessInfo());
       break;
     }
     case TEK_Complex: {
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
new file mode 100644
index 000000000000..8f814de05b70
--- /dev/null
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown  -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple nvptx-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -fexceptions -fcxx-exceptions -aux-triple powerpc64le-unknown-unknown -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+#include <complex>
+
+// Verify we do not add tbaa metadata to type punned memory operations:
+
+// CHECK:      call i64 @__kmpc_shuffle_int64(
+// CHECK-NEXT: store i64 %{{.*}}, i64* %{{.*}}, align {{[0-9]+$}}
+
+// CHECK:      call i64 @__kmpc_shuffle_int64(
+// CHECK-NEXT: store i64 %{{.*}}, i64* %{{.*}}, align {{[0-9]+$}}
+
+template <typename T>
+void complex_reduction() {
+#pragma omp target teams distribute
+  for (int ib = 0; ib < 100; ib++) {
+    std::complex<T> partial_sum;
+    const int istart = ib * 4;
+    const int iend = (ib + 1) * 4;
+#pragma omp parallel for reduction(+ \
+                                   : partial_sum)
+    for (int i = istart; i < iend; i++)
+      partial_sum += std::complex<T>(i, i);
+  }
+}
+
+void test() {
+  complex_reduction<float>();
+  complex_reduction<double>();
+}
+#endif

From 45574524c3a15f1e34c7d181e3bc17e9e7d90210 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Tue, 25 Aug 2020 17:41:59 +0200
Subject: [PATCH 170/363] OpenMP: Fix for PR46868: Incorrect target map

https://bugs.llvm.org/attachment.cgi?id=23891 by Alexey Bataev.
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 93 ++++++++++++++-----
 .../target_data_use_device_ptr_codegen.cpp    | 14 +--
 clang/test/OpenMP/target_map_codegen.cpp      | 21 ++---
 clang/test/OpenMP/target_update_codegen.cpp   |  3 +-
 openmp/libomptarget/src/omptarget.cpp         |  9 +-
 .../test/env/base_ptr_ref_count.c             | 47 ++++++++++
 6 files changed, 137 insertions(+), 50 deletions(-)
 create mode 100644 openmp/libomptarget/test/env/base_ptr_ref_count.c

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index b221deab0174..14e0cba62b23 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -7265,6 +7265,8 @@ class MappableExprsHandler {
     // &p, &p, sizeof(float*), TARGET_PARAM | TO | FROM
     //
     // map(p[1:24])
+    // &p, &p[1], 24*sizeof(float), TARGET_PARAM | TO | FROM | PTR_AND_OBJ
+    // in unified shared memory mode or for local pointers
     // p, &p[1], 24*sizeof(float), TARGET_PARAM | TO | FROM
     //
     // map(s)
@@ -7400,6 +7402,7 @@ class MappableExprsHandler {
     // Track if the map information being generated is the first for a list of
     // components.
     bool IsExpressionFirstInfo = true;
+    bool FirstPointerInComplexData = false;
     Address BP = Address::invalid();
     const Expr *AssocExpr = I->getAssociatedExpression();
     const auto *AE = dyn_cast<ArraySubscriptExpr>(AssocExpr);
@@ -7442,10 +7445,15 @@ class MappableExprsHandler {
       QualType Ty =
           I->getAssociatedDeclaration()->getType().getNonReferenceType();
       if (Ty->isAnyPointerType() && std::next(I) != CE) {
-        BP = CGF.EmitLoadOfPointer(BP, Ty->castAs<PointerType>());
-
-        // We do not need to generate individual map information for the
-        // pointer, it can be associated with the combined storage.
+        // No need to generate individual map information for the pointer, it
+        // can be associated with the combined storage if shared memory mode is
+        // active or the base declaration is not global variable.
+        const auto *VD = dyn_cast<VarDecl>(I->getAssociatedDeclaration());
+         if (CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory() ||
+            !VD || VD->hasLocalStorage())
+          BP = CGF.EmitLoadOfPointer(BP, Ty->castAs<PointerType>());
+        else
+          FirstPointerInComplexData = IsCaptureFirstInfo;
         ++I;
       }
     }
@@ -7481,8 +7489,19 @@ class MappableExprsHandler {
         EncounteredME = dyn_cast<MemberExpr>(I->getAssociatedExpression());
         // If we encounter a PTR_AND_OBJ entry from now on it should be marked
         // as MEMBER_OF the parent struct.
-        if (EncounteredME)
+        if (EncounteredME) {
           ShouldBeMemberOf = true;
+          // Do not emit as complex pointer if this is actually not array-like
+          // expression.
+          if (FirstPointerInComplexData) {
+            QualType Ty = std::prev(I)
+                              ->getAssociatedDeclaration()
+                              ->getType()
+                              .getNonReferenceType();
+            BP = CGF.EmitLoadOfPointer(BP, Ty->castAs<PointerType>());
+            FirstPointerInComplexData = false;
+          }
+        }
       }
 
       auto Next = std::next(I);
@@ -7615,10 +7634,11 @@ class MappableExprsHandler {
           // same expression except for the first one. We also need to signal
           // this map is the first one that relates with the current capture
           // (there is a set of entries for each capture).
-          OpenMPOffloadMappingFlags Flags = getMapTypeBits(
-              MapType, MapModifiers, IsImplicit,
-              !IsExpressionFirstInfo || RequiresReference,
-              IsCaptureFirstInfo && !RequiresReference);
+          OpenMPOffloadMappingFlags Flags =
+              getMapTypeBits(MapType, MapModifiers, IsImplicit,
+                             !IsExpressionFirstInfo || RequiresReference ||
+                                 FirstPointerInComplexData,
+                             IsCaptureFirstInfo && !RequiresReference);
 
           if (!IsExpressionFirstInfo) {
             // If we have a PTR_AND_OBJ pair where the OBJ is a pointer as well,
@@ -7676,6 +7696,7 @@ class MappableExprsHandler {
 
         IsExpressionFirstInfo = false;
         IsCaptureFirstInfo = false;
+        FirstPointerInComplexData = false;
       }
     }
   }
@@ -7906,6 +7927,10 @@ class MappableExprsHandler {
     // emission of that entry until the whole struct has been processed.
     llvm::MapVector<const ValueDecl *, SmallVector<DeferredDevicePtrEntryTy, 4>>
         DeferredInfo;
+    MapBaseValuesArrayTy UseDevicePtrBasePointers;
+    MapValuesArrayTy UseDevicePtrPointers;
+    MapValuesArrayTy UseDevicePtrSizes;
+    MapFlagsArrayTy UseDevicePtrTypes;
 
     for (const auto *C :
          CurExecDir->getClausesOfKind<OMPUseDevicePtrClause>()) {
@@ -7922,15 +7947,27 @@ class MappableExprsHandler {
         // We potentially have map information for this declaration already.
         // Look for the first set of components that refer to it.
         if (It != Info.end()) {
-          auto CI = std::find_if(
-              It->second.begin(), It->second.end(), [VD](const MapInfo &MI) {
-                return MI.Components.back().getAssociatedDeclaration() == VD;
-              });
+          auto *CI = llvm::find_if(It->second, [VD](const MapInfo &MI) {
+            return MI.Components.back().getAssociatedDeclaration() == VD;
+          });
           // If we found a map entry, signal that the pointer has to be returned
           // and move on to the next declaration.
+          // Exclude cases where the base pointer is mapped as array subscript,
+          // array section or array shaping. The base address is passed as a
+          // pointer to base in this case and cannot be used as a base for
+          // use_device_ptr list item.
           if (CI != It->second.end()) {
-            CI->ReturnDevicePointer = true;
-            continue;
+            auto PrevCI = std::next(CI->Components.rbegin());
+            const auto *VarD = dyn_cast<VarDecl>(VD);
+            if (CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory() ||
+                isa<MemberExpr>(IE) ||
+                !VD->getType().getNonReferenceType()->isPointerType() ||
+                PrevCI == CI->Components.rend() ||
+                isa<MemberExpr>(PrevCI->getAssociatedExpression()) || !VarD ||
+                VarD->hasLocalStorage()) {
+              CI->ReturnDevicePointer = true;
+              continue;
+            }
           }
         }
 
@@ -7951,10 +7988,12 @@ class MappableExprsHandler {
         } else {
           llvm::Value *Ptr =
               CGF.EmitLoadOfScalar(CGF.EmitLValue(IE), IE->getExprLoc());
-          BasePointers.emplace_back(Ptr, VD);
-          Pointers.push_back(Ptr);
-          Sizes.push_back(llvm::Constant::getNullValue(CGF.Int64Ty));
-          Types.push_back(OMP_MAP_RETURN_PARAM | OMP_MAP_TARGET_PARAM);
+          UseDevicePtrBasePointers.emplace_back(Ptr, VD);
+          UseDevicePtrPointers.push_back(Ptr);
+          UseDevicePtrSizes.push_back(
+              llvm::Constant::getNullValue(CGF.Int64Ty));
+          UseDevicePtrTypes.push_back(OMP_MAP_RETURN_PARAM |
+                                      OMP_MAP_TARGET_PARAM);
         }
       }
     }
@@ -8015,10 +8054,12 @@ class MappableExprsHandler {
             Ptr = CGF.EmitLValue(IE).getPointer(CGF);
           else
             Ptr = CGF.EmitScalarExpr(IE);
-          BasePointers.emplace_back(Ptr, VD);
-          Pointers.push_back(Ptr);
-          Sizes.push_back(llvm::Constant::getNullValue(CGF.Int64Ty));
-          Types.push_back(OMP_MAP_RETURN_PARAM | OMP_MAP_TARGET_PARAM);
+          UseDevicePtrBasePointers.emplace_back(Ptr, VD);
+          UseDevicePtrPointers.push_back(Ptr);
+          UseDevicePtrSizes.push_back(
+              llvm::Constant::getNullValue(CGF.Int64Ty));
+          UseDevicePtrTypes.push_back(OMP_MAP_RETURN_PARAM |
+                                      OMP_MAP_TARGET_PARAM);
         }
       }
     }
@@ -8108,6 +8149,12 @@ class MappableExprsHandler {
       Sizes.append(CurSizes.begin(), CurSizes.end());
       Types.append(CurTypes.begin(), CurTypes.end());
     }
+    // Append data for use_device_ptr clauses.
+    BasePointers.append(UseDevicePtrBasePointers.begin(),
+                        UseDevicePtrBasePointers.end());
+    Pointers.append(UseDevicePtrPointers.begin(), UseDevicePtrPointers.end());
+    Sizes.append(UseDevicePtrSizes.begin(), UseDevicePtrSizes.end());
+    Types.append(UseDevicePtrTypes.begin(), UseDevicePtrTypes.end());
   }
 
   /// Generate all the base pointers, section pointers, sizes and map types for
diff --git a/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp b/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp
index a3d8043b6b4e..fa53cc4aa8f7 100644
--- a/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp
+++ b/clang/test/OpenMP/target_data_use_device_ptr_codegen.cpp
@@ -22,18 +22,18 @@
 double *g;
 
 // CK1: @g = global double*
-// CK1: [[MTYPE00:@.+]] = {{.*}}constant [1 x i64] [i64 99]
+// CK1: [[MTYPE00:@.+]] = {{.*}}constant [2 x i64] [i64 51, i64 96]
 // CK1: [[MTYPE01:@.+]] = {{.*}}constant [1 x i64] [i64 99]
 // CK1: [[MTYPE03:@.+]] = {{.*}}constant [1 x i64] [i64 99]
 // CK1: [[MTYPE04:@.+]] = {{.*}}constant [1 x i64] [i64 99]
 // CK1: [[MTYPE05:@.+]] = {{.*}}constant [1 x i64] [i64 99]
 // CK1: [[MTYPE06:@.+]] = {{.*}}constant [1 x i64] [i64 99]
 // CK1: [[MTYPE07:@.+]] = {{.*}}constant [1 x i64] [i64 99]
-// CK1: [[MTYPE08:@.+]] = {{.*}}constant [2 x i64] [{{i64 35, i64 99|i64 99, i64 35}}]
+// CK1: [[MTYPE08:@.+]] = {{.*}}constant [2 x i64] [i64 99, i64 35]
 // CK1: [[MTYPE09:@.+]] = {{.*}}constant [2 x i64] [i64 99, i64 99]
 // CK1: [[MTYPE10:@.+]] = {{.*}}constant [2 x i64] [i64 99, i64 99]
-// CK1: [[MTYPE11:@.+]] = {{.*}}constant [2 x i64] [i64 96, i64 35]
-// CK1: [[MTYPE12:@.+]] = {{.*}}constant [2 x i64] [i64 96, i64 35]
+// CK1: [[MTYPE11:@.+]] = {{.*}}constant [2 x i64] [i64 35, i64 96]
+// CK1: [[MTYPE12:@.+]] = {{.*}}constant [2 x i64] [i64 35, i64 96]
 
 // CK1-LABEL: @_Z3foo
 template<typename T>
@@ -42,7 +42,7 @@ void foo(float *&lr, T *&tr) {
   T *t;
 
   // CK1:     [[T:%.+]] = load double*, double** [[DECL:@g]],
-  // CK1:     [[BP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* %{{.+}}, i32 0, i32 0
+  // CK1:     [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 1
   // CK1:     [[CBP:%.+]] = bitcast i8** [[BP]] to double**
   // CK1:     store double* [[T]], double** [[CBP]],
   // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE00]]
@@ -280,7 +280,7 @@ void foo(float *&lr, T *&tr) {
   ++l; ++t;
 
   // CK1:     [[T1:%.+]] = load i32*, i32** [[DECL:%.+]],
-  // CK1:     [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 0
+  // CK1:     [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 1
   // CK1:     [[CBP:%.+]] = bitcast i8** [[BP]] to i32**
   // CK1:     store i32* [[T1]], i32** [[CBP]],
   // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE11]]
@@ -300,7 +300,7 @@ void foo(float *&lr, T *&tr) {
 
   // CK1:     [[T2:%.+]] = load i32**, i32*** [[DECL:%.+]],
   // CK1:     [[T1:%.+]] = load i32*, i32** [[T2]],
-  // CK1:     [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 0
+  // CK1:     [[BP:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* %{{.+}}, i32 0, i32 1
   // CK1:     [[CBP:%.+]] = bitcast i8** [[BP]] to i32**
   // CK1:     store i32* [[T1]], i32** [[CBP]],
   // CK1:     call void @__tgt_target_data_begin{{.+}}[[MTYPE12]]
diff --git a/clang/test/OpenMP/target_map_codegen.cpp b/clang/test/OpenMP/target_map_codegen.cpp
index 92e0224a2de3..ad54b560889b 100644
--- a/clang/test/OpenMP/target_map_codegen.cpp
+++ b/clang/test/OpenMP/target_map_codegen.cpp
@@ -3195,7 +3195,7 @@ int explicit_maps_template_args_and_members(int a){
 
 // CK22-LABEL: @.__omp_offloading_{{.*}}explicit_maps_globals{{.*}}_l{{[0-9]+}}.region_id = weak constant i8 0
 // CK22: [[SIZE04:@.+]] = private {{.*}}constant [1 x i64] [i64 20]
-// CK22: [[MTYPE04:@.+]] = private {{.*}}constant [1 x i64] [i64 35]
+// CK22: [[MTYPE04:@.+]] = private {{.*}}constant [1 x i64] [i64 51]
 
 // CK22-LABEL: @.__omp_offloading_{{.*}}explicit_maps_globals{{.*}}_l{{[0-9]+}}.region_id = weak constant i8 0
 // CK22: [[SIZE05:@.+]] = private {{.*}}constant [1 x i64] [i64 4]
@@ -3215,7 +3215,7 @@ int explicit_maps_template_args_and_members(int a){
 
 // CK22-LABEL: @.__omp_offloading_{{.*}}explicit_maps_globals{{.*}}_l{{[0-9]+}}.region_id = weak constant i8 0
 // CK22: [[SIZE09:@.+]] = private {{.*}}constant [1 x i64] [i64 20]
-// CK22: [[MTYPE09:@.+]] = private {{.*}}constant [1 x i64] [i64 35]
+// CK22: [[MTYPE09:@.+]] = private {{.*}}constant [1 x i64] [i64 51]
 
 // CK22-LABEL: @.__omp_offloading_{{.*}}explicit_maps_globals{{.*}}_l{{[0-9]+}}.region_id = weak constant i8 0
 // CK22: [[SIZE10:@.+]] = private {{.*}}constant [1 x i64] [i64 4]
@@ -3235,7 +3235,7 @@ int explicit_maps_template_args_and_members(int a){
 
 // CK22-LABEL: @.__omp_offloading_{{.*}}explicit_maps_globals{{.*}}_l{{[0-9]+}}.region_id = weak constant i8 0
 // CK22: [[SIZE14:@.+]] = private {{.*}}constant [1 x i64] [i64 20]
-// CK22: [[MTYPE14:@.+]] = private {{.*}}constant [1 x i64] [i64 35]
+// CK22: [[MTYPE14:@.+]] = private {{.*}}constant [1 x i64] [i64 51]
 
 int a;
 int c[100];
@@ -3331,11 +3331,10 @@ int explicit_maps_globals(void){
 
   // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
   // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
-  // CK22-DAG: [[CBP0:%.+]] = bitcast i8** [[BP0]] to i32**
+  // CK22-DAG: [[CBP0:%.+]] = bitcast i8** [[BP0]] to i32***
   // CK22-DAG: [[CP0:%.+]] = bitcast i8** [[P0]] to i32**
-  // CK22-DAG: store i32* [[RVAR0:%.+]], i32** [[CBP0]]
+  // CK22-DAG: store i32** @d, i32*** [[CBP0]]
   // CK22-DAG: store i32* [[SEC0:%.+]], i32** [[CP0]]
-  // CK22-DAG: [[RVAR0]] = load i32*, i32** @d
   // CK22-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 2
   // CK22-DAG: [[RVAR00]] = load i32*, i32** @d
 
@@ -3414,11 +3413,10 @@ int explicit_maps_globals(void){
 
   // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
   // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
-  // CK22-DAG: [[CBP0:%.+]] = bitcast i8** [[BP0]] to [[ST]]**
+  // CK22-DAG: [[CBP0:%.+]] = bitcast i8** [[BP0]] to [[ST]]***
   // CK22-DAG: [[CP0:%.+]] = bitcast i8** [[P0]] to [[ST]]**
-  // CK22-DAG: store [[ST]]* [[RVAR0:%.+]], [[ST]]** [[CBP0]]
+  // CK22-DAG: store [[ST]]** @sd, [[ST]]*** [[CBP0]]
   // CK22-DAG: store [[ST]]* [[SEC0:%.+]], [[ST]]** [[CP0]]
-  // CK22-DAG: [[RVAR0]] = load [[ST]]*, [[ST]]** @sd
   // CK22-DAG: [[SEC0]] = getelementptr {{.*}}[[ST]]* [[RVAR00:%.+]], i{{.+}} 2
   // CK22-DAG: [[RVAR00]] = load [[ST]]*, [[ST]]** @sd
 
@@ -3497,11 +3495,10 @@ int explicit_maps_globals(void){
 
   // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
   // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
-  // CK22-DAG: [[CBP0:%.+]] = bitcast i8** [[BP0]] to [[STT]]**
+  // CK22-DAG: [[CBP0:%.+]] = bitcast i8** [[BP0]] to [[STT]]***
   // CK22-DAG: [[CP0:%.+]] = bitcast i8** [[P0]] to [[STT]]**
-  // CK22-DAG: store [[STT]]* [[RVAR0:%.+]], [[STT]]** [[CBP0]]
+  // CK22-DAG: store [[STT]]** @std, [[STT]]*** [[CBP0]]
   // CK22-DAG: store [[STT]]* [[SEC0:%.+]], [[STT]]** [[CP0]]
-  // CK22-DAG: [[RVAR0]] = load [[STT]]*, [[STT]]** @std
   // CK22-DAG: [[SEC0]] = getelementptr {{.*}}[[STT]]* [[RVAR00:%.+]], i{{.+}} 2
   // CK22-DAG: [[RVAR00]] = load [[STT]]*, [[STT]]** @std
 
diff --git a/clang/test/OpenMP/target_update_codegen.cpp b/clang/test/OpenMP/target_update_codegen.cpp
index fd5a62a8067c..a308b9ed6deb 100644
--- a/clang/test/OpenMP/target_update_codegen.cpp
+++ b/clang/test/OpenMP/target_update_codegen.cpp
@@ -737,7 +737,7 @@ void lvalue(int **BB, int a, int b) {
   // CK13-64-DAG: [[ADD_PTR]] = getelementptr inbounds i32*, i32** [[B_VAL:%.+]], i64 [[IDX_EXT:%.+]]
   // CK13-32-DAG: [[ADD_PTR]] = getelementptr inbounds i32*, i32** [[B_VAL:%.+]], i32 [[A_ADDR:%.+]]
   // CK13-64-DAG: [[IDX_EXT]] = sext i32 [[TWO:%.+]] to i64
-  // CK13-DAG: [[B_VAL]] = load i32**, i32*** [[BB_ADDR]]
+  // CK13-DAG: [[B_VAL]] = load i32**, i32*** [[BB_ADDR:%.+]]
   #pragma omp target update to(*(*(BB+a)+b))
   *(*(BB+a)+b) = 1;
   #pragma omp target update from(*(*(BB+a)+b))
@@ -978,6 +978,7 @@ void lvalue_find_base(float **f, SSA *sa) {
   // CK17-DAG: [[FIVE]] = load i32, i32* [[I_2:%.+]],
   // CK17-DAG: [[I_2]] = getelementptr inbounds [[SSA:%.+]], [[SSA]]* [[FOUR:%.+]], i32 0, i32 0
   // CK17-DAG: [[FOUR]] = load [[SSA]]*, [[SSA]]** [[SSA_ADDR:%.+]],
+  // CK17-DAG: [[F]] = load float**, float*** [[F_ADDR:%.+]],
 
   #pragma omp target update to(*(sa->sa->i+*(1+sa->i+f)))
   *(sa->sa->i+*(1+sa->i+f)) = 1;
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index cce9dbd2fe15..15712323d43e 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -746,14 +746,9 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num,
           return OFFLOAD_FAIL;
         }
       }
-    } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
-      TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast,
-          false, IsHostPtr);
-      TgtBaseOffset = 0; // no offset for ptrs.
-      DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
-         "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
-         DPxPTR(HstPtrBase));
     } else {
+      if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
+        HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
       TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
           false, IsHostPtr);
       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
diff --git a/openmp/libomptarget/test/env/base_ptr_ref_count.c b/openmp/libomptarget/test/env/base_ptr_ref_count.c
new file mode 100644
index 000000000000..5b62f5eb8ac3
--- /dev/null
+++ b/openmp/libomptarget/test/env/base_ptr_ref_count.c
@@ -0,0 +1,47 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda
+// REQUIRES: libomptarget-debug
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int *allocate(size_t n) {
+  int *ptr = malloc(sizeof(int) * n);
+#pragma omp target enter data map(to : ptr[:n])
+  return ptr;
+}
+
+void deallocate(int *ptr, size_t n) {
+#pragma omp target exit data map(delete : ptr[:n])
+  free(ptr);
+}
+
+#pragma omp declare target
+int *cnt;
+void foo() {
+  ++(*cnt);
+}
+#pragma omp end declare target
+
+int main(void) {
+  int *A = allocate(10);
+  int *V = allocate(10);
+  deallocate(A, 10);
+  deallocate(V, 10);
+// CHECK-NOT: RefCount=2
+  cnt = malloc(sizeof(int));
+  *cnt = 0;
+#pragma omp target data map(cnt[:1])
+#pragma omp target
+  foo();
+  printf("Cnt = %d.\n", *cnt);
+// CHECK: Cnt = 1.
+  free(cnt);
+
+  return 0;
+}
+
+

From 4d16d8dfe50eb45545e844c3c9acafd363637dad Mon Sep 17 00:00:00 2001
From: QingShan Zhang <qshanz@cn.ibm.com>
Date: Mon, 24 Aug 2020 02:50:58 +0000
Subject: [PATCH 171/363] [DAGCombine] Remove dead node when it is created by
 getNegatedExpression

We hit the compiling time reported by https://bugs.llvm.org/show_bug.cgi?id=46877
and the reason is the same as D77319. So we need to remove the dead node we created
to avoid increase the problem size of DAGCombiner.

Reviewed By: Spatel

Differential Revision: https://reviews.llvm.org/D86183

(cherry picked from commit 960cbc53ca170c8c605bf83fa63b49ab27a56f65)
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  11 +
 llvm/test/CodeGen/X86/pr46877.ll              | 416 ++++++++++++++++++
 2 files changed, 427 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/pr46877.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 96df20039b15..94cb6da3d69e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5726,6 +5726,11 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
       return SDValue();
   }
 
+  auto RemoveDeadNode = [&](SDValue N) {
+    if (N && N.getNode()->use_empty())
+      DAG.RemoveDeadNode(N.getNode());
+  };
+
   SDLoc DL(Op);
 
   switch (Opcode) {
@@ -5804,12 +5809,14 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     // Negate the X if its cost is less or equal than Y.
     if (NegX && (CostX <= CostY)) {
       Cost = CostX;
+      RemoveDeadNode(NegY);
       return DAG.getNode(ISD::FSUB, DL, VT, NegX, Y, Flags);
     }
 
     // Negate the Y if it is not expensive.
     if (NegY) {
       Cost = CostY;
+      RemoveDeadNode(NegX);
       return DAG.getNode(ISD::FSUB, DL, VT, NegY, X, Flags);
     }
     break;
@@ -5847,6 +5854,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     // Negate the X if its cost is less or equal than Y.
     if (NegX && (CostX <= CostY)) {
       Cost = CostX;
+      RemoveDeadNode(NegY);
       return DAG.getNode(Opcode, DL, VT, NegX, Y, Flags);
     }
 
@@ -5858,6 +5866,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     // Negate the Y if it is not expensive.
     if (NegY) {
       Cost = CostY;
+      RemoveDeadNode(NegX);
       return DAG.getNode(Opcode, DL, VT, X, NegY, Flags);
     }
     break;
@@ -5887,12 +5896,14 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     // Negate the X if its cost is less or equal than Y.
     if (NegX && (CostX <= CostY)) {
       Cost = std::min(CostX, CostZ);
+      RemoveDeadNode(NegY);
       return DAG.getNode(Opcode, DL, VT, NegX, Y, NegZ, Flags);
     }
 
     // Negate the Y if it is not expensive.
     if (NegY) {
       Cost = std::min(CostY, CostZ);
+      RemoveDeadNode(NegX);
       return DAG.getNode(Opcode, DL, VT, X, NegY, NegZ, Flags);
     }
     break;
diff --git a/llvm/test/CodeGen/X86/pr46877.ll b/llvm/test/CodeGen/X86/pr46877.ll
new file mode 100644
index 000000000000..581b2d586fa0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr46877.ll
@@ -0,0 +1,416 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 < %s -mcpu=haswell -mtriple=x86_64 | FileCheck %s
+
+; Verify that we are not exponentially increasing compiling time.
+define void @tester(float %0, float %1, float %2, float %3, float %4, float %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13, float %14, float %15, float %16, float %17, float %18, float %19, float %20, float %21, float %22, float %23, float %24, float %25, float %26, float %27, float %28, float %29, float %30, float %31, float %32, float %33, float %34, float %35, float %36, float %37, float %38, float %39, float %40, float %41, float %42, float %43, float %44, float %45, float %46, float %47, float %48, float %49, float %50, float %51, float %52, float %53, float %54, float %55, float %56, float %57, float %58, float %59, float %60, float %61, float %62, float %63, float %64, float %65, float %66, float %67, float %68, float %69, float %70, float %71, float %72, float %73, float %74, float %75, float %76, float %77, float %78, float %79, float* %80) {
+; CHECK-LABEL: tester:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovaps %xmm3, %xmm15
+; CHECK-NEXT:    vmovss {{.*#+}} xmm14 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm13 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vsubss %xmm1, %xmm0, %xmm12
+; CHECK-NEXT:    vmulss %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    vfmsub213ss {{.*#+}} xmm3 = (xmm15 * xmm3) - xmm0
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0
+; CHECK-NEXT:    vmulss %xmm5, %xmm4, %xmm2
+; CHECK-NEXT:    vmulss %xmm2, %xmm3, %xmm3
+; CHECK-NEXT:    vmulss %xmm6, %xmm12, %xmm2
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm2 = -(xmm7 * xmm2) + xmm0
+; CHECK-NEXT:    vmulss %xmm3, %xmm2, %xmm5
+; CHECK-NEXT:    vmulss %xmm0, %xmm13, %xmm2
+; CHECK-NEXT:    vmovss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    vmulss %xmm2, %xmm10, %xmm2
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm3
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * mem) + xmm0
+; CHECK-NEXT:    vmulss %xmm3, %xmm2, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm3
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm4
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm4 = -(xmm14 * xmm4) + xmm0
+; CHECK-NEXT:    vmulss %xmm4, %xmm5, %xmm4
+; CHECK-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0
+; CHECK-NEXT:    vmulss %xmm5, %xmm2, %xmm2
+; CHECK-NEXT:    vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm7, %xmm5
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm5 = -(xmm10 * xmm5) + xmm0
+; CHECK-NEXT:    vmulss %xmm5, %xmm4, %xmm4
+; CHECK-NEXT:    vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmulss %xmm0, %xmm9, %xmm6
+; CHECK-NEXT:    vmovss %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    vmulss %xmm6, %xmm14, %xmm5
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0
+; CHECK-NEXT:    vmulss %xmm5, %xmm2, %xmm2
+; CHECK-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm5 = -(xmm13 * xmm5) + xmm0
+; CHECK-NEXT:    vmulss %xmm5, %xmm4, %xmm4
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm11
+; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm11 * xmm3) + xmm0
+; CHECK-NEXT:    vmulss %xmm3, %xmm2, %xmm2
+; CHECK-NEXT:    vmulss %xmm2, %xmm4, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm15 * xmm3) + xmm0
+; CHECK-NEXT:    vmulss %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm4
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0
+; CHECK-NEXT:    vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm8, %xmm6
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0
+; CHECK-NEXT:    vmulss %xmm6, %xmm4, %xmm4
+; CHECK-NEXT:    vmulss %xmm4, %xmm2, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm4
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0
+; CHECK-NEXT:    vmulss %xmm2, %xmm4, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm4 = -(xmm1 * xmm4) + xmm0
+; CHECK-NEXT:    vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm6 = -(xmm6 * mem) + xmm0
+; CHECK-NEXT:    vmulss %xmm6, %xmm4, %xmm4
+; CHECK-NEXT:    vmulss %xmm4, %xmm2, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm1
+; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm4 = -(xmm1 * xmm4) + xmm0
+; CHECK-NEXT:    vmulss %xmm2, %xmm4, %xmm10
+; CHECK-NEXT:    vmulss %xmm0, %xmm12, %xmm6
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm4
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm13, %xmm5
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm5 = -(xmm7 * xmm5) + xmm0
+; CHECK-NEXT:    vmulss %xmm5, %xmm4, %xmm4
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm5
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm5
+; CHECK-NEXT:    vmulss %xmm4, %xmm5, %xmm12
+; CHECK-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm5 = -(xmm7 * xmm5) + xmm0
+; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmulss %xmm6, %xmm3, %xmm2
+; CHECK-NEXT:    vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm2 = -(xmm10 * xmm2) + xmm0
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm9
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm1
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0
+; CHECK-NEXT:    vmulss %xmm2, %xmm5, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm5
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm5 = -(xmm5 * mem) + xmm0
+; CHECK-NEXT:    vmulss %xmm1, %xmm2, %xmm1
+; CHECK-NEXT:    vmulss %xmm5, %xmm1, %xmm1
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm2
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm2 = -(xmm13 * xmm2) + xmm0
+; CHECK-NEXT:    vmulss %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm12, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vmulss %xmm1, %xmm2, %xmm4
+; CHECK-NEXT:    vmovss {{.*#+}} xmm13 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm3
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm13 * xmm3) + xmm0
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm2
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * mem) + xmm0
+; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT:    vmulss %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0
+; CHECK-NEXT:    vmulss %xmm1, %xmm2, %xmm1
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm4, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; CHECK-NEXT:    vmulss %xmm1, %xmm2, %xmm1
+; CHECK-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 4-byte Reload
+; CHECK-NEXT:    # xmm12 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm12, %xmm2
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm7 = -(xmm7 * mem) + xmm0
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm2 = -(xmm13 * xmm2) + xmm0
+; CHECK-NEXT:    vmulss %xmm7, %xmm2, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm8 = -(xmm8 * mem) + xmm0
+; CHECK-NEXT:    vmulss %xmm2, %xmm8, %xmm2
+; CHECK-NEXT:    vmulss %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm2 = -(xmm15 * xmm2) + xmm0
+; CHECK-NEXT:    vmulss %xmm1, %xmm2, %xmm1
+; CHECK-NEXT:    vmulss %xmm0, %xmm5, %xmm2
+; CHECK-NEXT:    vmulss %xmm3, %xmm2, %xmm2
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm2 = -(xmm10 * xmm2) + xmm0
+; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm5 * xmm3) + xmm0
+; CHECK-NEXT:    vmulss %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm8
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm9, %xmm4
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0
+; CHECK-NEXT:    vmulss %xmm4, %xmm2, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; CHECK-NEXT:    vmulss %xmm2, %xmm1, %xmm10
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm11 = -(xmm5 * xmm11) + xmm0
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm2
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm2 = -(xmm15 * xmm2) + xmm0
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm1, %xmm4
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm4 = -(xmm4 * mem) + xmm0
+; CHECK-NEXT:    vmulss %xmm2, %xmm11, %xmm2
+; CHECK-NEXT:    vmulss %xmm4, %xmm2, %xmm2
+; CHECK-NEXT:    vfnmadd132ss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 4-byte Folded Reload
+; CHECK-NEXT:    # xmm14 = -(xmm14 * mem) + xmm0
+; CHECK-NEXT:    vmulss %xmm2, %xmm14, %xmm9
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm11
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm11 = -(xmm11 * mem) + xmm0
+; CHECK-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm5, %xmm7
+; CHECK-NEXT:    vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 4-byte Folded Reload
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm6, %xmm1
+; CHECK-NEXT:    vmulss %xmm6, %xmm15, %xmm6
+; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm6 = -(xmm3 * xmm6) + xmm0
+; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm2, %xmm4
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm4 = -(xmm3 * xmm4) + xmm0
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm7 = -(xmm3 * xmm7) + xmm0
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm5 = -(xmm3 * xmm5) + xmm0
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm12, %xmm2
+; CHECK-NEXT:    vmulss %xmm0, %xmm13, %xmm3
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm3, %xmm3
+; CHECK-NEXT:    vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm12 * xmm3) + xmm0
+; CHECK-NEXT:    vfnmadd213ss {{.*#+}} xmm2 = -(xmm12 * xmm2) + xmm0
+; CHECK-NEXT:    vfmsub213ss {{.*#+}} xmm1 = (xmm15 * xmm1) - xmm0
+; CHECK-NEXT:    vfnmadd132ss {{.*#+}} xmm8 = -(xmm8 * mem) + xmm0
+; CHECK-NEXT:    vmulss %xmm8, %xmm9, %xmm0
+; CHECK-NEXT:    vmulss %xmm6, %xmm0, %xmm0
+; CHECK-NEXT:    vmulss %xmm4, %xmm0, %xmm0
+; CHECK-NEXT:    vmulss %xmm7, %xmm0, %xmm0
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm10, %xmm4
+; CHECK-NEXT:    vmulss %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vmulss %xmm5, %xmm11, %xmm4
+; CHECK-NEXT:    vmulss %xmm3, %xmm4, %xmm3
+; CHECK-NEXT:    vmulss %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vmulss {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; CHECK-NEXT:    vmulss %xmm1, %xmm2, %xmm1
+; CHECK-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovss %xmm0, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+  %81 = fsub reassoc nsz contract float %0, %1
+  %82 = fmul reassoc nsz contract float %1, %2
+  %83 = fmul reassoc nsz contract float %3, %82
+  %84 = fsub reassoc nsz contract float %0, %83
+  %85 = fmul reassoc nsz contract float %84, %4
+  %86 = fmul reassoc nsz contract float %81, %5
+  %87 = fsub reassoc nsz contract float %0, %86
+  %88 = fmul reassoc nsz contract float %87, %85
+  %89 = fmul reassoc nsz contract float %81, %6
+  %90 = fmul reassoc nsz contract float %89, %7
+  %91 = fsub reassoc nsz contract float %0, %90
+  %92 = fmul reassoc nsz contract float %91, %88
+  %93 = fmul reassoc nsz contract float %8, %0
+  %94 = fmul reassoc nsz contract float %93, %9
+  %95 = fmul reassoc nsz contract float %94, %10
+  %96 = fsub reassoc nsz contract float %0, %95
+  %97 = fmul reassoc nsz contract float %96, %92
+  %98 = fmul reassoc nsz contract float %11, %7
+  %99 = fmul reassoc nsz contract float %98, %12
+  %100 = fsub reassoc nsz contract float %0, %99
+  %101 = fmul reassoc nsz contract float %100, %97
+  %102 = fmul reassoc nsz contract float %13, %0
+  %103 = fmul reassoc nsz contract float %102, %14
+  %104 = fmul reassoc nsz contract float %103, %15
+  %105 = fsub reassoc nsz contract float %0, %104
+  %106 = fmul reassoc nsz contract float %105, %101
+  %107 = fmul reassoc nsz contract float %16, %17
+  %108 = fsub reassoc nsz contract float %0, %107
+  %109 = fmul reassoc nsz contract float %108, %106
+  %110 = fmul reassoc nsz contract float %18, %19
+  %111 = fmul reassoc nsz contract float %110, %9
+  %112 = fsub reassoc nsz contract float %0, %111
+  %113 = fmul reassoc nsz contract float %112, %109
+  %114 = fmul reassoc nsz contract float %20, %0
+  %115 = fmul reassoc nsz contract float %114, %15
+  %116 = fmul reassoc nsz contract float %81, %115
+  %117 = fsub reassoc nsz contract float %0, %116
+  %118 = fmul reassoc nsz contract float %117, %113
+  %119 = fmul reassoc nsz contract float %8, %21
+  %120 = fsub reassoc nsz contract float %0, %119
+  %121 = fmul reassoc nsz contract float %120, %118
+  %122 = fmul reassoc nsz contract float %102, %22
+  %123 = fmul reassoc nsz contract float %122, %23
+  %124 = fsub reassoc nsz contract float %0, %123
+  %125 = fmul reassoc nsz contract float %124, %121
+  %126 = fmul reassoc nsz contract float %125, %24
+  %127 = fmul reassoc nsz contract float %3, %25
+  %128 = fsub reassoc nsz contract float %0, %127
+  %129 = fmul reassoc nsz contract float %128, %126
+  %130 = fmul reassoc nsz contract float %129, %26
+  %131 = fmul reassoc nsz contract float %27, %1
+  %132 = fmul reassoc nsz contract float %131, %28
+  %133 = fsub reassoc nsz contract float %0, %132
+  %134 = fmul reassoc nsz contract float %133, %130
+  %135 = fmul reassoc nsz contract float %29, %30
+  %136 = fmul reassoc nsz contract float %135, %31
+  %137 = fsub reassoc nsz contract float %0, %136
+  %138 = fmul reassoc nsz contract float %137, %134
+  %139 = fmul reassoc nsz contract float %138, %32
+  %140 = fmul reassoc nsz contract float %139, %33
+  %141 = fmul reassoc nsz contract float %140, %34
+  %142 = fmul reassoc nsz contract float %35, %9
+  %143 = fmul reassoc nsz contract float %142, %36
+  %144 = fsub reassoc nsz contract float %0, %143
+  %145 = fmul reassoc nsz contract float %144, %141
+  %146 = fmul reassoc nsz contract float %145, %37
+  %147 = fmul reassoc nsz contract float %1, %38
+  %148 = fsub reassoc nsz contract float %0, %147
+  %149 = fmul reassoc nsz contract float %148, %146
+  %150 = fmul reassoc nsz contract float %39, %40
+  %151 = fsub reassoc nsz contract float %0, %150
+  %152 = fmul reassoc nsz contract float %151, %149
+  %153 = fmul reassoc nsz contract float %152, %41
+  %154 = fmul reassoc nsz contract float %20, %42
+  %155 = fmul reassoc nsz contract float %154, %43
+  %156 = fsub reassoc nsz contract float %0, %155
+  %157 = fmul reassoc nsz contract float %156, %153
+  %158 = fmul reassoc nsz contract float %157, %44
+  %159 = fmul reassoc nsz contract float %158, %45
+  %160 = fmul reassoc nsz contract float %81, %0
+  %161 = fmul reassoc nsz contract float %160, %46
+  %162 = fmul reassoc nsz contract float %161, %14
+  %163 = fsub reassoc nsz contract float %0, %162
+  %164 = fmul reassoc nsz contract float %163, %159
+  %165 = fmul reassoc nsz contract float %8, %47
+  %166 = fmul reassoc nsz contract float %18, %165
+  %167 = fsub reassoc nsz contract float %0, %166
+  %168 = fmul reassoc nsz contract float %167, %164
+  %169 = fmul reassoc nsz contract float %168, %48
+  %170 = fmul reassoc nsz contract float %169, %49
+  %171 = fmul reassoc nsz contract float %18, %50
+  %172 = fsub reassoc nsz contract float %0, %171
+  %173 = fmul reassoc nsz contract float %172, %170
+  %174 = fmul reassoc nsz contract float %16, %160
+  %175 = fmul reassoc nsz contract float %174, %12
+  %176 = fsub reassoc nsz contract float %0, %175
+  %177 = fmul reassoc nsz contract float %176, %173
+  %178 = fmul reassoc nsz contract float %51, %0
+  %179 = fmul reassoc nsz contract float %178, %22
+  %180 = fmul reassoc nsz contract float %179, %52
+  %181 = fsub reassoc nsz contract float %0, %180
+  %182 = fmul reassoc nsz contract float %181, %177
+  %183 = fmul reassoc nsz contract float %27, %16
+  %184 = fmul reassoc nsz contract float %183, %53
+  %185 = fsub reassoc nsz contract float %0, %184
+  %186 = fmul reassoc nsz contract float %185, %182
+  %187 = fmul reassoc nsz contract float %16, %54
+  %188 = fmul reassoc nsz contract float %8, %187
+  %189 = fsub reassoc nsz contract float %0, %188
+  %190 = fmul reassoc nsz contract float %189, %186
+  %191 = fmul reassoc nsz contract float %190, %55
+  %192 = fmul reassoc nsz contract float %191, %56
+  %193 = fmul reassoc nsz contract float %57, %58
+  %194 = fmul reassoc nsz contract float %193, %59
+  %195 = fsub reassoc nsz contract float %0, %194
+  %196 = fmul reassoc nsz contract float %195, %192
+  %197 = fmul reassoc nsz contract float %13, %160
+  %198 = fmul reassoc nsz contract float %197, %36
+  %199 = fsub reassoc nsz contract float %0, %198
+  %200 = fmul reassoc nsz contract float %199, %196
+  %201 = fmul reassoc nsz contract float %93, %60
+  %202 = fmul reassoc nsz contract float %201, %61
+  %203 = fsub reassoc nsz contract float %0, %202
+  %204 = fmul reassoc nsz contract float %203, %200
+  %205 = fmul reassoc nsz contract float %204, %62
+  %206 = fmul reassoc nsz contract float %205, %63
+  %207 = fmul reassoc nsz contract float %114, %9
+  %208 = fmul reassoc nsz contract float %207, %59
+  %209 = fsub reassoc nsz contract float %0, %208
+  %210 = fmul reassoc nsz contract float %209, %206
+  %211 = fmul reassoc nsz contract float %18, %64
+  %212 = fsub reassoc nsz contract float %0, %211
+  %213 = fmul reassoc nsz contract float %212, %210
+  %214 = fmul reassoc nsz contract float %29, %65
+  %215 = fsub reassoc nsz contract float %0, %214
+  %216 = fmul reassoc nsz contract float %215, %213
+  %217 = fmul reassoc nsz contract float %216, %66
+  %218 = fmul reassoc nsz contract float %3, %67
+  %219 = fsub reassoc nsz contract float %0, %218
+  %220 = fmul reassoc nsz contract float %219, %217
+  %221 = fmul reassoc nsz contract float %220, %68
+  %222 = fmul reassoc nsz contract float %57, %69
+  %223 = fsub reassoc nsz contract float %0, %222
+  %224 = fmul reassoc nsz contract float %223, %221
+  %225 = fmul reassoc nsz contract float %57, %0
+  %226 = fmul reassoc nsz contract float %225, %61
+  %227 = fmul reassoc nsz contract float %226, %12
+  %228 = fsub reassoc nsz contract float %0, %227
+  %229 = fmul reassoc nsz contract float %228, %224
+  %230 = fmul reassoc nsz contract float %178, %70
+  %231 = fmul reassoc nsz contract float %230, %46
+  %232 = fsub reassoc nsz contract float %0, %231
+  %233 = fmul reassoc nsz contract float %232, %229
+  %234 = fmul reassoc nsz contract float %233, %71
+  %235 = fmul reassoc nsz contract float %57, %122
+  %236 = fsub reassoc nsz contract float %0, %235
+  %237 = fmul reassoc nsz contract float %236, %234
+  %238 = fmul reassoc nsz contract float %20, %160
+  %239 = fmul reassoc nsz contract float %3, %238
+  %240 = fsub reassoc nsz contract float %0, %239
+  %241 = fmul reassoc nsz contract float %240, %237
+  %242 = fmul reassoc nsz contract float %16, %72
+  %243 = fmul reassoc nsz contract float %242, %73
+  %244 = fsub reassoc nsz contract float %0, %243
+  %245 = fmul reassoc nsz contract float %244, %241
+  %246 = fmul reassoc nsz contract float %154, %15
+  %247 = fsub reassoc nsz contract float %0, %246
+  %248 = fmul reassoc nsz contract float %247, %245
+  %249 = fmul reassoc nsz contract float %178, %23
+  %250 = fmul reassoc nsz contract float %249, %74
+  %251 = fsub reassoc nsz contract float %0, %250
+  %252 = fmul reassoc nsz contract float %251, %248
+  %253 = fmul reassoc nsz contract float %3, %160
+  %254 = fmul reassoc nsz contract float %51, %253
+  %255 = fsub reassoc nsz contract float %0, %254
+  %256 = fmul reassoc nsz contract float %255, %252
+  %257 = fmul reassoc nsz contract float %13, %75
+  %258 = fmul reassoc nsz contract float %257, %51
+  %259 = fsub reassoc nsz contract float %0, %258
+  %260 = fmul reassoc nsz contract float %259, %256
+  %261 = fmul reassoc nsz contract float %8, %76
+  %262 = fmul reassoc nsz contract float %51, %261
+  %263 = fsub reassoc nsz contract float %0, %262
+  %264 = fmul reassoc nsz contract float %263, %260
+  %265 = fmul reassoc nsz contract float %264, %77
+  %266 = fmul reassoc nsz contract float %39, %0
+  %267 = fmul reassoc nsz contract float %266, %78
+  %268 = fmul reassoc nsz contract float %267, %14
+  %269 = fsub reassoc nsz contract float %0, %268
+  %270 = fmul reassoc nsz contract float %269, %265
+  %271 = fmul reassoc nsz contract float %1, %76
+  %272 = fmul reassoc nsz contract float %51, %271
+  %273 = fsub reassoc nsz contract float %0, %272
+  %274 = fmul reassoc nsz contract float %273, %270
+  %275 = fmul reassoc nsz contract float %0, %59
+  %276 = fmul reassoc nsz contract float %275, %79
+  %277 = fmul reassoc nsz contract float %276, %36
+  %278 = fsub reassoc nsz contract float %0, %277
+  %279 = fmul reassoc nsz contract float %278, %274
+  %280 = fmul reassoc nsz contract float %114, %22
+  %281 = fmul reassoc nsz contract float %280, %36
+  %282 = fsub reassoc nsz contract float %0, %281
+  %283 = fmul reassoc nsz contract float %282, %279
+  %284 = fmul reassoc nsz contract float %0, %43
+  %285 = fmul reassoc nsz contract float %284, %81
+  %286 = fmul reassoc nsz contract float %3, %285
+  %287 = fsub reassoc nsz contract float %0, %286
+  %288 = fmul reassoc nsz contract float %287, %283
+  store float %288, float* %80, align 4
+  ret void
+}

From 03c8e1cc7efabd122294e1cd670fba6d544f2831 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Wed, 26 Aug 2020 15:16:02 +0200
Subject: [PATCH 172/363] ReleaseNotes: removal of llgo

---
 llvm/docs/ReleaseNotes.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index aea1550960e8..6c92c1224238 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -44,6 +44,9 @@ Non-comprehensive list of changes in this release
    functionality, or simply have a lot to talk about), see the `NOTE` below
    for adding a new subsection.
 
+* The llgo frontend has been removed for now, but may be resurrected in the
+  future.
+
 * ...
 
 

From 21d01a67c9613932053dd89c9957782f86e0c93f Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 12 Aug 2020 20:50:59 -0700
Subject: [PATCH 173/363] [ELF] --gdb-index: skip SHF_GROUP .debug_info

-gdwarf-5 -fdebug-types-section may produce multiple .debug_info sections.  All
except one are type units (.debug_types before DWARF v5). When constructing
.gdb_index, we should ignore these type units. We use a simple heuristic: the
compile unit does not have the SHF_GROUP flag. (This needs to be revisited if
people place compile unit .debug_info in COMDAT groups.)

This issue manifests as a data race: because an object file may have multiple
.debug_info sections, we may concurrently construct `LLDDwarfObj` for the same
file in multiple threads. The threads may access `InputSectionBase::data()`
concurrently on the same input section. `InputSectionBase::data()` does a lazy
uncompress() and rewrites the member variable `rawData`. A thread running zlib
`inflate()` (transitively called by uncompress()) on a buffer with `rawData`
tampered by another thread may fail with `uncompress failed: zlib error: Z_DATA_ERROR`.

Even if no data race occurred in an optimistic run, if there are N .debug_info,
one CU entry and its address ranges will be replicated N times. The result
.gdb_index can be much larger than a correct one.

The new test gdb-index-dwarf5-type-unit.s actually has two compile units. This
cannot be produced with regular approaches (it can be produced with -r
--unique). This is used to demonstrate that the .gdb_index construction code
only considers the last non-SHF_GROUP .debug_info

Reviewed By: grimar

Differential Revision: https://reviews.llvm.org/D85579

(cherry picked from commit fb141292f4411448af41fc454c07f3903acb84dd)
---
 lld/ELF/DWARF.cpp                         | 22 +++++-
 lld/ELF/DWARF.h                           |  4 +
 lld/ELF/SyntheticSections.cpp             | 50 ++++++------
 lld/test/ELF/gdb-index-dwarf5-type-unit.s | 93 +++++++++++++++++++++++
 4 files changed, 143 insertions(+), 26 deletions(-)
 create mode 100644 lld/test/ELF/gdb-index-dwarf5-type-unit.s

diff --git a/lld/ELF/DWARF.cpp b/lld/ELF/DWARF.cpp
index 24c44730bf64..5767f6020f93 100644
--- a/lld/ELF/DWARF.cpp
+++ b/lld/ELF/DWARF.cpp
@@ -26,7 +26,12 @@ using namespace lld;
 using namespace lld::elf;
 
 template <class ELFT> LLDDwarfObj<ELFT>::LLDDwarfObj(ObjFile<ELFT> *obj) {
-  for (InputSectionBase *sec : obj->getSections()) {
+  // Get the ELF sections to retrieve sh_flags. See the SHF_GROUP comment below.
+  ArrayRef<typename ELFT::Shdr> objSections =
+      CHECK(obj->getObj().sections(), obj);
+  assert(objSections.size() == obj->getSections().size());
+  for (auto it : llvm::enumerate(obj->getSections())) {
+    InputSectionBase *sec = it.value();
     if (!sec)
       continue;
 
@@ -35,7 +40,6 @@ template <class ELFT> LLDDwarfObj<ELFT>::LLDDwarfObj(ObjFile<ELFT> *obj) {
                 .Case(".debug_addr", &addrSection)
                 .Case(".debug_gnu_pubnames", &gnuPubnamesSection)
                 .Case(".debug_gnu_pubtypes", &gnuPubtypesSection)
-                .Case(".debug_info", &infoSection)
                 .Case(".debug_loclists", &loclistsSection)
                 .Case(".debug_ranges", &rangesSection)
                 .Case(".debug_rnglists", &rnglistsSection)
@@ -53,6 +57,20 @@ template <class ELFT> LLDDwarfObj<ELFT>::LLDDwarfObj(ObjFile<ELFT> *obj) {
       strSection = toStringRef(sec->data());
     else if (sec->name == ".debug_line_str")
       lineStrSection = toStringRef(sec->data());
+    else if (sec->name == ".debug_info" &&
+             !(objSections[it.index()].sh_flags & ELF::SHF_GROUP)) {
+      // In DWARF v5, -fdebug-types-section places type units in .debug_info
+      // sections in COMDAT groups. They are not compile units and thus should
+      // be ignored for .gdb_index/diagnostics purposes.
+      //
+      // We use a simple heuristic: the compile unit does not have the SHF_GROUP
+      // flag. If we place compile units in COMDAT groups in the future, we may
+      // need to perform a lightweight parsing. We drop the SHF_GROUP flag when
+      // the InputSection was created, so we need to retrieve sh_flags from the
+      // associated ELF section header.
+      infoSection.Data = toStringRef(sec->data());
+      infoSection.sec = sec;
+    }
   }
 }
 
diff --git a/lld/ELF/DWARF.h b/lld/ELF/DWARF.h
index a12dae6e9960..900c63de26ff 100644
--- a/lld/ELF/DWARF.h
+++ b/lld/ELF/DWARF.h
@@ -32,6 +32,10 @@ template <class ELFT> class LLDDwarfObj final : public llvm::DWARFObject {
     f(infoSection);
   }
 
+  InputSection *getInfoSection() const {
+    return cast<InputSection>(infoSection.sec);
+  }
+
   const llvm::DWARFSection &getLoclistsSection() const override {
     return loclistsSection;
   }
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 731b9f658060..09f771d12359 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -28,6 +28,7 @@
 #include "lld/Common/Strings.h"
 #include "lld/Common/Version.h"
 #include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugPubTable.h"
@@ -2653,15 +2654,6 @@ void GdbIndexSection::initOutputSize() {
   }
 }
 
-static std::vector<InputSection *> getDebugInfoSections() {
-  std::vector<InputSection *> ret;
-  for (InputSectionBase *s : inputSections)
-    if (InputSection *isec = dyn_cast<InputSection>(s))
-      if (isec->name == ".debug_info")
-        ret.push_back(isec);
-  return ret;
-}
-
 static std::vector<GdbIndexSection::CuEntry> readCuList(DWARFContext &dwarf) {
   std::vector<GdbIndexSection::CuEntry> ret;
   for (std::unique_ptr<DWARFUnit> &cu : dwarf.compile_units())
@@ -2815,30 +2807,40 @@ createSymbols(ArrayRef<std::vector<GdbIndexSection::NameAttrEntry>> nameAttrs,
 
 // Returns a newly-created .gdb_index section.
 template <class ELFT> GdbIndexSection *GdbIndexSection::create() {
-  std::vector<InputSection *> sections = getDebugInfoSections();
-
-  // .debug_gnu_pub{names,types} are useless in executables.
-  // They are present in input object files solely for creating
-  // a .gdb_index. So we can remove them from the output.
-  for (InputSectionBase *s : inputSections)
+  // Collect InputFiles with .debug_info. See the comment in
+  // LLDDwarfObj<ELFT>::LLDDwarfObj. If we do lightweight parsing in the future,
+  // note that isec->data() may uncompress the full content, which should be
+  // parallelized.
+  SetVector<InputFile *> files;
+  for (InputSectionBase *s : inputSections) {
+    InputSection *isec = dyn_cast<InputSection>(s);
+    if (!isec)
+      continue;
+    // .debug_gnu_pub{names,types} are useless in executables.
+    // They are present in input object files solely for creating
+    // a .gdb_index. So we can remove them from the output.
     if (s->name == ".debug_gnu_pubnames" || s->name == ".debug_gnu_pubtypes")
       s->markDead();
+    else if (isec->name == ".debug_info")
+      files.insert(isec->file);
+  }
 
-  std::vector<GdbChunk> chunks(sections.size());
-  std::vector<std::vector<NameAttrEntry>> nameAttrs(sections.size());
+  std::vector<GdbChunk> chunks(files.size());
+  std::vector<std::vector<NameAttrEntry>> nameAttrs(files.size());
 
-  parallelForEachN(0, sections.size(), [&](size_t i) {
+  parallelForEachN(0, files.size(), [&](size_t i) {
     // To keep memory usage low, we don't want to keep cached DWARFContext, so
     // avoid getDwarf() here.
-    ObjFile<ELFT> *file = sections[i]->getFile<ELFT>();
+    ObjFile<ELFT> *file = cast<ObjFile<ELFT>>(files[i]);
     DWARFContext dwarf(std::make_unique<LLDDwarfObj<ELFT>>(file));
+    auto &dobj = static_cast<const LLDDwarfObj<ELFT> &>(dwarf.getDWARFObj());
 
-    chunks[i].sec = sections[i];
+    // If the are multiple compile units .debug_info (very rare ld -r --unique),
+    // this only picks the last one. Other address ranges are lost.
+    chunks[i].sec = dobj.getInfoSection();
     chunks[i].compilationUnits = readCuList(dwarf);
-    chunks[i].addressAreas = readAddressAreas(dwarf, sections[i]);
-    nameAttrs[i] = readPubNamesAndTypes<ELFT>(
-        static_cast<const LLDDwarfObj<ELFT> &>(dwarf.getDWARFObj()),
-        chunks[i].compilationUnits);
+    chunks[i].addressAreas = readAddressAreas(dwarf, chunks[i].sec);
+    nameAttrs[i] = readPubNamesAndTypes<ELFT>(dobj, chunks[i].compilationUnits);
   });
 
   auto *ret = make<GdbIndexSection>();
diff --git a/lld/test/ELF/gdb-index-dwarf5-type-unit.s b/lld/test/ELF/gdb-index-dwarf5-type-unit.s
new file mode 100644
index 000000000000..5cd6778fe7e4
--- /dev/null
+++ b/lld/test/ELF/gdb-index-dwarf5-type-unit.s
@@ -0,0 +1,93 @@
+# REQUIRES: x86, zlib
+## -gdwarf-5 -fdebug-types-section may produce multiple .debug_info sections.
+## All except one are type units. Test we can locate the compile unit, add it to
+## the index, and not erroneously duplicate it (which would happen if we
+## consider every .debug_info a compile unit).
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld --gdb-index -Ttext=0x1000 %t.o -o %t
+# RUN: llvm-dwarfdump --gdb-index %t | FileCheck %s
+
+## Test we don't uncompress a section while another thread is concurrently
+## accessing it. This would be detected by tsan as a data race.
+# RUN: llvm-objcopy --compress-debug-sections %t.o
+# RUN: ld.lld --gdb-index -Ttext=0x1000 %t.o -o %t1
+# RUN: llvm-dwarfdump --gdb-index %t1 | FileCheck %s
+
+## In this test, there are actually two compile unit .debug_info (very uncommon;
+## -r --unique). Currently we only handle the last compile unit.
+# CHECK:      CU list offset = 0x18, has 1 entries:
+# CHECK-NEXT:   0: Offset = 0x32, Length = 0x19
+
+# CHECK:      Address area offset = 0x28, has 1 entries:
+# CHECK-NEXT:   Low/High address = [0x1001, 0x1002) (Size: 0x1), CU id = 0
+
+.Lfunc_begin0:
+  ret
+.Lfunc_end0:
+.Lfunc_begin1:
+  ret
+.Lfunc_end1:
+
+.section  .debug_abbrev,"",@progbits
+  .byte  1                         # Abbreviation Code
+  .byte  65                        # DW_TAG_type_unit
+  .byte  0                         # DW_CHILDREN_no
+  .byte  0                         # EOM(1)
+  .byte  0                         # EOM(2)
+
+  .byte  2                         # Abbreviation Code
+  .byte  17                        # DW_TAG_compile_unit
+  .byte  0                         # DW_CHILDREN_no
+  .byte  17                        # DW_AT_low_pc
+  .byte  1                         # DW_FORM_addr
+  .byte  18                        # DW_AT_high_pc
+  .byte  6                         # DW_FORM_data4
+  .byte  0                         # EOM(1)
+  .byte  0                         # EOM(2)
+
+  .byte  0                         # EOM(3)
+
+.macro TYPE_UNIT id signature
+.section  .debug_info,"G",@progbits,\signature
+  .long  .Ldebug_info_end\id-.Ldebug_info_start\id # Length of Unit
+.Ldebug_info_start\id:
+  .short 5                         # DWARF version number
+  .byte  2                         # DWARF Unit Type
+  .byte  8                         # Address Size
+  .long  .debug_abbrev             # Offset Into Abbrev. Section
+  .quad  \signature                # Type Signature
+  .long  .Ldebug_info_end\id       # Type DIE Offset
+  .byte  1                         # Abbrev [1] DW_TAG_type_unit
+.Ldebug_info_end\id:
+.endm
+
+## We place compile units between two type units (rare). A naive approach will
+## take either the first or the last .debug_info
+TYPE_UNIT 0, 123
+
+.section  .debug_info,"",@progbits,unique,0
+.Lcu_begin0:
+  .long .Lcu_end0-.Lcu_begin0-4    # Length of Unit
+  .short 5                         # DWARF version number
+  .byte  1                         # DWARF Unit Type
+  .byte  8                         # Address Size
+  .long  .debug_abbrev             # Offset Into Abbrev. Section
+  .byte  2                         # Abbrev [2] DW_TAG_compile_unit
+  .quad  .Lfunc_begin0             # DW_AT_low_pc
+  .long  .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc
+.Lcu_end0:
+
+.section  .debug_info,"",@progbits,unique,1
+.Lcu_begin1:
+  .long .Lcu_end1-.Lcu_begin1-4    # Length of Unit
+  .short 5                         # DWARF version number
+  .byte  1                         # DWARF Unit Type
+  .byte  8                         # Address Size
+  .long  .debug_abbrev             # Offset Into Abbrev. Section
+  .byte  2                         # Abbrev [2] DW_TAG_compile_unit
+  .quad  .Lfunc_begin1             # DW_AT_low_pc
+  .long  .Lfunc_end1-.Lfunc_begin1 # DW_AT_high_pc
+.Lcu_end1:
+
+TYPE_UNIT 1, 456

From ce9f3f19f512f78c5d9ffb2753bae7bcb203161b Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Wed, 4 Mar 2020 15:38:45 -0800
Subject: [PATCH 174/363] [Docs] Document --lto-whole-program-visibility

Summary:
Documents interaction of linker option added in D71913 with LTO
visibility.

Reviewers: pcc

Subscribers: inglorion, hiraditya, steven_wu, dexonsmith, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D75655

(cherry picked from commit 72bdb41a06a27b5453bf966a0ffecfa6f5fae1a6)
---
 clang/docs/LTOVisibility.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/clang/docs/LTOVisibility.rst b/clang/docs/LTOVisibility.rst
index 3a60f54e1b90..cdc0b9cc0e19 100644
--- a/clang/docs/LTOVisibility.rst
+++ b/clang/docs/LTOVisibility.rst
@@ -35,6 +35,16 @@ other classes receive hidden LTO visibility. Classes with internal linkage
 (e.g. classes declared in unnamed namespaces) also receive hidden LTO
 visibility.
 
+During the LTO link, all classes with public LTO visibility will be refined
+to hidden LTO visibility when the ``--lto-whole-program-visibility`` lld linker
+option is applied (``-plugin-opt=whole-program-visibility`` for gold). This flag
+can be used to defer specifying whether classes have hidden LTO visibility until
+link time, to allow bitcode objects to be shared by different LTO links.
+Due to an implementation limitation, symbols associated with classes with hidden
+LTO visibility may still be exported from the binary when using this flag. It is
+unsafe to refer to these symbols, and their visibility may be relaxed to hidden
+in a future compiler release.
+
 A class defined in a translation unit built without LTO receives public
 LTO visibility regardless of its object file visibility, linkage or other
 attributes.

From 6f2ba83779c8055a58f1cc9ee33686a8109ff33a Mon Sep 17 00:00:00 2001
From: Francesco Petrogalli <francesco.petrogalli@arm.com>
Date: Wed, 26 Aug 2020 15:16:15 +0100
Subject: [PATCH 175/363] [release][SVE] Move notes for SVE ACLE to the release
 notes of clang.

---
 clang/docs/ReleaseNotes.rst | 54 +++++++++++++++++++++++++++++++++++
 llvm/docs/ReleaseNotes.rst  | 56 +++----------------------------------
 2 files changed, 58 insertions(+), 52 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6f336088750f..a8fde6b452d0 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -90,6 +90,60 @@ Non-comprehensive list of changes in this release
   a fixed hashing algorithm that prevents some collision when loading
   out-of-date profile informations. Clang can still read old profile files.
 
+- Clang adds support for the following macros that enable the
+  C-intrinsics from the `Arm C language extensions for SVE
+  <https://developer.arm.com/documentation/100987/>`_ (version
+  ``00bet5``, see section 2.1 for the list of intrinsics associated to
+  each macro):
+
+
+      =================================  =================
+      Preprocessor macro                 Target feature
+      =================================  =================
+      ``__ARM_FEATURE_SVE``              ``+sve``
+      ``__ARM_FEATURE_SVE_BF16``         ``+sve+bf16``
+      ``__ARM_FEATURE_SVE_MATMUL_FP32``  ``+sve+f32mm``
+      ``__ARM_FEATURE_SVE_MATMUL_FP64``  ``+sve+f64mm``
+      ``__ARM_FEATURE_SVE_MATMUL_INT8``  ``+sve+i8mm``
+      ``__ARM_FEATURE_SVE2``             ``+sve2``
+      ``__ARM_FEATURE_SVE2_AES``         ``+sve2-aes``
+      ``__ARM_FEATURE_SVE2_BITPERM``     ``+sve2-bitperm``
+      ``__ARM_FEATURE_SVE2_SHA3``        ``+sve2-sha3``
+      ``__ARM_FEATURE_SVE2_SM4``         ``+sve2-sm4``
+      =================================  =================
+
+  The macros enable users to write C/C++ `Vector Length Agnostic
+  (VLA)` loops, that can be executed on any CPU that implements the
+  underlying instructions supported by the C intrinsics, independently
+  of the hardware vector register size.
+
+  For example, the ``__ARM_FEATURE_SVE`` macro is enabled when
+  targeting AArch64 code generation by setting ``-march=armv8-a+sve``
+  on the command line.
+
+  .. code-block:: c
+     :caption: Example of VLA addition of two arrays with SVE ACLE.
+
+     // Compile with:
+     // `clang++ -march=armv8a+sve ...` (for c++)
+     // `clang -stc=c11 -march=armv8a+sve ...` (for c)
+     #include <arm_sve.h>
+
+     void VLA_add_arrays(double *x, double *y, double *out, unsigned N) {
+       for (unsigned i = 0; i < N; i += svcntd()) {
+         svbool_t Pg = svwhilelt_b64(i, N);
+         svfloat64_t vx = svld1(Pg, &x[i]);
+         svfloat64_t vy = svld1(Pg, &y[i]);
+         svfloat64_t vout = svadd_x(Pg, vx, vy);
+         svst1(Pg, &out[i], vout);
+       }
+     }
+
+  Please note that support for lazy binding of SVE function calls is
+  incomplete. When you interface user code with SVE functions that are
+  provided through shared libraries, avoid using lazy binding. If you
+  use lazy binding, the results could be corrupted.
+
 New Compiler Flags
 ------------------
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 6c92c1224238..5bbdea65c3e7 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -106,59 +106,11 @@ Changes to the AArch64 Backend
 * Clearly error out on unsupported relocations when targeting COFF, instead
   of silently accepting some (without being able to do what was requested).
 
-* Clang adds support for the following macros that enable the
-  C-intrinsics from the `Arm C language extensions for SVE
+* Implemented codegen support for the SVE C-language intrinsics
+  documented in `Arm C Language Extensions (ACLE) for SVE
   <https://developer.arm.com/documentation/100987/>`_ (version
-  ``00bet5``, see section 2.1 for the list of intrinsics associated to
-  each macro):
-
-
-      =================================  =================
-      Preprocessor macro                 Target feature
-      =================================  =================
-      ``__ARM_FEATURE_SVE``              ``+sve``
-      ``__ARM_FEATURE_SVE_BF16``         ``+sve+bf16``
-      ``__ARM_FEATURE_SVE_MATMUL_FP32``  ``+sve+f32mm``
-      ``__ARM_FEATURE_SVE_MATMUL_FP64``  ``+sve+f64mm``
-      ``__ARM_FEATURE_SVE_MATMUL_INT8``  ``+sve+i8mm``
-      ``__ARM_FEATURE_SVE2``             ``+sve2``
-      ``__ARM_FEATURE_SVE2_AES``         ``+sve2-aes``
-      ``__ARM_FEATURE_SVE2_BITPERM``     ``+sve2-bitperm``
-      ``__ARM_FEATURE_SVE2_SHA3``        ``+sve2-sha3``
-      ``__ARM_FEATURE_SVE2_SM4``         ``+sve2-sm4``
-      =================================  =================
-
-  The macros enable users to write C/C++ `Vector Length Agnostic
-  (VLA)` loops, that can be executed on any CPU that implements the
-  underlying instructions supported by the C intrinsics, independently
-  of the hardware vector register size.
-
-  For example, the ``__ARM_FEATURE_SVE`` macro is enabled when
-  targeting AArch64 code generation by setting ``-march=armv8-a+sve``
-  on the command line.
-
-  .. code-block:: c
-     :caption: Example of VLA addition of two arrays with SVE ACLE.
-
-     // Compile with:
-     // `clang++ -march=armv8a+sve ...` (for c++)
-     // `clang -stc=c11 -march=armv8a+sve ...` (for c)
-     #include <arm_sve.h>
-
-     void VLA_add_arrays(double *x, double *y, double *out, unsigned N) {
-       for (unsigned i = 0; i < N; i += svcntd()) {
-         svbool_t Pg = svwhilelt_b64(i, N);
-         svfloat64_t vx = svld1(Pg, &x[i]);
-         svfloat64_t vy = svld1(Pg, &y[i]);
-         svfloat64_t vout = svadd_x(Pg, vx, vy);
-         svst1(Pg, &out[i], vout);
-       }
-     }
-
-  Please note that support for lazy binding of SVE function calls is
-  incomplete. When you interface user code with SVE functions that are
-  provided through shared libraries, avoid using lazy binding. If you
-  use lazy binding, the results could be corrupted.
+  ``00bet5``). For more information, see the ``clang`` 11 release
+  notes.
 
 Changes to the ARM Backend
 --------------------------

From 46f3aed198a5530b5115881628e1fcfb3e7541c9 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Wed, 26 Aug 2020 16:44:55 +0200
Subject: [PATCH 176/363] Bump -len_control value in fuzzer-custommutator.test
 (PR47286)

to make the test more stable, as suggested by mmoroz.

(cherry picked from commit 8421503300c6145480710761983f089ccbe0bb56)
---
 compiler-rt/test/fuzzer/fuzzer-custommutator.test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/test/fuzzer/fuzzer-custommutator.test b/compiler-rt/test/fuzzer/fuzzer-custommutator.test
index 87e69a0d8cf3..25f5fe697b43 100644
--- a/compiler-rt/test/fuzzer/fuzzer-custommutator.test
+++ b/compiler-rt/test/fuzzer/fuzzer-custommutator.test
@@ -6,7 +6,7 @@ LLVMFuzzerCustomMutator: {{.*}} lim: 4096 {{.*}}
 LLVMFuzzerCustomMutator: BINGO
 
 # len_control is disabled for custom mutators by default, test that it can be enabled.
-RUN: not %run %t-CustomMutatorTest -len_control=100 2>&1 | FileCheck %s --check-prefix=LLVMFuzzerCustomMutatorWithLenControl
+RUN: not %run %t-CustomMutatorTest -len_control=1000 2>&1 | FileCheck %s --check-prefix=LLVMFuzzerCustomMutatorWithLenControl
 LLVMFuzzerCustomMutatorWithLenControl: INFO: found LLVMFuzzerCustomMutator
 LLVMFuzzerCustomMutatorWithLenControl: In LLVMFuzzerCustomMutator
 LLVMFuzzerCustomMutatorWithLenControl: {{.*}} lim: {{[1-9][0-9]?}} {{.*}}

From 04d70cd0f07dab371abf586627ce9ac09e04362c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 12 Aug 2020 20:14:00 -0700
Subject: [PATCH 177/363] [ELF] -r: allow SHT_X86_64_UNWIND to be merged into
 SHT_PROGBITS

* For .cfi_*, GCC/GNU as emits SHT_PROGBITS type .eh_frame sections.
* Since rL252300, clang emits SHT_X86_64_UNWIND type .eh_frame sections
  (originated from Solaris, documented in the x86-64 psABI).
* Some assembly use `.section .eh_frame,"a",@unwind` to generate
  SHT_X86_64_UNWIND .eh_frame sections.

In a non-relocatable link, input .eh_frame are combined and there is
only one SyntheticSection .eh_frame in the output section, so the
"section type mismatch" diagnostic does not fire.

In a relocatable link, there is no SyntheticSection .eh_frame. .eh_frame of
mixed types can trigger the diagnostic. This patch fixes it by adding another
special case 0x70000001 (= SHT_X86_64_UNWIND) to canMergeToProgbits().

    ld.lld -r gcc.o clang.o => error: section type mismatch for .eh_frame

There was a discussion "RFC: Usefulness of SHT_X86_64_UNWIND" on the x86-64-abi
mailing list. Folks are not wild about making the psABI value 0x70000001 into
gABI, but a few think defining 0x70000001 for .eh_frame may be a good idea for a
new architecture.

Reviewed By: grimar

Differential Revision: https://reviews.llvm.org/D85785

(cherry picked from commit 88498f44dfe7d9b886f2622335cdeae4dbf2b02a)
---
 lld/ELF/OutputSections.cpp      | 6 +++++-
 lld/test/ELF/eh-frame-type.test | 3 +++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index 7e9e76b070ec..881c375a1159 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -77,10 +77,14 @@ OutputSection::OutputSection(StringRef name, uint32_t type, uint64_t flags)
 // to be allocated for nobits sections. Other ones don't require
 // any special treatment on top of progbits, so there doesn't
 // seem to be a harm in merging them.
+//
+// NOTE: clang since rL252300 emits SHT_X86_64_UNWIND .eh_frame sections. Allow
+// them to be merged into SHT_PROGBITS .eh_frame (GNU as .cfi_*).
 static bool canMergeToProgbits(unsigned type) {
   return type == SHT_NOBITS || type == SHT_PROGBITS || type == SHT_INIT_ARRAY ||
          type == SHT_PREINIT_ARRAY || type == SHT_FINI_ARRAY ||
-         type == SHT_NOTE;
+         type == SHT_NOTE ||
+         (type == SHT_X86_64_UNWIND && config->emachine == EM_X86_64);
 }
 
 // Record that isec will be placed in the OutputSection. isec does not become
diff --git a/lld/test/ELF/eh-frame-type.test b/lld/test/ELF/eh-frame-type.test
index 22b802a9e528..4105491372b8 100644
--- a/lld/test/ELF/eh-frame-type.test
+++ b/lld/test/ELF/eh-frame-type.test
@@ -11,6 +11,9 @@
 # RUN: ld.lld %t1.o %t2.o -o %tboth
 # RUN: llvm-readobj -S %tboth | FileCheck %s
 
+# RUN: ld.lld -r %t1.o %t2.o -o %tboth.ro
+# RUN: llvm-readobj -S %tboth.ro | FileCheck %s
+
 # CHECK:      Name: .eh_frame
 # CHECK-NEXT: Type: SHT_PROGBITS
 

From 761cd1ce23769b459d8f111e1448ff1e9807b90e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 20 Aug 2020 23:45:34 -0700
Subject: [PATCH 178/363] [X86] Correct the implementation of the testFeature
 macro in getIntelProcessorTypeAndSubtype to do a proper bit test.

Instead of ANDing with a one hot mask representing the bit to
be tested, we were ANDing with just the bit number. This tests
multiple bits none of them the correct one.

This caused skylake-avx512, cascadelake and cooperlake to all
be misdetected. Based on experiments with the Intel SDE, it seems
that all of these CPUs are being detected as being cooperlake.
This is bad since its the newest CPU of the 3.

(cherry picked from commit df9a9bb7beb7bc04ca4188fe0e527baac2900ff1)
---
 compiler-rt/lib/builtins/cpu_model.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/builtins/cpu_model.c b/compiler-rt/lib/builtins/cpu_model.c
index 8346bb62dcfb..468bcc84cbcb 100644
--- a/compiler-rt/lib/builtins/cpu_model.c
+++ b/compiler-rt/lib/builtins/cpu_model.c
@@ -277,7 +277,7 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
                                 const unsigned *Features,
                                 unsigned *Type, unsigned *Subtype) {
 #define testFeature(F)                                                         \
-  (Features[F / 32] & (F % 32)) != 0
+  (Features[F / 32] & (1 << (F % 32))) != 0
 
   // We select CPU strings to match the code in Host.cpp, but we don't use them
   // in compiler-rt.

From 124e8259abe1dc7d0e8ad9d238f698bcfc31562e Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Wed, 26 Aug 2020 19:29:56 +0200
Subject: [PATCH 179/363] ReleaseNotes: mention the build preferring python 3

Text by Saleem!
---
 llvm/docs/ReleaseNotes.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 5bbdea65c3e7..49b1a040a393 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -44,6 +44,13 @@ Non-comprehensive list of changes in this release
    functionality, or simply have a lot to talk about), see the `NOTE` below
    for adding a new subsection.
 
+* The LLVM project has started the migration towards Python 3, and the build
+  system now prefers Python 3 whenever available.  If the Python 3 interpreter
+  (or libraries) are not found, the build system will, for the time being, fall
+  back to Python 2.  It is recommended that downstream projects migrate to
+  Python 3 as Python 2 has been end-of-life'd by the Python Software
+  Foundation.
+
 * The llgo frontend has been removed for now, but may be resurrected in the
   future.
 

From 29e94ddb3930e3d7b54afb3753a6a40d6ef57898 Mon Sep 17 00:00:00 2001
From: Francesco Petrogalli <francesco.petrogalli@arm.com>
Date: Wed, 26 Aug 2020 15:43:56 +0000
Subject: [PATCH 180/363] [MC][SVE] Fix data operand for instruction alias of
 `st1d`.

The version of `st1d` that operates with vector plus immediate
addressing mode uses the alias `st1d { <Zn>.d }, <Pg>, [<Za>.d]` for
rendering `st1d { <Zn>.d }, <Pg>, [<Za>.d, #0]`. The disassembler was
generating `<Zn>.s` instead of `<Zn>.d>`.

Differential Revision: https://reviews.llvm.org/D86633
---
 llvm/lib/Target/AArch64/SVEInstrFormats.td |  2 +-
 llvm/test/MC/AArch64/SVE/st1b.s            | 24 ++++++++++++++++++++++
 llvm/test/MC/AArch64/SVE/st1d.s            | 12 +++++++++++
 llvm/test/MC/AArch64/SVE/st1h.s            | 24 ++++++++++++++++++++++
 llvm/test/MC/AArch64/SVE/st1w.s            | 24 ++++++++++++++++++++++
 5 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index c56a65b9e212..e86f2a6ebde4 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -5416,7 +5416,7 @@ multiclass sve_mem_64b_sst_vi_ptrs<bits<3> opc, string asm,
   def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $imm5]",
                   (!cast<Instruction>(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
-                  (!cast<Instruction>(NAME # _IMM) Z_s:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
+                  (!cast<Instruction>(NAME # _IMM) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
 
   def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt),
             (!cast<Instruction>(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
diff --git a/llvm/test/MC/AArch64/SVE/st1b.s b/llvm/test/MC/AArch64/SVE/st1b.s
index a6f766bdfd7c..40b830709ead 100644
--- a/llvm/test/MC/AArch64/SVE/st1b.s
+++ b/llvm/test/MC/AArch64/SVE/st1b.s
@@ -168,3 +168,27 @@ st1b    { z31.d }, p7, [z31.d, #31]
 // CHECK-ENCODING: [0xff,0xbf,0x5f,0xe4]
 // CHECK-ERROR: instruction requires: sve
 // CHECK-UNKNOWN: ff bf 5f e4 <unknown>
+
+st1b    { z0.s }, p7, [z0.s, #0]
+// CHECK-INST: st1b    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0x60,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 60 e4 <unknown>
+
+st1b    { z0.s }, p7, [z0.s]
+// CHECK-INST: st1b    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0x60,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 60 e4 <unknown>
+
+st1b    { z0.d }, p7, [z0.d, #0]
+// CHECK-INST: st1b    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0x40,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 40 e4 <unknown>
+
+st1b    { z0.d }, p7, [z0.d]
+// CHECK-INST: st1b    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0x40,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 40 e4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st1d.s b/llvm/test/MC/AArch64/SVE/st1d.s
index ba4a0e5be114..a5a19e772b52 100644
--- a/llvm/test/MC/AArch64/SVE/st1d.s
+++ b/llvm/test/MC/AArch64/SVE/st1d.s
@@ -78,3 +78,15 @@ st1d    { z31.d }, p7, [z31.d, #248]
 // CHECK-ENCODING: [0xff,0xbf,0xdf,0xe5]
 // CHECK-ERROR: instruction requires: sve
 // CHECK-UNKNOWN: ff bf df e5 <unknown>
+
+st1d    { z0.d }, p7, [z0.d, #0]
+// CHECK-INST: st1d    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc c0 e5 <unknown>
+
+st1d    { z0.d }, p7, [z0.d]
+// CHECK-INST: st1d    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc c0 e5 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st1h.s b/llvm/test/MC/AArch64/SVE/st1h.s
index cd6c20d83482..fe22c52bb9be 100644
--- a/llvm/test/MC/AArch64/SVE/st1h.s
+++ b/llvm/test/MC/AArch64/SVE/st1h.s
@@ -168,3 +168,27 @@ st1h    { z31.d }, p7, [z31.d, #62]
 // CHECK-ENCODING: [0xff,0xbf,0xdf,0xe4]
 // CHECK-ERROR: instruction requires: sve
 // CHECK-UNKNOWN: ff bf df e4 <unknown>
+
+st1h    { z0.s }, p7, [z0.s, #0]
+// CHECK-INST: st1h    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0xe0,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc e0 e4 <unknown>
+
+st1h    { z0.s }, p7, [z0.s]
+// CHECK-INST: st1h    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0xe0,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc e0 e4 <unknown>
+
+st1h    { z0.d }, p7, [z0.d, #0]
+// CHECK-INST: st1h    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc c0 e4 <unknown>
+
+st1h    { z0.d }, p7, [z0.d]
+// CHECK-INST: st1h    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe4]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc c0 e4 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/st1w.s b/llvm/test/MC/AArch64/SVE/st1w.s
index e20194f5747e..5bbcd2e1ea0f 100644
--- a/llvm/test/MC/AArch64/SVE/st1w.s
+++ b/llvm/test/MC/AArch64/SVE/st1w.s
@@ -138,3 +138,27 @@ st1w    { z31.d }, p7, [z31.d, #124]
 // CHECK-ENCODING: [0xff,0xbf,0x5f,0xe5]
 // CHECK-ERROR: instruction requires: sve
 // CHECK-UNKNOWN: ff bf 5f e5 <unknown>
+
+st1w    { z0.s }, p7, [z0.s, #0]
+// CHECK-INST: st1w    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0x60,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 60 e5 <unknown>
+
+st1w    { z0.s }, p7, [z0.s]
+// CHECK-INST: st1w    { z0.s }, p7, [z0.s]
+// CHECK-ENCODING: [0x00,0xbc,0x60,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 60 e5 <unknown>
+
+st1w    { z0.d }, p7, [z0.d, #0]
+// CHECK-INST: st1w    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0x40,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 40 e5 <unknown>
+
+st1w    { z0.d }, p7, [z0.d]
+// CHECK-INST: st1w    { z0.d }, p7, [z0.d]
+// CHECK-ENCODING: [0x00,0xbc,0x40,0xe5]
+// CHECK-ERROR: instruction requires: sve
+// CHECK-UNKNOWN: 00 bc 40 e5 <unknown>

From 63255250c2d76c31282493b8368c74869c5657cc Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Thu, 27 Aug 2020 01:12:16 -0400
Subject: [PATCH 181/363] Default to -fuse-init-array on OpenBSD.

(cherry picked from commit a45ccc983b51330fd49c8526fe4770e40eeab708)
---
 clang/lib/Driver/ToolChains/OpenBSD.cpp | 9 ---------
 clang/lib/Driver/ToolChains/OpenBSD.h   | 5 -----
 clang/test/Driver/openbsd.c             | 5 -----
 3 files changed, 19 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index 4f2d04058d24..1177fba96562 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -313,15 +313,6 @@ std::string OpenBSD::getCompilerRT(const ArgList &Args,
   return std::string(Path.str());
 }
 
-void OpenBSD::addClangTargetOptions(const ArgList &DriverArgs,
-                                    ArgStringList &CC1Args,
-                                    Action::OffloadKind) const {
-  // Support for .init_array is still new (Aug 2016).
-  if (!DriverArgs.hasFlag(options::OPT_fuse_init_array,
-                          options::OPT_fno_use_init_array, false))
-    CC1Args.push_back("-fno-use-init-array");
-}
-
 Tool *OpenBSD::buildAssembler() const {
   return new tools::openbsd::Assembler(*this);
 }
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.h b/clang/lib/Driver/ToolChains/OpenBSD.h
index 09595faf9d6b..5f9b259bf861 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.h
+++ b/clang/lib/Driver/ToolChains/OpenBSD.h
@@ -86,11 +86,6 @@ class LLVM_LIBRARY_VISIBILITY OpenBSD : public Generic_ELF {
 
   SanitizerMask getSupportedSanitizers() const override;
 
-  void
-  addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
-                        llvm::opt::ArgStringList &CC1Args,
-                        Action::OffloadKind DeviceOffloadKind) const override;
-
 protected:
   Tool *buildAssembler() const override;
   Tool *buildLinker() const override;
diff --git a/clang/test/Driver/openbsd.c b/clang/test/Driver/openbsd.c
index e17d05dc76da..cee4539eaca2 100644
--- a/clang/test/Driver/openbsd.c
+++ b/clang/test/Driver/openbsd.c
@@ -122,8 +122,3 @@
 // RUN: %clang -target powerpc-unknown-openbsd -### -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-POWERPC-SECUREPLT %s
 // CHECK-POWERPC-SECUREPLT: "-target-feature" "+secure-plt"
-
-// Check -fno-init-array
-// RUN: %clang -no-canonical-prefixes -target i686-pc-openbsd %s -### 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-CTORS %s
-// CHECK-CTORS: "-fno-use-init-array"

From 522d80ab553b42e2feadfd4178932069dfc51d3f Mon Sep 17 00:00:00 2001
From: AndreyChurbanov <andrey.churbanov@intel.com>
Date: Wed, 26 Aug 2020 21:56:01 +0300
Subject: [PATCH 182/363] [OpenMP] Fix import library installation with MinGW

Patch by mati865@gmail.com

Differential Revision: https://reviews.llvm.org/D86552

(cherry picked from commit 1596ea80fdf3410f94ef9a2548701d26cc81c2f5)
---
 openmp/runtime/src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index 81275c0483dd..19423f58c6c4 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -195,7 +195,7 @@ if(WIN32)
   # the import library is "re-linked" to include kmp_import.cpp which prevents
   # linking of both Visual Studio OpenMP and newly built OpenMP
   set_source_files_properties(kmp_import.cpp PROPERTIES COMPILE_FLAGS "${LIBOMP_CONFIGURED_CXXFLAGS}")
-  set(LIBOMP_IMP_LIB_FILE ${LIBOMP_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX})
+  set(LIBOMP_IMP_LIB_FILE ${LIBOMP_LIB_NAME}${CMAKE_IMPORT_LIBRARY_SUFFIX})
   set(LIBOMP_GENERATED_IMP_LIB_FILENAME ${LIBOMP_LIB_FILE}${CMAKE_STATIC_LIBRARY_SUFFIX})
   set_target_properties(omp PROPERTIES
     VERSION ${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR} # uses /version flag

From 2eab0b4f20aa192ba5ca8492c20aeae85b44e70b Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Thu, 27 Aug 2020 21:19:58 -0700
Subject: [PATCH 183/363] [X86] Update release notes.

---
 llvm/docs/ReleaseNotes.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 49b1a040a393..c7ca861dbc34 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -167,6 +167,16 @@ During this release ...
   avx512bw otherwise they would split into multiple YMM registers. This means
   vXi16/vXi8 vectors are consistently treated the same as
   vXi32/vXi64/vXf64/vXf32 vectors of the same total width.
+* Support was added for Intel AMX instructions.
+* Support was added for TSXLDTRK instructions.
+* A pass was added for mitigating the Load Value Injection vulnerability.
+* The Speculative Execution Side Effect Suppression pass was added which can
+  be used to as a last resort mitigation for speculative execution related
+  CPU vulnerabilities.
+* Improved recognition of boolean vector reductions with better MOVMSKB/PTEST
+  handling
+* Exteded recognition of rotation patterns to handle funnel shift as well,
+  allowing us to remove the existing x86-specific SHLD/SHRD combine.
 
 Changes to the AMDGPU Target
 -----------------------------

From f81c61748dd80b92a638bf16eebddc1a7ccfcf8e Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 27 Aug 2020 23:15:21 -0700
Subject: [PATCH 184/363] ReleaseNotes: add some clang items

---
 clang/docs/ReleaseNotes.rst | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index a8fde6b452d0..7b1df2ed9c3f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -58,6 +58,10 @@ Improvements to Clang's diagnostics
   -Wuninitialized. It warns on cases where uninitialized variables are passed
   as const reference arguments to a function.
 
+- ``-Wimplicit-const-int-float-conversion`` (enabled by default) is a new
+  option controlled by ``-Wimplicit-int-float-conversion``.  It warns on
+  implicit conversion from a floating constant to an integer type.
+
 Non-comprehensive list of changes in this release
 -------------------------------------------------
 
@@ -144,6 +148,21 @@ Non-comprehensive list of changes in this release
   provided through shared libraries, avoid using lazy binding. If you
   use lazy binding, the results could be corrupted.
 
+- ``-O`` maps to ``-O1`` instead of ``-O2``.
+  (`D79916 <https://reviews.llvm.org/D79916>`_)
+
+- In a ``-flto={full,thin}`` link, ``-Os``, ``-Oz`` and ``-Og`` can be used
+  now. ``-Os`` and ``-Oz`` map to the -O2 pipe line while ``-Og`` maps to the
+  -O1 pipeline.
+  (`D79919 <https://reviews.llvm.org/D79919>`_)
+
+- ``--coverage`` (gcov) defaults to gcov [4.8,8) compatible format now.
+
+- On x86, ``-fpic/-fPIC -fno-semantic-interposition`` assumes a global
+  definition of default visibility non-interposable and allows interprocedural
+  optimizations. In produced assembly ``-Lfunc$local`` local aliases are created
+  for global symbols of default visibility.
+
 New Compiler Flags
 ------------------
 
@@ -195,6 +214,8 @@ New Compiler Flags
     adding -fdata-sections -ffunction-sections to the command generating
     the shared object).
 
+- ``-fsanitize-coverage-allowlist`` and ``-fsanitize-coverage-blocklist`` are added.
+
 Deprecated Compiler Flags
 -------------------------
 

From 5d21aedfdbf0b85d65bad08b7b89913205de4b33 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Fri, 28 Aug 2020 09:56:52 +0200
Subject: [PATCH 185/363] Add release note for RecoveryExpr.

---
 clang/docs/ReleaseNotes.rst | 40 +++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7b1df2ed9c3f..83877d0d95a2 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -48,6 +48,46 @@ Major New Features
 
 - ...
 
+Recovery AST
+^^^^^^^^^^^^
+
+clang's AST now improves support for representing broken C++ code. This improves
+the quality of subsequent diagnostics after an error is encountered. It also
+exposes more information to tools like clang-tidy and clangd that consume
+clang’s AST, allowing them to be more accurate on broken code.
+
+A RecoveryExpr is introduced in clang's AST, marking an expression containing
+semantic errors. This preserves the source range and subexpressions of the
+broken expression in the AST (rather than discarding the whole expression).
+
+For the following invalid code:
+
+  .. code-block:: c++
+
+     int NoArg(); // Line 1
+     int x = NoArg(42); // oops!
+
+clang-10 produces the minimal placeholder:
+
+  .. code-block:: c++
+
+     // VarDecl <line:2:1, col:5> col:5 x 'int'
+
+clang-11 produces a richer AST:
+
+  .. code-block:: c++
+
+     // VarDecl <line:2:1, col:16> col:5 x 'int' cinit
+     // `-RecoveryExpr <col:9, col:16> '<dependent type>' contains-errors lvalue
+     //    `-UnresolvedLookupExpr <col:9> '<overloaded function>' lvalue (ADL) = 'NoArg'
+     //    `-IntegerLiteral <col:15> 'int' 42
+
+Note that error-dependent types and values may now occur outside a template
+context. Tools may need to adjust assumptions about dependent code.
+
+This feature is on by default for C++ code, and can be explicitly controlled
+with `-Xclang -f[no-]recovery-ast`.
+
 Improvements to Clang's diagnostics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

From b931e22c954374acf75c4f1d1f2666f3f8e67470 Mon Sep 17 00:00:00 2001
From: Kai Luo <lkail@cn.ibm.com>
Date: Fri, 28 Aug 2020 01:56:12 +0000
Subject: [PATCH 186/363] [PowerPC] PPCBoolRetToInt: Don't translate Constant's
 operands

When collecting `i1` values via `findAllDefs`, ignore Constant's
operands, since Constant's operands might not be `i1`.

Fixes https://bugs.llvm.org/show_bug.cgi?id=46923 which causes ICE
```
llvm-project/llvm/lib/IR/Constants.cpp:1924: static llvm::Constant *llvm::ConstantExpr::getZExt(llvm::Constant *, llvm::Type *, bool): Assertion `C->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits()&& "SrcTy must be smaller than DestTy for ZExt!"' failed.
```

Differential Revision: https://reviews.llvm.org/D85007

(cherry picked from commit cbea17568f4301582c1d5d43990f089ca6cff522)
---
 llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp | 15 ++++++-----
 llvm/test/CodeGen/PowerPC/pr46923.ll        | 29 +++++++++++++++++++++
 2 files changed, 38 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/pr46923.ll

diff --git a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
index 2259a29f838a..f125ca011cd2 100644
--- a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
+++ b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -78,9 +78,9 @@ class PPCBoolRetToInt : public FunctionPass {
       Value *Curr = WorkList.back();
       WorkList.pop_back();
       auto *CurrUser = dyn_cast<User>(Curr);
-      // Operands of CallInst are skipped because they may not be Bool type,
-      // and their positions are defined by ABI.
-      if (CurrUser && !isa<CallInst>(Curr))
+      // Operands of CallInst/Constant are skipped because they may not be Bool
+      // type. For CallInst, their positions are defined by ABI.
+      if (CurrUser && !isa<CallInst>(Curr) && !isa<Constant>(Curr))
         for (auto &Op : CurrUser->operands())
           if (Defs.insert(Op).second)
             WorkList.push_back(Op);
@@ -90,6 +90,9 @@ class PPCBoolRetToInt : public FunctionPass {
 
   // Translate a i1 value to an equivalent i32/i64 value:
   Value *translate(Value *V) {
+    assert(V->getType() == Type::getInt1Ty(V->getContext()) &&
+           "Expect an i1 value");
+
     Type *IntTy = ST->isPPC64() ? Type::getInt64Ty(V->getContext())
                                 : Type::getInt32Ty(V->getContext());
 
@@ -252,9 +255,9 @@ class PPCBoolRetToInt : public FunctionPass {
       auto *First = dyn_cast<User>(Pair.first);
       auto *Second = dyn_cast<User>(Pair.second);
       assert((!First || Second) && "translated from user to non-user!?");
-      // Operands of CallInst are skipped because they may not be Bool type,
-      // and their positions are defined by ABI.
-      if (First && !isa<CallInst>(First))
+      // Operands of CallInst/Constant are skipped because they may not be Bool
+      // type. For CallInst, their positions are defined by ABI.
+      if (First && !isa<CallInst>(First) && !isa<Constant>(First))
         for (unsigned i = 0; i < First->getNumOperands(); ++i)
           Second->setOperand(i, BoolToIntMap[First->getOperand(i)]);
     }
diff --git a/llvm/test/CodeGen/PowerPC/pr46923.ll b/llvm/test/CodeGen/PowerPC/pr46923.ll
new file mode 100644
index 000000000000..3e9faa60422a
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr46923.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names < %s | FileCheck %s
+
+@bar = external constant i64, align 8
+
+define i1 @foo() {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li r3, 0
+; CHECK-NEXT:    isel r3, 0, r3, 4*cr5+lt
+; CHECK-NEXT:    blr
+entry:
+  br label %next
+
+next:
+  br i1 undef, label %true, label %false
+
+true:
+  br label %end
+
+false:
+  br label %end
+
+end:
+  %a = phi i1 [ icmp ugt (i64 0, i64 ptrtoint (i64* @bar to i64)), %true ],
+              [ icmp ugt (i64 0, i64 2), %false ]
+  ret i1 %a
+}

From aa0dcfb1179b0916e0315f2125fd35af6d6869d3 Mon Sep 17 00:00:00 2001
From: KAWASHIMA Takahiro <t-kawashima@fujitsu.com>
Date: Fri, 28 Aug 2020 18:00:59 +0900
Subject: [PATCH 187/363] [release][docs] Add -mtls-size= option to the release
 note

---
 clang/docs/ReleaseNotes.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 83877d0d95a2..c32952cce51d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -256,6 +256,12 @@ New Compiler Flags
 
 - ``-fsanitize-coverage-allowlist`` and ``-fsanitize-coverage-blocklist`` are added.
 
+- -mtls-size={12,24,32,48} allows selecting the size of the TLS (thread-local
+  storage) in the local exec TLS model of AArch64, which is the default TLS
+  model for non-PIC objects. Each value represents 4KB, 16MB (default), 4GB,
+  and 256TB (needs -mcmodel=large). This allows large/many thread local
+  variables or a compact/fast code in an executable.
+
 Deprecated Compiler Flags
 -------------------------
 

From ba3413982cbd7a5b5aeaf2ea34e0a91d5561202d Mon Sep 17 00:00:00 2001
From: Lucas Prates <lucas.prates@arm.com>
Date: Thu, 27 Aug 2020 15:31:40 +0100
Subject: [PATCH 188/363] [CodeGen] Properly propagating Calling Convention
 information when lowering vector arguments

When joining the legal parts of vector arguments into its original value
during the lower of Formal Arguments in SelectionDAGBuilder, the Calling
Convention information was not being propagated for the handling of each
individual parts. The same did not happen when lowering calls, causing a
mismatch.

This patch fixes the issue by properly propagating the Calling
Convention details.

This fixes Bugzilla #47001.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D86715

(cherry picked from commit 3d943bcd223e5b97179840c2f5885fe341e51747)
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  4 +-
 llvm/test/CodeGen/ARM/fp16-args.ll            | 89 +++++++++++++++++--
 llvm/test/CodeGen/ARM/fp16-v3.ll              |  3 -
 3 files changed, 83 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 1d596c89c911..feb949f81eba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -409,7 +409,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
       // as appropriate.
       for (unsigned i = 0; i != NumParts; ++i)
         Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1,
-                                  PartVT, IntermediateVT, V);
+                                  PartVT, IntermediateVT, V, CallConv);
     } else if (NumParts > 0) {
       // If the intermediate type was expanded, build the intermediate
       // operands from the parts.
@@ -418,7 +418,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
       unsigned Factor = NumParts / NumIntermediates;
       for (unsigned i = 0; i != NumIntermediates; ++i)
         Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor,
-                                  PartVT, IntermediateVT, V);
+                                  PartVT, IntermediateVT, V, CallConv);
     }
 
     // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
diff --git a/llvm/test/CodeGen/ARM/fp16-args.ll b/llvm/test/CodeGen/ARM/fp16-args.ll
index 7ed1e883eef1..18bbcd12c768 100644
--- a/llvm/test/CodeGen/ARM/fp16-args.ll
+++ b/llvm/test/CodeGen/ARM/fp16-args.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=armv7a--none-eabi -float-abi soft -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT
-; RUN: llc -mtriple=armv7a--none-eabi -float-abi hard -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD
-; RUN: llc -mtriple=armv7a--none-eabi -float-abi soft -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FULL-SOFT
-; RUN: llc -mtriple=armv7a--none-eabi -float-abi hard -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FULL-HARD
-; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi soft -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT
-; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi hard -mattr=+fp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD
-; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi soft -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FULL-SOFT
-; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi hard -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FULL-HARD
+; RUN: llc -mtriple=armv7a--none-eabi -float-abi soft -mattr=+fp16 < %s | FileCheck %s --check-prefix=SOFT
+; RUN: llc -mtriple=armv7a--none-eabi -float-abi hard -mattr=+fp16 < %s | FileCheck %s --check-prefix=HARD
+; RUN: llc -mtriple=armv7a--none-eabi -float-abi soft -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=FULL-SOFT --check-prefix=FULL-SOFT-LE
+; RUN: llc -mtriple=armv7a--none-eabi -float-abi hard -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=FULL-HARD --check-prefix=FULL-HARD-LE
+; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi soft -mattr=+fp16 < %s | FileCheck %s --check-prefix=SOFT
+; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi hard -mattr=+fp16 < %s | FileCheck %s --check-prefix=HARD
+; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi soft -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=FULL-SOFT --check-prefix=FULL-SOFT-BE
+; RUN: llc -mtriple=armv7aeb--none-eabi -float-abi hard -mattr=+fullfp16 < %s | FileCheck %s --check-prefix=FULL-HARD --check-prefix=FULL-HARD-BE
 
 define half @foo(half %a, half %b) {
 ; SOFT-LABEL: foo:
@@ -44,3 +44,76 @@ entry:
   %0 = fadd half %a, %b
   ret half %0
 }
+
+define <4 x half> @foo_vec(<4 x half> %a) {
+; SOFT-LABEL: foo_vec:
+; SOFT:       @ %bb.0: @ %entry
+; SOFT-NEXT:    vmov s0, r3
+; SOFT-NEXT:    vmov s2, r1
+; SOFT-NEXT:    vcvtb.f32.f16 s0, s0
+; SOFT-NEXT:    vmov s4, r0
+; SOFT-NEXT:    vcvtb.f32.f16 s2, s2
+; SOFT-NEXT:    vmov s6, r2
+; SOFT-NEXT:    vcvtb.f32.f16 s4, s4
+; SOFT-NEXT:    vcvtb.f32.f16 s6, s6
+; SOFT-NEXT:    vadd.f32 s0, s0, s0
+; SOFT-NEXT:    vadd.f32 s2, s2, s2
+; SOFT-NEXT:    vcvtb.f16.f32 s0, s0
+; SOFT-NEXT:    vadd.f32 s4, s4, s4
+; SOFT-NEXT:    vcvtb.f16.f32 s2, s2
+; SOFT-NEXT:    vadd.f32 s6, s6, s6
+; SOFT-NEXT:    vcvtb.f16.f32 s4, s4
+; SOFT-NEXT:    vcvtb.f16.f32 s6, s6
+; SOFT-NEXT:    vmov r0, s4
+; SOFT-NEXT:    vmov r1, s2
+; SOFT-NEXT:    vmov r2, s6
+; SOFT-NEXT:    vmov r3, s0
+; SOFT-NEXT:    bx lr
+;
+; HARD-LABEL: foo_vec:
+; HARD:       @ %bb.0: @ %entry
+; HARD-NEXT:    vcvtb.f32.f16 s4, s3
+; HARD-NEXT:    vcvtb.f32.f16 s2, s2
+; HARD-NEXT:    vcvtb.f32.f16 s6, s1
+; HARD-NEXT:    vcvtb.f32.f16 s0, s0
+; HARD-NEXT:    vadd.f32 s2, s2, s2
+; HARD-NEXT:    vadd.f32 s0, s0, s0
+; HARD-NEXT:    vcvtb.f16.f32 s2, s2
+; HARD-NEXT:    vadd.f32 s4, s4, s4
+; HARD-NEXT:    vcvtb.f16.f32 s0, s0
+; HARD-NEXT:    vadd.f32 s6, s6, s6
+; HARD-NEXT:    vcvtb.f16.f32 s3, s4
+; HARD-NEXT:    vcvtb.f16.f32 s1, s6
+; HARD-NEXT:    bx lr
+;
+; FULL-SOFT-LE-LABEL: foo_vec:
+; FULL-SOFT-LE:       @ %bb.0: @ %entry
+; FULL-SOFT-LE-NEXT:    vmov d16, r0, r1
+; FULL-SOFT-LE-NEXT:    vadd.f16 d16, d16, d16
+; FULL-SOFT-LE-NEXT:    vmov r0, r1, d16
+; FULL-SOFT-LE-NEXT:    bx lr
+;
+; FULL-HARD-LE-LABEL: foo_vec:
+; FULL-HARD-LE:       @ %bb.0: @ %entry
+; FULL-HARD-LE-NEXT:    vadd.f16 d0, d0, d0
+; FULL-HARD-LE-NEXT:    bx lr
+;
+; FULL-SOFT-BE-LABEL: foo_vec:
+; FULL-SOFT-BE:       @ %bb.0: @ %entry
+; FULL-SOFT-BE-NEXT:    vmov d16, r1, r0
+; FULL-SOFT-BE-NEXT:    vrev64.16 d16, d16
+; FULL-SOFT-BE-NEXT:    vadd.f16 d16, d16, d16
+; FULL-SOFT-BE-NEXT:    vrev64.16 d16, d16
+; FULL-SOFT-BE-NEXT:    vmov r1, r0, d16
+; FULL-SOFT-BE-NEXT:    bx lr
+;
+; FULL-HARD-BE-LABEL: foo_vec:
+; FULL-HARD-BE:       @ %bb.0: @ %entry
+; FULL-HARD-BE-NEXT:    vrev64.16 d16, d0
+; FULL-HARD-BE-NEXT:    vadd.f16 d16, d16, d16
+; FULL-HARD-BE-NEXT:    vrev64.16 d0, d16
+; FULL-HARD-BE-NEXT:    bx lr
+entry:
+  %0 = fadd <4 x half> %a, %a
+  ret <4 x half> %0
+}
diff --git a/llvm/test/CodeGen/ARM/fp16-v3.ll b/llvm/test/CodeGen/ARM/fp16-v3.ll
index e84fee2c2e1b..085503e80c7f 100644
--- a/llvm/test/CodeGen/ARM/fp16-v3.ll
+++ b/llvm/test/CodeGen/ARM/fp16-v3.ll
@@ -28,9 +28,6 @@ define void @test_vec3(<3 x half>* %arr, i32 %i) #0 {
 }
 
 ; CHECK-LABEL: test_bitcast:
-; CHECK: vcvtb.f16.f32
-; CHECK: vcvtb.f16.f32
-; CHECK: vcvtb.f16.f32
 ; CHECK: pkhbt
 ; CHECK: uxth
 define void @test_bitcast(<3 x half> %inp, <3 x i16>* %arr) #0 {

From dae9fe408793def8a49f5e1d10d2a859627785e3 Mon Sep 17 00:00:00 2001
From: Anastasia Stulova <anastasia.stulova@arm.com>
Date: Fri, 28 Aug 2020 11:10:11 +0200
Subject: [PATCH 189/363] [OpenCL][Docs] 10.x release notes

Summary of major changes for OpenCL support in clang 11.

Differential revision: https://reviews.llvm.org/D86626
---
 clang/docs/ReleaseNotes.rst | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c32952cce51d..9d0ab935063f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -382,10 +382,36 @@ C++1z Feature Support
 Objective-C Language Changes in Clang
 -------------------------------------
 
-OpenCL C Language Changes in Clang
-----------------------------------
+OpenCL Kernel Language Changes in Clang
+---------------------------------------
 
-...
+- Added extensions from `cl_khr_subgroup_extensions` to clang and the internal
+  header.
+
+- Added rocm device libs linking for AMDGPU.
+
+- Added diagnostic for OpenCL 2.0 blocks used in function arguments.
+
+- Fixed MS mangling for OpenCL 2.0 pipe type specifier.
+
+- Improved command line options for fast relaxed math.
+
+- Improved `atomic_fetch_min/max` functions in the internal header
+  (`opencl-c.h`).
+
+- Improved size of builtin function table for `TableGen`-based internal header
+  (enabled by `-fdeclare-opencl-builtins`) and added new functionality for
+  OpenCL 2.0 atomics, pipes, enqueue kernel, `cl_khr_subgroups`,
+  `cl_arm_integer_dot_product`.
+
+Changes related to C++ for OpenCL
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- Added `addrspace_cast` operator.
+
+- Improved address space deduction in templates.
+
+- Improved diagnostics of address spaces in nested pointer conversions.
 
 ABI Changes in Clang
 --------------------

From bc1425082be4a01a661fda974a8b90bfbbd14faf Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Wed, 19 Aug 2020 10:34:25 +0100
Subject: [PATCH 190/363] [Clang][SVE] NFC: Move info about ACLE types into
 separate function.

This function returns a struct `BuiltinVectorTypeInfo` that contains
the builtin vector's element type, element count and number of vectors
(used for vector tuples).

Reviewed By: c-rhodes

Differential Revision: https://reviews.llvm.org/D86100

(cherry picked from commit 0353848cc94f0fc23a953f8f420be7ee3342c8dc)
---
 clang/include/clang/AST/ASTContext.h |  16 ++++
 clang/lib/AST/ASTContext.cpp         | 113 +++++++++++++++++++++++++++
 clang/lib/CodeGen/CodeGenTypes.cpp   |  51 ++----------
 3 files changed, 135 insertions(+), 45 deletions(-)

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 2b988be60da9..9020e6629d08 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -60,6 +60,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/TypeSize.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -1297,6 +1298,21 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// Returns a vla type where known sizes are replaced with [*].
   QualType getVariableArrayDecayedType(QualType Ty) const;
 
+  // Convenience struct to return information about a builtin vector type.
+  struct BuiltinVectorTypeInfo {
+    QualType ElementType;
+    llvm::ElementCount EC;
+    unsigned NumVectors;
+    BuiltinVectorTypeInfo(QualType ElementType, llvm::ElementCount EC,
+                          unsigned NumVectors)
+        : ElementType(ElementType), EC(EC), NumVectors(NumVectors) {}
+  };
+
+  /// Returns the element type, element count and number of vectors
+  /// (in case of tuple) for a builtin vector type.
+  BuiltinVectorTypeInfo
+  getBuiltinVectorTypeInfo(const BuiltinType *VecTy) const;
+
   /// Return the unique reference to a scalable vector type of the specified
   /// element type and scalable number of elements.
   ///
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index e3798bb46e86..bf51d35d9693 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -3634,6 +3634,119 @@ QualType ASTContext::getIncompleteArrayType(QualType elementType,
   return QualType(newType, 0);
 }
 
+ASTContext::BuiltinVectorTypeInfo
+ASTContext::getBuiltinVectorTypeInfo(const BuiltinType *Ty) const {
+#define SVE_INT_ELTTY(BITS, ELTS, SIGNED, NUMVECTORS)                          \
+  {getIntTypeForBitwidth(BITS, SIGNED), llvm::ElementCount(ELTS, true),        \
+   NUMVECTORS};
+
+#define SVE_ELTTY(ELTTY, ELTS, NUMVECTORS)                                     \
+  {ELTTY, llvm::ElementCount(ELTS, true), NUMVECTORS};
+
+  switch (Ty->getKind()) {
+  default:
+    llvm_unreachable("Unsupported builtin vector type");
+  case BuiltinType::SveInt8:
+    return SVE_INT_ELTTY(8, 16, true, 1);
+  case BuiltinType::SveUint8:
+    return SVE_INT_ELTTY(8, 16, false, 1);
+  case BuiltinType::SveInt8x2:
+    return SVE_INT_ELTTY(8, 16, true, 2);
+  case BuiltinType::SveUint8x2:
+    return SVE_INT_ELTTY(8, 16, false, 2);
+  case BuiltinType::SveInt8x3:
+    return SVE_INT_ELTTY(8, 16, true, 3);
+  case BuiltinType::SveUint8x3:
+    return SVE_INT_ELTTY(8, 16, false, 3);
+  case BuiltinType::SveInt8x4:
+    return SVE_INT_ELTTY(8, 16, true, 4);
+  case BuiltinType::SveUint8x4:
+    return SVE_INT_ELTTY(8, 16, false, 4);
+  case BuiltinType::SveInt16:
+    return SVE_INT_ELTTY(16, 8, true, 1);
+  case BuiltinType::SveUint16:
+    return SVE_INT_ELTTY(16, 8, false, 1);
+  case BuiltinType::SveInt16x2:
+    return SVE_INT_ELTTY(16, 8, true, 2);
+  case BuiltinType::SveUint16x2:
+    return SVE_INT_ELTTY(16, 8, false, 2);
+  case BuiltinType::SveInt16x3:
+    return SVE_INT_ELTTY(16, 8, true, 3);
+  case BuiltinType::SveUint16x3:
+    return SVE_INT_ELTTY(16, 8, false, 3);
+  case BuiltinType::SveInt16x4:
+    return SVE_INT_ELTTY(16, 8, true, 4);
+  case BuiltinType::SveUint16x4:
+    return SVE_INT_ELTTY(16, 8, false, 4);
+  case BuiltinType::SveInt32:
+    return SVE_INT_ELTTY(32, 4, true, 1);
+  case BuiltinType::SveUint32:
+    return SVE_INT_ELTTY(32, 4, false, 1);
+  case BuiltinType::SveInt32x2:
+    return SVE_INT_ELTTY(32, 4, true, 2);
+  case BuiltinType::SveUint32x2:
+    return SVE_INT_ELTTY(32, 4, false, 2);
+  case BuiltinType::SveInt32x3:
+    return SVE_INT_ELTTY(32, 4, true, 3);
+  case BuiltinType::SveUint32x3:
+    return SVE_INT_ELTTY(32, 4, false, 3);
+  case BuiltinType::SveInt32x4:
+    return SVE_INT_ELTTY(32, 4, true, 4);
+  case BuiltinType::SveUint32x4:
+    return SVE_INT_ELTTY(32, 4, false, 4);
+  case BuiltinType::SveInt64:
+    return SVE_INT_ELTTY(64, 2, true, 1);
+  case BuiltinType::SveUint64:
+    return SVE_INT_ELTTY(64, 2, false, 1);
+  case BuiltinType::SveInt64x2:
+    return SVE_INT_ELTTY(64, 2, true, 2);
+  case BuiltinType::SveUint64x2:
+    return SVE_INT_ELTTY(64, 2, false, 2);
+  case BuiltinType::SveInt64x3:
+    return SVE_INT_ELTTY(64, 2, true, 3);
+  case BuiltinType::SveUint64x3:
+    return SVE_INT_ELTTY(64, 2, false, 3);
+  case BuiltinType::SveInt64x4:
+    return SVE_INT_ELTTY(64, 2, true, 4);
+  case BuiltinType::SveUint64x4:
+    return SVE_INT_ELTTY(64, 2, false, 4);
+  case BuiltinType::SveBool:
+    return SVE_ELTTY(BoolTy, 16, 1);
+  case BuiltinType::SveFloat16:
+    return SVE_ELTTY(HalfTy, 8, 1);
+  case BuiltinType::SveFloat16x2:
+    return SVE_ELTTY(HalfTy, 8, 2);
+  case BuiltinType::SveFloat16x3:
+    return SVE_ELTTY(HalfTy, 8, 3);
+  case BuiltinType::SveFloat16x4:
+    return SVE_ELTTY(HalfTy, 8, 4);
+  case BuiltinType::SveFloat32:
+    return SVE_ELTTY(FloatTy, 4, 1);
+  case BuiltinType::SveFloat32x2:
+    return SVE_ELTTY(FloatTy, 4, 2);
+  case BuiltinType::SveFloat32x3:
+    return SVE_ELTTY(FloatTy, 4, 3);
+  case BuiltinType::SveFloat32x4:
+    return SVE_ELTTY(FloatTy, 4, 4);
+  case BuiltinType::SveFloat64:
+    return SVE_ELTTY(DoubleTy, 2, 1);
+  case BuiltinType::SveFloat64x2:
+    return SVE_ELTTY(DoubleTy, 2, 2);
+  case BuiltinType::SveFloat64x3:
+    return SVE_ELTTY(DoubleTy, 2, 3);
+  case BuiltinType::SveFloat64x4:
+    return SVE_ELTTY(DoubleTy, 2, 4);
+  case BuiltinType::SveBFloat16:
+    return SVE_ELTTY(BFloat16Ty, 8, 1);
+  case BuiltinType::SveBFloat16x2:
+    return SVE_ELTTY(BFloat16Ty, 8, 2);
+  case BuiltinType::SveBFloat16x3:
+    return SVE_ELTTY(BFloat16Ty, 8, 3);
+  case BuiltinType::SveBFloat16x4:
+    return SVE_ELTTY(BFloat16Ty, 8, 4);
+  }
+}
+
 /// getScalableVectorType - Return the unique reference to a scalable vector
 /// type of the specified element type and size. VectorType must be a built-in
 /// type.
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index d431c0263666..4792c10ecdae 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -533,99 +533,60 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
     case BuiltinType::OCLReserveID:
       ResultType = CGM.getOpenCLRuntime().convertOpenCLSpecificType(Ty);
       break;
-#define GET_SVE_INT_VEC(BITS, ELTS)                                            \
-  llvm::ScalableVectorType::get(                                               \
-      llvm::IntegerType::get(getLLVMContext(), BITS), ELTS);
     case BuiltinType::SveInt8:
     case BuiltinType::SveUint8:
-      return GET_SVE_INT_VEC(8, 16);
     case BuiltinType::SveInt8x2:
     case BuiltinType::SveUint8x2:
-      return GET_SVE_INT_VEC(8, 32);
     case BuiltinType::SveInt8x3:
     case BuiltinType::SveUint8x3:
-      return GET_SVE_INT_VEC(8, 48);
     case BuiltinType::SveInt8x4:
     case BuiltinType::SveUint8x4:
-      return GET_SVE_INT_VEC(8, 64);
     case BuiltinType::SveInt16:
     case BuiltinType::SveUint16:
-      return GET_SVE_INT_VEC(16, 8);
     case BuiltinType::SveInt16x2:
     case BuiltinType::SveUint16x2:
-      return GET_SVE_INT_VEC(16, 16);
     case BuiltinType::SveInt16x3:
     case BuiltinType::SveUint16x3:
-      return GET_SVE_INT_VEC(16, 24);
     case BuiltinType::SveInt16x4:
     case BuiltinType::SveUint16x4:
-      return GET_SVE_INT_VEC(16, 32);
     case BuiltinType::SveInt32:
     case BuiltinType::SveUint32:
-      return GET_SVE_INT_VEC(32, 4);
     case BuiltinType::SveInt32x2:
     case BuiltinType::SveUint32x2:
-      return GET_SVE_INT_VEC(32, 8);
     case BuiltinType::SveInt32x3:
     case BuiltinType::SveUint32x3:
-      return GET_SVE_INT_VEC(32, 12);
     case BuiltinType::SveInt32x4:
     case BuiltinType::SveUint32x4:
-      return GET_SVE_INT_VEC(32, 16);
     case BuiltinType::SveInt64:
     case BuiltinType::SveUint64:
-      return GET_SVE_INT_VEC(64, 2);
     case BuiltinType::SveInt64x2:
     case BuiltinType::SveUint64x2:
-      return GET_SVE_INT_VEC(64, 4);
     case BuiltinType::SveInt64x3:
     case BuiltinType::SveUint64x3:
-      return GET_SVE_INT_VEC(64, 6);
     case BuiltinType::SveInt64x4:
     case BuiltinType::SveUint64x4:
-      return GET_SVE_INT_VEC(64, 8);
     case BuiltinType::SveBool:
-      return GET_SVE_INT_VEC(1, 16);
-#undef GET_SVE_INT_VEC
-#define GET_SVE_FP_VEC(TY, ISFP16, ELTS)                                       \
-  llvm::ScalableVectorType::get(                                               \
-      getTypeForFormat(getLLVMContext(),                                       \
-                       Context.getFloatTypeSemantics(Context.TY),              \
-                       /* UseNativeHalf = */ ISFP16),                          \
-      ELTS);
     case BuiltinType::SveFloat16:
-      return GET_SVE_FP_VEC(HalfTy, true, 8);
     case BuiltinType::SveFloat16x2:
-      return GET_SVE_FP_VEC(HalfTy, true, 16);
     case BuiltinType::SveFloat16x3:
-      return GET_SVE_FP_VEC(HalfTy, true, 24);
     case BuiltinType::SveFloat16x4:
-      return GET_SVE_FP_VEC(HalfTy, true, 32);
     case BuiltinType::SveFloat32:
-      return GET_SVE_FP_VEC(FloatTy, false, 4);
     case BuiltinType::SveFloat32x2:
-      return GET_SVE_FP_VEC(FloatTy, false, 8);
     case BuiltinType::SveFloat32x3:
-      return GET_SVE_FP_VEC(FloatTy, false, 12);
     case BuiltinType::SveFloat32x4:
-      return GET_SVE_FP_VEC(FloatTy, false, 16);
     case BuiltinType::SveFloat64:
-      return GET_SVE_FP_VEC(DoubleTy, false, 2);
     case BuiltinType::SveFloat64x2:
-      return GET_SVE_FP_VEC(DoubleTy, false, 4);
     case BuiltinType::SveFloat64x3:
-      return GET_SVE_FP_VEC(DoubleTy, false, 6);
     case BuiltinType::SveFloat64x4:
-      return GET_SVE_FP_VEC(DoubleTy, false, 8);
     case BuiltinType::SveBFloat16:
-      return GET_SVE_FP_VEC(BFloat16Ty, false, 8);
     case BuiltinType::SveBFloat16x2:
-      return GET_SVE_FP_VEC(BFloat16Ty, false, 16);
     case BuiltinType::SveBFloat16x3:
-      return GET_SVE_FP_VEC(BFloat16Ty, false, 24);
-    case BuiltinType::SveBFloat16x4:
-      return GET_SVE_FP_VEC(BFloat16Ty, false, 32);
-#undef GET_SVE_FP_VEC
+    case BuiltinType::SveBFloat16x4: {
+      ASTContext::BuiltinVectorTypeInfo Info =
+          Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(Ty));
+      return llvm::ScalableVectorType::get(ConvertType(Info.ElementType),
+                                           Info.EC.Min * Info.NumVectors);
+    }
     case BuiltinType::Dependent:
 #define BUILTIN_TYPE(Id, SingletonId)
 #define PLACEHOLDER_TYPE(Id, SingletonId) \

From 2bde1011ba1a794a0391a37d41d0b461dec89d54 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Wed, 19 Aug 2020 11:06:51 +0100
Subject: [PATCH 191/363] [AArch64][SVE] Fix calculation restore point for SVE
 callee saves.

This fixes an issue where the restore point of callee-saves in the
function epilogues was incorrectly calculated when the basic block
consisted of only a RET instruction. This caused dealloc instructions
to be inserted in between the block of callee-save restore instructions,
rather than before it.

Reviewed By: paulwalker-arm

Differential Revision: https://reviews.llvm.org/D86099

(cherry picked from commit 5f47d4456d192eaea8c56a2b4648023c8743c927)
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |  7 ++--
 .../framelayout-sve-calleesaves-fix.mir       | 36 +++++++++++++++++++
 2 files changed, 39 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 83653dcbb8cf..c6cc6e9e8471 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1694,11 +1694,10 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
   MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
-    RestoreBegin = std::prev(RestoreEnd);;
-    while (IsSVECalleeSave(RestoreBegin) &&
-           RestoreBegin != MBB.begin())
+    RestoreBegin = std::prev(RestoreEnd);
+    while (RestoreBegin != MBB.begin() &&
+           IsSVECalleeSave(std::prev(RestoreBegin)))
       --RestoreBegin;
-    ++RestoreBegin;
 
     assert(IsSVECalleeSave(RestoreBegin) &&
            IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
new file mode 100644
index 000000000000..a3cbd39c6531
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
@@ -0,0 +1,36 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+# RUN: llc -mattr=+sve -mtriple=aarch64-none-linux-gnu -start-before=prologepilog %s -o - | FileCheck %s
+
+--- |
+  define aarch64_sve_vector_pcs void @fix_restorepoint_p4() { entry: unreachable }
+  ; CHECK-LABEL: fix_restorepoint_p4:
+  ; CHECK:       // %bb.0: // %entry
+  ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+  ; CHECK-NEXT:    addvl sp, sp, #-2
+  ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+  ; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+  ; CHECK-NEXT:    addvl sp, sp, #-1
+  ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+  ; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+  ; CHECK-NEXT:    .cfi_offset w29, -16
+  ; CHECK-NEXT:    // implicit-def: $z8
+  ; CHECK-NEXT:    // implicit-def: $p4
+  ; CHECK-NEXT:    addvl sp, sp, #1
+  ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+  ; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+  ; CHECK-NEXT:    addvl sp, sp, #2
+  ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+  ; CHECK-NEXT:    ret
+...
+name: fix_restorepoint_p4
+stack:
+  - { id: 0, stack-id: sve-vec, size: 16, alignment: 16 }
+body:             |
+  bb.0.entry:
+    $z8 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    B %bb.1
+
+  bb.1.entry:
+    RET_ReallyLR
+---

From f5c02e4855d3206e27e151dfdda457b95af33529 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Thu, 27 Aug 2020 08:12:43 +0100
Subject: [PATCH 192/363] [AArch64][SVE] Add missing debug info for ACLE types.

This patch adds type information for SVE ACLE vector types,
by describing them as vectors, with a lower bound of 0, and
an upper bound described by a DWARF expression using the
AArch64 Vector Granule register (VG), which contains the
runtime multiple of 64bit granules in an SVE vector.

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D86101

(cherry picked from commit 4e9b66de3f046c1e97b34c938b0920fa6401f40c)
---
 clang/lib/CodeGen/CGDebugInfo.cpp             | 46 ++++++++----
 .../CodeGen/aarch64-debug-sve-vector-types.c  | 71 +++++++++++++++++++
 .../aarch64-debug-sve-vectorx2-types.c        | 67 +++++++++++++++++
 .../aarch64-debug-sve-vectorx3-types.c        | 67 +++++++++++++++++
 .../aarch64-debug-sve-vectorx4-types.c        | 67 +++++++++++++++++
 clang/test/CodeGen/aarch64-sve.c              | 16 -----
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp     |  6 +-
 llvm/test/DebugInfo/AArch64/dbg-sve-types.ll  | 44 ++++++++++++
 8 files changed, 351 insertions(+), 33 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-debug-sve-vector-types.c
 create mode 100644 clang/test/CodeGen/aarch64-debug-sve-vectorx2-types.c
 create mode 100644 clang/test/CodeGen/aarch64-debug-sve-vectorx3-types.c
 create mode 100644 clang/test/CodeGen/aarch64-debug-sve-vectorx4-types.c
 create mode 100644 llvm/test/DebugInfo/AArch64/dbg-sve-types.ll

diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 6965c4a1209c..703f5087370a 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -719,23 +719,39 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
   case BuiltinType::Id: \
     return getOrCreateStructPtrType("opencl_" #ExtType, Id##Ty);
 #include "clang/Basic/OpenCLExtensionTypes.def"
-  // TODO: real support for SVE types requires more infrastructure
-  // to be added first.  The types have a variable length and are
-  // represented in debug info as types whose length depends on a
-  // target-specific pseudo register.
-#define SVE_TYPE(Name, Id, SingletonId) \
-  case BuiltinType::Id:
+
+#define SVE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/AArch64SVEACLETypes.def"
-  {
-    unsigned DiagID = CGM.getDiags().getCustomDiagID(
-        DiagnosticsEngine::Error,
-        "cannot yet generate debug info for SVE type '%0'");
-    auto Name = BT->getName(CGM.getContext().getPrintingPolicy());
-    CGM.getDiags().Report(DiagID) << Name;
-    // Return something safe.
-    return CreateType(cast<const BuiltinType>(CGM.getContext().IntTy));
-  }
+    {
+      ASTContext::BuiltinVectorTypeInfo Info =
+          CGM.getContext().getBuiltinVectorTypeInfo(BT);
+      unsigned NumElemsPerVG = (Info.EC.Min * Info.NumVectors) / 2;
+
+      // Debuggers can't extract 1bit from a vector, so will display a
+      // bitpattern for svbool_t instead.
+      if (Info.ElementType == CGM.getContext().BoolTy) {
+        NumElemsPerVG /= 8;
+        Info.ElementType = CGM.getContext().UnsignedCharTy;
+      }
 
+      auto *LowerBound =
+          llvm::ConstantAsMetadata::get(llvm::ConstantInt::getSigned(
+              llvm::Type::getInt64Ty(CGM.getLLVMContext()), 0));
+      SmallVector<int64_t, 9> Expr(
+          {llvm::dwarf::DW_OP_constu, NumElemsPerVG, llvm::dwarf::DW_OP_bregx,
+           /* AArch64::VG */ 46, 0, llvm::dwarf::DW_OP_mul,
+           llvm::dwarf::DW_OP_constu, 1, llvm::dwarf::DW_OP_minus});
+      auto *UpperBound = DBuilder.createExpression(Expr);
+
+      llvm::Metadata *Subscript = DBuilder.getOrCreateSubrange(
+          /*count*/ nullptr, LowerBound, UpperBound, /*stride*/ nullptr);
+      llvm::DINodeArray SubscriptArray = DBuilder.getOrCreateArray(Subscript);
+      llvm::DIType *ElemTy =
+          getOrCreateType(Info.ElementType, TheCU->getFile());
+      auto Align = getTypeAlignIfRequired(BT, CGM.getContext());
+      return DBuilder.createVectorType(/*Size*/ 0, Align, ElemTy,
+                                       SubscriptArray);
+    }
   case BuiltinType::UChar:
   case BuiltinType::Char_U:
     Encoding = llvm::dwarf::DW_ATE_unsigned_char;
diff --git a/clang/test/CodeGen/aarch64-debug-sve-vector-types.c b/clang/test/CodeGen/aarch64-debug-sve-vector-types.c
new file mode 100644
index 000000000000..4325e3f44747
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-debug-sve-vector-types.c
@@ -0,0 +1,71 @@
+// RUN:  %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \
+// RUN:  -emit-llvm -o - %s -debug-info-kind=limited 2>&1 | FileCheck %s
+
+void test_locals(void) {
+  // CHECK-DAG: name: "__SVBool_t",{{.*}}, baseType: ![[CT1:[0-9]+]]
+  // CHECK-DAG: ![[CT1]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTYU8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1_64:[0-9]+]])
+  // CHECK-DAG: ![[ELTTYU8]] = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char)
+  // CHECK-DAG: ![[ELTS1_64]] = !{![[REALELTS1_64:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS1_64]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 1, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  __SVBool_t b8;
+
+  // CHECK-DAG: name: "__SVInt8_t",{{.*}}, baseType: ![[CT8:[0-9]+]]
+  // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTYS8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8:[0-9]+]])
+  // CHECK-DAG: ![[ELTTYS8]] = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char)
+  // CHECK-DAG: ![[ELTS8]] = !{![[REALELTS8:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS8]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 8, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  __SVInt8_t s8;
+
+  // CHECK-DAG: name: "__SVUint8_t",{{.*}}, baseType: ![[CT8:[0-9]+]]
+  // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTYU8]], flags: DIFlagVector, elements: ![[ELTS8]])
+  __SVUint8_t u8;
+
+  // CHECK-DAG: name: "__SVInt16_t",{{.*}}, baseType: ![[CT16:[0-9]+]]
+  // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16:[0-9]+]])
+  // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed)
+  // CHECK-DAG: ![[ELTS16]] = !{![[REALELTS16:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS16]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 4, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  __SVInt16_t s16;
+
+  // CHECK-DAG: name: "__SVUint16_t",{{.*}}, baseType: ![[CT16:[0-9]+]]
+  // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16]])
+  // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "unsigned short", size: 16, encoding: DW_ATE_unsigned)
+  __SVUint16_t u16;
+
+  // CHECK-DAG: name: "__SVInt32_t",{{.*}}, baseType: ![[CT32:[0-9]+]]
+  // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32:[0-9]+]])
+  // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  // CHECK-DAG: ![[ELTS32]] = !{![[REALELTS32:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS32]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 2, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  __SVInt32_t s32;
+
+  // CHECK-DAG: name: "__SVUint32_t",{{.*}}, baseType: ![[CT32:[0-9]+]]
+  // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32]])
+  // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+  __SVUint32_t u32;
+
+  // CHECK-DAG: name: "__SVInt64_t",{{.*}}, baseType: ![[CT64:[0-9]+]]
+  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1_64]])
+  // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+  __SVInt64_t s64;
+
+  // CHECK-DAG: name: "__SVUint64_t",{{.*}}, baseType: ![[CT64:[0-9]+]]
+  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1_64]])
+  // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+  __SVUint64_t u64;
+
+  // CHECK:     name: "__SVFloat16_t",{{.*}}, baseType: ![[CT16:[0-9]+]]
+  // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16]])
+  // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "__fp16", size: 16, encoding: DW_ATE_float)
+  __SVFloat16_t f16;
+
+  // CHECK:     name: "__SVFloat32_t",{{.*}}, baseType: ![[CT32:[0-9]+]]
+  // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32]])
+  // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
+  __SVFloat32_t f32;
+
+  // CHECK:     name: "__SVFloat64_t",{{.*}}, baseType: ![[CT64:[0-9]+]]
+  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1_64]])
+  // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
+  __SVFloat64_t f64;
+}
diff --git a/clang/test/CodeGen/aarch64-debug-sve-vectorx2-types.c b/clang/test/CodeGen/aarch64-debug-sve-vectorx2-types.c
new file mode 100644
index 000000000000..0d874c0b557c
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-debug-sve-vectorx2-types.c
@@ -0,0 +1,67 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \
+// RUN:  -emit-llvm -o - %s -debug-info-kind=limited 2>&1 | FileCheck %s
+
+void test_locals(void) {
+  // CHECK-DAG: name: "__clang_svint8x2_t",{{.*}}, baseType: ![[CT8:[0-9]+]]
+  // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8x2:[0-9]+]])
+  // CHECK-DAG: ![[ELTTY8]] = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char)
+  // CHECK-DAG: ![[ELTS8x2]] = !{![[REALELTS8x2:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS8x2]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 16, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  __clang_svint8x2_t s8;
+
+  // CHECK-DAG: name: "__clang_svuint8x2_t",{{.*}}, baseType: ![[CT8:[0-9]+]]
+  // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8x2]])
+  // CHECK-DAG: ![[ELTTY8]] = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char)
+  __clang_svuint8x2_t u8;
+
+  // CHECK-DAG: name: "__clang_svint16x2_t",{{.*}}, baseType: ![[CT16:[0-9]+]]
+  // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x2:[0-9]+]])
+  // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed)
+  // CHECK-DAG: ![[ELTS16x2]] = !{![[REALELTS16x2:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS16x2]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 8, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  __clang_svint16x2_t s16;
+
+  // CHECK-DAG: name: "__clang_svuint16x2_t",{{.*}}, baseType: ![[CT16:[0-9]+]]
+  // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x2]])
+  // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "unsigned short", size: 16, encoding: DW_ATE_unsigned)
+  __clang_svuint16x2_t u16;
+
+  // CHECK-DAG: name: "__clang_svint32x2_t",{{.*}}, baseType: ![[CT32:[0-9]+]]
+  // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x2:[0-9]+]])
+  // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  // CHECK-DAG: ![[ELTS32x2]] = !{![[REALELTS32x2:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS32x2]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 4, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  __clang_svint32x2_t s32;
+
+  // CHECK-DAG: name: "__clang_svuint32x2_t",{{.*}}, baseType: ![[CT32:[0-9]+]]
+  // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x2]])
+  // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+  __clang_svuint32x2_t u32;
+
+  // CHECK-DAG: name: "__clang_svint64x2_t",{{.*}}, baseType: ![[CT64:[0-9]+]]
+  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x2_64:[0-9]+]])
+  // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+  // CHECK-DAG: ![[ELTS1x2_64]] = !{![[REALELTS1x2_64:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS1x2_64]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 2, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  __clang_svint64x2_t s64;
+
+  // CHECK-DAG: name: "__clang_svuint64x2_t",{{.*}}, baseType: ![[CT64:[0-9]+]]
+  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x2_64]])
+  // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+  __clang_svuint64x2_t u64;
+
+  // CHECK:     name: "__clang_svfloat16x2_t",{{.*}}, baseType: ![[CT16:[0-9]+]]
+  // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x2]])
+  // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "__fp16", size: 16, encoding: DW_ATE_float)
+  __clang_svfloat16x2_t f16;
+
+  // CHECK:     name: "__clang_svfloat32x2_t",{{.*}}, baseType: ![[CT32:[0-9]+]]
+  // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x2]])
+  // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
+  __clang_svfloat32x2_t f32;
+
+  // CHECK:     name: "__clang_svfloat64x2_t",{{.*}}, baseType: ![[CT64:[0-9]+]]
+  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x2_64]])
+  // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
+  __clang_svfloat64x2_t f64;
+}
diff --git a/clang/test/CodeGen/aarch64-debug-sve-vectorx3-types.c b/clang/test/CodeGen/aarch64-debug-sve-vectorx3-types.c
new file mode 100644
index 000000000000..c5dde7d1295d
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-debug-sve-vectorx3-types.c
@@ -0,0 +1,67 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \
+// RUN:  -emit-llvm -o - %s -debug-info-kind=limited 2>&1 | FileCheck %s
+
+void test_locals(void) {
+  // CHECK-DAG: name: "__clang_svint8x3_t",{{.*}}, baseType: ![[CT8:[0-9]+]]
+  // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8x3:[0-9]+]])
+  // CHECK-DAG: ![[ELTTY8]] = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char)
+  // CHECK-DAG: ![[ELTS8x3]] = !{![[REALELTS8x3:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS8x3]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 24, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  __clang_svint8x3_t s8;
+
+  // CHECK-DAG: name: "__clang_svuint8x3_t",{{.*}}, baseType: ![[CT8:[0-9]+]]
+  // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8x3]])
+  // CHECK-DAG: ![[ELTTY8]] = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char)
+  __clang_svuint8x3_t u8;
+
+  // CHECK-DAG: name: "__clang_svint16x3_t",{{.*}}, baseType: ![[CT16:[0-9]+]]
+  // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x3:[0-9]+]])
+  // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed)
+  // CHECK-DAG: ![[ELTS16x3]] = !{![[REALELTS16x3:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS16x3]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 12, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  __clang_svint16x3_t s16;
+
+  // CHECK-DAG: name: "__clang_svuint16x3_t",{{.*}}, baseType: ![[CT16:[0-9]+]]
+  // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x3]])
+  // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "unsigned short", size: 16, encoding: DW_ATE_unsigned)
+  __clang_svuint16x3_t u16;
+
+  // CHECK-DAG: name: "__clang_svint32x3_t",{{.*}}, baseType: ![[CT32:[0-9]+]]
+  // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x3:[0-9]+]])
+  // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  // CHECK-DAG: ![[ELTS32x3]] = !{![[REALELTS32x3:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS32x3]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 6, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  __clang_svint32x3_t s32;
+
+  // CHECK-DAG: name: "__clang_svuint32x3_t",{{.*}}, baseType: ![[CT32:[0-9]+]]
+  // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x3]])
+  // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+  __clang_svuint32x3_t u32;
+
+  // CHECK-DAG: name: "__clang_svint64x3_t",{{.*}}, baseType: ![[CT64:[0-9]+]]
+  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x3_64:[0-9]+]])
+  // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+  // CHECK-DAG: ![[ELTS1x3_64]] = !{![[REALELTS1x3_64:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS1x3_64]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 3, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  __clang_svint64x3_t s64;
+
+  // CHECK-DAG: name: "__clang_svuint64x3_t",{{.*}}, baseType: ![[CT64:[0-9]+]]
+  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x3_64]])
+  // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+  __clang_svuint64x3_t u64;
+
+  // CHECK:     name: "__clang_svfloat16x3_t",{{.*}}, baseType: ![[CT16:[0-9]+]]
+  // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x3]])
+  // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "__fp16", size: 16, encoding: DW_ATE_float)
+  __clang_svfloat16x3_t f16;
+
+  // CHECK:     name: "__clang_svfloat32x3_t",{{.*}}, baseType: ![[CT32:[0-9]+]]
+  // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x3]])
+  // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
+  __clang_svfloat32x3_t f32;
+
+  // CHECK:     name: "__clang_svfloat64x3_t",{{.*}}, baseType: ![[CT64:[0-9]+]]
+  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x3_64]])
+  // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
+  __clang_svfloat64x3_t f64;
+}
diff --git a/clang/test/CodeGen/aarch64-debug-sve-vectorx4-types.c b/clang/test/CodeGen/aarch64-debug-sve-vectorx4-types.c
new file mode 100644
index 000000000000..90a266c53f90
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-debug-sve-vectorx4-types.c
@@ -0,0 +1,67 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \
+// RUN:  -emit-llvm -o - %s -debug-info-kind=limited 2>&1 | FileCheck %s
+
+void test_locals(void) {
+  // CHECK-DAG: name: "__clang_svint8x4_t",{{.*}}, baseType: ![[CT8:[0-9]+]]
+  // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8x4:[0-9]+]])
+  // CHECK-DAG: ![[ELTTY8]] = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char)
+  // CHECK-DAG: ![[ELTS8x4]] = !{![[REALELTS8x4:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS8x4]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 32, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  __clang_svint8x4_t s8;
+
+  // CHECK-DAG: name: "__clang_svuint8x4_t",{{.*}}, baseType: ![[CT8:[0-9]+]]
+  // CHECK-DAG: ![[CT8]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY8:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS8x4]])
+  // CHECK-DAG: ![[ELTTY8]] = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char)
+  __clang_svuint8x4_t u8;
+
+  // CHECK-DAG: name: "__clang_svint16x4_t",{{.*}}, baseType: ![[CT16:[0-9]+]]
+  // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x4:[0-9]+]])
+  // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed)
+  // CHECK-DAG: ![[ELTS16x4]] = !{![[REALELTS16x4:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS16x4]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 16, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  __clang_svint16x4_t s16;
+
+  // CHECK-DAG: name: "__clang_svuint16x4_t",{{.*}}, baseType: ![[CT16:[0-9]+]]
+  // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x4]])
+  // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "unsigned short", size: 16, encoding: DW_ATE_unsigned)
+  __clang_svuint16x4_t u16;
+
+  // CHECK-DAG: name: "__clang_svint32x4_t",{{.*}}, baseType: ![[CT32:[0-9]+]]
+  // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x4:[0-9]+]])
+  // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  // CHECK-DAG: ![[ELTS32x4]] = !{![[REALELTS32x4:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS32x4]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 8, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  __clang_svint32x4_t s32;
+
+  // CHECK-DAG: name: "__clang_svuint32x4_t",{{.*}}, baseType: ![[CT32:[0-9]+]]
+  // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x4]])
+  // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+  __clang_svuint32x4_t u32;
+
+  // CHECK-DAG: name: "__clang_svint64x4_t",{{.*}}, baseType: ![[CT64:[0-9]+]]
+  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x4_64:[0-9]+]])
+  // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+  // CHECK-DAG: ![[ELTS1x4_64]] = !{![[REALELTS1x4_64:[0-9]+]]}
+  // CHECK-DAG: ![[REALELTS1x4_64]] = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 4, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+  __clang_svint64x4_t s64;
+
+  // CHECK-DAG: name: "__clang_svuint64x4_t",{{.*}}, baseType: ![[CT64:[0-9]+]]
+  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x4_64]])
+  // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+  __clang_svuint64x4_t u64;
+
+  // CHECK:     name: "__clang_svfloat16x4_t",{{.*}}, baseType: ![[CT16:[0-9]+]]
+  // CHECK-DAG: ![[CT16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY16:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS16x4]])
+  // CHECK-DAG: ![[ELTTY16]] = !DIBasicType(name: "__fp16", size: 16, encoding: DW_ATE_float)
+  __clang_svfloat16x4_t f16;
+
+  // CHECK:     name: "__clang_svfloat32x4_t",{{.*}}, baseType: ![[CT32:[0-9]+]]
+  // CHECK-DAG: ![[CT32]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY32:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS32x4]])
+  // CHECK-DAG: ![[ELTTY32]] = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
+  __clang_svfloat32x4_t f32;
+
+  // CHECK:     name: "__clang_svfloat64x4_t",{{.*}}, baseType: ![[CT64:[0-9]+]]
+  // CHECK-DAG: ![[CT64]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[ELTTY64:[0-9]+]], flags: DIFlagVector, elements: ![[ELTS1x4_64]])
+  // CHECK-DAG: ![[ELTTY64]] = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
+  __clang_svfloat64x4_t f64;
+}
diff --git a/clang/test/CodeGen/aarch64-sve.c b/clang/test/CodeGen/aarch64-sve.c
index d21af74319f9..ebcf334f11d6 100644
--- a/clang/test/CodeGen/aarch64-sve.c
+++ b/clang/test/CodeGen/aarch64-sve.c
@@ -1,22 +1,6 @@
-// RUN: not %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \
-// RUN:  -emit-llvm -o - %s -debug-info-kind=limited 2>&1 | FileCheck %s -check-prefix=CHECK-DEBUG
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \
 // RUN:  -emit-llvm -o - %s 2>&1 | FileCheck %s -check-prefix=CHECK
 
-// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVInt8_t'
-// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVInt16_t'
-// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVInt32_t'
-// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVInt64_t'
-// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVUint8_t'
-// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVUint16_t'
-// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVUint32_t'
-// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVUint64_t'
-// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVFloat16_t'
-// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVFloat32_t'
-// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVFloat64_t'
-// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVBFloat16_t'
-// CHECK-DEBUG: cannot yet generate debug info for SVE type '__SVBool_t'
-
 // CHECK: @ptr = global <vscale x 16 x i8>* null, align 8
 // CHECK: %s8 = alloca <vscale x 16 x i8>, align 16
 // CHECK: %s16 = alloca <vscale x 8 x i16>, align 16
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index e958f38e486b..ceeae14c1073 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1417,8 +1417,10 @@ static bool hasVectorBeenPadded(const DICompositeType *CTy) {
          Elements[0]->getTag() == dwarf::DW_TAG_subrange_type &&
          "Invalid vector element array, expected one element of type subrange");
   const auto Subrange = cast<DISubrange>(Elements[0]);
-  const auto CI = Subrange->getCount().get<ConstantInt *>();
-  const int32_t NumVecElements = CI->getSExtValue();
+  const auto NumVecElements =
+      Subrange->getCount()
+          ? Subrange->getCount().get<ConstantInt *>()->getSExtValue()
+          : 0;
 
   // Ensure we found the element count and that the actual size is wide
   // enough to contain the requested size.
diff --git a/llvm/test/DebugInfo/AArch64/dbg-sve-types.ll b/llvm/test/DebugInfo/AArch64/dbg-sve-types.ll
new file mode 100644
index 000000000000..62b86f294861
--- /dev/null
+++ b/llvm/test/DebugInfo/AArch64/dbg-sve-types.ll
@@ -0,0 +1,44 @@
+; Test that the debug info for the vector type is correctly codegenerated
+; when the DISubrange has no count, but only an upperbound.
+; RUN: llc -mtriple aarch64 -mattr=+sve -filetype=obj -o %t %s
+; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: rm %t
+
+; CHECK:      {{.*}}: DW_TAG_subrange_type
+; CHECK-NEXT:   DW_AT_type    ({{.*}} "__ARRAY_SIZE_TYPE__")
+; CHECK-NEXT:   DW_AT_upper_bound     (DW_OP_lit8, DW_OP_bregx VG+0, DW_OP_mul, DW_OP_lit1, DW_OP_minus)
+
+define <vscale x 16 x i8> @test_svint8_t(<vscale x 16 x i8> returned %op1) !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata <vscale x 16 x i8> %op1, metadata !19, metadata !DIExpression()), !dbg !20
+  ret <vscale x 16 x i8> %op1, !dbg !21
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "dbg-sve-types.ll", directory: "")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 12.0.0"}
+!7 = distinct !DISubprogram(name: "test_svint8_t", scope: !8, file: !8, line: 5, type: !9, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !18)
+!8 = !DIFile(filename: "dbg-sve-types.ll", directory: "")
+!9 = !DISubroutineType(types: !10)
+!10 = !{!11, !11}
+!11 = !DIDerivedType(tag: DW_TAG_typedef, name: "svint8_t", file: !12, line: 32, baseType: !13)
+!12 = !DIFile(filename: "lib/clang/12.0.0/include/arm_sve.h", directory: "")
+!13 = !DIDerivedType(tag: DW_TAG_typedef, name: "__SVInt8_t", file: !1, baseType: !14)
+!14 = !DICompositeType(tag: DW_TAG_array_type, baseType: !15, flags: DIFlagVector, elements: !16)
+!15 = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char)
+!16 = !{!17}
+!17 = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 8, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus))
+!18 = !{!19}
+!19 = !DILocalVariable(name: "op1", arg: 1, scope: !7, file: !8, line: 5, type: !11)
+!20 = !DILocation(line: 0, scope: !7)
+!21 = !DILocation(line: 5, column: 39, scope: !7)

From e37a52fe921280105f5f291bb01722c391ef1123 Mon Sep 17 00:00:00 2001
From: Adam Czachorowski <adamcz@google.com>
Date: Wed, 26 Aug 2020 16:20:01 +0200
Subject: [PATCH 193/363] [clang] Exclude invalid destructors from lookups.

This fixes a crash when declaring a destructor with a wrong name, then
writing result to pch file and loading it again. The PCH storage uses
DeclarationNameKey as key and it is the same key for both the invalid
destructor and the implicit one that was created because the other one
was invalid. When querying for the Foo::~Foo we end up getting
Foo::~Bar, which is then rejected and we end up with nullptr in
CXXRecordDecl::GetDestructor().

Fixes https://bugs.llvm.org/show_bug.cgi?id=47270

Differential Revision: https://reviews.llvm.org/D86624

(cherry picked from commit eed0af6179ca4fe9e60121e0829ed8d3849b1ce5)
---
 clang/lib/AST/DeclBase.cpp                | 7 +++++++
 clang/test/PCH/cxx-invalid-destructor.cpp | 4 ++++
 clang/test/PCH/cxx-invalid-destructor.h   | 7 +++++++
 3 files changed, 18 insertions(+)
 create mode 100644 clang/test/PCH/cxx-invalid-destructor.cpp
 create mode 100644 clang/test/PCH/cxx-invalid-destructor.h

diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index da1eadd9d931..f4314d0bd961 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -1487,6 +1487,13 @@ static bool shouldBeHidden(NamedDecl *D) {
     if (FD->isFunctionTemplateSpecialization())
       return true;
 
+  // Hide destructors that are invalid. There should always be one destructor,
+  // but if it is an invalid decl, another one is created. We need to hide the
+  // invalid one from places that expect exactly one destructor, like the
+  // serialization code.
+  if (isa<CXXDestructorDecl>(D) && D->isInvalidDecl())
+    return true;
+
   return false;
 }
 
diff --git a/clang/test/PCH/cxx-invalid-destructor.cpp b/clang/test/PCH/cxx-invalid-destructor.cpp
new file mode 100644
index 000000000000..fc89cf1f3dfc
--- /dev/null
+++ b/clang/test/PCH/cxx-invalid-destructor.cpp
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 -x c++ -std=c++11 -emit-pch -o %t %S/cxx-invalid-destructor.h -fallow-pch-with-compiler-errors
+// RUN: %clang_cc1 -x c++ -std=c++11 -include-pch %t %S/cxx-invalid-destructor.cpp -fsyntax-only -fno-validate-pch
+
+Foo f;
diff --git a/clang/test/PCH/cxx-invalid-destructor.h b/clang/test/PCH/cxx-invalid-destructor.h
new file mode 100644
index 000000000000..59095a37c203
--- /dev/null
+++ b/clang/test/PCH/cxx-invalid-destructor.h
@@ -0,0 +1,7 @@
+struct Base {
+  ~Base();
+};
+
+struct Foo : public Base {
+  ~Base();
+};

From 5b08e498cd35b05a937d6afd5cc20bde90822a29 Mon Sep 17 00:00:00 2001
From: Adam Balogh <adam.balogh@ericsson.com>
Date: Thu, 27 Aug 2020 08:01:43 -0700
Subject: [PATCH 194/363] [analyzer] NFC: Store the pointee/referenced type for
 dynamic type tracking.

The successfulness of a dynamic cast depends only on the C++ class, not the pointer or reference. Thus if *A is a *B, then &A is a &B,
const *A is a const *B etc. This patch changes DynamicCastInfo to store
and check the cast between the unqualified pointed/referenced types.
It also removes e.g. SubstTemplateTypeParmType from both the pointer
and the pointed type.

Differential Revision: https://reviews.llvm.org/D85752

(cherry picked from commit 5a9e7789396e7618c1407aafc329e00584437a2f)
---
 .../StaticAnalyzer/Checkers/CastValueChecker.cpp    |  2 +-
 clang/lib/StaticAnalyzer/Core/DynamicType.cpp       | 13 +++++++++++++
 clang/test/Analysis/cast-value-state-dump.cpp       |  4 ++--
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/CastValueChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CastValueChecker.cpp
index 1ef70b650414..3d1721f04875 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CastValueChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CastValueChecker.cpp
@@ -106,7 +106,7 @@ static const NoteTag *getNoteTag(CheckerContext &C,
                                  QualType CastToTy, const Expr *Object,
                                  bool CastSucceeds, bool IsKnownCast) {
   std::string CastToName =
-      CastInfo ? CastInfo->to()->getPointeeCXXRecordDecl()->getNameAsString()
+      CastInfo ? CastInfo->to()->getAsCXXRecordDecl()->getNameAsString()
                : CastToTy->getPointeeCXXRecordDecl()->getNameAsString();
   Object = Object->IgnoreParenImpCasts();
 
diff --git a/clang/lib/StaticAnalyzer/Core/DynamicType.cpp b/clang/lib/StaticAnalyzer/Core/DynamicType.cpp
index e9b64fd79614..9ed915aafcab 100644
--- a/clang/lib/StaticAnalyzer/Core/DynamicType.cpp
+++ b/clang/lib/StaticAnalyzer/Core/DynamicType.cpp
@@ -65,6 +65,13 @@ const DynamicTypeInfo *getRawDynamicTypeInfo(ProgramStateRef State,
   return State->get<DynamicTypeMap>(MR);
 }
 
+static void unbox(QualType &Ty) {
+  // FIXME: Why are we being fed references to pointers in the first place?
+  while (Ty->isReferenceType() || Ty->isPointerType())
+    Ty = Ty->getPointeeType();
+  Ty = Ty.getCanonicalType().getUnqualifiedType();
+}
+
 const DynamicCastInfo *getDynamicCastInfo(ProgramStateRef State,
                                           const MemRegion *MR,
                                           QualType CastFromTy,
@@ -73,6 +80,9 @@ const DynamicCastInfo *getDynamicCastInfo(ProgramStateRef State,
   if (!Lookup)
     return nullptr;
 
+  unbox(CastFromTy);
+  unbox(CastToTy);
+
   for (const DynamicCastInfo &Cast : *Lookup)
     if (Cast.equals(CastFromTy, CastToTy))
       return &Cast;
@@ -112,6 +122,9 @@ ProgramStateRef setDynamicTypeAndCastInfo(ProgramStateRef State,
     State = State->set<DynamicTypeMap>(MR, CastToTy);
   }
 
+  unbox(CastFromTy);
+  unbox(CastToTy);
+
   DynamicCastInfo::CastResult ResultKind =
       CastSucceeds ? DynamicCastInfo::CastResult::Success
                    : DynamicCastInfo::CastResult::Failure;
diff --git a/clang/test/Analysis/cast-value-state-dump.cpp b/clang/test/Analysis/cast-value-state-dump.cpp
index 3dffb78767cf..3e6a40cf1319 100644
--- a/clang/test/Analysis/cast-value-state-dump.cpp
+++ b/clang/test/Analysis/cast-value-state-dump.cpp
@@ -35,8 +35,8 @@ void evalNonNullParamNonNullReturn(const Shape *S) {
   // CHECK-NEXT: ],
   // CHECK-NEXT: "dynamic_casts": [
   // CHECK:        { "region": "SymRegion{reg_$0<const struct clang::Shape * S>}", "casts": [
-  // CHECK-NEXT:     { "from": "const struct clang::Shape *", "to": "const class clang::Circle *", "kind": "success" },
-  // CHECK-NEXT:     { "from": "const struct clang::Shape *", "to": "const class clang::Square *", "kind": "fail" }
+  // CHECK-NEXT:     { "from": "struct clang::Shape", "to": "class clang::Circle", "kind": "success" },
+  // CHECK-NEXT:     { "from": "struct clang::Shape", "to": "class clang::Square", "kind": "fail" }
   // CHECK-NEXT:   ] }
 
   (void)(1 / !C);

From 22bce848a0b27b5ce7ef1e086054522a39c70651 Mon Sep 17 00:00:00 2001
From: Adam Balogh <adam.balogh@ericsson.com>
Date: Thu, 27 Aug 2020 08:06:10 -0700
Subject: [PATCH 195/363] [analyzer] pr47037: CastValueChecker: Support for the
 new variadic isa<>.

llvm::isa<>() and llvm::isa_and_not_null<>() template functions recently became
variadic. Unfortunately this causes crashes in case of isa_and_not_null<>()
and incorrect behavior in isa<>(). This patch fixes this issue.

Differential Revision: https://reviews.llvm.org/D85728

(cherry picked from commit 4448affede5100658530aea8793ae7a7bc05a110)
---
 .../Checkers/CastValueChecker.cpp             | 136 ++++++++++++++----
 clang/test/Analysis/Inputs/llvm.h             |  18 ++-
 clang/test/Analysis/cast-value-logic.cpp      |  19 +++
 clang/test/Analysis/cast-value-notes.cpp      |  80 +++++++++--
 4 files changed, 209 insertions(+), 44 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/CastValueChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CastValueChecker.cpp
index 3d1721f04875..528f68c6c429 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CastValueChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CastValueChecker.cpp
@@ -135,6 +135,47 @@ static const NoteTag *getNoteTag(CheckerContext &C,
       /*IsPrunable=*/true);
 }
 
+static const NoteTag *getNoteTag(CheckerContext &C,
+                                 SmallVector<QualType, 4> CastToTyVec,
+                                 const Expr *Object,
+                                 bool IsKnownCast) {
+  Object = Object->IgnoreParenImpCasts();
+
+  return C.getNoteTag(
+      [=]() -> std::string {
+        SmallString<128> Msg;
+        llvm::raw_svector_ostream Out(Msg);
+
+        if (!IsKnownCast)
+          Out << "Assuming ";
+
+        if (const auto *DRE = dyn_cast<DeclRefExpr>(Object)) {
+          Out << '\'' << DRE->getDecl()->getNameAsString() << '\'';
+        } else if (const auto *ME = dyn_cast<MemberExpr>(Object)) {
+          Out << (IsKnownCast ? "Field '" : "field '")
+              << ME->getMemberDecl()->getNameAsString() << '\'';
+        } else {
+          Out << (IsKnownCast ? "The object" : "the object");
+        }
+        Out << " is";
+
+        bool First = true;
+        for (QualType CastToTy: CastToTyVec) {
+          std::string CastToName =
+            CastToTy->getAsCXXRecordDecl() ?
+            CastToTy->getAsCXXRecordDecl()->getNameAsString() :
+            CastToTy->getPointeeCXXRecordDecl()->getNameAsString();
+          Out << ' ' << ((CastToTyVec.size() == 1) ? "not" :
+                         (First ? "neither" : "nor")) << " a '" << CastToName
+              << '\'';
+          First = false;
+        }
+
+        return std::string(Out.str());
+      },
+      /*IsPrunable=*/true);
+}
+
 //===----------------------------------------------------------------------===//
 // Main logic to evaluate a cast.
 //===----------------------------------------------------------------------===//
@@ -220,40 +261,76 @@ static void addInstanceOfTransition(const CallEvent &Call,
                                     bool IsInstanceOf) {
   const FunctionDecl *FD = Call.getDecl()->getAsFunction();
   QualType CastFromTy = Call.parameters()[0]->getType();
-  QualType CastToTy = FD->getTemplateSpecializationArgs()->get(0).getAsType();
-  if (CastFromTy->isPointerType())
-    CastToTy = C.getASTContext().getPointerType(CastToTy);
-  else if (CastFromTy->isReferenceType())
-    CastToTy = alignReferenceTypes(CastToTy, CastFromTy, C.getASTContext());
-  else
-    return;
+  SmallVector<QualType, 4> CastToTyVec;
+  for (unsigned idx = 0; idx < FD->getTemplateSpecializationArgs()->size() - 1;
+       ++idx) {
+    TemplateArgument CastToTempArg =
+      FD->getTemplateSpecializationArgs()->get(idx);
+    switch (CastToTempArg.getKind()) {
+    default:
+      return;
+    case TemplateArgument::Type:
+      CastToTyVec.push_back(CastToTempArg.getAsType());
+      break;
+    case TemplateArgument::Pack:
+      for (TemplateArgument ArgInPack: CastToTempArg.pack_elements())
+        CastToTyVec.push_back(ArgInPack.getAsType());
+      break;
+    }
+  }
 
   const MemRegion *MR = DV.getAsRegion();
-  const DynamicCastInfo *CastInfo =
-      getDynamicCastInfo(State, MR, CastFromTy, CastToTy);
+  if (MR && CastFromTy->isReferenceType())
+    MR = State->getSVal(DV.castAs<Loc>()).getAsRegion();
+
+  bool Success = false;
+  bool IsAnyKnown = false;
+  for (QualType CastToTy: CastToTyVec) {
+    if (CastFromTy->isPointerType())
+      CastToTy = C.getASTContext().getPointerType(CastToTy);
+    else if (CastFromTy->isReferenceType())
+      CastToTy = alignReferenceTypes(CastToTy, CastFromTy, C.getASTContext());
+    else
+      return;
 
-  bool CastSucceeds;
-  if (CastInfo)
-    CastSucceeds = IsInstanceOf && CastInfo->succeeds();
-  else
-    CastSucceeds = IsInstanceOf || CastFromTy == CastToTy;
+    const DynamicCastInfo *CastInfo =
+      getDynamicCastInfo(State, MR, CastFromTy, CastToTy);
 
-  if (isInfeasibleCast(CastInfo, CastSucceeds)) {
-    C.generateSink(State, C.getPredecessor());
-    return;
+    bool CastSucceeds;
+    if (CastInfo)
+      CastSucceeds = IsInstanceOf && CastInfo->succeeds();
+    else
+      CastSucceeds = IsInstanceOf || CastFromTy == CastToTy;
+
+    // Store the type and the cast information.
+    bool IsKnownCast = CastInfo || CastFromTy == CastToTy;
+    IsAnyKnown = IsAnyKnown || IsKnownCast;
+    ProgramStateRef NewState = State;
+    if (!IsKnownCast)
+      NewState = setDynamicTypeAndCastInfo(State, MR, CastFromTy, CastToTy,
+                                           IsInstanceOf);
+
+    if (CastSucceeds) {
+      Success = true;
+      C.addTransition(
+          NewState->BindExpr(Call.getOriginExpr(), C.getLocationContext(),
+                             C.getSValBuilder().makeTruthVal(true)),
+          getNoteTag(C, CastInfo, CastToTy, Call.getArgExpr(0), true,
+                     IsKnownCast));
+      if (IsKnownCast)
+        return;
+    } else if (CastInfo && CastInfo->succeeds()) {
+      C.generateSink(NewState, C.getPredecessor());
+      return;
+    }
   }
 
-  // Store the type and the cast information.
-  bool IsKnownCast = CastInfo || CastFromTy == CastToTy;
-  if (!IsKnownCast)
-    State = setDynamicTypeAndCastInfo(State, MR, CastFromTy, CastToTy,
-                                      IsInstanceOf);
-
-  C.addTransition(
-      State->BindExpr(Call.getOriginExpr(), C.getLocationContext(),
-                      C.getSValBuilder().makeTruthVal(CastSucceeds)),
-      getNoteTag(C, CastInfo, CastToTy, Call.getArgExpr(0), CastSucceeds,
-                 IsKnownCast));
+  if (!Success) {
+    C.addTransition(
+        State->BindExpr(Call.getOriginExpr(), C.getLocationContext(),
+                        C.getSValBuilder().makeTruthVal(false)),
+        getNoteTag(C, CastToTyVec, Call.getArgExpr(0), IsAnyKnown));
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -402,8 +479,9 @@ bool CastValueChecker::evalCall(const CallEvent &Call,
     QualType ParamT = Call.parameters()[0]->getType();
     QualType ResultT = Call.getResultType();
     if (!(ParamT->isPointerType() && ResultT->isPointerType()) &&
-        !(ParamT->isReferenceType() && ResultT->isReferenceType()))
+        !(ParamT->isReferenceType() && ResultT->isReferenceType())) {
       return false;
+    }
 
     DV = Call.getArgSVal(0).getAs<DefinedOrUnknownSVal>();
     break;
diff --git a/clang/test/Analysis/Inputs/llvm.h b/clang/test/Analysis/Inputs/llvm.h
index c9d66ba2374d..b80567bcb586 100644
--- a/clang/test/Analysis/Inputs/llvm.h
+++ b/clang/test/Analysis/Inputs/llvm.h
@@ -19,11 +19,19 @@ const X *dyn_cast_or_null(Y *Value);
 template <class X, class Y>
 const X *dyn_cast_or_null(Y &Value);
 
-template <class X, class Y>
-bool isa(Y Value);
-
-template <class X, class Y>
-bool isa_and_nonnull(Y Value);
+template <class X, class Y> inline bool isa(const Y &Val);
+
+template <typename First, typename Second, typename... Rest, typename Y>
+inline bool isa(const Y &Val) {
+  return isa<First>(Val) || isa<Second, Rest...>(Val);
+}
+
+template <typename... X, class Y>
+inline bool isa_and_nonnull(const Y &Val) {
+  if (!Val)
+    return false;
+  return isa<X...>(Val);
+}
 
 template <typename X, typename Y>
 std::unique_ptr<X> cast(std::unique_ptr<Y> &&Value);
diff --git a/clang/test/Analysis/cast-value-logic.cpp b/clang/test/Analysis/cast-value-logic.cpp
index 1411ede92e36..52a94f24fba6 100644
--- a/clang/test/Analysis/cast-value-logic.cpp
+++ b/clang/test/Analysis/cast-value-logic.cpp
@@ -19,6 +19,8 @@ struct Shape {
   virtual double area();
 };
 class Triangle : public Shape {};
+class Rectangle : public Shape {};
+class Hexagon : public Shape {};
 class Circle : public Shape {
 public:
   ~Circle();
@@ -39,6 +41,23 @@ void test_regions_isa(const Shape *A, const Shape *B) {
     clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
 }
 
+void test_regions_isa_variadic(const Shape *A, const Shape *B) {
+  if (isa<Triangle, Rectangle, Hexagon>(A) &&
+      !isa<Rectangle, Hexagon, Circle>(B))
+    clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+}
+
+void test_regions_isa_and_nonnull(const Shape *A, const Shape *B) {
+  if (isa_and_nonnull<Circle>(A) && !isa_and_nonnull<Circle>(B))
+    clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+}
+
+void test_regions_isa_and_nonnull_variadic(const Shape *A, const Shape *B) {
+  if (isa_and_nonnull<Triangle, Rectangle, Hexagon>(A) &&
+      !isa_and_nonnull<Rectangle, Hexagon, Circle>(B))
+    clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+}
+
 namespace test_cast {
 void evalLogic(const Shape *S) {
   const Circle *C = cast<Circle>(S);
diff --git a/clang/test/Analysis/cast-value-notes.cpp b/clang/test/Analysis/cast-value-notes.cpp
index eb5d1b3d3fe2..a09586309fb4 100644
--- a/clang/test/Analysis/cast-value-notes.cpp
+++ b/clang/test/Analysis/cast-value-notes.cpp
@@ -13,6 +13,8 @@ struct Shape {
   const T *getAs() const;
 };
 class Triangle : public Shape {};
+class Rectangle : public Shape {};
+class Hexagon : public Shape {};
 class Circle : public Shape {};
 } // namespace clang
 
@@ -27,7 +29,6 @@ void evalReferences(const Shape &S) {
 }
 
 void evalNonNullParamNonNullReturnReference(const Shape &S) {
-  // Unmodeled cast from reference to pointer.
   const auto *C = dyn_cast_or_null<Circle>(S);
   // expected-note@-1 {{'C' initialized here}}
 
@@ -43,13 +44,37 @@ void evalNonNullParamNonNullReturnReference(const Shape &S) {
     return;
   }
 
+  if (dyn_cast_or_null<Rectangle>(C)) {
+    // expected-note@-1 {{Assuming 'C' is not a 'Rectangle'}}
+    // expected-note@-2 {{Taking false branch}}
+    return;
+  }
+
+  if (dyn_cast_or_null<Hexagon>(C)) {
+    // expected-note@-1 {{Assuming 'C' is not a 'Hexagon'}}
+    // expected-note@-2 {{Taking false branch}}
+    return;
+  }
+
   if (isa<Triangle>(C)) {
     // expected-note@-1 {{'C' is not a 'Triangle'}}
     // expected-note@-2 {{Taking false branch}}
     return;
   }
 
-  if (isa<Circle>(C)) {
+  if (isa<Triangle, Rectangle>(C)) {
+    // expected-note@-1 {{'C' is neither a 'Triangle' nor a 'Rectangle'}}
+    // expected-note@-2 {{Taking false branch}}
+    return;
+  }
+
+  if (isa<Triangle, Rectangle, Hexagon>(C)) {
+    // expected-note@-1 {{'C' is neither a 'Triangle' nor a 'Rectangle' nor a 'Hexagon'}}
+    // expected-note@-2 {{Taking false branch}}
+    return;
+  }
+
+  if (isa<Circle, Rectangle, Hexagon>(C)) {
     // expected-note@-1 {{'C' is a 'Circle'}}
     // expected-note@-2 {{Taking true branch}}
 
@@ -65,22 +90,57 @@ void evalNonNullParamNonNullReturn(const Shape *S) {
   // expected-note@-1 {{'S' is a 'Circle'}}
   // expected-note@-2 {{'C' initialized here}}
 
-  if (!isa<Triangle>(C)) {
-    // expected-note@-1 {{Assuming 'C' is a 'Triangle'}}
+  if (!dyn_cast_or_null<Circle>(C)) {
+    // expected-note@-1 {{'C' is a 'Circle'}}
     // expected-note@-2 {{Taking false branch}}
     return;
   }
 
-  if (!isa<Triangle>(C)) {
-    // expected-note@-1 {{'C' is a 'Triangle'}}
+  if (dyn_cast_or_null<Triangle>(C)) {
+    // expected-note@-1 {{Assuming 'C' is not a 'Triangle'}}
     // expected-note@-2 {{Taking false branch}}
     return;
   }
 
-  (void)(1 / !C);
-  // expected-note@-1 {{'C' is non-null}}
-  // expected-note@-2 {{Division by zero}}
-  // expected-warning@-3 {{Division by zero}}
+  if (dyn_cast_or_null<Rectangle>(C)) {
+    // expected-note@-1 {{Assuming 'C' is not a 'Rectangle'}}
+    // expected-note@-2 {{Taking false branch}}
+    return;
+  }
+
+  if (dyn_cast_or_null<Hexagon>(C)) {
+    // expected-note@-1 {{Assuming 'C' is not a 'Hexagon'}}
+    // expected-note@-2 {{Taking false branch}}
+    return;
+  }
+
+  if (isa<Triangle>(C)) {
+    // expected-note@-1 {{'C' is not a 'Triangle'}}
+    // expected-note@-2 {{Taking false branch}}
+    return;
+  }
+
+  if (isa<Triangle, Rectangle>(C)) {
+    // expected-note@-1 {{'C' is neither a 'Triangle' nor a 'Rectangle'}}
+    // expected-note@-2 {{Taking false branch}}
+    return;
+  }
+
+  if (isa<Triangle, Rectangle, Hexagon>(C)) {
+    // expected-note@-1 {{'C' is neither a 'Triangle' nor a 'Rectangle' nor a 'Hexagon'}}
+    // expected-note@-2 {{Taking false branch}}
+    return;
+  }
+
+  if (isa<Circle, Rectangle, Hexagon>(C)) {
+    // expected-note@-1 {{'C' is a 'Circle'}}
+    // expected-note@-2 {{Taking true branch}}
+
+    (void)(1 / !C);
+    // expected-note@-1 {{'C' is non-null}}
+    // expected-note@-2 {{Division by zero}}
+    // expected-warning@-3 {{Division by zero}}
+  }
 }
 
 void evalNonNullParamNullReturn(const Shape *S) {

From 97ac9e82002d6b12831ca2c78f739cca65a4fa05 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Thu, 27 Aug 2020 17:17:38 -0400
Subject: [PATCH 196/363] [SSP] Restore setting the visibility of __guard_local
 to hidden for better code generation.

Patch by: Philip Guenther

(cherry picked from commit d870e363263835bec96c83f51b20e64722cad742)
---
 llvm/lib/CodeGen/TargetLoweringBase.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 2c94c2c62e5f..42c1fa8af0e6 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1827,7 +1827,10 @@ Value *TargetLoweringBase::getIRStackGuard(IRBuilder<> &IRB) const {
   if (getTargetMachine().getTargetTriple().isOSOpenBSD()) {
     Module &M = *IRB.GetInsertBlock()->getParent()->getParent();
     PointerType *PtrTy = Type::getInt8PtrTy(M.getContext());
-    return M.getOrInsertGlobal("__guard_local", PtrTy);
+    Constant *C = M.getOrInsertGlobal("__guard_local", PtrTy);
+    if (GlobalVariable *G = dyn_cast_or_null<GlobalVariable>(C))
+      G->setVisibility(GlobalValue::HiddenVisibility);
+    return C;
   }
   return nullptr;
 }

From 2c6a593b5e186a686fdaf6b6082b0dbcae29c265 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 29 Aug 2020 08:51:19 -0700
Subject: [PATCH 197/363] ReleaseNotes: add lld/ELF notes

Differential Revision: https://reviews.llvm.org/D86579
---
 lld/docs/ReleaseNotes.rst | 133 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 127 insertions(+), 6 deletions(-)

diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 513ad37e278e..466a7f707354 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -24,22 +24,143 @@ Non-comprehensive list of changes in this release
 ELF Improvements
 ----------------
 
-* New ``--time-trace`` option records a time trace file that can be viewed in
+* ``--lto-emit-asm`` is added to emit assembly output for debugging purposes.
+  (`D77231 <https://reviews.llvm.org/D77231>`_)
+* ``--lto-whole-program-visibility`` is added to support LTO whole-program devirtualization.
+  (`D71913 <https://reviews.llvm.org/D71913>`_)
+* ``--print-archive-stats=`` is added to print the number of members and the number of fetched members for each archive.
+  The feature is similar to GNU gold's ``--print-symbol-counts=``.
+  (`D78983 <https://reviews.llvm.org/D78983>`_)
+* ``--shuffle-sections=`` is added to introduce randomization in the output to help reduce measurement bias and detect static initialization order fiasco.
+  (`D74791 <https://reviews.llvm.org/D74791>`_)
+  (`D74887 <https://reviews.llvm.org/D74887>`_)
+* ``--time-trace`` is added. It records a time trace file that can be viewed in
   chrome://tracing. The file can be specified with ``--time-trace-file``.
   Trace granularity can be specified with ``--time-trace-granularity``.
   (`D71060 <https://reviews.llvm.org/D71060>`_)
-* For ARM architectures the default max page size was increased to 64k.
-  This increases compatibility with systems where a non standard page
-  size was configured. This also is inline with GNU ld defaults.
-  (`D77330 <https://reviews.llvm.org/D77330>`_)
-* ...
+* ``--thinlto-single-module`` is added to compile a subset of modules in ThinLTO for debugging purposes.
+  (`D80406 <https://reviews.llvm.org/D80406>`_)
+* ``--unique`` is added to create separate output sections for orphan sections.
+  (`D75536 <https://reviews.llvm.org/D75536>`_)
+* ``--warn-backrefs`` has been improved to emulate GNU ld's archive semantics.
+  If a link passes with warnings from ``--warn-backrefs``, it almost assuredly
+  means that the link will fail with GNU ld, or the symbol will get different
+  resolutions in GNU ld and LLD. ``--warn-backrefs-exclude=`` is added to
+  exclude known issues.
+  (`D77522 <https://reviews.llvm.org/D77522>`_)
+  (`D77630 <https://reviews.llvm.org/D77630>`_)
+  (`D77512 <https://reviews.llvm.org/D77512>`_)
+* ``--no-relax`` is accepted but ignored. The Linux kernel's RISC-V port uses this option.
+  (`D81359 <https://reviews.llvm.org/D81359>`_)
+* ``--rosegment`` (default) is added to complement ``--no-rosegment``.
+  GNU gold from 2.35 onwards support both options.
+* ``--threads=N`` is added. The default uses all threads.
+  (`D76885 <https://reviews.llvm.org/D76885>`_)
+* ``--wrap`` has better compatibility with GNU ld.
+* ``-z dead-reloc-in-nonalloc=<section_glob>=<value>`` is added to resolve an absolute relocation
+  referencing a discarded symbol.
+  (`D83264 <https://reviews.llvm.org/D83264>`_)
+* Changed tombstone values to (``.debug_ranges``/``.debug_loc``) 1 and (other ``.debug_*``) 0.
+  A tombstone value is the computed value of a relocation referencing a discarded symbol (``--gc-sections``, ICF or ``/DISCARD/``).
+  (`D84825 <https://reviews.llvm.org/D84825>`_)
+  In the future many .debug_* may switch to 0xffffffff/0xffffffffffffffff as the tombstone value.
+* ``-z keep-text-section-prefix`` moves ``.text.unknown.*`` input sections to ``.text.unknown``.
+* ``-z rel`` and ``-z rela`` are added to select the REL/RELA format for dynamic relocations.
+  The default is target specific and typically matches the form used in relocatable objects.
+* ``-z start-stop-visibility={default,protected,internal,hidden}`` is added.
+  GNU ld/gold from 2.35 onwards support this option.
+  (`D55682 <https://reviews.llvm.org/D55682>`_)
+* When ``-r`` or ``--emit-relocs`` is specified, the GNU ld compatible
+  ``--discard-all`` and ``--discard-locals`` semantics are implemented.
+  (`D77807 <https://reviews.llvm.org/D77807>`_)
+* ``--emit-relocs --strip-debug`` can now be used together.
+  (`D74375 <https://reviews.llvm.org/D74375>`_)
+* ``--gdb-index`` supports DWARF v5.
+  (`D79061 <https://reviews.llvm.org/D79061>`_)
+  (`D85579 <https://reviews.llvm.org/D85579>`_)
+* ``-r`` allows SHT_X86_64_UNWIND to be merged into SHT_PROGBITS.
+  This allows clang/GCC produced object files to be mixed together.
+  (`D85785 <https://reviews.llvm.org/D85785>`_)
+* Better linker script support related to output section alignments and LMA regions.
+  (`D74286 <https://reviews.llvm.org/D75724>`_)
+  (`D74297 <https://reviews.llvm.org/D75724>`_)
+  (`D75724 <https://reviews.llvm.org/D75724>`_)
+  (`D81986 <https://reviews.llvm.org/D81986>`_)
+* In a input section description, the filename can be specified in double quotes.
+  ``archive:file`` syntax is added.
+  (`D72517 <https://reviews.llvm.org/D72517>`_)
+  (`D75100 <https://reviews.llvm.org/D75100>`_)
+* Linker script specified empty ``(.init|.preinit|.fini)_array`` are allowed with RELRO.
+  (`D76915 <https://reviews.llvm.org/D76915>`_)
+* ``INSERT AFTER`` and ``INSERT BEFORE`` work for orphan sections now.
+  (`D74375 <https://reviews.llvm.org/D74375>`_)
+* ``INPUT_SECTION_FLAGS`` is supported in linker scripts.
+  (`D72745 <https://reviews.llvm.org/D72745>`_)
+* ``DF_1_PIE`` is set for position-independent executables.
+  (`D80872 <https://reviews.llvm.org/D80872>`_)
+* For a symbol assignment ``alias = aliasee;``, ``alias`` inherits the ``aliasee``'s symbol type.
+  (`D86263 <https://reviews.llvm.org/D86263>`_)
+* ``SHT_GNU_verneed`` in shared objects are parsed, and versioned undefined symbols in shared objects are respected.
+  (`D80059 <https://reviews.llvm.org/D80059>`_)
+* SHF_LINK_ORDER and non-SHF_LINK_ORDER sections can be mixed along as the SHF_LINK_ORDER components are contiguous.
+  (`D77007 <https://reviews.llvm.org/D77007>`_)
+* An out-of-range relocation diagnostic mentions the referenced symbol now.
+  (`D73518 <https://reviews.llvm.org/D73518>`_)
+* AArch64: ``R_AARCH64_PLT32`` is supported.
+  (`D81184 <https://reviews.llvm.org/D81184>`_)
+* ARM: SBREL type relocations are supported.
+  (`D74375 <https://reviews.llvm.org/D74375>`_)
+* ARM: ``R_ARM_ALU_PC_G0``, ``R_ARM_LDR_PC_G0``, ``R_ARM_THUMB_PC8`` and ``R_ARM_THUMB__PC12`` are supported.
+  (`D75349 <https://reviews.llvm.org/D75349>`_)
+  (`D77200 <https://reviews.llvm.org/D77200>`_)
+* ARM: various improvements to .ARM.exidx: ``/DISCARD/`` support for a subset, out-of-range handling, support for non monotonic section order.
+  (`PR44824 <https://llvm.org/PR44824>`_)
+* AVR: many relocation types are supported.
+  (`D78741 <https://reviews.llvm.org/D78741>`_)
+* Hexagon: General Dynamic and some other relocation types are supported.
+* PPC: Canonical PLT and range extension thunks with addends are supported.
+  (`D73399 <https://reviews.llvm.org/D73399>`_)
+  (`D73424 <https://reviews.llvm.org/D73424>`_)
+  (`D75394 <https://reviews.llvm.org/D75394>`_)
+* PPC and PPC64: copy relocations.
+  (`D73255 <https://reviews.llvm.org/D73255>`_)
+* PPC64: ``_savegpr[01]_{14..31}`` and ``_restgpr[01]_{14..31}`` can be synthesized.
+  (`D79977 <https://reviews.llvm.org/D79977>`_)
+* PPC64: ``R_PPC64_GOT_PCREL34`` and ``R_PPC64_REL24_NOTOC`` are supported. r2 save stub is supported.
+  (`D81948 <https://reviews.llvm.org/D81948>`_)
+  (`D82950 <https://reviews.llvm.org/D82950>`_)
+  (`D82816 <https://reviews.llvm.org/D82816>`_)
+* RISC-V: ``R_RISCV_IRELATIVE`` is supported.
+  (`D74022 <https://reviews.llvm.org/D74022>`_)
+* RISC-V: ``R_RISCV_ALIGN`` is errored because GNU ld style linker relaxation is not supported.
+  (`D71820 <https://reviews.llvm.org/D71820>`_)
+* SPARCv9: more relocation types are supported.
+  (`D77672 <https://reviews.llvm.org/D77672>`_)
 
 Breaking changes
 ----------------
 
 * One-dash form of some long option (``--thinlto-*``, ``--lto-*``, ``--shuffle-sections=``)
   are no longer supported.
+  (`D79371 <https://reviews.llvm.org/D79371>`_)
 * ``--export-dynamic-symbol`` no longer implies ``-u``.
+  The new behavior matches GNU ld from binutils 2.35 onwards.
+  (`D80487 <https://reviews.llvm.org/D80487>`_)
+* ARM: the default max page size was increased from 4096 to 65536.
+  This increases compatibility with systems where a non standard page
+  size was configured. This also is inline with GNU ld defaults.
+  (`D77330 <https://reviews.llvm.org/D77330>`_)
+* ARM: for non-STT_FUNC symbols, Thumb interworking thunks are not added and BL/BLX are not substituted.
+  (`D73474 <https://reviews.llvm.org/D73474>`_)
+  (`D73542 <https://reviews.llvm.org/D73542>`_)
+* AArch64: ``--force-bti`` is renamed to ``-z force-bti`. ``--pac-plt`` is renamed to ``-z pac-plt``.
+  This change is compatibile with GNU ld.
+* A readonly ``PT_LOAD`` is created in the presence of a ``SECTIONS`` command.
+  The new behavior is consistent with the longstanding behavior in the absence of a SECTIONS command.
+* Orphan section names like ``.rodata.foo`` and ``.text.foo`` are not grouped into ``.rodata`` and ``.text`` in the presence of a ``SECTIONS`` command.
+  The new behavior matches GNU ld.
+  (`D75225 <https://reviews.llvm.org/D75225>`_)
+* ``--no-threads`` is removed. Use ``--threads=1`` instead. ``--threads`` (no-op) is removed.
 
 COFF Improvements
 -----------------

From db16ab428bbae15b5c157fd518574baecf63f8dc Mon Sep 17 00:00:00 2001
From: sameeran joshi <sameeranjayant.joshi@amd.com>
Date: Tue, 18 Aug 2020 15:05:51 +0530
Subject: [PATCH 198/363] [Flang] Move markdown files(.MD) from documentation/
 to docs/

Summary:
Other LLVM sub-projects use docs/ folder for documentation files.
Follow LLVM project policy.
Modify `documentation/` references in sources to `docs/`.
This patch doesn't modify files to reStructuredText(.rst) file format.

Reviewed By: DavidTruby, sscalpone

Differential Revision: https://reviews.llvm.org/D85884

(cherry picked from commit eaff200429a3dcf36eebfae39d2e859d6815285e)
---
 flang/README.md                               | 20 +++++++++----------
 .../ArrayComposition.md                       |  2 +-
 .../BijectiveInternalNameUniquing.md          |  0
 flang/{documentation => docs}/C++17.md        |  2 +-
 flang/{documentation => docs}/C++style.md     |  2 +-
 flang/{documentation => docs}/Calls.md        |  2 +-
 flang/{documentation => docs}/Character.md    |  2 +-
 .../ControlFlowGraph.md                       |  2 +-
 flang/{documentation => docs}/Directives.md   |  2 +-
 flang/{documentation => docs}/Extensions.md   |  2 +-
 .../FortranForCProgrammers.md                 |  2 +-
 flang/{documentation => docs}/FortranIR.md    |  2 +-
 .../IORuntimeInternals.md                     |  2 +-
 .../ImplementingASemanticCheck.md             |  2 +-
 flang/{documentation => docs}/Intrinsics.md   |  2 +-
 .../LabelResolution.md                        |  2 +-
 flang/{documentation => docs}/ModFiles.md     |  2 +-
 .../OpenMP-4.5-grammar.txt                    |  2 +-
 .../OpenMP-semantics.md                       |  2 +-
 .../OptionComparison.md                       |  2 +-
 flang/{documentation => docs}/Overview.md     |  2 +-
 .../ParserCombinators.md                      |  2 +-
 flang/{documentation => docs}/Parsing.md      |  2 +-
 .../{documentation => docs}/Preprocessing.md  |  2 +-
 .../PullRequestChecklist.md                   |  2 +-
 .../RuntimeDescriptor.md                      |  2 +-
 flang/{documentation => docs}/Semantics.md    |  2 +-
 .../{documentation => docs}/f2018-grammar.txt |  2 +-
 .../{documentation => docs}/flang-c-style.el  |  2 +-
 flang/lib/Evaluate/intrinsics.cpp             |  2 +-
 30 files changed, 38 insertions(+), 38 deletions(-)
 rename flang/{documentation => docs}/ArrayComposition.md (99%)
 rename flang/{documentation => docs}/BijectiveInternalNameUniquing.md (100%)
 rename flang/{documentation => docs}/C++17.md (99%)
 rename flang/{documentation => docs}/C++style.md (99%)
 rename flang/{documentation => docs}/Calls.md (99%)
 rename flang/{documentation => docs}/Character.md (99%)
 rename flang/{documentation => docs}/ControlFlowGraph.md (99%)
 rename flang/{documentation => docs}/Directives.md (92%)
 rename flang/{documentation => docs}/Extensions.md (99%)
 rename flang/{documentation => docs}/FortranForCProgrammers.md (99%)
 rename flang/{documentation => docs}/FortranIR.md (99%)
 rename flang/{documentation => docs}/IORuntimeInternals.md (99%)
 rename flang/{documentation => docs}/ImplementingASemanticCheck.md (99%)
 rename flang/{documentation => docs}/Intrinsics.md (99%)
 rename flang/{documentation => docs}/LabelResolution.md (99%)
 rename flang/{documentation => docs}/ModFiles.md (99%)
 rename flang/{documentation => docs}/OpenMP-4.5-grammar.txt (99%)
 rename flang/{documentation => docs}/OpenMP-semantics.md (99%)
 rename flang/{documentation => docs}/OptionComparison.md (99%)
 rename flang/{documentation => docs}/Overview.md (98%)
 rename flang/{documentation => docs}/ParserCombinators.md (99%)
 rename flang/{documentation => docs}/Parsing.md (99%)
 rename flang/{documentation => docs}/Preprocessing.md (99%)
 rename flang/{documentation => docs}/PullRequestChecklist.md (98%)
 rename flang/{documentation => docs}/RuntimeDescriptor.md (99%)
 rename flang/{documentation => docs}/Semantics.md (99%)
 rename flang/{documentation => docs}/f2018-grammar.txt (99%)
 rename flang/{documentation => docs}/flang-c-style.el (92%)

diff --git a/flang/README.md b/flang/README.md
index f7797ed55bd3..44573ae4b9b6 100644
--- a/flang/README.md
+++ b/flang/README.md
@@ -8,30 +8,30 @@ F18 was subsequently accepted into the LLVM project and rechristened as Flang.
 
 ## Getting Started
 
-Read more about flang in the [documentation directory](documentation).
-Start with the [compiler overview](documentation/Overview.md).
+Read more about flang in the [docs directory](docs).
+Start with the [compiler overview](docs/Overview.md).
 
 To better understand Fortran as a language
 and the specific grammar accepted by flang,
-read [Fortran For C Programmers](documentation/FortranForCProgrammers.md)
+read [Fortran For C Programmers](docs/FortranForCProgrammers.md)
 and
-flang's specifications of the [Fortran grammar](documentation/f2018-grammar.txt)
+flang's specifications of the [Fortran grammar](docs/f2018-grammar.txt)
 and
-the [OpenMP grammar](documentation/OpenMP-4.5-grammar.txt).
+the [OpenMP grammar](docs/OpenMP-4.5-grammar.txt).
 
 Treatment of language extensions is covered
-in [this document](documentation/Extensions.md).
+in [this document](docs/Extensions.md).
 
 To understand the compilers handling of intrinsics,
-see the [discussion of intrinsics](documentation/Intrinsics.md).
+see the [discussion of intrinsics](docs/Intrinsics.md).
 
 To understand how a flang program communicates with libraries at runtime,
-see the discussion of [runtime descriptors](documentation/RuntimeDescriptor.md).
+see the discussion of [runtime descriptors](docs/RuntimeDescriptor.md).
 
 If you're interested in contributing to the compiler,
-read the [style guide](documentation/C++style.md)
+read the [style guide](docs/C++style.md)
 and
-also review [how flang uses modern C++ features](documentation/C++17.md).
+also review [how flang uses modern C++ features](docs/C++17.md).
 
 ## Supported C++ compilers
 
diff --git a/flang/documentation/ArrayComposition.md b/flang/docs/ArrayComposition.md
similarity index 99%
rename from flang/documentation/ArrayComposition.md
rename to flang/docs/ArrayComposition.md
index 099909c5ef0d..0f30af39f9e4 100644
--- a/flang/documentation/ArrayComposition.md
+++ b/flang/docs/ArrayComposition.md
@@ -1,4 +1,4 @@
-<!--===- documentation/ArrayComposition.md 
+<!--===- docs/ArrayComposition.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/BijectiveInternalNameUniquing.md b/flang/docs/BijectiveInternalNameUniquing.md
similarity index 100%
rename from flang/documentation/BijectiveInternalNameUniquing.md
rename to flang/docs/BijectiveInternalNameUniquing.md
diff --git a/flang/documentation/C++17.md b/flang/docs/C++17.md
similarity index 99%
rename from flang/documentation/C++17.md
rename to flang/docs/C++17.md
index 18ea0b23e70e..87d5fc01f092 100644
--- a/flang/documentation/C++17.md
+++ b/flang/docs/C++17.md
@@ -1,4 +1,4 @@
-<!--===- documentation/C++17.md 
+<!--===- docs/C++17.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/C++style.md b/flang/docs/C++style.md
similarity index 99%
rename from flang/documentation/C++style.md
rename to flang/docs/C++style.md
index ca532463ae83..4ab95393d758 100644
--- a/flang/documentation/C++style.md
+++ b/flang/docs/C++style.md
@@ -1,4 +1,4 @@
-<!--===- documentation/C++style.md 
+<!--===- docs/C++style.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Calls.md b/flang/docs/Calls.md
similarity index 99%
rename from flang/documentation/Calls.md
rename to flang/docs/Calls.md
index 79f0d972bc73..d70bc910d73d 100644
--- a/flang/documentation/Calls.md
+++ b/flang/docs/Calls.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Calls.md
+<!--===- docs/Calls.md
 
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Character.md b/flang/docs/Character.md
similarity index 99%
rename from flang/documentation/Character.md
rename to flang/docs/Character.md
index d1c7ca479a17..700db864f2da 100644
--- a/flang/documentation/Character.md
+++ b/flang/docs/Character.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Character.md
+<!--===- docs/Character.md
 
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/ControlFlowGraph.md b/flang/docs/ControlFlowGraph.md
similarity index 99%
rename from flang/documentation/ControlFlowGraph.md
rename to flang/docs/ControlFlowGraph.md
index 6ed9183daf7c..b2b549845ebb 100644
--- a/flang/documentation/ControlFlowGraph.md
+++ b/flang/docs/ControlFlowGraph.md
@@ -1,4 +1,4 @@
-<!--===- documentation/ControlFlowGraph.md 
+<!--===- docs/ControlFlowGraph.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Directives.md b/flang/docs/Directives.md
similarity index 92%
rename from flang/documentation/Directives.md
rename to flang/docs/Directives.md
index e178a69714cc..c2e93c5f3de2 100644
--- a/flang/documentation/Directives.md
+++ b/flang/docs/Directives.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Directives.md 
+<!--===- docs/Directives.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Extensions.md b/flang/docs/Extensions.md
similarity index 99%
rename from flang/documentation/Extensions.md
rename to flang/docs/Extensions.md
index a2420c727e82..9010b770cca6 100644
--- a/flang/documentation/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Extensions.md 
+<!--===- docs/Extensions.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/FortranForCProgrammers.md b/flang/docs/FortranForCProgrammers.md
similarity index 99%
rename from flang/documentation/FortranForCProgrammers.md
rename to flang/docs/FortranForCProgrammers.md
index ce4a0b7072b0..103def2a92ce 100644
--- a/flang/documentation/FortranForCProgrammers.md
+++ b/flang/docs/FortranForCProgrammers.md
@@ -1,4 +1,4 @@
-<!--===- documentation/FortranForCProgrammers.md
+<!--===- docs/FortranForCProgrammers.md
 
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/FortranIR.md b/flang/docs/FortranIR.md
similarity index 99%
rename from flang/documentation/FortranIR.md
rename to flang/docs/FortranIR.md
index ccdeb8fc4f56..5d83aaa8e34c 100644
--- a/flang/documentation/FortranIR.md
+++ b/flang/docs/FortranIR.md
@@ -1,4 +1,4 @@
-<!--===- documentation/FortranIR.md 
+<!--===- docs/FortranIR.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/IORuntimeInternals.md b/flang/docs/IORuntimeInternals.md
similarity index 99%
rename from flang/documentation/IORuntimeInternals.md
rename to flang/docs/IORuntimeInternals.md
index c9b1ce4078ec..b4f3092a014e 100644
--- a/flang/documentation/IORuntimeInternals.md
+++ b/flang/docs/IORuntimeInternals.md
@@ -1,4 +1,4 @@
-<!--===- documentation/IORuntimeInternals.md
+<!--===- docs/IORuntimeInternals.md
 
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/ImplementingASemanticCheck.md b/flang/docs/ImplementingASemanticCheck.md
similarity index 99%
rename from flang/documentation/ImplementingASemanticCheck.md
rename to flang/docs/ImplementingASemanticCheck.md
index fc2e4f14061e..3bb16915cb88 100644
--- a/flang/documentation/ImplementingASemanticCheck.md
+++ b/flang/docs/ImplementingASemanticCheck.md
@@ -1,4 +1,4 @@
-<!--===- documentation/ImplementingASemanticCheck.md 
+<!--===- docs/ImplementingASemanticCheck.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Intrinsics.md b/flang/docs/Intrinsics.md
similarity index 99%
rename from flang/documentation/Intrinsics.md
rename to flang/docs/Intrinsics.md
index 8fd06766710c..7be0bf3e4a9c 100644
--- a/flang/documentation/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Intrinsics.md 
+<!--===- docs/Intrinsics.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/LabelResolution.md b/flang/docs/LabelResolution.md
similarity index 99%
rename from flang/documentation/LabelResolution.md
rename to flang/docs/LabelResolution.md
index 87c7798c234e..e837b4fa6aec 100644
--- a/flang/documentation/LabelResolution.md
+++ b/flang/docs/LabelResolution.md
@@ -1,4 +1,4 @@
-<!--===- documentation/LabelResolution.md 
+<!--===- docs/LabelResolution.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/ModFiles.md b/flang/docs/ModFiles.md
similarity index 99%
rename from flang/documentation/ModFiles.md
rename to flang/docs/ModFiles.md
index 7d7ceb4e0a0e..483341bdd0f4 100644
--- a/flang/documentation/ModFiles.md
+++ b/flang/docs/ModFiles.md
@@ -1,4 +1,4 @@
-<!--===- documentation/ModFiles.md 
+<!--===- docs/ModFiles.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/OpenMP-4.5-grammar.txt b/flang/docs/OpenMP-4.5-grammar.txt
similarity index 99%
rename from flang/documentation/OpenMP-4.5-grammar.txt
rename to flang/docs/OpenMP-4.5-grammar.txt
index 01cfa1b65f94..c74072ba1ef2 100644
--- a/flang/documentation/OpenMP-4.5-grammar.txt
+++ b/flang/docs/OpenMP-4.5-grammar.txt
@@ -1,4 +1,4 @@
-#===-- documentation/OpenMP-4.5-grammar.txt --------------------------------===#
+#===-- docs/OpenMP-4.5-grammar.txt --------------------------------===#
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/OpenMP-semantics.md b/flang/docs/OpenMP-semantics.md
similarity index 99%
rename from flang/documentation/OpenMP-semantics.md
rename to flang/docs/OpenMP-semantics.md
index 783acef6a4e0..4e2a81739cf8 100644
--- a/flang/documentation/OpenMP-semantics.md
+++ b/flang/docs/OpenMP-semantics.md
@@ -1,4 +1,4 @@
-<!--===- documentation/OpenMP-semantics.md 
+<!--===- docs/OpenMP-semantics.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/OptionComparison.md b/flang/docs/OptionComparison.md
similarity index 99%
rename from flang/documentation/OptionComparison.md
rename to flang/docs/OptionComparison.md
index e8ccf57fbdf2..db5932411cc1 100644
--- a/flang/documentation/OptionComparison.md
+++ b/flang/docs/OptionComparison.md
@@ -1,4 +1,4 @@
-<!--===- documentation/OptionComparison.md 
+<!--===- docs/OptionComparison.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Overview.md b/flang/docs/Overview.md
similarity index 98%
rename from flang/documentation/Overview.md
rename to flang/docs/Overview.md
index 47ad18f023f9..75a8cd1c4cab 100644
--- a/flang/documentation/Overview.md
+++ b/flang/docs/Overview.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Overview.md 
+<!--===- docs/Overview.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/ParserCombinators.md b/flang/docs/ParserCombinators.md
similarity index 99%
rename from flang/documentation/ParserCombinators.md
rename to flang/docs/ParserCombinators.md
index 984a4d043439..4f3dc6fd07ae 100644
--- a/flang/documentation/ParserCombinators.md
+++ b/flang/docs/ParserCombinators.md
@@ -1,4 +1,4 @@
-<!--===- documentation/ParserCombinators.md 
+<!--===- docs/ParserCombinators.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Parsing.md b/flang/docs/Parsing.md
similarity index 99%
rename from flang/documentation/Parsing.md
rename to flang/docs/Parsing.md
index b961cd630ae1..fad9a4d57278 100644
--- a/flang/documentation/Parsing.md
+++ b/flang/docs/Parsing.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Parsing.md 
+<!--===- docs/Parsing.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Preprocessing.md b/flang/docs/Preprocessing.md
similarity index 99%
rename from flang/documentation/Preprocessing.md
rename to flang/docs/Preprocessing.md
index eff3f921e43c..7f6f3951cfd1 100644
--- a/flang/documentation/Preprocessing.md
+++ b/flang/docs/Preprocessing.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Preprocessing.md 
+<!--===- docs/Preprocessing.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/PullRequestChecklist.md b/flang/docs/PullRequestChecklist.md
similarity index 98%
rename from flang/documentation/PullRequestChecklist.md
rename to flang/docs/PullRequestChecklist.md
index 9a43fa9b46e0..12a67be374a2 100644
--- a/flang/documentation/PullRequestChecklist.md
+++ b/flang/docs/PullRequestChecklist.md
@@ -1,4 +1,4 @@
-<!--===- documentation/PullRequestChecklist.md 
+<!--===- docs/PullRequestChecklist.md 
 
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/RuntimeDescriptor.md b/flang/docs/RuntimeDescriptor.md
similarity index 99%
rename from flang/documentation/RuntimeDescriptor.md
rename to flang/docs/RuntimeDescriptor.md
index bf8845265792..d819517fa979 100644
--- a/flang/documentation/RuntimeDescriptor.md
+++ b/flang/docs/RuntimeDescriptor.md
@@ -1,4 +1,4 @@
-<!--===- documentation/RuntimeDescriptor.md 
+<!--===- docs/RuntimeDescriptor.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Semantics.md b/flang/docs/Semantics.md
similarity index 99%
rename from flang/documentation/Semantics.md
rename to flang/docs/Semantics.md
index 3f185f9f52b8..6ea0b292de69 100644
--- a/flang/documentation/Semantics.md
+++ b/flang/docs/Semantics.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Semantics.md 
+<!--===- docs/Semantics.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/f2018-grammar.txt b/flang/docs/f2018-grammar.txt
similarity index 99%
rename from flang/documentation/f2018-grammar.txt
rename to flang/docs/f2018-grammar.txt
index 10d3747d1287..2de8cdfc1b8f 100644
--- a/flang/documentation/f2018-grammar.txt
+++ b/flang/docs/f2018-grammar.txt
@@ -1,4 +1,4 @@
-#===-- documentation/f2018-grammar.txt -------------------------------------===#
+#===-- docs/f2018-grammar.txt -------------------------------------===#
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/flang-c-style.el b/flang/docs/flang-c-style.el
similarity index 92%
rename from flang/documentation/flang-c-style.el
rename to flang/docs/flang-c-style.el
index 001ab0750466..1749b5dd8570 100644
--- a/flang/documentation/flang-c-style.el
+++ b/flang/docs/flang-c-style.el
@@ -1,4 +1,4 @@
-;;===-- documentation/flang-c-style.el ------------------------------------===;;
+;;===-- docs/flang-c-style.el ------------------------------------===;;
 ;;
 ;; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 ;; See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index 6237499d12ef..923a5d1e933c 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -30,7 +30,7 @@ class FoldingContext;
 
 // This file defines the supported intrinsic procedures and implements
 // their recognition and validation.  It is largely table-driven.  See
-// documentation/intrinsics.md and section 16 of the Fortran 2018 standard
+// docs/intrinsics.md and section 16 of the Fortran 2018 standard
 // for full details on each of the intrinsics.  Be advised, they have
 // complicated details, and the design of these tables has to accommodate
 // that complexity.

From e6b4ec963a0b303a5b9d193f8dfb0ae4ea09d66b Mon Sep 17 00:00:00 2001
From: Sameeran joshi <joshisameeran17@gmail.com>
Date: Mon, 24 Aug 2020 19:51:45 +0530
Subject: [PATCH 199/363] [FLANG] Pick `.md` files when building sphinx
 documentation.

Need to build sphinx using below flags to Cmake
`-DLLVM_ENABLE_SPHINX=ON -DSPHINX_WARNINGS_AS_ERRORS=OFF`.
Generate html docs using cmake target
`docs-flang-html`
Generated html files should be at `build/tools/flang/docs/html`.
Patch in series from the dicussion on review
https://reviews.llvm.org/D85828

After this patch the markdown docmentation must be written using guide in-
`llvm/docs/MarkdownQuickstartTemplate.md`

Reviewed By: sscalpone

Differential Revision: https://reviews.llvm.org/D86131

(cherry picked from commit bc9cdfa12bde46c77bcc6450707f2714133e9b2f)
---
 flang/README.md    | 25 +++++++++++++++++++++++++
 flang/docs/conf.py | 27 +++++++++++++++++++++------
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/flang/README.md b/flang/README.md
index 44573ae4b9b6..fafc1f91a421 100644
--- a/flang/README.md
+++ b/flang/README.md
@@ -33,6 +33,9 @@ read the [style guide](docs/C++style.md)
 and
 also review [how flang uses modern C++ features](docs/C++17.md).
 
+If you are interested in writing new documentation, follow 
+[markdown style guide from LLVM](https://github.com/llvm/llvm-project/blob/master/llvm/docs/MarkdownQuickstartTemplate.md).
+
 ## Supported C++ compilers
 
 Flang is written in C++17.
@@ -216,3 +219,25 @@ It will generate html in
 
     <build-dir>/tools/flang/docs/doxygen/html # for flang docs
 ```
+## Generate Sphinx-based Documentation
+<!TODO: Add webpage once we have a website.
+!>
+Flang documentation should preferably be written in `markdown(.md)` syntax (they can be in `reStructuredText(.rst)` format as well but markdown is recommended in first place), it
+is mostly meant to be processed by the Sphinx documentation generation
+system to create HTML pages which would be hosted on the webpage of flang and
+updated periodically.
+
+If you would like to generate and view the HTML locally, install
+Sphinx <http://sphinx-doc.org/> and then:
+
+- Pass `-DLLVM_ENABLE_SPHINX=ON -DSPHINX_WARNINGS_AS_ERRORS=OFF` to the cmake command.
+
+```
+cd ~/llvm-project/build
+cmake -DLLVM_ENABLE_SPHINX=ON -DSPHINX_WARNINGS_AS_ERRORS=OFF ../llvm
+make docs-flang-html
+
+It will generate html in
+
+   $BROWSER <build-dir>/tools/flang/docs/html/
+```
diff --git a/flang/docs/conf.py b/flang/docs/conf.py
index bbe37a68cc28..045d0a2c4167 100644
--- a/flang/docs/conf.py
+++ b/flang/docs/conf.py
@@ -21,7 +21,6 @@
 
 # If your documentation needs a minimal Sphinx version, state it here.
 #needs_sphinx = '1.0'
-
 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 extensions = ['sphinx.ext.todo', 'sphinx.ext.mathjax', 'sphinx.ext.intersphinx']
@@ -30,13 +29,29 @@
 templates_path = ['_templates']
 
 # The suffix of source filenames.
-source_suffix = '.rst'
+source_suffix = {
+    '.rst': 'restructuredtext',
+}
+try:
+  import recommonmark
+except ImportError:
+  # manpages do not use any .md sources
+  if not tags.has('builder-man'):
+    raise
+else:
+  import sphinx
+  if sphinx.version_info >= (3, 0):
+    # This requires 0.5 or later.
+    extensions.append('recommonmark')
+  else:
+    source_parsers = {'.md': 'recommonmark.parser.CommonMarkParser'}
+  source_suffix['.md'] = 'markdown'
 
 # The encoding of source files.
 #source_encoding = 'utf-8-sig'
 
 # The master toctree document.
-master_doc = 'ReleaseNotes'
+master_doc = 'Overview'
 
 # General information about the project.
 project = u'Flang'
@@ -196,7 +211,7 @@
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title, author, documentclass [howto/manual]).
 latex_documents = [
-  ('ReleaseNotes', 'Flang.tex', u'Flang Documentation',
+  ('Overview', 'Flang.tex', u'Flang Documentation',
    u'The Flang Team', 'manual'),
 ]
 
@@ -237,8 +252,8 @@
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-  ('ReleaseNotes', 'Flang', u'Flang Documentation',
-   u'The Flang Team', 'Flang', 'One line description of project.',
+  ('Overview', 'Flang', u'Flang Documentation',
+   u'The Flang Team', 'Flang', 'A Fortran front end for LLVM.',
    'Miscellaneous'),
 ]
 

From d98e4c0d9a3585e2302c717beea9b9d03df9663a Mon Sep 17 00:00:00 2001
From: Kristof Beyls <kristof.beyls@arm.com>
Date: Mon, 31 Aug 2020 14:01:04 +0200
Subject: [PATCH 200/363] Add a few more release notes for ARM and AArch64.

---
 llvm/docs/ReleaseNotes.rst | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index c7ca861dbc34..8171f9d990c9 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -119,11 +119,26 @@ Changes to the AArch64 Backend
   ``00bet5``). For more information, see the ``clang`` 11 release
   notes.
 
+* Added support for Armv8.6-A:
+
+  Assembly support for the following extensions:
+  - Enhanced Counter Virtualization (ARMv8.6-ECV).
+  - Fine Grained Traps (ARMv8.6-FGT).
+  - Activity Monitors virtualization (ARMv8.6-AMU).
+  - Data gathering hint (ARMv8.0-DGH).
+
+  Assembly and intrinsics support for the Armv8.6-A Matrix Multiply extension
+  for Neon and SVE vectors.
+
+  Support for the ARMv8.2-BF16 BFloat16 extension. This includes a new C-level
+  storage-only `__bf16` type, a `BFloat` IR type, a `bf16` MVT, and assembly
+  and intrinsics support.
+
+* Added support for Cortex-A34, Cortex-A77, Cortex-A78 and Cortex-X1 cores.
+
 Changes to the ARM Backend
 --------------------------
 
-During this release ...
-
 * Implemented C-language intrinsics for the full Arm v8.1-M MVE instruction
   set. ``<arm_mve.h>`` now supports the complete API defined in the Arm C
   Language Extensions.
@@ -139,6 +154,19 @@ During this release ...
   default may wish to specify ``-fno-omit-frame-pointer`` to get the old
   behavior. This improves compatibility with GCC.
 
+* Added support for Armv8.6-A:
+
+  Assembly and intrinsics support for the Armv8.6-A Matrix Multiply extension
+  for Neon vectors.
+
+  Support for the ARMv8.2-AA32BF16 BFloat16 extension. This includes a new
+  C-level storage-only `__bf16` type, a `BFloat` IR type, a `bf16` MVT, and
+  assembly and intrinsics support.
+
+* Added support for CMSE.
+
+* Added support for Cortex-M55, Cortex-A77, Cortex-A78 and Cortex-X1 cores.
+
 Changes to the MIPS Target
 --------------------------
 

From defbc77a7c951d8cc47daf49df15c7f548cada05 Mon Sep 17 00:00:00 2001
From: QingShan Zhang <qshanz@cn.ibm.com>
Date: Fri, 28 Aug 2020 15:21:47 +0000
Subject: [PATCH 201/363] [DAGCombine] Don't delete the node if it has uses
 immediately

This is the follow up patch for https://reviews.llvm.org/D86183 as we miss to delete the node if NegX == NegY, which has use after we create the node.
```
    if (NegX && (CostX <= CostY)) {
      Cost = std::min(CostX, CostZ);
      RemoveDeadNode(NegY);
      return DAG.getNode(Opcode, DL, VT, NegX, Y, NegZ, Flags);  #<-- NegY is used here if NegY == NegX.
    }
```

Reviewed By: spatel

Differential Revision: https://reviews.llvm.org/D86689

(cherry picked from commit deb4b2580715810ecd5cb7eefa5ffbe65e5eedc8)
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp    | 18 ++++++++++++------
 llvm/test/CodeGen/PowerPC/fneg.ll              | 17 +++++++++++++++++
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 94cb6da3d69e..819e608c6896 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5809,15 +5809,17 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     // Negate the X if its cost is less or equal than Y.
     if (NegX && (CostX <= CostY)) {
       Cost = CostX;
+      SDValue N = DAG.getNode(ISD::FSUB, DL, VT, NegX, Y, Flags);
       RemoveDeadNode(NegY);
-      return DAG.getNode(ISD::FSUB, DL, VT, NegX, Y, Flags);
+      return N;
     }
 
     // Negate the Y if it is not expensive.
     if (NegY) {
       Cost = CostY;
+      SDValue N = DAG.getNode(ISD::FSUB, DL, VT, NegY, X, Flags);
       RemoveDeadNode(NegX);
-      return DAG.getNode(ISD::FSUB, DL, VT, NegY, X, Flags);
+      return N;
     }
     break;
   }
@@ -5854,8 +5856,9 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     // Negate the X if its cost is less or equal than Y.
     if (NegX && (CostX <= CostY)) {
       Cost = CostX;
+      SDValue N = DAG.getNode(Opcode, DL, VT, NegX, Y, Flags);
       RemoveDeadNode(NegY);
-      return DAG.getNode(Opcode, DL, VT, NegX, Y, Flags);
+      return N;
     }
 
     // Ignore X * 2.0 because that is expected to be canonicalized to X + X.
@@ -5866,8 +5869,9 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     // Negate the Y if it is not expensive.
     if (NegY) {
       Cost = CostY;
+      SDValue N = DAG.getNode(Opcode, DL, VT, X, NegY, Flags);
       RemoveDeadNode(NegX);
-      return DAG.getNode(Opcode, DL, VT, X, NegY, Flags);
+      return N;
     }
     break;
   }
@@ -5896,15 +5900,17 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     // Negate the X if its cost is less or equal than Y.
     if (NegX && (CostX <= CostY)) {
       Cost = std::min(CostX, CostZ);
+      SDValue N = DAG.getNode(Opcode, DL, VT, NegX, Y, NegZ, Flags);
       RemoveDeadNode(NegY);
-      return DAG.getNode(Opcode, DL, VT, NegX, Y, NegZ, Flags);
+      return N;
     }
 
     // Negate the Y if it is not expensive.
     if (NegY) {
       Cost = std::min(CostY, CostZ);
+      SDValue N = DAG.getNode(Opcode, DL, VT, X, NegY, NegZ, Flags);
       RemoveDeadNode(NegX);
-      return DAG.getNode(Opcode, DL, VT, X, NegY, NegZ, Flags);
+      return N;
     }
     break;
   }
diff --git a/llvm/test/CodeGen/PowerPC/fneg.ll b/llvm/test/CodeGen/PowerPC/fneg.ll
index 328ffecd1762..aea34e216d64 100644
--- a/llvm/test/CodeGen/PowerPC/fneg.ll
+++ b/llvm/test/CodeGen/PowerPC/fneg.ll
@@ -39,3 +39,20 @@ define float @fma_fneg_fsub(float %x, float %y0, float %y1, float %z) {
   %r = call float @llvm.fmuladd.f32(float %negx, float %negy, float %z)
   ret float %r
 }
+
+; Verify that we didn't hit assertion for this case.
+define double @fneg_no_ice(float %x) {
+; CHECK-LABEL: fneg_no_ice:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lis r3, .LCPI3_0@ha
+; CHECK-NEXT:    lfs f0, .LCPI3_0@l(r3)
+; CHECK-NEXT:    fsubs f0, f0, f1
+; CHECK-NEXT:    fmul f1, f0, f0
+; CHECK-NEXT:    fmul f1, f0, f1
+; CHECK-NEXT:    blr
+  %y = fsub fast float 1.0, %x
+  %e = fpext float %y to double
+  %e2 = fmul double %e, %e
+  %e3 = fmul double %e, %e2
+  ret double %e3
+}

From 7166d2653be30b18d87a112ca99ae706dd998ba2 Mon Sep 17 00:00:00 2001
From: Rainer Orth <ro@gcc.gnu.org>
Date: Fri, 28 Aug 2020 11:40:34 +0200
Subject: [PATCH 202/363] [cmake] Don't build with -O3 -fPIC on Solaris/sparcv9

Tests on Solaris/sparcv9 currently show about 250 failures when building
with gcc, most of them like the following:

  FAIL: LLVM-Unit :: Support/./SupportTests/TaskQueueTest.UnOrderedFutures (4269 of 67884)
  ******************** TEST 'LLVM-Unit :: Support/./SupportTests/TaskQueueTest.UnOrderedFutures' FAILED ********************
  Note: Google Test filter = TaskQueueTest.UnOrderedFutures
  [==========] Running 1 test from 1 test case.
  [----------] Global test environment set-up.
  [----------] 1 test from TaskQueueTest
  [ RUN      ] TaskQueueTest.UnOrderedFutures
  0  SupportTests        0x0000000100753b20 llvm::sys::PrintStackTrace(llvm::raw_ostream&) + 32
  1  SupportTests        0x0000000100752974 llvm::sys::RunSignalHandlers() + 68
  2  SupportTests        0x0000000100752b18 SignalHandler(int) + 372
  3  libc.so.1           0xffffffff7eedc800 __sighndlr + 12
  4  libc.so.1           0xffffffff7eecf23c call_user_handler + 852
  5  libc.so.1           0xffffffff7eecf594 sigacthandler + 84
  6  SupportTests        0x00000001006f8cb8 std::thread::_State_impl<std::thread::_Invoker<std::tuple<llvm::ThreadPool::ThreadPool(llvm::ThreadPoolStrategy)::'lambda'()> > >::_M_run() + 512
  7  libstdc++.so.6.0.28 0xfffffffc628117cc execute_native_thread_routine + 16
  8  libc.so.1           0xffffffff7eedc6a0 _lwp_start + 0

Since it's effectively impossible to debug such a `SEGV` in a `Release`
build, I tried a `Debug` build instead, only to find that the failures had
gone away.

Further investigation revealed that most of the issue centers around
`llvm/lib/Support/ThreadPool.cpp`.  That file is built with `-O3 -fPIC` in
a `Release` build.  The failure vanishes if

- compiling without `-fPIC`
- compiling with `-O -fPIC`
- linking with GNU `ld` instead of Solaris `ld`

It has meanwhile been determined that `gcc` doesn't correctly heed some TLS
code sequences.  To make things worse, Solaris `ld` doesn't properly
validate its assumptions against the input, generating wrong code.

`gld` like `gcc` is more liberal here and correctly deals with the code it
gets fed from `gcc`.

There's PR target/96607: GCC feeds SPARC/Solaris linker with unrecognized
TLS sequences <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96607> now.

An attempt to build with `-DLLVM_ENABLE_PIC=Off` initially failed since
neither `libRemarks.so` (D85626 <https://reviews.llvm.org/D85626>) nor
`LLVMPolly.so` (D85627 <https://reviews.llvm.org/D85627>) heed that option.
Even with that fixed, a few codegen failures remain.

Next I tried to build just `ThreadPool.cpp` with `-O -fPIC`.  While that
fixed the vast majority of the failures, 16 `LLVM :: CodeGen/X86` failures
remained.

Given that that solution was both incomplete and fragile, I went for
building the whole tree with `-O -fPIC` for `Release` and `RelWithDebInfo`
builds.

As detailed in Bug 47304, 2-stage builds also show large numbers of
failures when building with `-O3` or `-O2`, which are likewise worked
around by building with `-O` until they are sufficiently analyzed and
fixed.

This way, all failures relative to a `Debug` build go away.

Tested on `sparcv9-sun-solaris2.11`.

Differential Revision: https://reviews.llvm.org/D85630

(cherry picked from commit 15c66b10114d239c96282cf8fc5330186178974b)
---
 llvm/cmake/modules/HandleLLVMOptions.cmake | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 2e249593e12f..5ef22eb493ba 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -12,6 +12,7 @@ include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
 include(CheckSymbolExists)
 include(CMakeDependentOption)
+include(LLVMProcessSources)
 
 if(CMAKE_LINKER MATCHES "lld-link" OR (MSVC AND (LLVM_USE_LINKER STREQUAL "lld" OR LLVM_ENABLE_LLD)))
   set(LINKER_IS_LLD_LINK TRUE)
@@ -291,6 +292,15 @@ if( LLVM_ENABLE_PIC )
          NOT Uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
     add_flag_or_print_warning("-fno-shrink-wrap" FNO_SHRINK_WRAP)
   endif()
+  # gcc with -O3 -fPIC generates TLS sequences that violate the spec on
+  # Solaris/sparcv9, causing executables created with the system linker
+  # to SEGV (GCC PR target/96607).
+  # clang with -O3 -fPIC generates code that SEGVs.
+  # Both can be worked around by compiling with -O instead.
+  if(${CMAKE_SYSTEM_NAME} STREQUAL "SunOS" AND LLVM_NATIVE_ARCH STREQUAL "Sparc")
+    llvm_replace_compiler_option(CMAKE_CXX_FLAGS_RELEASE "-O[23]" "-O")
+    llvm_replace_compiler_option(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O[23]" "-O")
+  endif()
 endif()
 
 if(NOT WIN32 AND NOT CYGWIN AND NOT (${CMAKE_SYSTEM_NAME} MATCHES "AIX" AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))

From 981192ddcc6c7c947747e7ea478d02adfca210bd Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Sat, 29 Aug 2020 20:47:18 -0400
Subject: [PATCH 203/363] Remove OpenBSD/sparc support

(cherry picked from commit 4fbf0636a214abbc30b6eee42bd5b7755dfd5f38)
---
 clang/lib/Basic/Targets.cpp             | 4 ----
 clang/lib/Driver/ToolChains/OpenBSD.cpp | 9 ---------
 clang/test/Driver/openbsd.c             | 5 -----
 clang/test/Driver/pic.c                 | 2 --
 4 files changed, 20 deletions(-)

diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index 818133f66f3f..965f273892bd 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -409,8 +409,6 @@ TargetInfo *AllocateTarget(const llvm::Triple &Triple,
       return new SolarisTargetInfo<SparcV8TargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
       return new NetBSDTargetInfo<SparcV8TargetInfo>(Triple, Opts);
-    case llvm::Triple::OpenBSD:
-      return new OpenBSDTargetInfo<SparcV8TargetInfo>(Triple, Opts);
     case llvm::Triple::RTEMS:
       return new RTEMSTargetInfo<SparcV8TargetInfo>(Triple, Opts);
     default:
@@ -424,8 +422,6 @@ TargetInfo *AllocateTarget(const llvm::Triple &Triple,
       return new LinuxTargetInfo<SparcV8elTargetInfo>(Triple, Opts);
     case llvm::Triple::NetBSD:
       return new NetBSDTargetInfo<SparcV8elTargetInfo>(Triple, Opts);
-    case llvm::Triple::OpenBSD:
-      return new OpenBSDTargetInfo<SparcV8elTargetInfo>(Triple, Opts);
     case llvm::Triple::RTEMS:
       return new RTEMSTargetInfo<SparcV8elTargetInfo>(Triple, Opts);
     default:
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index 1177fba96562..5ca2fa0850e6 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -43,15 +43,6 @@ void openbsd::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-many");
     break;
 
-  case llvm::Triple::sparc:
-  case llvm::Triple::sparcel: {
-    CmdArgs.push_back("-32");
-    std::string CPU = getCPUName(Args, getToolChain().getTriple());
-    CmdArgs.push_back(sparc::getSparcAsmModeForCPU(CPU, getToolChain().getTriple()));
-    AddAssemblerKPIC(getToolChain(), Args, CmdArgs);
-    break;
-  }
-
   case llvm::Triple::sparcv9: {
     CmdArgs.push_back("-64");
     std::string CPU = getCPUName(Args, getToolChain().getTriple());
diff --git a/clang/test/Driver/openbsd.c b/clang/test/Driver/openbsd.c
index cee4539eaca2..203b4b4a2ff0 100644
--- a/clang/test/Driver/openbsd.c
+++ b/clang/test/Driver/openbsd.c
@@ -58,8 +58,6 @@
 // RUN:   | FileCheck -check-prefix=CHECK-AMD64-M32 %s
 // RUN: %clang -target powerpc-unknown-openbsd -### -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-POWERPC %s
-// RUN: %clang -target sparc-unknown-openbsd -### -no-integrated-as -c %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-SPARC %s
 // RUN: %clang -target sparc64-unknown-openbsd -### -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-SPARC64 %s
 // RUN: %clang -target mips64-unknown-openbsd -### -no-integrated-as -c %s 2>&1 \
@@ -72,7 +70,6 @@
 // RUN:   | FileCheck -check-prefix=CHECK-MIPS64EL-PIC %s
 // CHECK-AMD64-M32: as{{.*}}" "--32"
 // CHECK-POWERPC: as{{.*}}" "-mppc" "-many"
-// CHECK-SPARC: as{{.*}}" "-32" "-Av8"
 // CHECK-SPARC64: as{{.*}}" "-64" "-Av9"
 // CHECK-MIPS64: as{{.*}}" "-mabi" "64" "-EB"
 // CHECK-MIPS64-PIC: as{{.*}}" "-mabi" "64" "-EB" "-KPIC"
@@ -80,8 +77,6 @@
 // CHECK-MIPS64EL-PIC: as{{.*}}" "-mabi" "64" "-EL" "-KPIC"
 
 // Check that the integrated assembler is enabled for SPARC
-// RUN: %clang -target sparc-unknown-openbsd -### -c %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-IAS %s
 // RUN: %clang -target sparc64-unknown-openbsd -### -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-IAS %s
 // CHECK-IAS-NOT: "-no-integrated-as"
diff --git a/clang/test/Driver/pic.c b/clang/test/Driver/pic.c
index c0cdeb464cbf..57924cd1bbb4 100644
--- a/clang/test/Driver/pic.c
+++ b/clang/test/Driver/pic.c
@@ -253,8 +253,6 @@
 // RUN:   | FileCheck %s --check-prefix=CHECK-PIE1
 // RUN: %clang -c %s -target powerpc-unknown-openbsd -### 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-PIE2
-// RUN: %clang -c %s -target sparc-unknown-openbsd -### 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-PIE2
 // RUN: %clang -c %s -target sparc64-unknown-openbsd -### 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-PIE2
 // RUN: %clang -c %s -target i386-pc-openbsd -fno-pie -### 2>&1 \

From 3d27de6c9739c9dd124a83917301e6d5bec933d0 Mon Sep 17 00:00:00 2001
From: sameeran joshi <sameeranjayant.joshi@amd.com>
Date: Mon, 31 Aug 2020 18:29:57 +0530
Subject: [PATCH 204/363] [Flang][NFC] Remove license comments from files in
 docs/ folder.

Solves issue https://reviews.llvm.org/D86131#2247275

Reviewed By: hans

Differential Revision: https://reviews.llvm.org/D86875

(cherry picked from commit f787c9a90c69f604694b02493404c066ea601b9e)
---
 flang/docs/ArrayComposition.md           | 8 --------
 flang/docs/C++17.md                      | 8 --------
 flang/docs/C++style.md                   | 8 --------
 flang/docs/Calls.md                      | 8 --------
 flang/docs/Character.md                  | 8 --------
 flang/docs/ControlFlowGraph.md           | 8 --------
 flang/docs/Directives.md                 | 8 --------
 flang/docs/Extensions.md                 | 8 --------
 flang/docs/FortranForCProgrammers.md     | 8 --------
 flang/docs/FortranIR.md                  | 8 --------
 flang/docs/IORuntimeInternals.md         | 8 --------
 flang/docs/ImplementingASemanticCheck.md | 8 --------
 flang/docs/Intrinsics.md                 | 8 --------
 flang/docs/LabelResolution.md            | 8 --------
 flang/docs/ModFiles.md                   | 8 --------
 flang/docs/OpenMP-semantics.md           | 8 --------
 flang/docs/OptionComparison.md           | 8 --------
 flang/docs/Overview.md                   | 8 --------
 flang/docs/ParserCombinators.md          | 8 --------
 flang/docs/Parsing.md                    | 8 --------
 flang/docs/Preprocessing.md              | 8 --------
 flang/docs/PullRequestChecklist.md       | 8 --------
 flang/docs/RuntimeDescriptor.md          | 8 --------
 flang/docs/Semantics.md                  | 8 --------
 24 files changed, 192 deletions(-)

diff --git a/flang/docs/ArrayComposition.md b/flang/docs/ArrayComposition.md
index 0f30af39f9e4..18194caadf09 100644
--- a/flang/docs/ArrayComposition.md
+++ b/flang/docs/ArrayComposition.md
@@ -1,11 +1,3 @@
-<!--===- docs/ArrayComposition.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 This note attempts to describe the motivation for and design of an
 implementation of Fortran 90 (and later) array expression evaluation that
 minimizes the use of dynamically allocated temporary storage for
diff --git a/flang/docs/C++17.md b/flang/docs/C++17.md
index 87d5fc01f092..ea8395cfdedc 100644
--- a/flang/docs/C++17.md
+++ b/flang/docs/C++17.md
@@ -1,11 +1,3 @@
-<!--===- docs/C++17.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 ## C++14/17 features used in f18
 
 The C++ dialect used in this project constitutes a subset of the
diff --git a/flang/docs/C++style.md b/flang/docs/C++style.md
index 4ab95393d758..77e0a0463823 100644
--- a/flang/docs/C++style.md
+++ b/flang/docs/C++style.md
@@ -1,11 +1,3 @@
-<!--===- docs/C++style.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 ## In brief:
 * Use *clang-format*
 from llvm 7
diff --git a/flang/docs/Calls.md b/flang/docs/Calls.md
index d70bc910d73d..8a4d65820d19 100644
--- a/flang/docs/Calls.md
+++ b/flang/docs/Calls.md
@@ -1,11 +1,3 @@
-<!--===- docs/Calls.md
-
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
--->
-
 ## Procedure reference implementation protocol
 
 Fortran function and subroutine references are complicated.
diff --git a/flang/docs/Character.md b/flang/docs/Character.md
index 700db864f2da..f66b14438945 100644
--- a/flang/docs/Character.md
+++ b/flang/docs/Character.md
@@ -1,11 +1,3 @@
-<!--===- docs/Character.md
-
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
--->
-
 ## Implementation of `CHARACTER` types in f18
 
 ### Kinds and Character Sets
diff --git a/flang/docs/ControlFlowGraph.md b/flang/docs/ControlFlowGraph.md
index b2b549845ebb..7d1e514a87ad 100644
--- a/flang/docs/ControlFlowGraph.md
+++ b/flang/docs/ControlFlowGraph.md
@@ -1,11 +1,3 @@
-<!--===- docs/ControlFlowGraph.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 ## Concept
 After a Fortran subprogram has been parsed, its names resolved, and all its
 semantic constraints successfully checked, the parse tree of its
diff --git a/flang/docs/Directives.md b/flang/docs/Directives.md
index c2e93c5f3de2..554dc4608dd4 100644
--- a/flang/docs/Directives.md
+++ b/flang/docs/Directives.md
@@ -1,11 +1,3 @@
-<!--===- docs/Directives.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 Compiler directives supported by F18
 ====================================
 
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 9010b770cca6..86a4f04de57f 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -1,11 +1,3 @@
-<!--===- docs/Extensions.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 As a general principle, this compiler will accept by default and
 without complaint many legacy features, extensions to the standard
 language, and features that have been deleted from the standard,
diff --git a/flang/docs/FortranForCProgrammers.md b/flang/docs/FortranForCProgrammers.md
index 103def2a92ce..542034f3ea83 100644
--- a/flang/docs/FortranForCProgrammers.md
+++ b/flang/docs/FortranForCProgrammers.md
@@ -1,11 +1,3 @@
-<!--===- docs/FortranForCProgrammers.md
-
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
--->
-
 Fortran For C Programmers
 =========================
 
diff --git a/flang/docs/FortranIR.md b/flang/docs/FortranIR.md
index 5d83aaa8e34c..83193ff27a35 100644
--- a/flang/docs/FortranIR.md
+++ b/flang/docs/FortranIR.md
@@ -1,11 +1,3 @@
-<!--===- docs/FortranIR.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 # Design: Fortran IR
 
 ## Introduction
diff --git a/flang/docs/IORuntimeInternals.md b/flang/docs/IORuntimeInternals.md
index b4f3092a014e..8ff464ee9c8f 100644
--- a/flang/docs/IORuntimeInternals.md
+++ b/flang/docs/IORuntimeInternals.md
@@ -1,11 +1,3 @@
-<!--===- docs/IORuntimeInternals.md
-
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
--->
-
 Fortran I/O Runtime Library Internal Design
 ===========================================
 
diff --git a/flang/docs/ImplementingASemanticCheck.md b/flang/docs/ImplementingASemanticCheck.md
index 3bb16915cb88..2406f5bc2a58 100644
--- a/flang/docs/ImplementingASemanticCheck.md
+++ b/flang/docs/ImplementingASemanticCheck.md
@@ -1,11 +1,3 @@
-<!--===- docs/ImplementingASemanticCheck.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-# Introduction
 I recently added a semantic check to the f18 compiler front end.  This document
 describes my thought process and the resulting implementation.
 
diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md
index 7be0bf3e4a9c..6f4dec467823 100644
--- a/flang/docs/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -1,11 +1,3 @@
-<!--===- docs/Intrinsics.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 # A categorization of standard (2018) and extended Fortran intrinsic procedures
 
 This note attempts to group the intrinsic procedures of Fortran into categories
diff --git a/flang/docs/LabelResolution.md b/flang/docs/LabelResolution.md
index e837b4fa6aec..2dfa5a30bb3c 100644
--- a/flang/docs/LabelResolution.md
+++ b/flang/docs/LabelResolution.md
@@ -1,11 +1,3 @@
-<!--===- docs/LabelResolution.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 # Semantics: Resolving Labels and Construct Names
 
 ## Overview
diff --git a/flang/docs/ModFiles.md b/flang/docs/ModFiles.md
index 483341bdd0f4..367cd4cd54f7 100644
--- a/flang/docs/ModFiles.md
+++ b/flang/docs/ModFiles.md
@@ -1,11 +1,3 @@
-<!--===- docs/ModFiles.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 # Module Files
 
 Module files hold information from a module that is necessary to compile 
diff --git a/flang/docs/OpenMP-semantics.md b/flang/docs/OpenMP-semantics.md
index 4e2a81739cf8..22a3ca5614eb 100644
--- a/flang/docs/OpenMP-semantics.md
+++ b/flang/docs/OpenMP-semantics.md
@@ -1,11 +1,3 @@
-<!--===- docs/OpenMP-semantics.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 # OpenMP Semantic Analysis
 
 ## OpenMP for F18
diff --git a/flang/docs/OptionComparison.md b/flang/docs/OptionComparison.md
index db5932411cc1..5c04450a7bb3 100644
--- a/flang/docs/OptionComparison.md
+++ b/flang/docs/OptionComparison.md
@@ -1,11 +1,3 @@
-<!--===- docs/OptionComparison.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 # Compiler options
 
 This document catalogs the options processed by F18's peers/competitors.  Much of the document is taken up by a set of tables that list the options categorized into different topics.  Some of the table headings link to more information about the contents of the tables.  For example, the table on **Standards conformance** options links to [notes on Standards conformance](#standards).
diff --git a/flang/docs/Overview.md b/flang/docs/Overview.md
index 75a8cd1c4cab..807efda2ed9a 100644
--- a/flang/docs/Overview.md
+++ b/flang/docs/Overview.md
@@ -1,11 +1,3 @@
-<!--===- docs/Overview.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 # Overview of Compiler Phases
 
 Each phase produces either correct output or fatal errors.
diff --git a/flang/docs/ParserCombinators.md b/flang/docs/ParserCombinators.md
index 4f3dc6fd07ae..757684dcfda6 100644
--- a/flang/docs/ParserCombinators.md
+++ b/flang/docs/ParserCombinators.md
@@ -1,11 +1,3 @@
-<!--===- docs/ParserCombinators.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 ## Concept
 The Fortran language recognizer here can be classified as an LL recursive
 descent parser.  It is composed from a *parser combinator* library that
diff --git a/flang/docs/Parsing.md b/flang/docs/Parsing.md
index fad9a4d57278..54a4fd752f6c 100644
--- a/flang/docs/Parsing.md
+++ b/flang/docs/Parsing.md
@@ -1,11 +1,3 @@
-<!--===- docs/Parsing.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 The F18 Parser
 ==============
 This program source code implements a parser for the Fortran programming
diff --git a/flang/docs/Preprocessing.md b/flang/docs/Preprocessing.md
index 7f6f3951cfd1..9b4d905177b7 100644
--- a/flang/docs/Preprocessing.md
+++ b/flang/docs/Preprocessing.md
@@ -1,11 +1,3 @@
-<!--===- docs/Preprocessing.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 Fortran Preprocessing
 =====================
 
diff --git a/flang/docs/PullRequestChecklist.md b/flang/docs/PullRequestChecklist.md
index 12a67be374a2..17b6d64923f5 100644
--- a/flang/docs/PullRequestChecklist.md
+++ b/flang/docs/PullRequestChecklist.md
@@ -1,11 +1,3 @@
-<!--===- docs/PullRequestChecklist.md 
-
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
--->
-
 # Pull request checklist
 Please review the following items before submitting a pull request.  This list
 can also be used when reviewing pull requests.
diff --git a/flang/docs/RuntimeDescriptor.md b/flang/docs/RuntimeDescriptor.md
index d819517fa979..a8eff33f6521 100644
--- a/flang/docs/RuntimeDescriptor.md
+++ b/flang/docs/RuntimeDescriptor.md
@@ -1,11 +1,3 @@
-<!--===- docs/RuntimeDescriptor.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 ## Concept
 The properties that characterize data values and objects in Fortran
 programs must sometimes be materialized when the program runs.
diff --git a/flang/docs/Semantics.md b/flang/docs/Semantics.md
index 6ea0b292de69..f879671b4f4e 100644
--- a/flang/docs/Semantics.md
+++ b/flang/docs/Semantics.md
@@ -1,11 +1,3 @@
-<!--===- docs/Semantics.md 
-  
-   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-   See https://llvm.org/LICENSE.txt for license information.
-   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-  
--->
-
 # Semantic Analysis
 
 The semantic analysis pass determines if a syntactically correct Fortran

From 2526d8c43499fc5dd6135556ab16ae20d280ddca Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Sat, 29 Aug 2020 10:55:55 +0200
Subject: [PATCH 205/363] [InstSimplify] Protect against more poison in
 SimplifyWithOpReplaced (PR47322)

Replace the check for poison-producing instructions in
SimplifyWithOpReplaced() with the generic helper canCreatePoison()
that properly handles poisonous shifts and thus avoids the problem
from PR47322.

This additionally fixes a bug in IIQ.UseInstrInfo=false mode, which
previously could have caused this code to ignore poison flags.
Setting UseInstrInfo=false should reduce the possible optimizations,
not increase them.

This is not a full solution to the problem, as poison could be
introduced more indirectly. This is just a minimal, easy to backport
fix.

Differential Revision: https://reviews.llvm.org/D86834

(cherry picked from commit a5be86fde5de2c253aa19704bf4e4854f1936f8c)
---
 llvm/lib/Analysis/InstructionSimplify.cpp   |  5 +----
 llvm/test/Transforms/InstSimplify/select.ll | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index d3bdf9d6aafd..9423ff9e3a66 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -3837,10 +3837,7 @@ static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
     // TODO: This is an unusual limitation because better analysis results in
     //       worse simplification. InstCombine can do this fold more generally
     //       by dropping the flags. Remove this fold to save compile-time?
-    if (isa<OverflowingBinaryOperator>(B))
-      if (Q.IIQ.hasNoSignedWrap(B) || Q.IIQ.hasNoUnsignedWrap(B))
-        return nullptr;
-    if (isa<PossiblyExactOperator>(B) && Q.IIQ.isExact(B))
+    if (canCreatePoison(I))
       return nullptr;
 
     if (MaxRecurse) {
diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll
index b1264138a15e..05fa46ca3f49 100644
--- a/llvm/test/Transforms/InstSimplify/select.ll
+++ b/llvm/test/Transforms/InstSimplify/select.ll
@@ -919,3 +919,19 @@ define <2 x i32> @all_constant_true_undef_false_constexpr_vec() {
   %s = select i1 ptrtoint (<2 x i32> ()* @all_constant_true_undef_false_constexpr_vec to i1), <2 x i32> undef, <2 x i32><i32 -1, i32 ptrtoint (<2 x i32> ()* @all_constant_true_undef_false_constexpr_vec to i32)>
   ret <2 x i32> %s
 }
+
+define i32 @pr47322_more_poisonous_replacement(i32 %arg) {
+; CHECK-LABEL: @pr47322_more_poisonous_replacement(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[ARG:%.*]], 0
+; CHECK-NEXT:    [[TRAILING:%.*]] = call i32 @llvm.cttz.i32(i32 [[ARG]], i1 immarg true)
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[ARG]], [[TRAILING]]
+; CHECK-NEXT:    [[R1_SROA_0_1:%.*]] = select i1 [[CMP]], i32 0, i32 [[SHIFTED]]
+; CHECK-NEXT:    ret i32 [[R1_SROA_0_1]]
+;
+  %cmp = icmp eq i32 %arg, 0
+  %trailing = call i32 @llvm.cttz.i32(i32 %arg, i1 immarg true)
+  %shifted = lshr i32 %arg, %trailing
+  %r1.sroa.0.1 = select i1 %cmp, i32 0, i32 %shifted
+  ret i32 %r1.sroa.0.1
+}
+declare i32 @llvm.cttz.i32(i32, i1 immarg)

From 7569e8c696288cd9c9409936b4fe9b846d0bd0b7 Mon Sep 17 00:00:00 2001
From: Kang Zhang <shkzhang@cn.ibm.com>
Date: Wed, 29 Jul 2020 16:39:27 +0000
Subject: [PATCH 206/363] [PowerPC] Set v1i128 to expand for SETCC to avoid
 crash

Summary:
PPC only supports the instruction selection for v16i8, v8i16, v4i32,
v2i64, v4f32 and v2f64 for ISD::SETCC, don't support the v1i128, so
v1i128 for ISD::SETCC will crash.

This patch is to set v1i128 to expand to avoid crash.

Reviewed By: steven.zhang

Differential Revision: https://reviews.llvm.org/D84238

(cherry picked from commit 802c043078ad653aca131648a130b59f041df0b5)
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp |  2 +
 llvm/test/CodeGen/PowerPC/setcc-vector.ll   | 49 +++++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/setcc-vector.ll

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 5c1a4cb16568..2f50c52c90a1 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -920,6 +920,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
         setOperationAction(ISD::SUB, MVT::v2i64, Expand);
       }
 
+      setOperationAction(ISD::SETCC, MVT::v1i128, Expand);
+
       setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
       AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);
       setOperationAction(ISD::STORE, MVT::v2i64, Promote);
diff --git a/llvm/test/CodeGen/PowerPC/setcc-vector.ll b/llvm/test/CodeGen/PowerPC/setcc-vector.ll
new file mode 100644
index 000000000000..5917ccabf84e
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/setcc-vector.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names < %s | FileCheck -check-prefixes=CHECK-PWR9 %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names < %s | FileCheck -check-prefixes=CHECK-PWR8  %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple=powerpc64-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names < %s | FileCheck -check-prefixes=CHECK-PWR7 %s
+
+define <1 x i64> @setcc_v1i128(<1 x i128> %a) {
+; CHECK-PWR9-LABEL: setcc_v1i128:
+; CHECK-PWR9:       # %bb.0: # %entry
+; CHECK-PWR9-NEXT:    mfvsrld r3, vs34
+; CHECK-PWR9-NEXT:    cmpldi r3, 35708
+; CHECK-PWR9-NEXT:    mfvsrd r3, vs34
+; CHECK-PWR9-NEXT:    cmpdi cr1, r3, 0
+; CHECK-PWR9-NEXT:    li r3, 1
+; CHECK-PWR9-NEXT:    crnand 4*cr5+lt, 4*cr1+eq, lt
+; CHECK-PWR9-NEXT:    isel r3, 0, r3, 4*cr5+lt
+; CHECK-PWR9-NEXT:    blr
+;
+; CHECK-PWR8-LABEL: setcc_v1i128:
+; CHECK-PWR8:       # %bb.0: # %entry
+; CHECK-PWR8-NEXT:    xxswapd vs0, vs34
+; CHECK-PWR8-NEXT:    mfvsrd r3, vs34
+; CHECK-PWR8-NEXT:    cmpdi r3, 0
+; CHECK-PWR8-NEXT:    li r3, 1
+; CHECK-PWR8-NEXT:    mffprd r4, f0
+; CHECK-PWR8-NEXT:    cmpldi cr1, r4, 35708
+; CHECK-PWR8-NEXT:    crnand 4*cr5+lt, eq, 4*cr1+lt
+; CHECK-PWR8-NEXT:    isel r3, 0, r3, 4*cr5+lt
+; CHECK-PWR8-NEXT:    blr
+;
+; CHECK-PWR7-LABEL: setcc_v1i128:
+; CHECK-PWR7:       # %bb.0: # %entry
+; CHECK-PWR7-NEXT:    li r5, 0
+; CHECK-PWR7-NEXT:    cntlzd r3, r3
+; CHECK-PWR7-NEXT:    ori r5, r5, 35708
+; CHECK-PWR7-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-PWR7-NEXT:    subc r5, r4, r5
+; CHECK-PWR7-NEXT:    subfe r4, r4, r4
+; CHECK-PWR7-NEXT:    neg r4, r4
+; CHECK-PWR7-NEXT:    and r3, r3, r4
+; CHECK-PWR7-NEXT:    blr
+entry:
+  %0 = icmp ult <1 x i128> %a, <i128 35708>
+  %1 = zext <1 x i1> %0 to <1 x i64>
+  ret <1 x i64> %1
+}
+

From f5df584a5077e726ad851ccfe8496deda3e5ef07 Mon Sep 17 00:00:00 2001
From: Camille Coti <camille.coti@gmail.com>
Date: Tue, 25 Aug 2020 09:27:20 +0100
Subject: [PATCH 207/363] [flang] Version information in flang/f18

Fixed some version information in flang/f18:

  - fixed the behavior of the -v switch: this flag enables verbosity with used with arguments, but just displays the version when used alone (related to this bug: https://bugs.llvm.org/show_bug.cgi?id=46017)
 - added __FLANG, __FLANG_MAJOR__, __FLANG_MINOR__ and __FLANG_PATCHLEVEL__ (similar to their __F18* counterparts) for compatibility purpose

Reviewed By: AlexisPerry, richard.barton.arm, tskeith

Differential Revision: https://reviews.llvm.org/D84334

(cherry picked from commit b11c52781635bd871abd6d932cfd5dcd6f311903)
---
 flang/test/Driver/version_test.f90            |  7 ++++--
 .../Preprocessing/compiler_defined_macros.F90 | 11 ++++++++
 flang/tools/f18/CMakeLists.txt                |  7 ++++--
 flang/tools/f18/f18.cpp                       | 25 ++++++++++++++++---
 flang/tools/f18/f18_version.h.in              |  9 +++++++
 5 files changed, 51 insertions(+), 8 deletions(-)
 create mode 100644 flang/test/Preprocessing/compiler_defined_macros.F90
 create mode 100644 flang/tools/f18/f18_version.h.in

diff --git a/flang/test/Driver/version_test.f90 b/flang/test/Driver/version_test.f90
index 08ea35ba49ea..79be3617cf4b 100644
--- a/flang/test/Driver/version_test.f90
+++ b/flang/test/Driver/version_test.f90
@@ -1,7 +1,10 @@
 ! Check that lit configuration works by checking the compiler version
 
-! RUN: %f18 -V 2>&1 | FileCheck  -check-prefix=VERSION %s
 ! VERSION-NOT:{{![[:space:]]}}
 ! VERSION:{{[[:space:]]}}
-! VERSION-SAME:f18 compiler (under development)
+! VERSION-SAME:f18 compiler (under development), version {{[1-9][0-9]*.[0-9]*.[0-9]*}}
 ! VERSION-EMPTY:
+  
+! RUN: %f18 -V 2>&1 | FileCheck  -check-prefix=VERSION %s
+! RUN: %f18 -v 2>&1 | FileCheck  -check-prefix=VERSION %s
+! RUN: %f18 --version 2>&1 | FileCheck  -check-prefix=VERSION %s
diff --git a/flang/test/Preprocessing/compiler_defined_macros.F90 b/flang/test/Preprocessing/compiler_defined_macros.F90
new file mode 100644
index 000000000000..ba20f6d39622
--- /dev/null
+++ b/flang/test/Preprocessing/compiler_defined_macros.F90
@@ -0,0 +1,11 @@
+! Check that the macros that give the version number are set properly
+
+!CHECK: flang_major = {{[1-9][0-9]*$}}
+!CHECK: flang_minor = {{[0-9]+$}}
+!CHECK: flang_patchlevel = {{[0-9]+$}}
+!RUN: %f18 -E %s | FileCheck  --ignore-case %s
+
+  
+integer, parameter :: flang_major = __flang_major__
+integer, parameter :: flang_minor = __flang_minor__
+integer, parameter :: flang_patchlevel = __flang_patchlevel__
diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
index 46c38fa43a2e..f3af6e8312fe 100644
--- a/flang/tools/f18/CMakeLists.txt
+++ b/flang/tools/f18/CMakeLists.txt
@@ -28,8 +28,10 @@ set(MODULES
 )
 
 set(include ${FLANG_BINARY_DIR}/include/flang)
-
-set(include ${FLANG_BINARY_DIR}/include/flang)
+target_include_directories(f18
+  PRIVATE
+  ${CMAKE_CURRENT_BINARY_DIR}
+)
 
 # Create module files directly from the top-level module source directory
 foreach(filename ${MODULES})
@@ -64,5 +66,6 @@ file(COPY ${CMAKE_BINARY_DIR}/tools/flang/bin/flang DESTINATION ${CMAKE_BINARY_D
 # The flang script to be installed needs a different path to the headers.
 set(FLANG_INTRINSIC_MODULES_DIR ${CMAKE_INSTALL_PREFIX}/include/flang)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/flang.sh.in ${FLANG_BINARY_DIR}/bin/flang-install.sh @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/f18_version.h.in ${CMAKE_CURRENT_BINARY_DIR}/f18_version.h @ONLY)
 
 install(PROGRAMS ${FLANG_BINARY_DIR}/bin/flang-install.sh DESTINATION bin RENAME flang PERMISSIONS OWNER_EXECUTE OWNER_READ OWNER_WRITE)
diff --git a/flang/tools/f18/f18.cpp b/flang/tools/f18/f18.cpp
index 03c0f7afe810..23b104ee520c 100644
--- a/flang/tools/f18/f18.cpp
+++ b/flang/tools/f18/f18.cpp
@@ -38,6 +38,8 @@
 #include <unistd.h>
 #include <vector>
 
+#include "f18_version.h"
+
 static std::list<std::string> argList(int argc, char *const argv[]) {
   std::list<std::string> result;
   for (int j = 0; j < argc; ++j) {
@@ -390,6 +392,13 @@ void Link(std::vector<std::string> &liblist, std::vector<std::string> &objects,
   }
 }
 
+int printVersion() {
+  llvm::errs() << "\nf18 compiler (under development), version "
+               << __FLANG_MAJOR__ << "." << __FLANG_MINOR__ << "."
+               << __FLANG_PATCHLEVEL__ << "\n";
+  return exitStatus;
+}
+
 int main(int argc, char *const argv[]) {
 
   atexit(CleanUpAtExit);
@@ -411,6 +420,11 @@ int main(int argc, char *const argv[]) {
   options.predefinitions.emplace_back("__F18_MAJOR__", "1");
   options.predefinitions.emplace_back("__F18_MINOR__", "1");
   options.predefinitions.emplace_back("__F18_PATCHLEVEL__", "1");
+  options.predefinitions.emplace_back("__flang__", __FLANG__);
+  options.predefinitions.emplace_back("__flang_major__", __FLANG_MAJOR__);
+  options.predefinitions.emplace_back("__flang_minor__", __FLANG_MINOR__);
+  options.predefinitions.emplace_back(
+      "__flang_patchlevel__", __FLANG_PATCHLEVEL__);
 #if __x86_64__
   options.predefinitions.emplace_back("__x86_64__", "1");
 #endif
@@ -651,13 +665,16 @@ int main(int argc, char *const argv[]) {
           << "Unrecognised options are passed through to the external compiler\n"
           << "set by F18_FC (see defaults).\n";
       return exitStatus;
-    } else if (arg == "-V") {
-      llvm::errs() << "\nf18 compiler (under development)\n";
-      return exitStatus;
+    } else if (arg == "-V" || arg == "--version") {
+      return printVersion();
     } else {
       driver.F18_FCArgs.push_back(arg);
       if (arg == "-v") {
-        driver.verbose = true;
+        if (args.size() > 1) {
+          driver.verbose = true;
+        } else {
+          return printVersion();
+        }
       } else if (arg == "-I") {
         driver.F18_FCArgs.push_back(args.front());
         driver.searchDirectories.push_back(args.front());
diff --git a/flang/tools/f18/f18_version.h.in b/flang/tools/f18/f18_version.h.in
new file mode 100644
index 000000000000..0c8d5227cd00
--- /dev/null
+++ b/flang/tools/f18/f18_version.h.in
@@ -0,0 +1,9 @@
+#ifndef _F18_H_
+#define _F18_H_
+
+#define __FLANG__ "1"
+#define __FLANG_MAJOR__ "@LLVM_VERSION_MAJOR@"
+#define __FLANG_MINOR__ "@LLVM_VERSION_MINOR@"
+#define __FLANG_PATCHLEVEL__ "@LLVM_VERSION_PATCH@"
+
+#endif // _F18_H_

From 7030fc50d93e5b08bde9743fb54f24c4a44a8e4a Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@lowrisc.org>
Date: Sat, 5 Sep 2020 13:26:44 +0100
Subject: [PATCH 208/363] ReleaseNotes: Add RISC-V updates

---
 clang/docs/ReleaseNotes.rst | 16 ++++++++++++
 llvm/docs/ReleaseNotes.rst  | 50 +++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 9d0ab935063f..ba0e15deb389 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -262,6 +262,13 @@ New Compiler Flags
   and 256TB (needs -mcmodel=large). This allows large/many thread local
   variables or a compact/fast code in an executable.
 
+- -menable-experimental-extension` can be used to enable experimental or
+  unratified RISC-V extensions, allowing them to be targeted by specifying the
+  extension name and precise version number in the `-march` string. For these
+  experimental extensions, there is no expectation of ongoing support - the
+  compiler support will continue to change until the specification is
+  finalised.
+
 Deprecated Compiler Flags
 -------------------------
 
@@ -296,6 +303,10 @@ Modified Compiler Flags
   ``char8_t`` as the character type of ``u8`` literals. This restores the
   Clang 8 behavior that regressed in Clang 9 and 10.
 - -print-targets has been added to print the registered targets.
+- -mcpu is now supported for RISC-V, and recognises the generic-rv32,
+  rocket-rv32, sifive-e31, generic-rv64, rocket-rv64, and sifive-u54 target
+  CPUs.
+
 
 New Pragmas in Clang
 --------------------
@@ -416,6 +427,11 @@ Changes related to C++ for OpenCL
 ABI Changes in Clang
 --------------------
 
+- For RISC-V, an ABI bug was fixed when passing complex single-precision
+  floats in RV64 with the hard float ABI. The bug could only be triggered for
+  function calls that exhaust the available FPRs.
+
+
 OpenMP Support in Clang
 -----------------------
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 8171f9d990c9..cbc8c0859c7b 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -178,6 +178,56 @@ Changes to the PowerPC Target
 
 During this release ...
 
+Changes to the RISC-V Target
+----------------------------
+
+New features:
+* After consultation through an RFC, the RISC-V backend now accepts patches for
+  proposed instruction set extensions that have not yet been ratified.  For these
+  experimental extensions, there is no expectation of ongoing support - the
+  compiler support will continue to change until the specification is finalised.
+  In line with this policy, MC layer and code generation support was added for
+  version 0.92 of the proposed Bit Manipulation Extension and MC layer support
+  was added for version 0.8 of the proposed RISC-V Vector instruction set
+  extension. As these extensions are not yet ratified, compiler support will
+  continue to change to match the specifications until they are finalised.
+* ELF attribute sections are now created, encoding information such as the ISA
+  string.
+* Support for saving/restoring callee-saved registers via libcalls (a code
+  size optimisation).
+* llvm-objdump will now print branch targets as part of disassembly.
+
+Improvements:
+* If an immediate can be generated using a pair of `addi` instructions, that
+  pair will be selected rather than materialising the immediate into a
+  separate register with an `lui` and `addi` pair.
+* Multiplication by a constant was optimised.
+* `addi` instructions are now folded into the offset of a load/store instruction
+  even if the load/store itself has a non-zero offset, when it is safe to do
+  so.
+* Additional target hooks were implemented to minimise generation of
+  unnecessary control flow instruction.
+* The RISC-V backend's load/store peephole optimisation pass now supports
+  constant pools, improving code generation for floating point constants.
+* Debug scratch register names `dscratch0` and `dscratch1` are now recognised in
+  addition to the legacy `dscratch` register name.
+* Codegen for checking isnan was improved, removing a redundant `and`.
+* The `dret` instruction is now supported by the MC layer.
+* `.option pic` and `.option nopic` are now supported in assembly and `.reloc`
+  was extended to support arbitrary relocation types.
+* Scheduling info metadata was improved.
+* The `jump` pseudo instruction is now supported.
+
+Bug fixes:
+* A failure to insert indirect branches in position independent code
+  was fixed.
+* The calculated expanded size of atomic pseudo operations was fixed, avoiding
+  "fixup value out of range" errors during branch relaxation for some inputs.
+* The `mcountinhibit` CSR is now recognised.
+* The correct libcall is now emitted for converting a float/double to a 32-bit
+  signed or unsigned integer on RV64 targets lacking the F or D extensions.
+
+
 Changes to the X86 Target
 -------------------------
 

From 919f9c291508217c697220b87a33406b9b685202 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dimitry@andric.com>
Date: Wed, 2 Sep 2020 18:56:12 +0200
Subject: [PATCH 209/363] Eliminate the sizing template parameter N from
 CoalescingBitVector

Since the parameter is not used anywhere, and the default size of 16
apparently causes PR47359, remove it. This ensures that IntervalMap will
automatically determine the optimal size, using its NodeSizer struct.

Reviewed By: dblaikie

Differential Revision: https://reviews.llvm.org/D87044

(cherry picked from commit f26fc568402f84a94557cbe86e7aac8319d61387)
---
 llvm/include/llvm/ADT/CoalescingBitVector.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/ADT/CoalescingBitVector.h b/llvm/include/llvm/ADT/CoalescingBitVector.h
index f8c8fec0ec9e..0a7dcfe22631 100644
--- a/llvm/include/llvm/ADT/CoalescingBitVector.h
+++ b/llvm/include/llvm/ADT/CoalescingBitVector.h
@@ -34,15 +34,14 @@ namespace llvm {
 /// performance for non-sequential find() operations.
 ///
 /// \tparam IndexT - The type of the index into the bitvector.
-/// \tparam N - The first N coalesced intervals of set bits are stored in-place.
-template <typename IndexT, unsigned N = 16> class CoalescingBitVector {
+template <typename IndexT> class CoalescingBitVector {
   static_assert(std::is_unsigned<IndexT>::value,
                 "Index must be an unsigned integer.");
 
-  using ThisT = CoalescingBitVector<IndexT, N>;
+  using ThisT = CoalescingBitVector<IndexT>;
 
   /// An interval map for closed integer ranges. The mapped values are unused.
-  using MapT = IntervalMap<IndexT, char, N>;
+  using MapT = IntervalMap<IndexT, char>;
 
   using UnderlyingIterator = typename MapT::const_iterator;
 

From 8399522c96a94bfb7c1cbf4df2bed0b3d826fbf6 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Sun, 6 Sep 2020 15:42:21 -0700
Subject: [PATCH 210/363] [WebAssembly] Fix incorrect assumption of simple
 value types

Fixes PR47375, in which an assertion was triggering because
WebAssemblyTargetLowering::isVectorLoadExtDesirable was improperly
assuming the use of simple value types.

Differential Revision: https://reviews.llvm.org/D87110

(cherry picked from commit caee15a0ed52471bd329d01dc253ec9be3936c6d)
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  4 +--
 llvm/test/CodeGen/WebAssembly/pr47375.ll      | 36 +++++++++++++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/WebAssembly/pr47375.ll

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index a9b9eceb4130..925636c82321 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -601,8 +601,8 @@ bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT,
 }
 
 bool WebAssemblyTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
-  MVT ExtT = ExtVal.getSimpleValueType();
-  MVT MemT = cast<LoadSDNode>(ExtVal->getOperand(0))->getSimpleValueType(0);
+  EVT ExtT = ExtVal.getValueType();
+  EVT MemT = cast<LoadSDNode>(ExtVal->getOperand(0))->getValueType(0);
   return (ExtT == MVT::v8i16 && MemT == MVT::v8i8) ||
          (ExtT == MVT::v4i32 && MemT == MVT::v4i16) ||
          (ExtT == MVT::v2i64 && MemT == MVT::v2i32);
diff --git a/llvm/test/CodeGen/WebAssembly/pr47375.ll b/llvm/test/CodeGen/WebAssembly/pr47375.ll
new file mode 100644
index 000000000000..4c04631f26b1
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/pr47375.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; Regression test for pr47375, in which an assertion was triggering
+; because WebAssemblyTargetLowering::isVectorLoadExtDesirable was
+; improperly assuming the use of simple value types.
+
+define void @sext_vec() {
+; CHECK-LABEL: sext_vec:
+; CHECK:         .functype sext_vec () -> ()
+; CHECK-NEXT:    .local i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.load8_u 0
+; CHECK-NEXT:    local.set 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i32.store8 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 7
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    i32.or
+; CHECK-NEXT:    i32.const 7175
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    i32.store16 0
+; CHECK-NEXT:    # fallthrough-return
+  %L1 = load <2 x i3>, <2 x i3>* undef, align 2
+  %zext = zext <2 x i3> %L1 to <2 x i10>
+  store <2 x i10> %zext, <2 x i10>* undef, align 4
+  ret void
+}

From 56a7fe31adbb352804e4d568ec8b65cdc749a83f Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 2 Sep 2020 12:21:06 -0700
Subject: [PATCH 211/363] [GCDAProfiling] Suppress -Wprio-ctor-dtor for GCC>=9
 and remove unused write_string/length_of_string

The `__attribute__((destructor(100)))` diagnostic does not have a
warning option in GCC 8 (before r264853) and thus cannot be suppressed.

(cherry picked from commit 1cfde143e82aeb47cffba436ba7b5302d8e14193)
---
 compiler-rt/lib/profile/GCDAProfiling.c | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/compiler-rt/lib/profile/GCDAProfiling.c b/compiler-rt/lib/profile/GCDAProfiling.c
index 57d8dec423cc..82369357e986 100644
--- a/compiler-rt/lib/profile/GCDAProfiling.c
+++ b/compiler-rt/lib/profile/GCDAProfiling.c
@@ -210,22 +210,6 @@ static void write_64bit_value(uint64_t i) {
   write_32bit_value(hi);
 }
 
-static uint32_t length_of_string(const char *s) {
-  return (strlen(s) / 4) + 1;
-}
-
-// Remove when we support libgcov 9 current_working_directory.
-#if !defined(_MSC_VER) && defined(__clang__)
-__attribute__((unused))
-#endif
-static void
-write_string(const char *s) {
-  uint32_t len = length_of_string(s);
-  write_32bit_value(len);
-  write_bytes(s, strlen(s));
-  write_bytes("\0\0\0\0", 4 - (strlen(s) % 4));
-}
-
 static uint32_t read_32bit_value() {
   uint32_t val;
 
@@ -632,6 +616,9 @@ void llvm_writeout_files(void) {
 // __attribute__((destructor)) and destructors whose priorities are greater than
 // 100 run before this function and can thus be tracked. The priority is
 // compatible with GCC 7 onwards.
+#if __GNUC__ >= 9
+#pragma GCC diagnostic ignored "-Wprio-ctor-dtor"
+#endif
 __attribute__((destructor(100)))
 #endif
 static void llvm_writeout_and_clear(void) {

From 6b98995a44b2ebc94804ab552ed497c6860b6df3 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Wed, 2 Sep 2020 15:30:19 -0500
Subject: [PATCH 212/363] [PowerPC] Do not legalize vector FDIV without VSX

Quite a while ago, we legalized these nodes as we added custom
handling for reciprocal estimates in the back end. We have since
moved to target-independent combines but neglected to turn off
legalization. As a result, we can now get selection failures on
non-VSX subtargets as evidenced in the listed PR.

Fixes: https://bugs.llvm.org/show_bug.cgi?id=47373
(cherry picked from commit 27714075848e7f05a297317ad28ad2570d8e5a43)
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp |   8 +-
 llvm/test/CodeGen/PowerPC/pr47373.ll        | 180 ++++++++++++++++++++
 2 files changed, 181 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/pr47373.ll

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 2f50c52c90a1..2d0b17115249 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -799,7 +799,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::MUL, MVT::v4f32, Legal);
     setOperationAction(ISD::FMA, MVT::v4f32, Legal);
 
-    if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) {
+    if (Subtarget.hasVSX()) {
       setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
       setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
     }
@@ -1297,12 +1297,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setTargetDAGCombine(ISD::SELECT_CC);
   }
 
-  // Use reciprocal estimates.
-  if (TM.Options.UnsafeFPMath) {
-    setTargetDAGCombine(ISD::FDIV);
-    setTargetDAGCombine(ISD::FSQRT);
-  }
-
   if (Subtarget.hasP9Altivec()) {
     setTargetDAGCombine(ISD::ABS);
     setTargetDAGCombine(ISD::VSELECT);
diff --git a/llvm/test/CodeGen/PowerPC/pr47373.ll b/llvm/test/CodeGen/PowerPC/pr47373.ll
new file mode 100644
index 000000000000..559f4f9a8b4a
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr47373.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=powerpc64-unknown-freebsd13.0 \
+; RUN:   -mcpu=ppc64 -ppc-asm-full-reg-names < %s | FileCheck %s
+@a = local_unnamed_addr global float* null, align 8
+
+; Function Attrs: nounwind
+define void @d() local_unnamed_addr #0 {
+; CHECK-LABEL: d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    std r0, 16(r1)
+; CHECK-NEXT:    stdu r1, -208(r1)
+; CHECK-NEXT:    addis r3, r2, .LC0@toc@ha
+; CHECK-NEXT:    std r29, 184(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    ld r3, .LC0@toc@l(r3)
+; CHECK-NEXT:    std r30, 192(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    ld r29, 0(r3)
+; CHECK-NEXT:    bl c
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mr r30, r3
+; CHECK-NEXT:    bl b
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    cmpwi r30, 1
+; CHECK-NEXT:    blt cr0, .LBB0_9
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    cmplwi r30, 4
+; CHECK-NEXT:    clrldi r4, r30, 32
+; CHECK-NEXT:    li r5, 0
+; CHECK-NEXT:    blt cr0, .LBB0_7
+; CHECK-NEXT:  # %bb.2: # %vector.memcheck
+; CHECK-NEXT:    rldic r6, r30, 2, 30
+; CHECK-NEXT:    add r7, r3, r6
+; CHECK-NEXT:    cmpld r29, r7
+; CHECK-NEXT:    add r6, r29, r6
+; CHECK-NEXT:    bc 4, lt, .LBB0_4
+; CHECK-NEXT:  # %bb.3: # %vector.memcheck
+; CHECK-NEXT:    cmpld r3, r6
+; CHECK-NEXT:    bc 12, lt, .LBB0_7
+; CHECK-NEXT:  .LBB0_4: # %vector.ph
+; CHECK-NEXT:    rlwinm r5, r4, 0, 0, 29
+; CHECK-NEXT:    li r7, 15
+; CHECK-NEXT:    addi r6, r5, -4
+; CHECK-NEXT:    addi r8, r1, 144
+; CHECK-NEXT:    rldicl r6, r6, 62, 2
+; CHECK-NEXT:    addi r9, r1, 128
+; CHECK-NEXT:    addi r6, r6, 1
+; CHECK-NEXT:    addi r10, r1, 160
+; CHECK-NEXT:    mtctr r6
+; CHECK-NEXT:    li r6, 0
+; CHECK-NEXT:    addi r11, r1, 112
+; CHECK-NEXT:  .LBB0_5: # %vector.body
+; CHECK-NEXT:    #
+; CHECK-NEXT:    add r12, r3, r6
+; CHECK-NEXT:    lvx v3, r3, r6
+; CHECK-NEXT:    lvx v5, r12, r7
+; CHECK-NEXT:    add r12, r29, r6
+; CHECK-NEXT:    lvsl v2, r3, r6
+; CHECK-NEXT:    vperm v2, v3, v5, v2
+; CHECK-NEXT:    lvx v3, r29, r6
+; CHECK-NEXT:    lvx v5, r12, r7
+; CHECK-NEXT:    lvsl v4, r29, r6
+; CHECK-NEXT:    stvx v2, 0, r8
+; CHECK-NEXT:    vperm v2, v3, v5, v4
+; CHECK-NEXT:    stvx v2, 0, r9
+; CHECK-NEXT:    lfs f0, 156(r1)
+; CHECK-NEXT:    lfs f1, 140(r1)
+; CHECK-NEXT:    fdivs f0, f1, f0
+; CHECK-NEXT:    lfs f1, 136(r1)
+; CHECK-NEXT:    stfs f0, 172(r1)
+; CHECK-NEXT:    lfs f0, 152(r1)
+; CHECK-NEXT:    fdivs f0, f1, f0
+; CHECK-NEXT:    lfs f1, 132(r1)
+; CHECK-NEXT:    stfs f0, 168(r1)
+; CHECK-NEXT:    lfs f0, 148(r1)
+; CHECK-NEXT:    fdivs f0, f1, f0
+; CHECK-NEXT:    lfs f1, 128(r1)
+; CHECK-NEXT:    stfs f0, 164(r1)
+; CHECK-NEXT:    lfs f0, 144(r1)
+; CHECK-NEXT:    fdivs f0, f1, f0
+; CHECK-NEXT:    stfs f0, 160(r1)
+; CHECK-NEXT:    lvx v2, 0, r10
+; CHECK-NEXT:    stvx v2, 0, r11
+; CHECK-NEXT:    ld r0, 112(r1)
+; CHECK-NEXT:    stdx r0, r29, r6
+; CHECK-NEXT:    addi r6, r6, 16
+; CHECK-NEXT:    ld r0, 120(r1)
+; CHECK-NEXT:    std r0, 8(r12)
+; CHECK-NEXT:    bdnz .LBB0_5
+; CHECK-NEXT:  # %bb.6: # %middle.block
+; CHECK-NEXT:    cmpld r5, r4
+; CHECK-NEXT:    beq cr0, .LBB0_9
+; CHECK-NEXT:  .LBB0_7: # %for.body.preheader18
+; CHECK-NEXT:    sldi r6, r5, 2
+; CHECK-NEXT:    sub r5, r4, r5
+; CHECK-NEXT:    addi r6, r6, -4
+; CHECK-NEXT:    add r3, r3, r6
+; CHECK-NEXT:    add r4, r29, r6
+; CHECK-NEXT:    mtctr r5
+; CHECK-NEXT:  .LBB0_8: # %for.body
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lfsu f0, 4(r4)
+; CHECK-NEXT:    lfsu f1, 4(r3)
+; CHECK-NEXT:    fdivs f0, f0, f1
+; CHECK-NEXT:    stfs f0, 0(r4)
+; CHECK-NEXT:    bdnz .LBB0_8
+; CHECK-NEXT:  .LBB0_9: # %for.end
+; CHECK-NEXT:    ld r30, 192(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld r29, 184(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    addi r1, r1, 208
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+entry:
+  %0 = load float*, float** @a, align 8
+  %call = call signext i32 bitcast (i32 (...)* @c to i32 ()*)() #2
+  %call1 = call float* bitcast (float* (...)* @b to float* ()*)() #2
+  %cmp11 = icmp sgt i32 %call, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %call to i64
+  %min.iters.check = icmp ult i32 %call, 4
+  br i1 %min.iters.check, label %for.body.preheader18, label %vector.memcheck
+
+vector.memcheck:                                  ; preds = %for.body.preheader
+  %scevgep = getelementptr float, float* %0, i64 %wide.trip.count
+  %scevgep15 = getelementptr float, float* %call1, i64 %wide.trip.count
+  %bound0 = icmp ult float* %0, %scevgep15
+  %bound1 = icmp ult float* %call1, %scevgep
+  %found.conflict = and i1 %bound0, %bound1
+  br i1 %found.conflict, label %for.body.preheader18, label %vector.ph
+
+vector.ph:                                        ; preds = %vector.memcheck
+  %n.vec = and i64 %wide.trip.count, 4294967292
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds float, float* %call1, i64 %index
+  %2 = bitcast float* %1 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %2, align 4
+  %3 = getelementptr inbounds float, float* %0, i64 %index
+  %4 = bitcast float* %3 to <4 x float>*
+  %wide.load17 = load <4 x float>, <4 x float>* %4, align 4
+  %5 = fdiv reassoc nsz arcp afn <4 x float> %wide.load17, %wide.load
+  %6 = bitcast float* %3 to <4 x float>*
+  store <4 x float> %5, <4 x float>* %6, align 4
+  %index.next = add i64 %index, 4
+  %7 = icmp eq i64 %index.next, %n.vec
+  br i1 %7, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.end, label %for.body.preheader18
+
+for.body.preheader18:                             ; preds = %middle.block, %vector.memcheck, %for.body.preheader
+  %indvars.iv.ph = phi i64 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader18, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader18 ]
+  %arrayidx = getelementptr inbounds float, float* %call1, i64 %indvars.iv
+  %8 = load float, float* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %0, i64 %indvars.iv
+  %9 = load float, float* %arrayidx3, align 4
+  %div = fdiv reassoc nsz arcp afn float %9, %8
+  store float %div, float* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %middle.block, %entry
+  ret void
+}
+
+declare signext i32 @c(...) local_unnamed_addr #1
+
+declare float* @b(...) local_unnamed_addr #1
+
+attributes #0 = { nounwind }

From b8fe222400586223c4fac4e98a480ee34cace780 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Wed, 2 Sep 2020 17:04:35 -0500
Subject: [PATCH 213/363] [PowerPC] Fix broken kill flag after MI peephole

The test case in https://bugs.llvm.org/show_bug.cgi?id=47373 exposed
two bugs in the PPC back end. The first one was fixed in commit
27714075848e7f05a297317ad28ad2570d8e5a43 but the test case had to
be added without -verify-machineinstrs due to the second bug.
This commit fixes the use-after-kill that is left behind by the
PPC MI peephole optimization.

(cherry picked from commit 69289cc10ffd1de4d3bf05d33948e6b21b6e68db)
---
 llvm/lib/Target/PowerPC/PPCMIPeephole.cpp                       | 2 ++
 .../PowerPC/jump-tables-collapse-rotate-remove-SrcMI.mir        | 2 +-
 llvm/test/CodeGen/PowerPC/mi-peephole.mir                       | 2 +-
 llvm/test/CodeGen/PowerPC/pr47373.ll                            | 2 +-
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index d2aba6bd6e8d..227c863685ae 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -1555,6 +1555,8 @@ bool PPCMIPeephole::emitRLDICWhenLoweringJumpTables(MachineInstr &MI) {
   MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg());
   MI.getOperand(2).setImm(NewSH);
   MI.getOperand(3).setImm(NewMB);
+  MI.getOperand(1).setIsKill(SrcMI->getOperand(1).isKill());
+  SrcMI->getOperand(1).setIsKill(false);
 
   LLVM_DEBUG(dbgs() << "To: ");
   LLVM_DEBUG(MI.dump());
diff --git a/llvm/test/CodeGen/PowerPC/jump-tables-collapse-rotate-remove-SrcMI.mir b/llvm/test/CodeGen/PowerPC/jump-tables-collapse-rotate-remove-SrcMI.mir
index 7c14e7750df9..2f7a85a111eb 100644
--- a/llvm/test/CodeGen/PowerPC/jump-tables-collapse-rotate-remove-SrcMI.mir
+++ b/llvm/test/CodeGen/PowerPC/jump-tables-collapse-rotate-remove-SrcMI.mir
@@ -51,4 +51,4 @@ body:             |
 #
 # CHECK-PASS-NOT:     %2:g8rc = RLDICL killed %1, 0, 32
 # CHECK-PASS-NOT:     %3:g8rc = RLDICR %2, 2, 61
-# CHECK-PASS:     %3:g8rc = RLDIC %1, 2, 30
+# CHECK-PASS:     %3:g8rc = RLDIC killed %1, 2, 30
diff --git a/llvm/test/CodeGen/PowerPC/mi-peephole.mir b/llvm/test/CodeGen/PowerPC/mi-peephole.mir
index 8bf72461d545..c7f41cd0bc4c 100644
--- a/llvm/test/CodeGen/PowerPC/mi-peephole.mir
+++ b/llvm/test/CodeGen/PowerPC/mi-peephole.mir
@@ -31,7 +31,7 @@ body:             |
   ; CHECK: bb.0.entry:
   ; CHECK:   %1:g8rc = COPY $x4
   ; CHECK:   %0:g8rc = COPY $x3
-  ; CHECK:   %3:g8rc = RLDIC %1, 2, 30
+  ; CHECK:   %3:g8rc = RLDIC killed %1, 2, 30
   ; CHECK:   $x3 = COPY %3
   ; CHECK:   BLR8 implicit $lr8, implicit $rm, implicit $x3
 ...
diff --git a/llvm/test/CodeGen/PowerPC/pr47373.ll b/llvm/test/CodeGen/PowerPC/pr47373.ll
index 559f4f9a8b4a..d09a5fe8fb0b 100644
--- a/llvm/test/CodeGen/PowerPC/pr47373.ll
+++ b/llvm/test/CodeGen/PowerPC/pr47373.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=powerpc64-unknown-freebsd13.0 \
+; RUN: llc -mtriple=powerpc64-unknown-freebsd13.0 -verify-machineinstrs \
 ; RUN:   -mcpu=ppc64 -ppc-asm-full-reg-names < %s | FileCheck %s
 @a = local_unnamed_addr global float* null, align 8
 

From ba6a10d87f57b78303555028f92add22918b3bcb Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 1 Sep 2020 12:14:32 -0700
Subject: [PATCH 214/363] [MachineCopyPropagation] In isNopCopy, check the
 destination registers match in addition to the source registers.

Previously if the source match we asserted that the destination
matched. But GPR <-> mask register copies on X86 can violate this
since we use the same K-registers for multiple sizes.

Fixes this ISPC issue https://github.com/ispc/ispc/issues/1851

Differential Revision: https://reviews.llvm.org/D86507

(cherry picked from commit 4783e2c9c603ed6aeacc76bb1177056a9d307bd1)
---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp   |  4 +-
 llvm/test/CodeGen/X86/machine-cp-mask-reg.mir | 59 +++++++++++++++++++
 2 files changed, 60 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/machine-cp-mask-reg.mir

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 70d6dcc2e3e2..4c4839ca6522 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -336,10 +336,8 @@ static bool isNopCopy(const MachineInstr &PreviousCopy, unsigned Src,
                       unsigned Def, const TargetRegisterInfo *TRI) {
   Register PreviousSrc = PreviousCopy.getOperand(1).getReg();
   Register PreviousDef = PreviousCopy.getOperand(0).getReg();
-  if (Src == PreviousSrc) {
-    assert(Def == PreviousDef);
+  if (Src == PreviousSrc && Def == PreviousDef)
     return true;
-  }
   if (!TRI->isSubRegister(PreviousSrc, Src))
     return false;
   unsigned SubIdx = TRI->getSubRegIndex(PreviousSrc, Src);
diff --git a/llvm/test/CodeGen/X86/machine-cp-mask-reg.mir b/llvm/test/CodeGen/X86/machine-cp-mask-reg.mir
new file mode 100644
index 000000000000..86a077e64764
--- /dev/null
+++ b/llvm/test/CodeGen/X86/machine-cp-mask-reg.mir
@@ -0,0 +1,59 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skx -run-pass=machine-cp -o - | FileCheck %s
+
+# machine-cp previously asserted trying to determine if the k0->eax copy below
+# could be combined with the k0->rax copy.
+
+--- |
+  ; ModuleID = 'test.ll'
+  source_filename = "test.ll"
+  target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+  define i8 @foo(<64 x i8> %x, i64* %y, i64 %z) #0 {
+    %a = icmp eq <64 x i8> %x, zeroinitializer
+    %b = bitcast <64 x i1> %a to i64
+    %c = add i64 %b, %z
+    store i64 %c, i64* %y, align 8
+    %d = extractelement <64 x i1> %a, i32 0
+    %e = zext i1 %d to i8
+    ret i8 %e
+  }
+
+  attributes #0 = { "target-cpu"="skx" }
+
+...
+---
+name:            foo
+alignment:       16
+tracksRegLiveness: true
+liveins:
+  - { reg: '$zmm0' }
+  - { reg: '$rdi' }
+  - { reg: '$rsi' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $rdi, $rsi, $zmm0
+
+    ; CHECK-LABEL: name: foo
+    ; CHECK: liveins: $rdi, $rsi, $zmm0
+    ; CHECK: renamable $k0 = VPTESTNMBZrr killed renamable $zmm0, renamable $zmm0
+    ; CHECK: renamable $rax = COPY renamable $k0
+    ; CHECK: renamable $rsi = ADD64rr killed renamable $rsi, killed renamable $rax, implicit-def dead $eflags
+    ; CHECK: MOV64mr killed renamable $rdi, 1, $noreg, 0, $noreg, killed renamable $rsi :: (store 8 into %ir.y)
+    ; CHECK: renamable $eax = COPY killed renamable $k0
+    ; CHECK: renamable $al = AND8ri renamable $al, 1, implicit-def dead $eflags, implicit killed $eax, implicit-def $eax
+    ; CHECK: $al = KILL renamable $al, implicit killed $eax
+    ; CHECK: RET 0, $al
+    renamable $k0 = VPTESTNMBZrr killed renamable $zmm0, renamable $zmm0
+    renamable $rax = COPY renamable $k0
+    renamable $rsi = ADD64rr killed renamable $rsi, killed renamable $rax, implicit-def dead $eflags
+    MOV64mr killed renamable $rdi, 1, $noreg, 0, $noreg, killed renamable $rsi :: (store 8 into %ir.y)
+    renamable $eax = COPY killed renamable $k0
+    renamable $al = AND8ri renamable $al, 1, implicit-def dead $eflags, implicit killed $eax, implicit-def $eax
+    $al = KILL renamable $al, implicit killed $eax
+    RET 0, $al
+
+...

From 0d8feb542b99bd9f3b92b7be422e8f0d86b93870 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Fri, 4 Sep 2020 21:44:37 -0400
Subject: [PATCH 215/363] [PowerPC] Provide vec_cmpne on pre-Power9
 architectures in altivec.h

These overloads are listed in appendix A of the ELFv2 ABI specification
without a requirement for ISA 3.0. So these need to be available on
all Altivec-capable architectures. The implementation in altivec.h
erroneously had them guarded for Power9 due to the availability of
the VCMPNE[BHW] instructions. However these need to be implemented
in terms of the VCMPEQ[BHW] instructions on older architectures.

Fixes: https://bugs.llvm.org/show_bug.cgi?id=47423
---
 clang/lib/Headers/altivec.h               | 104 +++++++++++++++++-----
 clang/test/CodeGen/builtins-ppc-altivec.c |  79 ++++++++++++++++
 2 files changed, 159 insertions(+), 24 deletions(-)

diff --git a/clang/lib/Headers/altivec.h b/clang/lib/Headers/altivec.h
index ac5f43836316..c4b90cc3f87c 100644
--- a/clang/lib/Headers/altivec.h
+++ b/clang/lib/Headers/altivec.h
@@ -1766,36 +1766,12 @@ vec_cmpne(vector unsigned int __a, vector unsigned int __b) {
                                                     (vector int)__b);
 }
 
-static __inline__ vector bool long long __ATTRS_o_ai
-vec_cmpne(vector bool long long __a, vector bool long long __b) {
-  return (vector bool long long)
-    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
-}
-
-static __inline__ vector bool long long __ATTRS_o_ai
-vec_cmpne(vector signed long long __a, vector signed long long __b) {
-  return (vector bool long long)
-    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
-}
-
-static __inline__ vector bool long long __ATTRS_o_ai
-vec_cmpne(vector unsigned long long __a, vector unsigned long long __b) {
-  return (vector bool long long)
-    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
-}
-
 static __inline__ vector bool int __ATTRS_o_ai
 vec_cmpne(vector float __a, vector float __b) {
   return (vector bool int)__builtin_altivec_vcmpnew((vector int)__a,
                                                     (vector int)__b);
 }
 
-static __inline__ vector bool long long __ATTRS_o_ai
-vec_cmpne(vector double __a, vector double __b) {
-  return (vector bool long long)
-    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
-}
-
 /* vec_cmpnez */
 
 static __inline__ vector bool char __ATTRS_o_ai
@@ -1900,6 +1876,86 @@ vec_parity_lsbb(vector signed long long __a) {
   return __builtin_altivec_vprtybd(__a);
 }
 
+#else
+/* vec_cmpne */
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector bool char __a, vector bool char __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector signed char __a, vector signed char __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool char __ATTRS_o_ai
+vec_cmpne(vector unsigned char __a, vector unsigned char __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector bool short __a, vector bool short __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector signed short __a, vector signed short __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool short __ATTRS_o_ai
+vec_cmpne(vector unsigned short __a, vector unsigned short __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector bool int __a, vector bool int __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector signed int __a, vector signed int __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector unsigned int __a, vector unsigned int __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+
+static __inline__ vector bool int __ATTRS_o_ai
+vec_cmpne(vector float __a, vector float __b) {
+  return ~(vec_cmpeq(__a, __b));
+}
+#endif
+
+#ifdef __POWER8_VECTOR__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector bool long long __a, vector bool long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector signed long long __a, vector signed long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector unsigned long long __a, vector unsigned long long __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
+#endif
+
+#ifdef __VSX__
+static __inline__ vector bool long long __ATTRS_o_ai
+vec_cmpne(vector double __a, vector double __b) {
+  return (vector bool long long)
+    ~(__builtin_altivec_vcmpequd((vector long long)__a, (vector long long)__b));
+}
 #endif
 
 /* vec_cmpgt */
diff --git a/clang/test/CodeGen/builtins-ppc-altivec.c b/clang/test/CodeGen/builtins-ppc-altivec.c
index d53011b37d41..dc93e7340597 100644
--- a/clang/test/CodeGen/builtins-ppc-altivec.c
+++ b/clang/test/CodeGen/builtins-ppc-altivec.c
@@ -1029,6 +1029,85 @@ void test2() {
 // CHECK: @llvm.ppc.altivec.vcmpeqfp
 // CHECK-LE: @llvm.ppc.altivec.vcmpeqfp
 
+  /* vec_cmpne */
+  res_vbc = vec_cmpne(vsc, vsc);
+// CHECK: @llvm.ppc.altivec.vcmpequb
+// CHECK: xor
+// CHECK-LE: @llvm.ppc.altivec.vcmpequb
+// CHECK-LE: xor
+
+  res_vbc = vec_cmpne(vuc, vuc);
+// CHECK: @llvm.ppc.altivec.vcmpequb
+// CHECK: xor
+// CHECK-LE: @llvm.ppc.altivec.vcmpequb
+// CHECK-LE: xor
+
+  res_vbc = vec_cmpne(vbc, vbc);
+// CHECK: @llvm.ppc.altivec.vcmpequb
+// CHECK: xor
+// CHECK-LE: @llvm.ppc.altivec.vcmpequb
+// CHECK-LE: xor
+
+  res_vbc = vec_cmpne(vbc, vbc);
+// CHECK: @llvm.ppc.altivec.vcmpequb
+// CHECK: xor
+// CHECK-LE: @llvm.ppc.altivec.vcmpequb
+// CHECK-LE: xor
+
+  res_vbs = vec_cmpne(vs, vs);
+// CHECK: @llvm.ppc.altivec.vcmpequh
+// CHECK: xor
+// CHECK-LE: @llvm.ppc.altivec.vcmpequh
+// CHECK-LE: xor
+
+  res_vbs = vec_cmpne(vus, vus);
+// CHECK: @llvm.ppc.altivec.vcmpequh
+// CHECK: xor
+// CHECK-LE: @llvm.ppc.altivec.vcmpequh
+// CHECK-LE: xor
+
+  res_vbs = vec_cmpne(vbs, vbs);
+// CHECK: @llvm.ppc.altivec.vcmpequh
+// CHECK: xor
+// CHECK-LE: @llvm.ppc.altivec.vcmpequh
+// CHECK-LE: xor
+
+  res_vbs = vec_cmpne(vbs, vbs);
+// CHECK: @llvm.ppc.altivec.vcmpequh
+// CHECK: xor
+// CHECK-LE: @llvm.ppc.altivec.vcmpequh
+// CHECK-LE: xor
+
+  res_vbi = vec_cmpne(vi, vi);
+// CHECK: @llvm.ppc.altivec.vcmpequw
+// CHECK: xor
+// CHECK-LE: @llvm.ppc.altivec.vcmpequw
+// CHECK-LE: xor
+
+  res_vbi = vec_cmpne(vui, vui);
+// CHECK: @llvm.ppc.altivec.vcmpequw
+// CHECK: xor
+// CHECK-LE: @llvm.ppc.altivec.vcmpequw
+// CHECK-LE: xor
+
+  res_vbi = vec_cmpne(vbi, vbi);
+// CHECK: @llvm.ppc.altivec.vcmpequw
+// CHECK: xor
+// CHECK-LE: @llvm.ppc.altivec.vcmpequw
+// CHECK-LE: xor
+
+  res_vbi = vec_cmpne(vbi, vbi);
+// CHECK: @llvm.ppc.altivec.vcmpequw
+// CHECK: xor
+// CHECK-LE: @llvm.ppc.altivec.vcmpequw
+// CHECK-LE: xor
+
+  res_vbi = vec_cmpne(vf, vf);
+// CHECK: @llvm.ppc.altivec.vcmpeqfp
+// CHECK: xor
+// CHECK-LE: @llvm.ppc.altivec.vcmpeqfp
+// CHECK-LE: xor
+
   /* vec_cmpge */
   res_vbc = vec_cmpge(vsc, vsc);
 // CHECK: @llvm.ppc.altivec.vcmpgtsb

From 96b8fd70d1572d3d38abce208e855c49f9eeac1d Mon Sep 17 00:00:00 2001
From: Aaron Puchert <aaronpuchert@alice-dsl.net>
Date: Sat, 5 Sep 2020 14:23:54 +0200
Subject: [PATCH 216/363] Set InvalidDecl directly when deserializing a Decl

When parsing a C++17 binding declaration, we first create the
BindingDecls in Sema::ActOnDecompositionDeclarator, and then build the
DecompositionDecl in Sema::ActOnVariableDeclarator, so the contained
BindingDecls are never null. But when deserializing, we read the
DecompositionDecl with all properties before filling in the Bindings.
Among other things, reading a declaration reads whether it's invalid,
then calling setInvalidDecl which assumes that all bindings of the
DecompositionDecl are available, but that isn't the case.

Deserialization should just set all properties directly without invoking
subsequent functions, so we just set the flag without using the setter.

Fixes PR34960.

Reviewed By: rsmith

Differential Revision: https://reviews.llvm.org/D86207

(cherry picked from commit 16975a638df3cda95c677055120b23e689d96dcd)
---
 clang/lib/Serialization/ASTReaderDecl.cpp |  2 +-
 clang/test/PCH/cxx1z-decomposition.cpp    | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 117eb598bd5e..c0bf240464f7 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -583,7 +583,7 @@ void ASTDeclReader::VisitDecl(Decl *D) {
                            Reader.getContext());
   }
   D->setLocation(ThisDeclLoc);
-  D->setInvalidDecl(Record.readInt());
+  D->InvalidDecl = Record.readInt();
   if (Record.readInt()) { // hasAttrs
     AttrVec Attrs;
     Record.readAttributes(Attrs);
diff --git a/clang/test/PCH/cxx1z-decomposition.cpp b/clang/test/PCH/cxx1z-decomposition.cpp
index 2f817b4280de..914ce80c550d 100644
--- a/clang/test/PCH/cxx1z-decomposition.cpp
+++ b/clang/test/PCH/cxx1z-decomposition.cpp
@@ -2,11 +2,11 @@
 // RUN: %clang_cc1 -pedantic -std=c++1z -include %s -verify %s
 //
 // With PCH:
-// RUN: %clang_cc1 -pedantic -std=c++1z -emit-pch %s -o %t
-// RUN: %clang_cc1 -pedantic -std=c++1z -include-pch %t -verify %s
+// RUN: %clang_cc1 -pedantic -std=c++1z -emit-pch -fallow-pch-with-compiler-errors %s -o %t
+// RUN: %clang_cc1 -pedantic -std=c++1z -include-pch %t -fallow-pch-with-compiler-errors -verify %s
 
-// RUN: %clang_cc1 -pedantic -std=c++1z -emit-pch -fpch-instantiate-templates %s -o %t
-// RUN: %clang_cc1 -pedantic -std=c++1z -include-pch %t -verify %s
+// RUN: %clang_cc1 -pedantic -std=c++1z -emit-pch -fallow-pch-with-compiler-errors -fpch-instantiate-templates %s -o %t
+// RUN: %clang_cc1 -pedantic -std=c++1z -include-pch %t -fallow-pch-with-compiler-errors -verify %s
 
 #ifndef HEADER
 #define HEADER
@@ -22,6 +22,8 @@ constexpr int foo(Q &&q) {
   return a * 10 + b;
 }
 
+auto [noinit]; // expected-error{{decomposition declaration '[noinit]' requires an initializer}}
+
 #else
 
 int arr[2];

From 7d4d7a7bf1e8d99b80da66afde7df81b05f77538 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Sun, 6 Sep 2020 15:54:24 -0400
Subject: [PATCH 217/363] [compiler-rt] Implement __clear_cache() on
 OpenBSD/arm

(cherry picked from commit 8542dab909f895a8b6812428bb5e1acf7ea15305)
---
 compiler-rt/lib/builtins/clear_cache.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/builtins/clear_cache.c b/compiler-rt/lib/builtins/clear_cache.c
index 72e02e613de5..29e31f55d499 100644
--- a/compiler-rt/lib/builtins/clear_cache.c
+++ b/compiler-rt/lib/builtins/clear_cache.c
@@ -33,7 +33,7 @@ uintptr_t GetCurrentProcess(void);
 #include <machine/sysarch.h>
 #endif
 
-#if defined(__OpenBSD__) && defined(__mips__)
+#if defined(__OpenBSD__) && (defined(__arm__) || defined(__mips__))
 // clang-format off
 #include <sys/types.h>
 #include <machine/sysarch.h>
@@ -58,7 +58,7 @@ void __clear_cache(void *start, void *end) {
 #elif defined(_WIN32) && (defined(__arm__) || defined(__aarch64__))
   FlushInstructionCache(GetCurrentProcess(), start, end - start);
 #elif defined(__arm__) && !defined(__APPLE__)
-#if defined(__FreeBSD__) || defined(__NetBSD__)
+#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)
   struct arm_sync_icache_args arg;
 
   arg.addr = (uintptr_t)start;

From 9e6b164239d767bdbc3248006f21dff6e8a41d90 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Mon, 7 Sep 2020 02:27:11 -0400
Subject: [PATCH 218/363] [Sparc] Select the UltraSPARC instruction set with
 the external assembler

Select the UltraSPARC instruction set with the external assembler on
Linux / FreeBSD / OpenBSD, matches GCC.

(cherry picked from commit 70523ecfaca692bf5d0192e466c34ae7514624ea)
---
 clang/lib/Driver/ToolChains/Arch/Sparc.cpp | 9 ++++++++-
 clang/test/Driver/freebsd.c                | 2 +-
 clang/test/Driver/linux-as.c               | 4 ++--
 clang/test/Driver/openbsd.c                | 2 +-
 4 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
index 043b7f257c01..70ba8eb2a7d0 100644
--- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
@@ -21,12 +21,19 @@ using namespace llvm::opt;
 const char *sparc::getSparcAsmModeForCPU(StringRef Name,
                                          const llvm::Triple &Triple) {
   if (Triple.getArch() == llvm::Triple::sparcv9) {
+    const char *DefV9CPU;
+
+    if (Triple.isOSLinux() || Triple.isOSFreeBSD() || Triple.isOSOpenBSD())
+      DefV9CPU = "-Av9a";
+    else
+      DefV9CPU = "-Av9";
+
     return llvm::StringSwitch<const char *>(Name)
         .Case("niagara", "-Av9b")
         .Case("niagara2", "-Av9b")
         .Case("niagara3", "-Av9d")
         .Case("niagara4", "-Av9d")
-        .Default("-Av9");
+        .Default(DefV9CPU);
   } else {
     return llvm::StringSwitch<const char *>(Name)
         .Case("v8", "-Av8")
diff --git a/clang/test/Driver/freebsd.c b/clang/test/Driver/freebsd.c
index 5eb00ce65d71..2d276952691c 100644
--- a/clang/test/Driver/freebsd.c
+++ b/clang/test/Driver/freebsd.c
@@ -168,7 +168,7 @@
 // RUN: %clang -mcpu=ultrasparc -target sparc64-unknown-freebsd8 %s -### -no-integrated-as 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-SPARC-CPU %s
 // CHECK-SPARC-CPU: cc1{{.*}}" "-target-cpu" "ultrasparc"
-// CHECK-SPARC-CPU: as{{.*}}" "-Av9
+// CHECK-SPARC-CPU: as{{.*}}" "-Av9a
 
 // Check that -G flags are passed to the linker for mips
 // RUN: %clang -target mips-unknown-freebsd %s -### -G0 2>&1 \
diff --git a/clang/test/Driver/linux-as.c b/clang/test/Driver/linux-as.c
index 77ac05f30942..0959bd7ba0a1 100644
--- a/clang/test/Driver/linux-as.c
+++ b/clang/test/Driver/linux-as.c
@@ -168,7 +168,7 @@
 // RUN:   | FileCheck -check-prefix=CHECK-SPARCV9 %s
 // CHECK-SPARCV9: as
 // CHECK-SPARCV9: -64
-// CHECK-SPARCV9: -Av9
+// CHECK-SPARCV9: -Av9a
 // CHECK-SPARCV9-NOT: -KPIC
 // CHECK-SPARCV9: -o
 //
@@ -177,7 +177,7 @@
 // RUN:   | FileCheck -check-prefix=CHECK-SPARCV9PIC %s
 // CHECK-SPARCV9PIC: as
 // CHECK-SPARCV9PIC: -64
-// CHECK-SPARCV9PIC: -Av9
+// CHECK-SPARCV9PIC: -Av9a
 // CHECK-SPARCV9PIC: -KPIC
 // CHECK-SPARCV9PIC: -o
 //
diff --git a/clang/test/Driver/openbsd.c b/clang/test/Driver/openbsd.c
index 203b4b4a2ff0..ae1aa6441690 100644
--- a/clang/test/Driver/openbsd.c
+++ b/clang/test/Driver/openbsd.c
@@ -70,7 +70,7 @@
 // RUN:   | FileCheck -check-prefix=CHECK-MIPS64EL-PIC %s
 // CHECK-AMD64-M32: as{{.*}}" "--32"
 // CHECK-POWERPC: as{{.*}}" "-mppc" "-many"
-// CHECK-SPARC64: as{{.*}}" "-64" "-Av9"
+// CHECK-SPARC64: as{{.*}}" "-64" "-Av9a"
 // CHECK-MIPS64: as{{.*}}" "-mabi" "64" "-EB"
 // CHECK-MIPS64-PIC: as{{.*}}" "-mabi" "64" "-EB" "-KPIC"
 // CHECK-MIPS64EL: as{{.*}}" "-mabi" "64" "-EL"

From e7e6335763cafe06988a6c06ed50af0b4ec28d8b Mon Sep 17 00:00:00 2001
From: Juneyoung Lee <aqjune@gmail.com>
Date: Tue, 8 Sep 2020 11:40:24 +0900
Subject: [PATCH 219/363] ReleaseNotes: Add updates in LangRef related with
 undef/poison

---
 llvm/docs/ReleaseNotes.rst | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index cbc8c0859c7b..0d5e0137bbc4 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -101,6 +101,18 @@ Changes to the LLVM IR
   where ``uint64_t`` was used to denote the size in bits of a IR type
   we have partially migrated the codebase to using ``llvm::TypeSize``.
 
+* Branching on ``undef``/``poison`` is undefined behavior. It is needed for
+  correctly analyzing value ranges based on branch conditions. This is
+  consistent with MSan's behavior as well.
+
+* ``memset``/``memcpy``/``memmove`` can take ``undef``/``poison`` pointer(s)
+  if the size to fill is zero.
+
+* Passing ``undef``/``poison`` to a standard I/O library function call
+  (`printf`/`fputc`/...) is undefined behavior. The new ``noundef`` attribute
+  is attached to the functions' arguments. The full list is available at
+  ``llvm::inferLibFuncAttributes``.
+
 Changes to building LLVM
 ------------------------
 
@@ -305,6 +317,10 @@ Changes to the Go bindings
 Changes to the DAG infrastructure
 ---------------------------------
 
+* A SelDag-level freeze instruction has landed. It is simply lowered as a copy
+  operation to MachineIR, but to make it fully correct either IMPLICIT_DEF
+  should be fixed or the equivalent FREEZE operation should be added to
+  MachineIR.
 
 Changes to the Debug Info
 ---------------------------------

From 6d762fdaa5c42e9f74cca48481e6f55c4472c0d3 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Fri, 4 Sep 2020 13:54:21 -0400
Subject: [PATCH 220/363] [PowerPC] Allow const pointers for load builtins in
 altivec.h

The load builtins in altivec.h do not have const in the signature
for the pointer parameter. This prevents using them for loading
from constant pointers. A notable case for such a use is Eigen.

This patch simply adds the missing const.

Fixes: https://bugs.llvm.org/show_bug.cgi?id=47408
(cherry picked from commit 54205f0bd2377503b818d7f62cc4ed63ef5b1e94)
---
 clang/lib/Headers/altivec.h                 |  74 ++++-----
 clang/test/CodeGen/builtins-ppc-altivec.c   | 171 ++++++++++----------
 clang/test/CodeGen/builtins-ppc-p10vector.c |  10 +-
 clang/test/CodeGen/builtins-ppc-xl-xst.c    | 165 +++++++++++--------
 4 files changed, 230 insertions(+), 190 deletions(-)

diff --git a/clang/lib/Headers/altivec.h b/clang/lib/Headers/altivec.h
index c4b90cc3f87c..c00c5561428a 100644
--- a/clang/lib/Headers/altivec.h
+++ b/clang/lib/Headers/altivec.h
@@ -2758,67 +2758,67 @@ vec_insert_exp(vector unsigned int __a, vector unsigned int __b) {
 }
 
 #if defined(__powerpc64__)
-static __inline__ vector signed char __ATTRS_o_ai vec_xl_len(signed char *__a,
+static __inline__ vector signed char __ATTRS_o_ai vec_xl_len(const signed char *__a,
                                                              size_t __b) {
   return (vector signed char)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
 static __inline__ vector unsigned char __ATTRS_o_ai
-vec_xl_len(unsigned char *__a, size_t __b) {
+vec_xl_len(const unsigned char *__a, size_t __b) {
   return (vector unsigned char)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
-static __inline__ vector signed short __ATTRS_o_ai vec_xl_len(signed short *__a,
+static __inline__ vector signed short __ATTRS_o_ai vec_xl_len(const signed short *__a,
                                                               size_t __b) {
   return (vector signed short)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
 static __inline__ vector unsigned short __ATTRS_o_ai
-vec_xl_len(unsigned short *__a, size_t __b) {
+vec_xl_len(const unsigned short *__a, size_t __b) {
   return (vector unsigned short)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
-static __inline__ vector signed int __ATTRS_o_ai vec_xl_len(signed int *__a,
+static __inline__ vector signed int __ATTRS_o_ai vec_xl_len(const signed int *__a,
                                                             size_t __b) {
   return (vector signed int)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
-static __inline__ vector unsigned int __ATTRS_o_ai vec_xl_len(unsigned int *__a,
+static __inline__ vector unsigned int __ATTRS_o_ai vec_xl_len(const unsigned int *__a,
                                                               size_t __b) {
   return (vector unsigned int)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
-static __inline__ vector float __ATTRS_o_ai vec_xl_len(float *__a, size_t __b) {
+static __inline__ vector float __ATTRS_o_ai vec_xl_len(const float *__a, size_t __b) {
   return (vector float)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
 static __inline__ vector signed __int128 __ATTRS_o_ai
-vec_xl_len(signed __int128 *__a, size_t __b) {
+vec_xl_len(const signed __int128 *__a, size_t __b) {
   return (vector signed __int128)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
 static __inline__ vector unsigned __int128 __ATTRS_o_ai
-vec_xl_len(unsigned __int128 *__a, size_t __b) {
+vec_xl_len(const unsigned __int128 *__a, size_t __b) {
   return (vector unsigned __int128)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
 static __inline__ vector signed long long __ATTRS_o_ai
-vec_xl_len(signed long long *__a, size_t __b) {
+vec_xl_len(const signed long long *__a, size_t __b) {
   return (vector signed long long)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
 static __inline__ vector unsigned long long __ATTRS_o_ai
-vec_xl_len(unsigned long long *__a, size_t __b) {
+vec_xl_len(const unsigned long long *__a, size_t __b) {
   return (vector unsigned long long)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
-static __inline__ vector double __ATTRS_o_ai vec_xl_len(double *__a,
+static __inline__ vector double __ATTRS_o_ai vec_xl_len(const double *__a,
                                                         size_t __b) {
   return (vector double)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
 static __inline__ vector unsigned char __ATTRS_o_ai
-vec_xl_len_r(unsigned char *__a, size_t __b) {
+vec_xl_len_r(const unsigned char *__a, size_t __b) {
   vector unsigned char __res =
       (vector unsigned char)__builtin_vsx_lxvll(__a, (__b << 56));
 #ifdef __LITTLE_ENDIAN__
@@ -16409,41 +16409,41 @@ typedef vector unsigned int unaligned_vec_uint __attribute__((aligned(1)));
 typedef vector float unaligned_vec_float __attribute__((aligned(1)));
 
 static inline __ATTRS_o_ai vector signed char vec_xl(signed long long __offset,
-                                                     signed char *__ptr) {
+                                                     const signed char *__ptr) {
   return *(unaligned_vec_schar *)(__ptr + __offset);
 }
 
 static inline __ATTRS_o_ai vector unsigned char
-vec_xl(signed long long __offset, unsigned char *__ptr) {
+vec_xl(signed long long __offset, const unsigned char *__ptr) {
   return *(unaligned_vec_uchar*)(__ptr + __offset);
 }
 
 static inline __ATTRS_o_ai vector signed short vec_xl(signed long long __offset,
-                                                      signed short *__ptr) {
+                                                      const signed short *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_sshort *)__addr;
 }
 
 static inline __ATTRS_o_ai vector unsigned short
-vec_xl(signed long long __offset, unsigned short *__ptr) {
+vec_xl(signed long long __offset, const unsigned short *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_ushort *)__addr;
 }
 
 static inline __ATTRS_o_ai vector signed int vec_xl(signed long long __offset,
-                                                    signed int *__ptr) {
+                                                    const signed int *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_sint *)__addr;
 }
 
 static inline __ATTRS_o_ai vector unsigned int vec_xl(signed long long __offset,
-                                                      unsigned int *__ptr) {
+                                                      const unsigned int *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_uint *)__addr;
 }
 
 static inline __ATTRS_o_ai vector float vec_xl(signed long long __offset,
-                                               float *__ptr) {
+                                               const float *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_float *)__addr;
 }
@@ -16454,19 +16454,19 @@ typedef vector unsigned long long unaligned_vec_ull __attribute__((aligned(1)));
 typedef vector double unaligned_vec_double __attribute__((aligned(1)));
 
 static inline __ATTRS_o_ai vector signed long long
-vec_xl(signed long long __offset, signed long long *__ptr) {
+vec_xl(signed long long __offset, const signed long long *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_sll *)__addr;
 }
 
 static inline __ATTRS_o_ai vector unsigned long long
-vec_xl(signed long long __offset, unsigned long long *__ptr) {
+vec_xl(signed long long __offset, const unsigned long long *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_ull *)__addr;
 }
 
 static inline __ATTRS_o_ai vector double vec_xl(signed long long __offset,
-                                                double *__ptr) {
+                                                const double *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_double *)__addr;
 }
@@ -16477,13 +16477,13 @@ typedef vector signed __int128 unaligned_vec_si128 __attribute__((aligned(1)));
 typedef vector unsigned __int128 unaligned_vec_ui128
     __attribute__((aligned(1)));
 static inline __ATTRS_o_ai vector signed __int128
-vec_xl(signed long long __offset, signed __int128 *__ptr) {
+vec_xl(signed long long __offset, const signed __int128 *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_si128 *)__addr;
 }
 
 static inline __ATTRS_o_ai vector unsigned __int128
-vec_xl(signed long long __offset, unsigned __int128 *__ptr) {
+vec_xl(signed long long __offset, const unsigned __int128 *__ptr) {
   signed char *__addr = (signed char *)__ptr + __offset;
   return *(unaligned_vec_ui128 *)__addr;
 }
@@ -16493,71 +16493,71 @@ vec_xl(signed long long __offset, unsigned __int128 *__ptr) {
 
 #ifdef __LITTLE_ENDIAN__
 static __inline__ vector signed char __ATTRS_o_ai
-vec_xl_be(signed long long __offset, signed char *__ptr) {
+vec_xl_be(signed long long __offset, const signed char *__ptr) {
   vector signed char __vec = (vector signed char)__builtin_vsx_lxvd2x_be(__offset, __ptr);
   return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
                                  13, 12, 11, 10, 9, 8);
 }
 
 static __inline__ vector unsigned char __ATTRS_o_ai
-vec_xl_be(signed long long __offset, unsigned char *__ptr) {
+vec_xl_be(signed long long __offset, const unsigned char *__ptr) {
   vector unsigned char __vec = (vector unsigned char)__builtin_vsx_lxvd2x_be(__offset, __ptr);
   return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
                                  13, 12, 11, 10, 9, 8);
 }
 
 static __inline__ vector signed short  __ATTRS_o_ai
-vec_xl_be(signed long long __offset, signed short *__ptr) {
+vec_xl_be(signed long long __offset, const signed short *__ptr) {
   vector signed short __vec = (vector signed short)__builtin_vsx_lxvd2x_be(__offset, __ptr);
   return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
 }
 
 static __inline__ vector unsigned short __ATTRS_o_ai
-vec_xl_be(signed long long __offset, unsigned short *__ptr) {
+vec_xl_be(signed long long __offset, const unsigned short *__ptr) {
   vector unsigned short __vec = (vector unsigned short)__builtin_vsx_lxvd2x_be(__offset, __ptr);
   return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
 }
 
 static __inline__ vector signed int __ATTRS_o_ai
-vec_xl_be(signed long long  __offset, signed int *__ptr) {
+vec_xl_be(signed long long  __offset, const signed int *__ptr) {
   return (vector signed int)__builtin_vsx_lxvw4x_be(__offset, __ptr);
 }
 
 static __inline__ vector unsigned int __ATTRS_o_ai
-vec_xl_be(signed long long  __offset, unsigned int *__ptr) {
+vec_xl_be(signed long long  __offset, const unsigned int *__ptr) {
   return (vector unsigned int)__builtin_vsx_lxvw4x_be(__offset, __ptr);
 }
 
 static __inline__ vector float __ATTRS_o_ai
-vec_xl_be(signed long long  __offset, float *__ptr) {
+vec_xl_be(signed long long  __offset, const float *__ptr) {
   return (vector float)__builtin_vsx_lxvw4x_be(__offset, __ptr);
 }
 
 #ifdef __VSX__
 static __inline__ vector signed long long __ATTRS_o_ai
-vec_xl_be(signed long long  __offset, signed long long *__ptr) {
+vec_xl_be(signed long long  __offset, const signed long long *__ptr) {
   return (vector signed long long)__builtin_vsx_lxvd2x_be(__offset, __ptr);
 }
 
 static __inline__ vector unsigned long long __ATTRS_o_ai
-vec_xl_be(signed long long  __offset, unsigned long long *__ptr) {
+vec_xl_be(signed long long  __offset, const unsigned long long *__ptr) {
   return (vector unsigned long long)__builtin_vsx_lxvd2x_be(__offset, __ptr);
 }
 
 static __inline__ vector double __ATTRS_o_ai
-vec_xl_be(signed long long  __offset, double *__ptr) {
+vec_xl_be(signed long long  __offset, const double *__ptr) {
   return (vector double)__builtin_vsx_lxvd2x_be(__offset, __ptr);
 }
 #endif
 
 #if defined(__POWER8_VECTOR__) && defined(__powerpc64__)
 static __inline__ vector signed __int128 __ATTRS_o_ai
-vec_xl_be(signed long long  __offset, signed __int128 *__ptr) {
+vec_xl_be(signed long long  __offset, const signed __int128 *__ptr) {
   return vec_xl(__offset, __ptr);
 }
 
 static __inline__ vector unsigned __int128 __ATTRS_o_ai
-vec_xl_be(signed long long  __offset, unsigned __int128 *__ptr) {
+vec_xl_be(signed long long  __offset, const unsigned __int128 *__ptr) {
   return vec_xl(__offset, __ptr);
 }
 #endif
diff --git a/clang/test/CodeGen/builtins-ppc-altivec.c b/clang/test/CodeGen/builtins-ppc-altivec.c
index dc93e7340597..06f70a901903 100644
--- a/clang/test/CodeGen/builtins-ppc-altivec.c
+++ b/clang/test/CodeGen/builtins-ppc-altivec.c
@@ -38,6 +38,13 @@ vector float res_vf;
 
 // CHECK-NOALTIVEC: error: unknown type name 'vector'
 // CHECK-NOALTIVEC-NOT: '(error)'
+const signed char *param_sc_ld;
+const unsigned char *param_uc_ld;
+const short *param_s_ld;
+const unsigned short *param_us_ld;
+const int *param_i_ld;
+const unsigned int *param_ui_ld;
+const float *param_f_ld;
 
 signed char param_sc;
 unsigned char param_uc;
@@ -1392,7 +1399,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
-  res_vsc = vec_ld(0, &param_sc);
+  res_vsc = vec_ld(0, param_sc_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
@@ -1400,7 +1407,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
-  res_vuc = vec_ld(0, &param_uc);
+  res_vuc = vec_ld(0, param_uc_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
@@ -1412,7 +1419,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
-  res_vs  = vec_ld(0, &param_s);
+  res_vs  = vec_ld(0, param_s_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
@@ -1420,7 +1427,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
-  res_vus = vec_ld(0, &param_us);
+  res_vus = vec_ld(0, param_us_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
@@ -1436,7 +1443,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
-  res_vi  = vec_ld(0, &param_i);
+  res_vi  = vec_ld(0, param_i_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
@@ -1444,7 +1451,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
-  res_vui = vec_ld(0, &param_ui);
+  res_vui = vec_ld(0, param_ui_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
@@ -1456,7 +1463,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
-  res_vf  = vec_ld(0, &param_f);
+  res_vf  = vec_ld(0, param_f_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
@@ -1464,7 +1471,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
-  res_vsc = vec_lvx(0, &param_sc);
+  res_vsc = vec_lvx(0, param_sc_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
@@ -1472,7 +1479,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
-  res_vuc = vec_lvx(0, &param_uc);
+  res_vuc = vec_lvx(0, param_uc_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
@@ -1484,7 +1491,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
-  res_vs  = vec_lvx(0, &param_s);
+  res_vs  = vec_lvx(0, param_s_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
@@ -1492,7 +1499,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
-  res_vus = vec_lvx(0, &param_us);
+  res_vus = vec_lvx(0, param_us_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
@@ -1508,7 +1515,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
-  res_vi  = vec_lvx(0, &param_i);
+  res_vi  = vec_lvx(0, param_i_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
@@ -1516,7 +1523,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
-  res_vui = vec_lvx(0, &param_ui);
+  res_vui = vec_lvx(0, param_ui_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
@@ -1528,64 +1535,64 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
-  res_vf  = vec_lvx(0, &param_f);
+  res_vf  = vec_lvx(0, param_f_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK-LE: @llvm.ppc.altivec.lvx
 
   /* vec_lde */
-  res_vsc = vec_lde(0, &param_sc);
+  res_vsc = vec_lde(0, param_sc_ld);
 // CHECK: @llvm.ppc.altivec.lvebx
 // CHECK-LE: @llvm.ppc.altivec.lvebx
 
-  res_vuc = vec_lde(0, &param_uc);
+  res_vuc = vec_lde(0, param_uc_ld);
 // CHECK: @llvm.ppc.altivec.lvebx
 // CHECK-LE: @llvm.ppc.altivec.lvebx
 
-  res_vs  = vec_lde(0, &param_s);
+  res_vs  = vec_lde(0, param_s_ld);
 // CHECK: @llvm.ppc.altivec.lvehx
 // CHECK-LE: @llvm.ppc.altivec.lvehx
 
-  res_vus = vec_lde(0, &param_us);
+  res_vus = vec_lde(0, param_us_ld);
 // CHECK: @llvm.ppc.altivec.lvehx
 // CHECK-LE: @llvm.ppc.altivec.lvehx
 
-  res_vi  = vec_lde(0, &param_i);
+  res_vi  = vec_lde(0, param_i_ld);
 // CHECK: @llvm.ppc.altivec.lvewx
 // CHECK-LE: @llvm.ppc.altivec.lvewx
 
-  res_vui = vec_lde(0, &param_ui);
+  res_vui = vec_lde(0, param_ui_ld);
 // CHECK: @llvm.ppc.altivec.lvewx
 // CHECK-LE: @llvm.ppc.altivec.lvewx
 
-  res_vf  = vec_lde(0, &param_f);
+  res_vf  = vec_lde(0, param_f_ld);
 // CHECK: @llvm.ppc.altivec.lvewx
 // CHECK-LE: @llvm.ppc.altivec.lvewx
 
-  res_vsc = vec_lvebx(0, &param_sc);
+  res_vsc = vec_lvebx(0, param_sc_ld);
 // CHECK: @llvm.ppc.altivec.lvebx
 // CHECK-LE: @llvm.ppc.altivec.lvebx
 
-  res_vuc = vec_lvebx(0, &param_uc);
+  res_vuc = vec_lvebx(0, param_uc_ld);
 // CHECK: @llvm.ppc.altivec.lvebx
 // CHECK-LE: @llvm.ppc.altivec.lvebx
 
-  res_vs  = vec_lvehx(0, &param_s);
+  res_vs  = vec_lvehx(0, param_s_ld);
 // CHECK: @llvm.ppc.altivec.lvehx
 // CHECK-LE: @llvm.ppc.altivec.lvehx
 
-  res_vus = vec_lvehx(0, &param_us);
+  res_vus = vec_lvehx(0, param_us_ld);
 // CHECK: @llvm.ppc.altivec.lvehx
 // CHECK-LE: @llvm.ppc.altivec.lvehx
 
-  res_vi  = vec_lvewx(0, &param_i);
+  res_vi  = vec_lvewx(0, param_i_ld);
 // CHECK: @llvm.ppc.altivec.lvewx
 // CHECK-LE: @llvm.ppc.altivec.lvewx
 
-  res_vui = vec_lvewx(0, &param_ui);
+  res_vui = vec_lvewx(0, param_ui_ld);
 // CHECK: @llvm.ppc.altivec.lvewx
 // CHECK-LE: @llvm.ppc.altivec.lvewx
 
-  res_vf  = vec_lvewx(0, &param_f);
+  res_vf  = vec_lvewx(0, param_f_ld);
 // CHECK: @llvm.ppc.altivec.lvewx
 // CHECK-LE: @llvm.ppc.altivec.lvewx
 
@@ -1594,7 +1601,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
-  res_vsc = vec_ldl(0, &param_sc);
+  res_vsc = vec_ldl(0, param_sc_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
@@ -1602,7 +1609,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
-  res_vuc = vec_ldl(0, &param_uc);
+  res_vuc = vec_ldl(0, param_uc_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
@@ -1614,7 +1621,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
-  res_vs  = vec_ldl(0, &param_s);
+  res_vs  = vec_ldl(0, param_s_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
@@ -1622,7 +1629,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
-  res_vus = vec_ldl(0, &param_us);
+  res_vus = vec_ldl(0, param_us_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
@@ -1638,7 +1645,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
-  res_vi  = vec_ldl(0, &param_i);
+  res_vi  = vec_ldl(0, param_i_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
@@ -1646,7 +1653,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
-  res_vui = vec_ldl(0, &param_ui);
+  res_vui = vec_ldl(0, param_ui_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
@@ -1658,7 +1665,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
-  res_vf  = vec_ldl(0, &param_f);
+  res_vf  = vec_ldl(0, param_f_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
@@ -1666,7 +1673,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
-  res_vsc = vec_lvxl(0, &param_sc);
+  res_vsc = vec_lvxl(0, param_sc_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
@@ -1678,7 +1685,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
-  res_vuc = vec_lvxl(0, &param_uc);
+  res_vuc = vec_lvxl(0, param_uc_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
@@ -1686,7 +1693,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
-  res_vs  = vec_lvxl(0, &param_s);
+  res_vs  = vec_lvxl(0, param_s_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
@@ -1694,7 +1701,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
-  res_vus = vec_lvxl(0, &param_us);
+  res_vus = vec_lvxl(0, param_us_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
@@ -1710,7 +1717,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
-  res_vi  = vec_lvxl(0, &param_i);
+  res_vi  = vec_lvxl(0, param_i_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
@@ -1718,7 +1725,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
-  res_vui = vec_lvxl(0, &param_ui);
+  res_vui = vec_lvxl(0, param_ui_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
@@ -1730,7 +1737,7 @@ void test6() {
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
-  res_vf  = vec_lvxl(0, &param_f);
+  res_vf  = vec_lvxl(0, param_f_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK-LE: @llvm.ppc.altivec.lvxl
 
@@ -1744,12 +1751,12 @@ void test6() {
 // CHECK-LE: @llvm.ppc.altivec.vlogefp
 
   /* vec_lvsl */
-  res_vuc = vec_lvsl(0, &param_i);
+  res_vuc = vec_lvsl(0, param_i_ld);
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK-LE: @llvm.ppc.altivec.lvsl
 
   /* vec_lvsr */
-  res_vuc = vec_lvsr(0, &param_i);
+  res_vuc = vec_lvsr(0, param_i_ld);
 // CHECK: @llvm.ppc.altivec.lvsr
 // CHECK-LE: @llvm.ppc.altivec.lvsr
 
@@ -6108,7 +6115,7 @@ void test6() {
 // CHECK-LE: insertelement <4 x float>
 
   /* vec_lvlx */
-  res_vsc = vec_lvlx(0, &param_sc);
+  res_vsc = vec_lvlx(0, param_sc_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <16 x i8> zeroinitializer
@@ -6128,7 +6135,7 @@ void test6() {
 // CHECK-LE: store <16 x i8> zeroinitializer
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vuc = vec_lvlx(0, &param_uc);
+  res_vuc = vec_lvlx(0, param_uc_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <16 x i8> zeroinitializer
@@ -6158,7 +6165,7 @@ void test6() {
 // CHECK-LE: @llvm.ppc.altivec.lvsl
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vs  = vec_lvlx(0, &param_s);
+  res_vs  = vec_lvlx(0, param_s_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <8 x i16> zeroinitializer
@@ -6178,7 +6185,7 @@ void test6() {
 // CHECK-LE: store <8 x i16> zeroinitializer
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vus = vec_lvlx(0, &param_us);
+  res_vus = vec_lvlx(0, param_us_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <8 x i16> zeroinitializer
@@ -6218,7 +6225,7 @@ void test6() {
 // CHECK-LE: @llvm.ppc.altivec.lvsl
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vi  = vec_lvlx(0, &param_i);
+  res_vi  = vec_lvlx(0, param_i_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <4 x i32> zeroinitializer
@@ -6238,7 +6245,7 @@ void test6() {
 // CHECK-LE: store <4 x i32> zeroinitializer
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vui = vec_lvlx(0, &param_ui);
+  res_vui = vec_lvlx(0, param_ui_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <4 x i32> zeroinitializer
@@ -6279,7 +6286,7 @@ void test6() {
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
   /* vec_lvlxl */
-  res_vsc = vec_lvlxl(0, &param_sc);
+  res_vsc = vec_lvlxl(0, param_sc_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <16 x i8> zeroinitializer
@@ -6299,7 +6306,7 @@ void test6() {
 // CHECK-LE: store <16 x i8> zeroinitializer
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vuc = vec_lvlxl(0, &param_uc);
+  res_vuc = vec_lvlxl(0, param_uc_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <16 x i8> zeroinitializer
@@ -6329,7 +6336,7 @@ void test6() {
 // CHECK-LE: @llvm.ppc.altivec.lvsl
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vs  = vec_lvlxl(0, &param_s);
+  res_vs  = vec_lvlxl(0, param_s_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <8 x i16> zeroinitializer
@@ -6349,7 +6356,7 @@ void test6() {
 // CHECK-LE: store <8 x i16> zeroinitializer
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vus = vec_lvlxl(0, &param_us);
+  res_vus = vec_lvlxl(0, param_us_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <8 x i16> zeroinitializer
@@ -6389,7 +6396,7 @@ void test6() {
 // CHECK-LE: @llvm.ppc.altivec.lvsl
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vi  = vec_lvlxl(0, &param_i);
+  res_vi  = vec_lvlxl(0, param_i_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <4 x i32> zeroinitializer
@@ -6409,7 +6416,7 @@ void test6() {
 // CHECK-LE: store <4 x i32> zeroinitializer
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vui = vec_lvlxl(0, &param_ui);
+  res_vui = vec_lvlxl(0, param_ui_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <4 x i32> zeroinitializer
@@ -6450,7 +6457,7 @@ void test6() {
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
   /* vec_lvrx */
-  res_vsc = vec_lvrx(0, &param_sc);
+  res_vsc = vec_lvrx(0, param_sc_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <16 x i8> zeroinitializer
@@ -6470,7 +6477,7 @@ void test6() {
 // CHECK-LE: store <16 x i8> zeroinitializer
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vuc = vec_lvrx(0, &param_uc);
+  res_vuc = vec_lvrx(0, param_uc_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <16 x i8> zeroinitializer
@@ -6500,7 +6507,7 @@ void test6() {
 // CHECK-LE: @llvm.ppc.altivec.lvsl
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vs  = vec_lvrx(0, &param_s);
+  res_vs  = vec_lvrx(0, param_s_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <8 x i16> zeroinitializer
@@ -6520,7 +6527,7 @@ void test6() {
 // CHECK-LE: store <8 x i16> zeroinitializer
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vus = vec_lvrx(0, &param_us);
+  res_vus = vec_lvrx(0, param_us_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <8 x i16> zeroinitializer
@@ -6560,7 +6567,7 @@ void test6() {
 // CHECK-LE: @llvm.ppc.altivec.lvsl
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vi  = vec_lvrx(0, &param_i);
+  res_vi  = vec_lvrx(0, param_i_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <4 x i32> zeroinitializer
@@ -6580,7 +6587,7 @@ void test6() {
 // CHECK-LE: store <4 x i32> zeroinitializer
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vui = vec_lvrx(0, &param_ui);
+  res_vui = vec_lvrx(0, param_ui_ld);
 // CHECK: @llvm.ppc.altivec.lvx
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <4 x i32> zeroinitializer
@@ -6621,7 +6628,7 @@ void test6() {
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
   /* vec_lvrxl */
-  res_vsc = vec_lvrxl(0, &param_sc);
+  res_vsc = vec_lvrxl(0, param_sc_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <16 x i8> zeroinitializer
@@ -6641,7 +6648,7 @@ void test6() {
 // CHECK-LE: store <16 x i8> zeroinitializer
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vuc = vec_lvrxl(0, &param_uc);
+  res_vuc = vec_lvrxl(0, param_uc_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <16 x i8> zeroinitializer
@@ -6671,7 +6678,7 @@ void test6() {
 // CHECK-LE: @llvm.ppc.altivec.lvsl
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vs  = vec_lvrxl(0, &param_s);
+  res_vs  = vec_lvrxl(0, param_s_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <8 x i16> zeroinitializer
@@ -6691,7 +6698,7 @@ void test6() {
 // CHECK-LE: store <8 x i16> zeroinitializer
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vus = vec_lvrxl(0, &param_us);
+  res_vus = vec_lvrxl(0, param_us_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <8 x i16> zeroinitializer
@@ -6731,7 +6738,7 @@ void test6() {
 // CHECK-LE: @llvm.ppc.altivec.lvsl
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vi  = vec_lvrxl(0, &param_i);
+  res_vi  = vec_lvrxl(0, param_i_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <4 x i32> zeroinitializer
@@ -6751,7 +6758,7 @@ void test6() {
 // CHECK-LE: store <4 x i32> zeroinitializer
 // CHECK-LE: @llvm.ppc.altivec.vperm
 
-  res_vui = vec_lvrxl(0, &param_ui);
+  res_vui = vec_lvrxl(0, param_ui_ld);
 // CHECK: @llvm.ppc.altivec.lvxl
 // CHECK: @llvm.ppc.altivec.lvsl
 // CHECK: store <4 x i32> zeroinitializer
@@ -9433,31 +9440,31 @@ void test8() {
 void test9() {
   // CHECK-LABEL: define void @test9
   // CHECK-LE-LABEL: define void @test9
-  res_vsc = vec_xl(param_sll, &param_sc);
+  res_vsc = vec_xl(param_sll, param_sc_ld);
   // CHECK: load <16 x i8>, <16 x i8>* %{{[0-9]+}}, align 1
   // CHECK-LE: load <16 x i8>, <16 x i8>* %{{[0-9]+}}, align 1
 
-  res_vuc = vec_xl(param_sll, &param_uc);
+  res_vuc = vec_xl(param_sll, param_uc_ld);
   // CHECK: load <16 x i8>, <16 x i8>* %{{[0-9]+}}, align 1
   // CHECK-LE: load <16 x i8>, <16 x i8>* %{{[0-9]+}}, align 1
 
-  res_vs = vec_xl(param_sll, &param_s);
+  res_vs = vec_xl(param_sll, param_s_ld);
   // CHECK: load <8 x i16>, <8 x i16>* %{{[0-9]+}}, align 1
   // CHECK-LE: load <8 x i16>, <8 x i16>* %{{[0-9]+}}, align 1
 
-  res_vus = vec_xl(param_sll, &param_us);
+  res_vus = vec_xl(param_sll, param_us_ld);
   // CHECK: load <8 x i16>, <8 x i16>* %{{[0-9]+}}, align 1
   // CHECK-LE: load <8 x i16>, <8 x i16>* %{{[0-9]+}}, align 1
 
-  res_vi = vec_xl(param_sll, &param_i);
+  res_vi = vec_xl(param_sll, param_i_ld);
   // CHECK: load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 1
   // CHECK-LE: load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 1
 
-  res_vui = vec_xl(param_sll, &param_ui);
+  res_vui = vec_xl(param_sll, param_ui_ld);
   // CHECK: load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 1
   // CHECK-LE: load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 1
 
-  res_vf = vec_xl(param_sll, &param_f);
+  res_vf = vec_xl(param_sll, param_f_ld);
   // CHECK: load <4 x float>, <4 x float>* %{{[0-9]+}}, align 1
   // CHECK-LE: load <4 x float>, <4 x float>* %{{[0-9]+}}, align 1
 }
@@ -9499,35 +9506,35 @@ void test10() {
 void test11() {
   // CHECK-LABEL: define void @test11
   // CHECK-LE-LABEL: define void @test11
-  res_vsc = vec_xl_be(param_sll, &param_sc);
+  res_vsc = vec_xl_be(param_sll, param_sc_ld);
   // CHECK: load <16 x i8>, <16 x i8>* %{{[0-9]+}}, align 1
   // CHECK-LE: call <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8* %{{[0-9]+}})
   // CHECK-LE: shufflevector <16 x i8> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 
-  res_vuc = vec_xl_be(param_sll, &param_uc);
+  res_vuc = vec_xl_be(param_sll, param_uc_ld);
   // CHECK: load <16 x i8>, <16 x i8>* %{{[0-9]+}}, align 1
   // CHECK-LE: call <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8* %{{[0-9]+}})
   // CHECK-LE: shufflevector <16 x i8> %{{[0-9]+}}, <16 x i8> %{{[0-9]+}}, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
 
-  res_vs = vec_xl_be(param_sll, &param_s);
+  res_vs = vec_xl_be(param_sll, param_s_ld);
   // CHECK: load <8 x i16>, <8 x i16>* %{{[0-9]+}}, align 1
   // CHECK-LE: call <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8* %{{[0-9]+}})
   // CHECK-LE: shufflevector <8 x i16> %{{[0-9]+}}, <8 x i16> %{{[0-9]+}}, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 
-  res_vus = vec_xl_be(param_sll, &param_us);
+  res_vus = vec_xl_be(param_sll, param_us_ld);
   // CHECK: load <8 x i16>, <8 x i16>* %{{[0-9]+}}, align 1
   // CHECK-LE: call <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8* %{{[0-9]+}})
   // CHECK-LE: shufflevector <8 x i16> %{{[0-9]+}}, <8 x i16> %{{[0-9]+}}, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
 
-  res_vi = vec_xl_be(param_sll, &param_i);
+  res_vi = vec_xl_be(param_sll, param_i_ld);
   // CHECK: load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 1
   // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.lxvw4x.be(i8* %{{[0-9]+}})
 
-  res_vui = vec_xl_be(param_sll, &param_ui);
+  res_vui = vec_xl_be(param_sll, param_ui_ld);
   // CHECK: load <4 x i32>, <4 x i32>* %{{[0-9]+}}, align 1
   // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.lxvw4x.be(i8* %{{[0-9]+}})
 
-  res_vf = vec_xl_be(param_sll, &param_f);
+  res_vf = vec_xl_be(param_sll, param_f_ld);
   // CHECK: load <4 x float>, <4 x float>* %{{[0-9]+}}, align 1
   // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.lxvw4x.be(i8* %{{[0-9]+}})
 }
diff --git a/clang/test/CodeGen/builtins-ppc-p10vector.c b/clang/test/CodeGen/builtins-ppc-p10vector.c
index c51c24f25986..3512bfc4cf33 100644
--- a/clang/test/CodeGen/builtins-ppc-p10vector.c
+++ b/clang/test/CodeGen/builtins-ppc-p10vector.c
@@ -24,10 +24,18 @@ vector unsigned long long vulla, vullb, vullc;
 vector unsigned __int128 vui128a, vui128b, vui128c;
 vector float vfa, vfb;
 vector double vda, vdb;
-unsigned int uia, uib;
+signed int *iap;
+unsigned int uia, uib, *uiap;
+signed char *cap;
 unsigned char uca;
+const unsigned char *ucap;
+const signed short *sap;
 unsigned short usa;
+const unsigned short *usap;
+const signed long long *llap;
+signed long long llb;
 unsigned long long ulla;
+const unsigned long long *ullap;
 
 vector unsigned long long test_vpdepd(void) {
   // CHECK: @llvm.ppc.altivec.vpdepd(<2 x i64>
diff --git a/clang/test/CodeGen/builtins-ppc-xl-xst.c b/clang/test/CodeGen/builtins-ppc-xl-xst.c
index 8ad45376e977..226e9d8aff4e 100644
--- a/clang/test/CodeGen/builtins-ppc-xl-xst.c
+++ b/clang/test/CodeGen/builtins-ppc-xl-xst.c
@@ -17,10 +17,12 @@
 // CHECK-NEXT:    [[__PTR_ADDR_I:%.*]] = alloca i16*, align 8
 // CHECK-NEXT:    [[__ADDR_I:%.*]] = alloca i8*, align 8
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x i16>*, align 8
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i16*, align 8
+// CHECK-NEXT:    [[ST_ADDR:%.*]] = alloca i16*, align 8
+// CHECK-NEXT:    [[LD_ADDR:%.*]] = alloca i16*, align 8
 // CHECK-NEXT:    store <8 x i16>* [[C:%.*]], <8 x i16>** [[C_ADDR]], align 8
-// CHECK-NEXT:    store i16* [[PTR:%.*]], i16** [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i16*, i16** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store i16* [[ST:%.*]], i16** [[ST_ADDR]], align 8
+// CHECK-NEXT:    store i16* [[LD:%.*]], i16** [[LD_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16*, i16** [[LD_ADDR]], align 8
 // CHECK-NEXT:    store i64 3, i64* [[__OFFSET_ADDR_I]], align 8
 // CHECK-NEXT:    store i16* [[TMP0]], i16** [[__PTR_ADDR_I]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[__PTR_ADDR_I]], align 8
@@ -35,7 +37,7 @@
 // CHECK-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* [[TMP7]], align 16
 // CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>*, <8 x i16>** [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load i16*, i16** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load i16*, i16** [[ST_ADDR]], align 8
 // CHECK-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* [[__VEC_ADDR_I]], align 16
 // CHECK-NEXT:    store i64 7, i64* [[__OFFSET_ADDR_I1]], align 8
 // CHECK-NEXT:    store i16* [[TMP10]], i16** [[__PTR_ADDR_I2]], align 8
@@ -50,9 +52,9 @@
 // CHECK-NEXT:    store <8 x i16> [[TMP14]], <8 x i16>* [[TMP16]], align 1
 // CHECK-NEXT:    ret void
 //
-void test1(vector signed short *c, signed short *ptr) {
-    *c = vec_xl(3ll, ptr);
-    vec_xst(*c, 7ll, ptr);
+void test1(vector signed short *c, signed short *st, const signed short *ld) {
+    *c = vec_xl(3ll, ld);
+    vec_xst(*c, 7ll, st);
 }
 
 // CHECK-LABEL: @test2(
@@ -65,10 +67,12 @@ void test1(vector signed short *c, signed short *ptr) {
 // CHECK-NEXT:    [[__PTR_ADDR_I:%.*]] = alloca i16*, align 8
 // CHECK-NEXT:    [[__ADDR_I:%.*]] = alloca i8*, align 8
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x i16>*, align 8
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i16*, align 8
+// CHECK-NEXT:    [[ST_ADDR:%.*]] = alloca i16*, align 8
+// CHECK-NEXT:    [[LD_ADDR:%.*]] = alloca i16*, align 8
 // CHECK-NEXT:    store <8 x i16>* [[C:%.*]], <8 x i16>** [[C_ADDR]], align 8
-// CHECK-NEXT:    store i16* [[PTR:%.*]], i16** [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i16*, i16** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store i16* [[ST:%.*]], i16** [[ST_ADDR]], align 8
+// CHECK-NEXT:    store i16* [[LD:%.*]], i16** [[LD_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16*, i16** [[LD_ADDR]], align 8
 // CHECK-NEXT:    store i64 3, i64* [[__OFFSET_ADDR_I]], align 8
 // CHECK-NEXT:    store i16* [[TMP0]], i16** [[__PTR_ADDR_I]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[__PTR_ADDR_I]], align 8
@@ -83,7 +87,7 @@ void test1(vector signed short *c, signed short *ptr) {
 // CHECK-NEXT:    store <8 x i16> [[TMP6]], <8 x i16>* [[TMP7]], align 16
 // CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>*, <8 x i16>** [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load i16*, i16** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load i16*, i16** [[ST_ADDR]], align 8
 // CHECK-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* [[__VEC_ADDR_I]], align 16
 // CHECK-NEXT:    store i64 7, i64* [[__OFFSET_ADDR_I1]], align 8
 // CHECK-NEXT:    store i16* [[TMP10]], i16** [[__PTR_ADDR_I2]], align 8
@@ -98,9 +102,10 @@ void test1(vector signed short *c, signed short *ptr) {
 // CHECK-NEXT:    store <8 x i16> [[TMP14]], <8 x i16>* [[TMP16]], align 1
 // CHECK-NEXT:    ret void
 //
-void test2(vector unsigned short *c, unsigned short *ptr) {
-    *c = vec_xl(3ll, ptr);
-    vec_xst(*c, 7ll, ptr);
+void test2(vector unsigned short *c, unsigned short *st,
+           const unsigned short *ld) {
+    *c = vec_xl(3ll, ld);
+    vec_xst(*c, 7ll, st);
 }
 
 // CHECK-LABEL: @test3(
@@ -113,10 +118,12 @@ void test2(vector unsigned short *c, unsigned short *ptr) {
 // CHECK-NEXT:    [[__PTR_ADDR_I:%.*]] = alloca i32*, align 8
 // CHECK-NEXT:    [[__ADDR_I:%.*]] = alloca i8*, align 8
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <4 x i32>*, align 8
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[ST_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[LD_ADDR:%.*]] = alloca i32*, align 8
 // CHECK-NEXT:    store <4 x i32>* [[C:%.*]], <4 x i32>** [[C_ADDR]], align 8
-// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store i32* [[ST:%.*]], i32** [[ST_ADDR]], align 8
+// CHECK-NEXT:    store i32* [[LD:%.*]], i32** [[LD_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[LD_ADDR]], align 8
 // CHECK-NEXT:    store i64 3, i64* [[__OFFSET_ADDR_I]], align 8
 // CHECK-NEXT:    store i32* [[TMP0]], i32** [[__PTR_ADDR_I]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32*, i32** [[__PTR_ADDR_I]], align 8
@@ -131,7 +138,7 @@ void test2(vector unsigned short *c, unsigned short *ptr) {
 // CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 16
 // CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>*, <4 x i32>** [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32*, i32** [[ST_ADDR]], align 8
 // CHECK-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[__VEC_ADDR_I]], align 16
 // CHECK-NEXT:    store i64 7, i64* [[__OFFSET_ADDR_I1]], align 8
 // CHECK-NEXT:    store i32* [[TMP10]], i32** [[__PTR_ADDR_I2]], align 8
@@ -146,9 +153,9 @@ void test2(vector unsigned short *c, unsigned short *ptr) {
 // CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 1
 // CHECK-NEXT:    ret void
 //
-void test3(vector signed int *c, signed int *ptr) {
-    *c = vec_xl(3ll, ptr);
-    vec_xst(*c, 7ll, ptr);
+void test3(vector signed int *c, signed int *st, const signed int *ld) {
+    *c = vec_xl(3ll, ld);
+    vec_xst(*c, 7ll, st);
 }
 
 // CHECK-LABEL: @test4(
@@ -161,10 +168,12 @@ void test3(vector signed int *c, signed int *ptr) {
 // CHECK-NEXT:    [[__PTR_ADDR_I:%.*]] = alloca i32*, align 8
 // CHECK-NEXT:    [[__ADDR_I:%.*]] = alloca i8*, align 8
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <4 x i32>*, align 8
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[ST_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[LD_ADDR:%.*]] = alloca i32*, align 8
 // CHECK-NEXT:    store <4 x i32>* [[C:%.*]], <4 x i32>** [[C_ADDR]], align 8
-// CHECK-NEXT:    store i32* [[PTR:%.*]], i32** [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store i32* [[ST:%.*]], i32** [[ST_ADDR]], align 8
+// CHECK-NEXT:    store i32* [[LD:%.*]], i32** [[LD_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[LD_ADDR]], align 8
 // CHECK-NEXT:    store i64 3, i64* [[__OFFSET_ADDR_I]], align 8
 // CHECK-NEXT:    store i32* [[TMP0]], i32** [[__PTR_ADDR_I]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32*, i32** [[__PTR_ADDR_I]], align 8
@@ -179,7 +188,7 @@ void test3(vector signed int *c, signed int *ptr) {
 // CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 16
 // CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>*, <4 x i32>** [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load i32*, i32** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32*, i32** [[ST_ADDR]], align 8
 // CHECK-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[__VEC_ADDR_I]], align 16
 // CHECK-NEXT:    store i64 7, i64* [[__OFFSET_ADDR_I1]], align 8
 // CHECK-NEXT:    store i32* [[TMP10]], i32** [[__PTR_ADDR_I2]], align 8
@@ -194,9 +203,9 @@ void test3(vector signed int *c, signed int *ptr) {
 // CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 1
 // CHECK-NEXT:    ret void
 //
-void test4(vector unsigned int *c, unsigned int *ptr) {
-    *c = vec_xl(3ll, ptr);
-    vec_xst(*c, 7ll, ptr);
+void test4(vector unsigned int *c, unsigned int *st, const unsigned int *ld) {
+    *c = vec_xl(3ll, ld);
+    vec_xst(*c, 7ll, st);
 }
 
 // CHECK-LABEL: @test5(
@@ -209,10 +218,12 @@ void test4(vector unsigned int *c, unsigned int *ptr) {
 // CHECK-NEXT:    [[__PTR_ADDR_I:%.*]] = alloca i64*, align 8
 // CHECK-NEXT:    [[__ADDR_I:%.*]] = alloca i8*, align 8
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <2 x i64>*, align 8
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i64*, align 8
+// CHECK-NEXT:    [[ST_ADDR:%.*]] = alloca i64*, align 8
+// CHECK-NEXT:    [[LD_ADDR:%.*]] = alloca i64*, align 8
 // CHECK-NEXT:    store <2 x i64>* [[C:%.*]], <2 x i64>** [[C_ADDR]], align 8
-// CHECK-NEXT:    store i64* [[PTR:%.*]], i64** [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64*, i64** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store i64* [[ST:%.*]], i64** [[ST_ADDR]], align 8
+// CHECK-NEXT:    store i64* [[LD:%.*]], i64** [[LD_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64*, i64** [[LD_ADDR]], align 8
 // CHECK-NEXT:    store i64 3, i64* [[__OFFSET_ADDR_I]], align 8
 // CHECK-NEXT:    store i64* [[TMP0]], i64** [[__PTR_ADDR_I]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i64*, i64** [[__PTR_ADDR_I]], align 8
@@ -227,7 +238,7 @@ void test4(vector unsigned int *c, unsigned int *ptr) {
 // CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 16
 // CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>*, <2 x i64>** [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[TMP8]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load i64*, i64** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load i64*, i64** [[ST_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[__VEC_ADDR_I]], align 16
 // CHECK-NEXT:    store i64 7, i64* [[__OFFSET_ADDR_I1]], align 8
 // CHECK-NEXT:    store i64* [[TMP10]], i64** [[__PTR_ADDR_I2]], align 8
@@ -242,9 +253,10 @@ void test4(vector unsigned int *c, unsigned int *ptr) {
 // CHECK-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[TMP16]], align 1
 // CHECK-NEXT:    ret void
 //
-void test5(vector signed long long *c, signed long long *ptr) {
-    *c = vec_xl(3ll, ptr);
-    vec_xst(*c, 7ll, ptr);
+void test5(vector signed long long *c, signed long long *st,
+           const signed long long *ld) {
+    *c = vec_xl(3ll, ld);
+    vec_xst(*c, 7ll, st);
 }
 
 // CHECK-LABEL: @test6(
@@ -257,10 +269,12 @@ void test5(vector signed long long *c, signed long long *ptr) {
 // CHECK-NEXT:    [[__PTR_ADDR_I:%.*]] = alloca i64*, align 8
 // CHECK-NEXT:    [[__ADDR_I:%.*]] = alloca i8*, align 8
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <2 x i64>*, align 8
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca i64*, align 8
+// CHECK-NEXT:    [[ST_ADDR:%.*]] = alloca i64*, align 8
+// CHECK-NEXT:    [[LD_ADDR:%.*]] = alloca i64*, align 8
 // CHECK-NEXT:    store <2 x i64>* [[C:%.*]], <2 x i64>** [[C_ADDR]], align 8
-// CHECK-NEXT:    store i64* [[PTR:%.*]], i64** [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64*, i64** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store i64* [[ST:%.*]], i64** [[ST_ADDR]], align 8
+// CHECK-NEXT:    store i64* [[LD:%.*]], i64** [[LD_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64*, i64** [[LD_ADDR]], align 8
 // CHECK-NEXT:    store i64 3, i64* [[__OFFSET_ADDR_I]], align 8
 // CHECK-NEXT:    store i64* [[TMP0]], i64** [[__PTR_ADDR_I]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i64*, i64** [[__PTR_ADDR_I]], align 8
@@ -275,7 +289,7 @@ void test5(vector signed long long *c, signed long long *ptr) {
 // CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 16
 // CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>*, <2 x i64>** [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[TMP8]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load i64*, i64** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load i64*, i64** [[ST_ADDR]], align 8
 // CHECK-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[__VEC_ADDR_I]], align 16
 // CHECK-NEXT:    store i64 7, i64* [[__OFFSET_ADDR_I1]], align 8
 // CHECK-NEXT:    store i64* [[TMP10]], i64** [[__PTR_ADDR_I2]], align 8
@@ -290,9 +304,10 @@ void test5(vector signed long long *c, signed long long *ptr) {
 // CHECK-NEXT:    store <2 x i64> [[TMP14]], <2 x i64>* [[TMP16]], align 1
 // CHECK-NEXT:    ret void
 //
-void test6(vector unsigned long long *c, unsigned long long *ptr) {
-    *c = vec_xl(3ll, ptr);
-    vec_xst(*c, 7ll, ptr);
+void test6(vector unsigned long long *c, unsigned long long *st,
+           const unsigned long long *ld) {
+    *c = vec_xl(3ll, ld);
+    vec_xst(*c, 7ll, st);
 }
 
 // CHECK-LABEL: @test7(
@@ -305,10 +320,12 @@ void test6(vector unsigned long long *c, unsigned long long *ptr) {
 // CHECK-NEXT:    [[__PTR_ADDR_I:%.*]] = alloca float*, align 8
 // CHECK-NEXT:    [[__ADDR_I:%.*]] = alloca i8*, align 8
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <4 x float>*, align 8
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[ST_ADDR:%.*]] = alloca float*, align 8
+// CHECK-NEXT:    [[LD_ADDR:%.*]] = alloca float*, align 8
 // CHECK-NEXT:    store <4 x float>* [[C:%.*]], <4 x float>** [[C_ADDR]], align 8
-// CHECK-NEXT:    store float* [[PTR:%.*]], float** [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load float*, float** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store float* [[ST:%.*]], float** [[ST_ADDR]], align 8
+// CHECK-NEXT:    store float* [[LD:%.*]], float** [[LD_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load float*, float** [[LD_ADDR]], align 8
 // CHECK-NEXT:    store i64 3, i64* [[__OFFSET_ADDR_I]], align 8
 // CHECK-NEXT:    store float* [[TMP0]], float** [[__PTR_ADDR_I]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load float*, float** [[__PTR_ADDR_I]], align 8
@@ -323,7 +340,7 @@ void test6(vector unsigned long long *c, unsigned long long *ptr) {
 // CHECK-NEXT:    store <4 x float> [[TMP6]], <4 x float>* [[TMP7]], align 16
 // CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>*, <4 x float>** [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[TMP8]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load float*, float** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load float*, float** [[ST_ADDR]], align 8
 // CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[__VEC_ADDR_I]], align 16
 // CHECK-NEXT:    store i64 7, i64* [[__OFFSET_ADDR_I1]], align 8
 // CHECK-NEXT:    store float* [[TMP10]], float** [[__PTR_ADDR_I2]], align 8
@@ -338,9 +355,9 @@ void test6(vector unsigned long long *c, unsigned long long *ptr) {
 // CHECK-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 1
 // CHECK-NEXT:    ret void
 //
-void test7(vector float *c, float *ptr) {
-    *c = vec_xl(3ll, ptr);
-    vec_xst(*c, 7ll, ptr);
+void test7(vector float *c, float *st, const float *ld) {
+    *c = vec_xl(3ll, ld);
+    vec_xst(*c, 7ll, st);
 }
 
 // CHECK-LABEL: @test8(
@@ -353,10 +370,12 @@ void test7(vector float *c, float *ptr) {
 // CHECK-NEXT:    [[__PTR_ADDR_I:%.*]] = alloca double*, align 8
 // CHECK-NEXT:    [[__ADDR_I:%.*]] = alloca i8*, align 8
 // CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <2 x double>*, align 8
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca double*, align 8
+// CHECK-NEXT:    [[ST_ADDR:%.*]] = alloca double*, align 8
+// CHECK-NEXT:    [[LD_ADDR:%.*]] = alloca double*, align 8
 // CHECK-NEXT:    store <2 x double>* [[C:%.*]], <2 x double>** [[C_ADDR]], align 8
-// CHECK-NEXT:    store double* [[PTR:%.*]], double** [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store double* [[ST:%.*]], double** [[ST_ADDR]], align 8
+// CHECK-NEXT:    store double* [[LD:%.*]], double** [[LD_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load double*, double** [[LD_ADDR]], align 8
 // CHECK-NEXT:    store i64 3, i64* [[__OFFSET_ADDR_I]], align 8
 // CHECK-NEXT:    store double* [[TMP0]], double** [[__PTR_ADDR_I]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load double*, double** [[__PTR_ADDR_I]], align 8
@@ -371,7 +390,7 @@ void test7(vector float *c, float *ptr) {
 // CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 16
 // CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>*, <2 x double>** [[C_ADDR]], align 8
 // CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = load double*, double** [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load double*, double** [[ST_ADDR]], align 8
 // CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[__VEC_ADDR_I]], align 16
 // CHECK-NEXT:    store i64 7, i64* [[__OFFSET_ADDR_I1]], align 8
 // CHECK-NEXT:    store double* [[TMP10]], double** [[__PTR_ADDR_I2]], align 8
@@ -386,9 +405,9 @@ void test7(vector float *c, float *ptr) {
 // CHECK-NEXT:    store <2 x double> [[TMP14]], <2 x double>* [[TMP16]], align 1
 // CHECK-NEXT:    ret void
 //
-void test8(vector double *c, double *ptr) {
-    *c = vec_xl(3ll, ptr);
-    vec_xst(*c, 7ll, ptr);
+void test8(vector double *c, double *st, const double *ld) {
+    *c = vec_xl(3ll, ld);
+    vec_xst(*c, 7ll, st);
 }
 
 #ifdef __POWER8_VECTOR__
@@ -402,10 +421,12 @@ void test8(vector double *c, double *ptr) {
 // CHECK-P8-NEXT:    [[__PTR_ADDR_I:%.*]] = alloca i128*, align 8
 // CHECK-P8-NEXT:    [[__ADDR_I:%.*]] = alloca i8*, align 8
 // CHECK-P8-NEXT:    [[C_ADDR:%.*]] = alloca <1 x i128>*, align 8
-// CHECK-P8-NEXT:    [[PTR_ADDR:%.*]] = alloca i128*, align 8
+// CHECK-P8-NEXT:    [[ST_ADDR:%.*]] = alloca i128*, align 8
+// CHECK-P8-NEXT:    [[LD_ADDR:%.*]] = alloca i128*, align 8
 // CHECK-P8-NEXT:    store <1 x i128>* [[C:%.*]], <1 x i128>** [[C_ADDR]], align 8
-// CHECK-P8-NEXT:    store i128* [[PTR:%.*]], i128** [[PTR_ADDR]], align 8
-// CHECK-P8-NEXT:    [[TMP0:%.*]] = load i128*, i128** [[PTR_ADDR]], align 8
+// CHECK-P8-NEXT:    store i128* [[ST:%.*]], i128** [[ST_ADDR]], align 8
+// CHECK-P8-NEXT:    store i128* [[LD:%.*]], i128** [[LD_ADDR]], align 8
+// CHECK-P8-NEXT:    [[TMP0:%.*]] = load i128*, i128** [[LD_ADDR]], align 8
 // CHECK-P8-NEXT:    store i64 3, i64* [[__OFFSET_ADDR_I]], align 8
 // CHECK-P8-NEXT:    store i128* [[TMP0]], i128** [[__PTR_ADDR_I]], align 8
 // CHECK-P8-NEXT:    [[TMP1:%.*]] = load i128*, i128** [[__PTR_ADDR_I]], align 8
@@ -420,7 +441,7 @@ void test8(vector double *c, double *ptr) {
 // CHECK-P8-NEXT:    store <1 x i128> [[TMP6]], <1 x i128>* [[TMP7]], align 16
 // CHECK-P8-NEXT:    [[TMP8:%.*]] = load <1 x i128>*, <1 x i128>** [[C_ADDR]], align 8
 // CHECK-P8-NEXT:    [[TMP9:%.*]] = load <1 x i128>, <1 x i128>* [[TMP8]], align 16
-// CHECK-P8-NEXT:    [[TMP10:%.*]] = load i128*, i128** [[PTR_ADDR]], align 8
+// CHECK-P8-NEXT:    [[TMP10:%.*]] = load i128*, i128** [[ST_ADDR]], align 8
 // CHECK-P8-NEXT:    store <1 x i128> [[TMP9]], <1 x i128>* [[__VEC_ADDR_I]], align 16
 // CHECK-P8-NEXT:    store i64 7, i64* [[__OFFSET_ADDR_I1]], align 8
 // CHECK-P8-NEXT:    store i128* [[TMP10]], i128** [[__PTR_ADDR_I2]], align 8
@@ -435,9 +456,10 @@ void test8(vector double *c, double *ptr) {
 // CHECK-P8-NEXT:    store <1 x i128> [[TMP14]], <1 x i128>* [[TMP16]], align 1
 // CHECK-P8-NEXT:    ret void
 //
-void test9(vector signed __int128 *c, signed __int128 *ptr) {
-    *c = vec_xl(3ll, ptr);
-    vec_xst(*c, 7ll, ptr);
+void test9(vector signed __int128 *c, signed __int128 *st,
+           const signed __int128 *ld) {
+    *c = vec_xl(3ll, ld);
+    vec_xst(*c, 7ll, st);
 }
 
 // CHECK-P8-LABEL: @test10(
@@ -450,10 +472,12 @@ void test9(vector signed __int128 *c, signed __int128 *ptr) {
 // CHECK-P8-NEXT:    [[__PTR_ADDR_I:%.*]] = alloca i128*, align 8
 // CHECK-P8-NEXT:    [[__ADDR_I:%.*]] = alloca i8*, align 8
 // CHECK-P8-NEXT:    [[C_ADDR:%.*]] = alloca <1 x i128>*, align 8
-// CHECK-P8-NEXT:    [[PTR_ADDR:%.*]] = alloca i128*, align 8
+// CHECK-P8-NEXT:    [[ST_ADDR:%.*]] = alloca i128*, align 8
+// CHECK-P8-NEXT:    [[LD_ADDR:%.*]] = alloca i128*, align 8
 // CHECK-P8-NEXT:    store <1 x i128>* [[C:%.*]], <1 x i128>** [[C_ADDR]], align 8
-// CHECK-P8-NEXT:    store i128* [[PTR:%.*]], i128** [[PTR_ADDR]], align 8
-// CHECK-P8-NEXT:    [[TMP0:%.*]] = load i128*, i128** [[PTR_ADDR]], align 8
+// CHECK-P8-NEXT:    store i128* [[ST:%.*]], i128** [[ST_ADDR]], align 8
+// CHECK-P8-NEXT:    store i128* [[LD:%.*]], i128** [[LD_ADDR]], align 8
+// CHECK-P8-NEXT:    [[TMP0:%.*]] = load i128*, i128** [[LD_ADDR]], align 8
 // CHECK-P8-NEXT:    store i64 3, i64* [[__OFFSET_ADDR_I]], align 8
 // CHECK-P8-NEXT:    store i128* [[TMP0]], i128** [[__PTR_ADDR_I]], align 8
 // CHECK-P8-NEXT:    [[TMP1:%.*]] = load i128*, i128** [[__PTR_ADDR_I]], align 8
@@ -468,7 +492,7 @@ void test9(vector signed __int128 *c, signed __int128 *ptr) {
 // CHECK-P8-NEXT:    store <1 x i128> [[TMP6]], <1 x i128>* [[TMP7]], align 16
 // CHECK-P8-NEXT:    [[TMP8:%.*]] = load <1 x i128>*, <1 x i128>** [[C_ADDR]], align 8
 // CHECK-P8-NEXT:    [[TMP9:%.*]] = load <1 x i128>, <1 x i128>* [[TMP8]], align 16
-// CHECK-P8-NEXT:    [[TMP10:%.*]] = load i128*, i128** [[PTR_ADDR]], align 8
+// CHECK-P8-NEXT:    [[TMP10:%.*]] = load i128*, i128** [[ST_ADDR]], align 8
 // CHECK-P8-NEXT:    store <1 x i128> [[TMP9]], <1 x i128>* [[__VEC_ADDR_I]], align 16
 // CHECK-P8-NEXT:    store i64 7, i64* [[__OFFSET_ADDR_I1]], align 8
 // CHECK-P8-NEXT:    store i128* [[TMP10]], i128** [[__PTR_ADDR_I2]], align 8
@@ -483,8 +507,9 @@ void test9(vector signed __int128 *c, signed __int128 *ptr) {
 // CHECK-P8-NEXT:    store <1 x i128> [[TMP14]], <1 x i128>* [[TMP16]], align 1
 // CHECK-P8-NEXT:    ret void
 //
-void test10(vector unsigned __int128 *c, unsigned __int128 *ptr) {
-    *c = vec_xl(3ll, ptr);
-    vec_xst(*c, 7ll, ptr);
+void test10(vector unsigned __int128 *c, unsigned __int128 *st,
+            const unsigned __int128 *ld) {
+    *c = vec_xl(3ll, ld);
+    vec_xst(*c, 7ll, st);
 }
 #endif

From 2ffe0eed51af296a4cf6be73c1b514c91e722114 Mon Sep 17 00:00:00 2001
From: Serge Guelton <sguelton@redhat.com>
Date: Thu, 25 Jun 2020 05:57:01 -0400
Subject: [PATCH 221/363] Provide anchor for compiler extensions

This patch is cherry-picked from 04b0a4e22e3b4549f9d241f8a9f37eebecb62a31, and
amended to prevent an undefined reference to `llvm::EnableABIBreakingChecks'

(cherry picked from commit 38778e1087b2825e91b07ce4570c70815b49dcdc)
---
 llvm/lib/Extensions/Extensions.cpp | 15 +++++++++++++++
 llvm/lib/Extensions/LLVMBuild.txt  |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Extensions/Extensions.cpp b/llvm/lib/Extensions/Extensions.cpp
index e69de29bb2d1..2fe537f91876 100644
--- a/llvm/lib/Extensions/Extensions.cpp
+++ b/llvm/lib/Extensions/Extensions.cpp
@@ -0,0 +1,15 @@
+#include "llvm/Passes/PassPlugin.h"
+#define HANDLE_EXTENSION(Ext)                                                  \
+		llvm::PassPluginLibraryInfo get##Ext##PluginInfo();
+#include "llvm/Support/Extension.def"
+
+
+namespace llvm {
+	namespace details {
+		void extensions_anchor() {
+#define HANDLE_EXTENSION(Ext)                                                  \
+			static auto Ext = get##Ext##PluginInfo();
+#include "llvm/Support/Extension.def"
+		}
+	}
+}
diff --git a/llvm/lib/Extensions/LLVMBuild.txt b/llvm/lib/Extensions/LLVMBuild.txt
index 2005830a4dd7..7a98c8f68051 100644
--- a/llvm/lib/Extensions/LLVMBuild.txt
+++ b/llvm/lib/Extensions/LLVMBuild.txt
@@ -18,4 +18,4 @@
 type = Library
 name = Extensions
 parent = Libraries
-required_libraries =
+required_libraries = Support

From 42ee33ca2bee10faedb6a02031c88bd6f70193f0 Mon Sep 17 00:00:00 2001
From: mydeveloperday <mydeveloperday@gmail.com>
Date: Tue, 8 Sep 2020 16:39:11 +0100
Subject: [PATCH 222/363] [clang-format] Handle shifts within conditions

In some situation shifts can be treated as a template, and is thus formatted as one. So, by doing a couple extra checks to assure that the condition doesn't contain a template, and is in fact a bit shift should solve this problem.

This is a fix for [[ https://bugs.llvm.org/show_bug.cgi?id=46969 | bug 46969 ]]

Reviewed By: MyDeveloperDay

Patch By: Saldivarcher

Differential Revision: https://reviews.llvm.org/D86581

(cherry picked from commit c81dd3d159ab03d46e4280c458d3c29e56648218)
---
 clang/lib/Format/TokenAnnotator.cpp   | 20 +++++++++++++-------
 clang/unittests/Format/FormatTest.cpp | 15 +++++++++++++++
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 7f8e35126512..914c05f72aec 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -56,6 +56,13 @@ static bool isLambdaParameterList(const FormatToken *Left) {
          Left->Previous->MatchingParen->is(TT_LambdaLSquare);
 }
 
+/// Returns \c true if the token is followed by a boolean condition, \c false
+/// otherwise.
+static bool isKeywordWithCondition(const FormatToken &Tok) {
+  return Tok.isOneOf(tok::kw_if, tok::kw_for, tok::kw_while, tok::kw_switch,
+                     tok::kw_constexpr, tok::kw_catch);
+}
+
 /// A parser that gathers additional information about tokens.
 ///
 /// The \c TokenAnnotator tries to match parenthesis and square brakets and
@@ -108,6 +115,12 @@ class AnnotatingParser {
 
     while (CurrentToken) {
       if (CurrentToken->is(tok::greater)) {
+        // Try to do a better job at looking for ">>" within the condition of
+        // a statement.
+        if (CurrentToken->Next && CurrentToken->Next->is(tok::greater) &&
+            Left->ParentBracket != tok::less &&
+            isKeywordWithCondition(*Line.First))
+          return false;
         Left->MatchingParen = CurrentToken;
         CurrentToken->MatchingParen = Left;
         // In TT_Proto, we must distignuish between:
@@ -2733,13 +2746,6 @@ bool TokenAnnotator::spaceRequiredBeforeParens(const FormatToken &Right) const {
           Right.ParameterCount > 0);
 }
 
-/// Returns \c true if the token is followed by a boolean condition, \c false
-/// otherwise.
-static bool isKeywordWithCondition(const FormatToken &Tok) {
-  return Tok.isOneOf(tok::kw_if, tok::kw_for, tok::kw_while, tok::kw_switch,
-                     tok::kw_constexpr, tok::kw_catch);
-}
-
 bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
                                           const FormatToken &Left,
                                           const FormatToken &Right) {
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 6ac3ffbffd1c..17d302f0b659 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -7508,6 +7508,21 @@ TEST_F(FormatTest, UnderstandsTemplateParameters) {
   verifyFormat("static_assert(is_convertible<A &&, B>::value, \"AAA\");");
   verifyFormat("Constructor(A... a) : a_(X<A>{std::forward<A>(a)}...) {}");
   verifyFormat("< < < < < < < < < < < < < < < < < < < < < < < < < < < < < <");
+  verifyFormat("some_templated_type<decltype([](int i) { return i; })>");
+}
+
+TEST_F(FormatTest, UnderstandsShiftOperators) {
+  verifyFormat("if (i < x >> 1)");
+  verifyFormat("while (i < x >> 1)");
+  verifyFormat("for (unsigned i = 0; i < i; ++i, v = v >> 1)");
+  verifyFormat("for (unsigned i = 0; i < x >> 1; ++i, v = v >> 1)");
+  verifyFormat(
+      "for (std::vector<int>::iterator i = 0; i < x >> 1; ++i, v = v >> 1)");
+  verifyFormat("Foo.call<Bar<Function>>()");
+  verifyFormat("if (Foo.call<Bar<Function>>() == 0)");
+  verifyFormat("for (std::vector<std::pair<int>>::iterator i = 0; i < x >> 1; "
+               "++i, v = v >> 1)");
+  verifyFormat("if (w<u<v<x>>, 1>::t)");
 }
 
 TEST_F(FormatTest, BitshiftOperatorWidth) {

From 6f1dbbc17c03206040eeaaee71e5db961f2cac30 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 8 Sep 2020 10:49:32 -0700
Subject: [PATCH 223/363] [X86] SSE4_A should only imply SSE3 not SSSE3 in the
 frontend.

SSE4_1 and SSE4_2 due imply SSSE3. So I guess I got confused when
switching the code to being table based in D83273.

Fixes PR47464

(cherry picked from commit e6bb4c8e7b3e27f214c9665763a2dd09aa96a5ac)
---
 clang/test/Preprocessor/predefined-arch-macros.c | 2 ++
 llvm/lib/Support/X86TargetParser.cpp             | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index abab9274ffbb..4dc9a800956e 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -2386,6 +2386,7 @@
 // CHECK_AMDFAM10_M32: #define __SSE4A__ 1
 // CHECK_AMDFAM10_M32: #define __SSE_MATH__ 1
 // CHECK_AMDFAM10_M32: #define __SSE__ 1
+// CHECK_AMDFAM10_M32-NOT: #define __SSSE3__ 1
 // CHECK_AMDFAM10_M32: #define __amdfam10 1
 // CHECK_AMDFAM10_M32: #define __amdfam10__ 1
 // CHECK_AMDFAM10_M32: #define __i386 1
@@ -2408,6 +2409,7 @@
 // CHECK_AMDFAM10_M64: #define __SSE4A__ 1
 // CHECK_AMDFAM10_M64: #define __SSE_MATH__ 1
 // CHECK_AMDFAM10_M64: #define __SSE__ 1
+// CHECK_AMDFAM10_M64-NOT: #define __SSSE3__ 1
 // CHECK_AMDFAM10_M64: #define __amd64 1
 // CHECK_AMDFAM10_M64: #define __amd64__ 1
 // CHECK_AMDFAM10_M64: #define __amdfam10 1
diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp
index c629f872df12..4c2d4efbfca8 100644
--- a/llvm/lib/Support/X86TargetParser.cpp
+++ b/llvm/lib/Support/X86TargetParser.cpp
@@ -522,7 +522,7 @@ static constexpr FeatureBitset ImpliedFeaturesAVX5124FMAPS = {};
 static constexpr FeatureBitset ImpliedFeaturesAVX5124VNNIW = {};
 
 // SSE4_A->FMA4->XOP chain.
-static constexpr FeatureBitset ImpliedFeaturesSSE4_A = FeatureSSSE3;
+static constexpr FeatureBitset ImpliedFeaturesSSE4_A = FeatureSSE3;
 static constexpr FeatureBitset ImpliedFeaturesFMA4 = FeatureAVX | FeatureSSE4_A;
 static constexpr FeatureBitset ImpliedFeaturesXOP = FeatureFMA4;
 

From d1cdc6da27a5937c239791c056eb2a754d7f4747 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Tue, 8 Sep 2020 21:21:14 -0400
Subject: [PATCH 224/363] [PowerPC] Set setMaxAtomicSizeInBitsSupported
 appropriately for 32-bit PowerPC in PPCTargetLowering

Reviewed By: nemanjai

Differential Revision: https://reviews.llvm.org/D86165

(cherry picked from commit 88b368a1c47bca536f03041f7464235b94ea98a1)
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp  |   3 +
 llvm/test/CodeGen/PowerPC/atomics-indexed.ll | 140 ++++--
 llvm/test/CodeGen/PowerPC/atomics.ll         | 437 ++++++++++++++++---
 3 files changed, 503 insertions(+), 77 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 2d0b17115249..f54f1673526d 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1260,6 +1260,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setLibcallName(RTLIB::SRA_I128, nullptr);
   }
 
+  if (!isPPC64)
+    setMaxAtomicSizeInBitsSupported(32);
+
   setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
 
   // We have target-specific dag combine patterns for the following nodes:
diff --git a/llvm/test/CodeGen/PowerPC/atomics-indexed.ll b/llvm/test/CodeGen/PowerPC/atomics-indexed.ll
index b4790adfd908..cf7225a5fc20 100644
--- a/llvm/test/CodeGen/PowerPC/atomics-indexed.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics-indexed.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -verify-machineinstrs -ppc-asm-full-reg-names | FileCheck %s --check-prefix=CHECK --check-prefix=PPC32
 ; FIXME: -verify-machineinstrs currently fail on ppc64 (mismatched register/instruction).
 ; This is already checked for in Atomics-64.ll
@@ -8,9 +9,25 @@
 
 ; Indexed version of loads
 define i8 @load_x_i8_seq_cst([100000 x i8]* %mem) {
-; CHECK-LABEL: load_x_i8_seq_cst
-; CHECK: sync
-; CHECK: lbzx [[VAL:r[0-9]+]]
+; PPC32-LABEL: load_x_i8_seq_cst:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    lis r4, 1
+; PPC32-NEXT:    sync
+; PPC32-NEXT:    ori r4, r4, 24464
+; PPC32-NEXT:    lbzx r3, r3, r4
+; PPC32-NEXT:    lwsync
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: load_x_i8_seq_cst:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    lis r4, 1
+; PPC64-NEXT:    sync
+; PPC64-NEXT:    ori r4, r4, 24464
+; PPC64-NEXT:    lbzx r3, r3, r4
+; PPC64-NEXT:    cmpd cr7, r3, r3
+; PPC64-NEXT:    bne- cr7, .+4
+; PPC64-NEXT:    isync
+; PPC64-NEXT:    blr
 ; CHECK-PPC32: lwsync
 ; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
 ; CHECK-PPC64: bne- [[CR]], .+4
@@ -20,8 +37,23 @@ define i8 @load_x_i8_seq_cst([100000 x i8]* %mem) {
   ret i8 %val
 }
 define i16 @load_x_i16_acquire([100000 x i16]* %mem) {
-; CHECK-LABEL: load_x_i16_acquire
-; CHECK: lhzx [[VAL:r[0-9]+]]
+; PPC32-LABEL: load_x_i16_acquire:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    lis r4, 2
+; PPC32-NEXT:    ori r4, r4, 48928
+; PPC32-NEXT:    lhzx r3, r3, r4
+; PPC32-NEXT:    lwsync
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: load_x_i16_acquire:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    lis r4, 2
+; PPC64-NEXT:    ori r4, r4, 48928
+; PPC64-NEXT:    lhzx r3, r3, r4
+; PPC64-NEXT:    cmpd cr7, r3, r3
+; PPC64-NEXT:    bne- cr7, .+4
+; PPC64-NEXT:    isync
+; PPC64-NEXT:    blr
 ; CHECK-PPC32: lwsync
 ; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
 ; CHECK-PPC64: bne- [[CR]], .+4
@@ -31,19 +63,39 @@ define i16 @load_x_i16_acquire([100000 x i16]* %mem) {
   ret i16 %val
 }
 define i32 @load_x_i32_monotonic([100000 x i32]* %mem) {
-; CHECK-LABEL: load_x_i32_monotonic
-; CHECK: lwzx
-; CHECK-NOT: sync
+; CHECK-LABEL: load_x_i32_monotonic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lis r4, 5
+; CHECK-NEXT:    ori r4, r4, 32320
+; CHECK-NEXT:    lwzx r3, r3, r4
+; CHECK-NEXT:    blr
   %ptr = getelementptr inbounds [100000 x i32], [100000 x i32]* %mem, i64 0, i64 90000
   %val = load atomic i32, i32* %ptr monotonic, align 4
   ret i32 %val
 }
 define i64 @load_x_i64_unordered([100000 x i64]* %mem) {
-; CHECK-LABEL: load_x_i64_unordered
-; PPC32: __sync_
-; PPC64-NOT: __sync_
-; PPC64: ldx
-; CHECK-NOT: sync
+; PPC32-LABEL: load_x_i64_unordered:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    mflr r0
+; PPC32-NEXT:    stw r0, 4(r1)
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    .cfi_offset lr, 4
+; PPC32-NEXT:    addi r3, r3, -896
+; PPC32-NEXT:    addis r3, r3, 11
+; PPC32-NEXT:    li r4, 0
+; PPC32-NEXT:    bl __atomic_load_8
+; PPC32-NEXT:    lwz r0, 20(r1)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    mtlr r0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: load_x_i64_unordered:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    lis r4, 10
+; PPC64-NEXT:    ori r4, r4, 64640
+; PPC64-NEXT:    ldx r3, r3, r4
+; PPC64-NEXT:    blr
   %ptr = getelementptr inbounds [100000 x i64], [100000 x i64]* %mem, i64 0, i64 90000
   %val = load atomic i64, i64* %ptr unordered, align 8
   ret i64 %val
@@ -51,35 +103,69 @@ define i64 @load_x_i64_unordered([100000 x i64]* %mem) {
 
 ; Indexed version of stores
 define void @store_x_i8_seq_cst([100000 x i8]* %mem) {
-; CHECK-LABEL: store_x_i8_seq_cst
-; CHECK: sync
-; CHECK: stbx
+; CHECK-LABEL: store_x_i8_seq_cst:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lis r4, 1
+; CHECK-NEXT:    ori r4, r4, 24464
+; CHECK-NEXT:    li r5, 42
+; CHECK-NEXT:    sync
+; CHECK-NEXT:    stbx r5, r3, r4
+; CHECK-NEXT:    blr
   %ptr = getelementptr inbounds [100000 x i8], [100000 x i8]* %mem, i64 0, i64 90000
   store atomic i8 42, i8* %ptr seq_cst, align 1
   ret void
 }
 define void @store_x_i16_release([100000 x i16]* %mem) {
-; CHECK-LABEL: store_x_i16_release
-; CHECK: lwsync
-; CHECK: sthx
+; CHECK-LABEL: store_x_i16_release:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lis r4, 2
+; CHECK-NEXT:    ori r4, r4, 48928
+; CHECK-NEXT:    li r5, 42
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    sthx r5, r3, r4
+; CHECK-NEXT:    blr
   %ptr = getelementptr inbounds [100000 x i16], [100000 x i16]* %mem, i64 0, i64 90000
   store atomic i16 42, i16* %ptr release, align 2
   ret void
 }
 define void @store_x_i32_monotonic([100000 x i32]* %mem) {
-; CHECK-LABEL: store_x_i32_monotonic
-; CHECK-NOT: sync
-; CHECK: stwx
+; CHECK-LABEL: store_x_i32_monotonic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lis r4, 5
+; CHECK-NEXT:    ori r4, r4, 32320
+; CHECK-NEXT:    li r5, 42
+; CHECK-NEXT:    stwx r5, r3, r4
+; CHECK-NEXT:    blr
   %ptr = getelementptr inbounds [100000 x i32], [100000 x i32]* %mem, i64 0, i64 90000
   store atomic i32 42, i32* %ptr monotonic, align 4
   ret void
 }
 define void @store_x_i64_unordered([100000 x i64]* %mem) {
-; CHECK-LABEL: store_x_i64_unordered
-; CHECK-NOT: sync
-; PPC32: __sync_
-; PPC64-NOT: __sync_
-; PPC64: stdx
+; PPC32-LABEL: store_x_i64_unordered:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    mflr r0
+; PPC32-NEXT:    stw r0, 4(r1)
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    .cfi_offset lr, 4
+; PPC32-NEXT:    addi r3, r3, -896
+; PPC32-NEXT:    addis r3, r3, 11
+; PPC32-NEXT:    li r5, 0
+; PPC32-NEXT:    li r6, 42
+; PPC32-NEXT:    li r7, 0
+; PPC32-NEXT:    bl __atomic_store_8
+; PPC32-NEXT:    lwz r0, 20(r1)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    mtlr r0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: store_x_i64_unordered:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    lis r4, 10
+; PPC64-NEXT:    ori r4, r4, 64640
+; PPC64-NEXT:    li r5, 42
+; PPC64-NEXT:    stdx r5, r3, r4
+; PPC64-NEXT:    blr
   %ptr = getelementptr inbounds [100000 x i64], [100000 x i64]* %mem, i64 0, i64 90000
   store atomic i64 42, i64* %ptr unordered, align 8
   ret void
diff --git a/llvm/test/CodeGen/PowerPC/atomics.ll b/llvm/test/CodeGen/PowerPC/atomics.ll
index c964218cb60b..008cd4c7157c 100644
--- a/llvm/test/CodeGen/PowerPC/atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
 ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc-unknown-linux-gnu -verify-machineinstrs  -ppc-asm-full-reg-names | FileCheck %s --check-prefix=CHECK --check-prefix=PPC32
 ; This is already checked for in Atomics-64.ll
 ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu  -ppc-asm-full-reg-names | FileCheck %s --check-prefix=CHECK --check-prefix=PPC64
@@ -9,22 +10,35 @@
 ; We first check loads, for all sizes from i8 to i64.
 ; We also vary orderings to check for barriers.
 define i8 @load_i8_unordered(i8* %mem) {
-; CHECK-LABEL: load_i8_unordered
-; CHECK: lbz
-; CHECK-NOT: sync
+; CHECK-LABEL: load_i8_unordered:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lbz r3, 0(r3)
+; CHECK-NEXT:    blr
   %val = load atomic i8, i8* %mem unordered, align 1
   ret i8 %val
 }
 define i16 @load_i16_monotonic(i16* %mem) {
-; CHECK-LABEL: load_i16_monotonic
-; CHECK: lhz
-; CHECK-NOT: sync
+; CHECK-LABEL: load_i16_monotonic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lhz r3, 0(r3)
+; CHECK-NEXT:    blr
   %val = load atomic i16, i16* %mem monotonic, align 2
   ret i16 %val
 }
 define i32 @load_i32_acquire(i32* %mem) {
-; CHECK-LABEL: load_i32_acquire
-; CHECK: lwz [[VAL:r[0-9]+]]
+; PPC32-LABEL: load_i32_acquire:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    lwz r3, 0(r3)
+; PPC32-NEXT:    lwsync
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: load_i32_acquire:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    lwz r3, 0(r3)
+; PPC64-NEXT:    cmpd cr7, r3, r3
+; PPC64-NEXT:    bne- cr7, .+4
+; PPC64-NEXT:    isync
+; PPC64-NEXT:    blr
   %val = load atomic i32, i32* %mem acquire, align 4
 ; CHECK-PPC32: lwsync
 ; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
@@ -33,11 +47,28 @@ define i32 @load_i32_acquire(i32* %mem) {
   ret i32 %val
 }
 define i64 @load_i64_seq_cst(i64* %mem) {
-; CHECK-LABEL: load_i64_seq_cst
-; CHECK: sync
-; PPC32: __sync_
-; PPC64-NOT: __sync_
-; PPC64: ld [[VAL:r[0-9]+]]
+; PPC32-LABEL: load_i64_seq_cst:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    mflr r0
+; PPC32-NEXT:    stw r0, 4(r1)
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    .cfi_offset lr, 4
+; PPC32-NEXT:    li r4, 5
+; PPC32-NEXT:    bl __atomic_load_8
+; PPC32-NEXT:    lwz r0, 20(r1)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    mtlr r0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: load_i64_seq_cst:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    sync
+; PPC64-NEXT:    ld r3, 0(r3)
+; PPC64-NEXT:    cmpd cr7, r3, r3
+; PPC64-NEXT:    bne- cr7, .+4
+; PPC64-NEXT:    isync
+; PPC64-NEXT:    blr
   %val = load atomic i64, i64* %mem seq_cst, align 8
 ; CHECK-PPC32: lwsync
 ; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
@@ -48,95 +79,401 @@ define i64 @load_i64_seq_cst(i64* %mem) {
 
 ; Stores
 define void @store_i8_unordered(i8* %mem) {
-; CHECK-LABEL: store_i8_unordered
-; CHECK-NOT: sync
-; CHECK: stb
+; CHECK-LABEL: store_i8_unordered:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li r4, 42
+; CHECK-NEXT:    stb r4, 0(r3)
+; CHECK-NEXT:    blr
   store atomic i8 42, i8* %mem unordered, align 1
   ret void
 }
 define void @store_i16_monotonic(i16* %mem) {
-; CHECK-LABEL: store_i16_monotonic
-; CHECK-NOT: sync
-; CHECK: sth
+; CHECK-LABEL: store_i16_monotonic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li r4, 42
+; CHECK-NEXT:    sth r4, 0(r3)
+; CHECK-NEXT:    blr
   store atomic i16 42, i16* %mem monotonic, align 2
   ret void
 }
 define void @store_i32_release(i32* %mem) {
-; CHECK-LABEL: store_i32_release
-; CHECK: lwsync
-; CHECK: stw
+; CHECK-LABEL: store_i32_release:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li r4, 42
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    stw r4, 0(r3)
+; CHECK-NEXT:    blr
   store atomic i32 42, i32* %mem release, align 4
   ret void
 }
 define void @store_i64_seq_cst(i64* %mem) {
-; CHECK-LABEL: store_i64_seq_cst
-; CHECK: sync
-; PPC32: __sync_
-; PPC64-NOT: __sync_
-; PPC64: std
+; PPC32-LABEL: store_i64_seq_cst:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    mflr r0
+; PPC32-NEXT:    stw r0, 4(r1)
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    .cfi_offset lr, 4
+; PPC32-NEXT:    li r5, 0
+; PPC32-NEXT:    li r6, 42
+; PPC32-NEXT:    li r7, 5
+; PPC32-NEXT:    bl __atomic_store_8
+; PPC32-NEXT:    lwz r0, 20(r1)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    mtlr r0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: store_i64_seq_cst:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    li r4, 42
+; PPC64-NEXT:    sync
+; PPC64-NEXT:    std r4, 0(r3)
+; PPC64-NEXT:    blr
   store atomic i64 42, i64* %mem seq_cst, align 8
   ret void
 }
 
 ; Atomic CmpXchg
 define i8 @cas_strong_i8_sc_sc(i8* %mem) {
-; CHECK-LABEL: cas_strong_i8_sc_sc
-; CHECK: sync
+; PPC32-LABEL: cas_strong_i8_sc_sc:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    rlwinm r8, r3, 3, 27, 28
+; PPC32-NEXT:    li r5, 1
+; PPC32-NEXT:    li r6, 0
+; PPC32-NEXT:    li r7, 255
+; PPC32-NEXT:    rlwinm r4, r3, 0, 0, 29
+; PPC32-NEXT:    xori r3, r8, 24
+; PPC32-NEXT:    slw r5, r5, r3
+; PPC32-NEXT:    slw r8, r6, r3
+; PPC32-NEXT:    slw r6, r7, r3
+; PPC32-NEXT:    and r7, r5, r6
+; PPC32-NEXT:    and r8, r8, r6
+; PPC32-NEXT:    sync
+; PPC32-NEXT:  .LBB8_1:
+; PPC32-NEXT:    lwarx r9, 0, r4
+; PPC32-NEXT:    and r5, r9, r6
+; PPC32-NEXT:    cmpw r5, r8
+; PPC32-NEXT:    bne cr0, .LBB8_3
+; PPC32-NEXT:  # %bb.2:
+; PPC32-NEXT:    andc r9, r9, r6
+; PPC32-NEXT:    or r9, r9, r7
+; PPC32-NEXT:    stwcx. r9, 0, r4
+; PPC32-NEXT:    bne cr0, .LBB8_1
+; PPC32-NEXT:    b .LBB8_4
+; PPC32-NEXT:  .LBB8_3:
+; PPC32-NEXT:    stwcx. r9, 0, r4
+; PPC32-NEXT:  .LBB8_4:
+; PPC32-NEXT:    srw r3, r5, r3
+; PPC32-NEXT:    lwsync
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: cas_strong_i8_sc_sc:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    rlwinm r8, r3, 3, 27, 28
+; PPC64-NEXT:    li r5, 1
+; PPC64-NEXT:    li r6, 0
+; PPC64-NEXT:    li r7, 255
+; PPC64-NEXT:    rldicr r4, r3, 0, 61
+; PPC64-NEXT:    xori r3, r8, 24
+; PPC64-NEXT:    slw r5, r5, r3
+; PPC64-NEXT:    slw r8, r6, r3
+; PPC64-NEXT:    slw r6, r7, r3
+; PPC64-NEXT:    and r7, r5, r6
+; PPC64-NEXT:    and r8, r8, r6
+; PPC64-NEXT:    sync
+; PPC64-NEXT:  .LBB8_1:
+; PPC64-NEXT:    lwarx r9, 0, r4
+; PPC64-NEXT:    and r5, r9, r6
+; PPC64-NEXT:    cmpw r5, r8
+; PPC64-NEXT:    bne cr0, .LBB8_3
+; PPC64-NEXT:  # %bb.2:
+; PPC64-NEXT:    andc r9, r9, r6
+; PPC64-NEXT:    or r9, r9, r7
+; PPC64-NEXT:    stwcx. r9, 0, r4
+; PPC64-NEXT:    bne cr0, .LBB8_1
+; PPC64-NEXT:    b .LBB8_4
+; PPC64-NEXT:  .LBB8_3:
+; PPC64-NEXT:    stwcx. r9, 0, r4
+; PPC64-NEXT:  .LBB8_4:
+; PPC64-NEXT:    srw r3, r5, r3
+; PPC64-NEXT:    lwsync
+; PPC64-NEXT:    blr
   %val = cmpxchg i8* %mem, i8 0, i8 1 seq_cst seq_cst
-; CHECK: lwsync
   %loaded = extractvalue { i8, i1} %val, 0
   ret i8 %loaded
 }
 define i16 @cas_weak_i16_acquire_acquire(i16* %mem) {
-; CHECK-LABEL: cas_weak_i16_acquire_acquire
-;CHECK-NOT: sync
+; PPC32-LABEL: cas_weak_i16_acquire_acquire:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    li r6, 0
+; PPC32-NEXT:    rlwinm r4, r3, 3, 27, 27
+; PPC32-NEXT:    li r5, 1
+; PPC32-NEXT:    ori r7, r6, 65535
+; PPC32-NEXT:    xori r4, r4, 16
+; PPC32-NEXT:    slw r8, r5, r4
+; PPC32-NEXT:    slw r9, r6, r4
+; PPC32-NEXT:    slw r5, r7, r4
+; PPC32-NEXT:    rlwinm r3, r3, 0, 0, 29
+; PPC32-NEXT:    and r6, r8, r5
+; PPC32-NEXT:    and r8, r9, r5
+; PPC32-NEXT:  .LBB9_1:
+; PPC32-NEXT:    lwarx r9, 0, r3
+; PPC32-NEXT:    and r7, r9, r5
+; PPC32-NEXT:    cmpw r7, r8
+; PPC32-NEXT:    bne cr0, .LBB9_3
+; PPC32-NEXT:  # %bb.2:
+; PPC32-NEXT:    andc r9, r9, r5
+; PPC32-NEXT:    or r9, r9, r6
+; PPC32-NEXT:    stwcx. r9, 0, r3
+; PPC32-NEXT:    bne cr0, .LBB9_1
+; PPC32-NEXT:    b .LBB9_4
+; PPC32-NEXT:  .LBB9_3:
+; PPC32-NEXT:    stwcx. r9, 0, r3
+; PPC32-NEXT:  .LBB9_4:
+; PPC32-NEXT:    srw r3, r7, r4
+; PPC32-NEXT:    lwsync
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: cas_weak_i16_acquire_acquire:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    li r6, 0
+; PPC64-NEXT:    rlwinm r4, r3, 3, 27, 27
+; PPC64-NEXT:    li r5, 1
+; PPC64-NEXT:    ori r7, r6, 65535
+; PPC64-NEXT:    xori r4, r4, 16
+; PPC64-NEXT:    slw r8, r5, r4
+; PPC64-NEXT:    slw r9, r6, r4
+; PPC64-NEXT:    slw r5, r7, r4
+; PPC64-NEXT:    rldicr r3, r3, 0, 61
+; PPC64-NEXT:    and r6, r8, r5
+; PPC64-NEXT:    and r8, r9, r5
+; PPC64-NEXT:  .LBB9_1:
+; PPC64-NEXT:    lwarx r9, 0, r3
+; PPC64-NEXT:    and r7, r9, r5
+; PPC64-NEXT:    cmpw r7, r8
+; PPC64-NEXT:    bne cr0, .LBB9_3
+; PPC64-NEXT:  # %bb.2:
+; PPC64-NEXT:    andc r9, r9, r5
+; PPC64-NEXT:    or r9, r9, r6
+; PPC64-NEXT:    stwcx. r9, 0, r3
+; PPC64-NEXT:    bne cr0, .LBB9_1
+; PPC64-NEXT:    b .LBB9_4
+; PPC64-NEXT:  .LBB9_3:
+; PPC64-NEXT:    stwcx. r9, 0, r3
+; PPC64-NEXT:  .LBB9_4:
+; PPC64-NEXT:    srw r3, r7, r4
+; PPC64-NEXT:    lwsync
+; PPC64-NEXT:    blr
   %val = cmpxchg weak i16* %mem, i16 0, i16 1 acquire acquire
-; CHECK: lwsync
   %loaded = extractvalue { i16, i1} %val, 0
   ret i16 %loaded
 }
 define i32 @cas_strong_i32_acqrel_acquire(i32* %mem) {
-; CHECK-LABEL: cas_strong_i32_acqrel_acquire
-; CHECK: lwsync
+; CHECK-LABEL: cas_strong_i32_acqrel_acquire:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li r5, 1
+; CHECK-NEXT:    li r6, 0
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:  .LBB10_1:
+; CHECK-NEXT:    lwarx r4, 0, r3
+; CHECK-NEXT:    cmpw r6, r4
+; CHECK-NEXT:    bne cr0, .LBB10_3
+; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:    stwcx. r5, 0, r3
+; CHECK-NEXT:    bne cr0, .LBB10_1
+; CHECK-NEXT:    b .LBB10_4
+; CHECK-NEXT:  .LBB10_3:
+; CHECK-NEXT:    stwcx. r4, 0, r3
+; CHECK-NEXT:  .LBB10_4:
+; CHECK-NEXT:    mr r3, r4
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    blr
   %val = cmpxchg i32* %mem, i32 0, i32 1 acq_rel acquire
-; CHECK: lwsync
   %loaded = extractvalue { i32, i1} %val, 0
   ret i32 %loaded
 }
 define i64 @cas_weak_i64_release_monotonic(i64* %mem) {
-; CHECK-LABEL: cas_weak_i64_release_monotonic
-; CHECK: lwsync
+; PPC32-LABEL: cas_weak_i64_release_monotonic:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    mflr r0
+; PPC32-NEXT:    stw r0, 4(r1)
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    .cfi_offset lr, 4
+; PPC32-NEXT:    li r4, 0
+; PPC32-NEXT:    stw r4, 12(r1)
+; PPC32-NEXT:    li r5, 0
+; PPC32-NEXT:    stw r4, 8(r1)
+; PPC32-NEXT:    addi r4, r1, 8
+; PPC32-NEXT:    li r6, 1
+; PPC32-NEXT:    li r7, 3
+; PPC32-NEXT:    li r8, 0
+; PPC32-NEXT:    bl __atomic_compare_exchange_8
+; PPC32-NEXT:    lwz r4, 12(r1)
+; PPC32-NEXT:    lwz r3, 8(r1)
+; PPC32-NEXT:    lwz r0, 20(r1)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    mtlr r0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: cas_weak_i64_release_monotonic:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    li r5, 1
+; PPC64-NEXT:    li r6, 0
+; PPC64-NEXT:    lwsync
+; PPC64-NEXT:  .LBB11_1:
+; PPC64-NEXT:    ldarx r4, 0, r3
+; PPC64-NEXT:    cmpd r6, r4
+; PPC64-NEXT:    bne cr0, .LBB11_4
+; PPC64-NEXT:  # %bb.2:
+; PPC64-NEXT:    stdcx. r5, 0, r3
+; PPC64-NEXT:    bne cr0, .LBB11_1
+; PPC64-NEXT:  # %bb.3:
+; PPC64-NEXT:    mr r3, r4
+; PPC64-NEXT:    blr
+; PPC64-NEXT:  .LBB11_4:
+; PPC64-NEXT:    stdcx. r4, 0, r3
+; PPC64-NEXT:    mr r3, r4
+; PPC64-NEXT:    blr
   %val = cmpxchg weak i64* %mem, i64 0, i64 1 release monotonic
-; CHECK-NOT: [sync ]
   %loaded = extractvalue { i64, i1} %val, 0
   ret i64 %loaded
 }
 
 ; AtomicRMW
 define i8 @add_i8_monotonic(i8* %mem, i8 %operand) {
-; CHECK-LABEL: add_i8_monotonic
-; CHECK-NOT: sync
+; PPC32-LABEL: add_i8_monotonic:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    rlwinm r7, r3, 3, 27, 28
+; PPC32-NEXT:    li r6, 255
+; PPC32-NEXT:    rlwinm r5, r3, 0, 0, 29
+; PPC32-NEXT:    xori r3, r7, 24
+; PPC32-NEXT:    slw r4, r4, r3
+; PPC32-NEXT:    slw r6, r6, r3
+; PPC32-NEXT:  .LBB12_1:
+; PPC32-NEXT:    lwarx r7, 0, r5
+; PPC32-NEXT:    add r8, r4, r7
+; PPC32-NEXT:    andc r9, r7, r6
+; PPC32-NEXT:    and r8, r8, r6
+; PPC32-NEXT:    or r8, r8, r9
+; PPC32-NEXT:    stwcx. r8, 0, r5
+; PPC32-NEXT:    bne cr0, .LBB12_1
+; PPC32-NEXT:  # %bb.2:
+; PPC32-NEXT:    srw r3, r7, r3
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: add_i8_monotonic:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    rlwinm r7, r3, 3, 27, 28
+; PPC64-NEXT:    li r6, 255
+; PPC64-NEXT:    rldicr r5, r3, 0, 61
+; PPC64-NEXT:    xori r3, r7, 24
+; PPC64-NEXT:    slw r4, r4, r3
+; PPC64-NEXT:    slw r6, r6, r3
+; PPC64-NEXT:  .LBB12_1:
+; PPC64-NEXT:    lwarx r7, 0, r5
+; PPC64-NEXT:    add r8, r4, r7
+; PPC64-NEXT:    andc r9, r7, r6
+; PPC64-NEXT:    and r8, r8, r6
+; PPC64-NEXT:    or r8, r8, r9
+; PPC64-NEXT:    stwcx. r8, 0, r5
+; PPC64-NEXT:    bne cr0, .LBB12_1
+; PPC64-NEXT:  # %bb.2:
+; PPC64-NEXT:    srw r3, r7, r3
+; PPC64-NEXT:    blr
   %val = atomicrmw add i8* %mem, i8 %operand monotonic
   ret i8 %val
 }
 define i16 @xor_i16_seq_cst(i16* %mem, i16 %operand) {
-; CHECK-LABEL: xor_i16_seq_cst
-; CHECK: sync
+; PPC32-LABEL: xor_i16_seq_cst:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    li r6, 0
+; PPC32-NEXT:    rlwinm r7, r3, 3, 27, 27
+; PPC32-NEXT:    rlwinm r5, r3, 0, 0, 29
+; PPC32-NEXT:    ori r6, r6, 65535
+; PPC32-NEXT:    xori r3, r7, 16
+; PPC32-NEXT:    slw r4, r4, r3
+; PPC32-NEXT:    slw r6, r6, r3
+; PPC32-NEXT:    sync
+; PPC32-NEXT:  .LBB13_1:
+; PPC32-NEXT:    lwarx r7, 0, r5
+; PPC32-NEXT:    xor r8, r4, r7
+; PPC32-NEXT:    andc r9, r7, r6
+; PPC32-NEXT:    and r8, r8, r6
+; PPC32-NEXT:    or r8, r8, r9
+; PPC32-NEXT:    stwcx. r8, 0, r5
+; PPC32-NEXT:    bne cr0, .LBB13_1
+; PPC32-NEXT:  # %bb.2:
+; PPC32-NEXT:    srw r3, r7, r3
+; PPC32-NEXT:    lwsync
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: xor_i16_seq_cst:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    li r6, 0
+; PPC64-NEXT:    rlwinm r7, r3, 3, 27, 27
+; PPC64-NEXT:    rldicr r5, r3, 0, 61
+; PPC64-NEXT:    ori r6, r6, 65535
+; PPC64-NEXT:    xori r3, r7, 16
+; PPC64-NEXT:    slw r4, r4, r3
+; PPC64-NEXT:    slw r6, r6, r3
+; PPC64-NEXT:    sync
+; PPC64-NEXT:  .LBB13_1:
+; PPC64-NEXT:    lwarx r7, 0, r5
+; PPC64-NEXT:    xor r8, r4, r7
+; PPC64-NEXT:    andc r9, r7, r6
+; PPC64-NEXT:    and r8, r8, r6
+; PPC64-NEXT:    or r8, r8, r9
+; PPC64-NEXT:    stwcx. r8, 0, r5
+; PPC64-NEXT:    bne cr0, .LBB13_1
+; PPC64-NEXT:  # %bb.2:
+; PPC64-NEXT:    srw r3, r7, r3
+; PPC64-NEXT:    lwsync
+; PPC64-NEXT:    blr
   %val = atomicrmw xor i16* %mem, i16 %operand seq_cst
-; CHECK: lwsync
   ret i16 %val
 }
 define i32 @xchg_i32_acq_rel(i32* %mem, i32 %operand) {
-; CHECK-LABEL: xchg_i32_acq_rel
-; CHECK: lwsync
+; CHECK-LABEL: xchg_i32_acq_rel:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:  .LBB14_1:
+; CHECK-NEXT:    lwarx r5, 0, r3
+; CHECK-NEXT:    stwcx. r4, 0, r3
+; CHECK-NEXT:    bne cr0, .LBB14_1
+; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:    mr r3, r5
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    blr
   %val = atomicrmw xchg i32* %mem, i32 %operand acq_rel
-; CHECK: lwsync
   ret i32 %val
 }
 define i64 @and_i64_release(i64* %mem, i64 %operand) {
-; CHECK-LABEL: and_i64_release
-; CHECK: lwsync
+; PPC32-LABEL: and_i64_release:
+; PPC32:       # %bb.0:
+; PPC32-NEXT:    mflr r0
+; PPC32-NEXT:    stw r0, 4(r1)
+; PPC32-NEXT:    stwu r1, -16(r1)
+; PPC32-NEXT:    .cfi_def_cfa_offset 16
+; PPC32-NEXT:    .cfi_offset lr, 4
+; PPC32-NEXT:    li r7, 3
+; PPC32-NEXT:    bl __atomic_fetch_and_8
+; PPC32-NEXT:    lwz r0, 20(r1)
+; PPC32-NEXT:    addi r1, r1, 16
+; PPC32-NEXT:    mtlr r0
+; PPC32-NEXT:    blr
+;
+; PPC64-LABEL: and_i64_release:
+; PPC64:       # %bb.0:
+; PPC64-NEXT:    lwsync
+; PPC64-NEXT:  .LBB15_1:
+; PPC64-NEXT:    ldarx r5, 0, r3
+; PPC64-NEXT:    and r6, r4, r5
+; PPC64-NEXT:    stdcx. r6, 0, r3
+; PPC64-NEXT:    bne cr0, .LBB15_1
+; PPC64-NEXT:  # %bb.2:
+; PPC64-NEXT:    mr r3, r5
+; PPC64-NEXT:    blr
   %val = atomicrmw and i64* %mem, i64 %operand release
-; CHECK-NOT: [sync ]
   ret i64 %val
 }

From 8ae3293030d9691ebe0006b79cec1b06bb8015cf Mon Sep 17 00:00:00 2001
From: Sourabh Singh Tomar <SourabhSingh.Tomar@amd.com>
Date: Thu, 10 Sep 2020 23:04:37 +0530
Subject: [PATCH 225/363] Revert D86875 "[Flang][NFC] Remove license comments
 from files in docs/ folder."

This reverts commit f787c9a90c69f, this was causing some build issues.

(cherry picked from commit 932aae77e92b08e63c0225b6eb37dfa80b310313)
---
 flang/docs/ArrayComposition.md           | 8 ++++++++
 flang/docs/C++17.md                      | 8 ++++++++
 flang/docs/C++style.md                   | 8 ++++++++
 flang/docs/Calls.md                      | 8 ++++++++
 flang/docs/Character.md                  | 8 ++++++++
 flang/docs/ControlFlowGraph.md           | 8 ++++++++
 flang/docs/Directives.md                 | 8 ++++++++
 flang/docs/Extensions.md                 | 8 ++++++++
 flang/docs/FortranForCProgrammers.md     | 8 ++++++++
 flang/docs/FortranIR.md                  | 8 ++++++++
 flang/docs/IORuntimeInternals.md         | 8 ++++++++
 flang/docs/ImplementingASemanticCheck.md | 8 ++++++++
 flang/docs/Intrinsics.md                 | 8 ++++++++
 flang/docs/LabelResolution.md            | 8 ++++++++
 flang/docs/ModFiles.md                   | 8 ++++++++
 flang/docs/OpenMP-semantics.md           | 8 ++++++++
 flang/docs/OptionComparison.md           | 8 ++++++++
 flang/docs/Overview.md                   | 8 ++++++++
 flang/docs/ParserCombinators.md          | 8 ++++++++
 flang/docs/Parsing.md                    | 8 ++++++++
 flang/docs/Preprocessing.md              | 8 ++++++++
 flang/docs/PullRequestChecklist.md       | 8 ++++++++
 flang/docs/RuntimeDescriptor.md          | 8 ++++++++
 flang/docs/Semantics.md                  | 8 ++++++++
 24 files changed, 192 insertions(+)

diff --git a/flang/docs/ArrayComposition.md b/flang/docs/ArrayComposition.md
index 18194caadf09..0f30af39f9e4 100644
--- a/flang/docs/ArrayComposition.md
+++ b/flang/docs/ArrayComposition.md
@@ -1,3 +1,11 @@
+<!--===- docs/ArrayComposition.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 This note attempts to describe the motivation for and design of an
 implementation of Fortran 90 (and later) array expression evaluation that
 minimizes the use of dynamically allocated temporary storage for
diff --git a/flang/docs/C++17.md b/flang/docs/C++17.md
index ea8395cfdedc..87d5fc01f092 100644
--- a/flang/docs/C++17.md
+++ b/flang/docs/C++17.md
@@ -1,3 +1,11 @@
+<!--===- docs/C++17.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 ## C++14/17 features used in f18
 
 The C++ dialect used in this project constitutes a subset of the
diff --git a/flang/docs/C++style.md b/flang/docs/C++style.md
index 77e0a0463823..4ab95393d758 100644
--- a/flang/docs/C++style.md
+++ b/flang/docs/C++style.md
@@ -1,3 +1,11 @@
+<!--===- docs/C++style.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 ## In brief:
 * Use *clang-format*
 from llvm 7
diff --git a/flang/docs/Calls.md b/flang/docs/Calls.md
index 8a4d65820d19..d70bc910d73d 100644
--- a/flang/docs/Calls.md
+++ b/flang/docs/Calls.md
@@ -1,3 +1,11 @@
+<!--===- docs/Calls.md
+
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+-->
+
 ## Procedure reference implementation protocol
 
 Fortran function and subroutine references are complicated.
diff --git a/flang/docs/Character.md b/flang/docs/Character.md
index f66b14438945..700db864f2da 100644
--- a/flang/docs/Character.md
+++ b/flang/docs/Character.md
@@ -1,3 +1,11 @@
+<!--===- docs/Character.md
+
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+-->
+
 ## Implementation of `CHARACTER` types in f18
 
 ### Kinds and Character Sets
diff --git a/flang/docs/ControlFlowGraph.md b/flang/docs/ControlFlowGraph.md
index 7d1e514a87ad..b2b549845ebb 100644
--- a/flang/docs/ControlFlowGraph.md
+++ b/flang/docs/ControlFlowGraph.md
@@ -1,3 +1,11 @@
+<!--===- docs/ControlFlowGraph.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 ## Concept
 After a Fortran subprogram has been parsed, its names resolved, and all its
 semantic constraints successfully checked, the parse tree of its
diff --git a/flang/docs/Directives.md b/flang/docs/Directives.md
index 554dc4608dd4..c2e93c5f3de2 100644
--- a/flang/docs/Directives.md
+++ b/flang/docs/Directives.md
@@ -1,3 +1,11 @@
+<!--===- docs/Directives.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 Compiler directives supported by F18
 ====================================
 
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 86a4f04de57f..9010b770cca6 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -1,3 +1,11 @@
+<!--===- docs/Extensions.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 As a general principle, this compiler will accept by default and
 without complaint many legacy features, extensions to the standard
 language, and features that have been deleted from the standard,
diff --git a/flang/docs/FortranForCProgrammers.md b/flang/docs/FortranForCProgrammers.md
index 542034f3ea83..103def2a92ce 100644
--- a/flang/docs/FortranForCProgrammers.md
+++ b/flang/docs/FortranForCProgrammers.md
@@ -1,3 +1,11 @@
+<!--===- docs/FortranForCProgrammers.md
+
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+-->
+
 Fortran For C Programmers
 =========================
 
diff --git a/flang/docs/FortranIR.md b/flang/docs/FortranIR.md
index 83193ff27a35..5d83aaa8e34c 100644
--- a/flang/docs/FortranIR.md
+++ b/flang/docs/FortranIR.md
@@ -1,3 +1,11 @@
+<!--===- docs/FortranIR.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 # Design: Fortran IR
 
 ## Introduction
diff --git a/flang/docs/IORuntimeInternals.md b/flang/docs/IORuntimeInternals.md
index 8ff464ee9c8f..b4f3092a014e 100644
--- a/flang/docs/IORuntimeInternals.md
+++ b/flang/docs/IORuntimeInternals.md
@@ -1,3 +1,11 @@
+<!--===- docs/IORuntimeInternals.md
+
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+-->
+
 Fortran I/O Runtime Library Internal Design
 ===========================================
 
diff --git a/flang/docs/ImplementingASemanticCheck.md b/flang/docs/ImplementingASemanticCheck.md
index 2406f5bc2a58..3bb16915cb88 100644
--- a/flang/docs/ImplementingASemanticCheck.md
+++ b/flang/docs/ImplementingASemanticCheck.md
@@ -1,3 +1,11 @@
+<!--===- docs/ImplementingASemanticCheck.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+# Introduction
 I recently added a semantic check to the f18 compiler front end.  This document
 describes my thought process and the resulting implementation.
 
diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md
index 6f4dec467823..7be0bf3e4a9c 100644
--- a/flang/docs/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -1,3 +1,11 @@
+<!--===- docs/Intrinsics.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 # A categorization of standard (2018) and extended Fortran intrinsic procedures
 
 This note attempts to group the intrinsic procedures of Fortran into categories
diff --git a/flang/docs/LabelResolution.md b/flang/docs/LabelResolution.md
index 2dfa5a30bb3c..e837b4fa6aec 100644
--- a/flang/docs/LabelResolution.md
+++ b/flang/docs/LabelResolution.md
@@ -1,3 +1,11 @@
+<!--===- docs/LabelResolution.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 # Semantics: Resolving Labels and Construct Names
 
 ## Overview
diff --git a/flang/docs/ModFiles.md b/flang/docs/ModFiles.md
index 367cd4cd54f7..483341bdd0f4 100644
--- a/flang/docs/ModFiles.md
+++ b/flang/docs/ModFiles.md
@@ -1,3 +1,11 @@
+<!--===- docs/ModFiles.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 # Module Files
 
 Module files hold information from a module that is necessary to compile 
diff --git a/flang/docs/OpenMP-semantics.md b/flang/docs/OpenMP-semantics.md
index 22a3ca5614eb..4e2a81739cf8 100644
--- a/flang/docs/OpenMP-semantics.md
+++ b/flang/docs/OpenMP-semantics.md
@@ -1,3 +1,11 @@
+<!--===- docs/OpenMP-semantics.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 # OpenMP Semantic Analysis
 
 ## OpenMP for F18
diff --git a/flang/docs/OptionComparison.md b/flang/docs/OptionComparison.md
index 5c04450a7bb3..db5932411cc1 100644
--- a/flang/docs/OptionComparison.md
+++ b/flang/docs/OptionComparison.md
@@ -1,3 +1,11 @@
+<!--===- docs/OptionComparison.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 # Compiler options
 
 This document catalogs the options processed by F18's peers/competitors.  Much of the document is taken up by a set of tables that list the options categorized into different topics.  Some of the table headings link to more information about the contents of the tables.  For example, the table on **Standards conformance** options links to [notes on Standards conformance](#standards).
diff --git a/flang/docs/Overview.md b/flang/docs/Overview.md
index 807efda2ed9a..75a8cd1c4cab 100644
--- a/flang/docs/Overview.md
+++ b/flang/docs/Overview.md
@@ -1,3 +1,11 @@
+<!--===- docs/Overview.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 # Overview of Compiler Phases
 
 Each phase produces either correct output or fatal errors.
diff --git a/flang/docs/ParserCombinators.md b/flang/docs/ParserCombinators.md
index 757684dcfda6..4f3dc6fd07ae 100644
--- a/flang/docs/ParserCombinators.md
+++ b/flang/docs/ParserCombinators.md
@@ -1,3 +1,11 @@
+<!--===- docs/ParserCombinators.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 ## Concept
 The Fortran language recognizer here can be classified as an LL recursive
 descent parser.  It is composed from a *parser combinator* library that
diff --git a/flang/docs/Parsing.md b/flang/docs/Parsing.md
index 54a4fd752f6c..fad9a4d57278 100644
--- a/flang/docs/Parsing.md
+++ b/flang/docs/Parsing.md
@@ -1,3 +1,11 @@
+<!--===- docs/Parsing.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 The F18 Parser
 ==============
 This program source code implements a parser for the Fortran programming
diff --git a/flang/docs/Preprocessing.md b/flang/docs/Preprocessing.md
index 9b4d905177b7..7f6f3951cfd1 100644
--- a/flang/docs/Preprocessing.md
+++ b/flang/docs/Preprocessing.md
@@ -1,3 +1,11 @@
+<!--===- docs/Preprocessing.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 Fortran Preprocessing
 =====================
 
diff --git a/flang/docs/PullRequestChecklist.md b/flang/docs/PullRequestChecklist.md
index 17b6d64923f5..12a67be374a2 100644
--- a/flang/docs/PullRequestChecklist.md
+++ b/flang/docs/PullRequestChecklist.md
@@ -1,3 +1,11 @@
+<!--===- docs/PullRequestChecklist.md 
+
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+-->
+
 # Pull request checklist
 Please review the following items before submitting a pull request.  This list
 can also be used when reviewing pull requests.
diff --git a/flang/docs/RuntimeDescriptor.md b/flang/docs/RuntimeDescriptor.md
index a8eff33f6521..d819517fa979 100644
--- a/flang/docs/RuntimeDescriptor.md
+++ b/flang/docs/RuntimeDescriptor.md
@@ -1,3 +1,11 @@
+<!--===- docs/RuntimeDescriptor.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 ## Concept
 The properties that characterize data values and objects in Fortran
 programs must sometimes be materialized when the program runs.
diff --git a/flang/docs/Semantics.md b/flang/docs/Semantics.md
index f879671b4f4e..6ea0b292de69 100644
--- a/flang/docs/Semantics.md
+++ b/flang/docs/Semantics.md
@@ -1,3 +1,11 @@
+<!--===- docs/Semantics.md 
+  
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+  
+-->
+
 # Semantic Analysis
 
 The semantic analysis pass determines if a syntactically correct Fortran

From 9ad89a8cc3c7de0f0b5a306932671a4c78644d03 Mon Sep 17 00:00:00 2001
From: Richard Barton <richard.barton@arm.com>
Date: Thu, 3 Sep 2020 11:44:03 +0100
Subject: [PATCH 226/363] [flang] Convert release notes to markdown

Switch ReleaseNotes from .rst to .md to match the other docs.

At the same time, fix the version number for master.
---
 flang/docs/ReleaseNotes.md  | 87 +++++++++++++++++++++++++++++++++
 flang/docs/ReleaseNotes.rst | 96 -------------------------------------
 2 files changed, 87 insertions(+), 96 deletions(-)
 create mode 100644 flang/docs/ReleaseNotes.md
 delete mode 100644 flang/docs/ReleaseNotes.rst

diff --git a/flang/docs/ReleaseNotes.md b/flang/docs/ReleaseNotes.md
new file mode 100644
index 000000000000..b891ab904a04
--- /dev/null
+++ b/flang/docs/ReleaseNotes.md
@@ -0,0 +1,87 @@
+# Flang 11.0.0 (In-Progress) Release Notes
+
+> **warning**
+>
+> These are in-progress notes for the upcoming LLVM 11.0.0 release.
+> Release notes for previous releases can be found on [the Download
+> Page](https://releases.llvm.org/download.html).
+
+## Introduction
+
+This document contains the release notes for the Flang Fortran frontend,
+part of the LLVM Compiler Infrastructure, release 11.0.0. Here we
+describe the status of Flang in some detail, including major
+improvements from the previous release and new feature work. For the
+general LLVM release notes, see [the LLVM
+documentation](https://llvm.org/docs/ReleaseNotes.html). All LLVM
+releases may be downloaded from the [LLVM releases web
+site](https://llvm.org/releases/).
+
+Note that if you are reading this file from a Git checkout, this
+document applies to the *next* release, not the current one. To see the
+release notes for a specific release, please see the [releases
+page](https://llvm.org/releases/).
+
+## Known Issues
+
+These are issues that couldn't be fixed before the release. See the bug
+reports for the latest status.
+
+ *   ...
+
+## Introducing Flang
+
+Flang is LLVM's Fortran front end and is new for the LLVM 11 release.
+
+Flang is still a work in progress for this release and is included for
+experimentation and feedback.
+
+Flang is able to parse a comprehensive subset of the Fortran language
+and check it for correctness. Flang is not yet able to generate LLVM IR
+for the source code and thus is unable to compile a running binary.
+
+Flang is able to unparse the input source code into a canonical form and
+emit it to allow testing. Flang can also invoke an external Fortran
+compiler on this canonical input.
+
+Flang's parser has comprehensive support for:
+ * Fortran 2018
+ * OpenMP 4.5
+ * OpenACC 3.0
+
+Interested users are invited to try to compile their Fortran codes with
+flang in and report any issues in parsing or semantic checking in
+[bugzilla](https://bugs.llvm.org/enter_bug.cgi?product=flang).
+
+### Major missing features
+
+ * Flang is not supported on Windows platforms.
+
+## Using Flang
+
+Usage: `flang hello.f90 -o hello.bin`
+
+By default, Flang will parse the Fortran file `hello.f90` then unparse it to a
+canonical Fortran source file. Flang will then invoke an external
+Fortran compiler to compile this source file and link it, placing the
+resulting executable in `hello.bin`.
+
+To specify the external Fortran compiler, set the `F18_FC` environment
+variable to the name of the compiler binary and ensure that it is on your
+`PATH`. The default value for `F18_FC` is `gfortran`.
+
+When invoked with no source input, Flang will wait for input on stdin.
+When invoked in this way, Flang performs the same actions as if
+called with `-fdebug-measure-parse-tree -funparse` and does not invoke
+`F18_FC`.
+
+For a full list of options that Flang supports, run `flang --help`.
+
+## Additional Information
+
+Flang's documentation is located in the `flang/docs/` directory in the
+LLVM monorepo.
+
+If you have any questions or comments about Flang, please feel free to
+contact us via the [mailing
+list](https://lists.llvm.org/mailman/listinfo/flang-dev).
diff --git a/flang/docs/ReleaseNotes.rst b/flang/docs/ReleaseNotes.rst
deleted file mode 100644
index bbc7377412d6..000000000000
--- a/flang/docs/ReleaseNotes.rst
+++ /dev/null
@@ -1,96 +0,0 @@
-========================================
-Flang 11.0.0 (In-Progress) Release Notes
-========================================
-
-.. contents::
-   :local:
-   :depth: 2
-
-.. warning::
-
-   These are in-progress notes for the upcoming LLVM 11.0.0 release.
-   Release notes for previous releases can be found on
-   `the Download Page <https://releases.llvm.org/download.html>`_.
-
-Introduction
-============
-
-This document contains the release notes for the Flang Fortran
-frontend, part of the LLVM Compiler Infrastructure, release 11.0.0. Here we
-describe the status of Flang in some detail, including major
-improvements from the previous release and new feature work. For the
-general LLVM release notes, see `the LLVM
-documentation <https://llvm.org/docs/ReleaseNotes.html>`_. All LLVM
-releases may be downloaded from the `LLVM releases web
-site <https://llvm.org/releases/>`_.
-
-Note that if you are reading this file from a Git checkout, this document
-applies to the *next* release, not
-the current one. To see the release notes for a specific release, please
-see the `releases page <https://llvm.org/releases/>`_.
-
-Known Issues
-============
-
-These are issues that couldn't be fixed before the release. See the bug reports for the latest status.
-
-- ...
-
-Introducing Flang
-=================
-
-Flang is LLVM's Fortran front end and is new for the LLVM 11 release.
-
-Flang is still a work in progress for this release and is included for
-experimentation and feedback.
-
-Flang status
-------------
-
-Flang is able to parse a comprehensive subset of the Fortran language
-and check it for correctness. Flang is not yet able to generate LLVM IR for
-the source code and thus is unable to compile a running binary. 
-
-Flang is able to unparse the input source code into a canonical form and emit
-it to allow testing. Flang can also invoke an external Fortran compiler on this
-canonical input.
-
-Flang's parser has comprehensive support for:
-- Fortran 2018
-- OpenMP 4.5
-- OpenACC 3.0
-
-Major missing features
-----------------------
-
-- Flang is not supported on Windows platforms.
-
-Using Flang
-===========
-
-Usage: ``flang hello.f90 -o hello.bin``
-
-Flang will parse the Fortran file ``hello.f90`` then unparse it to a canonical
-Fortran source file. Flang will then invoke an external Fortran compiler to
-compile this source file and link it, placing the resulting executable
-in ``hello.bin``.
-
-To specify the external Fortran compiler, set the ``F18_FC`` environment
-variable to the name of the compiler binary and ensure it is on your ``PATH``.
-The default value for ``F18_FC`` is ``gfortran``.
-
-When invoked with no source input, Flang will wait for input on standard in.
-When invoked in this way, Flang performs the same actions as if called with
-``-fdebug-measure-parse-tree -funparse`` and does not invoke ``F18_FC``.
-
-For a full list of options that Flang supports, run ``flang --help``.
-
-Additional Information
-======================
-
-Flang's documentation is located in the ``flang/docs/`` directory in
-the LLVM monorepo.
-
-If you have any questions or comments about Flang, please feel free to
-contact us via the `mailing
-list <https://lists.llvm.org/mailman/listinfo/flang-dev>`_.

From d024df40a192407eb54c5f8c17a7eb3b49e3f6da Mon Sep 17 00:00:00 2001
From: Kamil Rytarowski <n54@gmx.com>
Date: Thu, 10 Sep 2020 11:44:12 +0200
Subject: [PATCH 227/363] [compiler-rt] [netbsd] Reintroduce
 __sanitizer_protoent

Partial revert of https://reviews.llvm.org/D82424

(cherry picked from commit f51e55e09eefbbc57fdd802f5f17e34749ba03ec)
---
 .../lib/sanitizer_common/sanitizer_platform_limits_netbsd.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.h
index ae54a8cf105e..d80280d9bf8c 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.h
@@ -129,6 +129,12 @@ struct __sanitizer_shmid_ds {
   void *_shm_internal;
 };
 
+struct __sanitizer_protoent {
+  char *p_name;
+  char **p_aliases;
+  int p_proto;
+};
+
 struct __sanitizer_netent {
   char *n_name;
   char **n_aliases;

From 29d700a8132088ee6320702b601c0479a710a3ec Mon Sep 17 00:00:00 2001
From: Alok Kumar Sharma <AlokKumar.Sharma@amd.com>
Date: Fri, 11 Sep 2020 11:11:39 +0530
Subject: [PATCH 228/363] [DebugInfo] Fixing CodeView assert related to
 lowerBound field of DISubrange.

    This is to fix CodeView build failure https://bugs.llvm.org/show_bug.cgi?id=47287
    after DIsSubrange upgrade D80197

    Assert condition is now removed and Count is calculated in case LowerBound
    is absent or zero and Count or UpperBound is constant. If Count is unknown
    it is later handled as VLA (currently Count is set to zero).

Reviewed By: rnk

Differential Revision: https://reviews.llvm.org/D87406

(cherry picked from commit e45b0708ae81ace27de53f12b32a80601cb12bf3)
---
 llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 3f053c7a38c7..39069e24e061 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -1592,11 +1592,16 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
     assert(Element->getTag() == dwarf::DW_TAG_subrange_type);
 
     const DISubrange *Subrange = cast<DISubrange>(Element);
-    assert(!Subrange->getRawLowerBound() &&
-           "codeview doesn't support subranges with lower bounds");
     int64_t Count = -1;
-    if (auto *CI = Subrange->getCount().dyn_cast<ConstantInt*>())
-      Count = CI->getSExtValue();
+    // Calculate the count if either LowerBound is absent or is zero and
+    // either of Count or UpperBound are constant.
+    auto *LI = Subrange->getLowerBound().dyn_cast<ConstantInt *>();
+    if (!Subrange->getRawLowerBound() || (LI && (LI->getSExtValue() == 0))) {
+      if (auto *CI = Subrange->getCount().dyn_cast<ConstantInt*>())
+        Count = CI->getSExtValue();
+      else if (auto *UI = Subrange->getUpperBound().dyn_cast<ConstantInt*>())
+        Count = UI->getSExtValue() + 1; // LowerBound is zero
+    }
 
     // Forward declarations of arrays without a size and VLAs use a count of -1.
     // Emit a count of zero in these cases to match what MSVC does for arrays

From bff8d98129e8512ce9dcaed04e49c4f32f3a7e71 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Thu, 10 Sep 2020 15:41:36 +0000
Subject: [PATCH 229/363] [clang][aarch64] Fix mangling of bfloat16 neon
 vectors

The AAPCS64 specifies the internal type is used for c++ mangling. For
bfloat16 it was defined as `BFloat16` when it should be `Bfloat16`, i.e.
lowercase 'f'.

For more information, see:

https://github.com/ARM-software/abi-aa/blob/master/aapcs64/aapcs64.rst#appendix-support-for-advanced-simd-extensions

Reviewed By: stuij

Differential Revision: https://reviews.llvm.org/D87463

(cherry picked from commit cabd60c26b5df34f096cccca5a915bde3b1d8ee1)
---
 clang/lib/AST/ItaniumMangle.cpp               |  2 +-
 clang/test/CodeGenCXX/mangle-neon-vectors.cpp | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index ddfbe9f86499..8b1419074df5 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3248,7 +3248,7 @@ static StringRef mangleAArch64VectorBase(const BuiltinType *EltType) {
   case BuiltinType::Double:
     return "Float64";
   case BuiltinType::BFloat16:
-    return "BFloat16";
+    return "Bfloat16";
   default:
     llvm_unreachable("Unexpected vector element base type");
   }
diff --git a/clang/test/CodeGenCXX/mangle-neon-vectors.cpp b/clang/test/CodeGenCXX/mangle-neon-vectors.cpp
index 6faf6226efd2..cb5e40be6a6d 100644
--- a/clang/test/CodeGenCXX/mangle-neon-vectors.cpp
+++ b/clang/test/CodeGenCXX/mangle-neon-vectors.cpp
@@ -1,6 +1,7 @@
 // RUN: %clang_cc1 -triple armv7-apple-ios -target-feature +neon  %s -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -triple arm64-apple-ios -target-feature +neon %s -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -triple arm64-linux-gnu -target-feature +neon %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-AARCH64
+// RUN: %clang_cc1 -triple arm64-linux-gnu -target-feature +neon -target-feature +bf16 %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-AARCH64-BF16
 
 typedef float float32_t;
 typedef double float64_t;
@@ -14,6 +15,10 @@ typedef short poly16_t;
 #endif
 typedef unsigned __INT64_TYPE__ uint64_t;
 
+#if defined(__ARM_FEATURE_BF16)
+typedef __bf16 bfloat16_t;
+#endif
+
 typedef __attribute__((neon_vector_type(2))) int int32x2_t;
 typedef __attribute__((neon_vector_type(4))) int int32x4_t;
 typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
@@ -28,6 +33,10 @@ typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
 typedef __attribute__((neon_polyvector_type(16))) poly8_t  poly8x16_t;
 typedef __attribute__((neon_polyvector_type(8)))  poly16_t poly16x8_t;
 
+#if defined(__ARM_FEATURE_BF16)
+typedef __attribute__((neon_vector_type(4))) __bf16 bfloat16x4_t;
+#endif
+
 // CHECK: 16__simd64_int32_t
 // CHECK-AARCH64: 11__Int32x2_t
 void f1(int32x2_t v) { }
@@ -72,3 +81,8 @@ void f10(poly16x8_t v) {}
 // CHECK-AARCH64: 13__Float64x2_t
 void f11(float64x2_t v) { }
 #endif
+
+#if defined(__ARM_FEATURE_BF16)
+// CHECK-AARCH64-BF16: 14__Bfloat16x4_t
+void f12(bfloat16x4_t v) {}
+#endif

From 01be54e257d9f09c6bbc7fe98f8f7449b30b37da Mon Sep 17 00:00:00 2001
From: Richard Barton <richard.barton@arm.com>
Date: Fri, 11 Sep 2020 14:17:19 +0100
Subject: [PATCH 230/363] [flang] Add new documentation main page

Add a new index page to be the Flang documentation mainpage instead of
Overview.md, which jumps straight into the compiler Design. The index file
needs to be in .rst format to use the toctree directive to create table of
contents.

Also use the sphinx_markdown_tables extension to generate html tables form
markdown.

A number of additional style changes to the existing docs were needed to make
this work well:
 * Convert all headings to the # style, which works better with toctree's
   titlesonly option. Ensure that there is only one top-level heading per
   document.
 * Add a title to documents that don't have one for rendering on the index.
 * Convert the grammar docs from .txt to .md. for better rendering
 * Fixed broken link to a section in another document - sphinx does not seem to
   support anchor links in markdown files.

Depends on D87226

Reviewed By: sameeranjoshi

Differential Revision: https://reviews.llvm.org/D87242
---
 flang/docs/ArrayComposition.md                | 31 +++++----
 flang/docs/BijectiveInternalNameUniquing.md   | 21 +++---
 flang/docs/C++17.md                           | 13 ++--
 flang/docs/C++style.md                        |  9 +++
 flang/docs/Calls.md                           |  7 ++
 flang/docs/Character.md                       | 17 +++--
 flang/docs/ControlFlowGraph.md                |  7 ++
 flang/docs/Directives.md                      |  5 +-
 flang/docs/Extensions.md                      | 27 +++++---
 flang/docs/FortranForCProgrammers.md          | 68 ++++++++++---------
 flang/docs/FortranIR.md                       |  5 ++
 flang/docs/IORuntimeInternals.md              | 63 +++++++++--------
 flang/docs/ImplementingASemanticCheck.md      | 42 +++++++-----
 flang/docs/Intrinsics.md                      | 57 +++++++++-------
 flang/docs/LabelResolution.md                 |  5 ++
 flang/docs/ModFiles.md                        |  5 ++
 ...-4.5-grammar.txt => OpenMP-4.5-grammar.md} | 17 +++--
 flang/docs/OpenMP-semantics.md                |  5 ++
 flang/docs/OptionComparison.md                | 15 ++--
 flang/docs/Overview.md                        |  5 ++
 flang/docs/ParserCombinators.md               |  9 +++
 flang/docs/Parsing.md                         | 33 +++++----
 flang/docs/Preprocessing.md                   | 32 +++++----
 flang/docs/PullRequestChecklist.md            |  2 +-
 flang/docs/RuntimeDescriptor.md               |  7 ++
 flang/docs/Semantics.md                       |  5 ++
 flang/docs/conf.py                            | 13 +++-
 .../{f2018-grammar.txt => f2018-grammar.md}   | 12 ++--
 flang/docs/index.md                           | 61 +++++++++++++++++
 29 files changed, 399 insertions(+), 199 deletions(-)
 rename flang/docs/{OpenMP-4.5-grammar.txt => OpenMP-4.5-grammar.md} (97%)
 rename flang/docs/{f2018-grammar.txt => f2018-grammar.md} (99%)
 create mode 100644 flang/docs/index.md

diff --git a/flang/docs/ArrayComposition.md b/flang/docs/ArrayComposition.md
index 0f30af39f9e4..9e61abe5670f 100644
--- a/flang/docs/ArrayComposition.md
+++ b/flang/docs/ArrayComposition.md
@@ -6,6 +6,13 @@
   
 -->
 
+# Array Composition
+
+```eval_rst
+.. contents::
+   :local:
+```
+
 This note attempts to describe the motivation for and design of an
 implementation of Fortran 90 (and later) array expression evaluation that
 minimizes the use of dynamically allocated temporary storage for
@@ -34,8 +41,8 @@ Other Fortran intrinsic functions are technically transformational (e.g.,
 `COMMAND_ARGUMENT_COUNT`) but not of interest for this note.
 The generic `REDUCE` is also not considered here.
 
-Arrays as functions
-===================
+## Arrays as functions
+
 A whole array can be viewed as a function that maps its indices to the values
 of its elements.
 Specifically, it is a map from a tuple of integers to its element type.
@@ -45,8 +52,8 @@ and the shape of the array delimits the domain of the map.
 `REAL :: A(N,M)` can be seen as a function mapping ordered pairs of integers
 `(J,K)` with `1<=J<=N` and `1<=J<=M` to real values.
 
-Array expressions as functions
-==============================
+## Array expressions as functions
+
 The same perspective can be taken of an array expression comprising
 intrinsic operators and elemental functions.
 Fortran doesn't allow one to apply subscripts directly to an expression,
@@ -83,8 +90,8 @@ side variable as an operand of the right-hand side expression, and any
 function calls on the right-hand side are elemental or scalar-valued,
 we can avoid the use of a temporary.
 
-Transformational intrinsic functions as function composition
-============================================================
+## Transformational intrinsic functions as function composition
+
 Many of the transformational intrinsic functions listed above
 can, when their array arguments are viewed as functions over their
 index tuples, be seen as compositions of those functions with
@@ -127,8 +134,8 @@ More completely:
 * `SPREAD(A,DIM=d,NCOPIES=n)` for compile-time `d` simply
   applies `A` to a reduced index tuple.
 
-Determination of rank and shape
-===============================
+## Determination of rank and shape
+
 An important part of evaluating array expressions without the use of
 temporary storage is determining the shape of the result prior to,
 or without, evaluating the elements of the result.
@@ -173,8 +180,8 @@ In cases where the analyzed shape is known at compile time, we should
 be able to have the opportunity to avoid heap allocation in favor of
 stack storage, if the scope of the variable is local.
 
-Automatic reallocation of allocatables
-======================================
+## Automatic reallocation of allocatables
+
 Fortran 2003 introduced the ability to assign non-conforming array expressions
 to ALLOCATABLE arrays with the implied semantics of reallocation to the
 new shape.
@@ -182,8 +189,8 @@ The implementation of this feature also becomes more straightforward if
 our implementation of array expressions has decoupled calculation of shapes
 from the evaluation of the elements of the result.
 
-Rewriting rules
-===============
+## Rewriting rules
+
 Let `{...}` denote an ordered tuple of 1-based indices, e.g. `{j,k}`, into
 the result of an array expression or subexpression.
 
diff --git a/flang/docs/BijectiveInternalNameUniquing.md b/flang/docs/BijectiveInternalNameUniquing.md
index b302d389c664..7a6e8a4f4e64 100644
--- a/flang/docs/BijectiveInternalNameUniquing.md
+++ b/flang/docs/BijectiveInternalNameUniquing.md
@@ -1,4 +1,9 @@
-## Bijective Internal Name Uniquing
+# Bijective Internal Name Uniquing
+
+```eval_rst
+.. contents::
+   :local:
+```
 
 FIR has a flat namespace.  No two objects may have the same name at
 the module level.  (These would be functions, globals, etc.)
@@ -13,14 +18,14 @@ Fortran is case insensitive, which allows the compiler to convert the
 user's identifiers to all lower case.  Such a universal conversion implies
 that all upper case letters are available for use in uniquing.
 
-### Prefix `_Q`
+## Prefix `_Q`
 
 All uniqued names have the prefix sequence `_Q` to indicate the name has
 been uniqued.  (Q is chosen because it is a
 [low frequency letter](http://pi.math.cornell.edu/~mec/2003-2004/cryptography/subs/frequencies.html)
 in English.)
 
-### Scope Building
+## Scope Building
 
 Symbols can be scoped by the module, submodule, or procedure that contains
 that symbol.  After the `_Q` sigil, names are constructed from outermost to
@@ -45,7 +50,7 @@ The uniqued name of `fun` becomes:
     _QMmodSs1modSs2modFsubPfun
 ```
 
-### Common blocks
+## Common blocks
 
    * A common block name will be prefixed with `B`
 
@@ -69,7 +74,7 @@ The uniqued name in case of `blank common block` becomes:
     _QB
 ```
 
-### Module scope global data
+## Module scope global data
 
    * A global data entity is prefixed with `E`
    * A global entity that is constant (parameter) will be prefixed with `EC`
@@ -92,7 +97,7 @@ The uniqued name of `pi` becomes:
     _QMmodECpi
 ```
 
-### Procedures/Subprograms
+## Procedures/Subprograms
 
    * A procedure/subprogram is prefixed with `P`
 
@@ -105,7 +110,7 @@ The uniqued name of `sub` becomes:
     _QPsub
 ```
 
-### Derived types and related
+## Derived types and related
 
    * A derived type is prefixed with `T`
    * If a derived type has KIND parameters, they are listed in a consistent
@@ -148,7 +153,7 @@ The uniqued name of `yourtype` where `k1=4` and `k2=-6` (at compile-time):
      type `yourtype` above would be `_QCTyourtypeK4KN6`.  The type
      descriptor for `REAL(4)` would be `_QCrealK4`.
 
-### Compiler generated names
+## Compiler generated names
 
 Compiler generated names do not have to be mapped back to Fortran.  These
 names will be prefixed with `_QQ` and followed by a unique compiler
diff --git a/flang/docs/C++17.md b/flang/docs/C++17.md
index 87d5fc01f092..9e0120d2e4c5 100644
--- a/flang/docs/C++17.md
+++ b/flang/docs/C++17.md
@@ -6,7 +6,12 @@
   
 -->
 
-## C++14/17 features used in f18
+# C++14/17 features used in f18
+
+```eval_rst
+.. contents::
+   :local:
+```
 
 The C++ dialect used in this project constitutes a subset of the
 standard C++ programming language and library features.
@@ -32,7 +37,7 @@ The most important of these are:
 (`std::tuple` is actually a C++11 feature, but I include it
 in this list because it's not particularly well known.)
 
-### Sum types
+## Sum types
 
 First, some background information to explain the need for sum types
 in f18.
@@ -111,7 +116,7 @@ would be to:
   functions (or the forbidden `dynamic_cast`) to identify alternatives
   during analysis
 
-### Product types
+## Product types
 
 Many productions in the Fortran grammar describe a sequence of various
 sub-parses.
@@ -133,7 +138,7 @@ So we use `std::tuple` for such things.
 It has also been handy for template metaprogramming that needs to work
 with lists of types.
 
-### `std::optional`
+## `std::optional`
 
 This simple little type is used wherever a value might or might not be
 present.
diff --git a/flang/docs/C++style.md b/flang/docs/C++style.md
index 4ab95393d758..fb11e6411614 100644
--- a/flang/docs/C++style.md
+++ b/flang/docs/C++style.md
@@ -6,6 +6,15 @@
   
 -->
 
+# Flang C++ Style Guide
+
+```eval_rst
+.. contents::
+   :local:
+```
+
+This document captures the style guide rules that are followed in the Flang codebase.
+
 ## In brief:
 * Use *clang-format*
 from llvm 7
diff --git a/flang/docs/Calls.md b/flang/docs/Calls.md
index d70bc910d73d..440d0bd147c2 100644
--- a/flang/docs/Calls.md
+++ b/flang/docs/Calls.md
@@ -6,6 +6,13 @@
 
 -->
 
+# Representation of Fortran function calls
+
+```eval_rst
+.. contents::
+   :local:
+```
+
 ## Procedure reference implementation protocol
 
 Fortran function and subroutine references are complicated.
diff --git a/flang/docs/Character.md b/flang/docs/Character.md
index 700db864f2da..603dd8848ba1 100644
--- a/flang/docs/Character.md
+++ b/flang/docs/Character.md
@@ -6,9 +6,14 @@
 
 -->
 
-## Implementation of `CHARACTER` types in f18
+# Implementation of `CHARACTER` types in f18
 
-### Kinds and Character Sets
+```eval_rst
+.. contents::
+   :local:
+```
+
+## Kinds and Character Sets
 
 The f18 compiler and runtime support three kinds of the intrinsic
 `CHARACTER` type of Fortran 2018.
@@ -48,7 +53,7 @@ We might want to support one or more environment variables to change these
 assumptions, especially for `KIND=1` users of ISO-8859 character sets
 besides Latin-1.
 
-### Lengths
+## Lengths
 
 Allocatable `CHARACTER` objects in Fortran may defer the specification
 of their lengths until the time of their allocation or whole (non-substring)
@@ -76,7 +81,7 @@ Fortran substrings are rather like subscript triplets into a hidden
 "zero" dimension of a scalar `CHARACTER` value, but they cannot have
 strides.
 
-### Concatenation
+## Concatenation
 
 Fortran has one `CHARACTER`-valued intrinsic operator, `//`, which
 concatenates its operands (10.1.5.3).
@@ -105,7 +110,7 @@ The result of `//` may be used
 The f18 compiler has a general (but slow) means of implementing concatenation
 and a specialized (fast) option to optimize the most common case.
 
-#### General concatenation
+### General concatenation
 
 In the most general case, the f18 compiler's generated code and
 runtime support library represent the result as a deferred-length allocatable
@@ -130,7 +135,7 @@ When the left-hand side of a `CHARACTER` assignment is a deferred-length
 allocatable and the right-hand side is a temporary, use of the runtime's
 `MoveAlloc()` subroutine instead can save an allocation and a copy.
 
-#### Optimized concatenation
+### Optimized concatenation
 
 Scalar `CHARACTER(KIND=1)` expressions evaluated as the right-hand sides of
 assignments to independent substrings or whole variables that are not
diff --git a/flang/docs/ControlFlowGraph.md b/flang/docs/ControlFlowGraph.md
index b2b549845ebb..dcdecf1b77f6 100644
--- a/flang/docs/ControlFlowGraph.md
+++ b/flang/docs/ControlFlowGraph.md
@@ -6,6 +6,13 @@
   
 -->
 
+# Control Flow Graph
+
+```eval_rst
+.. contents::
+   :local:
+```
+
 ## Concept
 After a Fortran subprogram has been parsed, its names resolved, and all its
 semantic constraints successfully checked, the parse tree of its
diff --git a/flang/docs/Directives.md b/flang/docs/Directives.md
index c2e93c5f3de2..a1a99b674cef 100644
--- a/flang/docs/Directives.md
+++ b/flang/docs/Directives.md
@@ -6,8 +6,9 @@
   
 -->
 
-Compiler directives supported by F18
-====================================
+# Compiler directives supported by Flang
+
+A list of non-standard directives supported by Flang
 
 * `!dir$ fixed` and `!dir$ free` select Fortran source forms.  Their effect
   persists to the end of the current source file.
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 9010b770cca6..e16c55e97673 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -6,6 +6,13 @@
   
 -->
 
+# Fortran Extensions supported by Flang
+
+```eval_rst
+.. contents::
+   :local:
+```
+
 As a general principle, this compiler will accept by default and
 without complaint many legacy features, extensions to the standard
 language, and features that have been deleted from the standard,
@@ -16,8 +23,8 @@ Other non-standard features, which do conflict with the current
 standard specification of the Fortran programming language, are
 accepted if enabled by command-line options.
 
-Intentional violations of the standard
-======================================
+## Intentional violations of the standard
+
 * Scalar `INTEGER` actual argument expressions (not variables!)
   are converted to the kinds of scalar `INTEGER` dummy arguments
   when the interface is explicit and the kinds differ.
@@ -29,8 +36,8 @@ Intentional violations of the standard
   so long as they contain no executable code, no internal subprograms,
   and allocate no storage outside a named `COMMON` block.  (C1415)
 
-Extensions, deletions, and legacy features supported by default
-===============================================================
+## Extensions, deletions, and legacy features supported by default
+
 * Tabs in source
 * `<>` as synonym for `.NE.` and `/=`
 * `$` and `@` as legal characters in names
@@ -122,8 +129,8 @@ Extensions, deletions, and legacy features supported by default
 * DATA statement initialization is allowed for procedure pointers outside
   structure constructors.
 
-Extensions supported when enabled by options
---------------------------------------------
+### Extensions supported when enabled by options
+
 * C-style backslash escape sequences in quoted CHARACTER literals
   (but not Hollerith) [-fbackslash]
 * Logical abbreviations `.T.`, `.F.`, `.N.`, `.A.`, `.O.`, and `.X.`
@@ -140,8 +147,8 @@ Extensions supported when enabled by options
   `KIND=` actual argument.  We return `INTEGER(KIND=8)` by default in
   these cases when the `-flarge-sizes` option is enabled.
 
-Extensions and legacy features deliberately not supported
----------------------------------------------------------
+### Extensions and legacy features deliberately not supported
+
 * `.LG.` as synonym for `.NE.`
 * `REDIMENSION`
 * Allocatable `COMMON`
@@ -184,8 +191,8 @@ Extensions and legacy features deliberately not supported
   PGI, Intel, and XLF support this in ways that are not numerically equivalent.
   PGI converts the arguments while Intel and XLF replace the specific by the related generic.
 
-Preprocessing behavior
-======================
+## Preprocessing behavior
+
 * The preprocessor is always run, whatever the filename extension may be.
 * We respect Fortran comments in macro actual arguments (like GNU, Intel, NAG;
   unlike PGI and XLF) on the principle that macro calls should be treated
diff --git a/flang/docs/FortranForCProgrammers.md b/flang/docs/FortranForCProgrammers.md
index 103def2a92ce..572433ab7c15 100644
--- a/flang/docs/FortranForCProgrammers.md
+++ b/flang/docs/FortranForCProgrammers.md
@@ -6,8 +6,12 @@
 
 -->
 
-Fortran For C Programmers
-=========================
+# Fortran For C Programmers
+
+```eval_rst
+.. contents::
+   :local:
+```
 
 This note is limited to essential information about Fortran so that
 a C or C++ programmer can get started more quickly with the language,
@@ -16,8 +20,8 @@ to write or modify Fortran code.
 Please see other sources to learn about Fortran's rich history,
 current applications, and modern best practices in new code.
 
-Know This At Least
-------------------
+## Know This At Least
+
 * There have been many implementations of Fortran, often from competing
   vendors, and the standard language has been defined by U.S. and
   international standards organizations.  The various editions of
@@ -53,8 +57,8 @@ Know This At Least
   interfaces in compiled "modules", as well as legacy mechanisms for
   sharing data and interconnecting subprograms.
 
-A Rosetta Stone
----------------
+## A Rosetta Stone
+
 Fortran's language standard and other documentation uses some terminology
 in particular ways that might be unfamiliar.
 
@@ -81,8 +85,8 @@ in particular ways that might be unfamiliar.
 | Type-bound procedure | Kind of a C++ member function but not really |
 | Unformatted | Raw binary |
 
-Data Types
-----------
+## Data Types
+
 There are five built-in ("intrinsic") types: `INTEGER`, `REAL`, `COMPLEX`,
 `LOGICAL`, and `CHARACTER`.
 They are parameterized with "kind" values, which should be treated as
@@ -117,8 +121,8 @@ Last, there are "typeless" binary constants that can be used in a few
 situations, like static data initialization or immediate conversion,
 where type is not necessary.
 
-Arrays
-------
+## Arrays
+
 Arrays are not types in Fortran.
 Being an array is a property of an object or function, not of a type.
 Unlike C, one cannot have an array of arrays or an array of pointers,
@@ -133,8 +137,8 @@ And yes, the default lower bound on each dimension is 1, not 0.
 Expressions can manipulate arrays as multidimensional values, and
 the compiler will create the necessary loops.
 
-Allocatables
-------------
+## Allocatables
+
 Modern Fortran programs use `ALLOCATABLE` data extensively.
 Such variables and derived type components are allocated dynamically.
 They are automatically deallocated when they go out of scope, much
@@ -147,8 +151,8 @@ and follow up all the references that are made in the documentation
 from the description of `ALLOCATABLE` to other topics; it's a feature
 that interacts with much of the rest of the language.)
 
-I/O
----
+## I/O
+
 Fortran's input/output features are built into the syntax of the language,
 rather than being defined by library interfaces as in C and C++.
 There are means for raw binary I/O and for "formatted" transfers to
@@ -173,8 +177,8 @@ One can also use compiler-generated formatting in "list-directed" I/O,
 in which the compiler derives reasonable default formats based on
 data types.
 
-Subprograms
------------
+## Subprograms
+
 Fortran has both `FUNCTION` and `SUBROUTINE` subprograms.
 They share the same name space, but functions cannot be called as
 subroutines or vice versa.
@@ -188,8 +192,8 @@ their own internal procedures.
 As is the case with C++ lambda expressions, internal procedures can
 reference names from their host subprograms.
 
-Modules
--------
+## Modules
+
 Modern Fortran has good support for separate compilation and namespace
 management.
 The *module* is the basic unit of compilation, although independent
@@ -204,8 +208,8 @@ All references to objects in modules are done with direct names or
 aliases that have been added to the local scope, as Fortran has no means
 of qualifying references with module names.
 
-Arguments
----------
+## Arguments
+
 Functions and subroutines have "dummy" arguments that are dynamically
 associated with actual arguments during calls.
 Essentially, all argument passing in Fortran is by reference, not value.
@@ -236,8 +240,8 @@ scope.
 This is the opposite of the assumptions under which a C or C++ compiler must
 labor when trying to optimize code with pointers.
 
-Overloading
------------
+## Overloading
+
 Fortran supports a form of overloading via its interface feature.
 By default, an interface is a means for specifying prototypes for a
 set of subroutines and functions.
@@ -250,8 +254,8 @@ A similar feature can be used for generic type-bound procedures.
 This feature can be used to overload the built-in operators and some
 I/O statements, too.
 
-Polymorphism
-------------
+## Polymorphism
+
 Fortran code can be written to accept data of some derived type or
 any extension thereof using `CLASS`, deferring the actual type to
 execution, rather than the usual `TYPE` syntax.
@@ -261,8 +265,8 @@ Fortran's `SELECT TYPE` construct is used to distinguish between
 possible specific types dynamically, when necessary.  It's a
 little like C++17's `std::visit()` on a discriminated union.
 
-Pointers
---------
+## Pointers
+
 Pointers are objects in Fortran, not data types.
 Pointers can point to data, arrays, and subprograms.
 A pointer can only point to data that has the `TARGET` attribute.
@@ -287,8 +291,8 @@ out of scope.
 A legacy feature, "Cray pointers", implements dynamic base addressing of
 one variable using an address stored in another.
 
-Preprocessing
--------------
+## Preprocessing
+
 There is no standard preprocessing feature, but every real Fortran implementation
 has some support for passing Fortran source code through a variant of
 the standard C source preprocessor.
@@ -302,8 +306,8 @@ suffix (e.g., "foo.F90") or a compiler command line option.
 (Since the F18 compiler always runs its built-in preprocessing stage,
 no special option or filename suffix is required.)
 
-"Object Oriented" Programming
------------------------------
+## "Object Oriented" Programming
+
 Fortran doesn't have member functions (or subroutines) in the sense
 that C++ does, in which a function has immediate access to the members
 of a specific instance of a derived type.
@@ -325,8 +329,8 @@ There's a lot more that can be said about type-bound procedures (e.g., how they
 support overloading) but this should be enough to get you started with
 the most common usage.
 
-Pitfalls
---------
+## Pitfalls
+
 Variable initializers, e.g. `INTEGER :: J=123`, are _static_ initializers!
 They imply that the variable is stored in static storage, not on the stack,
 and the initialized value lasts only until the variable is assigned.
diff --git a/flang/docs/FortranIR.md b/flang/docs/FortranIR.md
index 5d83aaa8e34c..f1f643a1d17d 100644
--- a/flang/docs/FortranIR.md
+++ b/flang/docs/FortranIR.md
@@ -8,6 +8,11 @@
 
 # Design: Fortran IR
 
+```eval_rst
+.. contents::
+   :local:
+```
+
 ## Introduction
 
 After semantic analysis is complete and it has been determined that the compiler has a legal Fortran program as input, the parse tree will be lowered to an intermediate representation for the purposes of high-level analysis and optimization.  In this document, that intermediate representation will be called Fortran IR or FIR.  The pass that converts from the parse tree and other data structures of the front-end to FIR will be called the "Burnside bridge".
diff --git a/flang/docs/IORuntimeInternals.md b/flang/docs/IORuntimeInternals.md
index b4f3092a014e..2748fcf16fa3 100644
--- a/flang/docs/IORuntimeInternals.md
+++ b/flang/docs/IORuntimeInternals.md
@@ -6,8 +6,12 @@
 
 -->
 
-Fortran I/O Runtime Library Internal Design
-===========================================
+# Fortran I/O Runtime Library Internal Design
+
+```eval_rst
+.. contents::
+   :local:
+```
 
 This note is meant to be an overview of the design of the *implementation*
 of the f18 Fortran compiler's runtime support library for I/O statements.
@@ -66,8 +70,7 @@ template library of fast conversion algorithms used to interpret
 floating-point values in Fortran source programs and to emit them
 to module files.
 
-Overview of Classes
-===================
+## Overview of Classes
 
 A suite of C++ classes and class templates are composed to construct
 the Fortran I/O runtime support library.
@@ -79,16 +82,16 @@ classes are in the process of being vigorously rearranged and
 modified; use `grep` or an IDE to discover these classes in
 the source for now.  (Sorry!)
 
-`Terminator`
-----------
+### `Terminator`
+
 A general facility for the entire library, `Terminator` latches a
 source program statement location in terms of an unowned pointer to
 its source file path name and line number and uses them to construct
 a fatal error message if needed.
 It is used for both user program errors and internal runtime library crashes.
 
-`IoErrorHandler`
---------------
+### `IoErrorHandler`
+
 When I/O error conditions arise at runtime that the Fortran program
 might have the privilege to handle itself via `ERR=`, `END=`, or
 `EOR=` labels and/or by an `IOSTAT=` variable, this subclass of
@@ -96,8 +99,8 @@ might have the privilege to handle itself via `ERR=`, `END=`, or
 It sorts out priorities in the case of multiple errors and determines
 the final `IOSTAT=` value at the end of an I/O statement.
 
-`MutableModes`
-------------
+### `MutableModes`
+
 Fortran's formatted I/O statements are affected by a suite of
 modes that can be configured by `OPEN` statements, overridden by
 data transfer I/O statement control lists, and further overridden
@@ -108,8 +111,8 @@ order to properly isolate their modifications.
 The modes in force at the time each data item is processed constitute
 a member of each `DataEdit`.
 
-`DataEdit`
---------
+### `DataEdit`
+
 Represents a single data edit descriptor from a `FORMAT` statement
 or `FMT=` character value, with some hidden extensions to also
 support formatting of list-directed transfers.
@@ -119,8 +122,8 @@ For simplicity and efficiency, each data edit descriptor is
 encoded in the `DataEdit` as a simple capitalized character
 (or two) and some optional field widths.
 
-`FormatControl<>`
----------------
+### `FormatControl<>`
+
 This class template traverses a `FORMAT` statement's contents (or `FMT=`
 character value) to extract data edit descriptors like `E20.14` to
 serve each item in an I/O data transfer statement's *io-list*,
@@ -142,32 +145,32 @@ output strings or record positionings at the end of the *io-list*.
 The `DefaultFormatControlCallbacks` structure summarizes the API
 expected by `FormatControl` from its class template actual arguments.
 
-`OpenFile`
---------
+### `OpenFile`
+
 This class encapsulates all (I hope) the operating system interfaces
 used to interact with the host's filesystems for operations on
 external units.
 Asynchronous I/O interfaces are faked for now with synchronous
 operations and deferred results.
 
-`ConnectionState`
----------------
+### `ConnectionState`
+
 An active connection to an external or internal unit maintains
 the common parts of its state in this subclass of `ConnectionAttributes`.
 The base class holds state that should not change during the
 lifetime of the connection, while the subclass maintains state
 that may change during I/O statement execution.
 
-`InternalDescriptorUnit`
-----------------------
+### `InternalDescriptorUnit`
+
 When I/O is being performed from/to a Fortran `CHARACTER` array
 rather than an external file, this class manages the standard
 interoperable descriptor used to access its elements as records.
 It has the necessary interfaces to serve as an actual argument
 to the `FormatControl` class template.
 
-`FileFrame<>`
------------
+### `FileFrame<>`
+
 This CRTP class template isolates all of the complexity involved between
 an external unit's `OpenFile` and the buffering requirements
 imposed by the capabilities of Fortran `FORMAT` control edit
@@ -192,8 +195,8 @@ a frame may come up short.
 As a CRTP class template, `FileFrame` accesses the raw filesystem
 facilities it needs from `*this`.
 
-`ExternalFileUnit`
-----------------
+### `ExternalFileUnit`
+
 This class mixes in `ConnectionState`, `OpenFile`, and
 `FileFrame<ExternalFileUnit>` to represent the state of an open
 (or soon to be opened) external file descriptor as a Fortran
@@ -210,8 +213,8 @@ Static member functions `LookUp()`, `LookUpOrCrash()`, and `LookUpOrCreate()`
 probe the map to convert Fortran `UNIT=` numbers from I/O statements
 into references to active units.
 
-`IoStatementBase`
----------------
+### `IoStatementBase`
+
 The subclasses of `IoStatementBase` each encapsulate and maintain
 the state of one active Fortran I/O statement across the several
 I/O runtime library API function calls it may comprise.
@@ -239,8 +242,8 @@ the I/O API supports a means whereby the code generated for the Fortran
 program may supply stack space to the I/O runtime support library
 for this purpose.
 
-`IoStatementState`
-----------------
+### `IoStatementState`
+
 F18's Fortran I/O runtime support library defines and implements an API
 that uses a sequence of function calls to implement each Fortran I/O
 statement.
@@ -269,8 +272,8 @@ unit, the library has to treat that (expected to be rare) situation
 as a weird variation of internal I/O since there's no `ExternalFileUnit`
 available to hold its `IoStatementBase` subclass or `IoStatementState`.
 
-A Narrative Overview Of `PRINT *, 'HELLO, WORLD'`
-=================================================
+## A Narrative Overview Of `PRINT *, 'HELLO, WORLD'`
+
 1. When the compiled Fortran program begins execution at the `main()`
 entry point exported from its main program, it calls `ProgramStart()`
 with its arguments and environment.
diff --git a/flang/docs/ImplementingASemanticCheck.md b/flang/docs/ImplementingASemanticCheck.md
index 3bb16915cb88..35b107e4988e 100644
--- a/flang/docs/ImplementingASemanticCheck.md
+++ b/flang/docs/ImplementingASemanticCheck.md
@@ -5,14 +5,20 @@
    SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   
 -->
-# Introduction
+# How to implement a Sematic Check in Flang
+
+```eval_rst
+.. contents::
+   :local:
+```
+
 I recently added a semantic check to the f18 compiler front end.  This document
 describes my thought process and the resulting implementation.
 
 For more information about the compiler, start with the 
 [compiler overview](Overview.md).
 
-# Problem definition
+## Problem definition
 
 In the 2018 Fortran standard, section 11.1.7.4.3, paragraph 2, states that:
 
@@ -29,7 +35,7 @@ emit a warning if an active DO variable was passed to a dummy argument with
 INTENT(INOUT).  Previously, I had implemented similar checks for SUBROUTINE
 calls.
 
-# Creating a test
+## Creating a test
 
 My first step was to create a test case to cause the problem.  I called it testfun.f90 and used it to check the behavior of other Fortran compilers.  Here's the initial version:
 
@@ -94,14 +100,14 @@ constant 216 in the statement:
 ```fortran
       dummyArg = 216
 ```
-# Analysis and implementation planning
+## Analysis and implementation planning
 
 I then considered what I needed to do.  I needed to detect situations where an
 active DO variable was passed to a dummy argument with `INTENT(OUT)` or
 `INTENT(INOUT)`.  Once I detected such a situation, I needed to produce a
 message that highlighted the erroneous source code.  
 
-## Deciding where to add the code to the compiler
+### Deciding where to add the code to the compiler
 This new semantic check would depend on several types of information -- the
 parse tree, source code location information, symbols, and expressions.  Thus I
 needed to put my new code in a place in the compiler after the parse tree had
@@ -151,7 +157,7 @@ Since my semantic check was focused on DO CONCURRENT statements, I added it to
 the file `lib/Semantics/check-do.cpp` where most of the semantic checking for
 DO statements already lived.
 
-## Taking advantage of prior work
+### Taking advantage of prior work
 When implementing a similar check for SUBROUTINE calls, I created a utility
 functions in `lib/Semantics/semantics.cpp` to emit messages if
 a symbol corresponding to an active DO variable was being potentially modified:
@@ -173,7 +179,7 @@ information --
 The first and third are needed since they're required to call the utility
 functions.  The second is needed to determine whether to call them.
 
-## Finding the source location
+### Finding the source location
 The source code location information that I'd need for the error message must
 come from the parse tree.  I looked in the file
 `include/flang/Parser/parse-tree.h` and determined that a `struct Expr`
@@ -181,7 +187,7 @@ contained source location information since it had the field `CharBlock
 source`.  Thus, if I visited a `parser::Expr` node, I could get the source
 location information for the associated expression.
 
-## Determining the `INTENT`
+### Determining the `INTENT`
 I knew that I could find the `INTENT` of the dummy argument associated with the
 actual argument from the function called `dummyIntent()` in the class
 `evaluate::ActualArgument` in the file `include/flang/Evaluate/call.h`.  So
@@ -248,7 +254,7 @@ This combination of the traversal framework and `dummyIntent()` would give
 me the `INTENT` of all of the dummy arguments in a FUNCTION call.  Thus, I
 would have the second piece of information I needed.
 
-## Determining if the actual argument is a variable
+### Determining if the actual argument is a variable
 I also guessed that I could determine if the `evaluate::ActualArgument`
 consisted of a variable.  
 
@@ -264,9 +270,9 @@ needed -- the source location of the erroneous text, the `INTENT` of the dummy
 argument, and a symbol that I could use to determine whether the actual
 argument was an active DO variable.
 
-# Implementation
+## Implementation
 
-## Adding a parse tree visitor
+### Adding a parse tree visitor
 I started my implementation by adding a visitor for `parser::Expr` nodes.
 Since this analysis is part of DO construct checking, I did this in
 `lib/Semantics/check-do.cpp`.  I added a print statement to the visitor to
@@ -308,7 +314,7 @@ source position of the associated expression (`CharBlock source`).  So I
 now had one of the three pieces of information needed to detect and report
 errors.
 
-## Collecting the actual arguments
+### Collecting the actual arguments
 To get the `INTENT` of the dummy arguments and the `semantics::Symbol` associated with the
 actual argument, I needed to find all of the actual arguments embedded in an
 expression that contained a FUNCTION call.  So my next step was to write the
@@ -474,7 +480,7 @@ node.
 
 So far, so good.
 
-## Finding the `INTENT` of the dummy argument
+### Finding the `INTENT` of the dummy argument
 I now wanted to find the `INTENT` of the dummy argument associated with the
 arguments in the set.  As mentioned earlier, the type
 `evaluate::ActualArgument` has a member function called `dummyIntent()`
@@ -518,7 +524,7 @@ I then modified my test case to convince myself that I was getting the correct
 
 So far, so good.
 
-## Finding the symbols for arguments that are variables
+### Finding the symbols for arguments that are variables
 The third and last piece of information I needed was to determine if a variable
 was being passed as an actual argument.  In such cases, I wanted to get the
 symbol table node (`semantics::Symbol`) for the variable.  My starting point was the
@@ -638,7 +644,7 @@ Here's the result of running the modified compiler on my Fortran test case:
 
 Sweet.
 
-## Emitting the messages
+### Emitting the messages
 At this point, using the source location information from the original
 `parser::Expr`, I had enough information to plug into the exiting
 interfaces for emitting messages for active DO variables.  I modified the
@@ -701,7 +707,7 @@ output:
 
 Even sweeter.
 
-# Improving the test case
+## Improving the test case
 At this point, my implementation seemed to be working.  But I was concerned
 about the limitations of my test case.  So I augmented it to include arguments
 other than `INTENT(OUT)` and more complex expressions.  Luckily, my
@@ -762,7 +768,7 @@ Here's the test I ended up with:
   end subroutine s
 ```
 
-# Submitting the pull request
+## Submitting the pull request
 At this point, my implementation seemed functionally complete, so I stripped out all of the debug statements, ran `clang-format` on it and reviewed it
 to make sure that the names were clear.  Here's what I ended up with:
 
@@ -790,7 +796,7 @@ to make sure that the names were clear.  Here's what I ended up with:
 
 I then created a pull request to get review comments.  
 
-# Responding to pull request comments
+## Responding to pull request comments
 I got feedback suggesting that I use an `if` statement rather than a
 `case` statement.  Another comment reminded me that I should look at the
 code I'd previously writted to do a similar check for SUBROUTINE calls to see
diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md
index 7be0bf3e4a9c..f9e47e5893bf 100644
--- a/flang/docs/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -8,6 +8,11 @@
 
 # A categorization of standard (2018) and extended Fortran intrinsic procedures
 
+```eval_rst
+.. contents::
+   :local:
+```
+
 This note attempts to group the intrinsic procedures of Fortran into categories
 of functions or subroutines with similar interfaces as an aid to
 comprehension beyond that which might be gained from the standard's
@@ -53,14 +58,14 @@ Intrinsic modules are not covered here.
    may appear within the brackets to preserve the order of arguments
    (e.g., `COUNT`).
 
-# Elemental intrinsic functions
+## Elemental intrinsic functions
 
 Pure elemental semantics apply to these functions, to wit: when one or more of
 the actual arguments are arrays, the arguments must be conformable, and
 the result is also an array.
 Scalar arguments are expanded when the arguments are not all scalars.
 
-## Elemental intrinsic functions that may have unrestricted specific procedures
+### Elemental intrinsic functions that may have unrestricted specific procedures
 
 When an elemental intrinsic function is documented here as having an
 _unrestricted specific name_, that name may be passed as an actual
@@ -349,7 +354,7 @@ that is present in `SET`, or zero if none is.
 `VERIFY` is essentially the opposite: it returns the index of the first (or last) character
 in `STRING` that is *not* present in `SET`, or zero if all are.
 
-# Transformational intrinsic functions
+## Transformational intrinsic functions
 
 This category comprises a large collection of intrinsic functions that
 are collected together because they somehow transform their arguments
@@ -372,7 +377,7 @@ Some general rules apply to the transformational intrinsic functions:
 1. The type `any` here denotes any intrinsic or derived type.
 1. The notation `(..)` denotes an array of any rank (but not an assumed-rank array).
 
-## Logical reduction transformational intrinsic functions
+### Logical reduction transformational intrinsic functions
 ```
 ALL(LOGICAL(k) MASK(..) [, DIM ]) -> LOGICAL(k)
 ANY(LOGICAL(k) MASK(..) [, DIM ]) -> LOGICAL(k)
@@ -380,7 +385,7 @@ COUNT(LOGICAL(any) MASK(..) [, DIM, KIND=KIND(0) ]) -> INTEGER(KIND)
 PARITY(LOGICAL(k) MASK(..) [, DIM ]) -> LOGICAL(k)
 ```
 
-## Numeric reduction transformational intrinsic functions
+### Numeric reduction transformational intrinsic functions
 ```
 IALL(INTEGER(k) ARRAY(..) [, DIM, MASK ]) -> INTEGER(k)
 IANY(INTEGER(k) ARRAY(..) [, DIM, MASK ]) -> INTEGER(k)
@@ -392,7 +397,7 @@ SUM(numeric ARRAY(..) [, DIM, MASK ]) -> numeric
 
 `NORM2` generalizes `HYPOT` by computing `SQRT(SUM(X*X))` while avoiding spurious overflows.
 
-## Extrema reduction transformational intrinsic functions
+### Extrema reduction transformational intrinsic functions
 ```
 MAXVAL(relational(k) ARRAY(..) [, DIM, MASK ]) -> relational(k)
 MINVAL(relational(k) ARRAY(..) [, DIM, MASK ]) -> relational(k)
@@ -419,7 +424,7 @@ MAXLOC(relational ARRAY(..) [, DIM, MASK, KIND=KIND(0), BACK=.FALSE. ])
 MINLOC(relational ARRAY(..) [, DIM, MASK, KIND=KIND(0), BACK=.FALSE. ])
 ```
 
-## Data rearrangement transformational intrinsic functions
+### Data rearrangement transformational intrinsic functions
 The optional `DIM` argument to these functions must be a scalar integer of
 any kind, and it takes a default value of 1 when absent.
 
@@ -475,7 +480,7 @@ UNPACK(any VECTOR(n), LOGICAL(any) MASK(..), FIELD) -> type and kind of VECTOR,
 ```
 `FIELD` has same type and kind as `VECTOR` and is conformable with `MASK`.
 
-## Other transformational intrinsic functions
+### Other transformational intrinsic functions
 ```
 BESSEL_JN(INTEGER(n1) N1, INTEGER(n2) N2, REAL(k) X) -> REAL(k) vector (MAX(N2-N1+1,0))
 BESSEL_YN(INTEGER(n1) N1, INTEGER(n2) N2, REAL(k) X) -> REAL(k) vector (MAX(N2-N1+1,0))
@@ -517,7 +522,7 @@ At least one argument must be present in a call to `SELECTED_REAL_KIND`.
 An assumed-rank array may be passed to `SHAPE`, and if it is associated with an assumed-size array,
 the last element of the result will be -1.
 
-## Coarray transformational intrinsic functions
+### Coarray transformational intrinsic functions
 ```
 FAILED_IMAGES([scalar TEAM_TYPE TEAM, KIND=KIND(0)]) -> INTEGER(KIND) vector
 GET_TEAM([scalar INTEGER(?) LEVEL]) -> scalar TEAM_TYPE
@@ -532,10 +537,10 @@ THIS_IMAGE([COARRAY, DIM, scalar TEAM_TYPE TEAM]) -> default INTEGER
 The result of `THIS_IMAGE` is a scalar if `DIM` is present or if `COARRAY` is absent,
 and a vector whose length is the corank of `COARRAY` otherwise.
 
-# Inquiry intrinsic functions
+## Inquiry intrinsic functions
 These are neither elemental nor transformational; all are pure.
 
-## Type inquiry intrinsic functions
+### Type inquiry intrinsic functions
 All of these functions return constants.
 The value of the argument is not used, and may well be undefined.
 ```
@@ -554,7 +559,7 @@ RANGE(INTEGER(k) or REAL(k) or COMPLEX(k) X(..)) -> scalar default INTEGER
 TINY(REAL(k) X(..)) -> scalar REAL(k)
 ```
 
-## Bound and size inquiry intrinsic functions
+### Bound and size inquiry intrinsic functions
 The results are scalar when `DIM` is present, and a vector of length=(co)rank(`(CO)ARRAY`)
 when `DIM` is absent.
 ```
@@ -567,7 +572,7 @@ UCOBOUND(any COARRAY [, DIM, KIND=KIND(0) ]) -> INTEGER(KIND)
 
 Assumed-rank arrays may be used with `LBOUND`, `SIZE`, and `UBOUND`.
 
-## Object characteristic inquiry intrinsic functions
+### Object characteristic inquiry intrinsic functions
 ```
 ALLOCATED(any type ALLOCATABLE ARRAY) -> scalar default LOGICAL
 ALLOCATED(any type ALLOCATABLE SCALAR) -> scalar default LOGICAL
@@ -584,11 +589,11 @@ The arguments to `EXTENDS_TYPE_OF` must be of extensible derived types or be unl
 
 An assumed-rank array may be used with `IS_CONTIGUOUS` and `RANK`.
 
-# Intrinsic subroutines
+## Intrinsic subroutines
 
 (*TODO*: complete these descriptions)
 
-## One elemental intrinsic subroutine
+### One elemental intrinsic subroutine
 ```
 INTERFACE
   SUBROUTINE MVBITS(FROM, FROMPOS, LEN, TO, TOPOS)
@@ -602,7 +607,7 @@ INTERFACE
 END INTERFACE
 ```
 
-## Non-elemental intrinsic subroutines
+### Non-elemental intrinsic subroutines
 ```
 CALL CPU_TIME(REAL INTENT(OUT) TIME)
 ```
@@ -627,7 +632,7 @@ CALL RANDOM_SEED([SIZE, PUT, GET])
 CALL SYSTEM_CLOCK([COUNT, COUNT_RATE, COUNT_MAX])
 ```
 
-## Atomic intrinsic subroutines
+### Atomic intrinsic subroutines
 ```
 CALL ATOMIC_ADD(ATOM, VALUE [, STAT=])
 CALL ATOMIC_AND(ATOM, VALUE [, STAT=])
@@ -642,7 +647,7 @@ CALL ATOMIC_REF(VALUE, ATOM [, STAT=])
 CALL ATOMIC_XOR(ATOM, VALUE [, STAT=])
 ```
 
-## Collective intrinsic subroutines
+### Collective intrinsic subroutines
 ```
 CALL CO_BROADCAST
 CALL CO_MAX
@@ -651,8 +656,8 @@ CALL CO_REDUCE
 CALL CO_SUM
 ```
 
-# Non-standard intrinsics
-## PGI
+## Non-standard intrinsics
+### PGI
 ```
 AND, OR, XOR
 LSHIFT, RSHIFT, SHIFT
@@ -666,7 +671,7 @@ JINT, JNINT, KNINT
 LOC
 ```
 
-## Intel
+### Intel
 ```
 DCMPLX(X,Y), QCMPLX(X,Y)
 DREAL(DOUBLE COMPLEX A) -> DOUBLE PRECISION
@@ -689,12 +694,12 @@ CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, LOC
 MALLOC
 ```
 
-# Intrinsic Procedure Support in f18
+## Intrinsic Procedure Support in f18
 This section gives an overview of the support inside f18 libraries for the
 intrinsic procedures listed above.
 It may be outdated, refer to f18 code base for the actual support status.
 
-## Semantic Analysis
+### Semantic Analysis
 F18 semantic expression analysis phase detects intrinsic procedure references,
 validates the argument types and deduces the return types.
 This phase currently supports all the intrinsic procedures listed above but the ones in the table below.
@@ -710,7 +715,7 @@ This phase currently supports all the intrinsic procedures listed above but the
 | Collective intrinsic subroutines | CO_BROADCAST &al. |
 
 
-## Intrinsic Function Folding
+### Intrinsic Function Folding
 Fortran Constant Expressions can contain references to a certain number of
 intrinsic functions (see Fortran 2018 standard section 10.1.12 for more details).
 Constant Expressions may be used to define kind arguments. Therefore, the semantic
@@ -724,7 +729,7 @@ arrays when an implementation is provided for the scalars (regardless of whether
 it is using host hardware types or not).
 The status of intrinsic function folding support is given in the sub-sections below.
 
-### Intrinsic Functions with Host Independent Folding Support
+#### Intrinsic Functions with Host Independent Folding Support
 Implementations using f18 scalar types enables folding intrinsic functions
 on any host and with any possible type kind supported by f18. The intrinsic functions
 listed below are folded using host independent implementations.
@@ -736,7 +741,7 @@ listed below are folded using host independent implementations.
 | COMPLEX | CMPLX, CONJG |
 | LOGICAL | BGE, BGT, BLE, BLT |
 
-### Intrinsic Functions with Host Dependent Folding Support
+#### Intrinsic Functions with Host Dependent Folding Support
 Implementations using the host runtime may not be available for all supported
 f18 types depending on the host hardware types and the libraries available on the host.
 The actual support on a host depends on what the host hardware types are.
diff --git a/flang/docs/LabelResolution.md b/flang/docs/LabelResolution.md
index e837b4fa6aec..c1227a8bc35a 100644
--- a/flang/docs/LabelResolution.md
+++ b/flang/docs/LabelResolution.md
@@ -8,6 +8,11 @@
 
 # Semantics: Resolving Labels and Construct Names
 
+```eval_rst
+.. contents::
+   :local:
+```
+
 ## Overview
 
 After the Fortran input file(s) has been parsed into a syntax tree, the compiler must check that the program checks semantically.  Target labels must be checked and violations of legal semantics should be reported to the user.
diff --git a/flang/docs/ModFiles.md b/flang/docs/ModFiles.md
index 483341bdd0f4..ccb849ab0dec 100644
--- a/flang/docs/ModFiles.md
+++ b/flang/docs/ModFiles.md
@@ -8,6 +8,11 @@
 
 # Module Files
 
+```eval_rst
+.. contents::
+   :local:
+```
+
 Module files hold information from a module that is necessary to compile 
 program units that depend on the module.
 
diff --git a/flang/docs/OpenMP-4.5-grammar.txt b/flang/docs/OpenMP-4.5-grammar.md
similarity index 97%
rename from flang/docs/OpenMP-4.5-grammar.txt
rename to flang/docs/OpenMP-4.5-grammar.md
index c74072ba1ef2..9044e305f060 100644
--- a/flang/docs/OpenMP-4.5-grammar.txt
+++ b/flang/docs/OpenMP-4.5-grammar.md
@@ -1,18 +1,16 @@
-#===-- docs/OpenMP-4.5-grammar.txt --------------------------------===#
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-#===------------------------------------------------------------------------===#
+# OpenMP 4.5 Grammar
 
-# OpenMP 4.5 Specifications
+Grammar used by Flang to parse OpenMP 4.5.
 
+## OpenMP 4.5 Specifications
+```
 2 omp-directive -> sentinel directive-name [clause[ [,] clause]...]
 2.1.1 sentinel -> !$omp | c$omp | *$omp
 2.1.2 sentinel -> !$omp
+```
 
-# directive-name
+## directive-name
+```
 2.5 parallel -> PARALLEL [parallel-clause[ [,] parallel-clause]...]
     parallel-clause -> if-clause |
                        num-threads-clause |
@@ -462,3 +460,4 @@
                      ALLOC | RELEASE | DELETE
 
 2.15.5.2 defaultmap -> DEFAULTMAP (TOFROM:SCALAR)
+```
diff --git a/flang/docs/OpenMP-semantics.md b/flang/docs/OpenMP-semantics.md
index 4e2a81739cf8..1511bc9e7b3b 100644
--- a/flang/docs/OpenMP-semantics.md
+++ b/flang/docs/OpenMP-semantics.md
@@ -8,6 +8,11 @@
 
 # OpenMP Semantic Analysis
 
+```eval_rst
+.. contents::
+   :local:
+```
+
 ## OpenMP for F18
 
 1. Define and document the parse tree representation for
diff --git a/flang/docs/OptionComparison.md b/flang/docs/OptionComparison.md
index db5932411cc1..347a1d6000ee 100644
--- a/flang/docs/OptionComparison.md
+++ b/flang/docs/OptionComparison.md
@@ -6,14 +6,21 @@
   
 -->
 
-# Compiler options
+# Compiler options comparison
+
+```eval_rst
+.. contents::
+   :local:
+```
 
 This document catalogs the options processed by F18's peers/competitors.  Much of the document is taken up by a set of tables that list the options categorized into different topics.  Some of the table headings link to more information about the contents of the tables.  For example, the table on **Standards conformance** options links to [notes on Standards conformance](#standards).
 
-**There's also important information in the ___[Notes section](#notes)___ near the end of the document on how this data was gathered and what ___is___ and ___is not___ included in this document.**  
+**There's also important information in the ___[Appendix section](#appendix)___ near the end of the document on how this data was gathered and what ___is___ and ___is not___ included in this document.**  
 
 Note that compilers may support language features without having an option for them.  Such cases are frequently, but not always noted in this document.
 
+## Categorisation of Options
+
 <table>
   <tr>
    <td colspan="7" ><strong><a href="#standards">Standards conformance</a></strong>
@@ -1183,7 +1190,7 @@ Mcuda
 
 
 
-## <a name="notes"></a>Notes
+## Notes
 
 **<a name="standards"></a>Standards conformance:** 
 
@@ -1290,7 +1297,7 @@ GNU is the only compiler with options governing the use of non-standard intrinsi
 **Warn for bad call checking**: This Cray option ("-eb") issues a warning message rather than an error message when the compiler detects a call to a procedure with one or more dummy arguments having the TARGET, VOLATILE or ASYNCHRONOUS attribute and there is not an explicit interface definition.
 
 
-## Notes
+## Appendix
 
 
 ### What is and is not included
diff --git a/flang/docs/Overview.md b/flang/docs/Overview.md
index 75a8cd1c4cab..987858943845 100644
--- a/flang/docs/Overview.md
+++ b/flang/docs/Overview.md
@@ -8,6 +8,11 @@
 
 # Overview of Compiler Phases
 
+```eval_rst
+.. contents::
+   :local:
+```
+
 Each phase produces either correct output or fatal errors.
 
 ## Prescan and Preprocess
diff --git a/flang/docs/ParserCombinators.md b/flang/docs/ParserCombinators.md
index 4f3dc6fd07ae..ff94d341c150 100644
--- a/flang/docs/ParserCombinators.md
+++ b/flang/docs/ParserCombinators.md
@@ -6,6 +6,15 @@
   
 -->
 
+# Parser Combinators
+
+```eval_rst
+.. contents::
+   :local:
+```
+
+This document is a primer on Parser Combinators and their use in Flang.
+
 ## Concept
 The Fortran language recognizer here can be classified as an LL recursive
 descent parser.  It is composed from a *parser combinator* library that
diff --git a/flang/docs/Parsing.md b/flang/docs/Parsing.md
index fad9a4d57278..dec63e6fbdab 100644
--- a/flang/docs/Parsing.md
+++ b/flang/docs/Parsing.md
@@ -6,8 +6,13 @@
   
 -->
 
-The F18 Parser
-==============
+# The F18 Parser
+
+```eval_rst
+.. contents::
+   :local:
+```
+
 This program source code implements a parser for the Fortran programming
 language.
 
@@ -42,8 +47,8 @@ source file and receive its parse tree and error messages.  The interfaces
 of the Parsing class correspond to the two major passes of the parser,
 which are described below.
 
-Prescanning and Preprocessing
------------------------------
+## Prescanning and Preprocessing
+
 The first pass is performed by an instance of the Prescanner class,
 with help from an instance of Preprocessor.
 
@@ -100,8 +105,8 @@ The content of the cooked character stream is available and useful
 for debugging, being as it is a simple value forwarded from the first major
 pass of the compiler to the second.
 
-Source Provenance
------------------
+## Source Provenance
+
 The prescanner constructs a chronicle of every file that is read by the
 parser, viz. the original source file and all others that it directly
 or indirectly includes.  One copy of the content of each of these files
@@ -124,8 +129,8 @@ Simple `const char *` pointers to characters in the cooked character
 stream, or to contiguous ranges thereof, are used as source position
 indicators within the parser and in the parse tree.
 
-Messages
---------
+## Messages
+
 Message texts, and snprintf-like formatting strings for constructing
 messages, are instantiated in the various components of the parser with
 C++ user defined character literals tagged with `_err_en_US` and `_en_US`
@@ -134,8 +139,8 @@ English used in the United States) so that they may be easily identified
 for localization.  As described above, messages are associated with
 source code positions by means of provenance values.
 
-The Parse Tree
---------------
+## The Parse Tree
+
 Each of the ca. 450 numbered requirement productions in the standard
 Fortran language grammar, as well as the productions implied by legacy
 extensions and preserved obsolescent features, maps to a distinct class
@@ -174,8 +179,8 @@ stability of pointers into these lists.
 There is a general purpose library by means of which parse trees may
 be traversed.
 
-Parsing
--------
+## Parsing
+
 This compiler attempts to recognize the entire cooked character stream
 (see above) as a Fortran program.  It records the reductions made during
 a successful recognition as a parse tree value.  The recognized grammar
@@ -203,8 +208,8 @@ of "parser combinator" template functions that compose them to form more
 complicated recognizers and their correspondences to the construction
 of parse tree values.
 
-Unparsing
----------
+## Unparsing
+
 Parse trees can be converted back into free form Fortran source code.
 This formatter is not really a classical "pretty printer", but is
 more of a data structure dump whose output is suitable for compilation
diff --git a/flang/docs/Preprocessing.md b/flang/docs/Preprocessing.md
index 7f6f3951cfd1..3c6984cfa2fd 100644
--- a/flang/docs/Preprocessing.md
+++ b/flang/docs/Preprocessing.md
@@ -6,11 +6,15 @@
   
 -->
 
-Fortran Preprocessing
-=====================
+# Fortran Preprocessing
+
+```eval_rst
+.. contents::
+   :local:
+```
+
+## Behavior common to (nearly) all compilers:
 
-Behavior common to (nearly) all compilers:
-------------------------------------------
 * Macro and argument names are sensitive to case.
 * Fixed form right margin clipping after column 72 (or 132)
   has precedence over macro name recognition, and also over
@@ -39,9 +43,8 @@ Behavior common to (nearly) all compilers:
 * A `#define` directive intermixed with continuation lines can't
   define a macro that's invoked earlier in the same continued statement.
 
-Behavior that is not consistent over all extant compilers but which
-probably should be uncontroversial:
------------------------------------
+## Behavior that is not consistent over all extant compilers but which probably should be uncontroversial:
+
 * Invoked macro names can straddle a Fortran line continuation.
 * ... unless implicit fixed form card padding intervenes; i.e.,
   in fixed form, a continued macro name has to be split at column
@@ -65,8 +68,8 @@ probably should be uncontroversial:
   directive indicator.
 * `#define KWM !` allows KWM to signal a comment.
 
-Judgement calls, where precedents are unclear:
-----------------------------------------------
+## Judgement calls, where precedents are unclear:
+
 * Expressions in `#if` and `#elif` should support both Fortran and C
   operators; e.g., `#if 2 .LT. 3` should work.
 * If a function-like macro does not close its parentheses, line
@@ -84,16 +87,16 @@ Judgement calls, where precedents are unclear:
   lines, it may or may not affect text in the continued statement that
   appeared before the directive.
 
-Behavior that few compilers properly support (or none), but should:
--------------------------------------------------------------------
+## Behavior that few compilers properly support (or none), but should:
+
 * A macro invocation can straddle free form continuation lines in all of their
   forms, with continuation allowed in the name, before the arguments, and
   within the arguments.
 * Directives can be capitalized in free form, too.
 * `__VA_ARGS__` and `__VA_OPT__` work in variadic function-like macros.
 
-In short, a Fortran preprocessor should work as if:
----------------------------------------------------
+## In short, a Fortran preprocessor should work as if:
+
 1. Fixed form lines are padded up to column 72 (or 132) and clipped thereafter.
 2. Fortran comments are removed.
 3. C-style line continuations are processed in preprocessing directives.
@@ -125,8 +128,7 @@ text.
 OpenMP-style directives that look like comments are not addressed by
 this scheme but are obvious extensions.
 
-Appendix
-========
+## Appendix
 `N` in the table below means "not supported"; this doesn't
 mean a bug, it just means that a particular behavior was
 not observed.
diff --git a/flang/docs/PullRequestChecklist.md b/flang/docs/PullRequestChecklist.md
index 12a67be374a2..b253c153f61e 100644
--- a/flang/docs/PullRequestChecklist.md
+++ b/flang/docs/PullRequestChecklist.md
@@ -36,7 +36,7 @@ even though I've read the style guide, they regularly trip me up.
    clang-format will do this for most code.  But you may need to break up long
    strings.
 *  Review declarations for proper use of `constexpr` and `const`.
-*  Follow the C++ [naming guidelines](C++style.md#naming).
+*  Follow the C++ [naming guidelines](C++style.html#naming)
 *  Ensure that the names evoke their purpose and are consistent with existing code.
 *  Used braced initializers.
 *  Review pointer and reference types to make sure that you're using them
diff --git a/flang/docs/RuntimeDescriptor.md b/flang/docs/RuntimeDescriptor.md
index d819517fa979..f0bbd2e3feda 100644
--- a/flang/docs/RuntimeDescriptor.md
+++ b/flang/docs/RuntimeDescriptor.md
@@ -6,6 +6,13 @@
   
 -->
 
+# Runtime Descriptors
+
+```eval_rst
+.. contents::
+   :local:
+```
+
 ## Concept
 The properties that characterize data values and objects in Fortran
 programs must sometimes be materialized when the program runs.
diff --git a/flang/docs/Semantics.md b/flang/docs/Semantics.md
index 6ea0b292de69..361426c936c2 100644
--- a/flang/docs/Semantics.md
+++ b/flang/docs/Semantics.md
@@ -8,6 +8,11 @@
 
 # Semantic Analysis
 
+```eval_rst
+.. contents::
+   :local:
+```
+
 The semantic analysis pass determines if a syntactically correct Fortran
 program is is legal by enforcing the constraints of the language.
 
diff --git a/flang/docs/conf.py b/flang/docs/conf.py
index 045d0a2c4167..21362fc3449e 100644
--- a/flang/docs/conf.py
+++ b/flang/docs/conf.py
@@ -46,12 +46,23 @@
   else:
     source_parsers = {'.md': 'recommonmark.parser.CommonMarkParser'}
   source_suffix['.md'] = 'markdown'
+  extensions.append('sphinx_markdown_tables')
+
+  # Setup AutoStructify for inline .rst toctrees in index.md
+  from recommonmark.transform import AutoStructify
+  def setup(app):
+    # Disable inline math to avoid
+    # https://github.com/readthedocs/recommonmark/issues/120 in Extensions.md
+    app.add_config_value('recommonmark_config', {
+      'enable_inline_math': False
+    }, True)
+    app.add_transform(AutoStructify)
 
 # The encoding of source files.
 #source_encoding = 'utf-8-sig'
 
 # The master toctree document.
-master_doc = 'Overview'
+master_doc = 'index'
 
 # General information about the project.
 project = u'Flang'
diff --git a/flang/docs/f2018-grammar.txt b/flang/docs/f2018-grammar.md
similarity index 99%
rename from flang/docs/f2018-grammar.txt
rename to flang/docs/f2018-grammar.md
index 2de8cdfc1b8f..b47eced4857e 100644
--- a/flang/docs/f2018-grammar.txt
+++ b/flang/docs/f2018-grammar.md
@@ -1,11 +1,8 @@
-#===-- docs/f2018-grammar.txt -------------------------------------===#
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-#===------------------------------------------------------------------------===#
+# Fortran 2018 Grammar
 
+Grammar used by Flang to parse Fortran 2018.
+
+```
 R0001 digit -> 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
 R0002 letter ->
         A | B | C | D | E | F | G | H | I | J | K | L | M |
@@ -799,3 +796,4 @@ R1542 return-stmt -> RETURN [scalar-int-expr]
 R1543 contains-stmt -> CONTAINS
 R1544 stmt-function-stmt ->
         function-name ( [dummy-arg-name-list] ) = scalar-expr
+```        
diff --git a/flang/docs/index.md b/flang/docs/index.md
new file mode 100644
index 000000000000..4c0717056522
--- /dev/null
+++ b/flang/docs/index.md
@@ -0,0 +1,61 @@
+# Welcome to Flang's documentation
+
+Flang is LLVM's Fortran frontend
+
+```eval_rst
+.. toctree::
+   :titlesonly:
+
+   ReleaseNotes
+```
+
+# Contributing to Flang
+
+```eval_rst
+.. toctree::
+   :titlesonly:
+
+   FortranForCProgrammers
+   C++style
+   C++17
+   PullRequestChecklist
+   ImplementingASemanticCheck
+```
+
+# Design Documents
+
+```eval_rst
+.. toctree::
+   :titlesonly:
+
+   Overview
+   Preprocessing
+   Parsing
+   LabelResolution
+   ModFiles
+   Semantics
+   OpenMP-semantics
+   ControlFlowGraph
+   FortranIR
+   IORuntimeInternals
+   f2018-grammar.md
+   OpenMP-4.5-grammar.md
+   Directives
+   Extensions
+   Intrinsics
+   OptionComparison
+   ParserCombinators
+   RuntimeDescriptor
+   Calls
+   Character
+   ArrayComposition
+   BijectiveInternalNameUniquing
+```
+
+# Indices and tables
+
+```eval_rst
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+```

From b3fb40b3a3c1fb7ac094eda50762624baad37552 Mon Sep 17 00:00:00 2001
From: dfukalov <daniil.fukalov@amd.com>
Date: Fri, 4 Sep 2020 22:44:01 +0300
Subject: [PATCH 231/363] [AMDGPU] Fix for folding v2.16 literals.

It was found some packed immediate operands (e.g. `<half 1.0, half 2.0>`) are
incorrectly processed so one of two packed values were lost.

Introduced new function to check immediate 32-bit operand can be folded.
Converted condition about current op_sel flags value to fall-through.

Fixes: SWDEV-247595

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D87158

(cherry picked from commit d03c4034dc80c944ec4a5833ba8f87d60183f866)
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     | 44 +++++++++----------
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    | 13 ++++++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  3 ++
 .../CodeGen/AMDGPU/shrink-add-sub-constant.ll |  4 +-
 4 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index ffcf4c30bc70..92980d2406cf 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -192,8 +192,8 @@ static bool updateOperand(FoldCandidate &Fold,
   if (Fold.isImm()) {
     if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked &&
         !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) &&
-        AMDGPU::isInlinableLiteralV216(static_cast<uint16_t>(Fold.ImmToFold),
-                                       ST.hasInv2PiInlineImm())) {
+        AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
+                                      ST.hasInv2PiInlineImm())) {
       // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
       // already set.
       unsigned Opcode = MI->getOpcode();
@@ -209,30 +209,30 @@ static bool updateOperand(FoldCandidate &Fold,
       ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
       MachineOperand &Mod = MI->getOperand(ModIdx);
       unsigned Val = Mod.getImm();
-      if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
-        return false;
-      // Only apply the following transformation if that operand requries
-      // a packed immediate.
-      switch (TII.get(Opcode).OpInfo[OpNo].OperandType) {
-      case AMDGPU::OPERAND_REG_IMM_V2FP16:
-      case AMDGPU::OPERAND_REG_IMM_V2INT16:
-      case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
-      case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
-        // If upper part is all zero we do not need op_sel_hi.
-        if (!isUInt<16>(Fold.ImmToFold)) {
-          if (!(Fold.ImmToFold & 0xffff)) {
-            Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
+      if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) {
+        // Only apply the following transformation if that operand requries
+        // a packed immediate.
+        switch (TII.get(Opcode).OpInfo[OpNo].OperandType) {
+        case AMDGPU::OPERAND_REG_IMM_V2FP16:
+        case AMDGPU::OPERAND_REG_IMM_V2INT16:
+        case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+        case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+          // If upper part is all zero we do not need op_sel_hi.
+          if (!isUInt<16>(Fold.ImmToFold)) {
+            if (!(Fold.ImmToFold & 0xffff)) {
+              Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
+              Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+              Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+              return true;
+            }
             Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
-            Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+            Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
             return true;
           }
-          Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
-          Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
-          return true;
+          break;
+        default:
+          break;
         }
-        break;
-      default:
-        break;
       }
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 00e6d517bde5..3df2157fc402 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1282,6 +1282,19 @@ bool isInlinableIntLiteralV216(int32_t Literal) {
   return Lo16 == Hi16 && isInlinableIntLiteral(Lo16);
 }
 
+bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) {
+  assert(HasInv2Pi);
+
+  int16_t Lo16 = static_cast<int16_t>(Literal);
+  if (isInt<16>(Literal) || isUInt<16>(Literal))
+    return true;
+
+  int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
+  if (!(Literal & 0xffff))
+    return true;
+  return Lo16 == Hi16;
+}
+
 bool isArgPassedInSGPR(const Argument *A) {
   const Function *F = A->getParent();
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index e71554575f6a..26bb77f4b4c7 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -660,6 +660,9 @@ bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
 LLVM_READNONE
 bool isInlinableIntLiteralV216(int32_t Literal);
 
+LLVM_READNONE
+bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
+
 bool isArgPassedInSGPR(const Argument *Arg);
 
 LLVM_READONLY
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index ff4a8296d8dd..bf437cc5bb58 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -1166,7 +1166,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(<2 x i16> addrspace(1)* %out,
 ; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, s0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_i16 v2, v3, 7 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_sub_i16 v2, v3, 0x400007
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1250,7 +1250,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %ou
 ; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, s0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_sub_i16 v2, v3, 0x7b0040
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()

From 2d61b5ea8079fb28db6a7b25cfc844fa6c21f8c4 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Sat, 29 Aug 2020 22:31:06 +0200
Subject: [PATCH 232/363] Reduce code duplication in simplifySelectWithICmpCond
 (NFC)

Canonicalize icmp ne to icmp eq and implement all the folds only once.
---
 llvm/lib/Analysis/InstructionSimplify.cpp | 34 ++++++-----------------
 1 file changed, 9 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 9423ff9e3a66..0a76979f93e2 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -3965,12 +3965,18 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
   if (!match(CondVal, m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS))))
     return nullptr;
 
-  if (ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero())) {
+  // Canonicalize ne to eq predicate.
+  if (Pred == ICmpInst::ICMP_NE) {
+    Pred = ICmpInst::ICMP_EQ;
+    std::swap(TrueVal, FalseVal);
+  }
+
+  if (Pred == ICmpInst::ICMP_EQ && match(CmpRHS, m_Zero())) {
     Value *X;
     const APInt *Y;
     if (match(CmpLHS, m_And(m_Value(X), m_APInt(Y))))
       if (Value *V = simplifySelectBitTest(TrueVal, FalseVal, X, Y,
-                                           Pred == ICmpInst::ICMP_EQ))
+                                           /*TrueWhenUnset=*/true))
         return V;
 
     // Test for a bogus zero-shift-guard-op around funnel-shift or rotate.
@@ -3981,13 +3987,7 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
                                                           m_Value(ShAmt)));
     // (ShAmt == 0) ? fshl(X, *, ShAmt) : X --> X
     // (ShAmt == 0) ? fshr(*, X, ShAmt) : X --> X
-    if (match(TrueVal, isFsh) && FalseVal == X && CmpLHS == ShAmt &&
-        Pred == ICmpInst::ICMP_EQ)
-      return X;
-    // (ShAmt != 0) ? X : fshl(X, *, ShAmt) --> X
-    // (ShAmt != 0) ? X : fshr(*, X, ShAmt) --> X
-    if (match(FalseVal, isFsh) && TrueVal == X && CmpLHS == ShAmt &&
-        Pred == ICmpInst::ICMP_NE)
+    if (match(TrueVal, isFsh) && FalseVal == X && CmpLHS == ShAmt)
       return X;
 
     // Test for a zero-shift-guard-op around rotates. These are used to
@@ -4001,11 +4001,6 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
                                 m_Intrinsic<Intrinsic::fshr>(m_Value(X),
                                                              m_Deferred(X),
                                                              m_Value(ShAmt)));
-    // (ShAmt != 0) ? fshl(X, X, ShAmt) : X --> fshl(X, X, ShAmt)
-    // (ShAmt != 0) ? fshr(X, X, ShAmt) : X --> fshr(X, X, ShAmt)
-    if (match(TrueVal, isRotate) && FalseVal == X && CmpLHS == ShAmt &&
-        Pred == ICmpInst::ICMP_NE)
-      return TrueVal;
     // (ShAmt == 0) ? X : fshl(X, X, ShAmt) --> fshl(X, X, ShAmt)
     // (ShAmt == 0) ? X : fshr(X, X, ShAmt) --> fshr(X, X, ShAmt)
     if (match(FalseVal, isRotate) && TrueVal == X && CmpLHS == ShAmt &&
@@ -4032,17 +4027,6 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
         SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
             FalseVal)
       return FalseVal;
-  } else if (Pred == ICmpInst::ICMP_NE) {
-    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q, MaxRecurse) ==
-            FalseVal ||
-        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
-            FalseVal)
-      return TrueVal;
-    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q, MaxRecurse) ==
-            TrueVal ||
-        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
-            TrueVal)
-      return TrueVal;
   }
 
   return nullptr;

From d720e5855dcf57b5b88ee6a4147ccd762115278a Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Thu, 10 Sep 2020 18:53:08 +0200
Subject: [PATCH 233/363] Add test for PR47322 (NFC)

---
 llvm/test/Transforms/InstCombine/select.ll | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index 185ff838b819..abdb36ab7bd4 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -2487,3 +2487,19 @@ define <2 x i32> @true_undef_vec(i1 %cond, <2 x i32> %x) {
   %s = select i1 %cond, <2 x i32> undef, <2 x i32> %x
   ret <2 x i32> %s
 }
+
+; FIXME: This is a miscompile!
+define i32 @pr47322_more_poisonous_replacement(i32 %arg) {
+; CHECK-LABEL: @pr47322_more_poisonous_replacement(
+; CHECK-NEXT:    [[TRAILING:%.*]] = call i32 @llvm.cttz.i32(i32 [[ARG:%.*]], i1 immarg true), [[RNG0:!range !.*]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[ARG]], [[TRAILING]]
+; CHECK-NEXT:    ret i32 [[SHIFTED]]
+;
+  %cmp = icmp eq i32 %arg, 0
+  %trailing = call i32 @llvm.cttz.i32(i32 %arg, i1 immarg true)
+  %shifted = lshr i32 %arg, %trailing
+  %r1.sroa.0.1 = select i1 %cmp, i32 0, i32 %shifted
+  ret i32 %r1.sroa.0.1
+}
+
+declare i32 @llvm.cttz.i32(i32, i1 immarg)

From be318969e245db0cd5471bff2a7cbfa3fad2b075 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Thu, 10 Sep 2020 12:19:16 +0200
Subject: [PATCH 234/363] Fix incorrect SimplifyWithOpReplaced transform
 (PR47322)

This is a followup to D86834, which partially fixed this issue in
InstSimplify. However, InstCombine repeats the same transform while
dropping poison flags -- which does not cover cases where poison is
introduced in some other way.

The fix here is a bit more comprehensive, because things are quite
entangled, and it's hard to only partially address it without
regressing optimization. There are really two changes here:

 * Export the SimplifyWithOpReplaced API from InstSimplify, with an
   added AllowRefinement flag. For replacements inside the TrueVal
   we don't actually care whether refinement occurs or not, the
   replacement is always legal. This part of the transform is now
   done in InstSimplify only. (It should be noted that the current
   AllowRefinement check is not sufficient -- that's an issue we
   need to address separately.)
 * Change the InstCombine fold to work by temporarily dropping
   poison generating flags, running the fold and then restoring the
   flags if it didn't work out. This will ensure that the InstCombine
   fold is correct as long as the InstSimplify fold is correct.

Differential Revision: https://reviews.llvm.org/D87445
---
 .../llvm/Analysis/InstructionSimplify.h       |  6 ++
 llvm/lib/Analysis/InstructionSimplify.cpp     | 50 ++++++++++-------
 .../InstCombine/InstCombineSelect.cpp         | 55 +++++++++++--------
 llvm/test/Transforms/InstCombine/select.ll    |  6 +-
 4 files changed, 72 insertions(+), 45 deletions(-)

diff --git a/llvm/include/llvm/Analysis/InstructionSimplify.h b/llvm/include/llvm/Analysis/InstructionSimplify.h
index 2a39a4e09087..b5ae54fb98bc 100644
--- a/llvm/include/llvm/Analysis/InstructionSimplify.h
+++ b/llvm/include/llvm/Analysis/InstructionSimplify.h
@@ -268,6 +268,12 @@ Value *SimplifyFreezeInst(Value *Op, const SimplifyQuery &Q);
 Value *SimplifyInstruction(Instruction *I, const SimplifyQuery &Q,
                            OptimizationRemarkEmitter *ORE = nullptr);
 
+/// See if V simplifies when its operand Op is replaced with RepOp.
+/// AllowRefinement specifies whether the simplification can be a refinement,
+/// or whether it needs to be strictly identical.
+Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
+                              const SimplifyQuery &Q, bool AllowRefinement);
+
 /// Replace all uses of 'I' with 'SimpleV' and simplify the uses recursively.
 ///
 /// This first performs a normal RAUW of I with SimpleV. It then recursively
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 0a76979f93e2..e744a966a104 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -3810,10 +3810,10 @@ Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   return ::SimplifyFCmpInst(Predicate, LHS, RHS, FMF, Q, RecursionLimit);
 }
 
-/// See if V simplifies when its operand Op is replaced with RepOp.
-static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
-                                           const SimplifyQuery &Q,
-                                           unsigned MaxRecurse) {
+static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
+                                     const SimplifyQuery &Q,
+                                     bool AllowRefinement,
+                                     unsigned MaxRecurse) {
   // Trivial replacement.
   if (V == Op)
     return RepOp;
@@ -3826,20 +3826,19 @@ static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   if (!I)
     return nullptr;
 
+  // Consider:
+  //   %cmp = icmp eq i32 %x, 2147483647
+  //   %add = add nsw i32 %x, 1
+  //   %sel = select i1 %cmp, i32 -2147483648, i32 %add
+  //
+  // We can't replace %sel with %add unless we strip away the flags (which will
+  // be done in InstCombine).
+  // TODO: This is unsound, because it only catches some forms of refinement.
+  if (!AllowRefinement && canCreatePoison(I))
+    return nullptr;
+
   // If this is a binary operator, try to simplify it with the replaced op.
   if (auto *B = dyn_cast<BinaryOperator>(I)) {
-    // Consider:
-    //   %cmp = icmp eq i32 %x, 2147483647
-    //   %add = add nsw i32 %x, 1
-    //   %sel = select i1 %cmp, i32 -2147483648, i32 %add
-    //
-    // We can't replace %sel with %add unless we strip away the flags.
-    // TODO: This is an unusual limitation because better analysis results in
-    //       worse simplification. InstCombine can do this fold more generally
-    //       by dropping the flags. Remove this fold to save compile-time?
-    if (canCreatePoison(I))
-      return nullptr;
-
     if (MaxRecurse) {
       if (B->getOperand(0) == Op)
         return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), Q,
@@ -3906,6 +3905,13 @@ static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   return nullptr;
 }
 
+Value *llvm::SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
+                                    const SimplifyQuery &Q,
+                                    bool AllowRefinement) {
+  return ::SimplifyWithOpReplaced(V, Op, RepOp, Q, AllowRefinement,
+                                  RecursionLimit);
+}
+
 /// Try to simplify a select instruction when its condition operand is an
 /// integer comparison where one operand of the compare is a constant.
 static Value *simplifySelectBitTest(Value *TrueVal, Value *FalseVal, Value *X,
@@ -4017,14 +4023,18 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
   // arms of the select. See if substituting this value into the arm and
   // simplifying the result yields the same value as the other arm.
   if (Pred == ICmpInst::ICMP_EQ) {
-    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q, MaxRecurse) ==
+    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q,
+                               /* AllowRefinement */ false, MaxRecurse) ==
             TrueVal ||
-        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
+        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q,
+                               /* AllowRefinement */ false, MaxRecurse) ==
             TrueVal)
       return FalseVal;
-    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q, MaxRecurse) ==
+    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q,
+                               /* AllowRefinement */ true, MaxRecurse) ==
             FalseVal ||
-        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q, MaxRecurse) ==
+        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q,
+                               /* AllowRefinement */ true, MaxRecurse) ==
             FalseVal)
       return FalseVal;
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index db27711f29b1..fa695c39cd1e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1148,22 +1148,6 @@ static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
   return &Sel;
 }
 
-static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *ReplaceOp,
-                                     const SimplifyQuery &Q) {
-  // If this is a binary operator, try to simplify it with the replaced op
-  // because we know Op and ReplaceOp are equivalant.
-  // For example: V = X + 1, Op = X, ReplaceOp = 42
-  // Simplifies as: add(42, 1) --> 43
-  if (auto *BO = dyn_cast<BinaryOperator>(V)) {
-    if (BO->getOperand(0) == Op)
-      return SimplifyBinOp(BO->getOpcode(), ReplaceOp, BO->getOperand(1), Q);
-    if (BO->getOperand(1) == Op)
-      return SimplifyBinOp(BO->getOpcode(), BO->getOperand(0), ReplaceOp, Q);
-  }
-
-  return nullptr;
-}
-
 /// If we have a select with an equality comparison, then we know the value in
 /// one of the arms of the select. See if substituting this value into an arm
 /// and simplifying the result yields the same value as the other arm.
@@ -1190,20 +1174,45 @@ static Value *foldSelectValueEquivalence(SelectInst &Sel, ICmpInst &Cmp,
   if (Cmp.getPredicate() == ICmpInst::ICMP_NE)
     std::swap(TrueVal, FalseVal);
 
+  auto *FalseInst = dyn_cast<Instruction>(FalseVal);
+  if (!FalseInst)
+    return nullptr;
+
+  // InstSimplify already performed this fold if it was possible subject to
+  // current poison-generating flags. Try the transform again with
+  // poison-generating flags temporarily dropped.
+  bool WasNUW = false, WasNSW = false, WasExact = false;
+  if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(FalseVal)) {
+    WasNUW = OBO->hasNoUnsignedWrap();
+    WasNSW = OBO->hasNoSignedWrap();
+    FalseInst->setHasNoUnsignedWrap(false);
+    FalseInst->setHasNoSignedWrap(false);
+  }
+  if (auto *PEO = dyn_cast<PossiblyExactOperator>(FalseVal)) {
+    WasExact = PEO->isExact();
+    FalseInst->setIsExact(false);
+  }
+
   // Try each equivalence substitution possibility.
   // We have an 'EQ' comparison, so the select's false value will propagate.
   // Example:
   // (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1
-  // (X == 42) ? (X + 1) : 43 --> (X == 42) ? (42 + 1) : 43 --> 43
   Value *CmpLHS = Cmp.getOperand(0), *CmpRHS = Cmp.getOperand(1);
-  if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q) == TrueVal ||
-      simplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q) == TrueVal ||
-      simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q) == FalseVal ||
-      simplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q) == FalseVal) {
-    if (auto *FalseInst = dyn_cast<Instruction>(FalseVal))
-      FalseInst->dropPoisonGeneratingFlags();
+  if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q,
+                             /* AllowRefinement */ false) == TrueVal ||
+      SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q,
+                             /* AllowRefinement */ false) == TrueVal) {
     return FalseVal;
   }
+
+  // Restore poison-generating flags if the transform did not apply.
+  if (WasNUW)
+    FalseInst->setHasNoUnsignedWrap();
+  if (WasNSW)
+    FalseInst->setHasNoSignedWrap();
+  if (WasExact)
+    FalseInst->setIsExact();
+
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index abdb36ab7bd4..c23587b606ce 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -2491,9 +2491,11 @@ define <2 x i32> @true_undef_vec(i1 %cond, <2 x i32> %x) {
 ; FIXME: This is a miscompile!
 define i32 @pr47322_more_poisonous_replacement(i32 %arg) {
 ; CHECK-LABEL: @pr47322_more_poisonous_replacement(
-; CHECK-NEXT:    [[TRAILING:%.*]] = call i32 @llvm.cttz.i32(i32 [[ARG:%.*]], i1 immarg true), [[RNG0:!range !.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[ARG:%.*]], 0
+; CHECK-NEXT:    [[TRAILING:%.*]] = call i32 @llvm.cttz.i32(i32 [[ARG]], i1 immarg true), [[RNG0:!range !.*]]
 ; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[ARG]], [[TRAILING]]
-; CHECK-NEXT:    ret i32 [[SHIFTED]]
+; CHECK-NEXT:    [[R1_SROA_0_1:%.*]] = select i1 [[CMP]], i32 0, i32 [[SHIFTED]]
+; CHECK-NEXT:    ret i32 [[R1_SROA_0_1]]
 ;
   %cmp = icmp eq i32 %arg, 0
   %trailing = call i32 @llvm.cttz.i32(i32 %arg, i1 immarg true)

From 88e17a8e9b49bfbcfc1fa70f867b5b56a7a64fc7 Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan@cn.ibm.com>
Date: Tue, 15 Sep 2020 17:59:10 +0800
Subject: [PATCH 235/363] [SelectionDAG] Remove unused FP constant in
 getNegatedExpression

960cbc53 immediately removes nodes that won't be used to avoid
compilation time explosion. This patch adds the removal to constants to
fix PR47517.

Reviewed By: RKSimon, steven.zhang

Differential Revision: https://reviews.llvm.org/D87614

(cherry picked from commit 2508ef014e8b01006de4e5ee6fd451d1f68d550f)
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  4 ++-
 llvm/test/CodeGen/X86/pr47517.ll              | 28 +++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/X86/pr47517.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 819e608c6896..4ebb99c97841 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5751,8 +5751,10 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
 
     // If we already have the use of the negated floating constant, it is free
     // to negate it even it has multiple uses.
-    if (!Op.hasOneUse() && CFP.use_empty())
+    if (!Op.hasOneUse() && CFP.use_empty()) {
+      RemoveDeadNode(CFP);
       break;
+    }
     Cost = NegatibleCost::Neutral;
     return CFP;
   }
diff --git a/llvm/test/CodeGen/X86/pr47517.ll b/llvm/test/CodeGen/X86/pr47517.ll
new file mode 100644
index 000000000000..6b508acf15dd
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr47517.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple x86_64 < %s | FileCheck %s
+
+; To ensure unused floating point constant is removed in negation
+define float @test(float %src, float* %p) {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq $0, (%rdi)
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %a0 = getelementptr inbounds float, float* %p, i32 0
+  %a1 = getelementptr inbounds float, float* %p, i32 1
+  store float 0.000000e+00, float* %a0
+  store float 0.000000e+00, float* %a1
+  %zero = load float, float* %a0
+  %fmul1 = fmul fast float %zero, %src
+  %fadd1 = fadd fast float %fmul1, %zero
+  %fmul2 = fmul fast float %fadd1, 2.000000e+00
+  %fmul3 = fmul fast float %fmul2, %fmul2
+  %fmul4 = fmul fast float %fmul2, 2.000000e+00
+  %fadd2 = fadd fast float %fmul4, -3.000000e+00
+  %fmul5 = fmul fast float %fadd2, %fmul2
+  %fadd3 = fadd fast float %fmul2, %src
+  %fadd4 = fadd fast float %fadd3, %fmul5
+  %fmul6 = fmul fast float %fmul3, %fadd4
+  ret float %fmul6
+}

From d754173a98309b25562b5624dc108a3b46e990fe Mon Sep 17 00:00:00 2001
From: Kirill Bobyrev <kbobyrev@google.com>
Date: Wed, 26 Aug 2020 17:08:00 +0200
Subject: [PATCH 236/363] [clangd] Use string[] for allCommitCharacters

As per LSP specification, allCommitCharacters should be string[] instead of
string:

https://microsoft.github.io/language-server-protocol/specification#textDocument_completion

Reviewed By: sammccall

Differential Revision: https://reviews.llvm.org/D86604

(cherry picked from commit 9d11e6789c477ce6104e29745ca70e13c9fafeb0)
---
 clang-tools-extra/clangd/ClangdLSPServer.cpp  |  5 +++-
 .../clangd/test/initialize-params.test        | 30 ++++++++++++++++++-
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp
index 0408b0498488..15ef89cb34fa 100644
--- a/clang-tools-extra/clangd/ClangdLSPServer.cpp
+++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp
@@ -592,7 +592,10 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params,
             {"codeActionProvider", std::move(CodeActionProvider)},
             {"completionProvider",
              llvm::json::Object{
-                 {"allCommitCharacters", " \t()[]{}<>:;,+-/*%^&#?.=\"'|"},
+                 {"allCommitCharacters",
+                  {" ", "\t", "(", ")", "[", "]", "{",  "}", "<",
+                   ">", ":",  ";", ",", "+", "-", "/",  "*", "%",
+                   "^", "&",  "#", "?", ".", "=", "\"", "'", "|"}},
                  {"resolveProvider", false},
                  // We do extra checks, e.g. that > is part of ->.
                  {"triggerCharacters", {".", "<", ">", ":", "\"", "/"}},
diff --git a/clang-tools-extra/clangd/test/initialize-params.test b/clang-tools-extra/clangd/test/initialize-params.test
index f0a0f791c2f6..4125c27e4e35 100644
--- a/clang-tools-extra/clangd/test/initialize-params.test
+++ b/clang-tools-extra/clangd/test/initialize-params.test
@@ -7,7 +7,35 @@
 # CHECK-NEXT:    "capabilities": {
 # CHECK-NEXT:      "codeActionProvider": true,
 # CHECK-NEXT:      "completionProvider": {
-# CHECK-NEXT:        "allCommitCharacters": " \t()[]{}<>:;,+-/*%^&#?.=\"'|",
+# CHECK-NEXT:        "allCommitCharacters": [
+# CHECK-NEXT:          " ",
+# CHECK-NEXT:          "\t",
+# CHECK-NEXT:          "(",
+# CHECK-NEXT:          ")",
+# CHECK-NEXT:          "[",
+# CHECK-NEXT:          "]",
+# CHECK-NEXT:          "{",
+# CHECK-NEXT:          "}",
+# CHECK-NEXT:          "<",
+# CHECK-NEXT:          ">",
+# CHECK-NEXT:          ":",
+# CHECK-NEXT:          ";",
+# CHECK-NEXT:          ",",
+# CHECK-NEXT:          "+",
+# CHECK-NEXT:          "-",
+# CHECK-NEXT:          "/",
+# CHECK-NEXT:          "*",
+# CHECK-NEXT:          "%",
+# CHECK-NEXT:          "^",
+# CHECK-NEXT:          "&",
+# CHECK-NEXT:          "#",
+# CHECK-NEXT:          "?",
+# CHECK-NEXT:          ".",
+# CHECK-NEXT:          "=",
+# CHECK-NEXT:          "\"",
+# CHECK-NEXT:          "'",
+# CHECK-NEXT:          "|"
+# CHECK-NEXT:        ],
 # CHECK-NEXT:        "resolveProvider": false,
 # CHECK-NEXT:        "triggerCharacters": [
 # CHECK-NEXT:          ".",

From 2ec773995076236110d4ffb1db7e6723c22519fc Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Mon, 14 Sep 2020 12:52:54 -0700
Subject: [PATCH 237/363] [FastISel] Bail out of selectGetElementPtr for vector
 GEPs.

The code that decomposes the GEP into ADD/MUL doesn't work properly
for vector GEPs. It can create bad COPY instructions or possibly
assert.

For now just bail out to SelectionDAG.

Fixes PR45906

(cherry picked from commit 4208ea3e19f8e3e8cd35e6f5a6c43f4aa066c6ec)
---
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp    |  6 +++
 .../test/CodeGen/X86/masked_gather_scatter.ll | 48 +++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index fc6c3a145f13..f5948d2a20dc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -690,6 +690,12 @@ bool FastISel::selectGetElementPtr(const User *I) {
   Register N = getRegForValue(I->getOperand(0));
   if (!N) // Unhandled operand. Halt "fast" selection and bail.
     return false;
+
+  // FIXME: The code below does not handle vector GEPs. Halt "fast" selection
+  // and bail.
+  if (isa<VectorType>(I->getType()))
+    return false;
+
   bool NIsKill = hasTrivialKill(I->getOperand(0));
 
   // Keep a running tab of the total offset to coalesce multiple N = N + Offset
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index df3af4c24659..b654b2a579fc 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -3319,3 +3319,51 @@ define void @scatter_16i64_constant_indices(i32* %ptr, <16 x i1> %mask, <16 x i3
   call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %src0, <16 x i32*> %gep, i32 4, <16 x i1> %mask)
   ret void
 }
+
+%struct.foo = type { i8*, i64, i16, i16, i32 }
+
+; This used to cause fast-isel to generate bad copy instructions that would
+; cause an error in copyPhysReg.
+define <8 x i64> @pr45906(<8 x %struct.foo*> %ptr) {
+; KNL_64-LABEL: pr45906:
+; KNL_64:       # %bb.0: # %bb
+; KNL_64-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_64-NEXT:    vpgatherqq (,%zmm1), %zmm0 {%k1}
+; KNL_64-NEXT:    retq
+;
+; KNL_32-LABEL: pr45906:
+; KNL_32:       # %bb.0: # %bb
+; KNL_32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
+; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
+; KNL_32-NEXT:    vpgatherdq (,%ymm1), %zmm0 {%k1}
+; KNL_32-NEXT:    retl
+;
+; SKX_SMALL-LABEL: pr45906:
+; SKX_SMALL:       # %bb.0: # %bb
+; SKX_SMALL-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; SKX_SMALL-NEXT:    kxnorw %k0, %k0, %k1
+; SKX_SMALL-NEXT:    vpgatherqq (,%zmm1), %zmm0 {%k1}
+; SKX_SMALL-NEXT:    retq
+;
+; SKX_LARGE-LABEL: pr45906:
+; SKX_LARGE:       # %bb.0: # %bb
+; SKX_LARGE-NEXT:    movabsq ${{\.LCPI.*}}, %rax
+; SKX_LARGE-NEXT:    vpaddq (%rax){1to8}, %zmm0, %zmm1
+; SKX_LARGE-NEXT:    kxnorw %k0, %k0, %k1
+; SKX_LARGE-NEXT:    vpgatherqq (,%zmm1), %zmm0 {%k1}
+; SKX_LARGE-NEXT:    retq
+;
+; SKX_32-LABEL: pr45906:
+; SKX_32:       # %bb.0: # %bb
+; SKX_32-NEXT:    vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
+; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
+; SKX_32-NEXT:    vpgatherdq (,%ymm1), %zmm0 {%k1}
+; SKX_32-NEXT:    retl
+bb:
+  %tmp = getelementptr inbounds %struct.foo, <8 x %struct.foo*> %ptr, i64 0, i32 1
+  %tmp1 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %tmp, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i64> undef)
+  ret <8 x i64> %tmp1
+}
+declare <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*>, i32, <8 x i1>, <8 x i64>)

From 274bb3fdddf8fe692fa13f4b3ccb06df8a72b388 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Tue, 15 Sep 2020 15:45:33 +0200
Subject: [PATCH 238/363] Clang release notes: mention the max_tokens_here
 pragma

---
 clang/docs/ReleaseNotes.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ba0e15deb389..c39be709d86c 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -311,7 +311,10 @@ Modified Compiler Flags
 New Pragmas in Clang
 --------------------
 
-- ...
+- The ``clang max_tokens_here`` pragma can be used together with
+  `-Wmax-tokens <DiagnosticsReference.html#wmax-tokens>`_ to emit a warning when
+  the number of preprocessor tokens exceeds a limit. Such limits can be helpful
+  in limiting code growth and slow compiles due to large header files.
 
 Attribute Changes in Clang
 --------------------------

From 1a8e4505d860ad0faa898526fc6fdc861c981516 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krist=C3=B3f=20Umann?= <dkszelethus@gmail.com>
Date: Tue, 25 Aug 2020 13:49:41 +0200
Subject: [PATCH 239/363] [analyzer] Add documentation for alpha.fuchsia.Lock
 and alpha.core.C11Lock

---
 clang/docs/analyzer/checkers.rst              | 37 +++++++++++++++++++
 .../user-docs/CrossTranslationUnit.rst        |  2 +
 2 files changed, 39 insertions(+)

diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 1583da7aff09..ca5aec677178 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -1472,6 +1472,23 @@ Warn about assigning non-{0,1} values to boolean variables.
 alpha.core
 ^^^^^^^^^^
 
+.. _alpha-core-C11Lock:
+
+alpha.core.C11Lock
+""""""""""""""""""
+Similarly to :ref:`alpha.unix.PthreadLock <alpha-unix-PthreadLock>`, checks for
+the locking/unlocking of ``mtx_t`` mutexes.
+
+.. code-block:: cpp
+
+ mtx_t mtx1;
+
+ void bad1(void)
+ {
+   mtx_lock(&mtx1);
+   mtx_lock(&mtx1); // warn: This lock has already been acquired
+ }
+
 .. _alpha-core-CallAndMessageUnInitRefArg:
 
 alpha.core.CallAndMessageUnInitRefArg (C,C++, ObjC)
@@ -1849,6 +1866,26 @@ Check for dereference of null smart pointers.
    *P; // warn: dereference of a default constructed smart unique_ptr
  }
 
+alpha.fuchsia
+^^^^^^^^^^^^^
+
+.. _alpha-fuchsia-lock:
+
+alpha.fuchsia.Lock
+""""""""""""""""""
+Similarly to :ref:`alpha.unix.PthreadLock <alpha-unix-PthreadLock>`, checks for
+the locking/unlocking of fuchsia mutexes.
+
+.. code-block:: cpp
+
+ spin_lock_t mtx1;
+
+ void bad1(void)
+ {
+   spin_lock(&mtx1);
+   spin_lock(&mtx1);	// warn: This lock has already been acquired
+ }
+
 alpha.llvm
 ^^^^^^^^^^
 
diff --git a/clang/docs/analyzer/user-docs/CrossTranslationUnit.rst b/clang/docs/analyzer/user-docs/CrossTranslationUnit.rst
index 36be82f209ef..0606185f39e6 100644
--- a/clang/docs/analyzer/user-docs/CrossTranslationUnit.rst
+++ b/clang/docs/analyzer/user-docs/CrossTranslationUnit.rst
@@ -201,6 +201,8 @@ Example usage of scan-build-py:
   ^C
   $
 
+.. _ctu-on-demand:
+
 On-demand analysis
 __________________
 The analysis produces the necessary AST structure of external TUs during analysis. This requires the

From e62452bb3e1e163daf75914cdf2e86deb4debf50 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Tue, 15 Sep 2020 16:42:07 +0200
Subject: [PATCH 240/363] Revert "Double check that passes correctly set their
 Modified status"

This check fires during self-host.

> The approach is simple: if a pass reports that it's not modifying a
> Function/Module, compute a loose hash of that Function/Module and compare it
> with the original one. If we report no change but there's a hash change, then we
> have an error.
>
> This approach misses a lot of change but it's not super intrusive and can
> detect most of the simple mistakes.
>
> Differential Revision: https://reviews.llvm.org/D80916

This reverts commit 3667d87a33d3c8d4072a41fd84bb880c59347dc0.
---
 llvm/lib/IR/LegacyPassManager.cpp           | 87 ---------------------
 llvm/unittests/IR/LegacyPassManagerTest.cpp |  2 +-
 2 files changed, 1 insertion(+), 88 deletions(-)

diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index 74869fa62c66..4189aea46294 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -1475,74 +1475,6 @@ void FPPassManager::dumpPassStructure(unsigned Offset) {
   }
 }
 
-#ifdef EXPENSIVE_CHECKS
-namespace {
-namespace details {
-
-// Basic hashing mechanism to detect structural change to the IR, used to verify
-// pass return status consistency with actual change. Loosely copied from
-// llvm/lib/Transforms/Utils/FunctionComparator.cpp
-
-class StructuralHash {
-  uint64_t Hash = 0x6acaa36bef8325c5ULL;
-
-  void update(uint64_t V) { Hash = hashing::detail::hash_16_bytes(Hash, V); }
-
-public:
-  StructuralHash() = default;
-
-  void update(Function &F) {
-    if (F.empty())
-      return;
-
-    update(F.isVarArg());
-    update(F.arg_size());
-
-    SmallVector<const BasicBlock *, 8> BBs;
-    SmallPtrSet<const BasicBlock *, 16> VisitedBBs;
-
-    BBs.push_back(&F.getEntryBlock());
-    VisitedBBs.insert(BBs[0]);
-    while (!BBs.empty()) {
-      const BasicBlock *BB = BBs.pop_back_val();
-      update(45798); // Block header
-      for (auto &Inst : *BB)
-        update(Inst.getOpcode());
-
-      const Instruction *Term = BB->getTerminator();
-      for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
-        if (!VisitedBBs.insert(Term->getSuccessor(i)).second)
-          continue;
-        BBs.push_back(Term->getSuccessor(i));
-      }
-    }
-  }
-
-  void update(Module &M) {
-    for (Function &F : M)
-      update(F);
-  }
-
-  uint64_t getHash() const { return Hash; }
-};
-
-} // namespace details
-
-uint64_t StructuralHash(Function &F) {
-  details::StructuralHash H;
-  H.update(F);
-  return H.getHash();
-}
-
-uint64_t StructuralHash(Module &M) {
-  details::StructuralHash H;
-  H.update(M);
-  return H.getHash();
-}
-
-} // end anonymous namespace
-
-#endif
 
 /// Execute all of the passes scheduled for execution by invoking
 /// runOnFunction method.  Keep track of whether any of the passes modifies
@@ -1581,16 +1513,7 @@ bool FPPassManager::runOnFunction(Function &F) {
     {
       PassManagerPrettyStackEntry X(FP, F);
       TimeRegion PassTimer(getPassTimer(FP));
-#ifdef EXPENSIVE_CHECKS
-      uint64_t RefHash = StructuralHash(F);
-#endif
       LocalChanged |= FP->runOnFunction(F);
-
-#ifdef EXPENSIVE_CHECKS
-      assert((LocalChanged || (RefHash == StructuralHash(F))) &&
-             "Pass modifies its input and doesn't report it.");
-#endif
-
       if (EmitICRemark) {
         unsigned NewSize = F.getInstructionCount();
 
@@ -1691,17 +1614,7 @@ MPPassManager::runOnModule(Module &M) {
       PassManagerPrettyStackEntry X(MP, M);
       TimeRegion PassTimer(getPassTimer(MP));
 
-#ifdef EXPENSIVE_CHECKS
-      uint64_t RefHash = StructuralHash(M);
-#endif
-
       LocalChanged |= MP->runOnModule(M);
-
-#ifdef EXPENSIVE_CHECKS
-      assert((LocalChanged || (RefHash == StructuralHash(M))) &&
-             "Pass modifies its input and doesn't report it.");
-#endif
-
       if (EmitICRemark) {
         // Update the size of the module.
         unsigned ModuleCount = M.getInstructionCount();
diff --git a/llvm/unittests/IR/LegacyPassManagerTest.cpp b/llvm/unittests/IR/LegacyPassManagerTest.cpp
index 8dda94b1b032..b7801b52481d 100644
--- a/llvm/unittests/IR/LegacyPassManagerTest.cpp
+++ b/llvm/unittests/IR/LegacyPassManagerTest.cpp
@@ -680,7 +680,7 @@ namespace llvm {
       ASSERT_EQ(M->getFunctionList().size(), 4U);
       Function *F = M->getFunction("test2");
       Function *SF = splitSimpleFunction(*F);
-      CallInst::Create(F, "", &*SF->getEntryBlock().getFirstInsertionPt());
+      CallInst::Create(F, "", &SF->getEntryBlock());
       ASSERT_EQ(M->getFunctionList().size(), 5U);
       CGModifierPass *P = new CGModifierPass();
       legacy::PassManager Passes;

From 791b7e9f73e0064153a7c3db8045a7333a8c390c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krist=C3=B3f=20Umann?= <dkszelethus@gmail.com>
Date: Tue, 25 Aug 2020 13:48:04 +0200
Subject: [PATCH 241/363] [release][docs] Add 11.0.0. release notes for the
 Clang Static Analyzer

Differential Revision: https://reviews.llvm.org/D86533
---
 clang/docs/ReleaseNotes.rst | 69 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 68 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c39be709d86c..ee257194d57f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -603,10 +603,77 @@ libclang
 
 - ...
 
+.. _release-notes-clang-static-analyzer:
+
 Static Analyzer
 ---------------
 
-- ...
+- Improved the analyzer's understanding of inherited C++ constructors.
+
+- Improved the analyzer's understanding of dynamic class method dispatching in
+  Objective-C.
+
+- Greatly improved the analyzer's constraint solver by better understanding
+  when constraints are imposed on multiple symbolic values that are known to be
+  equal or known to be non-equal. It will now also efficiently reject impossible
+  if-branches between known comparison expressions.
+
+- Added :ref:`on-demand parsing <ctu-on-demand>` capability to Cross Translation
+  Unit (CTU) analysis.
+
+- Numerous fixes and improvements for the HTML output.
+
+- New checker: :ref:`alpha.core.C11Lock <alpha-core-C11Lock>` and
+  :ref:`alpha.fuchsia.Lock <alpha-fuchsia-lock>` checks for their respective
+  locking APIs.
+
+- New checker: :ref:`alpha.security.cert.pos.34c <alpha-security-cert-pos-34c>`
+  finds calls to ``putenv`` where a pointer to an autmoatic variable is passed
+  as an argument.
+
+- New checker: :ref:`webkit.NoUncountedMemberChecker
+  <webkit-NoUncountedMemberChecker>` to enforce the existence of virtual
+  destructors for all uncounted types used as base classes.
+
+- New checker: :ref:`webkit.RefCntblBaseVirtualDtor
+  <webkit-RefCntblBaseVirtualDtor>` checks that only ref-counted types
+  are used as class members, not raw pointers and references to uncounted
+  types.
+
+- New checker: :ref:`alpha.cplusplus.SmartPtr <alpha-cplusplus-SmartPtr>` check
+  for dereference of null smart pointers.
+
+- Moved ``PlacementNewChecker`` out of alpha, making it enabled by default.
+
+- Added support for multi-dimensional variadic arrays in ``core.VLASize``.
+
+- Added a check for insufficient storage in array placement new calls, as well
+  as support for alignment variants to ``cplusplus.PlacementNew``.
+
+- While still in alpha, ``alpha.unix.PthreadLock``, the iterator and container
+  modeling infrastructure, ``alpha.unix.Stream``, and taint analysis were
+  improved greatly. An ongoing, currently off-by-default improvement was made on
+  the pre-condition modeling of several functions defined in the POSIX standard.
+
+- Improved the warning messages of several checkers.
+
+- Fixed a few remaining cases of checkers emitting reports under incorrect
+  checker names, and employed a few restrictions to more easily identify and
+  avoid such errors.
+
+- Moved several non-reporting checkers (those that model, among other things,
+  standard functions, but emit no diagnostics) to be listed under
+  ``-analyzer-checker-help-developer`` instead of ``-analyzer-checker-help``.
+  Manually enabling or disabling checkers found on this list is not supported
+  in production.
+
+- Numerous fixes for crashes, false positives and false negatives in
+  ``unix.Malloc``, ``osx.cocoa.NSError``, and several other checkers.
+
+- Implemented a dockerized testing system to more easily determine the
+  correctness and performance impact of a change to the static analyzer itself.
+  The currently beta-version tool is found in
+  ``(llvm-project repository)/clang/utils/analyzer/SATest.py``.
 
 .. _release-notes-ubsan:
 

From 22dab218407e159631fd0689cb4412646b51515a Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan@cn.ibm.com>
Date: Tue, 15 Sep 2020 22:03:50 +0800
Subject: [PATCH 242/363] Revert "[SelectionDAG] Remove unused FP constant in
 getNegatedExpression"

2508ef01 doesn't totally fix the issue since we did not handle the case
when unused temporary negated result is the same with the result, which
is found by address sanitizer.

(cherry picked from commit e1669843f2aaf1e4929afdd8f125c14536d27664)
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  4 +--
 llvm/test/CodeGen/X86/pr47517.ll              | 28 -------------------
 2 files changed, 1 insertion(+), 31 deletions(-)
 delete mode 100644 llvm/test/CodeGen/X86/pr47517.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4ebb99c97841..819e608c6896 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5751,10 +5751,8 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
 
     // If we already have the use of the negated floating constant, it is free
     // to negate it even it has multiple uses.
-    if (!Op.hasOneUse() && CFP.use_empty()) {
-      RemoveDeadNode(CFP);
+    if (!Op.hasOneUse() && CFP.use_empty())
       break;
-    }
     Cost = NegatibleCost::Neutral;
     return CFP;
   }
diff --git a/llvm/test/CodeGen/X86/pr47517.ll b/llvm/test/CodeGen/X86/pr47517.ll
deleted file mode 100644
index 6b508acf15dd..000000000000
--- a/llvm/test/CodeGen/X86/pr47517.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple x86_64 < %s | FileCheck %s
-
-; To ensure unused floating point constant is removed in negation
-define float @test(float %src, float* %p) {
-; CHECK-LABEL: test:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq $0, (%rdi)
-; CHECK-NEXT:    xorps %xmm0, %xmm0
-; CHECK-NEXT:    retq
-entry:
-  %a0 = getelementptr inbounds float, float* %p, i32 0
-  %a1 = getelementptr inbounds float, float* %p, i32 1
-  store float 0.000000e+00, float* %a0
-  store float 0.000000e+00, float* %a1
-  %zero = load float, float* %a0
-  %fmul1 = fmul fast float %zero, %src
-  %fadd1 = fadd fast float %fmul1, %zero
-  %fmul2 = fmul fast float %fadd1, 2.000000e+00
-  %fmul3 = fmul fast float %fmul2, %fmul2
-  %fmul4 = fmul fast float %fmul2, 2.000000e+00
-  %fadd2 = fadd fast float %fmul4, -3.000000e+00
-  %fmul5 = fmul fast float %fadd2, %fmul2
-  %fadd3 = fadd fast float %fmul2, %src
-  %fadd4 = fadd fast float %fadd3, %fmul5
-  %fmul6 = fmul fast float %fmul3, %fadd4
-  ret float %fmul6
-}

From d3f1f588f902a968f102d6cdaf052674efc257aa Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Tue, 1 Sep 2020 09:43:11 -0700
Subject: [PATCH 243/363] [Docs] Add/update release notes for D71913 (LTO WPD
 changes)

This adds documentation for the options added / changed by D71913, which
enabled aggressive WPD under LTO. The lld release notes already
mentioned it, but I expanded the note.

Differential Revision: https://reviews.llvm.org/D86958
---
 clang/docs/ReleaseNotes.rst | 3 ++-
 lld/docs/ReleaseNotes.rst   | 2 +-
 llvm/docs/ReleaseNotes.rst  | 6 ++++++
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ee257194d57f..d90b8f182ef9 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -306,7 +306,8 @@ Modified Compiler Flags
 - -mcpu is now supported for RISC-V, and recognises the generic-rv32,
   rocket-rv32, sifive-e31, generic-rv64, rocket-rv64, and sifive-u54 target
   CPUs.
-
+- ``-fwhole-program-vtables`` (along with ``-flto*``) now prepares all classes for possible whole program visibility if specified during the LTO link.
+  (`D71913 <https://reviews.llvm.org/D71913>`_)
 
 New Pragmas in Clang
 --------------------
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 466a7f707354..880f933e51be 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -26,7 +26,7 @@ ELF Improvements
 
 * ``--lto-emit-asm`` is added to emit assembly output for debugging purposes.
   (`D77231 <https://reviews.llvm.org/D77231>`_)
-* ``--lto-whole-program-visibility`` is added to support LTO whole-program devirtualization.
+* ``--lto-whole-program-visibility`` is added to specify that classes have hidden LTO visibility in LTO and ThinLTO links of source files compiled with ``-fwhole-program-vtables``. See `LTOVisibility <https://clang.llvm.org/docs/LTOVisibility.html>`_ for details.
   (`D71913 <https://reviews.llvm.org/D71913>`_)
 * ``--print-archive-stats=`` is added to print the number of members and the number of fetched members for each archive.
   The feature is similar to GNU gold's ``--print-symbol-counts=``.
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 0d5e0137bbc4..e87bf3d146f5 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -333,6 +333,12 @@ Changes to the Debug Info
   passed to the callee. The feature improves the debugging user experience when
   debugging optimized code.
 
+Changes to the Gold Plugin
+--------------------------
+
+* ``--plugin-opt=whole-program-visibility`` is added to specify that classes have hidden LTO visibility in LTO and ThinLTO links of source files compiled with ``-fwhole-program-vtables``. See `LTOVisibility <https://clang.llvm.org/docs/LTOVisibility.html>`_ for details.
+  (`D71913 <https://reviews.llvm.org/D71913>`_)
+
 Changes to the LLVM tools
 ---------------------------------
 

From 1596c2dfd548b21cf33ad3353882ac465d78c1bb Mon Sep 17 00:00:00 2001
From: Joachim Priesner <llvm-project-704996@jspam.de>
Date: Thu, 20 Aug 2020 09:15:29 -0400
Subject: [PATCH 244/363] Fix -allow-enabling-analyzer-alpha-checkers always
 being passed to run-clang-tidy.py

The action='store_true' option of argparse.add_argument implicitly
generates a default value of False if the argument is not specified.
Thus, the allow_enabling_alpha_checkers argument of
get_tidy_invocation is never None.
---
 clang-tools-extra/clang-tidy/tool/run-clang-tidy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
index 4272ae0957fe..7e23419cd916 100755
--- a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
+++ b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
@@ -84,7 +84,7 @@ def get_tidy_invocation(f, clang_tidy_binary, checks, tmpdir, build_path,
                         extra_arg, extra_arg_before, quiet, config):
   """Gets a command line for clang-tidy."""
   start = [clang_tidy_binary]
-  if allow_enabling_alpha_checkers is not None:
+  if allow_enabling_alpha_checkers:
     start.append('-allow-enabling-analyzer-alpha-checkers')
   if header_filter is not None:
     start.append('-header-filter=' + header_filter)

From 4b23932e230dd48a7bfc6fadb461d0ef81aeba94 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Tue, 15 Sep 2020 18:38:48 +0200
Subject: [PATCH 245/363] [OPENMP][NFC]Release notes for OpenMP in clang
 (11.x).

By Alexey Bataev!

Differential revision: https://reviews.llvm.org/D86562
---
 clang/docs/ReleaseNotes.rst | 47 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d90b8f182ef9..1c02c478be68 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -439,7 +439,52 @@ ABI Changes in Clang
 OpenMP Support in Clang
 -----------------------
 
-- ...
+New features for OpenMP 5.0 were implemented.
+
+- OpenMP 5.0 is the default version supported by the compiler. User can switch
+  to OpenMP 4.5 using ``-fopenmp-version=45`` option.
+
+- Added support for declare variant directive.
+
+- Improved support of math functions and complex types for NVPTX target.
+
+- Added support for parallel execution of target regions for NVPTX target.
+
+- Added support for ``scan`` directives and ``inscan`` modifier in ``reduction``
+  clauses.
+
+- Added support for ``iterator`` construct.
+
+- Added support for ``depobj`` construct.
+
+- Added support for ``detach`` clauses in task-based directives.
+
+- Added support for array shaping operations.
+
+- Added support for cancellation constructs in ``taskloop`` directives.
+
+- Nonmonotonic modifier is allowed with all schedule kinds.
+
+- Added support for ``task`` and ``default`` modifiers in ``reduction`` clauses.
+
+- Added support for strides in array sections.
+
+- Added support for ``use_device_addr`` clause.
+
+- Added support for ``uses_allocators`` clause.
+
+- Added support for ``defaultmap`` clause.
+
+- Added basic support for ``hint`` clause in ``atomic`` directives.
+
+- Added basic support for ``affinity`` clause.
+
+- Added basic support for ``ancestor`` modifier in ``device`` clause.
+
+- Added support for ``default(firstprivate)`` clause. This clause is the part of
+  upcoming OpenMP 5.1 and can be enabled using ``-fopenmp-version=51`` option.
+
+- Bug fixes and optimizations.
 
 CUDA Support in Clang
 ---------------------

From 6e042866c307c0ebe35094e6590dc1a1372f13c9 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Tue, 15 Sep 2020 10:47:02 +0200
Subject: [PATCH 246/363] Revert "RegAllocFast: Record internal state based on
 register units"

This seems to have caused incorrect register allocation in some cases,
breaking tests in the Zig standard library (PR47278).

As discussed on the bug, revert back to green for now.

> Record internal state based on register units. This is often more
> efficient as there are typically fewer register units to update
> compared to iterating over all the aliases of a register.
>
> Original patch by Matthias Braun, but I've been rebasing and fixing it
> for almost 2 years and fixed a few bugs causing intermediate failures
> to make this patch independent of the changes in
> https://reviews.llvm.org/D52010.

This reverts commit 66251f7e1de79a7c1620659b7f58352b8c8e892e, and
follow-ups 931a68f26b9a3de853807ffad7b2cd0a2dd30922
and 0671a4c5087d40450603d9d26cf239f1a8b1367e. It also adjust some
test expectations.

(cherry picked from commit a21387c65470417c58021f8d3194a4510bb64f46)
---
 llvm/lib/CodeGen/RegAllocFast.cpp             |  217 +--
 .../arm64-fast-isel-conversion-fallback.ll    |    8 +-
 .../AArch64/arm64-fast-isel-conversion.ll     |    8 +-
 llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll     |    8 +-
 .../CodeGen/AArch64/fast-isel-sp-adjust.ll    |    3 +-
 llvm/test/CodeGen/AArch64/popcount.ll         |   37 +-
 .../AMDGPU/indirect-addressing-term.ll        |   12 +-
 .../AMDGPU/partial-sgpr-to-vgpr-spills.ll     | 1260 ++++++++---------
 llvm/test/CodeGen/AMDGPU/spill-m0.ll          |   95 --
 llvm/test/CodeGen/AMDGPU/wwm-reserved.ll      |    8 +-
 llvm/test/CodeGen/ARM/legalize-bitcast.ll     |    6 +-
 .../GlobalISel/llvm-ir/fptosi_and_fptoui.ll   |   72 +-
 llvm/test/CodeGen/Mips/atomic-min-max.ll      |  960 ++++++-------
 llvm/test/CodeGen/Mips/atomic.ll              |  282 ++--
 llvm/test/CodeGen/Mips/implicit-sret.ll       |   14 +-
 llvm/test/CodeGen/PowerPC/addegluecrash.ll    |   10 +-
 llvm/test/CodeGen/PowerPC/popcount.ll         |   14 +-
 llvm/test/CodeGen/PowerPC/vsx.ll              |   54 +-
 llvm/test/CodeGen/SPARC/fp16-promote.ll       |   10 +-
 .../CodeGen/X86/2009-04-14-IllegalRegs.ll     |   29 +-
 llvm/test/CodeGen/X86/atomic-unordered.ll     |   58 +-
 llvm/test/CodeGen/X86/atomic32.ll             |  122 +-
 llvm/test/CodeGen/X86/atomic64.ll             |   40 +-
 llvm/test/CodeGen/X86/avx-load-store.ll       |   22 +-
 .../CodeGen/X86/avx512-mask-zext-bugfix.ll    |   22 +-
 llvm/test/CodeGen/X86/crash-O0.ll             |    9 +-
 .../CodeGen/X86/extend-set-cc-uses-dbg.ll     |    4 +-
 .../test/CodeGen/X86/fast-isel-nontemporal.ll |   60 +-
 llvm/test/CodeGen/X86/lvi-hardening-loads.ll  |    4 +-
 llvm/test/CodeGen/X86/mixed-ptr-sizes.ll      |  102 +-
 llvm/test/CodeGen/X86/pr1489.ll               |   24 +-
 llvm/test/CodeGen/X86/pr27591.ll              |   14 +-
 llvm/test/CodeGen/X86/pr30430.ll              |   34 +-
 llvm/test/CodeGen/X86/pr30813.ll              |    5 +-
 llvm/test/CodeGen/X86/pr32241.ll              |   18 +-
 llvm/test/CodeGen/X86/pr32284.ll              |  274 ++--
 llvm/test/CodeGen/X86/pr32340.ll              |   54 +-
 llvm/test/CodeGen/X86/pr32345.ll              |   63 +-
 llvm/test/CodeGen/X86/pr32451.ll              |   23 +-
 llvm/test/CodeGen/X86/pr34592.ll              |   25 +-
 llvm/test/CodeGen/X86/pr39733.ll              |    4 +-
 llvm/test/CodeGen/X86/pr44749.ll              |   24 +-
 llvm/test/CodeGen/X86/pr47000.ll              |  135 +-
 .../regalloc-fast-missing-live-out-spill.mir  |    8 +-
 llvm/test/CodeGen/X86/swift-return.ll         |   41 +-
 llvm/test/CodeGen/X86/swifterror.ll           |    4 +-
 llvm/test/DebugInfo/X86/op_deref.ll           |    8 +-
 47 files changed, 2155 insertions(+), 2153 deletions(-)

diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 5396f9f3a143..cf3eaba23bee 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -106,8 +106,13 @@ namespace {
     /// that it is alive across blocks.
     BitVector MayLiveAcrossBlocks;
 
-    /// State of a register unit.
-    enum RegUnitState {
+    /// State of a physical register.
+    enum RegState {
+      /// A disabled register is not available for allocation, but an alias may
+      /// be in use. A register can only be moved out of the disabled state if
+      /// all aliases are disabled.
+      regDisabled,
+
       /// A free register is not currently in use and can be allocated
       /// immediately without checking aliases.
       regFree,
@@ -121,8 +126,8 @@ namespace {
       /// register. In that case, LiveVirtRegs contains the inverse mapping.
     };
 
-    /// Maps each physical register to a RegUnitState enum or virtual register.
-    std::vector<unsigned> RegUnitStates;
+    /// Maps each physical register to a RegState enum or a virtual register.
+    std::vector<unsigned> PhysRegState;
 
     SmallVector<Register, 16> VirtDead;
     SmallVector<MachineInstr *, 32> Coalesced;
@@ -184,10 +189,6 @@ namespace {
     bool isLastUseOfLocalReg(const MachineOperand &MO) const;
 
     void addKillFlag(const LiveReg &LRI);
-#ifndef NDEBUG
-    bool verifyRegStateMapping(const LiveReg &LR) const;
-#endif
-
     void killVirtReg(LiveReg &LR);
     void killVirtReg(Register VirtReg);
     void spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR);
@@ -195,7 +196,7 @@ namespace {
 
     void usePhysReg(MachineOperand &MO);
     void definePhysReg(MachineBasicBlock::iterator MI, MCPhysReg PhysReg,
-                       unsigned NewState);
+                       RegState NewState);
     unsigned calcSpillCost(MCPhysReg PhysReg) const;
     void assignVirtToPhysReg(LiveReg &, MCPhysReg PhysReg);
 
@@ -228,7 +229,7 @@ namespace {
     bool mayLiveOut(Register VirtReg);
     bool mayLiveIn(Register VirtReg);
 
-    void dumpState() const;
+    void dumpState();
   };
 
 } // end anonymous namespace
@@ -239,8 +240,7 @@ INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false,
                 false)
 
 void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) {
-  for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI)
-    RegUnitStates[*UI] = NewState;
+  PhysRegState[PhysReg] = NewState;
 }
 
 /// This allocates space for the specified virtual register to be held on the
@@ -384,23 +384,12 @@ void RegAllocFast::addKillFlag(const LiveReg &LR) {
   }
 }
 
-#ifndef NDEBUG
-bool RegAllocFast::verifyRegStateMapping(const LiveReg &LR) const {
-  for (MCRegUnitIterator UI(LR.PhysReg, TRI); UI.isValid(); ++UI) {
-    if (RegUnitStates[*UI] != LR.VirtReg)
-      return false;
-  }
-
-  return true;
-}
-#endif
-
 /// Mark virtreg as no longer available.
 void RegAllocFast::killVirtReg(LiveReg &LR) {
-  assert(verifyRegStateMapping(LR) && "Broken RegState mapping");
   addKillFlag(LR);
-  MCPhysReg PhysReg = LR.PhysReg;
-  setPhysRegState(PhysReg, regFree);
+  assert(PhysRegState[LR.PhysReg] == LR.VirtReg &&
+         "Broken RegState mapping");
+  setPhysRegState(LR.PhysReg, regFree);
   LR.PhysReg = 0;
 }
 
@@ -427,9 +416,7 @@ void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI,
 
 /// Do the actual work of spilling.
 void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR) {
-  assert(verifyRegStateMapping(LR) && "Broken RegState mapping");
-
-  MCPhysReg PhysReg = LR.PhysReg;
+  assert(PhysRegState[LR.PhysReg] == LR.VirtReg && "Broken RegState mapping");
 
   if (LR.Dirty) {
     // If this physreg is used by the instruction, we want to kill it on the
@@ -437,7 +424,7 @@ void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR) {
     bool SpillKill = MachineBasicBlock::iterator(LR.LastUse) != MI;
     LR.Dirty = false;
 
-    spill(MI, LR.VirtReg, PhysReg, SpillKill);
+    spill(MI, LR.VirtReg, LR.PhysReg, SpillKill);
 
     if (SpillKill)
       LR.LastUse = nullptr; // Don't kill register again
@@ -473,16 +460,53 @@ void RegAllocFast::usePhysReg(MachineOperand &MO) {
   assert(PhysReg.isPhysical() && "Bad usePhysReg operand");
 
   markRegUsedInInstr(PhysReg);
+  switch (PhysRegState[PhysReg]) {
+  case regDisabled:
+    break;
+  case regReserved:
+    PhysRegState[PhysReg] = regFree;
+    LLVM_FALLTHROUGH;
+  case regFree:
+    MO.setIsKill();
+    return;
+  default:
+    // The physreg was allocated to a virtual register. That means the value we
+    // wanted has been clobbered.
+    llvm_unreachable("Instruction uses an allocated register");
+  }
 
-  for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) {
-    switch (RegUnitStates[*UI]) {
+  // Maybe a superregister is reserved?
+  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
+    MCPhysReg Alias = *AI;
+    switch (PhysRegState[Alias]) {
+    case regDisabled:
+      break;
     case regReserved:
-      RegUnitStates[*UI] = regFree;
+      // Either PhysReg is a subregister of Alias and we mark the
+      // whole register as free, or PhysReg is the superregister of
+      // Alias and we mark all the aliases as disabled before freeing
+      // PhysReg.
+      // In the latter case, since PhysReg was disabled, this means that
+      // its value is defined only by physical sub-registers. This check
+      // is performed by the assert of the default case in this loop.
+      // Note: The value of the superregister may only be partial
+      // defined, that is why regDisabled is a valid state for aliases.
+      assert((TRI->isSuperRegister(PhysReg, Alias) ||
+              TRI->isSuperRegister(Alias, PhysReg)) &&
+             "Instruction is not using a subregister of a reserved register");
       LLVM_FALLTHROUGH;
     case regFree:
+      if (TRI->isSuperRegister(PhysReg, Alias)) {
+        // Leave the superregister in the working set.
+        setPhysRegState(Alias, regFree);
+        MO.getParent()->addRegisterKilled(Alias, TRI, true);
+        return;
+      }
+      // Some other alias was in the working set - clear it.
+      setPhysRegState(Alias, regDisabled);
       break;
     default:
-      llvm_unreachable("Unexpected reg unit state");
+      llvm_unreachable("Instruction uses an alias of an allocated register");
     }
   }
 
@@ -495,20 +519,38 @@ void RegAllocFast::usePhysReg(MachineOperand &MO) {
 /// similar to defineVirtReg except the physreg is reserved instead of
 /// allocated.
 void RegAllocFast::definePhysReg(MachineBasicBlock::iterator MI,
-                                 MCPhysReg PhysReg, unsigned NewState) {
-  for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) {
-    switch (unsigned VirtReg = RegUnitStates[*UI]) {
+                                 MCPhysReg PhysReg, RegState NewState) {
+  markRegUsedInInstr(PhysReg);
+  switch (Register VirtReg = PhysRegState[PhysReg]) {
+  case regDisabled:
+    break;
+  default:
+    spillVirtReg(MI, VirtReg);
+    LLVM_FALLTHROUGH;
+  case regFree:
+  case regReserved:
+    setPhysRegState(PhysReg, NewState);
+    return;
+  }
+
+  // This is a disabled register, disable all aliases.
+  setPhysRegState(PhysReg, NewState);
+  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
+    MCPhysReg Alias = *AI;
+    switch (Register VirtReg = PhysRegState[Alias]) {
+    case regDisabled:
+      break;
     default:
       spillVirtReg(MI, VirtReg);
-      break;
+      LLVM_FALLTHROUGH;
     case regFree:
     case regReserved:
+      setPhysRegState(Alias, regDisabled);
+      if (TRI->isSuperRegister(PhysReg, Alias))
+        return;
       break;
     }
   }
-
-  markRegUsedInInstr(PhysReg);
-  setPhysRegState(PhysReg, NewState);
 }
 
 /// Return the cost of spilling clearing out PhysReg and aliases so it is free
@@ -521,24 +563,46 @@ unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
                       << " is already used in instr.\n");
     return spillImpossible;
   }
+  switch (Register VirtReg = PhysRegState[PhysReg]) {
+  case regDisabled:
+    break;
+  case regFree:
+    return 0;
+  case regReserved:
+    LLVM_DEBUG(dbgs() << printReg(VirtReg, TRI) << " corresponding "
+                      << printReg(PhysReg, TRI) << " is reserved already.\n");
+    return spillImpossible;
+  default: {
+    LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg);
+    assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
+           "Missing VirtReg entry");
+    return LRI->Dirty ? spillDirty : spillClean;
+  }
+  }
 
-  for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) {
-    switch (unsigned VirtReg = RegUnitStates[*UI]) {
+  // This is a disabled register, add up cost of aliases.
+  LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << " is disabled.\n");
+  unsigned Cost = 0;
+  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
+    MCPhysReg Alias = *AI;
+    switch (Register VirtReg = PhysRegState[Alias]) {
+    case regDisabled:
+      break;
     case regFree:
+      ++Cost;
       break;
     case regReserved:
-      LLVM_DEBUG(dbgs() << printReg(VirtReg, TRI) << " corresponding "
-                        << printReg(PhysReg, TRI) << " is reserved already.\n");
       return spillImpossible;
     default: {
       LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg);
       assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
              "Missing VirtReg entry");
-      return LRI->Dirty ? spillDirty : spillClean;
+      Cost += LRI->Dirty ? spillDirty : spillClean;
+      break;
     }
     }
   }
-  return 0;
+  return Cost;
 }
 
 /// This method updates local state so that we know that PhysReg is the
@@ -845,17 +909,9 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
     if (!Reg || !Reg.isPhysical())
       continue;
     markRegUsedInInstr(Reg);
-
-    for (MCRegUnitIterator UI(Reg, TRI); UI.isValid(); ++UI) {
-      if (!ThroughRegs.count(RegUnitStates[*UI]))
-        continue;
-
-      // Need to spill any aliasing registers.
-      for (MCRegUnitRootIterator RI(*UI, TRI); RI.isValid(); ++RI) {
-        for (MCSuperRegIterator SI(*RI, TRI, true); SI.isValid(); ++SI) {
-          definePhysReg(MI, *SI, regFree);
-        }
-      }
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+      if (ThroughRegs.count(PhysRegState[*AI]))
+        definePhysReg(MI, *AI, regFree);
     }
   }
 
@@ -919,40 +975,37 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
 }
 
 #ifndef NDEBUG
-
-void RegAllocFast::dumpState() const {
-  for (unsigned Unit = 1, UnitE = TRI->getNumRegUnits(); Unit != UnitE;
-       ++Unit) {
-    switch (unsigned VirtReg = RegUnitStates[Unit]) {
+void RegAllocFast::dumpState() {
+  for (unsigned Reg = 1, E = TRI->getNumRegs(); Reg != E; ++Reg) {
+    if (PhysRegState[Reg] == regDisabled) continue;
+    dbgs() << " " << printReg(Reg, TRI);
+    switch(PhysRegState[Reg]) {
     case regFree:
       break;
     case regReserved:
-      dbgs() << " " << printRegUnit(Unit, TRI) << "[P]";
+      dbgs() << "*";
       break;
     default: {
-      dbgs() << ' ' << printRegUnit(Unit, TRI) << '=' << printReg(VirtReg);
-      LiveRegMap::const_iterator I = findLiveVirtReg(VirtReg);
-      assert(I != LiveVirtRegs.end() && "have LiveVirtRegs entry");
-      if (I->Dirty)
-        dbgs() << "[D]";
-      assert(TRI->hasRegUnit(I->PhysReg, Unit) && "inverse mapping present");
+      dbgs() << '=' << printReg(PhysRegState[Reg]);
+      LiveRegMap::iterator LRI = findLiveVirtReg(PhysRegState[Reg]);
+      assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
+             "Missing VirtReg entry");
+      if (LRI->Dirty)
+        dbgs() << "*";
+      assert(LRI->PhysReg == Reg && "Bad inverse map");
       break;
     }
     }
   }
   dbgs() << '\n';
   // Check that LiveVirtRegs is the inverse.
-  for (const LiveReg &LR : LiveVirtRegs) {
-    Register VirtReg = LR.VirtReg;
-    assert(VirtReg.isVirtual() && "Bad map key");
-    MCPhysReg PhysReg = LR.PhysReg;
-    if (PhysReg != 0) {
-      assert(Register::isPhysicalRegister(PhysReg) &&
-             "mapped to physreg");
-      for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) {
-        assert(RegUnitStates[*UI] == VirtReg && "inverse map valid");
-      }
-    }
+  for (LiveRegMap::iterator i = LiveVirtRegs.begin(),
+       e = LiveVirtRegs.end(); i != e; ++i) {
+    if (!i->PhysReg)
+      continue;
+    assert(i->VirtReg.isVirtual() && "Bad map key");
+    assert(Register::isPhysicalRegister(i->PhysReg) && "Bad map value");
+    assert(PhysRegState[i->PhysReg] == i->VirtReg && "Bad inverse map");
   }
 }
 #endif
@@ -1194,7 +1247,7 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
   this->MBB = &MBB;
   LLVM_DEBUG(dbgs() << "\nAllocating " << MBB);
 
-  RegUnitStates.assign(TRI->getNumRegUnits(), regFree);
+  PhysRegState.assign(TRI->getNumRegs(), regDisabled);
   assert(LiveVirtRegs.empty() && "Mapping not cleared from last block?");
 
   MachineBasicBlock::iterator MII = MBB.begin();
diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
index 7c546936ba27..392af063eb8a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
@@ -4,8 +4,8 @@
 define i32 @fptosi_wh(half %a) nounwind ssp {
 entry:
 ; CHECK-LABEL: fptosi_wh
-; CHECK: fcvt s0, h0
-; CHECK: fcvtzs [[REG:w[0-9]+]], s0
+; CHECK: fcvt s1, h0
+; CHECK: fcvtzs [[REG:w[0-9]+]], s1
 ; CHECK: mov w0, [[REG]]
   %conv = fptosi half %a to i32
   ret i32 %conv
@@ -15,8 +15,8 @@ entry:
 define i32 @fptoui_swh(half %a) nounwind ssp {
 entry:
 ; CHECK-LABEL: fptoui_swh
-; CHECK: fcvt s0, h0
-; CHECK: fcvtzu [[REG:w[0-9]+]], s0
+; CHECK: fcvt s1, h0
+; CHECK: fcvtzu [[REG:w[0-9]+]], s1
 ; CHECK: mov w0, [[REG]]
   %conv = fptoui half %a to i32
   ret i32 %conv
diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
index d8abf14c1366..ed03aec07e7d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
@@ -54,8 +54,8 @@ entry:
 ; CHECK: ldrh w8, [sp, #12]
 ; CHECK: str w8, [sp, #8]
 ; CHECK: ldr w8, [sp, #8]
-; CHECK: ; kill: def $x8 killed $w8
-; CHECK: str x8, [sp]
+; CHECK: mov x9, x8
+; CHECK: str x9, [sp]
 ; CHECK: ldr x0, [sp]
 ; CHECK: ret
   %a.addr = alloca i8, align 1
@@ -109,8 +109,8 @@ entry:
 ; CHECK: strh w8, [sp, #12]
 ; CHECK: ldrsh w8, [sp, #12]
 ; CHECK: str w8, [sp, #8]
-; CHECK: ldrsw x8, [sp, #8]
-; CHECK: str x8, [sp]
+; CHECK: ldrsw x9, [sp, #8]
+; CHECK: str x9, [sp]
 ; CHECK: ldr x0, [sp]
 ; CHECK: ret
   %a.addr = alloca i8, align 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
index e1e889b906c0..6b3e8d747d43 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -285,11 +285,11 @@ define i16 @to_half(float %in) {
 ; FAST:       // %bb.0:
 ; FAST-NEXT:    sub sp, sp, #16 // =16
 ; FAST-NEXT:    .cfi_def_cfa_offset 16
-; FAST-NEXT:    fcvt h0, s0
+; FAST-NEXT:    fcvt h1, s0
 ; FAST-NEXT:    // implicit-def: $w0
-; FAST-NEXT:    fmov s1, w0
-; FAST-NEXT:    mov.16b v1, v0
-; FAST-NEXT:    fmov w8, s1
+; FAST-NEXT:    fmov s0, w0
+; FAST-NEXT:    mov.16b v0, v1
+; FAST-NEXT:    fmov w8, s0
 ; FAST-NEXT:    mov w0, w8
 ; FAST-NEXT:    str w0, [sp, #12] // 4-byte Folded Spill
 ; FAST-NEXT:    mov w0, w8
diff --git a/llvm/test/CodeGen/AArch64/fast-isel-sp-adjust.ll b/llvm/test/CodeGen/AArch64/fast-isel-sp-adjust.ll
index 22e3ccf2b120..8d62fb355666 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-sp-adjust.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-sp-adjust.ll
@@ -15,7 +15,8 @@
 ; CHECK-LABEL: foo:
 ; CHECK: sub
 ; CHECK-DAG: mov x[[SP:[0-9]+]], sp
-; CHECK-DAG: mov w[[OFFSET:[0-9]+]], #4104
+; CHECK-DAG: mov [[TMP:w[0-9]+]], #4104
+; CHECK: mov w[[OFFSET:[0-9]+]], [[TMP]]
 ; CHECK: strb w0, [x[[SP]], x[[OFFSET]]]
 
 define void @foo(i8 %in) {
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 105969717e46..1e796fff710c 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -10,11 +10,12 @@ define i8 @popcount128(i128* nocapture nonnull readonly %0) {
 ; CHECK-NEXT:    // implicit-def: $q1
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov v1.d[1], x8
-; CHECK-NEXT:    cnt v0.16b, v1.16b
-; CHECK-NEXT:    uaddlv h0, v0.16b
+; CHECK-NEXT:    cnt v1.16b, v1.16b
+; CHECK-NEXT:    uaddlv h2, v1.16b
 ; CHECK-NEXT:    // implicit-def: $q1
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    mov v1.16b, v2.16b
+; CHECK-NEXT:    fmov w1, s1
+; CHECK-NEXT:    mov w0, w1
 ; CHECK-NEXT:    ret
 Entry:
   %1 = load i128, i128* %0, align 16
@@ -36,21 +37,21 @@ define i16 @popcount256(i256* nocapture nonnull readonly %0) {
 ; CHECK-NEXT:    // implicit-def: $q1
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov v1.d[1], x9
-; CHECK-NEXT:    cnt v0.16b, v1.16b
-; CHECK-NEXT:    uaddlv h0, v0.16b
+; CHECK-NEXT:    cnt v1.16b, v1.16b
+; CHECK-NEXT:    uaddlv h2, v1.16b
 ; CHECK-NEXT:    // implicit-def: $q1
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    mov v1.16b, v2.16b
+; CHECK-NEXT:    fmov w10, s1
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    // implicit-def: $q1
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov v1.d[1], x8
-; CHECK-NEXT:    cnt v0.16b, v1.16b
-; CHECK-NEXT:    uaddlv h0, v0.16b
+; CHECK-NEXT:    cnt v1.16b, v1.16b
+; CHECK-NEXT:    uaddlv h2, v1.16b
 ; CHECK-NEXT:    // implicit-def: $q1
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    add w0, w8, w9
+; CHECK-NEXT:    mov v1.16b, v2.16b
+; CHECK-NEXT:    fmov w11, s1
+; CHECK-NEXT:    add w0, w11, w10
 ; CHECK-NEXT:    ret
 Entry:
   %1 = load i256, i256* %0, align 16
@@ -69,11 +70,11 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
 ; CHECK-NEXT:    fmov d0, x0
 ; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    cnt v0.16b, v0.16b
-; CHECK-NEXT:    uaddlv h0, v0.16b
-; CHECK-NEXT:    // implicit-def: $q1
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    fmov w0, s1
-; CHECK-NEXT:    // kill: def $x0 killed $w0
+; CHECK-NEXT:    uaddlv h1, v0.16b
+; CHECK-NEXT:    // implicit-def: $q0
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    fmov w2, s0
+; CHECK-NEXT:    mov w0, w2
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    mov x1, v0.d[1]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
index e26b1c947104..40ef3b00da6d 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
@@ -69,15 +69,15 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
   ; GCN:   renamable $vgpr30 = COPY killed renamable $vgpr14
   ; GCN:   renamable $vgpr31 = COPY killed renamable $vgpr15
   ; GCN:   renamable $vgpr32 = COPY killed renamable $vgpr16
-  ; GCN:   renamable $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GCN:   renamable $sgpr20_sgpr21 = S_MOV_B64 $exec
   ; GCN:   renamable $vgpr1 = IMPLICIT_DEF
-  ; GCN:   renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+  ; GCN:   renamable $sgpr22_sgpr23 = IMPLICIT_DEF
   ; GCN:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5)
   ; GCN:   SI_SPILL_S128_SAVE killed $sgpr4_sgpr5_sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 16 into %stack.1, align 4, addrspace 5)
   ; GCN:   SI_SPILL_V512_SAVE killed $vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32, %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 64 into %stack.2, align 4, addrspace 5)
-  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.3, align 4, addrspace 5)
+  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr20_sgpr21, %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.3, align 4, addrspace 5)
   ; GCN:   SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
-  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr2_sgpr3, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.5, align 4, addrspace 5)
+  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr22_sgpr23, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.5, align 4, addrspace 5)
   ; GCN: bb.1:
   ; GCN:   successors: %bb.1(0x40000000), %bb.3(0x40000000)
   ; GCN:   $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (load 8 from %stack.5, align 4, addrspace 5)
@@ -91,8 +91,8 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
   ; GCN:   renamable $vgpr18 = V_MOV_B32_e32 undef $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, implicit $m0
   ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   ; GCN:   renamable $vgpr19 = COPY renamable $vgpr18
-  ; GCN:   renamable $sgpr2_sgpr3 = COPY renamable $sgpr4_sgpr5
-  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr2_sgpr3, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.5, align 4, addrspace 5)
+  ; GCN:   renamable $sgpr6_sgpr7 = COPY renamable $sgpr4_sgpr5
+  ; GCN:   SI_SPILL_S64_SAVE killed $sgpr6_sgpr7, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.5, align 4, addrspace 5)
   ; GCN:   SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.6, align 4, addrspace 5)
   ; GCN:   SI_SPILL_V32_SAVE killed $vgpr19, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
   ; GCN:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.7, addrspace 5)
diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
index b119ffd303e0..e991c550c6be 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -11,7 +11,7 @@
 define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(i32 addrspace(1)* %out, i32 %in) #0 {
 ; GCN-LABEL: spill_sgprs_to_multiple_vgprs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:11]
 ; GCN-NEXT:    ;;#ASMEND
@@ -42,354 +42,352 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(i32 addrspace(1)* %out,
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[84:91]
 ; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_writelane_b32 v0, s4, 0
+; GCN-NEXT:    v_writelane_b32 v0, s5, 1
+; GCN-NEXT:    v_writelane_b32 v0, s6, 2
+; GCN-NEXT:    v_writelane_b32 v0, s7, 3
+; GCN-NEXT:    v_writelane_b32 v0, s8, 4
+; GCN-NEXT:    v_writelane_b32 v0, s9, 5
+; GCN-NEXT:    v_writelane_b32 v0, s10, 6
+; GCN-NEXT:    v_writelane_b32 v0, s11, 7
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[4:11]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_writelane_b32 v0, s4, 8
+; GCN-NEXT:    v_writelane_b32 v0, s5, 9
+; GCN-NEXT:    v_writelane_b32 v0, s6, 10
+; GCN-NEXT:    v_writelane_b32 v0, s7, 11
+; GCN-NEXT:    v_writelane_b32 v0, s8, 12
+; GCN-NEXT:    v_writelane_b32 v0, s9, 13
+; GCN-NEXT:    v_writelane_b32 v0, s10, 14
+; GCN-NEXT:    v_writelane_b32 v0, s11, 15
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[4:11]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_writelane_b32 v0, s4, 16
+; GCN-NEXT:    v_writelane_b32 v0, s5, 17
+; GCN-NEXT:    v_writelane_b32 v0, s6, 18
+; GCN-NEXT:    v_writelane_b32 v0, s7, 19
+; GCN-NEXT:    v_writelane_b32 v0, s8, 20
+; GCN-NEXT:    v_writelane_b32 v0, s9, 21
+; GCN-NEXT:    v_writelane_b32 v0, s10, 22
+; GCN-NEXT:    v_writelane_b32 v0, s11, 23
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[4:11]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_writelane_b32 v0, s4, 24
+; GCN-NEXT:    v_writelane_b32 v0, s5, 25
+; GCN-NEXT:    v_writelane_b32 v0, s6, 26
+; GCN-NEXT:    v_writelane_b32 v0, s7, 27
+; GCN-NEXT:    v_writelane_b32 v0, s8, 28
+; GCN-NEXT:    v_writelane_b32 v0, s9, 29
+; GCN-NEXT:    v_writelane_b32 v0, s10, 30
+; GCN-NEXT:    v_writelane_b32 v0, s11, 31
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[4:11]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_writelane_b32 v0, s4, 32
+; GCN-NEXT:    v_writelane_b32 v0, s5, 33
+; GCN-NEXT:    v_writelane_b32 v0, s6, 34
+; GCN-NEXT:    v_writelane_b32 v0, s7, 35
+; GCN-NEXT:    v_writelane_b32 v0, s8, 36
+; GCN-NEXT:    v_writelane_b32 v0, s9, 37
+; GCN-NEXT:    v_writelane_b32 v0, s10, 38
+; GCN-NEXT:    v_writelane_b32 v0, s11, 39
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[4:11]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_writelane_b32 v0, s4, 40
+; GCN-NEXT:    v_writelane_b32 v0, s5, 41
+; GCN-NEXT:    v_writelane_b32 v0, s6, 42
+; GCN-NEXT:    v_writelane_b32 v0, s7, 43
+; GCN-NEXT:    v_writelane_b32 v0, s8, 44
+; GCN-NEXT:    v_writelane_b32 v0, s9, 45
+; GCN-NEXT:    v_writelane_b32 v0, s10, 46
+; GCN-NEXT:    v_writelane_b32 v0, s11, 47
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[4:11]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_writelane_b32 v0, s4, 48
+; GCN-NEXT:    v_writelane_b32 v0, s5, 49
+; GCN-NEXT:    v_writelane_b32 v0, s6, 50
+; GCN-NEXT:    v_writelane_b32 v0, s7, 51
+; GCN-NEXT:    v_writelane_b32 v0, s8, 52
+; GCN-NEXT:    v_writelane_b32 v0, s9, 53
+; GCN-NEXT:    v_writelane_b32 v0, s10, 54
+; GCN-NEXT:    v_writelane_b32 v0, s11, 55
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[4:11]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_mov_b32 s3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_writelane_b32 v0, s0, 0
-; GCN-NEXT:    v_writelane_b32 v0, s4, 1
-; GCN-NEXT:    v_writelane_b32 v0, s5, 2
-; GCN-NEXT:    v_writelane_b32 v0, s6, 3
-; GCN-NEXT:    v_writelane_b32 v0, s7, 4
-; GCN-NEXT:    v_writelane_b32 v0, s8, 5
-; GCN-NEXT:    v_writelane_b32 v0, s9, 6
-; GCN-NEXT:    v_writelane_b32 v0, s10, 7
-; GCN-NEXT:    v_writelane_b32 v0, s11, 8
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; def s[0:7]
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v0, s0, 9
-; GCN-NEXT:    v_writelane_b32 v0, s1, 10
-; GCN-NEXT:    v_writelane_b32 v0, s2, 11
-; GCN-NEXT:    v_writelane_b32 v0, s3, 12
-; GCN-NEXT:    v_writelane_b32 v0, s4, 13
-; GCN-NEXT:    v_writelane_b32 v0, s5, 14
-; GCN-NEXT:    v_writelane_b32 v0, s6, 15
-; GCN-NEXT:    v_writelane_b32 v0, s7, 16
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; def s[0:7]
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v0, s0, 17
-; GCN-NEXT:    v_writelane_b32 v0, s1, 18
-; GCN-NEXT:    v_writelane_b32 v0, s2, 19
-; GCN-NEXT:    v_writelane_b32 v0, s3, 20
-; GCN-NEXT:    v_writelane_b32 v0, s4, 21
-; GCN-NEXT:    v_writelane_b32 v0, s5, 22
-; GCN-NEXT:    v_writelane_b32 v0, s6, 23
-; GCN-NEXT:    v_writelane_b32 v0, s7, 24
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; def s[0:7]
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v0, s0, 25
-; GCN-NEXT:    v_writelane_b32 v0, s1, 26
-; GCN-NEXT:    v_writelane_b32 v0, s2, 27
-; GCN-NEXT:    v_writelane_b32 v0, s3, 28
-; GCN-NEXT:    v_writelane_b32 v0, s4, 29
-; GCN-NEXT:    v_writelane_b32 v0, s5, 30
-; GCN-NEXT:    v_writelane_b32 v0, s6, 31
-; GCN-NEXT:    v_writelane_b32 v0, s7, 32
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; def s[0:7]
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v0, s0, 33
-; GCN-NEXT:    v_writelane_b32 v0, s1, 34
-; GCN-NEXT:    v_writelane_b32 v0, s2, 35
-; GCN-NEXT:    v_writelane_b32 v0, s3, 36
-; GCN-NEXT:    v_writelane_b32 v0, s4, 37
-; GCN-NEXT:    v_writelane_b32 v0, s5, 38
-; GCN-NEXT:    v_writelane_b32 v0, s6, 39
-; GCN-NEXT:    v_writelane_b32 v0, s7, 40
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; def s[0:7]
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v0, s0, 41
-; GCN-NEXT:    v_writelane_b32 v0, s1, 42
-; GCN-NEXT:    v_writelane_b32 v0, s2, 43
-; GCN-NEXT:    v_writelane_b32 v0, s3, 44
-; GCN-NEXT:    v_writelane_b32 v0, s4, 45
-; GCN-NEXT:    v_writelane_b32 v0, s5, 46
-; GCN-NEXT:    v_writelane_b32 v0, s6, 47
-; GCN-NEXT:    v_writelane_b32 v0, s7, 48
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; def s[0:7]
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v0, s0, 49
-; GCN-NEXT:    v_writelane_b32 v0, s1, 50
-; GCN-NEXT:    v_writelane_b32 v0, s2, 51
-; GCN-NEXT:    v_writelane_b32 v0, s3, 52
-; GCN-NEXT:    v_writelane_b32 v0, s4, 53
-; GCN-NEXT:    v_writelane_b32 v0, s5, 54
-; GCN-NEXT:    v_writelane_b32 v0, s6, 55
-; GCN-NEXT:    v_writelane_b32 v0, s7, 56
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; def s[0:7]
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_mov_b32 s8, 0
-; GCN-NEXT:    v_readlane_b32 s9, v0, 0
-; GCN-NEXT:    s_cmp_lg_u32 s9, s8
-; GCN-NEXT:    v_writelane_b32 v0, s12, 57
-; GCN-NEXT:    v_writelane_b32 v0, s13, 58
-; GCN-NEXT:    v_writelane_b32 v0, s14, 59
-; GCN-NEXT:    v_writelane_b32 v0, s15, 60
-; GCN-NEXT:    v_writelane_b32 v0, s16, 61
-; GCN-NEXT:    v_writelane_b32 v0, s17, 62
-; GCN-NEXT:    v_writelane_b32 v0, s18, 63
-; GCN-NEXT:    v_writelane_b32 v1, s19, 0
-; GCN-NEXT:    v_writelane_b32 v1, s20, 1
-; GCN-NEXT:    v_writelane_b32 v1, s21, 2
-; GCN-NEXT:    v_writelane_b32 v1, s22, 3
-; GCN-NEXT:    v_writelane_b32 v1, s23, 4
-; GCN-NEXT:    v_writelane_b32 v1, s24, 5
-; GCN-NEXT:    v_writelane_b32 v1, s25, 6
-; GCN-NEXT:    v_writelane_b32 v1, s26, 7
-; GCN-NEXT:    v_writelane_b32 v1, s27, 8
-; GCN-NEXT:    v_writelane_b32 v1, s36, 9
-; GCN-NEXT:    v_writelane_b32 v1, s37, 10
-; GCN-NEXT:    v_writelane_b32 v1, s38, 11
-; GCN-NEXT:    v_writelane_b32 v1, s39, 12
-; GCN-NEXT:    v_writelane_b32 v1, s40, 13
-; GCN-NEXT:    v_writelane_b32 v1, s41, 14
-; GCN-NEXT:    v_writelane_b32 v1, s42, 15
-; GCN-NEXT:    v_writelane_b32 v1, s43, 16
-; GCN-NEXT:    v_writelane_b32 v1, s44, 17
-; GCN-NEXT:    v_writelane_b32 v1, s45, 18
-; GCN-NEXT:    v_writelane_b32 v1, s46, 19
-; GCN-NEXT:    v_writelane_b32 v1, s47, 20
-; GCN-NEXT:    v_writelane_b32 v1, s48, 21
-; GCN-NEXT:    v_writelane_b32 v1, s49, 22
-; GCN-NEXT:    v_writelane_b32 v1, s50, 23
-; GCN-NEXT:    v_writelane_b32 v1, s51, 24
-; GCN-NEXT:    v_writelane_b32 v1, s52, 25
-; GCN-NEXT:    v_writelane_b32 v1, s53, 26
-; GCN-NEXT:    v_writelane_b32 v1, s54, 27
-; GCN-NEXT:    v_writelane_b32 v1, s55, 28
-; GCN-NEXT:    v_writelane_b32 v1, s56, 29
-; GCN-NEXT:    v_writelane_b32 v1, s57, 30
-; GCN-NEXT:    v_writelane_b32 v1, s58, 31
-; GCN-NEXT:    v_writelane_b32 v1, s59, 32
-; GCN-NEXT:    v_writelane_b32 v1, s60, 33
-; GCN-NEXT:    v_writelane_b32 v1, s61, 34
-; GCN-NEXT:    v_writelane_b32 v1, s62, 35
-; GCN-NEXT:    v_writelane_b32 v1, s63, 36
-; GCN-NEXT:    v_writelane_b32 v1, s64, 37
-; GCN-NEXT:    v_writelane_b32 v1, s65, 38
-; GCN-NEXT:    v_writelane_b32 v1, s66, 39
-; GCN-NEXT:    v_writelane_b32 v1, s67, 40
-; GCN-NEXT:    v_writelane_b32 v1, s68, 41
-; GCN-NEXT:    v_writelane_b32 v1, s69, 42
-; GCN-NEXT:    v_writelane_b32 v1, s70, 43
-; GCN-NEXT:    v_writelane_b32 v1, s71, 44
-; GCN-NEXT:    v_writelane_b32 v1, s72, 45
-; GCN-NEXT:    v_writelane_b32 v1, s73, 46
-; GCN-NEXT:    v_writelane_b32 v1, s74, 47
-; GCN-NEXT:    v_writelane_b32 v1, s75, 48
-; GCN-NEXT:    v_writelane_b32 v1, s76, 49
-; GCN-NEXT:    v_writelane_b32 v1, s77, 50
-; GCN-NEXT:    v_writelane_b32 v1, s78, 51
-; GCN-NEXT:    v_writelane_b32 v1, s79, 52
-; GCN-NEXT:    v_writelane_b32 v1, s80, 53
-; GCN-NEXT:    v_writelane_b32 v1, s81, 54
-; GCN-NEXT:    v_writelane_b32 v1, s82, 55
-; GCN-NEXT:    v_writelane_b32 v1, s83, 56
-; GCN-NEXT:    v_writelane_b32 v1, s84, 57
-; GCN-NEXT:    v_writelane_b32 v1, s85, 58
-; GCN-NEXT:    v_writelane_b32 v1, s86, 59
-; GCN-NEXT:    v_writelane_b32 v1, s87, 60
-; GCN-NEXT:    v_writelane_b32 v1, s88, 61
-; GCN-NEXT:    v_writelane_b32 v1, s89, 62
-; GCN-NEXT:    v_writelane_b32 v1, s90, 63
-; GCN-NEXT:    v_writelane_b32 v2, s91, 0
-; GCN-NEXT:    v_writelane_b32 v2, s0, 1
-; GCN-NEXT:    v_writelane_b32 v2, s1, 2
-; GCN-NEXT:    v_writelane_b32 v2, s2, 3
-; GCN-NEXT:    v_writelane_b32 v2, s3, 4
-; GCN-NEXT:    v_writelane_b32 v2, s4, 5
-; GCN-NEXT:    v_writelane_b32 v2, s5, 6
-; GCN-NEXT:    v_writelane_b32 v2, s6, 7
-; GCN-NEXT:    v_writelane_b32 v2, s7, 8
+; GCN-NEXT:    s_cmp_lg_u32 s2, s3
+; GCN-NEXT:    v_writelane_b32 v0, s12, 56
+; GCN-NEXT:    v_writelane_b32 v0, s13, 57
+; GCN-NEXT:    v_writelane_b32 v0, s14, 58
+; GCN-NEXT:    v_writelane_b32 v0, s15, 59
+; GCN-NEXT:    v_writelane_b32 v0, s16, 60
+; GCN-NEXT:    v_writelane_b32 v0, s17, 61
+; GCN-NEXT:    v_writelane_b32 v0, s18, 62
+; GCN-NEXT:    v_writelane_b32 v0, s19, 63
+; GCN-NEXT:    v_writelane_b32 v1, s20, 0
+; GCN-NEXT:    v_writelane_b32 v1, s21, 1
+; GCN-NEXT:    v_writelane_b32 v1, s22, 2
+; GCN-NEXT:    v_writelane_b32 v1, s23, 3
+; GCN-NEXT:    v_writelane_b32 v1, s24, 4
+; GCN-NEXT:    v_writelane_b32 v1, s25, 5
+; GCN-NEXT:    v_writelane_b32 v1, s26, 6
+; GCN-NEXT:    v_writelane_b32 v1, s27, 7
+; GCN-NEXT:    v_writelane_b32 v1, s36, 8
+; GCN-NEXT:    v_writelane_b32 v1, s37, 9
+; GCN-NEXT:    v_writelane_b32 v1, s38, 10
+; GCN-NEXT:    v_writelane_b32 v1, s39, 11
+; GCN-NEXT:    v_writelane_b32 v1, s40, 12
+; GCN-NEXT:    v_writelane_b32 v1, s41, 13
+; GCN-NEXT:    v_writelane_b32 v1, s42, 14
+; GCN-NEXT:    v_writelane_b32 v1, s43, 15
+; GCN-NEXT:    v_writelane_b32 v1, s44, 16
+; GCN-NEXT:    v_writelane_b32 v1, s45, 17
+; GCN-NEXT:    v_writelane_b32 v1, s46, 18
+; GCN-NEXT:    v_writelane_b32 v1, s47, 19
+; GCN-NEXT:    v_writelane_b32 v1, s48, 20
+; GCN-NEXT:    v_writelane_b32 v1, s49, 21
+; GCN-NEXT:    v_writelane_b32 v1, s50, 22
+; GCN-NEXT:    v_writelane_b32 v1, s51, 23
+; GCN-NEXT:    v_writelane_b32 v1, s52, 24
+; GCN-NEXT:    v_writelane_b32 v1, s53, 25
+; GCN-NEXT:    v_writelane_b32 v1, s54, 26
+; GCN-NEXT:    v_writelane_b32 v1, s55, 27
+; GCN-NEXT:    v_writelane_b32 v1, s56, 28
+; GCN-NEXT:    v_writelane_b32 v1, s57, 29
+; GCN-NEXT:    v_writelane_b32 v1, s58, 30
+; GCN-NEXT:    v_writelane_b32 v1, s59, 31
+; GCN-NEXT:    v_writelane_b32 v1, s60, 32
+; GCN-NEXT:    v_writelane_b32 v1, s61, 33
+; GCN-NEXT:    v_writelane_b32 v1, s62, 34
+; GCN-NEXT:    v_writelane_b32 v1, s63, 35
+; GCN-NEXT:    v_writelane_b32 v1, s64, 36
+; GCN-NEXT:    v_writelane_b32 v1, s65, 37
+; GCN-NEXT:    v_writelane_b32 v1, s66, 38
+; GCN-NEXT:    v_writelane_b32 v1, s67, 39
+; GCN-NEXT:    v_writelane_b32 v1, s68, 40
+; GCN-NEXT:    v_writelane_b32 v1, s69, 41
+; GCN-NEXT:    v_writelane_b32 v1, s70, 42
+; GCN-NEXT:    v_writelane_b32 v1, s71, 43
+; GCN-NEXT:    v_writelane_b32 v1, s72, 44
+; GCN-NEXT:    v_writelane_b32 v1, s73, 45
+; GCN-NEXT:    v_writelane_b32 v1, s74, 46
+; GCN-NEXT:    v_writelane_b32 v1, s75, 47
+; GCN-NEXT:    v_writelane_b32 v1, s76, 48
+; GCN-NEXT:    v_writelane_b32 v1, s77, 49
+; GCN-NEXT:    v_writelane_b32 v1, s78, 50
+; GCN-NEXT:    v_writelane_b32 v1, s79, 51
+; GCN-NEXT:    v_writelane_b32 v1, s80, 52
+; GCN-NEXT:    v_writelane_b32 v1, s81, 53
+; GCN-NEXT:    v_writelane_b32 v1, s82, 54
+; GCN-NEXT:    v_writelane_b32 v1, s83, 55
+; GCN-NEXT:    v_writelane_b32 v1, s84, 56
+; GCN-NEXT:    v_writelane_b32 v1, s85, 57
+; GCN-NEXT:    v_writelane_b32 v1, s86, 58
+; GCN-NEXT:    v_writelane_b32 v1, s87, 59
+; GCN-NEXT:    v_writelane_b32 v1, s88, 60
+; GCN-NEXT:    v_writelane_b32 v1, s89, 61
+; GCN-NEXT:    v_writelane_b32 v1, s90, 62
+; GCN-NEXT:    v_writelane_b32 v1, s91, 63
+; GCN-NEXT:    v_writelane_b32 v2, s4, 0
+; GCN-NEXT:    v_writelane_b32 v2, s5, 1
+; GCN-NEXT:    v_writelane_b32 v2, s6, 2
+; GCN-NEXT:    v_writelane_b32 v2, s7, 3
+; GCN-NEXT:    v_writelane_b32 v2, s8, 4
+; GCN-NEXT:    v_writelane_b32 v2, s9, 5
+; GCN-NEXT:    v_writelane_b32 v2, s10, 6
+; GCN-NEXT:    v_writelane_b32 v2, s11, 7
 ; GCN-NEXT:    s_cbranch_scc1 BB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
-; GCN-NEXT:    v_readlane_b32 s0, v0, 1
-; GCN-NEXT:    v_readlane_b32 s1, v0, 2
-; GCN-NEXT:    v_readlane_b32 s2, v0, 3
-; GCN-NEXT:    v_readlane_b32 s3, v0, 4
-; GCN-NEXT:    v_readlane_b32 s4, v0, 5
-; GCN-NEXT:    v_readlane_b32 s5, v0, 6
-; GCN-NEXT:    v_readlane_b32 s6, v0, 7
-; GCN-NEXT:    v_readlane_b32 s7, v0, 8
+; GCN-NEXT:    v_readlane_b32 s0, v0, 0
+; GCN-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-NEXT:    v_readlane_b32 s2, v0, 2
+; GCN-NEXT:    v_readlane_b32 s3, v0, 3
+; GCN-NEXT:    v_readlane_b32 s4, v0, 4
+; GCN-NEXT:    v_readlane_b32 s5, v0, 5
+; GCN-NEXT:    v_readlane_b32 s6, v0, 6
+; GCN-NEXT:    v_readlane_b32 s7, v0, 7
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v0, 57
-; GCN-NEXT:    v_readlane_b32 s1, v0, 58
-; GCN-NEXT:    v_readlane_b32 s2, v0, 59
-; GCN-NEXT:    v_readlane_b32 s3, v0, 60
-; GCN-NEXT:    v_readlane_b32 s4, v0, 61
-; GCN-NEXT:    v_readlane_b32 s5, v0, 62
-; GCN-NEXT:    v_readlane_b32 s6, v0, 63
-; GCN-NEXT:    v_readlane_b32 s7, v1, 0
+; GCN-NEXT:    v_readlane_b32 s0, v0, 56
+; GCN-NEXT:    v_readlane_b32 s1, v0, 57
+; GCN-NEXT:    v_readlane_b32 s2, v0, 58
+; GCN-NEXT:    v_readlane_b32 s3, v0, 59
+; GCN-NEXT:    v_readlane_b32 s4, v0, 60
+; GCN-NEXT:    v_readlane_b32 s5, v0, 61
+; GCN-NEXT:    v_readlane_b32 s6, v0, 62
+; GCN-NEXT:    v_readlane_b32 s7, v0, 63
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v1, 1
-; GCN-NEXT:    v_readlane_b32 s1, v1, 2
-; GCN-NEXT:    v_readlane_b32 s2, v1, 3
-; GCN-NEXT:    v_readlane_b32 s3, v1, 4
-; GCN-NEXT:    v_readlane_b32 s4, v1, 5
-; GCN-NEXT:    v_readlane_b32 s5, v1, 6
-; GCN-NEXT:    v_readlane_b32 s6, v1, 7
-; GCN-NEXT:    v_readlane_b32 s7, v1, 8
+; GCN-NEXT:    v_readlane_b32 s0, v1, 0
+; GCN-NEXT:    v_readlane_b32 s1, v1, 1
+; GCN-NEXT:    v_readlane_b32 s2, v1, 2
+; GCN-NEXT:    v_readlane_b32 s3, v1, 3
+; GCN-NEXT:    v_readlane_b32 s4, v1, 4
+; GCN-NEXT:    v_readlane_b32 s5, v1, 5
+; GCN-NEXT:    v_readlane_b32 s6, v1, 6
+; GCN-NEXT:    v_readlane_b32 s7, v1, 7
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v1, 9
-; GCN-NEXT:    v_readlane_b32 s1, v1, 10
-; GCN-NEXT:    v_readlane_b32 s2, v1, 11
-; GCN-NEXT:    v_readlane_b32 s3, v1, 12
-; GCN-NEXT:    v_readlane_b32 s4, v1, 13
-; GCN-NEXT:    v_readlane_b32 s5, v1, 14
-; GCN-NEXT:    v_readlane_b32 s6, v1, 15
-; GCN-NEXT:    v_readlane_b32 s7, v1, 16
+; GCN-NEXT:    v_readlane_b32 s0, v1, 8
+; GCN-NEXT:    v_readlane_b32 s1, v1, 9
+; GCN-NEXT:    v_readlane_b32 s2, v1, 10
+; GCN-NEXT:    v_readlane_b32 s3, v1, 11
+; GCN-NEXT:    v_readlane_b32 s4, v1, 12
+; GCN-NEXT:    v_readlane_b32 s5, v1, 13
+; GCN-NEXT:    v_readlane_b32 s6, v1, 14
+; GCN-NEXT:    v_readlane_b32 s7, v1, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v1, 17
-; GCN-NEXT:    v_readlane_b32 s1, v1, 18
-; GCN-NEXT:    v_readlane_b32 s2, v1, 19
-; GCN-NEXT:    v_readlane_b32 s3, v1, 20
-; GCN-NEXT:    v_readlane_b32 s4, v1, 21
-; GCN-NEXT:    v_readlane_b32 s5, v1, 22
-; GCN-NEXT:    v_readlane_b32 s6, v1, 23
-; GCN-NEXT:    v_readlane_b32 s7, v1, 24
+; GCN-NEXT:    v_readlane_b32 s0, v1, 16
+; GCN-NEXT:    v_readlane_b32 s1, v1, 17
+; GCN-NEXT:    v_readlane_b32 s2, v1, 18
+; GCN-NEXT:    v_readlane_b32 s3, v1, 19
+; GCN-NEXT:    v_readlane_b32 s4, v1, 20
+; GCN-NEXT:    v_readlane_b32 s5, v1, 21
+; GCN-NEXT:    v_readlane_b32 s6, v1, 22
+; GCN-NEXT:    v_readlane_b32 s7, v1, 23
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v1, 25
-; GCN-NEXT:    v_readlane_b32 s1, v1, 26
-; GCN-NEXT:    v_readlane_b32 s2, v1, 27
-; GCN-NEXT:    v_readlane_b32 s3, v1, 28
-; GCN-NEXT:    v_readlane_b32 s4, v1, 29
-; GCN-NEXT:    v_readlane_b32 s5, v1, 30
-; GCN-NEXT:    v_readlane_b32 s6, v1, 31
-; GCN-NEXT:    v_readlane_b32 s7, v1, 32
+; GCN-NEXT:    v_readlane_b32 s0, v1, 24
+; GCN-NEXT:    v_readlane_b32 s1, v1, 25
+; GCN-NEXT:    v_readlane_b32 s2, v1, 26
+; GCN-NEXT:    v_readlane_b32 s3, v1, 27
+; GCN-NEXT:    v_readlane_b32 s4, v1, 28
+; GCN-NEXT:    v_readlane_b32 s5, v1, 29
+; GCN-NEXT:    v_readlane_b32 s6, v1, 30
+; GCN-NEXT:    v_readlane_b32 s7, v1, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v1, 33
-; GCN-NEXT:    v_readlane_b32 s1, v1, 34
-; GCN-NEXT:    v_readlane_b32 s2, v1, 35
-; GCN-NEXT:    v_readlane_b32 s3, v1, 36
-; GCN-NEXT:    v_readlane_b32 s4, v1, 37
-; GCN-NEXT:    v_readlane_b32 s5, v1, 38
-; GCN-NEXT:    v_readlane_b32 s6, v1, 39
-; GCN-NEXT:    v_readlane_b32 s7, v1, 40
+; GCN-NEXT:    v_readlane_b32 s0, v1, 32
+; GCN-NEXT:    v_readlane_b32 s1, v1, 33
+; GCN-NEXT:    v_readlane_b32 s2, v1, 34
+; GCN-NEXT:    v_readlane_b32 s3, v1, 35
+; GCN-NEXT:    v_readlane_b32 s4, v1, 36
+; GCN-NEXT:    v_readlane_b32 s5, v1, 37
+; GCN-NEXT:    v_readlane_b32 s6, v1, 38
+; GCN-NEXT:    v_readlane_b32 s7, v1, 39
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v1, 41
-; GCN-NEXT:    v_readlane_b32 s1, v1, 42
-; GCN-NEXT:    v_readlane_b32 s2, v1, 43
-; GCN-NEXT:    v_readlane_b32 s3, v1, 44
-; GCN-NEXT:    v_readlane_b32 s4, v1, 45
-; GCN-NEXT:    v_readlane_b32 s5, v1, 46
-; GCN-NEXT:    v_readlane_b32 s6, v1, 47
-; GCN-NEXT:    v_readlane_b32 s7, v1, 48
+; GCN-NEXT:    v_readlane_b32 s0, v1, 40
+; GCN-NEXT:    v_readlane_b32 s1, v1, 41
+; GCN-NEXT:    v_readlane_b32 s2, v1, 42
+; GCN-NEXT:    v_readlane_b32 s3, v1, 43
+; GCN-NEXT:    v_readlane_b32 s4, v1, 44
+; GCN-NEXT:    v_readlane_b32 s5, v1, 45
+; GCN-NEXT:    v_readlane_b32 s6, v1, 46
+; GCN-NEXT:    v_readlane_b32 s7, v1, 47
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v1, 49
-; GCN-NEXT:    v_readlane_b32 s1, v1, 50
-; GCN-NEXT:    v_readlane_b32 s2, v1, 51
-; GCN-NEXT:    v_readlane_b32 s3, v1, 52
-; GCN-NEXT:    v_readlane_b32 s4, v1, 53
-; GCN-NEXT:    v_readlane_b32 s5, v1, 54
-; GCN-NEXT:    v_readlane_b32 s6, v1, 55
-; GCN-NEXT:    v_readlane_b32 s7, v1, 56
+; GCN-NEXT:    v_readlane_b32 s0, v1, 48
+; GCN-NEXT:    v_readlane_b32 s1, v1, 49
+; GCN-NEXT:    v_readlane_b32 s2, v1, 50
+; GCN-NEXT:    v_readlane_b32 s3, v1, 51
+; GCN-NEXT:    v_readlane_b32 s4, v1, 52
+; GCN-NEXT:    v_readlane_b32 s5, v1, 53
+; GCN-NEXT:    v_readlane_b32 s6, v1, 54
+; GCN-NEXT:    v_readlane_b32 s7, v1, 55
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v1, 57
-; GCN-NEXT:    v_readlane_b32 s1, v1, 58
-; GCN-NEXT:    v_readlane_b32 s2, v1, 59
-; GCN-NEXT:    v_readlane_b32 s3, v1, 60
-; GCN-NEXT:    v_readlane_b32 s4, v1, 61
-; GCN-NEXT:    v_readlane_b32 s5, v1, 62
-; GCN-NEXT:    v_readlane_b32 s6, v1, 63
-; GCN-NEXT:    v_readlane_b32 s7, v2, 0
+; GCN-NEXT:    v_readlane_b32 s0, v1, 56
+; GCN-NEXT:    v_readlane_b32 s1, v1, 57
+; GCN-NEXT:    v_readlane_b32 s2, v1, 58
+; GCN-NEXT:    v_readlane_b32 s3, v1, 59
+; GCN-NEXT:    v_readlane_b32 s4, v1, 60
+; GCN-NEXT:    v_readlane_b32 s5, v1, 61
+; GCN-NEXT:    v_readlane_b32 s6, v1, 62
+; GCN-NEXT:    v_readlane_b32 s7, v1, 63
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v0, 9
-; GCN-NEXT:    v_readlane_b32 s1, v0, 10
-; GCN-NEXT:    v_readlane_b32 s2, v0, 11
-; GCN-NEXT:    v_readlane_b32 s3, v0, 12
-; GCN-NEXT:    v_readlane_b32 s4, v0, 13
-; GCN-NEXT:    v_readlane_b32 s5, v0, 14
-; GCN-NEXT:    v_readlane_b32 s6, v0, 15
-; GCN-NEXT:    v_readlane_b32 s7, v0, 16
+; GCN-NEXT:    v_readlane_b32 s0, v0, 8
+; GCN-NEXT:    v_readlane_b32 s1, v0, 9
+; GCN-NEXT:    v_readlane_b32 s2, v0, 10
+; GCN-NEXT:    v_readlane_b32 s3, v0, 11
+; GCN-NEXT:    v_readlane_b32 s4, v0, 12
+; GCN-NEXT:    v_readlane_b32 s5, v0, 13
+; GCN-NEXT:    v_readlane_b32 s6, v0, 14
+; GCN-NEXT:    v_readlane_b32 s7, v0, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v0, 17
-; GCN-NEXT:    v_readlane_b32 s1, v0, 18
-; GCN-NEXT:    v_readlane_b32 s2, v0, 19
-; GCN-NEXT:    v_readlane_b32 s3, v0, 20
-; GCN-NEXT:    v_readlane_b32 s4, v0, 21
-; GCN-NEXT:    v_readlane_b32 s5, v0, 22
-; GCN-NEXT:    v_readlane_b32 s6, v0, 23
-; GCN-NEXT:    v_readlane_b32 s7, v0, 24
+; GCN-NEXT:    v_readlane_b32 s0, v0, 16
+; GCN-NEXT:    v_readlane_b32 s1, v0, 17
+; GCN-NEXT:    v_readlane_b32 s2, v0, 18
+; GCN-NEXT:    v_readlane_b32 s3, v0, 19
+; GCN-NEXT:    v_readlane_b32 s4, v0, 20
+; GCN-NEXT:    v_readlane_b32 s5, v0, 21
+; GCN-NEXT:    v_readlane_b32 s6, v0, 22
+; GCN-NEXT:    v_readlane_b32 s7, v0, 23
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v0, 25
-; GCN-NEXT:    v_readlane_b32 s1, v0, 26
-; GCN-NEXT:    v_readlane_b32 s2, v0, 27
-; GCN-NEXT:    v_readlane_b32 s3, v0, 28
-; GCN-NEXT:    v_readlane_b32 s4, v0, 29
-; GCN-NEXT:    v_readlane_b32 s5, v0, 30
-; GCN-NEXT:    v_readlane_b32 s6, v0, 31
-; GCN-NEXT:    v_readlane_b32 s7, v0, 32
+; GCN-NEXT:    v_readlane_b32 s0, v0, 24
+; GCN-NEXT:    v_readlane_b32 s1, v0, 25
+; GCN-NEXT:    v_readlane_b32 s2, v0, 26
+; GCN-NEXT:    v_readlane_b32 s3, v0, 27
+; GCN-NEXT:    v_readlane_b32 s4, v0, 28
+; GCN-NEXT:    v_readlane_b32 s5, v0, 29
+; GCN-NEXT:    v_readlane_b32 s6, v0, 30
+; GCN-NEXT:    v_readlane_b32 s7, v0, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v0, 33
-; GCN-NEXT:    v_readlane_b32 s1, v0, 34
-; GCN-NEXT:    v_readlane_b32 s2, v0, 35
-; GCN-NEXT:    v_readlane_b32 s3, v0, 36
-; GCN-NEXT:    v_readlane_b32 s4, v0, 37
-; GCN-NEXT:    v_readlane_b32 s5, v0, 38
-; GCN-NEXT:    v_readlane_b32 s6, v0, 39
-; GCN-NEXT:    v_readlane_b32 s7, v0, 40
+; GCN-NEXT:    v_readlane_b32 s0, v0, 32
+; GCN-NEXT:    v_readlane_b32 s1, v0, 33
+; GCN-NEXT:    v_readlane_b32 s2, v0, 34
+; GCN-NEXT:    v_readlane_b32 s3, v0, 35
+; GCN-NEXT:    v_readlane_b32 s4, v0, 36
+; GCN-NEXT:    v_readlane_b32 s5, v0, 37
+; GCN-NEXT:    v_readlane_b32 s6, v0, 38
+; GCN-NEXT:    v_readlane_b32 s7, v0, 39
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v0, 41
-; GCN-NEXT:    v_readlane_b32 s1, v0, 42
-; GCN-NEXT:    v_readlane_b32 s2, v0, 43
-; GCN-NEXT:    v_readlane_b32 s3, v0, 44
-; GCN-NEXT:    v_readlane_b32 s4, v0, 45
-; GCN-NEXT:    v_readlane_b32 s5, v0, 46
-; GCN-NEXT:    v_readlane_b32 s6, v0, 47
-; GCN-NEXT:    v_readlane_b32 s7, v0, 48
+; GCN-NEXT:    v_readlane_b32 s0, v0, 40
+; GCN-NEXT:    v_readlane_b32 s1, v0, 41
+; GCN-NEXT:    v_readlane_b32 s2, v0, 42
+; GCN-NEXT:    v_readlane_b32 s3, v0, 43
+; GCN-NEXT:    v_readlane_b32 s4, v0, 44
+; GCN-NEXT:    v_readlane_b32 s5, v0, 45
+; GCN-NEXT:    v_readlane_b32 s6, v0, 46
+; GCN-NEXT:    v_readlane_b32 s7, v0, 47
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v0, 49
-; GCN-NEXT:    v_readlane_b32 s1, v0, 50
-; GCN-NEXT:    v_readlane_b32 s2, v0, 51
-; GCN-NEXT:    v_readlane_b32 s3, v0, 52
-; GCN-NEXT:    v_readlane_b32 s4, v0, 53
-; GCN-NEXT:    v_readlane_b32 s5, v0, 54
-; GCN-NEXT:    v_readlane_b32 s6, v0, 55
-; GCN-NEXT:    v_readlane_b32 s7, v0, 56
+; GCN-NEXT:    v_readlane_b32 s0, v0, 48
+; GCN-NEXT:    v_readlane_b32 s1, v0, 49
+; GCN-NEXT:    v_readlane_b32 s2, v0, 50
+; GCN-NEXT:    v_readlane_b32 s3, v0, 51
+; GCN-NEXT:    v_readlane_b32 s4, v0, 52
+; GCN-NEXT:    v_readlane_b32 s5, v0, 53
+; GCN-NEXT:    v_readlane_b32 s6, v0, 54
+; GCN-NEXT:    v_readlane_b32 s7, v0, 55
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v2, 1
-; GCN-NEXT:    v_readlane_b32 s1, v2, 2
-; GCN-NEXT:    v_readlane_b32 s2, v2, 3
-; GCN-NEXT:    v_readlane_b32 s3, v2, 4
-; GCN-NEXT:    v_readlane_b32 s4, v2, 5
-; GCN-NEXT:    v_readlane_b32 s5, v2, 6
-; GCN-NEXT:    v_readlane_b32 s6, v2, 7
-; GCN-NEXT:    v_readlane_b32 s7, v2, 8
+; GCN-NEXT:    v_readlane_b32 s0, v2, 0
+; GCN-NEXT:    v_readlane_b32 s1, v2, 1
+; GCN-NEXT:    v_readlane_b32 s2, v2, 2
+; GCN-NEXT:    v_readlane_b32 s3, v2, 3
+; GCN-NEXT:    v_readlane_b32 s4, v2, 4
+; GCN-NEXT:    v_readlane_b32 s5, v2, 5
+; GCN-NEXT:    v_readlane_b32 s6, v2, 6
+; GCN-NEXT:    v_readlane_b32 s7, v2, 7
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
@@ -444,195 +442,193 @@ ret:
 define amdgpu_kernel void @split_sgpr_spill_2_vgprs(i32 addrspace(1)* %out, i32 %in) #1 {
 ; GCN-LABEL: split_sgpr_spill_2_vgprs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[36:51]
 ; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_writelane_b32 v0, s4, 0
+; GCN-NEXT:    v_writelane_b32 v0, s5, 1
+; GCN-NEXT:    v_writelane_b32 v0, s6, 2
+; GCN-NEXT:    v_writelane_b32 v0, s7, 3
+; GCN-NEXT:    v_writelane_b32 v0, s8, 4
+; GCN-NEXT:    v_writelane_b32 v0, s9, 5
+; GCN-NEXT:    v_writelane_b32 v0, s10, 6
+; GCN-NEXT:    v_writelane_b32 v0, s11, 7
+; GCN-NEXT:    v_writelane_b32 v0, s12, 8
+; GCN-NEXT:    v_writelane_b32 v0, s13, 9
+; GCN-NEXT:    v_writelane_b32 v0, s14, 10
+; GCN-NEXT:    v_writelane_b32 v0, s15, 11
+; GCN-NEXT:    v_writelane_b32 v0, s16, 12
+; GCN-NEXT:    v_writelane_b32 v0, s17, 13
+; GCN-NEXT:    v_writelane_b32 v0, s18, 14
+; GCN-NEXT:    v_writelane_b32 v0, s19, 15
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[4:19]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_writelane_b32 v0, s4, 16
+; GCN-NEXT:    v_writelane_b32 v0, s5, 17
+; GCN-NEXT:    v_writelane_b32 v0, s6, 18
+; GCN-NEXT:    v_writelane_b32 v0, s7, 19
+; GCN-NEXT:    v_writelane_b32 v0, s8, 20
+; GCN-NEXT:    v_writelane_b32 v0, s9, 21
+; GCN-NEXT:    v_writelane_b32 v0, s10, 22
+; GCN-NEXT:    v_writelane_b32 v0, s11, 23
+; GCN-NEXT:    v_writelane_b32 v0, s12, 24
+; GCN-NEXT:    v_writelane_b32 v0, s13, 25
+; GCN-NEXT:    v_writelane_b32 v0, s14, 26
+; GCN-NEXT:    v_writelane_b32 v0, s15, 27
+; GCN-NEXT:    v_writelane_b32 v0, s16, 28
+; GCN-NEXT:    v_writelane_b32 v0, s17, 29
+; GCN-NEXT:    v_writelane_b32 v0, s18, 30
+; GCN-NEXT:    v_writelane_b32 v0, s19, 31
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[4:19]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[20:27]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[0:1]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_mov_b32 s3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_writelane_b32 v0, s0, 0
-; GCN-NEXT:    v_writelane_b32 v0, s4, 1
-; GCN-NEXT:    v_writelane_b32 v0, s5, 2
-; GCN-NEXT:    v_writelane_b32 v0, s6, 3
-; GCN-NEXT:    v_writelane_b32 v0, s7, 4
-; GCN-NEXT:    v_writelane_b32 v0, s8, 5
-; GCN-NEXT:    v_writelane_b32 v0, s9, 6
-; GCN-NEXT:    v_writelane_b32 v0, s10, 7
-; GCN-NEXT:    v_writelane_b32 v0, s11, 8
-; GCN-NEXT:    v_writelane_b32 v0, s12, 9
-; GCN-NEXT:    v_writelane_b32 v0, s13, 10
-; GCN-NEXT:    v_writelane_b32 v0, s14, 11
-; GCN-NEXT:    v_writelane_b32 v0, s15, 12
-; GCN-NEXT:    v_writelane_b32 v0, s16, 13
-; GCN-NEXT:    v_writelane_b32 v0, s17, 14
-; GCN-NEXT:    v_writelane_b32 v0, s18, 15
-; GCN-NEXT:    v_writelane_b32 v0, s19, 16
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; def s[0:15]
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; def s[16:31]
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_writelane_b32 v0, s0, 17
-; GCN-NEXT:    v_writelane_b32 v0, s1, 18
-; GCN-NEXT:    v_writelane_b32 v0, s2, 19
-; GCN-NEXT:    v_writelane_b32 v0, s3, 20
-; GCN-NEXT:    v_writelane_b32 v0, s4, 21
-; GCN-NEXT:    v_writelane_b32 v0, s5, 22
-; GCN-NEXT:    v_writelane_b32 v0, s6, 23
-; GCN-NEXT:    v_writelane_b32 v0, s7, 24
-; GCN-NEXT:    v_writelane_b32 v0, s8, 25
-; GCN-NEXT:    v_writelane_b32 v0, s9, 26
-; GCN-NEXT:    v_writelane_b32 v0, s10, 27
-; GCN-NEXT:    v_writelane_b32 v0, s11, 28
-; GCN-NEXT:    v_writelane_b32 v0, s12, 29
-; GCN-NEXT:    v_writelane_b32 v0, s13, 30
-; GCN-NEXT:    v_writelane_b32 v0, s14, 31
-; GCN-NEXT:    v_writelane_b32 v0, s15, 32
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; def s[0:7]
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; def s[8:9]
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_mov_b32 s10, 0
-; GCN-NEXT:    v_readlane_b32 s11, v0, 0
-; GCN-NEXT:    s_cmp_lg_u32 s11, s10
-; GCN-NEXT:    v_writelane_b32 v0, s36, 33
-; GCN-NEXT:    v_writelane_b32 v0, s37, 34
-; GCN-NEXT:    v_writelane_b32 v0, s38, 35
-; GCN-NEXT:    v_writelane_b32 v0, s39, 36
-; GCN-NEXT:    v_writelane_b32 v0, s40, 37
-; GCN-NEXT:    v_writelane_b32 v0, s41, 38
-; GCN-NEXT:    v_writelane_b32 v0, s42, 39
-; GCN-NEXT:    v_writelane_b32 v0, s43, 40
-; GCN-NEXT:    v_writelane_b32 v0, s44, 41
-; GCN-NEXT:    v_writelane_b32 v0, s45, 42
-; GCN-NEXT:    v_writelane_b32 v0, s46, 43
-; GCN-NEXT:    v_writelane_b32 v0, s47, 44
-; GCN-NEXT:    v_writelane_b32 v0, s48, 45
-; GCN-NEXT:    v_writelane_b32 v0, s49, 46
-; GCN-NEXT:    v_writelane_b32 v0, s50, 47
-; GCN-NEXT:    v_writelane_b32 v0, s51, 48
-; GCN-NEXT:    v_writelane_b32 v0, s16, 49
-; GCN-NEXT:    v_writelane_b32 v0, s17, 50
-; GCN-NEXT:    v_writelane_b32 v0, s18, 51
-; GCN-NEXT:    v_writelane_b32 v0, s19, 52
-; GCN-NEXT:    v_writelane_b32 v0, s20, 53
-; GCN-NEXT:    v_writelane_b32 v0, s21, 54
-; GCN-NEXT:    v_writelane_b32 v0, s22, 55
-; GCN-NEXT:    v_writelane_b32 v0, s23, 56
-; GCN-NEXT:    v_writelane_b32 v0, s24, 57
-; GCN-NEXT:    v_writelane_b32 v0, s25, 58
-; GCN-NEXT:    v_writelane_b32 v0, s26, 59
-; GCN-NEXT:    v_writelane_b32 v0, s27, 60
-; GCN-NEXT:    v_writelane_b32 v0, s28, 61
-; GCN-NEXT:    v_writelane_b32 v0, s29, 62
-; GCN-NEXT:    v_writelane_b32 v0, s30, 63
-; GCN-NEXT:    v_writelane_b32 v1, s31, 0
-; GCN-NEXT:    v_writelane_b32 v1, s0, 1
-; GCN-NEXT:    v_writelane_b32 v1, s1, 2
-; GCN-NEXT:    v_writelane_b32 v1, s2, 3
-; GCN-NEXT:    v_writelane_b32 v1, s3, 4
-; GCN-NEXT:    v_writelane_b32 v1, s4, 5
-; GCN-NEXT:    v_writelane_b32 v1, s5, 6
-; GCN-NEXT:    v_writelane_b32 v1, s6, 7
-; GCN-NEXT:    v_writelane_b32 v1, s7, 8
-; GCN-NEXT:    v_writelane_b32 v1, s8, 9
-; GCN-NEXT:    v_writelane_b32 v1, s9, 10
+; GCN-NEXT:    s_cmp_lg_u32 s2, s3
+; GCN-NEXT:    v_writelane_b32 v0, s36, 32
+; GCN-NEXT:    v_writelane_b32 v0, s37, 33
+; GCN-NEXT:    v_writelane_b32 v0, s38, 34
+; GCN-NEXT:    v_writelane_b32 v0, s39, 35
+; GCN-NEXT:    v_writelane_b32 v0, s40, 36
+; GCN-NEXT:    v_writelane_b32 v0, s41, 37
+; GCN-NEXT:    v_writelane_b32 v0, s42, 38
+; GCN-NEXT:    v_writelane_b32 v0, s43, 39
+; GCN-NEXT:    v_writelane_b32 v0, s44, 40
+; GCN-NEXT:    v_writelane_b32 v0, s45, 41
+; GCN-NEXT:    v_writelane_b32 v0, s46, 42
+; GCN-NEXT:    v_writelane_b32 v0, s47, 43
+; GCN-NEXT:    v_writelane_b32 v0, s48, 44
+; GCN-NEXT:    v_writelane_b32 v0, s49, 45
+; GCN-NEXT:    v_writelane_b32 v0, s50, 46
+; GCN-NEXT:    v_writelane_b32 v0, s51, 47
+; GCN-NEXT:    v_writelane_b32 v0, s4, 48
+; GCN-NEXT:    v_writelane_b32 v0, s5, 49
+; GCN-NEXT:    v_writelane_b32 v0, s6, 50
+; GCN-NEXT:    v_writelane_b32 v0, s7, 51
+; GCN-NEXT:    v_writelane_b32 v0, s8, 52
+; GCN-NEXT:    v_writelane_b32 v0, s9, 53
+; GCN-NEXT:    v_writelane_b32 v0, s10, 54
+; GCN-NEXT:    v_writelane_b32 v0, s11, 55
+; GCN-NEXT:    v_writelane_b32 v0, s12, 56
+; GCN-NEXT:    v_writelane_b32 v0, s13, 57
+; GCN-NEXT:    v_writelane_b32 v0, s14, 58
+; GCN-NEXT:    v_writelane_b32 v0, s15, 59
+; GCN-NEXT:    v_writelane_b32 v0, s16, 60
+; GCN-NEXT:    v_writelane_b32 v0, s17, 61
+; GCN-NEXT:    v_writelane_b32 v0, s18, 62
+; GCN-NEXT:    v_writelane_b32 v0, s19, 63
+; GCN-NEXT:    v_writelane_b32 v1, s20, 0
+; GCN-NEXT:    v_writelane_b32 v1, s21, 1
+; GCN-NEXT:    v_writelane_b32 v1, s22, 2
+; GCN-NEXT:    v_writelane_b32 v1, s23, 3
+; GCN-NEXT:    v_writelane_b32 v1, s24, 4
+; GCN-NEXT:    v_writelane_b32 v1, s25, 5
+; GCN-NEXT:    v_writelane_b32 v1, s26, 6
+; GCN-NEXT:    v_writelane_b32 v1, s27, 7
+; GCN-NEXT:    v_writelane_b32 v1, s0, 8
+; GCN-NEXT:    v_writelane_b32 v1, s1, 9
 ; GCN-NEXT:    s_cbranch_scc1 BB1_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
-; GCN-NEXT:    v_readlane_b32 s0, v0, 1
-; GCN-NEXT:    v_readlane_b32 s1, v0, 2
-; GCN-NEXT:    v_readlane_b32 s2, v0, 3
-; GCN-NEXT:    v_readlane_b32 s3, v0, 4
-; GCN-NEXT:    v_readlane_b32 s4, v0, 5
-; GCN-NEXT:    v_readlane_b32 s5, v0, 6
-; GCN-NEXT:    v_readlane_b32 s6, v0, 7
-; GCN-NEXT:    v_readlane_b32 s7, v0, 8
-; GCN-NEXT:    v_readlane_b32 s8, v0, 9
-; GCN-NEXT:    v_readlane_b32 s9, v0, 10
-; GCN-NEXT:    v_readlane_b32 s10, v0, 11
-; GCN-NEXT:    v_readlane_b32 s11, v0, 12
-; GCN-NEXT:    v_readlane_b32 s12, v0, 13
-; GCN-NEXT:    v_readlane_b32 s13, v0, 14
-; GCN-NEXT:    v_readlane_b32 s14, v0, 15
-; GCN-NEXT:    v_readlane_b32 s15, v0, 16
+; GCN-NEXT:    v_readlane_b32 s0, v0, 0
+; GCN-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-NEXT:    v_readlane_b32 s2, v0, 2
+; GCN-NEXT:    v_readlane_b32 s3, v0, 3
+; GCN-NEXT:    v_readlane_b32 s4, v0, 4
+; GCN-NEXT:    v_readlane_b32 s5, v0, 5
+; GCN-NEXT:    v_readlane_b32 s6, v0, 6
+; GCN-NEXT:    v_readlane_b32 s7, v0, 7
+; GCN-NEXT:    v_readlane_b32 s8, v0, 8
+; GCN-NEXT:    v_readlane_b32 s9, v0, 9
+; GCN-NEXT:    v_readlane_b32 s10, v0, 10
+; GCN-NEXT:    v_readlane_b32 s11, v0, 11
+; GCN-NEXT:    v_readlane_b32 s12, v0, 12
+; GCN-NEXT:    v_readlane_b32 s13, v0, 13
+; GCN-NEXT:    v_readlane_b32 s14, v0, 14
+; GCN-NEXT:    v_readlane_b32 s15, v0, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:15]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v0, 33
-; GCN-NEXT:    v_readlane_b32 s1, v0, 34
-; GCN-NEXT:    v_readlane_b32 s2, v0, 35
-; GCN-NEXT:    v_readlane_b32 s3, v0, 36
-; GCN-NEXT:    v_readlane_b32 s4, v0, 37
-; GCN-NEXT:    v_readlane_b32 s5, v0, 38
-; GCN-NEXT:    v_readlane_b32 s6, v0, 39
-; GCN-NEXT:    v_readlane_b32 s7, v0, 40
-; GCN-NEXT:    v_readlane_b32 s8, v0, 41
-; GCN-NEXT:    v_readlane_b32 s9, v0, 42
-; GCN-NEXT:    v_readlane_b32 s10, v0, 43
-; GCN-NEXT:    v_readlane_b32 s11, v0, 44
-; GCN-NEXT:    v_readlane_b32 s12, v0, 45
-; GCN-NEXT:    v_readlane_b32 s13, v0, 46
-; GCN-NEXT:    v_readlane_b32 s14, v0, 47
-; GCN-NEXT:    v_readlane_b32 s15, v0, 48
+; GCN-NEXT:    v_readlane_b32 s0, v0, 32
+; GCN-NEXT:    v_readlane_b32 s1, v0, 33
+; GCN-NEXT:    v_readlane_b32 s2, v0, 34
+; GCN-NEXT:    v_readlane_b32 s3, v0, 35
+; GCN-NEXT:    v_readlane_b32 s4, v0, 36
+; GCN-NEXT:    v_readlane_b32 s5, v0, 37
+; GCN-NEXT:    v_readlane_b32 s6, v0, 38
+; GCN-NEXT:    v_readlane_b32 s7, v0, 39
+; GCN-NEXT:    v_readlane_b32 s8, v0, 40
+; GCN-NEXT:    v_readlane_b32 s9, v0, 41
+; GCN-NEXT:    v_readlane_b32 s10, v0, 42
+; GCN-NEXT:    v_readlane_b32 s11, v0, 43
+; GCN-NEXT:    v_readlane_b32 s12, v0, 44
+; GCN-NEXT:    v_readlane_b32 s13, v0, 45
+; GCN-NEXT:    v_readlane_b32 s14, v0, 46
+; GCN-NEXT:    v_readlane_b32 s15, v0, 47
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:15]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v0, 17
-; GCN-NEXT:    v_readlane_b32 s1, v0, 18
-; GCN-NEXT:    v_readlane_b32 s2, v0, 19
-; GCN-NEXT:    v_readlane_b32 s3, v0, 20
-; GCN-NEXT:    v_readlane_b32 s4, v0, 21
-; GCN-NEXT:    v_readlane_b32 s5, v0, 22
-; GCN-NEXT:    v_readlane_b32 s6, v0, 23
-; GCN-NEXT:    v_readlane_b32 s7, v0, 24
-; GCN-NEXT:    v_readlane_b32 s8, v0, 25
-; GCN-NEXT:    v_readlane_b32 s9, v0, 26
-; GCN-NEXT:    v_readlane_b32 s10, v0, 27
-; GCN-NEXT:    v_readlane_b32 s11, v0, 28
-; GCN-NEXT:    v_readlane_b32 s12, v0, 29
-; GCN-NEXT:    v_readlane_b32 s13, v0, 30
-; GCN-NEXT:    v_readlane_b32 s14, v0, 31
-; GCN-NEXT:    v_readlane_b32 s15, v0, 32
+; GCN-NEXT:    v_readlane_b32 s0, v0, 16
+; GCN-NEXT:    v_readlane_b32 s1, v0, 17
+; GCN-NEXT:    v_readlane_b32 s2, v0, 18
+; GCN-NEXT:    v_readlane_b32 s3, v0, 19
+; GCN-NEXT:    v_readlane_b32 s4, v0, 20
+; GCN-NEXT:    v_readlane_b32 s5, v0, 21
+; GCN-NEXT:    v_readlane_b32 s6, v0, 22
+; GCN-NEXT:    v_readlane_b32 s7, v0, 23
+; GCN-NEXT:    v_readlane_b32 s8, v0, 24
+; GCN-NEXT:    v_readlane_b32 s9, v0, 25
+; GCN-NEXT:    v_readlane_b32 s10, v0, 26
+; GCN-NEXT:    v_readlane_b32 s11, v0, 27
+; GCN-NEXT:    v_readlane_b32 s12, v0, 28
+; GCN-NEXT:    v_readlane_b32 s13, v0, 29
+; GCN-NEXT:    v_readlane_b32 s14, v0, 30
+; GCN-NEXT:    v_readlane_b32 s15, v0, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:15]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v1, 1
-; GCN-NEXT:    v_readlane_b32 s1, v1, 2
-; GCN-NEXT:    v_readlane_b32 s2, v1, 3
-; GCN-NEXT:    v_readlane_b32 s3, v1, 4
-; GCN-NEXT:    v_readlane_b32 s4, v1, 5
-; GCN-NEXT:    v_readlane_b32 s5, v1, 6
-; GCN-NEXT:    v_readlane_b32 s6, v1, 7
-; GCN-NEXT:    v_readlane_b32 s7, v1, 8
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; use s[0:7]
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v1, 9
-; GCN-NEXT:    v_readlane_b32 s1, v1, 10
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; use s[0:1]
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v0, 49
-; GCN-NEXT:    v_readlane_b32 s1, v0, 50
-; GCN-NEXT:    v_readlane_b32 s2, v0, 51
-; GCN-NEXT:    v_readlane_b32 s3, v0, 52
-; GCN-NEXT:    v_readlane_b32 s4, v0, 53
-; GCN-NEXT:    v_readlane_b32 s5, v0, 54
-; GCN-NEXT:    v_readlane_b32 s6, v0, 55
-; GCN-NEXT:    v_readlane_b32 s7, v0, 56
-; GCN-NEXT:    v_readlane_b32 s8, v0, 57
-; GCN-NEXT:    v_readlane_b32 s9, v0, 58
-; GCN-NEXT:    v_readlane_b32 s10, v0, 59
-; GCN-NEXT:    v_readlane_b32 s11, v0, 60
-; GCN-NEXT:    v_readlane_b32 s12, v0, 61
-; GCN-NEXT:    v_readlane_b32 s13, v0, 62
-; GCN-NEXT:    v_readlane_b32 s14, v0, 63
-; GCN-NEXT:    v_readlane_b32 s15, v1, 0
+; GCN-NEXT:    v_readlane_b32 s16, v1, 0
+; GCN-NEXT:    v_readlane_b32 s17, v1, 1
+; GCN-NEXT:    v_readlane_b32 s18, v1, 2
+; GCN-NEXT:    v_readlane_b32 s19, v1, 3
+; GCN-NEXT:    v_readlane_b32 s20, v1, 4
+; GCN-NEXT:    v_readlane_b32 s21, v1, 5
+; GCN-NEXT:    v_readlane_b32 s22, v1, 6
+; GCN-NEXT:    v_readlane_b32 s23, v1, 7
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use s[16:23]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_readlane_b32 s24, v1, 8
+; GCN-NEXT:    v_readlane_b32 s25, v1, 9
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; use s[24:25]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_readlane_b32 s0, v0, 48
+; GCN-NEXT:    v_readlane_b32 s1, v0, 49
+; GCN-NEXT:    v_readlane_b32 s2, v0, 50
+; GCN-NEXT:    v_readlane_b32 s3, v0, 51
+; GCN-NEXT:    v_readlane_b32 s4, v0, 52
+; GCN-NEXT:    v_readlane_b32 s5, v0, 53
+; GCN-NEXT:    v_readlane_b32 s6, v0, 54
+; GCN-NEXT:    v_readlane_b32 s7, v0, 55
+; GCN-NEXT:    v_readlane_b32 s8, v0, 56
+; GCN-NEXT:    v_readlane_b32 s9, v0, 57
+; GCN-NEXT:    v_readlane_b32 s10, v0, 58
+; GCN-NEXT:    v_readlane_b32 s11, v0, 59
+; GCN-NEXT:    v_readlane_b32 s12, v0, 60
+; GCN-NEXT:    v_readlane_b32 s13, v0, 61
+; GCN-NEXT:    v_readlane_b32 s14, v0, 62
+; GCN-NEXT:    v_readlane_b32 s15, v0, 63
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:15]
 ; GCN-NEXT:    ;;#ASMEND
@@ -667,13 +663,13 @@ ret:
 define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 {
 ; GCN-LABEL: no_vgprs_last_sgpr_spill:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 s56, SCRATCH_RSRC_DWORD0
-; GCN-NEXT:    s_mov_b32 s57, SCRATCH_RSRC_DWORD1
-; GCN-NEXT:    s_mov_b32 s58, -1
-; GCN-NEXT:    s_mov_b32 s59, 0xe8f000
-; GCN-NEXT:    s_add_u32 s56, s56, s3
-; GCN-NEXT:    s_addc_u32 s57, s57, 0
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
+; GCN-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GCN-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GCN-NEXT:    s_mov_b32 s22, -1
+; GCN-NEXT:    s_mov_b32 s23, 0xe8f000
+; GCN-NEXT:    s_add_u32 s20, s20, s3
+; GCN-NEXT:    s_addc_u32 s21, s21, 0
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
@@ -692,179 +688,177 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def s[36:51]
 ; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_writelane_b32 v31, s4, 0
+; GCN-NEXT:    v_writelane_b32 v31, s5, 1
+; GCN-NEXT:    v_writelane_b32 v31, s6, 2
+; GCN-NEXT:    v_writelane_b32 v31, s7, 3
+; GCN-NEXT:    v_writelane_b32 v31, s8, 4
+; GCN-NEXT:    v_writelane_b32 v31, s9, 5
+; GCN-NEXT:    v_writelane_b32 v31, s10, 6
+; GCN-NEXT:    v_writelane_b32 v31, s11, 7
+; GCN-NEXT:    v_writelane_b32 v31, s12, 8
+; GCN-NEXT:    v_writelane_b32 v31, s13, 9
+; GCN-NEXT:    v_writelane_b32 v31, s14, 10
+; GCN-NEXT:    v_writelane_b32 v31, s15, 11
+; GCN-NEXT:    v_writelane_b32 v31, s16, 12
+; GCN-NEXT:    v_writelane_b32 v31, s17, 13
+; GCN-NEXT:    v_writelane_b32 v31, s18, 14
+; GCN-NEXT:    v_writelane_b32 v31, s19, 15
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[4:19]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_writelane_b32 v31, s4, 16
+; GCN-NEXT:    v_writelane_b32 v31, s5, 17
+; GCN-NEXT:    v_writelane_b32 v31, s6, 18
+; GCN-NEXT:    v_writelane_b32 v31, s7, 19
+; GCN-NEXT:    v_writelane_b32 v31, s8, 20
+; GCN-NEXT:    v_writelane_b32 v31, s9, 21
+; GCN-NEXT:    v_writelane_b32 v31, s10, 22
+; GCN-NEXT:    v_writelane_b32 v31, s11, 23
+; GCN-NEXT:    v_writelane_b32 v31, s12, 24
+; GCN-NEXT:    v_writelane_b32 v31, s13, 25
+; GCN-NEXT:    v_writelane_b32 v31, s14, 26
+; GCN-NEXT:    v_writelane_b32 v31, s15, 27
+; GCN-NEXT:    v_writelane_b32 v31, s16, 28
+; GCN-NEXT:    v_writelane_b32 v31, s17, 29
+; GCN-NEXT:    v_writelane_b32 v31, s18, 30
+; GCN-NEXT:    v_writelane_b32 v31, s19, 31
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[4:19]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ; def s[0:1]
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_mov_b32 s3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_writelane_b32 v31, s0, 0
-; GCN-NEXT:    v_writelane_b32 v31, s4, 1
-; GCN-NEXT:    v_writelane_b32 v31, s5, 2
-; GCN-NEXT:    v_writelane_b32 v31, s6, 3
-; GCN-NEXT:    v_writelane_b32 v31, s7, 4
-; GCN-NEXT:    v_writelane_b32 v31, s8, 5
-; GCN-NEXT:    v_writelane_b32 v31, s9, 6
-; GCN-NEXT:    v_writelane_b32 v31, s10, 7
-; GCN-NEXT:    v_writelane_b32 v31, s11, 8
-; GCN-NEXT:    v_writelane_b32 v31, s12, 9
-; GCN-NEXT:    v_writelane_b32 v31, s13, 10
-; GCN-NEXT:    v_writelane_b32 v31, s14, 11
-; GCN-NEXT:    v_writelane_b32 v31, s15, 12
-; GCN-NEXT:    v_writelane_b32 v31, s16, 13
-; GCN-NEXT:    v_writelane_b32 v31, s17, 14
-; GCN-NEXT:    v_writelane_b32 v31, s18, 15
-; GCN-NEXT:    v_writelane_b32 v31, s19, 16
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; def s[0:15]
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; def s[16:31]
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; def s[34:35]
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_mov_b32 s33, 0
-; GCN-NEXT:    v_readlane_b32 s52, v31, 0
-; GCN-NEXT:    s_cmp_lg_u32 s52, s33
-; GCN-NEXT:    v_writelane_b32 v31, s36, 17
-; GCN-NEXT:    v_writelane_b32 v31, s37, 18
-; GCN-NEXT:    v_writelane_b32 v31, s38, 19
-; GCN-NEXT:    v_writelane_b32 v31, s39, 20
-; GCN-NEXT:    v_writelane_b32 v31, s40, 21
-; GCN-NEXT:    v_writelane_b32 v31, s41, 22
-; GCN-NEXT:    v_writelane_b32 v31, s42, 23
-; GCN-NEXT:    v_writelane_b32 v31, s43, 24
-; GCN-NEXT:    v_writelane_b32 v31, s44, 25
-; GCN-NEXT:    v_writelane_b32 v31, s45, 26
-; GCN-NEXT:    v_writelane_b32 v31, s46, 27
-; GCN-NEXT:    v_writelane_b32 v31, s47, 28
-; GCN-NEXT:    v_writelane_b32 v31, s48, 29
-; GCN-NEXT:    v_writelane_b32 v31, s49, 30
-; GCN-NEXT:    v_writelane_b32 v31, s50, 31
-; GCN-NEXT:    v_writelane_b32 v31, s51, 32
-; GCN-NEXT:    v_writelane_b32 v31, s0, 33
-; GCN-NEXT:    v_writelane_b32 v31, s1, 34
-; GCN-NEXT:    v_writelane_b32 v31, s2, 35
-; GCN-NEXT:    v_writelane_b32 v31, s3, 36
-; GCN-NEXT:    v_writelane_b32 v31, s4, 37
-; GCN-NEXT:    v_writelane_b32 v31, s5, 38
-; GCN-NEXT:    v_writelane_b32 v31, s6, 39
-; GCN-NEXT:    v_writelane_b32 v31, s7, 40
-; GCN-NEXT:    v_writelane_b32 v31, s8, 41
-; GCN-NEXT:    v_writelane_b32 v31, s9, 42
-; GCN-NEXT:    v_writelane_b32 v31, s10, 43
-; GCN-NEXT:    v_writelane_b32 v31, s11, 44
-; GCN-NEXT:    v_writelane_b32 v31, s12, 45
-; GCN-NEXT:    v_writelane_b32 v31, s13, 46
-; GCN-NEXT:    v_writelane_b32 v31, s14, 47
-; GCN-NEXT:    v_writelane_b32 v31, s15, 48
-; GCN-NEXT:    buffer_store_dword v0, off, s[56:59], 0
-; GCN-NEXT:    v_writelane_b32 v0, s16, 0
-; GCN-NEXT:    v_writelane_b32 v0, s17, 1
-; GCN-NEXT:    v_writelane_b32 v0, s18, 2
-; GCN-NEXT:    v_writelane_b32 v0, s19, 3
-; GCN-NEXT:    v_writelane_b32 v0, s20, 4
-; GCN-NEXT:    v_writelane_b32 v0, s21, 5
-; GCN-NEXT:    v_writelane_b32 v0, s22, 6
-; GCN-NEXT:    v_writelane_b32 v0, s23, 7
-; GCN-NEXT:    v_writelane_b32 v0, s24, 8
-; GCN-NEXT:    v_writelane_b32 v0, s25, 9
-; GCN-NEXT:    v_writelane_b32 v0, s26, 10
-; GCN-NEXT:    v_writelane_b32 v0, s27, 11
-; GCN-NEXT:    v_writelane_b32 v0, s28, 12
-; GCN-NEXT:    v_writelane_b32 v0, s29, 13
-; GCN-NEXT:    v_writelane_b32 v0, s30, 14
-; GCN-NEXT:    v_writelane_b32 v0, s31, 15
-; GCN-NEXT:    s_mov_b64 s[16:17], exec
-; GCN-NEXT:    s_mov_b64 exec, 0xffff
-; GCN-NEXT:    buffer_store_dword v0, off, s[56:59], 0 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[16:17]
-; GCN-NEXT:    v_writelane_b32 v31, s34, 49
-; GCN-NEXT:    v_writelane_b32 v31, s35, 50
-; GCN-NEXT:    buffer_load_dword v0, off, s[56:59], 0
+; GCN-NEXT:    s_cmp_lg_u32 s2, s3
+; GCN-NEXT:    v_writelane_b32 v31, s36, 32
+; GCN-NEXT:    v_writelane_b32 v31, s37, 33
+; GCN-NEXT:    v_writelane_b32 v31, s38, 34
+; GCN-NEXT:    v_writelane_b32 v31, s39, 35
+; GCN-NEXT:    v_writelane_b32 v31, s40, 36
+; GCN-NEXT:    v_writelane_b32 v31, s41, 37
+; GCN-NEXT:    v_writelane_b32 v31, s42, 38
+; GCN-NEXT:    v_writelane_b32 v31, s43, 39
+; GCN-NEXT:    v_writelane_b32 v31, s44, 40
+; GCN-NEXT:    v_writelane_b32 v31, s45, 41
+; GCN-NEXT:    v_writelane_b32 v31, s46, 42
+; GCN-NEXT:    v_writelane_b32 v31, s47, 43
+; GCN-NEXT:    v_writelane_b32 v31, s48, 44
+; GCN-NEXT:    v_writelane_b32 v31, s49, 45
+; GCN-NEXT:    v_writelane_b32 v31, s50, 46
+; GCN-NEXT:    v_writelane_b32 v31, s51, 47
+; GCN-NEXT:    v_writelane_b32 v31, s4, 48
+; GCN-NEXT:    v_writelane_b32 v31, s5, 49
+; GCN-NEXT:    v_writelane_b32 v31, s6, 50
+; GCN-NEXT:    v_writelane_b32 v31, s7, 51
+; GCN-NEXT:    v_writelane_b32 v31, s8, 52
+; GCN-NEXT:    v_writelane_b32 v31, s9, 53
+; GCN-NEXT:    v_writelane_b32 v31, s10, 54
+; GCN-NEXT:    v_writelane_b32 v31, s11, 55
+; GCN-NEXT:    v_writelane_b32 v31, s12, 56
+; GCN-NEXT:    v_writelane_b32 v31, s13, 57
+; GCN-NEXT:    v_writelane_b32 v31, s14, 58
+; GCN-NEXT:    v_writelane_b32 v31, s15, 59
+; GCN-NEXT:    v_writelane_b32 v31, s16, 60
+; GCN-NEXT:    v_writelane_b32 v31, s17, 61
+; GCN-NEXT:    v_writelane_b32 v31, s18, 62
+; GCN-NEXT:    v_writelane_b32 v31, s19, 63
+; GCN-NEXT:    buffer_store_dword v0, off, s[20:23], 0
+; GCN-NEXT:    v_writelane_b32 v0, s0, 0
+; GCN-NEXT:    v_writelane_b32 v0, s1, 1
+; GCN-NEXT:    s_mov_b64 s[0:1], exec
+; GCN-NEXT:    s_mov_b64 exec, 3
+; GCN-NEXT:    buffer_store_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    buffer_load_dword v0, off, s[20:23], 0
 ; GCN-NEXT:    s_cbranch_scc1 BB2_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
-; GCN-NEXT:    v_readlane_b32 s0, v31, 1
-; GCN-NEXT:    v_readlane_b32 s1, v31, 2
-; GCN-NEXT:    v_readlane_b32 s2, v31, 3
-; GCN-NEXT:    v_readlane_b32 s3, v31, 4
-; GCN-NEXT:    v_readlane_b32 s4, v31, 5
-; GCN-NEXT:    v_readlane_b32 s5, v31, 6
-; GCN-NEXT:    v_readlane_b32 s6, v31, 7
-; GCN-NEXT:    v_readlane_b32 s7, v31, 8
-; GCN-NEXT:    v_readlane_b32 s8, v31, 9
-; GCN-NEXT:    v_readlane_b32 s9, v31, 10
-; GCN-NEXT:    v_readlane_b32 s10, v31, 11
-; GCN-NEXT:    v_readlane_b32 s11, v31, 12
-; GCN-NEXT:    v_readlane_b32 s12, v31, 13
-; GCN-NEXT:    v_readlane_b32 s13, v31, 14
-; GCN-NEXT:    v_readlane_b32 s14, v31, 15
-; GCN-NEXT:    v_readlane_b32 s15, v31, 16
+; GCN-NEXT:    v_readlane_b32 s0, v31, 0
+; GCN-NEXT:    v_readlane_b32 s1, v31, 1
+; GCN-NEXT:    v_readlane_b32 s2, v31, 2
+; GCN-NEXT:    v_readlane_b32 s3, v31, 3
+; GCN-NEXT:    v_readlane_b32 s4, v31, 4
+; GCN-NEXT:    v_readlane_b32 s5, v31, 5
+; GCN-NEXT:    v_readlane_b32 s6, v31, 6
+; GCN-NEXT:    v_readlane_b32 s7, v31, 7
+; GCN-NEXT:    v_readlane_b32 s8, v31, 8
+; GCN-NEXT:    v_readlane_b32 s9, v31, 9
+; GCN-NEXT:    v_readlane_b32 s10, v31, 10
+; GCN-NEXT:    v_readlane_b32 s11, v31, 11
+; GCN-NEXT:    v_readlane_b32 s12, v31, 12
+; GCN-NEXT:    v_readlane_b32 s13, v31, 13
+; GCN-NEXT:    v_readlane_b32 s14, v31, 14
+; GCN-NEXT:    v_readlane_b32 s15, v31, 15
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:15]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v31, 17
-; GCN-NEXT:    v_readlane_b32 s1, v31, 18
-; GCN-NEXT:    v_readlane_b32 s2, v31, 19
-; GCN-NEXT:    v_readlane_b32 s3, v31, 20
-; GCN-NEXT:    v_readlane_b32 s4, v31, 21
-; GCN-NEXT:    v_readlane_b32 s5, v31, 22
-; GCN-NEXT:    v_readlane_b32 s6, v31, 23
-; GCN-NEXT:    v_readlane_b32 s7, v31, 24
-; GCN-NEXT:    v_readlane_b32 s8, v31, 25
-; GCN-NEXT:    v_readlane_b32 s9, v31, 26
-; GCN-NEXT:    v_readlane_b32 s10, v31, 27
-; GCN-NEXT:    v_readlane_b32 s11, v31, 28
-; GCN-NEXT:    v_readlane_b32 s12, v31, 29
-; GCN-NEXT:    v_readlane_b32 s13, v31, 30
-; GCN-NEXT:    v_readlane_b32 s14, v31, 31
-; GCN-NEXT:    v_readlane_b32 s15, v31, 32
+; GCN-NEXT:    v_readlane_b32 s0, v31, 32
+; GCN-NEXT:    v_readlane_b32 s1, v31, 33
+; GCN-NEXT:    v_readlane_b32 s2, v31, 34
+; GCN-NEXT:    v_readlane_b32 s3, v31, 35
+; GCN-NEXT:    v_readlane_b32 s4, v31, 36
+; GCN-NEXT:    v_readlane_b32 s5, v31, 37
+; GCN-NEXT:    v_readlane_b32 s6, v31, 38
+; GCN-NEXT:    v_readlane_b32 s7, v31, 39
+; GCN-NEXT:    v_readlane_b32 s8, v31, 40
+; GCN-NEXT:    v_readlane_b32 s9, v31, 41
+; GCN-NEXT:    v_readlane_b32 s10, v31, 42
+; GCN-NEXT:    v_readlane_b32 s11, v31, 43
+; GCN-NEXT:    v_readlane_b32 s12, v31, 44
+; GCN-NEXT:    v_readlane_b32 s13, v31, 45
+; GCN-NEXT:    v_readlane_b32 s14, v31, 46
+; GCN-NEXT:    v_readlane_b32 s15, v31, 47
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:15]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v31, 33
-; GCN-NEXT:    v_readlane_b32 s1, v31, 34
-; GCN-NEXT:    v_readlane_b32 s2, v31, 35
-; GCN-NEXT:    v_readlane_b32 s3, v31, 36
-; GCN-NEXT:    v_readlane_b32 s4, v31, 37
-; GCN-NEXT:    v_readlane_b32 s5, v31, 38
-; GCN-NEXT:    v_readlane_b32 s6, v31, 39
-; GCN-NEXT:    v_readlane_b32 s7, v31, 40
-; GCN-NEXT:    v_readlane_b32 s8, v31, 41
-; GCN-NEXT:    v_readlane_b32 s9, v31, 42
-; GCN-NEXT:    v_readlane_b32 s10, v31, 43
-; GCN-NEXT:    v_readlane_b32 s11, v31, 44
-; GCN-NEXT:    v_readlane_b32 s12, v31, 45
-; GCN-NEXT:    v_readlane_b32 s13, v31, 46
-; GCN-NEXT:    v_readlane_b32 s14, v31, 47
-; GCN-NEXT:    v_readlane_b32 s15, v31, 48
+; GCN-NEXT:    v_readlane_b32 s0, v31, 16
+; GCN-NEXT:    v_readlane_b32 s1, v31, 17
+; GCN-NEXT:    v_readlane_b32 s2, v31, 18
+; GCN-NEXT:    v_readlane_b32 s3, v31, 19
+; GCN-NEXT:    v_readlane_b32 s4, v31, 20
+; GCN-NEXT:    v_readlane_b32 s5, v31, 21
+; GCN-NEXT:    v_readlane_b32 s6, v31, 22
+; GCN-NEXT:    v_readlane_b32 s7, v31, 23
+; GCN-NEXT:    v_readlane_b32 s8, v31, 24
+; GCN-NEXT:    v_readlane_b32 s9, v31, 25
+; GCN-NEXT:    v_readlane_b32 s10, v31, 26
+; GCN-NEXT:    v_readlane_b32 s11, v31, 27
+; GCN-NEXT:    v_readlane_b32 s12, v31, 28
+; GCN-NEXT:    v_readlane_b32 s13, v31, 29
+; GCN-NEXT:    v_readlane_b32 s14, v31, 30
+; GCN-NEXT:    v_readlane_b32 s15, v31, 31
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:15]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    s_mov_b64 s[0:1], exec
-; GCN-NEXT:    s_mov_b64 exec, 0xffff
-; GCN-NEXT:    buffer_load_dword v0, off, s[56:59], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[0:1]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-NEXT:    v_readlane_b32 s1, v0, 1
-; GCN-NEXT:    v_readlane_b32 s2, v0, 2
-; GCN-NEXT:    v_readlane_b32 s3, v0, 3
-; GCN-NEXT:    v_readlane_b32 s4, v0, 4
-; GCN-NEXT:    v_readlane_b32 s5, v0, 5
-; GCN-NEXT:    v_readlane_b32 s6, v0, 6
-; GCN-NEXT:    v_readlane_b32 s7, v0, 7
-; GCN-NEXT:    v_readlane_b32 s8, v0, 8
-; GCN-NEXT:    v_readlane_b32 s9, v0, 9
-; GCN-NEXT:    v_readlane_b32 s10, v0, 10
-; GCN-NEXT:    v_readlane_b32 s11, v0, 11
-; GCN-NEXT:    v_readlane_b32 s12, v0, 12
-; GCN-NEXT:    v_readlane_b32 s13, v0, 13
-; GCN-NEXT:    v_readlane_b32 s14, v0, 14
-; GCN-NEXT:    v_readlane_b32 s15, v0, 15
+; GCN-NEXT:    v_readlane_b32 s0, v31, 48
+; GCN-NEXT:    v_readlane_b32 s1, v31, 49
+; GCN-NEXT:    v_readlane_b32 s2, v31, 50
+; GCN-NEXT:    v_readlane_b32 s3, v31, 51
+; GCN-NEXT:    v_readlane_b32 s4, v31, 52
+; GCN-NEXT:    v_readlane_b32 s5, v31, 53
+; GCN-NEXT:    v_readlane_b32 s6, v31, 54
+; GCN-NEXT:    v_readlane_b32 s7, v31, 55
+; GCN-NEXT:    v_readlane_b32 s8, v31, 56
+; GCN-NEXT:    v_readlane_b32 s9, v31, 57
+; GCN-NEXT:    v_readlane_b32 s10, v31, 58
+; GCN-NEXT:    v_readlane_b32 s11, v31, 59
+; GCN-NEXT:    v_readlane_b32 s12, v31, 60
+; GCN-NEXT:    v_readlane_b32 s13, v31, 61
+; GCN-NEXT:    v_readlane_b32 s14, v31, 62
+; GCN-NEXT:    v_readlane_b32 s15, v31, 63
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:15]
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v31, 49
-; GCN-NEXT:    v_readlane_b32 s1, v31, 50
+; GCN-NEXT:    s_mov_b64 s[16:17], exec
+; GCN-NEXT:    s_mov_b64 exec, 3
+; GCN-NEXT:    buffer_load_dword v0, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[16:17]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s16, v0, 0
+; GCN-NEXT:    v_readlane_b32 s17, v0, 1
 ; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ; use s[0:1]
+; GCN-NEXT:    ; use s[16:17]
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:  BB2_2: ; %ret
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
index 9b629a5f9111..a03318ead716 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
@@ -77,101 +77,6 @@ endif:                                            ; preds = %else, %if
   ret void
 }
 
-; Force save and restore of m0 during SMEM spill
-; GCN-LABEL: {{^}}m0_unavailable_spill:
-
-; GCN: ; def m0, 1
-
-; GCN: s_mov_b32 m0, s0
-; GCN: v_interp_mov_f32
-
-; GCN: ; clobber m0
-
-; TOSMEM: s_mov_b32 s2, m0
-; TOSMEM: s_add_u32 m0, s3, 0x100
-; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
-; TOSMEM: s_mov_b32 m0, s2
-
-; TOSMEM: s_mov_b64 exec,
-; TOSMEM: s_cbranch_execz
-; TOSMEM: s_branch
-
-; TOSMEM: BB{{[0-9]+_[0-9]+}}:
-; TOSMEM: s_add_u32 m0, s3, 0x100
-; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
-
-; GCN-NOT: v_readlane_b32 m0
-; GCN-NOT: s_buffer_store_dword m0
-; GCN-NOT: s_buffer_load_dword m0
-define amdgpu_kernel void @m0_unavailable_spill(i32 %m0.arg) #0 {
-main_body:
-  %m0 = call i32 asm sideeffect "; def $0, 1", "={m0}"() #0
-  %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0.arg)
-  call void asm sideeffect "; clobber $0", "~{m0}"() #0
-  %cmp = fcmp ueq float 0.000000e+00, %tmp
-   br i1 %cmp, label %if, label %else
-
-if:                                               ; preds = %main_body
-  store volatile i32 8, i32 addrspace(1)* undef
-  br label %endif
-
-else:                                             ; preds = %main_body
-  store volatile i32 11, i32 addrspace(1)* undef
-  br label %endif
-
-endif:
-  ret void
-}
-
-; GCN-LABEL: {{^}}restore_m0_lds:
-; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]]
-; TOSMEM: s_cmp_eq_u32
-; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it
-; FIXME-TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s3, 0x100
-; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill
-; FIXME-TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s3, 0x200
-; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill
-; FIXME-TOSMEM-NOT: m0
-; TOSMEM: s_cbranch_scc1
-
-; TOSMEM: s_mov_b32 m0, -1
-
-; TOSMEM: s_mov_b32 s2, m0
-; TOSMEM: s_add_u32 m0, s3, 0x200
-; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload
-; TOSMEM: s_mov_b32 m0, s2
-; TOSMEM: s_waitcnt lgkmcnt(0)
-
-; TOSMEM: ds_write_b64
-
-; FIXME-TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s3, 0x100
-; TOSMEM: s_buffer_load_dword s2, s[88:91], m0 ; 4-byte Folded Reload
-; FIXME-TOSMEM-NOT: m0
-; TOSMEM: s_waitcnt lgkmcnt(0)
-; TOSMEM-NOT: m0
-; TOSMEM: s_mov_b32 m0, s2
-; TOSMEM: ; use m0
-
-; TOSMEM: s_dcache_wb
-; TOSMEM: s_endpgm
-define amdgpu_kernel void @restore_m0_lds(i32 %arg) {
-  %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={m0}"() #0
-  %sval = load volatile i64, i64 addrspace(4)* undef
-  %cmp = icmp eq i32 %arg, 0
-  br i1 %cmp, label %ret, label %bb
-
-bb:
-  store volatile i64 %sval, i64 addrspace(3)* undef
-  call void asm sideeffect "; use $0", "{m0}"(i32 %m0) #0
-  br label %ret
-
-ret:
-  ret void
-}
-
 declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 1a48e76a241b..e4beac77e1be 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -94,10 +94,10 @@ define i32 @called(i32 %a) noinline {
 
 ; GFX9-LABEL: {{^}}call:
 define amdgpu_kernel void @call(<4 x i32> inreg %tmp14, i32 inreg %arg) {
-; GFX9-O0: v_mov_b32_e32 v0, s0
+; GFX9-O0: v_mov_b32_e32 v0, s2
 ; GFX9-O3: v_mov_b32_e32 v2, s0
 ; GFX9-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, s3
 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
 ; GFX9-NEXT: s_not_b64 exec, exec
   %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
@@ -142,8 +142,8 @@ define amdgpu_kernel void @call_i64(<4 x i32> inreg %tmp14, i64 inreg %arg) {
 ; GFX9-O0: buffer_store_dword v1
 ; GFX9: s_swappc_b64
   %tmp134 = call i64 @called_i64(i64 %tmp107)
-; GFX9-O0: buffer_load_dword v4
-; GFX9-O0: buffer_load_dword v5
+; GFX9-O0: buffer_load_dword v6
+; GFX9-O0: buffer_load_dword v7
   %tmp136 = add i64 %tmp134, %tmp107
   %tmp137 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp136)
   %tmp138 = bitcast i64 %tmp137 to <2 x i32>
diff --git a/llvm/test/CodeGen/ARM/legalize-bitcast.ll b/llvm/test/CodeGen/ARM/legalize-bitcast.ll
index 529775df5fd7..478ff985bf47 100644
--- a/llvm/test/CodeGen/ARM/legalize-bitcast.ll
+++ b/llvm/test/CodeGen/ARM/legalize-bitcast.ll
@@ -49,9 +49,9 @@ define i16 @int_to_vec(i80 %in) {
 ; CHECK-NEXT:    vmov.32 d16[0], r0
 ; CHECK-NEXT:    @ implicit-def: $q9
 ; CHECK-NEXT:    vmov.f64 d18, d16
-; CHECK-NEXT:    vrev32.16 q8, q9
-; CHECK-NEXT:    @ kill: def $d16 killed $d16 killed $q8
-; CHECK-NEXT:    vmov.u16 r0, d16[0]
+; CHECK-NEXT:    vrev32.16 q9, q9
+; CHECK-NEXT:    @ kill: def $d18 killed $d18 killed $q9
+; CHECK-NEXT:    vmov.u16 r0, d18[0]
 ; CHECK-NEXT:    bx lr
   %vec = bitcast i80 %in to <5 x i16>
   %e0 = extractelement <5 x i16> %vec, i32 0
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/fptosi_and_fptoui.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/fptosi_and_fptoui.ll
index a98c6eb9fd6c..c63f24ea692c 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/fptosi_and_fptoui.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/fptosi_and_fptoui.ll
@@ -235,15 +235,15 @@ define i32 @f64tou32(double %a) {
 ; FP32-NEXT:    mfc1 $1, $f0
 ; FP32-NEXT:    lui $2, 16864
 ; FP32-NEXT:    ori $3, $zero, 0
-; FP32-NEXT:    mtc1 $3, $f0
-; FP32-NEXT:    mtc1 $2, $f1
-; FP32-NEXT:    sub.d $f2, $f12, $f0
-; FP32-NEXT:    trunc.w.d $f2, $f2
-; FP32-NEXT:    mfc1 $2, $f2
+; FP32-NEXT:    mtc1 $3, $f2
+; FP32-NEXT:    mtc1 $2, $f3
+; FP32-NEXT:    sub.d $f4, $f12, $f2
+; FP32-NEXT:    trunc.w.d $f0, $f4
+; FP32-NEXT:    mfc1 $2, $f0
 ; FP32-NEXT:    lui $3, 32768
 ; FP32-NEXT:    xor $2, $2, $3
 ; FP32-NEXT:    addiu $3, $zero, 1
-; FP32-NEXT:    c.ult.d $f12, $f0
+; FP32-NEXT:    c.ult.d $f12, $f2
 ; FP32-NEXT:    movf $3, $zero, $fcc0
 ; FP32-NEXT:    andi $3, $3, 1
 ; FP32-NEXT:    movn $2, $1, $3
@@ -256,15 +256,15 @@ define i32 @f64tou32(double %a) {
 ; FP64-NEXT:    mfc1 $1, $f0
 ; FP64-NEXT:    lui $2, 16864
 ; FP64-NEXT:    ori $3, $zero, 0
-; FP64-NEXT:    mtc1 $3, $f0
-; FP64-NEXT:    mthc1 $2, $f0
-; FP64-NEXT:    sub.d $f1, $f12, $f0
-; FP64-NEXT:    trunc.w.d $f1, $f1
-; FP64-NEXT:    mfc1 $2, $f1
+; FP64-NEXT:    mtc1 $3, $f1
+; FP64-NEXT:    mthc1 $2, $f1
+; FP64-NEXT:    sub.d $f2, $f12, $f1
+; FP64-NEXT:    trunc.w.d $f0, $f2
+; FP64-NEXT:    mfc1 $2, $f0
 ; FP64-NEXT:    lui $3, 32768
 ; FP64-NEXT:    xor $2, $2, $3
 ; FP64-NEXT:    addiu $3, $zero, 1
-; FP64-NEXT:    c.ult.d $f12, $f0
+; FP64-NEXT:    c.ult.d $f12, $f1
 ; FP64-NEXT:    movf $3, $zero, $fcc0
 ; FP64-NEXT:    andi $3, $3, 1
 ; FP64-NEXT:    movn $2, $1, $3
@@ -282,15 +282,15 @@ define zeroext i16 @f64tou16(double %a) {
 ; FP32-NEXT:    mfc1 $1, $f0
 ; FP32-NEXT:    lui $2, 16864
 ; FP32-NEXT:    ori $3, $zero, 0
-; FP32-NEXT:    mtc1 $3, $f0
-; FP32-NEXT:    mtc1 $2, $f1
-; FP32-NEXT:    sub.d $f2, $f12, $f0
-; FP32-NEXT:    trunc.w.d $f2, $f2
-; FP32-NEXT:    mfc1 $2, $f2
+; FP32-NEXT:    mtc1 $3, $f2
+; FP32-NEXT:    mtc1 $2, $f3
+; FP32-NEXT:    sub.d $f4, $f12, $f2
+; FP32-NEXT:    trunc.w.d $f0, $f4
+; FP32-NEXT:    mfc1 $2, $f0
 ; FP32-NEXT:    lui $3, 32768
 ; FP32-NEXT:    xor $2, $2, $3
 ; FP32-NEXT:    addiu $3, $zero, 1
-; FP32-NEXT:    c.ult.d $f12, $f0
+; FP32-NEXT:    c.ult.d $f12, $f2
 ; FP32-NEXT:    movf $3, $zero, $fcc0
 ; FP32-NEXT:    andi $3, $3, 1
 ; FP32-NEXT:    movn $2, $1, $3
@@ -304,15 +304,15 @@ define zeroext i16 @f64tou16(double %a) {
 ; FP64-NEXT:    mfc1 $1, $f0
 ; FP64-NEXT:    lui $2, 16864
 ; FP64-NEXT:    ori $3, $zero, 0
-; FP64-NEXT:    mtc1 $3, $f0
-; FP64-NEXT:    mthc1 $2, $f0
-; FP64-NEXT:    sub.d $f1, $f12, $f0
-; FP64-NEXT:    trunc.w.d $f1, $f1
-; FP64-NEXT:    mfc1 $2, $f1
+; FP64-NEXT:    mtc1 $3, $f1
+; FP64-NEXT:    mthc1 $2, $f1
+; FP64-NEXT:    sub.d $f2, $f12, $f1
+; FP64-NEXT:    trunc.w.d $f0, $f2
+; FP64-NEXT:    mfc1 $2, $f0
 ; FP64-NEXT:    lui $3, 32768
 ; FP64-NEXT:    xor $2, $2, $3
 ; FP64-NEXT:    addiu $3, $zero, 1
-; FP64-NEXT:    c.ult.d $f12, $f0
+; FP64-NEXT:    c.ult.d $f12, $f1
 ; FP64-NEXT:    movf $3, $zero, $fcc0
 ; FP64-NEXT:    andi $3, $3, 1
 ; FP64-NEXT:    movn $2, $1, $3
@@ -331,15 +331,15 @@ define zeroext i8 @f64tou8(double %a) {
 ; FP32-NEXT:    mfc1 $1, $f0
 ; FP32-NEXT:    lui $2, 16864
 ; FP32-NEXT:    ori $3, $zero, 0
-; FP32-NEXT:    mtc1 $3, $f0
-; FP32-NEXT:    mtc1 $2, $f1
-; FP32-NEXT:    sub.d $f2, $f12, $f0
-; FP32-NEXT:    trunc.w.d $f2, $f2
-; FP32-NEXT:    mfc1 $2, $f2
+; FP32-NEXT:    mtc1 $3, $f2
+; FP32-NEXT:    mtc1 $2, $f3
+; FP32-NEXT:    sub.d $f4, $f12, $f2
+; FP32-NEXT:    trunc.w.d $f0, $f4
+; FP32-NEXT:    mfc1 $2, $f0
 ; FP32-NEXT:    lui $3, 32768
 ; FP32-NEXT:    xor $2, $2, $3
 ; FP32-NEXT:    addiu $3, $zero, 1
-; FP32-NEXT:    c.ult.d $f12, $f0
+; FP32-NEXT:    c.ult.d $f12, $f2
 ; FP32-NEXT:    movf $3, $zero, $fcc0
 ; FP32-NEXT:    andi $3, $3, 1
 ; FP32-NEXT:    movn $2, $1, $3
@@ -353,15 +353,15 @@ define zeroext i8 @f64tou8(double %a) {
 ; FP64-NEXT:    mfc1 $1, $f0
 ; FP64-NEXT:    lui $2, 16864
 ; FP64-NEXT:    ori $3, $zero, 0
-; FP64-NEXT:    mtc1 $3, $f0
-; FP64-NEXT:    mthc1 $2, $f0
-; FP64-NEXT:    sub.d $f1, $f12, $f0
-; FP64-NEXT:    trunc.w.d $f1, $f1
-; FP64-NEXT:    mfc1 $2, $f1
+; FP64-NEXT:    mtc1 $3, $f1
+; FP64-NEXT:    mthc1 $2, $f1
+; FP64-NEXT:    sub.d $f2, $f12, $f1
+; FP64-NEXT:    trunc.w.d $f0, $f2
+; FP64-NEXT:    mfc1 $2, $f0
 ; FP64-NEXT:    lui $3, 32768
 ; FP64-NEXT:    xor $2, $2, $3
 ; FP64-NEXT:    addiu $3, $zero, 1
-; FP64-NEXT:    c.ult.d $f12, $f0
+; FP64-NEXT:    c.ult.d $f12, $f1
 ; FP64-NEXT:    movf $3, $zero, $fcc0
 ; FP64-NEXT:    andi $3, $3, 1
 ; FP64-NEXT:    movn $2, $1, $3
diff --git a/llvm/test/CodeGen/Mips/atomic-min-max.ll b/llvm/test/CodeGen/Mips/atomic-min-max.ll
index 646af650c00e..a6200851940c 100644
--- a/llvm/test/CodeGen/Mips/atomic-min-max.ll
+++ b/llvm/test/CodeGen/Mips/atomic-min-max.ll
@@ -1154,26 +1154,26 @@ define i16 @test_max_16(i16* nocapture %ptr, i16 signext %val) {
 ; MIPS64-NEXT:    sll $2, $2, 3
 ; MIPS64-NEXT:    ori $3, $zero, 65535
 ; MIPS64-NEXT:    sllv $3, $3, $2
-; MIPS64-NEXT:    nor $4, $zero, $3
+; MIPS64-NEXT:    nor $6, $zero, $3
 ; MIPS64-NEXT:    sllv $5, $5, $2
 ; MIPS64-NEXT:  .LBB4_1: # %entry
 ; MIPS64-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64-NEXT:    ll $7, 0($1)
-; MIPS64-NEXT:    slt $10, $7, $5
-; MIPS64-NEXT:    move $8, $7
-; MIPS64-NEXT:    movn $8, $5, $10
-; MIPS64-NEXT:    and $8, $8, $3
-; MIPS64-NEXT:    and $9, $7, $4
-; MIPS64-NEXT:    or $9, $9, $8
-; MIPS64-NEXT:    sc $9, 0($1)
-; MIPS64-NEXT:    beqz $9, .LBB4_1
+; MIPS64-NEXT:    ll $8, 0($1)
+; MIPS64-NEXT:    slt $11, $8, $5
+; MIPS64-NEXT:    move $9, $8
+; MIPS64-NEXT:    movn $9, $5, $11
+; MIPS64-NEXT:    and $9, $9, $3
+; MIPS64-NEXT:    and $10, $8, $6
+; MIPS64-NEXT:    or $10, $10, $9
+; MIPS64-NEXT:    sc $10, 0($1)
+; MIPS64-NEXT:    beqz $10, .LBB4_1
 ; MIPS64-NEXT:    nop
 ; MIPS64-NEXT:  # %bb.2: # %entry
-; MIPS64-NEXT:    and $6, $7, $3
-; MIPS64-NEXT:    srlv $6, $6, $2
-; MIPS64-NEXT:    seh $6, $6
+; MIPS64-NEXT:    and $7, $8, $3
+; MIPS64-NEXT:    srlv $7, $7, $2
+; MIPS64-NEXT:    seh $7, $7
 ; MIPS64-NEXT:  # %bb.3: # %entry
-; MIPS64-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64-NEXT:  # %bb.4: # %entry
 ; MIPS64-NEXT:    sync
 ; MIPS64-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -1194,26 +1194,26 @@ define i16 @test_max_16(i16* nocapture %ptr, i16 signext %val) {
 ; MIPS64R6-NEXT:    sll $2, $2, 3
 ; MIPS64R6-NEXT:    ori $3, $zero, 65535
 ; MIPS64R6-NEXT:    sllv $3, $3, $2
-; MIPS64R6-NEXT:    nor $4, $zero, $3
+; MIPS64R6-NEXT:    nor $6, $zero, $3
 ; MIPS64R6-NEXT:    sllv $5, $5, $2
 ; MIPS64R6-NEXT:  .LBB4_1: # %entry
 ; MIPS64R6-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64R6-NEXT:    ll $7, 0($1)
-; MIPS64R6-NEXT:    slt $10, $7, $5
-; MIPS64R6-NEXT:    seleqz $8, $7, $10
-; MIPS64R6-NEXT:    selnez $10, $5, $10
-; MIPS64R6-NEXT:    or $8, $8, $10
-; MIPS64R6-NEXT:    and $8, $8, $3
-; MIPS64R6-NEXT:    and $9, $7, $4
-; MIPS64R6-NEXT:    or $9, $9, $8
-; MIPS64R6-NEXT:    sc $9, 0($1)
-; MIPS64R6-NEXT:    beqzc $9, .LBB4_1
+; MIPS64R6-NEXT:    ll $8, 0($1)
+; MIPS64R6-NEXT:    slt $11, $8, $5
+; MIPS64R6-NEXT:    seleqz $9, $8, $11
+; MIPS64R6-NEXT:    selnez $11, $5, $11
+; MIPS64R6-NEXT:    or $9, $9, $11
+; MIPS64R6-NEXT:    and $9, $9, $3
+; MIPS64R6-NEXT:    and $10, $8, $6
+; MIPS64R6-NEXT:    or $10, $10, $9
+; MIPS64R6-NEXT:    sc $10, 0($1)
+; MIPS64R6-NEXT:    beqzc $10, .LBB4_1
 ; MIPS64R6-NEXT:  # %bb.2: # %entry
-; MIPS64R6-NEXT:    and $6, $7, $3
-; MIPS64R6-NEXT:    srlv $6, $6, $2
-; MIPS64R6-NEXT:    seh $6, $6
+; MIPS64R6-NEXT:    and $7, $8, $3
+; MIPS64R6-NEXT:    srlv $7, $7, $2
+; MIPS64R6-NEXT:    seh $7, $7
 ; MIPS64R6-NEXT:  # %bb.3: # %entry
-; MIPS64R6-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64R6-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64R6-NEXT:  # %bb.4: # %entry
 ; MIPS64R6-NEXT:    sync
 ; MIPS64R6-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -1232,28 +1232,28 @@ define i16 @test_max_16(i16* nocapture %ptr, i16 signext %val) {
 ; MIPS64EL-NEXT:    sll $2, $2, 3
 ; MIPS64EL-NEXT:    ori $3, $zero, 65535
 ; MIPS64EL-NEXT:    sllv $3, $3, $2
-; MIPS64EL-NEXT:    nor $4, $zero, $3
+; MIPS64EL-NEXT:    nor $6, $zero, $3
 ; MIPS64EL-NEXT:    sllv $5, $5, $2
 ; MIPS64EL-NEXT:  .LBB4_1: # %entry
 ; MIPS64EL-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64EL-NEXT:    ll $7, 0($1)
-; MIPS64EL-NEXT:    and $7, $7, $3
-; MIPS64EL-NEXT:    and $5, $5, $3
-; MIPS64EL-NEXT:    slt $10, $7, $5
-; MIPS64EL-NEXT:    move $8, $7
-; MIPS64EL-NEXT:    movn $8, $5, $10
+; MIPS64EL-NEXT:    ll $8, 0($1)
 ; MIPS64EL-NEXT:    and $8, $8, $3
-; MIPS64EL-NEXT:    and $9, $7, $4
-; MIPS64EL-NEXT:    or $9, $9, $8
-; MIPS64EL-NEXT:    sc $9, 0($1)
-; MIPS64EL-NEXT:    beqz $9, .LBB4_1
+; MIPS64EL-NEXT:    and $5, $5, $3
+; MIPS64EL-NEXT:    slt $11, $8, $5
+; MIPS64EL-NEXT:    move $9, $8
+; MIPS64EL-NEXT:    movn $9, $5, $11
+; MIPS64EL-NEXT:    and $9, $9, $3
+; MIPS64EL-NEXT:    and $10, $8, $6
+; MIPS64EL-NEXT:    or $10, $10, $9
+; MIPS64EL-NEXT:    sc $10, 0($1)
+; MIPS64EL-NEXT:    beqz $10, .LBB4_1
 ; MIPS64EL-NEXT:    nop
 ; MIPS64EL-NEXT:  # %bb.2: # %entry
-; MIPS64EL-NEXT:    and $6, $7, $3
-; MIPS64EL-NEXT:    srlv $6, $6, $2
-; MIPS64EL-NEXT:    seh $6, $6
+; MIPS64EL-NEXT:    and $7, $8, $3
+; MIPS64EL-NEXT:    srlv $7, $7, $2
+; MIPS64EL-NEXT:    seh $7, $7
 ; MIPS64EL-NEXT:  # %bb.3: # %entry
-; MIPS64EL-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64EL-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64EL-NEXT:  # %bb.4: # %entry
 ; MIPS64EL-NEXT:    sync
 ; MIPS64EL-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -1273,28 +1273,28 @@ define i16 @test_max_16(i16* nocapture %ptr, i16 signext %val) {
 ; MIPS64ELR6-NEXT:    sll $2, $2, 3
 ; MIPS64ELR6-NEXT:    ori $3, $zero, 65535
 ; MIPS64ELR6-NEXT:    sllv $3, $3, $2
-; MIPS64ELR6-NEXT:    nor $4, $zero, $3
+; MIPS64ELR6-NEXT:    nor $6, $zero, $3
 ; MIPS64ELR6-NEXT:    sllv $5, $5, $2
 ; MIPS64ELR6-NEXT:  .LBB4_1: # %entry
 ; MIPS64ELR6-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64ELR6-NEXT:    ll $7, 0($1)
-; MIPS64ELR6-NEXT:    and $7, $7, $3
-; MIPS64ELR6-NEXT:    and $5, $5, $3
-; MIPS64ELR6-NEXT:    slt $10, $7, $5
-; MIPS64ELR6-NEXT:    seleqz $8, $7, $10
-; MIPS64ELR6-NEXT:    selnez $10, $5, $10
-; MIPS64ELR6-NEXT:    or $8, $8, $10
+; MIPS64ELR6-NEXT:    ll $8, 0($1)
 ; MIPS64ELR6-NEXT:    and $8, $8, $3
-; MIPS64ELR6-NEXT:    and $9, $7, $4
-; MIPS64ELR6-NEXT:    or $9, $9, $8
-; MIPS64ELR6-NEXT:    sc $9, 0($1)
-; MIPS64ELR6-NEXT:    beqzc $9, .LBB4_1
+; MIPS64ELR6-NEXT:    and $5, $5, $3
+; MIPS64ELR6-NEXT:    slt $11, $8, $5
+; MIPS64ELR6-NEXT:    seleqz $9, $8, $11
+; MIPS64ELR6-NEXT:    selnez $11, $5, $11
+; MIPS64ELR6-NEXT:    or $9, $9, $11
+; MIPS64ELR6-NEXT:    and $9, $9, $3
+; MIPS64ELR6-NEXT:    and $10, $8, $6
+; MIPS64ELR6-NEXT:    or $10, $10, $9
+; MIPS64ELR6-NEXT:    sc $10, 0($1)
+; MIPS64ELR6-NEXT:    beqzc $10, .LBB4_1
 ; MIPS64ELR6-NEXT:  # %bb.2: # %entry
-; MIPS64ELR6-NEXT:    and $6, $7, $3
-; MIPS64ELR6-NEXT:    srlv $6, $6, $2
-; MIPS64ELR6-NEXT:    seh $6, $6
+; MIPS64ELR6-NEXT:    and $7, $8, $3
+; MIPS64ELR6-NEXT:    srlv $7, $7, $2
+; MIPS64ELR6-NEXT:    seh $7, $7
 ; MIPS64ELR6-NEXT:  # %bb.3: # %entry
-; MIPS64ELR6-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64ELR6-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64ELR6-NEXT:  # %bb.4: # %entry
 ; MIPS64ELR6-NEXT:    sync
 ; MIPS64ELR6-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -1635,26 +1635,26 @@ define i16 @test_min_16(i16* nocapture %ptr, i16 signext %val) {
 ; MIPS64-NEXT:    sll $2, $2, 3
 ; MIPS64-NEXT:    ori $3, $zero, 65535
 ; MIPS64-NEXT:    sllv $3, $3, $2
-; MIPS64-NEXT:    nor $4, $zero, $3
+; MIPS64-NEXT:    nor $6, $zero, $3
 ; MIPS64-NEXT:    sllv $5, $5, $2
 ; MIPS64-NEXT:  .LBB5_1: # %entry
 ; MIPS64-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64-NEXT:    ll $7, 0($1)
-; MIPS64-NEXT:    slt $10, $7, $5
-; MIPS64-NEXT:    move $8, $7
-; MIPS64-NEXT:    movz $8, $5, $10
-; MIPS64-NEXT:    and $8, $8, $3
-; MIPS64-NEXT:    and $9, $7, $4
-; MIPS64-NEXT:    or $9, $9, $8
-; MIPS64-NEXT:    sc $9, 0($1)
-; MIPS64-NEXT:    beqz $9, .LBB5_1
+; MIPS64-NEXT:    ll $8, 0($1)
+; MIPS64-NEXT:    slt $11, $8, $5
+; MIPS64-NEXT:    move $9, $8
+; MIPS64-NEXT:    movz $9, $5, $11
+; MIPS64-NEXT:    and $9, $9, $3
+; MIPS64-NEXT:    and $10, $8, $6
+; MIPS64-NEXT:    or $10, $10, $9
+; MIPS64-NEXT:    sc $10, 0($1)
+; MIPS64-NEXT:    beqz $10, .LBB5_1
 ; MIPS64-NEXT:    nop
 ; MIPS64-NEXT:  # %bb.2: # %entry
-; MIPS64-NEXT:    and $6, $7, $3
-; MIPS64-NEXT:    srlv $6, $6, $2
-; MIPS64-NEXT:    seh $6, $6
+; MIPS64-NEXT:    and $7, $8, $3
+; MIPS64-NEXT:    srlv $7, $7, $2
+; MIPS64-NEXT:    seh $7, $7
 ; MIPS64-NEXT:  # %bb.3: # %entry
-; MIPS64-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64-NEXT:  # %bb.4: # %entry
 ; MIPS64-NEXT:    sync
 ; MIPS64-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -1675,26 +1675,26 @@ define i16 @test_min_16(i16* nocapture %ptr, i16 signext %val) {
 ; MIPS64R6-NEXT:    sll $2, $2, 3
 ; MIPS64R6-NEXT:    ori $3, $zero, 65535
 ; MIPS64R6-NEXT:    sllv $3, $3, $2
-; MIPS64R6-NEXT:    nor $4, $zero, $3
+; MIPS64R6-NEXT:    nor $6, $zero, $3
 ; MIPS64R6-NEXT:    sllv $5, $5, $2
 ; MIPS64R6-NEXT:  .LBB5_1: # %entry
 ; MIPS64R6-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64R6-NEXT:    ll $7, 0($1)
-; MIPS64R6-NEXT:    slt $10, $7, $5
-; MIPS64R6-NEXT:    selnez $8, $7, $10
-; MIPS64R6-NEXT:    seleqz $10, $5, $10
-; MIPS64R6-NEXT:    or $8, $8, $10
-; MIPS64R6-NEXT:    and $8, $8, $3
-; MIPS64R6-NEXT:    and $9, $7, $4
-; MIPS64R6-NEXT:    or $9, $9, $8
-; MIPS64R6-NEXT:    sc $9, 0($1)
-; MIPS64R6-NEXT:    beqzc $9, .LBB5_1
+; MIPS64R6-NEXT:    ll $8, 0($1)
+; MIPS64R6-NEXT:    slt $11, $8, $5
+; MIPS64R6-NEXT:    selnez $9, $8, $11
+; MIPS64R6-NEXT:    seleqz $11, $5, $11
+; MIPS64R6-NEXT:    or $9, $9, $11
+; MIPS64R6-NEXT:    and $9, $9, $3
+; MIPS64R6-NEXT:    and $10, $8, $6
+; MIPS64R6-NEXT:    or $10, $10, $9
+; MIPS64R6-NEXT:    sc $10, 0($1)
+; MIPS64R6-NEXT:    beqzc $10, .LBB5_1
 ; MIPS64R6-NEXT:  # %bb.2: # %entry
-; MIPS64R6-NEXT:    and $6, $7, $3
-; MIPS64R6-NEXT:    srlv $6, $6, $2
-; MIPS64R6-NEXT:    seh $6, $6
+; MIPS64R6-NEXT:    and $7, $8, $3
+; MIPS64R6-NEXT:    srlv $7, $7, $2
+; MIPS64R6-NEXT:    seh $7, $7
 ; MIPS64R6-NEXT:  # %bb.3: # %entry
-; MIPS64R6-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64R6-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64R6-NEXT:  # %bb.4: # %entry
 ; MIPS64R6-NEXT:    sync
 ; MIPS64R6-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -1713,28 +1713,28 @@ define i16 @test_min_16(i16* nocapture %ptr, i16 signext %val) {
 ; MIPS64EL-NEXT:    sll $2, $2, 3
 ; MIPS64EL-NEXT:    ori $3, $zero, 65535
 ; MIPS64EL-NEXT:    sllv $3, $3, $2
-; MIPS64EL-NEXT:    nor $4, $zero, $3
+; MIPS64EL-NEXT:    nor $6, $zero, $3
 ; MIPS64EL-NEXT:    sllv $5, $5, $2
 ; MIPS64EL-NEXT:  .LBB5_1: # %entry
 ; MIPS64EL-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64EL-NEXT:    ll $7, 0($1)
-; MIPS64EL-NEXT:    and $7, $7, $3
-; MIPS64EL-NEXT:    and $5, $5, $3
-; MIPS64EL-NEXT:    slt $10, $7, $5
-; MIPS64EL-NEXT:    move $8, $7
-; MIPS64EL-NEXT:    movz $8, $5, $10
+; MIPS64EL-NEXT:    ll $8, 0($1)
 ; MIPS64EL-NEXT:    and $8, $8, $3
-; MIPS64EL-NEXT:    and $9, $7, $4
-; MIPS64EL-NEXT:    or $9, $9, $8
-; MIPS64EL-NEXT:    sc $9, 0($1)
-; MIPS64EL-NEXT:    beqz $9, .LBB5_1
+; MIPS64EL-NEXT:    and $5, $5, $3
+; MIPS64EL-NEXT:    slt $11, $8, $5
+; MIPS64EL-NEXT:    move $9, $8
+; MIPS64EL-NEXT:    movz $9, $5, $11
+; MIPS64EL-NEXT:    and $9, $9, $3
+; MIPS64EL-NEXT:    and $10, $8, $6
+; MIPS64EL-NEXT:    or $10, $10, $9
+; MIPS64EL-NEXT:    sc $10, 0($1)
+; MIPS64EL-NEXT:    beqz $10, .LBB5_1
 ; MIPS64EL-NEXT:    nop
 ; MIPS64EL-NEXT:  # %bb.2: # %entry
-; MIPS64EL-NEXT:    and $6, $7, $3
-; MIPS64EL-NEXT:    srlv $6, $6, $2
-; MIPS64EL-NEXT:    seh $6, $6
+; MIPS64EL-NEXT:    and $7, $8, $3
+; MIPS64EL-NEXT:    srlv $7, $7, $2
+; MIPS64EL-NEXT:    seh $7, $7
 ; MIPS64EL-NEXT:  # %bb.3: # %entry
-; MIPS64EL-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64EL-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64EL-NEXT:  # %bb.4: # %entry
 ; MIPS64EL-NEXT:    sync
 ; MIPS64EL-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -1754,28 +1754,28 @@ define i16 @test_min_16(i16* nocapture %ptr, i16 signext %val) {
 ; MIPS64ELR6-NEXT:    sll $2, $2, 3
 ; MIPS64ELR6-NEXT:    ori $3, $zero, 65535
 ; MIPS64ELR6-NEXT:    sllv $3, $3, $2
-; MIPS64ELR6-NEXT:    nor $4, $zero, $3
+; MIPS64ELR6-NEXT:    nor $6, $zero, $3
 ; MIPS64ELR6-NEXT:    sllv $5, $5, $2
 ; MIPS64ELR6-NEXT:  .LBB5_1: # %entry
 ; MIPS64ELR6-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64ELR6-NEXT:    ll $7, 0($1)
-; MIPS64ELR6-NEXT:    and $7, $7, $3
-; MIPS64ELR6-NEXT:    and $5, $5, $3
-; MIPS64ELR6-NEXT:    slt $10, $7, $5
-; MIPS64ELR6-NEXT:    selnez $8, $7, $10
-; MIPS64ELR6-NEXT:    seleqz $10, $5, $10
-; MIPS64ELR6-NEXT:    or $8, $8, $10
+; MIPS64ELR6-NEXT:    ll $8, 0($1)
 ; MIPS64ELR6-NEXT:    and $8, $8, $3
-; MIPS64ELR6-NEXT:    and $9, $7, $4
-; MIPS64ELR6-NEXT:    or $9, $9, $8
-; MIPS64ELR6-NEXT:    sc $9, 0($1)
-; MIPS64ELR6-NEXT:    beqzc $9, .LBB5_1
+; MIPS64ELR6-NEXT:    and $5, $5, $3
+; MIPS64ELR6-NEXT:    slt $11, $8, $5
+; MIPS64ELR6-NEXT:    selnez $9, $8, $11
+; MIPS64ELR6-NEXT:    seleqz $11, $5, $11
+; MIPS64ELR6-NEXT:    or $9, $9, $11
+; MIPS64ELR6-NEXT:    and $9, $9, $3
+; MIPS64ELR6-NEXT:    and $10, $8, $6
+; MIPS64ELR6-NEXT:    or $10, $10, $9
+; MIPS64ELR6-NEXT:    sc $10, 0($1)
+; MIPS64ELR6-NEXT:    beqzc $10, .LBB5_1
 ; MIPS64ELR6-NEXT:  # %bb.2: # %entry
-; MIPS64ELR6-NEXT:    and $6, $7, $3
-; MIPS64ELR6-NEXT:    srlv $6, $6, $2
-; MIPS64ELR6-NEXT:    seh $6, $6
+; MIPS64ELR6-NEXT:    and $7, $8, $3
+; MIPS64ELR6-NEXT:    srlv $7, $7, $2
+; MIPS64ELR6-NEXT:    seh $7, $7
 ; MIPS64ELR6-NEXT:  # %bb.3: # %entry
-; MIPS64ELR6-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64ELR6-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64ELR6-NEXT:  # %bb.4: # %entry
 ; MIPS64ELR6-NEXT:    sync
 ; MIPS64ELR6-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -2116,26 +2116,26 @@ define i16 @test_umax_16(i16* nocapture %ptr, i16 signext %val) {
 ; MIPS64-NEXT:    sll $2, $2, 3
 ; MIPS64-NEXT:    ori $3, $zero, 65535
 ; MIPS64-NEXT:    sllv $3, $3, $2
-; MIPS64-NEXT:    nor $4, $zero, $3
+; MIPS64-NEXT:    nor $6, $zero, $3
 ; MIPS64-NEXT:    sllv $5, $5, $2
 ; MIPS64-NEXT:  .LBB6_1: # %entry
 ; MIPS64-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64-NEXT:    ll $7, 0($1)
-; MIPS64-NEXT:    sltu $10, $7, $5
-; MIPS64-NEXT:    move $8, $7
-; MIPS64-NEXT:    movn $8, $5, $10
-; MIPS64-NEXT:    and $8, $8, $3
-; MIPS64-NEXT:    and $9, $7, $4
-; MIPS64-NEXT:    or $9, $9, $8
-; MIPS64-NEXT:    sc $9, 0($1)
-; MIPS64-NEXT:    beqz $9, .LBB6_1
+; MIPS64-NEXT:    ll $8, 0($1)
+; MIPS64-NEXT:    sltu $11, $8, $5
+; MIPS64-NEXT:    move $9, $8
+; MIPS64-NEXT:    movn $9, $5, $11
+; MIPS64-NEXT:    and $9, $9, $3
+; MIPS64-NEXT:    and $10, $8, $6
+; MIPS64-NEXT:    or $10, $10, $9
+; MIPS64-NEXT:    sc $10, 0($1)
+; MIPS64-NEXT:    beqz $10, .LBB6_1
 ; MIPS64-NEXT:    nop
 ; MIPS64-NEXT:  # %bb.2: # %entry
-; MIPS64-NEXT:    and $6, $7, $3
-; MIPS64-NEXT:    srlv $6, $6, $2
-; MIPS64-NEXT:    seh $6, $6
+; MIPS64-NEXT:    and $7, $8, $3
+; MIPS64-NEXT:    srlv $7, $7, $2
+; MIPS64-NEXT:    seh $7, $7
 ; MIPS64-NEXT:  # %bb.3: # %entry
-; MIPS64-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64-NEXT:  # %bb.4: # %entry
 ; MIPS64-NEXT:    sync
 ; MIPS64-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -2156,26 +2156,26 @@ define i16 @test_umax_16(i16* nocapture %ptr, i16 signext %val) {
 ; MIPS64R6-NEXT:    sll $2, $2, 3
 ; MIPS64R6-NEXT:    ori $3, $zero, 65535
 ; MIPS64R6-NEXT:    sllv $3, $3, $2
-; MIPS64R6-NEXT:    nor $4, $zero, $3
+; MIPS64R6-NEXT:    nor $6, $zero, $3
 ; MIPS64R6-NEXT:    sllv $5, $5, $2
 ; MIPS64R6-NEXT:  .LBB6_1: # %entry
 ; MIPS64R6-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64R6-NEXT:    ll $7, 0($1)
-; MIPS64R6-NEXT:    sltu $10, $7, $5
-; MIPS64R6-NEXT:    seleqz $8, $7, $10
-; MIPS64R6-NEXT:    selnez $10, $5, $10
-; MIPS64R6-NEXT:    or $8, $8, $10
-; MIPS64R6-NEXT:    and $8, $8, $3
-; MIPS64R6-NEXT:    and $9, $7, $4
-; MIPS64R6-NEXT:    or $9, $9, $8
-; MIPS64R6-NEXT:    sc $9, 0($1)
-; MIPS64R6-NEXT:    beqzc $9, .LBB6_1
+; MIPS64R6-NEXT:    ll $8, 0($1)
+; MIPS64R6-NEXT:    sltu $11, $8, $5
+; MIPS64R6-NEXT:    seleqz $9, $8, $11
+; MIPS64R6-NEXT:    selnez $11, $5, $11
+; MIPS64R6-NEXT:    or $9, $9, $11
+; MIPS64R6-NEXT:    and $9, $9, $3
+; MIPS64R6-NEXT:    and $10, $8, $6
+; MIPS64R6-NEXT:    or $10, $10, $9
+; MIPS64R6-NEXT:    sc $10, 0($1)
+; MIPS64R6-NEXT:    beqzc $10, .LBB6_1
 ; MIPS64R6-NEXT:  # %bb.2: # %entry
-; MIPS64R6-NEXT:    and $6, $7, $3
-; MIPS64R6-NEXT:    srlv $6, $6, $2
-; MIPS64R6-NEXT:    seh $6, $6
+; MIPS64R6-NEXT:    and $7, $8, $3
+; MIPS64R6-NEXT:    srlv $7, $7, $2
+; MIPS64R6-NEXT:    seh $7, $7
 ; MIPS64R6-NEXT:  # %bb.3: # %entry
-; MIPS64R6-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64R6-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64R6-NEXT:  # %bb.4: # %entry
 ; MIPS64R6-NEXT:    sync
 ; MIPS64R6-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -2194,28 +2194,28 @@ define i16 @test_umax_16(i16* nocapture %ptr, i16 signext %val) {
 ; MIPS64EL-NEXT:    sll $2, $2, 3
 ; MIPS64EL-NEXT:    ori $3, $zero, 65535
 ; MIPS64EL-NEXT:    sllv $3, $3, $2
-; MIPS64EL-NEXT:    nor $4, $zero, $3
+; MIPS64EL-NEXT:    nor $6, $zero, $3
 ; MIPS64EL-NEXT:    sllv $5, $5, $2
 ; MIPS64EL-NEXT:  .LBB6_1: # %entry
 ; MIPS64EL-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64EL-NEXT:    ll $7, 0($1)
-; MIPS64EL-NEXT:    and $7, $7, $3
-; MIPS64EL-NEXT:    and $5, $5, $3
-; MIPS64EL-NEXT:    sltu $10, $7, $5
-; MIPS64EL-NEXT:    move $8, $7
-; MIPS64EL-NEXT:    movn $8, $5, $10
+; MIPS64EL-NEXT:    ll $8, 0($1)
 ; MIPS64EL-NEXT:    and $8, $8, $3
-; MIPS64EL-NEXT:    and $9, $7, $4
-; MIPS64EL-NEXT:    or $9, $9, $8
-; MIPS64EL-NEXT:    sc $9, 0($1)
-; MIPS64EL-NEXT:    beqz $9, .LBB6_1
+; MIPS64EL-NEXT:    and $5, $5, $3
+; MIPS64EL-NEXT:    sltu $11, $8, $5
+; MIPS64EL-NEXT:    move $9, $8
+; MIPS64EL-NEXT:    movn $9, $5, $11
+; MIPS64EL-NEXT:    and $9, $9, $3
+; MIPS64EL-NEXT:    and $10, $8, $6
+; MIPS64EL-NEXT:    or $10, $10, $9
+; MIPS64EL-NEXT:    sc $10, 0($1)
+; MIPS64EL-NEXT:    beqz $10, .LBB6_1
 ; MIPS64EL-NEXT:    nop
 ; MIPS64EL-NEXT:  # %bb.2: # %entry
-; MIPS64EL-NEXT:    and $6, $7, $3
-; MIPS64EL-NEXT:    srlv $6, $6, $2
-; MIPS64EL-NEXT:    seh $6, $6
+; MIPS64EL-NEXT:    and $7, $8, $3
+; MIPS64EL-NEXT:    srlv $7, $7, $2
+; MIPS64EL-NEXT:    seh $7, $7
 ; MIPS64EL-NEXT:  # %bb.3: # %entry
-; MIPS64EL-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64EL-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64EL-NEXT:  # %bb.4: # %entry
 ; MIPS64EL-NEXT:    sync
 ; MIPS64EL-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -2235,28 +2235,28 @@ define i16 @test_umax_16(i16* nocapture %ptr, i16 signext %val) {
 ; MIPS64ELR6-NEXT:    sll $2, $2, 3
 ; MIPS64ELR6-NEXT:    ori $3, $zero, 65535
 ; MIPS64ELR6-NEXT:    sllv $3, $3, $2
-; MIPS64ELR6-NEXT:    nor $4, $zero, $3
+; MIPS64ELR6-NEXT:    nor $6, $zero, $3
 ; MIPS64ELR6-NEXT:    sllv $5, $5, $2
 ; MIPS64ELR6-NEXT:  .LBB6_1: # %entry
 ; MIPS64ELR6-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64ELR6-NEXT:    ll $7, 0($1)
-; MIPS64ELR6-NEXT:    and $7, $7, $3
-; MIPS64ELR6-NEXT:    and $5, $5, $3
-; MIPS64ELR6-NEXT:    sltu $10, $7, $5
-; MIPS64ELR6-NEXT:    seleqz $8, $7, $10
-; MIPS64ELR6-NEXT:    selnez $10, $5, $10
-; MIPS64ELR6-NEXT:    or $8, $8, $10
+; MIPS64ELR6-NEXT:    ll $8, 0($1)
 ; MIPS64ELR6-NEXT:    and $8, $8, $3
-; MIPS64ELR6-NEXT:    and $9, $7, $4
-; MIPS64ELR6-NEXT:    or $9, $9, $8
-; MIPS64ELR6-NEXT:    sc $9, 0($1)
-; MIPS64ELR6-NEXT:    beqzc $9, .LBB6_1
+; MIPS64ELR6-NEXT:    and $5, $5, $3
+; MIPS64ELR6-NEXT:    sltu $11, $8, $5
+; MIPS64ELR6-NEXT:    seleqz $9, $8, $11
+; MIPS64ELR6-NEXT:    selnez $11, $5, $11
+; MIPS64ELR6-NEXT:    or $9, $9, $11
+; MIPS64ELR6-NEXT:    and $9, $9, $3
+; MIPS64ELR6-NEXT:    and $10, $8, $6
+; MIPS64ELR6-NEXT:    or $10, $10, $9
+; MIPS64ELR6-NEXT:    sc $10, 0($1)
+; MIPS64ELR6-NEXT:    beqzc $10, .LBB6_1
 ; MIPS64ELR6-NEXT:  # %bb.2: # %entry
-; MIPS64ELR6-NEXT:    and $6, $7, $3
-; MIPS64ELR6-NEXT:    srlv $6, $6, $2
-; MIPS64ELR6-NEXT:    seh $6, $6
+; MIPS64ELR6-NEXT:    and $7, $8, $3
+; MIPS64ELR6-NEXT:    srlv $7, $7, $2
+; MIPS64ELR6-NEXT:    seh $7, $7
 ; MIPS64ELR6-NEXT:  # %bb.3: # %entry
-; MIPS64ELR6-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64ELR6-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64ELR6-NEXT:  # %bb.4: # %entry
 ; MIPS64ELR6-NEXT:    sync
 ; MIPS64ELR6-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -2597,26 +2597,26 @@ define i16 @test_umin_16(i16* nocapture %ptr, i16 signext %val) {
 ; MIPS64-NEXT:    sll $2, $2, 3
 ; MIPS64-NEXT:    ori $3, $zero, 65535
 ; MIPS64-NEXT:    sllv $3, $3, $2
-; MIPS64-NEXT:    nor $4, $zero, $3
+; MIPS64-NEXT:    nor $6, $zero, $3
 ; MIPS64-NEXT:    sllv $5, $5, $2
 ; MIPS64-NEXT:  .LBB7_1: # %entry
 ; MIPS64-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64-NEXT:    ll $7, 0($1)
-; MIPS64-NEXT:    sltu $10, $7, $5
-; MIPS64-NEXT:    move $8, $7
-; MIPS64-NEXT:    movz $8, $5, $10
-; MIPS64-NEXT:    and $8, $8, $3
-; MIPS64-NEXT:    and $9, $7, $4
-; MIPS64-NEXT:    or $9, $9, $8
-; MIPS64-NEXT:    sc $9, 0($1)
-; MIPS64-NEXT:    beqz $9, .LBB7_1
+; MIPS64-NEXT:    ll $8, 0($1)
+; MIPS64-NEXT:    sltu $11, $8, $5
+; MIPS64-NEXT:    move $9, $8
+; MIPS64-NEXT:    movz $9, $5, $11
+; MIPS64-NEXT:    and $9, $9, $3
+; MIPS64-NEXT:    and $10, $8, $6
+; MIPS64-NEXT:    or $10, $10, $9
+; MIPS64-NEXT:    sc $10, 0($1)
+; MIPS64-NEXT:    beqz $10, .LBB7_1
 ; MIPS64-NEXT:    nop
 ; MIPS64-NEXT:  # %bb.2: # %entry
-; MIPS64-NEXT:    and $6, $7, $3
-; MIPS64-NEXT:    srlv $6, $6, $2
-; MIPS64-NEXT:    seh $6, $6
+; MIPS64-NEXT:    and $7, $8, $3
+; MIPS64-NEXT:    srlv $7, $7, $2
+; MIPS64-NEXT:    seh $7, $7
 ; MIPS64-NEXT:  # %bb.3: # %entry
-; MIPS64-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64-NEXT:  # %bb.4: # %entry
 ; MIPS64-NEXT:    sync
 ; MIPS64-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -2637,26 +2637,26 @@ define i16 @test_umin_16(i16* nocapture %ptr, i16 signext %val) {
 ; MIPS64R6-NEXT:    sll $2, $2, 3
 ; MIPS64R6-NEXT:    ori $3, $zero, 65535
 ; MIPS64R6-NEXT:    sllv $3, $3, $2
-; MIPS64R6-NEXT:    nor $4, $zero, $3
+; MIPS64R6-NEXT:    nor $6, $zero, $3
 ; MIPS64R6-NEXT:    sllv $5, $5, $2
 ; MIPS64R6-NEXT:  .LBB7_1: # %entry
 ; MIPS64R6-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64R6-NEXT:    ll $7, 0($1)
-; MIPS64R6-NEXT:    sltu $10, $7, $5
-; MIPS64R6-NEXT:    selnez $8, $7, $10
-; MIPS64R6-NEXT:    seleqz $10, $5, $10
-; MIPS64R6-NEXT:    or $8, $8, $10
-; MIPS64R6-NEXT:    and $8, $8, $3
-; MIPS64R6-NEXT:    and $9, $7, $4
-; MIPS64R6-NEXT:    or $9, $9, $8
-; MIPS64R6-NEXT:    sc $9, 0($1)
-; MIPS64R6-NEXT:    beqzc $9, .LBB7_1
+; MIPS64R6-NEXT:    ll $8, 0($1)
+; MIPS64R6-NEXT:    sltu $11, $8, $5
+; MIPS64R6-NEXT:    selnez $9, $8, $11
+; MIPS64R6-NEXT:    seleqz $11, $5, $11
+; MIPS64R6-NEXT:    or $9, $9, $11
+; MIPS64R6-NEXT:    and $9, $9, $3
+; MIPS64R6-NEXT:    and $10, $8, $6
+; MIPS64R6-NEXT:    or $10, $10, $9
+; MIPS64R6-NEXT:    sc $10, 0($1)
+; MIPS64R6-NEXT:    beqzc $10, .LBB7_1
 ; MIPS64R6-NEXT:  # %bb.2: # %entry
-; MIPS64R6-NEXT:    and $6, $7, $3
-; MIPS64R6-NEXT:    srlv $6, $6, $2
-; MIPS64R6-NEXT:    seh $6, $6
+; MIPS64R6-NEXT:    and $7, $8, $3
+; MIPS64R6-NEXT:    srlv $7, $7, $2
+; MIPS64R6-NEXT:    seh $7, $7
 ; MIPS64R6-NEXT:  # %bb.3: # %entry
-; MIPS64R6-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64R6-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64R6-NEXT:  # %bb.4: # %entry
 ; MIPS64R6-NEXT:    sync
 ; MIPS64R6-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -2675,28 +2675,28 @@ define i16 @test_umin_16(i16* nocapture %ptr, i16 signext %val) {
 ; MIPS64EL-NEXT:    sll $2, $2, 3
 ; MIPS64EL-NEXT:    ori $3, $zero, 65535
 ; MIPS64EL-NEXT:    sllv $3, $3, $2
-; MIPS64EL-NEXT:    nor $4, $zero, $3
+; MIPS64EL-NEXT:    nor $6, $zero, $3
 ; MIPS64EL-NEXT:    sllv $5, $5, $2
 ; MIPS64EL-NEXT:  .LBB7_1: # %entry
 ; MIPS64EL-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64EL-NEXT:    ll $7, 0($1)
-; MIPS64EL-NEXT:    and $7, $7, $3
-; MIPS64EL-NEXT:    and $5, $5, $3
-; MIPS64EL-NEXT:    sltu $10, $7, $5
-; MIPS64EL-NEXT:    move $8, $7
-; MIPS64EL-NEXT:    movz $8, $5, $10
+; MIPS64EL-NEXT:    ll $8, 0($1)
 ; MIPS64EL-NEXT:    and $8, $8, $3
-; MIPS64EL-NEXT:    and $9, $7, $4
-; MIPS64EL-NEXT:    or $9, $9, $8
-; MIPS64EL-NEXT:    sc $9, 0($1)
-; MIPS64EL-NEXT:    beqz $9, .LBB7_1
+; MIPS64EL-NEXT:    and $5, $5, $3
+; MIPS64EL-NEXT:    sltu $11, $8, $5
+; MIPS64EL-NEXT:    move $9, $8
+; MIPS64EL-NEXT:    movz $9, $5, $11
+; MIPS64EL-NEXT:    and $9, $9, $3
+; MIPS64EL-NEXT:    and $10, $8, $6
+; MIPS64EL-NEXT:    or $10, $10, $9
+; MIPS64EL-NEXT:    sc $10, 0($1)
+; MIPS64EL-NEXT:    beqz $10, .LBB7_1
 ; MIPS64EL-NEXT:    nop
 ; MIPS64EL-NEXT:  # %bb.2: # %entry
-; MIPS64EL-NEXT:    and $6, $7, $3
-; MIPS64EL-NEXT:    srlv $6, $6, $2
-; MIPS64EL-NEXT:    seh $6, $6
+; MIPS64EL-NEXT:    and $7, $8, $3
+; MIPS64EL-NEXT:    srlv $7, $7, $2
+; MIPS64EL-NEXT:    seh $7, $7
 ; MIPS64EL-NEXT:  # %bb.3: # %entry
-; MIPS64EL-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64EL-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64EL-NEXT:  # %bb.4: # %entry
 ; MIPS64EL-NEXT:    sync
 ; MIPS64EL-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -2716,28 +2716,28 @@ define i16 @test_umin_16(i16* nocapture %ptr, i16 signext %val) {
 ; MIPS64ELR6-NEXT:    sll $2, $2, 3
 ; MIPS64ELR6-NEXT:    ori $3, $zero, 65535
 ; MIPS64ELR6-NEXT:    sllv $3, $3, $2
-; MIPS64ELR6-NEXT:    nor $4, $zero, $3
+; MIPS64ELR6-NEXT:    nor $6, $zero, $3
 ; MIPS64ELR6-NEXT:    sllv $5, $5, $2
 ; MIPS64ELR6-NEXT:  .LBB7_1: # %entry
 ; MIPS64ELR6-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64ELR6-NEXT:    ll $7, 0($1)
-; MIPS64ELR6-NEXT:    and $7, $7, $3
-; MIPS64ELR6-NEXT:    and $5, $5, $3
-; MIPS64ELR6-NEXT:    sltu $10, $7, $5
-; MIPS64ELR6-NEXT:    selnez $8, $7, $10
-; MIPS64ELR6-NEXT:    seleqz $10, $5, $10
-; MIPS64ELR6-NEXT:    or $8, $8, $10
+; MIPS64ELR6-NEXT:    ll $8, 0($1)
 ; MIPS64ELR6-NEXT:    and $8, $8, $3
-; MIPS64ELR6-NEXT:    and $9, $7, $4
-; MIPS64ELR6-NEXT:    or $9, $9, $8
-; MIPS64ELR6-NEXT:    sc $9, 0($1)
-; MIPS64ELR6-NEXT:    beqzc $9, .LBB7_1
+; MIPS64ELR6-NEXT:    and $5, $5, $3
+; MIPS64ELR6-NEXT:    sltu $11, $8, $5
+; MIPS64ELR6-NEXT:    selnez $9, $8, $11
+; MIPS64ELR6-NEXT:    seleqz $11, $5, $11
+; MIPS64ELR6-NEXT:    or $9, $9, $11
+; MIPS64ELR6-NEXT:    and $9, $9, $3
+; MIPS64ELR6-NEXT:    and $10, $8, $6
+; MIPS64ELR6-NEXT:    or $10, $10, $9
+; MIPS64ELR6-NEXT:    sc $10, 0($1)
+; MIPS64ELR6-NEXT:    beqzc $10, .LBB7_1
 ; MIPS64ELR6-NEXT:  # %bb.2: # %entry
-; MIPS64ELR6-NEXT:    and $6, $7, $3
-; MIPS64ELR6-NEXT:    srlv $6, $6, $2
-; MIPS64ELR6-NEXT:    seh $6, $6
+; MIPS64ELR6-NEXT:    and $7, $8, $3
+; MIPS64ELR6-NEXT:    srlv $7, $7, $2
+; MIPS64ELR6-NEXT:    seh $7, $7
 ; MIPS64ELR6-NEXT:  # %bb.3: # %entry
-; MIPS64ELR6-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64ELR6-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64ELR6-NEXT:  # %bb.4: # %entry
 ; MIPS64ELR6-NEXT:    sync
 ; MIPS64ELR6-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -3079,26 +3079,26 @@ define i8 @test_max_8(i8* nocapture %ptr, i8 signext %val) {
 ; MIPS64-NEXT:    sll $2, $2, 3
 ; MIPS64-NEXT:    ori $3, $zero, 255
 ; MIPS64-NEXT:    sllv $3, $3, $2
-; MIPS64-NEXT:    nor $4, $zero, $3
+; MIPS64-NEXT:    nor $6, $zero, $3
 ; MIPS64-NEXT:    sllv $5, $5, $2
 ; MIPS64-NEXT:  .LBB8_1: # %entry
 ; MIPS64-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64-NEXT:    ll $7, 0($1)
-; MIPS64-NEXT:    slt $10, $7, $5
-; MIPS64-NEXT:    move $8, $7
-; MIPS64-NEXT:    movn $8, $5, $10
-; MIPS64-NEXT:    and $8, $8, $3
-; MIPS64-NEXT:    and $9, $7, $4
-; MIPS64-NEXT:    or $9, $9, $8
-; MIPS64-NEXT:    sc $9, 0($1)
-; MIPS64-NEXT:    beqz $9, .LBB8_1
+; MIPS64-NEXT:    ll $8, 0($1)
+; MIPS64-NEXT:    slt $11, $8, $5
+; MIPS64-NEXT:    move $9, $8
+; MIPS64-NEXT:    movn $9, $5, $11
+; MIPS64-NEXT:    and $9, $9, $3
+; MIPS64-NEXT:    and $10, $8, $6
+; MIPS64-NEXT:    or $10, $10, $9
+; MIPS64-NEXT:    sc $10, 0($1)
+; MIPS64-NEXT:    beqz $10, .LBB8_1
 ; MIPS64-NEXT:    nop
 ; MIPS64-NEXT:  # %bb.2: # %entry
-; MIPS64-NEXT:    and $6, $7, $3
-; MIPS64-NEXT:    srlv $6, $6, $2
-; MIPS64-NEXT:    seh $6, $6
+; MIPS64-NEXT:    and $7, $8, $3
+; MIPS64-NEXT:    srlv $7, $7, $2
+; MIPS64-NEXT:    seh $7, $7
 ; MIPS64-NEXT:  # %bb.3: # %entry
-; MIPS64-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64-NEXT:  # %bb.4: # %entry
 ; MIPS64-NEXT:    sync
 ; MIPS64-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -3119,26 +3119,26 @@ define i8 @test_max_8(i8* nocapture %ptr, i8 signext %val) {
 ; MIPS64R6-NEXT:    sll $2, $2, 3
 ; MIPS64R6-NEXT:    ori $3, $zero, 255
 ; MIPS64R6-NEXT:    sllv $3, $3, $2
-; MIPS64R6-NEXT:    nor $4, $zero, $3
+; MIPS64R6-NEXT:    nor $6, $zero, $3
 ; MIPS64R6-NEXT:    sllv $5, $5, $2
 ; MIPS64R6-NEXT:  .LBB8_1: # %entry
 ; MIPS64R6-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64R6-NEXT:    ll $7, 0($1)
-; MIPS64R6-NEXT:    slt $10, $7, $5
-; MIPS64R6-NEXT:    seleqz $8, $7, $10
-; MIPS64R6-NEXT:    selnez $10, $5, $10
-; MIPS64R6-NEXT:    or $8, $8, $10
-; MIPS64R6-NEXT:    and $8, $8, $3
-; MIPS64R6-NEXT:    and $9, $7, $4
-; MIPS64R6-NEXT:    or $9, $9, $8
-; MIPS64R6-NEXT:    sc $9, 0($1)
-; MIPS64R6-NEXT:    beqzc $9, .LBB8_1
+; MIPS64R6-NEXT:    ll $8, 0($1)
+; MIPS64R6-NEXT:    slt $11, $8, $5
+; MIPS64R6-NEXT:    seleqz $9, $8, $11
+; MIPS64R6-NEXT:    selnez $11, $5, $11
+; MIPS64R6-NEXT:    or $9, $9, $11
+; MIPS64R6-NEXT:    and $9, $9, $3
+; MIPS64R6-NEXT:    and $10, $8, $6
+; MIPS64R6-NEXT:    or $10, $10, $9
+; MIPS64R6-NEXT:    sc $10, 0($1)
+; MIPS64R6-NEXT:    beqzc $10, .LBB8_1
 ; MIPS64R6-NEXT:  # %bb.2: # %entry
-; MIPS64R6-NEXT:    and $6, $7, $3
-; MIPS64R6-NEXT:    srlv $6, $6, $2
-; MIPS64R6-NEXT:    seh $6, $6
+; MIPS64R6-NEXT:    and $7, $8, $3
+; MIPS64R6-NEXT:    srlv $7, $7, $2
+; MIPS64R6-NEXT:    seh $7, $7
 ; MIPS64R6-NEXT:  # %bb.3: # %entry
-; MIPS64R6-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64R6-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64R6-NEXT:  # %bb.4: # %entry
 ; MIPS64R6-NEXT:    sync
 ; MIPS64R6-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -3157,28 +3157,28 @@ define i8 @test_max_8(i8* nocapture %ptr, i8 signext %val) {
 ; MIPS64EL-NEXT:    sll $2, $2, 3
 ; MIPS64EL-NEXT:    ori $3, $zero, 255
 ; MIPS64EL-NEXT:    sllv $3, $3, $2
-; MIPS64EL-NEXT:    nor $4, $zero, $3
+; MIPS64EL-NEXT:    nor $6, $zero, $3
 ; MIPS64EL-NEXT:    sllv $5, $5, $2
 ; MIPS64EL-NEXT:  .LBB8_1: # %entry
 ; MIPS64EL-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64EL-NEXT:    ll $7, 0($1)
-; MIPS64EL-NEXT:    and $7, $7, $3
-; MIPS64EL-NEXT:    and $5, $5, $3
-; MIPS64EL-NEXT:    slt $10, $7, $5
-; MIPS64EL-NEXT:    move $8, $7
-; MIPS64EL-NEXT:    movn $8, $5, $10
+; MIPS64EL-NEXT:    ll $8, 0($1)
 ; MIPS64EL-NEXT:    and $8, $8, $3
-; MIPS64EL-NEXT:    and $9, $7, $4
-; MIPS64EL-NEXT:    or $9, $9, $8
-; MIPS64EL-NEXT:    sc $9, 0($1)
-; MIPS64EL-NEXT:    beqz $9, .LBB8_1
+; MIPS64EL-NEXT:    and $5, $5, $3
+; MIPS64EL-NEXT:    slt $11, $8, $5
+; MIPS64EL-NEXT:    move $9, $8
+; MIPS64EL-NEXT:    movn $9, $5, $11
+; MIPS64EL-NEXT:    and $9, $9, $3
+; MIPS64EL-NEXT:    and $10, $8, $6
+; MIPS64EL-NEXT:    or $10, $10, $9
+; MIPS64EL-NEXT:    sc $10, 0($1)
+; MIPS64EL-NEXT:    beqz $10, .LBB8_1
 ; MIPS64EL-NEXT:    nop
 ; MIPS64EL-NEXT:  # %bb.2: # %entry
-; MIPS64EL-NEXT:    and $6, $7, $3
-; MIPS64EL-NEXT:    srlv $6, $6, $2
-; MIPS64EL-NEXT:    seh $6, $6
+; MIPS64EL-NEXT:    and $7, $8, $3
+; MIPS64EL-NEXT:    srlv $7, $7, $2
+; MIPS64EL-NEXT:    seh $7, $7
 ; MIPS64EL-NEXT:  # %bb.3: # %entry
-; MIPS64EL-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64EL-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64EL-NEXT:  # %bb.4: # %entry
 ; MIPS64EL-NEXT:    sync
 ; MIPS64EL-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -3198,28 +3198,28 @@ define i8 @test_max_8(i8* nocapture %ptr, i8 signext %val) {
 ; MIPS64ELR6-NEXT:    sll $2, $2, 3
 ; MIPS64ELR6-NEXT:    ori $3, $zero, 255
 ; MIPS64ELR6-NEXT:    sllv $3, $3, $2
-; MIPS64ELR6-NEXT:    nor $4, $zero, $3
+; MIPS64ELR6-NEXT:    nor $6, $zero, $3
 ; MIPS64ELR6-NEXT:    sllv $5, $5, $2
 ; MIPS64ELR6-NEXT:  .LBB8_1: # %entry
 ; MIPS64ELR6-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64ELR6-NEXT:    ll $7, 0($1)
-; MIPS64ELR6-NEXT:    and $7, $7, $3
-; MIPS64ELR6-NEXT:    and $5, $5, $3
-; MIPS64ELR6-NEXT:    slt $10, $7, $5
-; MIPS64ELR6-NEXT:    seleqz $8, $7, $10
-; MIPS64ELR6-NEXT:    selnez $10, $5, $10
-; MIPS64ELR6-NEXT:    or $8, $8, $10
+; MIPS64ELR6-NEXT:    ll $8, 0($1)
 ; MIPS64ELR6-NEXT:    and $8, $8, $3
-; MIPS64ELR6-NEXT:    and $9, $7, $4
-; MIPS64ELR6-NEXT:    or $9, $9, $8
-; MIPS64ELR6-NEXT:    sc $9, 0($1)
-; MIPS64ELR6-NEXT:    beqzc $9, .LBB8_1
+; MIPS64ELR6-NEXT:    and $5, $5, $3
+; MIPS64ELR6-NEXT:    slt $11, $8, $5
+; MIPS64ELR6-NEXT:    seleqz $9, $8, $11
+; MIPS64ELR6-NEXT:    selnez $11, $5, $11
+; MIPS64ELR6-NEXT:    or $9, $9, $11
+; MIPS64ELR6-NEXT:    and $9, $9, $3
+; MIPS64ELR6-NEXT:    and $10, $8, $6
+; MIPS64ELR6-NEXT:    or $10, $10, $9
+; MIPS64ELR6-NEXT:    sc $10, 0($1)
+; MIPS64ELR6-NEXT:    beqzc $10, .LBB8_1
 ; MIPS64ELR6-NEXT:  # %bb.2: # %entry
-; MIPS64ELR6-NEXT:    and $6, $7, $3
-; MIPS64ELR6-NEXT:    srlv $6, $6, $2
-; MIPS64ELR6-NEXT:    seh $6, $6
+; MIPS64ELR6-NEXT:    and $7, $8, $3
+; MIPS64ELR6-NEXT:    srlv $7, $7, $2
+; MIPS64ELR6-NEXT:    seh $7, $7
 ; MIPS64ELR6-NEXT:  # %bb.3: # %entry
-; MIPS64ELR6-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64ELR6-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64ELR6-NEXT:  # %bb.4: # %entry
 ; MIPS64ELR6-NEXT:    sync
 ; MIPS64ELR6-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -3560,26 +3560,26 @@ define i8 @test_min_8(i8* nocapture %ptr, i8 signext %val) {
 ; MIPS64-NEXT:    sll $2, $2, 3
 ; MIPS64-NEXT:    ori $3, $zero, 255
 ; MIPS64-NEXT:    sllv $3, $3, $2
-; MIPS64-NEXT:    nor $4, $zero, $3
+; MIPS64-NEXT:    nor $6, $zero, $3
 ; MIPS64-NEXT:    sllv $5, $5, $2
 ; MIPS64-NEXT:  .LBB9_1: # %entry
 ; MIPS64-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64-NEXT:    ll $7, 0($1)
-; MIPS64-NEXT:    slt $10, $7, $5
-; MIPS64-NEXT:    move $8, $7
-; MIPS64-NEXT:    movz $8, $5, $10
-; MIPS64-NEXT:    and $8, $8, $3
-; MIPS64-NEXT:    and $9, $7, $4
-; MIPS64-NEXT:    or $9, $9, $8
-; MIPS64-NEXT:    sc $9, 0($1)
-; MIPS64-NEXT:    beqz $9, .LBB9_1
+; MIPS64-NEXT:    ll $8, 0($1)
+; MIPS64-NEXT:    slt $11, $8, $5
+; MIPS64-NEXT:    move $9, $8
+; MIPS64-NEXT:    movz $9, $5, $11
+; MIPS64-NEXT:    and $9, $9, $3
+; MIPS64-NEXT:    and $10, $8, $6
+; MIPS64-NEXT:    or $10, $10, $9
+; MIPS64-NEXT:    sc $10, 0($1)
+; MIPS64-NEXT:    beqz $10, .LBB9_1
 ; MIPS64-NEXT:    nop
 ; MIPS64-NEXT:  # %bb.2: # %entry
-; MIPS64-NEXT:    and $6, $7, $3
-; MIPS64-NEXT:    srlv $6, $6, $2
-; MIPS64-NEXT:    seh $6, $6
+; MIPS64-NEXT:    and $7, $8, $3
+; MIPS64-NEXT:    srlv $7, $7, $2
+; MIPS64-NEXT:    seh $7, $7
 ; MIPS64-NEXT:  # %bb.3: # %entry
-; MIPS64-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64-NEXT:  # %bb.4: # %entry
 ; MIPS64-NEXT:    sync
 ; MIPS64-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -3600,26 +3600,26 @@ define i8 @test_min_8(i8* nocapture %ptr, i8 signext %val) {
 ; MIPS64R6-NEXT:    sll $2, $2, 3
 ; MIPS64R6-NEXT:    ori $3, $zero, 255
 ; MIPS64R6-NEXT:    sllv $3, $3, $2
-; MIPS64R6-NEXT:    nor $4, $zero, $3
+; MIPS64R6-NEXT:    nor $6, $zero, $3
 ; MIPS64R6-NEXT:    sllv $5, $5, $2
 ; MIPS64R6-NEXT:  .LBB9_1: # %entry
 ; MIPS64R6-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64R6-NEXT:    ll $7, 0($1)
-; MIPS64R6-NEXT:    slt $10, $7, $5
-; MIPS64R6-NEXT:    selnez $8, $7, $10
-; MIPS64R6-NEXT:    seleqz $10, $5, $10
-; MIPS64R6-NEXT:    or $8, $8, $10
-; MIPS64R6-NEXT:    and $8, $8, $3
-; MIPS64R6-NEXT:    and $9, $7, $4
-; MIPS64R6-NEXT:    or $9, $9, $8
-; MIPS64R6-NEXT:    sc $9, 0($1)
-; MIPS64R6-NEXT:    beqzc $9, .LBB9_1
+; MIPS64R6-NEXT:    ll $8, 0($1)
+; MIPS64R6-NEXT:    slt $11, $8, $5
+; MIPS64R6-NEXT:    selnez $9, $8, $11
+; MIPS64R6-NEXT:    seleqz $11, $5, $11
+; MIPS64R6-NEXT:    or $9, $9, $11
+; MIPS64R6-NEXT:    and $9, $9, $3
+; MIPS64R6-NEXT:    and $10, $8, $6
+; MIPS64R6-NEXT:    or $10, $10, $9
+; MIPS64R6-NEXT:    sc $10, 0($1)
+; MIPS64R6-NEXT:    beqzc $10, .LBB9_1
 ; MIPS64R6-NEXT:  # %bb.2: # %entry
-; MIPS64R6-NEXT:    and $6, $7, $3
-; MIPS64R6-NEXT:    srlv $6, $6, $2
-; MIPS64R6-NEXT:    seh $6, $6
+; MIPS64R6-NEXT:    and $7, $8, $3
+; MIPS64R6-NEXT:    srlv $7, $7, $2
+; MIPS64R6-NEXT:    seh $7, $7
 ; MIPS64R6-NEXT:  # %bb.3: # %entry
-; MIPS64R6-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64R6-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64R6-NEXT:  # %bb.4: # %entry
 ; MIPS64R6-NEXT:    sync
 ; MIPS64R6-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -3638,28 +3638,28 @@ define i8 @test_min_8(i8* nocapture %ptr, i8 signext %val) {
 ; MIPS64EL-NEXT:    sll $2, $2, 3
 ; MIPS64EL-NEXT:    ori $3, $zero, 255
 ; MIPS64EL-NEXT:    sllv $3, $3, $2
-; MIPS64EL-NEXT:    nor $4, $zero, $3
+; MIPS64EL-NEXT:    nor $6, $zero, $3
 ; MIPS64EL-NEXT:    sllv $5, $5, $2
 ; MIPS64EL-NEXT:  .LBB9_1: # %entry
 ; MIPS64EL-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64EL-NEXT:    ll $7, 0($1)
-; MIPS64EL-NEXT:    and $7, $7, $3
-; MIPS64EL-NEXT:    and $5, $5, $3
-; MIPS64EL-NEXT:    slt $10, $7, $5
-; MIPS64EL-NEXT:    move $8, $7
-; MIPS64EL-NEXT:    movz $8, $5, $10
+; MIPS64EL-NEXT:    ll $8, 0($1)
 ; MIPS64EL-NEXT:    and $8, $8, $3
-; MIPS64EL-NEXT:    and $9, $7, $4
-; MIPS64EL-NEXT:    or $9, $9, $8
-; MIPS64EL-NEXT:    sc $9, 0($1)
-; MIPS64EL-NEXT:    beqz $9, .LBB9_1
+; MIPS64EL-NEXT:    and $5, $5, $3
+; MIPS64EL-NEXT:    slt $11, $8, $5
+; MIPS64EL-NEXT:    move $9, $8
+; MIPS64EL-NEXT:    movz $9, $5, $11
+; MIPS64EL-NEXT:    and $9, $9, $3
+; MIPS64EL-NEXT:    and $10, $8, $6
+; MIPS64EL-NEXT:    or $10, $10, $9
+; MIPS64EL-NEXT:    sc $10, 0($1)
+; MIPS64EL-NEXT:    beqz $10, .LBB9_1
 ; MIPS64EL-NEXT:    nop
 ; MIPS64EL-NEXT:  # %bb.2: # %entry
-; MIPS64EL-NEXT:    and $6, $7, $3
-; MIPS64EL-NEXT:    srlv $6, $6, $2
-; MIPS64EL-NEXT:    seh $6, $6
+; MIPS64EL-NEXT:    and $7, $8, $3
+; MIPS64EL-NEXT:    srlv $7, $7, $2
+; MIPS64EL-NEXT:    seh $7, $7
 ; MIPS64EL-NEXT:  # %bb.3: # %entry
-; MIPS64EL-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64EL-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64EL-NEXT:  # %bb.4: # %entry
 ; MIPS64EL-NEXT:    sync
 ; MIPS64EL-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -3679,28 +3679,28 @@ define i8 @test_min_8(i8* nocapture %ptr, i8 signext %val) {
 ; MIPS64ELR6-NEXT:    sll $2, $2, 3
 ; MIPS64ELR6-NEXT:    ori $3, $zero, 255
 ; MIPS64ELR6-NEXT:    sllv $3, $3, $2
-; MIPS64ELR6-NEXT:    nor $4, $zero, $3
+; MIPS64ELR6-NEXT:    nor $6, $zero, $3
 ; MIPS64ELR6-NEXT:    sllv $5, $5, $2
 ; MIPS64ELR6-NEXT:  .LBB9_1: # %entry
 ; MIPS64ELR6-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64ELR6-NEXT:    ll $7, 0($1)
-; MIPS64ELR6-NEXT:    and $7, $7, $3
-; MIPS64ELR6-NEXT:    and $5, $5, $3
-; MIPS64ELR6-NEXT:    slt $10, $7, $5
-; MIPS64ELR6-NEXT:    selnez $8, $7, $10
-; MIPS64ELR6-NEXT:    seleqz $10, $5, $10
-; MIPS64ELR6-NEXT:    or $8, $8, $10
+; MIPS64ELR6-NEXT:    ll $8, 0($1)
 ; MIPS64ELR6-NEXT:    and $8, $8, $3
-; MIPS64ELR6-NEXT:    and $9, $7, $4
-; MIPS64ELR6-NEXT:    or $9, $9, $8
-; MIPS64ELR6-NEXT:    sc $9, 0($1)
-; MIPS64ELR6-NEXT:    beqzc $9, .LBB9_1
+; MIPS64ELR6-NEXT:    and $5, $5, $3
+; MIPS64ELR6-NEXT:    slt $11, $8, $5
+; MIPS64ELR6-NEXT:    selnez $9, $8, $11
+; MIPS64ELR6-NEXT:    seleqz $11, $5, $11
+; MIPS64ELR6-NEXT:    or $9, $9, $11
+; MIPS64ELR6-NEXT:    and $9, $9, $3
+; MIPS64ELR6-NEXT:    and $10, $8, $6
+; MIPS64ELR6-NEXT:    or $10, $10, $9
+; MIPS64ELR6-NEXT:    sc $10, 0($1)
+; MIPS64ELR6-NEXT:    beqzc $10, .LBB9_1
 ; MIPS64ELR6-NEXT:  # %bb.2: # %entry
-; MIPS64ELR6-NEXT:    and $6, $7, $3
-; MIPS64ELR6-NEXT:    srlv $6, $6, $2
-; MIPS64ELR6-NEXT:    seh $6, $6
+; MIPS64ELR6-NEXT:    and $7, $8, $3
+; MIPS64ELR6-NEXT:    srlv $7, $7, $2
+; MIPS64ELR6-NEXT:    seh $7, $7
 ; MIPS64ELR6-NEXT:  # %bb.3: # %entry
-; MIPS64ELR6-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64ELR6-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64ELR6-NEXT:  # %bb.4: # %entry
 ; MIPS64ELR6-NEXT:    sync
 ; MIPS64ELR6-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -4041,26 +4041,26 @@ define i8 @test_umax_8(i8* nocapture %ptr, i8 signext %val) {
 ; MIPS64-NEXT:    sll $2, $2, 3
 ; MIPS64-NEXT:    ori $3, $zero, 255
 ; MIPS64-NEXT:    sllv $3, $3, $2
-; MIPS64-NEXT:    nor $4, $zero, $3
+; MIPS64-NEXT:    nor $6, $zero, $3
 ; MIPS64-NEXT:    sllv $5, $5, $2
 ; MIPS64-NEXT:  .LBB10_1: # %entry
 ; MIPS64-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64-NEXT:    ll $7, 0($1)
-; MIPS64-NEXT:    sltu $10, $7, $5
-; MIPS64-NEXT:    move $8, $7
-; MIPS64-NEXT:    movn $8, $5, $10
-; MIPS64-NEXT:    and $8, $8, $3
-; MIPS64-NEXT:    and $9, $7, $4
-; MIPS64-NEXT:    or $9, $9, $8
-; MIPS64-NEXT:    sc $9, 0($1)
-; MIPS64-NEXT:    beqz $9, .LBB10_1
+; MIPS64-NEXT:    ll $8, 0($1)
+; MIPS64-NEXT:    sltu $11, $8, $5
+; MIPS64-NEXT:    move $9, $8
+; MIPS64-NEXT:    movn $9, $5, $11
+; MIPS64-NEXT:    and $9, $9, $3
+; MIPS64-NEXT:    and $10, $8, $6
+; MIPS64-NEXT:    or $10, $10, $9
+; MIPS64-NEXT:    sc $10, 0($1)
+; MIPS64-NEXT:    beqz $10, .LBB10_1
 ; MIPS64-NEXT:    nop
 ; MIPS64-NEXT:  # %bb.2: # %entry
-; MIPS64-NEXT:    and $6, $7, $3
-; MIPS64-NEXT:    srlv $6, $6, $2
-; MIPS64-NEXT:    seh $6, $6
+; MIPS64-NEXT:    and $7, $8, $3
+; MIPS64-NEXT:    srlv $7, $7, $2
+; MIPS64-NEXT:    seh $7, $7
 ; MIPS64-NEXT:  # %bb.3: # %entry
-; MIPS64-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64-NEXT:  # %bb.4: # %entry
 ; MIPS64-NEXT:    sync
 ; MIPS64-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -4081,26 +4081,26 @@ define i8 @test_umax_8(i8* nocapture %ptr, i8 signext %val) {
 ; MIPS64R6-NEXT:    sll $2, $2, 3
 ; MIPS64R6-NEXT:    ori $3, $zero, 255
 ; MIPS64R6-NEXT:    sllv $3, $3, $2
-; MIPS64R6-NEXT:    nor $4, $zero, $3
+; MIPS64R6-NEXT:    nor $6, $zero, $3
 ; MIPS64R6-NEXT:    sllv $5, $5, $2
 ; MIPS64R6-NEXT:  .LBB10_1: # %entry
 ; MIPS64R6-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64R6-NEXT:    ll $7, 0($1)
-; MIPS64R6-NEXT:    sltu $10, $7, $5
-; MIPS64R6-NEXT:    seleqz $8, $7, $10
-; MIPS64R6-NEXT:    selnez $10, $5, $10
-; MIPS64R6-NEXT:    or $8, $8, $10
-; MIPS64R6-NEXT:    and $8, $8, $3
-; MIPS64R6-NEXT:    and $9, $7, $4
-; MIPS64R6-NEXT:    or $9, $9, $8
-; MIPS64R6-NEXT:    sc $9, 0($1)
-; MIPS64R6-NEXT:    beqzc $9, .LBB10_1
+; MIPS64R6-NEXT:    ll $8, 0($1)
+; MIPS64R6-NEXT:    sltu $11, $8, $5
+; MIPS64R6-NEXT:    seleqz $9, $8, $11
+; MIPS64R6-NEXT:    selnez $11, $5, $11
+; MIPS64R6-NEXT:    or $9, $9, $11
+; MIPS64R6-NEXT:    and $9, $9, $3
+; MIPS64R6-NEXT:    and $10, $8, $6
+; MIPS64R6-NEXT:    or $10, $10, $9
+; MIPS64R6-NEXT:    sc $10, 0($1)
+; MIPS64R6-NEXT:    beqzc $10, .LBB10_1
 ; MIPS64R6-NEXT:  # %bb.2: # %entry
-; MIPS64R6-NEXT:    and $6, $7, $3
-; MIPS64R6-NEXT:    srlv $6, $6, $2
-; MIPS64R6-NEXT:    seh $6, $6
+; MIPS64R6-NEXT:    and $7, $8, $3
+; MIPS64R6-NEXT:    srlv $7, $7, $2
+; MIPS64R6-NEXT:    seh $7, $7
 ; MIPS64R6-NEXT:  # %bb.3: # %entry
-; MIPS64R6-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64R6-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64R6-NEXT:  # %bb.4: # %entry
 ; MIPS64R6-NEXT:    sync
 ; MIPS64R6-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -4119,28 +4119,28 @@ define i8 @test_umax_8(i8* nocapture %ptr, i8 signext %val) {
 ; MIPS64EL-NEXT:    sll $2, $2, 3
 ; MIPS64EL-NEXT:    ori $3, $zero, 255
 ; MIPS64EL-NEXT:    sllv $3, $3, $2
-; MIPS64EL-NEXT:    nor $4, $zero, $3
+; MIPS64EL-NEXT:    nor $6, $zero, $3
 ; MIPS64EL-NEXT:    sllv $5, $5, $2
 ; MIPS64EL-NEXT:  .LBB10_1: # %entry
 ; MIPS64EL-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64EL-NEXT:    ll $7, 0($1)
-; MIPS64EL-NEXT:    and $7, $7, $3
-; MIPS64EL-NEXT:    and $5, $5, $3
-; MIPS64EL-NEXT:    sltu $10, $7, $5
-; MIPS64EL-NEXT:    move $8, $7
-; MIPS64EL-NEXT:    movn $8, $5, $10
+; MIPS64EL-NEXT:    ll $8, 0($1)
 ; MIPS64EL-NEXT:    and $8, $8, $3
-; MIPS64EL-NEXT:    and $9, $7, $4
-; MIPS64EL-NEXT:    or $9, $9, $8
-; MIPS64EL-NEXT:    sc $9, 0($1)
-; MIPS64EL-NEXT:    beqz $9, .LBB10_1
+; MIPS64EL-NEXT:    and $5, $5, $3
+; MIPS64EL-NEXT:    sltu $11, $8, $5
+; MIPS64EL-NEXT:    move $9, $8
+; MIPS64EL-NEXT:    movn $9, $5, $11
+; MIPS64EL-NEXT:    and $9, $9, $3
+; MIPS64EL-NEXT:    and $10, $8, $6
+; MIPS64EL-NEXT:    or $10, $10, $9
+; MIPS64EL-NEXT:    sc $10, 0($1)
+; MIPS64EL-NEXT:    beqz $10, .LBB10_1
 ; MIPS64EL-NEXT:    nop
 ; MIPS64EL-NEXT:  # %bb.2: # %entry
-; MIPS64EL-NEXT:    and $6, $7, $3
-; MIPS64EL-NEXT:    srlv $6, $6, $2
-; MIPS64EL-NEXT:    seh $6, $6
+; MIPS64EL-NEXT:    and $7, $8, $3
+; MIPS64EL-NEXT:    srlv $7, $7, $2
+; MIPS64EL-NEXT:    seh $7, $7
 ; MIPS64EL-NEXT:  # %bb.3: # %entry
-; MIPS64EL-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64EL-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64EL-NEXT:  # %bb.4: # %entry
 ; MIPS64EL-NEXT:    sync
 ; MIPS64EL-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -4160,28 +4160,28 @@ define i8 @test_umax_8(i8* nocapture %ptr, i8 signext %val) {
 ; MIPS64ELR6-NEXT:    sll $2, $2, 3
 ; MIPS64ELR6-NEXT:    ori $3, $zero, 255
 ; MIPS64ELR6-NEXT:    sllv $3, $3, $2
-; MIPS64ELR6-NEXT:    nor $4, $zero, $3
+; MIPS64ELR6-NEXT:    nor $6, $zero, $3
 ; MIPS64ELR6-NEXT:    sllv $5, $5, $2
 ; MIPS64ELR6-NEXT:  .LBB10_1: # %entry
 ; MIPS64ELR6-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64ELR6-NEXT:    ll $7, 0($1)
-; MIPS64ELR6-NEXT:    and $7, $7, $3
-; MIPS64ELR6-NEXT:    and $5, $5, $3
-; MIPS64ELR6-NEXT:    sltu $10, $7, $5
-; MIPS64ELR6-NEXT:    seleqz $8, $7, $10
-; MIPS64ELR6-NEXT:    selnez $10, $5, $10
-; MIPS64ELR6-NEXT:    or $8, $8, $10
+; MIPS64ELR6-NEXT:    ll $8, 0($1)
 ; MIPS64ELR6-NEXT:    and $8, $8, $3
-; MIPS64ELR6-NEXT:    and $9, $7, $4
-; MIPS64ELR6-NEXT:    or $9, $9, $8
-; MIPS64ELR6-NEXT:    sc $9, 0($1)
-; MIPS64ELR6-NEXT:    beqzc $9, .LBB10_1
+; MIPS64ELR6-NEXT:    and $5, $5, $3
+; MIPS64ELR6-NEXT:    sltu $11, $8, $5
+; MIPS64ELR6-NEXT:    seleqz $9, $8, $11
+; MIPS64ELR6-NEXT:    selnez $11, $5, $11
+; MIPS64ELR6-NEXT:    or $9, $9, $11
+; MIPS64ELR6-NEXT:    and $9, $9, $3
+; MIPS64ELR6-NEXT:    and $10, $8, $6
+; MIPS64ELR6-NEXT:    or $10, $10, $9
+; MIPS64ELR6-NEXT:    sc $10, 0($1)
+; MIPS64ELR6-NEXT:    beqzc $10, .LBB10_1
 ; MIPS64ELR6-NEXT:  # %bb.2: # %entry
-; MIPS64ELR6-NEXT:    and $6, $7, $3
-; MIPS64ELR6-NEXT:    srlv $6, $6, $2
-; MIPS64ELR6-NEXT:    seh $6, $6
+; MIPS64ELR6-NEXT:    and $7, $8, $3
+; MIPS64ELR6-NEXT:    srlv $7, $7, $2
+; MIPS64ELR6-NEXT:    seh $7, $7
 ; MIPS64ELR6-NEXT:  # %bb.3: # %entry
-; MIPS64ELR6-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64ELR6-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64ELR6-NEXT:  # %bb.4: # %entry
 ; MIPS64ELR6-NEXT:    sync
 ; MIPS64ELR6-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -4522,26 +4522,26 @@ define i8 @test_umin_8(i8* nocapture %ptr, i8 signext %val) {
 ; MIPS64-NEXT:    sll $2, $2, 3
 ; MIPS64-NEXT:    ori $3, $zero, 255
 ; MIPS64-NEXT:    sllv $3, $3, $2
-; MIPS64-NEXT:    nor $4, $zero, $3
+; MIPS64-NEXT:    nor $6, $zero, $3
 ; MIPS64-NEXT:    sllv $5, $5, $2
 ; MIPS64-NEXT:  .LBB11_1: # %entry
 ; MIPS64-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64-NEXT:    ll $7, 0($1)
-; MIPS64-NEXT:    sltu $10, $7, $5
-; MIPS64-NEXT:    move $8, $7
-; MIPS64-NEXT:    movz $8, $5, $10
-; MIPS64-NEXT:    and $8, $8, $3
-; MIPS64-NEXT:    and $9, $7, $4
-; MIPS64-NEXT:    or $9, $9, $8
-; MIPS64-NEXT:    sc $9, 0($1)
-; MIPS64-NEXT:    beqz $9, .LBB11_1
+; MIPS64-NEXT:    ll $8, 0($1)
+; MIPS64-NEXT:    sltu $11, $8, $5
+; MIPS64-NEXT:    move $9, $8
+; MIPS64-NEXT:    movz $9, $5, $11
+; MIPS64-NEXT:    and $9, $9, $3
+; MIPS64-NEXT:    and $10, $8, $6
+; MIPS64-NEXT:    or $10, $10, $9
+; MIPS64-NEXT:    sc $10, 0($1)
+; MIPS64-NEXT:    beqz $10, .LBB11_1
 ; MIPS64-NEXT:    nop
 ; MIPS64-NEXT:  # %bb.2: # %entry
-; MIPS64-NEXT:    and $6, $7, $3
-; MIPS64-NEXT:    srlv $6, $6, $2
-; MIPS64-NEXT:    seh $6, $6
+; MIPS64-NEXT:    and $7, $8, $3
+; MIPS64-NEXT:    srlv $7, $7, $2
+; MIPS64-NEXT:    seh $7, $7
 ; MIPS64-NEXT:  # %bb.3: # %entry
-; MIPS64-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64-NEXT:  # %bb.4: # %entry
 ; MIPS64-NEXT:    sync
 ; MIPS64-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -4562,26 +4562,26 @@ define i8 @test_umin_8(i8* nocapture %ptr, i8 signext %val) {
 ; MIPS64R6-NEXT:    sll $2, $2, 3
 ; MIPS64R6-NEXT:    ori $3, $zero, 255
 ; MIPS64R6-NEXT:    sllv $3, $3, $2
-; MIPS64R6-NEXT:    nor $4, $zero, $3
+; MIPS64R6-NEXT:    nor $6, $zero, $3
 ; MIPS64R6-NEXT:    sllv $5, $5, $2
 ; MIPS64R6-NEXT:  .LBB11_1: # %entry
 ; MIPS64R6-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64R6-NEXT:    ll $7, 0($1)
-; MIPS64R6-NEXT:    sltu $10, $7, $5
-; MIPS64R6-NEXT:    selnez $8, $7, $10
-; MIPS64R6-NEXT:    seleqz $10, $5, $10
-; MIPS64R6-NEXT:    or $8, $8, $10
-; MIPS64R6-NEXT:    and $8, $8, $3
-; MIPS64R6-NEXT:    and $9, $7, $4
-; MIPS64R6-NEXT:    or $9, $9, $8
-; MIPS64R6-NEXT:    sc $9, 0($1)
-; MIPS64R6-NEXT:    beqzc $9, .LBB11_1
+; MIPS64R6-NEXT:    ll $8, 0($1)
+; MIPS64R6-NEXT:    sltu $11, $8, $5
+; MIPS64R6-NEXT:    selnez $9, $8, $11
+; MIPS64R6-NEXT:    seleqz $11, $5, $11
+; MIPS64R6-NEXT:    or $9, $9, $11
+; MIPS64R6-NEXT:    and $9, $9, $3
+; MIPS64R6-NEXT:    and $10, $8, $6
+; MIPS64R6-NEXT:    or $10, $10, $9
+; MIPS64R6-NEXT:    sc $10, 0($1)
+; MIPS64R6-NEXT:    beqzc $10, .LBB11_1
 ; MIPS64R6-NEXT:  # %bb.2: # %entry
-; MIPS64R6-NEXT:    and $6, $7, $3
-; MIPS64R6-NEXT:    srlv $6, $6, $2
-; MIPS64R6-NEXT:    seh $6, $6
+; MIPS64R6-NEXT:    and $7, $8, $3
+; MIPS64R6-NEXT:    srlv $7, $7, $2
+; MIPS64R6-NEXT:    seh $7, $7
 ; MIPS64R6-NEXT:  # %bb.3: # %entry
-; MIPS64R6-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64R6-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64R6-NEXT:  # %bb.4: # %entry
 ; MIPS64R6-NEXT:    sync
 ; MIPS64R6-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -4600,28 +4600,28 @@ define i8 @test_umin_8(i8* nocapture %ptr, i8 signext %val) {
 ; MIPS64EL-NEXT:    sll $2, $2, 3
 ; MIPS64EL-NEXT:    ori $3, $zero, 255
 ; MIPS64EL-NEXT:    sllv $3, $3, $2
-; MIPS64EL-NEXT:    nor $4, $zero, $3
+; MIPS64EL-NEXT:    nor $6, $zero, $3
 ; MIPS64EL-NEXT:    sllv $5, $5, $2
 ; MIPS64EL-NEXT:  .LBB11_1: # %entry
 ; MIPS64EL-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64EL-NEXT:    ll $7, 0($1)
-; MIPS64EL-NEXT:    and $7, $7, $3
-; MIPS64EL-NEXT:    and $5, $5, $3
-; MIPS64EL-NEXT:    sltu $10, $7, $5
-; MIPS64EL-NEXT:    move $8, $7
-; MIPS64EL-NEXT:    movz $8, $5, $10
+; MIPS64EL-NEXT:    ll $8, 0($1)
 ; MIPS64EL-NEXT:    and $8, $8, $3
-; MIPS64EL-NEXT:    and $9, $7, $4
-; MIPS64EL-NEXT:    or $9, $9, $8
-; MIPS64EL-NEXT:    sc $9, 0($1)
-; MIPS64EL-NEXT:    beqz $9, .LBB11_1
+; MIPS64EL-NEXT:    and $5, $5, $3
+; MIPS64EL-NEXT:    sltu $11, $8, $5
+; MIPS64EL-NEXT:    move $9, $8
+; MIPS64EL-NEXT:    movz $9, $5, $11
+; MIPS64EL-NEXT:    and $9, $9, $3
+; MIPS64EL-NEXT:    and $10, $8, $6
+; MIPS64EL-NEXT:    or $10, $10, $9
+; MIPS64EL-NEXT:    sc $10, 0($1)
+; MIPS64EL-NEXT:    beqz $10, .LBB11_1
 ; MIPS64EL-NEXT:    nop
 ; MIPS64EL-NEXT:  # %bb.2: # %entry
-; MIPS64EL-NEXT:    and $6, $7, $3
-; MIPS64EL-NEXT:    srlv $6, $6, $2
-; MIPS64EL-NEXT:    seh $6, $6
+; MIPS64EL-NEXT:    and $7, $8, $3
+; MIPS64EL-NEXT:    srlv $7, $7, $2
+; MIPS64EL-NEXT:    seh $7, $7
 ; MIPS64EL-NEXT:  # %bb.3: # %entry
-; MIPS64EL-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64EL-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64EL-NEXT:  # %bb.4: # %entry
 ; MIPS64EL-NEXT:    sync
 ; MIPS64EL-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -4641,28 +4641,28 @@ define i8 @test_umin_8(i8* nocapture %ptr, i8 signext %val) {
 ; MIPS64ELR6-NEXT:    sll $2, $2, 3
 ; MIPS64ELR6-NEXT:    ori $3, $zero, 255
 ; MIPS64ELR6-NEXT:    sllv $3, $3, $2
-; MIPS64ELR6-NEXT:    nor $4, $zero, $3
+; MIPS64ELR6-NEXT:    nor $6, $zero, $3
 ; MIPS64ELR6-NEXT:    sllv $5, $5, $2
 ; MIPS64ELR6-NEXT:  .LBB11_1: # %entry
 ; MIPS64ELR6-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64ELR6-NEXT:    ll $7, 0($1)
-; MIPS64ELR6-NEXT:    and $7, $7, $3
-; MIPS64ELR6-NEXT:    and $5, $5, $3
-; MIPS64ELR6-NEXT:    sltu $10, $7, $5
-; MIPS64ELR6-NEXT:    selnez $8, $7, $10
-; MIPS64ELR6-NEXT:    seleqz $10, $5, $10
-; MIPS64ELR6-NEXT:    or $8, $8, $10
+; MIPS64ELR6-NEXT:    ll $8, 0($1)
 ; MIPS64ELR6-NEXT:    and $8, $8, $3
-; MIPS64ELR6-NEXT:    and $9, $7, $4
-; MIPS64ELR6-NEXT:    or $9, $9, $8
-; MIPS64ELR6-NEXT:    sc $9, 0($1)
-; MIPS64ELR6-NEXT:    beqzc $9, .LBB11_1
+; MIPS64ELR6-NEXT:    and $5, $5, $3
+; MIPS64ELR6-NEXT:    sltu $11, $8, $5
+; MIPS64ELR6-NEXT:    selnez $9, $8, $11
+; MIPS64ELR6-NEXT:    seleqz $11, $5, $11
+; MIPS64ELR6-NEXT:    or $9, $9, $11
+; MIPS64ELR6-NEXT:    and $9, $9, $3
+; MIPS64ELR6-NEXT:    and $10, $8, $6
+; MIPS64ELR6-NEXT:    or $10, $10, $9
+; MIPS64ELR6-NEXT:    sc $10, 0($1)
+; MIPS64ELR6-NEXT:    beqzc $10, .LBB11_1
 ; MIPS64ELR6-NEXT:  # %bb.2: # %entry
-; MIPS64ELR6-NEXT:    and $6, $7, $3
-; MIPS64ELR6-NEXT:    srlv $6, $6, $2
-; MIPS64ELR6-NEXT:    seh $6, $6
+; MIPS64ELR6-NEXT:    and $7, $8, $3
+; MIPS64ELR6-NEXT:    srlv $7, $7, $2
+; MIPS64ELR6-NEXT:    seh $7, $7
 ; MIPS64ELR6-NEXT:  # %bb.3: # %entry
-; MIPS64ELR6-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64ELR6-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64ELR6-NEXT:  # %bb.4: # %entry
 ; MIPS64ELR6-NEXT:    sync
 ; MIPS64ELR6-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/Mips/atomic.ll b/llvm/test/CodeGen/Mips/atomic.ll
index 59ff83e4969c..3846fda47b13 100644
--- a/llvm/test/CodeGen/Mips/atomic.ll
+++ b/llvm/test/CodeGen/Mips/atomic.ll
@@ -2559,28 +2559,28 @@ define signext i8 @AtomicLoadAdd8(i8 signext %incr) nounwind {
 ; MIPS64R6O0-NEXT:    ld $1, %got_disp(y)($1)
 ; MIPS64R6O0-NEXT:    daddiu $2, $zero, -4
 ; MIPS64R6O0-NEXT:    and $2, $1, $2
-; MIPS64R6O0-NEXT:    andi $1, $1, 3
-; MIPS64R6O0-NEXT:    xori $1, $1, 3
-; MIPS64R6O0-NEXT:    sll $1, $1, 3
-; MIPS64R6O0-NEXT:    ori $3, $zero, 255
-; MIPS64R6O0-NEXT:    sllv $3, $3, $1
-; MIPS64R6O0-NEXT:    nor $5, $zero, $3
-; MIPS64R6O0-NEXT:    sllv $4, $4, $1
+; MIPS64R6O0-NEXT:    andi $3, $1, 3
+; MIPS64R6O0-NEXT:    xori $3, $3, 3
+; MIPS64R6O0-NEXT:    sll $3, $3, 3
+; MIPS64R6O0-NEXT:    ori $5, $zero, 255
+; MIPS64R6O0-NEXT:    sllv $5, $5, $3
+; MIPS64R6O0-NEXT:    nor $6, $zero, $5
+; MIPS64R6O0-NEXT:    sllv $4, $4, $3
 ; MIPS64R6O0-NEXT:  .LBB8_1: # %entry
 ; MIPS64R6O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64R6O0-NEXT:    ll $7, 0($2)
-; MIPS64R6O0-NEXT:    addu $8, $7, $4
-; MIPS64R6O0-NEXT:    and $8, $8, $3
-; MIPS64R6O0-NEXT:    and $9, $7, $5
-; MIPS64R6O0-NEXT:    or $9, $9, $8
-; MIPS64R6O0-NEXT:    sc $9, 0($2)
-; MIPS64R6O0-NEXT:    beqzc $9, .LBB8_1
+; MIPS64R6O0-NEXT:    ll $8, 0($2)
+; MIPS64R6O0-NEXT:    addu $9, $8, $4
+; MIPS64R6O0-NEXT:    and $9, $9, $5
+; MIPS64R6O0-NEXT:    and $10, $8, $6
+; MIPS64R6O0-NEXT:    or $10, $10, $9
+; MIPS64R6O0-NEXT:    sc $10, 0($2)
+; MIPS64R6O0-NEXT:    beqzc $10, .LBB8_1
 ; MIPS64R6O0-NEXT:  # %bb.2: # %entry
-; MIPS64R6O0-NEXT:    and $6, $7, $3
-; MIPS64R6O0-NEXT:    srlv $6, $6, $1
-; MIPS64R6O0-NEXT:    seb $6, $6
+; MIPS64R6O0-NEXT:    and $7, $8, $5
+; MIPS64R6O0-NEXT:    srlv $7, $7, $3
+; MIPS64R6O0-NEXT:    seb $7, $7
 ; MIPS64R6O0-NEXT:  # %bb.3: # %entry
-; MIPS64R6O0-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64R6O0-NEXT:  # %bb.4: # %entry
 ; MIPS64R6O0-NEXT:    lw $1, 12($sp) # 4-byte Folded Reload
 ; MIPS64R6O0-NEXT:    seb $2, $1
@@ -3075,28 +3075,28 @@ define signext i8 @AtomicLoadSub8(i8 signext %incr) nounwind {
 ; MIPS64R6O0-NEXT:    ld $1, %got_disp(y)($1)
 ; MIPS64R6O0-NEXT:    daddiu $2, $zero, -4
 ; MIPS64R6O0-NEXT:    and $2, $1, $2
-; MIPS64R6O0-NEXT:    andi $1, $1, 3
-; MIPS64R6O0-NEXT:    xori $1, $1, 3
-; MIPS64R6O0-NEXT:    sll $1, $1, 3
-; MIPS64R6O0-NEXT:    ori $3, $zero, 255
-; MIPS64R6O0-NEXT:    sllv $3, $3, $1
-; MIPS64R6O0-NEXT:    nor $5, $zero, $3
-; MIPS64R6O0-NEXT:    sllv $4, $4, $1
+; MIPS64R6O0-NEXT:    andi $3, $1, 3
+; MIPS64R6O0-NEXT:    xori $3, $3, 3
+; MIPS64R6O0-NEXT:    sll $3, $3, 3
+; MIPS64R6O0-NEXT:    ori $5, $zero, 255
+; MIPS64R6O0-NEXT:    sllv $5, $5, $3
+; MIPS64R6O0-NEXT:    nor $6, $zero, $5
+; MIPS64R6O0-NEXT:    sllv $4, $4, $3
 ; MIPS64R6O0-NEXT:  .LBB9_1: # %entry
 ; MIPS64R6O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64R6O0-NEXT:    ll $7, 0($2)
-; MIPS64R6O0-NEXT:    subu $8, $7, $4
-; MIPS64R6O0-NEXT:    and $8, $8, $3
-; MIPS64R6O0-NEXT:    and $9, $7, $5
-; MIPS64R6O0-NEXT:    or $9, $9, $8
-; MIPS64R6O0-NEXT:    sc $9, 0($2)
-; MIPS64R6O0-NEXT:    beqzc $9, .LBB9_1
+; MIPS64R6O0-NEXT:    ll $8, 0($2)
+; MIPS64R6O0-NEXT:    subu $9, $8, $4
+; MIPS64R6O0-NEXT:    and $9, $9, $5
+; MIPS64R6O0-NEXT:    and $10, $8, $6
+; MIPS64R6O0-NEXT:    or $10, $10, $9
+; MIPS64R6O0-NEXT:    sc $10, 0($2)
+; MIPS64R6O0-NEXT:    beqzc $10, .LBB9_1
 ; MIPS64R6O0-NEXT:  # %bb.2: # %entry
-; MIPS64R6O0-NEXT:    and $6, $7, $3
-; MIPS64R6O0-NEXT:    srlv $6, $6, $1
-; MIPS64R6O0-NEXT:    seb $6, $6
+; MIPS64R6O0-NEXT:    and $7, $8, $5
+; MIPS64R6O0-NEXT:    srlv $7, $7, $3
+; MIPS64R6O0-NEXT:    seb $7, $7
 ; MIPS64R6O0-NEXT:  # %bb.3: # %entry
-; MIPS64R6O0-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64R6O0-NEXT:  # %bb.4: # %entry
 ; MIPS64R6O0-NEXT:    lw $1, 12($sp) # 4-byte Folded Reload
 ; MIPS64R6O0-NEXT:    seb $2, $1
@@ -3601,29 +3601,29 @@ define signext i8 @AtomicLoadNand8(i8 signext %incr) nounwind {
 ; MIPS64R6O0-NEXT:    ld $1, %got_disp(y)($1)
 ; MIPS64R6O0-NEXT:    daddiu $2, $zero, -4
 ; MIPS64R6O0-NEXT:    and $2, $1, $2
-; MIPS64R6O0-NEXT:    andi $1, $1, 3
-; MIPS64R6O0-NEXT:    xori $1, $1, 3
-; MIPS64R6O0-NEXT:    sll $1, $1, 3
-; MIPS64R6O0-NEXT:    ori $3, $zero, 255
-; MIPS64R6O0-NEXT:    sllv $3, $3, $1
-; MIPS64R6O0-NEXT:    nor $5, $zero, $3
-; MIPS64R6O0-NEXT:    sllv $4, $4, $1
+; MIPS64R6O0-NEXT:    andi $3, $1, 3
+; MIPS64R6O0-NEXT:    xori $3, $3, 3
+; MIPS64R6O0-NEXT:    sll $3, $3, 3
+; MIPS64R6O0-NEXT:    ori $5, $zero, 255
+; MIPS64R6O0-NEXT:    sllv $5, $5, $3
+; MIPS64R6O0-NEXT:    nor $6, $zero, $5
+; MIPS64R6O0-NEXT:    sllv $4, $4, $3
 ; MIPS64R6O0-NEXT:  .LBB10_1: # %entry
 ; MIPS64R6O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64R6O0-NEXT:    ll $7, 0($2)
-; MIPS64R6O0-NEXT:    and $8, $7, $4
-; MIPS64R6O0-NEXT:    nor $8, $zero, $8
-; MIPS64R6O0-NEXT:    and $8, $8, $3
-; MIPS64R6O0-NEXT:    and $9, $7, $5
-; MIPS64R6O0-NEXT:    or $9, $9, $8
-; MIPS64R6O0-NEXT:    sc $9, 0($2)
-; MIPS64R6O0-NEXT:    beqzc $9, .LBB10_1
+; MIPS64R6O0-NEXT:    ll $8, 0($2)
+; MIPS64R6O0-NEXT:    and $9, $8, $4
+; MIPS64R6O0-NEXT:    nor $9, $zero, $9
+; MIPS64R6O0-NEXT:    and $9, $9, $5
+; MIPS64R6O0-NEXT:    and $10, $8, $6
+; MIPS64R6O0-NEXT:    or $10, $10, $9
+; MIPS64R6O0-NEXT:    sc $10, 0($2)
+; MIPS64R6O0-NEXT:    beqzc $10, .LBB10_1
 ; MIPS64R6O0-NEXT:  # %bb.2: # %entry
-; MIPS64R6O0-NEXT:    and $6, $7, $3
-; MIPS64R6O0-NEXT:    srlv $6, $6, $1
-; MIPS64R6O0-NEXT:    seb $6, $6
+; MIPS64R6O0-NEXT:    and $7, $8, $5
+; MIPS64R6O0-NEXT:    srlv $7, $7, $3
+; MIPS64R6O0-NEXT:    seb $7, $7
 ; MIPS64R6O0-NEXT:  # %bb.3: # %entry
-; MIPS64R6O0-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64R6O0-NEXT:  # %bb.4: # %entry
 ; MIPS64R6O0-NEXT:    lw $1, 12($sp) # 4-byte Folded Reload
 ; MIPS64R6O0-NEXT:    seb $2, $1
@@ -4115,27 +4115,27 @@ define signext i8 @AtomicSwap8(i8 signext %newval) nounwind {
 ; MIPS64R6O0-NEXT:    ld $1, %got_disp(y)($1)
 ; MIPS64R6O0-NEXT:    daddiu $2, $zero, -4
 ; MIPS64R6O0-NEXT:    and $2, $1, $2
-; MIPS64R6O0-NEXT:    andi $1, $1, 3
-; MIPS64R6O0-NEXT:    xori $1, $1, 3
-; MIPS64R6O0-NEXT:    sll $1, $1, 3
-; MIPS64R6O0-NEXT:    ori $3, $zero, 255
-; MIPS64R6O0-NEXT:    sllv $3, $3, $1
-; MIPS64R6O0-NEXT:    nor $5, $zero, $3
-; MIPS64R6O0-NEXT:    sllv $4, $4, $1
+; MIPS64R6O0-NEXT:    andi $3, $1, 3
+; MIPS64R6O0-NEXT:    xori $3, $3, 3
+; MIPS64R6O0-NEXT:    sll $3, $3, 3
+; MIPS64R6O0-NEXT:    ori $5, $zero, 255
+; MIPS64R6O0-NEXT:    sllv $5, $5, $3
+; MIPS64R6O0-NEXT:    nor $6, $zero, $5
+; MIPS64R6O0-NEXT:    sllv $4, $4, $3
 ; MIPS64R6O0-NEXT:  .LBB11_1: # %entry
 ; MIPS64R6O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64R6O0-NEXT:    ll $7, 0($2)
-; MIPS64R6O0-NEXT:    and $8, $4, $3
-; MIPS64R6O0-NEXT:    and $9, $7, $5
-; MIPS64R6O0-NEXT:    or $9, $9, $8
-; MIPS64R6O0-NEXT:    sc $9, 0($2)
-; MIPS64R6O0-NEXT:    beqzc $9, .LBB11_1
+; MIPS64R6O0-NEXT:    ll $8, 0($2)
+; MIPS64R6O0-NEXT:    and $9, $4, $5
+; MIPS64R6O0-NEXT:    and $10, $8, $6
+; MIPS64R6O0-NEXT:    or $10, $10, $9
+; MIPS64R6O0-NEXT:    sc $10, 0($2)
+; MIPS64R6O0-NEXT:    beqzc $10, .LBB11_1
 ; MIPS64R6O0-NEXT:  # %bb.2: # %entry
-; MIPS64R6O0-NEXT:    and $6, $7, $3
-; MIPS64R6O0-NEXT:    srlv $6, $6, $1
-; MIPS64R6O0-NEXT:    seb $6, $6
+; MIPS64R6O0-NEXT:    and $7, $8, $5
+; MIPS64R6O0-NEXT:    srlv $7, $7, $3
+; MIPS64R6O0-NEXT:    seb $7, $7
 ; MIPS64R6O0-NEXT:  # %bb.3: # %entry
-; MIPS64R6O0-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64R6O0-NEXT:  # %bb.4: # %entry
 ; MIPS64R6O0-NEXT:    lw $1, 12($sp) # 4-byte Folded Reload
 ; MIPS64R6O0-NEXT:    seb $2, $1
@@ -4666,32 +4666,32 @@ define signext i8 @AtomicCmpSwap8(i8 signext %oldval, i8 signext %newval) nounwi
 ; MIPS64R6O0-NEXT:    ld $1, %got_disp(y)($1)
 ; MIPS64R6O0-NEXT:    daddiu $2, $zero, -4
 ; MIPS64R6O0-NEXT:    and $2, $1, $2
-; MIPS64R6O0-NEXT:    andi $1, $1, 3
-; MIPS64R6O0-NEXT:    xori $1, $1, 3
-; MIPS64R6O0-NEXT:    sll $1, $1, 3
-; MIPS64R6O0-NEXT:    ori $3, $zero, 255
-; MIPS64R6O0-NEXT:    sllv $3, $3, $1
-; MIPS64R6O0-NEXT:    nor $6, $zero, $3
+; MIPS64R6O0-NEXT:    andi $3, $1, 3
+; MIPS64R6O0-NEXT:    xori $3, $3, 3
+; MIPS64R6O0-NEXT:    sll $3, $3, 3
+; MIPS64R6O0-NEXT:    ori $6, $zero, 255
+; MIPS64R6O0-NEXT:    sllv $6, $6, $3
+; MIPS64R6O0-NEXT:    nor $7, $zero, $6
 ; MIPS64R6O0-NEXT:    andi $4, $4, 255
-; MIPS64R6O0-NEXT:    sllv $4, $4, $1
+; MIPS64R6O0-NEXT:    sllv $4, $4, $3
 ; MIPS64R6O0-NEXT:    andi $5, $5, 255
-; MIPS64R6O0-NEXT:    sllv $5, $5, $1
+; MIPS64R6O0-NEXT:    sllv $5, $5, $3
 ; MIPS64R6O0-NEXT:  .LBB12_1: # %entry
 ; MIPS64R6O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64R6O0-NEXT:    ll $8, 0($2)
-; MIPS64R6O0-NEXT:    and $9, $8, $3
-; MIPS64R6O0-NEXT:    bnec $9, $4, .LBB12_3
+; MIPS64R6O0-NEXT:    ll $9, 0($2)
+; MIPS64R6O0-NEXT:    and $10, $9, $6
+; MIPS64R6O0-NEXT:    bnec $10, $4, .LBB12_3
 ; MIPS64R6O0-NEXT:  # %bb.2: # %entry
 ; MIPS64R6O0-NEXT:    # in Loop: Header=BB12_1 Depth=1
-; MIPS64R6O0-NEXT:    and $8, $8, $6
-; MIPS64R6O0-NEXT:    or $8, $8, $5
-; MIPS64R6O0-NEXT:    sc $8, 0($2)
-; MIPS64R6O0-NEXT:    beqzc $8, .LBB12_1
+; MIPS64R6O0-NEXT:    and $9, $9, $7
+; MIPS64R6O0-NEXT:    or $9, $9, $5
+; MIPS64R6O0-NEXT:    sc $9, 0($2)
+; MIPS64R6O0-NEXT:    beqzc $9, .LBB12_1
 ; MIPS64R6O0-NEXT:  .LBB12_3: # %entry
-; MIPS64R6O0-NEXT:    srlv $7, $9, $1
-; MIPS64R6O0-NEXT:    seb $7, $7
+; MIPS64R6O0-NEXT:    srlv $8, $10, $3
+; MIPS64R6O0-NEXT:    seb $8, $8
 ; MIPS64R6O0-NEXT:  # %bb.4: # %entry
-; MIPS64R6O0-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $8, 12($sp) # 4-byte Folded Spill
 ; MIPS64R6O0-NEXT:  # %bb.5: # %entry
 ; MIPS64R6O0-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
 ; MIPS64R6O0-NEXT:    daddiu $sp, $sp, 16
@@ -5236,28 +5236,28 @@ define i1 @AtomicCmpSwapRes8(i8* %ptr, i8 signext %oldval, i8 signext %newval) n
 ; MIPS64R6O0-NEXT:    sll $2, $2, 3
 ; MIPS64R6O0-NEXT:    ori $3, $zero, 255
 ; MIPS64R6O0-NEXT:    sllv $3, $3, $2
-; MIPS64R6O0-NEXT:    nor $4, $zero, $3
-; MIPS64R6O0-NEXT:    andi $7, $5, 255
-; MIPS64R6O0-NEXT:    sllv $7, $7, $2
+; MIPS64R6O0-NEXT:    nor $7, $zero, $3
+; MIPS64R6O0-NEXT:    andi $8, $5, 255
+; MIPS64R6O0-NEXT:    sllv $8, $8, $2
 ; MIPS64R6O0-NEXT:    andi $6, $6, 255
 ; MIPS64R6O0-NEXT:    sllv $6, $6, $2
 ; MIPS64R6O0-NEXT:  .LBB13_1: # %entry
 ; MIPS64R6O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64R6O0-NEXT:    ll $9, 0($1)
-; MIPS64R6O0-NEXT:    and $10, $9, $3
-; MIPS64R6O0-NEXT:    bnec $10, $7, .LBB13_3
+; MIPS64R6O0-NEXT:    ll $10, 0($1)
+; MIPS64R6O0-NEXT:    and $11, $10, $3
+; MIPS64R6O0-NEXT:    bnec $11, $8, .LBB13_3
 ; MIPS64R6O0-NEXT:  # %bb.2: # %entry
 ; MIPS64R6O0-NEXT:    # in Loop: Header=BB13_1 Depth=1
-; MIPS64R6O0-NEXT:    and $9, $9, $4
-; MIPS64R6O0-NEXT:    or $9, $9, $6
-; MIPS64R6O0-NEXT:    sc $9, 0($1)
-; MIPS64R6O0-NEXT:    beqzc $9, .LBB13_1
+; MIPS64R6O0-NEXT:    and $10, $10, $7
+; MIPS64R6O0-NEXT:    or $10, $10, $6
+; MIPS64R6O0-NEXT:    sc $10, 0($1)
+; MIPS64R6O0-NEXT:    beqzc $10, .LBB13_1
 ; MIPS64R6O0-NEXT:  .LBB13_3: # %entry
-; MIPS64R6O0-NEXT:    srlv $8, $10, $2
-; MIPS64R6O0-NEXT:    seb $8, $8
+; MIPS64R6O0-NEXT:    srlv $9, $11, $2
+; MIPS64R6O0-NEXT:    seb $9, $9
 ; MIPS64R6O0-NEXT:  # %bb.4: # %entry
 ; MIPS64R6O0-NEXT:    sw $5, 12($sp) # 4-byte Folded Spill
-; MIPS64R6O0-NEXT:    sw $8, 8($sp) # 4-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $9, 8($sp) # 4-byte Folded Spill
 ; MIPS64R6O0-NEXT:  # %bb.5: # %entry
 ; MIPS64R6O0-NEXT:    lw $1, 8($sp) # 4-byte Folded Reload
 ; MIPS64R6O0-NEXT:    lw $2, 12($sp) # 4-byte Folded Reload
@@ -5775,28 +5775,28 @@ define signext i16 @AtomicLoadAdd16(i16 signext %incr) nounwind {
 ; MIPS64R6O0-NEXT:    ld $1, %got_disp(z)($1)
 ; MIPS64R6O0-NEXT:    daddiu $2, $zero, -4
 ; MIPS64R6O0-NEXT:    and $2, $1, $2
-; MIPS64R6O0-NEXT:    andi $1, $1, 3
-; MIPS64R6O0-NEXT:    xori $1, $1, 2
-; MIPS64R6O0-NEXT:    sll $1, $1, 3
-; MIPS64R6O0-NEXT:    ori $3, $zero, 65535
-; MIPS64R6O0-NEXT:    sllv $3, $3, $1
-; MIPS64R6O0-NEXT:    nor $5, $zero, $3
-; MIPS64R6O0-NEXT:    sllv $4, $4, $1
+; MIPS64R6O0-NEXT:    andi $3, $1, 3
+; MIPS64R6O0-NEXT:    xori $3, $3, 2
+; MIPS64R6O0-NEXT:    sll $3, $3, 3
+; MIPS64R6O0-NEXT:    ori $5, $zero, 65535
+; MIPS64R6O0-NEXT:    sllv $5, $5, $3
+; MIPS64R6O0-NEXT:    nor $6, $zero, $5
+; MIPS64R6O0-NEXT:    sllv $4, $4, $3
 ; MIPS64R6O0-NEXT:  .LBB14_1: # %entry
 ; MIPS64R6O0-NEXT:    # =>This Inner Loop Header: Depth=1
-; MIPS64R6O0-NEXT:    ll $7, 0($2)
-; MIPS64R6O0-NEXT:    addu $8, $7, $4
-; MIPS64R6O0-NEXT:    and $8, $8, $3
-; MIPS64R6O0-NEXT:    and $9, $7, $5
-; MIPS64R6O0-NEXT:    or $9, $9, $8
-; MIPS64R6O0-NEXT:    sc $9, 0($2)
-; MIPS64R6O0-NEXT:    beqzc $9, .LBB14_1
+; MIPS64R6O0-NEXT:    ll $8, 0($2)
+; MIPS64R6O0-NEXT:    addu $9, $8, $4
+; MIPS64R6O0-NEXT:    and $9, $9, $5
+; MIPS64R6O0-NEXT:    and $10, $8, $6
+; MIPS64R6O0-NEXT:    or $10, $10, $9
+; MIPS64R6O0-NEXT:    sc $10, 0($2)
+; MIPS64R6O0-NEXT:    beqzc $10, .LBB14_1
 ; MIPS64R6O0-NEXT:  # %bb.2: # %entry
-; MIPS64R6O0-NEXT:    and $6, $7, $3
-; MIPS64R6O0-NEXT:    srlv $6, $6, $1
-; MIPS64R6O0-NEXT:    seh $6, $6
+; MIPS64R6O0-NEXT:    and $7, $8, $5
+; MIPS64R6O0-NEXT:    srlv $7, $7, $3
+; MIPS64R6O0-NEXT:    seh $7, $7
 ; MIPS64R6O0-NEXT:  # %bb.3: # %entry
-; MIPS64R6O0-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
 ; MIPS64R6O0-NEXT:  # %bb.4: # %entry
 ; MIPS64R6O0-NEXT:    lw $1, 12($sp) # 4-byte Folded Reload
 ; MIPS64R6O0-NEXT:    seh $2, $1
@@ -6359,33 +6359,33 @@ define {i16, i1} @foo(i16* %addr, i16 %l, i16 %r, i16 %new) {
 ; MIPS64R6O0-NEXT:    sll $3, $5, 0
 ; MIPS64R6O0-NEXT:    addu $2, $3, $2
 ; MIPS64R6O0-NEXT:    sync
-; MIPS64R6O0-NEXT:    daddiu $3, $zero, -4
-; MIPS64R6O0-NEXT:    and $3, $4, $3
-; MIPS64R6O0-NEXT:    andi $4, $4, 3
-; MIPS64R6O0-NEXT:    xori $4, $4, 2
-; MIPS64R6O0-NEXT:    sll $4, $4, 3
+; MIPS64R6O0-NEXT:    daddiu $8, $zero, -4
+; MIPS64R6O0-NEXT:    and $8, $4, $8
+; MIPS64R6O0-NEXT:    andi $3, $4, 3
+; MIPS64R6O0-NEXT:    xori $3, $3, 2
+; MIPS64R6O0-NEXT:    sll $3, $3, 3
 ; MIPS64R6O0-NEXT:    ori $5, $zero, 65535
-; MIPS64R6O0-NEXT:    sllv $5, $5, $4
+; MIPS64R6O0-NEXT:    sllv $5, $5, $3
 ; MIPS64R6O0-NEXT:    nor $6, $zero, $5
 ; MIPS64R6O0-NEXT:    andi $7, $2, 65535
-; MIPS64R6O0-NEXT:    sllv $7, $7, $4
+; MIPS64R6O0-NEXT:    sllv $7, $7, $3
 ; MIPS64R6O0-NEXT:    andi $1, $1, 65535
-; MIPS64R6O0-NEXT:    sllv $1, $1, $4
+; MIPS64R6O0-NEXT:    sllv $1, $1, $3
 ; MIPS64R6O0-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
-; MIPS64R6O0-NEXT:    ll $9, 0($3)
-; MIPS64R6O0-NEXT:    and $10, $9, $5
-; MIPS64R6O0-NEXT:    bnec $10, $7, .LBB15_3
+; MIPS64R6O0-NEXT:    ll $10, 0($8)
+; MIPS64R6O0-NEXT:    and $11, $10, $5
+; MIPS64R6O0-NEXT:    bnec $11, $7, .LBB15_3
 ; MIPS64R6O0-NEXT:  # %bb.2: # in Loop: Header=BB15_1 Depth=1
-; MIPS64R6O0-NEXT:    and $9, $9, $6
-; MIPS64R6O0-NEXT:    or $9, $9, $1
-; MIPS64R6O0-NEXT:    sc $9, 0($3)
-; MIPS64R6O0-NEXT:    beqzc $9, .LBB15_1
+; MIPS64R6O0-NEXT:    and $10, $10, $6
+; MIPS64R6O0-NEXT:    or $10, $10, $1
+; MIPS64R6O0-NEXT:    sc $10, 0($8)
+; MIPS64R6O0-NEXT:    beqzc $10, .LBB15_1
 ; MIPS64R6O0-NEXT:  .LBB15_3:
-; MIPS64R6O0-NEXT:    srlv $8, $10, $4
-; MIPS64R6O0-NEXT:    seh $8, $8
+; MIPS64R6O0-NEXT:    srlv $9, $11, $3
+; MIPS64R6O0-NEXT:    seh $9, $9
 ; MIPS64R6O0-NEXT:  # %bb.4:
 ; MIPS64R6O0-NEXT:    sw $2, 12($sp) # 4-byte Folded Spill
-; MIPS64R6O0-NEXT:    sw $8, 8($sp) # 4-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $9, 8($sp) # 4-byte Folded Spill
 ; MIPS64R6O0-NEXT:  # %bb.5:
 ; MIPS64R6O0-NEXT:    lw $1, 12($sp) # 4-byte Folded Reload
 ; MIPS64R6O0-NEXT:    seh $2, $1
@@ -7145,8 +7145,8 @@ define i32 @zeroreg() nounwind {
 ; MIPS64R6O0-NEXT:    sc $6, 0($1)
 ; MIPS64R6O0-NEXT:    beqzc $6, .LBB17_1
 ; MIPS64R6O0-NEXT:  .LBB17_3: # %entry
-; MIPS64R6O0-NEXT:    xor $1, $5, $3
-; MIPS64R6O0-NEXT:    sltiu $2, $1, 1
+; MIPS64R6O0-NEXT:    xor $2, $5, $3
+; MIPS64R6O0-NEXT:    sltiu $2, $2, 1
 ; MIPS64R6O0-NEXT:    sync
 ; MIPS64R6O0-NEXT:    jrc $ra
 ;
diff --git a/llvm/test/CodeGen/Mips/implicit-sret.ll b/llvm/test/CodeGen/Mips/implicit-sret.ll
index b9f6568e40c9..e86cec37d510 100644
--- a/llvm/test/CodeGen/Mips/implicit-sret.ll
+++ b/llvm/test/CodeGen/Mips/implicit-sret.ll
@@ -48,8 +48,8 @@ define internal { i32, i128, i64 } @implicit_sret_impl() unnamed_addr nounwind {
 ; CHECK-NEXT:    sd $zero, 8($4)
 ; CHECK-NEXT:    daddiu $3, $zero, 30
 ; CHECK-NEXT:    sd $3, 24($4)
-; CHECK-NEXT:    addiu $3, $zero, 10
-; CHECK-NEXT:    sw $3, 0($4)
+; CHECK-NEXT:    addiu $5, $zero, 10
+; CHECK-NEXT:    sw $5, 0($4)
 ; CHECK-NEXT:    jr $ra
 ; CHECK-NEXT:    nop
   ret { i32, i128, i64 } { i32 10, i128 20, i64 30 }
@@ -70,12 +70,10 @@ define internal void @test2() unnamed_addr nounwind {
 ; CHECK-NEXT:    lw $3, 4($sp)
 ; CHECK-NEXT:    # implicit-def: $a0_64
 ; CHECK-NEXT:    move $4, $3
-; CHECK-NEXT:    # implicit-def: $v1_64
-; CHECK-NEXT:    move $3, $2
-; CHECK-NEXT:    # implicit-def: $v0_64
-; CHECK-NEXT:    move $2, $1
-; CHECK-NEXT:    move $5, $3
-; CHECK-NEXT:    move $6, $2
+; CHECK-NEXT:    # implicit-def: $a1_64
+; CHECK-NEXT:    move $5, $2
+; CHECK-NEXT:    # implicit-def: $a2_64
+; CHECK-NEXT:    move $6, $1
 ; CHECK-NEXT:    jal use_sret2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/addegluecrash.ll b/llvm/test/CodeGen/PowerPC/addegluecrash.ll
index c38f377869f8..a1d980545836 100644
--- a/llvm/test/CodeGen/PowerPC/addegluecrash.ll
+++ b/llvm/test/CodeGen/PowerPC/addegluecrash.ll
@@ -21,11 +21,11 @@ define void @bn_mul_comba8(i64* nocapture %r, i64* nocapture readonly %a, i64* n
 ; CHECK-NEXT:    addze 5, 5
 ; CHECK-NEXT:    add 4, 5, 4
 ; CHECK-NEXT:    cmpld 7, 4, 5
-; CHECK-NEXT:    mfocrf 4, 1
-; CHECK-NEXT:    rlwinm 4, 4, 29, 31, 31
-; CHECK-NEXT:    # implicit-def: $x5
-; CHECK-NEXT:    mr 5, 4
-; CHECK-NEXT:    clrldi 4, 5, 32
+; CHECK-NEXT:    mfocrf 10, 1
+; CHECK-NEXT:    rlwinm 10, 10, 29, 31, 31
+; CHECK-NEXT:    # implicit-def: $x4
+; CHECK-NEXT:    mr 4, 10
+; CHECK-NEXT:    clrldi 4, 4, 32
 ; CHECK-NEXT:    std 4, 0(3)
 ; CHECK-NEXT:    blr
   %1 = load i64, i64* %a, align 8
diff --git a/llvm/test/CodeGen/PowerPC/popcount.ll b/llvm/test/CodeGen/PowerPC/popcount.ll
index fb20f1d3ee43..170d3d77d088 100644
--- a/llvm/test/CodeGen/PowerPC/popcount.ll
+++ b/llvm/test/CodeGen/PowerPC/popcount.ll
@@ -58,17 +58,17 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
 ; CHECK-NEXT:    # kill: def $f0 killed $f0 killed $vsl0
 ; CHECK-NEXT:    mffprd 3, 0
 ; CHECK-NEXT:    popcntd 3, 3
-; CHECK-NEXT:    xxswapd 0, 34
-; CHECK-NEXT:    # kill: def $f0 killed $f0 killed $vsl0
-; CHECK-NEXT:    mffprd 4, 0
+; CHECK-NEXT:    xxswapd 1, 34
+; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; CHECK-NEXT:    mffprd 4, 1
 ; CHECK-NEXT:    popcntd 4, 4
 ; CHECK-NEXT:    add 3, 4, 3
 ; CHECK-NEXT:    mtfprd 0, 3
-; CHECK-NEXT:    # kill: def $vsl0 killed $f0
+; CHECK-NEXT:    fmr 2, 0
 ; CHECK-NEXT:    li 3, 0
-; CHECK-NEXT:    mtfprd 1, 3
-; CHECK-NEXT:    # kill: def $vsl1 killed $f1
-; CHECK-NEXT:    xxmrghd 34, 1, 0
+; CHECK-NEXT:    mtfprd 0, 3
+; CHECK-NEXT:    fmr 3, 0
+; CHECK-NEXT:    xxmrghd 34, 3, 2
 ; CHECK-NEXT:    blr
 Entry:
   %1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0)
diff --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll
index 4a78218262ca..39469d63b907 100644
--- a/llvm/test/CodeGen/PowerPC/vsx.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx.ll
@@ -1548,8 +1548,8 @@ define <2 x i64> @test46(<2 x float> %a) {
 ; CHECK-FISL-NEXT:    ld r3, -24(r1)
 ; CHECK-FISL-NEXT:    std r3, -16(r1)
 ; CHECK-FISL-NEXT:    addi r3, r1, -16
-; CHECK-FISL-NEXT:    lxvd2x vs0, 0, r3
-; CHECK-FISL-NEXT:    xxlor v2, vs0, vs0
+; CHECK-FISL-NEXT:    lxvd2x vs1, 0, r3
+; CHECK-FISL-NEXT:    xxlor v2, vs1, vs1
 ; CHECK-FISL-NEXT:    blr
 ;
 ; CHECK-LE-LABEL: test46:
@@ -1616,8 +1616,8 @@ define <2 x i64> @test47(<2 x float> %a) {
 ; CHECK-FISL-NEXT:    ld r3, -24(r1)
 ; CHECK-FISL-NEXT:    std r3, -16(r1)
 ; CHECK-FISL-NEXT:    addi r3, r1, -16
-; CHECK-FISL-NEXT:    lxvd2x vs0, 0, r3
-; CHECK-FISL-NEXT:    xxlor v2, vs0, vs0
+; CHECK-FISL-NEXT:    lxvd2x vs1, 0, r3
+; CHECK-FISL-NEXT:    xxlor v2, vs1, vs1
 ; CHECK-FISL-NEXT:    blr
 ;
 ; CHECK-LE-LABEL: test47:
@@ -1859,13 +1859,13 @@ define <2 x i64> @test60(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-FISL-NEXT:    stxvd2x v3, 0, r3
 ; CHECK-FISL-NEXT:    addi r3, r1, -48
 ; CHECK-FISL-NEXT:    stxvd2x v2, 0, r3
-; CHECK-FISL-NEXT:    lwz r3, -20(r1)
-; CHECK-FISL-NEXT:    ld r4, -40(r1)
-; CHECK-FISL-NEXT:    sld r3, r4, r3
+; CHECK-FISL-NEXT:    lwz r4, -20(r1)
+; CHECK-FISL-NEXT:    ld r3, -40(r1)
+; CHECK-FISL-NEXT:    sld r3, r3, r4
 ; CHECK-FISL-NEXT:    std r3, -8(r1)
-; CHECK-FISL-NEXT:    lwz r3, -28(r1)
-; CHECK-FISL-NEXT:    ld r4, -48(r1)
-; CHECK-FISL-NEXT:    sld r3, r4, r3
+; CHECK-FISL-NEXT:    lwz r4, -28(r1)
+; CHECK-FISL-NEXT:    ld r3, -48(r1)
+; CHECK-FISL-NEXT:    sld r3, r3, r4
 ; CHECK-FISL-NEXT:    std r3, -16(r1)
 ; CHECK-FISL-NEXT:    addi r3, r1, -16
 ; CHECK-FISL-NEXT:    lxvd2x vs0, 0, r3
@@ -1925,13 +1925,13 @@ define <2 x i64> @test61(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-FISL-NEXT:    stxvd2x v3, 0, r3
 ; CHECK-FISL-NEXT:    addi r3, r1, -48
 ; CHECK-FISL-NEXT:    stxvd2x v2, 0, r3
-; CHECK-FISL-NEXT:    lwz r3, -20(r1)
-; CHECK-FISL-NEXT:    ld r4, -40(r1)
-; CHECK-FISL-NEXT:    srd r3, r4, r3
+; CHECK-FISL-NEXT:    lwz r4, -20(r1)
+; CHECK-FISL-NEXT:    ld r3, -40(r1)
+; CHECK-FISL-NEXT:    srd r3, r3, r4
 ; CHECK-FISL-NEXT:    std r3, -8(r1)
-; CHECK-FISL-NEXT:    lwz r3, -28(r1)
-; CHECK-FISL-NEXT:    ld r4, -48(r1)
-; CHECK-FISL-NEXT:    srd r3, r4, r3
+; CHECK-FISL-NEXT:    lwz r4, -28(r1)
+; CHECK-FISL-NEXT:    ld r3, -48(r1)
+; CHECK-FISL-NEXT:    srd r3, r3, r4
 ; CHECK-FISL-NEXT:    std r3, -16(r1)
 ; CHECK-FISL-NEXT:    addi r3, r1, -16
 ; CHECK-FISL-NEXT:    lxvd2x vs0, 0, r3
@@ -1991,13 +1991,13 @@ define <2 x i64> @test62(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-FISL-NEXT:    stxvd2x v3, 0, r3
 ; CHECK-FISL-NEXT:    addi r3, r1, -48
 ; CHECK-FISL-NEXT:    stxvd2x v2, 0, r3
-; CHECK-FISL-NEXT:    lwz r3, -20(r1)
-; CHECK-FISL-NEXT:    ld r4, -40(r1)
-; CHECK-FISL-NEXT:    srad r3, r4, r3
+; CHECK-FISL-NEXT:    lwz r4, -20(r1)
+; CHECK-FISL-NEXT:    ld r3, -40(r1)
+; CHECK-FISL-NEXT:    srad r3, r3, r4
 ; CHECK-FISL-NEXT:    std r3, -8(r1)
-; CHECK-FISL-NEXT:    lwz r3, -28(r1)
-; CHECK-FISL-NEXT:    ld r4, -48(r1)
-; CHECK-FISL-NEXT:    srad r3, r4, r3
+; CHECK-FISL-NEXT:    lwz r4, -28(r1)
+; CHECK-FISL-NEXT:    ld r3, -48(r1)
+; CHECK-FISL-NEXT:    srad r3, r3, r4
 ; CHECK-FISL-NEXT:    std r3, -16(r1)
 ; CHECK-FISL-NEXT:    addi r3, r1, -16
 ; CHECK-FISL-NEXT:    lxvd2x vs0, 0, r3
@@ -2426,12 +2426,12 @@ define <2 x i32> @test80(i32 %v) {
 ; CHECK-FISL:       # %bb.0:
 ; CHECK-FISL-NEXT:    # kill: def $r3 killed $r3 killed $x3
 ; CHECK-FISL-NEXT:    stw r3, -16(r1)
-; CHECK-FISL-NEXT:    addi r3, r1, -16
-; CHECK-FISL-NEXT:    lxvw4x vs0, 0, r3
+; CHECK-FISL-NEXT:    addi r4, r1, -16
+; CHECK-FISL-NEXT:    lxvw4x vs0, 0, r4
 ; CHECK-FISL-NEXT:    xxspltw v2, vs0, 0
-; CHECK-FISL-NEXT:    addis r3, r2, .LCPI65_0@toc@ha
-; CHECK-FISL-NEXT:    addi r3, r3, .LCPI65_0@toc@l
-; CHECK-FISL-NEXT:    lxvw4x v3, 0, r3
+; CHECK-FISL-NEXT:    addis r4, r2, .LCPI65_0@toc@ha
+; CHECK-FISL-NEXT:    addi r4, r4, .LCPI65_0@toc@l
+; CHECK-FISL-NEXT:    lxvw4x v3, 0, r4
 ; CHECK-FISL-NEXT:    vadduwm v2, v2, v3
 ; CHECK-FISL-NEXT:    blr
 ;
diff --git a/llvm/test/CodeGen/SPARC/fp16-promote.ll b/llvm/test/CodeGen/SPARC/fp16-promote.ll
index 0c402430dadc..9709322f48a5 100644
--- a/llvm/test/CodeGen/SPARC/fp16-promote.ll
+++ b/llvm/test/CodeGen/SPARC/fp16-promote.ll
@@ -182,11 +182,11 @@ define void @test_fptrunc_double(double %d, half* %p) nounwind {
 ; V8-UNOPT-NEXT:    std %i4, [%fp+-8]
 ; V8-UNOPT-NEXT:    ldd [%fp+-8], %f0
 ; V8-UNOPT-NEXT:    std %f0, [%fp+-16]
-; V8-UNOPT-NEXT:    ldd [%fp+-16], %i0
-; V8-UNOPT-NEXT:    mov %i0, %i3
-; V8-UNOPT-NEXT:    ! kill: def $i1 killed $i1 killed $i0_i1
-; V8-UNOPT-NEXT:    mov %i3, %o0
-; V8-UNOPT-NEXT:    mov %i1, %o1
+; V8-UNOPT-NEXT:    ldd [%fp+-16], %i4
+; V8-UNOPT-NEXT:    mov %i4, %i0
+; V8-UNOPT-NEXT:    ! kill: def $i5 killed $i5 killed $i4_i5
+; V8-UNOPT-NEXT:    mov %i0, %o0
+; V8-UNOPT-NEXT:    mov %i5, %o1
 ; V8-UNOPT-NEXT:    call __truncdfhf2
 ; V8-UNOPT-NEXT:    st %i2, [%fp+-20]
 ; V8-UNOPT-NEXT:    ld [%fp+-20], %i0 ! 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/2009-04-14-IllegalRegs.ll b/llvm/test/CodeGen/X86/2009-04-14-IllegalRegs.ll
index b5635c7e0f06..48ad2a2c0777 100644
--- a/llvm/test/CodeGen/X86/2009-04-14-IllegalRegs.ll
+++ b/llvm/test/CodeGen/X86/2009-04-14-IllegalRegs.ll
@@ -8,34 +8,34 @@
 define i32 @z() nounwind ssp {
 ; CHECK-LABEL: z:
 ; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    subl $148, %esp
+; CHECK-NEXT:    subl $144, %esp
 ; CHECK-NEXT:    movl L___stack_chk_guard$non_lazy_ptr, %eax
 ; CHECK-NEXT:    movl (%eax), %eax
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movb $48, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
-; CHECK-NEXT:    movb %al, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    movb %cl, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movb $15, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    movl $8, %ecx
-; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl $8, %edx
+; CHECK-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl %edx, %ecx
 ; CHECK-NEXT:    movl %eax, %edi
-; CHECK-NEXT:    movl %edx, %esi
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    rep;movsl (%esi), %es:(%edi)
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    addl $36, %ecx
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT:    movl %esi, %ecx
+; CHECK-NEXT:    movl %edx, %ecx
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; CHECK-NEXT:    movl %edx, %esi
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; CHECK-NEXT:    rep;movsl (%esi), %es:(%edi)
-; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT:    movb %cl, 32(%eax)
-; CHECK-NEXT:    movb %cl, 68(%eax)
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:    movb %bl, 32(%eax)
+; CHECK-NEXT:    movb %bl, 68(%eax)
 ; CHECK-NEXT:    calll _f
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -50,9 +50,10 @@ define i32 @z() nounwind ssp {
 ; CHECK-NEXT:    jne LBB0_3
 ; CHECK-NEXT:  ## %bb.2: ## %SP_return
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; CHECK-NEXT:    addl $148, %esp
+; CHECK-NEXT:    addl $144, %esp
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
 ; CHECK-NEXT:    retl
 ; CHECK-NEXT:  LBB0_3: ## %CallStackCheckFailBlk
 ; CHECK-NEXT:    calll ___stack_chk_fail
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 7a1f34c65c18..16fde4074ea0 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -126,8 +126,8 @@ define void @narrow_writeback_and(i64* %ptr) {
 ; CHECK-O0-NEXT:    movq (%rdi), %rax
 ; CHECK-O0-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-O0-NEXT:    andl $-256, %eax
-; CHECK-O0-NEXT:    # kill: def $rax killed $eax
-; CHECK-O0-NEXT:    movq %rax, (%rdi)
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    movq %rcx, (%rdi)
 ; CHECK-O0-NEXT:    retq
 ;
 ; CHECK-O3-LABEL: narrow_writeback_and:
@@ -231,10 +231,10 @@ define i128 @load_i128(i128* %ptr) {
 ; CHECK-O0-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-O0-NEXT:    .cfi_offset %rbx, -16
 ; CHECK-O0-NEXT:    xorl %eax, %eax
-; CHECK-O0-NEXT:    # kill: def $rax killed $eax
-; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    movq %rcx, %rax
+; CHECK-O0-NEXT:    movq %rcx, %rdx
+; CHECK-O0-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
 ; CHECK-O0-NEXT:    lock cmpxchg16b (%rdi)
 ; CHECK-O0-NEXT:    popq %rbx
@@ -326,14 +326,14 @@ define i256 @load_i256(i256* %ptr) {
 ; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-O0-NEXT:    callq __atomic_load
 ; CHECK-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; CHECK-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; CHECK-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; CHECK-O0-NEXT:    movq %rsi, 24(%rdi)
-; CHECK-O0-NEXT:    movq %rdx, 16(%rdi)
-; CHECK-O0-NEXT:    movq %rcx, 8(%rdi)
-; CHECK-O0-NEXT:    movq %rax, (%rdi)
+; CHECK-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; CHECK-O0-NEXT:    movq %rdi, 24(%r9)
+; CHECK-O0-NEXT:    movq %rsi, 16(%r9)
+; CHECK-O0-NEXT:    movq %rdx, 8(%r9)
+; CHECK-O0-NEXT:    movq %rax, (%r9)
 ; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; CHECK-O0-NEXT:    addq $56, %rsp
 ; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
@@ -831,8 +831,8 @@ define i64 @load_fold_udiv1(i64* %p) {
 ; CHECK-O0-NEXT:    movq (%rdi), %rax
 ; CHECK-O0-NEXT:    xorl %ecx, %ecx
 ; CHECK-O0-NEXT:    movl %ecx, %edx
-; CHECK-O0-NEXT:    movl $15, %ecx
-; CHECK-O0-NEXT:    divq %rcx
+; CHECK-O0-NEXT:    movl $15, %esi
+; CHECK-O0-NEXT:    divq %rsi
 ; CHECK-O0-NEXT:    retq
 ;
 ; CHECK-O3-CUR-LABEL: load_fold_udiv1:
@@ -1024,8 +1024,8 @@ define i64 @load_fold_urem1(i64* %p) {
 ; CHECK-O0-NEXT:    movq (%rdi), %rax
 ; CHECK-O0-NEXT:    xorl %ecx, %ecx
 ; CHECK-O0-NEXT:    movl %ecx, %edx
-; CHECK-O0-NEXT:    movl $15, %ecx
-; CHECK-O0-NEXT:    divq %rcx
+; CHECK-O0-NEXT:    movl $15, %esi
+; CHECK-O0-NEXT:    divq %rsi
 ; CHECK-O0-NEXT:    movq %rdx, %rax
 ; CHECK-O0-NEXT:    retq
 ;
@@ -1475,9 +1475,9 @@ define i1 @load_fold_icmp3(i64* %p1, i64* %p2) {
 ; CHECK-O0-NEXT:    movq (%rdi), %rax
 ; CHECK-O0-NEXT:    movq (%rsi), %rcx
 ; CHECK-O0-NEXT:    subq %rcx, %rax
-; CHECK-O0-NEXT:    sete %cl
+; CHECK-O0-NEXT:    sete %dl
 ; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT:    movb %cl, %al
+; CHECK-O0-NEXT:    movb %dl, %al
 ; CHECK-O0-NEXT:    retq
 ;
 ; CHECK-O3-CUR-LABEL: load_fold_icmp3:
@@ -2076,8 +2076,8 @@ define void @rmw_fold_and1(i64* %p, i64 %v) {
 ; CHECK-O0-NEXT:    movq (%rdi), %rax
 ; CHECK-O0-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-O0-NEXT:    andl $15, %eax
-; CHECK-O0-NEXT:    # kill: def $rax killed $eax
-; CHECK-O0-NEXT:    movq %rax, (%rdi)
+; CHECK-O0-NEXT:    movl %eax, %ecx
+; CHECK-O0-NEXT:    movq %rcx, (%rdi)
 ; CHECK-O0-NEXT:    retq
 ;
 ; CHECK-O3-LABEL: rmw_fold_and1:
@@ -2541,8 +2541,9 @@ define i16 @load_i8_anyext_i16(i8* %ptr) {
 ; CHECK-O0-CUR-LABEL: load_i8_anyext_i16:
 ; CHECK-O0-CUR:       # %bb.0:
 ; CHECK-O0-CUR-NEXT:    movb (%rdi), %al
-; CHECK-O0-CUR-NEXT:    movzbl %al, %eax
-; CHECK-O0-CUR-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-O0-CUR-NEXT:    movzbl %al, %ecx
+; CHECK-O0-CUR-NEXT:    # kill: def $cx killed $cx killed $ecx
+; CHECK-O0-CUR-NEXT:    movw %cx, %ax
 ; CHECK-O0-CUR-NEXT:    retq
 ;
 ; CHECK-O3-CUR-LABEL: load_i8_anyext_i16:
@@ -2670,12 +2671,13 @@ define i16 @load_combine(i8* %p) {
 ; CHECK-O0:       # %bb.0:
 ; CHECK-O0-NEXT:    movb (%rdi), %al
 ; CHECK-O0-NEXT:    movb 1(%rdi), %cl
-; CHECK-O0-NEXT:    movzbl %al, %eax
-; CHECK-O0-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-O0-NEXT:    movzbl %cl, %ecx
-; CHECK-O0-NEXT:    # kill: def $cx killed $cx killed $ecx
-; CHECK-O0-NEXT:    shlw $8, %cx
-; CHECK-O0-NEXT:    orw %cx, %ax
+; CHECK-O0-NEXT:    movzbl %al, %edx
+; CHECK-O0-NEXT:    # kill: def $dx killed $dx killed $edx
+; CHECK-O0-NEXT:    movzbl %cl, %esi
+; CHECK-O0-NEXT:    # kill: def $si killed $si killed $esi
+; CHECK-O0-NEXT:    shlw $8, %si
+; CHECK-O0-NEXT:    orw %si, %dx
+; CHECK-O0-NEXT:    movw %dx, %ax
 ; CHECK-O0-NEXT:    retq
 ;
 ; CHECK-O3-LABEL: load_combine:
diff --git a/llvm/test/CodeGen/X86/atomic32.ll b/llvm/test/CodeGen/X86/atomic32.ll
index 3fe5ef8311ce..4fb03356f99f 100644
--- a/llvm/test/CodeGen/X86/atomic32.ll
+++ b/llvm/test/CodeGen/X86/atomic32.ll
@@ -70,8 +70,8 @@ define void @atomic_fetch_and32() nounwind {
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $5, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
-; X64-NEXT:    sete %cl
-; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    sete %dl
+; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -94,8 +94,8 @@ define void @atomic_fetch_and32() nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $5, %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-NEXT:    sete %cl
-; X86-NEXT:    testb $1, %cl
+; X86-NEXT:    sete %dl
+; X86-NEXT:    testb $1, %dl
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
@@ -124,8 +124,8 @@ define void @atomic_fetch_or32() nounwind {
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    orl $5, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
-; X64-NEXT:    sete %cl
-; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    sete %dl
+; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -148,8 +148,8 @@ define void @atomic_fetch_or32() nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    orl $5, %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-NEXT:    sete %cl
-; X86-NEXT:    testb $1, %cl
+; X86-NEXT:    sete %dl
+; X86-NEXT:    testb $1, %dl
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
@@ -178,8 +178,8 @@ define void @atomic_fetch_xor32() nounwind {
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    xorl $5, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
-; X64-NEXT:    sete %cl
-; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    sete %dl
+; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -202,8 +202,8 @@ define void @atomic_fetch_xor32() nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    xorl $5, %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-NEXT:    sete %cl
-; X86-NEXT:    testb $1, %cl
+; X86-NEXT:    sete %dl
+; X86-NEXT:    testb $1, %dl
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
@@ -234,8 +234,8 @@ define void @atomic_fetch_nand32(i32 %x) nounwind {
 ; X64-NEXT:    andl %edx, %ecx
 ; X64-NEXT:    notl %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
-; X64-NEXT:    sete %cl
-; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    sete %sil
+; X64-NEXT:    testb $1, %sil
 ; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB5_2
 ; X64-NEXT:    jmp .LBB5_1
@@ -244,6 +244,7 @@ define void @atomic_fetch_nand32(i32 %x) nounwind {
 ;
 ; X86-LABEL: atomic_fetch_nand32:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl sc32, %ecx
@@ -257,13 +258,14 @@ define void @atomic_fetch_nand32(i32 %x) nounwind {
 ; X86-NEXT:    andl %edx, %ecx
 ; X86-NEXT:    notl %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
-; X86-NEXT:    sete %cl
-; X86-NEXT:    testb $1, %cl
+; X86-NEXT:    sete %bl
+; X86-NEXT:    testb $1, %bl
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    jne .LBB5_2
 ; X86-NEXT:    jmp .LBB5_1
 ; X86-NEXT:  .LBB5_2: # %atomicrmw.end
 ; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
   %t1 = atomicrmw nand i32* @sc32, i32 %x acquire
   ret void
@@ -283,8 +285,8 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X64-NEXT:    subl %edx, %ecx
 ; X64-NEXT:    cmovgel %eax, %edx
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
-; X64-NEXT:    sete %dl
-; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    sete %sil
+; X64-NEXT:    testb $1, %sil
 ; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB6_2
@@ -294,6 +296,7 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ;
 ; X86-CMOV-LABEL: atomic_fetch_max32:
 ; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    pushl %ebx
 ; X86-CMOV-NEXT:    subl $12, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl sc32, %ecx
@@ -307,18 +310,20 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    subl %edx, %ecx
 ; X86-CMOV-NEXT:    cmovgel %eax, %edx
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
-; X86-CMOV-NEXT:    sete %dl
-; X86-CMOV-NEXT:    testb $1, %dl
+; X86-CMOV-NEXT:    sete %bl
+; X86-CMOV-NEXT:    testb $1, %bl
 ; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB6_2
 ; X86-CMOV-NEXT:    jmp .LBB6_1
 ; X86-CMOV-NEXT:  .LBB6_2: # %atomicrmw.end
 ; X86-CMOV-NEXT:    addl $12, %esp
+; X86-CMOV-NEXT:    popl %ebx
 ; X86-CMOV-NEXT:    retl
 ;
 ; X86-NOCMOV-LABEL: atomic_fetch_max32:
 ; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    pushl %ebx
 ; X86-NOCMOV-NEXT:    pushl %esi
 ; X86-NOCMOV-NEXT:    subl $20, %esp
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -347,18 +352,20 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X86-NOCMOV-NEXT:    movl %ecx, %eax
 ; X86-NOCMOV-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %edx, sc32
-; X86-NOCMOV-NEXT:    sete %dl
-; X86-NOCMOV-NEXT:    testb $1, %dl
+; X86-NOCMOV-NEXT:    sete %bl
+; X86-NOCMOV-NEXT:    testb $1, %bl
 ; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB6_2
 ; X86-NOCMOV-NEXT:    jmp .LBB6_1
 ; X86-NOCMOV-NEXT:  .LBB6_2: # %atomicrmw.end
 ; X86-NOCMOV-NEXT:    addl $20, %esp
 ; X86-NOCMOV-NEXT:    popl %esi
+; X86-NOCMOV-NEXT:    popl %ebx
 ; X86-NOCMOV-NEXT:    retl
 ;
 ; X86-NOX87-LABEL: atomic_fetch_max32:
 ; X86-NOX87:       # %bb.0:
+; X86-NOX87-NEXT:    pushl %ebx
 ; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    subl $20, %esp
 ; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -387,14 +394,15 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X86-NOX87-NEXT:    movl %ecx, %eax
 ; X86-NOX87-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOX87-NEXT:    lock cmpxchgl %edx, sc32
-; X86-NOX87-NEXT:    sete %dl
-; X86-NOX87-NEXT:    testb $1, %dl
+; X86-NOX87-NEXT:    sete %bl
+; X86-NOX87-NEXT:    testb $1, %bl
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    jne .LBB6_2
 ; X86-NOX87-NEXT:    jmp .LBB6_1
 ; X86-NOX87-NEXT:  .LBB6_2: # %atomicrmw.end
 ; X86-NOX87-NEXT:    addl $20, %esp
 ; X86-NOX87-NEXT:    popl %esi
+; X86-NOX87-NEXT:    popl %ebx
 ; X86-NOX87-NEXT:    retl
   %t1 = atomicrmw max  i32* @sc32, i32 %x acquire
   ret void
@@ -414,8 +422,8 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X64-NEXT:    subl %edx, %ecx
 ; X64-NEXT:    cmovlel %eax, %edx
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
-; X64-NEXT:    sete %dl
-; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    sete %sil
+; X64-NEXT:    testb $1, %sil
 ; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB7_2
@@ -425,6 +433,7 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ;
 ; X86-CMOV-LABEL: atomic_fetch_min32:
 ; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    pushl %ebx
 ; X86-CMOV-NEXT:    subl $12, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl sc32, %ecx
@@ -438,18 +447,20 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    subl %edx, %ecx
 ; X86-CMOV-NEXT:    cmovlel %eax, %edx
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
-; X86-CMOV-NEXT:    sete %dl
-; X86-CMOV-NEXT:    testb $1, %dl
+; X86-CMOV-NEXT:    sete %bl
+; X86-CMOV-NEXT:    testb $1, %bl
 ; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB7_2
 ; X86-CMOV-NEXT:    jmp .LBB7_1
 ; X86-CMOV-NEXT:  .LBB7_2: # %atomicrmw.end
 ; X86-CMOV-NEXT:    addl $12, %esp
+; X86-CMOV-NEXT:    popl %ebx
 ; X86-CMOV-NEXT:    retl
 ;
 ; X86-NOCMOV-LABEL: atomic_fetch_min32:
 ; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    pushl %ebx
 ; X86-NOCMOV-NEXT:    pushl %esi
 ; X86-NOCMOV-NEXT:    subl $20, %esp
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -478,18 +489,20 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X86-NOCMOV-NEXT:    movl %ecx, %eax
 ; X86-NOCMOV-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %edx, sc32
-; X86-NOCMOV-NEXT:    sete %dl
-; X86-NOCMOV-NEXT:    testb $1, %dl
+; X86-NOCMOV-NEXT:    sete %bl
+; X86-NOCMOV-NEXT:    testb $1, %bl
 ; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB7_2
 ; X86-NOCMOV-NEXT:    jmp .LBB7_1
 ; X86-NOCMOV-NEXT:  .LBB7_2: # %atomicrmw.end
 ; X86-NOCMOV-NEXT:    addl $20, %esp
 ; X86-NOCMOV-NEXT:    popl %esi
+; X86-NOCMOV-NEXT:    popl %ebx
 ; X86-NOCMOV-NEXT:    retl
 ;
 ; X86-NOX87-LABEL: atomic_fetch_min32:
 ; X86-NOX87:       # %bb.0:
+; X86-NOX87-NEXT:    pushl %ebx
 ; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    subl $20, %esp
 ; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -518,14 +531,15 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X86-NOX87-NEXT:    movl %ecx, %eax
 ; X86-NOX87-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOX87-NEXT:    lock cmpxchgl %edx, sc32
-; X86-NOX87-NEXT:    sete %dl
-; X86-NOX87-NEXT:    testb $1, %dl
+; X86-NOX87-NEXT:    sete %bl
+; X86-NOX87-NEXT:    testb $1, %bl
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    jne .LBB7_2
 ; X86-NOX87-NEXT:    jmp .LBB7_1
 ; X86-NOX87-NEXT:  .LBB7_2: # %atomicrmw.end
 ; X86-NOX87-NEXT:    addl $20, %esp
 ; X86-NOX87-NEXT:    popl %esi
+; X86-NOX87-NEXT:    popl %ebx
 ; X86-NOX87-NEXT:    retl
   %t1 = atomicrmw min  i32* @sc32, i32 %x acquire
   ret void
@@ -545,8 +559,8 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X64-NEXT:    subl %edx, %ecx
 ; X64-NEXT:    cmoval %eax, %edx
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
-; X64-NEXT:    sete %dl
-; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    sete %sil
+; X64-NEXT:    testb $1, %sil
 ; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB8_2
@@ -556,6 +570,7 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ;
 ; X86-CMOV-LABEL: atomic_fetch_umax32:
 ; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    pushl %ebx
 ; X86-CMOV-NEXT:    subl $12, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl sc32, %ecx
@@ -569,18 +584,20 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    subl %edx, %ecx
 ; X86-CMOV-NEXT:    cmoval %eax, %edx
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
-; X86-CMOV-NEXT:    sete %dl
-; X86-CMOV-NEXT:    testb $1, %dl
+; X86-CMOV-NEXT:    sete %bl
+; X86-CMOV-NEXT:    testb $1, %bl
 ; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB8_2
 ; X86-CMOV-NEXT:    jmp .LBB8_1
 ; X86-CMOV-NEXT:  .LBB8_2: # %atomicrmw.end
 ; X86-CMOV-NEXT:    addl $12, %esp
+; X86-CMOV-NEXT:    popl %ebx
 ; X86-CMOV-NEXT:    retl
 ;
 ; X86-NOCMOV-LABEL: atomic_fetch_umax32:
 ; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    pushl %ebx
 ; X86-NOCMOV-NEXT:    pushl %esi
 ; X86-NOCMOV-NEXT:    subl $20, %esp
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -609,18 +626,20 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X86-NOCMOV-NEXT:    movl %ecx, %eax
 ; X86-NOCMOV-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %edx, sc32
-; X86-NOCMOV-NEXT:    sete %dl
-; X86-NOCMOV-NEXT:    testb $1, %dl
+; X86-NOCMOV-NEXT:    sete %bl
+; X86-NOCMOV-NEXT:    testb $1, %bl
 ; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB8_2
 ; X86-NOCMOV-NEXT:    jmp .LBB8_1
 ; X86-NOCMOV-NEXT:  .LBB8_2: # %atomicrmw.end
 ; X86-NOCMOV-NEXT:    addl $20, %esp
 ; X86-NOCMOV-NEXT:    popl %esi
+; X86-NOCMOV-NEXT:    popl %ebx
 ; X86-NOCMOV-NEXT:    retl
 ;
 ; X86-NOX87-LABEL: atomic_fetch_umax32:
 ; X86-NOX87:       # %bb.0:
+; X86-NOX87-NEXT:    pushl %ebx
 ; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    subl $20, %esp
 ; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -649,14 +668,15 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X86-NOX87-NEXT:    movl %ecx, %eax
 ; X86-NOX87-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOX87-NEXT:    lock cmpxchgl %edx, sc32
-; X86-NOX87-NEXT:    sete %dl
-; X86-NOX87-NEXT:    testb $1, %dl
+; X86-NOX87-NEXT:    sete %bl
+; X86-NOX87-NEXT:    testb $1, %bl
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    jne .LBB8_2
 ; X86-NOX87-NEXT:    jmp .LBB8_1
 ; X86-NOX87-NEXT:  .LBB8_2: # %atomicrmw.end
 ; X86-NOX87-NEXT:    addl $20, %esp
 ; X86-NOX87-NEXT:    popl %esi
+; X86-NOX87-NEXT:    popl %ebx
 ; X86-NOX87-NEXT:    retl
   %t1 = atomicrmw umax i32* @sc32, i32 %x acquire
   ret void
@@ -676,8 +696,8 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X64-NEXT:    subl %edx, %ecx
 ; X64-NEXT:    cmovbel %eax, %edx
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
-; X64-NEXT:    sete %dl
-; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    sete %sil
+; X64-NEXT:    testb $1, %sil
 ; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB9_2
@@ -687,6 +707,7 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ;
 ; X86-CMOV-LABEL: atomic_fetch_umin32:
 ; X86-CMOV:       # %bb.0:
+; X86-CMOV-NEXT:    pushl %ebx
 ; X86-CMOV-NEXT:    subl $12, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl sc32, %ecx
@@ -700,18 +721,20 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    subl %edx, %ecx
 ; X86-CMOV-NEXT:    cmovbel %eax, %edx
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
-; X86-CMOV-NEXT:    sete %dl
-; X86-CMOV-NEXT:    testb $1, %dl
+; X86-CMOV-NEXT:    sete %bl
+; X86-CMOV-NEXT:    testb $1, %bl
 ; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB9_2
 ; X86-CMOV-NEXT:    jmp .LBB9_1
 ; X86-CMOV-NEXT:  .LBB9_2: # %atomicrmw.end
 ; X86-CMOV-NEXT:    addl $12, %esp
+; X86-CMOV-NEXT:    popl %ebx
 ; X86-CMOV-NEXT:    retl
 ;
 ; X86-NOCMOV-LABEL: atomic_fetch_umin32:
 ; X86-NOCMOV:       # %bb.0:
+; X86-NOCMOV-NEXT:    pushl %ebx
 ; X86-NOCMOV-NEXT:    pushl %esi
 ; X86-NOCMOV-NEXT:    subl $20, %esp
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -740,18 +763,20 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X86-NOCMOV-NEXT:    movl %ecx, %eax
 ; X86-NOCMOV-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %edx, sc32
-; X86-NOCMOV-NEXT:    sete %dl
-; X86-NOCMOV-NEXT:    testb $1, %dl
+; X86-NOCMOV-NEXT:    sete %bl
+; X86-NOCMOV-NEXT:    testb $1, %bl
 ; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB9_2
 ; X86-NOCMOV-NEXT:    jmp .LBB9_1
 ; X86-NOCMOV-NEXT:  .LBB9_2: # %atomicrmw.end
 ; X86-NOCMOV-NEXT:    addl $20, %esp
 ; X86-NOCMOV-NEXT:    popl %esi
+; X86-NOCMOV-NEXT:    popl %ebx
 ; X86-NOCMOV-NEXT:    retl
 ;
 ; X86-NOX87-LABEL: atomic_fetch_umin32:
 ; X86-NOX87:       # %bb.0:
+; X86-NOX87-NEXT:    pushl %ebx
 ; X86-NOX87-NEXT:    pushl %esi
 ; X86-NOX87-NEXT:    subl $20, %esp
 ; X86-NOX87-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -780,14 +805,15 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X86-NOX87-NEXT:    movl %ecx, %eax
 ; X86-NOX87-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOX87-NEXT:    lock cmpxchgl %edx, sc32
-; X86-NOX87-NEXT:    sete %dl
-; X86-NOX87-NEXT:    testb $1, %dl
+; X86-NOX87-NEXT:    sete %bl
+; X86-NOX87-NEXT:    testb $1, %bl
 ; X86-NOX87-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOX87-NEXT:    jne .LBB9_2
 ; X86-NOX87-NEXT:    jmp .LBB9_1
 ; X86-NOX87-NEXT:  .LBB9_2: # %atomicrmw.end
 ; X86-NOX87-NEXT:    addl $20, %esp
 ; X86-NOX87-NEXT:    popl %esi
+; X86-NOX87-NEXT:    popl %ebx
 ; X86-NOX87-NEXT:    retl
   %t1 = atomicrmw umin i32* @sc32, i32 %x acquire
   ret void
diff --git a/llvm/test/CodeGen/X86/atomic64.ll b/llvm/test/CodeGen/X86/atomic64.ll
index fe7635bdc3ff..0149851ea467 100644
--- a/llvm/test/CodeGen/X86/atomic64.ll
+++ b/llvm/test/CodeGen/X86/atomic64.ll
@@ -137,12 +137,12 @@ define void @atomic_fetch_and64() nounwind {
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $5, %ecx
-; X64-NEXT:    # kill: def $rcx killed $ecx
-; X64-NEXT:    lock cmpxchgq %rcx, {{.*}}(%rip)
-; X64-NEXT:    sete %cl
-; X64-NEXT:    testb $1, %cl
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    lock cmpxchgq %rdx, {{.*}}(%rip)
+; X64-NEXT:    sete %sil
+; X64-NEXT:    testb $1, %sil
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    jne .LBB2_2
 ; X64-NEXT:    jmp .LBB2_1
@@ -202,8 +202,8 @@ define void @atomic_fetch_or64() nounwind {
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    orq $5, %rcx
 ; X64-NEXT:    lock cmpxchgq %rcx, {{.*}}(%rip)
-; X64-NEXT:    sete %cl
-; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    sete %dl
+; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -265,8 +265,8 @@ define void @atomic_fetch_xor64() nounwind {
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    xorq $5, %rcx
 ; X64-NEXT:    lock cmpxchgq %rcx, {{.*}}(%rip)
-; X64-NEXT:    sete %cl
-; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    sete %dl
+; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -330,8 +330,8 @@ define void @atomic_fetch_nand64(i64 %x) nounwind {
 ; X64-NEXT:    andq %rdx, %rcx
 ; X64-NEXT:    notq %rcx
 ; X64-NEXT:    lock cmpxchgq %rcx, {{.*}}(%rip)
-; X64-NEXT:    sete %cl
-; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    sete %sil
+; X64-NEXT:    testb $1, %sil
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    jne .LBB5_2
 ; X64-NEXT:    jmp .LBB5_1
@@ -373,8 +373,8 @@ define void @atomic_fetch_max64(i64 %x) nounwind {
 ; X64-NEXT:    subq %rdx, %rcx
 ; X64-NEXT:    cmovgeq %rax, %rdx
 ; X64-NEXT:    lock cmpxchgq %rdx, {{.*}}(%rip)
-; X64-NEXT:    sete %dl
-; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    sete %sil
+; X64-NEXT:    testb $1, %sil
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    jne .LBB6_2
@@ -473,8 +473,8 @@ define void @atomic_fetch_min64(i64 %x) nounwind {
 ; X64-NEXT:    subq %rdx, %rcx
 ; X64-NEXT:    cmovleq %rax, %rdx
 ; X64-NEXT:    lock cmpxchgq %rdx, {{.*}}(%rip)
-; X64-NEXT:    sete %dl
-; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    sete %sil
+; X64-NEXT:    testb $1, %sil
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    jne .LBB7_2
@@ -571,8 +571,8 @@ define void @atomic_fetch_umax64(i64 %x) nounwind {
 ; X64-NEXT:    subq %rdx, %rcx
 ; X64-NEXT:    cmovaq %rax, %rdx
 ; X64-NEXT:    lock cmpxchgq %rdx, {{.*}}(%rip)
-; X64-NEXT:    sete %dl
-; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    sete %sil
+; X64-NEXT:    testb $1, %sil
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    jne .LBB8_2
@@ -669,8 +669,8 @@ define void @atomic_fetch_umin64(i64 %x) nounwind {
 ; X64-NEXT:    subq %rdx, %rcx
 ; X64-NEXT:    cmovbeq %rax, %rdx
 ; X64-NEXT:    lock cmpxchgq %rdx, {{.*}}(%rip)
-; X64-NEXT:    sete %dl
-; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    sete %sil
+; X64-NEXT:    testb $1, %sil
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    jne .LBB9_2
diff --git a/llvm/test/CodeGen/X86/avx-load-store.ll b/llvm/test/CodeGen/X86/avx-load-store.ll
index f448bfec2ec9..718449d7a771 100644
--- a/llvm/test/CodeGen/X86/avx-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx-load-store.ll
@@ -175,8 +175,8 @@ define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp
 ; CHECK_O0:       # %bb.0:
 ; CHECK_O0-NEXT:    # implicit-def: $ymm2
 ; CHECK_O0-NEXT:    vmovaps %xmm0, %xmm2
-; CHECK_O0-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm0
-; CHECK_O0-NEXT:    vmovdqu %ymm0, (%rdi)
+; CHECK_O0-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; CHECK_O0-NEXT:    vmovdqu %ymm2, (%rdi)
 ; CHECK_O0-NEXT:    vzeroupper
 ; CHECK_O0-NEXT:    retq
   %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -197,8 +197,8 @@ define void @double_save_volatile(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nou
 ; CHECK_O0:       # %bb.0:
 ; CHECK_O0-NEXT:    # implicit-def: $ymm2
 ; CHECK_O0-NEXT:    vmovaps %xmm0, %xmm2
-; CHECK_O0-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm0
-; CHECK_O0-NEXT:    vmovdqu %ymm0, (%rdi)
+; CHECK_O0-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; CHECK_O0-NEXT:    vmovdqu %ymm2, (%rdi)
 ; CHECK_O0-NEXT:    vzeroupper
 ; CHECK_O0-NEXT:    retq
   %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -239,10 +239,10 @@ define void @f_f() nounwind {
 ; CHECK_O0-NEXT:  .LBB9_3: # %cif_mixed_test_all
 ; CHECK_O0-NEXT:    vmovdqa {{.*#+}} xmm0 = [4294967295,0,0,0]
 ; CHECK_O0-NEXT:    vmovdqa %xmm0, %xmm0
-; CHECK_O0-NEXT:    # kill: def $ymm0 killed $xmm0
+; CHECK_O0-NEXT:    vmovaps %xmm0, %xmm1
 ; CHECK_O0-NEXT:    # implicit-def: $rax
-; CHECK_O0-NEXT:    # implicit-def: $ymm1
-; CHECK_O0-NEXT:    vmaskmovps %ymm1, %ymm0, (%rax)
+; CHECK_O0-NEXT:    # implicit-def: $ymm2
+; CHECK_O0-NEXT:    vmaskmovps %ymm2, %ymm1, (%rax)
 ; CHECK_O0-NEXT:  .LBB9_4: # %cif_mixed_test_any_check
 allocas:
   br i1 undef, label %cif_mask_all, label %cif_mask_mixed
@@ -276,8 +276,8 @@ define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind {
 ; CHECK_O0-NEXT:    vmovdqu 16(%rsi), %xmm1
 ; CHECK_O0-NEXT:    # implicit-def: $ymm2
 ; CHECK_O0-NEXT:    vmovaps %xmm0, %xmm2
-; CHECK_O0-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm0
-; CHECK_O0-NEXT:    vmovdqu %ymm0, (%rdi)
+; CHECK_O0-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; CHECK_O0-NEXT:    vmovdqu %ymm2, (%rdi)
 ; CHECK_O0-NEXT:    vzeroupper
 ; CHECK_O0-NEXT:    retq
   %b = load <8 x i32>, <8 x i32>* %bp, align 1
@@ -321,8 +321,8 @@ define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind {
 ; CHECK_O0-NEXT:    vmovdqa 16(%rsi), %xmm1
 ; CHECK_O0-NEXT:    # implicit-def: $ymm2
 ; CHECK_O0-NEXT:    vmovaps %xmm0, %xmm2
-; CHECK_O0-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm0
-; CHECK_O0-NEXT:    vmovdqu %ymm0, (%rdi)
+; CHECK_O0-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; CHECK_O0-NEXT:    vmovdqu %ymm2, (%rdi)
 ; CHECK_O0-NEXT:    vzeroupper
 ; CHECK_O0-NEXT:    retq
   %b = load <4 x i64>, <4 x i64>* %bp, align 16
diff --git a/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll b/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
index 186370ca675c..c4e009d54ec7 100755
--- a/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
@@ -40,20 +40,22 @@ define void @test_xmm(i32 %shift, i32 %mulp, <2 x i64> %a,i8* %arraydecay,i8* %f
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; CHECK-NEXT:    vpmovd2m %xmm0, %k0
 ; CHECK-NEXT:    kmovq %k0, %k1
-; CHECK-NEXT:    kmovd %k0, %ecx
-; CHECK-NEXT:    ## kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    movzbl %cl, %ecx
-; CHECK-NEXT:    ## kill: def $cx killed $cx killed $ecx
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; CHECK-NEXT:    movl $4, %edx
-; CHECK-NEXT:    movl %edx, %esi
+; CHECK-NEXT:    kmovd %k0, %esi
+; CHECK-NEXT:    ## kill: def $sil killed $sil killed $esi
+; CHECK-NEXT:    movzbl %sil, %edi
+; CHECK-NEXT:    ## kill: def $di killed $di killed $edi
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
+; CHECK-NEXT:    movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT:    movq %rcx, %rdi
+; CHECK-NEXT:    movl $4, %r8d
+; CHECK-NEXT:    movl %r8d, %esi
+; CHECK-NEXT:    movl %r8d, %edx
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
-; CHECK-NEXT:    movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; CHECK-NEXT:    callq _calc_expected_mask_val
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $rax
-; CHECK-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx ## 2-byte Reload
-; CHECK-NEXT:    movzwl %cx, %edi
+; CHECK-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %r9w ## 2-byte Reload
+; CHECK-NEXT:    movzwl %r9w, %edi
 ; CHECK-NEXT:    movzwl %ax, %esi
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
diff --git a/llvm/test/CodeGen/X86/crash-O0.ll b/llvm/test/CodeGen/X86/crash-O0.ll
index 9f9e5584d6f2..a93d3dd267b5 100644
--- a/llvm/test/CodeGen/X86/crash-O0.ll
+++ b/llvm/test/CodeGen/X86/crash-O0.ll
@@ -79,12 +79,11 @@ define i64 @addressModeWith32bitIndex(i32 %V) {
 ; CHECK-NEXT:    movq %rsp, %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_register %rbp
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    ## kill: def $rax killed $eax
-; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    movq %rcx, %rax
 ; CHECK-NEXT:    cqto
-; CHECK-NEXT:    movslq %edi, %rcx
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
-; CHECK-NEXT:    idivq (%rsi,%rcx,8)
+; CHECK-NEXT:    movslq %edi, %rsi
+; CHECK-NEXT:    idivq (%rcx,%rsi,8)
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
   %gep = getelementptr i64, i64* null, i32 %V
diff --git a/llvm/test/CodeGen/X86/extend-set-cc-uses-dbg.ll b/llvm/test/CodeGen/X86/extend-set-cc-uses-dbg.ll
index 664d9ded1e0e..7d05a869be89 100644
--- a/llvm/test/CodeGen/X86/extend-set-cc-uses-dbg.ll
+++ b/llvm/test/CodeGen/X86/extend-set-cc-uses-dbg.ll
@@ -7,8 +7,8 @@ define void @foo(i32* %p) !dbg !4 {
 bb:
   %tmp = load i32, i32* %p, align 4, !dbg !7
   ; CHECK: $eax = MOV32rm killed {{.*}} $rdi, {{.*}} debug-location !7 :: (load 4 from %ir.p)
-  ; CHECK-NEXT: $rax = KILL killed renamable $eax, debug-location !7
-  ; CHECK-NEXT: $rcx = MOV64rr $rax, debug-location !7
+  ; CHECK-NEXT: $ecx = MOV32rr killed $eax, implicit-def $rcx, debug-location !7
+  ; CHECK-NEXT: $rdx = MOV64rr $rcx, debug-location !7
 
   switch i32 %tmp, label %bb7 [
     i32 0, label %bb1
diff --git a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll
index 7fffa21f0d24..5d7c83fa19d4 100644
--- a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll
@@ -1013,11 +1013,11 @@ define <16 x float> @test_load_nt16xfloat(<16 x float>* nocapture %ptr) {
 ; AVX1-NEXT:    vmovaps %xmm0, %xmm1
 ; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm1
-; AVX1-NEXT:    # implicit-def: $ymm2
-; AVX1-NEXT:    vmovaps %xmm1, %xmm2
-; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT:    # implicit-def: $ymm1
+; AVX1-NEXT:    vmovaps %xmm2, %xmm1
+; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_load_nt16xfloat:
@@ -1067,11 +1067,11 @@ define <8 x double> @test_load_nt8xdouble(<8 x double>* nocapture %ptr) {
 ; AVX1-NEXT:    vmovaps %xmm0, %xmm1
 ; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm1
-; AVX1-NEXT:    # implicit-def: $ymm2
-; AVX1-NEXT:    vmovaps %xmm1, %xmm2
-; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT:    # implicit-def: $ymm1
+; AVX1-NEXT:    vmovaps %xmm2, %xmm1
+; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_load_nt8xdouble:
@@ -1121,11 +1121,11 @@ define <64 x i8> @test_load_nt64xi8(<64 x i8>* nocapture %ptr) {
 ; AVX1-NEXT:    vmovaps %xmm0, %xmm1
 ; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm1
-; AVX1-NEXT:    # implicit-def: $ymm2
-; AVX1-NEXT:    vmovaps %xmm1, %xmm2
-; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT:    # implicit-def: $ymm1
+; AVX1-NEXT:    vmovaps %xmm2, %xmm1
+; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_load_nt64xi8:
@@ -1175,11 +1175,11 @@ define <32 x i16> @test_load_nt32xi16(<32 x i16>* nocapture %ptr) {
 ; AVX1-NEXT:    vmovaps %xmm0, %xmm1
 ; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm1
-; AVX1-NEXT:    # implicit-def: $ymm2
-; AVX1-NEXT:    vmovaps %xmm1, %xmm2
-; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT:    # implicit-def: $ymm1
+; AVX1-NEXT:    vmovaps %xmm2, %xmm1
+; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_load_nt32xi16:
@@ -1229,11 +1229,11 @@ define <16 x i32> @test_load_nt16xi32(<16 x i32>* nocapture %ptr) {
 ; AVX1-NEXT:    vmovaps %xmm0, %xmm1
 ; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm1
-; AVX1-NEXT:    # implicit-def: $ymm2
-; AVX1-NEXT:    vmovaps %xmm1, %xmm2
-; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT:    # implicit-def: $ymm1
+; AVX1-NEXT:    vmovaps %xmm2, %xmm1
+; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_load_nt16xi32:
@@ -1283,11 +1283,11 @@ define <8 x i64> @test_load_nt8xi64(<8 x i64>* nocapture %ptr) {
 ; AVX1-NEXT:    vmovaps %xmm0, %xmm1
 ; AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm1
-; AVX1-NEXT:    # implicit-def: $ymm2
-; AVX1-NEXT:    vmovaps %xmm1, %xmm2
-; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT:    # implicit-def: $ymm1
+; AVX1-NEXT:    vmovaps %xmm2, %xmm1
+; AVX1-NEXT:    vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_load_nt8xi64:
diff --git a/llvm/test/CodeGen/X86/lvi-hardening-loads.ll b/llvm/test/CodeGen/X86/lvi-hardening-loads.ll
index ff8276f6f1c2..e660f306ef75 100644
--- a/llvm/test/CodeGen/X86/lvi-hardening-loads.ll
+++ b/llvm/test/CodeGen/X86/lvi-hardening-loads.ll
@@ -117,9 +117,9 @@ if.then:                                          ; preds = %for.body
 ; X64-NOOPT-NEXT:      lfence
 ; X64-NOOPT-NEXT:      movq (%rax,%rcx,8), %rax
 ; X64-NOOPT-NEXT:      lfence
-; X64-NOOPT-NEXT:      movl (%rax), %eax
+; X64-NOOPT-NEXT:      movl (%rax), %edx
 ; X64-NOOPT-NEXT:      lfence
-; X64-NOOPT-NEXT:      movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NOOPT-NEXT:      movl %edx, -{{[0-9]+}}(%rsp)
 
 if.end:                                           ; preds = %if.then, %for.body
   br label %for.inc
diff --git a/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll b/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll
index ac55e1a1fc65..a1ad7f3c0f53 100644
--- a/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll
+++ b/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll
@@ -69,8 +69,8 @@ define dso_local void @test_zero_ext(%struct.Foo* %f, i32 addrspace(271)* %i) {
 ; CHECK-O0-LABEL: test_zero_ext:
 ; CHECK-O0:       # %bb.0: # %entry
 ; CHECK-O0-NEXT:    movl %edx, %eax
-; CHECK-O0-NEXT:    # kill: def $rax killed $eax
-; CHECK-O0-NEXT:    movq %rax, 8(%rcx)
+; CHECK-O0-NEXT:    movl %eax, %r8d
+; CHECK-O0-NEXT:    movq %r8, 8(%rcx)
 ; CHECK-O0-NEXT:    jmp use_foo # TAILCALL
 entry:
   %0 = addrspacecast i32 addrspace(271)* %i to i32*
@@ -125,23 +125,19 @@ entry:
 
 ; Test that null can be passed as a 32-bit pointer.
 define dso_local void @test_null_arg(%struct.Foo* %f) {
-; CHECK-LABEL: test_null_arg:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $40, %rsp
-; CHECK:         xorl %edx, %edx
-; CHECK-NEXT:    callq test_noop1
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    addq $40, %rsp
-; CHECK-NEXT:    retq
-;
-; CHECK-O0-LABEL: test_null_arg:
-; CHECK-O0:       # %bb.0: # %entry
-; CHECK-O0-NEXT:    subq $40, %rsp
-; CHECK-O0:         xorl %edx, %edx
-; CHECK-O0-NEXT:    callq test_noop1
-; CHECK-O0-NEXT:    nop
-; CHECK-O0-NEXT:    addq $40, %rsp
-; CHECK-O0-NEXT:    retq
+; ALL-LABEL: test_null_arg:
+; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    subq $40, %rsp
+; ALL-NEXT:    .seh_stackalloc 40
+; ALL-NEXT:    .seh_endprologue
+; ALL-NEXT:    xorl %edx, %edx
+; ALL-NEXT:    callq test_noop1
+; ALL-NEXT:    nop
+; ALL-NEXT:    addq $40, %rsp
+; ALL-NEXT:    retq
+; ALL-NEXT:    .seh_handlerdata
+; ALL-NEXT:    .text
+; ALL-NEXT:    .seh_endproc
 entry:
   call void @test_noop1(%struct.Foo* %f, i32 addrspace(270)* null)
   ret void
@@ -177,8 +173,8 @@ define void @test_unrecognized2(%struct.Foo* %f, i32 addrspace(271)* %i) {
 ; CHECK-O0-LABEL: test_unrecognized2:
 ; CHECK-O0:       # %bb.0: # %entry
 ; CHECK-O0-NEXT:    movl %edx, %eax
-; CHECK-O0-NEXT:    # kill: def $rax killed $eax
-; CHECK-O0-NEXT:    movq %rax, 16(%rcx)
+; CHECK-O0-NEXT:    movl %eax, %r8d
+; CHECK-O0-NEXT:    movq %r8, 16(%rcx)
 ; CHECK-O0-NEXT:    jmp use_foo # TAILCALL
 entry:
   %0 = addrspacecast i32 addrspace(271)* %i to i32 addrspace(9)*
@@ -189,16 +185,11 @@ entry:
 }
 
 define i32 @test_load_sptr32(i32 addrspace(270)* %i) {
-; CHECK-LABEL: test_load_sptr32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movslq  %ecx, %rax
-; CHECK-NEXT:    movl (%rax), %eax
-; CHECK-NEXT:    retq
-; CHECK-O0-LABEL: test_load_sptr32:
-; CHECK-O0:       # %bb.0: # %entry
-; CHECK-O0-NEXT:    movslq  %ecx, %rax
-; CHECK-O0-NEXT:    movl (%rax), %eax
-; CHECK-O0-NEXT:    retq
+; ALL-LABEL: test_load_sptr32:
+; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    movslq %ecx, %rax
+; ALL-NEXT:    movl (%rax), %eax
+; ALL-NEXT:    retq
 entry:
   %0 = load i32, i32 addrspace(270)* %i, align 4
   ret i32 %0
@@ -210,11 +201,12 @@ define i32 @test_load_uptr32(i32 addrspace(271)* %i) {
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    movl (%rax), %eax
 ; CHECK-NEXT:    retq
+;
 ; CHECK-O0-LABEL: test_load_uptr32:
 ; CHECK-O0:       # %bb.0: # %entry
 ; CHECK-O0-NEXT:    movl %ecx, %eax
-; CHECK-O0-NEXT:    # kill: def $rax killed $eax
-; CHECK-O0-NEXT:    movl (%rax), %eax
+; CHECK-O0-NEXT:    movl %eax, %edx
+; CHECK-O0-NEXT:    movl (%rdx), %eax
 ; CHECK-O0-NEXT:    retq
 entry:
   %0 = load i32, i32 addrspace(271)* %i, align 4
@@ -222,30 +214,21 @@ entry:
 }
 
 define i32 @test_load_ptr64(i32 addrspace(272)* %i) {
-; CHECK-LABEL: test_load_ptr64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl (%rcx), %eax
-; CHECK-NEXT:    retq
-; CHECK-O0-LABEL: test_load_ptr64:
-; CHECK-O0:       # %bb.0: # %entry
-; CHECK-O0-NEXT:    movl (%rcx), %eax
-; CHECK-O0-NEXT:    retq
+; ALL-LABEL: test_load_ptr64:
+; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    movl (%rcx), %eax
+; ALL-NEXT:    retq
 entry:
   %0 = load i32, i32 addrspace(272)* %i, align 8
   ret i32 %0
 }
 
 define void @test_store_sptr32(i32 addrspace(270)* %s, i32 %i) {
-; CHECK-LABEL: test_store_sptr32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movslq %ecx, %rax
-; CHECK-NEXT:    movl %edx, (%rax)
-; CHECK-NEXT:    retq
-; CHECK-O0-LABEL: test_store_sptr32:
-; CHECK-O0:       # %bb.0: # %entry
-; CHECK-O0-NEXT:    movslq %ecx, %rax
-; CHECK-O0-NEXT:    movl %edx, (%rax)
-; CHECK-O0-NEXT:    retq
+; ALL-LABEL: test_store_sptr32:
+; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    movslq %ecx, %rax
+; ALL-NEXT:    movl %edx, (%rax)
+; ALL-NEXT:    retq
 entry:
   store i32 %i, i32 addrspace(270)* %s, align 4
   ret void
@@ -257,11 +240,12 @@ define void @test_store_uptr32(i32 addrspace(271)* %s, i32 %i) {
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    movl %edx, (%rax)
 ; CHECK-NEXT:    retq
+;
 ; CHECK-O0-LABEL: test_store_uptr32:
 ; CHECK-O0:       # %bb.0: # %entry
 ; CHECK-O0-NEXT:    movl %ecx, %eax
-; CHECK-O0-NEXT:    # kill: def $rax killed $eax
-; CHECK-O0-NEXT:    movl %edx, (%rax)
+; CHECK-O0-NEXT:    movl %eax, %r8d
+; CHECK-O0-NEXT:    movl %edx, (%r8)
 ; CHECK-O0-NEXT:    retq
 entry:
   store i32 %i, i32 addrspace(271)* %s, align 4
@@ -269,14 +253,10 @@ entry:
 }
 
 define void @test_store_ptr64(i32 addrspace(272)* %s, i32 %i) {
-; CHECK-LABEL: test_store_ptr64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %edx, (%rcx)
-; CHECK-NEXT:    retq
-; CHECK-O0-LABEL: test_store_ptr64:
-; CHECK-O0:       # %bb.0: # %entry
-; CHECK-O0-NEXT:    movl %edx, (%rcx)
-; CHECK-O0-NEXT:    retq
+; ALL-LABEL: test_store_ptr64:
+; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    movl %edx, (%rcx)
+; ALL-NEXT:    retq
 entry:
   store i32 %i, i32 addrspace(272)* %s, align 8
   ret void
diff --git a/llvm/test/CodeGen/X86/pr1489.ll b/llvm/test/CodeGen/X86/pr1489.ll
index d1148eecb0da..6226ea6caf90 100644
--- a/llvm/test/CodeGen/X86/pr1489.ll
+++ b/llvm/test/CodeGen/X86/pr1489.ll
@@ -16,9 +16,9 @@ define i32 @quux() nounwind {
 ; CHECK-NEXT:    movl $1082126238, (%eax) ## imm = 0x407FEF9E
 ; CHECK-NEXT:    calll _lrintf
 ; CHECK-NEXT:    cmpl $1, %eax
-; CHECK-NEXT:    setl %al
-; CHECK-NEXT:    andb $1, %al
-; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    setl %cl
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movzbl %cl, %eax
 ; CHECK-NEXT:    addl $8, %esp
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
@@ -42,9 +42,9 @@ define i32 @foo() nounwind {
 ; CHECK-NEXT:    movl $-1236950581, (%eax) ## imm = 0xB645A1CB
 ; CHECK-NEXT:    calll _lrint
 ; CHECK-NEXT:    cmpl $1, %eax
-; CHECK-NEXT:    setl %al
-; CHECK-NEXT:    andb $1, %al
-; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    setl %cl
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movzbl %cl, %eax
 ; CHECK-NEXT:    addl $8, %esp
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
@@ -67,9 +67,9 @@ define i32 @bar() nounwind {
 ; CHECK-NEXT:    movl $1082126238, (%eax) ## imm = 0x407FEF9E
 ; CHECK-NEXT:    calll _lrintf
 ; CHECK-NEXT:    cmpl $1, %eax
-; CHECK-NEXT:    setl %al
-; CHECK-NEXT:    andb $1, %al
-; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    setl %cl
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movzbl %cl, %eax
 ; CHECK-NEXT:    addl $8, %esp
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
@@ -90,9 +90,9 @@ define i32 @baz() nounwind {
 ; CHECK-NEXT:    movl $1082126238, (%eax) ## imm = 0x407FEF9E
 ; CHECK-NEXT:    calll _lrintf
 ; CHECK-NEXT:    cmpl $1, %eax
-; CHECK-NEXT:    setl %al
-; CHECK-NEXT:    andb $1, %al
-; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    setl %cl
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movzbl %cl, %eax
 ; CHECK-NEXT:    addl $8, %esp
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/pr27591.ll b/llvm/test/CodeGen/X86/pr27591.ll
index 7455584ac698..97ad6814f192 100644
--- a/llvm/test/CodeGen/X86/pr27591.ll
+++ b/llvm/test/CodeGen/X86/pr27591.ll
@@ -9,9 +9,9 @@ define void @test1(i32 %x) #0 {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    cmpl $0, %edi
 ; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    movzbl %al, %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    movl %eax, %edi
+; CHECK-NEXT:    movzbl %al, %ecx
+; CHECK-NEXT:    andl $1, %ecx
+; CHECK-NEXT:    movl %ecx, %edi
 ; CHECK-NEXT:    callq callee1
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
@@ -27,10 +27,10 @@ define void @test2(i32 %x) #0 {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    cmpl $0, %edi
 ; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    movzbl %al, %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    negl %eax
-; CHECK-NEXT:    movl %eax, %edi
+; CHECK-NEXT:    movzbl %al, %ecx
+; CHECK-NEXT:    andl $1, %ecx
+; CHECK-NEXT:    negl %ecx
+; CHECK-NEXT:    movl %ecx, %edi
 ; CHECK-NEXT:    callq callee2
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr30430.ll b/llvm/test/CodeGen/X86/pr30430.ll
index e524245daa11..4d40aa09eeab 100644
--- a/llvm/test/CodeGen/X86/pr30430.ll
+++ b/llvm/test/CodeGen/X86/pr30430.ll
@@ -75,28 +75,28 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
 ; CHECK-NEXT:    # implicit-def: $ymm2
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm2
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm2
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
 ; CHECK-NEXT:    # implicit-def: $ymm3
-; CHECK-NEXT:    vmovaps %xmm2, %xmm3
-; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; CHECK-NEXT:    # implicit-def: $zmm2
-; CHECK-NEXT:    vmovaps %ymm1, %ymm2
-; CHECK-NEXT:    vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %xmm1, %xmm3
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm3
+; CHECK-NEXT:    # implicit-def: $zmm24
+; CHECK-NEXT:    vmovaps %zmm3, %zmm24
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm2, %zmm24, %zmm24
+; CHECK-NEXT:    vmovaps %zmm24, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm0
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
diff --git a/llvm/test/CodeGen/X86/pr30813.ll b/llvm/test/CodeGen/X86/pr30813.ll
index 7266c5bd8d01..e3e096bda6c2 100644
--- a/llvm/test/CodeGen/X86/pr30813.ll
+++ b/llvm/test/CodeGen/X86/pr30813.ll
@@ -1,8 +1,9 @@
 ; RUN: llc -mtriple=x86_64-linux-gnu -O0 %s -o - | FileCheck %s
 ; CHECK: patatino:
 ; CHECK:         .cfi_startproc
-; CHECK:         movzwl  (%rax), %e[[REG0:[abcd]x]]
-; CHECK:         movq    %r[[REG0]], ({{%r[abcd]x}})
+; CHECK:         movzwl  (%rax), [[REG0:%e[abcd]x]]
+; CHECK:         movl    [[REG0]], %e[[REG1C:[abcd]]]x
+; CHECK:         movq    %r[[REG1C]]x, ({{%r[abcd]x}})
 ; CHECK:         retq
 
 define void @patatino() {
diff --git a/llvm/test/CodeGen/X86/pr32241.ll b/llvm/test/CodeGen/X86/pr32241.ll
index 1f3d273dfc41..6d628e6962ed 100644
--- a/llvm/test/CodeGen/X86/pr32241.ll
+++ b/llvm/test/CodeGen/X86/pr32241.ll
@@ -23,14 +23,14 @@ define i32 @_Z3foov() {
 ; CHECK-NEXT:  .LBB0_2: # %lor.end
 ; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
 ; CHECK-NEXT:    andb $1, %al
-; CHECK-NEXT:    movzbl %al, %eax
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    cmpl %eax, %ecx
+; CHECK-NEXT:    movzbl %al, %ecx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT:    cmpl %ecx, %edx
 ; CHECK-NEXT:    setl %al
 ; CHECK-NEXT:    andb $1, %al
-; CHECK-NEXT:    movzbl %al, %eax
-; CHECK-NEXT:    xorl $-1, %eax
-; CHECK-NEXT:    cmpl $0, %eax
+; CHECK-NEXT:    movzbl %al, %ecx
+; CHECK-NEXT:    xorl $-1, %ecx
+; CHECK-NEXT:    cmpl $0, %ecx
 ; CHECK-NEXT:    movb $1, %al
 ; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; CHECK-NEXT:    jne .LBB0_4
@@ -42,9 +42,9 @@ define i32 @_Z3foov() {
 ; CHECK-NEXT:  .LBB0_4: # %lor.end5
 ; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
 ; CHECK-NEXT:    andb $1, %al
-; CHECK-NEXT:    movzbl %al, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movzbl %al, %ecx
+; CHECK-NEXT:    # kill: def $cx killed $cx killed $ecx
+; CHECK-NEXT:    movw %cx, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    addl $16, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
diff --git a/llvm/test/CodeGen/X86/pr32284.ll b/llvm/test/CodeGen/X86/pr32284.ll
index 533473663d73..a1041ab889c2 100644
--- a/llvm/test/CodeGen/X86/pr32284.ll
+++ b/llvm/test/CodeGen/X86/pr32284.ll
@@ -10,28 +10,28 @@ define void @foo() {
 ; X86-O0-LABEL: foo:
 ; X86-O0:       # %bb.0: # %entry
 ; X86-O0-NEXT:    xorl %eax, %eax
-; X86-O0-NEXT:    # kill: def $rax killed $eax
-; X86-O0-NEXT:    xorl %ecx, %ecx
+; X86-O0-NEXT:    movl %eax, %ecx
+; X86-O0-NEXT:    xorl %eax, %eax
 ; X86-O0-NEXT:    movzbl c, %edx
-; X86-O0-NEXT:    subl %edx, %ecx
-; X86-O0-NEXT:    movslq %ecx, %rcx
-; X86-O0-NEXT:    subq %rcx, %rax
-; X86-O0-NEXT:    # kill: def $al killed $al killed $rax
-; X86-O0-NEXT:    cmpb $0, %al
-; X86-O0-NEXT:    setne %al
-; X86-O0-NEXT:    andb $1, %al
-; X86-O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; X86-O0-NEXT:    subl %edx, %eax
+; X86-O0-NEXT:    movslq %eax, %rsi
+; X86-O0-NEXT:    subq %rsi, %rcx
+; X86-O0-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X86-O0-NEXT:    cmpb $0, %cl
+; X86-O0-NEXT:    setne %cl
+; X86-O0-NEXT:    andb $1, %cl
+; X86-O0-NEXT:    movb %cl, -{{[0-9]+}}(%rsp)
 ; X86-O0-NEXT:    cmpb $0, c
-; X86-O0-NEXT:    setne %al
-; X86-O0-NEXT:    xorb $-1, %al
-; X86-O0-NEXT:    xorb $-1, %al
-; X86-O0-NEXT:    andb $1, %al
-; X86-O0-NEXT:    movzbl %al, %eax
-; X86-O0-NEXT:    movzbl c, %ecx
-; X86-O0-NEXT:    cmpl %ecx, %eax
-; X86-O0-NEXT:    setle %al
-; X86-O0-NEXT:    andb $1, %al
-; X86-O0-NEXT:    movzbl %al, %eax
+; X86-O0-NEXT:    setne %cl
+; X86-O0-NEXT:    xorb $-1, %cl
+; X86-O0-NEXT:    xorb $-1, %cl
+; X86-O0-NEXT:    andb $1, %cl
+; X86-O0-NEXT:    movzbl %cl, %eax
+; X86-O0-NEXT:    movzbl c, %edx
+; X86-O0-NEXT:    cmpl %edx, %eax
+; X86-O0-NEXT:    setle %cl
+; X86-O0-NEXT:    andb $1, %cl
+; X86-O0-NEXT:    movzbl %cl, %eax
 ; X86-O0-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
 ; X86-O0-NEXT:    retq
 ;
@@ -63,13 +63,13 @@ define void @foo() {
 ; 686-O0-NEXT:    xorb $-1, %al
 ; 686-O0-NEXT:    xorb $-1, %al
 ; 686-O0-NEXT:    andb $1, %al
-; 686-O0-NEXT:    movzbl %al, %eax
-; 686-O0-NEXT:    movzbl c, %ecx
-; 686-O0-NEXT:    cmpl %ecx, %eax
+; 686-O0-NEXT:    movzbl %al, %ecx
+; 686-O0-NEXT:    movzbl c, %edx
+; 686-O0-NEXT:    cmpl %edx, %ecx
 ; 686-O0-NEXT:    setle %al
 ; 686-O0-NEXT:    andb $1, %al
-; 686-O0-NEXT:    movzbl %al, %eax
-; 686-O0-NEXT:    movl %eax, (%esp)
+; 686-O0-NEXT:    movzbl %al, %ecx
+; 686-O0-NEXT:    movl %ecx, (%esp)
 ; 686-O0-NEXT:    addl $8, %esp
 ; 686-O0-NEXT:    .cfi_def_cfa_offset 4
 ; 686-O0-NEXT:    retl
@@ -126,33 +126,33 @@ define void @f1() {
 ; X86-O0-NEXT:    movabsq $8381627093, %rcx # imm = 0x1F3957AD5
 ; X86-O0-NEXT:    addq %rcx, %rax
 ; X86-O0-NEXT:    cmpq $0, %rax
-; X86-O0-NEXT:    setne %al
-; X86-O0-NEXT:    andb $1, %al
-; X86-O0-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
-; X86-O0-NEXT:    movl var_5, %eax
-; X86-O0-NEXT:    xorl $-1, %eax
-; X86-O0-NEXT:    cmpl $0, %eax
-; X86-O0-NEXT:    setne %al
-; X86-O0-NEXT:    xorb $-1, %al
-; X86-O0-NEXT:    andb $1, %al
-; X86-O0-NEXT:    movzbl %al, %eax
-; X86-O0-NEXT:    # kill: def $rax killed $eax
+; X86-O0-NEXT:    setne %dl
+; X86-O0-NEXT:    andb $1, %dl
+; X86-O0-NEXT:    movb %dl, -{{[0-9]+}}(%rsp)
+; X86-O0-NEXT:    movl var_5, %esi
+; X86-O0-NEXT:    xorl $-1, %esi
+; X86-O0-NEXT:    cmpl $0, %esi
+; X86-O0-NEXT:    setne %dl
+; X86-O0-NEXT:    xorb $-1, %dl
+; X86-O0-NEXT:    andb $1, %dl
+; X86-O0-NEXT:    movzbl %dl, %esi
+; X86-O0-NEXT:    movl %esi, %eax
 ; X86-O0-NEXT:    movslq var_5, %rcx
 ; X86-O0-NEXT:    addq $7093, %rcx # imm = 0x1BB5
 ; X86-O0-NEXT:    cmpq %rcx, %rax
-; X86-O0-NEXT:    setg %al
-; X86-O0-NEXT:    andb $1, %al
-; X86-O0-NEXT:    movzbl %al, %eax
-; X86-O0-NEXT:    # kill: def $rax killed $eax
+; X86-O0-NEXT:    setg %dl
+; X86-O0-NEXT:    andb $1, %dl
+; X86-O0-NEXT:    movzbl %dl, %esi
+; X86-O0-NEXT:    movl %esi, %eax
 ; X86-O0-NEXT:    movq %rax, var_57
-; X86-O0-NEXT:    movl var_5, %eax
-; X86-O0-NEXT:    xorl $-1, %eax
-; X86-O0-NEXT:    cmpl $0, %eax
-; X86-O0-NEXT:    setne %al
-; X86-O0-NEXT:    xorb $-1, %al
-; X86-O0-NEXT:    andb $1, %al
-; X86-O0-NEXT:    movzbl %al, %eax
-; X86-O0-NEXT:    # kill: def $rax killed $eax
+; X86-O0-NEXT:    movl var_5, %esi
+; X86-O0-NEXT:    xorl $-1, %esi
+; X86-O0-NEXT:    cmpl $0, %esi
+; X86-O0-NEXT:    setne %dl
+; X86-O0-NEXT:    xorb $-1, %dl
+; X86-O0-NEXT:    andb $1, %dl
+; X86-O0-NEXT:    movzbl %dl, %esi
+; X86-O0-NEXT:    movl %esi, %eax
 ; X86-O0-NEXT:    movq %rax, _ZN8struct_210member_2_0E
 ; X86-O0-NEXT:    retq
 ;
@@ -178,17 +178,20 @@ define void @f1() {
 ;
 ; 686-O0-LABEL: f1:
 ; 686-O0:       # %bb.0: # %entry
-; 686-O0-NEXT:    pushl %ebx
+; 686-O0-NEXT:    pushl %ebp
 ; 686-O0-NEXT:    .cfi_def_cfa_offset 8
-; 686-O0-NEXT:    pushl %edi
+; 686-O0-NEXT:    pushl %ebx
 ; 686-O0-NEXT:    .cfi_def_cfa_offset 12
-; 686-O0-NEXT:    pushl %esi
+; 686-O0-NEXT:    pushl %edi
 ; 686-O0-NEXT:    .cfi_def_cfa_offset 16
+; 686-O0-NEXT:    pushl %esi
+; 686-O0-NEXT:    .cfi_def_cfa_offset 20
 ; 686-O0-NEXT:    subl $1, %esp
-; 686-O0-NEXT:    .cfi_def_cfa_offset 17
-; 686-O0-NEXT:    .cfi_offset %esi, -16
-; 686-O0-NEXT:    .cfi_offset %edi, -12
-; 686-O0-NEXT:    .cfi_offset %ebx, -8
+; 686-O0-NEXT:    .cfi_def_cfa_offset 21
+; 686-O0-NEXT:    .cfi_offset %esi, -20
+; 686-O0-NEXT:    .cfi_offset %edi, -16
+; 686-O0-NEXT:    .cfi_offset %ebx, -12
+; 686-O0-NEXT:    .cfi_offset %ebp, -8
 ; 686-O0-NEXT:    movl var_5, %eax
 ; 686-O0-NEXT:    movl %eax, %ecx
 ; 686-O0-NEXT:    sarl $31, %ecx
@@ -214,16 +217,18 @@ define void @f1() {
 ; 686-O0-NEXT:    movl var_5, %edi
 ; 686-O0-NEXT:    subl $-1, %edi
 ; 686-O0-NEXT:    sete %bl
-; 686-O0-NEXT:    movzbl %bl, %ebx
-; 686-O0-NEXT:    movl %ebx, _ZN8struct_210member_2_0E
+; 686-O0-NEXT:    movzbl %bl, %ebp
+; 686-O0-NEXT:    movl %ebp, _ZN8struct_210member_2_0E
 ; 686-O0-NEXT:    movl $0, _ZN8struct_210member_2_0E+4
 ; 686-O0-NEXT:    addl $1, %esp
-; 686-O0-NEXT:    .cfi_def_cfa_offset 16
+; 686-O0-NEXT:    .cfi_def_cfa_offset 20
 ; 686-O0-NEXT:    popl %esi
-; 686-O0-NEXT:    .cfi_def_cfa_offset 12
+; 686-O0-NEXT:    .cfi_def_cfa_offset 16
 ; 686-O0-NEXT:    popl %edi
-; 686-O0-NEXT:    .cfi_def_cfa_offset 8
+; 686-O0-NEXT:    .cfi_def_cfa_offset 12
 ; 686-O0-NEXT:    popl %ebx
+; 686-O0-NEXT:    .cfi_def_cfa_offset 8
+; 686-O0-NEXT:    popl %ebp
 ; 686-O0-NEXT:    .cfi_def_cfa_offset 4
 ; 686-O0-NEXT:    retl
 ;
@@ -305,25 +310,25 @@ define void @f2() {
 ; X86-O0-NEXT:    setne %cl
 ; X86-O0-NEXT:    xorb $-1, %cl
 ; X86-O0-NEXT:    andb $1, %cl
-; X86-O0-NEXT:    movzbl %cl, %ecx
-; X86-O0-NEXT:    xorl %ecx, %eax
+; X86-O0-NEXT:    movzbl %cl, %edx
+; X86-O0-NEXT:    xorl %edx, %eax
 ; X86-O0-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-O0-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; X86-O0-NEXT:    movzbl var_7, %eax
-; X86-O0-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-O0-NEXT:    cmpw $0, %ax
-; X86-O0-NEXT:    setne %al
-; X86-O0-NEXT:    xorb $-1, %al
-; X86-O0-NEXT:    andb $1, %al
-; X86-O0-NEXT:    movzbl %al, %eax
-; X86-O0-NEXT:    movzbl var_7, %ecx
-; X86-O0-NEXT:    cmpl %ecx, %eax
-; X86-O0-NEXT:    sete %al
-; X86-O0-NEXT:    andb $1, %al
-; X86-O0-NEXT:    movzbl %al, %eax
-; X86-O0-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-O0-NEXT:    # implicit-def: $rcx
-; X86-O0-NEXT:    movw %ax, (%rcx)
+; X86-O0-NEXT:    movzbl var_7, %edx
+; X86-O0-NEXT:    # kill: def $dx killed $dx killed $edx
+; X86-O0-NEXT:    cmpw $0, %dx
+; X86-O0-NEXT:    setne %cl
+; X86-O0-NEXT:    xorb $-1, %cl
+; X86-O0-NEXT:    andb $1, %cl
+; X86-O0-NEXT:    movzbl %cl, %esi
+; X86-O0-NEXT:    movzbl var_7, %edi
+; X86-O0-NEXT:    cmpl %edi, %esi
+; X86-O0-NEXT:    sete %cl
+; X86-O0-NEXT:    andb $1, %cl
+; X86-O0-NEXT:    movzbl %cl, %esi
+; X86-O0-NEXT:    # kill: def $si killed $si killed $esi
+; X86-O0-NEXT:    # implicit-def: $r8
+; X86-O0-NEXT:    movw %si, (%r8)
 ; X86-O0-NEXT:    retq
 ;
 ; X64-LABEL: f2:
@@ -345,33 +350,43 @@ define void @f2() {
 ;
 ; 686-O0-LABEL: f2:
 ; 686-O0:       # %bb.0: # %entry
+; 686-O0-NEXT:    pushl %edi
+; 686-O0-NEXT:    .cfi_def_cfa_offset 8
+; 686-O0-NEXT:    pushl %esi
+; 686-O0-NEXT:    .cfi_def_cfa_offset 12
 ; 686-O0-NEXT:    subl $2, %esp
-; 686-O0-NEXT:    .cfi_def_cfa_offset 6
+; 686-O0-NEXT:    .cfi_def_cfa_offset 14
+; 686-O0-NEXT:    .cfi_offset %esi, -12
+; 686-O0-NEXT:    .cfi_offset %edi, -8
 ; 686-O0-NEXT:    movzbl var_7, %eax
 ; 686-O0-NEXT:    cmpb $0, var_7
 ; 686-O0-NEXT:    setne %cl
 ; 686-O0-NEXT:    xorb $-1, %cl
 ; 686-O0-NEXT:    andb $1, %cl
-; 686-O0-NEXT:    movzbl %cl, %ecx
-; 686-O0-NEXT:    xorl %ecx, %eax
+; 686-O0-NEXT:    movzbl %cl, %edx
+; 686-O0-NEXT:    xorl %edx, %eax
 ; 686-O0-NEXT:    # kill: def $ax killed $ax killed $eax
 ; 686-O0-NEXT:    movw %ax, (%esp)
-; 686-O0-NEXT:    movzbl var_7, %eax
-; 686-O0-NEXT:    # kill: def $ax killed $ax killed $eax
-; 686-O0-NEXT:    cmpw $0, %ax
-; 686-O0-NEXT:    setne %al
-; 686-O0-NEXT:    xorb $-1, %al
-; 686-O0-NEXT:    andb $1, %al
-; 686-O0-NEXT:    movzbl %al, %eax
-; 686-O0-NEXT:    movzbl var_7, %ecx
-; 686-O0-NEXT:    cmpl %ecx, %eax
-; 686-O0-NEXT:    sete %al
-; 686-O0-NEXT:    andb $1, %al
-; 686-O0-NEXT:    movzbl %al, %eax
-; 686-O0-NEXT:    # kill: def $ax killed $ax killed $eax
-; 686-O0-NEXT:    # implicit-def: $ecx
-; 686-O0-NEXT:    movw %ax, (%ecx)
+; 686-O0-NEXT:    movzbl var_7, %edx
+; 686-O0-NEXT:    # kill: def $dx killed $dx killed $edx
+; 686-O0-NEXT:    cmpw $0, %dx
+; 686-O0-NEXT:    setne %cl
+; 686-O0-NEXT:    xorb $-1, %cl
+; 686-O0-NEXT:    andb $1, %cl
+; 686-O0-NEXT:    movzbl %cl, %esi
+; 686-O0-NEXT:    movzbl var_7, %edi
+; 686-O0-NEXT:    cmpl %edi, %esi
+; 686-O0-NEXT:    sete %cl
+; 686-O0-NEXT:    andb $1, %cl
+; 686-O0-NEXT:    movzbl %cl, %esi
+; 686-O0-NEXT:    # kill: def $si killed $si killed $esi
+; 686-O0-NEXT:    # implicit-def: $edi
+; 686-O0-NEXT:    movw %si, (%edi)
 ; 686-O0-NEXT:    addl $2, %esp
+; 686-O0-NEXT:    .cfi_def_cfa_offset 12
+; 686-O0-NEXT:    popl %esi
+; 686-O0-NEXT:    .cfi_def_cfa_offset 8
+; 686-O0-NEXT:    popl %edi
 ; 686-O0-NEXT:    .cfi_def_cfa_offset 4
 ; 686-O0-NEXT:    retl
 ;
@@ -431,35 +446,35 @@ define void @f3() #0 {
 ; X86-O0-NEXT:    movl var_13, %eax
 ; X86-O0-NEXT:    xorl $-1, %eax
 ; X86-O0-NEXT:    movl %eax, %eax
-; X86-O0-NEXT:    # kill: def $rax killed $eax
+; X86-O0-NEXT:    movl %eax, %ecx
 ; X86-O0-NEXT:    cmpl $0, var_13
-; X86-O0-NEXT:    setne %cl
-; X86-O0-NEXT:    xorb $-1, %cl
-; X86-O0-NEXT:    andb $1, %cl
-; X86-O0-NEXT:    movzbl %cl, %ecx
-; X86-O0-NEXT:    # kill: def $rcx killed $ecx
-; X86-O0-NEXT:    movl var_13, %edx
-; X86-O0-NEXT:    xorl $-1, %edx
-; X86-O0-NEXT:    xorl var_16, %edx
-; X86-O0-NEXT:    movl %edx, %edx
-; X86-O0-NEXT:    # kill: def $rdx killed $edx
-; X86-O0-NEXT:    andq %rdx, %rcx
-; X86-O0-NEXT:    orq %rcx, %rax
-; X86-O0-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X86-O0-NEXT:    setne %dl
+; X86-O0-NEXT:    xorb $-1, %dl
+; X86-O0-NEXT:    andb $1, %dl
+; X86-O0-NEXT:    movzbl %dl, %eax
+; X86-O0-NEXT:    movl %eax, %esi
 ; X86-O0-NEXT:    movl var_13, %eax
 ; X86-O0-NEXT:    xorl $-1, %eax
+; X86-O0-NEXT:    xorl var_16, %eax
 ; X86-O0-NEXT:    movl %eax, %eax
-; X86-O0-NEXT:    # kill: def $rax killed $eax
+; X86-O0-NEXT:    movl %eax, %edi
+; X86-O0-NEXT:    andq %rdi, %rsi
+; X86-O0-NEXT:    orq %rsi, %rcx
+; X86-O0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X86-O0-NEXT:    movl var_13, %eax
+; X86-O0-NEXT:    xorl $-1, %eax
+; X86-O0-NEXT:    movl %eax, %eax
+; X86-O0-NEXT:    movl %eax, %ecx
 ; X86-O0-NEXT:    cmpl $0, var_13
-; X86-O0-NEXT:    setne %cl
-; X86-O0-NEXT:    xorb $-1, %cl
-; X86-O0-NEXT:    andb $1, %cl
-; X86-O0-NEXT:    movzbl %cl, %ecx
-; X86-O0-NEXT:    # kill: def $rcx killed $ecx
-; X86-O0-NEXT:    andq $0, %rcx
-; X86-O0-NEXT:    orq %rcx, %rax
-; X86-O0-NEXT:    # kill: def $eax killed $eax killed $rax
-; X86-O0-NEXT:    movl %eax, var_46
+; X86-O0-NEXT:    setne %dl
+; X86-O0-NEXT:    xorb $-1, %dl
+; X86-O0-NEXT:    andb $1, %dl
+; X86-O0-NEXT:    movzbl %dl, %eax
+; X86-O0-NEXT:    movl %eax, %esi
+; X86-O0-NEXT:    andq $0, %rsi
+; X86-O0-NEXT:    orq %rsi, %rcx
+; X86-O0-NEXT:    # kill: def $ecx killed $ecx killed $rcx
+; X86-O0-NEXT:    movl %ecx, var_46
 ; X86-O0-NEXT:    retq
 ;
 ; X64-LABEL: f3:
@@ -484,28 +499,31 @@ define void @f3() #0 {
 ; 686-O0-NEXT:    .cfi_offset %ebp, -8
 ; 686-O0-NEXT:    movl %esp, %ebp
 ; 686-O0-NEXT:    .cfi_def_cfa_register %ebp
+; 686-O0-NEXT:    pushl %edi
 ; 686-O0-NEXT:    pushl %esi
 ; 686-O0-NEXT:    andl $-8, %esp
-; 686-O0-NEXT:    subl $16, %esp
-; 686-O0-NEXT:    .cfi_offset %esi, -12
+; 686-O0-NEXT:    subl $8, %esp
+; 686-O0-NEXT:    .cfi_offset %esi, -16
+; 686-O0-NEXT:    .cfi_offset %edi, -12
 ; 686-O0-NEXT:    movl var_13, %eax
 ; 686-O0-NEXT:    movl %eax, %ecx
 ; 686-O0-NEXT:    notl %ecx
 ; 686-O0-NEXT:    testl %eax, %eax
-; 686-O0-NEXT:    sete %al
-; 686-O0-NEXT:    movzbl %al, %eax
-; 686-O0-NEXT:    movl var_16, %edx
-; 686-O0-NEXT:    movl %ecx, %esi
-; 686-O0-NEXT:    xorl %edx, %esi
-; 686-O0-NEXT:    andl %esi, %eax
+; 686-O0-NEXT:    sete %dl
+; 686-O0-NEXT:    movzbl %dl, %eax
+; 686-O0-NEXT:    movl var_16, %esi
+; 686-O0-NEXT:    movl %ecx, %edi
+; 686-O0-NEXT:    xorl %esi, %edi
+; 686-O0-NEXT:    andl %edi, %eax
 ; 686-O0-NEXT:    orl %eax, %ecx
 ; 686-O0-NEXT:    movl %ecx, (%esp)
 ; 686-O0-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; 686-O0-NEXT:    movl var_13, %eax
 ; 686-O0-NEXT:    notl %eax
 ; 686-O0-NEXT:    movl %eax, var_46
-; 686-O0-NEXT:    leal -4(%ebp), %esp
+; 686-O0-NEXT:    leal -8(%ebp), %esp
 ; 686-O0-NEXT:    popl %esi
+; 686-O0-NEXT:    popl %edi
 ; 686-O0-NEXT:    popl %ebp
 ; 686-O0-NEXT:    .cfi_def_cfa %esp, 4
 ; 686-O0-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/pr32340.ll b/llvm/test/CodeGen/X86/pr32340.ll
index 98685b959f64..1e428ac7d83a 100644
--- a/llvm/test/CodeGen/X86/pr32340.ll
+++ b/llvm/test/CodeGen/X86/pr32340.ll
@@ -14,37 +14,37 @@ define void @foo() {
 ; X64-LABEL: foo:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    # kill: def $rax killed $eax
+; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    movw $0, var_825
-; X64-NEXT:    movzwl var_32, %ecx
+; X64-NEXT:    movzwl var_32, %eax
 ; X64-NEXT:    movzwl var_901, %edx
-; X64-NEXT:    movl %ecx, %esi
+; X64-NEXT:    movl %eax, %esi
 ; X64-NEXT:    xorl %edx, %esi
-; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    movl %eax, %edx
 ; X64-NEXT:    xorl %esi, %edx
-; X64-NEXT:    addl %ecx, %edx
-; X64-NEXT:    movslq %edx, %rcx
-; X64-NEXT:    movq %rcx, var_826
-; X64-NEXT:    movzwl var_32, %ecx
-; X64-NEXT:    # kill: def $rcx killed $ecx
-; X64-NEXT:    movzwl var_901, %edx
-; X64-NEXT:    xorl $51981, %edx # imm = 0xCB0D
-; X64-NEXT:    movslq %edx, %rdx
-; X64-NEXT:    movabsq $-1142377792914660288, %rsi # imm = 0xF02575732E06E440
-; X64-NEXT:    xorq %rsi, %rdx
-; X64-NEXT:    movq %rcx, %rsi
-; X64-NEXT:    xorq %rdx, %rsi
-; X64-NEXT:    xorq $-1, %rsi
-; X64-NEXT:    xorq %rsi, %rcx
-; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    orq var_57, %rdx
-; X64-NEXT:    orq %rdx, %rcx
-; X64-NEXT:    # kill: def $cx killed $cx killed $rcx
-; X64-NEXT:    movw %cx, var_900
-; X64-NEXT:    cmpq var_28, %rax
-; X64-NEXT:    setne %al
-; X64-NEXT:    andb $1, %al
-; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    addl %eax, %edx
+; X64-NEXT:    movslq %edx, %rdi
+; X64-NEXT:    movq %rdi, var_826
+; X64-NEXT:    movzwl var_32, %eax
+; X64-NEXT:    movl %eax, %edi
+; X64-NEXT:    movzwl var_901, %eax
+; X64-NEXT:    xorl $51981, %eax # imm = 0xCB0D
+; X64-NEXT:    movslq %eax, %r8
+; X64-NEXT:    movabsq $-1142377792914660288, %r9 # imm = 0xF02575732E06E440
+; X64-NEXT:    xorq %r9, %r8
+; X64-NEXT:    movq %rdi, %r9
+; X64-NEXT:    xorq %r8, %r9
+; X64-NEXT:    xorq $-1, %r9
+; X64-NEXT:    xorq %r9, %rdi
+; X64-NEXT:    movq %rdi, %r8
+; X64-NEXT:    orq var_57, %r8
+; X64-NEXT:    orq %r8, %rdi
+; X64-NEXT:    # kill: def $di killed $di killed $rdi
+; X64-NEXT:    movw %di, var_900
+; X64-NEXT:    cmpq var_28, %rcx
+; X64-NEXT:    setne %r10b
+; X64-NEXT:    andb $1, %r10b
+; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    movw %ax, var_827
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr32345.ll b/llvm/test/CodeGen/X86/pr32345.ll
index 165e0292d464..d5f7fde77f6d 100644
--- a/llvm/test/CodeGen/X86/pr32345.ll
+++ b/llvm/test/CodeGen/X86/pr32345.ll
@@ -15,23 +15,23 @@ define void @foo() {
 ; X640-NEXT:    xorl %ecx, %eax
 ; X640-NEXT:    movzwl var_27, %ecx
 ; X640-NEXT:    xorl %ecx, %eax
-; X640-NEXT:    cltq
-; X640-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X640-NEXT:    movslq %eax, %rdx
+; X640-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; X640-NEXT:    movzwl var_22, %eax
 ; X640-NEXT:    movzwl var_27, %ecx
 ; X640-NEXT:    xorl %ecx, %eax
 ; X640-NEXT:    movzwl var_27, %ecx
 ; X640-NEXT:    xorl %ecx, %eax
-; X640-NEXT:    cltq
-; X640-NEXT:    movzwl var_27, %ecx
-; X640-NEXT:    subl $16610, %ecx # imm = 0x40E2
-; X640-NEXT:    movl %ecx, %ecx
-; X640-NEXT:    # kill: def $rcx killed $ecx
+; X640-NEXT:    movslq %eax, %rdx
+; X640-NEXT:    movzwl var_27, %eax
+; X640-NEXT:    subl $16610, %eax # imm = 0x40E2
+; X640-NEXT:    movl %eax, %eax
+; X640-NEXT:    movl %eax, %ecx
 ; X640-NEXT:    # kill: def $cl killed $rcx
-; X640-NEXT:    sarq %cl, %rax
-; X640-NEXT:    # kill: def $al killed $al killed $rax
-; X640-NEXT:    # implicit-def: $rcx
-; X640-NEXT:    movb %al, (%rcx)
+; X640-NEXT:    sarq %cl, %rdx
+; X640-NEXT:    # kill: def $dl killed $dl killed $rdx
+; X640-NEXT:    # implicit-def: $rsi
+; X640-NEXT:    movb %dl, (%rsi)
 ; X640-NEXT:    retq
 ;
 ; 6860-LABEL: foo:
@@ -41,37 +41,43 @@ define void @foo() {
 ; 6860-NEXT:    .cfi_offset %ebp, -8
 ; 6860-NEXT:    movl %esp, %ebp
 ; 6860-NEXT:    .cfi_def_cfa_register %ebp
+; 6860-NEXT:    pushl %ebx
+; 6860-NEXT:    pushl %edi
+; 6860-NEXT:    pushl %esi
 ; 6860-NEXT:    andl $-8, %esp
-; 6860-NEXT:    subl $24, %esp
+; 6860-NEXT:    subl $32, %esp
+; 6860-NEXT:    .cfi_offset %esi, -20
+; 6860-NEXT:    .cfi_offset %edi, -16
+; 6860-NEXT:    .cfi_offset %ebx, -12
 ; 6860-NEXT:    movw var_22, %ax
 ; 6860-NEXT:    movzwl var_27, %ecx
 ; 6860-NEXT:    movw %cx, %dx
 ; 6860-NEXT:    xorw %dx, %ax
-; 6860-NEXT:    # implicit-def: $edx
-; 6860-NEXT:    movw %ax, %dx
-; 6860-NEXT:    xorl %ecx, %edx
-; 6860-NEXT:    # kill: def $dx killed $dx killed $edx
-; 6860-NEXT:    movzwl %dx, %eax
-; 6860-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; 6860-NEXT:    # implicit-def: $esi
+; 6860-NEXT:    movw %ax, %si
+; 6860-NEXT:    xorl %ecx, %esi
+; 6860-NEXT:    # kill: def $si killed $si killed $esi
+; 6860-NEXT:    movzwl %si, %ecx
+; 6860-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; 6860-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; 6860-NEXT:    movw var_22, %ax
 ; 6860-NEXT:    movzwl var_27, %ecx
 ; 6860-NEXT:    movw %cx, %dx
 ; 6860-NEXT:    xorw %dx, %ax
-; 6860-NEXT:    # implicit-def: $edx
-; 6860-NEXT:    movw %ax, %dx
-; 6860-NEXT:    xorl %ecx, %edx
-; 6860-NEXT:    # kill: def $dx killed $dx killed $edx
-; 6860-NEXT:    movzwl %dx, %eax
+; 6860-NEXT:    # implicit-def: $edi
+; 6860-NEXT:    movw %ax, %di
+; 6860-NEXT:    xorl %ecx, %edi
+; 6860-NEXT:    # kill: def $di killed $di killed $edi
+; 6860-NEXT:    movzwl %di, %ebx
 ; 6860-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; 6860-NEXT:    addb $30, %cl
-; 6860-NEXT:    xorl %edx, %edx
+; 6860-NEXT:    xorl %eax, %eax
 ; 6860-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; 6860-NEXT:    shrdl %cl, %edx, %eax
+; 6860-NEXT:    shrdl %cl, %eax, %ebx
 ; 6860-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; 6860-NEXT:    testb $32, %cl
+; 6860-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; 6860-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; 6860-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; 6860-NEXT:    jne .LBB0_2
 ; 6860-NEXT:  # %bb.1: # %bb
 ; 6860-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -81,7 +87,10 @@ define void @foo() {
 ; 6860-NEXT:    # kill: def $al killed $al killed $eax
 ; 6860-NEXT:    # implicit-def: $ecx
 ; 6860-NEXT:    movb %al, (%ecx)
-; 6860-NEXT:    movl %ebp, %esp
+; 6860-NEXT:    leal -12(%ebp), %esp
+; 6860-NEXT:    popl %esi
+; 6860-NEXT:    popl %edi
+; 6860-NEXT:    popl %ebx
 ; 6860-NEXT:    popl %ebp
 ; 6860-NEXT:    .cfi_def_cfa %esp, 4
 ; 6860-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/pr32451.ll b/llvm/test/CodeGen/X86/pr32451.ll
index 3b1997234ce5..4754d8e4cf6c 100644
--- a/llvm/test/CodeGen/X86/pr32451.ll
+++ b/llvm/test/CodeGen/X86/pr32451.ll
@@ -9,24 +9,29 @@ target triple = "x86_64-unknown-linux-gnu"
 define i8** @japi1_convert_690(i8**, i8***, i32) {
 ; CHECK-LABEL: japi1_convert_690:
 ; CHECK:       # %bb.0: # %top
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    subl $16, %esp
-; CHECK-NEXT:    .cfi_def_cfa_offset 20
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    .cfi_offset %ebx, -8
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; CHECK-NEXT:    calll julia.gc_root_decl
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; CHECK-NEXT:    calll jl_get_ptls_states
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
 ; CHECK-NEXT:    movl 4(%ecx), %edx
-; CHECK-NEXT:    movb (%edx), %dl
-; CHECK-NEXT:    andb $1, %dl
-; CHECK-NEXT:    movzbl %dl, %edx
+; CHECK-NEXT:    movb (%edx), %bl
+; CHECK-NEXT:    andb $1, %bl
+; CHECK-NEXT:    movzbl %bl, %edx
 ; CHECK-NEXT:    movl %edx, (%esp)
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; CHECK-NEXT:    calll jl_box_int32
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
 ; CHECK-NEXT:    movl %eax, (%ecx)
 ; CHECK-NEXT:    addl $16, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    popl %ebx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-NEXT:    retl
 top:
diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll
index 25b068c8fad6..0f73036a4c6c 100644
--- a/llvm/test/CodeGen/X86/pr34592.ll
+++ b/llvm/test/CodeGen/X86/pr34592.ll
@@ -10,7 +10,7 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
 ; CHECK-NEXT:    movq %rsp, %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_register %rbp
 ; CHECK-NEXT:    andq $-32, %rsp
-; CHECK-NEXT:    subq $160, %rsp
+; CHECK-NEXT:    subq $192, %rsp
 ; CHECK-NEXT:    vmovaps 240(%rbp), %ymm8
 ; CHECK-NEXT:    vmovaps 208(%rbp), %ymm9
 ; CHECK-NEXT:    vmovaps 176(%rbp), %ymm10
@@ -27,14 +27,14 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
 ; CHECK-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,0]
 ; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
-; CHECK-NEXT:    vmovaps %xmm7, %xmm2
-; CHECK-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; CHECK-NEXT:    # implicit-def: $ymm9
-; CHECK-NEXT:    vmovaps %xmm2, %xmm9
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; CHECK-NEXT:    vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,3]
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7]
+; CHECK-NEXT:    vmovaps %xmm7, %xmm9
+; CHECK-NEXT:    vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7]
+; CHECK-NEXT:    # implicit-def: $ymm2
+; CHECK-NEXT:    vmovaps %xmm9, %xmm2
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; CHECK-NEXT:    vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,3]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
 ; CHECK-NEXT:    vpblendd {{.*#+}} ymm8 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7]
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,1,3]
 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,1,4,5,4,5]
@@ -43,11 +43,14 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
 ; CHECK-NEXT:    vmovq {{.*#+}} xmm7 = xmm7[0],zero
 ; CHECK-NEXT:    # implicit-def: $ymm8
 ; CHECK-NEXT:    vmovaps %xmm7, %xmm8
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm8[0,1],ymm6[0,1]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm8[0,1],ymm6[0,1]
 ; CHECK-NEXT:    vmovaps %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vmovaps %ymm5, %ymm1
+; CHECK-NEXT:    vmovaps %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vmovaps %ymm6, %ymm2
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; CHECK-NEXT:    vmovaps %ymm3, (%rsp) # 32-byte Spill
-; CHECK-NEXT:    vmovaps %ymm9, %ymm3
+; CHECK-NEXT:    vmovaps %ymm5, %ymm3
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
diff --git a/llvm/test/CodeGen/X86/pr39733.ll b/llvm/test/CodeGen/X86/pr39733.ll
index 75f9dc51b85e..4c7153852d22 100644
--- a/llvm/test/CodeGen/X86/pr39733.ll
+++ b/llvm/test/CodeGen/X86/pr39733.ll
@@ -23,8 +23,8 @@ define void @test55() {
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm2
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm0
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; CHECK-NEXT:    vmovdqa %ymm0, (%rsp)
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm2
+; CHECK-NEXT:    vmovdqa %ymm2, (%rsp)
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
diff --git a/llvm/test/CodeGen/X86/pr44749.ll b/llvm/test/CodeGen/X86/pr44749.ll
index 1012d8c723b1..d465009c7c38 100644
--- a/llvm/test/CodeGen/X86/pr44749.ll
+++ b/llvm/test/CodeGen/X86/pr44749.ll
@@ -14,22 +14,20 @@ define i32 @a() {
 ; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    callq _b
 ; CHECK-NEXT:    cvtsi2sd %eax, %xmm0
-; CHECK-NEXT:    movq _calloc@{{.*}}(%rip), %rax
-; CHECK-NEXT:    subq $-1, %rax
-; CHECK-NEXT:    setne %cl
-; CHECK-NEXT:    movzbl %cl, %ecx
-; CHECK-NEXT:    ## kill: def $rcx killed $ecx
-; CHECK-NEXT:    leaq {{.*}}(%rip), %rdx
+; CHECK-NEXT:    movq _calloc@{{.*}}(%rip), %rcx
+; CHECK-NEXT:    subq $-1, %rcx
+; CHECK-NEXT:    setne %dl
+; CHECK-NEXT:    movzbl %dl, %eax
+; CHECK-NEXT:    movl %eax, %esi
+; CHECK-NEXT:    leaq {{.*}}(%rip), %rdi
 ; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    ucomisd %xmm1, %xmm0
-; CHECK-NEXT:    setae %cl
-; CHECK-NEXT:    movzbl %cl, %ecx
-; CHECK-NEXT:    ## kill: def $rcx killed $ecx
-; CHECK-NEXT:    leaq {{.*}}(%rip), %rdx
+; CHECK-NEXT:    setae %dl
+; CHECK-NEXT:    movzbl %dl, %eax
+; CHECK-NEXT:    movl %eax, %esi
+; CHECK-NEXT:    leaq {{.*}}(%rip), %rdi
 ; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    cvttsd2si %xmm0, %ecx
-; CHECK-NEXT:    movq %rax, (%rsp) ## 8-byte Spill
-; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    cvttsd2si %xmm0, %eax
 ; CHECK-NEXT:    addq $24, %rsp
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/pr47000.ll b/llvm/test/CodeGen/X86/pr47000.ll
index 083aa780a07c..922b6403cc4f 100755
--- a/llvm/test/CodeGen/X86/pr47000.ll
+++ b/llvm/test/CodeGen/X86/pr47000.ll
@@ -12,47 +12,51 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    subl $124, %esp
-; CHECK-NEXT:    movl 144(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    movw 176(%esp), %dx
-; CHECK-NEXT:    movw 172(%esp), %si
-; CHECK-NEXT:    movw 168(%esp), %di
-; CHECK-NEXT:    movw 164(%esp), %bx
-; CHECK-NEXT:    movw 160(%esp), %bp
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %dx
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %si
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %di
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %bx
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %bp
+; CHECK-NEXT:    movw %dx, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %dx
+; CHECK-NEXT:    movw %dx, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %dx
+; CHECK-NEXT:    movw %dx, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %dx
+; CHECK-NEXT:    movw %dx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %dx # 2-byte Reload
+; CHECK-NEXT:    movw %dx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %dx # 2-byte Reload
+; CHECK-NEXT:    movw %dx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movw %bp, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %bp # 2-byte Reload
+; CHECK-NEXT:    movw %bp, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movw %si, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movw %di, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %ebx
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movw 156(%esp), %ax
-; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; CHECK-NEXT:    movw 152(%esp), %ax
-; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
-; CHECK-NEXT:    movw 148(%esp), %ax
-; CHECK-NEXT:    movw %ax, 112(%esp)
-; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %ax # 2-byte Reload
-; CHECK-NEXT:    movw %ax, 114(%esp)
-; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %ax # 2-byte Reload
-; CHECK-NEXT:    movw %ax, 116(%esp)
-; CHECK-NEXT:    movw %bp, 118(%esp)
-; CHECK-NEXT:    movw %dx, 110(%esp)
-; CHECK-NEXT:    movw %si, 108(%esp)
-; CHECK-NEXT:    movw %di, 106(%esp)
-; CHECK-NEXT:    movw %bx, 104(%esp)
-; CHECK-NEXT:    movzwl 118(%esp), %edx
-; CHECK-NEXT:    movzwl 116(%esp), %esi
-; CHECK-NEXT:    movzwl 114(%esp), %edi
-; CHECK-NEXT:    movzwl 112(%esp), %ebx
-; CHECK-NEXT:    movzwl 110(%esp), %ebp
-; CHECK-NEXT:    movzwl 108(%esp), %eax
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movzwl 106(%esp), %eax
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movzwl 104(%esp), %eax
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    movl %ebx, (%eax)
 ; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, (%eax)
 ; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    calll __gnu_h2f_ieee
 ; CHECK-NEXT:    movl %esp, %eax
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -68,58 +72,58 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
 ; CHECK-NEXT:    fstps (%eax)
 ; CHECK-NEXT:    calll __gnu_f2h_ieee
 ; CHECK-NEXT:    movl %esp, %ecx
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT:    movl %edx, (%ecx)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT:    movl %esi, (%ecx)
 ; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
 ; CHECK-NEXT:    calll __gnu_h2f_ieee
-; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    movl %esp, %ecx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT:    movl %esi, (%ecx)
 ; CHECK-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
 ; CHECK-NEXT:    calll __gnu_h2f_ieee
-; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    fstps 4(%eax)
+; CHECK-NEXT:    movl %esp, %ecx
+; CHECK-NEXT:    fstps 4(%ecx)
 ; CHECK-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
-; CHECK-NEXT:    fstps (%eax)
+; CHECK-NEXT:    fstps (%ecx)
 ; CHECK-NEXT:    calll fmodf
-; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    fstps (%eax)
+; CHECK-NEXT:    movl %esp, %ecx
+; CHECK-NEXT:    fstps (%ecx)
 ; CHECK-NEXT:    calll __gnu_f2h_ieee
 ; CHECK-NEXT:    movl %esp, %ecx
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT:    movl %edx, (%ecx)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT:    movl %esi, (%ecx)
 ; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
 ; CHECK-NEXT:    calll __gnu_h2f_ieee
-; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    movl %esp, %ecx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT:    movl %esi, (%ecx)
 ; CHECK-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
 ; CHECK-NEXT:    calll __gnu_h2f_ieee
-; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    fstps 4(%eax)
+; CHECK-NEXT:    movl %esp, %ecx
+; CHECK-NEXT:    fstps 4(%ecx)
 ; CHECK-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
-; CHECK-NEXT:    fstps (%eax)
+; CHECK-NEXT:    fstps (%ecx)
 ; CHECK-NEXT:    calll fmodf
-; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    fstps (%eax)
+; CHECK-NEXT:    movl %esp, %ecx
+; CHECK-NEXT:    fstps (%ecx)
 ; CHECK-NEXT:    calll __gnu_f2h_ieee
 ; CHECK-NEXT:    movl %esp, %ecx
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT:    movl %edx, (%ecx)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT:    movl %esi, (%ecx)
 ; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
 ; CHECK-NEXT:    calll __gnu_h2f_ieee
-; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    movl %esp, %ecx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT:    movl %esi, (%ecx)
 ; CHECK-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
 ; CHECK-NEXT:    calll __gnu_h2f_ieee
-; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    fstps 4(%eax)
+; CHECK-NEXT:    movl %esp, %ecx
+; CHECK-NEXT:    fstps 4(%ecx)
 ; CHECK-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
-; CHECK-NEXT:    fstps (%eax)
+; CHECK-NEXT:    fstps (%ecx)
 ; CHECK-NEXT:    calll fmodf
-; CHECK-NEXT:    movl %esp, %eax
-; CHECK-NEXT:    fstps (%eax)
+; CHECK-NEXT:    movl %esp, %ecx
+; CHECK-NEXT:    fstps (%ecx)
 ; CHECK-NEXT:    calll __gnu_f2h_ieee
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-NEXT:    movw %ax, 6(%ecx)
@@ -127,9 +131,10 @@ define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind {
 ; CHECK-NEXT:    movw %ax, 4(%ecx)
 ; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %dx # 2-byte Reload
 ; CHECK-NEXT:    movw %dx, 2(%ecx)
-; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %si # 2-byte Reload
-; CHECK-NEXT:    movw %si, (%ecx)
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movw {{[-0-9]+}}(%e{{[sb]}}p), %bp # 2-byte Reload
+; CHECK-NEXT:    movw %bp, (%ecx)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    addl $124, %esp
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/regalloc-fast-missing-live-out-spill.mir b/llvm/test/CodeGen/X86/regalloc-fast-missing-live-out-spill.mir
index 2821f00940ec..0fe9f60897fd 100644
--- a/llvm/test/CodeGen/X86/regalloc-fast-missing-live-out-spill.mir
+++ b/llvm/test/CodeGen/X86/regalloc-fast-missing-live-out-spill.mir
@@ -23,15 +23,15 @@ body:             |
   ; CHECK:   successors: %bb.3(0x80000000)
   ; CHECK:   $rax = MOV64rm %stack.1, 1, $noreg, 0, $noreg :: (load 8 from %stack.1)
   ; CHECK:   renamable $ecx = MOV32r0 implicit-def $eflags
-  ; CHECK:   renamable $rcx = SUBREG_TO_REG 0, killed renamable $ecx, %subreg.sub_32bit
+  ; CHECK:   renamable $rdx = SUBREG_TO_REG 0, killed renamable $ecx, %subreg.sub_32bit
   ; CHECK:   MOV64mi32 killed renamable $rax, 1, $noreg, 0, $noreg, 0 :: (volatile store 8)
-  ; CHECK:   MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed $rcx :: (store 8 into %stack.0)
+  ; CHECK:   MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed $rdx :: (store 8 into %stack.0)
   ; CHECK: bb.3:
   ; CHECK:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK:   $rax = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load 8 from %stack.0)
   ; CHECK:   renamable $ecx = MOV32r0 implicit-def dead $eflags
-  ; CHECK:   renamable $rcx = SUBREG_TO_REG 0, killed renamable $ecx, %subreg.sub_32bit
-  ; CHECK:   MOV64mr %stack.1, 1, $noreg, 0, $noreg, killed $rcx :: (store 8 into %stack.1)
+  ; CHECK:   renamable $rdx = SUBREG_TO_REG 0, killed renamable $ecx, %subreg.sub_32bit
+  ; CHECK:   MOV64mr %stack.1, 1, $noreg, 0, $noreg, killed $rdx :: (store 8 into %stack.1)
   ; CHECK:   JMP64r killed renamable $rax
   bb.0:
     liveins: $edi, $rsi
diff --git a/llvm/test/CodeGen/X86/swift-return.ll b/llvm/test/CodeGen/X86/swift-return.ll
index 4934419055ac..c62e92f2cac5 100644
--- a/llvm/test/CodeGen/X86/swift-return.ll
+++ b/llvm/test/CodeGen/X86/swift-return.ll
@@ -28,10 +28,11 @@ define i16 @test(i32 %key) {
 ; CHECK-O0-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
 ; CHECK-O0-NEXT:    movl {{[0-9]+}}(%rsp), %edi
 ; CHECK-O0-NEXT:    callq gen
-; CHECK-O0-NEXT:    cwtl
-; CHECK-O0-NEXT:    movsbl %dl, %ecx
-; CHECK-O0-NEXT:    addl %ecx, %eax
-; CHECK-O0-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-O0-NEXT:    movswl %ax, %ecx
+; CHECK-O0-NEXT:    movsbl %dl, %esi
+; CHECK-O0-NEXT:    addl %esi, %ecx
+; CHECK-O0-NEXT:    # kill: def $cx killed $cx killed $ecx
+; CHECK-O0-NEXT:    movw %cx, %ax
 ; CHECK-O0-NEXT:    popq %rcx
 ; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-O0-NEXT:    retq
@@ -79,16 +80,16 @@ define i32 @test2(i32 %key) #0 {
 ; CHECK-O0-NEXT:    movl {{[0-9]+}}(%rsp), %edi
 ; CHECK-O0-NEXT:    movq %rsp, %rax
 ; CHECK-O0-NEXT:    callq gen2
-; CHECK-O0-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-O0-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
 ; CHECK-O0-NEXT:    movl {{[0-9]+}}(%rsp), %edx
-; CHECK-O0-NEXT:    movl (%rsp), %esi
-; CHECK-O0-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-O0-NEXT:    addl %edi, %esi
-; CHECK-O0-NEXT:    addl %edx, %esi
-; CHECK-O0-NEXT:    addl %ecx, %esi
-; CHECK-O0-NEXT:    addl %eax, %esi
-; CHECK-O0-NEXT:    movl %esi, %eax
+; CHECK-O0-NEXT:    movl {{[0-9]+}}(%rsp), %esi
+; CHECK-O0-NEXT:    movl (%rsp), %edi
+; CHECK-O0-NEXT:    movl {{[0-9]+}}(%rsp), %r8d
+; CHECK-O0-NEXT:    addl %r8d, %edi
+; CHECK-O0-NEXT:    addl %esi, %edi
+; CHECK-O0-NEXT:    addl %edx, %edi
+; CHECK-O0-NEXT:    addl %ecx, %edi
+; CHECK-O0-NEXT:    movl %edi, %eax
 ; CHECK-O0-NEXT:    addq $24, %rsp
 ; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-O0-NEXT:    retq
@@ -263,17 +264,17 @@ define void @consume_i1_ret() {
 ; CHECK-O0-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-O0-NEXT:    callq produce_i1_ret
 ; CHECK-O0-NEXT:    andb $1, %al
-; CHECK-O0-NEXT:    movzbl %al, %eax
-; CHECK-O0-NEXT:    movl %eax, var
+; CHECK-O0-NEXT:    movzbl %al, %esi
+; CHECK-O0-NEXT:    movl %esi, var
 ; CHECK-O0-NEXT:    andb $1, %dl
-; CHECK-O0-NEXT:    movzbl %dl, %eax
-; CHECK-O0-NEXT:    movl %eax, var
+; CHECK-O0-NEXT:    movzbl %dl, %esi
+; CHECK-O0-NEXT:    movl %esi, var
 ; CHECK-O0-NEXT:    andb $1, %cl
-; CHECK-O0-NEXT:    movzbl %cl, %eax
-; CHECK-O0-NEXT:    movl %eax, var
+; CHECK-O0-NEXT:    movzbl %cl, %esi
+; CHECK-O0-NEXT:    movl %esi, var
 ; CHECK-O0-NEXT:    andb $1, %r8b
-; CHECK-O0-NEXT:    movzbl %r8b, %eax
-; CHECK-O0-NEXT:    movl %eax, var
+; CHECK-O0-NEXT:    movzbl %r8b, %esi
+; CHECK-O0-NEXT:    movl %esi, var
 ; CHECK-O0-NEXT:    popq %rax
 ; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-O0-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll
index 1afae31b2b8d..1388c61c1898 100644
--- a/llvm/test/CodeGen/X86/swifterror.ll
+++ b/llvm/test/CodeGen/X86/swifterror.ll
@@ -790,8 +790,8 @@ a:
 ; CHECK-O0-LABEL: testAssign4
 ; CHECK-O0:        callq   _foo2
 ; CHECK-O0:        xorl    %eax, %eax
-; CHECK-O0: ## kill: def $rax killed $eax
-; CHECK-O0:        movq    %rax, [[SLOT:[-a-z0-9\(\)\%]*]]
+; CHECK-O0:        movl    %eax, %ecx
+; CHECK-O0:        movq    %rcx, [[SLOT:[-a-z0-9\(\)\%]*]]
 ; CHECK-O0:        movq    [[SLOT]], %rax
 ; CHECK-O0:        movq    %rax, [[SLOT2:[-a-z0-9\(\)\%]*]]
 ; CHECK-O0:        movq    [[SLOT2]], %r12
diff --git a/llvm/test/DebugInfo/X86/op_deref.ll b/llvm/test/DebugInfo/X86/op_deref.ll
index 1b49dc554f7e..5de9976d6de2 100644
--- a/llvm/test/DebugInfo/X86/op_deref.ll
+++ b/llvm/test/DebugInfo/X86/op_deref.ll
@@ -6,10 +6,10 @@
 ; RUN:     | FileCheck %s -check-prefix=CHECK -check-prefix=DWARF3
 
 ; DWARF4: DW_AT_location [DW_FORM_sec_offset]                      (0x00000000
-; DWARF4-NEXT:  {{.*}}: DW_OP_breg2 RCX+0, DW_OP_deref
+; DWARF4-NEXT:  {{.*}}: DW_OP_breg1 RDX+0, DW_OP_deref
 
 ; DWARF3: DW_AT_location [DW_FORM_data4]                      (0x00000000
-; DWARF3-NEXT:  {{.*}}: DW_OP_breg2 RCX+0, DW_OP_deref
+; DWARF3-NEXT:  {{.*}}: DW_OP_breg1 RDX+0, DW_OP_deref
 
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000067] = "vla")
@@ -17,8 +17,8 @@
 ; Check the DEBUG_VALUE comments for good measure.
 ; RUN: llc -O0 -mtriple=x86_64-apple-darwin %s -o - -filetype=asm | FileCheck %s -check-prefix=ASM-CHECK
 ; vla should have a register-indirect address at one point.
-; ASM-CHECK: DEBUG_VALUE: vla <- [DW_OP_deref] [$rcx+0]
-; ASM-CHECK: DW_OP_breg2
+; ASM-CHECK: DEBUG_VALUE: vla <- [DW_OP_deref] [$rdx+0]
+; ASM-CHECK: DW_OP_breg1
 
 ; RUN: llvm-as %s -o - | llvm-dis - | FileCheck %s --check-prefix=PRETTY-PRINT
 ; PRETTY-PRINT: DIExpression(DW_OP_deref)

From 158581772fc8f3d6c601ceba14a08285e46cb7e9 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Tue, 15 Sep 2020 20:18:00 +0200
Subject: [PATCH 247/363] ReleaseNotes: PowerPC changes

By Ahsan Saghir!
---
 llvm/docs/ReleaseNotes.rst | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index e87bf3d146f5..977ba26f9e23 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -188,7 +188,41 @@ During this release ...
 Changes to the PowerPC Target
 -----------------------------
 
-During this release ...
+Optimization:
+
+* Improved Loop Unroll-and-Jam legality checks, allowing it to handle more than two level loop nests
+* Improved Loop Unroll to be able to unroll more loops
+* Implemented an option to allow loop fusion to work on loops with different constant trip counts
+
+Codegen:
+
+* POWER10 support
+* Added PC Relative addressing
+* Added __int128 vector bool support
+* Security enhancement via probe-stack attribute support to protect against stack clash
+* Floating point support enhancements
+* Improved half precision and quad precision support, including GLIBC
+* constrained FP operation support for arithmetic/rounding/max/min
+* cleaning up fast math flags checks in DAGCombine, Legalizer, and Lowering
+* Performance improvements from instruction exploitation, especially for vector permute on LE
+* Scheduling enhancements
+* Added MacroFusion for POWER8
+* Added post-ra heuristics for POWER9
+* Target dependent passes tuning
+* Updated LoopStrengthReduce to use instruction number as first priority
+* Enhanced MachineCombiner to expose more ILP
+* Code quality and maintenance enhancements
+* Enabled more machine verification passes
+* Added ability to parse and emit additional extended mnemonics
+* Numerous bug fixes
+
+AIX Support Improvements:
+
+* Enabled compile and link such that a simple <stdio.h> "Hello World" program works with standard headers
+* Added support for the C calling convention for non-vector code
+* Implemented correct stack frame layout for functions
+* In llvm-objdump, added support for relocations, improved selection of symbol labels, and added the --symbol-description option
+
 
 Changes to the RISC-V Target
 ----------------------------

From 8f2c29681ce768afb739b6cf5ccca81dd87d5326 Mon Sep 17 00:00:00 2001
From: Richard Barton <richard.barton@arm.com>
Date: Wed, 16 Sep 2020 08:18:08 +0100
Subject: [PATCH 248/363] [flang] Fix docs build

Apply a local fix to an issue with recommonmark's AutoStructify extension
when used with certain versions of sphinx.

See https://github.com/readthedocs/recommonmark/issues/93

Reviewed By: hans

Differential Revision: https://reviews.llvm.org/D87714

(cherry picked from commit af56be339f8c9660747794cc6755384154602535)
---
 flang/docs/conf.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/flang/docs/conf.py b/flang/docs/conf.py
index 21362fc3449e..f5eb283a186a 100644
--- a/flang/docs/conf.py
+++ b/flang/docs/conf.py
@@ -50,6 +50,17 @@
 
   # Setup AutoStructify for inline .rst toctrees in index.md
   from recommonmark.transform import AutoStructify
+
+  # Stolen from https://github.com/readthedocs/recommonmark/issues/93
+  # Monkey patch to fix recommonmark 0.4 doc reference issues.
+  from recommonmark.states import DummyStateMachine
+  orig_run_role = DummyStateMachine.run_role
+  def run_role(self, name, options=None, content=None):
+    if name == 'doc':
+      name = 'any'
+      return orig_run_role(self, name, options, content)
+  DummyStateMachine.run_role = run_role
+
   def setup(app):
     # Disable inline math to avoid
     # https://github.com/readthedocs/recommonmark/issues/120 in Extensions.md

From 4a26e3b33798424dc5a4843f7b29a617bef81656 Mon Sep 17 00:00:00 2001
From: Adam Czachorowski <adamcz@google.com>
Date: Tue, 15 Sep 2020 20:13:00 +0200
Subject: [PATCH 249/363] [clangd] Actually parse Index section of the YAML
 file.

This fixes a bug in dbf486c0de92c76df77c1a1f815cf16533ecbb3a, which
introduced the Index section of the config, but did not register the
parse method, so it didn't work in a YAML file (but did in a test).

Differential Revision: https://reviews.llvm.org/D87710

(cherry picked from commit 7029e5d4ca20d20982da8efe89de27acd8d7d75b)
---
 clang-tools-extra/clangd/ConfigYAML.cpp         |  1 +
 .../clangd/unittests/ConfigYAMLTests.cpp        | 17 +++++++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/clang-tools-extra/clangd/ConfigYAML.cpp b/clang-tools-extra/clangd/ConfigYAML.cpp
index 16639f6649c2..9988fe376648 100644
--- a/clang-tools-extra/clangd/ConfigYAML.cpp
+++ b/clang-tools-extra/clangd/ConfigYAML.cpp
@@ -38,6 +38,7 @@ class Parser {
     DictParser Dict("Config", this);
     Dict.handle("If", [&](Node &N) { parse(F.If, N); });
     Dict.handle("CompileFlags", [&](Node &N) { parse(F.CompileFlags, N); });
+    Dict.handle("Index", [&](Node &N) { parse(F.Index, N); });
     Dict.parse(N);
     return !(N.failed() || HadError);
   }
diff --git a/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp b/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp
index a9526ce2367c..27b1c0cfc56d 100644
--- a/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ConfigYAMLTests.cpp
@@ -47,16 +47,21 @@ CompileFlags: { Add: [foo, bar] }
   Add: |
     b
     az
+---
+Index:
+  Background: Skip
   )yaml";
   auto Results = Fragment::parseYAML(YAML, "config.yaml", Diags.callback());
   EXPECT_THAT(Diags.Diagnostics, IsEmpty());
-  ASSERT_EQ(Results.size(), 2u);
-  EXPECT_FALSE(Results.front().If.HasUnrecognizedCondition);
-  EXPECT_THAT(Results.front().If.PathMatch, ElementsAre(Val("abc")));
-  EXPECT_THAT(Results.front().CompileFlags.Add,
-              ElementsAre(Val("foo"), Val("bar")));
+  ASSERT_EQ(Results.size(), 3u);
+  EXPECT_FALSE(Results[0].If.HasUnrecognizedCondition);
+  EXPECT_THAT(Results[0].If.PathMatch, ElementsAre(Val("abc")));
+  EXPECT_THAT(Results[0].CompileFlags.Add, ElementsAre(Val("foo"), Val("bar")));
+
+  EXPECT_THAT(Results[1].CompileFlags.Add, ElementsAre(Val("b\naz\n")));
 
-  EXPECT_THAT(Results.back().CompileFlags.Add, ElementsAre(Val("b\naz\n")));
+  ASSERT_TRUE(Results[2].Index.Background);
+  EXPECT_EQ("Skip", *Results[2].Index.Background.getValue());
 }
 
 TEST(ParseYAML, Locations) {

From 339a0e2d114ecd15eb7289425851e356af7bb8a7 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Wed, 16 Sep 2020 17:03:14 +0200
Subject: [PATCH 250/363] llvm release notes: drop in-progress warnings; minor
 cleanups

---
 llvm/docs/ReleaseNotes.rst | 79 ++++++--------------------------------
 1 file changed, 12 insertions(+), 67 deletions(-)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 977ba26f9e23..2af813fda1aa 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -5,12 +5,6 @@ LLVM 11.0.0 Release Notes
 .. contents::
     :local:
 
-.. warning::
-   These are in-progress notes for the upcoming LLVM 11 release.
-   Release notes for previous releases can be found on
-   `the Download Page <https://releases.llvm.org/download.html>`_.
-
-
 Introduction
 ============
 
@@ -26,48 +20,16 @@ have questions or comments, the `LLVM Developer's Mailing List
 <https://lists.llvm.org/mailman/listinfo/llvm-dev>`_ is a good place to send
 them.
 
-Note that if you are reading this file from a Git checkout or the main
-LLVM web page, this document applies to the *next* release, not the current
-one.  To see the release notes for a specific release, please see the `releases
-page <https://llvm.org/releases/>`_.
-
 Deprecated and Removed Features/APIs
 =================================================
 * BG/Q support, including QPX, will be removed in the 12.0.0 release.
 
 Non-comprehensive list of changes in this release
 =================================================
-.. NOTE
-   For small 1-3 sentence descriptions, just add an entry at the end of
-   this list. If your description won't fit comfortably in one bullet
-   point (e.g. maybe you would like to give an example of the
-   functionality, or simply have a lot to talk about), see the `NOTE` below
-   for adding a new subsection.
-
-* The LLVM project has started the migration towards Python 3, and the build
-  system now prefers Python 3 whenever available.  If the Python 3 interpreter
-  (or libraries) are not found, the build system will, for the time being, fall
-  back to Python 2.  It is recommended that downstream projects migrate to
-  Python 3 as Python 2 has been end-of-life'd by the Python Software
-  Foundation.
 
 * The llgo frontend has been removed for now, but may be resurrected in the
   future.
 
-* ...
-
-
-.. NOTE
-   If you would like to document a larger change, then you can add a
-   subsection about it right here. You can copy the following boilerplate
-   and un-indent it (the indentation causes it to be inside this comment).
-
-   Special New Feature
-   -------------------
-
-   Makes programs 10x faster by doing Special New Thing.
-
-
 Changes to the LLVM IR
 ----------------------
 
@@ -116,6 +78,13 @@ Changes to the LLVM IR
 Changes to building LLVM
 ------------------------
 
+* The LLVM project has started the migration towards Python 3, and the build
+  system now prefers Python 3 whenever available.  If the Python 3 interpreter
+  (or libraries) are not found, the build system will, for the time being, fall
+  back to Python 2.  It is recommended that downstream projects migrate to
+  Python 3 as Python 2 has been end-of-life'd by the Python Software
+  Foundation.
+
 Changes to the AArch64 Backend
 ------------------------------
 
@@ -134,6 +103,7 @@ Changes to the AArch64 Backend
 * Added support for Armv8.6-A:
 
   Assembly support for the following extensions:
+
   - Enhanced Counter Virtualization (ARMv8.6-ECV).
   - Fine Grained Traps (ARMv8.6-FGT).
   - Activity Monitors virtualization (ARMv8.6-AMU).
@@ -179,11 +149,6 @@ Changes to the ARM Backend
 
 * Added support for Cortex-M55, Cortex-A77, Cortex-A78 and Cortex-X1 cores.
 
-Changes to the MIPS Target
---------------------------
-
-During this release ...
-
 
 Changes to the PowerPC Target
 -----------------------------
@@ -228,6 +193,7 @@ Changes to the RISC-V Target
 ----------------------------
 
 New features:
+
 * After consultation through an RFC, the RISC-V backend now accepts patches for
   proposed instruction set extensions that have not yet been ratified.  For these
   experimental extensions, there is no expectation of ongoing support - the
@@ -244,6 +210,7 @@ New features:
 * llvm-objdump will now print branch targets as part of disassembly.
 
 Improvements:
+
 * If an immediate can be generated using a pair of `addi` instructions, that
   pair will be selected rather than materialising the immediate into a
   separate register with an `lui` and `addi` pair.
@@ -265,6 +232,7 @@ Improvements:
 * The `jump` pseudo instruction is now supported.
 
 Bug fixes:
+
 * A failure to insert indirect branches in position independent code
   was fixed.
 * The calculated expanded size of atomic pseudo operations was fixed, avoiding
@@ -277,9 +245,6 @@ Bug fixes:
 Changes to the X86 Target
 -------------------------
 
-During this release ...
-
-
 * Functions with the probe-stack attribute set to "inline-asm" are now protected
   against stack clash without the need of a third-party probing function and
   with limited impact on performance.
@@ -335,18 +300,6 @@ Changes to the Windows Target
 * Produce COFF weak external symbols for IR level weak symbols without a comdat
   (e.g. for `__attribute__((weak))` in C)
 
-Changes to the OCaml bindings
------------------------------
-
-
-
-Changes to the C API
---------------------
-
-
-Changes to the Go bindings
---------------------------
-
 
 Changes to the DAG infrastructure
 ---------------------------------
@@ -357,7 +310,7 @@ Changes to the DAG infrastructure
   MachineIR.
 
 Changes to the Debug Info
----------------------------------
+-------------------------
 
 * LLVM now supports the debug entry values (DW_OP_entry_value) production for
   the x86, ARM, and AArch64 targets by default. Other targets can use
@@ -389,14 +342,6 @@ Changes to the LLVM tools
 * llvm-lib supports adding import library objects in addition to regular
   object files
 
-Changes to LLDB
-===============
-
-External Open Source Projects Using LLVM 11
-===========================================
-
-* A project...
-
 Additional Information
 ======================
 

From 1a51c113148a10a2d7313aae313039ef8e0db0cc Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Wed, 16 Sep 2020 17:10:31 +0200
Subject: [PATCH 251/363] clang release notes: drop in-progress warnings; minor
 cleanups

---
 clang/docs/ReleaseNotes.rst | 78 +++----------------------------------
 1 file changed, 6 insertions(+), 72 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 1c02c478be68..0ccaa7a82121 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1,6 +1,6 @@
-========================================
-Clang 11.0.0 (In-Progress) Release Notes
-========================================
+==========================
+Clang 11.0.0 Release Notes
+==========================
 
 .. contents::
    :local:
@@ -8,12 +8,6 @@ Clang 11.0.0 (In-Progress) Release Notes
 
 Written by the `LLVM Team <https://llvm.org/>`_
 
-.. warning::
-
-   These are in-progress notes for the upcoming Clang 11 release.
-   Release notes for previous releases can be found on
-   `the Download Page <https://releases.llvm.org/download.html>`_.
-
 Introduction
 ============
 
@@ -30,11 +24,6 @@ For more information about Clang or LLVM, including information about the
 latest release, please see the `Clang Web Site <https://clang.llvm.org>`_ or the
 `LLVM Web Site <https://llvm.org>`_.
 
-Note that if you are reading this file from a Git checkout or the
-main Clang web page, this document applies to the *next* release, not
-the current one. To see the release notes for a specific release, please
-see the `releases page <https://llvm.org/releases/>`_.
-
 What's New in Clang 11.0.0?
 ===========================
 
@@ -43,13 +32,9 @@ here. Generic improvements to Clang as a whole or to its underlying
 infrastructure are described first, followed by language-specific
 sections with improvements to Clang's support for those languages.
 
-Major New Features
-------------------
-
-- ...
 
 Recovery AST
-^^^^^^^^^^^^
+------------
 
 clang's AST now improves support for representing broken C++ code. This improves
 the quality of subsequent diagnostics after an error is encountered. It also
@@ -89,7 +74,7 @@ This feature is on by default for C++ code, and can be explicitly controlled
 with `-Xclang -f[no-]recovery-ast`.
 
 Improvements to Clang's diagnostics
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-----------------------------------
 
 - -Wpointer-to-int-cast is a new warning group. This group warns about C-style
   casts of pointers to a integer type too small to hold all possible values.
@@ -269,13 +254,6 @@ New Compiler Flags
   compiler support will continue to change until the specification is
   finalised.
 
-Deprecated Compiler Flags
--------------------------
-
-The following options are deprecated and ignored. They will be removed in
-future versions of Clang.
-
-- ...
 
 Modified Compiler Flags
 -----------------------
@@ -346,8 +324,6 @@ C Language Changes in Clang
 - Clang now supports the GNU C extension `asm inline`; it won't do anything
   *yet*, but it will be parsed.
 
-- ...
-
 C++ Language Changes in Clang
 -----------------------------
 
@@ -389,13 +365,6 @@ C++ Language Changes in Clang
       int f() { return 0; }
     } S;
 
-C++1z Feature Support
-^^^^^^^^^^^^^^^^^^^^^
-
-...
-
-Objective-C Language Changes in Clang
--------------------------------------
 
 OpenCL Kernel Language Changes in Clang
 ---------------------------------------
@@ -420,7 +389,7 @@ OpenCL Kernel Language Changes in Clang
   `cl_arm_integer_dot_product`.
 
 Changes related to C++ for OpenCL
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+---------------------------------
 
 - Added `addrspace_cast` operator.
 
@@ -486,10 +455,6 @@ New features for OpenMP 5.0 were implemented.
 
 - Bug fixes and optimizations.
 
-CUDA Support in Clang
----------------------
-
-- ...
 
 Internal API Changes
 --------------------
@@ -538,11 +503,6 @@ release of Clang. Users of the build system should adjust accordingly.
   something you need, please reach out to the mailing list to discuss possible
   ways forward.
 
-AST Matchers
-------------
-
-- ...
-
 clang-format
 ------------
 
@@ -644,10 +604,6 @@ clang-format
         foo();
       } while(1);
 
-libclang
---------
-
-- ...
 
 .. _release-notes-clang-static-analyzer:
 
@@ -723,28 +679,6 @@ Static Analyzer
 
 .. _release-notes-ubsan:
 
-Undefined Behavior Sanitizer (UBSan)
-------------------------------------
-
-Core Analysis Improvements
-==========================
-
-- ...
-
-New Issues Found
-================
-
-- ...
-
-Python Binding Changes
-----------------------
-
-The following methods have been added:
-
--  ...
-
-Significant Known Problems
-==========================
 
 Additional Information
 ======================

From 19d7a9fa9d665c75655db70873782e60cad56bb7 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Wed, 16 Sep 2020 17:14:46 +0200
Subject: [PATCH 252/363] clang-tools-extra release notes: drop in-progress
 warnings; minor cleanups

---
 clang-tools-extra/docs/ReleaseNotes.rst | 60 ++-----------------------
 1 file changed, 3 insertions(+), 57 deletions(-)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 83ae2c6605fd..0471c5e9c4eb 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -1,6 +1,6 @@
-====================================================
-Extra Clang Tools 11.0.0 (In-Progress) Release Notes
-====================================================
+======================================
+Extra Clang Tools 11.0.0 Release Notes
+======================================
 
 .. contents::
    :local:
@@ -8,12 +8,6 @@ Extra Clang Tools 11.0.0 (In-Progress) Release Notes
 
 Written by the `LLVM Team <https://llvm.org/>`_
 
-.. warning::
-
-   These are in-progress notes for the upcoming Extra Clang Tools 11 release.
-   Release notes for previous releases can be found on
-   `the Download Page <https://releases.llvm.org/download.html>`_.
-
 Introduction
 ============
 
@@ -27,11 +21,6 @@ For more information about Clang or LLVM, including information about
 the latest release, please see the `Clang Web Site <https://clang.llvm.org>`_ or
 the `LLVM Web Site <https://llvm.org>`_.
 
-Note that if you are reading this file from a Git checkout or the
-main Clang web page, this document applies to the *next* release, not
-the current one. To see the release notes for a specific release, please
-see the `releases page <https://llvm.org/releases/>`_.
-
 What's New in Extra Clang Tools 11.0.0?
 =======================================
 
@@ -39,11 +28,6 @@ Some of the major new features and improvements to Extra Clang Tools are listed
 here. Generic improvements to Extra Clang Tools as a whole or to its underlying
 infrastructure are described first, followed by tool-specific sections.
 
-Major New Features
-------------------
-
-...
-
 Improvements to clangd
 ----------------------
 
@@ -238,21 +222,6 @@ Miscellaneous
 
 - Too many stability and correctness fixes to mention.
 
-Improvements to clang-doc
--------------------------
-
-The improvements are...
-
-Improvements to clang-query
----------------------------
-
-The improvements are...
-
-Improvements to clang-rename
-----------------------------
-
-The improvements are...
-
 Improvements to clang-tidy
 --------------------------
 
@@ -439,26 +408,3 @@ Other improvements
 
 - For `run-clang-tidy.py` add option to use alpha checkers from
   `clang-analyzer`.
-
-Improvements to include-fixer
------------------------------
-
-The improvements are...
-
-Improvements to clang-include-fixer
------------------------------------
-
-The improvements are...
-
-Improvements to modularize
---------------------------
-
-The improvements are...
-
-Improvements to pp-trace
-------------------------
-
-The improvements are...
-
-Clang-tidy visual studio plugin
--------------------------------

From 6afefb45dbffc4a8187192f3c70655343a10febf Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Wed, 16 Sep 2020 17:16:34 +0200
Subject: [PATCH 253/363] lld release notes: drop in-progress warnings; minor
 cleanups

---
 lld/docs/ReleaseNotes.rst | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 880f933e51be..fcb8eaefa594 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -5,11 +5,6 @@ lld 11.0.0 Release Notes
 .. contents::
     :local:
 
-.. warning::
-   These are in-progress notes for the upcoming LLVM 11.0.0 release.
-   Release notes for previous releases can be found on
-   `the Download Page <https://releases.llvm.org/download.html>`_.
-
 Introduction
 ============
 
@@ -176,12 +171,3 @@ MinGW Improvements
   ``--disable-runtime-pseudo-reloc``), the ``--no-seh`` flag and options
   for selecting file and section alignment (``--file-alignment`` and
   ``--section-alignment``).
-
-MachO Improvements
-------------------
-
-* Item 1.
-
-WebAssembly Improvements
-------------------------
-

From 34c21f8dbe656969c209803012234901a1d4ae19 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Wed, 16 Sep 2020 17:18:40 +0200
Subject: [PATCH 254/363] flang release notes: drop in-progress warnings; minor
 cleanups

---
 flang/docs/ReleaseNotes.md | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/flang/docs/ReleaseNotes.md b/flang/docs/ReleaseNotes.md
index b891ab904a04..19939a539c43 100644
--- a/flang/docs/ReleaseNotes.md
+++ b/flang/docs/ReleaseNotes.md
@@ -1,10 +1,4 @@
-# Flang 11.0.0 (In-Progress) Release Notes
-
-> **warning**
->
-> These are in-progress notes for the upcoming LLVM 11.0.0 release.
-> Release notes for previous releases can be found on [the Download
-> Page](https://releases.llvm.org/download.html).
+# Flang 11.0.0 Release Notes
 
 ## Introduction
 
@@ -17,18 +11,6 @@ documentation](https://llvm.org/docs/ReleaseNotes.html). All LLVM
 releases may be downloaded from the [LLVM releases web
 site](https://llvm.org/releases/).
 
-Note that if you are reading this file from a Git checkout, this
-document applies to the *next* release, not the current one. To see the
-release notes for a specific release, please see the [releases
-page](https://llvm.org/releases/).
-
-## Known Issues
-
-These are issues that couldn't be fixed before the release. See the bug
-reports for the latest status.
-
- *   ...
-
 ## Introducing Flang
 
 Flang is LLVM's Fortran front end and is new for the LLVM 11 release.

From 952e7c3b81ffa34130b571afe028debc0ef36691 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Wed, 16 Sep 2020 17:20:04 +0200
Subject: [PATCH 255/363] libc++ release notes: drop in-progress warnings;
 minor cleanups

---
 libcxx/docs/ReleaseNotes.rst | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes.rst b/libcxx/docs/ReleaseNotes.rst
index 1db79153ed89..001957570c90 100644
--- a/libcxx/docs/ReleaseNotes.rst
+++ b/libcxx/docs/ReleaseNotes.rst
@@ -1,6 +1,6 @@
-=========================================
-Libc++ 11.0.0 (In-Progress) Release Notes
-=========================================
+===========================
+Libc++ 11.0.0 Release Notes
+===========================
 
 .. contents::
    :local:
@@ -8,12 +8,6 @@ Libc++ 11.0.0 (In-Progress) Release Notes
 
 Written by the `Libc++ Team <https://libcxx.llvm.org>`_
 
-.. warning::
-
-   These are in-progress notes for the upcoming libc++ 11 release.
-   Release notes for previous releases can be found on
-   `the Download Page <https://releases.llvm.org/download.html>`_.
-
 Introduction
 ============
 
@@ -27,11 +21,6 @@ be downloaded from the `LLVM releases web site <https://llvm.org/releases/>`_.
 For more information about libc++, please see the `Libc++ Web Site
 <https://libcxx.llvm.org>`_ or the `LLVM Web Site <https://llvm.org>`_.
 
-Note that if you are reading this file from a Git checkout or the
-main Libc++ web page, this document applies to the *next* release, not
-the current one. To see the release notes for a specific release, please
-see the `releases page <https://llvm.org/releases/>`_.
-
 What's New in Libc++ 11.0.0?
 ============================
 
@@ -39,7 +28,3 @@ New Features
 ------------
 
 - ``<numbers>``
-
-API Changes
------------
-- ...

From f9572abae17c483c861702d440b4eeb0f08879d1 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Wed, 16 Sep 2020 17:21:17 +0200
Subject: [PATCH 256/363] openmp release notes: drop in-progress warnings;
 minor cleanups

---
 openmp/docs/ReleaseNotes.rst | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/openmp/docs/ReleaseNotes.rst b/openmp/docs/ReleaseNotes.rst
index b7f2ec42277e..e09ef5f5b638 100644
--- a/openmp/docs/ReleaseNotes.rst
+++ b/openmp/docs/ReleaseNotes.rst
@@ -5,11 +5,6 @@ openmp 11.0.0 Release Notes
 .. contents::
     :local:
 
-.. warning::
-   These are in-progress notes for the upcoming LLVM 11.0.0 release.
-   Release notes for previous releases can be found on
-   `the Download Page <https://releases.llvm.org/download.html>`_.
-
 Introduction
 ============
 
@@ -21,16 +16,6 @@ from the `LLVM releases web site <https://llvm.org/releases/>`_.
 Non-comprehensive list of changes in this release
 =================================================
 
-5.0 features
-------------
-
-* ...
-
-5.1 features
-------------
-
-* ...
-
 OMPT Improvements
 -----------------
 

From c2f4de353b2e9f6658febe5735c615ee433fd062 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Wed, 16 Sep 2020 17:22:02 +0200
Subject: [PATCH 257/363] polly release notes: drop in-progress warnings; minor
 cleanups

---
 polly/docs/ReleaseNotes.rst | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/polly/docs/ReleaseNotes.rst b/polly/docs/ReleaseNotes.rst
index ab95eae4e57e..8aaa6f0564da 100644
--- a/polly/docs/ReleaseNotes.rst
+++ b/polly/docs/ReleaseNotes.rst
@@ -1,17 +1,9 @@
-=============================
-Release Notes 11.0 (upcoming)
-=============================
+==================
+Release Notes 11.0
+==================
 
 In Polly 11 the following important changes have been incorporated.
 
-.. warning::
-
-  These releaes notes are for the next release of Polly and describe
-  the new features that have recently been committed to our development
-  branch.
-
-- Change ...
-
  * The LLVM option -polly-isl-arg was added to pass options to ISL's
    command line option parser. For instance,
    -polly-isl-arg=--schedule-algorithm=feautrier switches to the

From 80e2fc1e6e68d6ed57dccc03c6a5121e216bfd43 Mon Sep 17 00:00:00 2001
From: Ben Dunbobbin <Ben.Dunbobbin@sony.com>
Date: Thu, 13 Aug 2020 23:58:40 +0100
Subject: [PATCH 258/363] [X86][ELF] Prefer lowering MC_GlobalAddress operands
 to .Lfoo$local for STV_DEFAULT only

This patch restricts the behaviour of referencing via .Lfoo$local
local aliases, introduced in https://reviews.llvm.org/D73230, to
STV_DEFAULT globals only.

Hidden symbols via --fvisiblity=hidden (https://gcc.gnu.org/wiki/Visibility)
is an important scenario.

Benefits:

- Improves the size of object files by using fewer STT_SECTION symbols.

- The code reads a bit better (it was not obvious to me without going
  back to the code reviews why the canBenefitFromLocalAlias function
  currently doesn't consider visibility).

- There is also a side benefit in restoring the effectiveness of the
  --wrap linker option and making the behavior of --wrap consistent
  between LTO and normal builds for references within a translation-unit.
  Note: this --wrap behavior (which is specific to LLD) should not be
  considered reliable. See comments on https://reviews.llvm.org/D73230
  for more.

Differential Revision: https://reviews.llvm.org/D85782

(cherry picked from commit 4cb016cd2d8467c572b2e5c5d34f376ee79e4ac1)
---
 llvm/lib/IR/Globals.cpp                       |  3 +-
 llvm/test/CodeGen/AArch64/emutls.ll           |  2 --
 llvm/test/CodeGen/ARM/emutls.ll               |  2 --
 .../X86/2008-03-12-ThreadLocalAlias.ll        |  4 +--
 llvm/test/CodeGen/X86/linux-preemption.ll     | 29 +++++++++++++++++++
 .../X86/semantic-interposition-comdat.ll      |  2 +-
 llvm/test/CodeGen/X86/tailcallpic1.ll         |  2 +-
 llvm/test/CodeGen/X86/tailcallpic3.ll         |  2 +-
 llvm/test/CodeGen/X86/tailccpic1.ll           |  2 +-
 9 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index dd8e62164de1..ed946ef3fd12 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -104,7 +104,8 @@ bool GlobalValue::isInterposable() const {
 
 bool GlobalValue::canBenefitFromLocalAlias() const {
   // See AsmPrinter::getSymbolPreferLocal().
-  return GlobalObject::isExternalLinkage(getLinkage()) && !isDeclaration() &&
+  return hasDefaultVisibility() &&
+         GlobalObject::isExternalLinkage(getLinkage()) && !isDeclaration() &&
          !isa<GlobalIFunc>(this) && !hasComdat();
 }
 
diff --git a/llvm/test/CodeGen/AArch64/emutls.ll b/llvm/test/CodeGen/AArch64/emutls.ll
index 85d2c1a3b315..25be391bbfaa 100644
--- a/llvm/test/CodeGen/AArch64/emutls.ll
+++ b/llvm/test/CodeGen/AArch64/emutls.ll
@@ -155,7 +155,6 @@ entry:
 ; ARM64:      .data{{$}}
 ; ARM64:      .globl __emutls_v.i4
 ; ARM64-LABEL: __emutls_v.i4:
-; ARM64-NEXT: .L__emutls_v.i4$local:
 ; ARM64-NEXT: .xword 4
 ; ARM64-NEXT: .xword 4
 ; ARM64-NEXT: .xword 0
@@ -163,7 +162,6 @@ entry:
 
 ; ARM64:      .section .rodata,
 ; ARM64-LABEL: __emutls_t.i4:
-; ARM64-NEXT: .L__emutls_t.i4$local:
 ; ARM64-NEXT: .word 15
 
 ; ARM64-NOT:   __emutls_v.i5:
diff --git a/llvm/test/CodeGen/ARM/emutls.ll b/llvm/test/CodeGen/ARM/emutls.ll
index 4327086685e9..92b656d9ba09 100644
--- a/llvm/test/CodeGen/ARM/emutls.ll
+++ b/llvm/test/CodeGen/ARM/emutls.ll
@@ -238,7 +238,6 @@ entry:
 ; ARM32:      .data{{$}}
 ; ARM32:      .globl __emutls_v.i4
 ; ARM32-LABEL: __emutls_v.i4:
-; ARM32-NEXT:  .L__emutls_v.i4$local:
 ; ARM32-NEXT: .long 4
 ; ARM32-NEXT: .long 4
 ; ARM32-NEXT: .long 0
@@ -246,7 +245,6 @@ entry:
 
 ; ARM32:      .section .rodata,
 ; ARM32-LABEL: __emutls_t.i4:
-; ARM32-NEXT:  .L__emutls_t.i4$local:
 ; ARM32-NEXT: .long 15
 
 ; ARM32-NOT:   __emutls_v.i5:
diff --git a/llvm/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll b/llvm/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
index 89d249c09178..2ca003e052aa 100644
--- a/llvm/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
+++ b/llvm/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
@@ -12,7 +12,7 @@ target triple = "i386-pc-linux-gnu"
 
 define i32 @foo() {
 ; CHECK-LABEL: foo:
-; CHECK: leal    .L__libc_resp$local@TLSLDM
+; CHECK: leal    __libc_resp@TLSLD
 entry:
 	%retval = alloca i32		; <i32*> [#uses=1]
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
@@ -27,7 +27,7 @@ return:		; preds = %entry
 
 define i32 @bar() {
 ; CHECK-LABEL: bar:
-; CHECK: leal    .L__libc_resp$local@TLSLDM
+; CHECK: leal    __libc_resp@TLSLD
 entry:
 	%retval = alloca i32		; <i32*> [#uses=1]
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
diff --git a/llvm/test/CodeGen/X86/linux-preemption.ll b/llvm/test/CodeGen/X86/linux-preemption.ll
index 49a7becf1343..15265f401992 100644
--- a/llvm/test/CodeGen/X86/linux-preemption.ll
+++ b/llvm/test/CodeGen/X86/linux-preemption.ll
@@ -20,6 +20,14 @@ define i32* @get_strong_default_global() {
 ; STATIC: movl $strong_default_global, %eax
 ; CHECK32: movl strong_default_global@GOT(%eax), %eax
 
+@strong_hidden_global = hidden global i32 42
+define i32* @get_hidden_default_global() {
+  ret i32* @strong_hidden_global
+}
+; CHECK: leaq strong_hidden_global(%rip), %rax
+; STATIC: movl $strong_hidden_global, %eax
+; CHECK32: leal strong_hidden_global@GOTOFF(%eax), %eax
+
 @weak_default_global = weak global i32 42
 define i32* @get_weak_default_global() {
   ret i32* @weak_default_global
@@ -96,6 +104,14 @@ define i32* @get_strong_default_alias() {
 ; STATIC: movl $strong_default_alias, %eax
 ; CHECK32: movl strong_default_alias@GOT(%eax), %eax
 
+@strong_hidden_alias = hidden alias i32, i32* @aliasee
+define i32* @get_strong_hidden_alias() {
+  ret i32* @strong_hidden_alias
+}
+; CHECK: leaq strong_hidden_alias(%rip), %rax
+; STATIC: movl $strong_hidden_alias, %eax
+; CHECK32: leal strong_hidden_alias@GOTOFF(%eax), %eax
+
 @weak_default_alias = weak alias i32, i32* @aliasee
 define i32* @get_weak_default_alias() {
   ret i32* @weak_default_alias
@@ -149,6 +165,16 @@ define void()* @get_strong_default_function() {
 ; STATIC: movl $strong_default_function, %eax
 ; CHECK32: movl strong_default_function@GOT(%eax), %eax
 
+define hidden void @strong_hidden_function() {
+  ret void
+}
+define void()* @get_strong_hidden_function() {
+  ret void()* @strong_hidden_function
+}
+; CHECK: leaq strong_hidden_function(%rip), %rax
+; STATIC: movl $strong_hidden_function, %eax
+; CHECK32: leal strong_hidden_function@GOTOFF(%eax), %eax
+
 define weak void @weak_default_function() {
   ret void
 }
@@ -234,6 +260,9 @@ define void()* @get_external_preemptable_function() {
 
 ; COMMON:      .globl strong_default_alias
 ; COMMON-NEXT: .set strong_default_alias, aliasee
+; COMMON-NEXT: .globl strong_hidden_alias
+; COMMON-NEXT: .hidden strong_hidden_alias
+; COMMON-NEXT: .set strong_hidden_alias, aliasee
 ; COMMON-NEXT: .weak weak_default_alias
 ; COMMON-NEXT: .set weak_default_alias, aliasee
 ; COMMON-NEXT: .globl strong_local_alias
diff --git a/llvm/test/CodeGen/X86/semantic-interposition-comdat.ll b/llvm/test/CodeGen/X86/semantic-interposition-comdat.ll
index d0efd4d11c95..d11be2d6bd0c 100644
--- a/llvm/test/CodeGen/X86/semantic-interposition-comdat.ll
+++ b/llvm/test/CodeGen/X86/semantic-interposition-comdat.ll
@@ -3,7 +3,7 @@
 $comdat_func = comdat any
 
 ; CHECK-LABEL: func2:
-; CHECK-NEXT: .Lfunc2$local
+; CHECK-NOT: .Lfunc2$local
 
 declare void @func()
 
diff --git a/llvm/test/CodeGen/X86/tailcallpic1.ll b/llvm/test/CodeGen/X86/tailcallpic1.ll
index 717cc1fddec9..ed101fcccd2d 100644
--- a/llvm/test/CodeGen/X86/tailcallpic1.ll
+++ b/llvm/test/CodeGen/X86/tailcallpic1.ll
@@ -12,5 +12,5 @@ define fastcc i32 @tailcaller(i32 %in1, i32 %in2) {
 entry:
 	%tmp11 = tail call fastcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 )		; <i32> [#uses=1]
 	ret i32 %tmp11
-; CHECK: jmp .Ltailcallee$local
+; CHECK: jmp tailcallee
 }
diff --git a/llvm/test/CodeGen/X86/tailcallpic3.ll b/llvm/test/CodeGen/X86/tailcallpic3.ll
index 13b160aae2f6..edc58052d82f 100644
--- a/llvm/test/CodeGen/X86/tailcallpic3.ll
+++ b/llvm/test/CodeGen/X86/tailcallpic3.ll
@@ -16,7 +16,7 @@ entry:
   ret void
 }
 ; CHECK: tailcall_hidden:
-; CHECK: jmp .Ltailcallee_hidden$local
+; CHECK: jmp tailcallee_hidden
 
 define internal void @tailcallee_internal() {
 entry:
diff --git a/llvm/test/CodeGen/X86/tailccpic1.ll b/llvm/test/CodeGen/X86/tailccpic1.ll
index dbdc56aa61c7..de8f2219bc2f 100644
--- a/llvm/test/CodeGen/X86/tailccpic1.ll
+++ b/llvm/test/CodeGen/X86/tailccpic1.ll
@@ -12,5 +12,5 @@ define tailcc i32 @tailcaller(i32 %in1, i32 %in2) {
 entry:
 	%tmp11 = tail call tailcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 )		; <i32> [#uses=1]
 	ret i32 %tmp11
-; CHECK: jmp .Ltailcallee$local
+; CHECK: jmp tailcallee
 }

From 4fe4e35452ef17ce3db08b080e3b0642d36c5094 Mon Sep 17 00:00:00 2001
From: sameeran joshi <sameeranjayant.joshi@amd.com>
Date: Sun, 13 Sep 2020 17:24:34 +0530
Subject: [PATCH 259/363] [Flang] Add GettingInvolved documentation page and
 sidebar.

Adds a new GettingInvolved page to documentation which provides details about
mailing list, chats and calls.

Adds a sidebar page which provides common links on
all documentation pages.
The links include:
-  Getting Started
-  Getting Involved
-  Github Repository
-  Bug Reports
-  Code Review

Depends on https://reviews.llvm.org/D87242

Reviewed By: richard.barton.arm

Differential Revision: https://reviews.llvm.org/D87270

(cherry picked from commit fe395aecd9e70b815e6490639098d815385f9932)
---
 flang/docs/GettingInvolved.md           | 72 +++++++++++++++++++++++++
 flang/docs/_templates/indexsidebar.html | 26 +++++++++
 flang/docs/_templates/layout.html       | 14 +++++
 flang/docs/conf.py                      |  8 ++-
 flang/docs/index.md                     |  1 +
 5 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 flang/docs/GettingInvolved.md
 create mode 100644 flang/docs/_templates/indexsidebar.html
 create mode 100644 flang/docs/_templates/layout.html

diff --git a/flang/docs/GettingInvolved.md b/flang/docs/GettingInvolved.md
new file mode 100644
index 000000000000..a244fbcee56a
--- /dev/null
+++ b/flang/docs/GettingInvolved.md
@@ -0,0 +1,72 @@
+<!--===- docs/GettingInvolved.md
+
+   Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+   See https://llvm.org/LICENSE.txt for license information.
+   SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+-->
+# Getting Involved
+
+```eval_rst
+.. contents::
+   :local:
+```
+
+The Flang Project welcomes contributions of all kinds.
+Please feel free to join the mailing list or the slack channel for discussions related to development of Flang.
+To understand the status of various developments in Flang please join the respective call.
+
+## Mailing Lists
+
+[Developer's List (flang-dev)](http://lists.llvm.org/mailman/listinfo/flang-dev)
+
+  This list is for people who want to be included in technical discussions related to Flang. People post to this list when they have questions about writing code
+  for or using the Flang tools. It is relatively low volume.
+
+
+[Commits Archive (flang-commits)](http://lists.llvm.org/pipermail/flang-commits)
+
+  This list contains all commit messages that are made when Flang developers
+  commit code changes to the repository. It also serves as a forum for
+  patch review (i.e. send patches here). It is useful for those who want to
+  stay on the bleeding edge of Flang development. This list is high
+  volume.
+
+## Chat
+
+### Flang Slack Workspace
+
+-   There is a Slack workspace dedicated to Flang.
+-   There are a number of topic-oriented channels available (e.g., #driver, #f18-semantics, #fir).
+-   Add yourself via the *[invitation link](https://join.slack.com/t/flang-compiler/shared_invite/zt-2pcn51lh-VrRQL_YUOkxA_1CEfMGQhw "title")*
+
+## Calls
+
+### Flang Community Biweekly Call
+
+-   General updates on the Flang Project, both LLVM Flang and current Flang.
+-   Join [Flang Community Biweekly Call](https://nvmeet.webex.com/nvmeet/j.php?MTID=mb4edb8c799f69ec2dc0554acc969a162)
+-   Time: On Wednesdays 8:30 Pacific Time, on the weeks alternating with regular Flang Community Technical Biweekly Call.
+-   Minutes: They are sent to [flang-dev](http://lists.llvm.org/mailman/listinfo/flang-dev). Search for `Flang Biweekly Sync - Notes`.
+
+### Flang Community Technical Biweekly Call
+
+-   Technical topics call.
+-   Join [Flang Community Technical Biweekly Call](https://bluejeans.com/625064848?src=join_info)
+-   Time: On Mondays 8:30 Pacific Time, on the weeks alternating with regular Flang Community Biweekly Call.
+-   The agenda is in this [Google Doc](https://docs.google.com/document/d/1Z2U5UAtJ-Dag5wlMaLaW1KRmNgENNAYynJqLW2j2AZQ/).
+
+### LLVM Alias Analysis Technical Call
+
+-   For people working on improvements to LLVM alias analysis.
+-   Join [LLVM Alias Analysis Technical Call](https://bluejeans.com/101176001?src=join_info)
+-   Time: Tuesdays 10:00 AM Pacific Time, every 4 weeks.
+-   The agenda is in this [Google Doc](https://docs.google.com/document/d/1ybwEKDVtIbhIhK50qYtwKsL50K-NvB6LfuBsfepBZ9Y/).
+
+### OpenMP Technical Call
+
+-   Development updates on OpenMP and OpenACC in the Flang Project.
+-   Join [OpenMP Technical Call](https://bit.ly/39eQW3o)
+-   Time: Weekly call on every Thursdays 8:00 AM Pacific time.
+-   Meeting minutes are [here](https://docs.google.com/document/d/1yA-MeJf6RYY-ZXpdol0t7YoDoqtwAyBhFLr5thu5pFI).
+-   Status tracking [page](https://docs.google.com/spreadsheets/d/1FvHPuSkGbl4mQZRAwCIndvQx9dQboffiD-xD0oqxgU0/edit#gid=0).
diff --git a/flang/docs/_templates/indexsidebar.html b/flang/docs/_templates/indexsidebar.html
new file mode 100644
index 000000000000..3c8f1abdf900
--- /dev/null
+++ b/flang/docs/_templates/indexsidebar.html
@@ -0,0 +1,26 @@
+{# This template defines sidebar which can be used to provide common links on
+   all documentation pages. #}
+
+<h3>Documentation</h3>
+
+<ul class="want-points">
+    <li><a href="https://github.com/llvm/llvm-project/blob/master/flang/README.md#getting-started">Getting Started</a></li>
+</ul>
+
+<h3>Getting Involved</h3>
+<! TODO: Point links to website(flang.llvm.org) and not github once webpage comes up.>
+<ul class="want-points">
+    <li><a href="https://github.com/llvm/llvm-project/blob/master/flang/docs/GettingInvolved.md#mailing-lists">Mailing Lists</a></li>
+    <li><a href="https://github.com/llvm/llvm-project/blob/master/flang/docs/GettingInvolved.md#chat">Slack</a></li>
+    <li><a href="https://github.com/llvm/llvm-project/blob/master/flang/docs/GettingInvolved.md#calls">Calls</a></li>
+</ul>
+
+<h3>Additional Links</h3>
+
+<ul class="want-points">
+    <li><a href="https://github.com/llvm/llvm-project/tree/master/flang/">Github Repository</a></li>
+    <li><a href="https://bugs.llvm.org/">Bug Reports</a></li>
+    <li><a href="https://reviews.llvm.org/">Code Review</a></li>
+<! TODO: Have the bots setup first>
+    <li><a href="#">Doxygen API</a></li>
+</ul>
diff --git a/flang/docs/_templates/layout.html b/flang/docs/_templates/layout.html
new file mode 100644
index 000000000000..12b7731ccca7
--- /dev/null
+++ b/flang/docs/_templates/layout.html
@@ -0,0 +1,14 @@
+{% extends "!layout.html" %}
+
+{% block extrahead %}
+<style type="text/css">
+  table.right { float: right; margin-left: 20px; }
+  table.right td { border: 1px solid #ccc; }
+</style>
+{% endblock %}
+
+{% block rootrellink %}
+<! TODO: Change the webpage  >
+  <li><a href="https://flang.llvm.org">Flang Home</a>&nbsp;|&nbsp;</li>
+  <li><a href="{{ pathto('index') }}">Documentation</a>&raquo;</li>
+{% endblock %}
diff --git a/flang/docs/conf.py b/flang/docs/conf.py
index f5eb283a186a..197721a4e4c8 100644
--- a/flang/docs/conf.py
+++ b/flang/docs/conf.py
@@ -178,7 +178,13 @@ def setup(app):
 #html_use_smartypants = True
 
 # Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+html_sidebars = {
+    '**': [
+        'indexsidebar.html',
+        'searchbox.html',
+    ]
+}
+
 
 # Additional templates that should be rendered to pages, maps page names to
 # template names.
diff --git a/flang/docs/index.md b/flang/docs/index.md
index 4c0717056522..bd7092a418f3 100644
--- a/flang/docs/index.md
+++ b/flang/docs/index.md
@@ -15,6 +15,7 @@ Flang is LLVM's Fortran frontend
 .. toctree::
    :titlesonly:
 
+   GettingInvolved
    FortranForCProgrammers
    C++style
    C++17

From b78e5de029c26c309f541ab883fa5d6d953b073d Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan@cn.ibm.com>
Date: Thu, 17 Sep 2020 16:00:54 +0800
Subject: [PATCH 260/363] [SelectionDAG] Check any use of negation result
 before removal

2508ef01 fixed a bug about constant removal in negation. But after
sanitizing check I found there's still some issue about it so it's
reverted.

Temporary nodes will be removed if useless in negation. Before the
removal, they'd be checked if any other nodes used it. So the removal
was moved after getNode. However in rare cases the node to be removed is
the same as result of getNode. We missed that and will be fixed by this
patch.

Reviewed By: steven.zhang

Differential Revision: https://reviews.llvm.org/D87614

(cherry picked from commit a2fb5446be960ad164060b3c05fc268f7f72d67a)
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 22 ++++++++++-----
 llvm/test/CodeGen/X86/pr47517.ll              | 28 +++++++++++++++++++
 2 files changed, 43 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/pr47517.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 819e608c6896..64af293caf9e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5751,8 +5751,10 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
 
     // If we already have the use of the negated floating constant, it is free
     // to negate it even it has multiple uses.
-    if (!Op.hasOneUse() && CFP.use_empty())
+    if (!Op.hasOneUse() && CFP.use_empty()) {
+      RemoveDeadNode(CFP);
       break;
+    }
     Cost = NegatibleCost::Neutral;
     return CFP;
   }
@@ -5810,7 +5812,8 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     if (NegX && (CostX <= CostY)) {
       Cost = CostX;
       SDValue N = DAG.getNode(ISD::FSUB, DL, VT, NegX, Y, Flags);
-      RemoveDeadNode(NegY);
+      if (NegY != N)
+        RemoveDeadNode(NegY);
       return N;
     }
 
@@ -5818,7 +5821,8 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     if (NegY) {
       Cost = CostY;
       SDValue N = DAG.getNode(ISD::FSUB, DL, VT, NegY, X, Flags);
-      RemoveDeadNode(NegX);
+      if (NegX != N)
+        RemoveDeadNode(NegX);
       return N;
     }
     break;
@@ -5857,7 +5861,8 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     if (NegX && (CostX <= CostY)) {
       Cost = CostX;
       SDValue N = DAG.getNode(Opcode, DL, VT, NegX, Y, Flags);
-      RemoveDeadNode(NegY);
+      if (NegY != N)
+        RemoveDeadNode(NegY);
       return N;
     }
 
@@ -5870,7 +5875,8 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     if (NegY) {
       Cost = CostY;
       SDValue N = DAG.getNode(Opcode, DL, VT, X, NegY, Flags);
-      RemoveDeadNode(NegX);
+      if (NegX != N)
+        RemoveDeadNode(NegX);
       return N;
     }
     break;
@@ -5901,7 +5907,8 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     if (NegX && (CostX <= CostY)) {
       Cost = std::min(CostX, CostZ);
       SDValue N = DAG.getNode(Opcode, DL, VT, NegX, Y, NegZ, Flags);
-      RemoveDeadNode(NegY);
+      if (NegY != N)
+        RemoveDeadNode(NegY);
       return N;
     }
 
@@ -5909,7 +5916,8 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     if (NegY) {
       Cost = std::min(CostY, CostZ);
       SDValue N = DAG.getNode(Opcode, DL, VT, X, NegY, NegZ, Flags);
-      RemoveDeadNode(NegX);
+      if (NegX != N)
+        RemoveDeadNode(NegX);
       return N;
     }
     break;
diff --git a/llvm/test/CodeGen/X86/pr47517.ll b/llvm/test/CodeGen/X86/pr47517.ll
new file mode 100644
index 000000000000..5672fbc69a41
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr47517.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple x86_64 < %s | FileCheck %s
+
+; To ensure unused floating point constant is correctly removed
+define float @test(float %src, float* %p) {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq $0, (%rdi)
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %a0 = getelementptr inbounds float, float* %p, i32 0
+  %a1 = getelementptr inbounds float, float* %p, i32 1
+  store float 0.000000e+00, float* %a0
+  store float 0.000000e+00, float* %a1
+  %zero = load float, float* %a0
+  %fmul1 = fmul fast float %zero, %src
+  %fadd1 = fadd fast float %fmul1, %zero
+  %fmul2 = fmul fast float %fadd1, 2.000000e+00
+  %fmul3 = fmul fast float %fmul2, %fmul2
+  %fmul4 = fmul fast float %fmul2, 2.000000e+00
+  %fadd2 = fadd fast float %fmul4, -3.000000e+00
+  %fmul5 = fmul fast float %fadd2, %fmul2
+  %fadd3 = fadd fast float %fmul2, %src
+  %fadd4 = fadd fast float %fadd3, %fmul5
+  %fmul6 = fmul fast float %fmul3, %fadd4
+  ret float %fmul6
+}

From 410b0dc84bbdafabe3a2c3eedd96e50340a6e0d0 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Fri, 17 Jul 2020 10:41:35 -0700
Subject: [PATCH 261/363] [llvm] Add contains(KeyType) -> bool methods to
 SmallPtrSet

Matches C++20 API addition.

Differential Revision: https://reviews.llvm.org/D83449

(cherry picked from commit a0385bd7acd6e1d16224b4257f4cb50e59f1d75e)
---
 llvm/include/llvm/ADT/SmallPtrSet.h    |  3 +++
 llvm/unittests/ADT/SmallPtrSetTest.cpp | 34 +++++++++++++++++++++++---
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/ADT/SmallPtrSet.h b/llvm/include/llvm/ADT/SmallPtrSet.h
index 0ab05cfe611a..57dd8f6b695d 100644
--- a/llvm/include/llvm/ADT/SmallPtrSet.h
+++ b/llvm/include/llvm/ADT/SmallPtrSet.h
@@ -378,6 +378,9 @@ class SmallPtrSetImpl : public SmallPtrSetImplBase {
   iterator find(ConstPtrType Ptr) const {
     return makeIterator(find_imp(ConstPtrTraits::getAsVoidPointer(Ptr)));
   }
+  bool contains(ConstPtrType Ptr) const {
+    return find_imp(ConstPtrTraits::getAsVoidPointer(Ptr)) != EndPointer();
+  }
 
   template <typename IterT>
   void insert(IterT I, IterT E) {
diff --git a/llvm/unittests/ADT/SmallPtrSetTest.cpp b/llvm/unittests/ADT/SmallPtrSetTest.cpp
index 3226fe615509..eacd62ffc6ff 100644
--- a/llvm/unittests/ADT/SmallPtrSetTest.cpp
+++ b/llvm/unittests/ADT/SmallPtrSetTest.cpp
@@ -313,8 +313,8 @@ TEST(SmallPtrSetTest, ConstTest) {
   IntSet.insert(B);
   EXPECT_EQ(IntSet.count(B), 1u);
   EXPECT_EQ(IntSet.count(C), 1u);
-  EXPECT_NE(IntSet.find(B), IntSet.end());
-  EXPECT_NE(IntSet.find(C), IntSet.end());
+  EXPECT_TRUE(IntSet.contains(B));
+  EXPECT_TRUE(IntSet.contains(C));
 }
 
 // Verify that we automatically get the const version of PointerLikeTypeTraits
@@ -327,7 +327,7 @@ TEST(SmallPtrSetTest, ConstNonPtrTest) {
   TestPair Pair(&A[0], 1);
   IntSet.insert(Pair);
   EXPECT_EQ(IntSet.count(Pair), 1u);
-  EXPECT_NE(IntSet.find(Pair), IntSet.end());
+  EXPECT_TRUE(IntSet.contains(Pair));
 }
 
 // Test equality comparison.
@@ -367,3 +367,31 @@ TEST(SmallPtrSetTest, EqualityComparison) {
   EXPECT_NE(c, e);
   EXPECT_NE(e, d);
 }
+
+TEST(SmallPtrSetTest, Contains) {
+  SmallPtrSet<int *, 2> Set;
+  int buf[4] = {0, 11, 22, 11};
+  EXPECT_FALSE(Set.contains(&buf[0]));
+  EXPECT_FALSE(Set.contains(&buf[1]));
+
+  Set.insert(&buf[0]);
+  Set.insert(&buf[1]);
+  EXPECT_TRUE(Set.contains(&buf[0]));
+  EXPECT_TRUE(Set.contains(&buf[1]));
+  EXPECT_FALSE(Set.contains(&buf[3]));
+
+  Set.insert(&buf[1]);
+  EXPECT_TRUE(Set.contains(&buf[0]));
+  EXPECT_TRUE(Set.contains(&buf[1]));
+  EXPECT_FALSE(Set.contains(&buf[3]));
+
+  Set.erase(&buf[1]);
+  EXPECT_TRUE(Set.contains(&buf[0]));
+  EXPECT_FALSE(Set.contains(&buf[1]));
+
+  Set.insert(&buf[1]);
+  Set.insert(&buf[2]);
+  EXPECT_TRUE(Set.contains(&buf[0]));
+  EXPECT_TRUE(Set.contains(&buf[1]));
+  EXPECT_TRUE(Set.contains(&buf[2]));
+}

From 6250d4944539f67d6a605928e97c087fe306a79e Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight@google.com>
Date: Thu, 17 Sep 2020 18:10:19 -0400
Subject: [PATCH 262/363] PR47468: Fix findPHICopyInsertPoint, so that copies
 aren't incorrectly inserted after an INLINEASM_BR.

findPHICopyInsertPoint special cases placement in a block with a
callbr or invoke in it. In that case, we must ensure that the copy is
placed before the INLINEASM_BR or call instruction, if the register is
defined prior to that instruction, because it may jump out of the
block.

Previously, the code placed it immediately after the last def _or
use_. This is wrong, if the use is the instruction which may jump.  We
could correctly place it immediately after the last def (ignoring
uses), but that is non-optimal for register pressure.

Instead, place the copy after the last def, or before the
call/inlineasm_br, whichever is later.

Differential Revision: https://reviews.llvm.org/D87865

(cherry picked from commit f7a53d82c0902147909f28a9295a9d00b4b27d38)
---
 llvm/lib/CodeGen/PHIEliminationUtils.cpp      | 44 ++++++++++---------
 .../CodeGen/X86/callbr-asm-phi-placement.ll   | 44 +++++++++++++++++++
 2 files changed, 68 insertions(+), 20 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/callbr-asm-phi-placement.ll

diff --git a/llvm/lib/CodeGen/PHIEliminationUtils.cpp b/llvm/lib/CodeGen/PHIEliminationUtils.cpp
index bae96eb84521..2a72717e711d 100644
--- a/llvm/lib/CodeGen/PHIEliminationUtils.cpp
+++ b/llvm/lib/CodeGen/PHIEliminationUtils.cpp
@@ -27,31 +27,35 @@ llvm::findPHICopyInsertPoint(MachineBasicBlock* MBB, MachineBasicBlock* SuccMBB,
   // Usually, we just want to insert the copy before the first terminator
   // instruction. However, for the edge going to a landing pad, we must insert
   // the copy before the call/invoke instruction. Similarly for an INLINEASM_BR
-  // going to an indirect target.
-  if (!SuccMBB->isEHPad() && !SuccMBB->isInlineAsmBrIndirectTarget())
+  // going to an indirect target. This is similar to SplitKit.cpp's
+  // computeLastInsertPoint, and similarly assumes that there cannot be multiple
+  // instructions that are Calls with EHPad successors or INLINEASM_BR in a
+  // block.
+  bool EHPadSuccessor = SuccMBB->isEHPad();
+  if (!EHPadSuccessor && !SuccMBB->isInlineAsmBrIndirectTarget())
     return MBB->getFirstTerminator();
 
-  // Discover any defs/uses in this basic block.
-  SmallPtrSet<MachineInstr*, 8> DefUsesInMBB;
+  // Discover any defs in this basic block.
+  SmallPtrSet<MachineInstr *, 8> DefsInMBB;
   MachineRegisterInfo& MRI = MBB->getParent()->getRegInfo();
-  for (MachineInstr &RI : MRI.reg_instructions(SrcReg)) {
+  for (MachineInstr &RI : MRI.def_instructions(SrcReg))
     if (RI.getParent() == MBB)
-      DefUsesInMBB.insert(&RI);
-  }
+      DefsInMBB.insert(&RI);
 
-  MachineBasicBlock::iterator InsertPoint;
-  if (DefUsesInMBB.empty()) {
-    // No defs.  Insert the copy at the start of the basic block.
-    InsertPoint = MBB->begin();
-  } else if (DefUsesInMBB.size() == 1) {
-    // Insert the copy immediately after the def/use.
-    InsertPoint = *DefUsesInMBB.begin();
-    ++InsertPoint;
-  } else {
-    // Insert the copy immediately after the last def/use.
-    InsertPoint = MBB->end();
-    while (!DefUsesInMBB.count(&*--InsertPoint)) {}
-    ++InsertPoint;
+  MachineBasicBlock::iterator InsertPoint = MBB->begin();
+  // Insert the copy at the _latest_ point of:
+  // 1. Immediately AFTER the last def
+  // 2. Immediately BEFORE a call/inlineasm_br.
+  for (auto I = MBB->rbegin(), E = MBB->rend(); I != E; ++I) {
+    if (DefsInMBB.contains(&*I)) {
+      InsertPoint = std::next(I.getReverse());
+      break;
+    }
+    if ((EHPadSuccessor && I->isCall()) ||
+        I->getOpcode() == TargetOpcode::INLINEASM_BR) {
+      InsertPoint = I.getReverse();
+      break;
+    }
   }
 
   // Make sure the copy goes after any phi nodes but before
diff --git a/llvm/test/CodeGen/X86/callbr-asm-phi-placement.ll b/llvm/test/CodeGen/X86/callbr-asm-phi-placement.ll
new file mode 100644
index 000000000000..9bad6a7e0892
--- /dev/null
+++ b/llvm/test/CodeGen/X86/callbr-asm-phi-placement.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs -O2 < %s | FileCheck %s
+
+;; https://bugs.llvm.org/PR47468
+
+;; PHI elimination should place copies BEFORE the inline asm, not
+;; after, even if the inline-asm uses as an input the same value as
+;; the PHI.
+
+declare void @foo(i8*)
+
+define void @test1(i8* %arg, i8** %mem) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movq %rsi, %r14
+; CHECK-NEXT:  .Ltmp0: # Block address taken
+; CHECK-NEXT:  .LBB0_1: # %loop
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movq (%r14), %rbx
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    movq %rbx, %rdi
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:  # %bb.2: # %end
+; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    retq
+entry:
+  br label %loop
+
+loop:
+  %a = phi i8* [ %arg, %entry ], [ %b, %loop ]
+  %b = load i8*, i8** %mem, align 8
+  call void @foo(i8* %a)
+  callbr void asm sideeffect "", "*m,X"(i8* %b, i8* blockaddress(@test1, %loop))
+          to label %end [label %loop]
+
+end:
+  ret void
+}

From b513e1963f3a7edc897c6c4e675934d0c58f1802 Mon Sep 17 00:00:00 2001
From: Lucas Prates <lucas.prates@arm.com>
Date: Thu, 17 Sep 2020 18:07:35 +0100
Subject: [PATCH 263/363] [CodeGen] Fixing inconsistent ABI mangling of vlaues
 in SelectionDAGBuilder

SelectionDAGBuilder was inconsistently mangling values based on ABI
Calling Conventions when getting them through copyFromRegs in
SelectionDAGBuilder, causing duplicate value type convertions for
function arguments. The checking for the mangling requirement was based
on the value's originating instruction and was performed outside of, and
inspite of, the regular Calling Convention Lowering.

The issue could be observed in a scenario such as:

```
%arg1 = load half, half* %const, align 2
%arg2 = call fastcc half @someFunc()
call fastcc void @otherFunc(half %arg1, half %arg2)
; Here, %arg2 was incorrectly mangled twice, as the CallConv data from
; the call to @someFunc() was taken into consideration for the check
; when getting the value for processing the call to @otherFunc(...),
; after the proper convertion had taken place when lowering the return
; value of the first call.
```

This patch fixes the issue by disregarding the Calling Convention
information for such copyFromRegs, making sure the ABI mangling is
properly contanined in the Calling Convention Lowering.

This fixes Bugzilla #47454.

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D87844

(cherry picked from commit 53d238a961d14eae46f6f2b296ce48026c7bd0a1)
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 30 +-----------
 llvm/test/CodeGen/ARM/pr47454.ll              | 49 +++++++++++++++++++
 2 files changed, 51 insertions(+), 28 deletions(-)
 create mode 100644 llvm/test/CodeGen/ARM/pr47454.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index feb949f81eba..d2930391f87a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -169,32 +169,6 @@ static cl::opt<unsigned> SwitchPeelThreshold(
 // store [4096 x i8] %data, [4096 x i8]* %buffer
 static const unsigned MaxParallelChains = 64;
 
-// Return the calling convention if the Value passed requires ABI mangling as it
-// is a parameter to a function or a return value from a function which is not
-// an intrinsic.
-static Optional<CallingConv::ID> getABIRegCopyCC(const Value *V) {
-  if (auto *R = dyn_cast<ReturnInst>(V))
-    return R->getParent()->getParent()->getCallingConv();
-
-  if (auto *CI = dyn_cast<CallInst>(V)) {
-    const bool IsInlineAsm = CI->isInlineAsm();
-    const bool IsIndirectFunctionCall =
-        !IsInlineAsm && !CI->getCalledFunction();
-
-    // It is possible that the call instruction is an inline asm statement or an
-    // indirect function call in which case the return value of
-    // getCalledFunction() would be nullptr.
-    const bool IsInstrinsicCall =
-        !IsInlineAsm && !IsIndirectFunctionCall &&
-        CI->getCalledFunction()->getIntrinsicID() != Intrinsic::not_intrinsic;
-
-    if (!IsInlineAsm && !IsInstrinsicCall)
-      return CI->getCallingConv();
-  }
-
-  return None;
-}
-
 static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                       const SDValue *Parts, unsigned NumParts,
                                       MVT PartVT, EVT ValueVT, const Value *V,
@@ -1624,7 +1598,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
 
     RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg,
-                     Inst->getType(), getABIRegCopyCC(V));
+                     Inst->getType(), None);
     SDValue Chain = DAG.getEntryNode();
     return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
   }
@@ -5555,7 +5529,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
     if (VMI != FuncInfo.ValueMap.end()) {
       const auto &TLI = DAG.getTargetLoweringInfo();
       RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), VMI->second,
-                       V->getType(), getABIRegCopyCC(V));
+                       V->getType(), None);
       if (RFV.occupiesMultipleRegs()) {
         splitMultiRegDbgValue(RFV.getRegsAndSizes());
         return true;
diff --git a/llvm/test/CodeGen/ARM/pr47454.ll b/llvm/test/CodeGen/ARM/pr47454.ll
new file mode 100644
index 000000000000..d36a29c4e77c
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/pr47454.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=armv8-unknown-linux-unknown -mattr=-fp16 -O0 < %s | FileCheck %s
+
+declare fastcc half @getConstant()
+
+declare fastcc i1 @isEqual(half %0, half %1)
+
+define internal fastcc void @main() {
+; CHECK-LABEL: main:
+; CHECK:       @ %bb.0: @ %Entry
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    mov r11, sp
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    mov r0, #31744
+; CHECK-NEXT:    strh r0, [r11, #-2]
+; CHECK-NEXT:    ldrh r0, [r11, #-2]
+; CHECK-NEXT:    bl __gnu_h2f_ieee
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vstr s0, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    bl getConstant
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bl __gnu_h2f_ieee
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    bl __gnu_f2h_ieee
+; CHECK-NEXT:    vldr s0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    bl __gnu_f2h_ieee
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    uxth r1, r0
+; CHECK-NEXT:    vmov s1, r1
+; CHECK-NEXT:    bl isEqual
+; CHECK-NEXT:    mov sp, r11
+; CHECK-NEXT:    pop {r11, pc}
+Entry:
+    ; First arg directly from constant
+    %const = alloca half, align 2
+    store half 0xH7C00, half* %const, align 2
+    %arg1 = load half, half* %const, align 2
+    ; Second arg from fucntion return
+    %arg2 = call fastcc half @getConstant()
+    ; Arguments should have equivalent mangling
+    %result = call fastcc i1 @isEqual(half %arg1, half %arg2)
+    ret void
+}

From 8aca41f39c207b6f9efe2e448986d109892072ad Mon Sep 17 00:00:00 2001
From: Jessica Clarke <jrtc27@jrtc27.com>
Date: Thu, 17 Sep 2020 13:44:01 +0100
Subject: [PATCH 264/363] [clang][docs] Fix documentation of -O

D79916 changed the behaviour from -O2 to -O1 but the documentation was
not updated to reflect this.

(cherry picked from commit 788c7d2ec11dfc868a5b03478c922dc9699c6d47)
---
 clang/docs/CommandGuide/clang.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/docs/CommandGuide/clang.rst b/clang/docs/CommandGuide/clang.rst
index 2cca04fb31f1..2dfeafd1817a 100644
--- a/clang/docs/CommandGuide/clang.rst
+++ b/clang/docs/CommandGuide/clang.rst
@@ -385,7 +385,7 @@ Code Generation Options
     :option:`-Og` Like :option:`-O1`. In future versions, this option might
     disable different optimizations in order to improve debuggability.
 
-    :option:`-O` Equivalent to :option:`-O2`.
+    :option:`-O` Equivalent to :option:`-O1`.
 
     :option:`-O4` and higher
 

From 1b80e741f511df849945dfd0bd31b8d75b58d54b Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Tue, 22 Sep 2020 16:53:33 +0200
Subject: [PATCH 265/363] llvm index.rst: Drop in-progress warning

---
 llvm/docs/index.rst | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/llvm/docs/index.rst b/llvm/docs/index.rst
index 7e0bc8c4552b..0df1ea79b4f1 100644
--- a/llvm/docs/index.rst
+++ b/llvm/docs/index.rst
@@ -1,10 +1,5 @@
 About
-========
-
-.. warning::
-
-   If you are using a released version of LLVM, see `the download page
-   <https://llvm.org/releases/>`_ to find your documentation.
+=====
 
 The LLVM compiler infrastructure supports a wide range of projects, from
 industrial strength compilers to specialized JIT applications to small

From 0b56e5490dc33e4e7a4fdd837e642f72a2659189 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Tue, 22 Sep 2020 16:55:07 +0200
Subject: [PATCH 266/363] clang-tools-extra: Drop doxygen link from index.rst

---
 clang-tools-extra/docs/index.rst | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/clang-tools-extra/docs/index.rst b/clang-tools-extra/docs/index.rst
index d5c00b89a155..f1eaeb502ab5 100644
--- a/clang-tools-extra/docs/index.rst
+++ b/clang-tools-extra/docs/index.rst
@@ -24,23 +24,6 @@ Contents
    clang-doc
 
 
-Doxygen Documentation
-=====================
-The Doxygen documentation describes the **internal** software that makes up the
-tools of clang-tools-extra, not the **external** use of these tools. The Doxygen
-documentation contains no instructions about how to use the tools, only the APIs
-that make up the software. For usage instructions, please see the user's guide
-or reference manual for each tool.
-
-* `Doxygen documentation`_
-
-.. _`Doxygen documentation`: doxygen/annotated.html
-
-.. note::
-    This documentation is generated directly from the source code with doxygen.
-    Since the tools of clang-tools-extra are constantly under active
-    development, what you're about to read is out of date!
-
 
 Indices and tables
 ==================

From e9adcbfad0d31d4d703fe8fcf56cc73b38e59c07 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Thu, 24 Sep 2020 16:17:13 +0200
Subject: [PATCH 267/363] release notes: mention zig as an external project

---
 llvm/docs/ReleaseNotes.rst | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 2af813fda1aa..d724ba09502a 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -342,6 +342,21 @@ Changes to the LLVM tools
 * llvm-lib supports adding import library objects in addition to regular
   object files
 
+External Open Source Projects Using LLVM 11
+===========================================
+
+Zig Programming Language
+------------------------
+
+`Zig <https://ziglang.org>`_  is a general-purpose programming language and
+toolchain for maintaining robust, optimal, and reusable software. In addition
+to supporting LLVM as an optional backend, Zig links Clang and LLD to provide
+an out-of-the-box cross compilation experience, not only for Zig code but for
+C and C++ code as well. Using a sophisticated caching system, Zig lazily builds
+from source compiler-rt, mingw-w64, musl, glibc, libcxx, libcxxabi, and
+libunwind for the selected target - a "batteries included" drop-in for GCC/Clang
+that works the same on every platform.
+
 Additional Information
 ======================
 

From 81eb1c1fa75c6407713b5657156d8d9149572bfe Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 24 Sep 2020 15:36:42 -0400
Subject: [PATCH 268/363] AArch64/GlobalISel: Reduced patch for bug 47619

This is the relevant portions of an assert fixed by
b98f902f1877c3d679f77645a267edc89ffcd5d6.
---
 llvm/lib/CodeGen/GlobalISel/CallLowering.cpp  | 12 +++++----
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |  8 ++++--
 .../irtranslator-stack-evt-bug47619.ll        | 26 +++++++++++++++++++
 3 files changed, 39 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-evt-bug47619.ll

diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index a7146515c4c9..1be0ca441205 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -375,13 +375,15 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
             << "Load/store a split arg to/from the stack not implemented yet");
         return false;
       }
-      MVT VT = MVT::getVT(Args[i].Ty);
-      unsigned Size = VT == MVT::iPTR ? DL.getPointerSize()
-                                      : alignTo(VT.getSizeInBits(), 8) / 8;
+
+      EVT LocVT = VA.getValVT();
+      unsigned MemSize = LocVT == MVT::iPTR ? DL.getPointerSize()
+                                            : LocVT.getStoreSize();
+
       unsigned Offset = VA.getLocMemOffset();
       MachinePointerInfo MPO;
-      Register StackAddr = Handler.getStackAddress(Size, Offset, MPO);
-      Handler.assignValueToAddress(Args[i], StackAddr, Size, MPO, VA);
+      Register StackAddr = Handler.getStackAddress(MemSize, Offset, MPO);
+      Handler.assignValueToAddress(Args[i], StackAddr, MemSize, MPO, VA);
     } else {
       // FIXME: Support byvals and other weirdness
       return false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 05a4e3462a26..949dcea3aa18 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -129,13 +129,17 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
     }
   }
 
-  void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+  void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
     MachineFunction &MF = MIRBuilder.getMF();
 
+    // The reported memory location may be wider than the value.
+    const LLT RegTy = MRI.getType(ValVReg);
+    MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize);
+
     // FIXME: Get alignment
     auto MMO = MF.getMachineMemOperand(
-        MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
+        MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize,
         inferAlignFromPtrInfo(MF, MPO));
     MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
   }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-evt-bug47619.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-evt-bug47619.ll
new file mode 100644
index 000000000000..552997e44f09
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-evt-bug47619.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -mtriple=aarch64-unknown-unknown -stop-after=irtranslator %s -o - | FileCheck %s
+
+; Make sure the i3 %arg8 value is correctly handled. This was trying
+; to use MVT for EVT values passed on the stack and asserting before
+; b98f902f1877c3d679f77645a267edc89ffcd5d6
+define i3 @bug47619(i64 %arg, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i3 %arg8) {
+  ; CHECK-LABEL: name: bug47619
+  ; CHECK: bb.1.bb:
+  ; CHECK:   liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
+  ; CHECK:   [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
+  ; CHECK:   [[COPY4:%[0-9]+]]:_(s64) = COPY $x4
+  ; CHECK:   [[COPY5:%[0-9]+]]:_(s64) = COPY $x5
+  ; CHECK:   [[COPY6:%[0-9]+]]:_(s64) = COPY $x6
+  ; CHECK:   [[COPY7:%[0-9]+]]:_(s64) = COPY $x7
+  ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s3) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load 4 from %fixed-stack.0, align 16)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s3)
+  ; CHECK:   $w0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   RET_ReallyLR implicit $w0
+bb:
+  ret i3 %arg8
+}

From 184a13d362e041b1fcd14a5e782ba0b17d13dc3c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 25 Sep 2020 10:26:36 -0400
Subject: [PATCH 269/363] AArch64/GlobalISel: Narrow stack passed argument
 access size

This fixes a verifier error in the testcase from bug 47619.

The stack passed s3 value was widened to 4-bytes, and producing a
4-byte memory access with a < 1 byte result type. We need to either
widen the result type or narrow the access size. This copies the code
directly from the AMDGPU handling, which narrows the load size. I
don't like that every target has to handle this, but this is currently
broken on the 11 release branch and this is the simplest fix.

This reverts commit 42bfa7c63b85e76fe16521d1671afcafaf8f64ed.

(cherry picked from commit 6cb0d23f2ea6fb25106b0380797ccbc2141d71e1)
---
 llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp    | 9 +++++++--
 .../GlobalISel/irtranslator-stack-evt-bug47619.ll        | 4 ++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 11a8d5def429..4832ae8f415f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -84,11 +84,16 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
     }
   }
 
-  void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+  void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
     MachineFunction &MF = MIRBuilder.getMF();
+
+    // The reported memory location may be wider than the value.
+    const LLT RegTy = MRI.getType(ValVReg);
+    MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize);
+
     auto MMO = MF.getMachineMemOperand(
-        MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
+        MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize,
         inferAlignFromPtrInfo(MF, MPO));
     MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
   }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-evt-bug47619.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-evt-bug47619.ll
index 552997e44f09..ca36e5da5e5f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-evt-bug47619.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-evt-bug47619.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=aarch64-unknown-unknown -stop-after=irtranslator %s -o - | FileCheck %s
+; RUN: llc -global-isel -mtriple=aarch64-unknown-unknown -stop-after=irtranslator -verify-machineinstrs %s -o - | FileCheck %s
 
 ; Make sure the i3 %arg8 value is correctly handled. This was trying
 ; to use MVT for EVT values passed on the stack and asserting before
@@ -17,7 +17,7 @@ define i3 @bug47619(i64 %arg, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %a
   ; CHECK:   [[COPY6:%[0-9]+]]:_(s64) = COPY $x6
   ; CHECK:   [[COPY7:%[0-9]+]]:_(s64) = COPY $x7
   ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
-  ; CHECK:   [[LOAD:%[0-9]+]]:_(s3) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load 4 from %fixed-stack.0, align 16)
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s3) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load 1 from %fixed-stack.0, align 16)
   ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s3)
   ; CHECK:   $w0 = COPY [[ANYEXT]](s32)
   ; CHECK:   RET_ReallyLR implicit $w0

From 1e4b179bf821bfff8fad7f46423494ed1f62dac0 Mon Sep 17 00:00:00 2001
From: Simon Atanasyan <simon@atanasyan.com>
Date: Fri, 25 Sep 2020 00:01:07 +0300
Subject: [PATCH 270/363] [CodeGen] Do not call `emitGlobalConstantLargeInt`
 for constant requires 8 bytes to store

This is a fix for PR47630. The regression is caused by the D78011. After
this change the code starts to call the `emitGlobalConstantLargeInt` even
for constants which requires eight bytes to store.

Differential revision: https://reviews.llvm.org/D88261

(cherry picked from commit c6c5629f2fb4ddabd376fbe7c218733283e91d09)
---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp |  2 +-
 llvm/test/CodeGen/Mips/emit-big-cst.ll     | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index f8f7b74baf91..c7eb0257d71b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2779,7 +2779,7 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV,
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
     const uint64_t StoreSize = DL.getTypeStoreSize(CV->getType());
 
-    if (StoreSize < 8) {
+    if (StoreSize <= 8) {
       if (AP.isVerbose())
         AP.OutStreamer->GetCommentOS() << format("0x%" PRIx64 "\n",
                                                  CI->getZExtValue());
diff --git a/llvm/test/CodeGen/Mips/emit-big-cst.ll b/llvm/test/CodeGen/Mips/emit-big-cst.ll
index 67c2f107db19..679824ef047b 100644
--- a/llvm/test/CodeGen/Mips/emit-big-cst.ll
+++ b/llvm/test/CodeGen/Mips/emit-big-cst.ll
@@ -16,6 +16,14 @@
 ; LE-NEXT: .space 5
 ; LE-NEXT: .size bigCst, 16
 
+; BE-LABEL: notSoBigCst:
+; BE-NEXT:  .8byte  72057594037927935
+; BE-NEXT:  .size   notSoBigCst, 8
+
+; LE-LABEL: notSoBigCst:
+; LE-NEXT:  .8byte  72057594037927935
+; LE-NEXT:  .size   notSoBigCst, 8
+
 ; BE-LABEL: smallCst:
 ; BE-NEXT: .2byte 4386
 ; BE-NEXT: .byte 51
@@ -38,4 +46,14 @@ define void @accessBig(i64* %storage) {
   ret void
 }
 
+@notSoBigCst = internal constant i57 72057594037927935
+
+define void @accessNotSoBig(i64* %storage) {
+  %addr = bitcast i64* %storage to i57*
+  %bigLoadedCst = load volatile i57, i57* @notSoBigCst
+  %tmp = add i57 %bigLoadedCst, 1
+  store i57 %tmp, i57* %addr
+  ret void
+}
+
 @smallCst = internal constant i24 1122867

From 9e367bd69b0d2523237e204b43301e59a5badb29 Mon Sep 17 00:00:00 2001
From: Craig Disselkoen <craigdissel@gmail.com>
Date: Fri, 25 Sep 2020 14:34:23 -0700
Subject: [PATCH 271/363] C API: functions to get mask of a ShuffleVector

This commit fixes a regression (from LLVM 10 to LLVM 11 RC3) in the LLVM
C API.

Previously, commit 1ee6ec2bf removed the mask operand from the
ShuffleVector instruction, storing the mask data separately in the
instruction instead; this reduced the number of operands of
ShuffleVector from 3 to 2. AFAICT, this change unintentionally caused
a regression in the LLVM C API. Specifically, it is no longer possible
to get the mask of a ShuffleVector instruction through the C API. This
patch introduces new functions which together allow a C API user to get
the mask of a ShuffleVector instruction, restoring the functionality
which was previously available through LLVMGetOperand().

This patch also adds tests for this change to the llvm-c-test
executable, which involved adding support for InsertElement,
ExtractElement, and ShuffleVector itself (as well as constant vectors)
to echo.cpp. Previously, vector operations weren't tested at all in
echo.ll.

I also fixed some typos in comments and help-text nearby these changes,
which I happened to spot while developing this patch. Since the typo
fixes are technically unrelated other than being in the same files, I'm
happy to take them out if you'd rather they not be included in the patch.

Differential Revision: https://reviews.llvm.org/D88190

(cherry picked from commit 51cad041e0cb26597c7ccc0fbfaa349b8fffbcda)
---
 llvm/include/llvm-c/Core.h        | 15 +++++-
 llvm/lib/IR/Core.cpp              | 14 ++++++
 llvm/test/Bindings/llvm-c/echo.ll | 18 +++++++
 llvm/tools/llvm-c-test/echo.cpp   | 80 ++++++++++++++++++++++++++-----
 llvm/tools/llvm-c-test/main.c     |  9 ++--
 5 files changed, 119 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 2c7b4c6eff10..34d23146be40 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -3636,7 +3636,7 @@ void LLVMAddDestination(LLVMValueRef IndirectBr, LLVMBasicBlockRef Dest);
 /* Get the number of clauses on the landingpad instruction */
 unsigned LLVMGetNumClauses(LLVMValueRef LandingPad);
 
-/* Get the value of the clause at idnex Idx on the landingpad instruction */
+/* Get the value of the clause at index Idx on the landingpad instruction */
 LLVMValueRef LLVMGetClause(LLVMValueRef LandingPad, unsigned Idx);
 
 /* Add a catch or filter clause to the landingpad instruction */
@@ -3937,6 +3937,19 @@ LLVMValueRef LLVMBuildAtomicCmpXchg(LLVMBuilderRef B, LLVMValueRef Ptr,
                                     LLVMAtomicOrdering FailureOrdering,
                                     LLVMBool SingleThread);
 
+/**
+ * Get the number of elements in the mask of a ShuffleVector instruction.
+ */
+unsigned LLVMGetNumMaskElements(LLVMValueRef ShuffleVectorInst);
+
+/**
+ * Get the mask value at position Elt in the mask of a ShuffleVector
+ * instruction. Return LLVMUndefMaskElem if the mask value is undef at that
+ * position.
+ */
+int LLVMGetMaskValue(LLVMValueRef ShuffleVectorInst, unsigned Elt);
+extern const int LLVMUndefMaskElem;
+
 LLVMBool LLVMIsAtomicSingleThread(LLVMValueRef AtomicInst);
 void LLVMSetAtomicSingleThread(LLVMValueRef AtomicInst, LLVMBool SingleThread);
 
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 6f3bbc80d4fd..9caaea4b1f7a 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -3952,6 +3952,20 @@ LLVMValueRef LLVMBuildAtomicCmpXchg(LLVMBuilderRef B, LLVMValueRef Ptr,
                 singleThread ? SyncScope::SingleThread : SyncScope::System));
 }
 
+unsigned LLVMGetNumMaskElements(LLVMValueRef SVInst) {
+  Value *P = unwrap<Value>(SVInst);
+  ShuffleVectorInst *I = cast<ShuffleVectorInst>(P);
+  return I->getShuffleMask().size();
+}
+
+int LLVMGetMaskValue(LLVMValueRef SVInst, unsigned Elt) {
+  Value *P = unwrap<Value>(SVInst);
+  ShuffleVectorInst *I = cast<ShuffleVectorInst>(P);
+  return I->getMaskValue(Elt);
+}
+const int LLVMUndefMaskElem =
+    -1; // not actually accessible as ShuffleVectorInst::UndefMaskElem, so we
+        // hardcode it here
 
 LLVMBool LLVMIsAtomicSingleThread(LLVMValueRef AtomicInst) {
   Value *P = unwrap<Value>(AtomicInst);
diff --git a/llvm/test/Bindings/llvm-c/echo.ll b/llvm/test/Bindings/llvm-c/echo.ll
index 510798592b9d..5494170f3cd6 100644
--- a/llvm/test/Bindings/llvm-c/echo.ll
+++ b/llvm/test/Bindings/llvm-c/echo.ll
@@ -156,6 +156,24 @@ define void @memops(i8* %ptr) {
   ret void
 }
 
+define i32 @vectorops(i32, i32) {
+  %a = insertelement <4 x i32> undef, i32 %0, i32 0
+  %b = insertelement <4 x i32> %a, i32 %1, i32 2
+  %c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
+  %d = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %e = add <4 x i32> %d, %a
+  %f = mul <4 x i32> %e, %b
+  %g = xor <4 x i32> %f, %d
+  %h = or <4 x i32> %f, %e
+  %i = lshr <4 x i32> %h, <i32 2, i32 2, i32 2, i32 2>
+  %j = shl <4 x i32> %i, <i32 2, i32 3, i32 4, i32 5>
+  %k = shufflevector <4 x i32> %j, <4 x i32> %i, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m = shufflevector <4 x i32> %k, <4 x i32> undef, <1 x i32> <i32 1>
+  %n = shufflevector <4 x i32> %j, <4 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 2, i32 undef, i32 3, i32 undef, i32 undef>
+  %p = extractelement <8 x i32> %n, i32 5
+  ret i32 %p
+}
+
 declare void @personalityFn()
 
 define void @exn() personality void ()* @personalityFn {
diff --git a/llvm/tools/llvm-c-test/echo.cpp b/llvm/tools/llvm-c-test/echo.cpp
index b254da28ddc4..b404048749d3 100644
--- a/llvm/tools/llvm-c-test/echo.cpp
+++ b/llvm/tools/llvm-c-test/echo.cpp
@@ -30,7 +30,7 @@ template<typename T>
 struct CAPIDenseMap {};
 
 // The default DenseMapInfo require to know about pointer alignment.
-// Because the C API uses opaques pointer types, their alignment is unknown.
+// Because the C API uses opaque pointer types, their alignment is unknown.
 // As a result, we need to roll out our own implementation.
 template<typename T>
 struct CAPIDenseMap<T*> {
@@ -306,7 +306,7 @@ static LLVMValueRef clone_constant_impl(LLVMValueRef Cst, LLVMModuleRef M) {
     return LLVMConstArray(LLVMGetElementType(Ty), Elts.data(), EltCount);
   }
 
-  // Try contant data array
+  // Try constant data array
   if (LLVMIsAConstantDataArray(Cst)) {
     check_value_kind(Cst, LLVMConstantDataArrayValueKind);
     LLVMTypeRef Ty = TypeCloner(M).Clone(Cst);
@@ -357,9 +357,32 @@ static LLVMValueRef clone_constant_impl(LLVMValueRef Cst, LLVMModuleRef M) {
     report_fatal_error("ConstantFP is not supported");
   }
 
-  // This kind of constant is not supported
+  // Try ConstantVector
+  if (LLVMIsAConstantVector(Cst)) {
+    check_value_kind(Cst, LLVMConstantVectorValueKind);
+    LLVMTypeRef Ty = TypeCloner(M).Clone(Cst);
+    unsigned EltCount = LLVMGetVectorSize(Ty);
+    SmallVector<LLVMValueRef, 8> Elts;
+    for (unsigned i = 0; i < EltCount; i++)
+      Elts.push_back(clone_constant(LLVMGetOperand(Cst, i), M));
+    return LLVMConstVector(Elts.data(), EltCount);
+  }
+
+  // Try ConstantDataVector
+  if (LLVMIsAConstantDataVector(Cst)) {
+    check_value_kind(Cst, LLVMConstantDataVectorValueKind);
+    LLVMTypeRef Ty = TypeCloner(M).Clone(Cst);
+    unsigned EltCount = LLVMGetVectorSize(Ty);
+    SmallVector<LLVMValueRef, 8> Elts;
+    for (unsigned i = 0; i < EltCount; i++)
+      Elts.push_back(clone_constant(LLVMGetElementAsConstant(Cst, i), M));
+    return LLVMConstVector(Elts.data(), EltCount);
+  }
+
+  // At this point, if it's not a constant expression, it's a kind of constant
+  // which is not supported
   if (!LLVMIsAConstantExpr(Cst))
-    report_fatal_error("Expected a constant expression");
+    report_fatal_error("Unsupported constant kind");
 
   // At this point, it must be a constant expression
   check_value_kind(Cst, LLVMConstantExprValueKind);
@@ -370,7 +393,8 @@ static LLVMValueRef clone_constant_impl(LLVMValueRef Cst, LLVMModuleRef M) {
       return LLVMConstBitCast(clone_constant(LLVMGetOperand(Cst, 0), M),
                               TypeCloner(M).Clone(Cst));
     default:
-      fprintf(stderr, "%d is not a supported opcode\n", Op);
+      fprintf(stderr, "%d is not a supported opcode for constant expressions\n",
+              Op);
       exit(-1);
   }
 }
@@ -443,7 +467,7 @@ struct FunCloner {
       auto i = VMap.find(Src);
       if (i != VMap.end()) {
         // If we have a hit, it means we already generated the instruction
-        // as a dependancy to somethign else. We need to make sure
+        // as a dependency to something else. We need to make sure
         // it is ordered properly.
         auto I = i->second;
         LLVMInstructionRemoveFromParent(I);
@@ -746,8 +770,10 @@ struct FunCloner {
       }
       case LLVMExtractValue: {
         LLVMValueRef Agg = CloneValue(LLVMGetOperand(Src, 0));
-        if (LLVMGetNumIndices(Src) != 1)
-          report_fatal_error("Expected only one indice");
+        if (LLVMGetNumIndices(Src) > 1)
+          report_fatal_error("ExtractValue: Expected only one index");
+        else if (LLVMGetNumIndices(Src) < 1)
+          report_fatal_error("ExtractValue: Expected an index");
         auto I = LLVMGetIndices(Src)[0];
         Dst = LLVMBuildExtractValue(Builder, Agg, I, Name);
         break;
@@ -755,12 +781,44 @@ struct FunCloner {
       case LLVMInsertValue: {
         LLVMValueRef Agg = CloneValue(LLVMGetOperand(Src, 0));
         LLVMValueRef V = CloneValue(LLVMGetOperand(Src, 1));
-        if (LLVMGetNumIndices(Src) != 1)
-          report_fatal_error("Expected only one indice");
+        if (LLVMGetNumIndices(Src) > 1)
+          report_fatal_error("InsertValue: Expected only one index");
+        else if (LLVMGetNumIndices(Src) < 1)
+          report_fatal_error("InsertValue: Expected an index");
         auto I = LLVMGetIndices(Src)[0];
         Dst = LLVMBuildInsertValue(Builder, Agg, V, I, Name);
         break;
       }
+      case LLVMExtractElement: {
+        LLVMValueRef Agg = CloneValue(LLVMGetOperand(Src, 0));
+        LLVMValueRef Index = CloneValue(LLVMGetOperand(Src, 1));
+        Dst = LLVMBuildExtractElement(Builder, Agg, Index, Name);
+        break;
+      }
+      case LLVMInsertElement: {
+        LLVMValueRef Agg = CloneValue(LLVMGetOperand(Src, 0));
+        LLVMValueRef V = CloneValue(LLVMGetOperand(Src, 1));
+        LLVMValueRef Index = CloneValue(LLVMGetOperand(Src, 2));
+        Dst = LLVMBuildInsertElement(Builder, Agg, V, Index, Name);
+        break;
+      }
+      case LLVMShuffleVector: {
+        LLVMValueRef Agg0 = CloneValue(LLVMGetOperand(Src, 0));
+        LLVMValueRef Agg1 = CloneValue(LLVMGetOperand(Src, 1));
+        SmallVector<LLVMValueRef, 8> MaskElts;
+        unsigned NumMaskElts = LLVMGetNumMaskElements(Src);
+        for (unsigned i = 0; i < NumMaskElts; i++) {
+          int Val = LLVMGetMaskValue(Src, i);
+          if (Val == LLVMUndefMaskElem) {
+            MaskElts.push_back(LLVMGetUndef(LLVMInt64Type()));
+          } else {
+            MaskElts.push_back(LLVMConstInt(LLVMInt64Type(), Val, true));
+          }
+        }
+        LLVMValueRef Mask = LLVMConstVector(MaskElts.data(), NumMaskElts);
+        Dst = LLVMBuildShuffleVector(Builder, Agg0, Agg1, Mask, Name);
+        break;
+      }
       case LLVMFreeze: {
         LLVMValueRef Arg = CloneValue(LLVMGetOperand(Src, 0));
         Dst = LLVMBuildFreeze(Builder, Arg, Name);
@@ -1102,7 +1160,7 @@ static void clone_symbols(LLVMModuleRef Src, LLVMModuleRef M) {
       LLVMGlobalSetMetadata(G, Kind, MD);
     }
     LLVMDisposeValueMetadataEntries(AllMetadata);
-    
+
     LLVMSetGlobalConstant(G, LLVMIsGlobalConstant(Cur));
     LLVMSetThreadLocal(G, LLVMIsThreadLocal(Cur));
     LLVMSetExternallyInitialized(G, LLVMIsExternallyInitialized(Cur));
diff --git a/llvm/tools/llvm-c-test/main.c b/llvm/tools/llvm-c-test/main.c
index 0be48930f0b6..5e8adb45d691 100644
--- a/llvm/tools/llvm-c-test/main.c
+++ b/llvm/tools/llvm-c-test/main.c
@@ -36,10 +36,10 @@ static void print_usage(void) {
   fprintf(stderr, "  * --targets-list\n");
   fprintf(stderr, "    List available targets\n\n");
   fprintf(stderr, "  * --object-list-sections\n");
-  fprintf(stderr, "    Read object file form stdin - list sections\n\n");
+  fprintf(stderr, "    Read object file from stdin - list sections\n\n");
   fprintf(stderr, "  * --object-list-symbols\n");
   fprintf(stderr,
-          "    Read object file form stdin - list symbols (like nm)\n\n");
+          "    Read object file from stdin - list symbols (like nm)\n\n");
   fprintf(stderr, "  * --disassemble\n");
   fprintf(stderr, "    Read lines of triple, hex ascii machine code from stdin "
                   "- print disassembly\n\n");
@@ -48,11 +48,10 @@ static void print_usage(void) {
       stderr,
       "    Read lines of name, rpn from stdin - print generated module\n\n");
   fprintf(stderr, "  * --echo\n");
-  fprintf(stderr,
-          "    Read bitcode file form stdin - print it back out\n\n");
+  fprintf(stderr, "    Read bitcode file from stdin - print it back out\n\n");
   fprintf(stderr, "  * --test-diagnostic-handler\n");
   fprintf(stderr,
-          "    Read bitcode file form stdin with a diagnostic handler set\n\n");
+          "    Read bitcode file from stdin with a diagnostic handler set\n\n");
   fprintf(stderr, "  * --test-dibuilder\n");
   fprintf(stderr,
           "    Run tests for the DIBuilder C API - print generated module\n\n");

From 293924973057e33fcc63521f582bb9fd41e60cc4 Mon Sep 17 00:00:00 2001
From: Robert Widmann <devteam.codafi@gmail.com>
Date: Sat, 26 Sep 2020 17:32:38 -0600
Subject: [PATCH 272/363] [LLVM-C] Turn a ShuffleVector Constant Into a Getter.

It is not a good idea to expose raw constants in the LLVM C API. Replace this with an explicit getter.

Differential Revision: https://reviews.llvm.org/D88367

(cherry picked from commit 55f727306e727ea9f013d09c9b8aa70dbce6a1bd)
---
 llvm/include/llvm-c/Core.h      | 13 ++++++++++---
 llvm/lib/IR/Core.cpp            |  5 ++---
 llvm/tools/llvm-c-test/echo.cpp |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 34d23146be40..c8a6f970419b 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -3942,13 +3942,20 @@ LLVMValueRef LLVMBuildAtomicCmpXchg(LLVMBuilderRef B, LLVMValueRef Ptr,
  */
 unsigned LLVMGetNumMaskElements(LLVMValueRef ShuffleVectorInst);
 
+/**
+ * \returns a constant that specifies that the result of a \c ShuffleVectorInst
+ * is undefined.
+ */
+int LLVMGetUndefMaskElem(void);
+
 /**
  * Get the mask value at position Elt in the mask of a ShuffleVector
- * instruction. Return LLVMUndefMaskElem if the mask value is undef at that
- * position.
+ * instruction.
+ *
+ * \Returns the result of \c LLVMGetUndefMaskElem() if the mask value is undef
+ * at that position.
  */
 int LLVMGetMaskValue(LLVMValueRef ShuffleVectorInst, unsigned Elt);
-extern const int LLVMUndefMaskElem;
 
 LLVMBool LLVMIsAtomicSingleThread(LLVMValueRef AtomicInst);
 void LLVMSetAtomicSingleThread(LLVMValueRef AtomicInst, LLVMBool SingleThread);
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 9caaea4b1f7a..c1f7329034e0 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -3963,9 +3963,8 @@ int LLVMGetMaskValue(LLVMValueRef SVInst, unsigned Elt) {
   ShuffleVectorInst *I = cast<ShuffleVectorInst>(P);
   return I->getMaskValue(Elt);
 }
-const int LLVMUndefMaskElem =
-    -1; // not actually accessible as ShuffleVectorInst::UndefMaskElem, so we
-        // hardcode it here
+
+int LLVMGetUndefMaskElem(void) { return UndefMaskElem; }
 
 LLVMBool LLVMIsAtomicSingleThread(LLVMValueRef AtomicInst) {
   Value *P = unwrap<Value>(AtomicInst);
diff --git a/llvm/tools/llvm-c-test/echo.cpp b/llvm/tools/llvm-c-test/echo.cpp
index b404048749d3..0b3a10f463dd 100644
--- a/llvm/tools/llvm-c-test/echo.cpp
+++ b/llvm/tools/llvm-c-test/echo.cpp
@@ -809,7 +809,7 @@ struct FunCloner {
         unsigned NumMaskElts = LLVMGetNumMaskElements(Src);
         for (unsigned i = 0; i < NumMaskElts; i++) {
           int Val = LLVMGetMaskValue(Src, i);
-          if (Val == LLVMUndefMaskElem) {
+          if (Val == LLVMGetUndefMaskElem()) {
             MaskElts.push_back(LLVMGetUndef(LLVMInt64Type()));
           } else {
             MaskElts.push_back(LLVMConstInt(LLVMInt64Type(), Val, true));

From eb83b551d3eb08cf472fe6307fe3809a8005b2cc Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Mon, 28 Sep 2020 15:35:01 +0200
Subject: [PATCH 273/363] Fix mysterious failure of SupportTests
 FileCheckTest.Binop

The test would fail in no-asserts release builds using MSVC
for 64-bit Windows:

Unexpected error message:
TestBuffer:1:1: error: implicit format conflict between 'FOO' (%u) and '18\0' (%x), need an explicit format specifier

Error message(s) not found:
{implicit format conflict between 'FOO' (%u) and 'BAZ' (%x), need an explicit format specifier}

It seems a string from a previous test case is finding its way
into the latter one.

This doesn't reproduce on master anymore after 998709b7d, so let's
just hack around it here for the branch.
---
 llvm/unittests/Support/FileCheckTest.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/unittests/Support/FileCheckTest.cpp b/llvm/unittests/Support/FileCheckTest.cpp
index 92975dcd76b7..a4591bc319bb 100644
--- a/llvm/unittests/Support/FileCheckTest.cpp
+++ b/llvm/unittests/Support/FileCheckTest.cpp
@@ -714,6 +714,7 @@ TEST_F(FileCheckTest, Binop) {
   Value = Binop.eval();
   expectUndefErrors({"FOO", "BAR"}, Value.takeError());
 
+  {
   // Literal + Variable has format of variable.
   ExprStr = bufferize(SM, "FOO+18");
   FooStr = ExprStr.take_front(3);
@@ -736,6 +737,7 @@ TEST_F(FileCheckTest, Binop) {
   ImplicitFormat = Binop.getImplicitFormat(SM);
   ASSERT_THAT_EXPECTED(ImplicitFormat, Succeeded());
   EXPECT_EQ(*ImplicitFormat, ExpressionFormat::Kind::Unsigned);
+  }
 
   // Variables with different implicit format conflict.
   ExprStr = bufferize(SM, "FOO+BAZ");

From dda0a1867cc0c4ace4535f179aec85c3ff8cfa96 Mon Sep 17 00:00:00 2001
From: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Date: Tue, 29 Sep 2020 14:58:46 +0200
Subject: [PATCH 274/363] [LLVM 11] Add SystemZ changes to release notes

Differential Revision: https://reviews.llvm.org/D88479
---
 llvm/docs/ReleaseNotes.rst | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index d724ba09502a..db64fa281018 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -241,6 +241,21 @@ Bug fixes:
 * The correct libcall is now emitted for converting a float/double to a 32-bit
   signed or unsigned integer on RV64 targets lacking the F or D extensions.
 
+Changes to the SystemZ Target
+-----------------------------
+
+* Added support for the MemorySanitizer and the LeakSanitizer.
+* Added support for the ``-fstack-clash-protection`` command line option.
+* Enhanced the assembler parser to allow using `%r0` even in an address
+  register context, and to allow specifying registers using plain integer
+  numbers instead of register names everywhere.
+* Fixed wrong code generation violating the platform ABI when passing
+  a C++ class (not struct) type having only a single member of
+  floating-point type.
+* Fixed wrong code generation when using the `vec_store_len_r` or
+  `vec_load_len_r` intrinsics with an immediate length argument of
+  16 or larger.
+* Miscellaneous codegen enhancements, in particular to improve vector code.
 
 Changes to the X86 Target
 -------------------------

From a3aee2678d07e249dca18493d2acd898eefd48dd Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Tue, 29 Sep 2020 14:39:54 -0700
Subject: [PATCH 275/363] [GlobalISel] Fix multiply with overflow intrinsics
 legalization generating invalid MIR.

During lowering of G_UMULO and friends, the previous code moved the builder's
insertion point to be after the legalizing instruction. When that happened, if
there happened to be a "G_CONSTANT i32 0" immediately after, the CSEMIRBuilder
would try to find that constant during the buildConstant(zero) call, and since
it dominates itself would return the iterator unchanged, even though the def
of the constant was *after* the current insertion point. This resulted in the
compare being generated *before* the constant which it was using.

There's no need to modify the insertion point before building the mul-hi or
constant. Delaying moving the insert point ensures those are built/CSEd before
the G_ICMP is built.

Fixes PR47679

Differential Revision: https://reviews.llvm.org/D88514

(cherry picked from commit 1d54e75cf26a4c60b66659d5d9c62f4bb9452b03)
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  5 +-
 .../AArch64/GlobalISel/legalize-mul.mir       | 68 ++++++++++++++++++-
 .../CodeGen/Mips/GlobalISel/legalizer/mul.mir |  2 +-
 .../CodeGen/Mips/GlobalISel/llvm-ir/mul.ll    | 12 ++--
 4 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index da519f99ad7e..244e7a9583d6 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2368,11 +2368,12 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     MI.RemoveOperand(1);
     Observer.changedInstr(MI);
 
-    MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
-
     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
     auto Zero = MIRBuilder.buildConstant(Ty, 0);
 
+    // Move insert point forward so we can use the Res register if needed.
+    MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+
     // For *signed* multiply, overflow is detected by checking:
     // (hi != (lo >> bitwidth-1))
     if (Opcode == TargetOpcode::G_SMULH) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
index 3260eb6ca6fd..187ddebd9804 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
@@ -28,8 +28,8 @@ body:             |
     ; CHECK-LABEL: name: test_smul_overflow
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
-    ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY]], [[COPY1]]
     ; CHECK: [[SMULH:%[0-9]+]]:_(s64) = G_SMULH [[COPY]], [[COPY1]]
+    ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY]], [[COPY1]]
     ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
     ; CHECK: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MUL]], [[C]]
     ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[SMULH]](s64), [[ASHR]]
@@ -51,9 +51,9 @@ body:             |
     ; CHECK-LABEL: name: test_umul_overflow
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
-    ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY]], [[COPY1]]
     ; CHECK: [[UMULH:%[0-9]+]]:_(s64) = G_UMULH [[COPY]], [[COPY1]]
     ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY]], [[COPY1]]
     ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[UMULH]](s64), [[C]]
     ; CHECK: $x0 = COPY [[MUL]](s64)
     ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ICMP]](s32)
@@ -66,3 +66,67 @@ body:             |
     $w0 = COPY %4(s32)
 
 ...
+---
+name:            test_umulo_overflow_no_invalid_mir
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0' }
+  - { reg: '$x1' }
+  - { reg: '$x2' }
+frameInfo:
+  maxAlignment:    16
+stack:
+  - { id: 0, size: 8, alignment: 8 }
+  - { id: 1, size: 8, alignment: 8 }
+  - { id: 2, size: 16, alignment: 16 }
+  - { id: 3, size: 16, alignment: 8 }
+machineFunctionInfo: {}
+body:             |
+  bb.1:
+    liveins: $x0, $x1, $x2
+    ; Check that the overflow result doesn't generate incorrect MIR by using a G_CONSTANT 0
+    ; before it's been defined.
+    ; CHECK-LABEL: name: test_umulo_overflow_no_invalid_mir
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
+    ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+    ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1
+    ; CHECK: [[FRAME_INDEX2:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.3
+    ; CHECK: G_STORE [[COPY2]](s64), [[FRAME_INDEX]](p0) :: (store 8)
+    ; CHECK: G_STORE [[COPY1]](s64), [[FRAME_INDEX1]](p0) :: (store 8)
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load 8)
+    ; CHECK: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX1]](p0) :: (dereferenceable load 8)
+    ; CHECK: [[UMULH:%[0-9]+]]:_(s64) = G_UMULH [[LOAD]], [[LOAD1]]
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[LOAD]], [[LOAD1]]
+    ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[UMULH]](s64), [[C]]
+    ; CHECK: G_STORE [[C]](s64), [[FRAME_INDEX2]](p0) :: (store 8, align 1)
+    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ICMP]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C1]]
+    ; CHECK: $x0 = COPY [[MUL]](s64)
+    ; CHECK: $x1 = COPY [[AND]](s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %0:_(p0) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %25:_(s32) = G_CONSTANT i32 0
+    %3:_(p0) = G_FRAME_INDEX %stack.0
+    %4:_(p0) = G_FRAME_INDEX %stack.1
+    %6:_(p0) = G_FRAME_INDEX %stack.3
+    G_STORE %2(s64), %3(p0) :: (store 8)
+    G_STORE %1(s64), %4(p0) :: (store 8)
+    %7:_(s64) = G_LOAD %3(p0) :: (dereferenceable load 8)
+    %8:_(s64) = G_LOAD %4(p0) :: (dereferenceable load 8)
+    %9:_(s64), %10:_(s1) = G_UMULO %7, %8
+    %31:_(s64) = G_CONSTANT i64 0
+    G_STORE %31(s64), %6(p0) :: (store 8, align 1)
+    %16:_(s64) = G_ZEXT %10(s1)
+    $x0 = COPY %9(s64)
+    $x1 = COPY %16(s64)
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/mul.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/mul.mir
index c92a55d0af32..b146aa5ff13d 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/mul.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/mul.mir
@@ -439,9 +439,9 @@ body:             |
     ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
     ; MIPS32: [[COPY2:%[0-9]+]]:_(p0) = COPY $a2
     ; MIPS32: [[COPY3:%[0-9]+]]:_(p0) = COPY $a3
-    ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
     ; MIPS32: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[COPY1]]
     ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
     ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[UMULH]](s32), [[C]]
     ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ICMP]](s32)
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/mul.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/mul.ll
index 659eadf181c0..f7250ccde898 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/mul.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/mul.ll
@@ -180,13 +180,13 @@ declare { i32, i1 } @llvm.umul.with.overflow.i32(i32, i32)
 define void @umul_with_overflow(i32 %lhs, i32 %rhs, i32* %pmul, i1* %pcarry_flag) {
 ; MIPS32-LABEL: umul_with_overflow:
 ; MIPS32:       # %bb.0:
-; MIPS32-NEXT:    mul $1, $4, $5
 ; MIPS32-NEXT:    multu $4, $5
-; MIPS32-NEXT:    mfhi $2
-; MIPS32-NEXT:    sltu $2, $zero, $2
-; MIPS32-NEXT:    andi $2, $2, 1
-; MIPS32-NEXT:    sb $2, 0($7)
-; MIPS32-NEXT:    sw $1, 0($6)
+; MIPS32-NEXT:    mfhi $1
+; MIPS32-NEXT:    mul $2, $4, $5
+; MIPS32-NEXT:    sltu $1, $zero, $1
+; MIPS32-NEXT:    andi $1, $1, 1
+; MIPS32-NEXT:    sb $1, 0($7)
+; MIPS32-NEXT:    sw $2, 0($6)
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
   %res = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %lhs, i32 %rhs)

From 60a25202a7dd1e00067fcfce512086ebf3788537 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 24 Sep 2020 13:44:29 -0400
Subject: [PATCH 276/363] [APFloat] prevent NaN morphing into Inf on conversion
 (PR43907)

We shift the significand right on a truncation, but that needs to be made NaN-safe:
always set at least 1 bit in the significand.
https://llvm.org/PR43907

See D88238 for the likely follow-up (but needs some plumbing fixes before it can proceed).

Differential Revision: https://reviews.llvm.org/D87835

(cherry picked from commit e34bd1e0b03d20a506ada156d87e1b3a96d82fa2)
---
 llvm/lib/Support/APFloat.cpp           | 15 +++++++++++++++
 llvm/test/Transforms/ConstProp/cast.ll | 23 +++++++++++++++++++++++
 llvm/unittests/ADT/APFloatTest.cpp     | 15 +++++++++++++++
 3 files changed, 53 insertions(+)

diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 569cac790af9..362595d8f8b1 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -2242,6 +2242,21 @@ IEEEFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics,
     if (!X86SpecialNan && semantics == &semX87DoubleExtended)
       APInt::tcSetBit(significandParts(), semantics->precision - 1);
 
+    // If we are truncating NaN, it is possible that we shifted out all of the
+    // set bits in a signalling NaN payload. But NaN must remain NaN, so some
+    // bit in the significand must be set (otherwise it is Inf).
+    // This can only happen with sNaN. Set the 1st bit after the quiet bit,
+    // so that we still have an sNaN.
+    // FIXME: Set quiet and return opInvalidOp (on convert of any sNaN).
+    //        But this requires fixing LLVM to parse 32-bit hex FP or ignoring
+    //        conversions while parsing IR.
+    if (APInt::tcIsZero(significandParts(), newPartCount)) {
+      assert(shift < 0 && "Should not lose NaN payload on extend");
+      assert(semantics->precision >= 3 && "Unexpectedly narrow significand");
+      assert(*losesInfo && "Missing payload should have set lost info");
+      APInt::tcSetBit(significandParts(), semantics->precision - 3);
+    }
+
     // gcc forces the Quiet bit on, which means (float)(double)(float_sNan)
     // does not give you back the same bits.  This is dubious, and we
     // don't currently do it.  You're really supposed to get
diff --git a/llvm/test/Transforms/ConstProp/cast.ll b/llvm/test/Transforms/ConstProp/cast.ll
index 8377df17b3a8..c07fa1295b42 100644
--- a/llvm/test/Transforms/ConstProp/cast.ll
+++ b/llvm/test/Transforms/ConstProp/cast.ll
@@ -38,3 +38,26 @@ define float @overflow_sitofp() {
   ret float %i
 }
 
+; https://llvm.org/PR43907 - make sure that NaN doesn't morph into Inf.
+; SNaN remains SNaN.
+
+define float @nan_f64_trunc() {
+; CHECK-LABEL: @nan_f64_trunc(
+; CHECK-NEXT:    ret float 0x7FF4000000000000
+;
+  %f = fptrunc double 0x7FF0000000000001 to float
+  ret float %f
+}
+
+; Verify again with a vector and different destination type.
+; SNaN remains SNaN (first two elements).
+; QNaN remains QNaN (third element).
+; Lower 42 bits of NaN source payload are lost.
+
+define <3 x half> @nan_v3f64_trunc() {
+; CHECK-LABEL: @nan_v3f64_trunc(
+; CHECK-NEXT:    ret <3 x half> <half 0xH7D00, half 0xH7D00, half 0xH7E00>
+;
+  %f = fptrunc <3 x double> <double 0x7FF0020000000000, double 0x7FF003FFFFFFFFFF, double 0x7FF8000000000001> to <3 x half>
+  ret <3 x half> %f
+}
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index b24b43d09a40..c798c95e05f6 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -1840,6 +1840,21 @@ TEST(APFloatTest, convert) {
                &losesInfo);
   EXPECT_TRUE(test.bitwiseIsEqual(X87QNaN));
   EXPECT_FALSE(losesInfo);
+
+  // The payload is lost in truncation, but we must retain NaN, so we set the bit after the quiet bit.
+  APInt payload(52, 1);
+  test = APFloat::getSNaN(APFloat::IEEEdouble(), false, &payload);
+  APFloat::opStatus status = test.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(0x7fa00000, test.bitcastToAPInt());
+  EXPECT_TRUE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
+
+  // The payload is lost in truncation. QNaN remains QNaN.
+  test = APFloat::getQNaN(APFloat::IEEEdouble(), false, &payload);
+  status = test.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(0x7fc00000, test.bitcastToAPInt());
+  EXPECT_TRUE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
 }
 
 TEST(APFloatTest, PPCDoubleDouble) {

From b6efbd6b5f22d0a251d2fba9a5d24ac21760b1cc Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Thu, 1 Oct 2020 09:22:03 +0200
Subject: [PATCH 277/363] LLVM release notes: JIT changes

By Lang Hames!
---
 llvm/docs/ReleaseNotes.rst | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index db64fa281018..bd6bbca75d9e 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -85,6 +85,22 @@ Changes to building LLVM
   Python 3 as Python 2 has been end-of-life'd by the Python Software
   Foundation.
 
+Changes to the JIT infrastructure
+---------------------------------
+
+* LLJIT now supports execution of static inits / deinits via the
+  LLJIT::initialize and LLJIT::deinitialize methods
+
+* Static libraries can now be added to a JITDylib using the
+  StaticLibraryDefinitionGenerator class
+
+* A C API has been added for OrcV2 (llvm-project/llvm/include/llvm-c/Orc.h)
+
+* Several OrcV2 example projects have been added to
+  llvm-project/llvm/examples/OrcV2Examples
+
+* Many bug fixes and API improvements
+
 Changes to the AArch64 Backend
 ------------------------------
 

From 636ecdd147911fa9b51b84308734676ef815ca13 Mon Sep 17 00:00:00 2001
From: Ahsan Saghir <saghir@ca.ibm.com>
Date: Thu, 1 Oct 2020 13:28:35 -0500
Subject: [PATCH 278/363] Fix indentation for PowerPC ReleaseNotes

---
 llvm/docs/ReleaseNotes.rst | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index bd6bbca75d9e..a1f00a1a3b3a 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -178,24 +178,33 @@ Optimization:
 Codegen:
 
 * POWER10 support
-* Added PC Relative addressing
-* Added __int128 vector bool support
+
+  * Added PC Relative addressing
+  * Added __int128 vector bool support
+
 * Security enhancement via probe-stack attribute support to protect against stack clash
 * Floating point support enhancements
-* Improved half precision and quad precision support, including GLIBC
-* constrained FP operation support for arithmetic/rounding/max/min
-* cleaning up fast math flags checks in DAGCombine, Legalizer, and Lowering
+
+  * Improved half precision and quad precision support, including GLIBC
+  * constrained FP operation support for arithmetic/rounding/max/min
+  * cleaning up fast math flags checks in DAGCombine, Legalizer, and Lowering
+
 * Performance improvements from instruction exploitation, especially for vector permute on LE
 * Scheduling enhancements
-* Added MacroFusion for POWER8
-* Added post-ra heuristics for POWER9
+
+  * Added MacroFusion for POWER8
+  * Added post-ra heuristics for POWER9
+
 * Target dependent passes tuning
-* Updated LoopStrengthReduce to use instruction number as first priority
-* Enhanced MachineCombiner to expose more ILP
+
+  * Updated LoopStrengthReduce to use instruction number as first priority
+  * Enhanced MachineCombiner to expose more ILP
+
 * Code quality and maintenance enhancements
-* Enabled more machine verification passes
-* Added ability to parse and emit additional extended mnemonics
-* Numerous bug fixes
+
+  * Enabled more machine verification passes
+  * Added ability to parse and emit additional extended mnemonics
+  * Numerous bug fixes
 
 AIX Support Improvements:
 

From f80e6d63423008ca24aa5d5d9939d0e35a2d7d02 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Mon, 5 Oct 2020 13:38:45 +0200
Subject: [PATCH 279/363] ReleaseNotes: mention the machine outliner for ARM

As suggested by Yvan.
---
 llvm/docs/ReleaseNotes.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index a1f00a1a3b3a..f5f50cbf0158 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -165,6 +165,9 @@ Changes to the ARM Backend
 
 * Added support for Cortex-M55, Cortex-A77, Cortex-A78 and Cortex-X1 cores.
 
+* The Machine Outliner is now supported for ARM and Thumb2, it is not
+  turned on by default and can be enabled with the ``-moutline`` clang flag.
+
 
 Changes to the PowerPC Target
 -----------------------------

From 121babae56e9f08acedf3d6d44757e35556d0a37 Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan@cn.ibm.com>
Date: Tue, 6 Oct 2020 00:45:24 +0800
Subject: [PATCH 280/363] [SelectionDAG] Don't remove unused negated constant
 immediately

This reverts partial of a2fb5446 (actually, 2508ef01) about removing
negated FP constant immediately if it has no uses. However, as discussed
in bug 47517, there're cases when NegX is folded into constant from
other places while NegY is removed by that line of code and NegX is
equal to NegY. In these cases, NegX is deleted before used and crash
happens. So revert the code and add necessary test case.

(cherry picked from commit b326d4ff946d2061a566a3fcce9f33b484759fe0)
---
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp |  4 +---
 llvm/test/CodeGen/X86/pr47517.ll                 | 13 +++++++++++++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 64af293caf9e..8b3e6189a07f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5751,10 +5751,8 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
 
     // If we already have the use of the negated floating constant, it is free
     // to negate it even it has multiple uses.
-    if (!Op.hasOneUse() && CFP.use_empty()) {
-      RemoveDeadNode(CFP);
+    if (!Op.hasOneUse() && CFP.use_empty())
       break;
-    }
     Cost = NegatibleCost::Neutral;
     return CFP;
   }
diff --git a/llvm/test/CodeGen/X86/pr47517.ll b/llvm/test/CodeGen/X86/pr47517.ll
index 5672fbc69a41..afc27b49ab2a 100644
--- a/llvm/test/CodeGen/X86/pr47517.ll
+++ b/llvm/test/CodeGen/X86/pr47517.ll
@@ -26,3 +26,16 @@ entry:
   %fmul6 = fmul fast float %fmul3, %fadd4
   ret float %fmul6
 }
+
+; To ensure negated result will not be removed when NegX=NegY and
+; NegX is needed
+define float @test2(float %x, float %y) {
+  %add = fadd fast float %x, 750.0
+  %sub = fsub fast float %x, %add
+  %mul = fmul fast float %sub, %sub
+  %mul2 = fmul fast float %mul, %sub
+  %add2 = fadd fast float %mul2, 1.0
+  %add3 = fadd fast float %mul2, %add2
+  %mul3 = fmul fast float %y, %add3
+  ret float %mul3
+}

From e84852be644d34867a604997fd328bf411b1977d Mon Sep 17 00:00:00 2001
From: Shivanshu Goyal <Shivanshu.Goyal@microsoft.com>
Date: Tue, 6 Oct 2020 16:12:48 +0200
Subject: [PATCH 281/363] Add ability to turn off -fpch-instantiate-templates
 in clang-cl

A lot of our code building with clang-cl.exe using Clang 11 was failing with
the following 2 type of errors:

1. explicit specialization of 'foo' after instantiation
2. no matching function for call to 'bar'

Note that we also use -fdelayed-template-parsing in our builds.

I tried pretty hard to get a small repro for these failures, but couldn't. So
there is some subtle edge case in the -fpch-instantiate-templates feature
introduced by this change: https://reviews.llvm.org/D69585

When I tried turning this off using -fno-pch-instantiate-templates, builds
would silently fail with the same error without any indication that
-fno-pch-instantiate-templates was being ignored by the compiler. Then I
realized this "no" option wasn't actually working when I ran Clang under a
debugger.

Differential revision: https://reviews.llvm.org/D88680

(cherry picked from commit 66e4f07198761bbb4dcd55235024c1081ed15c75)
---
 clang/include/clang/Driver/Options.td         |  4 ++--
 clang/lib/Driver/ToolChains/Clang.cpp         |  6 +++++-
 clang/test/Driver/pch-instantiate-templates.c | 13 +++++++++++++
 3 files changed, 20 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Driver/pch-instantiate-templates.c

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index f818acb39d51..966cb907b7e2 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1435,11 +1435,11 @@ def fno_pch_validate_input_files_content:
   Group<f_Group>, Flags<[DriverOption]>;
 def fpch_instantiate_templates:
   Flag <["-"], "fpch-instantiate-templates">,
-  Group<f_Group>, Flags<[CC1Option]>,
+  Group<f_Group>, Flags<[CC1Option, CoreOption]>,
   HelpText<"Instantiate templates already while building a PCH">;
 def fno_pch_instantiate_templates:
   Flag <["-"], "fno-pch-instantiate-templates">,
-  Group<f_Group>, Flags<[CC1Option]>;
+  Group<f_Group>, Flags<[CC1Option, CoreOption]>;
 defm pch_codegen: OptInFFlag<"pch-codegen", "Generate ", "Do not generate ",
   "code for uses of this PCH that assumes an explicit object file will be built for the PCH">;
 defm pch_debuginfo: OptInFFlag<"pch-debuginfo", "Generate ", "Do not generate ",
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index f0a5451322aa..af4bcf951e6c 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1197,7 +1197,11 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
     if (YcArg && JA.getKind() >= Action::PrecompileJobClass &&
         JA.getKind() <= Action::AssembleJobClass) {
       CmdArgs.push_back(Args.MakeArgString("-building-pch-with-obj"));
-      CmdArgs.push_back(Args.MakeArgString("-fpch-instantiate-templates"));
+      // -fpch-instantiate-templates is the default when creating
+      // precomp using /Yc
+      if (Args.hasFlag(options::OPT_fpch_instantiate_templates,
+                       options::OPT_fno_pch_instantiate_templates, true))
+        CmdArgs.push_back(Args.MakeArgString("-fpch-instantiate-templates"));
     }
     if (YcArg || YuArg) {
       StringRef ThroughHeader = YcArg ? YcArg->getValue() : YuArg->getValue();
diff --git a/clang/test/Driver/pch-instantiate-templates.c b/clang/test/Driver/pch-instantiate-templates.c
new file mode 100644
index 000000000000..b0f7f3473993
--- /dev/null
+++ b/clang/test/Driver/pch-instantiate-templates.c
@@ -0,0 +1,13 @@
+// CL driver test cases
+// RUN: %clang_cl -### /Yc /Fpfoo.pch /Fofoo.obj -- %s 2>&1 | FileCheck --check-prefix=CLANG_CL_YC %s
+// RUN: %clang_cl -### /Yc /Fpfoo.pch /Fofoo.obj -fno-pch-instantiate-templates -- %s 2>&1 | FileCheck --check-prefix=CLANG_CL_YC_DISABLE %s
+
+// CLANG_CL_YC: "-fpch-instantiate-templates"
+// CLANG_CL_YC_DISABLE-NOT: "-fpch-instantiate-templates"
+
+// GCC driver test cases
+// RUN: %clang -### -x c-header %s -o %t/foo.pch 2>&1 | FileCheck -check-prefix=GCC_DEFAULT %s
+// RUN: %clang -### -x c-header %s -o %t/foo.pch -fpch-instantiate-templates 2>&1 | FileCheck -check-prefix=GCC_DEFAULT_ENABLE %s
+
+// GCC_DEFAULT-NOT: "-fpch-instantiate-templates"
+// GCC_DEFAULT_ENABLE: "-fpch-instantiate-templates"

From 176249bd6732a8044d457092ed932768724a6f06 Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard@gmail.com>
Date: Tue, 6 Oct 2020 17:54:36 -0700
Subject: [PATCH 282/363] [CodeGen][TailDuplicator] Don't duplicate blocks with
 INLINEASM_BR

Tail duplication of a block with an INLINEASM_BR may result in a PHI
node on the indirect branch. This is okay, but it also introduces a copy
for that PHI node *after* the INLINEASM_BR, which is not okay.

See: https://github.com/ClangBuiltLinux/linux/issues/1125

Differential Revision: https://reviews.llvm.org/D88823

(cherry picked from commit d2c61d2bf9bd1efad49acba2f2751112522686aa)
---
 llvm/lib/CodeGen/TailDuplicator.cpp        |  8 +++
 llvm/test/CodeGen/X86/tail-dup-asm-goto.ll | 61 ++++++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/tail-dup-asm-goto.ll

diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index bd554189f12b..f9773f74a7bd 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -627,6 +627,14 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
     if (PreRegAlloc && MI.isCall())
       return false;
 
+    // TailDuplicator::appendCopies will erroneously place COPYs after
+    // INLINEASM_BR instructions after 4b0aa5724fea, which demonstrates the same
+    // bug that was fixed in f7a53d82c090.
+    // FIXME: Use findPHICopyInsertPoint() to find the correct insertion point
+    //        for the COPY when replacing PHIs.
+    if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
+      return false;
+
     if (MI.isBundle())
       InstrCount += MI.getBundleSize();
     else if (!MI.isPHI() && !MI.isMetaInstruction())
diff --git a/llvm/test/CodeGen/X86/tail-dup-asm-goto.ll b/llvm/test/CodeGen/X86/tail-dup-asm-goto.ll
new file mode 100644
index 000000000000..77aa3adf0fc6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tail-dup-asm-goto.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -mtriple=x86_64-linux -stop-after=early-tailduplication < %s | FileCheck %s
+
+; Ensure that we don't duplicate a block with an "INLINEASM_BR" instruction
+; during code gen.
+declare void @foo()
+
+define i8* @test1(i8** %arg1, i8* %arg2) {
+  ; CHECK-LABEL: name: test1
+  ; CHECK: bb.0.bb:
+  ; CHECK:   successors: %bb.1(0x50000000), %bb.2(0x30000000)
+  ; CHECK:   liveins: $rdi, $rsi
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY $rsi
+  ; CHECK:   [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
+  ; CHECK:   [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 0, $noreg :: (load 8 from %ir.arg1)
+  ; CHECK:   [[SUB64rr:%[0-9]+]]:gr64 = SUB64rr [[MOV64rm]], [[COPY]], implicit-def $eflags
+  ; CHECK:   JCC_1 %bb.2, 4, implicit $eflags
+  ; CHECK:   JMP_1 %bb.1
+  ; CHECK: bb.1.bb100:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   MOV64mi32 [[COPY1]], 1, $noreg, 0, $noreg, 0 :: (store 8 into %ir.arg1)
+  ; CHECK:   JMP_1 %bb.3
+  ; CHECK: bb.2.bb106:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK:   CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp
+  ; CHECK:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK: bb.3.bb110:
+  ; CHECK:   successors: %bb.5(0x80000000), %bb.4(0x00000000)
+  ; CHECK:   [[PHI:%[0-9]+]]:gr64 = PHI [[COPY]], %bb.2, [[MOV64rm]], %bb.1
+  ; CHECK:   INLINEASM_BR &"#$0 $1 $2", 9 /* sideeffect mayload attdialect */, 13 /* imm */, 42, 13 /* imm */, 0, 13 /* imm */, blockaddress(@test1, %ir-block.bb17.i.i.i), 12 /* clobber */, implicit-def early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def early-clobber $eflags
+  ; CHECK:   JMP_1 %bb.5
+  ; CHECK: bb.4.bb17.i.i.i (address-taken):
+  ; CHECK:   successors: %bb.5(0x80000000)
+  ; CHECK: bb.5.kmem_cache_has_cpu_partial.exit:
+  ; CHECK:   $rax = COPY [[PHI]]
+  ; CHECK:   RET 0, $rax
+bb:
+  %i28.i = load i8*, i8** %arg1, align 8
+  %if = icmp ne i8* %i28.i, %arg2
+  br i1 %if, label %bb100, label %bb106
+
+bb100:                                            ; preds = %bb
+  store i8* null, i8** %arg1, align 8
+  br label %bb110
+
+bb106:                                            ; preds = %bb
+  call void @foo()
+  br label %bb110
+
+bb110:                                            ; preds = %bb106, %bb100
+  %i10.1 = phi i8* [ %arg2, %bb106 ], [ %i28.i, %bb100 ]
+  callbr void asm sideeffect "#$0 $1 $2", "i,i,X,~{dirflag},~{fpsr},~{flags}"(i32 42, i1 false, i8* blockaddress(@test1, %bb17.i.i.i))
+          to label %kmem_cache_has_cpu_partial.exit [label %bb17.i.i.i]
+
+bb17.i.i.i:                                       ; preds = %bb110
+  br label %kmem_cache_has_cpu_partial.exit
+
+kmem_cache_has_cpu_partial.exit:                  ; preds = %bb110
+  ret i8* %i10.1
+}

From 3039965645c8100f87556a74afe337f4fceeffcd Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Fri, 23 Oct 2020 01:40:56 +0000
Subject: [PATCH 283/363] Bump version to 11.0.1

---
 libcxx/CMakeLists.txt                     | 2 +-
 libcxxabi/CMakeLists.txt                  | 2 +-
 libunwind/CMakeLists.txt                  | 2 +-
 llvm/CMakeLists.txt                       | 2 +-
 llvm/utils/gn/secondary/llvm/version.gni  | 2 +-
 llvm/utils/lit/lit/__init__.py            | 2 +-
 llvm/utils/release/build_llvm_package.bat | 4 ++--
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index f37d729a8a15..f145831c75d8 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -32,7 +32,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBCXX_STANDALONE_BUIL
   project(libcxx CXX C)
 
   set(PACKAGE_NAME libcxx)
-  set(PACKAGE_VERSION 11.0.0)
+  set(PACKAGE_VERSION 11.0.1)
   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
 
diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index 6cb139b311c0..deff3d5e4ad1 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -25,7 +25,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBCXXABI_STANDALONE_B
   project(libcxxabi CXX C)
 
   set(PACKAGE_NAME libcxxabi)
-  set(PACKAGE_VERSION 11.0.0)
+  set(PACKAGE_VERSION 11.0.1)
   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
 
diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt
index bd8176c67925..cdac67e93df1 100644
--- a/libunwind/CMakeLists.txt
+++ b/libunwind/CMakeLists.txt
@@ -83,7 +83,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBUNWIND_STANDALONE_B
   endif()
 
   set(PACKAGE_NAME libunwind)
-  set(PACKAGE_VERSION 11.0.0)
+  set(PACKAGE_VERSION 11.0.1)
   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
 
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 038139a24090..915400af7a83 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -30,7 +30,7 @@ if(NOT DEFINED LLVM_VERSION_MINOR)
   set(LLVM_VERSION_MINOR 0)
 endif()
 if(NOT DEFINED LLVM_VERSION_PATCH)
-  set(LLVM_VERSION_PATCH 0)
+  set(LLVM_VERSION_PATCH 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
   set(LLVM_VERSION_SUFFIX "")
diff --git a/llvm/utils/gn/secondary/llvm/version.gni b/llvm/utils/gn/secondary/llvm/version.gni
index 44c8736132e3..e2b6390b66cc 100644
--- a/llvm/utils/gn/secondary/llvm/version.gni
+++ b/llvm/utils/gn/secondary/llvm/version.gni
@@ -1,4 +1,4 @@
 llvm_version_major = 11
 llvm_version_minor = 0
-llvm_version_patch = 0
+llvm_version_patch = 1
 llvm_version = "$llvm_version_major.$llvm_version_minor.$llvm_version_patch"
diff --git a/llvm/utils/lit/lit/__init__.py b/llvm/utils/lit/lit/__init__.py
index 6bf0132d4252..27629033758f 100644
--- a/llvm/utils/lit/lit/__init__.py
+++ b/llvm/utils/lit/lit/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = 'Daniel Dunbar'
 __email__ = 'daniel@minormatter.com'
-__versioninfo__ = (0, 11, 0)
+__versioninfo__ = (0, 11, 1)
 __version__ = '.'.join(str(v) for v in __versioninfo__) + 'dev'
 
 __all__ = []
diff --git a/llvm/utils/release/build_llvm_package.bat b/llvm/utils/release/build_llvm_package.bat
index 8e487ef6812e..31e237c63565 100755
--- a/llvm/utils/release/build_llvm_package.bat
+++ b/llvm/utils/release/build_llvm_package.bat
@@ -27,8 +27,8 @@ set python64_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python36
 for /f "usebackq" %%i in (`PowerShell ^(Get-Date^).ToString^('yyyyMMdd'^)`) do set datestamp=%%i
 
 set revision=%1
-set package_version=11.0.0-%revision:~0,8%
-set clang_format_vs_version=11.0.0.%datestamp%
+set package_version=11.0.1-%revision:~0,8%
+set clang_format_vs_version=11.0.1.%datestamp%
 set build_dir=llvm_package_%revision:~0,8%
 
 echo Revision: %revision%

From b59b6b662bb987955f1516c9a47e96c7ab443ac6 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Fri, 23 Oct 2020 19:48:46 +0000
Subject: [PATCH 284/363] Import github action definitions from release/10.x
 branch

---
 .github/workflows/clang-tests.yml  |  43 +++++++++++
 .github/workflows/libclc-tests.yml |  53 +++++++++++++
 .github/workflows/lld-tests.yml    |  43 +++++++++++
 .github/workflows/lldb-tests.yml   |  48 ++++++++++++
 .github/workflows/llvm-tests.yml   | 116 +++++++++++++++++++++++++++++
 5 files changed, 303 insertions(+)
 create mode 100644 .github/workflows/clang-tests.yml
 create mode 100644 .github/workflows/libclc-tests.yml
 create mode 100644 .github/workflows/lld-tests.yml
 create mode 100644 .github/workflows/lldb-tests.yml
 create mode 100644 .github/workflows/llvm-tests.yml

diff --git a/.github/workflows/clang-tests.yml b/.github/workflows/clang-tests.yml
new file mode 100644
index 000000000000..f8ca65e10726
--- /dev/null
+++ b/.github/workflows/clang-tests.yml
@@ -0,0 +1,43 @@
+name: Clang Tests
+
+on:
+  push:
+    branches:
+      - 'release/**'
+    paths:
+      - 'clang/**'
+      - 'llvm/**'
+      - '.github/workflows/clang-tests.yml'
+  pull_request:
+    paths:
+      - 'clang/**'
+      - 'llvm/**'
+      - '.github/workflows/clang-tests.yml'
+
+jobs:
+  build_clang:
+    name: clang check-all
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - ubuntu-latest
+          - windows-latest
+          - macOS-latest
+    steps:
+    - name: Setup Windows
+      if: startsWith(matrix.os, 'windows')
+      uses: llvm/actions/setup-windows@master
+      with:
+        arch: amd64
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@master
+    - uses: actions/checkout@v1
+      with:
+        fetch-depth: 1
+    - name: Test clang
+      uses: llvm/actions/build-test-llvm-project@master
+      with:
+        cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="clang" -DCMAKE_BUILD_TYPE=Release
+        build_target: check-clang
diff --git a/.github/workflows/libclc-tests.yml b/.github/workflows/libclc-tests.yml
new file mode 100644
index 000000000000..4e8639b1c89a
--- /dev/null
+++ b/.github/workflows/libclc-tests.yml
@@ -0,0 +1,53 @@
+name: libclc Tests
+
+on:
+  push:
+    branches:
+      - 'release/**'
+    paths:
+      - 'clang/**'
+      - 'llvm/**'
+      - 'libclc/**'
+      - '.github/workflows/libclc-tests.yml'
+  pull_request:
+    paths:
+      - 'clang/**'
+      - 'llvm/**'
+      - 'libclc/**'
+      - '.github/workflows/libclc-tests.yml'
+
+jobs:
+  build_libclc:
+    name: libclc test
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - ubuntu-latest
+          # Disable build on windows, because I can't figure out where llvm-config is.
+          #- windows-latest
+          - macOS-latest
+    steps:
+    - name: Setup Windows
+      if: startsWith(matrix.os, 'windows')
+      uses: llvm/actions/setup-windows@master
+      with:
+        arch: amd64
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@master
+    - uses: actions/checkout@v1
+      with:
+        fetch-depth: 1
+    - name: Build clang
+      uses: llvm/actions/build-test-llvm-project@master
+      with:
+        cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="clang" -DCMAKE_BUILD_TYPE=Release
+        build_target: ""
+    - name: Build and test libclc
+      run: |
+        mkdir libclc-build
+        cd libclc-build
+        cmake -G Ninja ../libclc -DLLVM_CONFIG=../build/bin/llvm-config
+        ninja
+        ninja test
diff --git a/.github/workflows/lld-tests.yml b/.github/workflows/lld-tests.yml
new file mode 100644
index 000000000000..9b4cbe95f231
--- /dev/null
+++ b/.github/workflows/lld-tests.yml
@@ -0,0 +1,43 @@
+name: LLD Tests
+
+on:
+  push:
+    branches:
+      - 'release/**'
+    paths:
+      - 'lld/**'
+      - 'llvm/**'
+      - '.github/workflows/lld-tests.yml'
+  pull_request:
+    paths:
+      - 'lld/**'
+      - 'llvm/**'
+      - '.github/workflows/lld-tests.yml'
+
+jobs:
+  build_lld:
+    name: lld check-all
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - ubuntu-latest
+          - windows-latest
+          - macOS-latest
+    steps:
+    - name: Setup Windows
+      if: startsWith(matrix.os, 'windows')
+      uses: llvm/actions/setup-windows@master
+      with:
+        arch: amd64
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@master
+    - uses: actions/checkout@v1
+      with:
+        fetch-depth: 1
+    - name: Test lld
+      uses: llvm/actions/build-test-llvm-project@master
+      with:
+        cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="lld" -DCMAKE_BUILD_TYPE=Release
+        build_target: check-lld
diff --git a/.github/workflows/lldb-tests.yml b/.github/workflows/lldb-tests.yml
new file mode 100644
index 000000000000..229e6deece6e
--- /dev/null
+++ b/.github/workflows/lldb-tests.yml
@@ -0,0 +1,48 @@
+name: lldb Tests
+
+on:
+  push:
+    branches:
+      - 'release/**'
+    paths:
+      - 'clang/**'
+      - 'llvm/**'
+      - 'lldb/**'
+      - '.github/workflows/lldb-tests.yml'
+  pull_request:
+    paths:
+      - 'clang/**'
+      - 'llvm/**'
+      - 'lldb/**'
+      - '.github/workflows/lldb-tests.yml'
+
+jobs:
+  build_lldb:
+    name: lldb build
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - ubuntu-latest
+          - windows-latest
+          # macOS build disabled due to: llvm.org/PR46190
+          #- macOS-latest
+    steps:
+    - name: Setup Windows
+      if: startsWith(matrix.os, 'windows')
+      uses: llvm/actions/setup-windows@master
+      with:
+        arch: amd64
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@master
+    - uses: actions/checkout@v1
+      with:
+        fetch-depth: 1
+    - name: Build lldb
+      uses: llvm/actions/build-test-llvm-project@master
+      with:
+        # Mac OS requries that libcxx is enabled for lldb tests, so we need  to disable them.
+        cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="clang;lldb" -DCMAKE_BUILD_TYPE=Release -DLLDB_INCLUDE_TESTS=OFF
+        # check-lldb is not consistent, so we only build lldb.
+        build_target: ""
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
new file mode 100644
index 000000000000..baefbc08c102
--- /dev/null
+++ b/.github/workflows/llvm-tests.yml
@@ -0,0 +1,116 @@
+name: LLVM Tests
+
+env:
+  release_major: 11
+
+on:
+  push:
+    branches:
+      - 'release/**'
+    paths:
+      - 'llvm/**'
+      - '.github/workflows/llvm-tests.yml'
+  pull_request:
+    paths:
+      - 'llvm/**'
+      - '.github/workflows/llvm-tests.yml'
+
+jobs:
+  build_llvm:
+    name: llvm check-all
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - ubuntu-latest
+          - windows-latest
+          - macOS-latest
+    steps:
+    - name: Setup Windows
+      if: startsWith(matrix.os, 'windows')
+      uses: llvm/actions/setup-windows@master
+      with:
+        arch: amd64
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@master
+    - uses: actions/checkout@v1
+      with:
+        fetch-depth: 1
+    - name: Test llvm
+      uses: llvm/actions/build-test-llvm-project@master
+      with:
+        cmake_args: -G Ninja -DCMAKE_BUILD_TYPE=Release
+
+  abi-dump:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        name:
+          - build-baseline
+          - build-latest
+        include:
+          - name: build-baseline
+            # FIXME: Referencing the env context does not work here
+            # ref: llvmorg-${{ env.release_major }}.0.0
+            ref: llvmorg-11.0.0
+            repo: llvm/llvm-project
+          - name: build-latest
+            ref: ${{ github.sha }}
+            repo: ${{ github.repository }}
+    steps:
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@master
+    - name: Install abi-compliance-checker
+      run: |
+        sudo apt-get install abi-dumper autoconf pkg-config
+    - name: Install universal-ctags
+      run: |
+        git clone https://github.com/universal-ctags/ctags.git
+        cd ctags
+        ./autogen.sh
+        ./configure
+        sudo make install
+    - name: Download source code
+      uses: llvm/actions/get-llvm-project-src@master
+      with:
+        ref: ${{ matrix.ref }}
+        repo: ${{ matrix.repo }}
+    - name: Configure
+      run: |
+        mkdir build
+        cd build
+        cmake -G Ninja -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD="" -DLLVM_BUILD_LLVM_DYLIB=ON -DCMAKE_C_FLAGS_DEBUG="-g -Og" -DCMAKE_CXX_FLAGS_DEBUG="-g -Og" ../llvm
+    - name: Build
+      run: ninja -C build libLLVM-${{ env.release_major }}.so
+    - name: Dump ABI
+      run: abi-dumper -lver ${{ matrix.ref }} -skip-cxx -public-headers llvm/include -o ${{ matrix.ref }}.abi.tar.gz build/lib/libLLVM-${{ env.release_major }}.so
+    - name: Upload ABI file
+      uses: actions/upload-artifact@v1
+      with:
+        name: ${{ matrix.name }}
+        path: ${{ matrix.ref }}.abi.tar.gz
+
+  abi-compare:
+    runs-on: ubuntu-latest
+    needs:
+      - abi-dump
+    steps:
+      - name: Download baseline
+        uses: actions/download-artifact@v1
+        with:
+          name: build-baseline
+      - name: Download latest
+        uses: actions/download-artifact@v1
+        with:
+          name: build-latest
+      - name: Install abi-compliance-checker
+        run: sudo apt-get install abi-compliance-checker
+      - name: Compare ABI
+        run: abi-compliance-checker -l libLLVM-${{ env.release_major}}.so -old build-baseline/*.tar.gz -new build-latest/*.tar.gz
+      - name: Upload ABI Comparison
+        if: always()
+        uses: actions/upload-artifact@v1
+        with:
+          name: compat-report-${{ github.sha }}
+          path: compat_reports/

From 38399ced95bca0aede9baaecd75936952758b1b6 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Thu, 3 Sep 2020 20:58:56 -0700
Subject: [PATCH 285/363] [ConstantFold] Make areGlobalsPotentiallyEqual less
 aggressive.

In particular, we shouldn't make assumptions about globals which are
unnamed_addr: we can fold them together with other globals.

Also while I'm here, use isInterposable() instead of trying to
explicitly name all the different kinds of weak linkage.

Fixes https://bugs.llvm.org/show_bug.cgi?id=47090

Differential Revision: https://reviews.llvm.org/D87123

(cherry picked from commit d751f86189a7f7ef2a6fe06974a5da3349b02f20)
---
 llvm/lib/IR/ConstantFold.cpp                           | 2 +-
 llvm/test/Assembler/ConstantExprNoFold.ll              | 6 ++++++
 llvm/test/Transforms/InstCombine/2010-03-03-ExtElim.ll | 4 ++--
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index f3c3e9ad9f69..c20d0955f3d8 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -1589,7 +1589,7 @@ static FCmpInst::Predicate evaluateFCmpRelation(Constant *V1, Constant *V2) {
 static ICmpInst::Predicate areGlobalsPotentiallyEqual(const GlobalValue *GV1,
                                                       const GlobalValue *GV2) {
   auto isGlobalUnsafeForEquality = [](const GlobalValue *GV) {
-    if (GV->hasExternalWeakLinkage() || GV->hasWeakAnyLinkage())
+    if (GV->isInterposable() || GV->hasGlobalUnnamedAddr())
       return true;
     if (const auto *GVar = dyn_cast<GlobalVariable>(GV)) {
       Type *Ty = GVar->getValueType();
diff --git a/llvm/test/Assembler/ConstantExprNoFold.ll b/llvm/test/Assembler/ConstantExprNoFold.ll
index 42e558eb3865..d91855925c89 100644
--- a/llvm/test/Assembler/ConstantExprNoFold.ll
+++ b/llvm/test/Assembler/ConstantExprNoFold.ll
@@ -42,6 +42,12 @@ target datalayout = "p:32:32"
 @empty.2 = external global [0 x i8], align 1
 @empty.cmp = global i1 icmp eq ([0 x i8]* @empty.1, [0 x i8]* @empty.2)
 
+; Two unnamed_addr globals can share an address
+; CHECK: @unnamed.cmp = global i1 icmp eq ([5 x i8]* @unnamed.1, [5 x i8]* @unnamed.2)
+@unnamed.1 = unnamed_addr constant [5 x i8] c"asdf\00"
+@unnamed.2 = unnamed_addr constant [5 x i8] c"asdf\00"
+@unnamed.cmp = global i1 icmp eq ([5 x i8]* @unnamed.1, [5 x i8]* @unnamed.2)
+
 @addrspace3 = internal addrspace(3) global i32 undef
 
 ; CHECK: @no.fold.addrspace.icmp.eq.gv.null = global i1 icmp eq (i32 addrspace(3)* @addrspace3, i32 addrspace(3)* null)
diff --git a/llvm/test/Transforms/InstCombine/2010-03-03-ExtElim.ll b/llvm/test/Transforms/InstCombine/2010-03-03-ExtElim.ll
index ad0fe5a21783..da9d0469e5e2 100644
--- a/llvm/test/Transforms/InstCombine/2010-03-03-ExtElim.ll
+++ b/llvm/test/Transforms/InstCombine/2010-03-03-ExtElim.ll
@@ -16,8 +16,8 @@ define i1 @PR6486() nounwind {
 ; CHECK: ret i1 true
 }
 
-@d = common global i32 0, align 4
-@a = common global [1 x i32] zeroinitializer, align 4
+@d = global i32 0, align 4
+@a = global [1 x i32] zeroinitializer, align 4
 
 define i1 @PR16462_1() nounwind {
 ; CHECK-LABEL: @PR16462_1(

From 1ff84a04aebcafc65e43dfe13d6f2aa352f72637 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 7 Oct 2020 16:24:33 -0700
Subject: [PATCH 286/363] BPF: fix incorrect DAG2DAG load optimization

Currently, bpf backend Instruction section DAG2DAG phase has
an optimization to replace loading constant struct memeber
or array element with direct values. The reason is that these
locally defined struct or array variables may have their
initial values stored in a readonly section and early bpf
ecosystem is not able to handle such cases.

Bpf ecosystem now can not only handle readonly sections,
but also global variables. global variable can also have
initialized data and global variable may or may not be constant,
i.e., global variable data can be put in .data section or .rodata
section. This exposed a bug in DAG2DAG Load optimization
as it did not check whether the global variable is constant
or not.

This patch fixed the bug by checking whether global variable,
representing the initial data, is constant or not and will not
do optimization if it is not a constant.

Another bug is also fixed in this patch to check whether
the load is simple (not volatile/atomic) or not. If it is
not simple, we will not do optimization. To summary for
globals:
   - struct t var = { ... } ;  // no load optimization
   - const struct t var = { ... }; // load optimization is possible
   - volatile const struct t var = { ... }; // no load optimization

Differential Revision: https://reviews.llvm.org/D89021

(cherry picked from commit 31611721686760fe59c91a84b025e4dee94d1662)
---
 llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp |  4 ++--
 llvm/test/CodeGen/BPF/rodata_6.ll       | 25 +++++++++++++++++++++++++
 llvm/test/CodeGen/BPF/rodata_7.ll       | 25 +++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/BPF/rodata_6.ll
 create mode 100644 llvm/test/CodeGen/BPF/rodata_7.ll

diff --git a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index d407edfbd966..77f565fb5957 100644
--- a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -254,7 +254,7 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
   const LoadSDNode *LD = cast<LoadSDNode>(Node);
   uint64_t size = LD->getMemOperand()->getSize();
 
-  if (!size || size > 8 || (size & (size - 1)))
+  if (!size || size > 8 || (size & (size - 1)) || !LD->isSimple())
     return;
 
   SDNode *LDAddrNode = LD->getOperand(1).getNode();
@@ -342,7 +342,7 @@ bool BPFDAGToDAGISel::getConstantFieldValue(const GlobalAddressSDNode *Node,
                                             unsigned char *ByteSeq) {
   const GlobalVariable *V = dyn_cast<GlobalVariable>(Node->getGlobal());
 
-  if (!V || !V->hasInitializer())
+  if (!V || !V->hasInitializer() || !V->isConstant())
     return false;
 
   const Constant *Init = V->getInitializer();
diff --git a/llvm/test/CodeGen/BPF/rodata_6.ll b/llvm/test/CodeGen/BPF/rodata_6.ll
new file mode 100644
index 000000000000..1af3d8dc230f
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/rodata_6.ll
@@ -0,0 +1,25 @@
+; RUN: llc -march=bpf < %s | FileCheck %s
+;
+; Source code:
+;   struct t1 { int a; };
+;   struct t1 data = { .a = 3 };
+;   int foo(void) {
+;     return data.a + 20;
+;   }
+; Compilation flag:
+;   clang -target bpf -O2 -S -emit-llvm test.c
+
+%struct.t1 = type { i32 }
+
+@data = dso_local local_unnamed_addr global %struct.t1 { i32 3 }, align 4
+
+; Function Attrs: norecurse nounwind readonly
+define dso_local i32 @foo() local_unnamed_addr {
+entry:
+  %0 = load i32, i32* getelementptr inbounds (%struct.t1, %struct.t1* @data, i64 0, i32 0), align 4
+  %add = add nsw i32 %0, 20
+; CHECK:   [[REG1:r[0-9]+]] = data ll
+; CHECK:   r0 = *(u32 *)([[REG1]] + 0)
+; CHECK:   r0 += 20
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/BPF/rodata_7.ll b/llvm/test/CodeGen/BPF/rodata_7.ll
new file mode 100644
index 000000000000..69969a140302
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/rodata_7.ll
@@ -0,0 +1,25 @@
+; RUN: llc -march=bpf < %s | FileCheck %s
+;
+; Source code:
+;   struct t1 { int a; };
+;   volatile const struct t1 data = { .a = 3 };
+;   int foo(void) {
+;     return data.a + 20;
+;   }
+; Compilation flag:
+;   clang -target bpf -O2 -S -emit-llvm test.c
+
+%struct.t1 = type { i32 }
+
+@data = dso_local constant %struct.t1 { i32 3 }, align 4
+
+; Function Attrs: nofree norecurse nounwind
+define dso_local i32 @foo() local_unnamed_addr {
+entry:
+  %0 = load volatile i32, i32* getelementptr inbounds (%struct.t1, %struct.t1* @data, i64 0, i32 0), align 4
+  %add = add nsw i32 %0, 20
+; CHECK:   [[REG1:r[0-9]+]] = data ll
+; CHECK:   r0 = *(u32 *)([[REG1]] + 0)
+; CHECK:   r0 += 20
+  ret i32 %add
+}

From 83716db47f94391771bf469262ab4282679d8981 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 27 Oct 2020 09:30:10 -0700
Subject: [PATCH 287/363] [ELF] -r: don't crash when a non-SHF_LINK_ORDER
 orphan is added before a SHF_LINK_ORDER orphan

Fixes https://github.com/ClangBuiltLinux/linux/issues/1186

If a non-SHF_LINK_ORDER orphan is added first, `firstIsec->flags & SHF_LINK_ORDER`
will be zero and we currently assert when calling `getLinkOrderDep`.

Reviewed By: grimar

Differential Revision: https://reviews.llvm.org/D90200

(cherry picked from commit ae73091f30245852817c5c0af050a5a731dee50a)
---
 lld/ELF/LinkerScript.cpp        |  7 +++++--
 lld/test/ELF/linkorder-mixed2.s | 22 ++++++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 lld/test/ELF/linkorder-mixed2.s

diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index 6de2cd65b973..7314b27659bb 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -679,8 +679,11 @@ addInputSec(StringMap<TinyPtrVector<OutputSection *>> &map,
       auto *firstIsec = cast<InputSectionBase>(
           cast<InputSectionDescription>(sec->sectionCommands[0])
               ->sectionBases[0]);
-      if (firstIsec->getLinkOrderDep()->getOutputSection() !=
-          isec->getLinkOrderDep()->getOutputSection())
+      OutputSection *firstIsecOut =
+          firstIsec->flags & SHF_LINK_ORDER
+              ? firstIsec->getLinkOrderDep()->getOutputSection()
+              : nullptr;
+      if (firstIsecOut != isec->getLinkOrderDep()->getOutputSection())
         continue;
     }
 
diff --git a/lld/test/ELF/linkorder-mixed2.s b/lld/test/ELF/linkorder-mixed2.s
new file mode 100644
index 000000000000..26ab970676aa
--- /dev/null
+++ b/lld/test/ELF/linkorder-mixed2.s
@@ -0,0 +1,22 @@
+# REQUIRES: x86
+## In a relocatable link, don't combine SHF_LINK_ORDER and non-SHF_LINK_ORDER
+## like we don't combine SHF_LINK_ORDER with different linked-to sections
+## (see linkerscript/linkorder-linked-to.s).
+## Test we support adding a non-SHF_LINK_ORDER section as an orphan first.
+
+# RUN: llvm-mc -filetype=obj --triple=x86_64 %s -o %t.o
+
+# RUN: ld.lld -r %t.o -o %t.ro
+# RUN: llvm-readelf -x foo %t.ro | FileCheck %s
+
+# CHECK:      Hex dump of section 'foo':
+# CHECK-NEXT: 0x00000000 0100
+
+.section foo,"a"
+.byte 0
+
+.section .text,"ax",@progbits
+ret
+
+.section foo,"ao",@progbits,.text
+.byte 1

From 8fc424f26bf1ea1471bd770a1b4eee4545c2bc96 Mon Sep 17 00:00:00 2001
From: Aaron Puchert <aaronpuchert@alice-dsl.net>
Date: Mon, 26 Oct 2020 20:32:46 +0100
Subject: [PATCH 288/363] Add release tarballs for libclc

Fixes PR47917.

Reviewed By: tstellar

Differential Revision: https://reviews.llvm.org/D90100

(cherry picked from commit 139785dc98ae94717eebaed083eeaad5d775b495)
---
 llvm/utils/release/export.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/release/export.sh b/llvm/utils/release/export.sh
index c3277de38b53..3ffd7e78dd63 100755
--- a/llvm/utils/release/export.sh
+++ b/llvm/utils/release/export.sh
@@ -13,7 +13,7 @@
 
 set -e
 
-projects="llvm clang test-suite compiler-rt libcxx libcxxabi clang-tools-extra polly lldb lld openmp libunwind flang"
+projects="llvm clang test-suite compiler-rt libcxx libcxxabi libclc clang-tools-extra polly lldb lld openmp libunwind flang"
 
 release=""
 rc=""

From 5ad2592b5dc039608eab8a07ce3bd0d8923f0516 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 28 Oct 2020 15:50:32 +0000
Subject: [PATCH 289/363] [X86] Fix cpu name typos

As discussed on PR26418 rGea84dc9500df incorrectly set the knl cpuname to tremont (and missed out the tremont cpuname entirely).

(cherry picked from commit 0d17dc2e75428885e37e53a1524ce7b607501cfa)
---
 llvm/lib/Support/Host.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index 658c1ee74cfe..36cecf9b2a16 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -760,14 +760,15 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
       *Type = X86::INTEL_GOLDMONT_PLUS;
       break;
     case 0x86:
+      CPU = "tremont";
       *Type = X86::INTEL_TREMONT;
       break;
 
+    // Xeon Phi (Knights Landing + Knights Mill):
     case 0x57:
-      CPU = "tremont";
+      CPU = "knl";
       *Type = X86::INTEL_KNL;
       break;
-
     case 0x85:
       CPU = "knm";
       *Type = X86::INTEL_KNM;

From 701addff1b713ee13d85daa0e3f7a0504d84b1af Mon Sep 17 00:00:00 2001
From: Hubert Tong <hubert.reinterpretcast@gmail.com>
Date: Thu, 1 Oct 2020 15:46:26 -0400
Subject: [PATCH 290/363] [clang][Sema] Fix PR47676: Handle dependent AltiVec
 C-style cast

Fix premature decision in the presence of type-dependent expression
operands on whether AltiVec vector initializations from single
expressions are "splat" operations.

Verify that the instantiation is able to determine the correct cast
semantics for both the scalar type and the vector type case.

Note that, because the change only affects the single-expression
case (and the target type is an AltiVec-style vector type), the
replacement of a parenthesized list with a parenthesized expression
does not change the semantics of the program in a program-observable
manner.

Reviewed By: aaron.ballman

Differential Revision: https://reviews.llvm.org/D88526

(cherry picked from commit 35ecc7fe49ba881a77e8146b51870a60a52b211f)
---
 clang/lib/Sema/SemaExpr.cpp         |  2 +-
 clang/test/SemaTemplate/pr47676.cpp | 38 +++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/SemaTemplate/pr47676.cpp

diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index ccae79636f32..0b80ee613077 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -7401,7 +7401,7 @@ Sema::ActOnCastExpr(Scope *S, SourceLocation LParenLoc,
     }
     if (PE || PLE->getNumExprs() == 1) {
       Expr *E = (PE ? PE->getSubExpr() : PLE->getExpr(0));
-      if (!E->getType()->isVectorType())
+      if (!E->isTypeDependent() && !E->getType()->isVectorType())
         isVectorLiteral = true;
     }
     else
diff --git a/clang/test/SemaTemplate/pr47676.cpp b/clang/test/SemaTemplate/pr47676.cpp
new file mode 100644
index 000000000000..428607097c96
--- /dev/null
+++ b/clang/test/SemaTemplate/pr47676.cpp
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -triple=powerpc64le-unknown-linux-gnu \
+// RUN:            -target-feature +altivec -fsyntax-only -ast-dump \
+// RUN:            -xc++ < %s 2>&1 \
+// RUN:   | FileCheck %s
+
+// Ensures that casts to AltiVec type with a dependent expression operand does
+// not hit the assertion failure reported in PR47676. Further checks that casts
+// to AltiVec type with a dependent expression operand is, on instantiation,
+// able to correctly differentiate between a splat case and a bitcast case.
+template <typename T> void f(T *tp) {
+  extern void g(int, ...);
+  g(0, (__vector int)(*tp));
+  g(0, (__vector int)*tp);
+}
+
+void g(void) {
+  f<__vector float>(nullptr);
+//      CHECK: | |-FunctionDecl {{.*}} f 'void (__vector float *)'
+
+//      CHECK: |   | `-CStyleCastExpr {{.*}} '__vector int' <NoOp>
+// CHECK-NEXT: |   |   `-ImplicitCastExpr {{.*}} '__vector int' <BitCast>
+// CHECK-NEXT: |   |     `-ImplicitCastExpr {{.*}}'__vector float' <LValueToRValue>
+
+//      CHECK: |     `-CStyleCastExpr {{.*}} '__vector int' <NoOp>
+// CHECK-NEXT: |       `-ImplicitCastExpr {{.*}} '__vector int' <BitCast>
+// CHECK-NEXT: |         `-ImplicitCastExpr {{.*}}'__vector float' <LValueToRValue>
+
+  f<double>(nullptr);
+//      CHECK: | `-FunctionDecl {{.*}} f 'void (double *)'
+
+//      CHECK: |     | `-CStyleCastExpr {{.*}} '__vector int' <VectorSplat>
+// CHECK-NEXT: |     |   `-ImplicitCastExpr {{.*}} 'int' <FloatingToIntegral>
+// CHECK-NEXT: |     |     `-ImplicitCastExpr {{.*}}'double' <LValueToRValue>
+
+//      CHECK: |       `-CStyleCastExpr {{.*}} '__vector int' <VectorSplat>
+// CHECK-NEXT: |         `-ImplicitCastExpr {{.*}} 'int' <FloatingToIntegral>
+// CHECK-NEXT: |           `-ImplicitCastExpr {{.*}}:'double' <LValueToRValue>
+}

From 3c687677678c382e8d13d6583c3f8cdf3fd301dd Mon Sep 17 00:00:00 2001
From: Geoff Levner <glevner@gmail.com>
Date: Thu, 8 Oct 2020 10:59:30 -0400
Subject: [PATCH 291/363] DeferredDiagnosticsEmitter crashes

Patch VisitCXXDeleteExpr() in clang::UsedDeclVisitor to avoid it crashing
when the expression's destroyed type is null. According to the comments
in CXXDeleteExpr::getDestroyedType(), this can happen when the type to
delete is a dependent type.

Patch by Geoff Levner.

Differential Revision: https://reviews.llvm.org/D88949

(cherry picked from commit b9225543e844bee5091aa16108e0c54bd2abe485)
---
 clang/lib/Sema/UsedDeclVisitor.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Sema/UsedDeclVisitor.h b/clang/lib/Sema/UsedDeclVisitor.h
index d207e07f451a..c33d30478e2a 100644
--- a/clang/lib/Sema/UsedDeclVisitor.h
+++ b/clang/lib/Sema/UsedDeclVisitor.h
@@ -67,10 +67,13 @@ class UsedDeclVisitor : public EvaluatedExprVisitor<Derived> {
   void VisitCXXDeleteExpr(CXXDeleteExpr *E) {
     if (E->getOperatorDelete())
       asImpl().visitUsedDecl(E->getBeginLoc(), E->getOperatorDelete());
-    QualType Destroyed = S.Context.getBaseElementType(E->getDestroyedType());
-    if (const RecordType *DestroyedRec = Destroyed->getAs<RecordType>()) {
-      CXXRecordDecl *Record = cast<CXXRecordDecl>(DestroyedRec->getDecl());
-      asImpl().visitUsedDecl(E->getBeginLoc(), S.LookupDestructor(Record));
+    QualType DestroyedOrNull = E->getDestroyedType();
+    if (!DestroyedOrNull.isNull()) {
+      QualType Destroyed = S.Context.getBaseElementType(DestroyedOrNull);
+      if (const RecordType *DestroyedRec = Destroyed->getAs<RecordType>()) {
+        CXXRecordDecl *Record = cast<CXXRecordDecl>(DestroyedRec->getDecl());
+        asImpl().visitUsedDecl(E->getBeginLoc(), S.LookupDestructor(Record));
+      }
     }
 
     Inherited::VisitCXXDeleteExpr(E);

From ef4ffcafbb2deeb30ccc30ebcdf9a5a843a27ec1 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 29 Sep 2020 22:29:26 -0700
Subject: [PATCH 292/363] [DAE] MarkLive in MarkValue(MaybeLive) if any use is
 live

While looping through all args or all return values, we may mark a use
of a later iteration as live. Previously when we got to that later value
it would ignore that and continue adding to Uses instead of marking it
live. For example, when looping through arg#0 and arg#1,
MarkValue(arg#0, Live) may cause some use of arg#1 to be live, but
MarkValue(arg#1, MaybeLive) will not notice that and continue adding
into Uses.

Now MarkValue(RA, MaybeLive) will MarkLive(RA) if any use is live.

Fixes PR47444.

Reviewed By: rnk

Differential Revision: https://reviews.llvm.org/D88529

(cherry picked from commit 7468afe9ca135228f4c5a48f1b061ca57786fad6)
---
 .../Transforms/IPO/DeadArgumentElimination.h  |  1 +
 .../IPO/DeadArgumentElimination.cpp           | 29 +++++++++++------
 .../DeadArgElim/preserve-used-ret.ll          | 32 +++++++++++++++++++
 3 files changed, 53 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/Transforms/DeadArgElim/preserve-used-ret.ll

diff --git a/llvm/include/llvm/Transforms/IPO/DeadArgumentElimination.h b/llvm/include/llvm/Transforms/IPO/DeadArgumentElimination.h
index 73797bc10017..496ceea12bc9 100644
--- a/llvm/include/llvm/Transforms/IPO/DeadArgumentElimination.h
+++ b/llvm/include/llvm/Transforms/IPO/DeadArgumentElimination.h
@@ -128,6 +128,7 @@ class DeadArgumentEliminationPass
   Liveness SurveyUses(const Value *V, UseVector &MaybeLiveUses);
 
   void SurveyFunction(const Function &F);
+  bool IsLive(const RetOrArg &RA);
   void MarkValue(const RetOrArg &RA, Liveness L,
                  const UseVector &MaybeLiveUses);
   void MarkLive(const RetOrArg &RA);
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 54c51b6e7161..f2588938d964 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -357,7 +357,7 @@ DeadArgumentEliminationPass::Liveness
 DeadArgumentEliminationPass::MarkIfNotLive(RetOrArg Use,
                                            UseVector &MaybeLiveUses) {
   // We're live if our use or its Function is already marked as live.
-  if (LiveFunctions.count(Use.F) || LiveValues.count(Use))
+  if (IsLive(Use))
     return Live;
 
   // We're maybe live otherwise, but remember that we must become live if
@@ -657,10 +657,18 @@ void DeadArgumentEliminationPass::MarkValue(const RetOrArg &RA, Liveness L,
       MarkLive(RA);
       break;
     case MaybeLive:
-      // Note any uses of this value, so this return value can be
-      // marked live whenever one of the uses becomes live.
-      for (const auto &MaybeLiveUse : MaybeLiveUses)
-        Uses.insert(std::make_pair(MaybeLiveUse, RA));
+      assert(!IsLive(RA) && "Use is already live!");
+      for (const auto &MaybeLiveUse : MaybeLiveUses) {
+        if (IsLive(MaybeLiveUse)) {
+          // A use is live, so this value is live.
+          MarkLive(RA);
+          break;
+        } else {
+          // Note any uses of this value, so this value can be
+          // marked live whenever one of the uses becomes live.
+          Uses.insert(std::make_pair(MaybeLiveUse, RA));
+        }
+      }
       break;
   }
 }
@@ -686,17 +694,20 @@ void DeadArgumentEliminationPass::MarkLive(const Function &F) {
 /// mark any values that are used by this value (according to Uses) live as
 /// well.
 void DeadArgumentEliminationPass::MarkLive(const RetOrArg &RA) {
-  if (LiveFunctions.count(RA.F))
-    return; // Function was already marked Live.
+  if (IsLive(RA))
+    return; // Already marked Live.
 
-  if (!LiveValues.insert(RA).second)
-    return; // We were already marked Live.
+  LiveValues.insert(RA);
 
   LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Marking "
                     << RA.getDescription() << " live\n");
   PropagateLiveness(RA);
 }
 
+bool DeadArgumentEliminationPass::IsLive(const RetOrArg &RA) {
+  return LiveFunctions.count(RA.F) || LiveValues.count(RA);
+}
+
 /// PropagateLiveness - Given that RA is a live value, propagate it's liveness
 /// to any other values it uses (according to Uses).
 void DeadArgumentEliminationPass::PropagateLiveness(const RetOrArg &RA) {
diff --git a/llvm/test/Transforms/DeadArgElim/preserve-used-ret.ll b/llvm/test/Transforms/DeadArgElim/preserve-used-ret.ll
new file mode 100644
index 000000000000..f0c2649fdb39
--- /dev/null
+++ b/llvm/test/Transforms/DeadArgElim/preserve-used-ret.ll
@@ -0,0 +1,32 @@
+; RUN: opt -S -deadargelim %s | FileCheck %s
+
+define internal { i64, i64 } @f(i64 %a, i64 %b) {
+start:
+  %0 = insertvalue { i64, i64 } undef, i64 %a, 0
+  %1 = insertvalue { i64, i64 } %0, i64 %b, 1
+  ret { i64, i64 } %1
+}
+
+; Check that we don't delete either of g's return values
+
+; CHECK-LABEL: define internal { i64, i64 } @g(i64 %a, i64 %b)
+define internal { i64, i64 } @g(i64 %a, i64 %b) {
+start:
+  %0 = call { i64, i64 } @f(i64 %a, i64 %b)
+  ret { i64, i64 } %0
+}
+
+declare dso_local i32 @test(i64, i64)
+
+define i32 @main(i32 %argc, i8** %argv) {
+start:
+  %x = call { i64, i64 } @g(i64 13, i64 42)
+  %x.0 = extractvalue { i64, i64 } %x, 0
+  %x.1 = extractvalue { i64, i64 } %x, 1
+  %z = bitcast i64 %x.0 to i64
+  %y = call { i64, i64 } @f(i64 %x.0, i64 %x.1)
+  %y.1 = extractvalue { i64, i64 } %y, 1
+  %0 = call i32 @test(i64 %x.0, i64 %y.1)
+  ret i32 %0
+}
+

From 85ce339f1bd4af075aeb08f59a5a1da00993ce40 Mon Sep 17 00:00:00 2001
From: Kristina Bessonova <kbessonova@accesssoftek.com>
Date: Mon, 7 Sep 2020 10:03:32 +0200
Subject: [PATCH 293/363] [cmake] Fix build of attribute plugin example on
 Windows

Seems '${cmake_2_8_12_PRIVATE}' was removed a long time ago, so it should
be just PRIVATE keyword here.

Reviewed By: john.brawn

Differential Revision: https://reviews.llvm.org/D86091

(cherry picked from commit 04ea680a8ccc4f9a4d7333cd712333960348c35b)
---
 clang/examples/Attribute/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/examples/Attribute/CMakeLists.txt b/clang/examples/Attribute/CMakeLists.txt
index ed02f5e5992f..42f04f5039bc 100644
--- a/clang/examples/Attribute/CMakeLists.txt
+++ b/clang/examples/Attribute/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_llvm_library(Attribute MODULE Attribute.cpp PLUGIN_TOOL clang)
 
 if(LLVM_ENABLE_PLUGINS AND (WIN32 OR CYGWIN))
-  target_link_libraries(Attribute ${cmake_2_8_12_PRIVATE}
+  target_link_libraries(Attribute PRIVATE
     clangAST
     clangBasic
     clangFrontend

From 02004c9e7c9668465585a35a33c6580cc4e3056f Mon Sep 17 00:00:00 2001
From: Adam Balogh <adam.balogh@ericsson.com>
Date: Thu, 15 Oct 2020 15:07:48 +0200
Subject: [PATCH 294/363] [ADT] Fix for ImmutableMapRef

The `Root` member of `ImmutableMapRef` was changed recently from a plain
pointer to `IntrusiveRefCntPtr`. However, the `Profile` member function
was not adjusted. This results in comilation error whenever the
`Profile` method is used on an `ImmutableMapRef`. This patch fixes this
issue and also adds unit tests for `ImmutableMapRef`.

Differential Revision: https://reviews.llvm.org/D89486

(cherry picked from commit 184eb4fa4f1cc871692fa390261df8c25ddcc7ec)
---
 llvm/include/llvm/ADT/ImmutableMap.h    |  2 +-
 llvm/unittests/ADT/ImmutableMapTest.cpp | 41 +++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ADT/ImmutableMap.h b/llvm/include/llvm/ADT/ImmutableMap.h
index 30689d2274a8..81b21a7319a7 100644
--- a/llvm/include/llvm/ADT/ImmutableMap.h
+++ b/llvm/include/llvm/ADT/ImmutableMap.h
@@ -355,7 +355,7 @@ class ImmutableMapRef {
   unsigned getHeight() const { return Root ? Root->getHeight() : 0; }
 
   static inline void Profile(FoldingSetNodeID &ID, const ImmutableMapRef &M) {
-    ID.AddPointer(M.Root);
+    ID.AddPointer(M.Root.get());
   }
 
   inline void Profile(FoldingSetNodeID &ID) const { return Profile(ID, *this); }
diff --git a/llvm/unittests/ADT/ImmutableMapTest.cpp b/llvm/unittests/ADT/ImmutableMapTest.cpp
index fa61816d213c..1217718826f7 100644
--- a/llvm/unittests/ADT/ImmutableMapTest.cpp
+++ b/llvm/unittests/ADT/ImmutableMapTest.cpp
@@ -46,4 +46,45 @@ TEST(ImmutableMapTest, MultiElemIntMapTest) {
   EXPECT_EQ(3U, S2.getHeight());
 }
 
+TEST(ImmutableMapTest, EmptyIntMapRefTest) {
+  using int_int_map = ImmutableMapRef<int, int>;
+  ImmutableMapRef<int, int>::FactoryTy f;
+
+  EXPECT_TRUE(int_int_map::getEmptyMap(&f) == int_int_map::getEmptyMap(&f));
+  EXPECT_FALSE(int_int_map::getEmptyMap(&f) != int_int_map::getEmptyMap(&f));
+  EXPECT_TRUE(int_int_map::getEmptyMap(&f).isEmpty());
+
+  int_int_map S = int_int_map::getEmptyMap(&f);
+  EXPECT_EQ(0u, S.getHeight());
+  EXPECT_TRUE(S.begin() == S.end());
+  EXPECT_FALSE(S.begin() != S.end());
+}
+
+TEST(ImmutableMapTest, MultiElemIntMapRefTest) {
+  ImmutableMapRef<int, int>::FactoryTy f;
+
+  ImmutableMapRef<int, int> S = ImmutableMapRef<int, int>::getEmptyMap(&f);
+
+  ImmutableMapRef<int, int> S2 = S.add(3, 10).add(4, 11).add(5, 12);
+
+  EXPECT_TRUE(S.isEmpty());
+  EXPECT_FALSE(S2.isEmpty());
+
+  EXPECT_EQ(nullptr, S.lookup(3));
+  EXPECT_EQ(nullptr, S.lookup(9));
+
+  EXPECT_EQ(10, *S2.lookup(3));
+  EXPECT_EQ(11, *S2.lookup(4));
+  EXPECT_EQ(12, *S2.lookup(5));
+
+  EXPECT_EQ(5, S2.getMaxElement()->first);
+  EXPECT_EQ(3U, S2.getHeight());
+}
+
+  TEST(ImmutableMapTest, MapOfMapRefsTest) {
+  ImmutableMap<int, ImmutableMapRef<int, int>>::Factory f;
+
+  EXPECT_TRUE(f.getEmptyMap() == f.getEmptyMap());
+  }
+
 }

From 0874e7ef66cc82a795fb19e8662e09a10eabaa01 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Wed, 23 Sep 2020 15:24:52 -0400
Subject: [PATCH 295/363] Allow init_priority values <= 100 and > 65535 within
 system headers.

This also adds some bare-bones documentation for the attribute rather
than leaving it undocumented.

(cherry picked from commit af1d3e655991e5f0c86df372b8583a60d20268e0)
---
 clang/include/clang/Basic/Attr.td         |  2 +-
 clang/include/clang/Basic/AttrDocs.td     | 26 +++++++++++++++++++++++
 clang/lib/Sema/SemaDeclAttr.cpp           |  6 +++++-
 clang/test/SemaCXX/init-priority-attr.cpp | 18 +++++++++++++---
 4 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index bc4a380545af..19eccf7ceadf 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2115,7 +2115,7 @@ def InitPriority : InheritableAttr {
   let Spellings = [GCC<"init_priority", /*AllowInC*/0>];
   let Args = [UnsignedArgument<"Priority">];
   let Subjects = SubjectList<[Var], ErrorDiag>;
-  let Documentation = [Undocumented];
+  let Documentation = [InitPriorityDocs];
 }
 
 def Section : InheritableAttr {
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 3cba3a3d96f9..833127ed44eb 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -57,6 +57,32 @@ global variable or function should be in after translation.
   let Heading = "section, __declspec(allocate)";
 }
 
+def InitPriorityDocs : Documentation {
+  let Category = DocCatVariable;
+  let Content = [{
+In C++, the order in which global variables are initialized across translation
+units is unspecified, unlike the ordering within a single translation unit. The
+``init_priority`` attribute allows you to specify a relative ordering for the
+initialization of objects declared at namespace scope in C++. The priority is
+given as an integer constant expression between 101 and 65535 (inclusive).
+Priorities outside of that range are reserved for use by the implementation. A
+lower value indicates a higher priority of initialization. Note that only the
+relative ordering of values is important. For example:
+
+.. code-block:: c++
+
+  struct SomeType { SomeType(); };
+  __attribute__((init_priority(200))) SomeType Obj1;
+  __attribute__((init_priority(101))) SomeType Obj2;
+
+``Obj1`` will be initialized *before* ``Obj2`` despite the usual order of
+initialization being the opposite.
+
+This attribute is only supported for C++ and Objective-C++ and is ignored in
+other language modes.
+  }];
+}
+
 def InitSegDocs : Documentation {
   let Category = DocCatVariable;
   let Content = [{
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 1a0594512a60..a9a2a19b4797 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3301,7 +3301,11 @@ static void handleInitPriorityAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     return;
   }
 
-  if (prioritynum < 101 || prioritynum > 65535) {
+  // Only perform the priority check if the attribute is outside of a system
+  // header. Values <= 100 are reserved for the implementation, and libc++
+  // benefits from being able to specify values in that range.
+  if ((prioritynum < 101 || prioritynum > 65535) &&
+      !S.getSourceManager().isInSystemHeader(AL.getLoc())) {
     S.Diag(AL.getLoc(), diag::err_attribute_argument_out_of_range)
         << E->getSourceRange() << AL << 101 << 65535;
     AL.setInvalid();
diff --git a/clang/test/SemaCXX/init-priority-attr.cpp b/clang/test/SemaCXX/init-priority-attr.cpp
index 8f31e2fd62d0..5b5e3b9eb940 100644
--- a/clang/test/SemaCXX/init-priority-attr.cpp
+++ b/clang/test/SemaCXX/init-priority-attr.cpp
@@ -1,4 +1,9 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -DSYSTEM -verify %s
+
+#if defined(SYSTEM)
+#5 "init-priority-attr.cpp" 3 // system header
+#endif
 
 class Two {
 private:
@@ -21,7 +26,15 @@ Two foo __attribute__((init_priority(101))) ( 5, 6 );
 
 Two goo __attribute__((init_priority(2,3))) ( 5, 6 ); // expected-error {{'init_priority' attribute takes one argument}}
 
-Two coo[2]  __attribute__((init_priority(3)));	// expected-error {{'init_priority' attribute requires integer constant between 101 and 65535 inclusive}}
+Two coo[2]  __attribute__((init_priority(100)));
+#if !defined(SYSTEM)
+// expected-error@-2 {{'init_priority' attribute requires integer constant between 101 and 65535 inclusive}}
+#endif
+
+Two boo[2]  __attribute__((init_priority(65536)));
+#if !defined(SYSTEM)
+// expected-error@-2 {{'init_priority' attribute requires integer constant between 101 and 65535 inclusive}}
+#endif
 
 Two koo[4]  __attribute__((init_priority(1.13))); // expected-error {{'init_priority' attribute requires an integer constant}}
 
@@ -30,6 +43,5 @@ Two func()  __attribute__((init_priority(1001))); // expected-error {{'init_prio
 int i  __attribute__((init_priority(1001))); // expected-error {{can only use 'init_priority' attribute on file-scope definitions of objects of class type}}
 
 int main() {
-	Two foo __attribute__((init_priority(1001)));	// expected-error {{can only use 'init_priority' attribute on file-scope definitions of objects of class type}}
+  Two foo __attribute__((init_priority(1001)));	// expected-error {{can only use 'init_priority' attribute on file-scope definitions of objects of class type}}
 }
-

From d50044e809d2c15c56df0ea808f047a2c81d7344 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Mon, 19 Oct 2020 16:41:51 -0700
Subject: [PATCH 296/363] [CUDA] Improve clang's ability to detect recent CUDA
 versions.

CUDA-11.1 does not carry version.txt which causes clang to assume that it's
CUDA-7.0, which used to be the only CUDA version w/o version.txt.

In order to tell CUDA-7.0 apart from the new versions, clang now probes for the
presence of libdevice.10.bc which is not present in the old CUDA versions.

This should keep Clang working for CUDA-11.1.

PR47332: https://bugs.llvm.org/show_bug.cgi?id=47332

Differential Revision: https://reviews.llvm.org/D89752

(cherry picked from commit 65d206484c54177641d4b11d42cab1f1acc8c0c7)
---
 clang/lib/Driver/ToolChains/Cuda.cpp                  | 11 ++++++++---
 .../Driver/Inputs/CUDA_111/usr/local/cuda/bin/.keep   |  0
 .../Inputs/CUDA_111/usr/local/cuda/include/.keep      |  0
 .../Driver/Inputs/CUDA_111/usr/local/cuda/lib/.keep   |  0
 .../Driver/Inputs/CUDA_111/usr/local/cuda/lib64/.keep |  0
 .../usr/local/cuda/nvvm/libdevice/libdevice.10.bc     |  0
 clang/test/Driver/cuda-version-check.cu               |  7 ++++++-
 7 files changed, 14 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/bin/.keep
 create mode 100644 clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/include/.keep
 create mode 100644 clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/lib/.keep
 create mode 100644 clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/lib64/.keep
 create mode 100644 clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/nvvm/libdevice/libdevice.10.bc

diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 110a0bca9bc1..cfd9dae0fa91 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -155,9 +155,14 @@ CudaInstallationDetector::CudaInstallationDetector(
     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> VersionFile =
         FS.getBufferForFile(InstallPath + "/version.txt");
     if (!VersionFile) {
-      // CUDA 7.0 doesn't have a version.txt, so guess that's our version if
-      // version.txt isn't present.
-      Version = CudaVersion::CUDA_70;
+      // CUDA 7.0 and CUDA 11.1+ do not have version.txt file.
+      // Use libdevice file to distinguish 7.0 from the new versions.
+      if (FS.exists(LibDevicePath + "/libdevice.10.bc")) {
+        Version = CudaVersion::LATEST;
+        DetectedVersionIsNotSupported = Version > CudaVersion::LATEST_SUPPORTED;
+      } else {
+        Version = CudaVersion::CUDA_70;
+      }
     } else {
       ParseCudaVersionFile((*VersionFile)->getBuffer());
     }
diff --git a/clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/bin/.keep b/clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/bin/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/include/.keep b/clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/include/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/lib/.keep b/clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/lib/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/lib64/.keep b/clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/lib64/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/nvvm/libdevice/libdevice.10.bc b/clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/nvvm/libdevice/libdevice.10.bc
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/cuda-version-check.cu b/clang/test/Driver/cuda-version-check.cu
index a09b248304f2..1e6af029202f 100644
--- a/clang/test/Driver/cuda-version-check.cu
+++ b/clang/test/Driver/cuda-version-check.cu
@@ -10,6 +10,11 @@
 // RUN:    FileCheck %s --check-prefix=OK
 // RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-path=%S/Inputs/CUDA-unknown/usr/local/cuda 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=UNKNOWN_VERSION
+// CUDA versions after 11.0 (update 1) do not carry version.txt file. Make sure
+// we still detect them as a new version and handle them the same as we handle
+// other new CUDA versions.
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=UNKNOWN_VERSION
 // Make sure that we don't warn about CUDA version during C++ compilation.
 // RUN: %clang --target=x86_64-linux -v -### -x c++ --cuda-gpu-arch=sm_60 \
 // RUN:    --cuda-path=%S/Inputs/CUDA-unknown/usr/local/cuda 2>&1 %s | \
@@ -65,5 +70,5 @@
 // ERR_SM61: error: GPU arch sm_61 {{.*}}
 // ERR_SM61-NOT: error: GPU arch sm_61
 
-// UNKNOWN_VERSION: Unknown CUDA version 999.999. Assuming the latest supported version
+// UNKNOWN_VERSION: Unknown CUDA version {{.*}}. Assuming the latest supported version
 // UNKNOWN_VERSION_CXX-NOT: Unknown CUDA version

From 06f479cba3a09ef47326ea69e719d2aa1c0fba4c Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Tue, 20 Oct 2020 15:11:38 -0700
Subject: [PATCH 297/363] [CUDA] Extract CUDA version from cuda.h if
 version.txt is not found

If CUDA version can not be determined based on version.txt file, attempt to find
CUDA_VERSION macro in cuda.h.

This is a follow-up to D89752,

Differntial Revision: https://reviews.llvm.org/D89832

(cherry picked from commit e7fe125b776bf08d95e60ff3354a5c836218a0e6)
---
 .../clang/Basic/DiagnosticDriverKinds.td      |   2 +-
 clang/lib/Driver/ToolChains/Cuda.cpp          | 118 +++++++++++++-----
 clang/lib/Driver/ToolChains/Cuda.h            |   3 -
 .../Inputs/CUDA_102/usr/local/cuda/bin/.keep  |   0
 .../CUDA_102/usr/local/cuda/include/.keep     |   0
 .../Inputs/CUDA_102/usr/local/cuda/lib/.keep  |   0
 .../CUDA_102/usr/local/cuda/lib64/.keep       |   0
 .../local/cuda/nvvm/libdevice/libdevice.10.bc |   0
 .../CUDA_102/usr/local/cuda/version.txt       |   1 +
 .../CUDA_111/usr/local/cuda/include/cuda.h    |   7 ++
 clang/test/Driver/cuda-version-check.cu       |  14 ++-
 11 files changed, 108 insertions(+), 37 deletions(-)
 create mode 100644 clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/bin/.keep
 create mode 100644 clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/include/.keep
 create mode 100644 clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/lib/.keep
 create mode 100644 clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/lib64/.keep
 create mode 100644 clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/nvvm/libdevice/libdevice.10.bc
 create mode 100644 clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/version.txt
 create mode 100644 clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/include/cuda.h

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 558639ecad6a..acdad15cdf6c 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -69,7 +69,7 @@ def err_drv_cuda_version_unsupported : Error<
   "install, pass a different GPU arch with --cuda-gpu-arch, or pass "
   "--no-cuda-version-check.">;
 def warn_drv_unknown_cuda_version: Warning<
-  "Unknown CUDA version %0. Assuming the latest supported version %1">,
+  "Unknown CUDA version. %0 Assuming the latest supported version %1">,
   InGroup<CudaUnknownVersion>;
 def err_drv_cuda_host_arch : Error<"unsupported architecture '%0' for host compilation.">;
 def err_drv_mix_cuda_hip : Error<"Mixed Cuda and HIP compilation is not supported.">;
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index cfd9dae0fa91..ffc606dd554b 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -16,6 +16,7 @@
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Options.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
@@ -32,29 +33,80 @@ using namespace clang::driver::tools;
 using namespace clang;
 using namespace llvm::opt;
 
+namespace {
+struct CudaVersionInfo {
+  std::string DetectedVersion;
+  CudaVersion Version;
+};
 // Parses the contents of version.txt in an CUDA installation.  It should
 // contain one line of the from e.g. "CUDA Version 7.5.2".
-void CudaInstallationDetector::ParseCudaVersionFile(llvm::StringRef V) {
-  Version = CudaVersion::UNKNOWN;
+CudaVersionInfo parseCudaVersionFile(llvm::StringRef V) {
+  V = V.trim();
   if (!V.startswith("CUDA Version "))
-    return;
+    return {V.str(), CudaVersion::UNKNOWN};
   V = V.substr(strlen("CUDA Version "));
   SmallVector<StringRef,4> VersionParts;
   V.split(VersionParts, '.');
-  if (VersionParts.size() < 2)
-    return;
-  DetectedVersion = join_items(".", VersionParts[0], VersionParts[1]);
-  Version = CudaStringToVersion(DetectedVersion);
-  if (Version != CudaVersion::UNKNOWN) {
-    // TODO(tra): remove the warning once we have all features of 10.2 and 11.0
-    // implemented.
-    DetectedVersionIsNotSupported = Version > CudaVersion::LATEST_SUPPORTED;
-    return;
-  }
+  return {"version.txt: " + V.str() + ".",
+          VersionParts.size() < 2
+              ? CudaVersion::UNKNOWN
+              : CudaStringToVersion(
+                    join_items(".", VersionParts[0], VersionParts[1]))};
+}
+
+CudaVersion getCudaVersion(uint32_t raw_version) {
+  if (raw_version < 7050)
+    return CudaVersion::CUDA_70;
+  if (raw_version < 8000)
+    return CudaVersion::CUDA_75;
+  if (raw_version < 9000)
+    return CudaVersion::CUDA_80;
+  if (raw_version < 9010)
+    return CudaVersion::CUDA_90;
+  if (raw_version < 9020)
+    return CudaVersion::CUDA_91;
+  if (raw_version < 10000)
+    return CudaVersion::CUDA_92;
+  if (raw_version < 10010)
+    return CudaVersion::CUDA_100;
+  if (raw_version < 10020)
+    return CudaVersion::CUDA_101;
+  if (raw_version < 11000)
+    return CudaVersion::CUDA_102;
+  if (raw_version < 11010)
+    return CudaVersion::CUDA_110;
+  return CudaVersion::LATEST;
+}
 
-  Version = CudaVersion::LATEST_SUPPORTED;
-  DetectedVersionIsNotSupported = true;
+CudaVersionInfo parseCudaHFile(llvm::StringRef Input) {
+  // Helper lambda which skips the words if the line starts with them or returns
+  // None otherwise.
+  auto StartsWithWords =
+      [](llvm::StringRef Line,
+         const SmallVector<StringRef, 3> words) -> llvm::Optional<StringRef> {
+    for (StringRef word : words) {
+      if (!Line.consume_front(word))
+        return {};
+      Line = Line.ltrim();
+    }
+    return Line;
+  };
+
+  Input = Input.ltrim();
+  while (!Input.empty()) {
+    if (auto Line =
+            StartsWithWords(Input.ltrim(), {"#", "define", "CUDA_VERSION"})) {
+      uint32_t RawVersion;
+      Line->consumeInteger(10, RawVersion);
+      return {"cuda.h: CUDA_VERSION=" + Twine(RawVersion).str() + ".",
+              getCudaVersion(RawVersion)};
+    }
+    // Find next non-empty line.
+    Input = Input.drop_front(Input.find_first_of("\n\r")).ltrim();
+  }
+  return {"cuda.h: CUDA_VERSION not found.", CudaVersion::UNKNOWN};
 }
+} // namespace
 
 void CudaInstallationDetector::WarnIfUnsupportedVersion() {
   if (DetectedVersionIsNotSupported)
@@ -152,21 +204,31 @@ CudaInstallationDetector::CudaInstallationDetector(
     else
       continue;
 
-    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> VersionFile =
-        FS.getBufferForFile(InstallPath + "/version.txt");
-    if (!VersionFile) {
-      // CUDA 7.0 and CUDA 11.1+ do not have version.txt file.
-      // Use libdevice file to distinguish 7.0 from the new versions.
-      if (FS.exists(LibDevicePath + "/libdevice.10.bc")) {
-        Version = CudaVersion::LATEST;
-        DetectedVersionIsNotSupported = Version > CudaVersion::LATEST_SUPPORTED;
-      } else {
-        Version = CudaVersion::CUDA_70;
-      }
-    } else {
-      ParseCudaVersionFile((*VersionFile)->getBuffer());
+    CudaVersionInfo VersionInfo = {"", CudaVersion::UNKNOWN};
+    if (auto VersionFile = FS.getBufferForFile(InstallPath + "/version.txt"))
+      VersionInfo = parseCudaVersionFile((*VersionFile)->getBuffer());
+    // If version file didn't give us the version, try to find it in cuda.h
+    if (VersionInfo.Version == CudaVersion::UNKNOWN)
+      if (auto CudaHFile = FS.getBufferForFile(InstallPath + "/include/cuda.h"))
+        VersionInfo = parseCudaHFile((*CudaHFile)->getBuffer());
+    // As the last resort, make an educated guess between CUDA-7.0, (which had
+    // no version.txt file and had old-style libdevice bitcode ) and an unknown
+    // recent CUDA version (no version.txt, new style bitcode).
+    if (VersionInfo.Version == CudaVersion::UNKNOWN) {
+      VersionInfo.Version = (FS.exists(LibDevicePath + "/libdevice.10.bc"))
+                                ? Version = CudaVersion::LATEST
+                                : Version = CudaVersion::CUDA_70;
+      VersionInfo.DetectedVersion =
+          "No version found in version.txt or cuda.h.";
     }
 
+    Version = VersionInfo.Version;
+    DetectedVersion = VersionInfo.DetectedVersion;
+
+    // TODO(tra): remove the warning once we have all features of 10.2
+    // and 11.0 implemented.
+    DetectedVersionIsNotSupported = Version > CudaVersion::LATEST_SUPPORTED;
+
     if (Version >= CudaVersion::CUDA_90) {
       // CUDA-9+ uses single libdevice file for all GPU variants.
       std::string FilePath = LibDevicePath + "/libdevice.10.bc";
diff --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h
index 873eb7338a30..bbf272c468a5 100644
--- a/clang/lib/Driver/ToolChains/Cuda.h
+++ b/clang/lib/Driver/ToolChains/Cuda.h
@@ -78,9 +78,6 @@ class CudaInstallationDetector {
     return LibDeviceMap.lookup(Gpu);
   }
   void WarnIfUnsupportedVersion();
-
-private:
-  void ParseCudaVersionFile(llvm::StringRef V);
 };
 
 namespace tools {
diff --git a/clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/bin/.keep b/clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/bin/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/include/.keep b/clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/include/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/lib/.keep b/clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/lib/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/lib64/.keep b/clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/lib64/.keep
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/nvvm/libdevice/libdevice.10.bc b/clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/nvvm/libdevice/libdevice.10.bc
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/version.txt b/clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/version.txt
new file mode 100644
index 000000000000..cd34d385ddf5
--- /dev/null
+++ b/clang/test/Driver/Inputs/CUDA_102/usr/local/cuda/version.txt
@@ -0,0 +1 @@
+CUDA Version 10.2.333
diff --git a/clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/include/cuda.h b/clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/include/cuda.h
new file mode 100644
index 000000000000..6ce5b747561d
--- /dev/null
+++ b/clang/test/Driver/Inputs/CUDA_111/usr/local/cuda/include/cuda.h
@@ -0,0 +1,7 @@
+//
+// Placeholder file for testing CUDA version detection
+//
+
+#define CUDA_VERSION 11010
+
+//
diff --git a/clang/test/Driver/cuda-version-check.cu b/clang/test/Driver/cuda-version-check.cu
index 1e6af029202f..bc04794375a9 100644
--- a/clang/test/Driver/cuda-version-check.cu
+++ b/clang/test/Driver/cuda-version-check.cu
@@ -8,13 +8,15 @@
 // RUN:    FileCheck %s --check-prefix=OK
 // RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=OK
+// Test version guess when no version.txt or cuda.h are found
 // RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-path=%S/Inputs/CUDA-unknown/usr/local/cuda 2>&1 %s | \
 // RUN:    FileCheck %s --check-prefix=UNKNOWN_VERSION
-// CUDA versions after 11.0 (update 1) do not carry version.txt file. Make sure
-// we still detect them as a new version and handle them the same as we handle
-// other new CUDA versions.
+// Unknown version with version.txt present
+// RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=UNKNOWN_VERSION_V
+// Unknown version with no version.txt but with version info present in cuda.h
 // RUN: %clang --target=x86_64-linux -v -### --cuda-gpu-arch=sm_60 --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda 2>&1 %s | \
-// RUN:    FileCheck %s --check-prefix=UNKNOWN_VERSION
+// RUN:    FileCheck %s --check-prefix=UNKNOWN_VERSION_H
 // Make sure that we don't warn about CUDA version during C++ compilation.
 // RUN: %clang --target=x86_64-linux -v -### -x c++ --cuda-gpu-arch=sm_60 \
 // RUN:    --cuda-path=%S/Inputs/CUDA-unknown/usr/local/cuda 2>&1 %s | \
@@ -70,5 +72,7 @@
 // ERR_SM61: error: GPU arch sm_61 {{.*}}
 // ERR_SM61-NOT: error: GPU arch sm_61
 
-// UNKNOWN_VERSION: Unknown CUDA version {{.*}}. Assuming the latest supported version
+// UNKNOWN_VERSION_V: Unknown CUDA version. version.txt:{{.*}}. Assuming the latest supported version
+// UNKNOWN_VERSION_H: Unknown CUDA version. cuda.h: CUDA_VERSION={{.*}}. Assuming the latest supported version
+// UNKNOWN_VERSION: Unknown CUDA version. No version found in version.txt or cuda.h. Assuming the latest supported version
 // UNKNOWN_VERSION_CXX-NOT: Unknown CUDA version

From 973b95e0a8450e701a106896b5fb9aeda46f9071 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <andrea.dibiagio@sony.com>
Date: Sat, 31 Oct 2020 11:21:05 +0000
Subject: [PATCH 298/363] [MCA][LSUnit] Correctly update the internal group
 flags on store barrier execution. Fixes PR48024.

This is likely to be a regressigion introduced by my last refactoring of the
LSUnit (commit 5578ec32f9c4f). Before this patch, the
"CurrentStoreBarrierGroupID" index was not correctly reset on store barrier
executions.  This was leading to unexpected crashes like the one reported as
PR48024.

(cherry picked from commit 0e20666db3ac280affe82d31b6c144923704e9c4)
---
 llvm/lib/MCA/HardwareUnits/LSUnit.cpp         |   2 +
 .../llvm-mca/X86/BtVer2/stmxcsr-ldmxcsr.s     | 104 ++++++++++++++++++
 .../llvm-mca/X86/Haswell/stmxcsr-ldmxcsr.s    | 100 +++++++++++++++++
 3 files changed, 206 insertions(+)
 create mode 100644 llvm/test/tools/llvm-mca/X86/BtVer2/stmxcsr-ldmxcsr.s
 create mode 100644 llvm/test/tools/llvm-mca/X86/Haswell/stmxcsr-ldmxcsr.s

diff --git a/llvm/lib/MCA/HardwareUnits/LSUnit.cpp b/llvm/lib/MCA/HardwareUnits/LSUnit.cpp
index e945e8cecce9..4594368fc0e9 100644
--- a/llvm/lib/MCA/HardwareUnits/LSUnit.cpp
+++ b/llvm/lib/MCA/HardwareUnits/LSUnit.cpp
@@ -243,6 +243,8 @@ void LSUnit::onInstructionExecuted(const InstRef &IR) {
       CurrentStoreGroupID = 0;
     if (GroupID == CurrentLoadBarrierGroupID)
       CurrentLoadBarrierGroupID = 0;
+    if (GroupID == CurrentStoreBarrierGroupID)
+      CurrentStoreBarrierGroupID = 0;
   }
 }
 
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/stmxcsr-ldmxcsr.s b/llvm/test/tools/llvm-mca/X86/BtVer2/stmxcsr-ldmxcsr.s
new file mode 100644
index 000000000000..52bf97732d95
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/stmxcsr-ldmxcsr.s
@@ -0,0 +1,104 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+# Code snippet taken from PR48024.
+
+stmxcsr -4(%rsp)
+movl    $-24577, %eax    # imm = 0x9FFF
+andl    -4(%rsp), %eax
+movl    %eax, -8(%rsp)
+ldmxcsr -8(%rsp)
+retq
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.85
+# CHECK-NEXT: IPC:               0.85
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00           *      U     stmxcsr	-4(%rsp)
+# CHECK-NEXT:  1      1     0.50                        movl	$-24577, %eax
+# CHECK-NEXT:  1      4     1.00    *                   andl	-4(%rsp), %eax
+# CHECK-NEXT:  1      1     1.00           *            movl	%eax, -8(%rsp)
+# CHECK-NEXT:  1      3     1.00    *             U     ldmxcsr	-8(%rsp)
+# CHECK-NEXT:  1      4     1.00                  U     retq
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - JALU0
+# CHECK-NEXT: [1]   - JALU1
+# CHECK-NEXT: [2]   - JDiv
+# CHECK-NEXT: [3]   - JFPA
+# CHECK-NEXT: [4]   - JFPM
+# CHECK-NEXT: [5]   - JFPU0
+# CHECK-NEXT: [6]   - JFPU1
+# CHECK-NEXT: [7]   - JLAGU
+# CHECK-NEXT: [8]   - JMul
+# CHECK-NEXT: [9]   - JSAGU
+# CHECK-NEXT: [10]  - JSTC
+# CHECK-NEXT: [11]  - JVALU0
+# CHECK-NEXT: [12]  - JVALU1
+# CHECK-NEXT: [13]  - JVIMUL
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT: 1.50   1.50    -      -      -      -      -     3.00    -     2.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     stmxcsr	-4(%rsp)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     movl	$-24577, %eax
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -     andl	-4(%rsp), %eax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     movl	%eax, -8(%rsp)
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -      -      -     ldmxcsr	-8(%rsp)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -      -      -      -      -      -     retq
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
+
+# CHECK:      [0,0]     DeER .    .    .    .   .   stmxcsr	-4(%rsp)
+# CHECK-NEXT: [0,1]     DeER .    .    .    .   .   movl	$-24577, %eax
+# CHECK-NEXT: [0,2]     .DeeeeER  .    .    .   .   andl	-4(%rsp), %eax
+# CHECK-NEXT: [0,3]     .D====eER .    .    .   .   movl	%eax, -8(%rsp)
+# CHECK-NEXT: [0,4]     . D===eeeER    .    .   .   ldmxcsr	-8(%rsp)
+# CHECK-NEXT: [0,5]     . DeeeeE--R    .    .   .   retq
+# CHECK-NEXT: [1,0]     .  D===eE--R   .    .   .   stmxcsr	-4(%rsp)
+# CHECK-NEXT: [1,1]     .  DeE-----R   .    .   .   movl	$-24577, %eax
+# CHECK-NEXT: [1,2]     .   D====eeeeER.    .   .   andl	-4(%rsp), %eax
+# CHECK-NEXT: [1,3]     .   D========eER    .   .   movl	%eax, -8(%rsp)
+# CHECK-NEXT: [1,4]     .    D=======eeeER  .   .   ldmxcsr	-8(%rsp)
+# CHECK-NEXT: [1,5]     .    D=eeeeE-----R  .   .   retq
+# CHECK-NEXT: [2,0]     .    .D=======eE--R .   .   stmxcsr	-4(%rsp)
+# CHECK-NEXT: [2,1]     .    .DeE---------R .   .   movl	$-24577, %eax
+# CHECK-NEXT: [2,2]     .    . D========eeeeER  .   andl	-4(%rsp), %eax
+# CHECK-NEXT: [2,3]     .    . D============eER .   movl	%eax, -8(%rsp)
+# CHECK-NEXT: [2,4]     .    .  D===========eeeER   ldmxcsr	-8(%rsp)
+# CHECK-NEXT: [2,5]     .    .  D=eeeeE---------R   retq
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     4.3    1.0    1.3       stmxcsr	-4(%rsp)
+# CHECK-NEXT: 1.     3     1.0    1.0    4.7       movl	$-24577, %eax
+# CHECK-NEXT: 2.     3     5.0    0.3    0.0       andl	-4(%rsp), %eax
+# CHECK-NEXT: 3.     3     9.0    0.0    0.0       movl	%eax, -8(%rsp)
+# CHECK-NEXT: 4.     3     8.0    0.0    0.0       ldmxcsr	-8(%rsp)
+# CHECK-NEXT: 5.     3     1.7    1.7    5.3       retq
+# CHECK-NEXT:        3     4.8    0.7    1.9       <total>
diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/stmxcsr-ldmxcsr.s b/llvm/test/tools/llvm-mca/X86/Haswell/stmxcsr-ldmxcsr.s
new file mode 100644
index 000000000000..c976314f0fd2
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/stmxcsr-ldmxcsr.s
@@ -0,0 +1,100 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+# Code snippet taken from PR48024.
+
+stmxcsr -4(%rsp)
+movl    $-24577, %eax    # imm = 0x9FFF
+andl    -4(%rsp), %eax
+movl    %eax, -8(%rsp)
+ldmxcsr -8(%rsp)
+retq
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      1304
+# CHECK-NEXT: Total uOps:        1300
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.46
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  3      2     1.00           *      U     stmxcsr	-4(%rsp)
+# CHECK-NEXT:  1      1     0.25                        movl	$-24577, %eax
+# CHECK-NEXT:  2      6     0.50    *                   andl	-4(%rsp), %eax
+# CHECK-NEXT:  1      1     1.00           *            movl	%eax, -8(%rsp)
+# CHECK-NEXT:  3      7     1.00    *             U     ldmxcsr	-8(%rsp)
+# CHECK-NEXT:  3      7     1.00                  U     retq
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - HWDivider
+# CHECK-NEXT: [1]   - HWFPDivider
+# CHECK-NEXT: [2]   - HWPort0
+# CHECK-NEXT: [3]   - HWPort1
+# CHECK-NEXT: [4]   - HWPort2
+# CHECK-NEXT: [5]   - HWPort3
+# CHECK-NEXT: [6]   - HWPort4
+# CHECK-NEXT: [7]   - HWPort5
+# CHECK-NEXT: [8]   - HWPort6
+# CHECK-NEXT: [9]   - HWPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     1.75   1.74   1.67   1.68   2.00   1.75   1.76   1.65
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -      -      -     0.30    -     1.00   1.00    -     0.70   stmxcsr	-4(%rsp)
+# CHECK-NEXT:  -      -     0.08   0.67    -      -      -     0.04   0.21    -     movl	$-24577, %eax
+# CHECK-NEXT:  -      -     0.42   0.37   0.35   0.65    -     0.01   0.20    -     andl	-4(%rsp), %eax
+# CHECK-NEXT:  -      -      -      -     0.05    -     1.00    -      -     0.95   movl	%eax, -8(%rsp)
+# CHECK-NEXT:  -      -     1.00   0.23   0.34   0.66    -     0.42   0.35    -     ldmxcsr	-8(%rsp)
+# CHECK-NEXT:  -      -     0.25   0.47   0.63   0.37    -     0.28   1.00    -     retq
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeER.    .    .    .    .    .    .    . .   stmxcsr	-4(%rsp)
+# CHECK-NEXT: [0,1]     DeE-R.    .    .    .    .    .    .    . .   movl	$-24577, %eax
+# CHECK-NEXT: [0,2]     .DeeeeeeER.    .    .    .    .    .    . .   andl	-4(%rsp), %eax
+# CHECK-NEXT: [0,3]     .D======eER    .    .    .    .    .    . .   movl	%eax, -8(%rsp)
+# CHECK-NEXT: [0,4]     . D=====eeeeeeeER   .    .    .    .    . .   ldmxcsr	-8(%rsp)
+# CHECK-NEXT: [0,5]     .  DeeeeeeeE----R   .    .    .    .    . .   retq
+# CHECK-NEXT: [1,0]     .   D====eeE----R   .    .    .    .    . .   stmxcsr	-4(%rsp)
+# CHECK-NEXT: [1,1]     .   DeE---------R   .    .    .    .    . .   movl	$-24577, %eax
+# CHECK-NEXT: [1,2]     .    D=========eeeeeeER  .    .    .    . .   andl	-4(%rsp), %eax
+# CHECK-NEXT: [1,3]     .    D===============eER .    .    .    . .   movl	%eax, -8(%rsp)
+# CHECK-NEXT: [1,4]     .    .D==============eeeeeeeER.    .    . .   ldmxcsr	-8(%rsp)
+# CHECK-NEXT: [1,5]     .    . DeeeeeeeE-------------R.    .    . .   retq
+# CHECK-NEXT: [2,0]     .    .  D=============eeE----R.    .    . .   stmxcsr	-4(%rsp)
+# CHECK-NEXT: [2,1]     .    .  DeE------------------R.    .    . .   movl	$-24577, %eax
+# CHECK-NEXT: [2,2]     .    .   D==================eeeeeeER    . .   andl	-4(%rsp), %eax
+# CHECK-NEXT: [2,3]     .    .   D========================eER   . .   movl	%eax, -8(%rsp)
+# CHECK-NEXT: [2,4]     .    .    D=======================eeeeeeeER   ldmxcsr	-8(%rsp)
+# CHECK-NEXT: [2,5]     .    .    .DeeeeeeeE----------------------R   retq
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     6.7    1.0    2.7       stmxcsr	-4(%rsp)
+# CHECK-NEXT: 1.     3     1.0    1.0    9.3       movl	$-24577, %eax
+# CHECK-NEXT: 2.     3     10.0   0.3    0.0       andl	-4(%rsp), %eax
+# CHECK-NEXT: 3.     3     16.0   0.0    0.0       movl	%eax, -8(%rsp)
+# CHECK-NEXT: 4.     3     15.0   0.0    0.0       ldmxcsr	-8(%rsp)
+# CHECK-NEXT: 5.     3     1.0    1.0    13.0      retq
+# CHECK-NEXT:        3     8.3    0.6    4.2       <total>

From 3e8d9807d663d3180ba5093879f8f570f8c280bb Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliacomputing.com>
Date: Fri, 7 Aug 2020 16:38:15 -0400
Subject: [PATCH 299/363] [X86] Don't produce bad x86andp nodes for i1 vectors

In D85499, I attempted to fix this same issue by canonicalizing
andnp for i1 vectors, but since there was some opposition to such
a change, this commit just fixes the bug by using two different
forms depending on which kind of vector type is in use. We can
then always decide to switch the canonical forms later.

Description of the original bug:
We have a DAG combine that tries to fold (vselect cond, 0000..., X) -> (andnp cond, x).
However, it does so by attempting to create an i64 vector with the number
of elements obtained by truncating division by 64 from the bitwidth. This is
bad for mask vectors like v8i1, since that division is just zero. Besides,
we don't want i64 vectors anyway. For i1 vectors, switch the pattern
to (andnp (not cond), x), which is the canonical form for `kandn`
on mask registers.

Fixes https://github.com/JuliaLang/julia/issues/36955.

Differential Revision: https://reviews.llvm.org/D85553

(cherry picked from commit c58674df147ac0e2777208376bfd2b0d9acbef48)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 12 +++--
 llvm/test/CodeGen/X86/avx512-select.ll  | 61 +++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1671917157f4..fd1e6517dfac 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39588,10 +39588,14 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
 
   // vselect Cond, 000..., X -> andn Cond, X
   if (TValIsAllZeros) {
-    MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
-    SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
-    SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
-    SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
+    SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
+    SDValue AndN;
+    // The canonical form differs for i1 vectors - x86andnp is not used
+    if (CondVT.getScalarType() == MVT::i1)
+      AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
+                         CastRHS);
+    else
+      AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
     return DAG.getBitcast(VT, AndN);
   }
 
diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll
index 634757ddbf9d..a60f6ee06e73 100644
--- a/llvm/test/CodeGen/X86/avx512-select.ll
+++ b/llvm/test/CodeGen/X86/avx512-select.ll
@@ -705,3 +705,64 @@ define void @select_v1i1(<1 x i1>* %w, <1 x i1>* %x, <1 x i1>* %y, i1 %z) nounwi
   store <1 x i1> %c, <1 x i1>* %x
   ret void
 }
+
+; Regression test from https://github.com/JuliaLang/julia/issues/36955
+define i8 @julia_issue36955(<8 x i1> %mask, <8 x double> %a) {
+; X86-AVX512F-LABEL: julia_issue36955:
+; X86-AVX512F:       # %bb.0:
+; X86-AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
+; X86-AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
+; X86-AVX512F-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; X86-AVX512F-NEXT:    vcmplepd %zmm2, %zmm1, %k1
+; X86-AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
+; X86-AVX512F-NEXT:    korw %k0, %k1, %k0
+; X86-AVX512F-NEXT:    kmovw %k0, %eax
+; X86-AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; X86-AVX512F-NEXT:    vzeroupper
+; X86-AVX512F-NEXT:    retl
+;
+; X64-AVX512F-LABEL: julia_issue36955:
+; X64-AVX512F:       # %bb.0:
+; X64-AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
+; X64-AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
+; X64-AVX512F-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; X64-AVX512F-NEXT:    vcmplepd %zmm2, %zmm1, %k1
+; X64-AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
+; X64-AVX512F-NEXT:    korw %k0, %k1, %k0
+; X64-AVX512F-NEXT:    kmovw %k0, %eax
+; X64-AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; X64-AVX512F-NEXT:    vzeroupper
+; X64-AVX512F-NEXT:    retq
+;
+; X86-AVX512BW-LABEL: julia_issue36955:
+; X86-AVX512BW:       # %bb.0:
+; X86-AVX512BW-NEXT:    vpsllw $15, %xmm0, %xmm0
+; X86-AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X86-AVX512BW-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; X86-AVX512BW-NEXT:    vcmplepd %zmm3, %zmm1, %k1
+; X86-AVX512BW-NEXT:    vpcmpgtw %zmm0, %zmm2, %k0 {%k1}
+; X86-AVX512BW-NEXT:    korw %k0, %k1, %k0
+; X86-AVX512BW-NEXT:    kmovd %k0, %eax
+; X86-AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; X86-AVX512BW-NEXT:    vzeroupper
+; X86-AVX512BW-NEXT:    retl
+;
+; X64-AVX512BW-LABEL: julia_issue36955:
+; X64-AVX512BW:       # %bb.0:
+; X64-AVX512BW-NEXT:    vpsllw $15, %xmm0, %xmm0
+; X64-AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-AVX512BW-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
+; X64-AVX512BW-NEXT:    vcmplepd %zmm3, %zmm1, %k1
+; X64-AVX512BW-NEXT:    vpcmpgtw %zmm0, %zmm2, %k0 {%k1}
+; X64-AVX512BW-NEXT:    korw %k0, %k1, %k0
+; X64-AVX512BW-NEXT:    kmovd %k0, %eax
+; X64-AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; X64-AVX512BW-NEXT:    vzeroupper
+; X64-AVX512BW-NEXT:    retq
+  %fcmp = fcmp ugt <8 x double> %a, zeroinitializer
+  %xor = xor <8 x i1> %fcmp, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+  %select1 = select <8 x i1> %fcmp, <8 x i1> zeroinitializer, <8 x i1> %mask
+  %select2 = select <8 x i1> %xor, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i1> %select1
+  %ret = bitcast <8 x i1> %select2 to i8
+  ret i8 %ret
+}

From a1e0363c7402f7aa58e24e0e6dfa447ebabc1910 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Wed, 30 Sep 2020 11:35:00 +0200
Subject: [PATCH 300/363] Fix limit behavior of dynamic alloca

When the allocation size is 0, we shouldn't probe. Within [1,  PAGE_SIZE], we
should probe once etc.

This fixes https://bugs.llvm.org/show_bug.cgi?id=47657

Differential Revision: https://reviews.llvm.org/D88548

(cherry picked from commit 9573c9f2a363da71b2c07a3add4e52721e6028a0)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp             | 2 +-
 llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fd1e6517dfac..f68ae4461fe3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31876,7 +31876,7 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
 
   BuildMI(testMBB, DL, TII->get(X86::JCC_1))
       .addMBB(tailMBB)
-      .addImm(X86::COND_L);
+      .addImm(X86::COND_LE);
   testMBB->addSuccessor(blockMBB);
   testMBB->addSuccessor(tailMBB);
 
diff --git a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll
index bc4678564083..82fd67842c8a 100644
--- a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll
+++ b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll
@@ -24,12 +24,12 @@ attributes #0 =  {"probe-stack"="inline-asm"}
 ; CHECK-X86-64-NEXT:  	andq	$-16, %rcx
 ; CHECK-X86-64-NEXT:  	subq	%rcx, %rax
 ; CHECK-X86-64-NEXT:  	cmpq	%rsp, %rax
-; CHECK-X86-64-NEXT:  	jl	.LBB0_3
+; CHECK-X86-64-NEXT:  	jle	.LBB0_3
 ; CHECK-X86-64-NEXT:  .LBB0_2: # =>This Inner Loop Header: Depth=1
 ; CHECK-X86-64-NEXT:  	movq	$0, (%rsp)
 ; CHECK-X86-64-NEXT:  	subq	$4096, %rsp # imm = 0x1000
 ; CHECK-X86-64-NEXT:  	cmpq	%rsp, %rax
-; CHECK-X86-64-NEXT:  	jge	.LBB0_2
+; CHECK-X86-64-NEXT:  	jg	.LBB0_2
 ; CHECK-X86-64-NEXT:  .LBB0_3:
 ; CHECK-X86-64-NEXT:  	movq	%rax, %rsp
 ; CHECK-X86-64-NEXT:  	movl	$1, 4792(%rax)
@@ -54,12 +54,12 @@ attributes #0 =  {"probe-stack"="inline-asm"}
 ; CHECK-X86-32-NEXT:    andl    $-16, %ecx
 ; CHECK-X86-32-NEXT:    subl    %ecx, %eax
 ; CHECK-X86-32-NEXT:    cmpl    %esp, %eax
-; CHECK-X86-32-NEXT:    jl  .LBB0_3
+; CHECK-X86-32-NEXT:    jle  .LBB0_3
 ; CHECK-X86-32-NEXT:  .LBB0_2: # =>This Inner Loop Header: Depth=1
 ; CHECK-X86-32-NEXT:    movl    $0, (%esp)
 ; CHECK-X86-32-NEXT:    subl    $4096, %esp # imm = 0x1000
 ; CHECK-X86-32-NEXT:    cmpl    %esp, %eax
-; CHECK-X86-32-NEXT:    jge .LBB0_2
+; CHECK-X86-32-NEXT:    jg .LBB0_2
 ; CHECK-X86-32-NEXT:  .LBB0_3:
 ; CHECK-X86-32-NEXT:    movl    %eax, %esp
 ; CHECK-X86-32-NEXT:    movl    $1, 4792(%eax)

From aac36687f7978f33751daf2870b5c812124ebfaf Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Thu, 23 Jul 2020 16:22:48 +0200
Subject: [PATCH 301/363] Fix interaction between stack alignment and
 inline-asm stack clash protection

As reported in https://github.com/rust-lang/rust/issues/70143 alignment is not
taken into account when doing the probing. Fix that by adjusting the first probe
if the stack align is small, or by extending the dynamic probing if the
alignment is large.

Differential Revision: https://reviews.llvm.org/D84419

(cherry picked from commit f2c6bfa350de142e4d63808d03335f69bd136d6a)
---
 llvm/lib/Target/X86/X86FrameLowering.cpp      | 222 ++++++++++++++++--
 llvm/lib/Target/X86/X86FrameLowering.h        |   8 +-
 .../X86/stack-clash-large-large-align.ll      |  88 +++++++
 .../CodeGen/X86/stack-clash-no-free-probe.ll  |  27 ---
 .../stack-clash-small-alloc-medium-align.ll   | 135 +++++++++++
 .../X86/stack-clash-small-large-align.ll      |  83 +++++++
 6 files changed, 512 insertions(+), 51 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/stack-clash-large-large-align.ll
 delete mode 100644 llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll
 create mode 100644 llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll
 create mode 100644 llvm/test/CodeGen/X86/stack-clash-small-large-align.ll

diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index c7ca6fb2a4fc..db6b68659493 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -586,29 +586,55 @@ void X86FrameLowering::emitStackProbeInlineGeneric(
   const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
   uint64_t ProbeChunk = StackProbeSize * 8;
 
+  uint64_t MaxAlign =
+      TRI->needsStackRealignment(MF) ? calculateMaxStackAlign(MF) : 0;
+
   // Synthesize a loop or unroll it, depending on the number of iterations.
+  // BuildStackAlignAND ensures that only MaxAlign % StackProbeSize bits left
+  // between the unaligned rsp and current rsp.
   if (Offset > ProbeChunk) {
-    emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset);
+    emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset,
+                                    MaxAlign % StackProbeSize);
   } else {
-    emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset);
+    emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset,
+                                     MaxAlign % StackProbeSize);
   }
 }
 
 void X86FrameLowering::emitStackProbeInlineGenericBlock(
     MachineFunction &MF, MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
-    uint64_t Offset) const {
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
+    uint64_t AlignOffset) const {
 
   const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
   const X86TargetLowering &TLI = *STI.getTargetLowering();
   const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);
   const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
   const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
+
   uint64_t CurrentOffset = 0;
-  // 0 Thanks to return address being saved on the stack
-  uint64_t CurrentProbeOffset = 0;
 
-  // For the first N - 1 pages, just probe. I tried to take advantage of
+  assert(AlignOffset < StackProbeSize);
+
+  // If the offset is so small it fits within a page, there's nothing to do.
+  if (StackProbeSize < Offset + AlignOffset) {
+
+    MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+                           .addReg(StackPtr)
+                           .addImm(StackProbeSize - AlignOffset)
+                           .setMIFlag(MachineInstr::FrameSetup);
+    MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
+                     .setMIFlag(MachineInstr::FrameSetup),
+                 StackPtr, false, 0)
+        .addImm(0)
+        .setMIFlag(MachineInstr::FrameSetup);
+    NumFrameExtraProbe++;
+    CurrentOffset = StackProbeSize - AlignOffset;
+  }
+
+  // For the next N - 1 pages, just probe. I tried to take advantage of
   // natural probes but it implies much more logic and there was very few
   // interesting natural probes to interleave.
   while (CurrentOffset + StackProbeSize < Offset) {
@@ -626,9 +652,9 @@ void X86FrameLowering::emitStackProbeInlineGenericBlock(
         .setMIFlag(MachineInstr::FrameSetup);
     NumFrameExtraProbe++;
     CurrentOffset += StackProbeSize;
-    CurrentProbeOffset += StackProbeSize;
   }
 
+  // No need to probe the tail, it is smaller than a Page.
   uint64_t ChunkSize = Offset - CurrentOffset;
   MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
                          .addReg(StackPtr)
@@ -639,8 +665,8 @@ void X86FrameLowering::emitStackProbeInlineGenericBlock(
 
 void X86FrameLowering::emitStackProbeInlineGenericLoop(
     MachineFunction &MF, MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
-    uint64_t Offset) const {
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
+    uint64_t AlignOffset) const {
   assert(Offset && "null offset");
 
   const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
@@ -648,6 +674,26 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
   const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
   const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
 
+  if (AlignOffset) {
+    if (AlignOffset < StackProbeSize) {
+      // Perform a first smaller allocation followed by a probe.
+      const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, AlignOffset);
+      MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), StackPtr)
+                             .addReg(StackPtr)
+                             .addImm(AlignOffset)
+                             .setMIFlag(MachineInstr::FrameSetup);
+      MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
+                       .setMIFlag(MachineInstr::FrameSetup),
+                   StackPtr, false, 0)
+          .addImm(0)
+          .setMIFlag(MachineInstr::FrameSetup);
+      NumFrameExtraProbe++;
+      Offset -= AlignOffset;
+    }
+  }
+
   // Synthesize a loop
   NumFrameLoopProbe++;
   const BasicBlock *LLVM_BB = MBB.getBasicBlock();
@@ -666,8 +712,8 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
 
   // save loop bound
   {
-    const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);
-    BuildMI(MBB, MBBI, DL, TII.get(Opc), FinalStackProbed)
+    const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, Offset);
+    BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed)
         .addReg(FinalStackProbed)
         .addImm(Offset / StackProbeSize * StackProbeSize)
         .setMIFlag(MachineInstr::FrameSetup);
@@ -675,8 +721,8 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
 
   // allocate a page
   {
-    const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
-    BuildMI(testMBB, DL, TII.get(Opc), StackPtr)
+    const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
+    BuildMI(testMBB, DL, TII.get(SUBOpc), StackPtr)
         .addReg(StackPtr)
         .addImm(StackProbeSize)
         .setMIFlag(MachineInstr::FrameSetup);
@@ -1052,13 +1098,149 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
                                           uint64_t MaxAlign) const {
   uint64_t Val = -MaxAlign;
   unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);
-  MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
-                         .addReg(Reg)
-                         .addImm(Val)
-                         .setMIFlag(MachineInstr::FrameSetup);
 
-  // The EFLAGS implicit def is dead.
-  MI->getOperand(3).setIsDead();
+  MachineFunction &MF = *MBB.getParent();
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const X86TargetLowering &TLI = *STI.getTargetLowering();
+  const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
+  const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);
+
+  // We want to make sure that (in worst case) less than StackProbeSize bytes
+  // are not probed after the AND. This assumption is used in
+  // emitStackProbeInlineGeneric.
+  if (Reg == StackPtr && EmitInlineStackProbe && MaxAlign >= StackProbeSize) {
+    {
+      NumFrameLoopProbe++;
+      MachineBasicBlock *entryMBB =
+          MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+      MachineBasicBlock *headMBB =
+          MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+      MachineBasicBlock *bodyMBB =
+          MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+      MachineBasicBlock *footMBB =
+          MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+
+      MachineFunction::iterator MBBIter = MBB.getIterator();
+      MF.insert(MBBIter, entryMBB);
+      MF.insert(MBBIter, headMBB);
+      MF.insert(MBBIter, bodyMBB);
+      MF.insert(MBBIter, footMBB);
+      const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
+      Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D;
+
+      // Setup entry block
+      {
+
+        entryMBB->splice(entryMBB->end(), &MBB, MBB.begin(), MBBI);
+        BuildMI(entryMBB, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
+            .addReg(StackPtr)
+            .setMIFlag(MachineInstr::FrameSetup);
+        MachineInstr *MI =
+            BuildMI(entryMBB, DL, TII.get(AndOp), FinalStackProbed)
+                .addReg(FinalStackProbed)
+                .addImm(Val)
+                .setMIFlag(MachineInstr::FrameSetup);
+
+        // The EFLAGS implicit def is dead.
+        MI->getOperand(3).setIsDead();
+
+        BuildMI(entryMBB, DL,
+                TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
+            .addReg(FinalStackProbed)
+            .addReg(StackPtr)
+            .setMIFlag(MachineInstr::FrameSetup);
+        BuildMI(entryMBB, DL, TII.get(X86::JCC_1))
+            .addMBB(&MBB)
+            .addImm(X86::COND_E)
+            .setMIFlag(MachineInstr::FrameSetup);
+        entryMBB->addSuccessor(headMBB);
+        entryMBB->addSuccessor(&MBB);
+      }
+
+      // Loop entry block
+
+      {
+        const unsigned SUBOpc =
+            getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
+        BuildMI(headMBB, DL, TII.get(SUBOpc), StackPtr)
+            .addReg(StackPtr)
+            .addImm(StackProbeSize)
+            .setMIFlag(MachineInstr::FrameSetup);
+
+        BuildMI(headMBB, DL,
+                TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
+            .addReg(FinalStackProbed)
+            .addReg(StackPtr)
+            .setMIFlag(MachineInstr::FrameSetup);
+
+        // jump
+        BuildMI(headMBB, DL, TII.get(X86::JCC_1))
+            .addMBB(footMBB)
+            .addImm(X86::COND_B)
+            .setMIFlag(MachineInstr::FrameSetup);
+
+        headMBB->addSuccessor(bodyMBB);
+        headMBB->addSuccessor(footMBB);
+      }
+
+      // setup loop body
+      {
+        addRegOffset(BuildMI(bodyMBB, DL, TII.get(MovMIOpc))
+                         .setMIFlag(MachineInstr::FrameSetup),
+                     StackPtr, false, 0)
+            .addImm(0)
+            .setMIFlag(MachineInstr::FrameSetup);
+
+        const unsigned SUBOpc =
+            getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
+        BuildMI(bodyMBB, DL, TII.get(SUBOpc), StackPtr)
+            .addReg(StackPtr)
+            .addImm(StackProbeSize)
+            .setMIFlag(MachineInstr::FrameSetup);
+
+        // cmp with stack pointer bound
+        BuildMI(bodyMBB, DL,
+                TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
+            .addReg(FinalStackProbed)
+            .addReg(StackPtr)
+            .setMIFlag(MachineInstr::FrameSetup);
+
+        // jump
+        BuildMI(bodyMBB, DL, TII.get(X86::JCC_1))
+            .addMBB(bodyMBB)
+            .addImm(X86::COND_B)
+            .setMIFlag(MachineInstr::FrameSetup);
+        bodyMBB->addSuccessor(bodyMBB);
+        bodyMBB->addSuccessor(footMBB);
+      }
+
+      // setup loop footer
+      {
+        BuildMI(footMBB, DL, TII.get(TargetOpcode::COPY), StackPtr)
+            .addReg(FinalStackProbed)
+            .setMIFlag(MachineInstr::FrameSetup);
+        addRegOffset(BuildMI(footMBB, DL, TII.get(MovMIOpc))
+                         .setMIFlag(MachineInstr::FrameSetup),
+                     StackPtr, false, 0)
+            .addImm(0)
+            .setMIFlag(MachineInstr::FrameSetup);
+        footMBB->addSuccessor(&MBB);
+      }
+
+      recomputeLiveIns(*headMBB);
+      recomputeLiveIns(*bodyMBB);
+      recomputeLiveIns(*footMBB);
+      recomputeLiveIns(MBB);
+    }
+  } else {
+    MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
+                           .addReg(Reg)
+                           .addImm(Val)
+                           .setMIFlag(MachineInstr::FrameSetup);
+
+    // The EFLAGS implicit def is dead.
+    MI->getOperand(3).setIsDead();
+  }
 }
 
 bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const {
diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h
index c0b4be95f88d..bb2e83205e71 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/llvm/lib/Target/X86/X86FrameLowering.h
@@ -213,14 +213,14 @@ class X86FrameLowering : public TargetFrameLowering {
   void emitStackProbeInlineGenericBlock(MachineFunction &MF,
                                         MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MBBI,
-                                        const DebugLoc &DL,
-                                        uint64_t Offset) const;
+                                        const DebugLoc &DL, uint64_t Offset,
+                                        uint64_t Align) const;
 
   void emitStackProbeInlineGenericLoop(MachineFunction &MF,
                                        MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MBBI,
-                                       const DebugLoc &DL,
-                                       uint64_t Offset) const;
+                                       const DebugLoc &DL, uint64_t Offset,
+                                       uint64_t Align) const;
 
   /// Emit a stub to later inline the target stack probe.
   MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
diff --git a/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll b/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll
new file mode 100644
index 000000000000..6c981cb4ac91
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s | FileCheck %s
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo_noprotect() local_unnamed_addr {
+; CHECK-LABEL: foo_noprotect:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:	pushq	%rbp
+; CHECK-NEXT:   .cfi_def_cfa_offset 16
+; CHECK-NEXT:   .cfi_offset %rbp, -16
+; CHECK-NEXT:   movq	%rsp, %rbp
+; CHECK-NEXT:   .cfi_def_cfa_register %rbp
+; CHECK-NEXT:   andq	$-4096, %rsp                    # imm = 0xF000
+; CHECK-NEXT:   subq	$73728, %rsp                    # imm = 0x12000
+; CHECK-NEXT:   movl	$1, 392(%rsp)
+; CHECK-NEXT:   movl	$1, 28792(%rsp)
+; CHECK-NEXT:   movl	(%rsp), %eax
+; CHECK-NEXT:   movq	%rbp, %rsp
+; CHECK-NEXT:   popq	%rbp
+; CHECK-NEXT:   .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:   retq
+
+
+  %a = alloca i32, i64 18000, align 4096
+  %b0 = getelementptr inbounds i32, i32* %a, i64 98
+  %b1 = getelementptr inbounds i32, i32* %a, i64 7198
+  store volatile i32 1, i32* %b0
+  store volatile i32 1, i32* %b1
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+define i32 @foo_protect() local_unnamed_addr #0 {
+; CHECK-LABEL: foo_protect:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:	pushq	%rbp
+; CHECK-NEXT:	.cfi_def_cfa_offset 16
+; CHECK-NEXT:	.cfi_offset %rbp, -16
+; CHECK-NEXT:	movq	%rsp, %rbp
+; CHECK-NEXT:	.cfi_def_cfa_register %rbp
+; CHECK-NEXT:	movq	%rsp, %r11
+; CHECK-NEXT:	andq	$-4096, %r11                    # imm = 0xF000
+; CHECK-NEXT:	cmpq	%rsp, %r11
+; CHECK-NEXT:	je	.LBB1_4
+; CHECK-NEXT:# %bb.1:
+; CHECK-NEXT:	subq	$4096, %rsp                     # imm = 0x1000
+; CHECK-NEXT:	cmpq	%rsp, %r11
+; CHECK-NEXT:	jb	.LBB1_3
+; CHECK-NEXT:.LBB1_2:                                # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:	movq	$0, (%rsp)
+; CHECK-NEXT:	subq	$4096, %rsp                     # imm = 0x1000
+; CHECK-NEXT:	cmpq	%rsp, %r11
+; CHECK-NEXT:	jb	.LBB1_2
+; CHECK-NEXT:.LBB1_3:
+; CHECK-NEXT:	movq	%r11, %rsp
+; CHECK-NEXT:	movq	$0, (%rsp)
+; CHECK-NEXT:.LBB1_4:
+; CHECK-NEXT:	movq	%rsp, %r11
+; CHECK-NEXT:	subq	$73728, %r11                    # imm = 0x12000
+; CHECK-NEXT:.LBB1_5:                                # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:	subq	$4096, %rsp                     # imm = 0x1000
+; CHECK-NEXT:	movq	$0, (%rsp)
+; CHECK-NEXT:	cmpq	%r11, %rsp
+; CHECK-NEXT:	jne	.LBB1_5
+; CHECK-NEXT:# %bb.6:
+; CHECK-NEXT:	movl	$1, 392(%rsp)
+; CHECK-NEXT:	movl	$1, 28792(%rsp)
+; CHECK-NEXT:	movl	(%rsp), %eax
+; CHECK-NEXT:	movq	%rbp, %rsp
+; CHECK-NEXT:	popq	%rbp
+; CHECK-NEXT:	.cfi_def_cfa %rsp, 8
+; CHECK-NEXT:	retq
+
+
+
+
+  %a = alloca i32, i64 18000, align 4096
+  %b0 = getelementptr inbounds i32, i32* %a, i64 98
+  %b1 = getelementptr inbounds i32, i32* %a, i64 7198
+  store volatile i32 1, i32* %b0
+  store volatile i32 1, i32* %b1
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}
diff --git a/llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll b/llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll
deleted file mode 100644
index 652acbdf00ba..000000000000
--- a/llvm/test/CodeGen/X86/stack-clash-no-free-probe.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define i32 @foo(i64 %i) local_unnamed_addr #0 {
-; CHECK-LABEL: foo:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:  subq	$4096, %rsp # imm = 0x1000
-; CHECK-NEXT:  movq	$0, (%rsp)
-; CHECK-NEXT:  subq	$3784, %rsp # imm = 0xEC8
-; CHECK-NEXT:  .cfi_def_cfa_offset 7888
-; CHECK-NEXT:  movl	$1, -128(%rsp,%rdi,4)
-; CHECK-NEXT:  movl	-128(%rsp), %eax
-; CHECK-NEXT:  addq	$7880, %rsp # imm = 0x1EC8
-; CHECK-NEXT:  .cfi_def_cfa_offset 8
-; CHECK-NEXT:  retq
-
-  %a = alloca i32, i32 2000, align 16
-  %b = getelementptr inbounds i32, i32* %a, i64 %i
-  store volatile i32 1, i32* %b
-  %c = load volatile i32, i32* %a
-  ret i32 %c
-}
-
-attributes #0 =  {"probe-stack"="inline-asm"}
-
diff --git a/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll b/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll
new file mode 100644
index 000000000000..eafa86f1eba9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll
@@ -0,0 +1,135 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; | case1 | alloca + align < probe_size
+define i32 @foo1(i64 %i) local_unnamed_addr #0 {
+; CHECK-LABEL: foo1:
+; CHECK:        # %bb.0:
+; CHECK-NEXT:	pushq	%rbp
+; CHECK-NEXT:	.cfi_def_cfa_offset 16
+; CHECK-NEXT:	.cfi_offset %rbp, -16
+; CHECK-NEXT:	movq	%rsp, %rbp
+; CHECK-NEXT:	.cfi_def_cfa_register %rbp
+; CHECK-NEXT:   andq    $-64, %rsp
+; CHECK-NEXT:   subq    $832, %rsp                      # imm = 0x340
+; CHECK-NEXT:   movl    $1, (%rsp,%rdi,4)
+; CHECK-NEXT:   movl    (%rsp), %eax
+; CHECK-NEXT:   movq    %rbp, %rsp
+; CHECK-NEXT:   popq    %rbp
+; CHECK-NEXT:	.cfi_def_cfa %rsp, 8
+; CHECK-NEXT:	retq
+
+  %a = alloca i32, i32 200, align 64
+  %b = getelementptr inbounds i32, i32* %a, i64 %i
+  store volatile i32 1, i32* %b
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+; | case2 | alloca > probe_size, align > probe_size
+define i32 @foo2(i64 %i) local_unnamed_addr #0 {
+; CHECK-LABEL: foo2:
+; CHECK:        # %bb.0:
+; CHECK-NEXT:	pushq	%rbp
+; CHECK-NEXT:	.cfi_def_cfa_offset 16
+; CHECK-NEXT:	.cfi_offset %rbp, -16
+; CHECK-NEXT:	movq	%rsp, %rbp
+; CHECK-NEXT:	.cfi_def_cfa_register %rbp
+; CHECK-NEXT:   andq    $-2048, %rsp                    # imm = 0xF800
+; CHECK-NEXT:   subq    $2048, %rsp                     # imm = 0x800
+; CHECK-NEXT:   movq    $0, (%rsp)
+; CHECK-NEXT:   subq    $4096, %rsp                     # imm = 0x1000
+; CHECK-NEXT:   movq    $0, (%rsp)
+; CHECK-NEXT:   subq    $2048, %rsp                     # imm = 0x800
+; CHECK-NEXT:   movl    $1, (%rsp,%rdi,4)
+; CHECK-NEXT:   movl    (%rsp), %eax
+; CHECK-NEXT:   movq    %rbp, %rsp
+; CHECK-NEXT:   popq    %rbp
+; CHECK-NEXT:   .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:   retq
+
+  %a = alloca i32, i32 2000, align 2048
+  %b = getelementptr inbounds i32, i32* %a, i64 %i
+  store volatile i32 1, i32* %b
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+; | case3 | alloca < probe_size, align < probe_size, alloca + align > probe_size
+define i32 @foo3(i64 %i) local_unnamed_addr #0 {
+; CHECK-LABEL: foo3:
+; CHECK:        # %bb.0:
+; CHECK-NEXT:   pushq   %rbp
+; CHECK-NEXT:   .cfi_def_cfa_offset 16
+; CHECK-NEXT:   .cfi_offset %rbp, -16
+; CHECK-NEXT:   movq    %rsp, %rbp
+; CHECK-NEXT:   .cfi_def_cfa_register %rbp
+; CHECK-NEXT:   andq    $-1024, %rsp                    # imm = 0xFC00
+; CHECK-NEXT:   subq    $3072, %rsp                     # imm = 0xC00
+; CHECK-NEXT:   movq    $0, (%rsp)
+; CHECK-NEXT:   subq    $1024, %rsp                     # imm = 0x400
+; CHECK-NEXT:   movl    $1, (%rsp,%rdi,4)
+; CHECK-NEXT:   movl    (%rsp), %eax
+; CHECK-NEXT:   movq    %rbp, %rsp
+; CHECK-NEXT:   popq    %rbp
+; CHECK-NEXT:   .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:   retq
+
+
+  %a = alloca i32, i32 1000, align 1024
+  %b = getelementptr inbounds i32, i32* %a, i64 %i
+  store volatile i32 1, i32* %b
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+; | case4 | alloca + probe_size < probe_size, followed by dynamic alloca
+define i32 @foo4(i64 %i) local_unnamed_addr #0 {
+; CHECK-LABEL: foo4:
+; CHECK:        # %bb.0:
+; CHECK-NEXT:	pushq	%rbp
+; CHECK-NEXT:	.cfi_def_cfa_offset 16
+; CHECK-NEXT:	.cfi_offset %rbp, -16
+; CHECK-NEXT:	movq	%rsp, %rbp
+; CHECK-NEXT:	.cfi_def_cfa_register %rbp
+; CHECK-NEXT:	pushq	%rbx
+; CHECK-NEXT:	andq	$-64, %rsp
+; CHECK-NEXT:	subq	$896, %rsp                      # imm = 0x380
+; CHECK-NEXT:	movq	%rsp, %rbx
+; CHECK-NEXT:	.cfi_offset %rbx, -24
+; CHECK-NEXT:	movl	$1, (%rbx,%rdi,4)
+; CHECK-NEXT:	movl	(%rbx), %ecx
+; CHECK-NEXT:	movq	%rsp, %rax
+; CHECK-NEXT:	leaq	15(,%rcx,4), %rcx
+; CHECK-NEXT:	andq	$-16, %rcx
+; CHECK-NEXT:	subq	%rcx, %rax
+; CHECK-NEXT:	cmpq	%rsp, %rax
+; CHECK-NEXT:	jle	.LBB3_3
+; CHECK-NEXT:.LBB3_2:                                # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:	movq	$0, (%rsp)
+; CHECK-NEXT:	subq	$4096, %rsp                     # imm = 0x1000
+; CHECK-NEXT:	cmpq	%rsp, %rax
+; CHECK-NEXT:	jg	.LBB3_2
+; CHECK-NEXT:.LBB3_3:
+; CHECK-NEXT:	andq	$-64, %rax
+; CHECK-NEXT:	movq	%rax, %rsp
+; CHECK-NEXT:	movl	(%rax), %eax
+; CHECK-NEXT:	leaq	-8(%rbp), %rsp
+; CHECK-NEXT:	popq	%rbx
+; CHECK-NEXT:	popq	%rbp
+; CHECK-NEXT:	.cfi_def_cfa %rsp, 8
+; CHECK-NEXT:	retq
+
+  %a = alloca i32, i32 200, align 64
+  %b = getelementptr inbounds i32, i32* %a, i64 %i
+  store volatile i32 1, i32* %b
+  %c = load volatile i32, i32* %a
+  %d = alloca i32, i32 %c, align 64
+  %e = load volatile i32, i32* %d
+  ret i32 %e
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}
+
diff --git a/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll b/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll
new file mode 100644
index 000000000000..e608bab90415
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s | FileCheck %s
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo_noprotect() local_unnamed_addr {
+; CHECK-LABEL: foo_noprotect:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:	pushq	%rbp
+; CHECK-NEXT:	.cfi_def_cfa_offset 16
+; CHECK-NEXT:	.cfi_offset %rbp, -16
+; CHECK-NEXT:	movq	%rsp, %rbp
+; CHECK-NEXT:	.cfi_def_cfa_register %rbp
+; CHECK-NEXT:	andq	$-65536, %rsp
+; CHECK-NEXT:	subq	$65536, %rsp
+; CHECK-NEXT:	movl	$1, 392(%rsp)
+; CHECK-NEXT:	movl	(%rsp), %eax
+; CHECK-NEXT:	movq	%rbp, %rsp
+; CHECK-NEXT:	popq	%rbp
+; CHECK-NEXT:	.cfi_def_cfa %rsp, 8
+; CHECK-NEXT:	retq
+
+
+
+  %a = alloca i32, i64 100, align 65536
+  %b = getelementptr inbounds i32, i32* %a, i64 98
+  store volatile i32 1, i32* %b
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+define i32 @foo_protect() local_unnamed_addr #0 {
+; CHECK-LABEL: foo_protect:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:	pushq	%rbp
+; CHECK-NEXT:	.cfi_def_cfa_offset 16
+; CHECK-NEXT:	.cfi_offset %rbp, -16
+; CHECK-NEXT:	movq	%rsp, %rbp
+; CHECK-NEXT:	.cfi_def_cfa_register %rbp
+; CHECK-NEXT:	movq	%rsp, %r11
+; CHECK-NEXT:	andq	$-65536, %r11                   # imm = 0xFFFF0000
+; CHECK-NEXT:	cmpq	%rsp, %r11
+; CHECK-NEXT:	je	.LBB1_4
+; CHECK-NEXT:# %bb.1:
+; CHECK-NEXT:	subq	$4096, %rsp                     # imm = 0x1000
+; CHECK-NEXT:	cmpq	%rsp, %r11
+; CHECK-NEXT:	jb	.LBB1_3
+; CHECK-NEXT:.LBB1_2:                                # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:	movq	$0, (%rsp)
+; CHECK-NEXT:	subq	$4096, %rsp                     # imm = 0x1000
+; CHECK-NEXT:	cmpq	%rsp, %r11
+; CHECK-NEXT:	jb	.LBB1_2
+; CHECK-NEXT:.LBB1_3:
+; CHECK-NEXT:	movq	%r11, %rsp
+; CHECK-NEXT:	movq	$0, (%rsp)
+; CHECK-NEXT:.LBB1_4:
+; CHECK-NEXT:	movq	%rsp, %r11
+; CHECK-NEXT:	subq	$65536, %r11                    # imm = 0x10000
+; CHECK-NEXT:.LBB1_5:                                # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:	subq	$4096, %rsp                     # imm = 0x1000
+; CHECK-NEXT:	movq	$0, (%rsp)
+; CHECK-NEXT:	cmpq	%r11, %rsp
+; CHECK-NEXT:	jne	.LBB1_5
+; CHECK-NEXT:# %bb.6:
+; CHECK-NEXT:	movl	$1, 392(%rsp)
+; CHECK-NEXT:	movl	(%rsp), %eax
+; CHECK-NEXT:	movq	%rbp, %rsp
+; CHECK-NEXT:	popq	%rbp
+; CHECK-NEXT:	.cfi_def_cfa %rsp, 8
+; CHECK-NEXT:	retq
+
+
+
+
+  %a = alloca i32, i64 100, align 65536
+  %b = getelementptr inbounds i32, i32* %a, i64 98
+  store volatile i32 1, i32* %b
+  %c = load volatile i32, i32* %a
+  ret i32 %c
+}
+
+attributes #0 =  {"probe-stack"="inline-asm"}

From bbe6cbbed8c7460a7e8477373b9250543362e771 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Tue, 27 Oct 2020 10:59:42 +0100
Subject: [PATCH 302/363] [stack-clash] Fix probing of dynamic alloca

- Perform the probing in the correct direction.
  Related to https://github.com/rust-lang/rust/pull/77885#issuecomment-711062924

- The first touch on a dynamic alloca cannot use a mov because it clobbers
  existing space. Use a xor 0 instead

Differential Revision: https://reviews.llvm.org/D90216

(cherry picked from commit 0f60bcc36c34522618bd1425a45f8c6006568fb6)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp              |  8 ++++----
 llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll  | 12 ++++++------
 .../X86/stack-clash-small-alloc-medium-align.ll      |  6 +++---
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f68ae4461fe3..afe470cc6e0b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31876,7 +31876,7 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
 
   BuildMI(testMBB, DL, TII->get(X86::JCC_1))
       .addMBB(tailMBB)
-      .addImm(X86::COND_LE);
+      .addImm(X86::COND_GE);
   testMBB->addSuccessor(blockMBB);
   testMBB->addSuccessor(tailMBB);
 
@@ -31892,9 +31892,9 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
   //
   // The property we want to enforce is to never have more than [page alloc] between two probes.
 
-  const unsigned MovMIOpc =
-      TFI.Uses64BitFramePtr ? X86::MOV64mi32 : X86::MOV32mi;
-  addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0)
+  const unsigned XORMIOpc =
+      TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
+  addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
       .addImm(0);
 
   BuildMI(blockMBB, DL,
diff --git a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll
index 82fd67842c8a..6dd8b6ab5897 100644
--- a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll
+++ b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll
@@ -24,12 +24,12 @@ attributes #0 =  {"probe-stack"="inline-asm"}
 ; CHECK-X86-64-NEXT:  	andq	$-16, %rcx
 ; CHECK-X86-64-NEXT:  	subq	%rcx, %rax
 ; CHECK-X86-64-NEXT:  	cmpq	%rsp, %rax
-; CHECK-X86-64-NEXT:  	jle	.LBB0_3
+; CHECK-X86-64-NEXT:  	jge	.LBB0_3
 ; CHECK-X86-64-NEXT:  .LBB0_2: # =>This Inner Loop Header: Depth=1
-; CHECK-X86-64-NEXT:  	movq	$0, (%rsp)
+; CHECK-X86-64-NEXT:  	xorq	$0, (%rsp)
 ; CHECK-X86-64-NEXT:  	subq	$4096, %rsp # imm = 0x1000
 ; CHECK-X86-64-NEXT:  	cmpq	%rsp, %rax
-; CHECK-X86-64-NEXT:  	jg	.LBB0_2
+; CHECK-X86-64-NEXT:  	jl	.LBB0_2
 ; CHECK-X86-64-NEXT:  .LBB0_3:
 ; CHECK-X86-64-NEXT:  	movq	%rax, %rsp
 ; CHECK-X86-64-NEXT:  	movl	$1, 4792(%rax)
@@ -54,12 +54,12 @@ attributes #0 =  {"probe-stack"="inline-asm"}
 ; CHECK-X86-32-NEXT:    andl    $-16, %ecx
 ; CHECK-X86-32-NEXT:    subl    %ecx, %eax
 ; CHECK-X86-32-NEXT:    cmpl    %esp, %eax
-; CHECK-X86-32-NEXT:    jle  .LBB0_3
+; CHECK-X86-32-NEXT:    jge  .LBB0_3
 ; CHECK-X86-32-NEXT:  .LBB0_2: # =>This Inner Loop Header: Depth=1
-; CHECK-X86-32-NEXT:    movl    $0, (%esp)
+; CHECK-X86-32-NEXT:    xorl    $0, (%esp)
 ; CHECK-X86-32-NEXT:    subl    $4096, %esp # imm = 0x1000
 ; CHECK-X86-32-NEXT:    cmpl    %esp, %eax
-; CHECK-X86-32-NEXT:    jg .LBB0_2
+; CHECK-X86-32-NEXT:    jl .LBB0_2
 ; CHECK-X86-32-NEXT:  .LBB0_3:
 ; CHECK-X86-32-NEXT:    movl    %eax, %esp
 ; CHECK-X86-32-NEXT:    movl    $1, 4792(%eax)
diff --git a/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll b/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll
index eafa86f1eba9..39b6c3640a60 100644
--- a/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll
+++ b/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll
@@ -106,12 +106,12 @@ define i32 @foo4(i64 %i) local_unnamed_addr #0 {
 ; CHECK-NEXT:	andq	$-16, %rcx
 ; CHECK-NEXT:	subq	%rcx, %rax
 ; CHECK-NEXT:	cmpq	%rsp, %rax
-; CHECK-NEXT:	jle	.LBB3_3
+; CHECK-NEXT:	jge	.LBB3_3
 ; CHECK-NEXT:.LBB3_2:                                # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:	movq	$0, (%rsp)
+; CHECK-NEXT:	xorq	$0, (%rsp)
 ; CHECK-NEXT:	subq	$4096, %rsp                     # imm = 0x1000
 ; CHECK-NEXT:	cmpq	%rsp, %rax
-; CHECK-NEXT:	jg	.LBB3_2
+; CHECK-NEXT:	jl	.LBB3_2
 ; CHECK-NEXT:.LBB3_3:
 ; CHECK-NEXT:	andq	$-64, %rax
 ; CHECK-NEXT:	movq	%rax, %rsp

From 8ac709578067f77a7036fe50610277516fa36d50 Mon Sep 17 00:00:00 2001
From: Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
Date: Mon, 12 Oct 2020 15:58:52 -0700
Subject: [PATCH 303/363] [SemaTemplate] Stop passing insertion position around
 during VarTemplate instantiation

They can get stale at use time because of updates from other recursive
specializations. Instead, rely on the existence of previous declarations to add
the specialization.

Differential Revision: https://reviews.llvm.org/D87853

(cherry picked from commit cffb0dd54d41d8e249d2009467c4beb5b681ba26)
---
 clang/include/clang/Sema/Sema.h               |  2 +-
 clang/include/clang/Sema/Template.h           |  2 +-
 clang/lib/Sema/SemaTemplate.cpp               |  2 +-
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  | 23 ++++++++-----------
 .../SemaTemplate/instantiate-var-template.cpp |  7 ++++++
 5 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 6f7ad8076718..7a7722559397 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -9123,7 +9123,7 @@ class Sema final {
       const TemplateArgumentList &TemplateArgList,
       const TemplateArgumentListInfo &TemplateArgsInfo,
       SmallVectorImpl<TemplateArgument> &Converted,
-      SourceLocation PointOfInstantiation, void *InsertPos,
+      SourceLocation PointOfInstantiation,
       LateInstantiatedAttrVec *LateAttrs = nullptr,
       LocalInstantiationScope *StartingScope = nullptr);
   VarTemplateSpecializationDecl *CompleteVarTemplateSpecializationDecl(
diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h
index 91d175fdd050..0dcaf565591b 100644
--- a/clang/include/clang/Sema/Template.h
+++ b/clang/include/clang/Sema/Template.h
@@ -600,7 +600,7 @@ enum class TemplateSubstitutionKind : char {
                         TagDecl *NewDecl);
 
     Decl *VisitVarTemplateSpecializationDecl(
-        VarTemplateDecl *VarTemplate, VarDecl *FromVar, void *InsertPos,
+        VarTemplateDecl *VarTemplate, VarDecl *FromVar,
         const TemplateArgumentListInfo &TemplateArgsInfo,
         ArrayRef<TemplateArgument> Converted,
         VarTemplateSpecializationDecl *PrevDecl = nullptr);
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index c05ed0b14e3e..ddae944a48f3 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -4471,7 +4471,7 @@ Sema::CheckVarTemplateId(VarTemplateDecl *Template, SourceLocation TemplateLoc,
   // FIXME: LateAttrs et al.?
   VarTemplateSpecializationDecl *Decl = BuildVarTemplateInstantiation(
       Template, InstantiationPattern, *InstantiationArgs, TemplateArgs,
-      Converted, TemplateNameLoc, InsertPos /*, LateAttrs, StartingScope*/);
+      Converted, TemplateNameLoc /*, LateAttrs, StartingScope*/);
   if (!Decl)
     return true;
 
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index baec13ba627c..903785a2be42 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -3602,11 +3602,11 @@ Decl *TemplateDeclInstantiator::VisitVarTemplateSpecializationDecl(
     return nullptr;
 
   return VisitVarTemplateSpecializationDecl(
-      InstVarTemplate, D, InsertPos, VarTemplateArgsInfo, Converted, PrevDecl);
+      InstVarTemplate, D, VarTemplateArgsInfo, Converted, PrevDecl);
 }
 
 Decl *TemplateDeclInstantiator::VisitVarTemplateSpecializationDecl(
-    VarTemplateDecl *VarTemplate, VarDecl *D, void *InsertPos,
+    VarTemplateDecl *VarTemplate, VarDecl *D,
     const TemplateArgumentListInfo &TemplateArgsInfo,
     ArrayRef<TemplateArgument> Converted,
     VarTemplateSpecializationDecl *PrevDecl) {
@@ -3629,8 +3629,11 @@ Decl *TemplateDeclInstantiator::VisitVarTemplateSpecializationDecl(
       SemaRef.Context, Owner, D->getInnerLocStart(), D->getLocation(),
       VarTemplate, DI->getType(), DI, D->getStorageClass(), Converted);
   Var->setTemplateArgsInfo(TemplateArgsInfo);
-  if (InsertPos)
+  if (!PrevDecl) {
+    void *InsertPos = nullptr;
+    VarTemplate->findSpecialization(Converted, InsertPos);
     VarTemplate->AddSpecialization(Var, InsertPos);
+  }
 
   if (SemaRef.getLangOpts().OpenCL)
     SemaRef.deduceOpenCLAddressSpace(Var);
@@ -4839,7 +4842,7 @@ VarTemplateSpecializationDecl *Sema::BuildVarTemplateInstantiation(
     const TemplateArgumentList &TemplateArgList,
     const TemplateArgumentListInfo &TemplateArgsInfo,
     SmallVectorImpl<TemplateArgument> &Converted,
-    SourceLocation PointOfInstantiation, void *InsertPos,
+    SourceLocation PointOfInstantiation,
     LateInstantiatedAttrVec *LateAttrs,
     LocalInstantiationScope *StartingScope) {
   if (FromVar->isInvalidDecl())
@@ -4878,7 +4881,7 @@ VarTemplateSpecializationDecl *Sema::BuildVarTemplateInstantiation(
 
   return cast_or_null<VarTemplateSpecializationDecl>(
       Instantiator.VisitVarTemplateSpecializationDecl(
-          VarTemplate, FromVar, InsertPos, TemplateArgsInfo, Converted));
+          VarTemplate, FromVar, TemplateArgsInfo, Converted));
 }
 
 /// Instantiates a variable template specialization by completing it
@@ -5310,8 +5313,8 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
     TemplateDeclInstantiator Instantiator(*this, Var->getDeclContext(),
                                           TemplateArgs);
     Var = cast_or_null<VarDecl>(Instantiator.VisitVarTemplateSpecializationDecl(
-        VarSpec->getSpecializedTemplate(), Def, nullptr,
-        VarSpec->getTemplateArgsInfo(), VarSpec->getTemplateArgs().asArray()));
+        VarSpec->getSpecializedTemplate(), Def, VarSpec->getTemplateArgsInfo(),
+        VarSpec->getTemplateArgs().asArray(), VarSpec));
     if (Var) {
       llvm::PointerUnion<VarTemplateDecl *,
                          VarTemplatePartialSpecializationDecl *> PatternPtr =
@@ -5321,12 +5324,6 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
         cast<VarTemplateSpecializationDecl>(Var)->setInstantiationOf(
             Partial, &VarSpec->getTemplateInstantiationArgs());
 
-      // Merge the definition with the declaration.
-      LookupResult R(*this, Var->getDeclName(), Var->getLocation(),
-                     LookupOrdinaryName, forRedeclarationInCurContext());
-      R.addDecl(OldVar);
-      MergeVarDecl(Var, R);
-
       // Attach the initializer.
       InstantiateVariableInitializer(Var, Def, TemplateArgs);
     }
diff --git a/clang/test/SemaTemplate/instantiate-var-template.cpp b/clang/test/SemaTemplate/instantiate-var-template.cpp
index b7b83e4afdd5..a24b205da596 100644
--- a/clang/test/SemaTemplate/instantiate-var-template.cpp
+++ b/clang/test/SemaTemplate/instantiate-var-template.cpp
@@ -40,3 +40,10 @@ namespace PR24483 {
   template<typename... T> A<T...> models;
   template<> struct B models<>; // expected-error {{incomplete type 'struct B'}} expected-note {{forward declaration}}
 }
+
+namespace InvalidInsertPos {
+  template<typename T, int N> T v;
+  template<int N> decltype(v<int, N-1>) v<int, N>;
+  template<> int v<int, 0>;
+  int k = v<int, 500>;
+}

From 03565ffd5da8370f5b89b69cd9868f32e2d75403 Mon Sep 17 00:00:00 2001
From: Raul Tambre <raul@tambre.ee>
Date: Thu, 20 Aug 2020 20:17:47 +0300
Subject: [PATCH 304/363] [CMake][compiler-rt][libunwind] Compile assembly
 files as ASM not C, unify workarounds

It isn't very wise to pass an assembly file to the compiler and tell it to compile as a C file and hope that the compiler recognizes it as assembly instead.
Simply don't mark the file as C and CMake will recognize the rest.

This was attempted earlier in https://reviews.llvm.org/D85706, but reverted due to architecture issues on Apple.
Subsequent digging revealed a similar change was done earlier for libunwind in https://reviews.llvm.org/rGb780df052dd2b246a760d00e00f7de9ebdab9d09.
Afterwards workarounds were added for MinGW and Apple:
* https://reviews.llvm.org/rGb780df052dd2b246a760d00e00f7de9ebdab9d09
* https://reviews.llvm.org/rGd4ded05ba851304b26a437896bc3962ef56f62cb

The workarounds in libunwind and compiler-rt are unified and comments added pointing to each other.
The workaround is updated to only be used for MinGW for CMake versions before 3.17, which fixed the issue (https://gitlab.kitware.com/cmake/cmake/-/merge_requests/4287).

Additionally fixed Clang not being passed as the assembly compiler for compiler-rt runtime build.

Example error:
[525/634] Building C object lib/tsan/CMakeFiles/clang_rt.tsan-aarch64.dir/rtl/tsan_rtl_aarch64.S.o
FAILED: lib/tsan/CMakeFiles/clang_rt.tsan-aarch64.dir/rtl/tsan_rtl_aarch64.S.o
/opt/tooling/drive/host/bin/clang --target=aarch64-linux-gnu -I/opt/tooling/drive/llvm/compiler-rt/lib/tsan/.. -isystem /opt/tooling/drive/toolchain/opt/drive/toolchain/include -x c -Wall -Wno-unused-parameter -fno-lto -fPIC -fno-builtin -fno-exceptions -fomit-frame-pointer -funwind-tables -fno-stack-protector -fno-sanitize=safe-stack -fvisibility=hidden -fno-lto -O3 -gline-tables-only -Wno-gnu -Wno-variadic-macros -Wno-c99-extensions -Wno-non-virtual-dtor -fPIE -fno-rtti -Wframe-larger-than=530 -Wglobal-constructors --sysroot=. -MD -MT lib/tsan/CMakeFiles/clang_rt.tsan-aarch64.dir/rtl/tsan_rtl_aarch64.S.o -MF lib/tsan/CMakeFiles/clang_rt.tsan-aarch64.dir/rtl/tsan_rtl_aarch64.S.o.d -o lib/tsan/CMakeFiles/clang_rt.tsan-aarch64.dir/rtl/tsan_rtl_aarch64.S.o -c /opt/tooling/drive/llvm/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S
/opt/tooling/drive/llvm/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S:29:1: error: expected identifier or '('
.section .text
^
1 error generated.

Differential Revision: https://reviews.llvm.org/D86308

(cherry picked from commit 45344cf7ac5b848f77825ffa37b0cb3b69b9b07b)
---
 clang/runtime/CMakeLists.txt                  |  1 +
 compiler-rt/cmake/Modules/AddCompilerRT.cmake | 10 +++++-----
 libunwind/src/CMakeLists.txt                  | 12 ++++--------
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/clang/runtime/CMakeLists.txt b/clang/runtime/CMakeLists.txt
index e20cc26f60af..61bbbf8faedd 100644
--- a/clang/runtime/CMakeLists.txt
+++ b/clang/runtime/CMakeLists.txt
@@ -75,6 +75,7 @@ if(LLVM_BUILD_EXTERNAL_COMPILER_RT AND EXISTS ${COMPILER_RT_SRC_ROOT}/)
     CMAKE_ARGS ${CLANG_COMPILER_RT_CMAKE_ARGS}
                -DCMAKE_C_COMPILER=${LLVM_RUNTIME_OUTPUT_INTDIR}/clang
                -DCMAKE_CXX_COMPILER=${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++
+               -DCMAKE_ASM_COMPILER=${LLVM_RUNTIME_OUTPUT_INTDIR}/clang
                -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
                -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
                -DLLVM_CONFIG_PATH=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-config
diff --git a/compiler-rt/cmake/Modules/AddCompilerRT.cmake b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
index dab55707338a..7c127a93dfa7 100644
--- a/compiler-rt/cmake/Modules/AddCompilerRT.cmake
+++ b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
@@ -109,11 +109,11 @@ endfunction()
 
 function(add_asm_sources output)
   set(${output} ${ARGN} PARENT_SCOPE)
-  # Xcode will try to compile asm files as C ('clang -x c'), and that will fail.
-  if (${CMAKE_GENERATOR} STREQUAL "Xcode")
-    enable_language(ASM)
-  else()
-    # Pass ASM file directly to the C++ compiler.
+  # CMake doesn't pass the correct architecture for Apple prior to CMake 3.19. https://gitlab.kitware.com/cmake/cmake/-/issues/20771
+  # MinGW didn't work correctly with assembly prior to CMake 3.17. https://gitlab.kitware.com/cmake/cmake/-/merge_requests/4287 and https://reviews.llvm.org/rGb780df052dd2b246a760d00e00f7de9ebdab9d09
+  # Workaround these two issues by compiling as C.
+  # Same workaround used in libunwind. Also update there if changed here.
+  if((APPLE AND CMAKE_VERSION VERSION_LESS 3.19) OR (MINGW AND CMAKE_VERSION VERSION_LESS 3.17))
     set_source_files_properties(${ARGN} PROPERTIES LANGUAGE C)
   endif()
 endfunction()
diff --git a/libunwind/src/CMakeLists.txt b/libunwind/src/CMakeLists.txt
index 8f79b1cf8740..309a85b22f59 100644
--- a/libunwind/src/CMakeLists.txt
+++ b/libunwind/src/CMakeLists.txt
@@ -24,14 +24,10 @@ set(LIBUNWIND_ASM_SOURCES
     UnwindRegistersRestore.S
     UnwindRegistersSave.S
     )
-if (MINGW OR APPLE)
-  # CMake doesn't build assembly sources for windows/gnu targets properly
-  # (up to current CMake, 3.16), so treat them as C files.
-  # Additionally, CMake ignores OSX_ARCHITECTURE for ASM files when targeting
-  # Apple platforms.
-  set_source_files_properties(${LIBUNWIND_ASM_SOURCES}
-                              PROPERTIES
-                                LANGUAGE C)
+
+# See add_asm_sources() in compiler-rt for explanation of this workaround.
+if((APPLE AND CMAKE_VERSION VERSION_LESS 3.19) OR (MINGW AND CMAKE_VERSION VERSION_LESS 3.17))
+  set_source_files_properties(${LIBUNWIND_ASM_SOURCES} PROPERTIES LANGUAGE C)
 endif()
 
 set(LIBUNWIND_HEADERS

From e4500bab8b29bdea2f4c51e8a143b8122491e6f5 Mon Sep 17 00:00:00 2001
From: Raul Tambre <raul@tambre.ee>
Date: Wed, 19 Aug 2020 12:50:35 -0700
Subject: [PATCH 305/363] [AArch64][GlobalISel] Handle rtcGPR64RegClassID in
 AArch64RegisterBankInfo::getRegBankFromRegClass()

TargetRegisterInfo::getMinimalPhysRegClass() returns rtcGPR64RegClassID for X16
and X17, as it's the last matching class. This in turn gets passed to
AArch64RegisterBankInfo::getRegBankFromRegClass(), which hits an unreachable.

It seems sensible to handle this case, so copies from X16 and X17 work.
Copying from X17 is used in inline assembly in libunwind for pointer
authentication.

Differential Revision: https://reviews.llvm.org/D85720

(cherry picked from commit e887d0e89b837be37b4279735a9c1ac57e90c995)
---
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp |  1 +
 .../GlobalISel/regbankselect-default.mir      | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 7e3ff1948dad..93213f5977e5 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -261,6 +261,7 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   case AArch64::GPR64common_and_GPR64noipRegClassID:
   case AArch64::GPR64noip_and_tcGPR64RegClassID:
   case AArch64::tcGPR64RegClassID:
+  case AArch64::rtcGPR64RegClassID:
   case AArch64::WSeqPairsClassRegClassID:
   case AArch64::XSeqPairsClassRegClassID:
     return getRegBank(AArch64::GPRRegBankID);
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbankselect-default.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbankselect-default.mir
index e226c0fbae7d..09884c75409e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/regbankselect-default.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbankselect-default.mir
@@ -75,6 +75,8 @@
 
   define void @test_gphi_ptr() { ret void }
 
+  define void @test_restricted_tail_call() { ret void }
+
 ...
 
 ---
@@ -888,3 +890,20 @@ body:             |
     RET_ReallyLR implicit $x0
 
 ...
+
+---
+name:            test_restricted_tail_call
+legalized:       true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $x16, $x17
+    ; CHECK-LABEL: name: test_restricted_tail_call
+    ; CHECK: liveins: $x16, $x17
+    ; CHECK: [[COPY:%[0-9]+]]:gpr(s64) = COPY $x16
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr(s64) = COPY $x17
+    ; CHECK: RET_ReallyLR
+    %0:_(s64) = COPY $x16
+    %1:_(s64) = COPY $x17
+    RET_ReallyLR
+...

From d8e8ae195a2581bea454317c4c4eabf0943d1e6d Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 25 Nov 2020 16:38:08 -0800
Subject: [PATCH 306/363] Revert "[SemaTemplate] Stop passing insertion
 position around during VarTemplate instantiation"

This reverts commit 8ac709578067f77a7036fe50610277516fa36d50.

This commit changes the public API of clang, so it needs to be reworked.
---
 clang/include/clang/Sema/Sema.h               |  2 +-
 clang/include/clang/Sema/Template.h           |  2 +-
 clang/lib/Sema/SemaTemplate.cpp               |  2 +-
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  | 23 +++++++++++--------
 .../SemaTemplate/instantiate-var-template.cpp |  7 ------
 5 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 7a7722559397..6f7ad8076718 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -9123,7 +9123,7 @@ class Sema final {
       const TemplateArgumentList &TemplateArgList,
       const TemplateArgumentListInfo &TemplateArgsInfo,
       SmallVectorImpl<TemplateArgument> &Converted,
-      SourceLocation PointOfInstantiation,
+      SourceLocation PointOfInstantiation, void *InsertPos,
       LateInstantiatedAttrVec *LateAttrs = nullptr,
       LocalInstantiationScope *StartingScope = nullptr);
   VarTemplateSpecializationDecl *CompleteVarTemplateSpecializationDecl(
diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h
index 0dcaf565591b..91d175fdd050 100644
--- a/clang/include/clang/Sema/Template.h
+++ b/clang/include/clang/Sema/Template.h
@@ -600,7 +600,7 @@ enum class TemplateSubstitutionKind : char {
                         TagDecl *NewDecl);
 
     Decl *VisitVarTemplateSpecializationDecl(
-        VarTemplateDecl *VarTemplate, VarDecl *FromVar,
+        VarTemplateDecl *VarTemplate, VarDecl *FromVar, void *InsertPos,
         const TemplateArgumentListInfo &TemplateArgsInfo,
         ArrayRef<TemplateArgument> Converted,
         VarTemplateSpecializationDecl *PrevDecl = nullptr);
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index ddae944a48f3..c05ed0b14e3e 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -4471,7 +4471,7 @@ Sema::CheckVarTemplateId(VarTemplateDecl *Template, SourceLocation TemplateLoc,
   // FIXME: LateAttrs et al.?
   VarTemplateSpecializationDecl *Decl = BuildVarTemplateInstantiation(
       Template, InstantiationPattern, *InstantiationArgs, TemplateArgs,
-      Converted, TemplateNameLoc /*, LateAttrs, StartingScope*/);
+      Converted, TemplateNameLoc, InsertPos /*, LateAttrs, StartingScope*/);
   if (!Decl)
     return true;
 
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 903785a2be42..baec13ba627c 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -3602,11 +3602,11 @@ Decl *TemplateDeclInstantiator::VisitVarTemplateSpecializationDecl(
     return nullptr;
 
   return VisitVarTemplateSpecializationDecl(
-      InstVarTemplate, D, VarTemplateArgsInfo, Converted, PrevDecl);
+      InstVarTemplate, D, InsertPos, VarTemplateArgsInfo, Converted, PrevDecl);
 }
 
 Decl *TemplateDeclInstantiator::VisitVarTemplateSpecializationDecl(
-    VarTemplateDecl *VarTemplate, VarDecl *D,
+    VarTemplateDecl *VarTemplate, VarDecl *D, void *InsertPos,
     const TemplateArgumentListInfo &TemplateArgsInfo,
     ArrayRef<TemplateArgument> Converted,
     VarTemplateSpecializationDecl *PrevDecl) {
@@ -3629,11 +3629,8 @@ Decl *TemplateDeclInstantiator::VisitVarTemplateSpecializationDecl(
       SemaRef.Context, Owner, D->getInnerLocStart(), D->getLocation(),
       VarTemplate, DI->getType(), DI, D->getStorageClass(), Converted);
   Var->setTemplateArgsInfo(TemplateArgsInfo);
-  if (!PrevDecl) {
-    void *InsertPos = nullptr;
-    VarTemplate->findSpecialization(Converted, InsertPos);
+  if (InsertPos)
     VarTemplate->AddSpecialization(Var, InsertPos);
-  }
 
   if (SemaRef.getLangOpts().OpenCL)
     SemaRef.deduceOpenCLAddressSpace(Var);
@@ -4842,7 +4839,7 @@ VarTemplateSpecializationDecl *Sema::BuildVarTemplateInstantiation(
     const TemplateArgumentList &TemplateArgList,
     const TemplateArgumentListInfo &TemplateArgsInfo,
     SmallVectorImpl<TemplateArgument> &Converted,
-    SourceLocation PointOfInstantiation,
+    SourceLocation PointOfInstantiation, void *InsertPos,
     LateInstantiatedAttrVec *LateAttrs,
     LocalInstantiationScope *StartingScope) {
   if (FromVar->isInvalidDecl())
@@ -4881,7 +4878,7 @@ VarTemplateSpecializationDecl *Sema::BuildVarTemplateInstantiation(
 
   return cast_or_null<VarTemplateSpecializationDecl>(
       Instantiator.VisitVarTemplateSpecializationDecl(
-          VarTemplate, FromVar, TemplateArgsInfo, Converted));
+          VarTemplate, FromVar, InsertPos, TemplateArgsInfo, Converted));
 }
 
 /// Instantiates a variable template specialization by completing it
@@ -5313,8 +5310,8 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
     TemplateDeclInstantiator Instantiator(*this, Var->getDeclContext(),
                                           TemplateArgs);
     Var = cast_or_null<VarDecl>(Instantiator.VisitVarTemplateSpecializationDecl(
-        VarSpec->getSpecializedTemplate(), Def, VarSpec->getTemplateArgsInfo(),
-        VarSpec->getTemplateArgs().asArray(), VarSpec));
+        VarSpec->getSpecializedTemplate(), Def, nullptr,
+        VarSpec->getTemplateArgsInfo(), VarSpec->getTemplateArgs().asArray()));
     if (Var) {
       llvm::PointerUnion<VarTemplateDecl *,
                          VarTemplatePartialSpecializationDecl *> PatternPtr =
@@ -5324,6 +5321,12 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
         cast<VarTemplateSpecializationDecl>(Var)->setInstantiationOf(
             Partial, &VarSpec->getTemplateInstantiationArgs());
 
+      // Merge the definition with the declaration.
+      LookupResult R(*this, Var->getDeclName(), Var->getLocation(),
+                     LookupOrdinaryName, forRedeclarationInCurContext());
+      R.addDecl(OldVar);
+      MergeVarDecl(Var, R);
+
       // Attach the initializer.
       InstantiateVariableInitializer(Var, Def, TemplateArgs);
     }
diff --git a/clang/test/SemaTemplate/instantiate-var-template.cpp b/clang/test/SemaTemplate/instantiate-var-template.cpp
index a24b205da596..b7b83e4afdd5 100644
--- a/clang/test/SemaTemplate/instantiate-var-template.cpp
+++ b/clang/test/SemaTemplate/instantiate-var-template.cpp
@@ -40,10 +40,3 @@ namespace PR24483 {
   template<typename... T> A<T...> models;
   template<> struct B models<>; // expected-error {{incomplete type 'struct B'}} expected-note {{forward declaration}}
 }
-
-namespace InvalidInsertPos {
-  template<typename T, int N> T v;
-  template<int N> decltype(v<int, N-1>) v<int, N>;
-  template<> int v<int, 0>;
-  int k = v<int, 500>;
-}

From 075cca3448389e97d60a9bb61f313642c31653dc Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 6 Nov 2020 14:08:30 -0500
Subject: [PATCH 307/363] [RTDYLD] support absolute relocations where needed

These appear in some sections, such as DWARF tables, since
RuntimeDyldELF explicitly maps to this as a sentinel value:
https://github.com/llvm/llvm-project/blob/29d1fba7b5335d969e3e5daa84b7a25cd1fa75ef/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp#L1199

That could then be a source of problems if it tried to examine these
sections (for example, with either setProcessAllSections(true) or ORCv2 on i686).

Replaces https://reviews.llvm.org/D89241

Reviewed By: lhames, vchuravy

Differential Revision: https://reviews.llvm.org/D90722

(cherry picked from commit 85f4be09124cc53e1a18a884ac7caf19988ab2a5)
---
 .../RuntimeDyld/RuntimeDyld.cpp               | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 7e9b0690ccea..04f541b59557 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -308,7 +308,9 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
                         << " SID: " << SectionID
                         << " Offset: " << format("%p", (uintptr_t)Addr)
                         << " flags: " << *FlagsOrErr << "\n");
-      GlobalSymbolTable[Name] = SymbolTableEntry(SectionID, Addr, *JITSymFlags);
+      if (!Name.empty()) // Skip absolute symbol relocations.
+        GlobalSymbolTable[Name] =
+            SymbolTableEntry(SectionID, Addr, *JITSymFlags);
     } else if (SymType == object::SymbolRef::ST_Function ||
                SymType == object::SymbolRef::ST_Data ||
                SymType == object::SymbolRef::ST_Unknown ||
@@ -340,8 +342,9 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
                         << " SID: " << SectionID
                         << " Offset: " << format("%p", (uintptr_t)SectOffset)
                         << " flags: " << *FlagsOrErr << "\n");
-      GlobalSymbolTable[Name] =
-          SymbolTableEntry(SectionID, SectOffset, *JITSymFlags);
+      if (!Name.empty()) // Skip absolute symbol relocations
+        GlobalSymbolTable[Name] =
+            SymbolTableEntry(SectionID, SectOffset, *JITSymFlags);
     }
   }
 
@@ -769,8 +772,9 @@ Error RuntimeDyldImpl::emitCommonSymbols(const ObjectFile &Obj,
 
     LLVM_DEBUG(dbgs() << "Allocating common symbol " << Name << " address "
                       << format("%p", Addr) << "\n");
-    GlobalSymbolTable[Name] =
-        SymbolTableEntry(SectionID, Offset, std::move(*JITSymFlags));
+    if (!Name.empty()) // Skip absolute symbol relocations.
+      GlobalSymbolTable[Name] =
+          SymbolTableEntry(SectionID, Offset, std::move(*JITSymFlags));
     Offset += Size;
     Addr += Size;
   }
@@ -930,6 +934,8 @@ void RuntimeDyldImpl::addRelocationForSymbol(const RelocationEntry &RE,
   if (Loc == GlobalSymbolTable.end()) {
     ExternalSymbolRelocations[SymbolName].push_back(RE);
   } else {
+    assert(!SymbolName.empty() &&
+           "Empty symbol should not be in GlobalSymbolTable");
     // Copy the RE since we want to modify its addend.
     RelocationEntry RECopy = RE;
     const auto &SymInfo = Loc->second;
@@ -1234,7 +1240,8 @@ void RuntimeDyldImpl::finalizeAsync(
 
   for (auto &RelocKV : SharedThis->ExternalSymbolRelocations) {
     StringRef Name = RelocKV.first();
-    assert(!Name.empty() && "Symbol has no name?");
+    if (Name.empty()) // Skip absolute symbol relocations.
+      continue;
     assert(!SharedThis->GlobalSymbolTable.count(Name) &&
            "Name already processed. RuntimeDyld instances can not be re-used "
            "when finalizing with finalizeAsync.");

From 890fdb8031e3ea2b2ede1e8b90c34a8977e38548 Mon Sep 17 00:00:00 2001
From: Joachim Meyer <joachim@joameyer.de>
Date: Thu, 29 Oct 2020 19:45:49 +0100
Subject: [PATCH 308/363] [OpenMP] Use __OPENMP_NVPTX__ instead of _OPENMP in
 complex wrapper headers.

This is very similar to 7f1e6fcff942, just fixing a left-over.
With this, it should be possible to use both, -x cuda and -fopenmp in the same invocation,
enabling to use both OpenMP, targeting CPU, and CUDA, targeting the GPU.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D90415

(cherry picked from commit eaee608448c832e8f806faae30ae4100620c4688)
---
 clang/lib/Headers/__clang_cuda_complex_builtins.h | 6 +++---
 clang/lib/Headers/openmp_wrappers/complex         | 2 ++
 clang/lib/Headers/openmp_wrappers/complex.h       | 2 ++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Headers/__clang_cuda_complex_builtins.h b/clang/lib/Headers/__clang_cuda_complex_builtins.h
index 8c10ff6b461f..d924487ab285 100644
--- a/clang/lib/Headers/__clang_cuda_complex_builtins.h
+++ b/clang/lib/Headers/__clang_cuda_complex_builtins.h
@@ -16,7 +16,7 @@
 // to work with CUDA and OpenMP target offloading [in C and C++ mode].)
 
 #pragma push_macro("__DEVICE__")
-#ifdef _OPENMP
+#ifdef __OPENMP_NVPTX__
 #pragma omp declare target
 #define __DEVICE__ __attribute__((noinline, nothrow, cold, weak))
 #else
@@ -26,7 +26,7 @@
 // To make the algorithms available for C and C++ in CUDA and OpenMP we select
 // different but equivalent function versions. TODO: For OpenMP we currently
 // select the native builtins as the overload support for templates is lacking.
-#if !defined(_OPENMP)
+#if !defined(__OPENMP_NVPTX__)
 #define _ISNANd std::isnan
 #define _ISNANf std::isnan
 #define _ISINFd std::isinf
@@ -250,7 +250,7 @@ __DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
 #undef _LOGBd
 #undef _LOGBf
 
-#ifdef _OPENMP
+#ifdef __OPENMP_NVPTX__
 #pragma omp end declare target
 #endif
 
diff --git a/clang/lib/Headers/openmp_wrappers/complex b/clang/lib/Headers/openmp_wrappers/complex
index 1ed0b14879ef..d8dcd41670ee 100644
--- a/clang/lib/Headers/openmp_wrappers/complex
+++ b/clang/lib/Headers/openmp_wrappers/complex
@@ -18,7 +18,9 @@
 #include <cmath>
 
 #define __CUDA__
+#define __OPENMP_NVPTX__
 #include <__clang_cuda_complex_builtins.h>
+#undef __OPENMP_NVPTX__
 #endif
 
 // Grab the host header too.
diff --git a/clang/lib/Headers/openmp_wrappers/complex.h b/clang/lib/Headers/openmp_wrappers/complex.h
index 829c7a785725..00d278548f82 100644
--- a/clang/lib/Headers/openmp_wrappers/complex.h
+++ b/clang/lib/Headers/openmp_wrappers/complex.h
@@ -18,7 +18,9 @@
 #include <math.h>
 
 #define __CUDA__
+#define __OPENMP_NVPTX__
 #include <__clang_cuda_complex_builtins.h>
+#undef __OPENMP_NVPTX__
 #endif
 
 // Grab the host header too.

From f590845f5017bf0f131bb05b5ee51cfb7756aec2 Mon Sep 17 00:00:00 2001
From: Sylvestre Ledru <sylvestre@debian.org>
Date: Mon, 12 Oct 2020 18:47:30 +0200
Subject: [PATCH 309/363] Revert "[clang-format] Fix AlignConsecutive on PP
 blocks"

This reverts commit b2eb439317576ce718193763c12bff9fccdfc166.

Caused the regression:
https://bugs.llvm.org/show_bug.cgi?id=47589

Reviewed By: MyDeveloperDay

Differential Revision: https://reviews.llvm.org/D89464

(cherry picked from commit b9e789447f14c0330edd22c82746af29e7c3b259)
---
 clang/lib/Format/FormatToken.h                |  6 -----
 clang/lib/Format/UnwrappedLineParser.cpp      |  2 --
 clang/lib/Format/WhitespaceManager.cpp        | 10 +++-----
 clang/unittests/Format/FormatTest.cpp         | 23 -------------------
 clang/unittests/Format/FormatTestComments.cpp | 21 -----------------
 5 files changed, 3 insertions(+), 59 deletions(-)

diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index d4287f53fde3..b132a3e84da5 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -183,12 +183,6 @@ struct FormatToken {
   /// before the token.
   bool MustBreakBefore = false;
 
-  /// Whether to not align across this token
-  ///
-  /// This happens for example when a preprocessor directive ended directly
-  /// before the token, but very rarely otherwise.
-  bool MustBreakAlignBefore = false;
-
   /// The raw text of the token.
   ///
   /// Contains the raw token text without leading whitespace and without leading
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index a37386425aae..ea8a41cfba82 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -3037,7 +3037,6 @@ void UnwrappedLineParser::readToken(int LevelDifference) {
       }
       FormatTok = Tokens->getNextToken();
       FormatTok->MustBreakBefore = true;
-      FormatTok->MustBreakAlignBefore = true;
     }
 
     if (!PPStack.empty() && (PPStack.back().Kind == PP_Unreachable) &&
@@ -3062,7 +3061,6 @@ void UnwrappedLineParser::pushToken(FormatToken *Tok) {
   Line->Tokens.push_back(UnwrappedLineNode(Tok));
   if (MustBreakBeforeNextToken) {
     Line->Tokens.back().Tok->MustBreakBefore = true;
-    Line->Tokens.back().Tok->MustBreakAlignBefore = true;
     MustBreakBeforeNextToken = false;
   }
 }
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index 32e0b685ea0f..3a265bd09168 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -411,11 +411,9 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
     if (Changes[i].NewlinesBefore != 0) {
       CommasBeforeMatch = 0;
       EndOfSequence = i;
-      // If there is a blank line, there is a forced-align-break (eg,
-      // preprocessor), or if the last line didn't contain any matching token,
-      // the sequence ends here.
-      if (Changes[i].NewlinesBefore > 1 ||
-          Changes[i].Tok->MustBreakAlignBefore || !FoundMatchOnLine)
+      // If there is a blank line, or if the last line didn't contain any
+      // matching token, the sequence ends here.
+      if (Changes[i].NewlinesBefore > 1 || !FoundMatchOnLine)
         AlignCurrentSequence();
 
       FoundMatchOnLine = false;
@@ -726,8 +724,6 @@ void WhitespaceManager::alignTrailingComments() {
     if (Changes[i].StartOfBlockComment)
       continue;
     Newlines += Changes[i].NewlinesBefore;
-    if (Changes[i].Tok->MustBreakAlignBefore)
-      BreakBeforeNext = true;
     if (!Changes[i].IsTrailingComment)
       continue;
 
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 17d302f0b659..7e4d1fca5096 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -11968,29 +11968,6 @@ TEST_F(FormatTest, AlignConsecutiveAssignments) {
   verifyFormat("int oneTwoThree = 123; // comment\n"
                "int oneTwo      = 12;  // comment",
                Alignment);
-
-  // Bug 25167
-  verifyFormat("#if A\n"
-               "#else\n"
-               "int aaaaaaaa = 12;\n"
-               "#endif\n"
-               "#if B\n"
-               "#else\n"
-               "int a = 12;\n"
-               "#endif\n",
-               Alignment);
-  verifyFormat("enum foo {\n"
-               "#if A\n"
-               "#else\n"
-               "  aaaaaaaa = 12;\n"
-               "#endif\n"
-               "#if B\n"
-               "#else\n"
-               "  a = 12;\n"
-               "#endif\n"
-               "};\n",
-               Alignment);
-
   EXPECT_EQ("int a = 5;\n"
             "\n"
             "int oneTwoThree = 123;",
diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp
index 47509f29744c..d5b9f8e0885a 100644
--- a/clang/unittests/Format/FormatTestComments.cpp
+++ b/clang/unittests/Format/FormatTestComments.cpp
@@ -2780,27 +2780,6 @@ TEST_F(FormatTestComments, AlignTrailingComments) {
                    "       // line 2 about b\n"
                    "       long b;",
                    getLLVMStyleWithColumns(80)));
-
-  // Checks an edge case in preprocessor handling.
-  // These comments should *not* be aligned
-  EXPECT_EQ(
-      "#if FOO\n"
-      "#else\n"
-      "long a; // Line about a\n"
-      "#endif\n"
-      "#if BAR\n"
-      "#else\n"
-      "long b_long_name; // Line about b\n"
-      "#endif\n",
-      format("#if FOO\n"
-             "#else\n"
-             "long a;           // Line about a\n" // Previous (bad) behavior
-             "#endif\n"
-             "#if BAR\n"
-             "#else\n"
-             "long b_long_name; // Line about b\n"
-             "#endif\n",
-             getLLVMStyleWithColumns(80)));
 }
 
 TEST_F(FormatTestComments, AlignsBlockCommentDecorations) {

From fd623ba8061dda6f489390ada6b11dff58a4d470 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Thu, 19 Nov 2020 10:59:08 -0800
Subject: [PATCH 310/363] Fix crash after looking up dwo_id=0 in CU index.

In the current state, if getFromHash(0) is called and there's no CU with
dwo_id=0, the lookup will stop at an empty slot, then the check
`Rows[H].getSignature() != S` won't cause the lookup to fail and return
a nullptr (as it should), because the empty slot has a 0 in the
signature field, and a pointer to the empty slot will be incorrectly
returned.

This patch fixes this by using the index field in the hash entry to
check for empty slots: signature = 0 can match a valid hash but
according to the spec the index for an occupied slot will always be
non-zero.

Differential Revision: https://reviews.llvm.org/D91670

(cherry picked from commit 314a0d73a8444e3b0c1a0d0de7d615d9448af1c9)
---
 llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp   |   8 +-
 .../split-dwarf-zero-signature-not-found.s    | 118 ++++++++++++++++++
 2 files changed, 124 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/tools/llvm-symbolizer/split-dwarf-zero-signature-not-found.s

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
index 3d4cecce27db..d27fd08db14e 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
@@ -286,10 +286,14 @@ const DWARFUnitIndex::Entry *DWARFUnitIndex::getFromHash(uint64_t S) const {
 
   auto H = S & Mask;
   auto HP = ((S >> 32) & Mask) | 1;
-  while (Rows[H].getSignature() != S && Rows[H].getSignature() != 0)
+  // The spec says "while 0 is a valid hash value, the row index in a used slot
+  // will always be non-zero". Loop until we find a match or an empty slot.
+  while (Rows[H].getSignature() != S && Rows[H].Index != nullptr)
     H = (H + HP) & Mask;
 
-  if (Rows[H].getSignature() != S)
+  // If the slot is empty, we don't care whether the signature matches (it could
+  // be zero and still match the zeros in the empty slot).
+  if (Rows[H].Index == nullptr)
     return nullptr;
 
   return &Rows[H];
diff --git a/llvm/test/tools/llvm-symbolizer/split-dwarf-zero-signature-not-found.s b/llvm/test/tools/llvm-symbolizer/split-dwarf-zero-signature-not-found.s
new file mode 100644
index 000000000000..de3cc738e1ff
--- /dev/null
+++ b/llvm/test/tools/llvm-symbolizer/split-dwarf-zero-signature-not-found.s
@@ -0,0 +1,118 @@
+## This test checks that looking up a zero hash in the .debug_cu_index hash
+## table works correctly when there's no CU with signature = 0.
+##
+## LLVM used to check just the signature bits to decide if the hash lookup ended
+## at a match or at an empty slot. This is wrong when signature = 0 because
+## empty slots have all zeros in the signature field too, and LLVM would return
+## the empty slot as a valid result.
+
+# REQUIRES: x86-registered-target
+
+# RUN: llvm-mc --filetype=obj --triple x86_64 %s -o %t --defsym MAIN=0
+# RUN: llvm-mc --filetype=obj --triple x86_64 %s -o %t.dwp
+# RUN: llvm-symbolizer --obj=%t --dwp=%t.dwp 0x0 | FileCheck %s
+
+## This expected output is very uninteresting, but it's better than a crash.
+# CHECK: ??:0:0
+
+        .section        .debug_abbrev,"",@progbits
+        .byte   1                       # Abbreviation Code
+        .byte   17                      # DW_TAG_compile_unit
+        .byte   0                       # DW_CHILDREN_no
+        .ascii  "\260B"                 # DW_AT_GNU_dwo_name
+        .byte   8                       # DW_FORM_string
+        .ascii  "\261B"                 # DW_AT_GNU_dwo_id
+        .byte   7                       # DW_FORM_data8
+        .ascii  "\263B"                 # DW_AT_GNU_addr_base
+        .byte   23                      # DW_FORM_sec_offset
+        .byte   85                      # DW_AT_ranges
+        .byte   23                      # DW_FORM_sec_offset
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   0                       # EOM(3)
+
+## Create two CUs, with dwo_ids 0 and 1 respectively.
+.ifdef MAIN
+.irpc I,01
+        .data
+A\I:
+        .long \I
+
+        .text
+F\I:
+        nop
+
+        .section        .debug_info,"",@progbits
+.Lcu_begin\I:
+        .long   .Ldebug_info_end\I-.Ldebug_info_start\I # Length of Unit
+.Ldebug_info_start\I:
+        .short  4                       # DWARF version number
+        .long   .debug_abbrev           # Offset Into Abbrev. Section
+        .byte   8                       # Address Size (in bytes)
+        .byte   1                       # Abbrev [1] 0xb:0x25 DW_TAG_compile_unit
+        .asciz  "A.dwo"                 # DW_AT_GNU_dwo_name
+        .quad   \I                      # DW_AT_GNU_dwo_id
+        .long   .debug_addr             # DW_AT_GNU_addr_base
+        .long   .Lranges\I              # DW_AT_ranges
+.Ldebug_info_end\I:
+
+        .section        .debug_addr,"",@progbits
+        .quad   A\I
+        .quad   F\I
+
+        .section        .debug_ranges,"",@progbits
+.Lranges\I:
+        .quad   F\I
+        .quad   F\I+1
+        .quad   0
+        .quad   0
+.endr
+.else
+## Deliberately omit compile unit 0 in the DWP. We want to check the case where
+## a signature = 0 matches an empty hash slot in .debug_cu_index and the index
+## in the parallel table has to be checked.
+        .section        .debug_abbrev.dwo,"e",@progbits
+.Labbrev1:
+        .byte   1                       # Abbreviation Code
+        .byte   17                      # DW_TAG_compile_unit
+        .byte   0                       # DW_CHILDREN_no
+        .byte   37                      # DW_AT_producer
+        .byte   8                       # DW_FORM_string
+        .byte   3                       # DW_AT_name
+        .byte   8                       # DW_FORM_string
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   0                       # EOM(3)
+.Labbrev_end1:
+
+        .section        .debug_info.dwo,"e",@progbits
+.Lcu_begin1:
+        .long   .Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+        .short  4                       # DWARF version number
+        .long   0                       # Offset Into Abbrev. Section
+        .byte   8                       # Address Size (in bytes)
+        .byte   1                       # Abbrev DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"    # DW_AT_producer
+        .byte   '1', '.', 'c', 0        # DW_AT_name
+.Ldebug_info_end1:
+
+        .section        .debug_cu_index,"",@progbits
+        .long   2                       # DWARF version number
+        .long   2                       # Section count
+        .long   1                       # Unit count
+        .long   8                       # Slot count
+
+        .quad   1, 0, 0, 0, 0, 0, 0, 0  # Hash table
+        .long   1, 0, 0, 0, 0, 0, 0, 0  # Index table
+
+        .long   1                       # DW_SECT_INFO
+        .long   3                       # DW_SECT_ABBREV
+
+        .long .Lcu_begin1-.debug_info.dwo
+        .long .Labbrev1-.debug_abbrev.dwo
+
+        .long .Ldebug_info_end1-.Lcu_begin1
+        .long .Labbrev_end1-.Labbrev1
+
+.endif

From 12c4c6cd770a1651676560924ff77c9e4b383848 Mon Sep 17 00:00:00 2001
From: Aaron Puchert <aaronpuchert@alice-dsl.net>
Date: Sun, 22 Nov 2020 20:51:00 +0100
Subject: [PATCH 311/363] Build reproducible tarballs for releases

Currently the tarballs contain superfluous metadata, like the user name
of the packager and via Pax headers even the PID of the tar process that
packaged the files. We build the monorepo projects directly from the git
repo using "git archive" and for the test-suite we add some flags as
recommended by https://reproducible-builds.org/docs/archives/. We don't
use numeric owners though to be compatible with "git archive".

The advantage of "git archive" is that the releaser doesn't have to
download the tar ball and extract it, rather the archive is built
directly from the repository. This is probably what GitHub uses
internally to produce the tarballs, so I wouldn't expect a difference.

Reviewed By: tstellar

Differential Revision: https://reviews.llvm.org/D91494

(cherry picked from commit 1a009296a4e9a50e85908f9141c3c1ea860d73e4)
---
 llvm/utils/release/export.sh | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/llvm/utils/release/export.sh b/llvm/utils/release/export.sh
index 3ffd7e78dd63..0c76ed047081 100755
--- a/llvm/utils/release/export.sh
+++ b/llvm/utils/release/export.sh
@@ -13,7 +13,7 @@
 
 set -e
 
-projects="llvm clang test-suite compiler-rt libcxx libcxxabi libclc clang-tools-extra polly lldb lld openmp libunwind flang"
+projects="llvm clang compiler-rt libcxx libcxxabi libclc clang-tools-extra polly lldb lld openmp libunwind flang"
 
 release=""
 rc=""
@@ -37,26 +37,34 @@ export_sources() {
         tag="$tag-$rc"
     fi
 
-    llvm_src_dir=llvm-project-$release$rc
-    mkdir -p $llvm_src_dir
+    llvm_src_dir=$(readlink -f $(dirname "$(readlink -f "$0")")/../../..)
+    [ -d $llvm_src_dir/.git ] || ( echo "No git repository at $llvm_src_dir" ; exit 1 )
 
     echo $tag
-    echo "Fetching LLVM project source ..."
-    curl -L https://github.com/llvm/llvm-project/archive/$tag.tar.gz | \
-        tar -C $llvm_src_dir --strip-components=1 -xzf -
+    target_dir=$(pwd)
 
     echo "Creating tarball for llvm-project ..."
-    tar -cJf llvm-project-$release$rc.tar.xz $llvm_src_dir
+    pushd $llvm_src_dir/
+    git archive --prefix=llvm-project-$release$rc.src/ $tag . | xz >$target_dir/llvm-project-$release$rc.src.tar.xz
+    popd
 
-    echo "Fetching LLVM test-suite source ..."
-    mkdir -p $llvm_src_dir/test-suite
-    curl -L https://github.com/llvm/test-suite/archive/$tag.tar.gz | \
-        tar -C $llvm_src_dir/test-suite --strip-components=1 -xzf -
+    if [ ! -d test-suite-$release$rc.src ]
+    then
+      echo "Fetching LLVM test-suite source ..."
+      mkdir -p test-suite-$release$rc.src
+      curl -L https://github.com/llvm/test-suite/archive/$tag.tar.gz | \
+          tar -C test-suite-$release$rc.src --strip-components=1 -xzf -
+    fi
+    echo "Creating tarball for test-suite ..."
+    tar --sort=name --owner=0 --group=0 \
+        --pax-option=exthdr.name=%d/PaxHeaders/%f,delete=atime,delete=ctime \
+        -cJf test-suite-$release$rc.src.tar.xz test-suite-$release$rc.src
 
     for proj in $projects; do
         echo "Creating tarball for $proj ..."
-        mv $llvm_src_dir/$proj $llvm_src_dir/$proj-$release$rc.src
-        tar -C $llvm_src_dir -cJf $proj-$release$rc.src.tar.xz $proj-$release$rc.src
+        pushd $llvm_src_dir/$proj
+        git archive --prefix=$proj-$release$rc.src/ $tag . | xz >$target_dir/$proj-$release$rc.src.tar.xz
+        popd
     done
 }
 

From a082c730b89fe5e544136ebe6370f452fd2772ee Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Sat, 8 Aug 2020 15:23:11 -0700
Subject: [PATCH 312/363] [WebAssembly] Fix FastISel address calculation bug

Fixes PR47040, in which an assertion was improperly triggered during
FastISel's address computation. The issue was that an `Address` set to
be relative to the FrameIndex with offset zero was incorrectly
considered to have an unset base. When the left hand side of an add
set the Address to be 0 off the FrameIndex, the right side would not
detect that the Address base had already been set and could try to set
the Address to be relative to a register instead, triggering an
assertion.

This patch fixes the issue by explicitly tracking whether an `Address`
has been set rather than interpreting an offset of zero to mean the
`Address` has not been set.

Differential Revision: https://reviews.llvm.org/D85581

(cherry picked from commit cc612c29084e907900ce63ad9031ab573a64e942)
---
 .../WebAssembly/WebAssemblyFastISel.cpp       | 17 +++++++-------
 .../CodeGen/WebAssembly/fast-isel-pr47040.ll  | 22 +++++++++++++++++++
 2 files changed, 30 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/WebAssembly/fast-isel-pr47040.ll

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 8a0092a3f298..c2a0d3e01740 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -58,6 +58,9 @@ class WebAssemblyFastISel final : public FastISel {
       int FI;
     } Base;
 
+    // Whether the base has been determined yet
+    bool IsBaseSet = false;
+
     int64_t Offset = 0;
 
     const GlobalValue *GV = nullptr;
@@ -74,8 +77,9 @@ class WebAssemblyFastISel final : public FastISel {
     bool isFIBase() const { return Kind == FrameIndexBase; }
     void setReg(unsigned Reg) {
       assert(isRegBase() && "Invalid base register access!");
-      assert(Base.Reg == 0 && "Overwriting non-zero register");
+      assert(!IsBaseSet && "Base cannot be reset");
       Base.Reg = Reg;
+      IsBaseSet = true;
     }
     unsigned getReg() const {
       assert(isRegBase() && "Invalid base register access!");
@@ -83,8 +87,9 @@ class WebAssemblyFastISel final : public FastISel {
     }
     void setFI(unsigned FI) {
       assert(isFIBase() && "Invalid base frame index access!");
-      assert(Base.FI == 0 && "Overwriting non-zero frame index");
+      assert(!IsBaseSet && "Base cannot be reset");
       Base.FI = FI;
+      IsBaseSet = true;
     }
     unsigned getFI() const {
       assert(isFIBase() && "Invalid base frame index access!");
@@ -98,13 +103,7 @@ class WebAssemblyFastISel final : public FastISel {
     int64_t getOffset() const { return Offset; }
     void setGlobalValue(const GlobalValue *G) { GV = G; }
     const GlobalValue *getGlobalValue() const { return GV; }
-    bool isSet() const {
-      if (isRegBase()) {
-        return Base.Reg != 0;
-      } else {
-        return Base.FI != 0;
-      }
-    }
+    bool isSet() const { return IsBaseSet; }
   };
 
   /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
diff --git a/llvm/test/CodeGen/WebAssembly/fast-isel-pr47040.ll b/llvm/test/CodeGen/WebAssembly/fast-isel-pr47040.ll
new file mode 100644
index 000000000000..616ce295d7f2
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/fast-isel-pr47040.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -fast-isel -fast-isel-abort=1 -verify-machineinstrs
+
+; Regression test for PR47040, in which an assertion was improperly
+; triggered during FastISel's address computation. The issue was that
+; an `Address` set to be relative to FrameIndex zero was incorrectly
+; considered to have an unset base. When the left hand side of an add
+; set the Address to have a FrameIndex base of 0, the right side would
+; not detect that the Address base had already been set and could try
+; to set the Address to be relative to a register instead, triggering
+; an assertion.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+define i32 @foo() {
+  %stack_addr = alloca i32
+  %stack_i = ptrtoint i32* %stack_addr to i32
+  %added = add i32 %stack_i, undef
+  %added_addr = inttoptr i32 %added to i32*
+  %ret = load i32, i32* %added_addr
+  ret i32 %ret
+}

From 8b89bc0de0e190be04991a9622c5866a2e93ef6d Mon Sep 17 00:00:00 2001
From: Julien Jorge <jjorge@quarkslab.com>
Date: Sun, 1 Nov 2020 17:29:26 -0800
Subject: [PATCH 313/363] [WebAssembly] Don't fold frame offset for global
 addresses

When machine instructions are in the form of
```
%0 = CONST_I32 @str
%1 = ADD_I32 %stack.0, %0
%2 = LOAD 0, 0, %1
```

In the `ADD_I32` instruction, it is possible to fold it if `%0` is a
`CONST_I32` from an immediate number. But in this case it is a global
address, so we shouldn't do that. But we haven't checked if the operand
of `ADD` is an immediate so far. This fixes the problem. (The case
applies the same for `ADD_I64` and `CONST_I64` instructions.)

Fixes https://bugs.llvm.org/show_bug.cgi?id=47944.

Patch by Julien Jorge (jjorge@quarkslab.com)

Reviewed By: dschuff

Differential Revision: https://reviews.llvm.org/D90577

(cherry picked from commit 0fca6517118d435f9c2d7afe6135fd5f357509b5)
---
 .../Target/WebAssembly/WebAssemblyInstrInfo.td   |  4 +++-
 .../WebAssembly/WebAssemblyRegisterInfo.cpp      | 10 ++++++----
 llvm/test/CodeGen/WebAssembly/userstack.ll       | 16 ++++++++++++++++
 3 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 5ff0d73534a6..085910f01ee6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -328,7 +328,9 @@ defm CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
 } // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1
 
 def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
-          (CONST_I32 tglobaladdr:$addr)>, Requires<[IsNotPIC]>;
+          (CONST_I32 tglobaladdr:$addr)>, Requires<[IsNotPIC, HasAddr32]>;
+def : Pat<(i64 (WebAssemblywrapper tglobaladdr:$addr)),
+          (CONST_I64 tglobaladdr:$addr)>, Requires<[IsNotPIC, HasAddr64]>;
 
 def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
           (GLOBAL_GET_I32 tglobaladdr:$addr)>, Requires<[IsPIC]>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index 130589c9df8c..6b6394a58339 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -101,10 +101,12 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex(
               WebAssemblyFrameLowering::getOpcConst(MF) &&
             MRI.hasOneNonDBGUse(Def->getOperand(0).getReg())) {
           MachineOperand &ImmMO = Def->getOperand(1);
-          ImmMO.setImm(ImmMO.getImm() + uint32_t(FrameOffset));
-          MI.getOperand(FIOperandNum)
-              .ChangeToRegister(FrameRegister, /*isDef=*/false);
-          return;
+          if (ImmMO.isImm()) {
+            ImmMO.setImm(ImmMO.getImm() + uint32_t(FrameOffset));
+            MI.getOperand(FIOperandNum)
+                .ChangeToRegister(FrameRegister, /*isDef=*/false);
+            return;
+          }
         }
       }
     }
diff --git a/llvm/test/CodeGen/WebAssembly/userstack.ll b/llvm/test/CodeGen/WebAssembly/userstack.ll
index dec202ea6af9..3d0e0d8c86a0 100644
--- a/llvm/test/CodeGen/WebAssembly/userstack.ll
+++ b/llvm/test/CodeGen/WebAssembly/userstack.ll
@@ -328,6 +328,22 @@ define void @inline_asm() {
   ret void
 }
 
+; We optimize the format of "frame offset + operand" by folding it, but this is
+; only possible when that operand is an immediate. In this example it is a
+; global address, so we should not fold it.
+; CHECK-LABEL: frame_offset_with_global_address
+; CHECK: i[[PTR]].const ${{.*}}=, str
+@str = local_unnamed_addr global [3 x i8] c"abc", align 16
+define i8 @frame_offset_with_global_address() {
+  %1 = alloca i8, align 4
+  %2 = ptrtoint i8* %1 to i32
+  ;; Here @str is a global address and not an immediate, so cannot be folded
+  %3 = getelementptr [3 x i8], [3 x i8]* @str, i32 0, i32 %2
+  %4 = load i8, i8* %3, align 8
+  %5 = and i8 %4, 67
+  ret i8 %5
+}
+
 ; CHECK: .globaltype	__stack_pointer, i[[PTR]]{{$}}
 
 ; TODO: test over-aligned alloca

From 19a8a7445dd6f55e6bacc5284fbd3acedfc737e1 Mon Sep 17 00:00:00 2001
From: Aaron En Ye Shi <enye.shi@gmail.com>
Date: Mon, 30 Nov 2020 20:53:17 +0000
Subject: [PATCH 314/363] [HIP] Fix HIP test on windows due to lld suffix

On Windows, lld is instead named lld.exe, therefore
a few HIP tests are failing. Instead the wildcard should
be modified to .*lld.* to handle .exe. This fixes the
bug: https://bugs.llvm.org/show_bug.cgi?id=48289.

Differential Revision: https://reviews.llvm.org/D92342

(cherry picked from commit f89e9c8201ea5a5b63af854c92ed26bc7ab4b8db)
---
 clang/test/Driver/hip-toolchain-rdc-static-lib.hip | 4 ++--
 clang/test/Driver/hip-toolchain-rdc.hip            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip
index dc29b0f87e36..533d3457d5b4 100644
--- a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip
+++ b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip
@@ -47,7 +47,7 @@
 // CHECK-NOT: "*.llvm-link"
 // CHECK-NOT: ".*opt"
 // CHECK-NOT: ".*llc"
-// CHECK: [[LLD: ".*lld"]] {{.*}} "-o" "[[IMG_DEV1:.*out]]" [[A_BC1]] [[B_BC1]]
+// CHECK: [[LLD: ".*lld.*"]] {{.*}} "-o" "[[IMG_DEV1:.*out]]" [[A_BC1]] [[B_BC1]]
 
 // generate image for device side path on gfx900
 // CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
@@ -78,6 +78,6 @@
 // CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*hipfb]]"
 
-// CHECK: [[MC:".*llvm-mc"]] "-o" [[OBJBUNDLE:".*o"]] "{{.*}}.mcin" "--filetype=obj"
+// CHECK: [[MC:".*llvm-mc.*"]] "-o" [[OBJBUNDLE:".*o"]] "{{.*}}.mcin" "--filetype=obj"
 
 // CHECK: [[AR:".*llvm-ar.*"]] "rcsD" "{{.*}}.out" [[A_OBJ_HOST]] [[B_OBJ_HOST]] [[OBJBUNDLE]]
diff --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip
index 97d5e59c0c4b..d6297b6bf607 100644
--- a/clang/test/Driver/hip-toolchain-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-rdc.hip
@@ -90,7 +90,7 @@
 // CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
 // CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*hipfb]]"
 
-// CHECK: [[MC:".*llvm-mc"]] "-o" [[OBJBUNDLE:".*o"]] "{{.*}}.mcin" "--filetype=obj"
+// CHECK: [[MC:".*llvm-mc.*"]] "-o" [[OBJBUNDLE:".*o"]] "{{.*}}.mcin" "--filetype=obj"
 
 // output the executable
 // CHECK: [[LD:".*ld.*"]] {{.*}}"-o" "a.out" {{.*}} [[A_OBJ_HOST]] [[B_OBJ_HOST]] [[OBJBUNDLE]]

From 2cf5c80ab6d4635932e4e7ef84951819540bf1a8 Mon Sep 17 00:00:00 2001
From: Aaron En Ye Shi <enye.shi@gmail.com>
Date: Tue, 1 Dec 2020 15:46:19 +0000
Subject: [PATCH 315/363] [HIP] Fix static-lib test CHECK bug

Fix hip test failures that were introduced by
previous changes to hip-toolchain-rdc-static-lib.hip
test. The .*lld.* is matching a longer string than
expected.

Differential Revision: https://reviews.llvm.org/D92342

(cherry picked from commit cd5897d55908827faf3e16c505bd79732a8f6eb6)
---
 clang/test/Driver/hip-toolchain-rdc-static-lib.hip | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip
index 533d3457d5b4..b698ec763249 100644
--- a/clang/test/Driver/hip-toolchain-rdc-static-lib.hip
+++ b/clang/test/Driver/hip-toolchain-rdc-static-lib.hip
@@ -47,7 +47,9 @@
 // CHECK-NOT: "*.llvm-link"
 // CHECK-NOT: ".*opt"
 // CHECK-NOT: ".*llc"
-// CHECK: [[LLD: ".*lld.*"]] {{.*}} "-o" "[[IMG_DEV1:.*out]]" [[A_BC1]] [[B_BC1]]
+// CHECK: [[LLD: ".*lld.*"]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
+// CHECK-SAME: "-plugin-opt=mcpu=gfx803"
+// CHECK-SAME: "-o" "[[IMG_DEV1:.*out]]" [[A_BC1]] [[B_BC1]]
 
 // generate image for device side path on gfx900
 // CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
@@ -71,7 +73,9 @@
 // CHECK-NOT: "*.llvm-link"
 // CHECK-NOT: ".*opt"
 // CHECK-NOT: ".*llc"
-// CHECK: [[LLD]] {{.*}} "-o" "[[IMG_DEV2:.*out]]" [[A_BC2]] [[B_BC2]]
+// CHECK: [[LLD]] {{.*}} "-plugin-opt=-amdgpu-internalize-symbols"
+// CHECK-SAME: "-plugin-opt=mcpu=gfx900"
+// CHECK-SAME: "-o" "[[IMG_DEV2:.*out]]" [[A_BC2]] [[B_BC2]]
 
 // combine images generated into hip fat binary object
 // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"

From bb852a09ae36eec895445aee102d1751af9633d7 Mon Sep 17 00:00:00 2001
From: David Chisnall <github@theravensnest.org>
Date: Tue, 1 Dec 2020 09:48:25 +0000
Subject: [PATCH 316/363] [GNU ObjC] Fix a regression listing methods twice.

Methods synthesized from declared properties were being added to the
method lists twice.  This came from the change to list them in the
class's method list, which missed removing the place in CGObjCGNU that
added them again.

Reviewed By: lanza

Differential Revision: https://reviews.llvm.org/D91874

(cherry picked from commit d1ed67037de6f3f44dc446784f74f0e02adec9b5)
---
 clang/lib/CodeGen/CGObjCGNU.cpp               | 13 -----------
 clang/test/CodeGenObjC/gnu-method-only-once.m | 23 +++++++++++++++++++
 2 files changed, 23 insertions(+), 13 deletions(-)
 create mode 100644 clang/test/CodeGenObjC/gnu-method-only-once.m

diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp
index bb9c494ae68e..c64faf4c0af7 100644
--- a/clang/lib/CodeGen/CGObjCGNU.cpp
+++ b/clang/lib/CodeGen/CGObjCGNU.cpp
@@ -3511,19 +3511,6 @@ void CGObjCGNU::GenerateClass(const ObjCImplementationDecl *OID) {
   ClassMethods.insert(ClassMethods.begin(), OID->classmeth_begin(),
       OID->classmeth_end());
 
-  // Collect the same information about synthesized properties, which don't
-  // show up in the instance method lists.
-  for (auto *propertyImpl : OID->property_impls())
-    if (propertyImpl->getPropertyImplementation() ==
-        ObjCPropertyImplDecl::Synthesize) {
-      auto addPropertyMethod = [&](const ObjCMethodDecl *accessor) {
-        if (accessor)
-          InstanceMethods.push_back(accessor);
-      };
-      addPropertyMethod(propertyImpl->getGetterMethodDecl());
-      addPropertyMethod(propertyImpl->getSetterMethodDecl());
-    }
-
   llvm::Constant *Properties = GeneratePropertyList(OID, ClassDecl);
 
   // Collect the names of referenced protocols
diff --git a/clang/test/CodeGenObjC/gnu-method-only-once.m b/clang/test/CodeGenObjC/gnu-method-only-once.m
new file mode 100644
index 000000000000..67d873ccc0aa
--- /dev/null
+++ b/clang/test/CodeGenObjC/gnu-method-only-once.m
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -S -emit-llvm -fobjc-runtime=gnustep-2.0 -o - %s | FileCheck %s -check-prefix=CHECK-NEW
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -S -emit-llvm -fobjc-runtime=gnustep-1.8 -o - %s | FileCheck %s -check-prefix=CHECK-OLD
+
+// Clang 9 or 10 changed the handling of method lists so that methods provided
+// from synthesised properties showed up in the method list, where previously
+// CGObjCGNU had to collect them and merge them.  One of the places where this
+// merging happened was missed in the move and so we ended up emitting two
+// copies of method metadata for declared properties.
+
+// This class has only instance properties and only one pair of synthesized
+// methods from the property and so we should synthesize only one method list,
+// with precisely two methods on it.
+@interface X
+@property (retain) id iProp;
+@end
+
+@implementation X
+@synthesize iProp;
+@end
+
+// Check that the method list has precisely 2 methods.
+// CHECK-NEW: @.objc_method_list = internal global { i8*, i32, i64, [2 x
+// CHECK-OLD: @.objc_method_list = internal global { i8*, i32, [2 x

From 79cac55bf044bd8502f54076816a942388e3eae0 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 6 Nov 2020 14:26:04 -0500
Subject: [PATCH 317/363] Fix unwind info relocation with large code model on
 AArch64

Makes sure that the unwind info uses 64bits pcrel relocation if a large code model is specified and handle the corresponding relocation in the ExecutionEngine. This can happen with certain kernel configuration (the same as the one in https://reviews.llvm.org/D27609, found at least on the ArchLinux stock kernel and the one used on https://www.packet.net/) using the builtin JIT memory manager.

Co-authored-by: Yichao Yu <yyc1992@gmail.com>

Differential Revision: https://reviews.llvm.org/D27629

(cherry picked from commit 18805ea951be02fcab6e7b11c3c7d929bcf1441a)
---
 llvm/lib/MC/MCObjectFileInfo.cpp              |  2 ++
 .../MC/AArch64/ELF_ARM64_large-relocations.s  | 20 +++++++++++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 llvm/test/MC/AArch64/ELF_ARM64_large-relocations.s

diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index b77a9635f64c..b9b4416fde21 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -317,6 +317,8 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
     break;
   case Triple::ppc64:
   case Triple::ppc64le:
+  case Triple::aarch64:
+  case Triple::aarch64_be:
   case Triple::x86_64:
     FDECFIEncoding = dwarf::DW_EH_PE_pcrel |
                      (Large ? dwarf::DW_EH_PE_sdata8 : dwarf::DW_EH_PE_sdata4);
diff --git a/llvm/test/MC/AArch64/ELF_ARM64_large-relocations.s b/llvm/test/MC/AArch64/ELF_ARM64_large-relocations.s
new file mode 100644
index 000000000000..66f28dabd79f
--- /dev/null
+++ b/llvm/test/MC/AArch64/ELF_ARM64_large-relocations.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc -triple=arm64-none-linux-gnu -large-code-model -filetype=obj -o %T/large-reloc.o %s
+# RUN: llvm-rtdyld -triple=arm64-none-linux-gnu -verify -map-section large-reloc.o,.eh_frame=0x10000 -map-section large-reloc.o,.text=0xffff000000000000 -check=%s %T/large-reloc.o
+# RUN-BE: llvm-mc -triple=aarch64_be-none-linux-gnu -large-code-model -filetype=obj -o %T/be-large-reloc.o %s
+# RUN-BE: llvm-rtdyld -triple=aarch64_be-none-linux-gnu -verify -map-section be-large-reloc.o,.eh_frame=0x10000 -map-section be-large-reloc.o,.text=0xffff000000000000 -check=%s %T/be-large-reloc.o
+
+        .text
+        .globl  g
+        .p2align        2
+        .type   g,@function
+g:
+        .cfi_startproc
+        mov      x0, xzr
+        ret
+        .Lfunc_end0:
+        .size   g, .Lfunc_end0-g
+        .cfi_endproc
+
+# Skip the CIE and load the 8 bytes PC begin pointer.
+# Assuming the CIE and the FDE length are both 4 bytes.
+# rtdyld-check: *{8}(section_addr(large-reloc.o, .eh_frame) + (*{4}(section_addr(large-reloc.o, .eh_frame))) + 0xc) = g - (section_addr(large-reloc.o, .eh_frame) + (*{4}(section_addr(large-reloc.o, .eh_frame))) + 0xc)

From a60b9f1bf32026995080d28f7be33f2658da191e Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Tue, 13 Oct 2020 15:57:08 +0100
Subject: [PATCH 318/363] Add fatal error when running out of registers for SVE
 tuple call arguments

When passing SVE types as arguments to function calls we can run
out of hardware SVE registers. This is normally fine, since we
switch to an indirect mode where we pass a pointer to a SVE stack
object in a GPR. However, if we switch over part-way through
processing a SVE tuple then part of it will be in registers and
the other part will be on the stack. This is wrong and we'd like
to avoid any silent ABI compatibility issues in future. For now,
I've added a fatal error when this happens until we can get a
proper fix.

NOTE: Cherry-pick contains changes to remove redundant operand from
min/max tests in 'llvm-ir-to-intrinsic.ll', which weren't originally
part of this patch since they were removed in D85142 before this landed,
but they fail otherwise.

Differential Revision: https://reviews.llvm.org/D89326

(cherry picked from commit af57a0838eba528c2e5bd805d92c611435fca0d8)
---
 .../AArch64/AArch64CallingConvention.cpp      |  9 +++
 .../Target/AArch64/AArch64ISelLowering.cpp    |  9 ++-
 .../CodeGen/AArch64/llvm-ir-to-intrinsic.ll   | 58 +++++++++----------
 .../sve-calling-convention-tuples-broken.ll   | 23 ++++++++
 4 files changed, 69 insertions(+), 30 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll

diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index 84ec5afcc9c1..9ae2b465e247 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -35,6 +35,9 @@ static const MCPhysReg DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
 static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
                                      AArch64::Q3, AArch64::Q4, AArch64::Q5,
                                      AArch64::Q6, AArch64::Q7};
+static const MCPhysReg ZRegList[] = {AArch64::Z0, AArch64::Z1, AArch64::Z2,
+                                     AArch64::Z3, AArch64::Z4, AArch64::Z5,
+                                     AArch64::Z6, AArch64::Z7};
 
 static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
                              MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
@@ -97,6 +100,8 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
     RegList = DRegList;
   else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector())
     RegList = QRegList;
+  else if (LocVT.isScalableVector())
+    RegList = ZRegList;
   else {
     // Not an array we want to split up after all.
     return false;
@@ -141,6 +146,10 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
     return true;
   }
 
+  if (LocVT.isScalableVector())
+    report_fatal_error(
+        "Passing consecutive scalable vector registers unsupported");
+
   // Mark all regs in the class as unavailable
   for (auto Reg : RegList)
     State.AllocateReg(Reg);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 45bfa85bdc07..74a6c28fd32a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14702,7 +14702,14 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
 
 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
-  return Ty->isArrayTy();
+  if (Ty->isArrayTy())
+    return true;
+
+  const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
+  if (TySize.isScalable() && TySize.getKnownMinSize() > 128)
+    return true;
+
+  return false;
 }
 
 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
diff --git a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
index 816465f9eaa1..2bf95d1e61fb 100644
--- a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
+++ b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
@@ -182,7 +182,7 @@ define <vscale x 2 x i64> @urem_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b
 ; SMIN
 ;
 
-define <vscale x 16 x i8> @smin_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) {
+define <vscale x 16 x i8> @smin_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: smin_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
@@ -193,7 +193,7 @@ define <vscale x 16 x i8> @smin_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b,
   ret <vscale x 16 x i8> %min
 }
 
-define <vscale x 8 x i16> @smin_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) {
+define <vscale x 8 x i16> @smin_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: smin_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
@@ -204,7 +204,7 @@ define <vscale x 8 x i16> @smin_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b
   ret <vscale x 8 x i16> %min
 }
 
-define <vscale x 4 x i32> @smin_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
+define <vscale x 4 x i32> @smin_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: smin_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
@@ -215,7 +215,7 @@ define <vscale x 4 x i32> @smin_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
   ret <vscale x 4 x i32> %min
 }
 
-define <vscale x 2 x i64> @smin_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) {
+define <vscale x 2 x i64> @smin_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: smin_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
@@ -226,7 +226,7 @@ define <vscale x 2 x i64> @smin_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b
   ret <vscale x 2 x i64> %min
 }
 
-define <vscale x 32 x i8> @smin_split_i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, <vscale x 32 x i8> %c) {
+define <vscale x 32 x i8> @smin_split_i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
 ; CHECK-LABEL: smin_split_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
@@ -238,7 +238,7 @@ define <vscale x 32 x i8> @smin_split_i8(<vscale x 32 x i8> %a, <vscale x 32 x i
   ret <vscale x 32 x i8> %min
 }
 
-define <vscale x 32 x i16> @smin_split_i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, <vscale x 32 x i16> %c) {
+define <vscale x 32 x i16> @smin_split_i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b) {
 ; CHECK-LABEL: smin_split_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
@@ -252,7 +252,7 @@ define <vscale x 32 x i16> @smin_split_i16(<vscale x 32 x i16> %a, <vscale x 32
   ret <vscale x 32 x i16> %min
 }
 
-define <vscale x 8 x i32> @smin_split_i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i32> %c) {
+define <vscale x 8 x i32> @smin_split_i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b) {
 ; CHECK-LABEL: smin_split_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
@@ -264,7 +264,7 @@ define <vscale x 8 x i32> @smin_split_i32(<vscale x 8 x i32> %a, <vscale x 8 x i
   ret <vscale x 8 x i32> %min
 }
 
-define <vscale x 4 x i64> @smin_split_i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, <vscale x 4 x i64> %c) {
+define <vscale x 4 x i64> @smin_split_i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b) {
 ; CHECK-LABEL: smin_split_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
@@ -276,7 +276,7 @@ define <vscale x 4 x i64> @smin_split_i64(<vscale x 4 x i64> %a, <vscale x 4 x i
   ret <vscale x 4 x i64> %min
 }
 
-define <vscale x 8 x i8> @smin_promote_i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c) {
+define <vscale x 8 x i8> @smin_promote_i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
 ; CHECK-LABEL: smin_promote_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
@@ -289,7 +289,7 @@ define <vscale x 8 x i8> @smin_promote_i8(<vscale x 8 x i8> %a, <vscale x 8 x i8
   ret <vscale x 8 x i8> %min
 }
 
-define <vscale x 4 x i16> @smin_promote_i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c) {
+define <vscale x 4 x i16> @smin_promote_i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
 ; CHECK-LABEL: smin_promote_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
@@ -302,7 +302,7 @@ define <vscale x 4 x i16> @smin_promote_i16(<vscale x 4 x i16> %a, <vscale x 4 x
   ret <vscale x 4 x i16> %min
 }
 
-define <vscale x 2 x i32> @smin_promote_i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c) {
+define <vscale x 2 x i32> @smin_promote_i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: smin_promote_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
@@ -319,7 +319,7 @@ define <vscale x 2 x i32> @smin_promote_i32(<vscale x 2 x i32> %a, <vscale x 2 x
 ; UMIN
 ;
 
-define <vscale x 16 x i8> @umin_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) {
+define <vscale x 16 x i8> @umin_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: umin_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
@@ -330,7 +330,7 @@ define <vscale x 16 x i8> @umin_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b,
   ret <vscale x 16 x i8> %min
 }
 
-define <vscale x 8 x i16> @umin_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) {
+define <vscale x 8 x i16> @umin_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: umin_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
@@ -341,7 +341,7 @@ define <vscale x 8 x i16> @umin_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b
   ret <vscale x 8 x i16> %min
 }
 
-define <vscale x 4 x i32> @umin_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
+define <vscale x 4 x i32> @umin_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: umin_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
@@ -352,7 +352,7 @@ define <vscale x 4 x i32> @umin_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
   ret <vscale x 4 x i32> %min
 }
 
-define <vscale x 2 x i64> @umin_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) {
+define <vscale x 2 x i64> @umin_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: umin_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
@@ -363,7 +363,7 @@ define <vscale x 2 x i64> @umin_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b
   ret <vscale x 2 x i64> %min
 }
 
-define <vscale x 4 x i64> @umin_split_i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, <vscale x 4 x i64> %c) {
+define <vscale x 4 x i64> @umin_split_i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b) {
 ; CHECK-LABEL: umin_split_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
@@ -375,7 +375,7 @@ define <vscale x 4 x i64> @umin_split_i64(<vscale x 4 x i64> %a, <vscale x 4 x i
   ret <vscale x 4 x i64> %min
 }
 
-define <vscale x 8 x i8> @umin_promote_i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c) {
+define <vscale x 8 x i8> @umin_promote_i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
 ; CHECK-LABEL: umin_promote_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
@@ -392,7 +392,7 @@ define <vscale x 8 x i8> @umin_promote_i8(<vscale x 8 x i8> %a, <vscale x 8 x i8
 ; SMAX
 ;
 
-define <vscale x 16 x i8> @smax_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) {
+define <vscale x 16 x i8> @smax_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: smax_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
@@ -403,7 +403,7 @@ define <vscale x 16 x i8> @smax_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b,
   ret <vscale x 16 x i8> %max
 }
 
-define <vscale x 8 x i16> @smax_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) {
+define <vscale x 8 x i16> @smax_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: smax_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
@@ -414,7 +414,7 @@ define <vscale x 8 x i16> @smax_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b
   ret <vscale x 8 x i16> %max
 }
 
-define <vscale x 4 x i32> @smax_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
+define <vscale x 4 x i32> @smax_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: smax_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
@@ -425,7 +425,7 @@ define <vscale x 4 x i32> @smax_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
   ret <vscale x 4 x i32> %max
 }
 
-define <vscale x 2 x i64> @smax_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) {
+define <vscale x 2 x i64> @smax_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: smax_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
@@ -436,7 +436,7 @@ define <vscale x 2 x i64> @smax_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b
   ret <vscale x 2 x i64> %max
 }
 
-define <vscale x 8 x i32> @smax_split_i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i32> %c) {
+define <vscale x 8 x i32> @smax_split_i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b) {
 ; CHECK-LABEL: smax_split_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
@@ -448,7 +448,7 @@ define <vscale x 8 x i32> @smax_split_i32(<vscale x 8 x i32> %a, <vscale x 8 x i
   ret <vscale x 8 x i32> %max
 }
 
-define <vscale x 4 x i16> @smax_promote_i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c) {
+define <vscale x 4 x i16> @smax_promote_i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
 ; CHECK-LABEL: smax_promote_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
@@ -465,7 +465,7 @@ define <vscale x 4 x i16> @smax_promote_i16(<vscale x 4 x i16> %a, <vscale x 4 x
 ; UMAX
 ;
 
-define <vscale x 16 x i8> @umax_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) {
+define <vscale x 16 x i8> @umax_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: umax_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
@@ -476,7 +476,7 @@ define <vscale x 16 x i8> @umax_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b,
   ret <vscale x 16 x i8> %max
 }
 
-define <vscale x 8 x i16> @umax_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) {
+define <vscale x 8 x i16> @umax_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: umax_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
@@ -487,7 +487,7 @@ define <vscale x 8 x i16> @umax_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b
   ret <vscale x 8 x i16> %max
 }
 
-define <vscale x 4 x i32> @umax_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) {
+define <vscale x 4 x i32> @umax_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: umax_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
@@ -498,7 +498,7 @@ define <vscale x 4 x i32> @umax_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
   ret <vscale x 4 x i32> %max
 }
 
-define <vscale x 2 x i64> @umax_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) {
+define <vscale x 2 x i64> @umax_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: umax_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
@@ -509,7 +509,7 @@ define <vscale x 2 x i64> @umax_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b
   ret <vscale x 2 x i64> %max
 }
 
-define <vscale x 16 x i16> @umax_split_i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, <vscale x 16 x i16> %c) {
+define <vscale x 16 x i16> @umax_split_i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
 ; CHECK-LABEL: umax_split_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
@@ -521,7 +521,7 @@ define <vscale x 16 x i16> @umax_split_i16(<vscale x 16 x i16> %a, <vscale x 16
   ret <vscale x 16 x i16> %max
 }
 
-define <vscale x 2 x i32> @umax_promote_i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c) {
+define <vscale x 2 x i32> @umax_promote_i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: umax_promote_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll
new file mode 100644
index 000000000000..ee88f0b460ed
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll
@@ -0,0 +1,23 @@
+; RUN: not --crash llc < %s -mtriple aarch64-linux-gnu -mattr=+sve >/dev/null 2>%t
+; RUN: FileCheck %s < %t
+
+; CHECK: Passing consecutive scalable vector registers unsupported
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define float @foo(double* %x0, double* %x1) {
+entry:
+  %0 = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %2 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x0)
+  %3 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x1)
+  %call = call float @callee(float 1.000000e+00, <vscale x 8 x double> %2, <vscale x 8 x double> %3)
+  ret float %call
+}
+
+declare float @callee(float, <vscale x 8 x double>, <vscale x 8 x double>)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
+declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1>, double*)

From 724f62a50241d782f9c46d98e4fb796d60953df4 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Tue, 27 Oct 2020 13:59:29 +0000
Subject: [PATCH 319/363] Add fatal error for unnamed SVE variadic arguments

We don't currently support passing unnamed variadic SVE arguments
so I've added a fatal error if we hit such cases to prevent any
silent ABI issues in future.

Differential Revision: https://reviews.llvm.org/D90230

(cherry picked from commit cea69fa4dcc4fcf3be62dba49ad012879d89377d)
---
 clang/lib/CodeGen/TargetInfo.cpp              |  5 ++++
 clang/test/CodeGen/aarch64-varargs-sve.c      | 21 +++++++++++++++
 .../Target/AArch64/AArch64ISelLowering.cpp    |  8 ++++++
 .../AArch64/sve-varargs-callee-broken.ll      | 22 ++++++++++++++++
 .../AArch64/sve-varargs-caller-broken.ll      | 12 +++++++++
 llvm/test/CodeGen/AArch64/sve-varargs.ll      | 26 +++++++++++++++++++
 6 files changed, 94 insertions(+)
 create mode 100644 clang/test/CodeGen/aarch64-varargs-sve.c
 create mode 100644 llvm/test/CodeGen/AArch64/sve-varargs-callee-broken.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve-varargs-caller-broken.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve-varargs.ll

diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index 9cd63ebe29ee..f10f8e58b78a 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -5471,6 +5471,11 @@ class AArch64ABIInfo : public SwiftABIInfo {
 
   Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
                     QualType Ty) const override {
+    llvm::Type *BaseTy = CGF.ConvertType(Ty);
+    if (isa<llvm::ScalableVectorType>(BaseTy))
+      llvm::report_fatal_error("Passing SVE types to variadic functions is "
+                               "currently not supported");
+
     return Kind == Win64 ? EmitMSVAArg(CGF, VAListAddr, Ty)
                          : isDarwinPCS() ? EmitDarwinVAArg(VAListAddr, Ty, CGF)
                                          : EmitAAPCSVAArg(VAListAddr, Ty, CGF);
diff --git a/clang/test/CodeGen/aarch64-varargs-sve.c b/clang/test/CodeGen/aarch64-varargs-sve.c
new file mode 100644
index 000000000000..bf57c6e1770a
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-varargs-sve.c
@@ -0,0 +1,21 @@
+// REQUIRES: aarch64-registered-target
+// RUN: not %clang_cc1 -triple aarch64-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -emit-llvm -o - %s 2>&1 | FileCheck %s
+// RUN: not %clang_cc1 -triple arm64-apple-ios7 -target-abi darwinpcs -target-feature +sve -fallow-half-arguments-and-returns -emit-llvm -o - %s 2>&1 | FileCheck %s
+
+// CHECK: Passing SVE types to variadic functions is currently not supported
+
+#include <arm_sve.h>
+#include <stdarg.h>
+
+double foo(char *str, ...) {
+  va_list ap;
+  svfloat64_t v;
+  double x;
+
+  va_start(ap, str);
+  v = va_arg(ap, svfloat64_t);
+  x = va_arg(ap, double);
+  va_end(ap);
+
+  return x + svaddv(svptrue_b8(), v);
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 74a6c28fd32a..48ca9039b1bd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4366,6 +4366,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
     for (unsigned i = 0; i != NumArgs; ++i) {
       MVT ArgVT = Outs[i].VT;
+      if (!Outs[i].IsFixed && ArgVT.isScalableVector())
+        report_fatal_error("Passing SVE types to variadic functions is "
+                           "currently not supported");
+
       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
       CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
                                                /*IsVarArg=*/ !Outs[i].IsFixed);
@@ -6168,6 +6172,10 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   Chain = VAList.getValue(1);
   VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
 
+  if (VT.isScalableVector())
+    report_fatal_error("Passing SVE types to variadic functions is "
+                       "currently not supported");
+
   if (Align && *Align > MinSlotSize) {
     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
                          DAG.getConstant(Align->value() - 1, DL, PtrVT));
diff --git a/llvm/test/CodeGen/AArch64/sve-varargs-callee-broken.ll b/llvm/test/CodeGen/AArch64/sve-varargs-callee-broken.ll
new file mode 100644
index 000000000000..cd097d5cbb1d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-varargs-callee-broken.ll
@@ -0,0 +1,22 @@
+; RUN: not --crash llc -mtriple arm64-apple-ios7 -mattr=+sve < %s 2>&1 | FileCheck %s
+
+; CHECK: Passing SVE types to variadic functions is currently not supported
+
+@.str = private unnamed_addr constant [4 x i8] c"fmt\00", align 1
+define void @foo(i8* %fmt, ...) nounwind {
+entry:
+  %fmt.addr = alloca i8*, align 8
+  %args = alloca i8*, align 8
+  %vc = alloca i32, align 4
+  %vv = alloca <vscale x 4 x i32>, align 16
+  store i8* %fmt, i8** %fmt.addr, align 8
+  %args1 = bitcast i8** %args to i8*
+  call void @llvm.va_start(i8* %args1)
+  %0 = va_arg i8** %args, i32
+  store i32 %0, i32* %vc, align 4
+  %1 = va_arg i8** %args, <vscale x 4 x i32>
+  store <vscale x 4 x i32> %1, <vscale x 4 x i32>* %vv, align 16
+  ret void
+}
+
+declare void @llvm.va_start(i8*) nounwind
diff --git a/llvm/test/CodeGen/AArch64/sve-varargs-caller-broken.ll b/llvm/test/CodeGen/AArch64/sve-varargs-caller-broken.ll
new file mode 100644
index 000000000000..0f26728b26cb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-varargs-caller-broken.ll
@@ -0,0 +1,12 @@
+; RUN: not --crash llc -mtriple aarch64-linux-gnu -mattr=+sve <%s 2>&1 | FileCheck %s
+
+declare i32 @sve_printf(i8*, <vscale x 4 x i32>, ...)
+
+@.str_1 = internal constant [6 x i8] c"boo!\0A\00"
+
+; CHECK: Passing SVE types to variadic functions is currently not supported
+define void @foo(<vscale x 4 x i32> %x) {
+  %f = getelementptr [6 x i8], [6 x i8]* @.str_1, i64 0, i64 0
+  call i32 (i8*, <vscale x 4 x i32>, ...) @sve_printf(i8* %f, <vscale x 4 x i32> %x, <vscale x 4 x i32> %x)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-varargs.ll b/llvm/test/CodeGen/AArch64/sve-varargs.ll
new file mode 100644
index 000000000000..0d3c8b6388a1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-varargs.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+declare i32 @sve_printf(i8*, <vscale x 4 x i32>, ...)
+
+@.str_1 = internal constant [6 x i8] c"boo!\0A\00"
+
+define void @foo(<vscale x 4 x i32> %x) {
+; CHECK-LABEL: foo:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    adrp x0, .str_1
+; CHECK-NEXT:    add x0, x0, :lo12:.str_1
+; CHECK-NEXT:    bl sve_printf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %f = getelementptr [6 x i8], [6 x i8]* @.str_1, i64 0, i64 0
+  call i32 (i8*, <vscale x 4 x i32>, ...) @sve_printf(i8* %f, <vscale x 4 x i32> %x)
+  ret void
+}

From 542174d77deb7f2a59bcd0b8147144d4e123cf7b Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Thu, 8 Oct 2020 14:51:10 +0000
Subject: [PATCH 320/363] Implement .variant_pcs directive

A dynamic linker with lazy binding support may need to handle variant
PCS function symbols specially, so an ELF symbol table marking
STO_AARCH64_VARIANT_PCS [1] was added to address this.

Function symbols that follow the vector PCS are marked via the
.variant_pcs assembler directive, which takes a single parameter
specifying the symbol name and sets the STO_AARCH64_VARIANT_PCS st_other
flag in the object file.

[1] https://github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst#st-other-values

Reviewed By: sdesmalen

Differential Revision: https://reviews.llvm.org/D89138

(cherry picked from commit c87bd2d8eb378d152f2b6bde4cb088ad390a676c)
---
 llvm/include/llvm/BinaryFormat/ELF.h          |  6 +++
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 15 ++++++
 .../AArch64/AsmParser/AArch64AsmParser.cpp    | 30 +++++++++++
 .../MCTargetDesc/AArch64ELFStreamer.cpp       |  8 +++
 .../MCTargetDesc/AArch64TargetStreamer.h      |  4 ++
 llvm/test/CodeGen/AArch64/variant-pcs.ll      | 51 +++++++++++++++++++
 .../MC/AArch64/directive-variant_pcs-err.s    | 17 +++++++
 llvm/test/MC/AArch64/directive-variant_pcs.s  | 11 ++++
 8 files changed, 142 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/variant-pcs.ll
 create mode 100644 llvm/test/MC/AArch64/directive-variant_pcs-err.s
 create mode 100644 llvm/test/MC/AArch64/directive-variant_pcs.s

diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index bdcf10fd1640..21a5c26883cd 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -405,6 +405,12 @@ enum {
 #include "ELFRelocs/AArch64.def"
 };
 
+// Special values for the st_other field in the symbol table entry for AArch64.
+enum {
+  // Symbol may follow different calling convention than base PCS.
+  STO_AARCH64_VARIANT_PCS = 0x80
+};
+
 // ARM Specific e_flags
 enum : unsigned {
   EF_ARM_SOFT_FLOAT = 0x00000200U,     // Legacy pre EABI_VER5
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 3a94820dac8d..7ec7ffe309f7 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -89,6 +89,8 @@ class AArch64AsmPrinter : public AsmPrinter {
   void emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
                           const MachineBasicBlock *MBB, unsigned JTI);
 
+  void emitFunctionEntryLabel() override;
+
   void LowerJumpTableDestSmall(MCStreamer &OutStreamer, const MachineInstr &MI);
 
   void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
@@ -822,6 +824,19 @@ void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
   OutStreamer->emitValue(Value, Size);
 }
 
+void AArch64AsmPrinter::emitFunctionEntryLabel() {
+  if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall ||
+      MF->getFunction().getCallingConv() ==
+          CallingConv::AArch64_SVE_VectorCall ||
+      STI->getRegisterInfo()->hasSVEArgsOrReturn(MF)) {
+    auto *TS =
+        static_cast<AArch64TargetStreamer *>(OutStreamer->getTargetStreamer());
+    TS->emitDirectiveVariantPCS(CurrentFnSym);
+  }
+
+  return AsmPrinter::emitFunctionEntryLabel();
+}
+
 /// Small jump tables contain an unsigned byte or half, representing the offset
 /// from the lowest-addressed possible destination to the desired basic
 /// block. Since all instructions are 4-byte aligned, this is further compressed
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 0ac09c4f96f0..e72ae0e62cb7 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -179,6 +179,8 @@ class AArch64AsmParser : public MCTargetAsmParser {
   bool parseDirectiveCFINegateRAState();
   bool parseDirectiveCFIBKeyFrame();
 
+  bool parseDirectiveVariantPCS(SMLoc L);
+
   bool validateInstruction(MCInst &Inst, SMLoc &IDLoc,
                            SmallVectorImpl<SMLoc> &Loc);
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -5077,6 +5079,8 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
     parseDirectiveCFIBKeyFrame();
   else if (IDVal == ".arch_extension")
     parseDirectiveArchExtension(Loc);
+  else if (IDVal == ".variant_pcs")
+    parseDirectiveVariantPCS(Loc);
   else if (IsMachO) {
     if (IDVal == MCLOHDirectiveName())
       parseDirectiveLOH(IDVal, Loc);
@@ -5507,6 +5511,32 @@ bool AArch64AsmParser::parseDirectiveCFIBKeyFrame() {
   return false;
 }
 
+/// parseDirectiveVariantPCS
+/// ::= .variant_pcs symbolname
+bool AArch64AsmParser::parseDirectiveVariantPCS(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return TokError("expected symbol name");
+
+  StringRef SymbolName = Tok.getIdentifier();
+
+  MCSymbol *Sym = getContext().lookupSymbol(SymbolName);
+  if (!Sym)
+    return TokError("unknown symbol in '.variant_pcs' directive");
+
+  Parser.Lex(); // Eat the symbol
+
+  // Shouldn't be any more tokens
+  if (parseToken(AsmToken::EndOfStatement))
+    return addErrorSuffix(" in '.variant_pcs' directive");
+
+  getTargetStreamer().emitDirectiveVariantPCS(Sym);
+
+  return false;
+}
+
 bool
 AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
                                     AArch64MCExpr::VariantKind &ELFRefKind,
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index fe4c34be1519..6dfda8217628 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -47,6 +47,10 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
 
   void emitInst(uint32_t Inst) override;
 
+  void emitDirectiveVariantPCS(MCSymbol *Symbol) override {
+    OS << "\t.variant_pcs " << Symbol->getName() << "\n";
+  }
+
 public:
   AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
 };
@@ -194,6 +198,10 @@ void AArch64TargetELFStreamer::emitInst(uint32_t Inst) {
   getStreamer().emitInst(Inst);
 }
 
+void AArch64TargetELFStreamer::emitDirectiveVariantPCS(MCSymbol *Symbol) {
+  cast<MCSymbolELF>(Symbol)->setOther(ELF::STO_AARCH64_VARIANT_PCS);
+}
+
 MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
                                                  MCInstPrinter *InstPrint,
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index 3a0c5d8318dd..1af978a806d1 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -36,6 +36,9 @@ class AArch64TargetStreamer : public MCTargetStreamer {
   /// Callback used to implement the .inst directive.
   virtual void emitInst(uint32_t Inst);
 
+  /// Callback used to implement the .variant_pcs directive.
+  virtual void emitDirectiveVariantPCS(MCSymbol *Symbol) {};
+
   virtual void EmitARM64WinCFIAllocStack(unsigned Size) {}
   virtual void EmitARM64WinCFISaveFPLR(int Offset) {}
   virtual void EmitARM64WinCFISaveFPLRX(int Offset) {}
@@ -63,6 +66,7 @@ class AArch64TargetELFStreamer : public AArch64TargetStreamer {
   AArch64ELFStreamer &getStreamer();
 
   void emitInst(uint32_t Inst) override;
+  void emitDirectiveVariantPCS(MCSymbol *Symbol) override;
 
 public:
   AArch64TargetELFStreamer(MCStreamer &S) : AArch64TargetStreamer(S) {}
diff --git a/llvm/test/CodeGen/AArch64/variant-pcs.ll b/llvm/test/CodeGen/AArch64/variant-pcs.ll
new file mode 100644
index 000000000000..f6e5fd13f1ed
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/variant-pcs.ll
@@ -0,0 +1,51 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -o - %s | FileCheck %s --check-prefix=CHECK-ASM
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -filetype=obj -o - %s \
+; RUN:   | llvm-readobj --symbols - | FileCheck %s --check-prefix=CHECK-OBJ
+
+define i32 @base_pcs() {
+; CHECK-ASM-LABEL: base_pcs:
+; CHECK-ASM-NOT: .variant_pcs
+; CHECK-OBJ-LABEL: Name: base_pcs
+; CHECK-OBJ: Other: 0
+  ret i32 42
+}
+
+define aarch64_vector_pcs <4 x i32> @neon_vector_pcs_1(<4 x i32> %arg) {
+; CHECK-ASM: .variant_pcs neon_vector_pcs_1
+; CHECK-ASM-NEXT: neon_vector_pcs_1:
+; CHECK-OBJ-LABEL: Name: neon_vector_pcs_1
+; CHECK-OBJ: Other [ (0x80)
+  ret <4 x i32> %arg
+}
+
+define <vscale x 4 x i32> @sve_vector_pcs_1() {
+; CHECK-ASM: .variant_pcs sve_vector_pcs_1
+; CHECK-ASM-NEXT: sve_vector_pcs_1:
+; CHECK-OBJ-LABEL: Name: sve_vector_pcs_1
+; CHECK-OBJ: Other [ (0x80)
+  ret <vscale x 4 x i32> undef
+}
+
+define <vscale x 4 x i1> @sve_vector_pcs_2() {
+; CHECK-ASM: .variant_pcs sve_vector_pcs_2
+; CHECK-ASM-NEXT: sve_vector_pcs_2:
+; CHECK-OBJ-LABEL: Name: sve_vector_pcs_2
+; CHECK-OBJ: Other [ (0x80)
+  ret <vscale x 4 x i1> undef
+}
+
+define void @sve_vector_pcs_3(<vscale x 4 x i32> %arg) {
+; CHECK-ASM: .variant_pcs sve_vector_pcs_3
+; CHECK-ASM-NEXT: sve_vector_pcs_3:
+; CHECK-OBJ-LABEL: Name: sve_vector_pcs_3
+; CHECK-OBJ: Other [ (0x80)
+  ret void
+}
+
+define void @sve_vector_pcs_4(<vscale x 4 x i1> %arg) {
+; CHECK-ASM: .variant_pcs sve_vector_pcs_4
+; CHECK-ASM-NEXT: sve_vector_pcs_4:
+; CHECK-OBJ-LABEL: Name: sve_vector_pcs_4
+; CHECK-OBJ: Other [ (0x80)
+  ret void
+}
diff --git a/llvm/test/MC/AArch64/directive-variant_pcs-err.s b/llvm/test/MC/AArch64/directive-variant_pcs-err.s
new file mode 100644
index 000000000000..98cf703b564e
--- /dev/null
+++ b/llvm/test/MC/AArch64/directive-variant_pcs-err.s
@@ -0,0 +1,17 @@
+// RUN: not llvm-mc -triple aarch64-unknown-none-eabi -filetype asm -o - %s 2>&1 | FileCheck %s
+
+.variant_pcs
+// CHECK: error: expected symbol name
+// CHECK-NEXT:   .variant_pcs
+// CHECK-NEXT:               ^
+
+.variant_pcs foo
+// CHECK: error: unknown symbol in '.variant_pcs' directive
+// CHECK-NEXT:   .variant_pcs foo
+// CHECK-NEXT:                ^
+
+.global foo
+.variant_pcs foo bar
+// CHECK: error: unexpected token in '.variant_pcs' directive
+// CHECK-NEXT:   .variant_pcs foo bar
+// CHECK-NEXT:                    ^
diff --git a/llvm/test/MC/AArch64/directive-variant_pcs.s b/llvm/test/MC/AArch64/directive-variant_pcs.s
new file mode 100644
index 000000000000..f6f9c9c272f7
--- /dev/null
+++ b/llvm/test/MC/AArch64/directive-variant_pcs.s
@@ -0,0 +1,11 @@
+// RUN: llvm-mc -triple aarch64-elf -filetype asm -o - %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-elf -filetype obj -o - %s \
+// RUN:   | llvm-readobj --symbols - | FileCheck %s --check-prefix=CHECK-ST_OTHER
+
+.text
+.global foo
+.variant_pcs foo
+// CHECK: .variant_pcs foo
+
+// CHECK-ST_OTHER: Name: foo
+// CHECK-ST_OTHER: Other [ (0x80)

From edc57e7e7ca2769d4d63bc939396f0d60666d262 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 27 Oct 2020 18:20:56 -0700
Subject: [PATCH 321/363] Guard `find_library(tensorflow_c_api ...)` by
 checking for TENSORFLOW_C_LIB_PATH to be set by the user

Also have CMake fails if the user provides a TENSORFLOW_C_LIB_PATH but
we can't find TensorFlow at this path.

At the moment the CMake script tries to figure if TensorFlow is
available on the system and enables support for it. This is in general
not desirable to customize build features this way and instead it is
preferable to let the user opt-in explicitly into the features they want
to enable. This is in line with other optional external dependencies
like Z3.
There are a few reasons to this but amongst others:
- reproducibility: making features "magically" enabled based on whether
  we find a package on the system or not makes it harder to handle bug
  reports from users.
- user control: they can't have TensorFlow on the system and build LLVM
  without TensorFlow right now. They also would suddenly distribute LLVM
  with a different set of features unknowingly just because their build
  machine environment would change subtly.

Right now this is motivated by a user reporting build failures on their system:

.../mesa-git/llvm-git/src/llvm-project/llvm/lib/Analysis/TFUtils.cpp:23:10: fatal error: tensorflow/c/c_api.h: No such file or directory
   23 | #include "tensorflow/c/c_api.h"
      |          ^~~~~~

It looks like we detected TensorFlow at configure time but couldn't set all the paths correctly.

Differential Revision: https://reviews.llvm.org/D88371

(cherry picked from commit e72d792c147ee506e337401e20c0f23042cc43fe)
---
 llvm/CMakeLists.txt | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 915400af7a83..b8dabbbca05a 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -832,6 +832,11 @@ configure_file(
   ${LLVM_INCLUDE_DIR}/llvm/Config/Targets.def
   )
 
+# They are not referenced. See set_output_directory().
+set( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/bin )
+set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX} )
+set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX} )
+
 # For up-to-date instructions for installing the Tensorflow dependency, refer to
 # the bot setup script: https://github.com/google/ml-compiler-opt/blob/master/buildbot/buildbot_init.sh
 # In this case, the latest C API library is available for download from
@@ -840,9 +845,9 @@ configure_file(
 # LLVM_HAVE_TF_API, through llvm-config.h, so that a user of the LLVM library may
 # also leverage the dependency.
 set(TENSORFLOW_C_LIB_PATH "" CACHE PATH "Path to TensorFlow C library install")
-find_library(tensorflow_c_api tensorflow PATHS ${TENSORFLOW_C_LIB_PATH}/lib)
 
-if (tensorflow_c_api)
+if (TENSORFLOW_C_LIB_PATH)
+  find_library(tensorflow_c_api tensorflow PATHS ${TENSORFLOW_C_LIB_PATH}/lib NO_DEFAULT_PATH REQUIRED)
   set(LLVM_HAVE_TF_API "ON" CACHE BOOL "Full Tensorflow API available")
   include_directories(${TENSORFLOW_C_LIB_PATH}/include)
 endif()
@@ -877,12 +882,6 @@ add_custom_target(srpm
   COMMAND rpmbuild -bs --define '_topdir ${LLVM_SRPM_DIR}' ${LLVM_SRPM_BINARY_SPECFILE})
 set_target_properties(srpm PROPERTIES FOLDER "Misc")
 
-
-# They are not referenced. See set_output_directory().
-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/bin )
-set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX} )
-set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX} )
-
 if(APPLE AND DARWIN_LTO_LIBRARY)
   set(CMAKE_EXE_LINKER_FLAGS
     "${CMAKE_EXE_LINKER_FLAGS} -Wl,-lto_library -Wl,${DARWIN_LTO_LIBRARY}")

From ba223fa19d35d41ad9eeade8978ab1a17d6aafe1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADs=20Marques?= <luismarques@lowrisc.org>
Date: Mon, 7 Dec 2020 23:50:35 +0000
Subject: [PATCH 322/363] [Clang][CodeGen][RISCV] Add hard float ABI tests with
 empty struct

This patch adds tests that showcase a behavior that is currently buggy.
Fix in a follow-up patch.

Differential Revision: https://reviews.llvm.org/D91269

(cherry picked from commit ca93f9abdc0abc96ca8fb7999549a50aadd95caf)
---
 clang/test/CodeGen/riscv32-ilp32d-abi.cpp | 37 +++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 clang/test/CodeGen/riscv32-ilp32d-abi.cpp

diff --git a/clang/test/CodeGen/riscv32-ilp32d-abi.cpp b/clang/test/CodeGen/riscv32-ilp32d-abi.cpp
new file mode 100644
index 000000000000..ffebb057e230
--- /dev/null
+++ b/clang/test/CodeGen/riscv32-ilp32d-abi.cpp
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -triple riscv32 -target-feature +d -target-abi ilp32d \
+// RUN:     -Wno-missing-declarations -emit-llvm %s -o - | FileCheck %s
+
+struct empty_float2 { struct {}; float f; float g; };
+
+// CHECK: define float @_Z14f_empty_float212empty_float2(float %0, float %1)
+// FIXME: Extraneous padding before the second float
+// CHECK: { [4 x i8], float, [4 x i8], float }
+float f_empty_float2(empty_float2 a) {
+    return a.g;
+}
+
+struct empty_double2 { struct {}; double f; double g; };
+
+// CHECK: define double @_Z15f_empty_double213empty_double2(double %0, double %1)
+// FIXME: Extraneous padding before the second double
+// CHECK: { [8 x i8], double, [8 x i8], double }
+double f_empty_double2(empty_double2 a) {
+    return a.g;
+}
+
+struct empty_float_double { struct {}; float f; double g; };
+
+// CHECK: define double @_Z20f_empty_float_double18empty_float_double(float %0, double %1)
+// CHECK: { [4 x i8], float, double }
+double f_empty_float_double(empty_float_double a) {
+    return a.g;
+}
+
+struct empty_double_float { struct {}; double f; float g; };
+
+// CHECK: define double @_Z20f_empty_double_float18empty_double_float(double %0, float %1)
+// FIXME: Extraneous padding before the float
+// CHECK: { [8 x i8], double, [8 x i8], float }
+double f_empty_double_float(empty_double_float a) {
+    return a.g;
+}

From a4eaecf122e1abbc5bc0f2478e80c6bb7da67cb0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADs=20Marques?= <luismarques@lowrisc.org>
Date: Mon, 7 Dec 2020 23:50:42 +0000
Subject: [PATCH 323/363] [Clang][CodeGen][RISCV] Fix hard float ABI test cases
 with empty struct

The code seemed not to account for the field 1 offset.

Differential Revision: https://reviews.llvm.org/D91270

(cherry picked from commit fa8f5bfa4e8cff042c9730320c74e97fab152ae1)
---
 clang/lib/CodeGen/TargetInfo.cpp          | 10 +++++-----
 clang/test/CodeGen/riscv32-ilp32d-abi.cpp |  9 +++------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index f10f8e58b78a..c5db0985c1bf 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -10490,7 +10490,7 @@ bool RISCVABIInfo::detectFPCCEligibleStruct(QualType Ty, llvm::Type *&Field1Ty,
     NeededArgFPRs++;
   else if (Field2Ty)
     NeededArgGPRs++;
-  return IsCandidate;
+  return true;
 }
 
 // Call getCoerceAndExpand for the two-element flattened struct described by
@@ -10516,15 +10516,15 @@ ABIArgInfo RISCVABIInfo::coerceAndExpandFPCCEligibleStruct(
 
   CharUnits Field2Align =
       CharUnits::fromQuantity(getDataLayout().getABITypeAlignment(Field2Ty));
-  CharUnits Field1Size =
+  CharUnits Field1End = Field1Off +
       CharUnits::fromQuantity(getDataLayout().getTypeStoreSize(Field1Ty));
-  CharUnits Field2OffNoPadNoPack = Field1Size.alignTo(Field2Align);
+  CharUnits Field2OffNoPadNoPack = Field1End.alignTo(Field2Align);
 
   CharUnits Padding = CharUnits::Zero();
   if (Field2Off > Field2OffNoPadNoPack)
     Padding = Field2Off - Field2OffNoPadNoPack;
-  else if (Field2Off != Field2Align && Field2Off > Field1Size)
-    Padding = Field2Off - Field1Size;
+  else if (Field2Off != Field2Align && Field2Off > Field1End)
+    Padding = Field2Off - Field1End;
 
   bool IsPacked = !Field2Off.isMultipleOf(Field2Align);
 
diff --git a/clang/test/CodeGen/riscv32-ilp32d-abi.cpp b/clang/test/CodeGen/riscv32-ilp32d-abi.cpp
index ffebb057e230..1018c78e168b 100644
--- a/clang/test/CodeGen/riscv32-ilp32d-abi.cpp
+++ b/clang/test/CodeGen/riscv32-ilp32d-abi.cpp
@@ -4,8 +4,7 @@
 struct empty_float2 { struct {}; float f; float g; };
 
 // CHECK: define float @_Z14f_empty_float212empty_float2(float %0, float %1)
-// FIXME: Extraneous padding before the second float
-// CHECK: { [4 x i8], float, [4 x i8], float }
+// CHECK: { [4 x i8], float, float }
 float f_empty_float2(empty_float2 a) {
     return a.g;
 }
@@ -13,8 +12,7 @@ float f_empty_float2(empty_float2 a) {
 struct empty_double2 { struct {}; double f; double g; };
 
 // CHECK: define double @_Z15f_empty_double213empty_double2(double %0, double %1)
-// FIXME: Extraneous padding before the second double
-// CHECK: { [8 x i8], double, [8 x i8], double }
+// CHECK: { [8 x i8], double, double }
 double f_empty_double2(empty_double2 a) {
     return a.g;
 }
@@ -30,8 +28,7 @@ double f_empty_float_double(empty_float_double a) {
 struct empty_double_float { struct {}; double f; float g; };
 
 // CHECK: define double @_Z20f_empty_double_float18empty_double_float(double %0, float %1)
-// FIXME: Extraneous padding before the float
-// CHECK: { [8 x i8], double, [8 x i8], float }
+// CHECK: { [8 x i8], double, float }
 double f_empty_double_float(empty_double_float a) {
     return a.g;
 }

From b430f94d005276c8588b86dde7759be37a7a3420 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADs=20Marques?= <luismarques@lowrisc.org>
Date: Mon, 7 Dec 2020 23:50:43 +0000
Subject: [PATCH 324/363] [Clang][CodeGen][RISCV] Fix hard float ABI for struct
 with empty struct and complex

Fixes bug 44904.

Differential Revision: https://reviews.llvm.org/D91278

(cherry picked from commit 3af354e863f553ef727967dfc091a64a11500aa5)
---
 clang/lib/CodeGen/TargetInfo.cpp          |  1 -
 clang/test/CodeGen/riscv32-ilp32d-abi.cpp | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index c5db0985c1bf..a061651d8b21 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -10395,7 +10395,6 @@ bool RISCVABIInfo::detectFPCCEligibleStructHelper(QualType Ty, CharUnits CurOff,
       return false;
     Field1Ty = CGT.ConvertType(EltTy);
     Field1Off = CurOff;
-    assert(CurOff.isZero() && "Unexpected offset for first field");
     Field2Ty = Field1Ty;
     Field2Off = Field1Off + getContext().getTypeSizeInChars(EltTy);
     return true;
diff --git a/clang/test/CodeGen/riscv32-ilp32d-abi.cpp b/clang/test/CodeGen/riscv32-ilp32d-abi.cpp
index 1018c78e168b..26d968be97df 100644
--- a/clang/test/CodeGen/riscv32-ilp32d-abi.cpp
+++ b/clang/test/CodeGen/riscv32-ilp32d-abi.cpp
@@ -32,3 +32,19 @@ struct empty_double_float { struct {}; double f; float g; };
 double f_empty_double_float(empty_double_float a) {
     return a.g;
 }
+
+struct empty_complex_f { struct {}; float _Complex fc; };
+
+// CHECK: define float @_Z17f_empty_complex_f15empty_complex_f(float %0, float %1)
+// CHECK: { [4 x i8], float, float }
+float f_empty_complex_f(empty_complex_f a) {
+    return __imag__ a.fc;
+}
+
+struct empty_complex_d { struct {}; double _Complex fc; };
+
+// CHECK: define double @_Z17f_empty_complex_d15empty_complex_d(double %0, double %1)
+// CHECK: { [8 x i8], double, double }
+double f_empty_complex_d(empty_complex_d a) {
+    return __imag__ a.fc;
+}

From 0feb4bc5295b373876823972d4b33e62b305cd0a Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson@amd.com>
Date: Thu, 13 Aug 2020 20:56:32 +0900
Subject: [PATCH 325/363] Fix missed SI_RETURN_TO_EPILOG in pre-emit peephole

SIPreEmitPeephole does not process all terminators, which means
it can fail to handle SI_RETURN_TO_EPILOG if immediately preceeded
by a branch to the early exit block.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D85872

(cherry picked from commit d538c5837a2cfedbf274133e29612da76003beed)
---
 llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp  | 24 ++++---
 .../transform-block-with-return-to-epilog.ll  | 69 +++++++++++++++++++
 2 files changed, 85 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index f31c722db1b2..442be886a8ac 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -254,16 +254,24 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
 
   for (MachineBasicBlock &MBB : MF) {
     MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
-    if (MBBE != MBB.end()) {
-      MachineInstr &MI = *MBBE;
+    MachineBasicBlock::iterator TermI = MBBE;
+    // Check first terminator for VCC branches to optimize
+    if (TermI != MBB.end()) {
+      MachineInstr &MI = *TermI;
       switch (MI.getOpcode()) {
       case AMDGPU::S_CBRANCH_VCCZ:
       case AMDGPU::S_CBRANCH_VCCNZ:
         Changed |= optimizeVccBranch(MI);
         continue;
-      case AMDGPU::SI_RETURN_TO_EPILOG:
-        // FIXME: This is not an optimization and should be
-        // moved somewhere else.
+      default:
+        break;
+      }
+    }
+    // Check all terminators for SI_RETURN_TO_EPILOG
+    // FIXME: This is not an optimization and should be moved somewhere else.
+    while (TermI != MBB.end()) {
+      MachineInstr &MI = *TermI;
+      if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
         assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
 
         // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
@@ -281,11 +289,11 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
               .addMBB(EmptyMBBAtEnd);
           MI.eraseFromParent();
           MBBE = MBB.getFirstTerminator();
+          TermI = MBBE;
+          continue;
         }
-        break;
-      default:
-        break;
       }
+      TermI++;
     }
 
     if (!ST.hasVGPRIndexMode())
diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
index c34a62bfc31d..416a72d51f99 100644
--- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
+++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
@@ -78,4 +78,73 @@ else:                                             ; preds = %else.if.cond
   unreachable
 }
 
+define amdgpu_ps { <4 x float> } @test_return_to_epilog_with_optimized_kill(float %val) #0 {
+  ; GCN-LABEL: name: test_return_to_epilog_with_optimized_kill
+  ; GCN: bb.0.entry:
+  ; GCN:   successors: %bb.1(0x40000000), %bb.4(0x40000000)
+  ; GCN:   liveins: $vgpr0
+  ; GCN:   renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec
+  ; GCN:   nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec
+  ; GCN:   $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GCN:   renamable $sgpr0_sgpr1 = S_XOR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def dead $scc
+  ; GCN:   S_CBRANCH_EXECZ %bb.4, implicit $exec
+  ; GCN: bb.1.flow.preheader:
+  ; GCN:   successors: %bb.2(0x80000000)
+  ; GCN:   liveins: $vgpr0, $sgpr0_sgpr1
+  ; GCN:   nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec
+  ; GCN:   renamable $sgpr2_sgpr3 = S_MOV_B64 0
+  ; GCN: bb.2.flow:
+  ; GCN:   successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+  ; GCN:   liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3
+  ; GCN:   renamable $sgpr4_sgpr5 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc
+  ; GCN:   renamable $sgpr2_sgpr3 = S_OR_B64 killed renamable $sgpr4_sgpr5, killed renamable $sgpr2_sgpr3, implicit-def $scc
+  ; GCN:   $exec = S_ANDN2_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc
+  ; GCN:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
+  ; GCN: bb.3.Flow:
+  ; GCN:   successors: %bb.4(0x80000000)
+  ; GCN:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+  ; GCN:   $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
+  ; GCN: bb.4.Flow1:
+  ; GCN:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; GCN:   liveins: $sgpr0_sgpr1
+  ; GCN:   renamable $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GCN:   $exec = S_XOR_B64 $exec, renamable $sgpr0_sgpr1, implicit-def $scc
+  ; GCN:   S_CBRANCH_EXECZ %bb.6, implicit $exec
+  ; GCN: bb.5.kill0:
+  ; GCN:   successors: %bb.6(0x80000000)
+  ; GCN:   liveins: $sgpr0_sgpr1
+  ; GCN:   $exec = S_MOV_B64 0
+  ; GCN: bb.6.end:
+  ; GCN:   successors: %bb.7(0x40000000), %bb.8(0x40000000)
+  ; GCN:   liveins: $sgpr0_sgpr1
+  ; GCN:   $exec = S_OR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc
+  ; GCN:   S_CBRANCH_EXECZ %bb.7, implicit $exec
+  ; GCN:   S_BRANCH %bb.8
+  ; GCN: bb.7:
+  ; GCN:   EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec
+  ; GCN:   S_ENDPGM 0
+  ; GCN: bb.8:
+entry:
+  %.i0 = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, %val
+  %cmp0 = fcmp olt float %.i0, 0.000000e+00
+  br i1 %cmp0, label %kill0, label %flow
+
+kill0:                                            ; preds = %entry
+  call void @llvm.amdgcn.kill(i1 false)
+  br label %end
+
+flow:                                             ; preds = %entry
+  %cmp1 = fcmp olt float %val, 0.000000e+00
+  br i1 %cmp1, label %flow, label %end
+
+kill1:                                            ; preds = %flow
+  call void @llvm.amdgcn.kill(i1 false)
+  br label %end
+
+end:                                              ; preds = %kill0, %kill1, %flow
+  ret { <4 x float> } undef
+}
+
+declare void @llvm.amdgcn.kill(i1) #0
+
 attributes #0 = { nounwind }

From a21e609d6a255f893fa7cbd863a3bc5c017c478e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 18 Nov 2020 14:06:19 +0000
Subject: [PATCH 326/363] [X86] Add broadcast merge test case for PR48215

(cherry picked from commit 8270f8c252d7013761c54e5bf528ac3e4e3b517c)
Signed-off-by: Warren Ristow <warren.ristow@sony.com>
---
 llvm/test/CodeGen/X86/pr48215.ll | 75 ++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/pr48215.ll

diff --git a/llvm/test/CodeGen/X86/pr48215.ll b/llvm/test/CodeGen/X86/pr48215.ll
new file mode 100644
index 000000000000..c825955a2970
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr48215.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx      | FileCheck %s --check-prefixes=AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
+
+; FIXME: Ensure when we merge broadcasts to different widths that they come from the same SDValue.
+define i32 @PR48215(i32 %a0, i32 %a1)  {
+; AVX1-LABEL: PR48215:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    movl %edi, %eax
+; AVX1-NEXT:    cltd
+; AVX1-NEXT:    idivl %esi
+; AVX1-NEXT:    vmovd %edx, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
+; AVX1-NEXT:    vmovd %eax, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3]
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vmovmskps %ymm2, %ecx
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmovmskps %xmm0, %eax
+; AVX1-NEXT:    addl %ecx, %eax
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR48215:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    movl %edi, %eax
+; AVX2-NEXT:    cltd
+; AVX2-NEXT:    idivl %esi
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,6,7]
+; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm1
+; AVX2-NEXT:    vmovmskps %ymm1, %ecx
+; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vmovmskps %xmm0, %eax
+; AVX2-NEXT:    addl %ecx, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: PR48215:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    movl %edi, %eax
+; AVX512-NEXT:    cltd
+; AVX512-NEXT:    idivl %esi
+; AVX512-NEXT:    vpbroadcastd %eax, %ymm0
+; AVX512-NEXT:    vpcmpltd {{.*}}(%rip), %ymm0, %k0
+; AVX512-NEXT:    vpcmpltd {{.*}}(%rip), %xmm0, %k1
+; AVX512-NEXT:    kmovw %k0, %eax
+; AVX512-NEXT:    movzbl %al, %ecx
+; AVX512-NEXT:    kmovw %k1, %eax
+; AVX512-NEXT:    andl $15, %eax
+; AVX512-NEXT:    addl %ecx, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %d = sdiv i32 %a0, %a1
+  %r = srem i32 %a0, %a1
+  %dv0 = insertelement <8 x i32> undef, i32 %d, i32 0
+  %rv0 = insertelement <4 x i32> undef, i32 %r, i32 0
+  %dv1 = shufflevector <8 x i32> %dv0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %rv1 = shufflevector <4 x i32> %rv0, <4 x i32> undef, <4 x i32> zeroinitializer
+  %dc0 = icmp slt <8 x i32> %dv1, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %rc0 = icmp slt <4 x i32> %rv1, <i32 4, i32 5, i32 6, i32 7>
+  %db0 = bitcast <8 x i1> %dc0 to i8
+  %rb0 = bitcast <4 x i1> %rc0 to i4
+  %db1 = zext i8 %db0 to i32
+  %rb1 = zext i4 %rb0 to i32
+  %res = add i32 %db1, %rb1
+  ret i32 %res
+}

From 14d60e9a80d40f9efc4b76524a07320d38994d2b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 19 Nov 2020 12:12:16 +0000
Subject: [PATCH 327/363] [X86][AVX] Only share broadcasts of different widths
 from the same SDValue of the same SDNode (PR48215)

D57663 allowed us to reuse broadcasts of the same scalar value by extracting low subvectors from the widest type.

Unfortunately we weren't ensuring the broadcasts were from the same SDValue, just the same SDNode - which failed on multiple-value nodes like ISD::SDIVREM

FYI: I intend to request this be merged into the 11.x release branch.

Differential Revision: https://reviews.llvm.org/D91709

(cherry picked from commit 14ae02fb3397961bb5f99a0df60622375fc1976d)
Signed-off-by: Warren Ristow <warren.ristow@sony.com>
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  2 ++
 llvm/test/CodeGen/X86/pr48215.ll        | 15 +++++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index afe470cc6e0b..f5b704ebbe9d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36018,8 +36018,10 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
 
     // Share broadcast with the longest vector and extract low subvector (free).
+    // Ensure the same SDValue from the SDNode use is being used.
     for (SDNode *User : Src->uses())
       if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
+          Src == User->getOperand(0) &&
           User->getValueSizeInBits(0) > VT.getSizeInBits()) {
         return extractSubVector(SDValue(User, 0), 0, DAG, DL,
                                 VT.getSizeInBits());
diff --git a/llvm/test/CodeGen/X86/pr48215.ll b/llvm/test/CodeGen/X86/pr48215.ll
index c825955a2970..125bde728c3f 100644
--- a/llvm/test/CodeGen/X86/pr48215.ll
+++ b/llvm/test/CodeGen/X86/pr48215.ll
@@ -33,12 +33,14 @@ define i32 @PR48215(i32 %a0, i32 %a1)  {
 ; AVX2-NEXT:    idivl %esi
 ; AVX2-NEXT:    vmovd %eax, %xmm0
 ; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vmovd %edx, %xmm1
+; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,6,7]
-; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vmovmskps %ymm1, %ecx
-; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vmovmskps %xmm0, %eax
+; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    vmovmskps %ymm0, %ecx
+; AVX2-NEXT:    vmovmskps %xmm1, %eax
 ; AVX2-NEXT:    addl %ecx, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -49,8 +51,9 @@ define i32 @PR48215(i32 %a0, i32 %a1)  {
 ; AVX512-NEXT:    cltd
 ; AVX512-NEXT:    idivl %esi
 ; AVX512-NEXT:    vpbroadcastd %eax, %ymm0
+; AVX512-NEXT:    vpbroadcastd %edx, %xmm1
 ; AVX512-NEXT:    vpcmpltd {{.*}}(%rip), %ymm0, %k0
-; AVX512-NEXT:    vpcmpltd {{.*}}(%rip), %xmm0, %k1
+; AVX512-NEXT:    vpcmpltd {{.*}}(%rip), %xmm1, %k1
 ; AVX512-NEXT:    kmovw %k0, %eax
 ; AVX512-NEXT:    movzbl %al, %ecx
 ; AVX512-NEXT:    kmovw %k1, %eax

From aa29049404efdc0134066839bc14d135d69ec225 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Thu, 19 Nov 2020 10:06:57 -0800
Subject: [PATCH 328/363] [CUDA] Unbreak CUDA compilation with -std=c++20

Standard libc++ headers in stdc++ mode include <new> which picks up
cuda_wrappers/new before any of the CUDA macros have been defined.

We can not include CUDA headers that early, so the work-around is to define
__device__ in the wrapper header itself.

Differential Revision: https://reviews.llvm.org/D91807

(cherry picked from commit 9a465057a64dba8a8614424d26136f5c0452bcc3)
---
 clang/lib/Headers/cuda_wrappers/new | 38 ++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/clang/lib/Headers/cuda_wrappers/new b/clang/lib/Headers/cuda_wrappers/new
index f49811c5a57c..47690f1152fe 100644
--- a/clang/lib/Headers/cuda_wrappers/new
+++ b/clang/lib/Headers/cuda_wrappers/new
@@ -33,66 +33,76 @@
 #define CUDA_NOEXCEPT
 #endif
 
+#pragma push_macro("__DEVICE__")
+#if defined __device__
+#define __DEVICE__ __device__
+#else
+// <new> has been included too early from the standard libc++ headers and the
+// standard CUDA macros are not available yet. We have to define our own.
+#define __DEVICE__ __attribute__((device))
+#endif
+
 // Device overrides for non-placement new and delete.
-__device__ inline void *operator new(__SIZE_TYPE__ size) {
+__DEVICE__ inline void *operator new(__SIZE_TYPE__ size) {
   if (size == 0) {
     size = 1;
   }
   return ::malloc(size);
 }
-__device__ inline void *operator new(__SIZE_TYPE__ size,
+__DEVICE__ inline void *operator new(__SIZE_TYPE__ size,
                                      const std::nothrow_t &) CUDA_NOEXCEPT {
   return ::operator new(size);
 }
 
-__device__ inline void *operator new[](__SIZE_TYPE__ size) {
+__DEVICE__ inline void *operator new[](__SIZE_TYPE__ size) {
   return ::operator new(size);
 }
-__device__ inline void *operator new[](__SIZE_TYPE__ size,
+__DEVICE__ inline void *operator new[](__SIZE_TYPE__ size,
                                        const std::nothrow_t &) {
   return ::operator new(size);
 }
 
-__device__ inline void operator delete(void* ptr) CUDA_NOEXCEPT {
+__DEVICE__ inline void operator delete(void* ptr) CUDA_NOEXCEPT {
   if (ptr) {
     ::free(ptr);
   }
 }
-__device__ inline void operator delete(void *ptr,
+__DEVICE__ inline void operator delete(void *ptr,
                                        const std::nothrow_t &) CUDA_NOEXCEPT {
   ::operator delete(ptr);
 }
 
-__device__ inline void operator delete[](void* ptr) CUDA_NOEXCEPT {
+__DEVICE__ inline void operator delete[](void* ptr) CUDA_NOEXCEPT {
   ::operator delete(ptr);
 }
-__device__ inline void operator delete[](void *ptr,
+__DEVICE__ inline void operator delete[](void *ptr,
                                          const std::nothrow_t &) CUDA_NOEXCEPT {
   ::operator delete(ptr);
 }
 
 // Sized delete, C++14 only.
 #if __cplusplus >= 201402L
-__device__ inline void operator delete(void *ptr,
+__DEVICE__ inline void operator delete(void *ptr,
                                        __SIZE_TYPE__ size) CUDA_NOEXCEPT {
   ::operator delete(ptr);
 }
-__device__ inline void operator delete[](void *ptr,
+__DEVICE__ inline void operator delete[](void *ptr,
                                          __SIZE_TYPE__ size) CUDA_NOEXCEPT {
   ::operator delete(ptr);
 }
 #endif
 
 // Device overrides for placement new and delete.
-__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+__DEVICE__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
   return __ptr;
 }
-__device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+__DEVICE__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
   return __ptr;
 }
-__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
-__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
+__DEVICE__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
+__DEVICE__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
 
+#pragma pop_macro("__DEVICE__")
 #pragma pop_macro("CUDA_NOEXCEPT")
 
 #endif // include guard

From 59012b685fd69d7350eb55166a8817688e413db8 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Fri, 4 Dec 2020 11:27:39 -0800
Subject: [PATCH 329/363] [CUDA] Another attempt to fix early inclusion of
 <new> from libstdc++

Previous patch (9a465057a64dba) did not fix the problem.
https://bugs.llvm.org/show_bug.cgi?id=48228

If the <new> is included too early, before CUDA-specific defines are available,
just include-next the standard <new> and undo the include guard.  CUDA-specific
variants of operator new/delete will be declared if/when <new> is used from the
CUDA source itself, when all CUDA-related macros are available.

Differential Revision: https://reviews.llvm.org/D91807

(cherry picked from commit 43267929423bf768bbbcc65e47a07e37af7f4e22)
---
 clang/lib/Headers/cuda_wrappers/new | 46 ++++++++++++++---------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/clang/lib/Headers/cuda_wrappers/new b/clang/lib/Headers/cuda_wrappers/new
index 47690f1152fe..7f255314056a 100644
--- a/clang/lib/Headers/cuda_wrappers/new
+++ b/clang/lib/Headers/cuda_wrappers/new
@@ -26,6 +26,13 @@
 
 #include_next <new>
 
+#if !defined(__device__)
+// The header has been included too early from the standard C++ library
+// and CUDA-specific macros are not available yet.
+// Undo the include guard and try again later.
+#undef __CLANG_CUDA_WRAPPERS_NEW
+#else
+
 #pragma push_macro("CUDA_NOEXCEPT")
 #if __cplusplus >= 201103L
 #define CUDA_NOEXCEPT noexcept
@@ -33,76 +40,67 @@
 #define CUDA_NOEXCEPT
 #endif
 
-#pragma push_macro("__DEVICE__")
-#if defined __device__
-#define __DEVICE__ __device__
-#else
-// <new> has been included too early from the standard libc++ headers and the
-// standard CUDA macros are not available yet. We have to define our own.
-#define __DEVICE__ __attribute__((device))
-#endif
-
 // Device overrides for non-placement new and delete.
-__DEVICE__ inline void *operator new(__SIZE_TYPE__ size) {
+__device__ inline void *operator new(__SIZE_TYPE__ size) {
   if (size == 0) {
     size = 1;
   }
   return ::malloc(size);
 }
-__DEVICE__ inline void *operator new(__SIZE_TYPE__ size,
+__device__ inline void *operator new(__SIZE_TYPE__ size,
                                      const std::nothrow_t &) CUDA_NOEXCEPT {
   return ::operator new(size);
 }
 
-__DEVICE__ inline void *operator new[](__SIZE_TYPE__ size) {
+__device__ inline void *operator new[](__SIZE_TYPE__ size) {
   return ::operator new(size);
 }
-__DEVICE__ inline void *operator new[](__SIZE_TYPE__ size,
+__device__ inline void *operator new[](__SIZE_TYPE__ size,
                                        const std::nothrow_t &) {
   return ::operator new(size);
 }
 
-__DEVICE__ inline void operator delete(void* ptr) CUDA_NOEXCEPT {
+__device__ inline void operator delete(void* ptr) CUDA_NOEXCEPT {
   if (ptr) {
     ::free(ptr);
   }
 }
-__DEVICE__ inline void operator delete(void *ptr,
+__device__ inline void operator delete(void *ptr,
                                        const std::nothrow_t &) CUDA_NOEXCEPT {
   ::operator delete(ptr);
 }
 
-__DEVICE__ inline void operator delete[](void* ptr) CUDA_NOEXCEPT {
+__device__ inline void operator delete[](void* ptr) CUDA_NOEXCEPT {
   ::operator delete(ptr);
 }
-__DEVICE__ inline void operator delete[](void *ptr,
+__device__ inline void operator delete[](void *ptr,
                                          const std::nothrow_t &) CUDA_NOEXCEPT {
   ::operator delete(ptr);
 }
 
 // Sized delete, C++14 only.
 #if __cplusplus >= 201402L
-__DEVICE__ inline void operator delete(void *ptr,
+__device__ inline void operator delete(void *ptr,
                                        __SIZE_TYPE__ size) CUDA_NOEXCEPT {
   ::operator delete(ptr);
 }
-__DEVICE__ inline void operator delete[](void *ptr,
+__device__ inline void operator delete[](void *ptr,
                                          __SIZE_TYPE__ size) CUDA_NOEXCEPT {
   ::operator delete(ptr);
 }
 #endif
 
 // Device overrides for placement new and delete.
-__DEVICE__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
   return __ptr;
 }
-__DEVICE__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+__device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
   return __ptr;
 }
-__DEVICE__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
-__DEVICE__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
+__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
+__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
 
-#pragma pop_macro("__DEVICE__")
 #pragma pop_macro("CUDA_NOEXCEPT")
 
+#endif // __device__
 #endif // include guard

From b091768e60e6807ae3806acaba1cbc9b1c96b388 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Fri, 4 Dec 2020 03:13:00 +0000
Subject: [PATCH 330/363] [LLD][COFF] Fix crash with /summary and PCH input
 files

Before this patch /summary was crashing with some .PCH.OBJ files, because tpiMap[srcIdx++] was reading at the wrong location. When the TpiSource depends on a .PCH.OBJ file, the types should be offset by the previously merged PCH.OBJ set of indices.

Differential Revision: https://reviews.llvm.org/D88678

(cherry picked from commit 4140f0744fb2deccb74e77282e23ff731f67821b)
---
 lld/COFF/DebugTypes.cpp                 |  5 +-
 lld/test/COFF/Inputs/precomp2-a.yaml    | 84 +++++++++++++++++++++++++
 lld/test/COFF/Inputs/precomp2.yaml      | 82 ++++++++++++++++++++++++
 lld/test/COFF/precomp-summary-fail.test | 21 +++++++
 4 files changed, 191 insertions(+), 1 deletion(-)
 create mode 100644 lld/test/COFF/Inputs/precomp2-a.yaml
 create mode 100644 lld/test/COFF/Inputs/precomp2.yaml
 create mode 100644 lld/test/COFF/precomp-summary-fail.test

diff --git a/lld/COFF/DebugTypes.cpp b/lld/COFF/DebugTypes.cpp
index 4790b0166799..abe3bb9eef5b 100644
--- a/lld/COFF/DebugTypes.cpp
+++ b/lld/COFF/DebugTypes.cpp
@@ -202,6 +202,9 @@ Expected<const CVIndexMap *> TpiSource::mergeDebugT(TypeMerger *m,
   BinaryStreamReader reader(file->debugTypes, support::little);
   cantFail(reader.readArray(types, reader.getLength()));
 
+  // When dealing with PCH.OBJ, some indices were already merged.
+  unsigned nbHeadIndices = indexMap->tpiMap.size();
+
   if (config->debugGHashes) {
     ArrayRef<GloballyHashedType> hashes;
     std::vector<GloballyHashedType> ownedHashes;
@@ -232,7 +235,7 @@ Expected<const CVIndexMap *> TpiSource::mergeDebugT(TypeMerger *m,
     // collecting statistics.
     m->tpiCounts.resize(m->getTypeTable().size());
     m->ipiCounts.resize(m->getIDTable().size());
-    uint32_t srcIdx = 0;
+    uint32_t srcIdx = nbHeadIndices;
     for (CVType &ty : types) {
       TypeIndex dstIdx = indexMap->tpiMap[srcIdx++];
       // Type merging may fail, so a complex source type may become the simple
diff --git a/lld/test/COFF/Inputs/precomp2-a.yaml b/lld/test/COFF/Inputs/precomp2-a.yaml
new file mode 100644
index 000000000000..a9d497ba10a3
--- /dev/null
+++ b/lld/test/COFF/Inputs/precomp2-a.yaml
@@ -0,0 +1,84 @@
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [  ]
+sections:
+  - Name:            '.debug$S'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    Subsections:
+      - !Symbols
+        Records:
+          - Kind:            S_OBJNAME
+            ObjNameSym:
+              Signature:       545589255
+              ObjectName:      'D:\llvm-project\lld\test\COFF\Inputs\precomp2-a.obj'
+          - Kind:            S_COMPILE3
+            Compile3Sym:
+              Flags:           [ SecurityChecks, HotPatch ]
+              Machine:         X64
+              FrontendMajor:   19
+              FrontendMinor:   13
+              FrontendBuild:   26131
+              FrontendQFE:     1
+              BackendMajor:    19
+              BackendMinor:    13
+              BackendBuild:    26131
+              BackendQFE:      1
+              Version:         'Microsoft (R) Optimizing Compiler'
+      - !StringTable
+        Strings:
+          - 'D:\llvm-project\lld\test\COFF\precomp\precomp.pch'
+          - 'D:\llvm-project\lld\test\COFF\precomp\precomp.h'
+          - 'D:\llvm-project\lld\test\COFF\precomp\a.cpp'
+  - Name:            '.debug$T'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    Types:
+      - Kind:            LF_PRECOMP
+        Precomp:
+          StartTypeIndex:  4096
+          TypesCount:      3
+          Signature:       545589255
+          PrecompFilePath: 'D:\llvm-project\lld\test\COFF\Inputs\precomp2.obj'
+      - Kind:            LF_STRING_ID
+        StringId:
+          Id:              0
+          String:          'test test test test test'
+      - Kind:            LF_STRING_ID
+        StringId:
+          Id:              0
+          String:          'test test test test test'
+      - Kind:            LF_STRING_ID
+        StringId:
+          Id:              0
+          String:          'test test test test test'
+      - Kind:            LF_BUILDINFO
+        BuildInfo:
+          ArgIndices:      [ 4101, 4101, 4101, 4101, 4101 ]
+symbols:
+  - Name:            '.debug$S'
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          0
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.debug$T'
+    Value:           0
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          0
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+...
diff --git a/lld/test/COFF/Inputs/precomp2.yaml b/lld/test/COFF/Inputs/precomp2.yaml
new file mode 100644
index 000000000000..7a4ec2f25af4
--- /dev/null
+++ b/lld/test/COFF/Inputs/precomp2.yaml
@@ -0,0 +1,82 @@
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [  ]
+sections:
+  - Name:            '.debug$S'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    Subsections:
+      - !Symbols
+        Records:
+          - Kind:            S_OBJNAME
+            ObjNameSym:
+              Signature:       545589255
+              ObjectName:      'D:\llvm-project\lld\test\COFF\Inputs\precomp2.obj'
+          - Kind:            S_COMPILE3
+            Compile3Sym:
+              Flags:           [ SecurityChecks, HotPatch ]
+              Machine:         X64
+              FrontendMajor:   19
+              FrontendMinor:   13
+              FrontendBuild:   26131
+              FrontendQFE:     1
+              BackendMajor:    19
+              BackendMinor:    13
+              BackendBuild:    26131
+              BackendQFE:      1
+              Version:         'Microsoft (R) Optimizing Compiler'
+      - !StringTable
+        Strings:
+          - 'D:\llvm-project\lld\test\COFF\precomp\precomp.pch'
+  - Name:            '.debug$P'
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    PrecompTypes:
+      - Kind:            LF_STRUCTURE
+        Class:
+          MemberCount:     0
+          Options:         [ None, ForwardReference, HasUniqueName ]
+          FieldList:       0
+          Name:            _s__CatchableType
+          UniqueName:      '.?AU_s__CatchableType@@'
+          DerivationList:  0
+          VTableShape:     0
+          Size:            0
+      - Kind:            LF_MODIFIER
+        Modifier:
+          ModifiedType:    4096
+          Modifiers:       [ None, Const ]
+      - Kind:            LF_POINTER
+        Pointer:
+          ReferentType:    4096
+          Attrs:           65548
+      - Kind:            LF_ENDPRECOMP
+        EndPrecomp:
+          Signature:       545589255
+symbols:
+  - Name:            '.debug$S'
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          0
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.debug$P'
+    Value:           0
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          0
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+...
diff --git a/lld/test/COFF/precomp-summary-fail.test b/lld/test/COFF/precomp-summary-fail.test
new file mode 100644
index 000000000000..6c48ba5f0c45
--- /dev/null
+++ b/lld/test/COFF/precomp-summary-fail.test
@@ -0,0 +1,21 @@
+
+The input files were tailored so that we end up with a resulting IPI stream
+smaller than the TPI stream, which would previously trigger a crash with
+/summary.
+
+RUN: rm -rf %t && mkdir %t
+RUN: yaml2obj < %S/Inputs/precomp2.yaml -o %t\precomp2.obj
+RUN: yaml2obj < %S/Inputs/precomp2-a.yaml -o %t\precomp2-a.obj
+RUN: lld-link %t\precomp2-a.obj %t\precomp2.obj /nodefaultlib /noentry \
+RUN:    /dll /out:%t.dll /debug /summary | FileCheck %s -check-prefix SUMMARY
+
+SUMMARY:                                     Summary
+SUMMARY-NEXT: --------------------------------------------------------------------------------
+SUMMARY-NEXT:               2 Input OBJ files (expanded from all cmd-line inputs)
+SUMMARY-NEXT:               0 PDB type server dependencies
+SUMMARY-NEXT:               1 Precomp OBJ dependencies
+SUMMARY-NEXT:               5 Merged TPI records
+SUMMARY-NEXT:               1 Output PDB strings
+SUMMARY-NEXT:               0 Global symbol records
+SUMMARY-NEXT:               4 Module symbol records
+SUMMARY-NEXT:               0 Public symbol records

From 852f4d8eb6d317be0947055c0bb6b4fd6c9aa930 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Fri, 2 Oct 2020 20:05:09 -0400
Subject: [PATCH 331/363] [Sparc] Remove cast that truncates immediate operands
 to 32 bits.

Patch by: Mark Kettenis

Test provided by Jessica Clarke.

Differential Revision: https://reviews.llvm.org/D87210

(cherry picked from commit 9ae95a0f8f1bc9bd9e8eb30a5a9444fbdca5cc29)
---
 llvm/lib/Target/Sparc/SparcAsmPrinter.cpp | 2 +-
 llvm/test/CodeGen/SPARC/inlineasm-v9.ll   | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index 069e43c6f544..7845a18b14c1 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -351,7 +351,7 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     break;
 
   case MachineOperand::MO_Immediate:
-    O << (int)MO.getImm();
+    O << MO.getImm();
     break;
   case MachineOperand::MO_MachineBasicBlock:
     MO.getMBB()->getSymbol()->print(O, MAI);
diff --git a/llvm/test/CodeGen/SPARC/inlineasm-v9.ll b/llvm/test/CodeGen/SPARC/inlineasm-v9.ll
index 53ab114dd8d5..1388c8655ace 100644
--- a/llvm/test/CodeGen/SPARC/inlineasm-v9.ll
+++ b/llvm/test/CodeGen/SPARC/inlineasm-v9.ll
@@ -39,3 +39,12 @@ entry:
   tail call void asm sideeffect "faddq $0,$1,$2", "{f40},{f40},{f40}"(fp128 0xL0, fp128 0xL0, fp128 0xL0)
   ret void
 }
+
+;; Ensure that 64-bit immediates aren't truncated
+; CHECK-LABEL: test_large_immediate
+; CHECK: or %o0, %lo(4294967296), %o0
+define i64 @test_large_immediate(i64) {
+entry:
+  %1 = tail call i64 asm "or $0, %lo($1), $0", "=r,i,r"(i64 4294967296, i64 %0)
+  ret i64 %1
+}

From 561e1ce1a82e98df60074cef6b63f640f4ef712c Mon Sep 17 00:00:00 2001
From: Joseph Tremoulet <jotrem@microsoft.com>
Date: Wed, 23 Sep 2020 06:00:50 -0700
Subject: [PATCH 332/363] [lldb] Fix GetRemoteSharedModule fallback logic

When the various methods of locating the module in GetRemoteSharedModule
fail, make sure we pass the original module spec to the bail-out call to
the provided resolver function.

Also make sure we consistently use the resolved module spec from the
various success paths.

Thanks to what appears to have been an accidentally inverted condition
(commit 85967fa applied the new condition to a path where GetModuleSpec
returns false, but should have applied it when GetModuleSpec returns
true), without this fix we only pass the original module spec in the
fallback if the original spec has no uuid (or has a uuid that somehow
matches the resolved module's uuid despite the call to GetModuleSpec
failing).  This manifested as a bug when processing a minidump file with
a user-provided sysroot, since in that case the resolver call was being
applied to resolved_module_spec (despite resolution failing), which did
not have the path of its file_spec set.

Reviewed By: JDevlieghere

Differential Revision: https://reviews.llvm.org/D88099

(cherry picked from commit 20f84257ac4ac54ceb5f581a6081fac6eff2a5a1)
---
 lldb/source/Target/Platform.cpp               | 16 +++++++++---
 .../minidump-new/TestMiniDumpNew.py           | 26 +++++++++++++++++++
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/lldb/source/Target/Platform.cpp b/lldb/source/Target/Platform.cpp
index 16787141bee0..34ed7872c720 100644
--- a/lldb/source/Target/Platform.cpp
+++ b/lldb/source/Target/Platform.cpp
@@ -1564,21 +1564,29 @@ Status Platform::GetRemoteSharedModule(const ModuleSpec &module_spec,
       if (error.Success() && module_sp)
         break;
     }
-    if (module_sp)
+    if (module_sp) {
+      resolved_module_spec = arch_module_spec;
       got_module_spec = true;
+    }
   }
 
   if (!got_module_spec) {
     // Get module information from a target.
-    if (!GetModuleSpec(module_spec.GetFileSpec(), module_spec.GetArchitecture(),
-                       resolved_module_spec)) {
+    if (GetModuleSpec(module_spec.GetFileSpec(), module_spec.GetArchitecture(),
+                      resolved_module_spec)) {
       if (!module_spec.GetUUID().IsValid() ||
           module_spec.GetUUID() == resolved_module_spec.GetUUID()) {
-        return module_resolver(module_spec);
+        got_module_spec = true;
       }
     }
   }
 
+  if (!got_module_spec) {
+    // Fall back to the given module resolver, which may have its own
+    // search logic.
+    return module_resolver(module_spec);
+  }
+
   // If we are looking for a specific UUID, make sure resolved_module_spec has
   // the same one before we search.
   if (module_spec.GetUUID().IsValid()) {
diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py
index 012f9b67d9e3..9d2daec67698 100644
--- a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py
+++ b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py
@@ -455,3 +455,29 @@ def check_region(index, start, end, read, write, execute, mapped, name):
         check_region(17, 0x40169000, 0x4016b000, True,  True,  False, True,  d)
         check_region(18, 0x4016b000, 0x40176000, True,  True,  False, True,  n)
         check_region(-1, 0x40176000, max_int,    False, False, False, False, n)
+
+    @skipIfLLVMTargetMissing("X86")
+    def test_minidump_sysroot(self):
+        """Test that lldb can find a module referenced in an i386 linux minidump using the sysroot."""
+
+        # Copy linux-x86_64 executable to tmp_sysroot/temp/test/ (since it was compiled as
+        # /tmp/test/linux-x86_64)
+        tmp_sysroot = os.path.join(
+            self.getBuildDir(), "lldb_i386_mock_sysroot")
+        executable = os.path.join(
+            tmp_sysroot, "tmp", "test", "linux-x86_64")
+        exe_dir = os.path.dirname(executable)
+        lldbutil.mkdir_p(exe_dir)
+        shutil.copyfile("linux-x86_64", executable)
+
+        # Set sysroot and load core
+        self.runCmd("platform select remote-linux --sysroot '%s'" %
+                    tmp_sysroot)
+        self.process_from_yaml("linux-x86_64.yaml")
+        self.check_state()
+
+        # Check that we loaded the module from the sysroot
+        self.assertEqual(self.target.GetNumModules(), 1)
+        module = self.target.GetModuleAtIndex(0)
+        spec = module.GetFileSpec()
+        self.assertEqual(spec.GetDirectory(), exe_dir)

From 98fa273339a474ae9dcc3295ef10e75e38589dda Mon Sep 17 00:00:00 2001
From: Joseph Tremoulet <jotrem@microsoft.com>
Date: Wed, 23 Sep 2020 09:20:10 -0700
Subject: [PATCH 333/363] [lldb] Normalize paths in new test

The minidump-sysroot test I added in commit 20f84257 compares two paths
using a string comparison.  This causes the Windows buildbot to fail
because of mismatched forward slashes and backslashes.  Use
os.path.normcase to normalize before comparing.

(cherry picked from commit 4a55c98fa7bee1e5ab1504db20ca4d7c8a083111)
---
 .../postmortem/minidump-new/TestMiniDumpNew.py               | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py
index 9d2daec67698..103e86efc54d 100644
--- a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py
+++ b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py
@@ -479,5 +479,6 @@ def test_minidump_sysroot(self):
         # Check that we loaded the module from the sysroot
         self.assertEqual(self.target.GetNumModules(), 1)
         module = self.target.GetModuleAtIndex(0)
-        spec = module.GetFileSpec()
-        self.assertEqual(spec.GetDirectory(), exe_dir)
+        spec_dir_norm = os.path.normcase(module.GetFileSpec().GetDirectory())
+        exe_dir_norm = os.path.normcase(exe_dir)
+        self.assertEqual(spec_dir_norm, exe_dir_norm)

From 393eac16e497d2e7cf67d881ba33644060f35c79 Mon Sep 17 00:00:00 2001
From: Greg Clayton <gclayton@fb.com>
Date: Mon, 10 Aug 2020 15:07:47 -0700
Subject: [PATCH 334/363] Add hashing of the .text section to ProcessMinidump.

Breakpad will always have a UUID for binaries when it creates minidump files. If an ELF files has a GNU build ID, it will use that. If it doesn't, it will create one by hashing up to the first 4096 bytes of the .text section. LLDB was not able to load these binaries even when we had the right binary because the UUID didn't match. LLDB will use the GNU build ID first as the main UUID for a binary and fallback onto a 8 byte CRC if a binary doesn't have one. With this fix, we will check for the Breakpad hash or the Facebook hash (a modified version of the breakpad hash that collides a bit less) and accept binaries when these hashes match.

Differential Revision: https://reviews.llvm.org/D86261

(cherry picked from commit 0e6c9a6e7940a2f8ee624358d828acffdb9ccca5)
---
 .../Process/minidump/ProcessMinidump.cpp      | 89 +++++++++++++++++++
 .../minidump-new/TestMiniDumpUUID.py          | 63 +++++++++++++
 .../minidump-new/libbreakpad-overflow.yaml    | 21 +++++
 .../postmortem/minidump-new/libbreakpad.yaml  | 15 ++++
 .../linux-arm-breakpad-uuid-match.yaml        | 15 ++++
 .../linux-arm-facebook-uuid-match.yaml        | 15 ++++
 6 files changed, 218 insertions(+)
 create mode 100644 lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad-overflow.yaml
 create mode 100644 lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad.yaml
 create mode 100644 lldb/test/API/functionalities/postmortem/minidump-new/linux-arm-breakpad-uuid-match.yaml
 create mode 100644 lldb/test/API/functionalities/postmortem/minidump-new/linux-arm-facebook-uuid-match.yaml

diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
index fc8ee346f449..af378ea7741f 100644
--- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
+++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
@@ -121,6 +121,72 @@ class PlaceholderObjectFile : public ObjectFile {
   lldb::addr_t m_base;
   lldb::addr_t m_size;
 };
+
+/// Duplicate the HashElfTextSection() from the breakpad sources.
+///
+/// Breakpad, a Google crash log reporting tool suite, creates minidump files
+/// for many different architectures. When using Breakpad to create ELF
+/// minidumps, it will check for a GNU build ID when creating a minidump file
+/// and if one doesn't exist in the file, it will say the UUID of the file is a
+/// checksum of up to the first 4096 bytes of the .text section. Facebook also
+/// uses breakpad and modified this hash to avoid collisions so we can
+/// calculate and check for this as well.
+///
+/// The breakpad code might end up hashing up to 15 bytes that immediately
+/// follow the .text section in the file, so this code must do exactly what it
+/// does so we can get an exact match for the UUID.
+///
+/// \param[in] module_sp The module to grab the .text section from.
+///
+/// \param[in/out] breakpad_uuid A vector that will receive the calculated
+///                breakpad .text hash.
+///
+/// \param[in/out] facebook_uuid A vector that will receive the calculated
+///                facebook .text hash.
+///
+void HashElfTextSection(ModuleSP module_sp, std::vector<uint8_t> &breakpad_uuid,
+                        std::vector<uint8_t> &facebook_uuid) {
+  SectionList *sect_list = module_sp->GetSectionList();
+  if (sect_list == nullptr)
+    return;
+  SectionSP sect_sp = sect_list->FindSectionByName(ConstString(".text"));
+  if (!sect_sp)
+    return;
+  constexpr size_t kMDGUIDSize = 16;
+  constexpr size_t kBreakpadPageSize = 4096;
+  // The breakpad code has a bug where it might access beyond the end of a
+  // .text section by up to 15 bytes, so we must ensure we round up to the
+  // next kMDGUIDSize byte boundary.
+  DataExtractor data;
+  const size_t text_size = sect_sp->GetFileSize();
+  const size_t read_size = std::min<size_t>(
+      llvm::alignTo(text_size, kMDGUIDSize), kBreakpadPageSize);
+  sect_sp->GetObjectFile()->GetData(sect_sp->GetFileOffset(), read_size, data);
+
+  breakpad_uuid.assign(kMDGUIDSize, 0);
+  facebook_uuid.assign(kMDGUIDSize, 0);
+
+  // The only difference between the breakpad hash and the facebook hash is the
+  // hashing of the text section size into the hash prior to hashing the .text
+  // contents.
+  for (size_t i = 0; i < kMDGUIDSize; i++)
+    facebook_uuid[i] ^= text_size % 255;
+
+  // This code carefully duplicates how the hash was created in Breakpad
+  // sources, including the error where it might has an extra 15 bytes past the
+  // end of the .text section if the .text section is less than a page size in
+  // length.
+  const uint8_t *ptr = data.GetDataStart();
+  const uint8_t *ptr_end = data.GetDataEnd();
+  while (ptr < ptr_end) {
+    for (unsigned i = 0; i < kMDGUIDSize; i++) {
+      breakpad_uuid[i] ^= ptr[i];
+      facebook_uuid[i] ^= ptr[i];
+    }
+    ptr += kMDGUIDSize;
+  }
+}
+
 } // namespace
 
 ConstString ProcessMinidump::GetPluginNameStatic() {
@@ -494,10 +560,33 @@ void ProcessMinidump::ReadModuleList() {
         const bool match = dmp_bytes.empty() || mod_bytes.empty() ||
             mod_bytes.take_front(dmp_bytes.size()) == dmp_bytes;
         if (!match) {
+          // Breakpad generates minindump files, and if there is no GNU build
+          // ID in the binary, it will calculate a UUID by hashing first 4096
+          // bytes of the .text section and using that as the UUID for a module
+          // in the minidump. Facebook uses a modified breakpad client that
+          // uses a slightly modified this hash to avoid collisions. Check for
+          // UUIDs from the minindump that match these cases and accept the
+          // module we find if they do match.
+          std::vector<uint8_t> breakpad_uuid;
+          std::vector<uint8_t> facebook_uuid;
+          HashElfTextSection(module_sp, breakpad_uuid, facebook_uuid);
+          if (dmp_bytes == llvm::ArrayRef<uint8_t>(breakpad_uuid)) {
+            LLDB_LOG(log, "Breakpad .text hash match for {0}.", name);
+          } else if (dmp_bytes == llvm::ArrayRef<uint8_t>(facebook_uuid)) {
+            LLDB_LOG(log, "Facebook .text hash match for {0}.", name);
+          } else {
+            // The UUID wasn't a partial match and didn't match the .text hash
+            // so remove the module from the target, we will need to create a
+            // placeholder object file.
             GetTarget().GetImages().Remove(module_sp);
             module_sp.reset();
+          }
+        } else {
+          LLDB_LOG(log, "Partial uuid match for {0}.", name);
         }
       }
+    } else {
+      LLDB_LOG(log, "Full uuid match for {0}.", name);
     }
     if (module_sp) {
       // Watch out for place holder modules that have different paths, but the
diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpUUID.py b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpUUID.py
index cc6d6fb37cae..c4dcddba631b 100644
--- a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpUUID.py
+++ b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpUUID.py
@@ -179,6 +179,69 @@ def test_partial_uuid_mismatch(self):
                            "/invalid/path/on/current/system/libuuidmismatch.so",
                            "7295E17C-6668-9E05-CBB5-DEE5003865D5")
 
+    def test_breakpad_hash_match(self):
+        """
+            Breakpad creates minidump files using CvRecord in each module whose
+            signature is set to PDB70 where the UUID is a hash generated by
+            breakpad of the .text section. This is only done when the
+            executable has no ELF build ID.
+
+            This test verifies that if we have a minidump with a 16 byte UUID,
+            that we are able to associate a symbol file with no ELF build ID
+            and match it up by hashing the .text section.
+        """
+        so_path = self.getBuildArtifact("libbreakpad.so")
+        self.yaml2obj("libbreakpad.yaml", so_path)
+        cmd = 'settings set target.exec-search-paths "%s"' % (os.path.dirname(so_path))
+        self.dbg.HandleCommand(cmd)
+        modules = self.get_minidump_modules("linux-arm-breakpad-uuid-match.yaml")
+        self.assertEqual(1, len(modules))
+        # LLDB makes up it own UUID as well when there is no build ID so we
+        # will check that this matches.
+        self.verify_module(modules[0], so_path, "D9C480E8")
+
+    def test_breakpad_overflow_hash_match(self):
+        """
+            This is a similar to test_breakpad_hash_match, but it verifies that
+            if the .text section does not end on a 16 byte boundary, then it
+            will overflow into the next section's data by up to 15 bytes. This
+            verifies that we are able to match what breakpad does as it will do
+            this.
+        """
+        so_path = self.getBuildArtifact("libbreakpad.so")
+        self.yaml2obj("libbreakpad-overflow.yaml", so_path)
+        cmd = 'settings set target.exec-search-paths "%s"' % (os.path.dirname(so_path))
+        self.dbg.HandleCommand(cmd)
+        modules = self.get_minidump_modules("linux-arm-breakpad-uuid-match.yaml")
+        self.assertEqual(1, len(modules))
+        # LLDB makes up it own UUID as well when there is no build ID so we
+        # will check that this matches.
+        self.verify_module(modules[0], so_path, "48EB9FD7")
+
+
+    def test_facebook_hash_match(self):
+        """
+            Breakpad creates minidump files using CvRecord in each module whose
+            signature is set to PDB70 where the UUID is a hash generated by
+            breakpad of the .text section and Facebook modified this hash to
+            avoid collisions. This is only done when the executable has no ELF
+            build ID.
+
+            This test verifies that if we have a minidump with a 16 byte UUID,
+            that we are able to associate a symbol file with no ELF build ID
+            and match it up by hashing the .text section like Facebook does.
+        """
+        so_path = self.getBuildArtifact("libbreakpad.so")
+        self.yaml2obj("libbreakpad.yaml", so_path)
+        cmd = 'settings set target.exec-search-paths "%s"' % (os.path.dirname(so_path))
+        self.dbg.HandleCommand(cmd)
+        modules = self.get_minidump_modules("linux-arm-facebook-uuid-match.yaml")
+        self.assertEqual(1, len(modules))
+        # LLDB makes up it own UUID as well when there is no build ID so we
+        # will check that this matches.
+        self.verify_module(modules[0], so_path, "D9C480E8")
+
+
     def test_relative_module_name(self):
         old_cwd = os.getcwd()
         self.addTearDownHook(lambda: os.chdir(old_cwd))
diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad-overflow.yaml b/lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad-overflow.yaml
new file mode 100644
index 000000000000..807a468f3d4b
--- /dev/null
+++ b/lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad-overflow.yaml
@@ -0,0 +1,21 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS32
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_ARM
+  Flags:           [ EF_ARM_SOFT_FLOAT, EF_ARM_EABI_VER5 ]
+Sections:
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x0000000000010000
+    AddressAlign:    0x0000000000000001
+    Content:         04
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_WRITE ]
+    Address:         0x0000000000010001
+    AddressAlign:    0x0000000000000001
+    Content:         0000001400000003000000474E5500
diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad.yaml b/lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad.yaml
new file mode 100644
index 000000000000..53e96f601aa8
--- /dev/null
+++ b/lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad.yaml
@@ -0,0 +1,15 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS32
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_ARM
+  Flags:           [ EF_ARM_SOFT_FLOAT, EF_ARM_EABI_VER5 ]
+Sections:
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x0000000000010000
+    AddressAlign:    0x0000000000000004
+    Content:         040000001400000003000000474E5500
diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/linux-arm-breakpad-uuid-match.yaml b/lldb/test/API/functionalities/postmortem/minidump-new/linux-arm-breakpad-uuid-match.yaml
new file mode 100644
index 000000000000..37848982c586
--- /dev/null
+++ b/lldb/test/API/functionalities/postmortem/minidump-new/linux-arm-breakpad-uuid-match.yaml
@@ -0,0 +1,15 @@
+--- !minidump
+Streams:
+  - Type:            SystemInfo
+    Processor Arch:  ARM
+    Platform ID:     Linux
+    CSD Version:     '15E216'
+    CPU:
+      CPUID:           0x00000000
+  - Type:            ModuleList
+    Modules:
+      - Base of Image:   0x0000000000001000
+        Size of Image:   0x00001000
+        Module Name:     '/invalid/path/on/current/system/libbreakpad.so'
+        CodeView Record: 52534453040000001400000003000000474e55000000000000
+...
diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/linux-arm-facebook-uuid-match.yaml b/lldb/test/API/functionalities/postmortem/minidump-new/linux-arm-facebook-uuid-match.yaml
new file mode 100644
index 000000000000..203fc669a0b8
--- /dev/null
+++ b/lldb/test/API/functionalities/postmortem/minidump-new/linux-arm-facebook-uuid-match.yaml
@@ -0,0 +1,15 @@
+--- !minidump
+Streams:
+  - Type:            SystemInfo
+    Processor Arch:  ARM
+    Platform ID:     Linux
+    CSD Version:     '15E216'
+    CPU:
+      CPUID:           0x00000000
+  - Type:            ModuleList
+    Modules:
+      - Base of Image:   0x0000000000001000
+        Size of Image:   0x00001000
+        Module Name:     '/invalid/path/on/current/system/libbreakpad.so'
+        CodeView Record: 52534453141010100410101013101010575e45100000000000
+...

From 93fffe98d5c2f6471928433a41b8cb546ef2abda Mon Sep 17 00:00:00 2001
From: Joseph Tremoulet <jotrem@microsoft.com>
Date: Fri, 16 Oct 2020 09:32:08 -0400
Subject: [PATCH 335/363] [lldb] Minidump: check for .text hash match with
 directory

When opening a minidump, we might discover that it reports a UUID for a
module that doesn't match the build ID, but rather a hash of the .text
section (according to either of two different hash functions, used by
breakpad and Facebook respectively).  The current logic searches for a
module by filename only to check the hash; this change updates it to
first search by directory+filename.  This is important when the
directory specified in the minidump must be interpreted relative to a
user-provided sysoort, as the leaf directory won't be in the search path
in that case.

Also add a regression test; without this change, module validation fails
because we have just the placeholder module which reports as its path
the platform path in the minidump.

Reviewed By: clayborg

Differential Revision: https://reviews.llvm.org/D89155

(cherry picked from commit d30797b4041ffe215b92d376af60c4f26a0555ae)
---
 .../Process/minidump/ProcessMinidump.cpp      | 109 ++++++++++--------
 .../Process/minidump/ProcessMinidump.h        |   4 +
 .../minidump-new/TestMiniDumpUUID.py          |  51 +++++++-
 .../minidump-new/libbreakpad-decoy.yaml       |  18 +++
 4 files changed, 133 insertions(+), 49 deletions(-)
 create mode 100644 lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad-decoy.yaml

diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
index af378ea7741f..1041f63aa2e2 100644
--- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
+++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
@@ -510,6 +510,53 @@ bool ProcessMinidump::UpdateThreadList(ThreadList &old_thread_list,
   return new_thread_list.GetSize(false) > 0;
 }
 
+ModuleSP ProcessMinidump::GetOrCreateModule(UUID minidump_uuid,
+                                            llvm::StringRef name,
+                                            ModuleSpec module_spec) {
+  Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_DYNAMIC_LOADER));
+  Status error;
+
+  ModuleSP module_sp =
+      GetTarget().GetOrCreateModule(module_spec, true /* notify */, &error);
+  if (!module_sp)
+    return module_sp;
+  // We consider the module to be a match if the minidump UUID is a
+  // prefix of the actual UUID, or if either of the UUIDs are empty.
+  const auto dmp_bytes = minidump_uuid.GetBytes();
+  const auto mod_bytes = module_sp->GetUUID().GetBytes();
+  const bool match = dmp_bytes.empty() || mod_bytes.empty() ||
+                     mod_bytes.take_front(dmp_bytes.size()) == dmp_bytes;
+  if (match) {
+    LLDB_LOG(log, "Partial uuid match for {0}.", name);
+    return module_sp;
+  }
+
+  // Breakpad generates minindump files, and if there is no GNU build
+  // ID in the binary, it will calculate a UUID by hashing first 4096
+  // bytes of the .text section and using that as the UUID for a module
+  // in the minidump. Facebook uses a modified breakpad client that
+  // uses a slightly modified this hash to avoid collisions. Check for
+  // UUIDs from the minindump that match these cases and accept the
+  // module we find if they do match.
+  std::vector<uint8_t> breakpad_uuid;
+  std::vector<uint8_t> facebook_uuid;
+  HashElfTextSection(module_sp, breakpad_uuid, facebook_uuid);
+  if (dmp_bytes == llvm::ArrayRef<uint8_t>(breakpad_uuid)) {
+    LLDB_LOG(log, "Breakpad .text hash match for {0}.", name);
+    return module_sp;
+  }
+  if (dmp_bytes == llvm::ArrayRef<uint8_t>(facebook_uuid)) {
+    LLDB_LOG(log, "Facebook .text hash match for {0}.", name);
+    return module_sp;
+  }
+  // The UUID wasn't a partial match and didn't match the .text hash
+  // so remove the module from the target, we will need to create a
+  // placeholder object file.
+  GetTarget().GetImages().Remove(module_sp);
+  module_sp.reset();
+  return module_sp;
+}
+
 void ProcessMinidump::ReadModuleList() {
   std::vector<const minidump::Module *> filtered_modules =
       m_minidump_parser->GetFilteredModuleList();
@@ -539,54 +586,22 @@ void ProcessMinidump::ReadModuleList() {
     // add the module to the target if it finds one.
     lldb::ModuleSP module_sp = GetTarget().GetOrCreateModule(module_spec,
                                                      true /* notify */, &error);
-    if (!module_sp) {
-      // Try and find a module without specifying the UUID and only looking for
-      // the file given a basename. We then will look for a partial UUID match
-      // if we find any matches. This function will add the module to the
-      // target if it finds one, so we need to remove the module from the target
-      // if the UUID doesn't match during our manual UUID verification. This
-      // allows the "target.exec-search-paths" setting to specify one or more
-      // directories that contain executables that can be searched for matches.
-      ModuleSpec basename_module_spec(module_spec);
-      basename_module_spec.GetUUID().Clear();
-      basename_module_spec.GetFileSpec().GetDirectory().Clear();
-      module_sp = GetTarget().GetOrCreateModule(basename_module_spec,
-                                                true /* notify */, &error);
-      if (module_sp) {
-        // We consider the module to be a match if the minidump UUID is a
-        // prefix of the actual UUID, or if either of the UUIDs are empty.
-        const auto dmp_bytes = uuid.GetBytes();
-        const auto mod_bytes = module_sp->GetUUID().GetBytes();
-        const bool match = dmp_bytes.empty() || mod_bytes.empty() ||
-            mod_bytes.take_front(dmp_bytes.size()) == dmp_bytes;
-        if (!match) {
-          // Breakpad generates minindump files, and if there is no GNU build
-          // ID in the binary, it will calculate a UUID by hashing first 4096
-          // bytes of the .text section and using that as the UUID for a module
-          // in the minidump. Facebook uses a modified breakpad client that
-          // uses a slightly modified this hash to avoid collisions. Check for
-          // UUIDs from the minindump that match these cases and accept the
-          // module we find if they do match.
-          std::vector<uint8_t> breakpad_uuid;
-          std::vector<uint8_t> facebook_uuid;
-          HashElfTextSection(module_sp, breakpad_uuid, facebook_uuid);
-          if (dmp_bytes == llvm::ArrayRef<uint8_t>(breakpad_uuid)) {
-            LLDB_LOG(log, "Breakpad .text hash match for {0}.", name);
-          } else if (dmp_bytes == llvm::ArrayRef<uint8_t>(facebook_uuid)) {
-            LLDB_LOG(log, "Facebook .text hash match for {0}.", name);
-          } else {
-            // The UUID wasn't a partial match and didn't match the .text hash
-            // so remove the module from the target, we will need to create a
-            // placeholder object file.
-            GetTarget().GetImages().Remove(module_sp);
-            module_sp.reset();
-          }
-        } else {
-          LLDB_LOG(log, "Partial uuid match for {0}.", name);
-        }
-      }
-    } else {
+    if (module_sp) {
       LLDB_LOG(log, "Full uuid match for {0}.", name);
+    } else {
+      // We couldn't find a module with an exactly-matching UUID.  Sometimes
+      // a minidump UUID is only a partial match or is a hash.  So try again
+      // without specifying the UUID, then again without specifying the
+      // directory if that fails.  This will allow us to find modules with
+      // partial matches or hash UUIDs in user-provided sysroots or search
+      // directories (target.exec-search-paths).
+      ModuleSpec partial_module_spec = module_spec;
+      partial_module_spec.GetUUID().Clear();
+      module_sp = GetOrCreateModule(uuid, name, partial_module_spec);
+      if (!module_sp) {
+        partial_module_spec.GetFileSpec().GetDirectory().Clear();
+        module_sp = GetOrCreateModule(uuid, name, partial_module_spec);
+      }
     }
     if (module_sp) {
       // Watch out for place holder modules that have different paths, but the
diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.h b/lldb/source/Plugins/Process/minidump/ProcessMinidump.h
index 839b0e7563f7..bfdace7ea33e 100644
--- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.h
+++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.h
@@ -102,6 +102,10 @@ class ProcessMinidump : public Process {
 
   void ReadModuleList();
 
+  lldb::ModuleSP GetOrCreateModule(lldb_private::UUID minidump_uuid,
+                                   llvm::StringRef name,
+                                   lldb_private::ModuleSpec module_spec);
+
   JITLoaderList &GetJITLoaders() override;
 
 private:
diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpUUID.py b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpUUID.py
index c4dcddba631b..619f94a2cbb0 100644
--- a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpUUID.py
+++ b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpUUID.py
@@ -21,11 +21,14 @@ class MiniDumpUUIDTestCase(TestBase):
     def verify_module(self, module, verify_path, verify_uuid):
         # Compare the filename and the directory separately. We are avoiding
         # SBFileSpec.fullpath because it causes a slash/backslash confusion
-        # on Windows.
+        # on Windows.  Similarly, we compare the directories using normcase
+        # because they may contain a Linux-style relative path from the
+        # minidump appended to a Windows-style root path from the host.
         self.assertEqual(
             os.path.basename(verify_path), module.GetFileSpec().basename)
         self.assertEqual(
-            os.path.dirname(verify_path), module.GetFileSpec().dirname or "")
+            os.path.normcase(os.path.dirname(verify_path)),
+            os.path.normcase(module.GetFileSpec().dirname or ""))
         self.assertEqual(verify_uuid, module.GetUUIDString())
 
     def get_minidump_modules(self, yaml_file):
@@ -200,6 +203,50 @@ def test_breakpad_hash_match(self):
         # will check that this matches.
         self.verify_module(modules[0], so_path, "D9C480E8")
 
+    def test_breakpad_hash_match_sysroot(self):
+        """
+            Check that we can match the breakpad .text section hash when the
+            module is located under a user-provided sysroot.
+        """
+        sysroot_path = os.path.join(self.getBuildDir(), "mock_sysroot")
+        # Create the directory under the sysroot where the minidump reports
+        # the module.
+        so_dir = os.path.join(sysroot_path, "invalid", "path", "on", "current", "system")
+        so_path = os.path.join(so_dir, "libbreakpad.so")
+        lldbutil.mkdir_p(so_dir)
+        self.yaml2obj("libbreakpad.yaml", so_path)
+        self.runCmd("platform select remote-linux --sysroot '%s'" % sysroot_path)
+        modules = self.get_minidump_modules("linux-arm-breakpad-uuid-match.yaml")
+        self.assertEqual(1, len(modules))
+        # LLDB makes up its own UUID as well when there is no build ID so we
+        # will check that this matches.
+        self.verify_module(modules[0], so_path, "D9C480E8")
+
+    def test_breakpad_hash_match_sysroot_decoy(self):
+        """
+            Check that we can match the breakpad .text section hash when there is
+            a module with the right name but wrong contents under a user-provided
+            sysroot, and the right module is at the given search path..
+        """
+        sysroot_path = os.path.join(self.getBuildDir(), "mock_sysroot")
+        # Create the directory under the sysroot where the minidump reports
+        # the module.
+        decoy_dir = os.path.join(sysroot_path, "invalid", "path", "on", "current", "system")
+        decoy_path = os.path.join(decoy_dir, "libbreakpad.so")
+        lldbutil.mkdir_p(decoy_dir)
+        self.yaml2obj("libbreakpad-decoy.yaml", decoy_path)
+        self.runCmd("platform select remote-linux --sysroot '%s'" % sysroot_path)
+        so_dir = os.path.join(self.getBuildDir(), "searchpath_dir")
+        so_path = os.path.join(so_dir, "libbreakpad.so")
+        lldbutil.mkdir_p(so_dir)
+        self.yaml2obj("libbreakpad.yaml", so_path)
+        self.runCmd('settings set target.exec-search-paths "%s"' % so_dir)
+        modules = self.get_minidump_modules("linux-arm-breakpad-uuid-match.yaml")
+        self.assertEqual(1, len(modules))
+        # LLDB makes up its own UUID as well when there is no build ID so we
+        # will check that this matches.
+        self.verify_module(modules[0], so_path, "D9C480E8")
+
     def test_breakpad_overflow_hash_match(self):
         """
             This is a similar to test_breakpad_hash_match, but it verifies that
diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad-decoy.yaml b/lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad-decoy.yaml
new file mode 100644
index 000000000000..028a12f54a09
--- /dev/null
+++ b/lldb/test/API/functionalities/postmortem/minidump-new/libbreakpad-decoy.yaml
@@ -0,0 +1,18 @@
+# This has different .text contents than libbreakpad-yaml,
+# to simulate having different versions of the module (to
+# test that we pick the one matching the minidump UUID).
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS32
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_ARM
+  Flags:           [ EF_ARM_SOFT_FLOAT, EF_ARM_EABI_VER5 ]
+Sections:
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x0000000000010000
+    AddressAlign:    0x0000000000000004
+    Content:         040000001400000003000000474E5500CC

From b618cf7a378d85f5630e24d3cf74ad3202732d72 Mon Sep 17 00:00:00 2001
From: Joseph Tremoulet <jotrem@microsoft.com>
Date: Fri, 30 Oct 2020 15:12:10 -0400
Subject: [PATCH 336/363] [lldb] GetSharedModule: Collect old modules in
 SmallVector

The various GetSharedModule methods have an optional out parameter for
the old module when a file has changed or been replaced, which the
Target uses to keep its module list current/correct.  We've been using
a single ModuleSP to track "the" old module, and this change switches
to using a SmallVector of ModuleSP, which has a couple benefits:
 - There are multiple codepaths which may discover an old module, and
   this centralizes the code for how to handle multiples in one place,
   in the Target code.  With the single ModuleSP, each place that may
   discover an old module is responsible for how it handles multiples,
   and the current code is inconsistent (some code paths drop the first
   old module, others drop the second).
 - The API will be more natural for identifying old modules in routines
   that work on sets, like ModuleList::ReplaceEquivalent (which I plan
   on updating to report old module(s) in a subsequent change to fix a
   bug).

I'm not convinced we can ever actually run into the case that multiple
old modules are found in the same GetOrCreateModule call, but I think
this change makes sense regardless, in light of the above.

When an old module is reported, Target::GetOrCreateModule calls
m_images.ReplaceModule, which doesn't allow multiple "old" modules; the
new code calls ReplaceModule for the first "old" module, and for any
subsequent old modules it logs the event and calls m_images.Remove.

Reviewed By: jingham

Differential Revision: https://reviews.llvm.org/D89156

(cherry picked from commit 61bfc703c3d36fbefc476cd3829065d983c1c792)
---
 lldb/include/lldb/Core/ModuleList.h           | 11 +--
 lldb/include/lldb/Target/Platform.h           |  9 +-
 lldb/source/Core/ModuleList.cpp               | 20 ++--
 .../Platform/MacOSX/PlatformDarwin.cpp        | 32 +++---
 .../Plugins/Platform/MacOSX/PlatformDarwin.h  |  6 +-
 .../Platform/MacOSX/PlatformDarwinKernel.cpp  |  6 +-
 .../Platform/MacOSX/PlatformDarwinKernel.h    |  2 +-
 .../Platform/MacOSX/PlatformMacOSX.cpp        | 21 ++--
 .../Plugins/Platform/MacOSX/PlatformMacOSX.h  |  2 +-
 .../MacOSX/PlatformRemoteDarwinDevice.cpp     | 19 ++--
 .../MacOSX/PlatformRemoteDarwinDevice.h       |  2 +-
 lldb/source/Target/Platform.cpp               | 27 +++--
 lldb/source/Target/Target.cpp                 | 98 ++++++++++++++-----
 13 files changed, 154 insertions(+), 101 deletions(-)

diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h
index ae1f6fdb20a2..c62021b4bf6b 100644
--- a/lldb/include/lldb/Core/ModuleList.h
+++ b/lldb/include/lldb/Core/ModuleList.h
@@ -443,12 +443,11 @@ class ModuleList {
 
   static bool ModuleIsInCache(const Module *module_ptr);
 
-  static Status GetSharedModule(const ModuleSpec &module_spec,
-                                lldb::ModuleSP &module_sp,
-                                const FileSpecList *module_search_paths_ptr,
-                                lldb::ModuleSP *old_module_sp_ptr,
-                                bool *did_create_ptr,
-                                bool always_create = false);
+  static Status
+  GetSharedModule(const ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
+                  const FileSpecList *module_search_paths_ptr,
+                  llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
+                  bool *did_create_ptr, bool always_create = false);
 
   static bool RemoveSharedModule(lldb::ModuleSP &module_sp);
 
diff --git a/lldb/include/lldb/Target/Platform.h b/lldb/include/lldb/Target/Platform.h
index 6234b8244b3f..277fcf68cb0c 100644
--- a/lldb/include/lldb/Target/Platform.h
+++ b/lldb/include/lldb/Target/Platform.h
@@ -301,11 +301,10 @@ class Platform : public PluginInterface {
   LocateExecutableScriptingResources(Target *target, Module &module,
                                      Stream *feedback_stream);
 
-  virtual Status GetSharedModule(const ModuleSpec &module_spec,
-                                 Process *process, lldb::ModuleSP &module_sp,
-                                 const FileSpecList *module_search_paths_ptr,
-                                 lldb::ModuleSP *old_module_sp_ptr,
-                                 bool *did_create_ptr);
+  virtual Status GetSharedModule(
+      const ModuleSpec &module_spec, Process *process,
+      lldb::ModuleSP &module_sp, const FileSpecList *module_search_paths_ptr,
+      llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr);
 
   virtual bool GetModuleSpec(const FileSpec &module_file_spec,
                              const ArchSpec &arch, ModuleSpec &module_spec);
diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp
index 0345678ddaff..76a861e33d0d 100644
--- a/lldb/source/Core/ModuleList.cpp
+++ b/lldb/source/Core/ModuleList.cpp
@@ -731,11 +731,11 @@ size_t ModuleList::RemoveOrphanSharedModules(bool mandatory) {
   return GetSharedModuleList().RemoveOrphans(mandatory);
 }
 
-Status ModuleList::GetSharedModule(const ModuleSpec &module_spec,
-                                   ModuleSP &module_sp,
-                                   const FileSpecList *module_search_paths_ptr,
-                                   ModuleSP *old_module_sp_ptr,
-                                   bool *did_create_ptr, bool always_create) {
+Status
+ModuleList::GetSharedModule(const ModuleSpec &module_spec, ModuleSP &module_sp,
+                            const FileSpecList *module_search_paths_ptr,
+                            llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
+                            bool *did_create_ptr, bool always_create) {
   ModuleList &shared_module_list = GetSharedModuleList();
   std::lock_guard<std::recursive_mutex> guard(
       shared_module_list.m_modules_mutex);
@@ -747,8 +747,6 @@ Status ModuleList::GetSharedModule(const ModuleSpec &module_spec,
 
   if (did_create_ptr)
     *did_create_ptr = false;
-  if (old_module_sp_ptr)
-    old_module_sp_ptr->reset();
 
   const UUID *uuid_ptr = module_spec.GetUUIDPtr();
   const FileSpec &module_file_spec = module_spec.GetFileSpec();
@@ -769,8 +767,8 @@ Status ModuleList::GetSharedModule(const ModuleSpec &module_spec,
 
         // Make sure the file for the module hasn't been modified
         if (module_sp->FileHasChanged()) {
-          if (old_module_sp_ptr && !*old_module_sp_ptr)
-            *old_module_sp_ptr = module_sp;
+          if (old_modules)
+            old_modules->push_back(module_sp);
 
           Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_MODULES));
           if (log != nullptr)
@@ -924,8 +922,8 @@ Status ModuleList::GetSharedModule(const ModuleSpec &module_spec,
             located_binary_modulespec.GetFileSpec());
         if (file_spec_mod_time != llvm::sys::TimePoint<>()) {
           if (file_spec_mod_time != module_sp->GetModificationTime()) {
-            if (old_module_sp_ptr)
-              *old_module_sp_ptr = module_sp;
+            if (old_modules)
+              old_modules->push_back(module_sp);
             shared_module_list.Remove(module_sp);
             module_sp.reset();
           }
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
index f5ec08a1a199..133eda93219c 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
@@ -221,7 +221,7 @@ BringInRemoteFile(Platform *platform,
 lldb_private::Status PlatformDarwin::GetSharedModuleWithLocalCache(
     const lldb_private::ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
     const lldb_private::FileSpecList *module_search_paths_ptr,
-    lldb::ModuleSP *old_module_sp_ptr, bool *did_create_ptr) {
+    llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) {
 
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PLATFORM));
   LLDB_LOGF(log,
@@ -238,7 +238,7 @@ lldb_private::Status PlatformDarwin::GetSharedModuleWithLocalCache(
   Status err;
 
   err = ModuleList::GetSharedModule(module_spec, module_sp,
-                                    module_search_paths_ptr, old_module_sp_ptr,
+                                    module_search_paths_ptr, old_modules,
                                     did_create_ptr);
   if (module_sp)
     return err;
@@ -341,8 +341,8 @@ lldb_private::Status PlatformDarwin::GetSharedModuleWithLocalCache(
 
 Status PlatformDarwin::GetSharedModule(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
-    const FileSpecList *module_search_paths_ptr, ModuleSP *old_module_sp_ptr,
-    bool *did_create_ptr) {
+    const FileSpecList *module_search_paths_ptr,
+    llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
   Status error;
   module_sp.reset();
 
@@ -351,16 +351,16 @@ Status PlatformDarwin::GetSharedModule(
     // module first.
     if (m_remote_platform_sp) {
       error = m_remote_platform_sp->GetSharedModule(
-          module_spec, process, module_sp, module_search_paths_ptr,
-          old_module_sp_ptr, did_create_ptr);
+          module_spec, process, module_sp, module_search_paths_ptr, old_modules,
+          did_create_ptr);
     }
   }
 
   if (!module_sp) {
     // Fall back to the local platform and find the file locally
     error = Platform::GetSharedModule(module_spec, process, module_sp,
-                                      module_search_paths_ptr,
-                                      old_module_sp_ptr, did_create_ptr);
+                                      module_search_paths_ptr, old_modules,
+                                      did_create_ptr);
 
     const FileSpec &platform_file = module_spec.GetFileSpec();
     if (!module_sp && module_search_paths_ptr && platform_file) {
@@ -373,7 +373,7 @@ Status PlatformDarwin::GetSharedModule(
           new_module_spec.GetFileSpec() = bundle_directory;
           if (Host::ResolveExecutableInBundle(new_module_spec.GetFileSpec())) {
             Status new_error(Platform::GetSharedModule(
-                new_module_spec, process, module_sp, nullptr, old_module_sp_ptr,
+                new_module_spec, process, module_sp, nullptr, old_modules,
                 did_create_ptr));
 
             if (module_sp)
@@ -400,8 +400,8 @@ Status PlatformDarwin::GetSharedModule(
                 ModuleSpec new_module_spec(module_spec);
                 new_module_spec.GetFileSpec() = new_file_spec;
                 Status new_error(Platform::GetSharedModule(
-                    new_module_spec, process, module_sp, nullptr,
-                    old_module_sp_ptr, did_create_ptr));
+                    new_module_spec, process, module_sp, nullptr, old_modules,
+                    did_create_ptr));
 
                 if (module_sp) {
                   module_sp->SetPlatformFileSpec(new_file_spec);
@@ -1639,8 +1639,8 @@ PlatformDarwin::LaunchProcess(lldb_private::ProcessLaunchInfo &launch_info) {
 
 lldb_private::Status PlatformDarwin::FindBundleBinaryInExecSearchPaths(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
-    const FileSpecList *module_search_paths_ptr, ModuleSP *old_module_sp_ptr,
-    bool *did_create_ptr) {
+    const FileSpecList *module_search_paths_ptr,
+    llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
   const FileSpec &platform_file = module_spec.GetFileSpec();
   // See if the file is present in any of the module_search_paths_ptr
   // directories.
@@ -1697,9 +1697,9 @@ lldb_private::Status PlatformDarwin::FindBundleBinaryInExecSearchPaths(
         if (FileSystem::Instance().Exists(path_to_try)) {
           ModuleSpec new_module_spec(module_spec);
           new_module_spec.GetFileSpec() = path_to_try;
-          Status new_error(Platform::GetSharedModule(
-              new_module_spec, process, module_sp, nullptr, old_module_sp_ptr,
-              did_create_ptr));
+          Status new_error(
+              Platform::GetSharedModule(new_module_spec, process, module_sp,
+                                        nullptr, old_modules, did_create_ptr));
 
           if (module_sp) {
             module_sp->SetPlatformFileSpec(path_to_try);
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h
index 8e28a7000310..4e9a9495893b 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h
@@ -46,7 +46,7 @@ class PlatformDarwin : public PlatformPOSIX {
   GetSharedModule(const lldb_private::ModuleSpec &module_spec,
                   lldb_private::Process *process, lldb::ModuleSP &module_sp,
                   const lldb_private::FileSpecList *module_search_paths_ptr,
-                  lldb::ModuleSP *old_module_sp_ptr,
+                  llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
                   bool *did_create_ptr) override;
 
   size_t GetSoftwareBreakpointTrapOpcode(
@@ -132,7 +132,7 @@ class PlatformDarwin : public PlatformPOSIX {
   virtual lldb_private::Status GetSharedModuleWithLocalCache(
       const lldb_private::ModuleSpec &module_spec, lldb::ModuleSP &module_sp,
       const lldb_private::FileSpecList *module_search_paths_ptr,
-      lldb::ModuleSP *old_module_sp_ptr, bool *did_create_ptr);
+      llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr);
 
   struct SDKEnumeratorInfo {
     lldb_private::FileSpec found_path;
@@ -158,7 +158,7 @@ class PlatformDarwin : public PlatformPOSIX {
       const lldb_private::ModuleSpec &module_spec,
       lldb_private::Process *process, lldb::ModuleSP &module_sp,
       const lldb_private::FileSpecList *module_search_paths_ptr,
-      lldb::ModuleSP *old_module_sp_ptr, bool *did_create_ptr);
+      llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr);
 
   static std::string FindComponentInPath(llvm::StringRef path,
                                          llvm::StringRef component);
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
index f6c0f262a379..79cbc940feb5 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
@@ -644,8 +644,8 @@ bool PlatformDarwinKernel::KernelHasdSYMSibling(const FileSpec &kernel_binary) {
 
 Status PlatformDarwinKernel::GetSharedModule(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
-    const FileSpecList *module_search_paths_ptr, ModuleSP *old_module_sp_ptr,
-    bool *did_create_ptr) {
+    const FileSpecList *module_search_paths_ptr,
+    llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
   Status error;
   module_sp.reset();
   const FileSpec &platform_file = module_spec.GetFileSpec();
@@ -676,7 +676,7 @@ Status PlatformDarwinKernel::GetSharedModule(
     // framework on macOS systems, a chance.
     error = PlatformDarwin::GetSharedModule(module_spec, process, module_sp,
                                            module_search_paths_ptr,
-                                           old_module_sp_ptr, did_create_ptr);
+                                           old_modules, did_create_ptr);
     if (error.Success() && module_sp.get()) {
       return error;
     }
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h
index 9cf9e41208eb..203bfb5e6066 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.h
@@ -57,7 +57,7 @@ class PlatformDarwinKernel : public PlatformDarwin {
   GetSharedModule(const lldb_private::ModuleSpec &module_spec,
                   lldb_private::Process *process, lldb::ModuleSP &module_sp,
                   const lldb_private::FileSpecList *module_search_paths_ptr,
-                  lldb::ModuleSP *old_module_sp_ptr,
+                  llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
                   bool *did_create_ptr) override;
 
   bool GetSupportedArchitectureAtIndex(uint32_t idx,
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
index 0b7f898ee0d3..cbdd2cde662b 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
@@ -292,10 +292,10 @@ lldb_private::Status PlatformMacOSX::GetSharedModule(
     const lldb_private::ModuleSpec &module_spec, Process *process,
     lldb::ModuleSP &module_sp,
     const lldb_private::FileSpecList *module_search_paths_ptr,
-    lldb::ModuleSP *old_module_sp_ptr, bool *did_create_ptr) {
-  Status error = GetSharedModuleWithLocalCache(
-      module_spec, module_sp, module_search_paths_ptr, old_module_sp_ptr,
-      did_create_ptr);
+    llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) {
+  Status error = GetSharedModuleWithLocalCache(module_spec, module_sp,
+                                               module_search_paths_ptr,
+                                               old_modules, did_create_ptr);
 
   if (module_sp) {
     if (module_spec.GetArchitecture().GetCore() ==
@@ -306,15 +306,16 @@ lldb_private::Status PlatformMacOSX::GetSharedModule(
         ModuleSpec module_spec_x86_64(module_spec);
         module_spec_x86_64.GetArchitecture() = ArchSpec("x86_64-apple-macosx");
         lldb::ModuleSP x86_64_module_sp;
-        lldb::ModuleSP old_x86_64_module_sp;
+        llvm::SmallVector<lldb::ModuleSP, 1> old_x86_64_modules;
         bool did_create = false;
         Status x86_64_error = GetSharedModuleWithLocalCache(
             module_spec_x86_64, x86_64_module_sp, module_search_paths_ptr,
-            &old_x86_64_module_sp, &did_create);
+            &old_x86_64_modules, &did_create);
         if (x86_64_module_sp && x86_64_module_sp->GetObjectFile()) {
           module_sp = x86_64_module_sp;
-          if (old_module_sp_ptr)
-            *old_module_sp_ptr = old_x86_64_module_sp;
+          if (old_modules)
+            old_modules->append(old_x86_64_modules.begin(),
+                                old_x86_64_modules.end());
           if (did_create_ptr)
             *did_create_ptr = did_create;
           return x86_64_error;
@@ -324,7 +325,9 @@ lldb_private::Status PlatformMacOSX::GetSharedModule(
   }
 
   if (!module_sp) {
-      error = FindBundleBinaryInExecSearchPaths (module_spec, process, module_sp, module_search_paths_ptr, old_module_sp_ptr, did_create_ptr);
+    error = FindBundleBinaryInExecSearchPaths(module_spec, process, module_sp,
+                                              module_search_paths_ptr,
+                                              old_modules, did_create_ptr);
   }
   return error;
 }
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h
index 30b11eb37684..deca3f06ab73 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.h
@@ -40,7 +40,7 @@ class PlatformMacOSX : public PlatformDarwin {
   GetSharedModule(const lldb_private::ModuleSpec &module_spec,
                   lldb_private::Process *process, lldb::ModuleSP &module_sp,
                   const lldb_private::FileSpecList *module_search_paths_ptr,
-                  lldb::ModuleSP *old_module_sp_ptr,
+                  llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
                   bool *did_create_ptr) override;
 
   const char *GetDescription() override {
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
index e4ede0dc638b..065eefa48fea 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
@@ -504,8 +504,8 @@ Status PlatformRemoteDarwinDevice::GetSymbolFile(const FileSpec &platform_file,
 
 Status PlatformRemoteDarwinDevice::GetSharedModule(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
-    const FileSpecList *module_search_paths_ptr, ModuleSP *old_module_sp_ptr,
-    bool *did_create_ptr) {
+    const FileSpecList *module_search_paths_ptr,
+    llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
   // For iOS, the SDK files are all cached locally on the host system. So first
   // we ask for the file in the cached SDK, then we attempt to get a shared
   // module for the right architecture with the right UUID.
@@ -608,24 +608,25 @@ Status PlatformRemoteDarwinDevice::GetSharedModule(
   // This may not be an SDK-related module.  Try whether we can bring in the
   // thing to our local cache.
   error = GetSharedModuleWithLocalCache(module_spec, module_sp,
-                                        module_search_paths_ptr,
-                                        old_module_sp_ptr, did_create_ptr);
+                                        module_search_paths_ptr, old_modules,
+                                        did_create_ptr);
   if (error.Success())
     return error;
 
   // See if the file is present in any of the module_search_paths_ptr
   // directories.
   if (!module_sp)
-    error = PlatformDarwin::FindBundleBinaryInExecSearchPaths (module_spec, process, module_sp,
-            module_search_paths_ptr, old_module_sp_ptr, did_create_ptr);
+    error = PlatformDarwin::FindBundleBinaryInExecSearchPaths(
+        module_spec, process, module_sp, module_search_paths_ptr, old_modules,
+        did_create_ptr);
 
   if (error.Success())
     return error;
 
   const bool always_create = false;
-  error = ModuleList::GetSharedModule(
-      module_spec, module_sp, module_search_paths_ptr, old_module_sp_ptr,
-      did_create_ptr, always_create);
+  error = ModuleList::GetSharedModule(module_spec, module_sp,
+                                      module_search_paths_ptr, old_modules,
+                                      did_create_ptr, always_create);
 
   if (module_sp)
     module_sp->SetPlatformFileSpec(platform_file);
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h
index b1b760f16d45..cc5f286f3b25 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.h
@@ -38,7 +38,7 @@ class PlatformRemoteDarwinDevice : public PlatformDarwin {
   GetSharedModule(const lldb_private::ModuleSpec &module_spec,
                   lldb_private::Process *process, lldb::ModuleSP &module_sp,
                   const lldb_private::FileSpecList *module_search_paths_ptr,
-                  lldb::ModuleSP *old_module_sp_ptr,
+                  llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
                   bool *did_create_ptr) override;
 
   void
diff --git a/lldb/source/Target/Platform.cpp b/lldb/source/Target/Platform.cpp
index 34ed7872c720..e5afb4c7b8d7 100644
--- a/lldb/source/Target/Platform.cpp
+++ b/lldb/source/Target/Platform.cpp
@@ -218,15 +218,14 @@ Platform::LocateExecutableScriptingResources(Target *target, Module &module,
 //    return PlatformSP();
 //}
 
-Status Platform::GetSharedModule(const ModuleSpec &module_spec,
-                                 Process *process, ModuleSP &module_sp,
-                                 const FileSpecList *module_search_paths_ptr,
-                                 ModuleSP *old_module_sp_ptr,
-                                 bool *did_create_ptr) {
+Status Platform::GetSharedModule(
+    const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
+    const FileSpecList *module_search_paths_ptr,
+    llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) {
   if (IsHost())
-    return ModuleList::GetSharedModule(
-        module_spec, module_sp, module_search_paths_ptr, old_module_sp_ptr,
-        did_create_ptr, false);
+    return ModuleList::GetSharedModule(module_spec, module_sp,
+                                       module_search_paths_ptr, old_modules,
+                                       did_create_ptr, false);
 
   // Module resolver lambda.
   auto resolver = [&](const ModuleSpec &spec) {
@@ -239,17 +238,17 @@ Status Platform::GetSharedModule(const ModuleSpec &module_spec,
       resolved_spec.GetFileSpec().PrependPathComponent(
           m_sdk_sysroot.GetStringRef());
       // Try to get shared module with resolved spec.
-      error = ModuleList::GetSharedModule(
-          resolved_spec, module_sp, module_search_paths_ptr, old_module_sp_ptr,
-          did_create_ptr, false);
+      error = ModuleList::GetSharedModule(resolved_spec, module_sp,
+                                          module_search_paths_ptr, old_modules,
+                                          did_create_ptr, false);
     }
     // If we don't have sysroot or it didn't work then
     // try original module spec.
     if (!error.Success()) {
       resolved_spec = spec;
-      error = ModuleList::GetSharedModule(
-          resolved_spec, module_sp, module_search_paths_ptr, old_module_sp_ptr,
-          did_create_ptr, false);
+      error = ModuleList::GetSharedModule(resolved_spec, module_sp,
+                                          module_search_paths_ptr, old_modules,
+                                          did_create_ptr, false);
     }
     if (error.Success() && module_sp)
       module_sp->SetPlatformFileSpec(resolved_spec.GetFileSpec());
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index 707344f99fcb..19d0c3d477eb 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -1965,8 +1965,9 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &module_spec, bool notify,
     module_sp = m_images.FindFirstModule(module_spec);
 
   if (!module_sp) {
-    ModuleSP old_module_sp; // This will get filled in if we have a new version
-                            // of the library
+    llvm::SmallVector<ModuleSP, 1>
+        old_modules; // This will get filled in if we have a new version
+                     // of the library
     bool did_create_module = false;
     FileSpecList search_paths = GetExecutableSearchPaths();
     // If there are image search path entries, try to use them first to acquire
@@ -1979,7 +1980,7 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &module_spec, bool notify,
         transformed_spec.GetFileSpec().GetFilename() =
             module_spec.GetFileSpec().GetFilename();
         error = ModuleList::GetSharedModule(transformed_spec, module_sp,
-                                            &search_paths, &old_module_sp,
+                                            &search_paths, &old_modules,
                                             &did_create_module);
       }
     }
@@ -1997,7 +1998,7 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &module_spec, bool notify,
         // We have a UUID, it is OK to check the global module list...
         error =
             ModuleList::GetSharedModule(module_spec, module_sp, &search_paths,
-                                        &old_module_sp, &did_create_module);
+                                        &old_modules, &did_create_module);
       }
 
       if (!module_sp) {
@@ -2006,7 +2007,7 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &module_spec, bool notify,
         if (m_platform_sp) {
           error = m_platform_sp->GetSharedModule(
               module_spec, m_process_sp.get(), module_sp, &search_paths,
-              &old_module_sp, &did_create_module);
+              &old_modules, &did_create_module);
         } else {
           error.SetErrorString("no platform is currently set");
         }
@@ -2057,18 +2058,18 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &module_spec, bool notify,
         // this target. So let's remove the UUID from the module list, and look
         // in the target's module list. Only do this if there is SOMETHING else
         // in the module spec...
-        if (!old_module_sp) {
-          if (module_spec.GetUUID().IsValid() &&
-              !module_spec.GetFileSpec().GetFilename().IsEmpty() &&
-              !module_spec.GetFileSpec().GetDirectory().IsEmpty()) {
-            ModuleSpec module_spec_copy(module_spec.GetFileSpec());
-            module_spec_copy.GetUUID().Clear();
-
-            ModuleList found_modules;
-            m_images.FindModules(module_spec_copy, found_modules);
-            if (found_modules.GetSize() == 1)
-              old_module_sp = found_modules.GetModuleAtIndex(0);
-          }
+        if (module_spec.GetUUID().IsValid() &&
+            !module_spec.GetFileSpec().GetFilename().IsEmpty() &&
+            !module_spec.GetFileSpec().GetDirectory().IsEmpty()) {
+          ModuleSpec module_spec_copy(module_spec.GetFileSpec());
+          module_spec_copy.GetUUID().Clear();
+
+          ModuleList found_modules;
+          m_images.FindModules(module_spec_copy, found_modules);
+          found_modules.ForEach([&](const ModuleSP &found_module) -> bool {
+            old_modules.push_back(found_module);
+            return true;
+          });
         }
 
         // Preload symbols outside of any lock, so hopefully we can do this for
@@ -2076,14 +2077,67 @@ ModuleSP Target::GetOrCreateModule(const ModuleSpec &module_spec, bool notify,
         if (GetPreloadSymbols())
           module_sp->PreloadSymbols();
 
-        if (old_module_sp && m_images.GetIndexForModule(old_module_sp.get()) !=
-                                 LLDB_INVALID_INDEX32) {
-          m_images.ReplaceModule(old_module_sp, module_sp);
+        llvm::SmallVector<ModuleSP, 1> replaced_modules;
+        for (ModuleSP &old_module_sp : old_modules) {
+          if (m_images.GetIndexForModule(old_module_sp.get()) !=
+              LLDB_INVALID_INDEX32) {
+            if (replaced_modules.empty())
+              m_images.ReplaceModule(old_module_sp, module_sp);
+            else
+              m_images.Remove(old_module_sp);
+
+            replaced_modules.push_back(std::move(old_module_sp));
+          }
+        }
+
+        if (replaced_modules.size() > 1) {
+          // The same new module replaced multiple old modules
+          // simultaneously.  It's not clear this should ever
+          // happen (if we always replace old modules as we add
+          // new ones, presumably we should never have more than
+          // one old one).  If there are legitimate cases where
+          // this happens, then the ModuleList::Notifier interface
+          // may need to be adjusted to allow reporting this.
+          // In the meantime, just log that this has happened; just
+          // above we called ReplaceModule on the first one, and Remove
+          // on the rest.
+          if (Log *log = GetLogIfAnyCategoriesSet(LIBLLDB_LOG_TARGET |
+                                                  LIBLLDB_LOG_MODULES)) {
+            StreamString message;
+            auto dump = [&message](Module &dump_module) -> void {
+              UUID dump_uuid = dump_module.GetUUID();
+
+              message << '[';
+              dump_module.GetDescription(message.AsRawOstream());
+              message << " (uuid ";
+
+              if (dump_uuid.IsValid())
+                dump_uuid.Dump(&message);
+              else
+                message << "not specified";
+
+              message << ")]";
+            };
+
+            message << "New module ";
+            dump(*module_sp);
+            message.AsRawOstream()
+                << llvm::formatv(" simultaneously replaced {0} old modules: ",
+                                 replaced_modules.size());
+            for (ModuleSP &replaced_module_sp : replaced_modules)
+              dump(*replaced_module_sp);
+
+            log->PutString(message.GetString());
+          }
+        }
+
+        if (replaced_modules.empty())
+          m_images.Append(module_sp, notify);
+
+        for (ModuleSP &old_module_sp : replaced_modules) {
           Module *old_module_ptr = old_module_sp.get();
           old_module_sp.reset();
           ModuleList::RemoveSharedModuleIfOrphaned(old_module_ptr);
-        } else {
-          m_images.Append(module_sp, notify);
         }
       } else
         module_sp.reset();

From abeec5d081f0dfbee305b70d7641c8e5af9a9335 Mon Sep 17 00:00:00 2001
From: Joseph Tremoulet <jotrem@microsoft.com>
Date: Fri, 30 Oct 2020 15:13:26 -0400
Subject: [PATCH 337/363] [lldb] Report old modules from
 ModuleList::ReplaceEquivalent

This allows the Target to update its module list when loading a shared
module replaces an equivalent one.

A testcase is added which hits this codepath -- without the fix, the
target reports libbreakpad.so twice in its module list.

Reviewed By: jingham

Differential Revision: https://reviews.llvm.org/D89157

(cherry picked from commit d20aa7ca422145fb4d07e16c1d0aa7de9e3554ea)
---
 lldb/include/lldb/Core/ModuleList.h           |  8 ++++++-
 lldb/source/Core/ModuleList.cpp               | 19 ++++++++++------
 .../minidump-new/TestMiniDumpUUID.py          | 22 +++++++++++++++++--
 3 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h
index c62021b4bf6b..d90b27e474ac 100644
--- a/lldb/include/lldb/Core/ModuleList.h
+++ b/lldb/include/lldb/Core/ModuleList.h
@@ -139,7 +139,13 @@ class ModuleList {
   ///
   /// \param[in] module_sp
   ///     A shared pointer to a module to replace in this collection.
-  void ReplaceEquivalent(const lldb::ModuleSP &module_sp);
+  ///
+  /// \param[in] old_modules
+  ///     Optional pointer to a vector which, if provided, will have shared
+  ///     pointers to the replaced module(s) appended to it.
+  void ReplaceEquivalent(
+      const lldb::ModuleSP &module_sp,
+      llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules = nullptr);
 
   /// Append a module to the module list, if it is not already there.
   ///
diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp
index 76a861e33d0d..1701cb56338e 100644
--- a/lldb/source/Core/ModuleList.cpp
+++ b/lldb/source/Core/ModuleList.cpp
@@ -171,7 +171,9 @@ void ModuleList::Append(const ModuleSP &module_sp, bool notify) {
   AppendImpl(module_sp, notify);
 }
 
-void ModuleList::ReplaceEquivalent(const ModuleSP &module_sp) {
+void ModuleList::ReplaceEquivalent(
+    const ModuleSP &module_sp,
+    llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules) {
   if (module_sp) {
     std::lock_guard<std::recursive_mutex> guard(m_modules_mutex);
 
@@ -184,11 +186,14 @@ void ModuleList::ReplaceEquivalent(const ModuleSP &module_sp) {
 
     size_t idx = 0;
     while (idx < m_modules.size()) {
-      ModuleSP module_sp(m_modules[idx]);
-      if (module_sp->MatchesModuleSpec(equivalent_module_spec))
+      ModuleSP test_module_sp(m_modules[idx]);
+      if (test_module_sp->MatchesModuleSpec(equivalent_module_spec)) {
+        if (old_modules)
+          old_modules->push_back(test_module_sp);
         RemoveImpl(m_modules.begin() + idx);
-      else
+      } else {
         ++idx;
+      }
     }
     // Now add the new module to the list
     Append(module_sp);
@@ -810,7 +815,7 @@ ModuleList::GetSharedModule(const ModuleSpec &module_spec, ModuleSP &module_sp,
           *did_create_ptr = true;
         }
 
-        shared_module_list.ReplaceEquivalent(module_sp);
+        shared_module_list.ReplaceEquivalent(module_sp, old_modules);
         return error;
       }
     }
@@ -847,7 +852,7 @@ ModuleList::GetSharedModule(const ModuleSpec &module_spec, ModuleSP &module_sp,
             if (did_create_ptr)
               *did_create_ptr = true;
 
-            shared_module_list.ReplaceEquivalent(module_sp);
+            shared_module_list.ReplaceEquivalent(module_sp, old_modules);
             return Status();
           }
         }
@@ -945,7 +950,7 @@ ModuleList::GetSharedModule(const ModuleSpec &module_spec, ModuleSP &module_sp,
           if (did_create_ptr)
             *did_create_ptr = true;
 
-          shared_module_list.ReplaceEquivalent(module_sp);
+          shared_module_list.ReplaceEquivalent(module_sp, old_modules);
         }
       } else {
         located_binary_modulespec.GetFileSpec().GetPath(path, sizeof(path));
diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpUUID.py b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpUUID.py
index 619f94a2cbb0..60ec48b459a9 100644
--- a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpUUID.py
+++ b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpUUID.py
@@ -31,10 +31,10 @@ def verify_module(self, module, verify_path, verify_uuid):
             os.path.normcase(module.GetFileSpec().dirname or ""))
         self.assertEqual(verify_uuid, module.GetUUIDString())
 
-    def get_minidump_modules(self, yaml_file):
+    def get_minidump_modules(self, yaml_file, exe = None):
         minidump_path = self.getBuildArtifact(os.path.basename(yaml_file) + ".dmp")
         self.yaml2obj(yaml_file, minidump_path)
-        self.target = self.dbg.CreateTarget(None)
+        self.target = self.dbg.CreateTarget(exe)
         self.process = self.target.LoadCore(minidump_path)
         return self.target.modules
 
@@ -265,6 +265,24 @@ def test_breakpad_overflow_hash_match(self):
         # will check that this matches.
         self.verify_module(modules[0], so_path, "48EB9FD7")
 
+    def test_breakpad_hash_match_exe_outside_sysroot(self):
+        """
+            Check that we can match the breakpad .text section hash when the
+            module is specified as the exe during launch, and a syroot is
+            provided, which does not contain the exe.
+        """
+        sysroot_path = os.path.join(self.getBuildDir(), "mock_sysroot")
+        lldbutil.mkdir_p(sysroot_path)
+        so_dir = os.path.join(self.getBuildDir(), "binary")
+        so_path = os.path.join(so_dir, "libbreakpad.so")
+        lldbutil.mkdir_p(so_dir)
+        self.yaml2obj("libbreakpad.yaml", so_path)
+        self.runCmd("platform select remote-linux --sysroot '%s'" % sysroot_path)
+        modules = self.get_minidump_modules("linux-arm-breakpad-uuid-match.yaml", so_path)
+        self.assertEqual(1, len(modules))
+        # LLDB makes up its own UUID as well when there is no build ID so we
+        # will check that this matches.
+        self.verify_module(modules[0], so_path, "D9C480E8")
 
     def test_facebook_hash_match(self):
         """

From ca8de9ad8895ab1368135f6fc63f29fe92b75c76 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sun, 15 Nov 2020 18:16:34 -0800
Subject: [PATCH 338/363] [X86] Fix crash with i64 bitreverse on 32-bit targets
 with XOP.

We unconditionally marked i64 as Custom, but did not install a
handler in ReplaceNodeResults when i64 isn't legal type. This
leads to ReplaceNodeResults asserting.

We have two options to fix this. Only mark i64 as Custom on
64-bit targets and let it expand to two i32 bitreverses which
each need a VPPERM. Or the other option is to add the Custom
handling to ReplaceNodeResults. This is what I went with.

(cherry picked from commit 57c0c4a27575840ae0a48eb9f8455a5ed087c857)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |   7 +
 llvm/test/CodeGen/X86/bitreverse.ll     | 198 ++++++++++++++++++++++++
 2 files changed, 205 insertions(+)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f5b704ebbe9d..56690c3c555b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30285,6 +30285,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(V);
     return;
   }
+  case ISD::BITREVERSE:
+    assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
+    assert(Subtarget.hasXOP() && "Expected XOP");
+    // We can use VPPERM by copying to a vector register and back. We'll need
+    // to move the scalar in two i32 pieces.
+    Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
+    return;
   }
 }
 
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index 343d9fb2da2d..8e2f6f9b463b 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
 ; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+xop | FileCheck %s --check-prefixes=X86XOP
 
 ; These tests just check that the plumbing is in place for @llvm.bitreverse. The
 ; actual output is massive at the moment as llvm.bitreverse is not yet legal.
@@ -75,6 +76,11 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
 ; X64-NEXT:    psrlw $1, %xmm0
 ; X64-NEXT:    por %xmm1, %xmm0
 ; X64-NEXT:    retq
+;
+; X86XOP-LABEL: test_bitreverse_v2i16:
+; X86XOP:       # %bb.0:
+; X86XOP-NEXT:    vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0
+; X86XOP-NEXT:    retl
   %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
   ret <2 x i16> %b
 }
@@ -145,6 +151,14 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; X64-NEXT:    shrq %rdx
 ; X64-NEXT:    leaq (%rdx,%rcx,2), %rax
 ; X64-NEXT:    retq
+;
+; X86XOP-LABEL: test_bitreverse_i64:
+; X86XOP:       # %bb.0:
+; X86XOP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; X86XOP-NEXT:    vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0
+; X86XOP-NEXT:    vmovd %xmm0, %eax
+; X86XOP-NEXT:    vpextrd $1, %xmm0, %edx
+; X86XOP-NEXT:    retl
   %b = call i64 @llvm.bitreverse.i64(i64 %a)
   ret i64 %b
 }
@@ -195,6 +209,13 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; X64-NEXT:    shrl %eax
 ; X64-NEXT:    leal (%rax,%rcx,2), %eax
 ; X64-NEXT:    retq
+;
+; X86XOP-LABEL: test_bitreverse_i32:
+; X86XOP:       # %bb.0:
+; X86XOP-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86XOP-NEXT:    vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0
+; X86XOP-NEXT:    vmovd %xmm0, %eax
+; X86XOP-NEXT:    retl
   %b = call i32 @llvm.bitreverse.i32(i32 %a)
   ret i32 %b
 }
@@ -247,6 +268,14 @@ define i24 @test_bitreverse_i24(i24 %a) nounwind {
 ; X64-NEXT:    leal (%rax,%rcx,2), %eax
 ; X64-NEXT:    shrl $8, %eax
 ; X64-NEXT:    retq
+;
+; X86XOP-LABEL: test_bitreverse_i24:
+; X86XOP:       # %bb.0:
+; X86XOP-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86XOP-NEXT:    vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0
+; X86XOP-NEXT:    vmovd %xmm0, %eax
+; X86XOP-NEXT:    shrl $8, %eax
+; X86XOP-NEXT:    retl
   %b = call i24 @llvm.bitreverse.i24(i24 %a)
   ret i24 %b
 }
@@ -299,6 +328,14 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; X64-NEXT:    leal (%rax,%rcx,2), %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
+;
+; X86XOP-LABEL: test_bitreverse_i16:
+; X86XOP:       # %bb.0:
+; X86XOP-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86XOP-NEXT:    vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0
+; X86XOP-NEXT:    vmovd %xmm0, %eax
+; X86XOP-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86XOP-NEXT:    retl
   %b = call i16 @llvm.bitreverse.i16(i16 %a)
   ret i16 %b
 }
@@ -342,6 +379,14 @@ define i8 @test_bitreverse_i8(i8 %a) {
 ; X64-NEXT:    addl %edi, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
+;
+; X86XOP-LABEL: test_bitreverse_i8:
+; X86XOP:       # %bb.0:
+; X86XOP-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86XOP-NEXT:    vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0
+; X86XOP-NEXT:    vmovd %xmm0, %eax
+; X86XOP-NEXT:    # kill: def $al killed $al killed $eax
+; X86XOP-NEXT:    retl
   %b = call i8 @llvm.bitreverse.i8(i8 %a)
   ret i8 %b
 }
@@ -387,6 +432,15 @@ define i4 @test_bitreverse_i4(i4 %a) {
 ; X64-NEXT:    shrb $4, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
+;
+; X86XOP-LABEL: test_bitreverse_i4:
+; X86XOP:       # %bb.0:
+; X86XOP-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86XOP-NEXT:    vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0
+; X86XOP-NEXT:    vmovd %xmm0, %eax
+; X86XOP-NEXT:    shrb $4, %al
+; X86XOP-NEXT:    # kill: def $al killed $al killed $eax
+; X86XOP-NEXT:    retl
   %b = call i4 @llvm.bitreverse.i4(i4 %a)
   ret i4 %b
 }
@@ -404,6 +458,11 @@ define <2 x i16> @fold_v2i16() {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movaps {{.*#+}} xmm0 = <61440,240,u,u,u,u,u,u>
 ; X64-NEXT:    retq
+;
+; X86XOP-LABEL: fold_v2i16:
+; X86XOP:       # %bb.0:
+; X86XOP-NEXT:    vmovaps {{.*#+}} xmm0 = <61440,240,u,u,u,u,u,u>
+; X86XOP-NEXT:    retl
   %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
   ret <2 x i16> %b
 }
@@ -418,6 +477,11 @@ define i24 @fold_i24() {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $2048, %eax # imm = 0x800
 ; X64-NEXT:    retq
+;
+; X86XOP-LABEL: fold_i24:
+; X86XOP:       # %bb.0:
+; X86XOP-NEXT:    movl $2048, %eax # imm = 0x800
+; X86XOP-NEXT:    retl
   %b = call i24 @llvm.bitreverse.i24(i24 4096)
   ret i24 %b
 }
@@ -432,6 +496,11 @@ define i8 @fold_i8() {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movb $-16, %al
 ; X64-NEXT:    retq
+;
+; X86XOP-LABEL: fold_i8:
+; X86XOP:       # %bb.0:
+; X86XOP-NEXT:    movb $-16, %al
+; X86XOP-NEXT:    retl
   %b = call i8 @llvm.bitreverse.i8(i8 15)
   ret i8 %b
 }
@@ -446,6 +515,11 @@ define i4 @fold_i4() {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movb $1, %al
 ; X64-NEXT:    retq
+;
+; X86XOP-LABEL: fold_i4:
+; X86XOP:       # %bb.0:
+; X86XOP-NEXT:    movb $1, %al
+; X86XOP-NEXT:    retl
   %b = call i4 @llvm.bitreverse.i4(i4 8)
   ret i4 %b
 }
@@ -463,6 +537,11 @@ define i8 @identity_i8(i8 %a) {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
+;
+; X86XOP-LABEL: identity_i8:
+; X86XOP:       # %bb.0:
+; X86XOP-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86XOP-NEXT:    retl
   %b = call i8 @llvm.bitreverse.i8(i8 %a)
   %c = call i8 @llvm.bitreverse.i8(i8 %b)
   ret i8 %c
@@ -478,6 +557,10 @@ define <2 x i16> @identity_v2i16(<2 x i16> %a) {
 ; X64-LABEL: identity_v2i16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    retq
+;
+; X86XOP-LABEL: identity_v2i16:
+; X86XOP:       # %bb.0:
+; X86XOP-NEXT:    retl
   %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
   %c = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %b)
   ret <2 x i16> %c
@@ -493,6 +576,10 @@ define i8 @undef_i8() {
 ; X64-LABEL: undef_i8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    retq
+;
+; X86XOP-LABEL: undef_i8:
+; X86XOP:       # %bb.0:
+; X86XOP-NEXT:    retl
   %b = call i8 @llvm.bitreverse.i8(i8 undef)
   ret i8 %b
 }
@@ -505,6 +592,10 @@ define <2 x i16> @undef_v2i16() {
 ; X64-LABEL: undef_v2i16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    retq
+;
+; X86XOP-LABEL: undef_v2i16:
+; X86XOP:       # %bb.0:
+; X86XOP-NEXT:    retl
   %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> undef)
   ret <2 x i16> %b
 }
@@ -1122,6 +1213,113 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X64-NEXT:    popq %r15
 ; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
+;
+; X86XOP-LABEL: large_promotion:
+; X86XOP:       # %bb.0:
+; X86XOP-NEXT:    pushl %ebp
+; X86XOP-NEXT:    pushl %ebx
+; X86XOP-NEXT:    pushl %edi
+; X86XOP-NEXT:    pushl %esi
+; X86XOP-NEXT:    subl $44, %esp
+; X86XOP-NEXT:    vmovdqa {{.*#+}} xmm0 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
+; X86XOP-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
+; X86XOP-NEXT:    vpextrd $1, %xmm1, %eax
+; X86XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
+; X86XOP-NEXT:    vmovd %xmm1, %ecx
+; X86XOP-NEXT:    shrdl $16, %ecx, %eax
+; X86XOP-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86XOP-NEXT:    vpextrd $1, %xmm1, %eax
+; X86XOP-NEXT:    shrdl $16, %eax, %ecx
+; X86XOP-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
+; X86XOP-NEXT:    vmovd %xmm1, %ecx
+; X86XOP-NEXT:    shrdl $16, %ecx, %eax
+; X86XOP-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86XOP-NEXT:    vpextrd $1, %xmm1, %eax
+; X86XOP-NEXT:    shrdl $16, %eax, %ecx
+; X86XOP-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
+; X86XOP-NEXT:    vmovd %xmm1, %ecx
+; X86XOP-NEXT:    shrdl $16, %ecx, %eax
+; X86XOP-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86XOP-NEXT:    vpextrd $1, %xmm1, %eax
+; X86XOP-NEXT:    shrdl $16, %eax, %ecx
+; X86XOP-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
+; X86XOP-NEXT:    vmovd %xmm1, %ecx
+; X86XOP-NEXT:    shrdl $16, %ecx, %eax
+; X86XOP-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86XOP-NEXT:    vpextrd $1, %xmm1, %eax
+; X86XOP-NEXT:    shrdl $16, %eax, %ecx
+; X86XOP-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
+; X86XOP-NEXT:    vmovd %xmm1, %ecx
+; X86XOP-NEXT:    shrdl $16, %ecx, %eax
+; X86XOP-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86XOP-NEXT:    vpextrd $1, %xmm1, %eax
+; X86XOP-NEXT:    shrdl $16, %eax, %ecx
+; X86XOP-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
+; X86XOP-NEXT:    vmovd %xmm1, %ebp
+; X86XOP-NEXT:    shrdl $16, %ebp, %eax
+; X86XOP-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86XOP-NEXT:    vpextrd $1, %xmm1, %ebx
+; X86XOP-NEXT:    shrdl $16, %ebx, %ebp
+; X86XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
+; X86XOP-NEXT:    vmovd %xmm1, %esi
+; X86XOP-NEXT:    shrdl $16, %esi, %ebx
+; X86XOP-NEXT:    vpextrd $1, %xmm1, %edx
+; X86XOP-NEXT:    shrdl $16, %edx, %esi
+; X86XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm0
+; X86XOP-NEXT:    vmovd %xmm0, %ecx
+; X86XOP-NEXT:    shrdl $16, %ecx, %edx
+; X86XOP-NEXT:    vpextrd $1, %xmm0, %edi
+; X86XOP-NEXT:    shrdl $16, %edi, %ecx
+; X86XOP-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86XOP-NEXT:    movl %ecx, 60(%eax)
+; X86XOP-NEXT:    movl %edx, 56(%eax)
+; X86XOP-NEXT:    movl %esi, 52(%eax)
+; X86XOP-NEXT:    movl %ebx, 48(%eax)
+; X86XOP-NEXT:    movl %ebp, 44(%eax)
+; X86XOP-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86XOP-NEXT:    movl %ecx, 40(%eax)
+; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86XOP-NEXT:    movl %ecx, 36(%eax)
+; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86XOP-NEXT:    movl %ecx, 32(%eax)
+; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86XOP-NEXT:    movl %ecx, 28(%eax)
+; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86XOP-NEXT:    movl %ecx, 24(%eax)
+; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86XOP-NEXT:    movl %ecx, 20(%eax)
+; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86XOP-NEXT:    movl %ecx, 16(%eax)
+; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86XOP-NEXT:    movl %ecx, 12(%eax)
+; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86XOP-NEXT:    movl %ecx, 8(%eax)
+; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86XOP-NEXT:    movl %ecx, 4(%eax)
+; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86XOP-NEXT:    movl %ecx, (%eax)
+; X86XOP-NEXT:    shrl $16, %edi
+; X86XOP-NEXT:    movw %di, 64(%eax)
+; X86XOP-NEXT:    addl $44, %esp
+; X86XOP-NEXT:    popl %esi
+; X86XOP-NEXT:    popl %edi
+; X86XOP-NEXT:    popl %ebx
+; X86XOP-NEXT:    popl %ebp
+; X86XOP-NEXT:    retl $4
   %Z = call i528 @llvm.bitreverse.i528(i528 %A)
   ret i528 %Z
 }

From fc23bc9b30bff900cb279318e8e60050688606f9 Mon Sep 17 00:00:00 2001
From: Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
Date: Mon, 12 Oct 2020 15:58:52 -0700
Subject: [PATCH 339/363] [SemaTemplate] Stop passing insertion position around
 during VarTemplate instantiation

They can get stale at use time because of updates from other recursive
specializations. Instead, rely on the existence of previous declarations to add
the specialization.

Differential Revision: https://reviews.llvm.org/D87853

(cherry picked from commit cffb0dd54d41d8e249d2009467c4beb5b681ba26)

This is a re-commit of 8ac709578067f77a7036fe50610277516fa36d50 with
some modifications to avoid changing the clang API.
---
 clang/lib/Sema/SemaTemplateInstantiateDecl.cpp      | 13 +++++--------
 .../test/SemaTemplate/instantiate-var-template.cpp  |  7 +++++++
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index baec13ba627c..7e6efe6105bf 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -3629,8 +3629,11 @@ Decl *TemplateDeclInstantiator::VisitVarTemplateSpecializationDecl(
       SemaRef.Context, Owner, D->getInnerLocStart(), D->getLocation(),
       VarTemplate, DI->getType(), DI, D->getStorageClass(), Converted);
   Var->setTemplateArgsInfo(TemplateArgsInfo);
-  if (InsertPos)
+  if (!PrevDecl) {
+    void *InsertPos = nullptr;
+    VarTemplate->findSpecialization(Converted, InsertPos);
     VarTemplate->AddSpecialization(Var, InsertPos);
+  }
 
   if (SemaRef.getLangOpts().OpenCL)
     SemaRef.deduceOpenCLAddressSpace(Var);
@@ -5311,7 +5314,7 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
                                           TemplateArgs);
     Var = cast_or_null<VarDecl>(Instantiator.VisitVarTemplateSpecializationDecl(
         VarSpec->getSpecializedTemplate(), Def, nullptr,
-        VarSpec->getTemplateArgsInfo(), VarSpec->getTemplateArgs().asArray()));
+        VarSpec->getTemplateArgsInfo(), VarSpec->getTemplateArgs().asArray(), VarSpec));
     if (Var) {
       llvm::PointerUnion<VarTemplateDecl *,
                          VarTemplatePartialSpecializationDecl *> PatternPtr =
@@ -5321,12 +5324,6 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
         cast<VarTemplateSpecializationDecl>(Var)->setInstantiationOf(
             Partial, &VarSpec->getTemplateInstantiationArgs());
 
-      // Merge the definition with the declaration.
-      LookupResult R(*this, Var->getDeclName(), Var->getLocation(),
-                     LookupOrdinaryName, forRedeclarationInCurContext());
-      R.addDecl(OldVar);
-      MergeVarDecl(Var, R);
-
       // Attach the initializer.
       InstantiateVariableInitializer(Var, Def, TemplateArgs);
     }
diff --git a/clang/test/SemaTemplate/instantiate-var-template.cpp b/clang/test/SemaTemplate/instantiate-var-template.cpp
index b7b83e4afdd5..a24b205da596 100644
--- a/clang/test/SemaTemplate/instantiate-var-template.cpp
+++ b/clang/test/SemaTemplate/instantiate-var-template.cpp
@@ -40,3 +40,10 @@ namespace PR24483 {
   template<typename... T> A<T...> models;
   template<> struct B models<>; // expected-error {{incomplete type 'struct B'}} expected-note {{forward declaration}}
 }
+
+namespace InvalidInsertPos {
+  template<typename T, int N> T v;
+  template<int N> decltype(v<int, N-1>) v<int, N>;
+  template<> int v<int, 0>;
+  int k = v<int, 500>;
+}

From 934376da585156885eca2f38f021367a4981d981 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Wed, 9 Dec 2020 17:51:32 +0300
Subject: [PATCH 340/363] [NFC][InstCombine] Add test coverage for
 @llvm.uadd.sat canonicalization

The non-strict variants are already handled because they are canonicalized
to strict variants by swapping hands in both the select and icmp,
and the fold simply considers that strictness is irrelevant here.

But that isn't actually true for the last pattern, as PR48390 reports.

(cherry picked from commit f16320b90b8381f2e3aac1ec17f39eff06f09ea0)
---
 .../InstCombine/saturating-add-sub.ll         | 178 ++++++++++++++++++
 1 file changed, 178 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll
index 7306bd42020e..3edafa17880b 100644
--- a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll
+++ b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll
@@ -1211,6 +1211,17 @@ define i32 @uadd_sat(i32 %x, i32 %y) {
   %r = select i1 %c, i32 -1, i32 %a
   ret i32 %r
 }
+define i32 @uadd_sat_nonstrict(i32 %x, i32 %y) {
+; CHECK-LABEL: @uadd_sat_nonstrict(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[X:%.*]], i32 [[Y:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %notx = xor i32 %x, -1
+  %a = add i32 %y, %x
+  %c = icmp ule i32 %notx, %y
+  %r = select i1 %c, i32 -1, i32 %a
+  ret i32 %r
+}
 
 define i32 @uadd_sat_commute_add(i32 %xp, i32 %y) {
 ; CHECK-LABEL: @uadd_sat_commute_add(
@@ -1239,6 +1250,19 @@ define i32 @uadd_sat_ugt(i32 %x, i32 %yp) {
   %r = select i1 %c, i32 -1, i32 %a
   ret i32 %r
 }
+define i32 @uadd_sat_uge(i32 %x, i32 %yp) {
+; CHECK-LABEL: @uadd_sat_uge(
+; CHECK-NEXT:    [[Y:%.*]] = sdiv i32 [[YP:%.*]], 2442
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[X:%.*]], i32 [[Y]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %y = sdiv i32 %yp, 2442 ; thwart complexity-based-canonicalization
+  %notx = xor i32 %x, -1
+  %a = add i32 %y, %x
+  %c = icmp uge i32 %y, %notx
+  %r = select i1 %c, i32 -1, i32 %a
+  ret i32 %r
+}
 
 define <2 x i32> @uadd_sat_ugt_commute_add(<2 x i32> %xp, <2 x i32> %yp) {
 ; CHECK-LABEL: @uadd_sat_ugt_commute_add(
@@ -1270,6 +1294,20 @@ define i32 @uadd_sat_commute_select(i32 %x, i32 %yp) {
   ret i32 %r
 }
 
+define i32 @uadd_sat_commute_select_nonstrict(i32 %x, i32 %yp) {
+; CHECK-LABEL: @uadd_sat_commute_select_nonstrict(
+; CHECK-NEXT:    [[Y:%.*]] = sdiv i32 [[YP:%.*]], 2442
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[X:%.*]], i32 [[Y]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %y = sdiv i32 %yp, 2442 ; thwart complexity-based-canonicalization
+  %notx = xor i32 %x, -1
+  %a = add i32 %y, %x
+  %c = icmp ule i32 %y, %notx
+  %r = select i1 %c, i32 %a, i32 -1
+  ret i32 %r
+}
+
 define i32 @uadd_sat_commute_select_commute_add(i32 %xp, i32 %yp) {
 ; CHECK-LABEL: @uadd_sat_commute_select_commute_add(
 ; CHECK-NEXT:    [[X:%.*]] = urem i32 42, [[XP:%.*]]
@@ -1357,6 +1395,19 @@ define i32 @uadd_sat_not(i32 %x, i32 %y) {
   ret i32 %r
 }
 
+define i32 @uadd_sat_not_nonstrict(i32 %x, i32 %y) {
+; CHECK-LABEL: @uadd_sat_not_nonstrict(
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[NOTX]], i32 [[Y:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %notx = xor i32 %x, -1
+  %a = add i32 %notx, %y
+  %c = icmp ule i32 %x, %y
+  %r = select i1 %c, i32 -1, i32 %a
+  ret i32 %r
+}
+
 define i32 @uadd_sat_not_commute_add(i32 %xp, i32 %yp) {
 ; CHECK-LABEL: @uadd_sat_not_commute_add(
 ; CHECK-NEXT:    [[X:%.*]] = srem i32 42, [[XP:%.*]]
@@ -1387,6 +1438,19 @@ define i32 @uadd_sat_not_ugt(i32 %x, i32 %y) {
   ret i32 %r
 }
 
+define i32 @uadd_sat_not_uge(i32 %x, i32 %y) {
+; CHECK-LABEL: @uadd_sat_not_uge(
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[NOTX]], i32 [[Y:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %notx = xor i32 %x, -1
+  %a = add i32 %notx, %y
+  %c = icmp uge i32 %y, %x
+  %r = select i1 %c, i32 -1, i32 %a
+  ret i32 %r
+}
+
 define <2 x i32> @uadd_sat_not_ugt_commute_add(<2 x i32> %x, <2 x i32> %yp) {
 ; CHECK-LABEL: @uadd_sat_not_ugt_commute_add(
 ; CHECK-NEXT:    [[Y:%.*]] = sdiv <2 x i32> [[YP:%.*]], <i32 2442, i32 4242>
@@ -1415,6 +1479,19 @@ define i32 @uadd_sat_not_commute_select(i32 %x, i32 %y) {
   ret i32 %r
 }
 
+define i32 @uadd_sat_not_commute_select_nonstrict(i32 %x, i32 %y) {
+; CHECK-LABEL: @uadd_sat_not_commute_select_nonstrict(
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[NOTX]], i32 [[Y:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %notx = xor i32 %x, -1
+  %a = add i32 %notx, %y
+  %c = icmp ule i32 %y, %x
+  %r = select i1 %c, i32 %a, i32 -1
+  ret i32 %r
+}
+
 define i32 @uadd_sat_not_commute_select_commute_add(i32 %x, i32 %yp) {
 ; CHECK-LABEL: @uadd_sat_not_commute_select_commute_add(
 ; CHECK-NEXT:    [[Y:%.*]] = sdiv i32 42, [[YP:%.*]]
@@ -1460,6 +1537,19 @@ define i32 @uadd_sat_not_commute_select_ugt_commute_add(i32 %x, i32 %y) {
   ret i32 %r
 }
 
+define i32 @uadd_sat_not_commute_select_uge_commute_add(i32 %x, i32 %y) {
+; CHECK-LABEL: @uadd_sat_not_commute_select_uge_commute_add(
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[NOTX]], i32 [[Y:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %notx = xor i32 %x, -1
+  %a = add i32 %notx, %y
+  %c = icmp uge i32 %x, %y
+  %r = select i1 %c, i32 %a, i32 -1
+  ret i32 %r
+}
+
 define i32 @uadd_sat_constant(i32 %x) {
 ; CHECK-LABEL: @uadd_sat_constant(
 ; CHECK-NEXT:    [[A:%.*]] = add i32 [[X:%.*]], 42
@@ -1700,3 +1790,91 @@ define i32 @unsigned_sat_constant_using_min_wrong_constant(i32 %x) {
   %r = add i32 %s, -42
   ret i32 %r
 }
+
+define i32 @uadd_sat_via_add(i32 %x, i32 %y) {
+; CHECK-LABEL: @uadd_sat_via_add(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %a = add i32 %x, %y
+  %c = icmp ult i32 %a, %y
+  %r = select i1 %c, i32 -1, i32 %a
+  ret i32 %r
+}
+
+define i32 @uadd_sat_via_add_nonstrict(i32 %x, i32 %y) {
+; CHECK-LABEL: @uadd_sat_via_add_nonstrict(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %a = add i32 %x, %y
+  %c = icmp ule i32 %a, %y
+  %r = select i1 %c, i32 -1, i32 %a
+  ret i32 %r
+}
+
+define i32 @uadd_sat_via_add_swapped_select(i32 %x, i32 %y) {
+; CHECK-LABEL: @uadd_sat_via_add_swapped_select(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %a = add i32 %x, %y
+  %c = icmp uge i32 %a, %y
+  %r = select i1 %c, i32 %a, i32 -1
+  ret i32 %r
+}
+
+define i32 @uadd_sat_via_add_swapped_select_strict(i32 %x, i32 %y) {
+; CHECK-LABEL: @uadd_sat_via_add_swapped_select_strict(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %a = add i32 %x, %y
+  %c = icmp ugt i32 %a, %y
+  %r = select i1 %c, i32 %a, i32 -1
+  ret i32 %r
+}
+
+define i32 @uadd_sat_via_add_swapped_cmp(i32 %x, i32 %y) {
+; CHECK-LABEL: @uadd_sat_via_add_swapped_cmp(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %a = add i32 %x, %y
+  %c = icmp ugt i32 %y, %a
+  %r = select i1 %c, i32 -1, i32 %a
+  ret i32 %r
+}
+
+define i32 @uadd_sat_via_add_swapped_cmp_nonstrict(i32 %x, i32 %y) {
+; CHECK-LABEL: @uadd_sat_via_add_swapped_cmp_nonstrict(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %a = add i32 %x, %y
+  %c = icmp uge i32 %y, %a
+  %r = select i1 %c, i32 -1, i32 %a
+  ret i32 %r
+}
+
+define i32 @uadd_sat_via_add_swapped_cmp_nonstric(i32 %x, i32 %y) {
+; CHECK-LABEL: @uadd_sat_via_add_swapped_cmp_nonstric(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %a = add i32 %x, %y
+  %c = icmp ule i32 %y, %a
+  %r = select i1 %c, i32 %a, i32 -1
+  ret i32 %r
+}
+
+define i32 @uadd_sat_via_add_swapped_cmp_select_nonstrict(i32 %x, i32 %y) {
+; CHECK-LABEL: @uadd_sat_via_add_swapped_cmp_select_nonstrict(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[X:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %a = add i32 %x, %y
+  %c = icmp ult i32 %y, %a
+  %r = select i1 %c, i32 %a, i32 -1
+  ret i32 %r
+}

From 8511a8df838f8b2766e56c22af4992d9862835fc Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Wed, 9 Dec 2020 18:04:08 +0300
Subject: [PATCH 341/363] [InstCombine] canonicalizeSaturatedAdd(): last fold
 is only valid for strict comparison (PR48390)

We could create uadd.sat under incorrect circumstances
if a select with -1 as the false value was canonicalized
by swapping the T/F values. Unlike the other transforms
in the same function, it is not invariant to equality.

Some alive proofs: https://alive2.llvm.org/ce/z/emmKKL

Based on original patch by David Green!

Fixes https://bugs.llvm.org/show_bug.cgi?id=48390

Differential Revision: https://reviews.llvm.org/D92717

(cherry picked from commit e6f2a79d7aa01f8dd7f0194f97a50b480e8ede71)
---
 .../InstCombine/InstCombineSelect.cpp         | 20 +++++++++-------
 .../InstCombine/saturating-add-sub.ll         | 24 ++++++++++++-------
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index fa695c39cd1e..1e43014e7d32 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -782,25 +782,24 @@ static Value *canonicalizeSaturatedAdd(ICmpInst *Cmp, Value *TVal, Value *FVal,
 
   // Match unsigned saturated add of 2 variables with an unnecessary 'not'.
   // There are 8 commuted variants.
-  // Canonicalize -1 (saturated result) to true value of the select. Just
-  // swapping the compare operands is legal, because the selected value is the
-  // same in case of equality, so we can interchange u< and u<=.
+  // Canonicalize -1 (saturated result) to true value of the select.
   if (match(FVal, m_AllOnes())) {
     std::swap(TVal, FVal);
-    std::swap(Cmp0, Cmp1);
+    Pred = CmpInst::getInversePredicate(Pred);
   }
   if (!match(TVal, m_AllOnes()))
     return nullptr;
 
-  // Canonicalize predicate to 'ULT'.
-  if (Pred == ICmpInst::ICMP_UGT) {
-    Pred = ICmpInst::ICMP_ULT;
+  // Canonicalize predicate to less-than or less-or-equal-than.
+  if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE) {
     std::swap(Cmp0, Cmp1);
+    Pred = CmpInst::getSwappedPredicate(Pred);
   }
-  if (Pred != ICmpInst::ICMP_ULT)
+  if (Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_ULE)
     return nullptr;
 
   // Match unsigned saturated add of 2 variables with an unnecessary 'not'.
+  // Strictness of the comparison is irrelevant.
   Value *Y;
   if (match(Cmp0, m_Not(m_Value(X))) &&
       match(FVal, m_c_Add(m_Specific(X), m_Value(Y))) && Y == Cmp1) {
@@ -809,6 +808,7 @@ static Value *canonicalizeSaturatedAdd(ICmpInst *Cmp, Value *TVal, Value *FVal,
     return Builder.CreateBinaryIntrinsic(Intrinsic::uadd_sat, X, Y);
   }
   // The 'not' op may be included in the sum but not the compare.
+  // Strictness of the comparison is irrelevant.
   X = Cmp0;
   Y = Cmp1;
   if (match(FVal, m_c_Add(m_Not(m_Specific(X)), m_Specific(Y)))) {
@@ -819,7 +819,9 @@ static Value *canonicalizeSaturatedAdd(ICmpInst *Cmp, Value *TVal, Value *FVal,
         Intrinsic::uadd_sat, BO->getOperand(0), BO->getOperand(1));
   }
   // The overflow may be detected via the add wrapping round.
-  if (match(Cmp0, m_c_Add(m_Specific(Cmp1), m_Value(Y))) &&
+  // This is only valid for strict comparison!
+  if (Pred == ICmpInst::ICMP_ULT &&
+      match(Cmp0, m_c_Add(m_Specific(Cmp1), m_Value(Y))) &&
       match(FVal, m_c_Add(m_Specific(Cmp1), m_Specific(Y)))) {
     // ((X + Y) u< X) ? -1 : (X + Y) --> uadd.sat(X, Y)
     // ((X + Y) u< Y) ? -1 : (X + Y) --> uadd.sat(X, Y)
diff --git a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll
index 3edafa17880b..ca45eb5bf429 100644
--- a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll
+++ b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll
@@ -1804,8 +1804,10 @@ define i32 @uadd_sat_via_add(i32 %x, i32 %y) {
 
 define i32 @uadd_sat_via_add_nonstrict(i32 %x, i32 %y) {
 ; CHECK-LABEL: @uadd_sat_via_add_nonstrict(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[X:%.*]])
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    [[A:%.*]] = add i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp ugt i32 [[A]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[C_NOT]], i32 [[A]], i32 -1
+; CHECK-NEXT:    ret i32 [[R]]
 ;
   %a = add i32 %x, %y
   %c = icmp ule i32 %a, %y
@@ -1826,8 +1828,10 @@ define i32 @uadd_sat_via_add_swapped_select(i32 %x, i32 %y) {
 
 define i32 @uadd_sat_via_add_swapped_select_strict(i32 %x, i32 %y) {
 ; CHECK-LABEL: @uadd_sat_via_add_swapped_select_strict(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[X:%.*]])
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    [[A:%.*]] = add i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt i32 [[A]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[C]], i32 [[A]], i32 -1
+; CHECK-NEXT:    ret i32 [[R]]
 ;
   %a = add i32 %x, %y
   %c = icmp ugt i32 %a, %y
@@ -1848,8 +1852,10 @@ define i32 @uadd_sat_via_add_swapped_cmp(i32 %x, i32 %y) {
 
 define i32 @uadd_sat_via_add_swapped_cmp_nonstrict(i32 %x, i32 %y) {
 ; CHECK-LABEL: @uadd_sat_via_add_swapped_cmp_nonstrict(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[X:%.*]])
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    [[A:%.*]] = add i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp ugt i32 [[A]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[C_NOT]], i32 [[A]], i32 -1
+; CHECK-NEXT:    ret i32 [[R]]
 ;
   %a = add i32 %x, %y
   %c = icmp uge i32 %y, %a
@@ -1870,8 +1876,10 @@ define i32 @uadd_sat_via_add_swapped_cmp_nonstric(i32 %x, i32 %y) {
 
 define i32 @uadd_sat_via_add_swapped_cmp_select_nonstrict(i32 %x, i32 %y) {
 ; CHECK-LABEL: @uadd_sat_via_add_swapped_cmp_select_nonstrict(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[Y:%.*]], i32 [[X:%.*]])
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    [[A:%.*]] = add i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt i32 [[A]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[C]], i32 [[A]], i32 -1
+; CHECK-NEXT:    ret i32 [[R]]
 ;
   %a = add i32 %x, %y
   %c = icmp ult i32 %y, %a

From 861b2a24bc49f8ad801df5035347d7073eaafee5 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Fri, 11 Dec 2020 12:20:40 +0100
Subject: [PATCH 342/363] [KernelAddressSanitizer] Fix globals exclusion for
 indirect aliases

GlobalAlias::getAliasee() may not always point directly to a
GlobalVariable. In such cases, try to find the canonical GlobalVariable
that the alias refers to.

Link: https://github.com/ClangBuiltLinux/linux/issues/1208

Reviewed By: dvyukov, nickdesaulniers

Differential Revision: https://reviews.llvm.org/D92846

(cherry picked from commit c28b18af19621e6b5cca257ef7139ba93833df0c)
---
 clang/test/CodeGen/asan-globals-alias.cpp     | 33 ++++++++++++++++---
 .../Instrumentation/AddressSanitizer.cpp      | 20 +++++------
 2 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/clang/test/CodeGen/asan-globals-alias.cpp b/clang/test/CodeGen/asan-globals-alias.cpp
index faf160ac79c9..c859d2f2b44a 100644
--- a/clang/test/CodeGen/asan-globals-alias.cpp
+++ b/clang/test/CodeGen/asan-globals-alias.cpp
@@ -1,17 +1,42 @@
 // RUN: %clang_cc1 -triple x86_64-linux -fsanitize=address -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,ASAN
+// RUN: %clang_cc1 -triple x86_64-linux -O2 -fsanitize=address -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,ASAN
 // RUN: %clang_cc1 -triple x86_64-linux -fsanitize=kernel-address -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,KASAN
+// RUN: %clang_cc1 -triple x86_64-linux -O2 -fsanitize=kernel-address -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,KASAN
 //
 // Not all platforms support aliases - test for Linux only.
 
-int global;                                                         // to generate ctor for at least 1 global
-int aliased_global;                                                 // KASAN - ignore globals prefixed by aliases with __-prefix (below)
-extern int __attribute__((alias("aliased_global"))) __global_alias; // KASAN - aliased_global ignored
+int global; // generate ctor for at least 1 global
+int aliased_global; // KASAN ignored
+extern int __attribute__((alias("aliased_global"))) __global_alias;
+
+// Recursive alias:
+int aliased_global_2; // KASAN ignored
+extern int __attribute__((alias("aliased_global_2"))) global_alias_2;
+extern int __attribute__((alias("global_alias_2"))) __global_alias_2_alias;
+
+// Potential indirect alias:
+struct input_device_id {
+  unsigned long keybit[24];
+  unsigned long driver_info;
+};
+struct input_device_id joydev_ids[] = { { {1}, 1234 } }; // KASAN ignored
+extern struct input_device_id __attribute__((alias("joydev_ids"))) __mod_joydev_ids_device_table;
 
 // ASAN: @aliased_global{{.*}} global { i32, [60 x i8] }{{.*}}, align 32
+// ASAN: @aliased_global_2{{.*}} global { i32, [60 x i8] }{{.*}}, align 32
+// ASAN: @joydev_ids{{.*}} global { {{.*}}[56 x i8] zeroinitializer }, align 32
 // KASAN: @aliased_global{{.*}} global i32
+// KASAN: @aliased_global_2{{.*}} global i32
+// KASAN: @joydev_ids{{.*}} global [1 x {{.*}}i64 1234 }], align 16
+
+// Check the aliases exist:
+// CHECK: @__global_alias = alias
+// CHECK: @global_alias_2 = alias
+// CHECK: @__global_alias_2_alias = alias
+// CHECK: @__mod_joydev_ids_device_table = alias
 
 // CHECK-LABEL: define internal void @asan.module_ctor
-// ASAN: call void @__asan_register_globals({{.*}}, i{{32|64}} 2)
+// ASAN: call void @__asan_register_globals({{.*}}, i{{32|64}} 4)
 // KASAN: call void @__asan_register_globals({{.*}}, i{{32|64}} 1)
 // CHECK-NEXT: ret void
 
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index ee09a4d9db7e..1557fad4d372 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -792,7 +792,7 @@ class ModuleAddressSanitizer {
                                   StringRef InternalSuffix);
   Instruction *CreateAsanModuleDtor(Module &M);
 
-  bool canInstrumentAliasedGlobal(const GlobalAlias &GA) const;
+  const GlobalVariable *getExcludedAliasedGlobal(const GlobalAlias &GA) const;
   bool shouldInstrumentGlobal(GlobalVariable *G) const;
   bool ShouldUseMachOGlobalsSection() const;
   StringRef getGlobalMetadataSection() const;
@@ -1784,20 +1784,22 @@ void ModuleAddressSanitizer::createInitializerPoisonCalls(
   }
 }
 
-bool ModuleAddressSanitizer::canInstrumentAliasedGlobal(
-    const GlobalAlias &GA) const {
+const GlobalVariable *
+ModuleAddressSanitizer::getExcludedAliasedGlobal(const GlobalAlias &GA) const {
   // In case this function should be expanded to include rules that do not just
   // apply when CompileKernel is true, either guard all existing rules with an
   // 'if (CompileKernel) { ... }' or be absolutely sure that all these rules
   // should also apply to user space.
   assert(CompileKernel && "Only expecting to be called when compiling kernel");
 
+  const Constant *C = GA.getAliasee();
+
   // When compiling the kernel, globals that are aliased by symbols prefixed
   // by "__" are special and cannot be padded with a redzone.
   if (GA.getName().startswith("__"))
-    return false;
+    return dyn_cast<GlobalVariable>(C->stripPointerCastsAndAliases());
 
-  return true;
+  return nullptr;
 }
 
 bool ModuleAddressSanitizer::shouldInstrumentGlobal(GlobalVariable *G) const {
@@ -2256,14 +2258,12 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
   *CtorComdat = false;
 
   // Build set of globals that are aliased by some GA, where
-  // canInstrumentAliasedGlobal(GA) returns false.
+  // getExcludedAliasedGlobal(GA) returns the relevant GlobalVariable.
   SmallPtrSet<const GlobalVariable *, 16> AliasedGlobalExclusions;
   if (CompileKernel) {
     for (auto &GA : M.aliases()) {
-      if (const auto *GV = dyn_cast<GlobalVariable>(GA.getAliasee())) {
-        if (!canInstrumentAliasedGlobal(GA))
-          AliasedGlobalExclusions.insert(GV);
-      }
+      if (const GlobalVariable *GV = getExcludedAliasedGlobal(GA))
+        AliasedGlobalExclusions.insert(GV);
     }
   }
 

From 700baa009dc685a0adc5f94d258be4ae24742471 Mon Sep 17 00:00:00 2001
From: Tobias Burnus <tobias@codesourcery.com>
Date: Fri, 11 Dec 2020 16:43:06 +0000
Subject: [PATCH 343/363] [MC][ELF] Accept abbreviated form with sh_flags and
 sh_entsize

D73999 / commit 75af9da755721123e62b45cd0bc0c5e688a9722a
added for LLVM 11 a check that sh_flags and sh_entsize (and sh_type)
changes are an error, in line with GNU assembler.

However, GNU assembler accepts and GCC generates an abbreviated form:
while the first .section contains the flags and entsize, subsequent
sections simply contain the name without repeating entsize or flags.

Do likewise for better compatibility.

See https://bugs.llvm.org/show_bug.cgi?id=48201

Reviewed By: jhenderson, MaskRay

Differential Revision: https://reviews.llvm.org/D92052

(cherry picked from commit 1deff4009e0ae661b03682901bf6932297ce7ea1)
---
 llvm/lib/MC/MCParser/ELFAsmParser.cpp         |  7 +++++--
 llvm/test/MC/ELF/section-flags-changed.s      |  3 +++
 llvm/test/MC/ELF/section-omitted-attributes.s | 11 +++++++++++
 3 files changed, 19 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/MC/ELF/section-omitted-attributes.s

diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index e5ab13bc719d..fb8215ef2281 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -644,10 +644,13 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
       !(SectionName == ".eh_frame" && Type == ELF::SHT_PROGBITS))
     Error(loc, "changed section type for " + SectionName + ", expected: 0x" +
                    utohexstr(Section->getType()));
-  if (Section->getFlags() != Flags)
+  // Check that flags are used consistently. However, the GNU assembler permits
+  // to leave out in subsequent uses of the same sections; for compatibility,
+  // do likewise.
+  if ((Flags || Size || !TypeName.empty()) && Section->getFlags() != Flags)
     Error(loc, "changed section flags for " + SectionName + ", expected: 0x" +
                    utohexstr(Section->getFlags()));
-  if (Section->getEntrySize() != Size)
+  if ((Flags || Size || !TypeName.empty()) && Section->getEntrySize() != Size)
     Error(loc, "changed section entsize for " + SectionName +
                    ", expected: " + Twine(Section->getEntrySize()));
 
diff --git a/llvm/test/MC/ELF/section-flags-changed.s b/llvm/test/MC/ELF/section-flags-changed.s
index 65f52cc29a6d..d2964ef046d1 100644
--- a/llvm/test/MC/ELF/section-flags-changed.s
+++ b/llvm/test/MC/ELF/section-flags-changed.s
@@ -9,4 +9,7 @@ foo:
 # CHECK: {{.*}}.s:[[# @LINE+1]]:1: error: changed section flags for .foo, expected: 0x6
 .pushsection .foo,"a",@progbits
 
+# CHECK: {{.*}}.s:[[# @LINE+1]]:1: error: changed section flags for .foo, expected: 0x6
+.section .foo,"",@progbits
+
 .pushsection .foo,"ax",@progbits
diff --git a/llvm/test/MC/ELF/section-omitted-attributes.s b/llvm/test/MC/ELF/section-omitted-attributes.s
new file mode 100644
index 000000000000..72b7c9121387
--- /dev/null
+++ b/llvm/test/MC/ELF/section-omitted-attributes.s
@@ -0,0 +1,11 @@
+# RUN: llvm-mc -triple=x86_64 %s -o - | FileCheck %s
+
+# If section flags and other attributes are omitted, don't error.
+
+# CHECK: .section        .foo,"aM",@progbits,1
+
+.section .foo,"aM",@progbits,1
+
+.section .foo
+
+.pushsection .foo

From 200eb1abe2a1088a155f47b95dd1d35b3f37afa6 Mon Sep 17 00:00:00 2001
From: Erich Keane <erich.keane@intel.com>
Date: Mon, 7 Dec 2020 11:27:01 -0800
Subject: [PATCH 344/363] Stop ExtractTypeForDeductionGuide from recursing on
 TypeSourceInfo

As reported in PR48177, the type-deduction extraction ends up going into
an infinite loop when the type referred to has a recursive definition.
This stops recursing and just substitutes the type-source-info the
TypeLocBuilder identified when transforming the base.

(cherry picked from commit 1c98f984105e552daa83ed8e92c61fba0e401410)
---
 clang/lib/Sema/SemaTemplate.cpp     |  3 +--
 clang/test/AST/deduction-guides.cpp | 40 +++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index c05ed0b14e3e..f788cf103503 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -1963,8 +1963,7 @@ class ExtractTypeForDeductionGuide
     TypeLocBuilder InnerTLB;
     QualType Transformed =
         TransformType(InnerTLB, OrigDecl->getTypeSourceInfo()->getTypeLoc());
-    TypeSourceInfo *TSI =
-        TransformType(InnerTLB.getTypeSourceInfo(Context, Transformed));
+    TypeSourceInfo *TSI = InnerTLB.getTypeSourceInfo(Context, Transformed);
 
     TypedefNameDecl *Decl = nullptr;
 
diff --git a/clang/test/AST/deduction-guides.cpp b/clang/test/AST/deduction-guides.cpp
index 0f5293bc063d..3a7f0bf4699e 100644
--- a/clang/test/AST/deduction-guides.cpp
+++ b/clang/test/AST/deduction-guides.cpp
@@ -37,3 +37,43 @@ HasDeductionGuideTypeAlias()->HasDeductionGuideTypeAlias<int>;
 // CHECK: CXXDeductionGuideDecl {{.*}} implicit <deduction guide for HasDeductionGuideTypeAlias> 'auto (HasDeductionGuideTypeAlias<T>) -> HasDeductionGuideTypeAlias<T>'
 // CHECK: CXXDeductionGuideDecl {{.*}} <deduction guide for HasDeductionGuideTypeAlias> 'auto () -> HasDeductionGuideTypeAlias<int>'
 } // namespace PR46111
+
+
+namespace PR48177 {
+  template <class A> struct Base {
+    using type_alias = A;
+  };
+  template<class T, int S, class A>
+  struct Derived : Base<A> {
+    using type_alias = typename Derived::type_alias;
+    Derived(Derived &&, typename Derived::type_alias const&);
+    Derived(T);
+  };
+
+  template<class T, class A>
+  Derived(T, A) -> Derived<T, 1, A>;
+
+  void init() {
+    Derived d {1,2};
+  }
+} // namespace PR48177
+
+// CHECK: CXXRecordDecl {{.*}} struct Derived
+// CHECK: TypeAliasDecl {{.*}} type_alias 'typename Derived<T, S, A>::type_alias'
+// CHECK-NEXT: DependentNameType {{.*}} 'typename Derived<T, S, A>::type_alias' dependent
+
+// CHECK: CXXRecordDecl {{.*}} struct Derived
+// CHECK: TypeAliasDecl {{.*}} type_alias 'typename Derived<int, 1, int>::type_alias':'int'
+// CHECK-NEXT: ElaboratedType {{.*}} 'typename Derived<int, 1, int>::type_alias' sugar
+// CHECK-NEXT: TypedefType {{.*}} 'PR48177::Base<int>::type_alias' sugar
+// CHECK-NEXT: TypeAlias {{.*}} 'type_alias'
+// CHECK-NEXT: SubstTemplateTypeParmType {{.*}} 'int' sugar
+// CHECK-NEXT: TemplateTypeParmType {{.*}} 'A'
+// CHECK-NEXT: TemplateTypeParm {{.*}} 'A'
+// CHECK-NEXT: BuiltinType {{.*}} 'int'
+
+// CHECK: CXXDeductionGuideDecl {{.*}} implicit <deduction guide for Derived> 'auto (Derived<T, S, A> &&, const typename Derived<T, S, A>::type_alias &) -> Derived<T, S, A>'
+// CHECK: CXXDeductionGuideDecl {{.*}} implicit <deduction guide for Derived> 'auto (T) -> Derived<T, S, A>'
+// CHECK: CXXDeductionGuideDecl {{.*}} implicit <deduction guide for Derived> 'auto (Derived<T, S, A>) -> Derived<T, S, A>'
+// CHECK: CXXDeductionGuideDecl {{.*}} <deduction guide for Derived> 'auto (T, A) -> Derived<T, 1, A>'
+// CHECK: CXXDeductionGuideDecl {{.*}} <deduction guide for Derived> 'auto (int, int) -> Derived<int, 1, int>'

From adf845300c9cc023d386d97b6ebeb1e82bd97763 Mon Sep 17 00:00:00 2001
From: Raul Tambre <raul@tambre.ee>
Date: Fri, 4 Sep 2020 19:10:09 +0300
Subject: [PATCH 345/363] [Sema] Introduce BuiltinAttr, per-declaration
 builtin-ness

Instead of relying on whether a certain identifier is a builtin, introduce BuiltinAttr to specify a declaration as having builtin semantics.

This fixes incompatible redeclarations of builtins, as reverting the identifier as being builtin due to one incompatible redeclaration would have broken rest of the builtin calls.
Mostly-compatible redeclarations of builtins also no longer have builtin semantics. They don't call the builtin nor inherit their attributes.
A long-standing FIXME regarding builtins inside a namespace enclosed in extern "C" not being recognized is also addressed.

Due to the more correct handling attributes for builtin functions are added in more places, resulting in more useful warnings.
Tests are updated to reflect that.

Intrinsics without an inline definition in intrin.h had `inline` and `static` removed as they had no effect and caused them to no longer be recognized as builtins otherwise.

A pthread_create() related test is XFAIL-ed, as it relied on it being recognized as a builtin based on its name.
The builtin declaration syntax is too restrictive and doesn't allow custom structs, function pointers, etc.
It seems to be the only case and fixing this would require reworking the current builtin syntax, so this seems acceptable.

Fixes PR45410.

Reviewed By: rsmith, yutsumi

Differential Revision: https://reviews.llvm.org/D77491
---
 clang/include/clang/Basic/Attr.td             |   8 +
 clang/include/clang/Basic/Builtins.def        |   1 +
 clang/include/clang/Basic/IdentifierTable.h   |  12 --
 clang/include/clang/Sema/Sema.h               |   2 +
 clang/lib/AST/Decl.cpp                        |  28 +---
 clang/lib/Headers/intrin.h                    | 145 +++++++-----------
 clang/lib/Sema/SemaDecl.cpp                   | 119 ++++++++------
 clang/lib/Sema/SemaExpr.cpp                   |   1 +
 clang/lib/Sema/SemaLookup.cpp                 |   7 +-
 clang/lib/Serialization/ASTReader.cpp         |  12 +-
 clang/lib/Serialization/ASTWriter.cpp         |   6 +-
 clang/test/AST/ast-dump-attr.cpp              |   1 +
 clang/test/CodeGen/builtin-redeclaration.c    |  16 ++
 clang/test/CodeGen/callback_pthread_create.c  |   7 +-
 clang/test/CodeGenCXX/builtins.cpp            |  14 ++
 clang/test/Sema/implicit-builtin-decl.c       |   5 -
 clang/test/Sema/warn-fortify-source.c         |  19 +--
 clang/test/SemaCXX/cxx11-compat.cpp           |   2 +-
 .../SemaCXX/warn-unused-local-typedef.cpp     |   4 +-
 19 files changed, 193 insertions(+), 216 deletions(-)
 create mode 100644 clang/test/CodeGen/builtin-redeclaration.c

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 19eccf7ceadf..60eaee7839e2 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -3444,3 +3444,11 @@ def ReleaseHandle : InheritableParamAttr {
   let Subjects = SubjectList<[ParmVar]>;
   let Documentation = [ReleaseHandleDocs];
 }
+
+def Builtin : InheritableAttr {
+  let Spellings = [];
+  let Args = [UnsignedArgument<"ID">];
+  let Subjects = SubjectList<[Function]>;
+  let SemaHandler = 0;
+  let Documentation = [Undocumented];
+}
diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index 1416a64543a4..01c28ebab763 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -1017,6 +1017,7 @@ LIBBUILTIN(strncasecmp, "icC*cC*z", "f",   "strings.h", ALL_GNU_LANGUAGES)
 LIBBUILTIN(_exit, "vi",           "fr",    "unistd.h", ALL_GNU_LANGUAGES)
 LIBBUILTIN(vfork, "p",            "fj",    "unistd.h", ALL_LANGUAGES)
 // POSIX pthread.h
+// FIXME: Should specify argument types.
 LIBBUILTIN(pthread_create, "",  "fC<2,3>", "pthread.h", ALL_GNU_LANGUAGES)
 
 // POSIX setjmp.h
diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h
index fc554a35e721..204a0f0cc0a5 100644
--- a/clang/include/clang/Basic/IdentifierTable.h
+++ b/clang/include/clang/Basic/IdentifierTable.h
@@ -225,18 +225,6 @@ class alignas(IdentifierInfoAlignment) IdentifierInfo {
   }
   void setObjCKeywordID(tok::ObjCKeywordKind ID) { ObjCOrBuiltinID = ID; }
 
-  /// True if setNotBuiltin() was called.
-  bool hasRevertedBuiltin() const {
-    return ObjCOrBuiltinID == tok::NUM_OBJC_KEYWORDS;
-  }
-
-  /// Revert the identifier to a non-builtin identifier. We do this if
-  /// the name of a known builtin library function is used to declare that
-  /// function, but an unexpected type is specified.
-  void revertBuiltin() {
-    setBuiltinID(0);
-  }
-
   /// Return a value indicating whether this is a builtin function.
   ///
   /// 0 is not-built-in. 1+ are specific builtin functions.
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 6f7ad8076718..16a7084f6b08 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -3957,6 +3957,8 @@ class Sema final {
   ObjCInterfaceDecl *getObjCInterfaceDecl(IdentifierInfo *&Id,
                                           SourceLocation IdLoc,
                                           bool TypoCorrection = false);
+  FunctionDecl *CreateBuiltin(IdentifierInfo *II, QualType Type, unsigned ID,
+                              SourceLocation Loc);
   NamedDecl *LazilyCreateBuiltin(IdentifierInfo *II, unsigned ID,
                                  Scope *S, bool ForRedeclaration,
                                  SourceLocation Loc);
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 5c0a98815dd7..67490da5c584 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3162,44 +3162,24 @@ FunctionDecl *FunctionDecl::getCanonicalDecl() { return getFirstDecl(); }
 /// functions as their wrapped builtins. This shouldn't be done in general, but
 /// it's useful in Sema to diagnose calls to wrappers based on their semantics.
 unsigned FunctionDecl::getBuiltinID(bool ConsiderWrapperFunctions) const {
-  unsigned BuiltinID;
+  unsigned BuiltinID = 0;
 
   if (const auto *ABAA = getAttr<ArmBuiltinAliasAttr>()) {
     BuiltinID = ABAA->getBuiltinName()->getBuiltinID();
-  } else {
-    if (!getIdentifier())
-      return 0;
-
-    BuiltinID = getIdentifier()->getBuiltinID();
+  } else if (const auto *A = getAttr<BuiltinAttr>()) {
+    BuiltinID = A->getID();
   }
 
   if (!BuiltinID)
     return 0;
 
-  ASTContext &Context = getASTContext();
-  if (Context.getLangOpts().CPlusPlus) {
-    const auto *LinkageDecl =
-        dyn_cast<LinkageSpecDecl>(getFirstDecl()->getDeclContext());
-    // In C++, the first declaration of a builtin is always inside an implicit
-    // extern "C".
-    // FIXME: A recognised library function may not be directly in an extern "C"
-    // declaration, for instance "extern "C" { namespace std { decl } }".
-    if (!LinkageDecl) {
-      if (BuiltinID == Builtin::BI__GetExceptionInfo &&
-          Context.getTargetInfo().getCXXABI().isMicrosoft())
-        return Builtin::BI__GetExceptionInfo;
-      return 0;
-    }
-    if (LinkageDecl->getLanguage() != LinkageSpecDecl::lang_c)
-      return 0;
-  }
-
   // If the function is marked "overloadable", it has a different mangled name
   // and is not the C library function.
   if (!ConsiderWrapperFunctions && hasAttr<OverloadableAttr>() &&
       !hasAttr<ArmBuiltinAliasAttr>())
     return 0;
 
+  ASTContext &Context = getASTContext();
   if (!Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID))
     return BuiltinID;
 
diff --git a/clang/lib/Headers/intrin.h b/clang/lib/Headers/intrin.h
index 871b47ca8267..e7b76a3bb2ed 100644
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@@ -57,16 +57,11 @@ void __addfsbyte(unsigned long, unsigned char);
 void __addfsdword(unsigned long, unsigned long);
 void __addfsword(unsigned long, unsigned short);
 void __code_seg(const char *);
-static __inline__
 void __cpuid(int[4], int);
-static __inline__
 void __cpuidex(int[4], int, int);
-static __inline__
 __int64 __emul(int, int);
-static __inline__
 unsigned __int64 __emulu(unsigned int, unsigned int);
 unsigned int __getcallerseflags(void);
-static __inline__
 void __halt(void);
 unsigned char __inbyte(unsigned short);
 void __inbytestring(unsigned short, unsigned char *, unsigned long);
@@ -82,13 +77,9 @@ void __inwordstring(unsigned short, unsigned short *, unsigned long);
 void __lidt(void *);
 unsigned __int64 __ll_lshift(unsigned __int64, int);
 __int64 __ll_rshift(__int64, int);
-static __inline__
 void __movsb(unsigned char *, unsigned char const *, size_t);
-static __inline__
 void __movsd(unsigned long *, unsigned long const *, size_t);
-static __inline__
 void __movsw(unsigned short *, unsigned short const *, size_t);
-static __inline__
 void __nop(void);
 void __nvreg_restore_fence(void);
 void __nvreg_save_fence(void);
@@ -105,23 +96,16 @@ unsigned long __readcr4(void);
 unsigned long __readcr8(void);
 unsigned int __readdr(unsigned int);
 #ifdef __i386__
-static __inline__
 unsigned char __readfsbyte(unsigned long);
-static __inline__
 unsigned __int64 __readfsqword(unsigned long);
-static __inline__
 unsigned short __readfsword(unsigned long);
 #endif
-static __inline__
 unsigned __int64 __readmsr(unsigned long);
 unsigned __int64 __readpmc(unsigned long);
 unsigned long __segmentlimit(unsigned long);
 void __sidt(void *);
-static __inline__
 void __stosb(unsigned char *, unsigned char, size_t);
-static __inline__
 void __stosd(unsigned long *, unsigned long, size_t);
-static __inline__
 void __stosw(unsigned short *, unsigned short, size_t);
 void __svm_clgi(void);
 void __svm_invlpga(void *, int);
@@ -136,7 +120,6 @@ void __vmx_off(void);
 void __vmx_vmptrst(unsigned __int64 *);
 void __wbinvd(void);
 void __writecr0(unsigned int);
-static __inline__
 void __writecr3(unsigned __INTPTR_TYPE__);
 void __writecr4(unsigned int);
 void __writecr8(unsigned int);
@@ -146,11 +129,8 @@ void __writefsdword(unsigned long, unsigned long);
 void __writefsqword(unsigned long, unsigned __int64);
 void __writefsword(unsigned long, unsigned short);
 void __writemsr(unsigned long, unsigned __int64);
-static __inline__
 void *_AddressOfReturnAddress(void);
-static __inline__
 unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
-static __inline__
 unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
 unsigned char _bittest(long const *, long);
 unsigned char _bittestandcomplement(long *, long);
@@ -169,12 +149,10 @@ long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
 long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
 __int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
 __int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
-static __inline__ void
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_ReadBarrier(void);
-static __inline__ void
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_ReadWriteBarrier(void);
+void __attribute__((__deprecated__(
+    "use other intrinsics or C++11 atomics instead"))) _ReadBarrier(void);
+void __attribute__((__deprecated__(
+    "use other intrinsics or C++11 atomics instead"))) _ReadWriteBarrier(void);
 unsigned int _rorx_u32(unsigned int, const unsigned int);
 int _sarx_i32(int, unsigned int);
 #if __STDC_HOSTED__
@@ -185,9 +163,8 @@ unsigned int _shrx_u32(unsigned int, unsigned int);
 void _Store_HLERelease(long volatile *, long);
 void _Store64_HLERelease(__int64 volatile *, __int64);
 void _StorePointer_HLERelease(void *volatile *, void *);
-static __inline__ void
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_WriteBarrier(void);
+void __attribute__((__deprecated__(
+    "use other intrinsics or C++11 atomics instead"))) _WriteBarrier(void);
 unsigned __int32 xbegin(void);
 void _xend(void);
 
@@ -197,19 +174,14 @@ void __addgsbyte(unsigned long, unsigned char);
 void __addgsdword(unsigned long, unsigned long);
 void __addgsqword(unsigned long, unsigned __int64);
 void __addgsword(unsigned long, unsigned short);
-static __inline__
 void __faststorefence(void);
 void __incgsbyte(unsigned long);
 void __incgsdword(unsigned long);
 void __incgsqword(unsigned long);
 void __incgsword(unsigned long);
-static __inline__
 void __movsq(unsigned long long *, unsigned long long const *, size_t);
-static __inline__
 unsigned char __readgsbyte(unsigned long);
-static __inline__
 unsigned long __readgsdword(unsigned long);
-static __inline__
 unsigned __int64 __readgsqword(unsigned long);
 unsigned short __readgsword(unsigned long);
 unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
@@ -218,7 +190,6 @@ unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
 unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
                                  unsigned __int64 _HighPart,
                                  unsigned char _Shift);
-static __inline__
 void __stosq(unsigned __int64 *, unsigned __int64, size_t);
 unsigned char __vmx_on(unsigned __int64 *);
 unsigned char __vmx_vmclear(unsigned __int64 *);
@@ -269,13 +240,9 @@ unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
 __int64 _sarx_i64(__int64, unsigned int);
 unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
 unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
-static __inline__
 __int64 __mulh(__int64, __int64);
-static __inline__
 unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
-static __inline__
 __int64 _mul128(__int64, __int64, __int64*);
-static __inline__
 unsigned __int64 _umul128(unsigned __int64,
                           unsigned __int64,
                           unsigned __int64*);
@@ -284,29 +251,19 @@ unsigned __int64 _umul128(unsigned __int64,
 
 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
 
-static __inline__
 unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
-static __inline__
 unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
 
 #endif
 
 #if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
-static __inline__
 __int64 _InterlockedDecrement64(__int64 volatile *_Addend);
-static __inline__
 __int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
-static __inline__
 __int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
-static __inline__
 __int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
-static __inline__
 __int64 _InterlockedIncrement64(__int64 volatile *_Addend);
-static __inline__
 __int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
-static __inline__
 __int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
-static __inline__
 __int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
 
 #endif
@@ -475,40 +432,56 @@ __int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
 |* movs, stos
 \*----------------------------------------------------------------------------*/
 #if defined(__i386__) || defined(__x86_64__)
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
+static __inline__ void __DEFAULT_FN_ATTRS __movsb(unsigned char *__dst,
+                                                  unsigned char const *__src,
+                                                  size_t __n) {
   __asm__ __volatile__("rep movsb" : "+D"(__dst), "+S"(__src), "+c"(__n)
                        : : "memory");
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
-  __asm__ __volatile__("rep movsl" : "+D"(__dst), "+S"(__src), "+c"(__n)
-                       : : "memory");
+static __inline__ void __DEFAULT_FN_ATTRS __movsd(unsigned long *__dst,
+                                                  unsigned long const *__src,
+                                                  size_t __n) {
+  __asm__ __volatile__("rep movsl"
+                       : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       :
+                       : "memory");
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
-  __asm__ __volatile__("rep movsw" : "+D"(__dst), "+S"(__src), "+c"(__n)
-                       : : "memory");
+static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst,
+                                                  unsigned short const *__src,
+                                                  size_t __n) {
+  __asm__ __volatile__("rep movsw"
+                       : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       :
+                       : "memory");
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
-  __asm__ __volatile__("rep stosl" : "+D"(__dst), "+c"(__n) : "a"(__x)
+static __inline__ void __DEFAULT_FN_ATTRS __stosd(unsigned long *__dst,
+                                                  unsigned long __x,
+                                                  size_t __n) {
+  __asm__ __volatile__("rep stosl"
+                       : "+D"(__dst), "+c"(__n)
+                       : "a"(__x)
                        : "memory");
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
-  __asm__ __volatile__("rep stosw" : "+D"(__dst), "+c"(__n) : "a"(__x)
+static __inline__ void __DEFAULT_FN_ATTRS __stosw(unsigned short *__dst,
+                                                  unsigned short __x,
+                                                  size_t __n) {
+  __asm__ __volatile__("rep stosw"
+                       : "+D"(__dst), "+c"(__n)
+                       : "a"(__x)
                        : "memory");
 }
 #endif
 #ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
-  __asm__ __volatile__("rep movsq" : "+D"(__dst), "+S"(__src), "+c"(__n)
-                       : : "memory");
+static __inline__ void __DEFAULT_FN_ATTRS __movsq(
+    unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
+  __asm__ __volatile__("rep movsq"
+                       : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       :
+                       : "memory");
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
+static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst,
+                                                  unsigned __int64 __x,
+                                                  size_t __n) {
   __asm__ __volatile__("rep stosq" : "+D"(__dst), "+c"(__n) : "a"(__x)
                        : "memory");
 }
@@ -518,26 +491,25 @@ __stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
 |* Misc
 \*----------------------------------------------------------------------------*/
 #if defined(__i386__) || defined(__x86_64__)
-static __inline__ void __DEFAULT_FN_ATTRS
-__cpuid(int __info[4], int __level) {
-  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
-                   : "a"(__level), "c"(0));
+static __inline__ void __DEFAULT_FN_ATTRS __cpuid(int __info[4], int __level) {
+  __asm__("cpuid"
+          : "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
+          : "a"(__level), "c"(0));
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__cpuidex(int __info[4], int __level, int __ecx) {
-  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
-                   : "a"(__level), "c"(__ecx));
+static __inline__ void __DEFAULT_FN_ATTRS __cpuidex(int __info[4], int __level,
+                                                    int __ecx) {
+  __asm__("cpuid"
+          : "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
+          : "a"(__level), "c"(__ecx));
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__halt(void) {
-  __asm__ volatile ("hlt");
+static __inline__ void __DEFAULT_FN_ATTRS __halt(void) {
+  __asm__ volatile("hlt");
 }
 #endif
 
 #if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__)
-static __inline__ void __DEFAULT_FN_ATTRS
-__nop(void) {
-  __asm__ volatile ("nop");
+static __inline__ void __DEFAULT_FN_ATTRS __nop(void) {
+  __asm__ volatile("nop");
 }
 #endif
 
@@ -574,8 +546,7 @@ __readmsr(unsigned long __register) {
 }
 #endif
 
-static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS
-__readcr3(void) {
+static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS __readcr3(void) {
   unsigned __LPTRINT_TYPE__ __cr3_val;
   __asm__ __volatile__ ("mov %%cr3, %0" : "=r"(__cr3_val) : : "memory");
   return __cr3_val;
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 3e2b61ae8cdf..2703d9876f85 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -2054,6 +2054,42 @@ static StringRef getHeaderName(Builtin::Context &BuiltinInfo, unsigned ID,
   llvm_unreachable("unhandled error kind");
 }
 
+FunctionDecl *Sema::CreateBuiltin(IdentifierInfo *II, QualType Type,
+                                  unsigned ID, SourceLocation Loc) {
+  DeclContext *Parent = Context.getTranslationUnitDecl();
+
+  if (getLangOpts().CPlusPlus) {
+    LinkageSpecDecl *CLinkageDecl = LinkageSpecDecl::Create(
+        Context, Parent, Loc, Loc, LinkageSpecDecl::lang_c, false);
+    CLinkageDecl->setImplicit();
+    Parent->addDecl(CLinkageDecl);
+    Parent = CLinkageDecl;
+  }
+
+  FunctionDecl *New = FunctionDecl::Create(Context, Parent, Loc, Loc, II, Type,
+                                           /*TInfo=*/nullptr, SC_Extern, false,
+                                           Type->isFunctionProtoType());
+  New->setImplicit();
+  New->addAttr(BuiltinAttr::CreateImplicit(Context, ID));
+
+  // Create Decl objects for each parameter, adding them to the
+  // FunctionDecl.
+  if (const FunctionProtoType *FT = dyn_cast<FunctionProtoType>(Type)) {
+    SmallVector<ParmVarDecl *, 16> Params;
+    for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
+      ParmVarDecl *parm = ParmVarDecl::Create(
+          Context, New, SourceLocation(), SourceLocation(), nullptr,
+          FT->getParamType(i), /*TInfo=*/nullptr, SC_None, nullptr);
+      parm->setScopeInfo(0, i);
+      Params.push_back(parm);
+    }
+    New->setParams(Params);
+  }
+
+  AddKnownFunctionAttributes(New);
+  return New;
+}
+
 /// LazilyCreateBuiltin - The specified Builtin-ID was first used at
 /// file scope.  lazily create a decl for it. ForRedeclaration is true
 /// if we're creating this built-in in anticipation of redeclaring the
@@ -2105,40 +2141,7 @@ NamedDecl *Sema::LazilyCreateBuiltin(IdentifierInfo *II, unsigned ID,
   if (R.isNull())
     return nullptr;
 
-  DeclContext *Parent = Context.getTranslationUnitDecl();
-  if (getLangOpts().CPlusPlus) {
-    LinkageSpecDecl *CLinkageDecl =
-        LinkageSpecDecl::Create(Context, Parent, Loc, Loc,
-                                LinkageSpecDecl::lang_c, false);
-    CLinkageDecl->setImplicit();
-    Parent->addDecl(CLinkageDecl);
-    Parent = CLinkageDecl;
-  }
-
-  FunctionDecl *New = FunctionDecl::Create(Context,
-                                           Parent,
-                                           Loc, Loc, II, R, /*TInfo=*/nullptr,
-                                           SC_Extern,
-                                           false,
-                                           R->isFunctionProtoType());
-  New->setImplicit();
-
-  // Create Decl objects for each parameter, adding them to the
-  // FunctionDecl.
-  if (const FunctionProtoType *FT = dyn_cast<FunctionProtoType>(R)) {
-    SmallVector<ParmVarDecl*, 16> Params;
-    for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
-      ParmVarDecl *parm =
-          ParmVarDecl::Create(Context, New, SourceLocation(), SourceLocation(),
-                              nullptr, FT->getParamType(i), /*TInfo=*/nullptr,
-                              SC_None, nullptr);
-      parm->setScopeInfo(0, i);
-      Params.push_back(parm);
-    }
-    New->setParams(Params);
-  }
-
-  AddKnownFunctionAttributes(New);
+  FunctionDecl *New = CreateBuiltin(II, R, ID, Loc);
   RegisterLocallyScopedExternCDecl(New, S);
 
   // TUScope is the translation-unit scope to insert this function into.
@@ -2146,7 +2149,7 @@ NamedDecl *Sema::LazilyCreateBuiltin(IdentifierInfo *II, unsigned ID,
   // relate Scopes to DeclContexts, and probably eliminate CurContext
   // entirely, but we're not there yet.
   DeclContext *SavedContext = CurContext;
-  CurContext = Parent;
+  CurContext = New->getDeclContext();
   PushOnScopeChains(New, TUScope);
   CurContext = SavedContext;
   return New;
@@ -3348,7 +3351,10 @@ bool Sema::MergeFunctionDecl(FunctionDecl *New, NamedDecl *&OldD,
       // there but not here.
       NewTypeInfo = NewTypeInfo.withCallingConv(OldTypeInfo.getCC());
       RequiresAdjustment = true;
-    } else if (New->getBuiltinID()) {
+    } else if (Old->getBuiltinID()) {
+      // Builtin attribute isn't propagated to the new one yet at this point,
+      // so we check if the old one is a builtin.
+
       // Calling Conventions on a Builtin aren't really useful and setting a
       // default calling convention and cdecl'ing some builtin redeclarations is
       // common, so warn and ignore the calling convention on the redeclaration.
@@ -3781,18 +3787,6 @@ bool Sema::MergeFunctionDecl(FunctionDecl *New, NamedDecl *&OldD,
       Diag(New->getLocation(), diag::warn_redecl_library_builtin) << New;
       Diag(OldLocation, diag::note_previous_builtin_declaration)
         << Old << Old->getType();
-
-      // If this is a global redeclaration, just forget hereafter
-      // about the "builtin-ness" of the function.
-      //
-      // Doing this for local extern declarations is problematic.  If
-      // the builtin declaration remains visible, a second invalid
-      // local declaration will produce a hard error; if it doesn't
-      // remain visible, a single bogus local redeclaration (which is
-      // actually only a warning) could break all the downstream code.
-      if (!New->getLexicalDeclContext()->isFunctionOrMethod())
-        New->getIdentifier()->revertBuiltin();
-
       return false;
     }
 
@@ -9636,6 +9630,35 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
     }
   }
 
+  // In C builtins get merged with implicitly lazily created declarations.
+  // In C++ we need to check if it's a builtin and add the BuiltinAttr here.
+  if (getLangOpts().CPlusPlus) {
+    if (IdentifierInfo *II = Previous.getLookupName().getAsIdentifierInfo()) {
+      if (unsigned BuiltinID = II->getBuiltinID()) {
+        if (NewFD->getLanguageLinkage() == CLanguageLinkage) {
+          // Declarations for builtins with custom typechecking by definition
+          // don't make sense. Don't attempt typechecking and simply add the
+          // attribute.
+          if (Context.BuiltinInfo.hasCustomTypechecking(BuiltinID)) {
+            NewFD->addAttr(BuiltinAttr::CreateImplicit(Context, BuiltinID));
+          } else {
+            ASTContext::GetBuiltinTypeError Error;
+            QualType BuiltinType = Context.GetBuiltinType(BuiltinID, Error);
+
+            if (!Error && !BuiltinType.isNull() &&
+                Context.hasSameFunctionTypeIgnoringExceptionSpec(
+                    NewFD->getType(), BuiltinType))
+              NewFD->addAttr(BuiltinAttr::CreateImplicit(Context, BuiltinID));
+          }
+        } else if (BuiltinID == Builtin::BI__GetExceptionInfo &&
+                   Context.getTargetInfo().getCXXABI().isMicrosoft()) {
+          // FIXME: We should consider this a builtin only in the std namespace.
+          NewFD->addAttr(BuiltinAttr::CreateImplicit(Context, BuiltinID));
+        }
+      }
+    }
+  }
+
   ProcessPragmaWeak(S, NewFD);
   checkAttributesAfterMerging(*this, *NewFD);
 
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 0b80ee613077..d301e6c732ab 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6158,6 +6158,7 @@ static FunctionDecl *rewriteBuiltinFunctionDecl(Sema *Sema, ASTContext &Context,
     Params.push_back(Parm);
   }
   OverloadDecl->setParams(Params);
+  Sema->mergeDeclAttributes(OverloadDecl, FDecl);
   return OverloadDecl;
 }
 
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index 5757eaf3fac0..bcbecd545398 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -894,10 +894,9 @@ bool Sema::LookupBuiltin(LookupResult &R) {
             Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID))
           return false;
 
-        if (NamedDecl *D = LazilyCreateBuiltin((IdentifierInfo *)II,
-                                               BuiltinID, TUScope,
-                                               R.isForRedeclaration(),
-                                               R.getNameLoc())) {
+        if (NamedDecl *D =
+                LazilyCreateBuiltin(II, BuiltinID, TUScope,
+                                    R.isForRedeclaration(), R.getNameLoc())) {
           R.addDecl(D);
           return true;
         }
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 4a1a995204e5..4d439691ef83 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -908,9 +908,8 @@ ASTIdentifierLookupTraitBase::ReadKey(const unsigned char* d, unsigned n) {
 /// Whether the given identifier is "interesting".
 static bool isInterestingIdentifier(ASTReader &Reader, IdentifierInfo &II,
                                     bool IsModule) {
-  return II.hadMacroDefinition() ||
-         II.isPoisoned() ||
-         (IsModule ? II.hasRevertedBuiltin() : II.getObjCOrBuiltinID()) ||
+  return II.hadMacroDefinition() || II.isPoisoned() ||
+         (!IsModule && II.getObjCOrBuiltinID()) ||
          II.hasRevertedTokenIDToIdentifier() ||
          (!(IsModule && Reader.getPreprocessor().getLangOpts().CPlusPlus) &&
           II.getFETokenInfo());
@@ -970,7 +969,6 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k,
   unsigned Bits = endian::readNext<uint16_t, little, unaligned>(d);
   bool CPlusPlusOperatorKeyword = readBit(Bits);
   bool HasRevertedTokenIDToIdentifier = readBit(Bits);
-  bool HasRevertedBuiltin = readBit(Bits);
   bool Poisoned = readBit(Bits);
   bool ExtensionToken = readBit(Bits);
   bool HadMacroDefinition = readBit(Bits);
@@ -984,12 +982,6 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k,
     II->revertTokenIDToIdentifier();
   if (!F.isModule())
     II->setObjCOrBuiltinID(ObjCOrBuiltinID);
-  else if (HasRevertedBuiltin && II->getBuiltinID()) {
-    II->revertBuiltin();
-    assert((II->hasRevertedBuiltin() ||
-            II->getObjCOrBuiltinID() == ObjCOrBuiltinID) &&
-           "Incorrect ObjC keyword or builtin ID");
-  }
   assert(II->isExtensionToken() == ExtensionToken &&
          "Incorrect extension token flag");
   (void)ExtensionToken;
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 2345a12caeb2..16e363fdd42c 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -3276,9 +3276,8 @@ class ASTIdentifierTableTrait {
   /// doesn't check whether the name has macros defined; use PublicMacroIterator
   /// to check that.
   bool isInterestingIdentifier(const IdentifierInfo *II, uint64_t MacroOffset) {
-    if (MacroOffset ||
-        II->isPoisoned() ||
-        (IsModule ? II->hasRevertedBuiltin() : II->getObjCOrBuiltinID()) ||
+    if (MacroOffset || II->isPoisoned() ||
+        (!IsModule && II->getObjCOrBuiltinID()) ||
         II->hasRevertedTokenIDToIdentifier() ||
         (NeedDecls && II->getFETokenInfo()))
       return true;
@@ -3385,7 +3384,6 @@ class ASTIdentifierTableTrait {
     Bits = (Bits << 1) | unsigned(HadMacroDefinition);
     Bits = (Bits << 1) | unsigned(II->isExtensionToken());
     Bits = (Bits << 1) | unsigned(II->isPoisoned());
-    Bits = (Bits << 1) | unsigned(II->hasRevertedBuiltin());
     Bits = (Bits << 1) | unsigned(II->hasRevertedTokenIDToIdentifier());
     Bits = (Bits << 1) | unsigned(II->isCPlusPlusOperatorKeyword());
     LE.write<uint16_t>(Bits);
diff --git a/clang/test/AST/ast-dump-attr.cpp b/clang/test/AST/ast-dump-attr.cpp
index 95491a02f8b2..c2bd768dc2ad 100644
--- a/clang/test/AST/ast-dump-attr.cpp
+++ b/clang/test/AST/ast-dump-attr.cpp
@@ -119,6 +119,7 @@ namespace Test {
 extern "C" int printf(const char *format, ...);
 // CHECK: FunctionDecl{{.*}}printf
 // CHECK-NEXT: ParmVarDecl{{.*}}format{{.*}}'const char *'
+// CHECK-NEXT: BuiltinAttr{{.*}}Implicit
 // CHECK-NEXT: FormatAttr{{.*}}Implicit printf 1 2
 
 alignas(8) extern int x;
diff --git a/clang/test/CodeGen/builtin-redeclaration.c b/clang/test/CodeGen/builtin-redeclaration.c
new file mode 100644
index 000000000000..582907184ea5
--- /dev/null
+++ b/clang/test/CodeGen/builtin-redeclaration.c
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -emit-llvm-only %s
+
+// PR45410
+// Ensure we mark local extern redeclarations with a different type as non-builtin.
+void non_builtin() {
+  extern float exp();
+  exp(); // Will crash due to wrong number of arguments if this calls the builtin.
+}
+
+// PR45410
+// We mark exp() builtin as const with -fno-math-errno (default).
+// We mustn't do that for extern redeclarations of builtins where the type differs.
+float attribute() {
+  extern float exp();
+  return exp(1);
+}
diff --git a/clang/test/CodeGen/callback_pthread_create.c b/clang/test/CodeGen/callback_pthread_create.c
index 785440030b32..b9eb6e18bbd7 100644
--- a/clang/test/CodeGen/callback_pthread_create.c
+++ b/clang/test/CodeGen/callback_pthread_create.c
@@ -1,5 +1,8 @@
-// RUN: %clang_cc1 -O1 %s -S -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -O1 %s -S -emit-llvm -o - | opt -ipconstprop -S | FileCheck --check-prefix=IPCP %s
+// FIXME: pthread_create() definition in Builtins.def doesn't match the real one, so it doesn't get recognized as a builtin and attributes aren't added.
+// RUN: false
+// XFAIL: *
+
+// RUN: %clang_cc1 %s -S -emit-llvm -o - -disable-llvm-optzns | FileCheck %s
 
 // CHECK: declare !callback ![[cid:[0-9]+]] {{.*}}i32 @pthread_create
 // CHECK: ![[cid]] = !{![[cidb:[0-9]+]]}
diff --git a/clang/test/CodeGenCXX/builtins.cpp b/clang/test/CodeGenCXX/builtins.cpp
index 242cba7bc14a..b0378322f97e 100644
--- a/clang/test/CodeGenCXX/builtins.cpp
+++ b/clang/test/CodeGenCXX/builtins.cpp
@@ -1,5 +1,19 @@
 // RUN: %clang_cc1 -triple=x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s
 
+// Builtins inside a namespace inside an extern "C" must be considered builtins.
+extern "C" {
+namespace X {
+double __builtin_fabs(double);
+float __builtin_fabsf(float) noexcept;
+} // namespace X
+}
+
+int o = X::__builtin_fabs(-2.0);
+// CHECK: @o = global i32 2, align 4
+
+long p = X::__builtin_fabsf(-3.0f);
+// CHECK: @p = global i64 3, align 8
+
 // PR8839
 extern "C" char memmove();
 
diff --git a/clang/test/Sema/implicit-builtin-decl.c b/clang/test/Sema/implicit-builtin-decl.c
index 3a3dfa935ac1..b25e86bc03a3 100644
--- a/clang/test/Sema/implicit-builtin-decl.c
+++ b/clang/test/Sema/implicit-builtin-decl.c
@@ -1,5 +1,4 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: not %clang_cc1 -fsyntax-only -ast-dump %s | FileCheck %s
 
 void f() {
   int *ptr = malloc(sizeof(int) * 10); // expected-warning{{implicitly declaring library function 'malloc' with type}} \
@@ -63,9 +62,5 @@ extern float fmaxf(float, float);
 struct __jmp_buf_tag {};
 void sigsetjmp(struct __jmp_buf_tag[1], int); // expected-warning{{declaration of built-in function 'sigsetjmp' requires the declaration of the 'jmp_buf' type, commonly provided in the header <setjmp.h>.}}
 
-// CHECK:     FunctionDecl {{.*}} <line:[[@LINE-2]]:1, col:44> col:6 sigsetjmp '
-// CHECK-NOT: FunctionDecl
-// CHECK:     ReturnsTwiceAttr {{.*}} <{{.*}}> Implicit
-
 // PR40692
 void pthread_create(); // no warning expected
diff --git a/clang/test/Sema/warn-fortify-source.c b/clang/test/Sema/warn-fortify-source.c
index 0f93a687f007..5ad2979bc29c 100644
--- a/clang/test/Sema/warn-fortify-source.c
+++ b/clang/test/Sema/warn-fortify-source.c
@@ -1,8 +1,6 @@
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.14.0 %s -verify
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.14.0 %s -verify -DUSE_PASS_OBJECT_SIZE
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.14.0 %s -verify -DUSE_BUILTINS
 // RUN: %clang_cc1 -xc++ -triple x86_64-apple-macosx10.14.0 %s -verify
-// RUN: %clang_cc1 -xc++ -triple x86_64-apple-macosx10.14.0 %s -verify -DUSE_PASS_OBJECT_SIZE
 // RUN: %clang_cc1 -xc++ -triple x86_64-apple-macosx10.14.0 %s -verify -DUSE_BUILTINS
 
 typedef unsigned long size_t;
@@ -13,13 +11,7 @@ extern "C" {
 
 extern int sprintf(char *str, const char *format, ...);
 
-#if defined(USE_PASS_OBJECT_SIZE)
-void *memcpy(void *dst, const void *src, size_t c);
-static void *memcpy(void *dst __attribute__((pass_object_size(1))), const void *src, size_t c) __attribute__((overloadable)) __asm__("merp");
-static void *memcpy(void *const dst __attribute__((pass_object_size(1))), const void *src, size_t c) __attribute__((overloadable)) {
-  return 0;
-}
-#elif defined(USE_BUILTINS)
+#if defined(USE_BUILTINS)
 #define memcpy(x,y,z) __builtin_memcpy(x,y,z)
 #else
 void *memcpy(void *dst, const void *src, size_t c);
@@ -45,14 +37,7 @@ void call_memcpy_type() {
   };
   struct pair p;
   char buf[20];
-  memcpy(&p.first, buf, 20);
-#ifdef USE_PASS_OBJECT_SIZE
-  // Use the more strict checking mode on the pass_object_size attribute:
-  // expected-warning@-3 {{memcpy' will always overflow; destination buffer has size 4, but size argument is 20}}
-#else
-  // Or just fallback to type 0:
-  // expected-warning@-6 {{memcpy' will always overflow; destination buffer has size 8, but size argument is 20}}
-#endif
+  memcpy(&p.first, buf, 20); // expected-warning {{memcpy' will always overflow; destination buffer has size 8, but size argument is 20}}
 }
 
 void call_strncat() {
diff --git a/clang/test/SemaCXX/cxx11-compat.cpp b/clang/test/SemaCXX/cxx11-compat.cpp
index 07cd6b1fcf93..f17c900201f7 100644
--- a/clang/test/SemaCXX/cxx11-compat.cpp
+++ b/clang/test/SemaCXX/cxx11-compat.cpp
@@ -31,7 +31,7 @@ struct S {
 s = { n }, // expected-warning {{non-constant-expression cannot be narrowed from type 'int' to 'char' in initializer list in C++11}} expected-note {{explicit cast}}
 t = { 1234 }; // expected-warning {{constant expression evaluates to 1234 which cannot be narrowed to type 'char' in C++11}} expected-warning {{changes value}} expected-note {{explicit cast}}
 
-#define PRIuS "uS"
+#define PRIuS "zu"
 int printf(const char *, ...);
 typedef __typeof(sizeof(int)) size_t;
 void h(size_t foo, size_t bar) {
diff --git a/clang/test/SemaCXX/warn-unused-local-typedef.cpp b/clang/test/SemaCXX/warn-unused-local-typedef.cpp
index 7e893ba506a5..554ea37eeb28 100644
--- a/clang/test/SemaCXX/warn-unused-local-typedef.cpp
+++ b/clang/test/SemaCXX/warn-unused-local-typedef.cpp
@@ -67,10 +67,10 @@ int printf(char const *, ...);
 
 void test() {
   typedef signed long int superint; // no diag
-  printf("%f", (superint) 42);
+  printf("%ld", (superint)42);
 
   typedef signed long int superint2; // no diag
-  printf("%f", static_cast<superint2>(42));
+  printf("%ld", static_cast<superint2>(42));
 
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-local-typedef"

From fdab756331f322a9818c1bdf14d23d9cd6036c81 Mon Sep 17 00:00:00 2001
From: Raul Tambre <raul.tambre@cleveron.com>
Date: Fri, 18 Sep 2020 20:07:05 +0300
Subject: [PATCH 346/363] [Sema] Handle objc_super special lookup when checking
 builtin compatibility

objc_super is special and needs LookupPredefedObjCSuperType() called before performing builtin type comparisons.
This fixes an error when compiling macOS headers. A test is added.

Differential Revision: https://reviews.llvm.org/D87917
---
 clang/lib/Sema/SemaDecl.cpp                | 1 +
 clang/test/SemaObjCXX/builtin-objcsuper.mm | 8 ++++++++
 2 files changed, 9 insertions(+)
 create mode 100644 clang/test/SemaObjCXX/builtin-objcsuper.mm

diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 2703d9876f85..82c78e3003f0 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -9643,6 +9643,7 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
             NewFD->addAttr(BuiltinAttr::CreateImplicit(Context, BuiltinID));
           } else {
             ASTContext::GetBuiltinTypeError Error;
+            LookupPredefedObjCSuperType(*this, S, NewFD->getIdentifier());
             QualType BuiltinType = Context.GetBuiltinType(BuiltinID, Error);
 
             if (!Error && !BuiltinType.isNull() &&
diff --git a/clang/test/SemaObjCXX/builtin-objcsuper.mm b/clang/test/SemaObjCXX/builtin-objcsuper.mm
new file mode 100644
index 000000000000..a6baf3c5165a
--- /dev/null
+++ b/clang/test/SemaObjCXX/builtin-objcsuper.mm
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -verify %s
+// expected-no-diagnostics
+
+// objc_super has special lookup rules for compatibility with macOS headers, so
+// the following should compile.
+struct objc_super {};
+extern "C" id objc_msgSendSuper(struct objc_super *super, SEL op, ...);
+extern "C" void objc_msgSendSuper_stret(struct objc_super *super, SEL op, ...);

From 0312bec0d7573b0549a222fb5d0a695fcf819dc3 Mon Sep 17 00:00:00 2001
From: Richard Smith <richard@metafoo.co.uk>
Date: Tue, 29 Sep 2020 15:20:11 -0700
Subject: [PATCH 347/363] Recognize setjmp and friends as builtins even if
 jmp_buf is not declared yet.

This happens in glibc's headers. It's important that we recognize these
functions so that we can mark them as returns_twice.

Differential Revision: https://reviews.llvm.org/D88518
---
 clang/include/clang/Basic/Builtins.def  | 25 +++++++-------
 clang/include/clang/Basic/Builtins.h    |  7 ++++
 clang/lib/Sema/SemaDecl.cpp             | 23 +++++++------
 clang/test/CodeGen/setjmp.c             | 44 +++++++++++++++++++++++++
 clang/test/Sema/builtin-setjmp.c        | 44 +++++++++++++++++++++----
 clang/test/Sema/implicit-builtin-decl.c |  5 ++-
 6 files changed, 116 insertions(+), 32 deletions(-)
 create mode 100644 clang/test/CodeGen/setjmp.c

diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index 01c28ebab763..dbb5dde20a2b 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -75,6 +75,9 @@
 //  U -> pure
 //  c -> const
 //  t -> signature is meaningless, use custom typechecking
+//  T -> type is not important to semantic analysis and codegen; recognize as
+//       builtin even if type doesn't match signature, and don't warn if we
+//       can't be sure the type is right
 //  F -> this is a libc/libm function with a '__builtin_' prefix added.
 //  f -> this is a libc/libm function without the '__builtin_' prefix. It can
 //       be followed by ':headername:' to state which header this function
@@ -893,7 +896,7 @@ LANGBUILTIN(__va_start,       "vc**.", "nt", ALL_MS_LANGUAGES)
 LANGBUILTIN(__fastfail, "vUi",    "nr", ALL_MS_LANGUAGES)
 
 // Microsoft library builtins.
-LIBBUILTIN(_setjmpex, "iJ", "fj",   "setjmpex.h", ALL_MS_LANGUAGES)
+LIBBUILTIN(_setjmpex, "iJ", "fjT", "setjmpex.h", ALL_MS_LANGUAGES)
 
 // C99 library functions
 // C99 stdarg.h
@@ -987,8 +990,8 @@ LIBBUILTIN(wmemmove,"w*w*wC*z", "f", "wchar.h", ALL_LANGUAGES)
 // In some systems setjmp is a macro that expands to _setjmp. We undefine
 // it here to avoid having two identical LIBBUILTIN entries.
 #undef setjmp
-LIBBUILTIN(setjmp, "iJ",          "fj",    "setjmp.h", ALL_LANGUAGES)
-LIBBUILTIN(longjmp, "vJi",        "fr",    "setjmp.h", ALL_LANGUAGES)
+LIBBUILTIN(setjmp, "iJ",          "fjT",   "setjmp.h", ALL_LANGUAGES)
+LIBBUILTIN(longjmp, "vJi",        "frT",   "setjmp.h", ALL_LANGUAGES)
 
 // Non-C library functions, active in GNU mode only.
 // Functions with (returns_twice) attribute (marked as "j") are still active in
@@ -1015,21 +1018,21 @@ LIBBUILTIN(strcasecmp, "icC*cC*", "f",     "strings.h", ALL_GNU_LANGUAGES)
 LIBBUILTIN(strncasecmp, "icC*cC*z", "f",   "strings.h", ALL_GNU_LANGUAGES)
 // POSIX unistd.h
 LIBBUILTIN(_exit, "vi",           "fr",    "unistd.h", ALL_GNU_LANGUAGES)
-LIBBUILTIN(vfork, "p",            "fj",    "unistd.h", ALL_LANGUAGES)
+LIBBUILTIN(vfork, "p",            "fjT",   "unistd.h", ALL_LANGUAGES)
 // POSIX pthread.h
 // FIXME: Should specify argument types.
 LIBBUILTIN(pthread_create, "",  "fC<2,3>", "pthread.h", ALL_GNU_LANGUAGES)
 
 // POSIX setjmp.h
 
-LIBBUILTIN(_setjmp, "iJ",         "fj",   "setjmp.h", ALL_LANGUAGES)
-LIBBUILTIN(__sigsetjmp, "iSJi",   "fj",   "setjmp.h", ALL_LANGUAGES)
-LIBBUILTIN(sigsetjmp, "iSJi",     "fj",   "setjmp.h", ALL_LANGUAGES)
-LIBBUILTIN(savectx, "iJ",         "fj",   "setjmp.h", ALL_LANGUAGES)
-LIBBUILTIN(getcontext, "iK*",     "fj",   "setjmp.h", ALL_LANGUAGES)
+LIBBUILTIN(_setjmp, "iJ",         "fjT",   "setjmp.h", ALL_LANGUAGES)
+LIBBUILTIN(__sigsetjmp, "iSJi",   "fjT",   "setjmp.h", ALL_LANGUAGES)
+LIBBUILTIN(sigsetjmp, "iSJi",     "fjT",   "setjmp.h", ALL_LANGUAGES)
+LIBBUILTIN(savectx, "iJ",         "fjT",   "setjmp.h", ALL_LANGUAGES)
+LIBBUILTIN(getcontext, "iK*",     "fjT",   "setjmp.h", ALL_LANGUAGES)
 
-LIBBUILTIN(_longjmp, "vJi",       "fr",    "setjmp.h", ALL_GNU_LANGUAGES)
-LIBBUILTIN(siglongjmp, "vSJi",    "fr",    "setjmp.h", ALL_GNU_LANGUAGES)
+LIBBUILTIN(_longjmp, "vJi",       "frT",   "setjmp.h", ALL_GNU_LANGUAGES)
+LIBBUILTIN(siglongjmp, "vSJi",    "frT",   "setjmp.h", ALL_GNU_LANGUAGES)
 // non-standard but very common
 LIBBUILTIN(strlcpy, "zc*cC*z",    "f",     "string.h", ALL_GNU_LANGUAGES)
 LIBBUILTIN(strlcat, "zc*cC*z",    "f",     "string.h", ALL_GNU_LANGUAGES)
diff --git a/clang/include/clang/Basic/Builtins.h b/clang/include/clang/Basic/Builtins.h
index e4ed482d9068..15bfcf797917 100644
--- a/clang/include/clang/Basic/Builtins.h
+++ b/clang/include/clang/Basic/Builtins.h
@@ -158,6 +158,13 @@ class Context {
     return strchr(getRecord(ID).Attributes, 't') != nullptr;
   }
 
+  /// Determines whether a declaration of this builtin should be recognized
+  /// even if the type doesn't match the specified signature.
+  bool allowTypeMismatch(unsigned ID) const {
+    return strchr(getRecord(ID).Attributes, 'T') != nullptr ||
+           hasCustomTypechecking(ID);
+  }
+
   /// Determines whether this builtin has a result or any arguments which
   /// are pointer types.
   bool hasPtrArgsOrResult(unsigned ID) const {
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 82c78e3003f0..5b0417fa8859 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -2107,7 +2107,8 @@ NamedDecl *Sema::LazilyCreateBuiltin(IdentifierInfo *II, unsigned ID,
 
     // If we have a builtin without an associated type we should not emit a
     // warning when we were not able to find a type for it.
-    if (Error == ASTContext::GE_Missing_type)
+    if (Error == ASTContext::GE_Missing_type ||
+        Context.BuiltinInfo.allowTypeMismatch(ID))
       return nullptr;
 
     // If we could not find a type for setjmp it is because the jmp_buf type was
@@ -2131,11 +2132,9 @@ NamedDecl *Sema::LazilyCreateBuiltin(IdentifierInfo *II, unsigned ID,
        Context.BuiltinInfo.isHeaderDependentFunction(ID))) {
     Diag(Loc, diag::ext_implicit_lib_function_decl)
         << Context.BuiltinInfo.getName(ID) << R;
-    if (Context.BuiltinInfo.getHeaderName(ID) &&
-        !Diags.isIgnored(diag::ext_implicit_lib_function_decl, Loc))
+    if (const char *Header = Context.BuiltinInfo.getHeaderName(ID))
       Diag(Loc, diag::note_include_header_or_declare)
-          << Context.BuiltinInfo.getHeaderName(ID)
-          << Context.BuiltinInfo.getName(ID);
+          << Header << Context.BuiltinInfo.getName(ID);
   }
 
   if (R.isNull())
@@ -9630,16 +9629,16 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
     }
   }
 
-  // In C builtins get merged with implicitly lazily created declarations.
-  // In C++ we need to check if it's a builtin and add the BuiltinAttr here.
-  if (getLangOpts().CPlusPlus) {
+  // If this is the first declaration of a library builtin function, add
+  // attributes as appropriate.
+  if (!D.isRedeclaration() &&
+      NewFD->getDeclContext()->getRedeclContext()->isFileContext()) {
     if (IdentifierInfo *II = Previous.getLookupName().getAsIdentifierInfo()) {
       if (unsigned BuiltinID = II->getBuiltinID()) {
         if (NewFD->getLanguageLinkage() == CLanguageLinkage) {
-          // Declarations for builtins with custom typechecking by definition
-          // don't make sense. Don't attempt typechecking and simply add the
-          // attribute.
-          if (Context.BuiltinInfo.hasCustomTypechecking(BuiltinID)) {
+          // Validate the type matches unless this builtin is specified as
+          // matching regardless of its declared type.
+          if (Context.BuiltinInfo.allowTypeMismatch(BuiltinID)) {
             NewFD->addAttr(BuiltinAttr::CreateImplicit(Context, BuiltinID));
           } else {
             ASTContext::GetBuiltinTypeError Error;
diff --git a/clang/test/CodeGen/setjmp.c b/clang/test/CodeGen/setjmp.c
new file mode 100644
index 000000000000..4ca360d8584c
--- /dev/null
+++ b/clang/test/CodeGen/setjmp.c
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -x c %s -triple x86_64-linux-gnu -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c++ %s -triple x86_64-linux-gnu -emit-llvm -o - | FileCheck %s
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct __jmp_buf_tag { int n; };
+int setjmp(struct __jmp_buf_tag*);
+int sigsetjmp(struct __jmp_buf_tag*, int);
+int _setjmp(struct __jmp_buf_tag*);
+int __sigsetjmp(struct __jmp_buf_tag*, int);
+
+typedef struct __jmp_buf_tag jmp_buf[1];
+typedef struct __jmp_buf_tag sigjmp_buf[1];
+
+#ifdef __cplusplus
+}
+#endif
+
+void f() {
+  jmp_buf jb;
+  // CHECK: call {{.*}}@setjmp(
+  setjmp(jb);
+  // CHECK: call {{.*}}@sigsetjmp(
+  sigsetjmp(jb, 0);
+  // CHECK: call {{.*}}@_setjmp(
+  _setjmp(jb);
+  // CHECK: call {{.*}}@__sigsetjmp(
+  __sigsetjmp(jb, 0);
+}
+
+// CHECK: ; Function Attrs: returns_twice
+// CHECK-NEXT: declare {{.*}} @setjmp(
+
+// CHECK: ; Function Attrs: returns_twice
+// CHECK-NEXT: declare {{.*}} @sigsetjmp(
+
+// CHECK: ; Function Attrs: returns_twice
+// CHECK-NEXT: declare {{.*}} @_setjmp(
+
+// CHECK: ; Function Attrs: returns_twice
+// CHECK-NEXT: declare {{.*}} @__sigsetjmp(
+
diff --git a/clang/test/Sema/builtin-setjmp.c b/clang/test/Sema/builtin-setjmp.c
index f8770d88e731..6a114fad05d9 100644
--- a/clang/test/Sema/builtin-setjmp.c
+++ b/clang/test/Sema/builtin-setjmp.c
@@ -1,10 +1,42 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify -DNO_JMP_BUF %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify -DNO_JMP_BUF %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify -DWRONG_JMP_BUF %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify -DRIGHT_JMP_BUF %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify -DONLY_JMP_BUF %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify -DNO_SETJMP %s -ast-dump 2>&1 | FileCheck %s
 
 #ifdef NO_JMP_BUF
-extern long setjmp(long *);   // expected-warning {{declaration of built-in function 'setjmp' requires the declaration of the 'jmp_buf' type, commonly provided in the header <setjmp.h>.}}
-#else
+// This happens in some versions of glibc: the declaration of __sigsetjmp
+// precedes the declaration of sigjmp_buf.
+extern long setjmp(long *); // Can't check, so we trust that this is the right type
+// FIXME: We could still diagnose the missing `jmp_buf` at the point of the call.
+// expected-no-diagnostics
+#elif WRONG_JMP_BUF
 typedef long jmp_buf;
-extern int setjmp(char);      // expected-warning@8 {{incompatible redeclaration of library function 'setjmp'}}
-                              // expected-note@8    {{'setjmp' is a builtin with type 'int (jmp_buf)' (aka 'int (long)')}}
+extern int setjmp(char); // expected-warning {{incompatible redeclaration of library function 'setjmp'}}
+                         // expected-note@-1 {{'setjmp' is a builtin with type 'int (jmp_buf)' (aka 'int (long)')}}
+#elif RIGHT_JMP_BUF
+typedef long jmp_buf;
+extern int setjmp(long); // OK, right type.
+// expected-no-diagnostics
+#elif ONLY_JMP_BUF
+typedef int *jmp_buf;
 #endif
+
+void use() {
+  setjmp(0);
+  #ifdef NO_SETJMP
+  // expected-warning@-2 {{implicit declaration of function 'setjmp' is invalid in C99}}
+  #elif ONLY_JMP_BUF
+  // expected-warning@-4 {{implicitly declaring library function 'setjmp' with type 'int (jmp_buf)' (aka 'int (int *)')}}
+  // expected-note@-5 {{include the header <setjmp.h> or explicitly provide a declaration for 'setjmp'}}
+  #endif
+
+  #ifdef NO_SETJMP
+  // In this case, the regular AST dump doesn't dump the implicit declaration of 'setjmp'.
+  #pragma clang __debug dump setjmp
+  #endif
+}
+
+// CHECK: FunctionDecl {{.*}} used setjmp
+// CHECK: BuiltinAttr {{.*}} Implicit
+// CHECK: ReturnsTwiceAttr {{.*}} Implicit
diff --git a/clang/test/Sema/implicit-builtin-decl.c b/clang/test/Sema/implicit-builtin-decl.c
index b25e86bc03a3..9434b507a3af 100644
--- a/clang/test/Sema/implicit-builtin-decl.c
+++ b/clang/test/Sema/implicit-builtin-decl.c
@@ -54,13 +54,12 @@ main(int argc, char *argv[])
 
 void snprintf() { }
 
-// PR8316 & PR40692
-void longjmp(); // expected-warning{{declaration of built-in function 'longjmp' requires the declaration of the 'jmp_buf' type, commonly provided in the header <setjmp.h>.}}
+void longjmp();
 
 extern float fmaxf(float, float);
 
 struct __jmp_buf_tag {};
-void sigsetjmp(struct __jmp_buf_tag[1], int); // expected-warning{{declaration of built-in function 'sigsetjmp' requires the declaration of the 'jmp_buf' type, commonly provided in the header <setjmp.h>.}}
+void sigsetjmp(struct __jmp_buf_tag[1], int);
 
 // PR40692
 void pthread_create(); // no warning expected

From 98f575ff55bff1e4128ffaeef4d05c356d996ab9 Mon Sep 17 00:00:00 2001
From: Richard Smith <richard@metafoo.co.uk>
Date: Fri, 2 Oct 2020 13:34:46 -0700
Subject: [PATCH 348/363] Don't reject calls to MinGW's unusual _setjmp
 declaration.

We now recognize this function as a builtin despite it having an
unexpected number of parameters; make sure we don't enforce that it has
only 1 argument for its 2 parameters.
---
 clang/include/clang/Basic/Builtins.def |  1 +
 clang/lib/CodeGen/CGBuiltin.cpp        |  6 ++-
 clang/lib/Sema/SemaChecking.cpp        |  5 --
 clang/test/Sema/builtin-setjmp.c       | 63 +++++++++++++++++++-------
 4 files changed, 52 insertions(+), 23 deletions(-)

diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index dbb5dde20a2b..5463b7dfc18c 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -1025,6 +1025,7 @@ LIBBUILTIN(pthread_create, "",  "fC<2,3>", "pthread.h", ALL_GNU_LANGUAGES)
 
 // POSIX setjmp.h
 
+// FIXME: MinGW _setjmp has an additional void* parameter.
 LIBBUILTIN(_setjmp, "iJ",         "fjT",   "setjmp.h", ALL_LANGUAGES)
 LIBBUILTIN(__sigsetjmp, "iSJi",   "fjT",   "setjmp.h", ALL_LANGUAGES)
 LIBBUILTIN(sigsetjmp, "iSJi",     "fjT",   "setjmp.h", ALL_LANGUAGES)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 8994b939093e..6b93f1b60af5 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3754,11 +3754,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI_abnormal_termination:
     return RValue::get(EmitSEHAbnormalTermination());
   case Builtin::BI_setjmpex:
-    if (getTarget().getTriple().isOSMSVCRT())
+    if (getTarget().getTriple().isOSMSVCRT() && E->getNumArgs() == 1 &&
+        E->getArg(0)->getType()->isPointerType())
       return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmpex, E);
     break;
   case Builtin::BI_setjmp:
-    if (getTarget().getTriple().isOSMSVCRT()) {
+    if (getTarget().getTriple().isOSMSVCRT() && E->getNumArgs() == 1 &&
+        E->getArg(0)->getType()->isPointerType()) {
       if (getTarget().getTriple().getArch() == llvm::Triple::x86)
         return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmp3, E);
       else if (getTarget().getTriple().getArch() == llvm::Triple::aarch64)
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index b00d2ff5f1d5..1bf04d9cb4f2 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1573,11 +1573,6 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     if (SemaBuiltinSetjmp(TheCall))
       return ExprError();
     break;
-  case Builtin::BI_setjmp:
-  case Builtin::BI_setjmpex:
-    if (checkArgCount(*this, TheCall, 1))
-      return true;
-    break;
   case Builtin::BI__builtin_classify_type:
     if (checkArgCount(*this, TheCall, 1)) return true;
     TheCall->setType(Context.IntTy);
diff --git a/clang/test/Sema/builtin-setjmp.c b/clang/test/Sema/builtin-setjmp.c
index 6a114fad05d9..604d534eb504 100644
--- a/clang/test/Sema/builtin-setjmp.c
+++ b/clang/test/Sema/builtin-setjmp.c
@@ -1,34 +1,47 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify -DNO_JMP_BUF %s -ast-dump | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify -DWRONG_JMP_BUF %s -ast-dump | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify -DRIGHT_JMP_BUF %s -ast-dump | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify -DONLY_JMP_BUF %s -ast-dump | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify -DNO_SETJMP %s -ast-dump 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=c,expected -DNO_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=c,expected -DWRONG_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=c,expected -DRIGHT_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=c,expected -DONLY_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=c,expected -DNO_SETJMP %s -ast-dump 2>&1 | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=cxx,expected -x c++ -DNO_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=cxx,expected -x c++ -DWRONG_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=cxx,expected -x c++ -DRIGHT_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK1,CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=cxx,expected -x c++ -DONLY_JMP_BUF %s -ast-dump | FileCheck %s --check-prefixes=CHECK2
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fsyntax-only -verify=cxx,expected -x c++ -DNO_SETJMP %s -ast-dump | FileCheck %s --check-prefixes=CHECK2
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 #ifdef NO_JMP_BUF
 // This happens in some versions of glibc: the declaration of __sigsetjmp
 // precedes the declaration of sigjmp_buf.
 extern long setjmp(long *); // Can't check, so we trust that this is the right type
 // FIXME: We could still diagnose the missing `jmp_buf` at the point of the call.
-// expected-no-diagnostics
+// c-no-diagnostics
 #elif WRONG_JMP_BUF
 typedef long jmp_buf;
-extern int setjmp(char); // expected-warning {{incompatible redeclaration of library function 'setjmp'}}
-                         // expected-note@-1 {{'setjmp' is a builtin with type 'int (jmp_buf)' (aka 'int (long)')}}
+// FIXME: Consider producing a similar warning in C++.
+extern int setjmp(char); // c-warning {{incompatible redeclaration of library function 'setjmp'}}
+                         // c-note@-1 {{'setjmp' is a builtin with type 'int (jmp_buf)' (aka 'int (long)')}}
 #elif RIGHT_JMP_BUF
 typedef long jmp_buf;
 extern int setjmp(long); // OK, right type.
-// expected-no-diagnostics
 #elif ONLY_JMP_BUF
 typedef int *jmp_buf;
 #endif
 
 void use() {
   setjmp(0);
-  #ifdef NO_SETJMP
-  // expected-warning@-2 {{implicit declaration of function 'setjmp' is invalid in C99}}
+  #if NO_SETJMP
+  // cxx-error@-2 {{undeclared identifier 'setjmp'}}
+  // c-warning@-3 {{implicit declaration of function 'setjmp' is invalid in C99}}
   #elif ONLY_JMP_BUF
-  // expected-warning@-4 {{implicitly declaring library function 'setjmp' with type 'int (jmp_buf)' (aka 'int (int *)')}}
-  // expected-note@-5 {{include the header <setjmp.h> or explicitly provide a declaration for 'setjmp'}}
+  // cxx-error@-5 {{undeclared identifier 'setjmp'}}
+  // c-warning@-6 {{implicitly declaring library function 'setjmp' with type 'int (jmp_buf)' (aka 'int (int *)')}}
+  // c-note@-7 {{include the header <setjmp.h> or explicitly provide a declaration for 'setjmp'}}
+  #else
+  // cxx-no-diagnostics
   #endif
 
   #ifdef NO_SETJMP
@@ -37,6 +50,24 @@ void use() {
   #endif
 }
 
-// CHECK: FunctionDecl {{.*}} used setjmp
-// CHECK: BuiltinAttr {{.*}} Implicit
-// CHECK: ReturnsTwiceAttr {{.*}} Implicit
+// CHECK1: FunctionDecl {{.*}} used setjmp
+// CHECK1: BuiltinAttr {{.*}} Implicit
+// CHECK1: ReturnsTwiceAttr {{.*}} Implicit
+
+// mingw declares _setjmp with an unusual signature.
+int _setjmp(void *, void *);
+#if !defined(NO_JMP_BUF) && !defined(NO_SETJMP)
+// c-warning@-2 {{incompatible redeclaration of library function '_setjmp'}}
+// c-note@-3 {{'_setjmp' is a builtin with type 'int (jmp_buf)'}}
+#endif
+void use_mingw() {
+  _setjmp(0, 0);
+}
+
+// CHECK2: FunctionDecl {{.*}} used _setjmp
+// CHECK2: BuiltinAttr {{.*}} Implicit
+// CHECK2: ReturnsTwiceAttr {{.*}} Implicit
+
+#ifdef __cplusplus
+}
+#endif

From f684355e0292b8e24a0870ecfda126fb15c9eb93 Mon Sep 17 00:00:00 2001
From: Aleksandr Platonov <platonov.aleksandr@huawei.com>
Date: Wed, 26 Aug 2020 22:10:35 +0300
Subject: [PATCH 349/363] [Support][Windows] Fix incorrect
 GetFinalPathNameByHandleW() return value check in realPathFromHandle()

`GetFinalPathNameByHandleW(,,N,)` returns:
- `< N` on success (this value does not include the size of the terminating null character)
- `>= N` if buffer is too small (this value includes the size of the terminating null character)

So, when `N == Buffer.capacity() - 1`, we need to resize buffer if return value is > `Buffer.capacity() - 2`.
Also, we can set `N` to `Buffer.capacity()`.

Thus, without this patch `realPathFromHandle()` returns unfilled buffer when length of the final path of the file is equal to `Buffer.capacity()` or `Buffer.capacity() - 1`.

Reviewed By: andrewng, amccarth

Differential Revision: https://reviews.llvm.org/D86564

(cherry picked from commit ceffd6993c350b57f43cec3b6371b159fc4a3149)
---
 llvm/lib/Support/Windows/Path.inc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index e352beb77616..365ab01c0a16 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -19,7 +19,6 @@
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/WindowsError.h"
 #include <fcntl.h>
-#include <io.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 
@@ -352,13 +351,13 @@ std::error_code is_local(const Twine &path, bool &result) {
 static std::error_code realPathFromHandle(HANDLE H,
                                           SmallVectorImpl<wchar_t> &Buffer) {
   DWORD CountChars = ::GetFinalPathNameByHandleW(
-      H, Buffer.begin(), Buffer.capacity() - 1, FILE_NAME_NORMALIZED);
-  if (CountChars > Buffer.capacity()) {
+      H, Buffer.begin(), Buffer.capacity(), FILE_NAME_NORMALIZED);
+  if (CountChars && CountChars >= Buffer.capacity()) {
     // The buffer wasn't big enough, try again.  In this case the return value
     // *does* indicate the size of the null terminator.
     Buffer.reserve(CountChars);
     CountChars = ::GetFinalPathNameByHandleW(
-        H, Buffer.data(), Buffer.capacity() - 1, FILE_NAME_NORMALIZED);
+        H, Buffer.begin(), Buffer.capacity(), FILE_NAME_NORMALIZED);
   }
   if (CountChars == 0)
     return mapWindowsError(GetLastError());

From 6ec777c2f6496b4fe1d78cc6d6871a3dc931a185 Mon Sep 17 00:00:00 2001
From: Ronald Wampler <rdwampler@gmail.com>
Date: Mon, 28 Sep 2020 17:07:44 -0400
Subject: [PATCH 350/363] [Support] PR42623: Avoid setting the delete-on-close
 bit if a TempFile doesn't reside on a local drive

On Windows, after commit 881ba104656c40098d4bc90c52613c08136f0fe1, tools
using TempFile would error with "bad file descriptor" when writing the
file on a network drive. It appears that setting the delete-on-close bit via
SetFileInformationByHandle/FileDispositionInfo prevented it from
accessing the file on network drives, and although using
FILE_DISPOSITION_INFO seems to work, it causes other troubles.

Differential Revision: https://reviews.llvm.org/D81803

(cherry picked from commit 79657e2339b58bc01fe1b85a448bb073d57d90bb)
---
 llvm/lib/Support/Windows/Path.inc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index 365ab01c0a16..a4ffc0ec4313 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -402,6 +402,20 @@ std::error_code is_local(int FD, bool &Result) {
 }
 
 static std::error_code setDeleteDisposition(HANDLE Handle, bool Delete) {
+  // First, check if the file is on a network (non-local) drive. If so, don't
+  // set DeleteFile to true, since it prevents opening the file for writes.
+  SmallVector<wchar_t, 128> FinalPath;
+  if (std::error_code EC = realPathFromHandle(Handle, FinalPath))
+    return EC;
+
+  bool IsLocal;
+  if (std::error_code EC = is_local_internal(FinalPath, IsLocal))
+    return EC;
+
+  if (!IsLocal)
+    return std::error_code();
+
+  // The file is on a local drive, set the DeleteFile to true.
   FILE_DISPOSITION_INFO Disposition;
   Disposition.DeleteFile = Delete;
   if (!SetFileInformationByHandle(Handle, FileDispositionInfo, &Disposition,

From 9e16c5bfae6e8d3cbec74376c2e734e3ff4ba11b Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Fri, 13 Nov 2020 23:08:47 -0800
Subject: [PATCH 351/363] [AArch64][GlobalISel] Look through a G_ZEXT when
 trying to match shift-extended register offsets.

The G_ZEXT in these cases seems to actually come from a combine that we do but
SelectionDAG doesn't. Looking through it allows us to match "uxtw #2" addressing
modes.

Differential Revision: https://reviews.llvm.org/D91475

(cherry picked from commit 0b6090699ab542cde24be1565b4d97dbad153cba)
---
 .../GISel/AArch64InstructionSelector.cpp      | 37 +++++++++++++------
 .../GlobalISel/load-wro-addressing-modes.mir  | 36 ++++++++++++++++++
 2 files changed, 61 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 408f0cb77e73..90bab9603245 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -4904,9 +4904,19 @@ AArch64InstructionSelector::selectExtendedSHL(
     return None;
 
   unsigned OffsetOpc = OffsetInst->getOpcode();
-  if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
-    return None;
+  bool LookedThroughZExt = false;
+  if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
+    // Try to look through a ZEXT.
+    if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
+      return None;
+
+    OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg());
+    OffsetOpc = OffsetInst->getOpcode();
+    LookedThroughZExt = true;
 
+    if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
+      return None;
+  }
   // Make sure that the memory op is a valid size.
   int64_t LegalShiftVal = Log2_32(SizeInBytes);
   if (LegalShiftVal == 0)
@@ -4957,20 +4967,23 @@ AArch64InstructionSelector::selectExtendedSHL(
 
   unsigned SignExtend = 0;
   if (WantsExt) {
-    // Check if the offset is defined by an extend.
-    MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
-    auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
-    if (Ext == AArch64_AM::InvalidShiftExtend)
-      return None;
+    // Check if the offset is defined by an extend, unless we looked through a
+    // G_ZEXT earlier.
+    if (!LookedThroughZExt) {
+      MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
+      auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
+      if (Ext == AArch64_AM::InvalidShiftExtend)
+        return None;
 
-    SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
-    // We only support SXTW for signed extension here.
-    if (SignExtend && Ext != AArch64_AM::SXTW)
-      return None;
+      SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
+      // We only support SXTW for signed extension here.
+      if (SignExtend && Ext != AArch64_AM::SXTW)
+        return None;
+      OffsetReg = ExtInst->getOperand(1).getReg();
+    }
 
     // Need a 32-bit wide register here.
     MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
-    OffsetReg = ExtInst->getOperand(1).getReg();
     OffsetReg = narrowExtendRegIfNeeded(OffsetReg, MIB);
   }
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/load-wro-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/load-wro-addressing-modes.mir
index 6b4b51d37ca8..8efd7648eed9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/load-wro-addressing-modes.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/load-wro-addressing-modes.mir
@@ -428,3 +428,39 @@ body:             |
     $x1 = COPY %load(s64)
     RET_ReallyLR implicit $x1
 ...
+---
+name:            zext_shl_LDRWroW
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$w0' }
+  - { reg: '$x1' }
+body:             |
+  bb.1:
+    liveins: $w0, $x1
+
+    ; We try to look through the G_ZEXT of the SHL here.
+
+    ; CHECK-LABEL: name: zext_shl_LDRWroW
+    ; CHECK: liveins: $w0, $x1
+    ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY $x1
+    ; CHECK: [[ANDWri:%[0-9]+]]:gpr32common = ANDWri [[COPY]], 7
+    ; CHECK: [[LDRWroW:%[0-9]+]]:gpr32 = LDRWroW [[COPY1]], [[ANDWri]], 0, 1 :: (load 4)
+    ; CHECK: $w0 = COPY [[LDRWroW]]
+    ; CHECK: RET_ReallyLR implicit $w0
+    %0:gpr(s32) = COPY $w0
+    %1:gpr(p0) = COPY $x1
+    %2:gpr(s32) = G_CONSTANT i32 255
+    %3:gpr(s32) = G_AND %0, %2
+    %13:gpr(s64) = G_CONSTANT i64 2
+    %12:gpr(s32) = G_SHL %3, %13(s64)
+    %6:gpr(s64) = G_ZEXT %12(s32)
+    %7:gpr(p0) = G_PTR_ADD %1, %6(s64)
+    %9:gpr(s32) = G_LOAD %7(p0) :: (load 4)
+    $w0 = COPY %9(s32)
+    RET_ReallyLR implicit $w0
+
+...

From 280e47ea0e837b809be03f2048ac8abc14dbc387 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Fri, 4 Dec 2020 15:51:44 -0800
Subject: [PATCH 352/363] [AArch64][GlobalISel] Narrow 128-bit regs to 64-bit
 regs in emitTestBit

When we have a 128-bit register, emitTestBit would incorrectly narrow to 32
bits always. If the bit number was > 32, then we would need a TB(N)ZX. This
would cause a crash, as we'd have the wrong register class. (PR48379)

This generalizes `narrowExtReg` into `moveScalarRegClass`.

This also allows us to remove `widenGPRBankRegIfNeeded` entirely, since
`selectCopy` correctly handles SUBREG_TO_REG etc.

This does create some codegen changes (since `selectCopy` uses the `all`
regclass variants). However, I think that these will likely be optimized away,
and we can always improve the `selectCopy` code. It looks like we should
revisit `selectCopy` at this point, and possibly refactor it into at least one
`emit` function.

Differential Revision: https://reviews.llvm.org/D92707

(cherry picked from commit 195a7af0abb26915f962462f69c0f17e3835f78b)
---
 .../GISel/AArch64InstructionSelector.cpp      | 97 +++++--------------
 .../GlobalISel/opt-fold-ext-tbz-tbnz.mir      |  5 +-
 .../AArch64/GlobalISel/subreg-copy.mir        | 32 ++++++
 .../GlobalISel/widen-narrow-tbz-tbnz.mir      | 10 +-
 4 files changed, 67 insertions(+), 77 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 90bab9603245..7733fe7f7b24 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -289,14 +289,15 @@ class AArch64InstructionSelector : public InstructionSelector {
   getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
                        bool IsLoadStore = false) const;
 
-  /// Instructions that accept extend modifiers like UXTW expect the register
-  /// being extended to be a GPR32. Narrow ExtReg to a 32-bit register using a
-  /// subregister copy if necessary. Return either ExtReg, or the result of the
-  /// new copy.
-  Register narrowExtendRegIfNeeded(Register ExtReg,
-                                             MachineIRBuilder &MIB) const;
-  Register widenGPRBankRegIfNeeded(Register Reg, unsigned Size,
-                                   MachineIRBuilder &MIB) const;
+  /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
+  ///
+  /// \returns Either \p Reg if no change was necessary, or the new register
+  /// created by moving \p Reg.
+  ///
+  /// Note: This uses emitCopy right now.
+  Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
+                              MachineIRBuilder &MIB) const;
+
   ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
 
   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
@@ -1195,10 +1196,10 @@ MachineInstr *AArch64InstructionSelector::emitTestBit(
   // TBNZW work.
   bool UseWReg = Bit < 32;
   unsigned NecessarySize = UseWReg ? 32 : 64;
-  if (Size < NecessarySize)
-    TestReg = widenGPRBankRegIfNeeded(TestReg, NecessarySize, MIB);
-  else if (Size > NecessarySize)
-    TestReg = narrowExtendRegIfNeeded(TestReg, MIB);
+  if (Size != NecessarySize)
+    TestReg = moveScalarRegClass(
+        TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
+        MIB);
 
   static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
                                           {AArch64::TBZW, AArch64::TBNZW}};
@@ -4984,7 +4985,7 @@ AArch64InstructionSelector::selectExtendedSHL(
 
     // Need a 32-bit wide register here.
     MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
-    OffsetReg = narrowExtendRegIfNeeded(OffsetReg, MIB);
+    OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
   }
 
   // We can use the LHS of the GEP as the base, and the LHS of the shift as an
@@ -5156,8 +5157,8 @@ AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
 
   // Need a 32-bit wide register.
   MachineIRBuilder MIB(*PtrAdd);
-  Register ExtReg =
-      narrowExtendRegIfNeeded(OffsetInst->getOperand(1).getReg(), MIB);
+  Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
+                                       AArch64::GPR32RegClass, MIB);
   unsigned SignExtend = Ext == AArch64_AM::SXTW;
 
   // Base is LHS, offset is ExtReg.
@@ -5431,67 +5432,21 @@ AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
   }
 }
 
-Register AArch64InstructionSelector::narrowExtendRegIfNeeded(
-    Register ExtReg, MachineIRBuilder &MIB) const {
+Register AArch64InstructionSelector::moveScalarRegClass(
+    Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
   MachineRegisterInfo &MRI = *MIB.getMRI();
-  if (MRI.getType(ExtReg).getSizeInBits() == 32)
-    return ExtReg;
-
-  // Insert a copy to move ExtReg to GPR32.
-  Register NarrowReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-  auto Copy = MIB.buildCopy({NarrowReg}, {ExtReg});
+  auto Ty = MRI.getType(Reg);
+  assert(!Ty.isVector() && "Expected scalars only!");
+  if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
+    return Reg;
 
-  // Select the copy into a subregister copy.
+  // Create a copy and immediately select it.
+  // FIXME: We should have an emitCopy function?
+  auto Copy = MIB.buildCopy({&RC}, {Reg});
   selectCopy(*Copy, TII, MRI, TRI, RBI);
   return Copy.getReg(0);
 }
 
-Register AArch64InstructionSelector::widenGPRBankRegIfNeeded(
-    Register Reg, unsigned WideSize, MachineIRBuilder &MIB) const {
-  assert(WideSize >= 8 && "WideSize is smaller than all possible registers?");
-  MachineRegisterInfo &MRI = *MIB.getMRI();
-  unsigned NarrowSize = MRI.getType(Reg).getSizeInBits();
-  assert(WideSize >= NarrowSize &&
-         "WideSize cannot be smaller than NarrowSize!");
-
-  // If the sizes match, just return the register.
-  //
-  // If NarrowSize is an s1, then we can select it to any size, so we'll treat
-  // it as a don't care.
-  if (NarrowSize == WideSize || NarrowSize == 1)
-    return Reg;
-
-  // Now check the register classes.
-  const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
-  const TargetRegisterClass *OrigRC = getMinClassForRegBank(*RB, NarrowSize);
-  const TargetRegisterClass *WideRC = getMinClassForRegBank(*RB, WideSize);
-  assert(OrigRC && "Could not determine narrow RC?");
-  assert(WideRC && "Could not determine wide RC?");
-
-  // If the sizes differ, but the register classes are the same, there is no
-  // need to insert a SUBREG_TO_REG.
-  //
-  // For example, an s8 that's supposed to be a GPR will be selected to either
-  // a GPR32 or a GPR64 register. Note that this assumes that the s8 will
-  // always end up on a GPR32.
-  if (OrigRC == WideRC)
-    return Reg;
-
-  // We have two different register classes. Insert a SUBREG_TO_REG.
-  unsigned SubReg = 0;
-  getSubRegForClass(OrigRC, TRI, SubReg);
-  assert(SubReg && "Couldn't determine subregister?");
-
-  // Build the SUBREG_TO_REG and return the new, widened register.
-  auto SubRegToReg =
-      MIB.buildInstr(AArch64::SUBREG_TO_REG, {WideRC}, {})
-          .addImm(0)
-          .addUse(Reg)
-          .addImm(SubReg);
-  constrainSelectedInstRegOperands(*SubRegToReg, TII, TRI, RBI);
-  return SubRegToReg.getReg(0);
-}
-
 /// Select an "extended register" operand. This operand folds in an extend
 /// followed by an optional left shift.
 InstructionSelector::ComplexRendererFns
@@ -5552,7 +5507,7 @@ AArch64InstructionSelector::selectArithExtendedRegister(
   // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
   // copy.
   MachineIRBuilder MIB(*RootDef);
-  ExtReg = narrowExtendRegIfNeeded(ExtReg, MIB);
+  ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
 
   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
            [=](MachineInstrBuilder &MIB) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/opt-fold-ext-tbz-tbnz.mir b/llvm/test/CodeGen/AArch64/GlobalISel/opt-fold-ext-tbz-tbnz.mir
index 977bb5a64cf5..9962bd87c175 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/opt-fold-ext-tbz-tbnz.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/opt-fold-ext-tbz-tbnz.mir
@@ -78,8 +78,9 @@ body:             |
   ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
   ; CHECK:   liveins: $h0
   ; CHECK:   [[SUBREG_TO_REG:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, $h0, %subreg.hsub
-  ; CHECK:   %copy:gpr32 = COPY [[SUBREG_TO_REG]]
-  ; CHECK:   TBNZW %copy, 3, %bb.1
+  ; CHECK:   %copy:gpr32all = COPY [[SUBREG_TO_REG]]
+  ; CHECK:   [[COPY:%[0-9]+]]:gpr32 = COPY %copy
+  ; CHECK:   TBNZW [[COPY]], 3, %bb.1
   ; CHECK:   B %bb.0
   ; CHECK: bb.1:
   ; CHECK:   RET_ReallyLR
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/subreg-copy.mir b/llvm/test/CodeGen/AArch64/GlobalISel/subreg-copy.mir
index efb999909ccc..d5902d70842b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/subreg-copy.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/subreg-copy.mir
@@ -34,3 +34,35 @@ body: |
   bb.1:
     RET_ReallyLR
 ...
+---
+name:              no_trunc
+alignment:         4
+legalized:         true
+regBankSelected:   true
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: no_trunc
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $x0
+  ; CHECK:   [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+  ; CHECK:   [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0 :: (load 16)
+  ; CHECK:   [[COPY1:%[0-9]+]]:gpr64all = COPY [[LDRQui]].dsub
+  ; CHECK:   [[COPY2:%[0-9]+]]:gpr64 = COPY [[COPY1]]
+  ; CHECK:   TBNZX [[COPY2]], 33, %bb.1
+  ; CHECK: bb.1:
+  ; CHECK:   RET_ReallyLR
+  bb.0:
+    liveins: $x0
+    %1:gpr(p0) = COPY $x0
+    %3:gpr(s64) = G_CONSTANT i64 8589934592
+    %5:gpr(s64) = G_CONSTANT i64 0
+    %0:fpr(s128) = G_LOAD %1:gpr(p0) :: (load 16)
+    %2:fpr(s64) = G_TRUNC %0:fpr(s128)
+    %8:gpr(s64) = COPY %2:fpr(s64)
+    %4:gpr(s64) = G_AND %8:gpr, %3:gpr
+    %7:gpr(s32) = G_ICMP intpred(ne), %4:gpr(s64), %5:gpr
+    %6:gpr(s1) = G_TRUNC %7:gpr(s32)
+    G_BRCOND %6:gpr(s1), %bb.1
+  bb.1:
+    RET_ReallyLR
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/widen-narrow-tbz-tbnz.mir b/llvm/test/CodeGen/AArch64/GlobalISel/widen-narrow-tbz-tbnz.mir
index 22963c50a2eb..7db671b08224 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/widen-narrow-tbz-tbnz.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/widen-narrow-tbz-tbnz.mir
@@ -106,8 +106,9 @@ body:             |
   ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
   ; CHECK:   liveins: $w0
   ; CHECK:   %reg:gpr32all = COPY $w0
-  ; CHECK:   [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %reg, %subreg.sub_32
-  ; CHECK:   TBZX [[SUBREG_TO_REG]], 33, %bb.1
+  ; CHECK:   [[SUBREG_TO_REG:%[0-9]+]]:gpr64all = SUBREG_TO_REG 0, %reg, %subreg.sub_32
+  ; CHECK:   [[COPY:%[0-9]+]]:gpr64 = COPY [[SUBREG_TO_REG]]
+  ; CHECK:   TBZX [[COPY]], 33, %bb.1
   ; CHECK:   B %bb.0
   ; CHECK: bb.1:
   ; CHECK:   RET_ReallyLR
@@ -140,8 +141,9 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
   ; CHECK:   %reg:gpr32 = IMPLICIT_DEF
-  ; CHECK:   [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %reg, %subreg.sub_32
-  ; CHECK:   TBZX [[SUBREG_TO_REG]], 33, %bb.1
+  ; CHECK:   [[SUBREG_TO_REG:%[0-9]+]]:gpr64all = SUBREG_TO_REG 0, %reg, %subreg.sub_32
+  ; CHECK:   [[COPY:%[0-9]+]]:gpr64 = COPY [[SUBREG_TO_REG]]
+  ; CHECK:   TBZX [[COPY]], 33, %bb.1
   ; CHECK:   B %bb.0
   ; CHECK: bb.1:
   ; CHECK:   RET_ReallyLR

From f5f8d86dc4c91ef492b919edf98335d4d09188a8 Mon Sep 17 00:00:00 2001
From: James Henderson <james.henderson@sony.com>
Date: Mon, 10 Aug 2020 13:36:44 +0100
Subject: [PATCH 353/363] Don't error for zero-length arange entries

Although the DWARF specification states that .debug_aranges entries
can't have length zero, these can occur in the wild. There's no
particular reason to enforce this part of the spec, since functionally
they have no impact. The patch removes the error and introduces a new
warning for premature terminator entries which does not stop parsing.

This is a relanding of cb3a598c87db, adding the missing obj2yaml part
that was needed.

Fixes https://bugs.llvm.org/show_bug.cgi?id=46805. See also
https://reviews.llvm.org/D71932 which originally introduced the error.

Reviewed by: ikudrin, dblaikie, Higuoxing

Differential Revision: https://reviews.llvm.org/D85313
---
 .../DebugInfo/DWARF/DWARFDebugArangeSet.cpp   | 13 ++--
 .../DWARF/DWARFDebugArangeSetTest.cpp         | 65 +++++++++++++++++--
 2 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
index 608fc0388af0..c3b039b05f30 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
@@ -132,19 +132,20 @@ Error DWARFDebugArangeSet::extract(DWARFDataExtractor data,
 
   uint64_t end_offset = Offset + full_length;
   while (*offset_ptr < end_offset) {
+    uint64_t EntryOffset = *offset_ptr;
     arangeDescriptor.Address = data.getUnsigned(offset_ptr, HeaderData.AddrSize);
     arangeDescriptor.Length = data.getUnsigned(offset_ptr, HeaderData.AddrSize);
 
-    if (arangeDescriptor.Length == 0) {
-      // Each set of tuples is terminated by a 0 for the address and 0
-      // for the length.
-      if (arangeDescriptor.Address == 0 && *offset_ptr == end_offset)
+    // Each set of tuples is terminated by a 0 for the address and 0
+    // for the length.
+    if (arangeDescriptor.Length == 0 && arangeDescriptor.Address == 0) {
+      if (*offset_ptr == end_offset)
         return ErrorSuccess();
       return createStringError(
           errc::invalid_argument,
           "address range table at offset 0x%" PRIx64
-          " has an invalid tuple (length = 0) at offset 0x%" PRIx64,
-          Offset, *offset_ptr - tuple_size);
+          " has a premature terminator entry at offset 0x%" PRIx64,
+          Offset, EntryOffset);
     }
 
     ArangeDescriptors.push_back(arangeDescriptor);
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDebugArangeSetTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDebugArangeSetTest.cpp
index 4ec9c5d1c0be..7f16aa9ce4b7 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFDebugArangeSetTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFDebugArangeSetTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
+#include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
@@ -166,9 +167,9 @@ TEST(DWARFDebugArangeSet, UnevenLength) {
       "of the tuple size");
 }
 
-TEST(DWARFDebugArangeSet, ZeroLengthEntry) {
+TEST(DWARFDebugArangeSet, ZeroAddressEntry) {
   static const char DebugArangesSecRaw[] =
-      "\x24\x00\x00\x00" // Length
+      "\x1c\x00\x00\x00" // Length
       "\x02\x00"         // Version
       "\x00\x00\x00\x00" // Debug Info Offset
       "\x04"             // Address Size
@@ -176,14 +177,68 @@ TEST(DWARFDebugArangeSet, ZeroLengthEntry) {
       "\x00\x00\x00\x00" // Padding
       "\x00\x00\x00\x00" // Entry1: Address
       "\x01\x00\x00\x00" //         Length
+      "\x00\x00\x00\x00" // Termination tuple
+      "\x00\x00\x00\x00";
+  DWARFDataExtractor Extractor(
+      StringRef(DebugArangesSecRaw, sizeof(DebugArangesSecRaw) - 1),
+      /*IsLittleEndian=*/true,
+      /*AddressSize=*/4);
+  DWARFDebugArangeSet Set;
+  uint64_t Offset = 0;
+  ASSERT_THAT_ERROR(Set.extract(Extractor, &Offset),
+                    Succeeded());
+  auto Range = Set.descriptors();
+  auto Iter = Range.begin();
+  ASSERT_EQ(std::distance(Iter, Range.end()), 1u);
+  EXPECT_EQ(Iter->Address, 0u);
+  EXPECT_EQ(Iter->Length, 1u);
+}
+
+TEST(DWARFDebugArangeSet, ZeroLengthEntry) {
+  static const char DebugArangesSecRaw[] =
+      "\x1c\x00\x00\x00" // Length
+      "\x02\x00"         // Version
+      "\x00\x00\x00\x00" // Debug Info Offset
+      "\x04"             // Address Size
+      "\x00"             // Segment Selector Size
+      "\x00\x00\x00\x00" // Padding
+      "\x01\x00\x00\x00" // Entry1: Address
+      "\x00\x00\x00\x00" //         Length
+      "\x00\x00\x00\x00" // Termination tuple
+      "\x00\x00\x00\x00";
+  DWARFDataExtractor Extractor(
+      StringRef(DebugArangesSecRaw, sizeof(DebugArangesSecRaw) - 1),
+      /*IsLittleEndian=*/true,
+      /*AddressSize=*/4);
+  DWARFDebugArangeSet Set;
+  uint64_t Offset = 0;
+  ASSERT_THAT_ERROR(Set.extract(Extractor, &Offset),
+                    Succeeded());
+  auto Range = Set.descriptors();
+  auto Iter = Range.begin();
+  ASSERT_EQ(std::distance(Iter, Range.end()), 1u);
+  EXPECT_EQ(Iter->Address, 1u);
+  EXPECT_EQ(Iter->Length, 0u);
+}
+
+TEST(DWARFDebugArangesSet, PrematureTerminator) {
+  static const char DebugArangesSecRaw[] =
+      "\x24\x00\x00\x00" // Length
+      "\x02\x00"         // Version
+      "\x00\x00\x00\x00" // Debug Info Offset
+      "\x04"             // Address Size
+      "\x00"             // Segment Selector Size
+      "\x00\x00\x00\x00" // Padding
+      "\x00\x00\x00\x00" // Entry1: Premature
+      "\x00\x00\x00\x00" //         terminator
       "\x01\x00\x00\x00" // Entry2: Address
-      "\x00\x00\x00\x00" //         Length (invalid)
+      "\x01\x00\x00\x00" //         Length
       "\x00\x00\x00\x00" // Termination tuple
       "\x00\x00\x00\x00";
   ExpectExtractError(
       DebugArangesSecRaw,
-      "address range table at offset 0x0 has an invalid tuple (length = 0) "
-      "at offset 0x18");
+      "address range table at offset 0x0 has a premature "
+      "terminator entry at offset 0x10");
 }
 
 } // end anonymous namespace

From d104e582838fd73d6ef565788f11617eccab87e2 Mon Sep 17 00:00:00 2001
From: Jonas Hahnfeld <hahnjo@hahnjo.de>
Date: Sat, 5 Dec 2020 12:52:38 +0100
Subject: [PATCH 354/363] [CMake] Avoid __FakeVCSRevision.h with no git
 repository

Set the return variable to "" in find_first_existing_vc_file to
say that there is a repository, but no file to depend on. This works
transparently for all other callers that handle undefinedness and
equality to an empty string the same way.

Use the knowledge to avoid depending on __FakeVCSRevision.h if there
is no git repository at all (for example when building a release) as
there is no point in regenerating an empty VCSRevision.h.

Differential Revision: https://reviews.llvm.org/D92718

(cherry picked from commit 6e890ec7beb0874464a0af9f84e41a987f968b23)
---
 llvm/cmake/modules/AddLLVM.cmake         | 8 ++++++++
 llvm/include/llvm/Support/CMakeLists.txt | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 333167bfb6b0..b74adc11ade9 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -2102,6 +2102,13 @@ function(setup_dependency_debugging name)
   set_target_properties(${name} PROPERTIES RULE_LAUNCH_COMPILE ${sandbox_command})
 endfunction()
 
+# If the sources at the given `path` are under version control, set `out_var`
+# to the the path of a file which will be modified when the VCS revision
+# changes, attempting to create that file if it does not exist; if no such
+# file exists and one cannot be created, instead set `out_var` to the
+# empty string.
+#
+# If the sources are not under version control, do not define `out_var`.
 function(find_first_existing_vc_file path out_var)
   if(NOT EXISTS "${path}")
     return()
@@ -2123,6 +2130,7 @@ function(find_first_existing_vc_file path out_var)
           RESULT_VARIABLE touch_head_result
           ERROR_QUIET)
         if (NOT touch_head_result EQUAL 0)
+          set(${out_var} "" PARENT_SCOPE)
           return()
         endif()
       endif()
diff --git a/llvm/include/llvm/Support/CMakeLists.txt b/llvm/include/llvm/Support/CMakeLists.txt
index da8a4da443ed..69f6a1582ce9 100644
--- a/llvm/include/llvm/Support/CMakeLists.txt
+++ b/llvm/include/llvm/Support/CMakeLists.txt
@@ -11,7 +11,7 @@ if(LLVM_APPEND_VC_REV)
   # A fake version file and is not expected to exist. It is being used to
   # force regeneration of VCSRevision.h for source directory with no write
   # permission available.
-  if (NOT llvm_vc)
+  if (llvm_vc STREQUAL "")
     set(fake_version_inc "${CMAKE_CURRENT_BINARY_DIR}/__FakeVCSRevision.h")
   endif()
 endif()

From 9caca7241d447266a23a99ea0536f30faaf19694 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Sun, 27 Sep 2020 01:22:55 -0700
Subject: [PATCH 355/363] [AArch64][GlobalISel] Use the look-through constant
 helper for the shift s32->s64 custom legalization.

Almost NFC, except it catches more cases and gives a 0.1% CTMark -O0 size win.
---
 llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 10 ++++------
 .../AArch64/GlobalISel/legalize-unmerge-values.mir     |  5 +++--
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 2eaec0b970fa..3dcc244a08fa 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -710,16 +710,14 @@ bool AArch64LegalizerInfo::legalizeShlAshrLshr(
   // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
   // imported patterns can select it later. Either way, it will be legal.
   Register AmtReg = MI.getOperand(2).getReg();
-  auto *CstMI = MRI.getVRegDef(AmtReg);
-  assert(CstMI && "expected to find a vreg def");
-  if (CstMI->getOpcode() != TargetOpcode::G_CONSTANT)
+  auto VRegAndVal = getConstantVRegValWithLookThrough(AmtReg, MRI);
+  if (!VRegAndVal)
     return true;
   // Check the shift amount is in range for an immediate form.
-  unsigned Amount = CstMI->getOperand(1).getCImm()->getZExtValue();
+  int64_t Amount = VRegAndVal->Value;
   if (Amount > 31)
     return true; // This will have to remain a register variant.
-  assert(MRI.getType(AmtReg).getSizeInBits() == 32);
-  auto ExtCst = MIRBuilder.buildZExt(LLT::scalar(64), AmtReg);
+  auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount);
   MI.getOperand(2).setReg(ExtCst.getReg(0));
   return true;
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-unmerge-values.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-unmerge-values.mir
index 56c5b8a8f1e2..9c1f6fc6f41b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-unmerge-values.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-unmerge-values.mir
@@ -24,9 +24,10 @@ body: |
     ; CHECK-LABEL: name: test_unmerge_s4
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
     ; CHECK: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[COPY]](s32)
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 4
     ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UV]](s8)
-    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[ZEXT]], [[C]](s32)
+    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[ZEXT]], [[C1]](s64)
     ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV]](s8)
     ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32)
     ; CHECK: $x0 = COPY [[ANYEXT]](s64)

From 43ff75f2c3feef64f9d73328230d34dac8832a91 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Sun, 27 Sep 2020 01:45:09 -0700
Subject: [PATCH 356/363] [AArch64][GlobalISel] Promote scalar G_SHL constant
 shift amounts to s64.

This was supposed to be done in the first place as is currently the case for
G_ASHR and G_LSHR but was forgotten when the original shift legalization
overhaul was done last year.

This was exposed because we started falling back on s32 = s32, s64 SHLs
due to a recent combiner change.

Gives a very minor (0.1%) code size -O0 improvement on consumer-typeset.
---
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    | 28 +++++++++++++------
 .../GlobalISel/legalize-merge-values.mir      |  5 ++--
 .../legalize-non-pow2-load-store.mir          |  7 ++---
 .../AArch64/GlobalISel/legalize-shift.mir     |  4 +--
 llvm/test/CodeGen/AArch64/arm64-clrsb.ll      |  4 +--
 5 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 3dcc244a08fa..4ffde2a7e3c4 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -97,15 +97,25 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .moreElementsToNextPow2(0);
 
   getActionDefinitionsBuilder(G_SHL)
-    .legalFor({{s32, s32}, {s64, s64},
-               {v2s32, v2s32}, {v4s32, v4s32}, {v2s64, v2s64}})
-    .clampScalar(1, s32, s64)
-    .clampScalar(0, s32, s64)
-    .widenScalarToNextPow2(0)
-    .clampNumElements(0, v2s32, v4s32)
-    .clampNumElements(0, v2s64, v2s64)
-    .moreElementsToNextPow2(0)
-    .minScalarSameAs(1, 0);
+      .customIf([=](const LegalityQuery &Query) {
+        const auto &SrcTy = Query.Types[0];
+        const auto &AmtTy = Query.Types[1];
+        return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
+               AmtTy.getSizeInBits() == 32;
+      })
+      .legalFor({{s32, s32},
+                 {s64, s64},
+                 {s32, s64},
+                 {v2s32, v2s32},
+                 {v4s32, v4s32},
+                 {v2s64, v2s64}})
+      .clampScalar(1, s32, s64)
+      .clampScalar(0, s32, s64)
+      .widenScalarToNextPow2(0)
+      .clampNumElements(0, v2s32, v4s32)
+      .clampNumElements(0, v2s64, v2s64)
+      .moreElementsToNextPow2(0)
+      .minScalarSameAs(1, 0);
 
   getActionDefinitionsBuilder(G_PTR_ADD)
       .legalFor({{p0, s64}, {v2p0, v2s64}})
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-merge-values.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-merge-values.mir
index 09ae228b4f1d..a802baca4c8d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-merge-values.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-merge-values.mir
@@ -6,11 +6,12 @@ name:            test_merge_s4
 body: |
   bb.0:
     ; CHECK-LABEL: name: test_merge_s4
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; CHECK: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 4
     ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
     ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[C2]], [[C1]]
-    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
+    ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C3]](s64)
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
     ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
     ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SHL]](s32)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-non-pow2-load-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-non-pow2-load-store.mir
index 7d7b77aa7535..6dc28e738dbc 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-non-pow2-load-store.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-non-pow2-load-store.mir
@@ -28,12 +28,11 @@ body:             |
     ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load 1 from %ir.ptr + 2, align 4)
-    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C2]](s32)
+    ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C2]](s64)
     ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
-    ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C3]](s64)
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C2]](s64)
     ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C1]](s64)
     ; CHECK: G_STORE [[COPY2]](s32), [[COPY1]](p0) :: (store 2 into %ir.ptr2, align 4)
     ; CHECK: G_STORE [[LSHR]](s32), [[PTR_ADD1]](p0) :: (store 1 into %ir.ptr2 + 2, align 4)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir
index 944ac8110ce0..3c3f34ea6dd1 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir
@@ -235,8 +235,8 @@ body:             |
 
     ; CHECK-LABEL: name: shl_cimm_32
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s64)
     ; CHECK: $w0 = COPY [[SHL]](s32)
     ; CHECK: RET_ReallyLR implicit $w0
     %0:_(s32) = COPY $w0
diff --git a/llvm/test/CodeGen/AArch64/arm64-clrsb.ll b/llvm/test/CodeGen/AArch64/arm64-clrsb.ll
index 64673f2e096b..149a466a1147 100644
--- a/llvm/test/CodeGen/AArch64/arm64-clrsb.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-clrsb.ll
@@ -21,10 +21,8 @@ entry:
 ; CHECK-LABEL: clrsb32
 ; CHECK:   cls [[TEMP:w[0-9]+]], [[TEMP]]
 
-; FIXME: We should produce the same result here to save some code size. After
-; that, we can remove the GISEL special casing.
 ; GISEL-LABEL: clrsb32
-; GISEL: clz
+; GISEL: cls [[TEMP:w[0-9]+]], [[TEMP]]
 }
 
 ; Function Attrs: nounwind ssp

From b6164d967e010ed54ed8ce0d4a3ea19f54e90108 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 12 Jan 2021 10:34:02 -0800
Subject: [PATCH 357/363] Bump version to 11.1.0

---
 libcxx/CMakeLists.txt                     | 2 +-
 libcxxabi/CMakeLists.txt                  | 2 +-
 libunwind/CMakeLists.txt                  | 2 +-
 llvm/CMakeLists.txt                       | 4 ++--
 llvm/utils/gn/secondary/llvm/version.gni  | 4 ++--
 llvm/utils/release/build_llvm_package.bat | 4 ++--
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index f145831c75d8..910d04b54b6d 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -32,7 +32,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBCXX_STANDALONE_BUIL
   project(libcxx CXX C)
 
   set(PACKAGE_NAME libcxx)
-  set(PACKAGE_VERSION 11.0.1)
+  set(PACKAGE_VERSION 11.1.0)
   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
 
diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index deff3d5e4ad1..36c6b2249e2b 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -25,7 +25,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBCXXABI_STANDALONE_B
   project(libcxxabi CXX C)
 
   set(PACKAGE_NAME libcxxabi)
-  set(PACKAGE_VERSION 11.0.1)
+  set(PACKAGE_VERSION 11.1.0)
   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
 
diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt
index cdac67e93df1..e44a103648f9 100644
--- a/libunwind/CMakeLists.txt
+++ b/libunwind/CMakeLists.txt
@@ -83,7 +83,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR LIBUNWIND_STANDALONE_B
   endif()
 
   set(PACKAGE_NAME libunwind)
-  set(PACKAGE_VERSION 11.0.1)
+  set(PACKAGE_VERSION 11.1.0)
   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
 
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index b8dabbbca05a..247ad36d3845 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -27,10 +27,10 @@ if(NOT DEFINED LLVM_VERSION_MAJOR)
   set(LLVM_VERSION_MAJOR 11)
 endif()
 if(NOT DEFINED LLVM_VERSION_MINOR)
-  set(LLVM_VERSION_MINOR 0)
+  set(LLVM_VERSION_MINOR 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_PATCH)
-  set(LLVM_VERSION_PATCH 1)
+  set(LLVM_VERSION_PATCH 0)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
   set(LLVM_VERSION_SUFFIX "")
diff --git a/llvm/utils/gn/secondary/llvm/version.gni b/llvm/utils/gn/secondary/llvm/version.gni
index e2b6390b66cc..ebc66a5138e7 100644
--- a/llvm/utils/gn/secondary/llvm/version.gni
+++ b/llvm/utils/gn/secondary/llvm/version.gni
@@ -1,4 +1,4 @@
 llvm_version_major = 11
-llvm_version_minor = 0
-llvm_version_patch = 1
+llvm_version_minor = 1
+llvm_version_patch = 0
 llvm_version = "$llvm_version_major.$llvm_version_minor.$llvm_version_patch"
diff --git a/llvm/utils/release/build_llvm_package.bat b/llvm/utils/release/build_llvm_package.bat
index 31e237c63565..73362920773b 100755
--- a/llvm/utils/release/build_llvm_package.bat
+++ b/llvm/utils/release/build_llvm_package.bat
@@ -27,8 +27,8 @@ set python64_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python36
 for /f "usebackq" %%i in (`PowerShell ^(Get-Date^).ToString^('yyyyMMdd'^)`) do set datestamp=%%i
 
 set revision=%1
-set package_version=11.0.1-%revision:~0,8%
-set clang_format_vs_version=11.0.1.%datestamp%
+set package_version=11.1.0-%revision:~0,8%
+set clang_format_vs_version=11.1.0.%datestamp%
 set build_dir=llvm_package_%revision:~0,8%
 
 echo Revision: %revision%

From 9bbcb554cdbf1a7b85e9a72169e4037cf4736a10 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Thu, 29 Oct 2020 10:30:11 +0000
Subject: [PATCH 358/363] Address ABI issues introduced with
 CXCursor_CXXAddrspaceCastExpr

Revert values in CXCursorKind as they were before
CXCursor_CXXAddrspaceCastExpr was introduced in a6a237f2046a ([OpenCL]
Added addrspace_cast operator in C++ mode., 2020-05-18).

Insert CXCursor_CXXAddrspaceCastExpr after the last expression in
CXCursorKind using the next available value.

Reviewed By: akyrtzi, svenvh

Differential Revision: https://reviews.llvm.org/D90385

(cherry picked from commit bbdbd020d2c2f315ed1545b23c23ec6ff1abc022)
---
 clang/include/clang-c/Index.h | 58 +++++++++++++++++------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index 5fa728d6d66c..9f5a727c84bb 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -33,7 +33,7 @@
  * compatible, thus CINDEX_VERSION_MAJOR is expected to remain stable.
  */
 #define CINDEX_VERSION_MAJOR 0
-#define CINDEX_VERSION_MINOR 60
+#define CINDEX_VERSION_MINOR 61
 
 #define CINDEX_VERSION_ENCODE(major, minor) (((major)*10000) + ((minor)*1))
 
@@ -2052,62 +2052,58 @@ enum CXCursorKind {
    */
   CXCursor_CXXFunctionalCastExpr = 128,
 
-  /** OpenCL's addrspace_cast<> expression.
-   */
-  CXCursor_CXXAddrspaceCastExpr = 129,
-
   /** A C++ typeid expression (C++ [expr.typeid]).
    */
-  CXCursor_CXXTypeidExpr = 130,
+  CXCursor_CXXTypeidExpr = 129,
 
   /** [C++ 2.13.5] C++ Boolean Literal.
    */
-  CXCursor_CXXBoolLiteralExpr = 131,
+  CXCursor_CXXBoolLiteralExpr = 130,
 
   /** [C++0x 2.14.7] C++ Pointer Literal.
    */
-  CXCursor_CXXNullPtrLiteralExpr = 132,
+  CXCursor_CXXNullPtrLiteralExpr = 131,
 
   /** Represents the "this" expression in C++
    */
-  CXCursor_CXXThisExpr = 133,
+  CXCursor_CXXThisExpr = 132,
 
   /** [C++ 15] C++ Throw Expression.
    *
    * This handles 'throw' and 'throw' assignment-expression. When
    * assignment-expression isn't present, Op will be null.
    */
-  CXCursor_CXXThrowExpr = 134,
+  CXCursor_CXXThrowExpr = 133,
 
   /** A new expression for memory allocation and constructor calls, e.g:
    * "new CXXNewExpr(foo)".
    */
-  CXCursor_CXXNewExpr = 135,
+  CXCursor_CXXNewExpr = 134,
 
   /** A delete expression for memory deallocation and destructor calls,
    * e.g. "delete[] pArray".
    */
-  CXCursor_CXXDeleteExpr = 136,
+  CXCursor_CXXDeleteExpr = 135,
 
   /** A unary expression. (noexcept, sizeof, or other traits)
    */
-  CXCursor_UnaryExpr = 137,
+  CXCursor_UnaryExpr = 136,
 
   /** An Objective-C string literal i.e. @"foo".
    */
-  CXCursor_ObjCStringLiteral = 138,
+  CXCursor_ObjCStringLiteral = 137,
 
   /** An Objective-C \@encode expression.
    */
-  CXCursor_ObjCEncodeExpr = 139,
+  CXCursor_ObjCEncodeExpr = 138,
 
   /** An Objective-C \@selector expression.
    */
-  CXCursor_ObjCSelectorExpr = 140,
+  CXCursor_ObjCSelectorExpr = 139,
 
   /** An Objective-C \@protocol expression.
    */
-  CXCursor_ObjCProtocolExpr = 141,
+  CXCursor_ObjCProtocolExpr = 140,
 
   /** An Objective-C "bridged" cast expression, which casts between
    * Objective-C pointers and C pointers, transferring ownership in the process.
@@ -2116,7 +2112,7 @@ enum CXCursorKind {
    *   NSString *str = (__bridge_transfer NSString *)CFCreateString();
    * \endcode
    */
-  CXCursor_ObjCBridgedCastExpr = 142,
+  CXCursor_ObjCBridgedCastExpr = 141,
 
   /** Represents a C++0x pack expansion that produces a sequence of
    * expressions.
@@ -2131,7 +2127,7 @@ enum CXCursorKind {
    * }
    * \endcode
    */
-  CXCursor_PackExpansionExpr = 143,
+  CXCursor_PackExpansionExpr = 142,
 
   /** Represents an expression that computes the length of a parameter
    * pack.
@@ -2143,7 +2139,7 @@ enum CXCursorKind {
    * };
    * \endcode
    */
-  CXCursor_SizeOfPackExpr = 144,
+  CXCursor_SizeOfPackExpr = 143,
 
   /* Represents a C++ lambda expression that produces a local function
    * object.
@@ -2157,39 +2153,43 @@ enum CXCursorKind {
    * }
    * \endcode
    */
-  CXCursor_LambdaExpr = 145,
+  CXCursor_LambdaExpr = 144,
 
   /** Objective-c Boolean Literal.
    */
-  CXCursor_ObjCBoolLiteralExpr = 146,
+  CXCursor_ObjCBoolLiteralExpr = 145,
 
   /** Represents the "self" expression in an Objective-C method.
    */
-  CXCursor_ObjCSelfExpr = 147,
+  CXCursor_ObjCSelfExpr = 146,
 
   /** OpenMP 5.0 [2.1.5, Array Section].
    */
-  CXCursor_OMPArraySectionExpr = 148,
+  CXCursor_OMPArraySectionExpr = 147,
 
   /** Represents an @available(...) check.
    */
-  CXCursor_ObjCAvailabilityCheckExpr = 149,
+  CXCursor_ObjCAvailabilityCheckExpr = 148,
 
   /**
    * Fixed point literal
    */
-  CXCursor_FixedPointLiteral = 150,
+  CXCursor_FixedPointLiteral = 149,
 
   /** OpenMP 5.0 [2.1.4, Array Shaping].
    */
-  CXCursor_OMPArrayShapingExpr = 151,
+  CXCursor_OMPArrayShapingExpr = 150,
 
   /**
    * OpenMP 5.0 [2.1.6 Iterators]
    */
-  CXCursor_OMPIteratorExpr = 152,
+  CXCursor_OMPIteratorExpr = 151,
+
+  /** OpenCL's addrspace_cast<> expression.
+   */
+  CXCursor_CXXAddrspaceCastExpr = 152,
 
-  CXCursor_LastExpr = CXCursor_OMPIteratorExpr,
+  CXCursor_LastExpr = CXCursor_CXXAddrspaceCastExpr,
 
   /* Statements */
   CXCursor_FirstStmt = 200,

From 3a8282376b6c2bb65a3bb580c10d4da1296d8df1 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Thu, 21 Jan 2021 11:35:48 -0800
Subject: [PATCH 359/363] Add minor version to libclang.so and libclang-cpp.so
 SONAME

This patch is for the release/11.x branch.  We need to bump the SONAME, because
the ABI of the shared library is changing

Reviewed By: sylvestre.ledru, cuviper

Differential Revision: https://reviews.llvm.org/D94941
---
 clang/CMakeLists.txt                   | 2 +-
 clang/tools/clang-shlib/CMakeLists.txt | 5 +++++
 clang/tools/libclang/CMakeLists.txt    | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index 2e06c5fd9028..bb4b801f01c8 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -509,7 +509,7 @@ set(CLANG_EXECUTABLE_VERSION
     "${CLANG_VERSION_MAJOR}" CACHE STRING
     "Major version number that will be appended to the clang executable name")
 set(LIBCLANG_LIBRARY_VERSION
-    "${CLANG_VERSION_MAJOR}" CACHE STRING
+    "${CLANG_VERSION_MAJOR}.${CLANG_VERSION_MINOR}" CACHE STRING
     "Major version number that will be appended to the libclang library")
 mark_as_advanced(CLANG_EXECUTABLE_VERSION LIBCLANG_LIBRARY_VERSION)
 
diff --git a/clang/tools/clang-shlib/CMakeLists.txt b/clang/tools/clang-shlib/CMakeLists.txt
index 5949223fc8e3..47ff80418bb0 100644
--- a/clang/tools/clang-shlib/CMakeLists.txt
+++ b/clang/tools/clang-shlib/CMakeLists.txt
@@ -48,3 +48,8 @@ add_clang_library(clang-cpp
                   ${_OBJECTS}
                   LINK_LIBS
                   ${_DEPS})
+
+    set_target_properties(clang-cpp
+      PROPERTIES
+      VERSION ${LIBCLANG_LIBRARY_VERSION}
+      SOVERSION ${LIBCLANG_LIBRARY_VERSION})
diff --git a/clang/tools/libclang/CMakeLists.txt b/clang/tools/libclang/CMakeLists.txt
index a4077140acee..5cd9ac5cddc1 100644
--- a/clang/tools/libclang/CMakeLists.txt
+++ b/clang/tools/libclang/CMakeLists.txt
@@ -150,6 +150,7 @@ if(ENABLE_SHARED)
   else()
     set_target_properties(libclang
       PROPERTIES
+      SOVERSION ${LIBCLANG_LIBRARY_VERSION}
       VERSION ${LIBCLANG_LIBRARY_VERSION}
       DEFINE_SYMBOL _CINDEX_LIB_)
     # FIXME: _CINDEX_LIB_ affects dllexport/dllimport on Win32.

From e18e509bdb45cd63c949c415df94358acf118e0f Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 1 Feb 2021 13:05:19 +0000
Subject: [PATCH 360/363] workflows: Add job to check for ABI changes in
 libclang.so and libclang-cpp.so

---
 .github/workflows/libclang-abi-tests.yml | 132 +++++++++++++++++++++++
 1 file changed, 132 insertions(+)
 create mode 100644 .github/workflows/libclang-abi-tests.yml

diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
new file mode 100644
index 000000000000..5681c7c8166e
--- /dev/null
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -0,0 +1,132 @@
+name: libclang ABI Tests
+
+on:
+  push:
+    branches:
+      - 'release/**'
+    paths:
+      - 'clang/**'
+      - '.github/workflows/libclang-abi-tests.yml'
+  pull_request:
+    paths:
+      - 'clang/**'
+      - '.github/workflows/libclang-abi-tests.yml'
+
+jobs:
+  abi-dump-setup:
+    runs-on: ubuntu-latest
+    outputs:
+      BASELINE_REF: ${{ steps.vars.outputs.BASELINE_REF }}
+      ABI_HEADERS: ${{ steps.vars.outputs.ABI_HEADERS }}
+      ABI_LIBS: ${{ steps.vars.outputs.ABI_LIBS }}
+      BASELINE_VERSION_MAJOR: ${{ steps.vars.outputs.BASELINE_VERSION_MAJOR }}
+      LLVM_VERSION_MAJOR: ${{ steps.version.outputs.LLVM_VERSION_MAJOR }}
+      LLVM_VERSION_MINOR: ${{ steps.version.outputs.LLVM_VERSION_MINOR }}
+      LLVM_VERSION_PATCH: ${{ steps.version.outputs.LLVM_VERSION_PATCH }}
+    steps:
+      - name: Checkout source
+        uses: actions/checkout@v1
+        with:
+          fetch-depth: 1
+
+      - name: Get LLVM version
+        id: version
+        uses: llvm/actions/get-llvm-version@main
+
+      - name: Setup Variables
+        id: vars
+        run: |
+          if [ ${{ steps.version.outputs.LLVM_VERSION_MINOR }} -ne 0 -o ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then
+            echo ::set-output name=BASELINE_VERSION_MAJOR::$(( ${{ steps.version.outputs.LLVM_VERSION_MAJOR }} - 1))
+            echo ::set-output name=ABI_HEADERS::clang-c
+            echo ::set-output name=ABI_LIBS::libclang.so
+          else
+            echo ::set-output name=BASELINE_VERSION_MAJOR::${{ steps.version.outputs.LLVM_VERSION_MAJOR }}
+            echo ::set-output name=ABI_HEADERS::.
+            echo ::set-output name=ABI_LIBS::libclang.so libclang-cpp.so
+          fi
+
+  abi-dump:
+    needs: abi-dump-setup
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        name:
+          - build-baseline
+          - build-latest
+        include:
+          - name: build-baseline
+            llvm_version_major: ${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}
+            ref: llvmorg-${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}.0.0
+            repo: llvm/llvm-project
+          - name: build-latest
+            llvm_version_major: ${{ needs.abi-dump-setup.outputs.LLVM_VERSION_MAJOR }}
+            ref: ${{ github.sha }}
+            repo: ${{ github.repository }}
+    steps:
+    - name: Install Ninja
+      uses: llvm/actions/install-ninja@main
+    - name: Install abi-compliance-checker
+      run: |
+        sudo apt-get install abi-dumper autoconf pkg-config
+    - name: Install universal-ctags
+      run: |
+        git clone https://github.com/universal-ctags/ctags.git
+        cd ctags
+        ./autogen.sh
+        ./configure
+        sudo make install
+    - name: Download source code
+      uses: llvm/actions/get-llvm-project-src@main
+      with:
+        ref: ${{ matrix.ref }}
+        repo: ${{ matrix.repo }}
+    - name: Configure
+      run: |
+        mkdir install
+        cmake -B build -S llvm -G Ninja -DLLVM_ENABLE_PROJECTS=clang -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD="" -DLLVM_BUILD_LLVM_DYLIB=ON -DLLVM_LINK_LLVM_DYLIB=ON -DCMAKE_C_FLAGS_DEBUG="-g -Og" -DCMAKE_CXX_FLAGS_DEBUG="-g -Og" -DCMAKE_INSTALL_PREFIX=`pwd`/install llvm
+    - name: Build
+      run: ninja -C build/ ${{ needs.abi-dump-setup.outputs.ABI_LIBS }} install-clang-headers
+    - name: Dump ABI
+      run: |
+        parallel abi-dumper -lver ${{ matrix.ref }} -skip-cxx -public-headers ./install/include/${{ needs.abi-dump-setup.outputs.ABI_HEADERS }} -o {}-${{ matrix.ref }}.abi ./build/lib/{} ::: ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}
+        for lib in ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}; do
+          # Remove symbol versioning from dumps, so we can compare across major versions.
+          sed -i 's/LLVM_${{ matrix.llvm_version_major }}/LLVM_NOVERSION/' $lib-${{ matrix.ref }}.abi
+          tar -czf $lib-${{ matrix.ref }}.abi.tar.gz $lib-${{ matrix.ref }}.abi
+        done
+    - name: Upload ABI file
+      uses: actions/upload-artifact@v2
+      with:
+        name: ${{ matrix.name }}
+        path: "*${{ matrix.ref }}.abi.tar.gz"
+
+  abi-compare:
+    runs-on: ubuntu-latest
+    needs:
+      - abi-dump-setup
+      - abi-dump
+    steps:
+      - name: Download baseline
+        uses: actions/download-artifact@v1
+        with:
+          name: build-baseline
+      - name: Download latest
+        uses: actions/download-artifact@v1
+        with:
+          name: build-latest
+
+      - name: Install abi-compliance-checker
+        run: sudo apt-get install abi-compliance-checker
+      - name: Compare ABI
+        run: |
+          for lib in ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}; do
+            abi-compliance-checker -lib $lib -old build-baseline/$lib*.abi.tar.gz -new build-latest/$lib*.abi.tar.gz
+          done
+      - name: Upload ABI Comparison
+        if: always()
+        uses: actions/upload-artifact@v2
+        with:
+          name: compat-report-${{ github.sha }}
+          path: compat_reports/
+

From 17e842f3785fe5f585f9adc0f6cacc431d327eee Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 3 Feb 2021 09:49:21 -0800
Subject: [PATCH 361/363] workflows: Update action branch names

---
 .github/workflows/clang-tests.yml  |  6 +++---
 .github/workflows/libclc-tests.yml |  6 +++---
 .github/workflows/lld-tests.yml    |  6 +++---
 .github/workflows/lldb-tests.yml   |  6 +++---
 .github/workflows/llvm-tests.yml   | 10 +++++-----
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/clang-tests.yml b/.github/workflows/clang-tests.yml
index f8ca65e10726..af0b5eabeeda 100644
--- a/.github/workflows/clang-tests.yml
+++ b/.github/workflows/clang-tests.yml
@@ -28,16 +28,16 @@ jobs:
     steps:
     - name: Setup Windows
       if: startsWith(matrix.os, 'windows')
-      uses: llvm/actions/setup-windows@master
+      uses: llvm/actions/setup-windows@main
       with:
         arch: amd64
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@master
+      uses: llvm/actions/install-ninja@main
     - uses: actions/checkout@v1
       with:
         fetch-depth: 1
     - name: Test clang
-      uses: llvm/actions/build-test-llvm-project@master
+      uses: llvm/actions/build-test-llvm-project@main
       with:
         cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="clang" -DCMAKE_BUILD_TYPE=Release
         build_target: check-clang
diff --git a/.github/workflows/libclc-tests.yml b/.github/workflows/libclc-tests.yml
index 4e8639b1c89a..2f1eb2939ea2 100644
--- a/.github/workflows/libclc-tests.yml
+++ b/.github/workflows/libclc-tests.yml
@@ -31,16 +31,16 @@ jobs:
     steps:
     - name: Setup Windows
       if: startsWith(matrix.os, 'windows')
-      uses: llvm/actions/setup-windows@master
+      uses: llvm/actions/setup-windows@main
       with:
         arch: amd64
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@master
+      uses: llvm/actions/install-ninja@main
     - uses: actions/checkout@v1
       with:
         fetch-depth: 1
     - name: Build clang
-      uses: llvm/actions/build-test-llvm-project@master
+      uses: llvm/actions/build-test-llvm-project@main
       with:
         cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="clang" -DCMAKE_BUILD_TYPE=Release
         build_target: ""
diff --git a/.github/workflows/lld-tests.yml b/.github/workflows/lld-tests.yml
index 9b4cbe95f231..bdf0c2fcd886 100644
--- a/.github/workflows/lld-tests.yml
+++ b/.github/workflows/lld-tests.yml
@@ -28,16 +28,16 @@ jobs:
     steps:
     - name: Setup Windows
       if: startsWith(matrix.os, 'windows')
-      uses: llvm/actions/setup-windows@master
+      uses: llvm/actions/setup-windows@main
       with:
         arch: amd64
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@master
+      uses: llvm/actions/install-ninja@main
     - uses: actions/checkout@v1
       with:
         fetch-depth: 1
     - name: Test lld
-      uses: llvm/actions/build-test-llvm-project@master
+      uses: llvm/actions/build-test-llvm-project@main
       with:
         cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="lld" -DCMAKE_BUILD_TYPE=Release
         build_target: check-lld
diff --git a/.github/workflows/lldb-tests.yml b/.github/workflows/lldb-tests.yml
index 229e6deece6e..93fddc2de8c6 100644
--- a/.github/workflows/lldb-tests.yml
+++ b/.github/workflows/lldb-tests.yml
@@ -31,16 +31,16 @@ jobs:
     steps:
     - name: Setup Windows
       if: startsWith(matrix.os, 'windows')
-      uses: llvm/actions/setup-windows@master
+      uses: llvm/actions/setup-windows@main
       with:
         arch: amd64
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@master
+      uses: llvm/actions/install-ninja@main
     - uses: actions/checkout@v1
       with:
         fetch-depth: 1
     - name: Build lldb
-      uses: llvm/actions/build-test-llvm-project@master
+      uses: llvm/actions/build-test-llvm-project@main
       with:
         # Mac OS requries that libcxx is enabled for lldb tests, so we need  to disable them.
         cmake_args: -G Ninja  -DLLVM_ENABLE_PROJECTS="clang;lldb" -DCMAKE_BUILD_TYPE=Release -DLLDB_INCLUDE_TESTS=OFF
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index baefbc08c102..44865e760125 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -29,16 +29,16 @@ jobs:
     steps:
     - name: Setup Windows
       if: startsWith(matrix.os, 'windows')
-      uses: llvm/actions/setup-windows@master
+      uses: llvm/actions/setup-windows@main
       with:
         arch: amd64
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@master
+      uses: llvm/actions/install-ninja@main
     - uses: actions/checkout@v1
       with:
         fetch-depth: 1
     - name: Test llvm
-      uses: llvm/actions/build-test-llvm-project@master
+      uses: llvm/actions/build-test-llvm-project@main
       with:
         cmake_args: -G Ninja -DCMAKE_BUILD_TYPE=Release
 
@@ -60,7 +60,7 @@ jobs:
             repo: ${{ github.repository }}
     steps:
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@master
+      uses: llvm/actions/install-ninja@main
     - name: Install abi-compliance-checker
       run: |
         sudo apt-get install abi-dumper autoconf pkg-config
@@ -72,7 +72,7 @@ jobs:
         ./configure
         sudo make install
     - name: Download source code
-      uses: llvm/actions/get-llvm-project-src@master
+      uses: llvm/actions/get-llvm-project-src@main
       with:
         ref: ${{ matrix.ref }}
         repo: ${{ matrix.repo }}

From 85fcd465594700a72b9259bb5805386970fd8a38 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 2 Feb 2021 18:41:49 -0800
Subject: [PATCH 362/363] workflows: Re-enable lldb test on Mac OS X

---
 .github/workflows/lldb-tests.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/lldb-tests.yml b/.github/workflows/lldb-tests.yml
index 93fddc2de8c6..68aec6036995 100644
--- a/.github/workflows/lldb-tests.yml
+++ b/.github/workflows/lldb-tests.yml
@@ -20,14 +20,16 @@ jobs:
   build_lldb:
     name: lldb build
     runs-on: ${{ matrix.os }}
+    # Workaround for build faliure on Mac OS X: llvm.org/PR46190, https://github.com/actions/virtual-environments/issues/2274
+    env:
+      CPLUS_INCLUDE_PATH: /usr/local/opt/llvm/include/c++/v1:/Library/Developer/CommandLineTools/SDKs/MacOSX10.15.sdk/usr/include
     strategy:
       fail-fast: false
       matrix:
         os:
           - ubuntu-latest
           - windows-latest
-          # macOS build disabled due to: llvm.org/PR46190
-          #- macOS-latest
+          - macOS-latest
     steps:
     - name: Setup Windows
       if: startsWith(matrix.os, 'windows')

From 1fdec59bffc11ae37eb51a1b9869f0696bfd5312 Mon Sep 17 00:00:00 2001
From: Andi-Bogdan Postelnicu <abpostelnicu@me.com>
Date: Wed, 3 Feb 2021 17:38:49 +0000
Subject: [PATCH 363/363] [lldb] Fix fallout caused by D89156 on 11.0.1 for
 MacOS

Fix fallout caused by D89156 on 11.0.1 for MacOS

Differential Revision: https://reviews.llvm.org/D95683
---
 .../Platform/MacOSX/PlatformAppleTVSimulator.cpp       |  8 ++++----
 .../Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h |  2 +-
 .../Platform/MacOSX/PlatformAppleWatchSimulator.cpp    |  8 ++++----
 .../Platform/MacOSX/PlatformAppleWatchSimulator.h      |  2 +-
 .../Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp   |  2 +-
 .../Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp   | 10 +++++-----
 .../Plugins/Platform/MacOSX/PlatformiOSSimulator.h     |  2 +-
 7 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp
index 461624a2adaa..cecffacf69fd 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp
@@ -282,7 +282,7 @@ Status PlatformAppleTVSimulator::GetSymbolFile(const FileSpec &platform_file,
 Status PlatformAppleTVSimulator::GetSharedModule(
     const ModuleSpec &module_spec, lldb_private::Process *process,
     ModuleSP &module_sp, const FileSpecList *module_search_paths_ptr,
-    ModuleSP *old_module_sp_ptr, bool *did_create_ptr) {
+    llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) {
   // For AppleTV, the SDK files are all cached locally on the host system. So
   // first we ask for the file in the cached SDK, then we attempt to get a
   // shared module for the right architecture with the right UUID.
@@ -296,9 +296,9 @@ Status PlatformAppleTVSimulator::GetSharedModule(
                               module_search_paths_ptr);
   } else {
     const bool always_create = false;
-    error = ModuleList::GetSharedModule(
-        module_spec, module_sp, module_search_paths_ptr, old_module_sp_ptr,
-        did_create_ptr, always_create);
+    error = ModuleList::GetSharedModule(module_spec, module_sp,
+                                        module_search_paths_ptr, old_modules,
+                                        did_create_ptr, always_create);
   }
   if (module_sp)
     module_sp->SetPlatformFileSpec(platform_file);
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h
index 5a7b0ee0d7dc..247cac06a320 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.h
@@ -55,7 +55,7 @@ class PlatformAppleTVSimulator : public PlatformAppleSimulator {
   GetSharedModule(const lldb_private::ModuleSpec &module_spec,
                   lldb_private::Process *process, lldb::ModuleSP &module_sp,
                   const lldb_private::FileSpecList *module_search_paths_ptr,
-                  lldb::ModuleSP *old_module_sp_ptr,
+                  llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
                   bool *did_create_ptr) override;
 
   uint32_t
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp
index 03a8fcd31360..372dd9de9757 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp
@@ -283,7 +283,7 @@ Status PlatformAppleWatchSimulator::GetSymbolFile(const FileSpec &platform_file,
 Status PlatformAppleWatchSimulator::GetSharedModule(
     const ModuleSpec &module_spec, lldb_private::Process *process,
     ModuleSP &module_sp, const FileSpecList *module_search_paths_ptr,
-    ModuleSP *old_module_sp_ptr, bool *did_create_ptr) {
+    llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) {
   // For AppleWatch, the SDK files are all cached locally on the host system.
   // So first we ask for the file in the cached SDK, then we attempt to get a
   // shared module for the right architecture with the right UUID.
@@ -297,9 +297,9 @@ Status PlatformAppleWatchSimulator::GetSharedModule(
                               module_search_paths_ptr);
   } else {
     const bool always_create = false;
-    error = ModuleList::GetSharedModule(
-        module_spec, module_sp, module_search_paths_ptr, old_module_sp_ptr,
-        did_create_ptr, always_create);
+    error = ModuleList::GetSharedModule(module_spec, module_sp,
+                                        module_search_paths_ptr, old_modules,
+                                        did_create_ptr, always_create);
   }
   if (module_sp)
     module_sp->SetPlatformFileSpec(platform_file);
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h
index 96dcd16ffa99..5becb8c0bf20 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.h
@@ -55,7 +55,7 @@ class PlatformAppleWatchSimulator : public PlatformAppleSimulator {
   GetSharedModule(const lldb_private::ModuleSpec &module_spec,
                   lldb_private::Process *process, lldb::ModuleSP &module_sp,
                   const lldb_private::FileSpecList *module_search_paths_ptr,
-                  lldb::ModuleSP *old_module_sp_ptr,
+                  llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
                   bool *did_create_ptr) override;
 
   uint32_t
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
index 79cbc940feb5..6d1cf804a0ae 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
@@ -730,7 +730,7 @@ Status PlatformDarwinKernel::GetSharedModule(
     // framework on macOS systems, a chance.
     error = PlatformDarwin::GetSharedModule(module_spec, process, module_sp,
                                             module_search_paths_ptr,
-                                            old_module_sp_ptr, did_create_ptr);
+                                            old_modules, did_create_ptr);
     if (error.Success() && module_sp.get()) {
       return error;
     }
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp
index a890d0afdf1e..e293bd5b644c 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp
@@ -286,8 +286,8 @@ Status PlatformiOSSimulator::GetSymbolFile(const FileSpec &platform_file,
 
 Status PlatformiOSSimulator::GetSharedModule(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
-    const FileSpecList *module_search_paths_ptr, ModuleSP *old_module_sp_ptr,
-    bool *did_create_ptr) {
+    const FileSpecList *module_search_paths_ptr,
+    llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules, bool *did_create_ptr) {
   // For iOS, the SDK files are all cached locally on the host system. So first
   // we ask for the file in the cached SDK, then we attempt to get a shared
   // module for the right architecture with the right UUID.
@@ -301,9 +301,9 @@ Status PlatformiOSSimulator::GetSharedModule(
                               module_search_paths_ptr);
   } else {
     const bool always_create = false;
-    error = ModuleList::GetSharedModule(
-        module_spec, module_sp, module_search_paths_ptr, old_module_sp_ptr,
-        did_create_ptr, always_create);
+    error = ModuleList::GetSharedModule(module_spec, module_sp,
+                                        module_search_paths_ptr, old_modules,
+                                        did_create_ptr, always_create);
   }
   if (module_sp)
     module_sp->SetPlatformFileSpec(platform_file);
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.h b/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.h
index 4d416d759bd2..cc8e45a2be29 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.h
@@ -57,7 +57,7 @@ class PlatformiOSSimulator : public PlatformAppleSimulator {
   GetSharedModule(const lldb_private::ModuleSpec &module_spec,
                   lldb_private::Process *process, lldb::ModuleSP &module_sp,
                   const lldb_private::FileSpecList *module_search_paths_ptr,
-                  lldb::ModuleSP *old_module_sp_ptr,
+                  llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
                   bool *did_create_ptr) override;
 
   uint32_t