From f54279668f5687cfc03b0a61226055d9826e3093 Mon Sep 17 00:00:00 2001
From: Sam Elliott <quic_aelliott@quicinc.com>
Date: Fri, 5 Jul 2024 13:48:08 +0100
Subject: [PATCH 01/67] [lld][ELF] Annotate Bitfields with Preferred Types
 (#97737)

---
 lld/ELF/InputSection.h |  4 ++++
 lld/ELF/Symbols.h      | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
index 58e5306fd6dcdf..ec12235f842a93 100644
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -52,14 +52,17 @@ class SectionBase {
 
   Kind kind() const { return (Kind)sectionKind; }
 
+  LLVM_PREFERRED_TYPE(Kind)
   uint8_t sectionKind : 3;
 
   // The next two bit fields are only used by InputSectionBase, but we
   // put them here so the struct packs better.
 
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t bss : 1;
 
   // Set for sections that should not be folded by ICF.
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t keepUnique : 1;
 
   uint8_t partition = 1;
@@ -282,6 +285,7 @@ struct SectionPiece {
       : inputOff(off), live(live), hash(hash >> 1) {}
 
   uint32_t inputOff;
+  LLVM_PREFERRED_TYPE(bool)
   uint32_t live : 1;
   uint32_t hash : 31;
   uint64_t outputOff = 0;
diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h
index c65c5d6cd0dca8..e764fe8d736330 100644
--- a/lld/ELF/Symbols.h
+++ b/lld/ELF/Symbols.h
@@ -115,18 +115,21 @@ class Symbol {
   uint8_t partition;
 
   // True if this symbol is preemptible at load time.
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t isPreemptible : 1;
 
   // True if the symbol was used for linking and thus need to be added to the
   // output file's symbol table. This is true for all symbols except for
   // unreferenced DSO symbols, lazy (archive) symbols, and bitcode symbols that
   // are unreferenced except by other bitcode objects.
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t isUsedInRegularObj : 1;
 
   // True if an undefined or shared symbol is used from a live section.
   //
   // NOTE: In Writer.cpp the field is used to mark local defined symbols
   // which are referenced by relocations when -r or --emit-relocs is given.
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t used : 1;
 
   // Used by a Defined symbol with protected or default visibility, to record
@@ -138,27 +141,33 @@ class Symbol {
   // - If -shared or --export-dynamic is specified, any symbol in an object
   //   file/bitcode sets this property, unless suppressed by LTO
   //   canBeOmittedFromSymbolTable().
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t exportDynamic : 1;
 
   // True if the symbol is in the --dynamic-list file. A Defined symbol with
   // protected or default visibility with this property is required to be
   // exported into .dynsym.
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t inDynamicList : 1;
 
   // Used to track if there has been at least one undefined reference to the
   // symbol. For Undefined and SharedSymbol, the binding may change to STB_WEAK
   // if the first undefined reference from a non-shared object is weak.
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t referenced : 1;
 
   // Used to track if this symbol will be referenced after wrapping is performed
   // (i.e. this will be true for foo if __real_foo is referenced, and will be
   // true for __wrap_foo if foo is referenced).
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t referencedAfterWrap : 1;
 
   // True if this symbol is specified by --trace-symbol option.
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t traced : 1;
 
   // True if the name contains '@'.
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t hasVersionSuffix : 1;
 
   // Symbol visibility. This is the computed minimum visibility of all
@@ -270,13 +279,16 @@ class Symbol {
 public:
   // True if this symbol is in the Iplt sub-section of the Plt and the Igot
   // sub-section of the .got.plt or .got.
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t isInIplt : 1;
 
   // True if this symbol needs a GOT entry and its GOT entry is actually in
   // Igot. This will be true only for certain non-preemptible ifuncs.
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t gotInIgot : 1;
 
   // True if defined relative to a section discarded by ICF.
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t folded : 1;
 
   // Allow reuse of a bit between architecture-exclusive symbol flags.
@@ -284,6 +296,7 @@ class Symbol {
   //   followed by a restore of the toc pointer.
   // - isTagged(): On AArch64, true if the symbol needs special relocation and
   //   metadata semantics because it's tagged, under the AArch64 MemtagABI.
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t archSpecificBit : 1;
   bool needsTocRestore() const { return archSpecificBit; }
   bool isTagged() const { return archSpecificBit; }
@@ -296,13 +309,16 @@ class Symbol {
   //
   // LTO shouldn't inline the symbol because it doesn't know the final content
   // of the symbol.
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t scriptDefined : 1;
 
   // True if defined in a DSO. There may also be a definition in a relocatable
   // object file.
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t dsoDefined : 1;
 
   // True if defined in a DSO as protected visibility.
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t dsoProtected : 1;
 
   // Temporary flags used to communicate which symbol entries need PLT and GOT
@@ -319,9 +335,11 @@ class Symbol {
   // to a Verneed index in the output. Otherwise, this represents the Verdef
   // index (VER_NDX_LOCAL, VER_NDX_GLOBAL, or a named version).
   uint16_t versionId;
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t versionScriptAssigned : 1;
 
   // True if targeted by a range extension thunk.
+  LLVM_PREFERRED_TYPE(bool)
   uint8_t thunkAccessed : 1;
 
   void setFlags(uint16_t bits) {

From 7102eae4c0640492e1fcea3da7f22f4e75a4f062 Mon Sep 17 00:00:00 2001
From: Sam Elliott <quic_aelliott@quicinc.com>
Date: Fri, 5 Jul 2024 13:48:29 +0100
Subject: [PATCH 02/67] [lld][ELF] Annotate Bitfields with Preferred Types
 (#97737)


From d4216b5d0b111879f153c53caecf8ea011296cec Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Fri, 5 Jul 2024 14:08:07 +0100
Subject: [PATCH 03/67] [clang][CodeGen][AMDGPU] Enable AMDGPU `printf` for
 `spirv64-amd-amdhsa` (#97132)

This enables the AMDGPU specific implementation of `printf` when
compiling for AMDGCN flavoured SPIR-V, the consequence being that the
expansion into ROCDL calls & friends gets expanded before "lowering" to
SPIR-V and gets carried through. The only relatively "novel" aspect is
that the `callAppendStringN` is simplified to take the type of the
passed in arguments, as opposed to querying them from the module. This
is a neutral change since the arguments were passed directly to the
call, without any attempt to cast them, hence the assumption that the
actual types match the formal ones was already baked in.
---
 clang/lib/CodeGen/CGBuiltin.cpp               |   8 +-
 clang/lib/CodeGen/CGGPUBuiltin.cpp            |   4 +-
 clang/test/CodeGenHIP/printf-builtin.hip      |   8 +
 clang/test/CodeGenHIP/printf.cpp              | 229 +++++++++++++++---
 .../lib/Transforms/Utils/AMDGPUEmitPrintf.cpp |   7 +-
 5 files changed, 222 insertions(+), 34 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 5b92f1837980c5..268137b319f76f 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5888,12 +5888,16 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_printf:
   case Builtin::BIprintf:
     if (getTarget().getTriple().isNVPTX() ||
-        getTarget().getTriple().isAMDGCN()) {
+        getTarget().getTriple().isAMDGCN() ||
+        (getTarget().getTriple().isSPIRV() &&
+         getTarget().getTriple().getVendor() == Triple::VendorType::AMD)) {
       if (getLangOpts().OpenMPIsTargetDevice)
         return EmitOpenMPDevicePrintfCallExpr(E);
       if (getTarget().getTriple().isNVPTX())
         return EmitNVPTXDevicePrintfCallExpr(E);
-      if (getTarget().getTriple().isAMDGCN() && getLangOpts().HIP)
+      if ((getTarget().getTriple().isAMDGCN() ||
+           getTarget().getTriple().isSPIRV()) &&
+          getLangOpts().HIP)
         return EmitAMDGPUDevicePrintfCallExpr(E);
     }
 
diff --git a/clang/lib/CodeGen/CGGPUBuiltin.cpp b/clang/lib/CodeGen/CGGPUBuiltin.cpp
index bd95541647bcff..b2340732afeb5a 100644
--- a/clang/lib/CodeGen/CGGPUBuiltin.cpp
+++ b/clang/lib/CodeGen/CGGPUBuiltin.cpp
@@ -179,7 +179,9 @@ RValue CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E) {
 }
 
 RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) {
-  assert(getTarget().getTriple().getArch() == llvm::Triple::amdgcn);
+  assert(getTarget().getTriple().isAMDGCN() ||
+         (getTarget().getTriple().isSPIRV() &&
+          getTarget().getTriple().getVendor() == llvm::Triple::AMD));
   assert(E->getBuiltinCallee() == Builtin::BIprintf ||
          E->getBuiltinCallee() == Builtin::BI__builtin_printf);
   assert(E->getNumArgs() >= 1); // printf always has at least one arg.
diff --git a/clang/test/CodeGenHIP/printf-builtin.hip b/clang/test/CodeGenHIP/printf-builtin.hip
index df1fbbb6d637a1..506bb6a1758723 100644
--- a/clang/test/CodeGenHIP/printf-builtin.hip
+++ b/clang/test/CodeGenHIP/printf-builtin.hip
@@ -1,8 +1,12 @@
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -emit-llvm -disable-llvm-optzns -mprintf-kind=hostcall -fno-builtin-printf -fcuda-is-device \
 // RUN:   -o - %s | FileCheck --check-prefixes=CHECK,HOSTCALL %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -disable-llvm-optzns -mprintf-kind=hostcall -fno-builtin-printf -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefixes=CHECK-AMDGCNSPIRV,HOSTCALL-AMDGCNSPIRV %s
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -emit-llvm -disable-llvm-optzns -mprintf-kind=buffered -fno-builtin-printf -fcuda-is-device \
 // RUN:   -o - %s | FileCheck --check-prefixes=CHECK,BUFFERED %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -disable-llvm-optzns -mprintf-kind=buffered -fno-builtin-printf -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefixes=CHECK-AMDGCNSPIRV,BUFFERED-AMDGCNSPIRV %s
 
 #define __device__ __attribute__((device))
 
@@ -11,13 +15,17 @@ extern "C" __device__ int printf(const char *format, ...);
 // CHECK-LABEL: @_Z4foo1v()
 __device__ int foo1() {
   // HOSTCALL: call i64 @__ockl_printf_begin
+  // HOSTCALL-AMDGCNSPIRV: call addrspace(4) i64 @__ockl_printf_begin
   // BUFFERED: call ptr addrspace(1) @__printf_alloc
+  // BUFFERED-AMDGCNSPIRV: call addrspace(4) ptr addrspace(1) @__printf_alloc
   // CHECK-NOT: call i32 (ptr, ...) @printf
+  // CHECK-AMDGCNSPIRV-NOT: call i32 (ptr, ...) @printf
   return __builtin_printf("Hello World\n");
 }
 
 // CHECK-LABEL: @_Z4foo2v()
 __device__ int foo2() {
   // CHECK: call i32 (ptr, ...) @printf
+  // CHECK-AMDGCNSPIRV: call spir_func addrspace(4) i32 (ptr addrspace(4), ...) @printf
   return printf("Hello World\n");
 }
diff --git a/clang/test/CodeGenHIP/printf.cpp b/clang/test/CodeGenHIP/printf.cpp
index 8c8b801dbbf560..2d814d6fdc7042 100644
--- a/clang/test/CodeGenHIP/printf.cpp
+++ b/clang/test/CodeGenHIP/printf.cpp
@@ -1,44 +1,219 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -emit-llvm -fcuda-is-device \
-// RUN:   -o - %s | FileCheck --enable-var-scope %s
+// RUN:   -o - %s | FileCheck --check-prefix=AMDGCN --enable-var-scope %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip -emit-llvm -fcuda-is-device \
+// RUN:   -o - %s | FileCheck --check-prefix=AMDGCNSPIRV --enable-var-scope %s
 
 #define __device__ __attribute__((device))
 
 extern "C" __device__ int printf(const char *format, ...);
 
+// AMDGCN-LABEL: define dso_local noundef i32 @_Z4foo1v(
+// AMDGCN-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-NEXT:  [[ENTRY:.*]]:
+// AMDGCN-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[S:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN-NEXT:    [[S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[S]] to ptr
+// AMDGCN-NEXT:    store ptr addrspacecast (ptr addrspace(4) @.str to ptr), ptr [[S_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[S_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP2:%.*]] = call i64 @__ockl_printf_begin(i64 0)
+// AMDGCN-NEXT:    [[TMP3:%.*]] = icmp eq ptr addrspacecast (ptr addrspace(4) @.str.1 to ptr), null
+// AMDGCN-NEXT:    br i1 [[TMP3]], label %[[STRLEN_JOIN:.*]], label %[[STRLEN_WHILE:.*]]
+// AMDGCN:       [[STRLEN_WHILE]]:
+// AMDGCN-NEXT:    [[TMP4:%.*]] = phi ptr [ addrspacecast (ptr addrspace(4) @.str.1 to ptr), %[[ENTRY]] ], [ [[TMP5:%.*]], %[[STRLEN_WHILE]] ]
+// AMDGCN-NEXT:    [[TMP5]] = getelementptr i8, ptr [[TMP4]], i64 1
+// AMDGCN-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+// AMDGCN-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0
+// AMDGCN-NEXT:    br i1 [[TMP7]], label %[[STRLEN_WHILE_DONE:.*]], label %[[STRLEN_WHILE]]
+// AMDGCN:       [[STRLEN_WHILE_DONE]]:
+// AMDGCN-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP4]] to i64
+// AMDGCN-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], ptrtoint (ptr addrspacecast (ptr addrspace(4) @.str.1 to ptr) to i64)
+// AMDGCN-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 1
+// AMDGCN-NEXT:    br label %[[STRLEN_JOIN]]
+// AMDGCN:       [[STRLEN_JOIN]]:
+// AMDGCN-NEXT:    [[TMP11:%.*]] = phi i64 [ [[TMP10]], %[[STRLEN_WHILE_DONE]] ], [ 0, %[[ENTRY]] ]
+// AMDGCN-NEXT:    [[TMP12:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[TMP2]], ptr addrspacecast (ptr addrspace(4) @.str.1 to ptr), i64 [[TMP11]], i32 0)
+// AMDGCN-NEXT:    [[TMP13:%.*]] = call i64 @__ockl_printf_append_args(i64 [[TMP12]], i32 1, i64 8, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0)
+// AMDGCN-NEXT:    [[TMP14:%.*]] = call i64 @__ockl_printf_append_args(i64 [[TMP13]], i32 1, i64 4614256650576692846, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0)
+// AMDGCN-NEXT:    [[TMP15:%.*]] = call i64 @__ockl_printf_append_args(i64 [[TMP14]], i32 1, i64 8, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0)
+// AMDGCN-NEXT:    [[TMP16:%.*]] = call i64 @__ockl_printf_append_args(i64 [[TMP15]], i32 1, i64 4, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0)
+// AMDGCN-NEXT:    [[TMP17:%.*]] = icmp eq ptr [[TMP0]], null
+// AMDGCN-NEXT:    br i1 [[TMP17]], label %[[STRLEN_JOIN1:.*]], label %[[STRLEN_WHILE2:.*]]
+// AMDGCN:       [[STRLEN_WHILE2]]:
+// AMDGCN-NEXT:    [[TMP18:%.*]] = phi ptr [ [[TMP0]], %[[STRLEN_JOIN]] ], [ [[TMP19:%.*]], %[[STRLEN_WHILE2]] ]
+// AMDGCN-NEXT:    [[TMP19]] = getelementptr i8, ptr [[TMP18]], i64 1
+// AMDGCN-NEXT:    [[TMP20:%.*]] = load i8, ptr [[TMP18]], align 1
+// AMDGCN-NEXT:    [[TMP21:%.*]] = icmp eq i8 [[TMP20]], 0
+// AMDGCN-NEXT:    br i1 [[TMP21]], label %[[STRLEN_WHILE_DONE3:.*]], label %[[STRLEN_WHILE2]]
+// AMDGCN:       [[STRLEN_WHILE_DONE3]]:
+// AMDGCN-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// AMDGCN-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[TMP18]] to i64
+// AMDGCN-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP23]], [[TMP22]]
+// AMDGCN-NEXT:    [[TMP25:%.*]] = add i64 [[TMP24]], 1
+// AMDGCN-NEXT:    br label %[[STRLEN_JOIN1]]
+// AMDGCN:       [[STRLEN_JOIN1]]:
+// AMDGCN-NEXT:    [[TMP26:%.*]] = phi i64 [ [[TMP25]], %[[STRLEN_WHILE_DONE3]] ], [ 0, %[[STRLEN_JOIN]] ]
+// AMDGCN-NEXT:    [[TMP27:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[TMP16]], ptr [[TMP0]], i64 [[TMP26]], i32 0)
+// AMDGCN-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[TMP1]] to i64
+// AMDGCN-NEXT:    [[TMP29:%.*]] = call i64 @__ockl_printf_append_args(i64 [[TMP27]], i32 1, i64 [[TMP28]], i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 1)
+// AMDGCN-NEXT:    [[TMP30:%.*]] = trunc i64 [[TMP29]] to i32
+// AMDGCN-NEXT:    ret i32 [[TMP30]]
+//
+// AMDGCNSPIRV-LABEL: define spir_func noundef i32 @_Z4foo1v(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*]]:
+// AMDGCNSPIRV-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-NEXT:    [[S:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[S_ASCAST:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str to ptr addrspace(4)), ptr addrspace(4) [[S_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[S_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[S_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = call addrspace(4) i64 @__ockl_printf_begin(i64 0)
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = icmp eq ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str.1 to ptr addrspace(4)), null
+// AMDGCNSPIRV-NEXT:    br i1 [[TMP3]], label %[[STRLEN_JOIN:.*]], label %[[STRLEN_WHILE:.*]]
+// AMDGCNSPIRV:       [[STRLEN_WHILE]]:
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = phi ptr addrspace(4) [ addrspacecast (ptr addrspace(1) @.str.1 to ptr addrspace(4)), %[[ENTRY]] ], [ [[TMP5:%.*]], %[[STRLEN_WHILE]] ]
+// AMDGCNSPIRV-NEXT:    [[TMP5]] = getelementptr i8, ptr addrspace(4) [[TMP4]], i64 1
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load i8, ptr addrspace(4) [[TMP4]], align 1
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0
+// AMDGCNSPIRV-NEXT:    br i1 [[TMP7]], label %[[STRLEN_WHILE_DONE:.*]], label %[[STRLEN_WHILE]]
+// AMDGCNSPIRV:       [[STRLEN_WHILE_DONE]]:
+// AMDGCNSPIRV-NEXT:    [[TMP8:%.*]] = ptrtoint ptr addrspace(4) [[TMP4]] to i64
+// AMDGCNSPIRV-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], ptrtoint (ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str.1 to ptr addrspace(4)) to i64)
+// AMDGCNSPIRV-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 1
+// AMDGCNSPIRV-NEXT:    br label %[[STRLEN_JOIN]]
+// AMDGCNSPIRV:       [[STRLEN_JOIN]]:
+// AMDGCNSPIRV-NEXT:    [[TMP11:%.*]] = phi i64 [ [[TMP10]], %[[STRLEN_WHILE_DONE]] ], [ 0, %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[TMP12:%.*]] = call addrspace(4) i64 @__ockl_printf_append_string_n(i64 [[TMP2]], ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str.1 to ptr addrspace(4)), i64 [[TMP11]], i32 0)
+// AMDGCNSPIRV-NEXT:    [[TMP13:%.*]] = call addrspace(4) i64 @__ockl_printf_append_args(i64 [[TMP12]], i32 1, i64 8, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0)
+// AMDGCNSPIRV-NEXT:    [[TMP14:%.*]] = call addrspace(4) i64 @__ockl_printf_append_args(i64 [[TMP13]], i32 1, i64 4614256650576692846, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0)
+// AMDGCNSPIRV-NEXT:    [[TMP15:%.*]] = call addrspace(4) i64 @__ockl_printf_append_args(i64 [[TMP14]], i32 1, i64 8, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0)
+// AMDGCNSPIRV-NEXT:    [[TMP16:%.*]] = call addrspace(4) i64 @__ockl_printf_append_args(i64 [[TMP15]], i32 1, i64 4, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0)
+// AMDGCNSPIRV-NEXT:    [[TMP17:%.*]] = icmp eq ptr addrspace(4) [[TMP0]], null
+// AMDGCNSPIRV-NEXT:    br i1 [[TMP17]], label %[[STRLEN_JOIN1:.*]], label %[[STRLEN_WHILE2:.*]]
+// AMDGCNSPIRV:       [[STRLEN_WHILE2]]:
+// AMDGCNSPIRV-NEXT:    [[TMP18:%.*]] = phi ptr addrspace(4) [ [[TMP0]], %[[STRLEN_JOIN]] ], [ [[TMP19:%.*]], %[[STRLEN_WHILE2]] ]
+// AMDGCNSPIRV-NEXT:    [[TMP19]] = getelementptr i8, ptr addrspace(4) [[TMP18]], i64 1
+// AMDGCNSPIRV-NEXT:    [[TMP20:%.*]] = load i8, ptr addrspace(4) [[TMP18]], align 1
+// AMDGCNSPIRV-NEXT:    [[TMP21:%.*]] = icmp eq i8 [[TMP20]], 0
+// AMDGCNSPIRV-NEXT:    br i1 [[TMP21]], label %[[STRLEN_WHILE_DONE3:.*]], label %[[STRLEN_WHILE2]]
+// AMDGCNSPIRV:       [[STRLEN_WHILE_DONE3]]:
+// AMDGCNSPIRV-NEXT:    [[TMP22:%.*]] = ptrtoint ptr addrspace(4) [[TMP0]] to i64
+// AMDGCNSPIRV-NEXT:    [[TMP23:%.*]] = ptrtoint ptr addrspace(4) [[TMP18]] to i64
+// AMDGCNSPIRV-NEXT:    [[TMP24:%.*]] = sub i64 [[TMP23]], [[TMP22]]
+// AMDGCNSPIRV-NEXT:    [[TMP25:%.*]] = add i64 [[TMP24]], 1
+// AMDGCNSPIRV-NEXT:    br label %[[STRLEN_JOIN1]]
+// AMDGCNSPIRV:       [[STRLEN_JOIN1]]:
+// AMDGCNSPIRV-NEXT:    [[TMP26:%.*]] = phi i64 [ [[TMP25]], %[[STRLEN_WHILE_DONE3]] ], [ 0, %[[STRLEN_JOIN]] ]
+// AMDGCNSPIRV-NEXT:    [[TMP27:%.*]] = call addrspace(4) i64 @__ockl_printf_append_string_n(i64 [[TMP16]], ptr addrspace(4) [[TMP0]], i64 [[TMP26]], i32 0)
+// AMDGCNSPIRV-NEXT:    [[TMP28:%.*]] = ptrtoint ptr addrspace(4) [[TMP1]] to i64
+// AMDGCNSPIRV-NEXT:    [[TMP29:%.*]] = call addrspace(4) i64 @__ockl_printf_append_args(i64 [[TMP27]], i32 1, i64 [[TMP28]], i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 1)
+// AMDGCNSPIRV-NEXT:    [[TMP30:%.*]] = trunc i64 [[TMP29]] to i32
+// AMDGCNSPIRV-NEXT:    ret i32 [[TMP30]]
+//
 __device__ int foo1() {
   const char *s = "hello world";
   return printf("%.*f %*.*s %p\n", 8, 3.14159, 8, 4, s, s);
 }
 
-// CHECK-LABEL: @_Z4foo1v()
-// CHECK: [[BEGIN:%.*]]   = call i64 @__ockl_printf_begin(i64 0)
-// CHECK: [[STRLEN1:%.*]] = phi i64 [ %{{[^,]*}}, %{{[^ ]*}} ], [ 0, %{{[^ ]*}} ]
-// CHECK: [[APPEND1:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[BEGIN]], {{.*}}, i64 [[STRLEN1]], i32 0)
-// CHECK: [[APPEND2:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND1]], i32 1, i64 8, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0)
-// CHECK: [[APPEND3:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND2]], i32 1, i64 4614256650576692846, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0)
-// CHECK: [[APPEND4:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND3]], i32 1, i64 8, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0)
-// CHECK: [[APPEND5:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND4]], i32 1, i64 4, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0)
-// CHECK: [[STRLEN2:%.*]] = phi i64 [ %{{[^,]*}}, %{{[^ ]*}} ], [ 0, %{{[^ ]*}} ]
-// CHECK: [[APPEND6:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[APPEND5]], {{.*}}, i64 [[STRLEN2]], i32 0)
-// CHECK: [[PTR2INT:%.*]] = ptrtoint ptr %{{.*}} to i64
-// CHECK: [[APPEND7:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND6]], i32 1, i64 [[PTR2INT]], i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 1)
-// CHECK: [[RETURN:%.*]]  = trunc i64 [[APPEND7]] to i32
-// CHECK: ret i32 [[RETURN]]
-
 __device__ char *dstr;
 
+// AMDGCN-LABEL: define dso_local noundef i32 @_Z4foo2v(
+// AMDGCN-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-NEXT:  [[ENTRY:.*]]:
+// AMDGCN-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspacecast (ptr addrspace(1) @dstr to ptr), align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr, ptr addrspacecast (ptr addrspace(1) @dstr to ptr), align 8
+// AMDGCN-NEXT:    [[TMP2:%.*]] = call i64 @__ockl_printf_begin(i64 0)
+// AMDGCN-NEXT:    [[TMP3:%.*]] = icmp eq ptr addrspacecast (ptr addrspace(4) @.str.2 to ptr), null
+// AMDGCN-NEXT:    br i1 [[TMP3]], label %[[STRLEN_JOIN:.*]], label %[[STRLEN_WHILE:.*]]
+// AMDGCN:       [[STRLEN_WHILE]]:
+// AMDGCN-NEXT:    [[TMP4:%.*]] = phi ptr [ addrspacecast (ptr addrspace(4) @.str.2 to ptr), %[[ENTRY]] ], [ [[TMP5:%.*]], %[[STRLEN_WHILE]] ]
+// AMDGCN-NEXT:    [[TMP5]] = getelementptr i8, ptr [[TMP4]], i64 1
+// AMDGCN-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+// AMDGCN-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0
+// AMDGCN-NEXT:    br i1 [[TMP7]], label %[[STRLEN_WHILE_DONE:.*]], label %[[STRLEN_WHILE]]
+// AMDGCN:       [[STRLEN_WHILE_DONE]]:
+// AMDGCN-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP4]] to i64
+// AMDGCN-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], ptrtoint (ptr addrspacecast (ptr addrspace(4) @.str.2 to ptr) to i64)
+// AMDGCN-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 1
+// AMDGCN-NEXT:    br label %[[STRLEN_JOIN]]
+// AMDGCN:       [[STRLEN_JOIN]]:
+// AMDGCN-NEXT:    [[TMP11:%.*]] = phi i64 [ [[TMP10]], %[[STRLEN_WHILE_DONE]] ], [ 0, %[[ENTRY]] ]
+// AMDGCN-NEXT:    [[TMP12:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[TMP2]], ptr addrspacecast (ptr addrspace(4) @.str.2 to ptr), i64 [[TMP11]], i32 0)
+// AMDGCN-NEXT:    [[TMP13:%.*]] = icmp eq ptr [[TMP0]], null
+// AMDGCN-NEXT:    br i1 [[TMP13]], label %[[STRLEN_JOIN1:.*]], label %[[STRLEN_WHILE2:.*]]
+// AMDGCN:       [[STRLEN_WHILE2]]:
+// AMDGCN-NEXT:    [[TMP14:%.*]] = phi ptr [ [[TMP0]], %[[STRLEN_JOIN]] ], [ [[TMP15:%.*]], %[[STRLEN_WHILE2]] ]
+// AMDGCN-NEXT:    [[TMP15]] = getelementptr i8, ptr [[TMP14]], i64 1
+// AMDGCN-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP14]], align 1
+// AMDGCN-NEXT:    [[TMP17:%.*]] = icmp eq i8 [[TMP16]], 0
+// AMDGCN-NEXT:    br i1 [[TMP17]], label %[[STRLEN_WHILE_DONE3:.*]], label %[[STRLEN_WHILE2]]
+// AMDGCN:       [[STRLEN_WHILE_DONE3]]:
+// AMDGCN-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// AMDGCN-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP14]] to i64
+// AMDGCN-NEXT:    [[TMP20:%.*]] = sub i64 [[TMP19]], [[TMP18]]
+// AMDGCN-NEXT:    [[TMP21:%.*]] = add i64 [[TMP20]], 1
+// AMDGCN-NEXT:    br label %[[STRLEN_JOIN1]]
+// AMDGCN:       [[STRLEN_JOIN1]]:
+// AMDGCN-NEXT:    [[TMP22:%.*]] = phi i64 [ [[TMP21]], %[[STRLEN_WHILE_DONE3]] ], [ 0, %[[STRLEN_JOIN]] ]
+// AMDGCN-NEXT:    [[TMP23:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[TMP12]], ptr [[TMP0]], i64 [[TMP22]], i32 0)
+// AMDGCN-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[TMP1]] to i64
+// AMDGCN-NEXT:    [[TMP25:%.*]] = call i64 @__ockl_printf_append_args(i64 [[TMP23]], i32 1, i64 [[TMP24]], i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 1)
+// AMDGCN-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
+// AMDGCN-NEXT:    ret i32 [[TMP26]]
+//
+// AMDGCNSPIRV-LABEL: define spir_func noundef i32 @_Z4foo2v(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  [[ENTRY:.*]]:
+// AMDGCNSPIRV-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) addrspacecast (ptr addrspace(1) @dstr to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) addrspacecast (ptr addrspace(1) @dstr to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = call addrspace(4) i64 @__ockl_printf_begin(i64 0)
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = icmp eq ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str.2 to ptr addrspace(4)), null
+// AMDGCNSPIRV-NEXT:    br i1 [[TMP3]], label %[[STRLEN_JOIN:.*]], label %[[STRLEN_WHILE:.*]]
+// AMDGCNSPIRV:       [[STRLEN_WHILE]]:
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = phi ptr addrspace(4) [ addrspacecast (ptr addrspace(1) @.str.2 to ptr addrspace(4)), %[[ENTRY]] ], [ [[TMP5:%.*]], %[[STRLEN_WHILE]] ]
+// AMDGCNSPIRV-NEXT:    [[TMP5]] = getelementptr i8, ptr addrspace(4) [[TMP4]], i64 1
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load i8, ptr addrspace(4) [[TMP4]], align 1
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0
+// AMDGCNSPIRV-NEXT:    br i1 [[TMP7]], label %[[STRLEN_WHILE_DONE:.*]], label %[[STRLEN_WHILE]]
+// AMDGCNSPIRV:       [[STRLEN_WHILE_DONE]]:
+// AMDGCNSPIRV-NEXT:    [[TMP8:%.*]] = ptrtoint ptr addrspace(4) [[TMP4]] to i64
+// AMDGCNSPIRV-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], ptrtoint (ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str.2 to ptr addrspace(4)) to i64)
+// AMDGCNSPIRV-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 1
+// AMDGCNSPIRV-NEXT:    br label %[[STRLEN_JOIN]]
+// AMDGCNSPIRV:       [[STRLEN_JOIN]]:
+// AMDGCNSPIRV-NEXT:    [[TMP11:%.*]] = phi i64 [ [[TMP10]], %[[STRLEN_WHILE_DONE]] ], [ 0, %[[ENTRY]] ]
+// AMDGCNSPIRV-NEXT:    [[TMP12:%.*]] = call addrspace(4) i64 @__ockl_printf_append_string_n(i64 [[TMP2]], ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str.2 to ptr addrspace(4)), i64 [[TMP11]], i32 0)
+// AMDGCNSPIRV-NEXT:    [[TMP13:%.*]] = icmp eq ptr addrspace(4) [[TMP0]], null
+// AMDGCNSPIRV-NEXT:    br i1 [[TMP13]], label %[[STRLEN_JOIN1:.*]], label %[[STRLEN_WHILE2:.*]]
+// AMDGCNSPIRV:       [[STRLEN_WHILE2]]:
+// AMDGCNSPIRV-NEXT:    [[TMP14:%.*]] = phi ptr addrspace(4) [ [[TMP0]], %[[STRLEN_JOIN]] ], [ [[TMP15:%.*]], %[[STRLEN_WHILE2]] ]
+// AMDGCNSPIRV-NEXT:    [[TMP15]] = getelementptr i8, ptr addrspace(4) [[TMP14]], i64 1
+// AMDGCNSPIRV-NEXT:    [[TMP16:%.*]] = load i8, ptr addrspace(4) [[TMP14]], align 1
+// AMDGCNSPIRV-NEXT:    [[TMP17:%.*]] = icmp eq i8 [[TMP16]], 0
+// AMDGCNSPIRV-NEXT:    br i1 [[TMP17]], label %[[STRLEN_WHILE_DONE3:.*]], label %[[STRLEN_WHILE2]]
+// AMDGCNSPIRV:       [[STRLEN_WHILE_DONE3]]:
+// AMDGCNSPIRV-NEXT:    [[TMP18:%.*]] = ptrtoint ptr addrspace(4) [[TMP0]] to i64
+// AMDGCNSPIRV-NEXT:    [[TMP19:%.*]] = ptrtoint ptr addrspace(4) [[TMP14]] to i64
+// AMDGCNSPIRV-NEXT:    [[TMP20:%.*]] = sub i64 [[TMP19]], [[TMP18]]
+// AMDGCNSPIRV-NEXT:    [[TMP21:%.*]] = add i64 [[TMP20]], 1
+// AMDGCNSPIRV-NEXT:    br label %[[STRLEN_JOIN1]]
+// AMDGCNSPIRV:       [[STRLEN_JOIN1]]:
+// AMDGCNSPIRV-NEXT:    [[TMP22:%.*]] = phi i64 [ [[TMP21]], %[[STRLEN_WHILE_DONE3]] ], [ 0, %[[STRLEN_JOIN]] ]
+// AMDGCNSPIRV-NEXT:    [[TMP23:%.*]] = call addrspace(4) i64 @__ockl_printf_append_string_n(i64 [[TMP12]], ptr addrspace(4) [[TMP0]], i64 [[TMP22]], i32 0)
+// AMDGCNSPIRV-NEXT:    [[TMP24:%.*]] = ptrtoint ptr addrspace(4) [[TMP1]] to i64
+// AMDGCNSPIRV-NEXT:    [[TMP25:%.*]] = call addrspace(4) i64 @__ockl_printf_append_args(i64 [[TMP23]], i32 1, i64 [[TMP24]], i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 1)
+// AMDGCNSPIRV-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
+// AMDGCNSPIRV-NEXT:    ret i32 [[TMP26]]
+//
 __device__ int foo2() {
   return printf("%s %p\n", dstr, dstr);
 }
-
-// CHECK-LABEL: @_Z4foo2v()
-// CHECK: [[BEGIN:%.*]]   = call i64 @__ockl_printf_begin(i64 0)
-// CHECK: [[STRLEN1:%.*]] = phi i64 [ %{{[^,]*}}, %{{[^ ]*}} ], [ 0, %{{[^ ]*}} ]
-// CHECK: [[APPEND1:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[BEGIN]], {{.*}}, i64 [[STRLEN1]], i32 0)
-// CHECK: [[STRLEN2:%.*]] = phi i64 [ %{{[^,]*}}, %{{[^ ]*}} ], [ 0, %{{[^ ]*}} ]
-// CHECK: [[APPEND2:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[APPEND1]], {{.*}}, i64 [[STRLEN2]], i32 0)
-// CHECK: [[PTR2INT:%.*]] = ptrtoint ptr %{{.*}} to i64
-// CHECK: [[APPEND3:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND2]], i32 1, i64 [[PTR2INT]], i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 1)
-// CHECK: [[RETURN:%.*]]  = trunc i64 [[APPEND3]] to i32
-// CHECK: ret i32 [[RETURN]]
diff --git a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
index be8264f1f42e58..a25632acbfcc3a 100644
--- a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
+++ b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
@@ -154,12 +154,11 @@ static Value *getStrlenWithNull(IRBuilder<> &Builder, Value *Str) {
 static Value *callAppendStringN(IRBuilder<> &Builder, Value *Desc, Value *Str,
                                 Value *Length, bool isLast) {
   auto Int64Ty = Builder.getInt64Ty();
-  auto PtrTy = Builder.getPtrTy();
-  auto Int32Ty = Builder.getInt32Ty();
+  auto IsLastInt32 = Builder.getInt32(isLast);
   auto M = Builder.GetInsertBlock()->getModule();
   auto Fn = M->getOrInsertFunction("__ockl_printf_append_string_n", Int64Ty,
-                                   Int64Ty, PtrTy, Int64Ty, Int32Ty);
-  auto IsLastInt32 = Builder.getInt32(isLast);
+                                   Desc->getType(), Str->getType(),
+                                   Length->getType(), IsLastInt32->getType());
   return Builder.CreateCall(Fn, {Desc, Str, Length, IsLastInt32});
 }
 

From 9f2215ae5517a3d6e1d59e597bff83b821b96c7e Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Fri, 5 Jul 2024 09:54:37 -0400
Subject: [PATCH 04/67] [libc][math] Fix signed zeros for erff. (#97742)

The inexact exception flag was raised for the exact cases of signed
zeros. This was reported by Paul Zimmermann using the CORE-MATH test
suites.
---
 libc/src/math/generic/erff.cpp | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/libc/src/math/generic/erff.cpp b/libc/src/math/generic/erff.cpp
index f120d5646e0439..aa7baffc7815e9 100644
--- a/libc/src/math/generic/erff.cpp
+++ b/libc/src/math/generic/erff.cpp
@@ -127,15 +127,6 @@ LLVM_LIBC_FUNCTION(float, erff, (float x)) {
   uint32_t x_u = xbits.uintval();
   uint32_t x_abs = x_u & 0x7fff'ffffU;
 
-  // Exceptional values
-  if (LIBC_UNLIKELY(x_abs == 0x3f65'9229U)) // |x| = 0x1.cb2452p-1f
-    return x < 0.0f ? fputil::round_result_slightly_down(-0x1.972ea8p-1f)
-                    : fputil::round_result_slightly_up(0x1.972ea8p-1f);
-  if (LIBC_UNLIKELY(x_abs == 0x4004'1e6aU)) // |x| = 0x1.083cd4p+1f
-    return x < 0.0f ? fputil::round_result_slightly_down(-0x1.fe3462p-1f)
-                    : fputil::round_result_slightly_up(0x1.fe3462p-1f);
-
-  // if (LIBC_UNLIKELY(x_abs > 0x407a'd444U)) {
   if (LIBC_UNLIKELY(x_abs >= 0x4080'0000U)) {
     const float ONE[2] = {1.0f, -1.0f};
     const float SMALL[2] = {-0x1.0p-25f, 0x1.0p-25f};
@@ -149,6 +140,21 @@ LLVM_LIBC_FUNCTION(float, erff, (float x)) {
     return ONE[sign] + SMALL[sign];
   }
 
+  // Exceptional mask = common 0 bits of 2 exceptional values.
+  constexpr uint32_t EXCEPT_MASK = 0x809a'6184U;
+
+  if (LIBC_UNLIKELY((x_abs & EXCEPT_MASK) == 0)) {
+    // Exceptional values
+    if (LIBC_UNLIKELY(x_abs == 0x3f65'9229U)) // |x| = 0x1.cb2452p-1f
+      return x < 0.0f ? fputil::round_result_slightly_down(-0x1.972ea8p-1f)
+                      : fputil::round_result_slightly_up(0x1.972ea8p-1f);
+    if (LIBC_UNLIKELY(x_abs == 0x4004'1e6aU)) // |x| = 0x1.083cd4p+1f
+      return x < 0.0f ? fputil::round_result_slightly_down(-0x1.fe3462p-1f)
+                      : fputil::round_result_slightly_up(0x1.fe3462p-1f);
+    if (x_abs == 0U)
+      return x;
+  }
+
   // Polynomial approximation:
   //   erf(x) ~ x * (c0 + c1 * x^2 + c2 * x^4 + ... + c7 * x^14)
   double xd = static_cast<double>(x);

From ac76ce2693b25bc3c643360acc18ebc01d22c072 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty@gmail.com>
Date: Fri, 5 Jul 2024 15:58:01 +0200
Subject: [PATCH 05/67] [libc][math][c23] Classify f16fma{,f,l} as LLVM libc
 extensions (#97728)

---
 libc/docs/math/index.rst   | 2 +-
 libc/spec/llvm_libc_ext.td | 4 ++++
 libc/spec/stdc.td          | 3 ---
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index b70f29a986e141..5c4464b552cc66 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -128,7 +128,7 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | f16div           | |check|\*        | |check|\*       | |check|\*              | N/A                  | |check|                | 7.12.14.4              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| f16fma           | |check|          | |check|         | |check|                | N/A                  | |check|                | 7.12.14.5              | F.10.11                    |
+| f16fma           | |check|\*        | |check|\*       | |check|\*              | N/A                  | |check|                | 7.12.14.5              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | f16sub           | |check|\*        | |check|\*       | |check|\*              | N/A                  | |check|                | 7.12.14.2              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/spec/llvm_libc_ext.td b/libc/spec/llvm_libc_ext.td
index b994e7ca56a933..c0374cb6311918 100644
--- a/libc/spec/llvm_libc_ext.td
+++ b/libc/spec/llvm_libc_ext.td
@@ -69,6 +69,10 @@ def LLVMLibcExt : StandardSpec<"llvm_libc_ext"> {
           GuardedFunctionSpec<"f16divf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"f16divl", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
 
+          GuardedFunctionSpec<"f16fma", RetValSpec<Float16Type>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
+          GuardedFunctionSpec<"f16fmaf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
+          GuardedFunctionSpec<"f16fmal", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
+
           GuardedFunctionSpec<"f16sqrt", RetValSpec<Float16Type>, [ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"f16sqrtf", RetValSpec<Float16Type>, [ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"f16sqrtl", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 9ff40bf76700c8..feaa3fbfa66aa5 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -477,9 +477,6 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"fma", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fmaf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>]>,
 
-          GuardedFunctionSpec<"f16fma", RetValSpec<Float16Type>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<DoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"f16fmaf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
-          GuardedFunctionSpec<"f16fmal", RetValSpec<Float16Type>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>], "LIBC_TYPES_HAS_FLOAT16">,
           GuardedFunctionSpec<"f16fmaf128", RetValSpec<Float16Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">,
 
           FunctionSpec<"fmod", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,

From 18b575d4aac603b6acd3fa0d639fbc79cd4f0ac3 Mon Sep 17 00:00:00 2001
From: Zibi Sarbinowski <zibi@ca.ibm.com>
Date: Fri, 5 Jul 2024 10:13:02 -0400
Subject: [PATCH 06/67] [libc++abi] Fixing up
 LIBCXXABI_ADDITIONAL_COMPILE_FLAGS (#97608)

This is the continuation of #96112 which implements proposal from Louis.
Using PRIVATE option on target_compile_options() fixes the issue of
propagating the option into lib++.
---
 libcxxabi/src/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt
index d6fcd72dcb1bd8..c1a7bcb14eb199 100644
--- a/libcxxabi/src/CMakeLists.txt
+++ b/libcxxabi/src/CMakeLists.txt
@@ -182,7 +182,7 @@ set_target_properties(cxxabi_shared_objects
 if (CMAKE_POSITION_INDEPENDENT_CODE OR NOT DEFINED CMAKE_POSITION_INDEPENDENT_CODE)
   set_target_properties(cxxabi_shared_objects PROPERTIES POSITION_INDEPENDENT_CODE ON) # must set manually because it's an object library
 endif()
-target_compile_options(cxxabi_shared_objects PUBLIC "${LIBCXXABI_ADDITIONAL_COMPILE_FLAGS}")
+target_compile_options(cxxabi_shared_objects PRIVATE "${LIBCXXABI_ADDITIONAL_COMPILE_FLAGS}")
 
 if (LIBCXXABI_ENABLE_SHARED)
   add_library(cxxabi_shared SHARED)
@@ -262,7 +262,7 @@ set_target_properties(cxxabi_static_objects
     CXX_STANDARD_REQUIRED OFF # TODO: Make this REQUIRED once we don't need to accommodate the LLVM documentation builders using an ancient CMake
     COMPILE_FLAGS "${LIBCXXABI_COMPILE_FLAGS}"
 )
-target_compile_options(cxxabi_static_objects PUBLIC "${LIBCXXABI_ADDITIONAL_COMPILE_FLAGS}")
+target_compile_options(cxxabi_static_objects PRIVATE "${LIBCXXABI_ADDITIONAL_COMPILE_FLAGS}")
 
 if(LIBCXXABI_HERMETIC_STATIC_LIBRARY)
   target_add_compile_flags_if_supported(cxxabi_static_objects PRIVATE -fvisibility=hidden)

From 2bc474b7e6d064d48610594fa5663582126900c4 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 5 Jul 2024 15:26:49 +0100
Subject: [PATCH 07/67] [InstCombine][X86] Pull out repeated uses of
 PatternMatch namespace. NFC.

Followup requested on #96882
---
 .../Target/X86/X86InstCombineIntrinsic.cpp    | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 163584b3750d38..8eea368b5f86f9 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -21,6 +21,7 @@
 #include <optional>
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "x86tti"
 
@@ -44,8 +45,7 @@ static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) {
 
   // Mask was extended from a boolean vector.
   Value *ExtMask;
-  if (PatternMatch::match(
-          Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
+  if (match(Mask, m_SExt(m_Value(ExtMask))) &&
       ExtMask->getType()->isIntOrIntVectorTy(1))
     return ExtMask;
 
@@ -523,10 +523,10 @@ static Value *simplifyX86pmulh(IntrinsicInst &II,
 
   // Multiply by one.
   if (!IsRounding) {
-    if (match(Arg0, PatternMatch::m_One()))
+    if (match(Arg0, m_One()))
       return IsSigned ? Builder.CreateAShr(Arg1, 15)
                       : ConstantAggregateZero::get(ResTy);
-    if (match(Arg1, PatternMatch::m_One()))
+    if (match(Arg1, m_One()))
       return IsSigned ? Builder.CreateAShr(Arg0, 15)
                       : ConstantAggregateZero::get(ResTy);
   }
@@ -655,7 +655,7 @@ static Value *simplifyX86addcarry(const IntrinsicInst &II,
          "Unexpected types for x86 addcarry");
 
   // If carry-in is zero, this is just an unsigned add with overflow.
-  if (match(CarryIn, PatternMatch::m_ZeroInt())) {
+  if (match(CarryIn, m_ZeroInt())) {
     Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
                                           {Op1, Op2});
     // The types have to be adjusted to match the x86 call types.
@@ -699,9 +699,9 @@ static Value *simplifyTernarylogic(const IntrinsicInst &II,
   auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); };
   auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); };
 
-  bool AIsConst = match(ArgA, PatternMatch::m_ImmConstant());
-  bool BIsConst = match(ArgB, PatternMatch::m_ImmConstant());
-  bool CIsConst = match(ArgC, PatternMatch::m_ImmConstant());
+  bool AIsConst = match(ArgA, m_ImmConstant());
+  bool BIsConst = match(ArgB, m_ImmConstant());
+  bool CIsConst = match(ArgC, m_ImmConstant());
 
   bool ABIsConst = AIsConst && BIsConst;
   bool ACIsConst = AIsConst && CIsConst;
@@ -2887,9 +2887,8 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     // intrinsics.
     Value *MaskSrc = nullptr;
     ArrayRef<int> ShuffleMask;
-    if (match(Mask, PatternMatch::m_OneUse(PatternMatch::m_Shuffle(
-                        PatternMatch::m_Value(MaskSrc), PatternMatch::m_Undef(),
-                        PatternMatch::m_Mask(ShuffleMask))))) {
+    if (match(Mask, m_OneUse(m_Shuffle(m_Value(MaskSrc), m_Undef(),
+                                       m_Mask(ShuffleMask))))) {
       // Bail if the shuffle was irregular or contains undefs.
       int NumElts = cast<FixedVectorType>(MaskSrc->getType())->getNumElements();
       if (NumElts < (int)ShuffleMask.size() || !isPowerOf2_32(NumElts) ||
@@ -2903,7 +2902,7 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     // vector condition value.
     Value *BoolVec;
     Mask = InstCombiner::peekThroughBitcast(Mask);
-    if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
+    if (match(Mask, m_SExt(m_Value(BoolVec))) &&
         BoolVec->getType()->isVectorTy() &&
         BoolVec->getType()->getScalarSizeInBits() == 1) {
       auto *MaskTy = cast<FixedVectorType>(Mask->getType());

From ae0d2244a23567c8d9863e63b338d34c31416a7b Mon Sep 17 00:00:00 2001
From: Nick Zavaritsky <mejedi@gmail.com>
Date: Fri, 5 Jul 2024 17:32:09 +0300
Subject: [PATCH 08/67] [BPF] Fix linking issues in static map initializers
 (#91310)

When BPF object files are linked with bpftool, every symbol must be
accompanied by BTF info. Ensure that extern functions referenced by
global variable initializers are included in BTF.

The primary motivation is "static" initialization of PROG maps:

```c
extern int elsewhere(struct xdp_md *);

struct {
  __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
  __uint(max_entries, 1);
  __type(key, int);
  __type(value, int);
  __array(values, int (struct xdp_md *));
} prog_map SEC(".maps") = { .values = { elsewhere } };
```

BPF backend needs debug info to produce BTF. Debug info is not
normally generated for external variables and functions. Previously, it
was solved differently for variables (collecting variable declarations
    in ExternalDeclarations vector) and functions (logic invoked during
    codegen in CGExpr.cpp).

This patch generalises ExternalDefclarations to include both function
and variable declarations. This change ensures that function references
    are not missed no matter the context. Previously external functions
    referenced in constant expressions lacked debug info.
---
 clang/include/clang/AST/ASTConsumer.h         |  3 +-
 .../clang/Frontend/MultiplexConsumer.h        |  2 +-
 clang/include/clang/Sema/Sema.h               |  2 +-
 clang/lib/CodeGen/BackendConsumer.h           |  2 +-
 clang/lib/CodeGen/CGExpr.cpp                  | 17 +-----
 clang/lib/CodeGen/CodeGenAction.cpp           |  2 +-
 clang/lib/CodeGen/CodeGenModule.cpp           | 19 ++++++-
 clang/lib/CodeGen/CodeGenModule.h             |  3 +-
 clang/lib/CodeGen/ModuleBuilder.cpp           |  2 +-
 clang/lib/Frontend/MultiplexConsumer.cpp      |  2 +-
 clang/lib/Interpreter/IncrementalParser.cpp   |  2 +-
 clang/lib/Sema/SemaDecl.cpp                   |  8 +++
 .../test/CodeGen/bpf-debug-info-extern-func.c |  9 ++++
 clang/test/CodeGen/bpf-debug-info-unref.c     | 11 ++++
 llvm/lib/Target/BPF/BTFDebug.cpp              | 23 ++++++++
 llvm/lib/Target/BPF/BTFDebug.h                |  4 ++
 llvm/test/CodeGen/BPF/BTF/extern-var-func2.ll | 54 +++++++++++++++++++
 17 files changed, 139 insertions(+), 26 deletions(-)
 create mode 100644 clang/test/CodeGen/bpf-debug-info-extern-func.c
 create mode 100644 clang/test/CodeGen/bpf-debug-info-unref.c
 create mode 100644 llvm/test/CodeGen/BPF/BTF/extern-var-func2.ll

diff --git a/clang/include/clang/AST/ASTConsumer.h b/clang/include/clang/AST/ASTConsumer.h
index ebcd8059284d8d..447f2592d23595 100644
--- a/clang/include/clang/AST/ASTConsumer.h
+++ b/clang/include/clang/AST/ASTConsumer.h
@@ -23,6 +23,7 @@ namespace clang {
   class ASTDeserializationListener; // layering violation because void* is ugly
   class SemaConsumer; // layering violation required for safe SemaConsumer
   class TagDecl;
+  class DeclaratorDecl;
   class VarDecl;
   class FunctionDecl;
   class ImportDecl;
@@ -105,7 +106,7 @@ class ASTConsumer {
   /// CompleteExternalDeclaration - Callback invoked at the end of a translation
   /// unit to notify the consumer that the given external declaration should be
   /// completed.
-  virtual void CompleteExternalDeclaration(VarDecl *D) {}
+  virtual void CompleteExternalDeclaration(DeclaratorDecl *D) {}
 
   /// Callback invoked when an MSInheritanceAttr has been attached to a
   /// CXXRecordDecl.
diff --git a/clang/include/clang/Frontend/MultiplexConsumer.h b/clang/include/clang/Frontend/MultiplexConsumer.h
index 4ed0d86d3cdfbc..e49e3392d1f317 100644
--- a/clang/include/clang/Frontend/MultiplexConsumer.h
+++ b/clang/include/clang/Frontend/MultiplexConsumer.h
@@ -67,7 +67,7 @@ class MultiplexConsumer : public SemaConsumer {
   void HandleTopLevelDeclInObjCContainer(DeclGroupRef D) override;
   void HandleImplicitImportDecl(ImportDecl *D) override;
   void CompleteTentativeDefinition(VarDecl *D) override;
-  void CompleteExternalDeclaration(VarDecl *D) override;
+  void CompleteExternalDeclaration(DeclaratorDecl *D) override;
   void AssignInheritanceModel(CXXRecordDecl *RD) override;
   void HandleVTable(CXXRecordDecl *RD) override;
   ASTMutationListener *GetASTMutationListener() override;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index fb3a5d25c635cd..75a80540dbcbfd 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -3098,7 +3098,7 @@ class Sema final : public SemaBase {
   TentativeDefinitionsType TentativeDefinitions;
 
   /// All the external declarations encoutered and used in the TU.
-  SmallVector<VarDecl *, 4> ExternalDeclarations;
+  SmallVector<DeclaratorDecl *, 4> ExternalDeclarations;
 
   /// Generally null except when we temporarily switch decl contexts,
   /// like in \see SemaObjC::ActOnObjCTemporaryExitContainerContext.
diff --git a/clang/lib/CodeGen/BackendConsumer.h b/clang/lib/CodeGen/BackendConsumer.h
index a648bd314e5012..a023d29cbd1d73 100644
--- a/clang/lib/CodeGen/BackendConsumer.h
+++ b/clang/lib/CodeGen/BackendConsumer.h
@@ -107,7 +107,7 @@ class BackendConsumer : public ASTConsumer {
   void HandleTagDeclDefinition(TagDecl *D) override;
   void HandleTagDeclRequiredDefinition(const TagDecl *D) override;
   void CompleteTentativeDefinition(VarDecl *D) override;
-  void CompleteExternalDeclaration(VarDecl *D) override;
+  void CompleteExternalDeclaration(DeclaratorDecl *D) override;
   void AssignInheritanceModel(CXXRecordDecl *RD) override;
   void HandleVTable(CXXRecordDecl *RD) override;
 
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 23e5deee325813..039f60c7745918 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -3141,21 +3141,8 @@ LValue CodeGenFunction::EmitDeclRefLValue(const DeclRefExpr *E) {
     return LV;
   }
 
-  if (const auto *FD = dyn_cast<FunctionDecl>(ND)) {
-    LValue LV = EmitFunctionDeclLValue(*this, E, FD);
-
-    // Emit debuginfo for the function declaration if the target wants to.
-    if (getContext().getTargetInfo().allowDebugInfoForExternalRef()) {
-      if (CGDebugInfo *DI = CGM.getModuleDebugInfo()) {
-        auto *Fn =
-            cast<llvm::Function>(LV.getPointer(*this)->stripPointerCasts());
-        if (!Fn->getSubprogram())
-          DI->EmitFunctionDecl(FD, FD->getLocation(), T, Fn);
-      }
-    }
-
-    return LV;
-  }
+  if (const auto *FD = dyn_cast<FunctionDecl>(ND))
+    return EmitFunctionDeclLValue(*this, E, FD);
 
   // FIXME: While we're emitting a binding from an enclosing scope, all other
   // DeclRefExprs we see should be implicitly treated as if they also refer to
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index 0b92c5318a5c23..e87226e60297c0 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -376,7 +376,7 @@ void BackendConsumer::CompleteTentativeDefinition(VarDecl *D) {
   Gen->CompleteTentativeDefinition(D);
 }
 
-void BackendConsumer::CompleteExternalDeclaration(VarDecl *D) {
+void BackendConsumer::CompleteExternalDeclaration(DeclaratorDecl *D) {
   Gen->CompleteExternalDeclaration(D);
 }
 
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 99e986d371cac5..dc9dd034dee7ba 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -5185,8 +5185,11 @@ void CodeGenModule::EmitTentativeDefinition(const VarDecl *D) {
   EmitGlobalVarDefinition(D);
 }
 
-void CodeGenModule::EmitExternalDeclaration(const VarDecl *D) {
-  EmitExternalVarDeclaration(D);
+void CodeGenModule::EmitExternalDeclaration(const DeclaratorDecl *D) {
+  if (auto const *V = dyn_cast<const VarDecl>(D))
+    EmitExternalVarDeclaration(V);
+  if (auto const *FD = dyn_cast<const FunctionDecl>(D))
+    EmitExternalFunctionDeclaration(FD);
 }
 
 CharUnits CodeGenModule::GetTargetTypeStoreSize(llvm::Type *Ty) const {
@@ -5622,6 +5625,18 @@ void CodeGenModule::EmitExternalVarDeclaration(const VarDecl *D) {
     }
 }
 
+void CodeGenModule::EmitExternalFunctionDeclaration(const FunctionDecl *FD) {
+  if (CGDebugInfo *DI = getModuleDebugInfo())
+    if (getCodeGenOpts().hasReducedDebugInfo()) {
+      auto *Ty = getTypes().ConvertType(FD->getType());
+      StringRef MangledName = getMangledName(FD);
+      auto *Fn = dyn_cast<llvm::Function>(
+          GetOrCreateLLVMFunction(MangledName, Ty, FD, /* ForVTable */ false));
+      if (!Fn->getSubprogram())
+        DI->EmitFunctionDecl(FD, FD->getLocation(), FD->getType(), Fn);
+    }
+}
+
 static bool isVarDeclStrongDefinition(const ASTContext &Context,
                                       CodeGenModule &CGM, const VarDecl *D,
                                       bool NoCommon) {
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 4796d421aaa699..0444f9f8449f86 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1338,7 +1338,7 @@ class CodeGenModule : public CodeGenTypeCache {
 
   void EmitTentativeDefinition(const VarDecl *D);
 
-  void EmitExternalDeclaration(const VarDecl *D);
+  void EmitExternalDeclaration(const DeclaratorDecl *D);
 
   void EmitVTable(CXXRecordDecl *Class);
 
@@ -1690,6 +1690,7 @@ class CodeGenModule : public CodeGenTypeCache {
 
   void EmitGlobalVarDefinition(const VarDecl *D, bool IsTentative = false);
   void EmitExternalVarDeclaration(const VarDecl *D);
+  void EmitExternalFunctionDeclaration(const FunctionDecl *D);
   void EmitAliasDefinition(GlobalDecl GD);
   void emitIFuncDefinition(GlobalDecl GD);
   void emitCPUDispatchDefinition(GlobalDecl GD);
diff --git a/clang/lib/CodeGen/ModuleBuilder.cpp b/clang/lib/CodeGen/ModuleBuilder.cpp
index df85295cfb2e29..d4e0ab0339a8b0 100644
--- a/clang/lib/CodeGen/ModuleBuilder.cpp
+++ b/clang/lib/CodeGen/ModuleBuilder.cpp
@@ -310,7 +310,7 @@ namespace {
       Builder->EmitTentativeDefinition(D);
     }
 
-    void CompleteExternalDeclaration(VarDecl *D) override {
+    void CompleteExternalDeclaration(DeclaratorDecl *D) override {
       Builder->EmitExternalDeclaration(D);
     }
 
diff --git a/clang/lib/Frontend/MultiplexConsumer.cpp b/clang/lib/Frontend/MultiplexConsumer.cpp
index 8fdc7f55a5003f..651c55aeed5408 100644
--- a/clang/lib/Frontend/MultiplexConsumer.cpp
+++ b/clang/lib/Frontend/MultiplexConsumer.cpp
@@ -357,7 +357,7 @@ void MultiplexConsumer::CompleteTentativeDefinition(VarDecl *D) {
     Consumer->CompleteTentativeDefinition(D);
 }
 
-void MultiplexConsumer::CompleteExternalDeclaration(VarDecl *D) {
+void MultiplexConsumer::CompleteExternalDeclaration(DeclaratorDecl *D) {
   for (auto &Consumer : Consumers)
     Consumer->CompleteExternalDeclaration(D);
 }
diff --git a/clang/lib/Interpreter/IncrementalParser.cpp b/clang/lib/Interpreter/IncrementalParser.cpp
index a8d0294fb6151b..b7c809c45098ca 100644
--- a/clang/lib/Interpreter/IncrementalParser.cpp
+++ b/clang/lib/Interpreter/IncrementalParser.cpp
@@ -79,7 +79,7 @@ class IncrementalASTConsumer final : public ASTConsumer {
   void CompleteTentativeDefinition(VarDecl *D) override final {
     Consumer->CompleteTentativeDefinition(D);
   }
-  void CompleteExternalDeclaration(VarDecl *D) override final {
+  void CompleteExternalDeclaration(DeclaratorDecl *D) override final {
     Consumer->CompleteExternalDeclaration(D);
   }
   void AssignInheritanceModel(CXXRecordDecl *RD) override final {
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index b3bfdacb017900..aa44608035538f 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -10818,6 +10818,14 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
       break;
     }
 
+  // Similar to no_builtin logic above, at this point of the code
+  // FunctionDecl::isThisDeclarationADefinition() always returns `false`
+  // because Sema::ActOnStartOfFunctionDef has not been called yet.
+  if (Context.getTargetInfo().allowDebugInfoForExternalRef() &&
+      !NewFD->isInvalidDecl() &&
+      D.getFunctionDefinitionKind() == FunctionDefinitionKind::Declaration)
+    ExternalDeclarations.push_back(NewFD);
+
   return NewFD;
 }
 
diff --git a/clang/test/CodeGen/bpf-debug-info-extern-func.c b/clang/test/CodeGen/bpf-debug-info-extern-func.c
new file mode 100644
index 00000000000000..e87c8bea3a878c
--- /dev/null
+++ b/clang/test/CodeGen/bpf-debug-info-extern-func.c
@@ -0,0 +1,9 @@
+// RUN: %clang -g -target bpf -S -emit-llvm %s -o - | FileCheck %s
+//
+// When linking BPF object files via bpftool, BTF info is required for
+// every symbol. BTF is generated from debug info. Ensure that debug info
+// is emitted for extern functions referenced via variable initializers.
+//
+// CHECK: !DISubprogram(name: "fn"
+extern void fn(void);
+void (*pfn) (void) = &fn;
diff --git a/clang/test/CodeGen/bpf-debug-info-unref.c b/clang/test/CodeGen/bpf-debug-info-unref.c
new file mode 100644
index 00000000000000..91f761ec3b77fb
--- /dev/null
+++ b/clang/test/CodeGen/bpf-debug-info-unref.c
@@ -0,0 +1,11 @@
+// RUN: %clang -g -target bpf -S -emit-llvm %s -o - | FileCheck %s
+//
+// No debug info is produced for unreferenced functions.
+// CHECK-NOT: !DISubprogram
+void unref(void);
+void unref2(typeof(unref));
+
+// No debug info for unused extern variables as well.
+// CHECK-NOT: !DiGlobalVariable
+extern int unused;
+extern int unused2[sizeof(unused)];
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index 34581e3b6286a7..4d847abea731dc 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -1495,6 +1495,29 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
 
     DataSecEntries[std::string(SecName)]->addDataSecEntry(VarId,
         Asm->getSymbol(&Global), Size);
+
+    if (Global.hasInitializer())
+      processGlobalInitializer(Global.getInitializer());
+  }
+}
+
+/// Process global variable initializer in pursuit for function
+/// pointers. Add discovered (extern) functions to BTF. Some (extern)
+/// functions might have been missed otherwise. Every symbol needs BTF
+/// info when linking with bpftool. Primary use case: "static"
+/// initialization of BPF maps.
+///
+/// struct {
+///   __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+///   ...
+/// } prog_map SEC(".maps") = { .values = { extern_func } };
+///
+void BTFDebug::processGlobalInitializer(const Constant *C) {
+  if (auto *Fn = dyn_cast<Function>(C))
+    processFuncPrototypes(Fn);
+  if (auto *CA = dyn_cast<ConstantAggregate>(C)) {
+    for (unsigned I = 0, N = CA->getNumOperands(); I < N; ++I)
+      processGlobalInitializer(CA->getOperand(I));
   }
 }
 
diff --git a/llvm/lib/Target/BPF/BTFDebug.h b/llvm/lib/Target/BPF/BTFDebug.h
index 3ef4a85299b653..b24a79de74efa3 100644
--- a/llvm/lib/Target/BPF/BTFDebug.h
+++ b/llvm/lib/Target/BPF/BTFDebug.h
@@ -352,6 +352,10 @@ class BTFDebug : public DebugHandlerBase {
   /// Generate types and variables for globals.
   void processGlobals(bool ProcessingMapDef);
 
+  /// Process global variable initializer in pursuit for function
+  /// pointers.
+  void processGlobalInitializer(const Constant *C);
+
   /// Generate types for function prototypes.
   void processFuncPrototypes(const Function *);
 
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func2.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func2.ll
new file mode 100644
index 00000000000000..700486d9f3515c
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func2.ll
@@ -0,0 +1,54 @@
+; RUN: llc -march=bpfel -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -march=bpfeb -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s
+;
+; Source code:
+;   extern int elsewhere(void);
+;   struct {
+;     void *values[];
+;   } prog_map = { .values = { elsewhere } };
+; Compilation flag:
+;   clang -target bpf -O2 -g -S -emit-llvm test.c
+; ModuleID = 'b.c'
+
+@prog_map = dso_local local_unnamed_addr global { [1 x ptr] } { [1 x ptr] [ptr @elsewhere] }, align 8, !dbg !0
+
+declare !dbg !17 dso_local i32 @elsewhere() #0
+
+; CHECK:             .long   0                               # BTF_KIND_FUNC_PROTO(id = 6)
+; CHECK-NEXT:        .long   218103808                       # 0xd000000
+; CHECK-NEXT:        .long   7
+; CHECK-NEXT:        .long   37                              # BTF_KIND_INT(id = 7)
+; CHECK-NEXT:        .long   16777216                        # 0x1000000
+; CHECK-NEXT:        .long   4
+; CHECK-NEXT:        .long   16777248                        # 0x1000020
+; CHECK-NEXT:        .long   41                              # BTF_KIND_FUNC(id = 8)
+; CHECK-NEXT:        .long   201326594                       # 0xc000002
+; CHECK-NEXT:        .long   6
+
+attributes #0 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!12, !13, !14, !15}
+!llvm.ident = !{!16}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "prog_map", scope: !2, file: !3, line: 4, type: !5, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, producer: "clang version 19.0.0git (git@github.com:llvm/llvm-project.git 0390a6803608e3a5314315b73740c2d3f5a5723f)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "b.c", directory: "/home/nickz/llvm-project.git", checksumkind: CSK_MD5, checksum: "41cc17375f1261a0e072590833492553")
+!4 = !{!0}
+!5 = distinct !DICompositeType(tag: DW_TAG_structure_type, file: !3, line: 2, elements: !6)
+!6 = !{!7}
+!7 = !DIDerivedType(tag: DW_TAG_member, name: "values", scope: !5, file: !3, line: 3, baseType: !8)
+!8 = !DICompositeType(tag: DW_TAG_array_type, baseType: !9, elements: !10)
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!10 = !{!11}
+!11 = !DISubrange(count: -1)
+!12 = !{i32 7, !"Dwarf Version", i32 5}
+!13 = !{i32 2, !"Debug Info Version", i32 3}
+!14 = !{i32 1, !"wchar_size", i32 4}
+!15 = !{i32 7, !"frame-pointer", i32 2}
+!16 = !{!"clang version 19.0.0git (git@github.com:llvm/llvm-project.git 0390a6803608e3a5314315b73740c2d3f5a5723f)"}
+!17 = !DISubprogram(name: "elsewhere", scope: !3, file: !3, line: 1, type: !18, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!18 = !DISubroutineType(types: !19)
+!19 = !{!20}
+!20 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)

From b0b3c1accd9b75fca149432128b1651261509b64 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 5 Jul 2024 16:36:48 +0200
Subject: [PATCH 09/67] [gn] port 5aacf93a8968

---
 llvm/utils/gn/secondary/libcxx/src/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
index 765e37721370cd..689bc2e137d244 100644
--- a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
@@ -192,6 +192,7 @@ cxx_sources = [
 ]
 if (target_os == "win") {
   cxx_sources += [
+    "support/win32/compiler_rt_shims.cpp",
     "support/win32/locale_win32.cpp",
     "support/win32/support.cpp",
     "support/win32/thread_win32.cpp",

From d177a94fbdc12c82e06dc1c2dc7500c3ce399291 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 5 Jul 2024 16:47:15 +0200
Subject: [PATCH 10/67] [IR] Add Constant::toConstantRange() (NFC)

The logic in llvm::getVectorConstantRange() can be a bit
inconvenient to use in some cases because of the need to handle
the scalar case separately. Generalize it to handle all constants,
and move it to live directly on Constant.
---
 llvm/include/llvm/Analysis/ValueTracking.h |  3 --
 llvm/include/llvm/IR/Constant.h            |  5 +++
 llvm/lib/Analysis/LazyValueInfo.cpp        |  4 +--
 llvm/lib/Analysis/ValueTracking.cpp        | 42 ++--------------------
 llvm/lib/IR/Constants.cpp                  | 38 ++++++++++++++++++++
 5 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index a67ad501982d22..b7b78cb9edab32 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -904,9 +904,6 @@ bool isOverflowIntrinsicNoWrap(const WithOverflowInst *WO,
 /// based on the vscale_range function attribute.
 ConstantRange getVScaleRange(const Function *F, unsigned BitWidth);
 
-/// Determine the possible constant range of a vector constant.
-ConstantRange getVectorConstantRange(const Constant *C);
-
 /// Determine the possible constant range of an integer or vector of integer
 /// value. This is intended as a cheap, non-recursive check.
 ConstantRange computeConstantRange(const Value *V, bool ForSigned,
diff --git a/llvm/include/llvm/IR/Constant.h b/llvm/include/llvm/IR/Constant.h
index d3171acf7b9ac2..a82e37b7e2df23 100644
--- a/llvm/include/llvm/IR/Constant.h
+++ b/llvm/include/llvm/IR/Constant.h
@@ -19,6 +19,7 @@
 
 namespace llvm {
 
+class ConstantRange;
 class APInt;
 
 /// This is an important base class in LLVM. It provides the common facilities
@@ -154,6 +155,10 @@ class Constant : public User {
   /// vector of constant integers, all equal, and the common value is returned.
   const APInt &getUniqueInteger() const;
 
+  /// Convert constant to an approximate constant range. For vectors, the
+  /// range is the union over the element ranges. Poison elements are ignored.
+  ConstantRange toConstantRange() const;
+
   /// Called if some element of this constant is no longer valid.
   /// At this point only other constants may be on the use_list for this
   /// constant.  Any constants on our Use list must also be destroy'd.  The
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 674c47ebe786ad..27a25377aa86bb 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -844,8 +844,8 @@ static ConstantRange toConstantRange(const ValueLatticeElement &Val,
   unsigned BW = Ty->getScalarSizeInBits();
   if (Val.isUnknown())
     return ConstantRange::getEmpty(BW);
-  if (Val.isConstant() && Ty->isVectorTy())
-    return getVectorConstantRange(Val.getConstant());
+  if (Val.isConstant())
+    return Val.getConstant()->toConstantRange();
   return ConstantRange::getFull(BW);
 }
 
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 85abf00774a026..7be8a18dd72712 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9513,39 +9513,6 @@ static void setLimitForFPToI(const Instruction *I, APInt &Lower, APInt &Upper) {
   }
 }
 
-ConstantRange llvm::getVectorConstantRange(const Constant *C) {
-  assert(C->getType()->isVectorTy() && "Expected vector constant");
-  if (auto *CI = dyn_cast_or_null<ConstantInt>(
-          C->getSplatValue(/*AllowPoison=*/true)))
-    return ConstantRange(CI->getValue());
-
-  unsigned BitWidth = C->getType()->getScalarSizeInBits();
-  if (auto *CDV = dyn_cast<ConstantDataVector>(C)) {
-    ConstantRange CR = ConstantRange::getEmpty(BitWidth);
-    for (unsigned I = 0, E = CDV->getNumElements(); I < E; ++I)
-      CR = CR.unionWith(CDV->getElementAsAPInt(I));
-    return CR;
-  }
-
-  if (auto *CV = dyn_cast<ConstantVector>(C)) {
-    ConstantRange CR = ConstantRange::getEmpty(BitWidth);
-    for (unsigned I = 0, E = CV->getNumOperands(); I < E; ++I) {
-      Constant *Elem = C->getAggregateElement(I);
-      if (!Elem)
-        return ConstantRange::getFull(BitWidth);
-      if (isa<PoisonValue>(Elem))
-        continue;
-      auto *CI = dyn_cast<ConstantInt>(Elem);
-      if (!CI)
-        return ConstantRange::getFull(BitWidth);
-      CR = CR.unionWith(CI->getValue());
-    }
-    return CR;
-  }
-
-  return ConstantRange::getFull(BitWidth);
-}
-
 ConstantRange llvm::computeConstantRange(const Value *V, bool ForSigned,
                                          bool UseInstrInfo, AssumptionCache *AC,
                                          const Instruction *CtxI,
@@ -9556,13 +9523,8 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool ForSigned,
   if (Depth == MaxAnalysisRecursionDepth)
     return ConstantRange::getFull(V->getType()->getScalarSizeInBits());
 
-  if (auto *C = dyn_cast<Constant>(V)) {
-    if (auto *CI = dyn_cast<ConstantInt>(C))
-      return ConstantRange(CI->getValue());
-    if (C->getType()->isVectorTy())
-      return getVectorConstantRange(C);
-    return ConstantRange::getFull(C->getType()->getScalarSizeInBits());
-  }
+  if (auto *C = dyn_cast<Constant>(V))
+    return C->toConstantRange();
 
   unsigned BitWidth = V->getType()->getScalarSizeInBits();
   InstrInfoQuery IIQ(UseInstrInfo);
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index bc91f904d7e87a..70803c153d8cbd 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -1756,6 +1756,44 @@ const APInt &Constant::getUniqueInteger() const {
   return cast<ConstantInt>(C)->getValue();
 }
 
+ConstantRange Constant::toConstantRange() const {
+  if (auto *CI = dyn_cast<ConstantInt>(this))
+    return ConstantRange(CI->getValue());
+
+  unsigned BitWidth = getType()->getScalarSizeInBits();
+  if (!getType()->isVectorTy())
+    return ConstantRange::getFull(BitWidth);
+
+  if (auto *CI = dyn_cast_or_null<ConstantInt>(
+          getSplatValue(/*AllowPoison=*/true)))
+    return ConstantRange(CI->getValue());
+
+  if (auto *CDV = dyn_cast<ConstantDataVector>(this)) {
+    ConstantRange CR = ConstantRange::getEmpty(BitWidth);
+    for (unsigned I = 0, E = CDV->getNumElements(); I < E; ++I)
+      CR = CR.unionWith(CDV->getElementAsAPInt(I));
+    return CR;
+  }
+
+  if (auto *CV = dyn_cast<ConstantVector>(this)) {
+    ConstantRange CR = ConstantRange::getEmpty(BitWidth);
+    for (unsigned I = 0, E = CV->getNumOperands(); I < E; ++I) {
+      Constant *Elem = CV->getOperand(I);
+      if (!Elem)
+        return ConstantRange::getFull(BitWidth);
+      if (isa<PoisonValue>(Elem))
+        continue;
+      auto *CI = dyn_cast<ConstantInt>(Elem);
+      if (!CI)
+        return ConstantRange::getFull(BitWidth);
+      CR = CR.unionWith(CI->getValue());
+    }
+    return CR;
+  }
+
+  return ConstantRange::getFull(BitWidth);
+}
+
 //---- ConstantPointerNull::get() implementation.
 //
 

From 4339d2edf6cbf4f7ddee3100980495ebcb8f64a5 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 5 Jul 2024 16:57:08 +0200
Subject: [PATCH 11/67] [CVP] Add missing CHECK lines in test (NFC)

---
 .../CorrelatedValuePropagation/vectors.ll      | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll b/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll
index caaed628ed43ec..88e995ac747ae0 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll
@@ -224,6 +224,17 @@ define <2 x i16> @and_with_poison(<2 x i8> %a) {
 
 
 define <4 x i64> @issue_97674_getConstantOnEdge(i1 %cond) {
+; CHECK-LABEL: define <4 x i64> @issue_97674_getConstantOnEdge(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[FOLDS:%.*]] = add nuw nsw <4 x i64> zeroinitializer, <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[R:%.*]] = phi <4 x i64> [ <i64 1, i64 1, i64 1, i64 1>, %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NEXT:    ret <4 x i64> [[R]]
+;
 entry:
   br i1 %cond, label %if.then, label %if.end
 
@@ -235,8 +246,13 @@ if.end:
   %r = phi <4 x i64> [ %folds, %if.then ], [ zeroinitializer, %entry ]
   ret <4 x i64> %r
 }
-    
+
 define <4 x i64> @issue_97674_getConstant() {
+; CHECK-LABEL: define <4 x i64> @issue_97674_getConstant() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOLDS:%.*]] = add nuw nsw <4 x i64> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    ret <4 x i64> zeroinitializer
+;
 entry:
   %folds = add <4 x i64> zeroinitializer, zeroinitializer
   ret <4 x i64> %folds

From f92bfca9fc217cad9026598ef6755e711c0be070 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Fri, 5 Jul 2024 16:01:00 +0100
Subject: [PATCH 12/67] [AArch64] All bits of an exact right shift are demanded
 (#97448)

When building a vector which contains zero elements, the AArch64 ISel
replaces those elements with `undef`, if they are right shifted out.

However, these elements need to stay zero if the right shift is exact,
or otherwise we will be introducing undefined behavior.

Should allow https://github.com/llvm/llvm-project/pull/92528 to be
recommitted.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  4 +++
 .../AArch64/shr-exact-demanded-bits.ll        | 35 +++++++++++++++++++
 2 files changed, 39 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/shr-exact-demanded-bits.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e0c3cc5eddb827..341cf51173ccc2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22142,6 +22142,10 @@ static SDValue performVectorShiftCombine(SDNode *N,
     if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
       return Op.getOperand(0);
 
+  // If the shift is exact, the shifted out bits matter.
+  if (N->getFlags().hasExact())
+    return SDValue();
+
   APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
   APInt DemandedMask = ~ShiftedOutBits;
 
diff --git a/llvm/test/CodeGen/AArch64/shr-exact-demanded-bits.ll b/llvm/test/CodeGen/AArch64/shr-exact-demanded-bits.ll
new file mode 100644
index 00000000000000..9698626aea655d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/shr-exact-demanded-bits.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+target triple = "aarch64-linux"
+
+define <2 x i32> @f(i8  %0, i8  %1) {
+; CHECK-LABEL: f:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov v0.b[3], w0
+; CHECK-NEXT:    mov v0.b[7], w1
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-NEXT:    ret
+  %3 = insertelement <2 x i8> poison, i8 %0, i64 0
+  %4 = insertelement <2 x i8> %3, i8 %1, i64 1
+  %5 = shufflevector <2 x i8> %4, <2 x i8> <i8 0, i8 poison>, <8 x i32> <i32 2, i32 2, i32 2, i32 0, i32 2, i32 2, i32 2, i32 1>
+  %6 = bitcast <8 x i8> %5 to <2 x i32>
+  %7 = ashr exact <2 x i32> %6, <i32 24, i32 24>
+  ret <2 x i32> %7
+}
+
+define <2 x i32> @g(i8  %0, i8  %1) {
+; CHECK-LABEL: g:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov v0.b[3], w0
+; CHECK-NEXT:    mov v0.b[7], w1
+; CHECK-NEXT:    ushr v0.2s, v0.2s, #24
+; CHECK-NEXT:    ret
+  %3 = insertelement <2 x i8> poison, i8 %0, i64 0
+  %4 = insertelement <2 x i8> %3, i8 %1, i64 1
+  %5 = shufflevector <2 x i8> %4, <2 x i8> <i8 0, i8 poison>, <8 x i32> <i32 2, i32 2, i32 2, i32 0, i32 2, i32 2, i32 2, i32 1>
+  %6 = bitcast <8 x i8> %5 to <2 x i32>
+  %7 = lshr exact <2 x i32> %6, <i32 24, i32 24>
+  ret <2 x i32> %7
+}

From 130f0f526dc28ebbe23e5956857e85f7c9b754f5 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 5 Jul 2024 16:56:46 +0200
Subject: [PATCH 13/67] [LVI][CVP] Add support for vector comparisons

---
 llvm/lib/Analysis/LazyValueInfo.cpp           |  6 +---
 .../CorrelatedValuePropagation/vectors.ll     | 32 +++++++++++++++----
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 27a25377aa86bb..877898f6daeef6 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -1784,12 +1784,8 @@ static Constant *getPredicateResult(CmpInst::Predicate Pred, Constant *C,
 
   Type *ResTy = CmpInst::makeCmpResultType(C->getType());
   if (Val.isConstantRange()) {
-    ConstantInt *CI = dyn_cast<ConstantInt>(C);
-    if (!CI)
-      return nullptr;
-
     const ConstantRange &CR = Val.getConstantRange();
-    ConstantRange RHS(CI->getValue());
+    ConstantRange RHS = C->toConstantRange();
     if (CR.icmp(Pred, RHS))
       return ConstantInt::getTrue(ResTy);
     if (CR.icmp(CmpInst::getInversePredicate(Pred), RHS))
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll b/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll
index 88e995ac747ae0..6f13263fe92be8 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll
@@ -1,32 +1,52 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=correlated-propagation < %s | FileCheck %s
 
-; TODO: Add support for this.
 define <2 x i1> @cmp1(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x i1> @cmp1(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw <2 x i8> [[A]], <i8 1, i8 1>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i8> [[ADD]], zeroinitializer
-; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %add = add nuw <2 x i8> %a, splat (i8 1)
   %cmp = icmp ne <2 x i8> %add, zeroinitializer
   ret <2 x i1> %cmp
 }
 
-; TODO: Add support for this.
 define <2 x i1> @cmp2(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x i1> @cmp2(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw <2 x i8> [[A]], <i8 5, i8 5>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt <2 x i8> [[ADD]], <i8 2, i8 2>
-; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %add = add nuw <2 x i8> %a, splat (i8 5)
   %cmp = icmp ugt <2 x i8> %add, splat (i8 2)
   ret <2 x i1> %cmp
 }
 
+define <2 x i1> @cmp_nonsplat(<2 x i8> %a) {
+; CHECK-LABEL: define <2 x i1> @cmp_nonsplat(
+; CHECK-SAME: <2 x i8> [[A:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw <2 x i8> [[A]], <i8 4, i8 5>
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %add = add nuw <2 x i8> %a, <i8 4, i8 5>
+  %cmp = icmp ugt <2 x i8> %add, <i8 2, i8 3>
+  ret <2 x i1> %cmp
+}
+
+; Handling this would require keeping track of ranges on a per-element basis.
+define <2 x i1> @cmp_nonsplat_fail(<2 x i8> %a) {
+; CHECK-LABEL: define <2 x i1> @cmp_nonsplat_fail(
+; CHECK-SAME: <2 x i8> [[A:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw <2 x i8> [[A]], <i8 3, i8 4>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt <2 x i8> [[ADD]], <i8 2, i8 3>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %add = add nuw <2 x i8> %a, <i8 3, i8 4>
+  %cmp = icmp ugt <2 x i8> %add, <i8 2, i8 3>
+  ret <2 x i1> %cmp
+}
+
 define <2 x i1> @cmp_signedness(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x i1> @cmp_signedness(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {

From 3bb25636414ee5b5eaf99c0bdcc191052c9d7ffb Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 5 Jul 2024 17:32:51 +0200
Subject: [PATCH 14/67] [mlir][vector] Fix crash in `vector.insert`
 canonicalization (#97801)

The `InsertOpConstantFolder` assumed that whenever the destination can
be folded to a constant attribute, that attribute must be a
`DenseElementsAttr`. That is is not necessarily the case.
---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp   |  5 +++--
 mlir/test/Dialect/Vector/canonicalize.mlir | 11 +++++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 53a6648de014c0..55bace2e35f444 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -2851,6 +2851,9 @@ class InsertOpConstantFolder final : public OpRewritePattern<InsertOp> {
     Attribute vectorDestCst;
     if (!matchPattern(destVector, m_Constant(&vectorDestCst)))
       return failure();
+    auto denseDest = llvm::dyn_cast<DenseElementsAttr>(vectorDestCst);
+    if (!denseDest)
+      return failure();
 
     VectorType destTy = destVector.getType();
     if (destTy.isScalable())
@@ -2861,8 +2864,6 @@ class InsertOpConstantFolder final : public OpRewritePattern<InsertOp> {
         !destVector.hasOneUse())
       return failure();
 
-    auto denseDest = llvm::cast<DenseElementsAttr>(vectorDestCst);
-
     Value sourceValue = op.getSource();
     Attribute sourceCst;
     if (!matchPattern(sourceValue, m_Constant(&sourceCst)))
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index 1a674d715ca61b..e71a6eb02ea46c 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -2729,3 +2729,14 @@ func.func @fold_vector_step_to_constant() -> vector<4xindex> {
   %0 = vector.step : vector<4xindex>
   return %0 : vector<4xindex>
 }
+
+// -----
+
+// CHECK-LABEL: func @vector_insert_const_regression(
+//       CHECK:   llvm.mlir.undef
+//       CHECK:   vector.insert
+func.func @vector_insert_const_regression(%arg0: i8) -> vector<4xi8> {
+  %0 = llvm.mlir.undef : vector<4xi8>
+  %1 = vector.insert %arg0, %0 [0] : i8 into vector<4xi8>
+  return %1 : vector<4xi8>
+}

From 9315645834ea81cf9550364a4950f289e9706a26 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Fri, 5 Jul 2024 10:43:51 -0500
Subject: [PATCH 15/67] [mlir][python] auto attribute casting (#97786)

---
 .../mlir/Bindings/Python/PybindAdaptors.h     | 24 ++++++++++++++++---
 mlir/test/python/dialects/python_test.py      | 14 ++++++++++-
 mlir/test/python/lib/PythonTestCAPI.cpp       |  4 ++++
 mlir/test/python/lib/PythonTestCAPI.h         |  2 ++
 mlir/test/python/lib/PythonTestDialect.h      |  6 ++---
 mlir/test/python/lib/PythonTestModule.cpp     |  7 +++---
 mlir/test/python/python_test_ops.td           |  4 ++++
 7 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/mlir/include/mlir/Bindings/Python/PybindAdaptors.h b/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
index ebf50109f72f23..df4b9bf713592d 100644
--- a/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
+++ b/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
@@ -406,21 +406,25 @@ class pure_subclass {
 class mlir_attribute_subclass : public pure_subclass {
 public:
   using IsAFunctionTy = bool (*)(MlirAttribute);
+  using GetTypeIDFunctionTy = MlirTypeID (*)();
 
   /// Subclasses by looking up the super-class dynamically.
   mlir_attribute_subclass(py::handle scope, const char *attrClassName,
-                          IsAFunctionTy isaFunction)
+                          IsAFunctionTy isaFunction,
+                          GetTypeIDFunctionTy getTypeIDFunction = nullptr)
       : mlir_attribute_subclass(
             scope, attrClassName, isaFunction,
             py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-                .attr("Attribute")) {}
+                .attr("Attribute"),
+            getTypeIDFunction) {}
 
   /// Subclasses with a provided mlir.ir.Attribute super-class. This must
   /// be used if the subclass is being defined in the same extension module
   /// as the mlir.ir class (otherwise, it will trigger a recursive
   /// initialization).
   mlir_attribute_subclass(py::handle scope, const char *typeClassName,
-                          IsAFunctionTy isaFunction, const py::object &superCls)
+                          IsAFunctionTy isaFunction, const py::object &superCls,
+                          GetTypeIDFunctionTy getTypeIDFunction = nullptr)
       : pure_subclass(scope, typeClassName, superCls) {
     // Casting constructor. Note that it hard, if not impossible, to properly
     // call chain to parent `__init__` in pybind11 due to its special handling
@@ -454,6 +458,20 @@ class mlir_attribute_subclass : public pure_subclass {
         "isinstance",
         [isaFunction](MlirAttribute other) { return isaFunction(other); },
         py::arg("other_attribute"));
+    def("__repr__", [superCls, captureTypeName](py::object self) {
+      return py::repr(superCls(self))
+          .attr("replace")(superCls.attr("__name__"), captureTypeName);
+    });
+    if (getTypeIDFunction) {
+      def_staticmethod("get_static_typeid",
+                       [getTypeIDFunction]() { return getTypeIDFunction(); });
+      py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
+          .attr(MLIR_PYTHON_CAPI_TYPE_CASTER_REGISTER_ATTR)(
+              getTypeIDFunction())(pybind11::cpp_function(
+              [thisClass = thisClass](const py::object &mlirAttribute) {
+                return thisClass(mlirAttribute);
+              }));
+    }
   }
 };
 
diff --git a/mlir/test/python/dialects/python_test.py b/mlir/test/python/dialects/python_test.py
index 70927b22d4749c..a76f3f2b5e4583 100644
--- a/mlir/test/python/dialects/python_test.py
+++ b/mlir/test/python/dialects/python_test.py
@@ -307,11 +307,23 @@ def testOptionalOperandOp():
 # CHECK-LABEL: TEST: testCustomAttribute
 @run
 def testCustomAttribute():
-    with Context() as ctx:
+    with Context() as ctx, Location.unknown():
         a = test.TestAttr.get()
         # CHECK: #python_test.test_attr
         print(a)
 
+        # CHECK: python_test.custom_attributed_op  {
+        # CHECK: #python_test.test_attr
+        # CHECK: }
+        op2 = test.CustomAttributedOp(a)
+        print(f"{op2}")
+
+        # CHECK: #python_test.test_attr
+        print(f"{op2.test_attr}")
+
+        # CHECK: TestAttr(#python_test.test_attr)
+        print(repr(op2.test_attr))
+
         # The following cast must not assert.
         b = test.TestAttr(a)
 
diff --git a/mlir/test/python/lib/PythonTestCAPI.cpp b/mlir/test/python/lib/PythonTestCAPI.cpp
index 71778a97d83a41..cb7d7677714fe6 100644
--- a/mlir/test/python/lib/PythonTestCAPI.cpp
+++ b/mlir/test/python/lib/PythonTestCAPI.cpp
@@ -23,6 +23,10 @@ MlirAttribute mlirPythonTestTestAttributeGet(MlirContext context) {
   return wrap(python_test::TestAttrAttr::get(unwrap(context)));
 }
 
+MlirTypeID mlirPythonTestTestAttributeGetTypeID(void) {
+  return wrap(python_test::TestAttrAttr::getTypeID());
+}
+
 bool mlirTypeIsAPythonTestTestType(MlirType type) {
   return llvm::isa<python_test::TestTypeType>(unwrap(type));
 }
diff --git a/mlir/test/python/lib/PythonTestCAPI.h b/mlir/test/python/lib/PythonTestCAPI.h
index 5f1ed3a5b2ad66..43f8fdcbfae125 100644
--- a/mlir/test/python/lib/PythonTestCAPI.h
+++ b/mlir/test/python/lib/PythonTestCAPI.h
@@ -23,6 +23,8 @@ mlirAttributeIsAPythonTestTestAttribute(MlirAttribute attr);
 MLIR_CAPI_EXPORTED MlirAttribute
 mlirPythonTestTestAttributeGet(MlirContext context);
 
+MLIR_CAPI_EXPORTED MlirTypeID mlirPythonTestTestAttributeGetTypeID(void);
+
 MLIR_CAPI_EXPORTED bool mlirTypeIsAPythonTestTestType(MlirType type);
 
 MLIR_CAPI_EXPORTED MlirType mlirPythonTestTestTypeGet(MlirContext context);
diff --git a/mlir/test/python/lib/PythonTestDialect.h b/mlir/test/python/lib/PythonTestDialect.h
index 044381fcd4728d..889365e1136b4e 100644
--- a/mlir/test/python/lib/PythonTestDialect.h
+++ b/mlir/test/python/lib/PythonTestDialect.h
@@ -16,13 +16,13 @@
 
 #include "PythonTestDialect.h.inc"
 
-#define GET_OP_CLASSES
-#include "PythonTestOps.h.inc"
-
 #define GET_ATTRDEF_CLASSES
 #include "PythonTestAttributes.h.inc"
 
 #define GET_TYPEDEF_CLASSES
 #include "PythonTestTypes.h.inc"
 
+#define GET_OP_CLASSES
+#include "PythonTestOps.h.inc"
+
 #endif // MLIR_TEST_PYTHON_LIB_PYTHONTESTDIALECT_H
diff --git a/mlir/test/python/lib/PythonTestModule.cpp b/mlir/test/python/lib/PythonTestModule.cpp
index f81b851f8759bf..a4f538dcb55944 100644
--- a/mlir/test/python/lib/PythonTestModule.cpp
+++ b/mlir/test/python/lib/PythonTestModule.cpp
@@ -44,10 +44,11 @@ PYBIND11_MODULE(_mlirPythonTest, m) {
       py::arg("registry"));
 
   mlir_attribute_subclass(m, "TestAttr",
-                          mlirAttributeIsAPythonTestTestAttribute)
+                          mlirAttributeIsAPythonTestTestAttribute,
+                          mlirPythonTestTestAttributeGetTypeID)
       .def_classmethod(
           "get",
-          [](py::object cls, MlirContext ctx) {
+          [](const py::object &cls, MlirContext ctx) {
             return cls(mlirPythonTestTestAttributeGet(ctx));
           },
           py::arg("cls"), py::arg("context") = py::none());
@@ -56,7 +57,7 @@ PYBIND11_MODULE(_mlirPythonTest, m) {
                      mlirPythonTestTestTypeGetTypeID)
       .def_classmethod(
           "get",
-          [](py::object cls, MlirContext ctx) {
+          [](const py::object &cls, MlirContext ctx) {
             return cls(mlirPythonTestTestTypeGet(ctx));
           },
           py::arg("cls"), py::arg("context") = py::none());
diff --git a/mlir/test/python/python_test_ops.td b/mlir/test/python/python_test_ops.td
index c0bc18448610a0..6211fb9987c76a 100644
--- a/mlir/test/python/python_test_ops.td
+++ b/mlir/test/python/python_test_ops.td
@@ -58,6 +58,10 @@ def AttributedOp : TestOp<"attributed_op"> {
                    UnitAttr:$unit);
 }
 
+def CustomAttributedOp : TestOp<"custom_attributed_op"> {
+  let arguments = (ins TestAttr:$test_attr);
+}
+
 def AttributesOp : TestOp<"attributes_op"> {
   let arguments = (ins
                    AffineMapArrayAttr:$x_affinemaparr,

From 344930316f4c901673461dcf44ad57ae6ade1015 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Fri, 5 Jul 2024 16:44:31 +0100
Subject: [PATCH 16/67] LDist/test: clean up and modernize (NFC) (#97822)

Clean up unused triple/datalayout lines, strengthen RUN lines to include
-verify-loop-info/-verify-dom-info, and regenerate tests with
UpdateTestChecks where appropriate.
---
 llvm/test/Transforms/LoopDistribute/basic.ll  |  15 +--
 .../LoopDistribute/bounds-expansion-bug.ll    |   8 +-
 .../crash-in-memcheck-generation.ll           |  15 +--
 .../diagnostics-with-hotness.ll               |   3 -
 .../Transforms/LoopDistribute/diagnostics.ll  |   3 -
 .../LoopDistribute/disable_nonforced.ll       |   9 +-
 .../disable_nonforced_enable.ll               |   9 +-
 .../Transforms/LoopDistribute/early-exit.ll   |   4 -
 .../Transforms/LoopDistribute/followup.ll     | 127 +++++++++++++++---
 .../Transforms/LoopDistribute/metadata.ll     |  19 +--
 .../LoopDistribute/no-if-convert.ll           |  81 +++++++----
 .../Transforms/LoopDistribute/outside-use.ll  |  94 ++++++++++---
 .../LoopDistribute/pointer-phi-in-loop.ll     |  66 ++++++++-
 .../test/Transforms/LoopDistribute/pr28443.ll |  33 +++--
 .../LoopDistribute/program-order.ll           |  43 +++---
 .../scev-inserted-runtime-check.ll            |   4 +-
 .../LoopDistribute/symbolic-stride.ll         |  11 +-
 .../uncomputable-backedge-taken-count.ll      |  11 +-
 .../unknown-bounds-for-memchecks.ll           |  33 ++++-
 19 files changed, 395 insertions(+), 193 deletions(-)

diff --git a/llvm/test/Transforms/LoopDistribute/basic.ll b/llvm/test/Transforms/LoopDistribute/basic.ll
index 1e4778dfa094cc..04e452d0bb32b2 100644
--- a/llvm/test/Transforms/LoopDistribute/basic.ll
+++ b/llvm/test/Transforms/LoopDistribute/basic.ll
@@ -15,15 +15,8 @@
 ;     C[i] = D[i] * E[i];
 ;   }
 
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
-
 ; CHECK-LABEL: @f(
-define void @f(ptr noalias %a,
-               ptr noalias %b,
-               ptr noalias %c,
-               ptr noalias %d,
-               ptr noalias %e) {
+define void @f(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d, ptr noalias %e) {
 entry:
   br label %for.body
 
@@ -88,11 +81,7 @@ declare i32 @llvm.convergent(i32) #0
 ; It is OK to distribute with a convergent operation, since in each
 ; new loop the convergent operation has the ssame control dependency.
 ; CHECK-LABEL: @f_with_convergent(
-define void @f_with_convergent(ptr noalias %a,
-                               ptr noalias %b,
-                               ptr noalias %c,
-                               ptr noalias %d,
-                               ptr noalias %e) {
+define void @f_with_convergent(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d, ptr noalias %e) {
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopDistribute/bounds-expansion-bug.ll b/llvm/test/Transforms/LoopDistribute/bounds-expansion-bug.ll
index be3b48dcfacf52..b00c1bf98ab122 100644
--- a/llvm/test/Transforms/LoopDistribute/bounds-expansion-bug.ll
+++ b/llvm/test/Transforms/LoopDistribute/bounds-expansion-bug.ll
@@ -14,13 +14,7 @@
 ; can get earlier expanded values invalidated when casts are used.  This test
 ; ensure that we are not using the invalidated values.
 
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @f(ptr %a1, ptr %a2,
-               ptr %b,
-               ptr %c1, ptr %c2,
-               ptr %d,
-               ptr %e) {
+define void @f(ptr %a1, ptr %a2, ptr %b, ptr %c1, ptr %c2, ptr %d, ptr %e) {
 entry:
 
   %cond = icmp eq ptr %e, null
diff --git a/llvm/test/Transforms/LoopDistribute/crash-in-memcheck-generation.ll b/llvm/test/Transforms/LoopDistribute/crash-in-memcheck-generation.ll
index 40193f0cb6bdf7..f4c9fd3514363c 100644
--- a/llvm/test/Transforms/LoopDistribute/crash-in-memcheck-generation.ll
+++ b/llvm/test/Transforms/LoopDistribute/crash-in-memcheck-generation.ll
@@ -11,19 +11,12 @@
 ;     C[i] = D[i] * E[i];
 ;   }
 
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
-
-define void @f(ptr  %a,
-               ptr  %b,
-               ptr noalias %c,
-               ptr noalias %d,
-               ptr noalias %e) {
-entry:
-  br label %for.body
-
+define void @f(ptr %a, ptr %b, ptr noalias %c, ptr noalias %d, ptr noalias %e) {
+; CHECK-LABEL: @f(
 ; CHECK-NOT: memcheck:
 ; CHECK: mul <4 x i32>
+entry:
+  br label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
   %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
diff --git a/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll b/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll
index 6f36f4d263f430..b5ef3578eadd90 100644
--- a/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll
+++ b/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll
@@ -21,9 +21,6 @@
 ;     5	  }
 ;     6	}
 
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.11.0"
-
 ; HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info (hotness: 300)
 ; HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization (hotness: 300)
 ; NO_HOTNESS: remark: /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info{{$}}
diff --git a/llvm/test/Transforms/LoopDistribute/diagnostics.ll b/llvm/test/Transforms/LoopDistribute/diagnostics.ll
index e824eb42b9e38b..e6a0d83bd63d23 100644
--- a/llvm/test/Transforms/LoopDistribute/diagnostics.ll
+++ b/llvm/test/Transforms/LoopDistribute/diagnostics.ll
@@ -32,9 +32,6 @@
 ;    18   }
 ;    19 }
 
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.11.0"
-
 ; MISSED_REMARKS: remark:  /tmp/t.c:3:3: loop not distributed: use -Rpass-analysis=loop-distribute for more info
 ; ALWAYS:         remark: /tmp/t.c:3:3: loop not distributed: memory operations are safe for vectorization
 ; ALWAYS:         warning: /tmp/t.c:3:3: loop not distributed: failed explicitly specified loop distribution
diff --git a/llvm/test/Transforms/LoopDistribute/disable_nonforced.ll b/llvm/test/Transforms/LoopDistribute/disable_nonforced.ll
index f667b86a8a246e..1b3985a43c5be0 100644
--- a/llvm/test/Transforms/LoopDistribute/disable_nonforced.ll
+++ b/llvm/test/Transforms/LoopDistribute/disable_nonforced.ll
@@ -1,16 +1,11 @@
-; RUN: opt -passes=loop-distribute -enable-loop-distribute=1 -S < %s | FileCheck %s
+; RUN: opt -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S < %s | FileCheck %s
 ;
 ; Check that the disable_nonforced is honored by loop distribution.
 ;
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
+define void @disable_nonforced(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d, ptr noalias %e) {
 ; CHECK-LABEL: @disable_nonforced(
 ; CHECK-NOT: for.body.ldist1:
-define void @disable_nonforced(ptr noalias %a,
-                         ptr noalias %b,
-                         ptr noalias %c,
-                         ptr noalias %d,
-                         ptr noalias %e) {
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopDistribute/disable_nonforced_enable.ll b/llvm/test/Transforms/LoopDistribute/disable_nonforced_enable.ll
index 794d7b1ee2a3a6..45a2d31256a276 100644
--- a/llvm/test/Transforms/LoopDistribute/disable_nonforced_enable.ll
+++ b/llvm/test/Transforms/LoopDistribute/disable_nonforced_enable.ll
@@ -1,17 +1,12 @@
-; RUN: opt -passes=loop-distribute -S < %s | FileCheck %s
+; RUN: opt -passes=loop-distribute -verify-loop-info -verify-dom-info -S < %s | FileCheck %s
 ;
 ; Check that llvm.loop.distribute.enable overrides
 ; llvm.loop.disable_nonforced.
 ;
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
+define void @disable_nonforced(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d,  ptr noalias %e) {
 ; CHECK-LABEL: @disable_nonforced(
 ; CHECK: for.body.ldist1:
-define void @disable_nonforced(ptr noalias %a,
-                         ptr noalias %b,
-                         ptr noalias %c,
-                         ptr noalias %d,
-                         ptr noalias %e) {
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopDistribute/early-exit.ll b/llvm/test/Transforms/LoopDistribute/early-exit.ll
index e04811335e1bd7..9353d842523f44 100644
--- a/llvm/test/Transforms/LoopDistribute/early-exit.ll
+++ b/llvm/test/Transforms/LoopDistribute/early-exit.ll
@@ -1,10 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; REQUIRES: x86-registered-target
 ; RUN: opt -aa-pipeline=basic-aa -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S %s | FileCheck %s
 
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
-
 @B = common global ptr null, align 8
 @A = common global ptr null, align 8
 @C = common global ptr null, align 8
diff --git a/llvm/test/Transforms/LoopDistribute/followup.ll b/llvm/test/Transforms/LoopDistribute/followup.ll
index 86cfb1855ae346..55307bdf249914 100644
--- a/llvm/test/Transforms/LoopDistribute/followup.ll
+++ b/llvm/test/Transforms/LoopDistribute/followup.ll
@@ -1,11 +1,93 @@
-; RUN: opt -passes=loop-distribute -S < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-distribute -verify-loop-info -verify-dom-info -S < %s | FileCheck %s
 ;
 ; Check that followup loop-attributes are applied to the loops after
 ; loop distribution.
 ;
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 define void @f(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) {
+; CHECK-LABEL: define void @f(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], ptr [[E:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[FOR_BODY_LVER_CHECK:.*]]
+; CHECK:       [[FOR_BODY_LVER_CHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 84
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[C]], i64 80
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[D]], i64 80
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[E]], i64 80
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[B]], i64 80
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND05:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND16:%.*]] = icmp ult ptr [[D]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT7:%.*]] = and i1 [[BOUND05]], [[BOUND16]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT7]]
+; CHECK-NEXT:    [[BOUND08:%.*]] = icmp ult ptr [[A]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND19:%.*]] = icmp ult ptr [[E]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
+; CHECK-NEXT:    [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
+; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ult ptr [[C]], [[SCEVGEP4]]
+; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
+; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX15]], label %[[FOR_BODY_PH_LVER_ORIG:.*]], label %[[FOR_BODY_PH_LDIST1:.*]]
+; CHECK:       [[FOR_BODY_PH_LVER_ORIG]]:
+; CHECK-NEXT:    br label %[[FOR_BODY_LVER_ORIG:.*]]
+; CHECK:       [[FOR_BODY_LVER_ORIG]]:
+; CHECK-NEXT:    [[IND_LVER_ORIG:%.*]] = phi i64 [ 0, %[[FOR_BODY_PH_LVER_ORIG]] ], [ [[ADD_LVER_ORIG:%.*]], %[[FOR_BODY_LVER_ORIG]] ]
+; CHECK-NEXT:    [[ARRAYIDXA_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IND_LVER_ORIG]]
+; CHECK-NEXT:    [[LOADA_LVER_ORIG:%.*]] = load i32, ptr [[ARRAYIDXA_LVER_ORIG]], align 4
+; CHECK-NEXT:    [[ARRAYIDXB_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IND_LVER_ORIG]]
+; CHECK-NEXT:    [[LOADB_LVER_ORIG:%.*]] = load i32, ptr [[ARRAYIDXB_LVER_ORIG]], align 4
+; CHECK-NEXT:    [[MULA_LVER_ORIG:%.*]] = mul i32 [[LOADB_LVER_ORIG]], [[LOADA_LVER_ORIG]]
+; CHECK-NEXT:    [[ADD_LVER_ORIG]] = add nuw nsw i64 [[IND_LVER_ORIG]], 1
+; CHECK-NEXT:    [[ARRAYIDXA_PLUS_4_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[ADD_LVER_ORIG]]
+; CHECK-NEXT:    store i32 [[MULA_LVER_ORIG]], ptr [[ARRAYIDXA_PLUS_4_LVER_ORIG]], align 4
+; CHECK-NEXT:    [[ARRAYIDXD_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[IND_LVER_ORIG]]
+; CHECK-NEXT:    [[LOADD_LVER_ORIG:%.*]] = load i32, ptr [[ARRAYIDXD_LVER_ORIG]], align 4
+; CHECK-NEXT:    [[ARRAYIDXE_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr [[E]], i64 [[IND_LVER_ORIG]]
+; CHECK-NEXT:    [[LOADE_LVER_ORIG:%.*]] = load i32, ptr [[ARRAYIDXE_LVER_ORIG]], align 4
+; CHECK-NEXT:    [[MULC_LVER_ORIG:%.*]] = mul i32 [[LOADD_LVER_ORIG]], [[LOADE_LVER_ORIG]]
+; CHECK-NEXT:    [[ARRAYIDXC_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IND_LVER_ORIG]]
+; CHECK-NEXT:    store i32 [[MULC_LVER_ORIG]], ptr [[ARRAYIDXC_LVER_ORIG]], align 4
+; CHECK-NEXT:    [[EXITCOND_LVER_ORIG:%.*]] = icmp eq i64 [[ADD_LVER_ORIG]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND_LVER_ORIG]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY_LVER_ORIG]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[FOR_BODY_PH_LDIST1]]:
+; CHECK-NEXT:    br label %[[FOR_BODY_LDIST1:.*]]
+; CHECK:       [[FOR_BODY_LDIST1]]:
+; CHECK-NEXT:    [[IND_LDIST1:%.*]] = phi i64 [ 0, %[[FOR_BODY_PH_LDIST1]] ], [ [[ADD_LDIST1:%.*]], %[[FOR_BODY_LDIST1]] ]
+; CHECK-NEXT:    [[ARRAYIDXA_LDIST1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IND_LDIST1]]
+; CHECK-NEXT:    [[LOADA_LDIST1:%.*]] = load i32, ptr [[ARRAYIDXA_LDIST1]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]]
+; CHECK-NEXT:    [[ARRAYIDXB_LDIST1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IND_LDIST1]]
+; CHECK-NEXT:    [[LOADB_LDIST1:%.*]] = load i32, ptr [[ARRAYIDXB_LDIST1]], align 4, !alias.scope [[META10:![0-9]+]]
+; CHECK-NEXT:    [[MULA_LDIST1:%.*]] = mul i32 [[LOADB_LDIST1]], [[LOADA_LDIST1]]
+; CHECK-NEXT:    [[ADD_LDIST1]] = add nuw nsw i64 [[IND_LDIST1]], 1
+; CHECK-NEXT:    [[ARRAYIDXA_PLUS_4_LDIST1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[ADD_LDIST1]]
+; CHECK-NEXT:    store i32 [[MULA_LDIST1]], ptr [[ARRAYIDXA_PLUS_4_LDIST1]], align 4, !alias.scope [[META3]], !noalias [[META6]]
+; CHECK-NEXT:    [[EXITCOND_LDIST1:%.*]] = icmp eq i64 [[ADD_LDIST1]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND_LDIST1]], label %[[FOR_BODY_PH:.*]], label %[[FOR_BODY_LDIST1]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[FOR_BODY_PH]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IND:%.*]] = phi i64 [ 0, %[[FOR_BODY_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD]] = add nuw nsw i64 [[IND]], 1
+; CHECK-NEXT:    [[ARRAYIDXD:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADD:%.*]] = load i32, ptr [[ARRAYIDXD]], align 4, !alias.scope [[META14:![0-9]+]]
+; CHECK-NEXT:    [[ARRAYIDXE:%.*]] = getelementptr inbounds i32, ptr [[E]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADE:%.*]] = load i32, ptr [[ARRAYIDXE]], align 4, !alias.scope [[META15:![0-9]+]]
+; CHECK-NEXT:    [[MULC:%.*]] = mul i32 [[LOADD]], [[LOADE]]
+; CHECK-NEXT:    [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IND]]
+; CHECK-NEXT:    store i32 [[MULC]], ptr [[ARRAYIDXC]], align 4, !alias.scope [[META16:![0-9]+]], !noalias [[META10]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT16:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
+; CHECK:       [[FOR_END_LOOPEXIT16]]:
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -48,23 +130,24 @@ for.end:
 !3 = !{!"llvm.loop.distribute.followup_coincident", !{!"FollowupCoincident", i1 false}}
 !4 = !{!"llvm.loop.distribute.followup_sequential", !{!"FollowupSequential", i32 8}}
 !5 = !{!"llvm.loop.distribute.followup_fallback", !{!"FollowupFallback"}}
-
-
-; CHECK-LABEL: for.body.lver.orig:
-; CHECK: br i1 %exitcond.lver.orig, label %for.end.loopexit, label %for.body.lver.orig, !llvm.loop ![[LOOP_ORIG:[0-9]+]]
-; CHECK-LABEL: for.body.ldist1:
-; CHECK: br i1 %exitcond.ldist1, label %for.body.ph, label %for.body.ldist1, !llvm.loop ![[LOOP_SEQUENTIAL:[0-9]+]]
-; CHECK-LABEL: for.body:
-; CHECK: br i1 %exitcond, label %for.end.loopexit16, label %for.body, !llvm.loop ![[LOOP_COINCIDENT:[0-9]+]]
-; CHECK-LABEL: for.end.loopexit:
-; CHECK: br label %for.end
-; CHECK-LABEL: for.end.loopexit16:
-; CHECK: br label %for.end
-
-; CHECK: ![[LOOP_ORIG]] = distinct !{![[LOOP_ORIG]], ![[FOLLOWUP_ALL:[0-9]+]], ![[FOLLOUP_FALLBACK:[0-9]+]]}
-; CHECK: ![[FOLLOWUP_ALL]] = !{!"FollowupAll"}
-; CHECK: ![[FOLLOUP_FALLBACK]] = !{!"FollowupFallback"}
-; CHECK: ![[LOOP_SEQUENTIAL]] = distinct !{![[LOOP_SEQUENTIAL]], ![[FOLLOWUP_ALL]], ![[FOLLOWUP_SEQUENTIAL:[0-9]+]]}
-; CHECK: ![[FOLLOWUP_SEQUENTIAL]] = !{!"FollowupSequential", i32 8}
-; CHECK: ![[LOOP_COINCIDENT]] = distinct !{![[LOOP_COINCIDENT]], ![[FOLLOWUP_ALL]], ![[FOLLOWUP_COINCIDENT:[0-9]+]]}
-; CHECK: ![[FOLLOWUP_COINCIDENT]] = !{!"FollowupCoincident", i1 false}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"FollowupAll"}
+; CHECK: [[META2]] = !{!"FollowupFallback"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META5:![0-9]+]]}
+; CHECK: [[META5]] = distinct !{[[META5]], !"LVerDomain"}
+; CHECK: [[META6]] = !{[[META7:![0-9]+]], [[META8:![0-9]+]], [[META9:![0-9]+]]}
+; CHECK: [[META7]] = distinct !{[[META7]], [[META5]]}
+; CHECK: [[META8]] = distinct !{[[META8]], [[META5]]}
+; CHECK: [[META9]] = distinct !{[[META9]], [[META5]]}
+; CHECK: [[META10]] = !{[[META11:![0-9]+]]}
+; CHECK: [[META11]] = distinct !{[[META11]], [[META5]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META13:![0-9]+]]}
+; CHECK: [[META13]] = !{!"FollowupSequential", i32 8}
+; CHECK: [[META14]] = !{[[META8]]}
+; CHECK: [[META15]] = !{[[META9]]}
+; CHECK: [[META16]] = !{[[META7]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META18:![0-9]+]]}
+; CHECK: [[META18]] = !{!"FollowupCoincident", i1 false}
+;.
diff --git a/llvm/test/Transforms/LoopDistribute/metadata.ll b/llvm/test/Transforms/LoopDistribute/metadata.ll
index 7cd0415ec52c12..b0e461fe0ea756 100644
--- a/llvm/test/Transforms/LoopDistribute/metadata.ll
+++ b/llvm/test/Transforms/LoopDistribute/metadata.ll
@@ -5,20 +5,12 @@
 ; properly according to -enable-loop-distribute=0/1 and the
 ; llvm.loop.distribute.enable metadata.
 
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
-
+define void @explicit_on(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d, ptr noalias %e) {
 ; CHECK-LABEL: @explicit_on(
-define void @explicit_on(ptr noalias %a,
-                         ptr noalias %b,
-                         ptr noalias %c,
-                         ptr noalias %d,
-                         ptr noalias %e) {
 entry:
   br label %for.body
 
 ; EXPLICIT: for.body.ldist1:
-
 for.body:                                         ; preds = %for.body, %entry
   %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
 
@@ -62,7 +54,6 @@ entry:
   br label %for.body
 
 ; EXPLICIT-NOT: for.body.ldist1:
-
 for.body:                                         ; preds = %for.body, %entry
   %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
 
@@ -96,12 +87,9 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
-; CHECK-LABEL: @default_distribute(
-define void @default_distribute(ptr noalias %a,
-               ptr noalias %b,
-               ptr noalias %c,
-               ptr noalias %d,
+define void @default_distribute(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d,
                ptr noalias %e) {
+; CHECK-LABEL: @default_distribute(
 entry:
   br label %for.body
 
@@ -109,7 +97,6 @@ entry:
 
 ; DEFAULT_ON: for.body.ldist1:
 ; DEFAULT_OFF-NOT: for.body.ldist1:
-
 for.body:                                         ; preds = %for.body, %entry
   %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
 
diff --git a/llvm/test/Transforms/LoopDistribute/no-if-convert.ll b/llvm/test/Transforms/LoopDistribute/no-if-convert.ll
index 77e120e3462473..35a340e32b1ae5 100644
--- a/llvm/test/Transforms/LoopDistribute/no-if-convert.ll
+++ b/llvm/test/Transforms/LoopDistribute/no-if-convert.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S < %s \
 ; RUN:   | FileCheck %s
 
@@ -15,33 +16,63 @@
 ;        G[i] = H[i] * J[i];
 ;   }
 
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
-
-define void @f(ptr noalias %a,
-               ptr noalias %b,
-               ptr noalias %c,
-               ptr noalias %d,
-               ptr noalias %e,
-               ptr noalias %g,
-               ptr noalias %h,
-               ptr noalias %j,
-               i64 %x) {
-entry:
-  br label %for.body
-
 ; Ensure that we have only two partitions, the first with one multiplication
 ; and the second with two.
-
-; CHECK: for.body.ldist1:
-; CHECK:    %mulC.ldist1 = mul i32 %loadD.ldist1, %loadE.ldist1
-; CHECK:    br i1 %exitcond.ldist1, label %entry.split, label %for.body.ldist1
-; CHECK: entry.split:
-; CHECK:    br label %for.body
-; CHECK: for.body:
-; CHECK:    %mulA = mul i32 %loadB, %loadA
-; CHECK:    %mulG = mul i32 %loadH, %loadJ
-; CHECK: for.end:
+define void @f(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d, ptr noalias %e, ptr noalias %g, ptr noalias %h, ptr noalias %j, i64 %x) {
+; CHECK-LABEL: define void @f(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr noalias [[D:%.*]], ptr noalias [[E:%.*]], ptr noalias [[G:%.*]], ptr noalias [[H:%.*]], ptr noalias [[J:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[ENTRY_SPLIT_LDIST1:.*]]
+; CHECK:       [[ENTRY_SPLIT_LDIST1]]:
+; CHECK-NEXT:    br label %[[FOR_BODY_LDIST1:.*]]
+; CHECK:       [[FOR_BODY_LDIST1]]:
+; CHECK-NEXT:    [[IND_LDIST1:%.*]] = phi i64 [ 0, %[[ENTRY_SPLIT_LDIST1]] ], [ [[ADD_LDIST1:%.*]], %[[IF_END_LDIST1:.*]] ]
+; CHECK-NEXT:    [[ARRAYIDXD_LDIST1:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[IND_LDIST1]]
+; CHECK-NEXT:    [[LOADD_LDIST1:%.*]] = load i32, ptr [[ARRAYIDXD_LDIST1]], align 4
+; CHECK-NEXT:    [[ARRAYIDXE_LDIST1:%.*]] = getelementptr inbounds i32, ptr [[E]], i64 [[IND_LDIST1]]
+; CHECK-NEXT:    [[LOADE_LDIST1:%.*]] = load i32, ptr [[ARRAYIDXE_LDIST1]], align 4
+; CHECK-NEXT:    [[MULC_LDIST1:%.*]] = mul i32 [[LOADD_LDIST1]], [[LOADE_LDIST1]]
+; CHECK-NEXT:    [[ARRAYIDXC_LDIST1:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IND_LDIST1]]
+; CHECK-NEXT:    store i32 [[MULC_LDIST1]], ptr [[ARRAYIDXC_LDIST1]], align 4
+; CHECK-NEXT:    [[ADD_LDIST1]] = add nuw nsw i64 [[IND_LDIST1]], 1
+; CHECK-NEXT:    [[IF_COND_LDIST1:%.*]] = icmp eq i64 [[IND_LDIST1]], [[X]]
+; CHECK-NEXT:    br i1 [[IF_COND_LDIST1]], label %[[IF_THEN_LDIST1:.*]], label %[[IF_END_LDIST1]]
+; CHECK:       [[IF_THEN_LDIST1]]:
+; CHECK-NEXT:    br label %[[IF_END_LDIST1]]
+; CHECK:       [[IF_END_LDIST1]]:
+; CHECK-NEXT:    [[EXITCOND_LDIST1:%.*]] = icmp eq i64 [[ADD_LDIST1]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND_LDIST1]], label %[[ENTRY_SPLIT:.*]], label %[[FOR_BODY_LDIST1]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IND:%.*]] = phi i64 [ 0, %[[ENTRY_SPLIT]] ], [ [[ADD:%.*]], %[[IF_END:.*]] ]
+; CHECK-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADA:%.*]] = load i32, ptr [[ARRAYIDXA]], align 4
+; CHECK-NEXT:    [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADB:%.*]] = load i32, ptr [[ARRAYIDXB]], align 4
+; CHECK-NEXT:    [[MULA:%.*]] = mul i32 [[LOADB]], [[LOADA]]
+; CHECK-NEXT:    [[ADD]] = add nuw nsw i64 [[IND]], 1
+; CHECK-NEXT:    [[ARRAYIDXA_PLUS_4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[ADD]]
+; CHECK-NEXT:    store i32 [[MULA]], ptr [[ARRAYIDXA_PLUS_4]], align 4
+; CHECK-NEXT:    [[IF_COND:%.*]] = icmp eq i64 [[IND]], [[X]]
+; CHECK-NEXT:    br i1 [[IF_COND]], label %[[IF_THEN:.*]], label %[[IF_END]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[ARRAYIDXH:%.*]] = getelementptr inbounds i32, ptr [[H]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADH:%.*]] = load i32, ptr [[ARRAYIDXH]], align 4
+; CHECK-NEXT:    [[ARRAYIDXJ:%.*]] = getelementptr inbounds i32, ptr [[J]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADJ:%.*]] = load i32, ptr [[ARRAYIDXJ]], align 4
+; CHECK-NEXT:    [[MULG:%.*]] = mul i32 [[LOADH]], [[LOADJ]]
+; CHECK-NEXT:    [[ARRAYIDXG:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[IND]]
+; CHECK-NEXT:    store i32 [[MULG]], ptr [[ARRAYIDXG]], align 4
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
   %ind = phi i64 [ 0, %entry ], [ %add, %if.end ]
diff --git a/llvm/test/Transforms/LoopDistribute/outside-use.ll b/llvm/test/Transforms/LoopDistribute/outside-use.ll
index e564c445fc6b93..661f8062ae268e 100644
--- a/llvm/test/Transforms/LoopDistribute/outside-use.ll
+++ b/llvm/test/Transforms/LoopDistribute/outside-use.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S < %s \
 ; RUN:   | FileCheck %s
 
@@ -11,9 +12,6 @@
 ;     sum += C[i];
 ;   }
 
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
-
 @B = common global ptr null, align 8
 @A = common global ptr null, align 8
 @C = common global ptr null, align 8
@@ -22,6 +20,75 @@ target triple = "x86_64-apple-macosx10.10.0"
 @SUM = common global i32 0, align 8
 
 define void @f() {
+; CHECK-LABEL: define void @f() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[A:%.*]] = load ptr, ptr @A, align 8
+; CHECK-NEXT:    [[B:%.*]] = load ptr, ptr @B, align 8
+; CHECK-NEXT:    [[C:%.*]] = load ptr, ptr @C, align 8
+; CHECK-NEXT:    [[D:%.*]] = load ptr, ptr @D, align 8
+; CHECK-NEXT:    [[E:%.*]] = load ptr, ptr @E, align 8
+; CHECK-NEXT:    br label %[[FOR_BODY_LVER_CHECK:.*]]
+; CHECK:       [[FOR_BODY_LVER_CHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 84
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[C]], i64 80
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[FOR_BODY_PH_LVER_ORIG:.*]], label %[[FOR_BODY_PH_LDIST1:.*]]
+; CHECK:       [[FOR_BODY_PH_LVER_ORIG]]:
+; CHECK-NEXT:    br label %[[FOR_BODY_LVER_ORIG:.*]]
+; CHECK:       [[FOR_BODY_LVER_ORIG]]:
+; CHECK-NEXT:    [[IND_LVER_ORIG:%.*]] = phi i64 [ 0, %[[FOR_BODY_PH_LVER_ORIG]] ], [ [[ADD_LVER_ORIG:%.*]], %[[FOR_BODY_LVER_ORIG]] ]
+; CHECK-NEXT:    [[SUM_LVER_ORIG:%.*]] = phi i32 [ 0, %[[FOR_BODY_PH_LVER_ORIG]] ], [ [[SUM_ADD_LVER_ORIG:%.*]], %[[FOR_BODY_LVER_ORIG]] ]
+; CHECK-NEXT:    [[ARRAYIDXA_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IND_LVER_ORIG]]
+; CHECK-NEXT:    [[LOADA_LVER_ORIG:%.*]] = load i32, ptr [[ARRAYIDXA_LVER_ORIG]], align 4
+; CHECK-NEXT:    [[ARRAYIDXB_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IND_LVER_ORIG]]
+; CHECK-NEXT:    [[LOADB_LVER_ORIG:%.*]] = load i32, ptr [[ARRAYIDXB_LVER_ORIG]], align 4
+; CHECK-NEXT:    [[MULA_LVER_ORIG:%.*]] = mul i32 [[LOADB_LVER_ORIG]], [[LOADA_LVER_ORIG]]
+; CHECK-NEXT:    [[ADD_LVER_ORIG]] = add nuw nsw i64 [[IND_LVER_ORIG]], 1
+; CHECK-NEXT:    [[ARRAYIDXA_PLUS_4_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[ADD_LVER_ORIG]]
+; CHECK-NEXT:    store i32 [[MULA_LVER_ORIG]], ptr [[ARRAYIDXA_PLUS_4_LVER_ORIG]], align 4
+; CHECK-NEXT:    [[ARRAYIDXC_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IND_LVER_ORIG]]
+; CHECK-NEXT:    [[LOADC_LVER_ORIG:%.*]] = load i32, ptr [[ARRAYIDXC_LVER_ORIG]], align 4
+; CHECK-NEXT:    [[SUM_ADD_LVER_ORIG]] = add nuw nsw i32 [[SUM_LVER_ORIG]], [[LOADC_LVER_ORIG]]
+; CHECK-NEXT:    [[EXITCOND_LVER_ORIG:%.*]] = icmp eq i64 [[ADD_LVER_ORIG]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND_LVER_ORIG]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY_LVER_ORIG]]
+; CHECK:       [[FOR_BODY_PH_LDIST1]]:
+; CHECK-NEXT:    br label %[[FOR_BODY_LDIST1:.*]]
+; CHECK:       [[FOR_BODY_LDIST1]]:
+; CHECK-NEXT:    [[IND_LDIST1:%.*]] = phi i64 [ 0, %[[FOR_BODY_PH_LDIST1]] ], [ [[ADD_LDIST1:%.*]], %[[FOR_BODY_LDIST1]] ]
+; CHECK-NEXT:    [[ARRAYIDXA_LDIST1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IND_LDIST1]]
+; CHECK-NEXT:    [[LOADA_LDIST1:%.*]] = load i32, ptr [[ARRAYIDXA_LDIST1]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    [[ARRAYIDXB_LDIST1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IND_LDIST1]]
+; CHECK-NEXT:    [[LOADB_LDIST1:%.*]] = load i32, ptr [[ARRAYIDXB_LDIST1]], align 4, !alias.scope [[META5:![0-9]+]]
+; CHECK-NEXT:    [[MULA_LDIST1:%.*]] = mul i32 [[LOADB_LDIST1]], [[LOADA_LDIST1]]
+; CHECK-NEXT:    [[ADD_LDIST1]] = add nuw nsw i64 [[IND_LDIST1]], 1
+; CHECK-NEXT:    [[ARRAYIDXA_PLUS_4_LDIST1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[ADD_LDIST1]]
+; CHECK-NEXT:    store i32 [[MULA_LDIST1]], ptr [[ARRAYIDXA_PLUS_4_LDIST1]], align 4, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    [[EXITCOND_LDIST1:%.*]] = icmp eq i64 [[ADD_LDIST1]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND_LDIST1]], label %[[FOR_BODY_PH:.*]], label %[[FOR_BODY_LDIST1]]
+; CHECK:       [[FOR_BODY_PH]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IND:%.*]] = phi i64 [ 0, %[[FOR_BODY_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, %[[FOR_BODY_PH]] ], [ [[SUM_ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD]] = add nuw nsw i64 [[IND]], 1
+; CHECK-NEXT:    [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADC:%.*]] = load i32, ptr [[ARRAYIDXC]], align 4, !alias.scope [[META3]]
+; CHECK-NEXT:    [[SUM_ADD]] = add nuw nsw i32 [[SUM]], [[LOADC]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT2:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[SUM_ADD_LVER_PH:%.*]] = phi i32 [ [[SUM_ADD_LVER_ORIG]], %[[FOR_BODY_LVER_ORIG]] ]
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
+; CHECK:       [[FOR_END_LOOPEXIT2]]:
+; CHECK-NEXT:    [[SUM_ADD_LVER_PH3:%.*]] = phi i32 [ [[SUM_ADD]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[SUM_ADD_LVER:%.*]] = phi i32 [ [[SUM_ADD_LVER_PH]], %[[FOR_END_LOOPEXIT]] ], [ [[SUM_ADD_LVER_PH3]], %[[FOR_END_LOOPEXIT2]] ]
+; CHECK-NEXT:    store i32 [[SUM_ADD_LVER]], ptr @SUM, align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   %a = load ptr, ptr @A, align 8
   %b = load ptr, ptr @B, align 8
@@ -31,18 +98,6 @@ entry:
 
   br label %for.body
 
-; CHECK: for.body.ldist1:
-; CHECK:   %mulA.ldist1 = mul i32 %loadB.ldist1, %loadA.ldist1
-; CHECK: for.body.ph:
-; CHECK: for.body:
-; CHECK:   %sum_add = add nuw nsw i32 %sum, %loadC
-; CHECK: for.end.loopexit:
-; CHECK:   %sum_add.lver.ph = phi i32 [ %sum_add.lver.orig, %for.body.lver.orig ]
-; CHECK: for.end.loopexit2:
-; CHECK:   %sum_add.lver.ph3 = phi i32 [ %sum_add, %for.body ]
-; CHECK: for.end:
-; CHECK:   %sum_add.lver = phi i32 [ %sum_add.lver.ph, %for.end.loopexit ], [ %sum_add.lver.ph3, %for.end.loopexit2 ]
-
 for.body:                                         ; preds = %for.body, %entry
   %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
   %sum = phi i32 [ 0, %entry ], [ %sum_add, %for.body ]
@@ -71,3 +126,12 @@ for.end:                                          ; preds = %for.body
   store i32 %sum_add, ptr @SUM, align 4
   ret void
 }
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]}
+; CHECK: [[META5]] = !{[[META6:![0-9]+]]}
+; CHECK: [[META6]] = distinct !{[[META6]], [[META2]]}
+;.
diff --git a/llvm/test/Transforms/LoopDistribute/pointer-phi-in-loop.ll b/llvm/test/Transforms/LoopDistribute/pointer-phi-in-loop.ll
index d99c6969ab273e..2ab9140baf866f 100644
--- a/llvm/test/Transforms/LoopDistribute/pointer-phi-in-loop.ll
+++ b/llvm/test/Transforms/LoopDistribute/pointer-phi-in-loop.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -aa-pipeline=basic-aa -passes='loop-distribute' -enable-loop-distribute -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S %s | FileCheck %s
 
 ; Testcases inspired by PR50296, PR50288.
 
@@ -52,9 +52,69 @@ for.end.loopexit:                                 ; preds = %if.end
   ret void
 }
 
-define void @phi_load_distribute(i1 %c, ptr %A, ptr %B, ptr %C) {
+define void @phi_load_distribute(i1 %cond, ptr %A, ptr %B, ptr %C) {
 ; CHECK-LABEL: @phi_load_distribute(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY_LVER_CHECK:%.*]]
+; CHECK:       for.body.lver.check:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 2
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 2
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 2
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]]
+; CHECK:       for.body.ph.lver.orig:
+; CHECK-NEXT:    br label [[FOR_BODY_LVER_ORIG:%.*]]
+; CHECK:       for.body.lver.orig:
+; CHECK-NEXT:    [[IV_LVER_ORIG:%.*]] = phi i16 [ 0, [[FOR_BODY_PH_LVER_ORIG]] ], [ [[IV_NEXT_LVER_ORIG:%.*]], [[IF_END_LVER_ORIG:%.*]] ]
+; CHECK-NEXT:    [[LV_LVER_ORIG:%.*]] = load i16, ptr [[A]], align 1
+; CHECK-NEXT:    store i16 [[LV_LVER_ORIG]], ptr [[A]], align 1
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN_LVER_ORIG:%.*]], label [[IF_END_LVER_ORIG]]
+; CHECK:       if.then.lver.orig:
+; CHECK-NEXT:    [[LV2_LVER_ORIG:%.*]] = load i16, ptr [[A]], align 1
+; CHECK-NEXT:    br label [[IF_END_LVER_ORIG]]
+; CHECK:       if.end.lver.orig:
+; CHECK-NEXT:    [[C_SINK_LVER_ORIG:%.*]] = phi ptr [ [[B]], [[IF_THEN_LVER_ORIG]] ], [ [[C]], [[FOR_BODY_LVER_ORIG]] ]
+; CHECK-NEXT:    [[LV3_LVER_ORIG:%.*]] = load i16, ptr [[C_SINK_LVER_ORIG]], align 2
+; CHECK-NEXT:    [[IV_NEXT_LVER_ORIG]] = add nuw nsw i16 [[IV_LVER_ORIG]], 1
+; CHECK-NEXT:    [[TOBOOL_NOT_LVER_ORIG:%.*]] = icmp eq i16 [[IV_NEXT_LVER_ORIG]], 1000
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT_LVER_ORIG]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[FOR_BODY_LVER_ORIG]]
+; CHECK:       for.body.ph.ldist1:
+; CHECK-NEXT:    br label [[FOR_BODY_LDIST1:%.*]]
+; CHECK:       for.body.ldist1:
+; CHECK-NEXT:    [[IV_LDIST1:%.*]] = phi i16 [ 0, [[FOR_BODY_PH_LDIST1]] ], [ [[IV_NEXT_LDIST1:%.*]], [[IF_END_LDIST1:%.*]] ]
+; CHECK-NEXT:    [[LV_LDIST1:%.*]] = load i16, ptr [[A]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    store i16 [[LV_LDIST1]], ptr [[A]], align 1, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN_LDIST1:%.*]], label [[IF_END_LDIST1]]
+; CHECK:       if.then.ldist1:
+; CHECK-NEXT:    [[LV2_LDIST1:%.*]] = load i16, ptr [[A]], align 1, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    br label [[IF_END_LDIST1]]
+; CHECK:       if.end.ldist1:
+; CHECK-NEXT:    [[IV_NEXT_LDIST1]] = add nuw nsw i16 [[IV_LDIST1]], 1
+; CHECK-NEXT:    [[TOBOOL_NOT_LDIST1:%.*]] = icmp eq i16 [[IV_NEXT_LDIST1]], 1000
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT_LDIST1]], label [[FOR_BODY_PH:%.*]], label [[FOR_BODY_LDIST1]]
+; CHECK:       for.body.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, [[FOR_BODY_PH]] ], [ [[IV_NEXT:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[C_SINK:%.*]] = phi ptr [ [[B]], [[IF_THEN]] ], [ [[C]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[LV3:%.*]] = load i16, ptr [[C_SINK]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i16 [[IV]], 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i16 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_END_LOOPEXIT_LOOPEXIT6:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit.loopexit6:
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    ret void
 ;
@@ -65,7 +125,7 @@ for.body:                                         ; preds = %if.end, %entry
   %iv = phi i16 [ 0, %entry ], [ %iv.next, %if.end ]
   %lv = load i16, ptr %A, align 1
   store i16 %lv, ptr %A, align 1
-  br i1 %c, label %if.then, label %if.end
+  br i1 %cond, label %if.then, label %if.end
 
 if.then:                                          ; preds = %for.body
   %lv2 = load i16, ptr %A, align 1
diff --git a/llvm/test/Transforms/LoopDistribute/pr28443.ll b/llvm/test/Transforms/LoopDistribute/pr28443.ll
index 15908dc1fc6f4a..dee448fe7d6a9f 100644
--- a/llvm/test/Transforms/LoopDistribute/pr28443.ll
+++ b/llvm/test/Transforms/LoopDistribute/pr28443.ll
@@ -1,10 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S \
 ; RUN:   < %s | FileCheck %s
 
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
 define void @fn1(i64 %a, ptr %b) {
+; CHECK-LABEL: define void @fn1(
+; CHECK-SAME: i64 [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[ADD75_EPIL:%.*]] = phi i64 [ [[ADD7_EPIL:%.*]], %[[FOR_BODY]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[ADD1_EPIL:%.*]] = add nsw i64 [[ADD75_EPIL]], 268435457
+; CHECK-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[ADD1_EPIL]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[ARRAYIDX_EPIL]], align 8
+; CHECK-NEXT:    [[ADD5_EPIL:%.*]] = add nsw i64 [[ADD75_EPIL]], 805306369
+; CHECK-NEXT:    [[ARRAYIDX6_EPIL:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[ADD5_EPIL]]
+; CHECK-NEXT:    store i64 [[LOAD]], ptr [[ARRAYIDX6_EPIL]], align 8
+; CHECK-NEXT:    [[ADD7_EPIL]] = add nsw i64 [[ADD75_EPIL]], 2
+; CHECK-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp eq i64 [[ADD7_EPIL]], 0
+; CHECK-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_END:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -20,17 +36,6 @@ for.body:
   %epil.iter.cmp = icmp eq i64 %add7.epil, 0
   br i1 %epil.iter.cmp, label %for.end, label %for.body
 
-  ; CHECK: %[[phi:.*]]  = phi i64
-  ; CHECK: %[[add1:.*]] = add nsw i64 %[[phi]], 268435457
-  ; CHECK: %[[gep1:.*]] = getelementptr inbounds i64, ptr %b, i64 %[[add1]]
-  ; CHECK: %[[load:.*]] = load i64, ptr %[[gep1]], align 8
-  ; CHECK: %[[add2:.*]] = add nsw i64 %[[phi]], 805306369
-  ; CHECK: %[[gep2:.*]] = getelementptr inbounds i64, ptr %b, i64 %[[add2]]
-  ; CHECK: store i64 %[[load]], ptr %[[gep2]], align 8
-  ; CHECK: %[[incr:.*]] = add nsw i64 %[[phi]], 2
-  ; CHECK: %[[cmp:.*]]  = icmp eq i64 %[[incr]], 0
-  ; CHECK: br i1 %[[cmp]]
-
 for.end:
   ret void
 }
diff --git a/llvm/test/Transforms/LoopDistribute/program-order.ll b/llvm/test/Transforms/LoopDistribute/program-order.ll
index 42e80f22e69304..c926cc0464a7e8 100644
--- a/llvm/test/Transforms/LoopDistribute/program-order.ll
+++ b/llvm/test/Transforms/LoopDistribute/program-order.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=loop-distribute -enable-loop-distribute -S -verify-loop-info -verify-dom-info < %s \
 ; RUN:   | FileCheck %s
 
@@ -12,24 +13,36 @@
 ;     S3: C[i] = d * E[i];
 ;   }
 
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
-
-define void @f(ptr noalias %a,
-               ptr noalias %b,
-               ptr noalias %c,
-               ptr noalias %d,
-               ptr noalias %e) {
+define void @f(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d, ptr noalias %e) {
+; CHECK-LABEL: define void @f(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr noalias [[D:%.*]], ptr noalias [[E:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IND:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADA:%.*]] = load i32, ptr [[ARRAYIDXA]], align 4
+; CHECK-NEXT:    [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADB:%.*]] = load i32, ptr [[ARRAYIDXB]], align 4
+; CHECK-NEXT:    [[MULA:%.*]] = mul i32 [[LOADB]], [[LOADA]]
+; CHECK-NEXT:    [[ARRAYIDXD:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADD:%.*]] = load i32, ptr [[ARRAYIDXD]], align 4
+; CHECK-NEXT:    [[ADD]] = add nuw nsw i64 [[IND]], 1
+; CHECK-NEXT:    [[ARRAYIDXA_PLUS_4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[ADD]]
+; CHECK-NEXT:    store i32 [[MULA]], ptr [[ARRAYIDXA_PLUS_4]], align 4
+; CHECK-NEXT:    [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IND]]
+; CHECK-NEXT:    [[ARRAYIDXE:%.*]] = getelementptr inbounds i32, ptr [[E]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADE:%.*]] = load i32, ptr [[ARRAYIDXE]], align 4
+; CHECK-NEXT:    [[MULC:%.*]] = mul i32 [[LOADD]], [[LOADE]]
+; CHECK-NEXT:    store i32 [[MULC]], ptr [[ARRAYIDXC]], align 4
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
-; CHECK: entry:
-; CHECK:    br label %for.body
-; CHECK: for.body:
-; CHECK:    br i1 %exitcond, label %for.end, label %for.body
-; CHECK: for.end:
-; CHECK:    ret void
-
 for.body:                                         ; preds = %for.body, %entry
   %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
 
diff --git a/llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll b/llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll
index fa3448e28041ab..4e75699c91773f 100644
--- a/llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll
+++ b/llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=loop-distribute -enable-loop-distribute -S -enable-mem-access-versioning=0 < %s | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+; RUN: opt -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S -enable-mem-access-versioning=0 < %s | FileCheck %s
 
 ; PredicatedScalarEvolution decides it needs to insert a bounds check
 ; not based on memory access.
diff --git a/llvm/test/Transforms/LoopDistribute/symbolic-stride.ll b/llvm/test/Transforms/LoopDistribute/symbolic-stride.ll
index 07c50b5b61162b..2edd6e4f85a648 100644
--- a/llvm/test/Transforms/LoopDistribute/symbolic-stride.ll
+++ b/llvm/test/Transforms/LoopDistribute/symbolic-stride.ll
@@ -14,12 +14,7 @@
 ;     C[i] = D[i] * A[stride * i];
 ;   }
 
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
-
-define void @f(ptr noalias %a,
-;
-;
+define void @f(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d, i64 %stride) {
 ; DEFAULT-LABEL: @f(
 ; DEFAULT-NEXT:  entry:
 ; DEFAULT-NEXT:    br label [[FOR_BODY_LVER_CHECK:%.*]]
@@ -110,10 +105,6 @@ define void @f(ptr noalias %a,
 ; NO-VERSION:       for.end:
 ; NO-VERSION-NEXT:    ret void
 ;
-  ptr noalias %b,
-  ptr noalias %c,
-  ptr noalias %d,
-  i64 %stride) {
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopDistribute/uncomputable-backedge-taken-count.ll b/llvm/test/Transforms/LoopDistribute/uncomputable-backedge-taken-count.ll
index 075f87097a82df..5427c38ec6cfdd 100644
--- a/llvm/test/Transforms/LoopDistribute/uncomputable-backedge-taken-count.ll
+++ b/llvm/test/Transforms/LoopDistribute/uncomputable-backedge-taken-count.ll
@@ -1,9 +1,6 @@
 ; RUN: opt -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S \
 ; RUN:   < %s | FileCheck %s
 
-target datalayout = "e-m:o-i32:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
-
 ; NOTE: The tests below use infinite loops to force unknown backedge-taken counts.
 ; Making the exit condition depend on a load would break current loop-distribute,
 ; because it requires all accesses to end up in either of the loops, but not both.
@@ -11,9 +8,7 @@ target triple = "x86_64-apple-macosx10.10.0"
 ; TODO
 ; Can distribute with unknown backedge-taken count, because no runtime checks are
 ; required.
-define void @unknown_btc_distribute_no_checks_needed(ptr noalias %a,
-               ptr noalias %c,
-               ptr noalias %d) {
+define void @unknown_btc_distribute_no_checks_needed(ptr noalias %a, ptr noalias %c, ptr noalias %d) {
 ; CHECK-LABEL: @unknown_btc_distribute_no_checks_needed(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label %for.body
@@ -49,9 +44,7 @@ for.end:                                          ; preds = %for.body
 
 ; Cannot distribute with unknown backedge-taken count, because runtime checks for
 ; induction wrapping are required.
-define void @unknown_btc_do_not_distribute_wrapping_checks(ptr noalias %a,
-               ptr noalias %c,
-               ptr noalias %d) {
+define void @unknown_btc_do_not_distribute_wrapping_checks(ptr noalias %a, ptr noalias %c, ptr noalias %d) {
 ; CHECK-LABEL: @unknown_btc_do_not_distribute_wrapping_checks(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label %for.body
diff --git a/llvm/test/Transforms/LoopDistribute/unknown-bounds-for-memchecks.ll b/llvm/test/Transforms/LoopDistribute/unknown-bounds-for-memchecks.ll
index 2f96f9621b9d91..208648b9ec20da 100644
--- a/llvm/test/Transforms/LoopDistribute/unknown-bounds-for-memchecks.ll
+++ b/llvm/test/Transforms/LoopDistribute/unknown-bounds-for-memchecks.ll
@@ -1,4 +1,5 @@
-; RUN: opt -passes=loop-distribute -enable-loop-distribute -S < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S < %s | FileCheck %s
 
 ; If we can't find the bounds for one of the arrays in order to generate the
 ; memchecks (e.g., C[i * i] below), loop shold not get distributed.
@@ -9,8 +10,6 @@
 ;     C[i * i] = B[i] * 2;
 ;   }
 
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-
 ; Verify that we didn't distribute by checking that we still have the original
 ; number of branches.
 
@@ -19,12 +18,36 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 @C = common global ptr null, align 8
 
 define void @f() {
+; CHECK-LABEL: define void @f() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[A:%.*]] = load ptr, ptr @A, align 8
+; CHECK-NEXT:    [[B:%.*]] = load ptr, ptr @B, align 8
+; CHECK-NEXT:    [[C:%.*]] = load ptr, ptr @C, align 8
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IND:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADA:%.*]] = load i32, ptr [[ARRAYIDXA]], align 4
+; CHECK-NEXT:    [[MULA:%.*]] = mul i32 [[LOADA]], 3
+; CHECK-NEXT:    [[ADD]] = add nuw nsw i64 [[IND]], 1
+; CHECK-NEXT:    [[ARRAYIDXA_PLUS_4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[ADD]]
+; CHECK-NEXT:    store i32 [[MULA]], ptr [[ARRAYIDXA_PLUS_4]], align 4
+; CHECK-NEXT:    [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADB:%.*]] = load i32, ptr [[ARRAYIDXB]], align 4
+; CHECK-NEXT:    [[MULC:%.*]] = mul i32 [[LOADB]], 2
+; CHECK-NEXT:    [[IND_2:%.*]] = mul i64 [[IND]], [[IND]]
+; CHECK-NEXT:    [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IND_2]]
+; CHECK-NEXT:    store i32 [[MULC]], ptr [[ARRAYIDXC]], align 4
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %a = load ptr, ptr @A, align 8
   %b = load ptr, ptr @B, align 8
   %c = load ptr, ptr @C, align 8
   br label %for.body
-; CHECK: br
 
 for.body:                                         ; preds = %for.body, %entry
   %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
@@ -49,8 +72,6 @@ for.body:                                         ; preds = %for.body, %entry
 
   %exitcond = icmp eq i64 %add, 20
   br i1 %exitcond, label %for.end, label %for.body
-; CHECK: br
-; CHECK-NOT: br
 
 for.end:                                          ; preds = %for.body
   ret void

From 0488f210d91f50b2c6318e757c07fb598dd33693 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 5 Jul 2024 16:43:54 +0100
Subject: [PATCH 17/67] [InstCombine][X86] Add test showing failure to peek
 through bitcast+shuffle+bitcast sequence to fold BLENDV to SELECT

Mentioned on #96882
---
 .../Transforms/InstCombine/X86/blend_x86.ll   | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
index f12cc1560c0a43..6ed9acd718ccc0 100644
--- a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
+++ b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
@@ -282,6 +282,37 @@ define <2 x i64> @sel_v16i8_sse_reality(ptr nocapture readonly %x, <2 x i64> %y,
   ret <2 x i64> %rcast
 }
 
+define <4 x float> @sel_v16i8_bitcast_shuffle_bitcast_cmp(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) {
+; CHECK-LABEL: @sel_v16i8_bitcast_shuffle_bitcast_cmp(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <8 x i1> [[CMP]] to <8 x i32>
+; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <8 x float> [[A]] to <8 x i32>
+; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <8 x float> [[B]] to <8 x i32>
+; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <8 x i32> [[SEXT]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <8 x i32> [[A_BC]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <8 x i32> [[B_BC]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[A_LO_BC:%.*]] = bitcast <4 x i32> [[A_LO]] to <16 x i8>
+; CHECK-NEXT:    [[B_LO_BC:%.*]] = bitcast <4 x i32> [[B_LO]] to <16 x i8>
+; CHECK-NEXT:    [[SEXT_LO_BC:%.*]] = bitcast <4 x i32> [[SEXT_LO]] to <16 x i8>
+; CHECK-NEXT:    [[BLENDV:%.*]] = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_LO_BC]], <16 x i8> [[B_LO_BC]], <16 x i8> [[SEXT_LO_BC]])
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <16 x i8> [[BLENDV]] to <4 x float>
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %cmp = fcmp olt <8 x float> %a, %b
+  %sext = sext <8 x i1> %cmp to <8 x i32>
+  %a.bc = bitcast <8 x float> %a to <8 x i32>
+  %b.bc = bitcast <8 x float> %b to <8 x i32>
+  %sext.lo = shufflevector <8 x i32> %sext, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %a.lo = shufflevector <8 x i32> %a.bc, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %b.lo = shufflevector <8 x i32> %b.bc, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %a.lo.bc = bitcast <4 x i32> %a.lo to <16 x i8>
+  %b.lo.bc = bitcast <4 x i32> %b.lo to <16 x i8>
+  %sext.lo.bc = bitcast <4 x i32> %sext.lo to <16 x i8>
+  %blendv = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a.lo.bc, <16 x i8> %b.lo.bc, <16 x i8> %sext.lo.bc)
+  %res = bitcast <16 x i8> %blendv to <4 x float>
+  ret <4 x float> %res
+}
+
 declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
 declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)

From 6c1c97c5b6744397063d9976bead154be38b8388 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 5 Jul 2024 16:53:06 +0100
Subject: [PATCH 18/67] [InstCombine][X86] Peek through bitcast+shuffle+bitcast
 sequence when folding BLENDV to SELECT

Mentioned on #96882
---
 llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp   |  5 +++--
 llvm/test/Transforms/InstCombine/X86/blend_x86.ll | 10 +++-------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 8eea368b5f86f9..322cb6f6f5819b 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -2882,6 +2882,8 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
     }
 
+    Mask = InstCombiner::peekThroughBitcast(Mask);
+
     // Peek through a one-use shuffle - VectorCombine should have simplified
     // this for cases where we're splitting wider vectors to use blendv
     // intrinsics.
@@ -2895,13 +2897,12 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
           any_of(ShuffleMask,
                  [NumElts](int M) { return M < 0 || M >= NumElts; }))
         break;
-      Mask = MaskSrc;
+      Mask = InstCombiner::peekThroughBitcast(MaskSrc);
     }
 
     // Convert to a vector select if we can bypass casts and find a boolean
     // vector condition value.
     Value *BoolVec;
-    Mask = InstCombiner::peekThroughBitcast(Mask);
     if (match(Mask, m_SExt(m_Value(BoolVec))) &&
         BoolVec->getType()->isVectorTy() &&
         BoolVec->getType()->getScalarSizeInBits() == 1) {
diff --git a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
index 6ed9acd718ccc0..aa49f493c9fa1a 100644
--- a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
+++ b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
@@ -285,17 +285,13 @@ define <2 x i64> @sel_v16i8_sse_reality(ptr nocapture readonly %x, <2 x i64> %y,
 define <4 x float> @sel_v16i8_bitcast_shuffle_bitcast_cmp(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) {
 ; CHECK-LABEL: @sel_v16i8_bitcast_shuffle_bitcast_cmp(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <8 x float> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <8 x i1> [[CMP]] to <8 x i32>
 ; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <8 x float> [[A]] to <8 x i32>
 ; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <8 x float> [[B]] to <8 x i32>
-; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <8 x i32> [[SEXT]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <8 x i32> [[A_BC]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <8 x i32> [[B_BC]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[A_LO_BC:%.*]] = bitcast <4 x i32> [[A_LO]] to <16 x i8>
-; CHECK-NEXT:    [[B_LO_BC:%.*]] = bitcast <4 x i32> [[B_LO]] to <16 x i8>
-; CHECK-NEXT:    [[SEXT_LO_BC:%.*]] = bitcast <4 x i32> [[SEXT_LO]] to <16 x i8>
-; CHECK-NEXT:    [[BLENDV:%.*]] = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_LO_BC]], <16 x i8> [[B_LO_BC]], <16 x i8> [[SEXT_LO_BC]])
-; CHECK-NEXT:    [[RES:%.*]] = bitcast <16 x i8> [[BLENDV]] to <4 x float>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i1> [[CMP]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[B_LO]], <4 x i32> [[A_LO]]
+; CHECK-NEXT:    [[RES:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float>
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
   %cmp = fcmp olt <8 x float> %a, %b

From 3bfc5167d9e49b9a53e364e8d8853fce543cca0f Mon Sep 17 00:00:00 2001
From: walter erquinigo <walter@modular.com>
Date: Fri, 5 Jul 2024 11:50:02 -0400
Subject: [PATCH 19/67] [lldb-dap][NFC] Minor rename

As a minor follow up for https://github.com/llvm/llvm-project/pull/97675, I'm renaming `SupportsExceptionBreakpoints` to `SupportsExceptionBreakpointsOnThrow` and adding a `SupportsExceptionBreakpointsOnCatch` to have a bit of more granularity.
---
 lldb/include/lldb/Target/Language.h              | 10 +++++++---
 lldb/source/Commands/CommandObjectBreakpoint.cpp |  3 ++-
 lldb/source/Plugins/Language/ObjC/ObjCLanguage.h |  2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/lldb/include/lldb/Target/Language.h b/lldb/include/lldb/Target/Language.h
index 2d6e5a40a0c0e4..83bf7635e369a5 100644
--- a/lldb/include/lldb/Target/Language.h
+++ b/lldb/include/lldb/Target/Language.h
@@ -363,9 +363,13 @@ class Language : public PluginInterface {
     return false;
   }
 
-  /// Returns true if this Language supports exception breakpoints via a
-  /// corresponding LanguageRuntime plugin.
-  virtual bool SupportsExceptionBreakpoints() const { return false; }
+  /// Returns true if this Language supports exception breakpoints on throw via
+  /// a corresponding LanguageRuntime plugin.
+  virtual bool SupportsExceptionBreakpointsOnThrow() const { return false; }
+
+  /// Returns true if this Language supports exception breakpoints on catch via
+  /// a corresponding LanguageRuntime plugin.
+  virtual bool SupportsExceptionBreakpointsOnCatch() const { return false; }
 
 protected:
   // Classes that inherit from Language can see and modify these
diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp
index a5fe9273fac76d..773f8ed2fa8af8 100644
--- a/lldb/source/Commands/CommandObjectBreakpoint.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp
@@ -317,7 +317,8 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
           break;
         default:
           if (Language *languagePlugin = Language::FindPlugin(language)) {
-            if (languagePlugin->SupportsExceptionBreakpoints()) {
+            if (languagePlugin->SupportsExceptionBreakpointsOnThrow() ||
+                languagePlugin->SupportsExceptionBreakpointsOnCatch()) {
               m_exception_language = language;
               break;
             }
diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
index a61d0f128370d4..d9c0cd3c18cfa1 100644
--- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
+++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h
@@ -194,7 +194,7 @@ class ObjCLanguage : public Language {
 
   llvm::StringRef GetInstanceVariableName() override { return "self"; }
 
-  bool SupportsExceptionBreakpoints() const override { return true; }
+  bool SupportsExceptionBreakpointsOnThrow() const override { return true; }
 
   // PluginInterface protocol
   llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }

From 44248d2d248be43c3e55d1ab6808342f63e0c70c Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Fri, 5 Jul 2024 10:56:33 -0700
Subject: [PATCH 20/67] [PGO][compiler-rt] Add a test to ensure include files
 do not diverge (#97775)

Memprof has two include files that are duplicated between LLVM and
compiler-rt. They need to stay in sync to ensure correct functionality,
but the comments can be somewhat easy to miss, which causes fixups like
839ed1ba553346b0c225e9b839cf3cb716dc7412 to be needed. This patch adds a
test to ensure that the files are the same between LLVM and compiler-rt
to catch this ideally before commit, but if not, soon afterwards.

There is additionally `InstrProfData.inc` for some PGO variants that is
added to the same test here as well.
---
 compiler-rt/test/profile/check-same-common-code.test | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 compiler-rt/test/profile/check-same-common-code.test

diff --git a/compiler-rt/test/profile/check-same-common-code.test b/compiler-rt/test/profile/check-same-common-code.test
new file mode 100644
index 00000000000000..81b9836760dd66
--- /dev/null
+++ b/compiler-rt/test/profile/check-same-common-code.test
@@ -0,0 +1,7 @@
+;
+; NOTE: if this test fails, please make sure the files are identical
+; copies of each other.
+;
+; RUN: diff %crt_src/include/profile/MIBEntryDef.inc %llvm_src/include/llvm/ProfileData/MIBEntryDef.inc
+; RUN: diff %crt_src/include/profile/MemProfData.inc %llvm_src/include/llvm/ProfileData/MemProfData.inc
+; RUN: diff %crt_src/include/profile/InstrProfData.inc %llvm_src/include/llvm/ProfileData/InstrProfData.inc

From 23d1d959583c35e6eab7e3e70d4c16449b418563 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Fri, 5 Jul 2024 10:59:51 -0700
Subject: [PATCH 21/67] [LLVM][compiler-rt][AArch64] Refactor AArch64 CPU
 features (#97777)

This patch refactors the AArch64 CPUFeatures enum into a separate
include file that is identical between LLVM and compiler-rt. This, along
with a test in compiler-rt to ensure that the two stay in sync.
---
 .../builtins/cpu_model/AArch64CPUFeatures.inc | 91 +++++++++++++++++++
 compiler-rt/lib/builtins/cpu_model/aarch64.h  | 69 +-------------
 .../TestCases/check-same-common-code.test     |  5 +
 compiler-rt/test/builtins/lit.cfg.py          |  2 +-
 .../llvm/TargetParser/AArch64CPUFeatures.inc  | 91 +++++++++++++++++++
 .../llvm/TargetParser/AArch64TargetParser.h   | 69 +-------------
 6 files changed, 190 insertions(+), 137 deletions(-)
 create mode 100644 compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc
 create mode 100644 compiler-rt/test/builtins/TestCases/check-same-common-code.test
 create mode 100644 llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc

diff --git a/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc b/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc
new file mode 100644
index 00000000000000..e78bb88cfedf21
--- /dev/null
+++ b/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc
@@ -0,0 +1,91 @@
+//===- AArch64CPUFeatures.inc - AArch64 CPU Features enum -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the CPUFeatures enum for AArch64 to facilitate better
+// testing of this code between LLVM and compiler-rt, primarily that the files
+// are an exact match.
+//
+// This file has two identical copies. The primary copy lives in LLVM and
+// the other one sits in compiler-rt/lib/builtins/cpu_model directory. To make
+// changes in this file, first modify the primary copy and copy it over to
+// compiler-rt. compiler-rt tests will fail if the two files are not synced up.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AARCH64_CPU_FEATURS_INC_H
+#define AARCH64_CPU_FEATURS_INC_H
+
+// Function Multi Versioning CPU features.
+enum CPUFeatures {
+  FEAT_RNG,
+  FEAT_FLAGM,
+  FEAT_FLAGM2,
+  FEAT_FP16FML,
+  FEAT_DOTPROD,
+  FEAT_SM4,
+  FEAT_RDM,
+  FEAT_LSE,
+  FEAT_FP,
+  FEAT_SIMD,
+  FEAT_CRC,
+  FEAT_SHA1,
+  FEAT_SHA2,
+  FEAT_SHA3,
+  FEAT_AES,
+  FEAT_PMULL,
+  FEAT_FP16,
+  FEAT_DIT,
+  FEAT_DPB,
+  FEAT_DPB2,
+  FEAT_JSCVT,
+  FEAT_FCMA,
+  FEAT_RCPC,
+  FEAT_RCPC2,
+  FEAT_FRINTTS,
+  FEAT_DGH,
+  FEAT_I8MM,
+  FEAT_BF16,
+  FEAT_EBF16,
+  FEAT_RPRES,
+  FEAT_SVE,
+  FEAT_SVE_BF16,
+  FEAT_SVE_EBF16,
+  FEAT_SVE_I8MM,
+  FEAT_SVE_F32MM,
+  FEAT_SVE_F64MM,
+  FEAT_SVE2,
+  FEAT_SVE_AES,
+  FEAT_SVE_PMULL128,
+  FEAT_SVE_BITPERM,
+  FEAT_SVE_SHA3,
+  FEAT_SVE_SM4,
+  FEAT_SME,
+  FEAT_MEMTAG,
+  FEAT_MEMTAG2,
+  FEAT_MEMTAG3,
+  FEAT_SB,
+  FEAT_PREDRES,
+  FEAT_SSBS,
+  FEAT_SSBS2,
+  FEAT_BTI,
+  FEAT_LS64,
+  FEAT_LS64_V,
+  FEAT_LS64_ACCDATA,
+  FEAT_WFXT,
+  FEAT_SME_F64,
+  FEAT_SME_I64,
+  FEAT_SME2,
+  FEAT_RCPC3,
+  FEAT_MOPS,
+  FEAT_MAX,
+  FEAT_EXT = 62, // Reserved to indicate presence of additional features field
+                 // in __aarch64_cpu_features
+  FEAT_INIT      // Used as flag of features initialization completion
+};
+
+#endif
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.h b/compiler-rt/lib/builtins/cpu_model/aarch64.h
index 15d5300da53ba8..f6cbf75d582f3a 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64.h
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64.h
@@ -14,74 +14,7 @@
 
 #if !defined(DISABLE_AARCH64_FMV)
 
-// CPUFeatures must correspond to the same AArch64 features in
-// AArch64TargetParser.h
-enum CPUFeatures {
-  FEAT_RNG,
-  FEAT_FLAGM,
-  FEAT_FLAGM2,
-  FEAT_FP16FML,
-  FEAT_DOTPROD,
-  FEAT_SM4,
-  FEAT_RDM,
-  FEAT_LSE,
-  FEAT_FP,
-  FEAT_SIMD,
-  FEAT_CRC,
-  FEAT_SHA1,
-  FEAT_SHA2,
-  FEAT_SHA3,
-  FEAT_AES,
-  FEAT_PMULL,
-  FEAT_FP16,
-  FEAT_DIT,
-  FEAT_DPB,
-  FEAT_DPB2,
-  FEAT_JSCVT,
-  FEAT_FCMA,
-  FEAT_RCPC,
-  FEAT_RCPC2,
-  FEAT_FRINTTS,
-  FEAT_DGH,
-  FEAT_I8MM,
-  FEAT_BF16,
-  FEAT_EBF16,
-  FEAT_RPRES,
-  FEAT_SVE,
-  FEAT_SVE_BF16,
-  FEAT_SVE_EBF16,
-  FEAT_SVE_I8MM,
-  FEAT_SVE_F32MM,
-  FEAT_SVE_F64MM,
-  FEAT_SVE2,
-  FEAT_SVE_AES,
-  FEAT_SVE_PMULL128,
-  FEAT_SVE_BITPERM,
-  FEAT_SVE_SHA3,
-  FEAT_SVE_SM4,
-  FEAT_SME,
-  FEAT_MEMTAG,
-  FEAT_MEMTAG2,
-  FEAT_MEMTAG3,
-  FEAT_SB,
-  FEAT_PREDRES,
-  FEAT_SSBS,
-  FEAT_SSBS2,
-  FEAT_BTI,
-  FEAT_LS64,
-  FEAT_LS64_V,
-  FEAT_LS64_ACCDATA,
-  FEAT_WFXT,
-  FEAT_SME_F64,
-  FEAT_SME_I64,
-  FEAT_SME2,
-  FEAT_RCPC3,
-  FEAT_MOPS,
-  FEAT_MAX,
-  FEAT_EXT = 62, // Reserved to indicate presence of additional features field
-                 // in __aarch64_cpu_features
-  FEAT_INIT      // Used as flag of features initialization completion
-};
+#include "AArch64CPUFeatures.inc"
 
 void __init_cpu_features(void);
 
diff --git a/compiler-rt/test/builtins/TestCases/check-same-common-code.test b/compiler-rt/test/builtins/TestCases/check-same-common-code.test
new file mode 100644
index 00000000000000..67fe900901763e
--- /dev/null
+++ b/compiler-rt/test/builtins/TestCases/check-same-common-code.test
@@ -0,0 +1,5 @@
+;
+; NOTE: if this test fails, please make sure the relevant copies are identical
+; copies of each other.
+;
+; RUN: diff %crt_src/lib/builtins/cpu_model/AArch64CPUFeatures.inc %llvm_src/include/llvm/TargetParser/AArch64CPUFeatures.inc
diff --git a/compiler-rt/test/builtins/lit.cfg.py b/compiler-rt/test/builtins/lit.cfg.py
index 9bf7d82830ccdf..9300488c8428d8 100644
--- a/compiler-rt/test/builtins/lit.cfg.py
+++ b/compiler-rt/test/builtins/lit.cfg.py
@@ -9,7 +9,7 @@
 config.test_source_root = os.path.dirname(__file__)
 
 # Test suffixes.
-config.suffixes = [".c", ".cpp", ".m", ".mm"]
+config.suffixes = [".c", ".cpp", ".m", ".mm", ".test"]
 extra_flags = ["-Wall"]
 if config.compiler_id == "GNU":
     # detect incorrect declarations of libgcc functions
diff --git a/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc b/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc
new file mode 100644
index 00000000000000..e78bb88cfedf21
--- /dev/null
+++ b/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc
@@ -0,0 +1,91 @@
+//===- AArch64CPUFeatures.inc - AArch64 CPU Features enum -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the CPUFeatures enum for AArch64 to facilitate better
+// testing of this code between LLVM and compiler-rt, primarily that the files
+// are an exact match.
+//
+// This file has two identical copies. The primary copy lives in LLVM and
+// the other one sits in compiler-rt/lib/builtins/cpu_model directory. To make
+// changes in this file, first modify the primary copy and copy it over to
+// compiler-rt. compiler-rt tests will fail if the two files are not synced up.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AARCH64_CPU_FEATURS_INC_H
+#define AARCH64_CPU_FEATURS_INC_H
+
+// Function Multi Versioning CPU features.
+enum CPUFeatures {
+  FEAT_RNG,
+  FEAT_FLAGM,
+  FEAT_FLAGM2,
+  FEAT_FP16FML,
+  FEAT_DOTPROD,
+  FEAT_SM4,
+  FEAT_RDM,
+  FEAT_LSE,
+  FEAT_FP,
+  FEAT_SIMD,
+  FEAT_CRC,
+  FEAT_SHA1,
+  FEAT_SHA2,
+  FEAT_SHA3,
+  FEAT_AES,
+  FEAT_PMULL,
+  FEAT_FP16,
+  FEAT_DIT,
+  FEAT_DPB,
+  FEAT_DPB2,
+  FEAT_JSCVT,
+  FEAT_FCMA,
+  FEAT_RCPC,
+  FEAT_RCPC2,
+  FEAT_FRINTTS,
+  FEAT_DGH,
+  FEAT_I8MM,
+  FEAT_BF16,
+  FEAT_EBF16,
+  FEAT_RPRES,
+  FEAT_SVE,
+  FEAT_SVE_BF16,
+  FEAT_SVE_EBF16,
+  FEAT_SVE_I8MM,
+  FEAT_SVE_F32MM,
+  FEAT_SVE_F64MM,
+  FEAT_SVE2,
+  FEAT_SVE_AES,
+  FEAT_SVE_PMULL128,
+  FEAT_SVE_BITPERM,
+  FEAT_SVE_SHA3,
+  FEAT_SVE_SM4,
+  FEAT_SME,
+  FEAT_MEMTAG,
+  FEAT_MEMTAG2,
+  FEAT_MEMTAG3,
+  FEAT_SB,
+  FEAT_PREDRES,
+  FEAT_SSBS,
+  FEAT_SSBS2,
+  FEAT_BTI,
+  FEAT_LS64,
+  FEAT_LS64_V,
+  FEAT_LS64_ACCDATA,
+  FEAT_WFXT,
+  FEAT_SME_F64,
+  FEAT_SME_I64,
+  FEAT_SME2,
+  FEAT_RCPC3,
+  FEAT_MOPS,
+  FEAT_MAX,
+  FEAT_EXT = 62, // Reserved to indicate presence of additional features field
+                 // in __aarch64_cpu_features
+  FEAT_INIT      // Used as flag of features initialization completion
+};
+
+#endif
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index f47d9f1bde8803..13091748e091c6 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -34,74 +34,7 @@ namespace AArch64 {
 struct ArchInfo;
 struct CpuInfo;
 
-// Function Multi Versioning CPU features. They must be kept in sync with
-// compiler-rt enum CPUFeatures in lib/builtins/cpu_model/aarch64.c with
-// FEAT_MAX as sentinel.
-enum CPUFeatures {
-  FEAT_RNG,
-  FEAT_FLAGM,
-  FEAT_FLAGM2,
-  FEAT_FP16FML,
-  FEAT_DOTPROD,
-  FEAT_SM4,
-  FEAT_RDM,
-  FEAT_LSE,
-  FEAT_FP,
-  FEAT_SIMD,
-  FEAT_CRC,
-  FEAT_SHA1,
-  FEAT_SHA2,
-  FEAT_SHA3,
-  FEAT_AES,
-  FEAT_PMULL,
-  FEAT_FP16,
-  FEAT_DIT,
-  FEAT_DPB,
-  FEAT_DPB2,
-  FEAT_JSCVT,
-  FEAT_FCMA,
-  FEAT_RCPC,
-  FEAT_RCPC2,
-  FEAT_FRINTTS,
-  FEAT_DGH,
-  FEAT_I8MM,
-  FEAT_BF16,
-  FEAT_EBF16,
-  FEAT_RPRES,
-  FEAT_SVE,
-  FEAT_SVE_BF16,
-  FEAT_SVE_EBF16,
-  FEAT_SVE_I8MM,
-  FEAT_SVE_F32MM,
-  FEAT_SVE_F64MM,
-  FEAT_SVE2,
-  FEAT_SVE_AES,
-  FEAT_SVE_PMULL128,
-  FEAT_SVE_BITPERM,
-  FEAT_SVE_SHA3,
-  FEAT_SVE_SM4,
-  FEAT_SME,
-  FEAT_MEMTAG,
-  FEAT_MEMTAG2,
-  FEAT_MEMTAG3,
-  FEAT_SB,
-  FEAT_PREDRES,
-  FEAT_SSBS,
-  FEAT_SSBS2,
-  FEAT_BTI,
-  FEAT_LS64,
-  FEAT_LS64_V,
-  FEAT_LS64_ACCDATA,
-  FEAT_WFXT,
-  FEAT_SME_F64,
-  FEAT_SME_I64,
-  FEAT_SME2,
-  FEAT_RCPC3,
-  FEAT_MOPS,
-  FEAT_MAX,
-  FEAT_EXT = 62,
-  FEAT_INIT
-};
+#include "llvm/TargetParser/AArch64CPUFeatures.inc"
 
 static_assert(FEAT_MAX < 62,
               "Number of features in CPUFeatures are limited to 62 entries");

From 0f1da49b4d854ce7c6572000da3fb6cb0a1245d2 Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail@igalia.com>
Date: Fri, 5 Jul 2024 20:20:51 +0200
Subject: [PATCH 22/67] [libc] Fix readlink tests on 32-bit systems (#97850)

Use sizeof in a string literal instead of a CString so we get the right size when creating the buf array.

We also now use strlen(FILENAME) to get the string lenght when calling readlink and readlinkat.
---
 libc/test/src/unistd/CMakeLists.txt      |  2 ++
 libc/test/src/unistd/readlink_test.cpp   | 11 +++++++----
 libc/test/src/unistd/readlinkat_test.cpp | 15 +++++++++------
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt
index de3e8d9ccbb626..f4f78b800987d4 100644
--- a/libc/test/src/unistd/CMakeLists.txt
+++ b/libc/test/src/unistd/CMakeLists.txt
@@ -262,6 +262,7 @@ add_libc_unittest(
     libc.include.unistd
     libc.src.errno.errno
     libc.src.unistd.readlink
+    libc.src.string.string_utils
     libc.src.unistd.symlink
     libc.src.unistd.unlink
     libc.src.__support.CPP.string_view
@@ -278,6 +279,7 @@ add_libc_unittest(
     libc.include.fcntl
     libc.include.unistd
     libc.src.errno.errno
+    libc.src.string.string_utils
     libc.src.unistd.readlinkat
     libc.src.unistd.symlink
     libc.src.unistd.unlink
diff --git a/libc/test/src/unistd/readlink_test.cpp b/libc/test/src/unistd/readlink_test.cpp
index 20f3951349118a..0760850d9bae19 100644
--- a/libc/test/src/unistd/readlink_test.cpp
+++ b/libc/test/src/unistd/readlink_test.cpp
@@ -9,6 +9,7 @@
 #include "src/__support/CPP/string_view.h"
 #include "src/errno/libc_errno.h"
 #include "src/unistd/readlink.h"
+#include "src/string/string_utils.h"
 #include "src/unistd/symlink.h"
 #include "src/unistd/unlink.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
@@ -30,8 +31,9 @@ TEST(LlvmLibcReadlinkTest, CreateAndUnlink) {
   //   3. Cleanup the symlink created in step #1.
   ASSERT_THAT(LIBC_NAMESPACE::symlink(LINK_VAL, LINK), Succeeds(0));
 
-  char buf[sizeof(LINK_VAL)];
-  ssize_t len = LIBC_NAMESPACE::readlink(LINK, buf, sizeof(buf));
+  char buf[sizeof(FILENAME)];
+  ssize_t len = LIBC_NAMESPACE::readlink(
+      LINK, buf, LIBC_NAMESPACE::internal::string_length(FILENAME));
   ASSERT_ERRNO_SUCCESS();
   ASSERT_EQ(cpp::string_view(buf, len), cpp::string_view(LINK_VAL));
 
@@ -40,7 +42,8 @@ TEST(LlvmLibcReadlinkTest, CreateAndUnlink) {
 
 TEST(LlvmLibcReadlinkTest, ReadlinkInNonExistentPath) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
-  char buf[8];
-  ASSERT_THAT(LIBC_NAMESPACE::readlink("non-existent-link", buf, sizeof(buf)),
+  constexpr auto len = 8;
+  char buf[len];
+  ASSERT_THAT(LIBC_NAMESPACE::readlink("non-existent-link", buf, len),
               Fails(ENOENT));
 }
diff --git a/libc/test/src/unistd/readlinkat_test.cpp b/libc/test/src/unistd/readlinkat_test.cpp
index 39d81d9ba544a6..61e87731c9b9dd 100644
--- a/libc/test/src/unistd/readlinkat_test.cpp
+++ b/libc/test/src/unistd/readlinkat_test.cpp
@@ -9,6 +9,7 @@
 #include "src/__support/CPP/string_view.h"
 #include "src/errno/libc_errno.h"
 #include "src/unistd/readlinkat.h"
+#include "src/string/string_utils.h"
 #include "src/unistd/symlink.h"
 #include "src/unistd/unlink.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
@@ -32,8 +33,9 @@ TEST(LlvmLibcReadlinkatTest, CreateAndUnlink) {
   //   3. Cleanup the symlink created in step #1.
   ASSERT_THAT(LIBC_NAMESPACE::symlink(LINK_VAL, LINK), Succeeds(0));
 
-  char buf[sizeof(LINK_VAL)];
-  ssize_t len = LIBC_NAMESPACE::readlinkat(AT_FDCWD, LINK, buf, sizeof(buf));
+  char buf[sizeof(FILENAME)];
+  ssize_t len = LIBC_NAMESPACE::readlinkat(
+      AT_FDCWD, LINK, buf, LIBC_NAMESPACE::internal::string_length(FILENAME));
   ASSERT_ERRNO_SUCCESS();
   ASSERT_EQ(cpp::string_view(buf, len), cpp::string_view(LINK_VAL));
 
@@ -42,8 +44,9 @@ TEST(LlvmLibcReadlinkatTest, CreateAndUnlink) {
 
 TEST(LlvmLibcReadlinkatTest, ReadlinkInNonExistentPath) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
-  char buf[8];
-  ASSERT_THAT(LIBC_NAMESPACE::readlinkat(AT_FDCWD, "non-existent-link", buf,
-                                         sizeof(buf)),
-              Fails(ENOENT));
+  constexpr auto len = 8;
+  char buf[len];
+  ASSERT_THAT(
+      LIBC_NAMESPACE::readlinkat(AT_FDCWD, "non-existent-link", buf, len),
+      Fails(ENOENT));
 }

From 788731cdbd732180639988c9589adbe63bb28afa Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Fri, 5 Jul 2024 22:27:04 +0400
Subject: [PATCH 23/67] [clang] Implement P3144R2 "Deleting a Pointer to an
 Incomplete Type..." (#97733)

This patch implements (not yet published)
[P3144R2](https://wiki.edg.com/pub/Wg21stlouis2024/StrawPolls/p3144r2.pdf)
"Deleting a Pointer to an Incomplete Type Should be Ill-formed". Wording
changes (not yet merged into the working draft) read:
> 7.6.2.9 [expr.delete] Delete
> If the object being deleted has incomplete class type at the point of
deletion <del>and the complete class has a
non-trivial destructor or a deallocation function, the behavior is
undefined</del>, <ins>the program is ill-formed</ins>.

We preserve status quo of emitting a warning when deleting a pointer to
incomplete type up to, and including, C++23, but make it ill-formed
since C++26. Same goes for deleting pointers to `void`, which has been
allowed as an extension.
---
 clang/docs/ReleaseNotes.rst                   |  4 +++
 clang/include/clang/Basic/DiagnosticGroups.td |  2 ++
 .../clang/Basic/DiagnosticSemaKinds.td        |  5 +++-
 clang/lib/Sema/SemaExprCXX.cpp                | 12 ++++++---
 clang/test/CXX/drs/cwg5xx.cpp                 | 22 +++++++++-------
 clang/test/OpenMP/deferred-diags.cpp          |  2 +-
 clang/test/SemaCXX/new-delete.cpp             | 26 ++++++++++++-------
 clang/www/cxx_status.html                     |  2 +-
 8 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 36cf615a4287cc..f6431a76b38de5 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -272,6 +272,7 @@ C++2c Feature Support
 
 - Implemented `P2809R3: Trivial infinite loops are not Undefined Behavior <https://wg21.link/P2809R3>`_.
 
+- Implemented `P3144R2 Deleting a Pointer to an Incomplete Type Should be Ill-formed <https://wg21.link/P3144R2>`_.
 
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -438,6 +439,9 @@ New Compiler Flags
   Matches MSVC behaviour by defining ``__STDC__`` to ``1`` when
   MSVC compatibility mode is used. It has no effect for C++ code.
 
+- ``-Wc++2c-compat`` group was added to help migrating existing codebases
+  to C++26.
+
 Deprecated Compiler Flags
 -------------------------
 
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 9431eea1f6be22..1b25cf36dd4f81 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -420,6 +420,8 @@ def CXX20CompatPedantic : DiagGroup<"c++20-compat-pedantic",
 def : DiagGroup<"c++2a-compat", [CXX20Compat]>;
 def : DiagGroup<"c++2a-compat-pedantic", [CXX20CompatPedantic]>;
 
+def CXX26Compat : DiagGroup<"c++2c-compat", [DeleteIncomplete]>;
+
 def ExitTimeDestructors : DiagGroup<"exit-time-destructors">;
 def FlexibleArrayExtensions : DiagGroup<"flexible-array-extensions">;
 def FourByteMultiChar : DiagGroup<"four-char-constants">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 3df64b2ecef1b2..44fd51ec9abc96 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -7989,8 +7989,11 @@ def ext_delete_void_ptr_operand : ExtWarn<
 def err_ambiguous_delete_operand : Error<
   "ambiguous conversion of delete expression of type %0 to a pointer">;
 def warn_delete_incomplete : Warning<
-  "deleting pointer to incomplete type %0 may cause undefined behavior">,
+  "deleting pointer to incomplete type %0 is incompatible with C++2c"
+  " and may cause undefined behavior">,
   InGroup<DeleteIncomplete>;
+def err_delete_incomplete : Error<
+  "cannot delete pointer to incomplete type %0">;
 def err_delete_incomplete_class_type : Error<
   "deleting incomplete class type %0; no conversions to pointer type">;
 def err_delete_explicit_conversion : Error<
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 69074f92a0286b..fcf2189a308a86 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -3719,8 +3719,11 @@ Sema::ActOnCXXDelete(SourceLocation StartLoc, bool UseGlobal,
       // The C++ standard bans deleting a pointer to a non-object type, which
       // effectively bans deletion of "void*". However, most compilers support
       // this, so we treat it as a warning unless we're in a SFINAE context.
-      Diag(StartLoc, diag::ext_delete_void_ptr_operand)
-        << Type << Ex.get()->getSourceRange();
+      // But we still prohibit this since C++26.
+      Diag(StartLoc, LangOpts.CPlusPlus26 ? diag::err_delete_incomplete
+                                          : diag::ext_delete_void_ptr_operand)
+          << (LangOpts.CPlusPlus26 ? Pointee : Type)
+          << Ex.get()->getSourceRange();
     } else if (Pointee->isFunctionType() || Pointee->isVoidType() ||
                Pointee->isSizelessType()) {
       return ExprError(Diag(StartLoc, diag::err_delete_operand)
@@ -3729,7 +3732,10 @@ Sema::ActOnCXXDelete(SourceLocation StartLoc, bool UseGlobal,
       // FIXME: This can result in errors if the definition was imported from a
       // module but is hidden.
       if (!RequireCompleteType(StartLoc, Pointee,
-                               diag::warn_delete_incomplete, Ex.get())) {
+                               LangOpts.CPlusPlus26
+                                   ? diag::err_delete_incomplete
+                                   : diag::warn_delete_incomplete,
+                               Ex.get())) {
         if (const RecordType *RT = PointeeElem->getAs<RecordType>())
           PointeeRD = cast<CXXRecordDecl>(RT->getDecl());
       }
diff --git a/clang/test/CXX/drs/cwg5xx.cpp b/clang/test/CXX/drs/cwg5xx.cpp
index 9d890f981348a7..6a0bb7a1966693 100644
--- a/clang/test/CXX/drs/cwg5xx.cpp
+++ b/clang/test/CXX/drs/cwg5xx.cpp
@@ -1,9 +1,10 @@
-// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98-11,cxx98-14,cxx98-17,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx98-11,cxx98-14,cxx98-17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++14 %s -verify=expected,cxx98-14,cxx98-17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx17,cxx98-17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx23,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98-23,cxx98-11,cxx98-14,cxx98-17,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx98-23,cxx98-11,cxx98-14,cxx98-17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 %s -verify=expected,cxx98-23,cxx98-14,cxx98-17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 %s -verify=expected,cxx98-23,since-cxx17,cxx98-17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 %s -verify=expected,cxx98-23,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 %s -verify=expected,cxx98-23,since-cxx23,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx26,since-cxx23,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 
 #if __cplusplus == 199711L
 #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
@@ -901,7 +902,8 @@ namespace cwg573 { // cwg573: no
   void *d = reinterpret_cast<void*>(c);
   // cxx98-error@-1 {{cast between pointer-to-function and pointer-to-object is an extension}}
   void f() { delete a; }
-  // expected-error@-1 {{cannot delete expression with pointer-to-'void' type 'void *'}}
+  // cxx98-23-error@-1 {{cannot delete expression with pointer-to-'void' type 'void *'}}
+  // since-cxx26-error@-2 {{cannot delete pointer to incomplete type 'void'}}
   int n = d - a;
   // expected-error@-1 {{arithmetic on pointers to void}}
   // FIXME: This is ill-formed.
@@ -1238,11 +1240,13 @@ namespace cwg599 { // cwg599: partial
   struct V { operator int*(); operator Fn*(); };
   void f(void *p, void (*q)(), S s, T t, U u, V v) {
     delete p;
-    // expected-error@-1 {{cannot delete expression with pointer-to-'void' type 'void *'}}
+    // cxx98-23-error@-1 {{cannot delete expression with pointer-to-'void' type 'void *'}}
+    // since-cxx26-error@-2 {{cannot delete pointer to incomplete type 'void'}}
     delete q;
     // expected-error@-1 {{cannot delete expression of type 'void (*)()'}}
     delete s;
-    // expected-error@-1 {{cannot delete expression with pointer-to-'void' type 'void *'}}
+    // cxx98-23-error@-1 {{cannot delete expression with pointer-to-'void' type 'void *'}}
+    // since-cxx26-error@-2 {{cannot delete pointer to incomplete type 'void'}}
     delete t;
     // expected-error@-1 {{cannot delete expression of type 'T'}}
     // FIXME: This is valid, but is rejected due to a non-conforming GNU
diff --git a/clang/test/OpenMP/deferred-diags.cpp b/clang/test/OpenMP/deferred-diags.cpp
index a12f80309344c6..e31b99b8c88e45 100644
--- a/clang/test/OpenMP/deferred-diags.cpp
+++ b/clang/test/OpenMP/deferred-diags.cpp
@@ -41,7 +41,7 @@ namespace TestDeleteIncompleteClassDefinition {
 struct a;
 struct b {
   b() {
-    delete c; // expected-warning {{deleting pointer to incomplete type 'a' may cause undefined behavior}}
+    delete c; // expected-warning {{deleting pointer to incomplete type 'a' is incompatible with C++2c and may cause undefined behavior}}
   }
   a *c;
 };
diff --git a/clang/test/SemaCXX/new-delete.cpp b/clang/test/SemaCXX/new-delete.cpp
index 1a99c6aac604f5..ec6ad43476f944 100644
--- a/clang/test/SemaCXX/new-delete.cpp
+++ b/clang/test/SemaCXX/new-delete.cpp
@@ -1,8 +1,10 @@
-// RUN: %clang_cc1 -fsyntax-only -verify=expected,precxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++98
-// RUN: %clang_cc1 -fsyntax-only -verify=expected,precxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++11
-// RUN: %clang_cc1 -fsyntax-only -verify=expected,precxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++14
-// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx17,precxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++17
-// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx17,cxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++20
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98-23,precxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++98
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98-23,precxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++11
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98-23,precxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++14
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98-23,cxx17,precxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++17
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98-23,cxx17,cxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++20
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98-23,cxx17,cxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++23
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,since-cxx26,cxx17,cxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++2c
 
 // FIXME Location is (frontend)
 // cxx17-note@*:* {{candidate function not viable: requires 2 arguments, but 3 were provided}}
@@ -172,8 +174,12 @@ void bad_deletes()
 {
   delete 0; // expected-error {{cannot delete expression of type 'int'}}
   delete [0] (int*)0; // expected-error {{expected variable name or 'this' in lambda capture list}}
-  delete (void*)0; // expected-warning {{cannot delete expression with pointer-to-'void' type 'void *'}}
-  delete (T*)0; // expected-warning {{deleting pointer to incomplete type}}
+  delete (void*)0;
+  // cxx98-23-warning@-1 {{cannot delete expression with pointer-to-'void' type 'void *'}}
+  // since-cxx26-error@-2 {{cannot delete pointer to incomplete type 'void'}}
+  delete (T*)0;
+  // cxx98-23-warning@-1 {{deleting pointer to incomplete type}}
+  // since-cxx26-error@-2 {{cannot delete pointer to incomplete type 'T'}}
   ::S::delete (int*)0; // expected-error {{expected unqualified-id}}
 }
 
@@ -513,8 +519,10 @@ namespace DeleteIncompleteClass {
 
 namespace DeleteIncompleteClassPointerError {
   struct A; // expected-note {{forward declaration}}
-  void f(A *x) { 1+delete x; } // expected-warning {{deleting pointer to incomplete type}} \
-                               // expected-error {{invalid operands to binary expression}}
+  void f(A *x) { 1+delete x; }
+  // expected-error@-1 {{invalid operands to binary expression}}
+  // cxx98-23-warning@-2 {{deleting pointer to incomplete type}}
+  // since-cxx26-error@-3 {{cannot delete pointer to incomplete type 'A'}}
 }
 
 namespace PR10504 {
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index f12ce38ba3d79c..0c013e6d7cb58d 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -213,7 +213,7 @@ <h2 id="cxx26">C++2c implementation status</h2>
  <tr>
   <td>Deleting a Pointer to an Incomplete Type Should be Ill-formed</td>
   <td><a href="https://wg21.link/P3144">P3144R2</a></td>
-  <td class="none" align="center">No</td>
+  <td class="Unreleased" align="center">Clang 19</td>
  </tr>
  <tr>
   <td>Ordering of constraints involving fold expressions</td>

From fc665436626fa265ebb8d62b9f8a4cfab9b959d0 Mon Sep 17 00:00:00 2001
From: "Mikhail R. Gadelha" <mikhail@igalia.com>
Date: Fri, 5 Jul 2024 20:32:20 +0200
Subject: [PATCH 24/67] Revert "[libc] Fix readlink tests on 32-bit systems"
 (#97852)

Reverts #97850 while I investigate the buildbot issue
---
 libc/test/src/unistd/CMakeLists.txt      |  2 --
 libc/test/src/unistd/readlink_test.cpp   | 11 ++++-------
 libc/test/src/unistd/readlinkat_test.cpp | 15 ++++++---------
 3 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt
index f4f78b800987d4..de3e8d9ccbb626 100644
--- a/libc/test/src/unistd/CMakeLists.txt
+++ b/libc/test/src/unistd/CMakeLists.txt
@@ -262,7 +262,6 @@ add_libc_unittest(
     libc.include.unistd
     libc.src.errno.errno
     libc.src.unistd.readlink
-    libc.src.string.string_utils
     libc.src.unistd.symlink
     libc.src.unistd.unlink
     libc.src.__support.CPP.string_view
@@ -279,7 +278,6 @@ add_libc_unittest(
     libc.include.fcntl
     libc.include.unistd
     libc.src.errno.errno
-    libc.src.string.string_utils
     libc.src.unistd.readlinkat
     libc.src.unistd.symlink
     libc.src.unistd.unlink
diff --git a/libc/test/src/unistd/readlink_test.cpp b/libc/test/src/unistd/readlink_test.cpp
index 0760850d9bae19..20f3951349118a 100644
--- a/libc/test/src/unistd/readlink_test.cpp
+++ b/libc/test/src/unistd/readlink_test.cpp
@@ -9,7 +9,6 @@
 #include "src/__support/CPP/string_view.h"
 #include "src/errno/libc_errno.h"
 #include "src/unistd/readlink.h"
-#include "src/string/string_utils.h"
 #include "src/unistd/symlink.h"
 #include "src/unistd/unlink.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
@@ -31,9 +30,8 @@ TEST(LlvmLibcReadlinkTest, CreateAndUnlink) {
   //   3. Cleanup the symlink created in step #1.
   ASSERT_THAT(LIBC_NAMESPACE::symlink(LINK_VAL, LINK), Succeeds(0));
 
-  char buf[sizeof(FILENAME)];
-  ssize_t len = LIBC_NAMESPACE::readlink(
-      LINK, buf, LIBC_NAMESPACE::internal::string_length(FILENAME));
+  char buf[sizeof(LINK_VAL)];
+  ssize_t len = LIBC_NAMESPACE::readlink(LINK, buf, sizeof(buf));
   ASSERT_ERRNO_SUCCESS();
   ASSERT_EQ(cpp::string_view(buf, len), cpp::string_view(LINK_VAL));
 
@@ -42,8 +40,7 @@ TEST(LlvmLibcReadlinkTest, CreateAndUnlink) {
 
 TEST(LlvmLibcReadlinkTest, ReadlinkInNonExistentPath) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
-  constexpr auto len = 8;
-  char buf[len];
-  ASSERT_THAT(LIBC_NAMESPACE::readlink("non-existent-link", buf, len),
+  char buf[8];
+  ASSERT_THAT(LIBC_NAMESPACE::readlink("non-existent-link", buf, sizeof(buf)),
               Fails(ENOENT));
 }
diff --git a/libc/test/src/unistd/readlinkat_test.cpp b/libc/test/src/unistd/readlinkat_test.cpp
index 61e87731c9b9dd..39d81d9ba544a6 100644
--- a/libc/test/src/unistd/readlinkat_test.cpp
+++ b/libc/test/src/unistd/readlinkat_test.cpp
@@ -9,7 +9,6 @@
 #include "src/__support/CPP/string_view.h"
 #include "src/errno/libc_errno.h"
 #include "src/unistd/readlinkat.h"
-#include "src/string/string_utils.h"
 #include "src/unistd/symlink.h"
 #include "src/unistd/unlink.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
@@ -33,9 +32,8 @@ TEST(LlvmLibcReadlinkatTest, CreateAndUnlink) {
   //   3. Cleanup the symlink created in step #1.
   ASSERT_THAT(LIBC_NAMESPACE::symlink(LINK_VAL, LINK), Succeeds(0));
 
-  char buf[sizeof(FILENAME)];
-  ssize_t len = LIBC_NAMESPACE::readlinkat(
-      AT_FDCWD, LINK, buf, LIBC_NAMESPACE::internal::string_length(FILENAME));
+  char buf[sizeof(LINK_VAL)];
+  ssize_t len = LIBC_NAMESPACE::readlinkat(AT_FDCWD, LINK, buf, sizeof(buf));
   ASSERT_ERRNO_SUCCESS();
   ASSERT_EQ(cpp::string_view(buf, len), cpp::string_view(LINK_VAL));
 
@@ -44,9 +42,8 @@ TEST(LlvmLibcReadlinkatTest, CreateAndUnlink) {
 
 TEST(LlvmLibcReadlinkatTest, ReadlinkInNonExistentPath) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
-  constexpr auto len = 8;
-  char buf[len];
-  ASSERT_THAT(
-      LIBC_NAMESPACE::readlinkat(AT_FDCWD, "non-existent-link", buf, len),
-      Fails(ENOENT));
+  char buf[8];
+  ASSERT_THAT(LIBC_NAMESPACE::readlinkat(AT_FDCWD, "non-existent-link", buf,
+                                         sizeof(buf)),
+              Fails(ENOENT));
 }

From 683c8e9913cd87e0b2a8e083298cd3ebc67923fe Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Fri, 5 Jul 2024 20:45:32 +0200
Subject: [PATCH 25/67] [libc++] Adds a missing include.

This should fix the clang modules with the locales disabled build in the
CI.
---
 libcxx/include/complex | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libcxx/include/complex b/libcxx/include/complex
index 69a9fcce6f7efa..22271acaf7358d 100644
--- a/libcxx/include/complex
+++ b/libcxx/include/complex
@@ -261,6 +261,7 @@ template<class T> complex<T> tanh (const complex<T>&);
 #include <__fwd/tuple.h>
 #include <__tuple/tuple_element.h>
 #include <__tuple/tuple_size.h>
+#include <__type_traits/conditional.h>
 #include <__utility/move.h>
 #include <cmath>
 #include <version>

From 1ed84a862f9ce3c60251968f23a5405f06458975 Mon Sep 17 00:00:00 2001
From: PeterChou1 <peter.chou@mail.utoronto.ca>
Date: Fri, 5 Jul 2024 15:27:25 -0400
Subject: [PATCH 26/67] [clang-doc] fix bug introduced by asset test (#97540)

---
 clang-tools-extra/clang-doc/tool/CMakeLists.txt | 2 +-
 llvm/CMakeLists.txt                             | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/clang-doc/tool/CMakeLists.txt b/clang-tools-extra/clang-doc/tool/CMakeLists.txt
index e93a5728d6b6b0..19c17a8f3a51f7 100644
--- a/clang-tools-extra/clang-doc/tool/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/tool/CMakeLists.txt
@@ -25,7 +25,7 @@ set(assets
 )
 
 set(asset_dir "${CMAKE_CURRENT_SOURCE_DIR}/../assets")
-set(resource_dir "${CMAKE_BINARY_DIR}/share/clang-doc")
+set(resource_dir "${LLVM_SHARE_OUTPUT_INTDIR}/clang-doc")
 set(out_files)
 
 function(copy_files_to_dst src_dir dst_dir file)
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 12618966c4adfd..cbbf84ec286ed2 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -446,6 +446,7 @@ mark_as_advanced(LLVM_EXAMPLES_INSTALL_DIR)
 # They are used as destination of target generators.
 set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin)
 set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX})
+set(LLVM_SHARE_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/share)
 if(WIN32 OR CYGWIN)
   # DLL platform -- put DLLs into bin.
   set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_RUNTIME_OUTPUT_INTDIR})

From e70f376b25ea96f3b0db75ff77ae1a58d53f2119 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 5 Jul 2024 13:19:16 -0700
Subject: [PATCH 27/67] [MCParser] Simplify macro-like body expansion

Make it easy to support argument expansion in the altmacro mode.
---
 llvm/lib/MC/MCParser/AsmParser.cpp | 179 ++++++++++++-----------------
 1 file changed, 73 insertions(+), 106 deletions(-)

diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 13879220a25e75..4b7c7f07fed990 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -2505,129 +2505,96 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, MCAsmMacro &Macro,
   // A macro without parameters is handled differently on Darwin:
   // gas accepts no arguments and does no substitutions
   StringRef Body = Macro.Body;
-  while (!Body.empty()) {
-    // Scan for the next substitution.
-    std::size_t End = Body.size(), Pos = 0;
-    for (; Pos != End; ++Pos) {
-      // Check for a substitution or escape.
-      if (IsDarwin && !NParameters) {
-        // This macro has no parameters, look for $0, $1, etc.
-        if (Body[Pos] != '$' || Pos + 1 == End)
-          continue;
+  size_t I = 0, End = Body.size();
+  while (I != End) {
+    if (Body[I] == '\\' && I + 1 != End) {
+      // Check for \@ and \+ pseudo variables.
+      if (EnableAtPseudoVariable && Body[I + 1] == '@') {
+        OS << NumOfMacroInstantiations;
+        I += 2;
+        continue;
+      }
+      if (Body[I + 1] == '+') {
+        OS << Macro.Count;
+        I += 2;
+        continue;
+      }
+      if (Body[I + 1] == '(' && Body[I + 2] == ')') {
+        I += 3;
+        continue;
+      }
 
-        char Next = Body[Pos + 1];
-        if (Next == '$' || Next == 'n' ||
-            isdigit(static_cast<unsigned char>(Next)))
+      size_t Pos = ++I;
+      while (I != End && isIdentifierChar(Body[I]))
+        ++I;
+      StringRef Argument(Body.data() + Pos, I - Pos);
+      unsigned Index = 0;
+      for (; Index < NParameters; ++Index)
+        if (Parameters[Index].Name == Argument)
           break;
+      if (Index == NParameters) {
+        OS << '\\' << Argument;
       } else {
-        // This macro has parameters, look for \foo, \bar, etc.
-        if (Body[Pos] == '\\' && Pos + 1 != End)
-          break;
+        bool VarargParameter = HasVararg && Index == (NParameters - 1);
+        for (const AsmToken &Token : A[Index]) {
+          // For altmacro mode, you can write '%expr'.
+          // The prefix '%' evaluates the expression 'expr'
+          // and uses the result as a string (e.g. replace %(1+2) with the
+          // string "3").
+          // Here, we identify the integer token which is the result of the
+          // absolute expression evaluation and replace it with its string
+          // representation.
+          if (AltMacroMode && Token.getString().front() == '%' &&
+              Token.is(AsmToken::Integer))
+            // Emit an integer value to the buffer.
+            OS << Token.getIntVal();
+          // Only Token that was validated as a string and begins with '<'
+          // is considered altMacroString!!!
+          else if (AltMacroMode && Token.getString().front() == '<' &&
+                   Token.is(AsmToken::String)) {
+            OS << angleBracketString(Token.getStringContents());
+          }
+          // We expect no quotes around the string's contents when
+          // parsing for varargs.
+          else if (Token.isNot(AsmToken::String) || VarargParameter)
+            OS << Token.getString();
+          else
+            OS << Token.getStringContents();
+        }
       }
+      continue;
     }
 
-    // Add the prefix.
-    OS << Body.slice(0, Pos);
-
-    // Check if we reached the end.
-    if (Pos == End)
-      break;
-
-    if (IsDarwin && !NParameters) {
-      switch (Body[Pos + 1]) {
+    if (Body[I] == '$' && I + 1 != End && IsDarwin && !NParameters) {
+      // This macro has no parameters, look for $0, $1, etc.
+      switch (Body[I + 1]) {
       // $$ => $
       case '$':
         OS << '$';
-        break;
-
+        I += 2;
+        continue;
       // $n => number of arguments
       case 'n':
         OS << A.size();
-        break;
-
-      // $[0-9] => argument
+        I += 2;
+        continue;
       default: {
-        // Missing arguments are ignored.
-        unsigned Index = Body[Pos + 1] - '0';
-        if (Index >= A.size())
+        if (!isDigit(Body[I + 1]))
           break;
-
-        // Otherwise substitute with the token values, with spaces eliminated.
-        for (const AsmToken &Token : A[Index])
-          OS << Token.getString();
-        break;
-      }
-      }
-      Pos += 2;
-    } else {
-      // Check for \@ and \+ pseudo variables.
-      unsigned I = Pos + 1;
-      if (I + 1 != End) {
-        if (EnableAtPseudoVariable && Body[I] == '@') {
-          ++I;
-        } else if (Body[I] == '+') {
-          ++I;
-        } else {
-          while (isIdentifierChar(Body[I]) && I + 1 != End)
-            ++I;
-        }
-      }
-
-      const char *Begin = Body.data() + Pos + 1;
-      StringRef Argument(Begin, I - (Pos + 1));
-      unsigned Index = 0;
-
-      if (Argument == "@") {
-        OS << NumOfMacroInstantiations;
-        Pos += 2;
-      } else if (Argument == "+") {
-        OS << Macro.Count;
-        Pos += 2;
-      } else {
-        for (; Index < NParameters; ++Index)
-          if (Parameters[Index].Name == Argument)
-            break;
-
-        if (Index == NParameters) {
-          if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
-            Pos += 3;
-          else {
-            OS << '\\' << Argument;
-            Pos = I;
-          }
-        } else {
-          bool VarargParameter = HasVararg && Index == (NParameters - 1);
+        // $[0-9] => argument
+        // Missing arguments are ignored.
+        unsigned Index = Body[I + 1] - '0';
+        if (Index < A.size())
           for (const AsmToken &Token : A[Index])
-            // For altmacro mode, you can write '%expr'.
-            // The prefix '%' evaluates the expression 'expr'
-            // and uses the result as a string (e.g. replace %(1+2) with the
-            // string "3").
-            // Here, we identify the integer token which is the result of the
-            // absolute expression evaluation and replace it with its string
-            // representation.
-            if (AltMacroMode && Token.getString().front() == '%' &&
-                Token.is(AsmToken::Integer))
-              // Emit an integer value to the buffer.
-              OS << Token.getIntVal();
-            // Only Token that was validated as a string and begins with '<'
-            // is considered altMacroString!!!
-            else if (AltMacroMode && Token.getString().front() == '<' &&
-                     Token.is(AsmToken::String)) {
-              OS << angleBracketString(Token.getStringContents());
-            }
-            // We expect no quotes around the string's contents when
-            // parsing for varargs.
-            else if (Token.isNot(AsmToken::String) || VarargParameter)
-              OS << Token.getString();
-            else
-              OS << Token.getStringContents();
-
-          Pos += 1 + Argument.size();
-        }
+            OS << Token.getString();
+        I += 2;
+        continue;
+      }
       }
     }
-    // Update the scan point.
-    Body = Body.substr(Pos);
+
+    OS << Body[I];
+    ++I;
   }
 
   ++Macro.Count;

From f6ae0d302f09f62b852a29f6dbfdbb4b9c2affb7 Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson@ericsson.com>
Date: Thu, 4 Jul 2024 18:02:16 +0200
Subject: [PATCH 28/67] [CodeGen] Pre-commit test case related to
 ComputeNumSignBits for SHL (#97695)

Adding test cases aiming at showing possibility to look through
ZERO_EXTEND/ANY_EXTEND when computing number of sign bits for an
SHL node. If all extended bits are shifted out we can analyze the
operand that is extended.
---
 .../CodeGen/X86/computenumsignbits-shl.ll     | 191 ++++++++++++++++++
 1 file changed, 191 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/computenumsignbits-shl.ll

diff --git a/llvm/test/CodeGen/X86/computenumsignbits-shl.ll b/llvm/test/CodeGen/X86/computenumsignbits-shl.ll
new file mode 100644
index 00000000000000..5799bb653ebc66
--- /dev/null
+++ b/llvm/test/CodeGen/X86/computenumsignbits-shl.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
+
+; Verify that we can look through a ZERO_EXTEND/ANY_EXTEND when doing
+; ComputeNumSignBits for SHL.
+; We use the (sshlsat x, c) -> (shl x, c) fold as verification.
+; That fold should happen if c is less than the number of sign bits in x
+
+define void @computeNumSignBits_shl_zext_1(i8 %x, ptr %p) nounwind {
+; X64-LABEL: computeNumSignBits_shl_zext_1:
+; X64:       # %bb.0:
+; X64-NEXT:    sarb $5, %dil
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    shll $10, %ecx
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    testw %cx, %cx
+; X64-NEXT:    sets %dl
+; X64-NEXT:    addl $32767, %edx # imm = 0x7FFF
+; X64-NEXT:    movl %eax, %edi
+; X64-NEXT:    shll $11, %edi
+; X64-NEXT:    movswl %di, %r8d
+; X64-NEXT:    shrl %r8d
+; X64-NEXT:    cmpw %r8w, %cx
+; X64-NEXT:    cmovnel %edx, %edi
+; X64-NEXT:    movw %di, (%rsi)
+; X64-NEXT:    movl %eax, %edi
+; X64-NEXT:    shll $12, %edi
+; X64-NEXT:    movswl %di, %r8d
+; X64-NEXT:    shrl $2, %r8d
+; X64-NEXT:    cmpw %r8w, %cx
+; X64-NEXT:    cmovnel %edx, %edi
+; X64-NEXT:    movw %di, (%rsi)
+; X64-NEXT:    shll $13, %eax
+; X64-NEXT:    movswl %ax, %edi
+; X64-NEXT:    shrl $3, %edi
+; X64-NEXT:    cmpw %di, %cx
+; X64-NEXT:    cmovnel %edx, %eax
+; X64-NEXT:    movw %ax, (%rsi)
+; X64-NEXT:    retq
+  %ashr = ashr i8 %x, 5
+  %zext = zext i8 %ashr to i16
+  %nsb4 = shl i16 %zext, 10
+  ; Expecting (sshlsat x, c) -> (shl x, c) fold.
+  %tmp1 = call i16 @llvm.sshl.sat.i16(i16 %nsb4, i16 1)
+  store volatile i16 %tmp1, ptr %p
+  ; Expecting (sshlsat x, c) -> (shl x, c) fold.
+  %tmp2 = call i16 @llvm.sshl.sat.i16(i16 %nsb4, i16 2)
+  store volatile i16 %tmp2, ptr %p
+  ; Expecting (sshlsat x, c) -> (shl x, c) fold.
+  %tmp3 = call i16 @llvm.sshl.sat.i16(i16 %nsb4, i16 3)
+  store volatile i16 %tmp3, ptr %p
+  ret void
+}
+
+define void @computeNumSignBits_shl_zext_2(i8 %x, ptr %p) nounwind {
+; X64-LABEL: computeNumSignBits_shl_zext_2:
+; X64:       # %bb.0:
+; X64-NEXT:    sarb $5, %dil
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    shll $10, %ecx
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    testw %cx, %cx
+; X64-NEXT:    sets %dl
+; X64-NEXT:    addl $32767, %edx # imm = 0x7FFF
+; X64-NEXT:    shll $14, %eax
+; X64-NEXT:    movswl %ax, %edi
+; X64-NEXT:    shrl $4, %edi
+; X64-NEXT:    cmpw %di, %cx
+; X64-NEXT:    cmovnel %edx, %eax
+; X64-NEXT:    movw %ax, (%rsi)
+; X64-NEXT:    retq
+  %ashr = ashr i8 %x, 5
+  %zext = zext i8 %ashr to i16
+  %nsb4 = shl i16 %zext, 10
+  ; 4 sign bits. Not expecting (sshlsat x, c) -> (shl x, c) fold.
+  %tmp4 = call i16 @llvm.sshl.sat.i16(i16 %nsb4, i16 4)
+  store volatile i16 %tmp4, ptr %p
+  ret void
+}
+
+define void @computeNumSignBits_shl_zext_vec_1(<2 x i8> %x, ptr %p) nounwind {
+; X64-LABEL: computeNumSignBits_shl_zext_vec_1:
+; X64:       # %bb.0:
+; X64-NEXT:    psrlw $5, %xmm0
+; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    psubb %xmm1, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1024,4096,u,u,u,u,u,u]
+; X64-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-NEXT:    pand %xmm0, %xmm2
+; X64-NEXT:    pcmpgtw %xmm0, %xmm1
+; X64-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT:    por %xmm2, %xmm1
+; X64-NEXT:    movdqa %xmm0, %xmm2
+; X64-NEXT:    paddw %xmm0, %xmm2
+; X64-NEXT:    movdqa %xmm2, %xmm3
+; X64-NEXT:    psraw $1, %xmm3
+; X64-NEXT:    pcmpeqw %xmm0, %xmm3
+; X64-NEXT:    movdqa %xmm3, %xmm0
+; X64-NEXT:    pandn %xmm1, %xmm0
+; X64-NEXT:    pand %xmm2, %xmm3
+; X64-NEXT:    por %xmm0, %xmm3
+; X64-NEXT:    movd %xmm3, (%rdi)
+; X64-NEXT:    retq
+  %ashr = ashr <2 x i8> %x, <i8 5, i8 5>
+  %zext = zext <2 x i8> %ashr to <2 x i16>
+  %nsb4_2 = shl <2 x i16> %zext, <i16 10, i16 12>
+  ; Expecting (sshlsat x, c) -> (shl x, c) fold.
+  %tmp1 = call <2 x i16> @llvm.sshl.sat.v2i16(<2 x i16> %nsb4_2, <2 x i16> <i16 1, i16 1>)
+  store volatile <2 x i16> %tmp1, ptr %p
+  ret void
+}
+
+define void @computeNumSignBits_shl_zext_vec_2(<2 x i8> %x, ptr %p) nounwind {
+; X64-LABEL: computeNumSignBits_shl_zext_vec_2:
+; X64:       # %bb.0:
+; X64-NEXT:    psrlw $5, %xmm0
+; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    psubb %xmm1, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1024,4096,u,u,u,u,u,u]
+; X64-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-NEXT:    pand %xmm0, %xmm2
+; X64-NEXT:    pcmpgtw %xmm0, %xmm1
+; X64-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT:    por %xmm2, %xmm1
+; X64-NEXT:    movdqa %xmm0, %xmm2
+; X64-NEXT:    psllw $2, %xmm2
+; X64-NEXT:    movdqa %xmm2, %xmm3
+; X64-NEXT:    psraw $2, %xmm3
+; X64-NEXT:    pcmpeqw %xmm0, %xmm3
+; X64-NEXT:    movdqa %xmm3, %xmm0
+; X64-NEXT:    pandn %xmm1, %xmm0
+; X64-NEXT:    pand %xmm2, %xmm3
+; X64-NEXT:    por %xmm0, %xmm3
+; X64-NEXT:    movd %xmm3, (%rdi)
+; X64-NEXT:    retq
+  %ashr = ashr <2 x i8> %x, <i8 5, i8 5>
+  %zext = zext <2 x i8> %ashr to <2 x i16>
+  %nsb4_2 = shl <2 x i16> %zext, <i16 10, i16 12>
+  ; Not expecting (sshlsat x, c) -> (shl x, c) fold.
+  ; Because only 2 sign bits in element 1.
+  %tmp1 = call <2 x i16> @llvm.sshl.sat.v2i16(<2 x i16> %nsb4_2, <2 x i16> <i16 2, i16 2>)
+  store volatile <2 x i16> %tmp1, ptr %p
+  ret void
+}
+
+define void @computeNumSignBits_shl_zext_vec_3(<2 x i8> %x, ptr %p) nounwind {
+; X64-LABEL: computeNumSignBits_shl_zext_vec_3:
+; X64:       # %bb.0:
+; X64-NEXT:    psrlw $5, %xmm0
+; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    psubb %xmm1, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,4096,u,u,u,u,u,u]
+; X64-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-NEXT:    pand %xmm0, %xmm2
+; X64-NEXT:    pcmpgtw %xmm0, %xmm1
+; X64-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT:    por %xmm2, %xmm1
+; X64-NEXT:    movdqa %xmm0, %xmm2
+; X64-NEXT:    paddw %xmm0, %xmm2
+; X64-NEXT:    movdqa %xmm2, %xmm3
+; X64-NEXT:    psraw $1, %xmm3
+; X64-NEXT:    pcmpeqw %xmm0, %xmm3
+; X64-NEXT:    movdqa %xmm3, %xmm0
+; X64-NEXT:    pandn %xmm1, %xmm0
+; X64-NEXT:    pand %xmm2, %xmm3
+; X64-NEXT:    por %xmm0, %xmm3
+; X64-NEXT:    movd %xmm3, (%rdi)
+; X64-NEXT:    retq
+  %ashr = ashr <2 x i8> %x, <i8 5, i8 5>
+  %zext = zext <2 x i8> %ashr to <2 x i16>
+  %nsb1_2 = shl <2 x i16> %zext, <i16 14, i16 12>
+  ; Not expecting (sshlsat x, c) -> (shl x, c) fold.
+  ; Because all sign bits shifted out for element 0
+  %tmp1 = call <2 x i16> @llvm.sshl.sat.v2i16(<2 x i16> %nsb1_2, <2 x i16> <i16 1, i16 1>)
+  store volatile <2 x i16> %tmp1, ptr %p
+  ret void
+}

From c2fbc701aaf826e2015a5dcab36e3ba792e7da7f Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson@ericsson.com>
Date: Thu, 4 Jul 2024 10:34:04 +0200
Subject: [PATCH 29/67] [SelectionDAG] Let ComputeKnownSignBits handle (shl
 (ext X), C) (#97695)

Add simple support for looking through ZEXT/ANYEXT/SEXT when doing
ComputeKnownSignBits for SHL. This is valid for the case when all
extended bits are shifted out, because then the number of sign bits
can be found by analysing the EXT operand.

A future improvement could be to pass along the "shifted left by"
information in the recursive calls to ComputeKnownSignBits. Allowing
us to handle this more generically.
---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 29 +++++++++--
 ...msignbits-shl.ll => known-signbits-shl.ll} | 49 +++----------------
 2 files changed, 33 insertions(+), 45 deletions(-)
 rename llvm/test/CodeGen/X86/{computenumsignbits-shl.ll => known-signbits-shl.ll} (79%)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 27c297af39e8cb..943d2ddc64246a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4615,12 +4615,33 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
       Tmp = std::min<uint64_t>(Tmp + *ShAmt, VTBits);
     return Tmp;
   case ISD::SHL:
-    if (std::optional<uint64_t> ShAmt =
-            getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) {
+    if (std::optional<ConstantRange> ShAmtRange =
+            getValidShiftAmountRange(Op, DemandedElts, Depth + 1)) {
+      uint64_t MaxShAmt = ShAmtRange->getUnsignedMax().getZExtValue();
+      uint64_t MinShAmt = ShAmtRange->getUnsignedMin().getZExtValue();
+      // Try to look through ZERO/SIGN/ANY_EXTEND. If all extended bits are
+      // shifted out, then we can compute the number of sign bits for the
+      // operand being extended. A future improvement could be to pass along the
+      // "shifted left by" information in the recursive calls to
+      // ComputeKnownSignBits. Allowing us to handle this more generically.
+      if (ISD::isExtOpcode(Op.getOperand(0).getOpcode())) {
+        SDValue Ext = Op.getOperand(0);
+        EVT ExtVT = Ext.getValueType();
+        SDValue Extendee = Ext.getOperand(0);
+        EVT ExtendeeVT = Extendee.getValueType();
+        uint64_t SizeDifference =
+            ExtVT.getScalarSizeInBits() - ExtendeeVT.getScalarSizeInBits();
+        if (SizeDifference <= MinShAmt) {
+          Tmp = SizeDifference +
+                ComputeNumSignBits(Extendee, DemandedElts, Depth + 1);
+          if (MaxShAmt < Tmp)
+            return Tmp - MaxShAmt;
+        }
+      }
       // shl destroys sign bits, ensure it doesn't shift out all sign bits.
       Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
-      if (*ShAmt < Tmp)
-        return Tmp - *ShAmt;
+      if (MaxShAmt < Tmp)
+        return Tmp - MaxShAmt;
     }
     break;
   case ISD::AND:
diff --git a/llvm/test/CodeGen/X86/computenumsignbits-shl.ll b/llvm/test/CodeGen/X86/known-signbits-shl.ll
similarity index 79%
rename from llvm/test/CodeGen/X86/computenumsignbits-shl.ll
rename to llvm/test/CodeGen/X86/known-signbits-shl.ll
index 5799bb653ebc66..473fecc307ed4e 100644
--- a/llvm/test/CodeGen/X86/computenumsignbits-shl.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-shl.ll
@@ -12,30 +12,12 @@ define void @computeNumSignBits_shl_zext_1(i8 %x, ptr %p) nounwind {
 ; X64-NEXT:    sarb $5, %dil
 ; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    shll $10, %ecx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    testw %cx, %cx
-; X64-NEXT:    sets %dl
-; X64-NEXT:    addl $32767, %edx # imm = 0x7FFF
-; X64-NEXT:    movl %eax, %edi
-; X64-NEXT:    shll $11, %edi
-; X64-NEXT:    movswl %di, %r8d
-; X64-NEXT:    shrl %r8d
-; X64-NEXT:    cmpw %r8w, %cx
-; X64-NEXT:    cmovnel %edx, %edi
-; X64-NEXT:    movw %di, (%rsi)
-; X64-NEXT:    movl %eax, %edi
-; X64-NEXT:    shll $12, %edi
-; X64-NEXT:    movswl %di, %r8d
-; X64-NEXT:    shrl $2, %r8d
-; X64-NEXT:    cmpw %r8w, %cx
-; X64-NEXT:    cmovnel %edx, %edi
-; X64-NEXT:    movw %di, (%rsi)
+; X64-NEXT:    shll $11, %ecx
+; X64-NEXT:    movw %cx, (%rsi)
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    shll $12, %ecx
+; X64-NEXT:    movw %cx, (%rsi)
 ; X64-NEXT:    shll $13, %eax
-; X64-NEXT:    movswl %ax, %edi
-; X64-NEXT:    shrl $3, %edi
-; X64-NEXT:    cmpw %di, %cx
-; X64-NEXT:    cmovnel %edx, %eax
 ; X64-NEXT:    movw %ax, (%rsi)
 ; X64-NEXT:    retq
   %ashr = ashr i8 %x, 5
@@ -88,24 +70,9 @@ define void @computeNumSignBits_shl_zext_vec_1(<2 x i8> %x, ptr %p) nounwind {
 ; X64-NEXT:    movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
 ; X64-NEXT:    pxor %xmm1, %xmm0
 ; X64-NEXT:    psubb %xmm1, %xmm0
-; X64-NEXT:    pxor %xmm1, %xmm1
-; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1024,4096,u,u,u,u,u,u]
-; X64-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X64-NEXT:    pand %xmm0, %xmm2
-; X64-NEXT:    pcmpgtw %xmm0, %xmm1
-; X64-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-NEXT:    por %xmm2, %xmm1
-; X64-NEXT:    movdqa %xmm0, %xmm2
-; X64-NEXT:    paddw %xmm0, %xmm2
-; X64-NEXT:    movdqa %xmm2, %xmm3
-; X64-NEXT:    psraw $1, %xmm3
-; X64-NEXT:    pcmpeqw %xmm0, %xmm3
-; X64-NEXT:    movdqa %xmm3, %xmm0
-; X64-NEXT:    pandn %xmm1, %xmm0
-; X64-NEXT:    pand %xmm2, %xmm3
-; X64-NEXT:    por %xmm0, %xmm3
-; X64-NEXT:    movd %xmm3, (%rdi)
+; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2048,8192,u,u,u,u,u,u]
+; X64-NEXT:    movd %xmm0, (%rdi)
 ; X64-NEXT:    retq
   %ashr = ashr <2 x i8> %x, <i8 5, i8 5>
   %zext = zext <2 x i8> %ashr to <2 x i16>

From ac03ae30cf2b6465ea8f117dfa74ba6f670f6258 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 5 Jul 2024 21:41:30 +0100
Subject: [PATCH 30/67] [LV] Preserve LAA in LoopVectorize (NFCI).

LoopVectorize already always preserves DT, LI and SCEV. If any changes
get made to the CFG, cached LAA info for loops are cleared.

LoopAccessAnalysis also implements ::invalidate to clear the analysis if
SE, DT or LI gets invalidated. Hence it should be safe to preserve LAA
and save a small amount of compile-time.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp                 | 1 +
 .../Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 56fb8a10d7334a..1423deb5a73f9a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -10258,6 +10258,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
     PA.preserve<LoopAnalysis>();
     PA.preserve<DominatorTreeAnalysis>();
     PA.preserve<ScalarEvolutionAnalysis>();
+    PA.preserve<LoopAccessAnalysis>();
 
     if (Result.MadeCFGChange) {
       // Making CFG changes likely means a loop got vectorized. Indicate that
diff --git a/llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll b/llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll
index c78e005311ef26..48fa198620d465 100644
--- a/llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll
+++ b/llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll
@@ -12,7 +12,7 @@ define i32 @novect(ptr %p) {
 ; CHECK-NOT:       Invalidating analysis: BranchProbabilityAnalysis on novect
 ; CHECK-NOT:       Invalidating analysis: BlockFrequencyAnalysis on novect
 ; CHECK:           Invalidating analysis: DemandedBitsAnalysis on novect
-; CHECK:           Invalidating analysis: LoopAccessAnalysis on novect
+; CHECK-NOT:       Invalidating analysis: LoopAccessAnalysis on novect
 ; CHECK:           Running pass: JumpThreadingPass on novect
 
 ; CHECK:           entry:

From 812f9e81d2f75c874301a8f5df25d3de7616a9c5 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 5 Jul 2024 14:00:26 -0700
Subject: [PATCH 31/67] [MCParser] .altmacro: ignore & after a token

---
 llvm/lib/MC/MCParser/AsmParser.cpp           | 2 ++
 llvm/test/MC/AsmParser/altmacro_expression.s | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 4b7c7f07fed990..bf3061ad541e34 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -2528,6 +2528,8 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, MCAsmMacro &Macro,
       while (I != End && isIdentifierChar(Body[I]))
         ++I;
       StringRef Argument(Body.data() + Pos, I - Pos);
+      if (AltMacroMode && I != End && Body[I] == '&')
+        ++I;
       unsigned Index = 0;
       for (; Index < NParameters; ++Index)
         if (Parameters[Index].Name == Argument)
diff --git a/llvm/test/MC/AsmParser/altmacro_expression.s b/llvm/test/MC/AsmParser/altmacro_expression.s
index 58d8b486cf85dd..cee0a6872541a9 100644
--- a/llvm/test/MC/AsmParser/altmacro_expression.s
+++ b/llvm/test/MC/AsmParser/altmacro_expression.s
@@ -3,10 +3,12 @@
 # Checking that the '%' was evaluated as a string first
 # In a fail scenario: The asmprint will print: addl $%(1+4), %eax
 
-# CHECK:  addl $5, %eax
+# CHECK:       addl $5, %eax
+# CHECK-NEXT:  addl $5, %eax
 .altmacro
 .macro percent_expr arg
     addl $\arg, %eax
+    addl $\arg&, %eax
 .endm
 
 percent_expr %(1+4)

From f8b1ca4992a22b4b65282c09dd6f07a1a2839070 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 5 Jul 2024 14:08:13 -0700
Subject: [PATCH 32/67] [MCParser] .altmacro: Support argument expansion not
 preceded by \

In the .altmacro mode, an argument can be expanded even if not preceded
by \
---
 llvm/lib/MC/MCParser/AsmParser.cpp    | 79 +++++++++++++++++----------
 llvm/test/MC/AsmParser/altmacro-arg.s | 22 ++++++++
 2 files changed, 71 insertions(+), 30 deletions(-)
 create mode 100644 llvm/test/MC/AsmParser/altmacro-arg.s

diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index bf3061ad541e34..707edb0481a619 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -2500,7 +2500,34 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, MCAsmMacro &Macro,
                             ArrayRef<MCAsmMacroArgument> A,
                             bool EnableAtPseudoVariable) {
   unsigned NParameters = Parameters.size();
-  bool HasVararg = NParameters ? Parameters.back().Vararg : false;
+  auto expandArg = [&](unsigned Index) {
+    bool HasVararg = NParameters ? Parameters.back().Vararg : false;
+    bool VarargParameter = HasVararg && Index == (NParameters - 1);
+    for (const AsmToken &Token : A[Index])
+      // For altmacro mode, you can write '%expr'.
+      // The prefix '%' evaluates the expression 'expr'
+      // and uses the result as a string (e.g. replace %(1+2) with the
+      // string "3").
+      // Here, we identify the integer token which is the result of the
+      // absolute expression evaluation and replace it with its string
+      // representation.
+      if (AltMacroMode && Token.getString().front() == '%' &&
+          Token.is(AsmToken::Integer))
+        // Emit an integer value to the buffer.
+        OS << Token.getIntVal();
+      // Only Token that was validated as a string and begins with '<'
+      // is considered altMacroString!!!
+      else if (AltMacroMode && Token.getString().front() == '<' &&
+               Token.is(AsmToken::String)) {
+        OS << angleBracketString(Token.getStringContents());
+      }
+      // We expect no quotes around the string's contents when
+      // parsing for varargs.
+      else if (Token.isNot(AsmToken::String) || VarargParameter)
+        OS << Token.getString();
+      else
+        OS << Token.getStringContents();
+  };
 
   // A macro without parameters is handled differently on Darwin:
   // gas accepts no arguments and does no substitutions
@@ -2534,36 +2561,10 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, MCAsmMacro &Macro,
       for (; Index < NParameters; ++Index)
         if (Parameters[Index].Name == Argument)
           break;
-      if (Index == NParameters) {
+      if (Index == NParameters)
         OS << '\\' << Argument;
-      } else {
-        bool VarargParameter = HasVararg && Index == (NParameters - 1);
-        for (const AsmToken &Token : A[Index]) {
-          // For altmacro mode, you can write '%expr'.
-          // The prefix '%' evaluates the expression 'expr'
-          // and uses the result as a string (e.g. replace %(1+2) with the
-          // string "3").
-          // Here, we identify the integer token which is the result of the
-          // absolute expression evaluation and replace it with its string
-          // representation.
-          if (AltMacroMode && Token.getString().front() == '%' &&
-              Token.is(AsmToken::Integer))
-            // Emit an integer value to the buffer.
-            OS << Token.getIntVal();
-          // Only Token that was validated as a string and begins with '<'
-          // is considered altMacroString!!!
-          else if (AltMacroMode && Token.getString().front() == '<' &&
-                   Token.is(AsmToken::String)) {
-            OS << angleBracketString(Token.getStringContents());
-          }
-          // We expect no quotes around the string's contents when
-          // parsing for varargs.
-          else if (Token.isNot(AsmToken::String) || VarargParameter)
-            OS << Token.getString();
-          else
-            OS << Token.getStringContents();
-        }
-      }
+      else
+        expandArg(Index);
       continue;
     }
 
@@ -2595,6 +2596,24 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, MCAsmMacro &Macro,
       }
     }
 
+    if (AltMacroMode && isIdentifierChar(Body[I])) {
+      size_t Len = 1;
+      while (I + Len != End && isIdentifierChar(Body[I + Len]))
+        ++Len;
+      StringRef Argument(Body.data() + I, Len);
+      unsigned Index = 0;
+      for (; Index != NParameters; ++Index)
+        if (Parameters[Index].Name == Argument)
+          break;
+      if (Index != NParameters) {
+        expandArg(Index);
+        I += Len;
+        if (I != End && Body[I] == '&')
+          ++I;
+        continue;
+      }
+    }
+
     OS << Body[I];
     ++I;
   }
diff --git a/llvm/test/MC/AsmParser/altmacro-arg.s b/llvm/test/MC/AsmParser/altmacro-arg.s
new file mode 100644
index 00000000000000..262c5eac832e0c
--- /dev/null
+++ b/llvm/test/MC/AsmParser/altmacro-arg.s
@@ -0,0 +1,22 @@
+## Arguments can be expanded even if they are not preceded by \
+# RUN: llvm-mc -triple=x86_64 %s | FileCheck %s
+
+# CHECK:      1 1 1a
+# CHECK-NEXT: 1 2 1a 2b
+# CHECK-NEXT: \$b \$b
+.altmacro
+.irp ._a,1
+  .print "\._a \._a& ._a&a"
+  .irp $b,2
+    .print "\._a \$b ._a&a $b&b"
+  .endr
+  .print "\$b \$b&"
+.endr
+
+# CHECK:      1 1& ._a&a
+# CHECK-NEXT: \$b \$b&
+.noaltmacro
+.irp ._a,1
+  .print "\._a \._a& ._a&a"
+  .print "\$b \$b&"
+.endr

From 3a4970652902dbdac0cef66738b85e695b67338c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 5 Jul 2024 14:27:51 -0700
Subject: [PATCH 33/67] [AArch64,test] Improve ilp32-diagnostics.s

---
 llvm/test/MC/AArch64/ilp32-diagnostics.s | 186 ++++++++++-------------
 1 file changed, 82 insertions(+), 104 deletions(-)

diff --git a/llvm/test/MC/AArch64/ilp32-diagnostics.s b/llvm/test/MC/AArch64/ilp32-diagnostics.s
index 4ca15f160418da..8a3bc1398e0429 100644
--- a/llvm/test/MC/AArch64/ilp32-diagnostics.s
+++ b/llvm/test/MC/AArch64/ilp32-diagnostics.s
@@ -1,105 +1,83 @@
 // RUN: not llvm-mc -triple aarch64-none-linux-gnu_ilp32 \
-// RUN:  < %s 2> %t2 -filetype=obj >/dev/null
-// RUN: FileCheck --check-prefix=CHECK-ERROR %s < %t2
-
-        .xword sym-.
-// CHECK-ERROR: error: ILP32 8 byte PC relative data relocation not supported (LP64 eqv: PREL64)
-// CHECK-ERROR: ^
-
-        .xword sym+16
-// CHECK-ERROR: error: ILP32 8 byte absolute data relocation not supported (LP64 eqv: ABS64)
-// CHECK-ERROR: ^
-
-        .xword sym@AUTH(da,42)
-// CHECK-ERROR: error: ILP32 8 byte absolute data relocation not supported (LP64 eqv: AUTH_ABS64)
-// CHECK-ERROR: ^
-
-        .xword sym@AUTH(da,42,addr)
-// CHECK-ERROR: error: ILP32 8 byte absolute data relocation not supported (LP64 eqv: AUTH_ABS64)
-// CHECK-ERROR: ^
-
-        movz x7, #:abs_g3:some_label
-// CHECK-ERROR: error: ILP32 absolute MOV relocation not supported (LP64 eqv: MOVW_UABS_G3)
-// CHECK-ERROR:        movz x7, #:abs_g3:some_label
-// CHECK-ERROR:        ^
-
-        movz x3, #:abs_g2:some_label
-// CHECK-ERROR: error: ILP32 absolute MOV relocation not supported (LP64 eqv: MOVW_UABS_G2)
-// CHECK-ERROR: movz x3, #:abs_g2:some_label
-// CHECK-ERROR: ^
-
-        movz x19, #:abs_g2_s:some_label
-// CHECK-ERROR: error: ILP32 absolute MOV relocation not supported (LP64 eqv: MOVW_SABS_G2)
-// CHECK-ERROR: movz x19, #:abs_g2_s:some_label
-// CHECK-ERROR: ^
-
-        movk x5, #:abs_g2_nc:some_label
-// CHECK-ERROR: error: ILP32 absolute MOV relocation not supported (LP64 eqv: MOVW_UABS_G2_NC)
-// CHECK-ERROR: movk x5, #:abs_g2_nc:some_label
-// CHECK-ERROR: ^
-
-        movz x19, #:abs_g1_s:some_label
-// CHECK-ERROR: error: ILP32 absolute MOV relocation not supported (LP64 eqv: MOVW_SABS_G1)
-// CHECK-ERROR: movz x19, #:abs_g1_s:some_label
-// CHECK-ERROR: ^
-
-        movk x5, #:abs_g1_nc:some_label
-// CHECK-ERROR: error: ILP32 absolute MOV relocation not supported (LP64 eqv: MOVW_UABS_G1_NC)
-// CHECK-ERROR: movk x5, #:abs_g1_nc:some_label
-// CHECK-ERROR: ^
-
-        movz x3, #:dtprel_g2:var
-// CHECK-ERROR: error: ILP32 absolute MOV relocation not supported (LP64 eqv: TLSLD_MOVW_DTPREL_G2)
-// CHECK-ERROR: movz x3, #:dtprel_g2:var
-// CHECK-ERROR: ^
-
-        movk x9, #:dtprel_g1_nc:var
-// CHECK-ERROR: error: ILP32 absolute MOV relocation not supported (LP64 eqv: TLSLD_MOVW_DTPREL_G1_NC)
-// CHECK-ERROR: movk x9, #:dtprel_g1_nc:var
-// CHECK-ERROR: ^
-
-        movz x3, #:tprel_g2:var
-// CHECK-ERROR: error: ILP32 absolute MOV relocation not supported (LP64 eqv: TLSLE_MOVW_TPREL_G2)
-// CHECK-ERROR: movz x3, #:tprel_g2:var
-// CHECK-ERROR: ^
-
-        movk x9, #:tprel_g1_nc:var
-// CHECK-ERROR: error: ILP32 absolute MOV relocation not supported (LP64 eqv: TLSLE_MOVW_TPREL_G1_NC)
-// CHECK-ERROR: movk x9, #:tprel_g1_nc:var
-// CHECK-ERROR: ^
-
-        movz x15, #:gottprel_g1:var
-// CHECK-ERROR: error: ILP32 absolute MOV relocation not supported (LP64 eqv: TLSIE_MOVW_GOTTPREL_G1)
-// CHECK-ERROR: movz x15, #:gottprel_g1:var
-// CHECK-ERROR: ^
-
-        movk x13, #:gottprel_g0_nc:var
-// CHECK-ERROR: error: ILP32 absolute MOV relocation not supported (LP64 eqv: TLSIE_MOVW_GOTTPREL_G0_NC)
-// CHECK-ERROR: movk x13, #:gottprel_g0_nc:var
-// CHECK-ERROR: ^
-
-        ldr x10, [x0, #:gottprel_lo12:var]
-// CHECK-ERROR: error: ILP32 64-bit load/store relocation not supported (LP64 eqv: TLSIE_LD64_GOTTPREL_LO12_NC)
-// CHECK-ERROR: ldr x10, [x0, #:gottprel_lo12:var]
-// CHECK-ERROR: ^
-
-   ldr x24, [x23, #:got_lo12:sym]
-// CHECK-ERROR: error: ILP32 64-bit load/store relocation not supported (LP64 eqv: LD64_GOT_LO12_NC)
-// CHECK-ERROR: ^
-
-   ldr x24, [x23, :gottprel_lo12:sym]
-// CHECK-ERROR: error: ILP32 64-bit load/store relocation not supported (LP64 eqv: TLSIE_LD64_GOTTPREL_LO12_NC)
-// CHECK-ERROR: ^
-
-        ldr x10, [x0, #:gottprel_lo12:var]
-// CHECK-ERROR: error: ILP32 64-bit load/store relocation not supported (LP64 eqv: TLSIE_LD64_GOTTPREL_LO12_NC)
-// CHECK-ERROR: ldr x10, [x0, #:gottprel_lo12:var]
-// CHECK-ERROR: ^
-
-   ldr x24, [x23, #:got_lo12:sym]
-// CHECK-ERROR: error: ILP32 64-bit load/store relocation not supported (LP64 eqv: LD64_GOT_LO12_NC)
-// CHECK-ERROR: ^
-
-   ldr x24, [x23, :gottprel_lo12:sym]
-// CHECK-ERROR: error: ILP32 64-bit load/store relocation not supported (LP64 eqv: TLSIE_LD64_GOTTPREL_LO12_NC)
-// CHECK-ERROR: ^
+// RUN:   < %s 2> %t2 -filetype=obj >/dev/null
+// RUN: FileCheck --check-prefix=ERROR %s --implicit-check-not=error: < %t2
+
+.xword sym-.
+// ERROR: [[#@LINE-1]]:8: error: ILP32 8 byte PC relative data relocation not supported (LP64 eqv: PREL64)
+
+.xword sym+16
+// ERROR: [[#@LINE-1]]:8: error: ILP32 8 byte absolute data relocation not supported (LP64 eqv: ABS64)
+
+.xword sym@AUTH(da,42)
+// ERROR: [[#@LINE-1]]:8: error: ILP32 8 byte absolute data relocation not supported (LP64 eqv: AUTH_ABS64)
+
+.xword sym@AUTH(da,42,addr)
+// ERROR: [[#@LINE-1]]:8: error: ILP32 8 byte absolute data relocation not supported (LP64 eqv: AUTH_ABS64)
+
+movz x7, #:abs_g3:some_label
+// ERROR: [[#@LINE-1]]:1: error: ILP32 absolute MOV relocation not supported (LP64 eqv: MOVW_UABS_G3)
+// ERROR:        movz x7, #:abs_g3:some_label
+
+movz x3, #:abs_g2:some_label
+// ERROR: [[#@LINE-1]]:1: error: ILP32 absolute MOV relocation not supported (LP64 eqv: MOVW_UABS_G2)
+// ERROR: movz x3, #:abs_g2:some_label
+
+movz x19, #:abs_g2_s:some_label
+// ERROR: [[#@LINE-1]]:1: error: ILP32 absolute MOV relocation not supported (LP64 eqv: MOVW_SABS_G2)
+// ERROR: movz x19, #:abs_g2_s:some_label
+
+movk x5, #:abs_g2_nc:some_label
+// ERROR: [[#@LINE-1]]:1: error: ILP32 absolute MOV relocation not supported (LP64 eqv: MOVW_UABS_G2_NC)
+// ERROR: movk x5, #:abs_g2_nc:some_label
+
+movz x19, #:abs_g1_s:some_label
+// ERROR: [[#@LINE-1]]:1: error: ILP32 absolute MOV relocation not supported (LP64 eqv: MOVW_SABS_G1)
+// ERROR: movz x19, #:abs_g1_s:some_label
+
+movk x5, #:abs_g1_nc:some_label
+// ERROR: [[#@LINE-1]]:1: error: ILP32 absolute MOV relocation not supported (LP64 eqv: MOVW_UABS_G1_NC)
+// ERROR: movk x5, #:abs_g1_nc:some_label
+
+movz x3, #:dtprel_g2:var
+// ERROR: [[#@LINE-1]]:1: error: ILP32 absolute MOV relocation not supported (LP64 eqv: TLSLD_MOVW_DTPREL_G2)
+// ERROR: movz x3, #:dtprel_g2:var
+
+movk x9, #:dtprel_g1_nc:var
+// ERROR: [[#@LINE-1]]:1: error: ILP32 absolute MOV relocation not supported (LP64 eqv: TLSLD_MOVW_DTPREL_G1_NC)
+// ERROR: movk x9, #:dtprel_g1_nc:var
+
+movz x3, #:tprel_g2:var
+// ERROR: [[#@LINE-1]]:1: error: ILP32 absolute MOV relocation not supported (LP64 eqv: TLSLE_MOVW_TPREL_G2)
+// ERROR: movz x3, #:tprel_g2:var
+
+movk x9, #:tprel_g1_nc:var
+// ERROR: [[#@LINE-1]]:1: error: ILP32 absolute MOV relocation not supported (LP64 eqv: TLSLE_MOVW_TPREL_G1_NC)
+// ERROR: movk x9, #:tprel_g1_nc:var
+
+movz x15, #:gottprel_g1:var
+// ERROR: [[#@LINE-1]]:1: error: ILP32 absolute MOV relocation not supported (LP64 eqv: TLSIE_MOVW_GOTTPREL_G1)
+// ERROR: movz x15, #:gottprel_g1:var
+
+movk x13, #:gottprel_g0_nc:var
+// ERROR: [[#@LINE-1]]:1: error: ILP32 absolute MOV relocation not supported (LP64 eqv: TLSIE_MOVW_GOTTPREL_G0_NC)
+// ERROR: movk x13, #:gottprel_g0_nc:var
+
+ldr x10, [x0, #:gottprel_lo12:var]
+// ERROR: [[#@LINE-1]]:1: error: ILP32 64-bit load/store relocation not supported (LP64 eqv: TLSIE_LD64_GOTTPREL_LO12_NC)
+// ERROR: ldr x10, [x0, #:gottprel_lo12:var]
+
+ldr x24, [x23, #:got_lo12:sym]
+// ERROR: [[#@LINE-1]]:1: error: ILP32 64-bit load/store relocation not supported (LP64 eqv: LD64_GOT_LO12_NC)
+
+ldr x24, [x23, :gottprel_lo12:sym]
+// ERROR: [[#@LINE-1]]:1: error: ILP32 64-bit load/store relocation not supported (LP64 eqv: TLSIE_LD64_GOTTPREL_LO12_NC)
+
+ldr x10, [x0, #:gottprel_lo12:var]
+// ERROR: [[#@LINE-1]]:1: error: ILP32 64-bit load/store relocation not supported (LP64 eqv: TLSIE_LD64_GOTTPREL_LO12_NC)
+// ERROR: ldr x10, [x0, #:gottprel_lo12:var]
+
+ldr x24, [x23, #:got_lo12:sym]
+// ERROR: [[#@LINE-1]]:1: error: ILP32 64-bit load/store relocation not supported (LP64 eqv: LD64_GOT_LO12_NC)
+
+ldr x24, [x23, :gottprel_lo12:sym]
+// ERROR: [[#@LINE-1]]:1: error: ILP32 64-bit load/store relocation not supported (LP64 eqv: TLSIE_LD64_GOTTPREL_LO12_NC)

From 37bee254975baaa07511cc93ddf059722f29e6b0 Mon Sep 17 00:00:00 2001
From: Shaw Young <58664393+shawbyoung@users.noreply.github.com>
Date: Fri, 5 Jul 2024 14:44:15 -0700
Subject: [PATCH 34/67] [BOLT][NFC] Refactor function matching (#97502)

Moved function matching techniques into separate helper functions for
ease of understanding and to make space for additional function
matching techniques to be added (e.g. call graph function matching).
---
 bolt/include/bolt/Profile/YAMLProfileReader.h |  19 +-
 bolt/lib/Profile/YAMLProfileReader.cpp        | 183 ++++++++++--------
 2 files changed, 113 insertions(+), 89 deletions(-)

diff --git a/bolt/include/bolt/Profile/YAMLProfileReader.h b/bolt/include/bolt/Profile/YAMLProfileReader.h
index 8bcae2b4df7394..582546a7e3b5e8 100644
--- a/bolt/include/bolt/Profile/YAMLProfileReader.h
+++ b/bolt/include/bolt/Profile/YAMLProfileReader.h
@@ -73,6 +73,10 @@ class YAMLProfileReader : public ProfileReaderBase {
   bool parseFunctionProfile(BinaryFunction &Function,
                             const yaml::bolt::BinaryFunctionProfile &YamlBF);
 
+  /// Checks if a function profile matches a binary function.
+  bool profileMatches(const yaml::bolt::BinaryFunctionProfile &Profile,
+                      const BinaryFunction &BF);
+
   /// Infer function profile from stale data (collected on older binaries).
   bool inferStaleProfile(BinaryFunction &Function,
                          const yaml::bolt::BinaryFunctionProfile &YamlBF);
@@ -80,6 +84,18 @@ class YAMLProfileReader : public ProfileReaderBase {
   /// Initialize maps for profile matching.
   void buildNameMaps(BinaryContext &BC);
 
+  /// Matches functions using exact name.
+  size_t matchWithExactName();
+
+  /// Matches function using LTO comomon name.
+  size_t matchWithLTOCommonName();
+
+  /// Matches functions using exact hash.
+  size_t matchWithHash(BinaryContext &BC);
+
+  /// Matches functions with similarly named profiled functions.
+  size_t matchWithNameSimilarity(BinaryContext &BC);
+
   /// Update matched YAML -> BinaryFunction pair.
   void matchProfileToFunction(yaml::bolt::BinaryFunctionProfile &YamlBF,
                               BinaryFunction &BF) {
@@ -93,9 +109,6 @@ class YAMLProfileReader : public ProfileReaderBase {
     ProfiledFunctions.emplace(&BF);
   }
 
-  /// Matches functions with similarly named profiled functions.
-  uint64_t matchWithNameSimilarity(BinaryContext &BC);
-
   /// Check if the profile uses an event with a given \p Name.
   bool usesEvent(StringRef Name) const;
 };
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index 63222147bedd6b..3abc0342622242 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -342,6 +342,13 @@ Error YAMLProfileReader::preprocessProfile(BinaryContext &BC) {
   return Error::success();
 }
 
+bool YAMLProfileReader::profileMatches(
+    const yaml::bolt::BinaryFunctionProfile &Profile, const BinaryFunction &BF) {
+  if (opts::IgnoreHash)
+    return Profile.NumBasicBlocks == BF.size();
+  return Profile.Hash == static_cast<uint64_t>(BF.getHash());
+}
+
 bool YAMLProfileReader::mayHaveProfileData(const BinaryFunction &BF) {
   if (opts::MatchProfileWithFunctionHash)
     return true;
@@ -358,8 +365,92 @@ bool YAMLProfileReader::mayHaveProfileData(const BinaryFunction &BF) {
   return false;
 }
 
-uint64_t YAMLProfileReader::matchWithNameSimilarity(BinaryContext &BC) {
-  uint64_t MatchedWithNameSimilarity = 0;
+size_t YAMLProfileReader::matchWithExactName() {
+  size_t MatchedWithExactName = 0;
+  // This first pass assigns profiles that match 100% by name and by hash.
+  for (auto [YamlBF, BF] : llvm::zip_equal(YamlBP.Functions, ProfileBFs)) {
+    if (!BF)
+      continue;
+    BinaryFunction &Function = *BF;
+    // Clear function call count that may have been set while pre-processing
+    // the profile.
+    Function.setExecutionCount(BinaryFunction::COUNT_NO_PROFILE);
+
+    if (profileMatches(YamlBF, Function)) {
+      matchProfileToFunction(YamlBF, Function);
+      ++MatchedWithExactName;
+    }
+  }
+  return MatchedWithExactName;
+}
+
+size_t YAMLProfileReader::matchWithHash(BinaryContext &BC) {
+  // Iterates through profiled functions to match the first binary function with
+  // the same exact hash. Serves to match identical, renamed functions.
+  // Collisions are possible where multiple functions share the same exact hash.
+  size_t MatchedWithHash = 0;
+  if (opts::MatchProfileWithFunctionHash) {
+    DenseMap<size_t, BinaryFunction *> StrictHashToBF;
+    StrictHashToBF.reserve(BC.getBinaryFunctions().size());
+
+    for (auto &[_, BF] : BC.getBinaryFunctions())
+      StrictHashToBF[BF.getHash()] = &BF;
+
+    for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) {
+      if (YamlBF.Used)
+        continue;
+      auto It = StrictHashToBF.find(YamlBF.Hash);
+      if (It != StrictHashToBF.end() && !ProfiledFunctions.count(It->second)) {
+        BinaryFunction *BF = It->second;
+        matchProfileToFunction(YamlBF, *BF);
+        ++MatchedWithHash;
+      }
+    }
+  }
+  return MatchedWithHash;
+}
+
+size_t YAMLProfileReader::matchWithLTOCommonName() {
+  // This second pass allows name ambiguity for LTO private functions.
+  size_t MatchedWithLTOCommonName = 0;
+  for (const auto &[CommonName, LTOProfiles] : LTOCommonNameMap) {
+    if (!LTOCommonNameFunctionMap.contains(CommonName))
+      continue;
+    std::unordered_set<BinaryFunction *> &Functions =
+        LTOCommonNameFunctionMap[CommonName];
+    // Return true if a given profile is matched to one of BinaryFunctions with
+    // matching LTO common name.
+    auto matchProfile = [&](yaml::bolt::BinaryFunctionProfile *YamlBF) {
+      if (YamlBF->Used)
+        return false;
+      for (BinaryFunction *BF : Functions) {
+        if (!ProfiledFunctions.count(BF) && profileMatches(*YamlBF, *BF)) {
+          matchProfileToFunction(*YamlBF, *BF);
+          ++MatchedWithLTOCommonName;
+          return true;
+        }
+      }
+      return false;
+    };
+    bool ProfileMatched = llvm::any_of(LTOProfiles, matchProfile);
+
+    // If there's only one function with a given name, try to match it
+    // partially.
+    if (!ProfileMatched && LTOProfiles.size() == 1 && Functions.size() == 1 &&
+        !LTOProfiles.front()->Used &&
+        !ProfiledFunctions.count(*Functions.begin())) {
+      matchProfileToFunction(*LTOProfiles.front(), **Functions.begin());
+      ++MatchedWithLTOCommonName;
+    }
+  }
+  return MatchedWithLTOCommonName;
+}
+
+size_t YAMLProfileReader::matchWithNameSimilarity(BinaryContext &BC) {
+  if (opts::NameSimilarityFunctionMatchingThreshold == 0)
+    return 0;
+
+  size_t MatchedWithNameSimilarity = 0;
   ItaniumPartialDemangler Demangler;
 
   // Demangle and derive namespace from function name.
@@ -477,17 +568,6 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
   }
   YamlProfileToFunction.resize(YamlBP.Functions.size() + 1);
 
-  auto profileMatches = [](const yaml::bolt::BinaryFunctionProfile &Profile,
-                           BinaryFunction &BF) {
-    if (opts::IgnoreHash)
-      return Profile.NumBasicBlocks == BF.size();
-    return Profile.Hash == static_cast<uint64_t>(BF.getHash());
-  };
-
-  uint64_t MatchedWithExactName = 0;
-  uint64_t MatchedWithHash = 0;
-  uint64_t MatchedWithLTOCommonName = 0;
-
   // Computes hash for binary functions.
   if (opts::MatchProfileWithFunctionHash) {
     for (auto &[_, BF] : BC.getBinaryFunctions()) {
@@ -501,84 +581,15 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
     }
   }
 
-  // This first pass assigns profiles that match 100% by name and by hash.
-  for (auto [YamlBF, BF] : llvm::zip_equal(YamlBP.Functions, ProfileBFs)) {
-    if (!BF)
-      continue;
-    BinaryFunction &Function = *BF;
-    // Clear function call count that may have been set while pre-processing
-    // the profile.
-    Function.setExecutionCount(BinaryFunction::COUNT_NO_PROFILE);
-
-    if (profileMatches(YamlBF, Function)) {
-      matchProfileToFunction(YamlBF, Function);
-      ++MatchedWithExactName;
-    }
-  }
-
-  // Iterates through profiled functions to match the first binary function with
-  // the same exact hash. Serves to match identical, renamed functions.
-  // Collisions are possible where multiple functions share the same exact hash.
-  if (opts::MatchProfileWithFunctionHash) {
-    DenseMap<size_t, BinaryFunction *> StrictHashToBF;
-    StrictHashToBF.reserve(BC.getBinaryFunctions().size());
-
-    for (auto &[_, BF] : BC.getBinaryFunctions())
-      StrictHashToBF[BF.getHash()] = &BF;
-
-    for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) {
-      if (YamlBF.Used)
-        continue;
-      auto It = StrictHashToBF.find(YamlBF.Hash);
-      if (It != StrictHashToBF.end() && !ProfiledFunctions.count(It->second)) {
-        BinaryFunction *BF = It->second;
-        matchProfileToFunction(YamlBF, *BF);
-        ++MatchedWithHash;
-      }
-    }
-  }
-
-  // This second pass allows name ambiguity for LTO private functions.
-  for (const auto &[CommonName, LTOProfiles] : LTOCommonNameMap) {
-    if (!LTOCommonNameFunctionMap.contains(CommonName))
-      continue;
-    std::unordered_set<BinaryFunction *> &Functions =
-        LTOCommonNameFunctionMap[CommonName];
-    // Return true if a given profile is matched to one of BinaryFunctions with
-    // matching LTO common name.
-    auto matchProfile = [&](yaml::bolt::BinaryFunctionProfile *YamlBF) {
-      if (YamlBF->Used)
-        return false;
-      for (BinaryFunction *BF : Functions) {
-        if (!ProfiledFunctions.count(BF) && profileMatches(*YamlBF, *BF)) {
-          matchProfileToFunction(*YamlBF, *BF);
-          ++MatchedWithLTOCommonName;
-          return true;
-        }
-      }
-      return false;
-    };
-    bool ProfileMatched = llvm::any_of(LTOProfiles, matchProfile);
-
-    // If there's only one function with a given name, try to match it
-    // partially.
-    if (!ProfileMatched && LTOProfiles.size() == 1 && Functions.size() == 1 &&
-        !LTOProfiles.front()->Used &&
-        !ProfiledFunctions.count(*Functions.begin())) {
-      matchProfileToFunction(*LTOProfiles.front(), **Functions.begin());
-      ++MatchedWithLTOCommonName;
-    }
-  }
+  const size_t MatchedWithExactName = matchWithExactName();
+  const size_t MatchedWithHash = matchWithHash(BC);
+  const size_t MatchedWithLTOCommonName = matchWithLTOCommonName();
+  const size_t MatchedWithNameSimilarity = matchWithNameSimilarity(BC);
 
   for (auto [YamlBF, BF] : llvm::zip_equal(YamlBP.Functions, ProfileBFs))
     if (!YamlBF.Used && BF && !ProfiledFunctions.count(BF))
       matchProfileToFunction(YamlBF, *BF);
 
-  // Uses name similarity to match functions that were not matched by name.
-  uint64_t MatchedWithNameSimilarity =
-      opts::NameSimilarityFunctionMatchingThreshold > 0
-          ? matchWithNameSimilarity(BC)
-          : 0;
 
   for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions)
     if (!YamlBF.Used && opts::Verbosity >= 1)

From 4a0aff199bda8abf04a59e4c0bdcedaac7d19841 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 5 Jul 2024 15:15:01 -0700
Subject: [PATCH 35/67] MCAssembler: Clean up iterator types for Symbols

---
 llvm/include/llvm/MC/MCAssembler.h | 29 ++++++-----------------------
 llvm/lib/MC/MCAssembler.cpp        | 12 ++++++++----
 2 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index 3a6d6105f20aac..8a8f0d4c1ea086 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
@@ -56,18 +57,10 @@ class MCValue;
 class MCAssembler {
 public:
   using SectionListType = std::vector<MCSection *>;
-  using SymbolDataListType = std::vector<const MCSymbol *>;
 
   using const_iterator = pointee_iterator<SectionListType::const_iterator>;
   using iterator = pointee_iterator<SectionListType::iterator>;
 
-  using const_symbol_iterator =
-      pointee_iterator<SymbolDataListType::const_iterator>;
-  using symbol_iterator = pointee_iterator<SymbolDataListType::iterator>;
-
-  using symbol_range = iterator_range<symbol_iterator>;
-  using const_symbol_range = iterator_range<const_symbol_iterator>;
-
   /// MachO specific deployment target version info.
   // A Major version of 0 indicates that no version information was supplied
   // and so the corresponding load command should not be emitted.
@@ -98,7 +91,7 @@ class MCAssembler {
 
   SectionListType Sections;
 
-  SymbolDataListType Symbols;
+  SmallVector<const MCSymbol *, 0> Symbols;
 
   /// The list of linker options to propagate into the object file.
   std::vector<std::vector<std::string>> LinkerOptions;
@@ -344,22 +337,12 @@ class MCAssembler {
 
   size_t size() const { return Sections.size(); }
 
-  /// @}
-  /// \name Symbol List Access
-  /// @{
-  symbol_iterator symbol_begin() { return Symbols.begin(); }
-  const_symbol_iterator symbol_begin() const { return Symbols.begin(); }
-
-  symbol_iterator symbol_end() { return Symbols.end(); }
-  const_symbol_iterator symbol_end() const { return Symbols.end(); }
-
-  symbol_range symbols() { return make_range(symbol_begin(), symbol_end()); }
-  const_symbol_range symbols() const {
-    return make_range(symbol_begin(), symbol_end());
+  iterator_range<pointee_iterator<
+      typename SmallVector<const MCSymbol *, 0>::const_iterator>>
+  symbols() const {
+    return make_pointee_range(Symbols);
   }
 
-  size_t symbol_size() const { return Symbols.size(); }
-
   /// @}
   /// \name Linker Option List Access
   /// @{
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index bab941b8e1ea89..3c777791472bf7 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -1346,11 +1346,15 @@ LLVM_DUMP_METHOD void MCAssembler::dump() const{
   OS << "],\n";
   OS << "  Symbols:[";
 
-  for (const_symbol_iterator it = symbol_begin(), ie = symbol_end(); it != ie; ++it) {
-    if (it != symbol_begin()) OS << ",\n           ";
+  bool First = true;
+  for (const MCSymbol &Sym : symbols()) {
+    if (First)
+      First = false;
+    else
+      OS << ",\n           ";
     OS << "(";
-    it->dump();
-    OS << ", Index:" << it->getIndex() << ", ";
+    Sym.dump();
+    OS << ", Index:" << Sym.getIndex() << ", ";
     OS << ")";
   }
   OS << "]>\n";

From 045e68f97b0e3d3df7be1d45e9ea3e105fff7b2a Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Fri, 5 Jul 2024 18:18:10 -0400
Subject: [PATCH 36/67] [libc] Change the test file names used in readlink_test
 and readlinkat_test. (#97864)

Attempting to fix the following errors from the build bots:
```
Failed to match LIBC_NAMESPACE::symlink(LINK_VAL, LINK) against Succeeds(0).
Expected return value to be equal to 0 but got -1.
Expected errno to be equal to "Success" but got "File exists".
```
---
 libc/test/src/unistd/readlink_test.cpp   | 4 ++--
 libc/test/src/unistd/readlinkat_test.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libc/test/src/unistd/readlink_test.cpp b/libc/test/src/unistd/readlink_test.cpp
index 20f3951349118a..49ab9c24f4024e 100644
--- a/libc/test/src/unistd/readlink_test.cpp
+++ b/libc/test/src/unistd/readlink_test.cpp
@@ -18,9 +18,9 @@ namespace cpp = LIBC_NAMESPACE::cpp;
 
 TEST(LlvmLibcReadlinkTest, CreateAndUnlink) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  constexpr const char *FILENAME = "readlink_test_value";
+  constexpr const char *FILENAME = "readlink_test_file";
   auto LINK_VAL = libc_make_test_file_path(FILENAME);
-  constexpr const char *FILENAME2 = "readlink.test.link";
+  constexpr const char *FILENAME2 = "readlink_test_file.link";
   auto LINK = libc_make_test_file_path(FILENAME2);
   LIBC_NAMESPACE::libc_errno = 0;
 
diff --git a/libc/test/src/unistd/readlinkat_test.cpp b/libc/test/src/unistd/readlinkat_test.cpp
index 39d81d9ba544a6..7e1ded5f8a4a19 100644
--- a/libc/test/src/unistd/readlinkat_test.cpp
+++ b/libc/test/src/unistd/readlinkat_test.cpp
@@ -20,9 +20,9 @@ namespace cpp = LIBC_NAMESPACE::cpp;
 
 TEST(LlvmLibcReadlinkatTest, CreateAndUnlink) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  constexpr const char *FILENAME = "readlinkat_test_value";
+  constexpr const char *FILENAME = "readlinkat_test_file";
   auto LINK_VAL = libc_make_test_file_path(FILENAME);
-  constexpr const char *FILENAME2 = "readlinkat.test.link";
+  constexpr const char *FILENAME2 = "readlinkat_test_file.link";
   auto LINK = libc_make_test_file_path(FILENAME2);
   LIBC_NAMESPACE::libc_errno = 0;
 

From dc1da93958be0311b79dce39d71bd954c478cf19 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Fri, 5 Jul 2024 15:18:49 -0700
Subject: [PATCH 37/67] [BOLT][BAT] Add support for three-way split functions
 (#93760)

In three-way split functions, if only .warm fragment is present, BAT
incorrectly overwrites the map for .warm fragment by empty .cold
fragment.

Test Plan: updated register-fragments-bolt-symbols.s
---
 bolt/lib/Profile/BoltAddressTranslation.cpp     | 3 +++
 bolt/test/X86/register-fragments-bolt-symbols.s | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index 519f282a2351c2..ec7e303c0f52e8 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -129,6 +129,9 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
     LLVM_DEBUG(dbgs() << " Cold part\n");
     for (const FunctionFragment &FF :
          Function.getLayout().getSplitFragments()) {
+      // Skip empty fragments to avoid adding zero-address entries to maps.
+      if (FF.empty())
+        continue;
       ColdPartSource.emplace(FF.getAddress(), Function.getOutputAddress());
       Map.clear();
       for (const BinaryBasicBlock *const BB : FF)
diff --git a/bolt/test/X86/register-fragments-bolt-symbols.s b/bolt/test/X86/register-fragments-bolt-symbols.s
index 5c9fb5ed1a757e..c9f1859c4e8a9c 100644
--- a/bolt/test/X86/register-fragments-bolt-symbols.s
+++ b/bolt/test/X86/register-fragments-bolt-symbols.s
@@ -13,8 +13,12 @@
 # PREAGGWARM: B X:0 #chain.warm# 1 0
 # RUN: perf2bolt %t.warm.bolt -p %t.preagg.warm --pa -o %t.warm.fdata -w %t.warm.yaml \
 # RUN:   -v=1 | FileCheck %s --check-prefix=CHECK-BOLT-WARM
+# RUN: FileCheck %s --input-file %t.warm.fdata --check-prefix=CHECK-FDATA-WARM
+# RUN: FileCheck %s --input-file %t.warm.yaml --check-prefix=CHECK-YAML-WARM
 
 # CHECK-BOLT-WARM: marking chain.warm/1(*2) as a fragment of chain
+# CHECK-FDATA-WARM: chain
+# CHECK-YAML-WARM: chain
 
 # RUN: sed -i 's|chain|chain/2|g' %t.fdata
 # RUN: llvm-objcopy --localize-symbol=chain %t.main.o

From 34855405b0a7dd6719fa3278f9b888f7f11bc4d8 Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <youngsuk.kim@hpe.com>
Date: Fri, 5 Jul 2024 16:36:19 -0500
Subject: [PATCH 38/67] [llvm] Avoid 'raw_string_ostream::str' (NFC)

Since `raw_string_ostream` doesn't own the string buffer, it is
desirable (in terms of memory safety) for users to directly reference
the string buffer rather than use `raw_string_ostream::str()`.

Work towards TODO item to remove `raw_string_ostream::str()`.
---
 llvm/lib/CodeGen/MIRPrintingPass.cpp          |  2 +-
 llvm/lib/FileCheck/FileCheck.cpp              |  2 +-
 llvm/lib/IR/DiagnosticInfo.cpp                |  2 +-
 llvm/lib/Remarks/Remark.cpp                   |  2 +-
 llvm/tools/llvm-opt-report/OptReport.cpp      |  8 ++++----
 llvm/utils/TableGen/CompressInstEmitter.cpp   | 10 +++++-----
 llvm/utils/TableGen/X86DisassemblerTables.cpp |  4 ++--
 7 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/CodeGen/MIRPrintingPass.cpp b/llvm/lib/CodeGen/MIRPrintingPass.cpp
index 0aed1297cd3eab..f70c0731ffafad 100644
--- a/llvm/lib/CodeGen/MIRPrintingPass.cpp
+++ b/llvm/lib/CodeGen/MIRPrintingPass.cpp
@@ -52,7 +52,7 @@ struct MIRPrintingPass : public MachineFunctionPass {
     std::string Str;
     raw_string_ostream StrOS(Str);
     printMIR(StrOS, MF);
-    MachineFunctions.append(StrOS.str());
+    MachineFunctions.append(Str);
     return false;
   }
 
diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp
index 1eb8330232321e..df5f7f4697c5c4 100644
--- a/llvm/lib/FileCheck/FileCheck.cpp
+++ b/llvm/lib/FileCheck/FileCheck.cpp
@@ -1488,7 +1488,7 @@ std::string Check::FileCheckType::getModifiersDescription() const {
   if (isLiteralMatch())
     OS << "LITERAL";
   OS << '}';
-  return OS.str();
+  return Ret;
 }
 
 std::string Check::FileCheckType::getDescription(StringRef Prefix) const {
diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp
index 108bf689005957..623f372bb9e740 100644
--- a/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/llvm/lib/IR/DiagnosticInfo.cpp
@@ -403,7 +403,7 @@ std::string DiagnosticInfoOptimizationBase::getMsg() const {
                                     ? Args.end()
                                     : Args.begin() + FirstExtraArgIndex))
     OS << Arg.Val;
-  return OS.str();
+  return Str;
 }
 
 DiagnosticInfoMisExpect::DiagnosticInfoMisExpect(const Instruction *Inst,
diff --git a/llvm/lib/Remarks/Remark.cpp b/llvm/lib/Remarks/Remark.cpp
index ef42271a3c8da6..0e98cad8e90454 100644
--- a/llvm/lib/Remarks/Remark.cpp
+++ b/llvm/lib/Remarks/Remark.cpp
@@ -23,7 +23,7 @@ std::string Remark::getArgsAsMsg() const {
   raw_string_ostream OS(Str);
   for (const Argument &Arg : Args)
     OS << Arg.Val;
-  return OS.str();
+  return Str;
 }
 
 /// Returns the value of a specified key parsed from StringRef.
diff --git a/llvm/tools/llvm-opt-report/OptReport.cpp b/llvm/tools/llvm-opt-report/OptReport.cpp
index 27dcb88134bd8e..cee9abcb494196 100644
--- a/llvm/tools/llvm-opt-report/OptReport.cpp
+++ b/llvm/tools/llvm-opt-report/OptReport.cpp
@@ -370,10 +370,10 @@ static bool writeReport(LocationInfoTy &LocationInfo) {
 
           if (!Succinct) {
             RS << LLI.UnrollCount;
-            RS << std::string(UCDigits - RS.str().size(), ' ');
+            RS << std::string(UCDigits - R.size(), ' ');
           }
 
-          return RS.str();
+          return R;
         };
 
         auto VStr = [VFDigits,
@@ -383,10 +383,10 @@ static bool writeReport(LocationInfoTy &LocationInfo) {
 
           if (!Succinct) {
             RS << LLI.VectorizationFactor << "," << LLI.InterleaveCount;
-            RS << std::string(VFDigits + ICDigits + 1 - RS.str().size(), ' ');
+            RS << std::string(VFDigits + ICDigits + 1 - R.size(), ' ');
           }
 
-          return RS.str();
+          return R;
         };
 
         OS << llvm::format_decimal(L, LNDigits) << " ";
diff --git a/llvm/utils/TableGen/CompressInstEmitter.cpp b/llvm/utils/TableGen/CompressInstEmitter.cpp
index fcf77934faacfd..1e30f052ab1dfd 100644
--- a/llvm/utils/TableGen/CompressInstEmitter.cpp
+++ b/llvm/utils/TableGen/CompressInstEmitter.cpp
@@ -618,7 +618,7 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS,
   }
 
   if (CompressPatterns.empty()) {
-    OS << FuncH.str();
+    OS << FH;
     OS.indent(2) << "return false;\n}\n";
     if (EType == EmitterType::Compress)
       OS << "\n#endif //GEN_COMPRESS_INSTR\n";
@@ -835,10 +835,10 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS,
     }
     if (CompressOrUncompress)
       CodeStream.indent(6) << "OutInst.setLoc(MI.getLoc());\n";
-    mergeCondAndCode(CaseStream, CondStream.str(), CodeStream.str());
+    mergeCondAndCode(CaseStream, CondString, CodeString);
     PrevOp = CurOp;
   }
-  Func << CaseStream.str() << "\n";
+  Func << CaseString << "\n";
   // Close brace for the last case.
   Func.indent(4) << "} // case " << CurOp << "\n";
   Func.indent(2) << "} // switch\n";
@@ -876,8 +876,8 @@ void CompressInstEmitter::emitCompressInstEmitter(raw_ostream &OS,
        << "}\n\n";
   }
 
-  OS << FuncH.str();
-  OS << Func.str();
+  OS << FH;
+  OS << F;
 
   if (EType == EmitterType::Compress)
     OS << "\n#endif //GEN_COMPRESS_INSTR\n";
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index f4d282f54ac055..7d28c48055c341 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -1062,11 +1062,11 @@ void DisassemblerTables::emit(raw_ostream &o) const {
   i1--;
   emitContextDecisions(o1, o2, i1, i2, ModRMTableNum);
 
-  o << o1.str();
+  o << s1;
   o << "  0x0\n";
   o << "};\n";
   o << "\n";
-  o << o2.str();
+  o << s2;
   o << "\n";
   o << "\n";
 }

From ceade83ad5fc529f2b2beb896eec0dd0b29fdd44 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 5 Jul 2024 15:28:16 -0700
Subject: [PATCH 39/67] [clang-format] Skip block commented out includes when
 sorting them (#97787)

Fixes #97539.
---
 clang/lib/Format/Format.cpp                 | 15 +++++++++++----
 clang/unittests/Format/SortIncludesTest.cpp |  9 +++++++++
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 6a8883b77a7305..7fd42e46e0ccb7 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -3222,10 +3222,16 @@ tooling::Replacements sortCppIncludes(const FormatStyle &Style, StringRef Code,
     if (Trimmed.contains(RawStringTermination))
       FormattingOff = false;
 
-    if (isClangFormatOff(Trimmed))
+    bool IsBlockComment = false;
+
+    if (isClangFormatOff(Trimmed)) {
       FormattingOff = true;
-    else if (isClangFormatOn(Trimmed))
+    } else if (isClangFormatOn(Trimmed)) {
       FormattingOff = false;
+    } else if (Trimmed.starts_with("/*")) {
+      IsBlockComment = true;
+      Pos = Code.find("*/", SearchFrom + 2);
+    }
 
     const bool EmptyLineSkipped =
         Trimmed.empty() &&
@@ -3235,9 +3241,10 @@ tooling::Replacements sortCppIncludes(const FormatStyle &Style, StringRef Code,
 
     bool MergeWithNextLine = Trimmed.ends_with("\\");
     if (!FormattingOff && !MergeWithNextLine) {
-      if (tooling::HeaderIncludes::IncludeRegex.match(Line, &Matches)) {
+      if (!IsBlockComment &&
+          tooling::HeaderIncludes::IncludeRegex.match(Trimmed, &Matches)) {
         StringRef IncludeName = Matches[2];
-        if (Line.contains("/*") && !Line.contains("*/")) {
+        if (Trimmed.contains("/*") && !Trimmed.contains("*/")) {
           // #include with a start of a block comment, but without the end.
           // Need to keep all the lines until the end of the comment together.
           // FIXME: This is somehow simplified check that probably does not work
diff --git a/clang/unittests/Format/SortIncludesTest.cpp b/clang/unittests/Format/SortIncludesTest.cpp
index 2eeb16b4ab9f52..31753825646373 100644
--- a/clang/unittests/Format/SortIncludesTest.cpp
+++ b/clang/unittests/Format/SortIncludesTest.cpp
@@ -1455,6 +1455,15 @@ TEST_F(SortIncludesTest, DisableRawStringLiteralSorting) {
 #undef X
 }
 
+TEST_F(SortIncludesTest, BlockCommentedOutIncludes) {
+  StringRef Code{"/* #include \"foo.h\"\n"
+                 "#include \"bar.h\" */\n"
+                 "#include <chrono>"};
+
+  FmtStyle = getGoogleStyle(FormatStyle::LK_Cpp);
+  verifyFormat(Code, sort(Code, "input.cpp", 0));
+}
+
 } // end namespace
 } // end namespace format
 } // end namespace clang

From b75453bc07dabe8e0dc0efb0766a4238e3df6712 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 5 Jul 2024 15:42:38 -0700
Subject: [PATCH 40/67] MCAssembler: Remove unneeded non-const iterators for
 Sections and misleading size()

The pointers cannot be mutated even if the dereferenced MCSection can.
---
 llvm/include/llvm/MC/MCAssembler.h  | 12 +-----------
 llvm/lib/MC/MCAssembler.cpp         | 12 ++++++++----
 llvm/lib/MC/MachObjectWriter.cpp    |  7 +++----
 llvm/lib/MC/WinCOFFObjectWriter.cpp |  4 ++--
 4 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h
index 8a8f0d4c1ea086..4b08d50de9e226 100644
--- a/llvm/include/llvm/MC/MCAssembler.h
+++ b/llvm/include/llvm/MC/MCAssembler.h
@@ -56,10 +56,8 @@ class MCValue;
 
 class MCAssembler {
 public:
-  using SectionListType = std::vector<MCSection *>;
-
+  using SectionListType = SmallVector<MCSection *, 0>;
   using const_iterator = pointee_iterator<SectionListType::const_iterator>;
-  using iterator = pointee_iterator<SectionListType::iterator>;
 
   /// MachO specific deployment target version info.
   // A Major version of 0 indicates that no version information was supplied
@@ -326,17 +324,9 @@ class MCAssembler {
     BundleAlignSize = Size;
   }
 
-  /// \name Section List Access
-  /// @{
-
-  iterator begin() { return Sections.begin(); }
   const_iterator begin() const { return Sections.begin(); }
-
-  iterator end() { return Sections.end(); }
   const_iterator end() const { return Sections.end(); }
 
-  size_t size() const { return Sections.size(); }
-
   iterator_range<pointee_iterator<
       typename SmallVector<const MCSymbol *, 0>::const_iterator>>
   symbols() const {
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index 3c777791472bf7..c8d12eb5dcf641 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -1339,14 +1339,18 @@ LLVM_DUMP_METHOD void MCAssembler::dump() const{
 
   OS << "<MCAssembler\n";
   OS << "  Sections:[\n    ";
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if (it != begin()) OS << ",\n    ";
-    it->dump();
+  bool First = true;
+  for (const MCSection &Sec : *this) {
+    if (First)
+      First = false;
+    else
+      OS << ",\n    ";
+    Sec.dump();
   }
   OS << "],\n";
   OS << "  Symbols:[";
 
-  bool First = true;
+  First = true;
   for (const MCSymbol &Sym : symbols()) {
     if (First)
       First = false;
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 14f7f0d5c1871b..53eed0092a5b4d 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -572,9 +572,8 @@ void MachObjectWriter::computeSymbolTable(
   // Build section lookup table.
   DenseMap<const MCSection*, uint8_t> SectionIndexMap;
   unsigned Index = 1;
-  for (MCAssembler::iterator it = Asm.begin(),
-         ie = Asm.end(); it != ie; ++it, ++Index)
-    SectionIndexMap[&*it] = Index;
+  for (MCSection &Sec : Asm)
+    SectionIndexMap[&Sec] = Index++;
   assert(Index <= 256 && "Too many sections!");
 
   // Build the string table.
@@ -798,7 +797,7 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm) {
     }
   }
 
-  unsigned NumSections = Asm.size();
+  unsigned NumSections = Asm.end() - Asm.begin();
   const MCAssembler::VersionInfoType &VersionInfo = Asm.getVersionInfo();
 
   // The section data starts after the header, the segment load command (and
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index c0bad192eb9823..7ba38be7edba94 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -1139,8 +1139,8 @@ uint64_t WinCOFFWriter::writeObject(MCAssembler &Asm) {
 #ifndef NDEBUG
   sections::iterator I = Sections.begin();
   sections::iterator IE = Sections.end();
-  MCAssembler::iterator J = Asm.begin();
-  MCAssembler::iterator JE = Asm.end();
+  auto J = Asm.begin();
+  auto JE = Asm.end();
   for (; I != IE && J != JE; ++I, ++J) {
     while (J != JE && ((Mode == NonDwoOnly && isDwoSection(*J)) ||
                        (Mode == DwoOnly && !isDwoSection(*J))))

From 8f5b1440dbdda570d66f170f47cb971388126bf3 Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Fri, 5 Jul 2024 16:18:43 -0700
Subject: [PATCH 41/67] [bazel] Port #97777 (#97868)

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 38970d9929b9c3..4a59c16ba12fcc 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -1297,8 +1297,9 @@ cc_library(
     copts = llvm_copts,
     includes = ["include"],
     textual_hdrs = [
-        "include/llvm/TargetParser/ARMTargetParserDef.inc",
+        "include/llvm/TargetParser/AArch64CPUFeatures.inc",
         "include/llvm/TargetParser/AArch64TargetParserDef.inc",
+        "include/llvm/TargetParser/ARMTargetParserDef.inc",
         "include/llvm/TargetParser/RISCVTargetParserDef.inc",
     ] + glob([
         "include/llvm/TargetParser/*.def",

From 88381cecfff48536230e11901f1598332033fd45 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 5 Jul 2024 17:11:20 -0700
Subject: [PATCH 42/67] [RISCV] Hoist some common setOperationActions to a
 common place. NFC

We always want these actions if the type is legal. We don't need
to check the subtarget features. So hoist them to a common point.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 50 +++++++++------------
 1 file changed, 20 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 022b8bcedda4d2..e935590bc1b7bd 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1306,11 +1306,20 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         // expansion to a build_vector of 0s.
         setOperationAction(ISD::UNDEF, VT, Custom);
 
+        setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR,
+                            ISD::EXTRACT_SUBVECTOR},
+                           VT, Custom);
+
+        // FIXME: mload, mstore, mgather, mscatter, vp_load/store,
+        // vp_stride_load/store, vp_gather/scatter can be hoisted to here.
+        setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
+
+        setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
+        setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT,
+                           Custom);
+
         if (VT.getVectorElementType() == MVT::f16 &&
             !Subtarget.hasVInstructionsF16()) {
-          setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
-          setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT,
-                             Custom);
           setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
           setOperationAction(
               {ISD::VP_MERGE, ISD::VP_SELECT, ISD::VSELECT, ISD::SELECT}, VT,
@@ -1318,10 +1327,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
           setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP,
                               ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP},
                              VT, Custom);
-          setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR,
-                              ISD::EXTRACT_SUBVECTOR, ISD::VECTOR_SHUFFLE},
-                             VT, Custom);
-          setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
+          setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+          // FIXME: We should prefer BUILD_VECTOR over SPLAT_VECTOR.
           setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
           MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
           // Don't promote f16 vector operations to f32 if f32 vector type is
@@ -1335,16 +1342,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         }
 
         if (VT.getVectorElementType() == MVT::bf16) {
-          setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
           setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
-          setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT,
-                             Custom);
-          setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR,
-                              ISD::EXTRACT_SUBVECTOR},
-                             VT, Custom);
-          setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
-          if (Subtarget.hasStdExtZfbfmin())
-            setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+          // FIXME: We should prefer BUILD_VECTOR over SPLAT_VECTOR.
+          setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
           setOperationAction(
               {ISD::VP_MERGE, ISD::VP_SELECT, ISD::VSELECT, ISD::SELECT}, VT,
               Custom);
@@ -1352,18 +1352,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
           continue;
         }
 
-        // We use EXTRACT_SUBVECTOR as a "cast" from scalable to fixed.
-        setOperationAction({ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR}, VT,
-                           Custom);
-
-        setOperationAction({ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
-                            ISD::VECTOR_SHUFFLE, ISD::INSERT_VECTOR_ELT,
-                            ISD::EXTRACT_VECTOR_ELT},
+        setOperationAction({ISD::BUILD_VECTOR, ISD::VECTOR_SHUFFLE,
+                            ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT},
                            VT, Custom);
 
-        setOperationAction({ISD::LOAD, ISD::STORE, ISD::MLOAD, ISD::MSTORE,
-                            ISD::MGATHER, ISD::MSCATTER},
-                           VT, Custom);
+        setOperationAction(
+            {ISD::MLOAD, ISD::MSTORE, ISD::MGATHER, ISD::MSCATTER}, VT, Custom);
 
         setOperationAction({ISD::VP_LOAD, ISD::VP_STORE,
                             ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
@@ -1377,8 +1371,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                             ISD::IS_FPCLASS, ISD::FMAXIMUM, ISD::FMINIMUM},
                            VT, Custom);
 
-        setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
-
         setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FFLOOR, ISD::FROUND,
                             ISD::FROUNDEVEN, ISD::FRINT, ISD::FNEARBYINT},
                            VT, Custom);
@@ -1395,8 +1387,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
         setOperationAction(FloatingPointVPOps, VT, Custom);
 
-        setOperationAction({ISD::STRICT_FP_EXTEND, ISD::STRICT_FP_ROUND}, VT,
-                           Custom);
         setOperationAction(
             {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
              ISD::STRICT_FDIV, ISD::STRICT_FSQRT, ISD::STRICT_FMA,

From f118c882fe2f2548b814492aa4e51ef29aa29739 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 5 Jul 2024 17:34:23 -0700
Subject: [PATCH 43/67] [RISCV] Remove unnecessary setOperationAction for
 ISD::SELECT_CC for fixed vectors. NFC

We already looped through all builtin operations and marked them as
Expand. We don't need to do it to SELECT_CC again.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e935590bc1b7bd..9a4d77d9bfd1a9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1256,7 +1256,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                            VT, Custom);
 
         setOperationAction(ISD::VSELECT, VT, Custom);
-        setOperationAction(ISD::SELECT_CC, VT, Expand);
 
         setOperationAction(
             {ISD::ANY_EXTEND, ISD::SIGN_EXTEND, ISD::ZERO_EXTEND}, VT, Custom);
@@ -1379,7 +1378,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
         setOperationAction(ISD::SETCC, VT, Custom);
         setOperationAction({ISD::VSELECT, ISD::SELECT}, VT, Custom);
-        setOperationAction(ISD::SELECT_CC, VT, Expand);
 
         setOperationAction(ISD::BITCAST, VT, Custom);
 

From 593f708118aef792f434185547f74fedeaf51dd4 Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <aganea@havenstudios.com>
Date: Fri, 5 Jul 2024 20:44:25 -0400
Subject: [PATCH 44/67] [Support] Silence function cast warning when building
 with Clang ToT targetting Windows

---
 llvm/lib/Support/Windows/Process.inc | 9 +++++++++
 llvm/lib/Support/Windows/Signals.inc | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/llvm/lib/Support/Windows/Process.inc b/llvm/lib/Support/Windows/Process.inc
index 34d294b232c32b..b9110d4e414fff 100644
--- a/llvm/lib/Support/Windows/Process.inc
+++ b/llvm/lib/Support/Windows/Process.inc
@@ -482,9 +482,18 @@ static RTL_OSVERSIONINFOEXW GetWindowsVer() {
     HMODULE hMod = ::GetModuleHandleW(L"ntdll.dll");
     assert(hMod);
 
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-function-type-mismatch"
+#endif
+
     auto getVer = (RtlGetVersionPtr)::GetProcAddress(hMod, "RtlGetVersion");
     assert(getVer);
 
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
     RTL_OSVERSIONINFOEXW info{};
     info.dwOSVersionInfoSize = sizeof(info);
     NTSTATUS r = getVer((PRTL_OSVERSIONINFOW)&info);
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index 29ebf7c696e04f..d057981c1c84cf 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -167,6 +167,11 @@ static bool isDebugHelpInitialized() {
   return fStackWalk64 && fSymInitialize && fSymSetOptions && fMiniDumpWriteDump;
 }
 
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-function-type-mismatch"
+#endif
+
 static bool load64BitDebugHelp(void) {
   HMODULE hLib =
       ::LoadLibraryExA("Dbghelp.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32);
@@ -192,6 +197,10 @@ static bool load64BitDebugHelp(void) {
   return isDebugHelpInitialized();
 }
 
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
 using namespace llvm;
 
 // Forward declare.

From 10e1b935e5d9017067207d62ababa733df088ecd Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <aganea@havenstudios.com>
Date: Fri, 5 Jul 2024 20:46:55 -0400
Subject: [PATCH 45/67] [compiler-rt] Silence function cast warning when
 building with Clang ToT targetting Windows

---
 compiler-rt/lib/fuzzer/FuzzerUtilWindows.cpp             | 9 +++++++++
 .../lib/sanitizer_common/sanitizer_symbolizer_win.cpp    | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/compiler-rt/lib/fuzzer/FuzzerUtilWindows.cpp b/compiler-rt/lib/fuzzer/FuzzerUtilWindows.cpp
index db80eb383885e6..24ea82a8c5dfc2 100644
--- a/compiler-rt/lib/fuzzer/FuzzerUtilWindows.cpp
+++ b/compiler-rt/lib/fuzzer/FuzzerUtilWindows.cpp
@@ -238,6 +238,11 @@ size_t PageSize() {
   return PageSizeCached;
 }
 
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-function-type-mismatch"
+#endif
+
 void SetThreadName(std::thread &thread, const std::string &name) {
   typedef HRESULT(WINAPI * proc)(HANDLE, PCWSTR);
   HMODULE kbase = GetModuleHandleA("KernelBase.dll");
@@ -255,6 +260,10 @@ void SetThreadName(std::thread &thread, const std::string &name) {
   }
 }
 
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
 } // namespace fuzzer
 
 #endif // LIBFUZZER_WINDOWS
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_win.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_win.cpp
index aae3e76ea229ff..a7ffe0e48fabe7 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_win.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_win.cpp
@@ -55,6 +55,11 @@ bool TrySymInitialize() {
 
 }  // namespace
 
+#  if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wcast-function-type-mismatch"
+#  endif
+
 // Initializes DbgHelp library, if it's not yet initialized. Calls to this
 // function should be synchronized with respect to other calls to DbgHelp API
 // (e.g. from WinSymbolizerTool).
@@ -133,6 +138,10 @@ void InitializeDbgHelpIfNeeded() {
   }
 }
 
+#  if defined(__clang__)
+#    pragma clang diagnostic pop
+#  endif
+
 bool WinSymbolizerTool::SymbolizePC(uptr addr, SymbolizedStack *frame) {
   InitializeDbgHelpIfNeeded();
 

From cf1ded3ac248ad4feeed7b4dd20c60b7e3c40339 Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <aganea@havenstudios.com>
Date: Fri, 5 Jul 2024 20:47:23 -0400
Subject: [PATCH 46/67] [lldb] Silence function cast warning when building with
 Clang ToT targetting Windows

---
 .../Process/Windows/Common/TargetThreadWindows.cpp       | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp
index a69c10081ff190..dc7697f71d6a6f 100644
--- a/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp
@@ -175,6 +175,11 @@ Status TargetThreadWindows::DoResume() {
   return Status();
 }
 
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-function-type-mismatch"
+#endif
+
 const char *TargetThreadWindows::GetName() {
   Log *log = GetLog(LLDBLog::Thread);
   static GetThreadDescriptionFunctionPtr GetThreadDescription = []() {
@@ -200,3 +205,7 @@ const char *TargetThreadWindows::GetName() {
 
   return m_name.c_str();
 }
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif

From be26e545424a6e006cd67e4433c88c25b23404ae Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <aganea@havenstudios.com>
Date: Fri, 5 Jul 2024 21:16:04 -0400
Subject: [PATCH 47/67] [openmp] Silence warning when building the x64 Windows
 LLVM release package

This fixes:
```
MASM : warning A4018:invalid command-line option : -U_GLIBCXX_ASSERTIONS
```
---
 openmp/runtime/src/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index 60641e6f0fe5de..f106694841ce8d 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -161,7 +161,9 @@ endif()
 
 # Disable libstdc++ assertions, even in an LLVM_ENABLE_ASSERTIONS build, to
 # avoid an unwanted dependency on libstdc++.so.
-add_definitions(-U_GLIBCXX_ASSERTIONS)
+if(NOT WIN32)
+  add_definitions(-U_GLIBCXX_ASSERTIONS)
+endif()
 
 # Add the OpenMP library
 libomp_get_ldflags(LIBOMP_CONFIGURED_LDFLAGS)

From 6337fdcc520e8f948bef23b361c75edeb32ed015 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 5 Jul 2024 19:34:19 -0700
Subject: [PATCH 48/67] [RISCV] Use EXTLOAD in lowerVECTOR_SHUFFLE. (#97862)

We're creating a load and a splat. The splat doesn't use the extended
bits so it doesn't matter what extend we use.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp                | 2 +-
 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 9a4d77d9bfd1a9..1e37f2c3b9c597 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5050,7 +5050,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
                           Ld->getOriginalAlign(),
                           Ld->getMemOperand()->getFlags());
         else
-          V = DAG.getExtLoad(ISD::SEXTLOAD, DL, XLenVT, Ld->getChain(), NewAddr,
+          V = DAG.getExtLoad(ISD::EXTLOAD, DL, XLenVT, Ld->getChain(), NewAddr,
                              Ld->getPointerInfo().getWithOffset(Offset), SVT,
                              Ld->getOriginalAlign(),
                              Ld->getMemOperand()->getFlags());
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
index 28ce6a12c4c89d..f67282f9e6a322 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
@@ -5863,7 +5863,7 @@ define i8 @vreduce_mul_v2i8(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    lb a0, 1(a0)
+; CHECK-NEXT:    lbu a0, 1(a0)
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret

From 0b9f2847da79298ed09c29493245113f02b32d9f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 5 Jul 2024 19:49:07 -0700
Subject: [PATCH 49/67] [RISCV] Remove unused check-prefixes. NFC

---
 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index def8fb5abf5066..f7477da49a3543 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32V
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+zba,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32VB
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+zba,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64V
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+rva22u64 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RVA22U64
 
@@ -1502,6 +1502,3 @@ define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, d
   %v7 = insertelement <8 x double> %v6, double %e7, i64 7
   ret <8 x double> %v7
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; RV32V: {{.*}}
-; RV32VB: {{.*}}

From a348824798e03c1ffd10e6a1c5340130b0f48bf9 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Sat, 6 Jul 2024 12:43:11 +0800
Subject: [PATCH 50/67] [RISCV] Allow folding vmerge with implicit passthru
 when true has tied dest (#78565)

We currently don't fold a vmerge if it has an implicit-def passthru and
its true operand also has a passthru (i.e. tied dest).

This restriction was added in https://reviews.llvm.org/D151596, back
whenever we had separate TU/TA pseudos. It looks like it was added
because the policy might not have been handled correctly.

However the policy should be set correctly if we relax this restriction
today, since we compute the policy differently now that we have removed
the TU/TA distinction in our pseudos.

We use a TUMU policy, and relax it to TAMU iff the vmerge's passthru is
implicit-def.

The reasoning behind this being that the tail elements always come from
the vmerge's passthru[^1], so if vmerge's passthru is implicit-def then
the tail is also implicit-def. So a tail agnostic policy is OK.

[^1]: unless the VL was shrunk, but in this case which case we
conservatively use TUMU.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |   8 --
 .../RISCV/rvv/rvv-peephole-vmerge-vops.ll     |  34 ++++++
 llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll       | 110 +++++++-----------
 3 files changed, 78 insertions(+), 74 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 8c1f8dca4e102b..7bdd4f8f4dbc30 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3749,11 +3749,6 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   // If True has a merge operand then it needs to be the same as vmerge's False,
   // since False will be used for the result's merge operand.
   if (HasTiedDest && !isImplicitDef(True->getOperand(0))) {
-    // The vmerge instruction must be TU.
-    // FIXME: This could be relaxed, but we need to handle the policy for the
-    // resulting op correctly.
-    if (isImplicitDef(Merge))
-      return false;
     SDValue MergeOpTrue = True->getOperand(0);
     if (False != MergeOpTrue)
       return false;
@@ -3763,9 +3758,6 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   // going to keep the mask from True.
   if (IsMasked) {
     assert(HasTiedDest && "Expected tied dest");
-    // The vmerge instruction must be TU.
-    if (isImplicitDef(Merge))
-      return false;
     // FIXME: Support mask agnostic True instruction which would have an
     // undef merge operand.
     if (Mask && !usesAllOnesMask(Mask, Glue))
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
index 183741dd1ac33d..b6921abf8fdf41 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll
@@ -1144,3 +1144,37 @@ define <vscale x 2 x double> @vpmerge_vfwsub.w_tied(<vscale x 2 x double> %passt
   %b = call <vscale x 2 x double> @llvm.vp.merge.nxv2f64(<vscale x 2 x i1> %mask, <vscale x 2 x double> %a, <vscale x 2 x double> %passthru, i32 %vl)
   ret <vscale x 2 x double> %b
 }
+
+define <vscale x 2 x i32> @true_tied_dest_vmerge_implicit_passthru(<vscale x 2 x i32> %passthru, <vscale x 2 x i32> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %m, i64 %avl) {
+; CHECK-LABEL: true_tied_dest_vmerge_implicit_passthru:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i32> @llvm.riscv.vmacc.nxv2i32.nxv2i32(<vscale x 2 x i32> %passthru, <vscale x 2 x i32> %x, <vscale x 2 x i32> %y, i64 %avl, i64 0)
+  %b = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> poison,
+    <vscale x 2 x i32> %passthru,
+    <vscale x 2 x i32> %a,
+    <vscale x 2 x i1> %m,
+    i64 %avl
+  )
+  ret <vscale x 2 x i32> %b
+}
+
+define <vscale x 2 x i32> @true_mask_vmerge_implicit_passthru(<vscale x 2 x i32> %passthru, <vscale x 2 x i32> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %m, i64 %avl) {
+; CHECK-LABEL: true_mask_vmerge_implicit_passthru:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, mu
+; CHECK-NEXT:    vadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i32> @llvm.riscv.vadd.mask.nxv2i32.nxv2i32(<vscale x 2 x i32> %passthru, <vscale x 2 x i32> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %m, i64 %avl, i64 0)
+  %b = call <vscale x 2 x i32> @llvm.riscv.vmerge.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> poison,
+    <vscale x 2 x i32> %passthru,
+    <vscale x 2 x i32> %a,
+    <vscale x 2 x i1> shufflevector(<vscale x 2 x i1> insertelement(<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer),
+    i64 %avl
+  )
+  ret <vscale x 2 x i32> %b
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll
index 0322c1ab9f6310..22ed56afbd94e5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll
@@ -81,9 +81,8 @@ define <vscale x 1 x i8> @vmadd_vv_nxv1i8_ta(<vscale x 1 x i8> %a, <vscale x 1 x
 define <vscale x 1 x i8> @vmadd_vx_nxv1i8_ta(<vscale x 1 x i8> %a, i8 %b, <vscale x 1 x i8> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv1i8_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmacc.vx v9, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
@@ -170,9 +169,8 @@ define <vscale x 2 x i8> @vmadd_vv_nxv2i8_ta(<vscale x 2 x i8> %a, <vscale x 2 x
 define <vscale x 2 x i8> @vmadd_vx_nxv2i8_ta(<vscale x 2 x i8> %a, i8 %b, <vscale x 2 x i8> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv2i8_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmacc.vx v9, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
@@ -259,9 +257,8 @@ define <vscale x 4 x i8> @vmadd_vv_nxv4i8_ta(<vscale x 4 x i8> %a, <vscale x 4 x
 define <vscale x 4 x i8> @vmadd_vx_nxv4i8_ta(<vscale x 4 x i8> %a, i8 %b, <vscale x 4 x i8> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv4i8_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmacc.vx v9, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
@@ -348,9 +345,8 @@ define <vscale x 8 x i8> @vmadd_vv_nxv8i8_ta(<vscale x 8 x i8> %a, <vscale x 8 x
 define <vscale x 8 x i8> @vmadd_vx_nxv8i8_ta(<vscale x 8 x i8> %a, i8 %b, <vscale x 8 x i8> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv8i8_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmacc.vx v9, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
@@ -437,9 +433,8 @@ define <vscale x 16 x i8> @vmadd_vv_nxv16i8_ta(<vscale x 16 x i8> %a, <vscale x
 define <vscale x 16 x i8> @vmadd_vx_nxv16i8_ta(<vscale x 16 x i8> %a, i8 %b, <vscale x 16 x i8> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv16i8_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vmacc.vx v10, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v10, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
@@ -526,9 +521,8 @@ define <vscale x 32 x i8> @vmadd_vv_nxv32i8_ta(<vscale x 32 x i8> %a, <vscale x
 define <vscale x 32 x i8> @vmadd_vx_nxv32i8_ta(<vscale x 32 x i8> %a, i8 %b, <vscale x 32 x i8> %c,  <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv32i8_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vmacc.vx v12, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v12, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
@@ -618,9 +612,8 @@ define <vscale x 64 x i8> @vmadd_vv_nxv64i8_ta(<vscale x 64 x i8> %a, <vscale x
 define <vscale x 64 x i8> @vmadd_vx_nxv64i8_ta(<vscale x 64 x i8> %a, i8 %b, <vscale x 64 x i8> %c,  <vscale x 64 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv64i8_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    vmacc.vx v16, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v16, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
   %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
@@ -707,9 +700,8 @@ define <vscale x 1 x i16> @vmadd_vv_nxv1i16_ta(<vscale x 1 x i16> %a, <vscale x
 define <vscale x 1 x i16> @vmadd_vx_nxv1i16_ta(<vscale x 1 x i16> %a, i16 %b, <vscale x 1 x i16> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv1i16_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
-; CHECK-NEXT:    vmacc.vx v9, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
@@ -796,9 +788,8 @@ define <vscale x 2 x i16> @vmadd_vv_nxv2i16_ta(<vscale x 2 x i16> %a, <vscale x
 define <vscale x 2 x i16> @vmadd_vx_nxv2i16_ta(<vscale x 2 x i16> %a, i16 %b, <vscale x 2 x i16> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv2i16_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
-; CHECK-NEXT:    vmacc.vx v9, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
@@ -885,9 +876,8 @@ define <vscale x 4 x i16> @vmadd_vv_nxv4i16_ta(<vscale x 4 x i16> %a, <vscale x
 define <vscale x 4 x i16> @vmadd_vx_nxv4i16_ta(<vscale x 4 x i16> %a, i16 %b, <vscale x 4 x i16> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv4i16_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vmacc.vx v9, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
@@ -974,9 +964,8 @@ define <vscale x 8 x i16> @vmadd_vv_nxv8i16_ta(<vscale x 8 x i16> %a, <vscale x
 define <vscale x 8 x i16> @vmadd_vx_nxv8i16_ta(<vscale x 8 x i16> %a, i16 %b, <vscale x 8 x i16> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv8i16_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-NEXT:    vmacc.vx v10, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v10, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
@@ -1063,9 +1052,8 @@ define <vscale x 16 x i16> @vmadd_vv_nxv16i16_ta(<vscale x 16 x i16> %a, <vscale
 define <vscale x 16 x i16> @vmadd_vx_nxv16i16_ta(<vscale x 16 x i16> %a, i16 %b, <vscale x 16 x i16> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv16i16_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vmacc.vx v12, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v12, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
@@ -1155,9 +1143,8 @@ define <vscale x 32 x i16> @vmadd_vv_nxv32i16_ta(<vscale x 32 x i16> %a, <vscale
 define <vscale x 32 x i16> @vmadd_vx_nxv32i16_ta(<vscale x 32 x i16> %a, i16 %b, <vscale x 32 x i16> %c,  <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv32i16_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vmacc.vx v16, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v16, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
   %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
@@ -1244,9 +1231,8 @@ define <vscale x 1 x i32> @vmadd_vv_nxv1i32_ta(<vscale x 1 x i32> %a, <vscale x
 define <vscale x 1 x i32> @vmadd_vx_nxv1i32_ta(<vscale x 1 x i32> %a, i32 %b, <vscale x 1 x i32> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv1i32_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
-; CHECK-NEXT:    vmacc.vx v9, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
@@ -1333,9 +1319,8 @@ define <vscale x 2 x i32> @vmadd_vv_nxv2i32_ta(<vscale x 2 x i32> %a, <vscale x
 define <vscale x 2 x i32> @vmadd_vx_nxv2i32_ta(<vscale x 2 x i32> %a, i32 %b, <vscale x 2 x i32> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv2i32_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
-; CHECK-NEXT:    vmacc.vx v9, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
@@ -1422,9 +1407,8 @@ define <vscale x 4 x i32> @vmadd_vv_nxv4i32_ta(<vscale x 4 x i32> %a, <vscale x
 define <vscale x 4 x i32> @vmadd_vx_nxv4i32_ta(<vscale x 4 x i32> %a, i32 %b, <vscale x 4 x i32> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv4i32_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; CHECK-NEXT:    vmacc.vx v10, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v10, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -1511,9 +1495,8 @@ define <vscale x 8 x i32> @vmadd_vv_nxv8i32_ta(<vscale x 8 x i32> %a, <vscale x
 define <vscale x 8 x i32> @vmadd_vx_nxv8i32_ta(<vscale x 8 x i32> %a, i32 %b, <vscale x 8 x i32> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv8i32_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vmacc.vx v12, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v12, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
@@ -1603,9 +1586,8 @@ define <vscale x 16 x i32> @vmadd_vv_nxv16i32_ta(<vscale x 16 x i32> %a, <vscale
 define <vscale x 16 x i32> @vmadd_vx_nxv16i32_ta(<vscale x 16 x i32> %a, i32 %b, <vscale x 16 x i32> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vmadd_vx_nxv16i32_ta:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmacc.vx v16, a0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v16, v0.t
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
   %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -1739,9 +1721,8 @@ define <vscale x 1 x i64> @vmadd_vx_nxv1i64_ta(<vscale x 1 x i64> %a, i64 %b, <v
 ;
 ; RV64-LABEL: vmadd_vx_nxv1i64_ta:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; RV64-NEXT:    vmacc.vx v9, a0, v8
-; RV64-NEXT:    vmerge.vvm v8, v8, v9, v0
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; RV64-NEXT:    vmadd.vx v8, a0, v9, v0.t
 ; RV64-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
   %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
@@ -1875,9 +1856,8 @@ define <vscale x 2 x i64> @vmadd_vx_nxv2i64_ta(<vscale x 2 x i64> %a, i64 %b, <v
 ;
 ; RV64-LABEL: vmadd_vx_nxv2i64_ta:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
-; RV64-NEXT:    vmacc.vx v10, a0, v8
-; RV64-NEXT:    vmerge.vvm v8, v8, v10, v0
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; RV64-NEXT:    vmadd.vx v8, a0, v10, v0.t
 ; RV64-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
   %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
@@ -2011,9 +1991,8 @@ define <vscale x 4 x i64> @vmadd_vx_nxv4i64_ta(<vscale x 4 x i64> %a, i64 %b, <v
 ;
 ; RV64-LABEL: vmadd_vx_nxv4i64_ta:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
-; RV64-NEXT:    vmacc.vx v12, a0, v8
-; RV64-NEXT:    vmerge.vvm v8, v8, v12, v0
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; RV64-NEXT:    vmadd.vx v8, a0, v12, v0.t
 ; RV64-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
   %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -2150,9 +2129,8 @@ define <vscale x 8 x i64> @vmadd_vx_nxv8i64_ta(<vscale x 8 x i64> %a, i64 %b, <v
 ;
 ; RV64-LABEL: vmadd_vx_nxv8i64_ta:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vmacc.vx v16, a0, v8
-; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vmadd.vx v8, a0, v16, v0.t
 ; RV64-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
   %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer

From ccf357ff643c6af86bb459eba5a00f40f1dcaf22 Mon Sep 17 00:00:00 2001
From: Izaak Schroeder <izaak.schroeder@gmail.com>
Date: Fri, 5 Jul 2024 22:02:04 -0700
Subject: [PATCH 51/67] [libc] Add `dlfcn.h` headers (#97772)

---
 libc/config/linux/aarch64/headers.txt        |  1 +
 libc/config/linux/x86_64/headers.txt         |  1 +
 libc/include/CMakeLists.txt                  |  9 ++++++++
 libc/include/dlfcn.h.def                     | 17 +++++++++++++++
 libc/include/llvm-libc-macros/CMakeLists.txt |  6 +++++
 libc/include/llvm-libc-macros/dlfcn-macros.h | 23 ++++++++++++++++++++
 6 files changed, 57 insertions(+)
 create mode 100644 libc/include/dlfcn.h.def
 create mode 100644 libc/include/llvm-libc-macros/dlfcn-macros.h

diff --git a/libc/config/linux/aarch64/headers.txt b/libc/config/linux/aarch64/headers.txt
index 7d25877cefcc83..8f898f0150905a 100644
--- a/libc/config/linux/aarch64/headers.txt
+++ b/libc/config/linux/aarch64/headers.txt
@@ -1,6 +1,7 @@
 set(TARGET_PUBLIC_HEADERS
     libc.include.assert
     libc.include.ctype
+    libc.include.dlfcn
     libc.include.errno
     libc.include.features
     libc.include.fenv
diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt
index 44d640b75e2bf7..df276894246c4c 100644
--- a/libc/config/linux/x86_64/headers.txt
+++ b/libc/config/linux/x86_64/headers.txt
@@ -2,6 +2,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.assert
     libc.include.ctype
     libc.include.dirent
+    libc.include.dlfcn
     libc.include.errno
     libc.include.fcntl
     libc.include.features
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 3ab7817d8568b7..f8ef35078a8c46 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -51,6 +51,15 @@ add_gen_header(
     .llvm_libc_common_h
 )
 
+add_gen_header(
+  dlfcn
+  DEF_FILE dlfcn.h.def
+  GEN_HDR dlfcn.h
+  DEPENDS
+    .llvm-libc-macros.dlfcn_macros
+    .llvm_libc_common_h
+)
+
 add_gen_header(
   features
   DEF_FILE features.h.def
diff --git a/libc/include/dlfcn.h.def b/libc/include/dlfcn.h.def
new file mode 100644
index 00000000000000..31395871c6b97e
--- /dev/null
+++ b/libc/include/dlfcn.h.def
@@ -0,0 +1,17 @@
+//===-- C standard library header dlfcn.h ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_DLFCN_H
+#define LLVM_LIBC_DLFCN_H
+
+#include "__llvm-libc-common.h"
+#include "llvm-libc-macros/dlfcn-macros.h"
+
+%%public_api()
+
+#endif // LLVM_LIBC_DLFCN_H
diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt
index f6af11abd4dd76..86d6271ff88ac2 100644
--- a/libc/include/llvm-libc-macros/CMakeLists.txt
+++ b/libc/include/llvm-libc-macros/CMakeLists.txt
@@ -277,3 +277,9 @@ add_macro_header(
   HDR
     stdckdint-macros.h
 )
+
+add_macro_header(
+  dlfcn_macros
+  HDR
+    dlfcn-macros.h
+)
diff --git a/libc/include/llvm-libc-macros/dlfcn-macros.h b/libc/include/llvm-libc-macros/dlfcn-macros.h
new file mode 100644
index 00000000000000..dcd202b9ab435c
--- /dev/null
+++ b/libc/include/llvm-libc-macros/dlfcn-macros.h
@@ -0,0 +1,23 @@
+//===-- Definition of macros from dlfcn.h ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_MACROS_DLFCN_MACROS_H
+#define LLVM_LIBC_MACROS_DLFCN_MACROS_H
+
+#define RTLD_LAZY 0x00001
+#define RTLD_NOW 0x00002
+#define RTLD_GLOBAL 0x00100
+#define RTLD_LOCAL 0
+
+// Non-standard stuff here
+#define RTLD_BINDING_MASK 0x3
+#define RTLD_NOLOAD 0x00004
+#define RTLD_DEEPBIND 0x00008
+#define RTLD_NODELETE 0x01000
+
+#endif // LLVM_LIBC_MACROS_DLFCN_MACROS_H

From 9cb9a97e44130e17e96f994c3e594aba69ea1ad5 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Fri, 5 Jul 2024 22:56:15 -0700
Subject: [PATCH 52/67] [CMake] Use Clang to infer the target triple (#89425)

When using Clang as a compiler, use Clang to normalize the triple that's
used to construct path for runtime library build and install paths. This
ensures that paths are consistent and avoids the issue where the build
uses a different triple spelling.

Differential Revision: https://reviews.llvm.org/D140925
---
 clang/cmake/caches/Fuchsia-stage2.cmake       |  2 +-
 .../cmake/Modules/CompilerRTUtils.cmake       | 17 +++++++++++++----
 runtimes/CMakeLists.txt                       | 19 +++++++++++++++++++
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index 9892b5d58e719b..52687a2cf8ea9c 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -141,7 +141,7 @@ if(WIN32 OR LLVM_WINSYSROOT)
   set(RUNTIMES_${target}_CMAKE_MODULE_LINKER_FLAGS ${WINDOWS_LINK_FLAGS} CACHE STRING "")
 endif()
 
-foreach(target aarch64-unknown-linux-gnu;armv7-unknown-linux-gnueabihf;i386-unknown-linux-gnu;riscv64-unknown-linux-gnu;x86_64-unknown-linux-gnu)
+foreach(target aarch64-linux-gnu;armv7-linux-gnueabihf;i386-linux-gnu;riscv64-linux-gnu;x86_64-linux-gnu)
   if(LINUX_${target}_SYSROOT)
     # Set the per-target builtins options.
     list(APPEND BUILTIN_TARGETS "${target}")
diff --git a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
index 9c7fe64d0bd35d..cec7af929fb2b6 100644
--- a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
+++ b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
@@ -368,14 +368,23 @@ macro(construct_compiler_rt_default_triple)
           "Default triple for which compiler-rt runtimes will be built.")
   endif()
 
-  if ("${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
+  if(CMAKE_C_COMPILER_ID MATCHES "Clang")
     set(option_prefix "")
     if (CMAKE_C_SIMULATE_ID MATCHES "MSVC")
       set(option_prefix "/clang:")
     endif()
-    execute_process(COMMAND ${CMAKE_C_COMPILER} ${option_prefix}--target=${COMPILER_RT_DEFAULT_TARGET_TRIPLE} ${option_prefix}-print-target-triple
-                    OUTPUT_VARIABLE COMPILER_RT_DEFAULT_TARGET_TRIPLE
-                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+    set(print_target_triple ${CMAKE_C_COMPILER} ${option_prefix}--target=${COMPILER_RT_DEFAULT_TARGET_TRIPLE} ${option_prefix}-print-target-triple)
+    execute_process(COMMAND ${print_target_triple}
+      RESULT_VARIABLE result
+      OUTPUT_VARIABLE output
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(result EQUAL 0)
+      set(COMPILER_RT_DEFAULT_TARGET_TRIPLE ${output})
+    else()
+      string(REPLACE ";" " " print_target_triple "${print_target_triple}")
+      # TODO(#97876): Report an error.
+      message(WARNING "Failed to execute `${print_target_triple}` to normalize target triple.")
+    endif()
   endif()
 
   string(REPLACE "-" ";" LLVM_TARGET_TRIPLE_LIST ${COMPILER_RT_DEFAULT_TARGET_TRIPLE})
diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt
index 24f48511695915..830165c799c2ab 100644
--- a/runtimes/CMakeLists.txt
+++ b/runtimes/CMakeLists.txt
@@ -183,6 +183,25 @@ message(STATUS "LLVM default target triple: ${LLVM_DEFAULT_TARGET_TRIPLE}")
 
 set(LLVM_TARGET_TRIPLE "${LLVM_DEFAULT_TARGET_TRIPLE}")
 
+if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+  set(option_prefix "")
+  if (CMAKE_C_SIMULATE_ID MATCHES "MSVC")
+    set(option_prefix "/clang:")
+  endif()
+  set(print_target_triple ${CMAKE_C_COMPILER} ${option_prefix}--target=${LLVM_DEFAULT_TARGET_TRIPLE} ${option_prefix}-print-target-triple)
+  execute_process(COMMAND ${print_target_triple}
+    RESULT_VARIABLE result
+    OUTPUT_VARIABLE output
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(result EQUAL 0)
+    set(LLVM_DEFAULT_TARGET_TRIPLE ${output})
+  else()
+    string(REPLACE ";" " " print_target_triple "${print_target_triple}")
+    # TODO(#97876): Report an error.
+    message(WARNING "Failed to execute `${print_target_triple}` to normalize target triple.")
+  endif()
+endif()
+
 option(LLVM_INCLUDE_TESTS "Generate build targets for the runtimes unit tests." ON)
 option(LLVM_INCLUDE_DOCS "Generate build targets for the runtimes documentation." ON)
 option(LLVM_ENABLE_SPHINX "Use Sphinx to generate the runtimes documentation." OFF)

From acd7a688fcd26ce4d72cecbddeddef788482e17e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 6 Jul 2024 16:48:32 +0900
Subject: [PATCH 53/67] [llvm] Remove redundant calls to
 std::unique_ptr<T>::get (NFC) (#97778)

---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |  2 +-
 .../Parallel/DWARFLinkerCompileUnit.cpp       |  2 +-
 .../DWARFLinker/Parallel/DWARFLinkerImpl.cpp  | 28 +++++++++----------
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  2 +-
 llvm/tools/bugpoint/BugDriver.cpp             |  2 +-
 llvm/tools/llvm-as/llvm-as.cpp                |  2 +-
 llvm/tools/llvm-extract/llvm-extract.cpp      |  2 +-
 llvm/tools/llvm-link/llvm-link.cpp            |  4 +--
 .../verify-uselistorder.cpp                   |  4 +--
 9 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 75cb17f357241d..5c7f6ddc948402 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2214,7 +2214,7 @@ class TargetInstrInfo : public MCInstrInfo {
   /// Return MIR formatter to format/parse MIR operands.  Target can override
   /// this virtual function and return target specific MIR formatter.
   virtual const MIRFormatter *getMIRFormatter() const {
-    if (!Formatter.get())
+    if (!Formatter)
       Formatter = std::make_unique<MIRFormatter>();
     return Formatter.get();
   }
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp
index 8a7313628b9926..0cb9cd5f9ea31d 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp
@@ -1833,7 +1833,7 @@ TypeUnit *CompileUnit::OutputUnitVariantPtr::getAsTypeUnit() {
 
 bool CompileUnit::resolveDependenciesAndMarkLiveness(
     bool InterCUProcessingStarted, std::atomic<bool> &HasNewInterconnectedCUs) {
-  if (!Dependencies.get())
+  if (!Dependencies)
     Dependencies.reset(new DependencyTracker(*this));
 
   return Dependencies->resolveDependenciesAndMarkLiveness(
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
index c060f8f4c1718b..84fd0806f07050 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
@@ -107,7 +107,7 @@ Error DWARFLinkerImpl::link() {
   std::optional<uint16_t> Language;
 
   for (std::unique_ptr<LinkContext> &Context : ObjectContexts) {
-    if (Context->InputDWARFFile.Dwarf.get() == nullptr) {
+    if (Context->InputDWARFFile.Dwarf == nullptr) {
       Context->setOutputFormat(Context->getFormParams(), GlobalEndianness);
       continue;
     }
@@ -203,13 +203,13 @@ Error DWARFLinkerImpl::link() {
     Pool.wait();
   }
 
-  if (ArtificialTypeUnit.get() != nullptr && !ArtificialTypeUnit->getTypePool()
-                                                  .getRoot()
-                                                  ->getValue()
-                                                  .load()
-                                                  ->Children.empty()) {
+  if (ArtificialTypeUnit != nullptr && !ArtificialTypeUnit->getTypePool()
+                                            .getRoot()
+                                            ->getValue()
+                                            .load()
+                                            ->Children.empty()) {
     if (GlobalData.getTargetTriple().has_value())
-      if (Error Err = ArtificialTypeUnit.get()->finishCloningAndEmit(
+      if (Error Err = ArtificialTypeUnit->finishCloningAndEmit(
               (*GlobalData.getTargetTriple()).get()))
         return Err;
   }
@@ -732,7 +732,7 @@ Error DWARFLinkerImpl::LinkContext::cloneAndEmitDebugFrame() {
   if (!GlobalData.getTargetTriple().has_value())
     return Error::success();
 
-  if (InputDWARFFile.Dwarf.get() == nullptr)
+  if (InputDWARFFile.Dwarf == nullptr)
     return Error::success();
 
   const DWARFObject &InputDWARFObj = InputDWARFFile.Dwarf->getDWARFObj();
@@ -865,7 +865,7 @@ void DWARFLinkerImpl::glueCompileUnitsAndWriteToTheOutput() {
   // units into the resulting file.
   emitCommonSectionsAndWriteCompileUnitsToTheOutput();
 
-  if (ArtificialTypeUnit.get() != nullptr)
+  if (ArtificialTypeUnit != nullptr)
     ArtificialTypeUnit.reset();
 
   // Write common debug sections into the resulting file.
@@ -1018,7 +1018,7 @@ void DWARFLinkerImpl::forEachOutputString(
     });
   });
 
-  if (ArtificialTypeUnit.get() != nullptr) {
+  if (ArtificialTypeUnit != nullptr) {
     ArtificialTypeUnit->forEach([&](SectionDescriptor &OutSection) {
       OutSection.ListDebugStrPatch.forEach([&](DebugStrPatch &Patch) {
         StringHandler(StringDestinationKind::DebugStr, Patch.String);
@@ -1049,7 +1049,7 @@ void DWARFLinkerImpl::forEachOutputString(
 void DWARFLinkerImpl::forEachObjectSectionsSet(
     function_ref<void(OutputSections &)> SectionsSetHandler) {
   // Handle artificial type unit first.
-  if (ArtificialTypeUnit.get() != nullptr)
+  if (ArtificialTypeUnit != nullptr)
     SectionsSetHandler(*ArtificialTypeUnit);
 
   // Then all modules(before regular compilation units).
@@ -1072,7 +1072,7 @@ void DWARFLinkerImpl::forEachObjectSectionsSet(
 
 void DWARFLinkerImpl::forEachCompileAndTypeUnit(
     function_ref<void(DwarfUnit *CU)> UnitHandler) {
-  if (ArtificialTypeUnit.get() != nullptr)
+  if (ArtificialTypeUnit != nullptr)
     UnitHandler(ArtificialTypeUnit.get());
 
   // Enumerate module units.
@@ -1348,7 +1348,7 @@ void DWARFLinkerImpl::emitDWARFv5DebugNamesSection(const Triple &TargetTriple) {
   forEachCompileAndTypeUnit([&](DwarfUnit *CU) {
     bool HasRecords = false;
     CU->forEachAcceleratorRecord([&](const DwarfUnit::AccelInfo &Info) {
-      if (DebugNames.get() == nullptr)
+      if (DebugNames == nullptr)
         DebugNames = std::make_unique<DWARF5AccelTable>();
 
       HasRecords = true;
@@ -1375,7 +1375,7 @@ void DWARFLinkerImpl::emitDWARFv5DebugNamesSection(const Triple &TargetTriple) {
     }
   });
 
-  if (DebugNames.get() != nullptr) {
+  if (DebugNames != nullptr) {
     // FIXME: we use AsmPrinter to emit accelerator sections.
     // It might be beneficial to directly emit accelerator data
     // to the raw_svector_ostream.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 84bb73cc9a796a..1e2b687854c77a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1384,7 +1384,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   getGenericInstructionUniformity(const MachineInstr &MI) const;
 
   const MIRFormatter *getMIRFormatter() const override {
-    if (!Formatter.get())
+    if (!Formatter)
       Formatter = std::make_unique<AMDGPUMIRFormatter>();
     return Formatter.get();
   }
diff --git a/llvm/tools/bugpoint/BugDriver.cpp b/llvm/tools/bugpoint/BugDriver.cpp
index 32c747fdd51665..f9b8d09501672c 100644
--- a/llvm/tools/bugpoint/BugDriver.cpp
+++ b/llvm/tools/bugpoint/BugDriver.cpp
@@ -142,7 +142,7 @@ bool BugDriver::addSources(const std::vector<std::string> &Filenames) {
 
   for (unsigned i = 1, e = Filenames.size(); i != e; ++i) {
     std::unique_ptr<Module> M = parseInputFile(Filenames[i], Context);
-    if (!M.get())
+    if (!M)
       return true;
 
     outs() << "Linking in input file: '" << Filenames[i] << "'\n";
diff --git a/llvm/tools/llvm-as/llvm-as.cpp b/llvm/tools/llvm-as/llvm-as.cpp
index 082565a9ef3d98..9b9af762676195 100644
--- a/llvm/tools/llvm-as/llvm-as.cpp
+++ b/llvm/tools/llvm-as/llvm-as.cpp
@@ -137,7 +137,7 @@ int main(int argc, char **argv) {
                                                 nullptr, SetDataLayout);
   }
   std::unique_ptr<Module> M = std::move(ModuleAndIndex.Mod);
-  if (!M.get()) {
+  if (!M) {
     Err.print(argv[0], errs());
     return 1;
   }
diff --git a/llvm/tools/llvm-extract/llvm-extract.cpp b/llvm/tools/llvm-extract/llvm-extract.cpp
index 5915f92ea05c3b..4ee644f1e29060 100644
--- a/llvm/tools/llvm-extract/llvm-extract.cpp
+++ b/llvm/tools/llvm-extract/llvm-extract.cpp
@@ -147,7 +147,7 @@ int main(int argc, char **argv) {
   SMDiagnostic Err;
   std::unique_ptr<Module> M = getLazyIRFileModule(InputFilename, Err, Context);
 
-  if (!M.get()) {
+  if (!M) {
     Err.print(argv[0], errs());
     return 1;
   }
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index 24ee02be7e9ee6..b311820ce58709 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -231,7 +231,7 @@ static std::unique_ptr<Module> loadArFile(const char *Argv0,
       M = getLazyIRModule(MemoryBuffer::getMemBuffer(MemBuf.get(), false),
                           ParseErr, Context);
 
-    if (!M.get()) {
+    if (!M) {
       errs() << Argv0 << ": ";
       WithColor::error() << " parsing member '" << ChildName
                          << "' of archive library failed'" << ArchiveName
@@ -417,7 +417,7 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
         identify_magic(Buffer->getBuffer()) == file_magic::archive
             ? loadArFile(argv0, std::move(Buffer), Context)
             : loadFile(argv0, std::move(Buffer), Context);
-    if (!M.get()) {
+    if (!M) {
       errs() << argv0 << ": ";
       WithColor::error() << " loading file '" << File << "'\n";
       return false;
diff --git a/llvm/tools/verify-uselistorder/verify-uselistorder.cpp b/llvm/tools/verify-uselistorder/verify-uselistorder.cpp
index 84fc777e1fdff1..f316bff1e49a51 100644
--- a/llvm/tools/verify-uselistorder/verify-uselistorder.cpp
+++ b/llvm/tools/verify-uselistorder/verify-uselistorder.cpp
@@ -176,7 +176,7 @@ std::unique_ptr<Module> TempFile::readAssembly(LLVMContext &Context) const {
   LLVM_DEBUG(dbgs() << " - read assembly\n");
   SMDiagnostic Err;
   std::unique_ptr<Module> M = parseAssemblyFile(Filename, Err, Context);
-  if (!M.get())
+  if (!M)
     Err.print("verify-uselistorder", errs());
   return M;
 }
@@ -555,7 +555,7 @@ int main(int argc, char **argv) {
   // Load the input module...
   std::unique_ptr<Module> M = parseIRFile(InputFilename, Err, Context);
 
-  if (!M.get()) {
+  if (!M) {
     Err.print(argv[0], errs());
     return 1;
   }

From 874ca08645420413e525054a47caf039bebde28b Mon Sep 17 00:00:00 2001
From: Zhikai Zeng <backlight.zzk@gmail.com>
Date: Sat, 6 Jul 2024 16:28:23 +0800
Subject: [PATCH 54/67] [Clang][ExprConstant] fix constant expression did not
 evaluate to integer (#97146)

fixes https://github.com/llvm/llvm-project/issues/96670

The cause is that we might return a lvalue here at


https://github.com/llvm/llvm-project/blob/3e53c97d33210db68188e731e93ee48dbaeeae32/clang/lib/AST/ExprConstant.cpp#L15861-L15865

This PR will make sure we return a rvalue in `FastEvaluateAsRValue`.
---
 clang/docs/ReleaseNotes.rst         |  2 ++
 clang/lib/AST/ExprConstant.cpp      |  9 ++++++---
 clang/test/SemaCXX/eval-crashes.cpp | 10 ++++++++++
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index f6431a76b38de5..d60c6fbf15d56c 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -967,6 +967,8 @@ Bug Fixes to C++ Support
 - Fixed an assertion failure about invalid conversion when calling lambda. (#GH96205).
 - Fixed a bug where the first operand of binary ``operator&`` would be transformed as if it was the operand
   of the address of operator. (#GH97483).
+- Fixed an assertion failure about a constant expression which is a known integer but is not
+  evaluated to an integer. (#GH96670).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 374a3acf7aa26f..e0c9ef68cb4480 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -15859,9 +15859,12 @@ static bool FastEvaluateAsRValue(const Expr *Exp, Expr::EvalResult &Result,
 
   if (const auto *CE = dyn_cast<ConstantExpr>(Exp)) {
     if (CE->hasAPValueResult()) {
-      Result.Val = CE->getAPValueResult();
-      IsConst = true;
-      return true;
+      APValue APV = CE->getAPValueResult();
+      if (!APV.isLValue()) {
+        Result.Val = std::move(APV);
+        IsConst = true;
+        return true;
+      }
     }
 
     // The SubExpr is usually just an IntegerLiteral.
diff --git a/clang/test/SemaCXX/eval-crashes.cpp b/clang/test/SemaCXX/eval-crashes.cpp
index 017df977b26b7b..0865dafe4bf92a 100644
--- a/clang/test/SemaCXX/eval-crashes.cpp
+++ b/clang/test/SemaCXX/eval-crashes.cpp
@@ -61,3 +61,13 @@ struct array {
   array() : data(*new int[1][2]) {}
 };
 }
+
+namespace GH96670 {
+inline constexpr long ullNil = -1;
+
+template<typename T = long, const T &Nil = ullNil>
+struct Test {};
+
+inline constexpr long lNil = -1;
+Test<long, lNil> c;
+}

From ac9d34a2eed4c4d58edf25b92e397faa76170d00 Mon Sep 17 00:00:00 2001
From: PeterChou1 <peter.chou@mail.utoronto.ca>
Date: Sat, 6 Jul 2024 04:29:49 -0400
Subject: [PATCH 55/67] [clang-doc] revert asset bug fix (#97882)

reverts https://github.com/llvm/llvm-project/pull/97540

which broke clangs standalone build
---
 clang-tools-extra/clang-doc/tool/CMakeLists.txt | 2 +-
 llvm/CMakeLists.txt                             | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang-tools-extra/clang-doc/tool/CMakeLists.txt b/clang-tools-extra/clang-doc/tool/CMakeLists.txt
index 19c17a8f3a51f7..e93a5728d6b6b0 100644
--- a/clang-tools-extra/clang-doc/tool/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/tool/CMakeLists.txt
@@ -25,7 +25,7 @@ set(assets
 )
 
 set(asset_dir "${CMAKE_CURRENT_SOURCE_DIR}/../assets")
-set(resource_dir "${LLVM_SHARE_OUTPUT_INTDIR}/clang-doc")
+set(resource_dir "${CMAKE_BINARY_DIR}/share/clang-doc")
 set(out_files)
 
 function(copy_files_to_dst src_dir dst_dir file)
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index cbbf84ec286ed2..12618966c4adfd 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -446,7 +446,6 @@ mark_as_advanced(LLVM_EXAMPLES_INSTALL_DIR)
 # They are used as destination of target generators.
 set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin)
 set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX})
-set(LLVM_SHARE_OUTPUT_INTDIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/share)
 if(WIN32 OR CYGWIN)
   # DLL platform -- put DLLs into bin.
   set(LLVM_SHLIB_OUTPUT_INTDIR ${LLVM_RUNTIME_OUTPUT_INTDIR})

From 5aa8ef8e9b05b714550eedbced34f67f225dbe6f Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sat, 6 Jul 2024 11:56:11 +0200
Subject: [PATCH 56/67] [libc++][test] Updates sized deallocation tests.
 (#97833)

In #90373 size deallocation was enabled by default. Some test were
disabled to propagate the clang changes to the libc++ CI. These changes
have been propagated so the test filter can be updated.
---
 .../support.dynamic/libcpp_deallocate.sh.cpp                 | 4 ++--
 .../new.delete.array/sized_delete_array14.pass.cpp           | 5 +++--
 .../new.delete/new.delete.single/sized_delete14.pass.cpp     | 5 +++--
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
index aa3ce210e3638d..ef04ccddf1835c 100644
--- a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
+++ b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
@@ -21,8 +21,8 @@
 // GCC doesn't support the aligned-allocation flags.
 // XFAIL: gcc
 
-// TODO(mordante) fix this test after updating clang in Docker
-// UNSUPPORTED: clang-15, clang-16, clang-17, clang-18, clang-19
+// These compiler versions do not have proper sized deallocation support.
+// UNSUPPORTED: clang-17, clang-18
 
 // RUN: %{build} -faligned-allocation -fsized-deallocation
 // RUN: %{run}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
index 0241e7cefcac3d..dc8254680310cd 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
@@ -8,8 +8,9 @@
 
 // test sized operator delete[] replacement.
 
-// TODO(mordante) fix this test after updating clang in Docker
-// UNSUPPORTED: clang-15, clang-16, clang-17, clang-18, clang-19
+// These compiler versions do not have proper sized deallocation support.
+// UNSUPPORTED: clang-17, clang-18
+
 // UNSUPPORTED: sanitizer-new-delete, c++03, c++11
 // XFAIL: apple-clang
 // XFAIL: using-built-library-before-llvm-11
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
index 2ab691618ea46d..a03fc9f3e8266e 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
@@ -8,8 +8,9 @@
 
 // test sized operator delete replacement.
 
-// TODO(mordante) fix this test after updating clang in Docker
-// UNSUPPORTED: clang-15, clang-16, clang-17, clang-18, clang-19
+// These compiler versions do not have proper sized deallocation support.
+// UNSUPPORTED: clang-17, clang-18
+
 // UNSUPPORTED: sanitizer-new-delete, c++03, c++11
 // XFAIL: apple-clang
 // XFAIL: using-built-library-before-llvm-11

From 126f81dd5a804636e4b66146d2d039099d9d5889 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sat, 6 Jul 2024 12:02:23 +0200
Subject: [PATCH 57/67] [NFC][libc++] removes std:: qualification,

Elements in nested namespaces in the std namespace do not use fully
qualified names in libc++. This adjusts a few cases found.
---
 libcxx/src/tzdb.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libcxx/src/tzdb.cpp b/libcxx/src/tzdb.cpp
index 2d07796c6951f6..d22de21c998198 100644
--- a/libcxx/src/tzdb.cpp
+++ b/libcxx/src/tzdb.cpp
@@ -677,9 +677,9 @@ void __init_tzdb(tzdb& __tzdb, __tz::__rules_storage_type& __rules) {
 
   __tzdb.version = chrono::__parse_version(__tzdata);
   chrono::__parse_tzdata(__tzdb, __rules, __tzdata);
-  std::ranges::sort(__tzdb.zones);
-  std::ranges::sort(__tzdb.links);
-  std::ranges::sort(__rules, {}, [](const auto& p) { return p.first; });
+  ranges::sort(__tzdb.zones);
+  ranges::sort(__tzdb.links);
+  ranges::sort(__rules, {}, [](const auto& p) { return p.first; });
 
   // There are two files with the leap second information
   // - leapseconds as specified by zic
@@ -724,10 +724,10 @@ void __init_tzdb(tzdb& __tzdb, __tz::__rules_storage_type& __rules) {
       return __result;
 
   filesystem::path __path = "/etc/localtime";
-  if (!std::filesystem::exists(__path))
+  if (!filesystem::exists(__path))
     std::__throw_runtime_error("tzdb: the symlink '/etc/localtime' does not exist");
 
-  if (!std::filesystem::is_symlink(__path))
+  if (!filesystem::is_symlink(__path))
     std::__throw_runtime_error("tzdb: the path '/etc/localtime' is not a symlink");
 
   filesystem::path __tz = filesystem::read_symlink(__path);

From 55b95a7a75ec4568d6ee7a3199090e830619c68e Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Sat, 6 Jul 2024 12:05:00 +0200
Subject: [PATCH 58/67] [mlir][Transforms][NFC] Dialect conversion: Reland docs
 improvement (#97886)

#96207 was reverted but the improvements to the documentation of the
dialect conversion are still useful.
---
 mlir/docs/DialectConversion.md                | 45 ++++++++++++-------
 .../mlir/Transforms/DialectConversion.h       | 18 +++++---
 2 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/mlir/docs/DialectConversion.md b/mlir/docs/DialectConversion.md
index 69781bb868bbf8..db26e6477d5fc7 100644
--- a/mlir/docs/DialectConversion.md
+++ b/mlir/docs/DialectConversion.md
@@ -246,6 +246,13 @@ depending on the situation.
 
     -   An argument materialization is used when converting the type of a block
         argument during a [signature conversion](#region-signature-conversion).
+        The new block argument types are specified in a `SignatureConversion`
+        object. An original block argument can be converted into multiple
+        block arguments, which is not supported everywhere in the dialect
+        conversion. (E.g., adaptors support only a single replacement value for
+        each original value.) Therefore, an argument materialization is used to
+        convert potentially multiple new block arguments back into a single SSA
+        value.
 
 *   Source Materialization
 
@@ -259,6 +266,9 @@ depending on the situation.
         *   When a block argument has been converted to a different type, but
             the original argument still has users that will remain live after
             the conversion process has finished.
+        *   When a block argument has been dropped, but the argument still has
+            users that will remain live after the conversion process has
+            finished.
         *   When the result type of an operation has been converted to a
             different type, but the original result still has users that will
             remain live after the conversion process is finished.
@@ -328,36 +338,41 @@ class TypeConverter {
     registerConversion(wrapCallback<T>(std::forward<FnT>(callback)));
   }
 
-  /// Register a materialization function, which must be convertible to the
-  /// following form:
-  ///   `Optional<Value> (OpBuilder &, T, ValueRange, Location)`,
-  ///   where `T` is any subclass of `Type`.
-  /// This function is responsible for creating an operation, using the
-  /// OpBuilder and Location provided, that "converts" a range of values into a
-  /// single value of the given type `T`. It must return a Value of the
-  /// converted type on success, an `std::nullopt` if it failed but other
-  /// materialization can be attempted, and `nullptr` on unrecoverable failure.
-  /// It will only be called for (sub)types of `T`.
-  ///
+  /// All of the following materializations require function objects that are
+  /// convertible to the following form:
+  ///   `std::optional<Value>(OpBuilder &, T, ValueRange, Location)`,
+  /// where `T` is any subclass of `Type`. This function is responsible for
+  /// creating an operation, using the OpBuilder and Location provided, that
+  /// "casts" a range of values into a single value of the given type `T`. It
+  /// must return a Value of the converted type on success, an `std::nullopt` if
+  /// it failed but other materialization can be attempted, and `nullptr` on
+  /// unrecoverable failure. It will only be called for (sub)types of `T`.
+  /// Materialization functions must be provided when a type conversion may
+  /// persist after the conversion has finished.
+
   /// This method registers a materialization that will be called when
-  /// converting an illegal block argument type, to a legal type.
+  /// converting (potentially multiple) block arguments that were the result of
+  /// a signature conversion of a single block argument, to a single SSA value.
   template <typename FnT,
             typename T = typename llvm::function_traits<FnT>::template arg_t<1>>
   void addArgumentMaterialization(FnT &&callback) {
     argumentMaterializations.emplace_back(
         wrapMaterialization<T>(std::forward<FnT>(callback)));
   }
+
   /// This method registers a materialization that will be called when
-  /// converting a legal type to an illegal source type. This is used when
-  /// conversions to an illegal type must persist beyond the main conversion.
+  /// converting a legal replacement value back to an illegal source type.
+  /// This is used when some uses of the original, illegal value must persist
+  /// beyond the main conversion.
   template <typename FnT,
             typename T = typename llvm::function_traits<FnT>::template arg_t<1>>
   void addSourceMaterialization(FnT &&callback) {
     sourceMaterializations.emplace_back(
         wrapMaterialization<T>(std::forward<FnT>(callback)));
   }
+
   /// This method registers a materialization that will be called when
-  /// converting type from an illegal, or source, type to a legal type.
+  /// converting an illegal (source) value to a legal (target) type.
   template <typename FnT,
             typename T = typename llvm::function_traits<FnT>::template arg_t<1>>
   void addTargetMaterialization(FnT &&callback) {
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index f83f3a3fdf9929..a22f198bdf2520 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -168,8 +168,8 @@ class TypeConverter {
     registerConversion(wrapCallback<T>(std::forward<FnT>(callback)));
   }
 
-  /// Register a materialization function, which must be convertible to the
-  /// following form:
+  /// All of the following materializations require function objects that are
+  /// convertible to the following form:
   ///   `std::optional<Value>(OpBuilder &, T, ValueRange, Location)`,
   /// where `T` is any subclass of `Type`. This function is responsible for
   /// creating an operation, using the OpBuilder and Location provided, that
@@ -179,26 +179,30 @@ class TypeConverter {
   /// unrecoverable failure. It will only be called for (sub)types of `T`.
   /// Materialization functions must be provided when a type conversion may
   /// persist after the conversion has finished.
-  ///
+
   /// This method registers a materialization that will be called when
-  /// converting an illegal block argument type, to a legal type.
+  /// converting (potentially multiple) block arguments that were the result of
+  /// a signature conversion of a single block argument, to a single SSA value.
   template <typename FnT, typename T = typename llvm::function_traits<
                               std::decay_t<FnT>>::template arg_t<1>>
   void addArgumentMaterialization(FnT &&callback) {
     argumentMaterializations.emplace_back(
         wrapMaterialization<T>(std::forward<FnT>(callback)));
   }
+
   /// This method registers a materialization that will be called when
-  /// converting a legal type to an illegal source type. This is used when
-  /// conversions to an illegal type must persist beyond the main conversion.
+  /// converting a legal replacement value back to an illegal source type.
+  /// This is used when some uses of the original, illegal value must persist
+  /// beyond the main conversion.
   template <typename FnT, typename T = typename llvm::function_traits<
                               std::decay_t<FnT>>::template arg_t<1>>
   void addSourceMaterialization(FnT &&callback) {
     sourceMaterializations.emplace_back(
         wrapMaterialization<T>(std::forward<FnT>(callback)));
   }
+
   /// This method registers a materialization that will be called when
-  /// converting type from an illegal, or source, type to a legal type.
+  /// converting an illegal (source) value to a legal (target) type.
   template <typename FnT, typename T = typename llvm::function_traits<
                               std::decay_t<FnT>>::template arg_t<1>>
   void addTargetMaterialization(FnT &&callback) {

From 88b26293a24bdd85fce2b2f7191cc0a5bc0cecfe Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko <atrosinenko@accesssoftek.com>
Date: Sat, 6 Jul 2024 13:36:02 +0300
Subject: [PATCH 59/67] [AArch64][PAC] Support BLRA* instructions in SLS
 Hardening pass (#97605)

Make SLS Hardening pass handle BLRA* instructions the same way it
handles BLR. The thunk names have the form

    __llvm_slsblr_thunk_xN            for BLR thunks
    __llvm_slsblr_thunk_(aaz|abz)_xN  for BLRAAZ and BLRABZ thunks
    __llvm_slsblr_thunk_(aa|ab)_xN_xM for BLRAA and BLRAB thunks

Now there are about 1800 possible thunk names, so do not rely on linear
thunk function's name lookup and parse the name instead.
---
 .../Target/AArch64/AArch64SLSHardening.cpp    | 377 ++++++++++++------
 .../speculation-hardening-sls-blra.mir        | 210 ++++++++++
 2 files changed, 469 insertions(+), 118 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/speculation-hardening-sls-blra.mir

diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
index 00ba31b3e500dc..5e83015d72f422 100644
--- a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
@@ -13,6 +13,7 @@
 
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/IndirectThunks.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -23,8 +24,11 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
+#include <climits>
+#include <tuple>
 
 using namespace llvm;
 
@@ -32,17 +36,107 @@ using namespace llvm;
 
 #define AARCH64_SLS_HARDENING_NAME "AArch64 sls hardening pass"
 
-static const char SLSBLRNamePrefix[] = "__llvm_slsblr_thunk_";
+// Common name prefix of all thunks generated by this pass.
+//
+// The generic form is
+// __llvm_slsblr_thunk_xN            for BLR thunks
+// __llvm_slsblr_thunk_(aaz|abz)_xN  for BLRAAZ and BLRABZ thunks
+// __llvm_slsblr_thunk_(aa|ab)_xN_xM for BLRAA and BLRAB thunks
+static constexpr StringRef CommonNamePrefix = "__llvm_slsblr_thunk_";
 
 namespace {
 
-// Set of inserted thunks: bitmask with bits corresponding to
-// indexes in SLSBLRThunks array.
-typedef uint32_t ThunksSet;
+struct ThunkKind {
+  enum ThunkKindId {
+    ThunkBR,
+    ThunkBRAA,
+    ThunkBRAB,
+    ThunkBRAAZ,
+    ThunkBRABZ,
+  };
+
+  ThunkKindId Id;
+  StringRef NameInfix;
+  bool HasXmOperand;
+  bool NeedsPAuth;
+
+  // Opcode to perform indirect jump from inside the thunk.
+  unsigned BROpcode;
+
+  static const ThunkKind BR;
+  static const ThunkKind BRAA;
+  static const ThunkKind BRAB;
+  static const ThunkKind BRAAZ;
+  static const ThunkKind BRABZ;
+};
+
+// Set of inserted thunks.
+class ThunksSet {
+public:
+  static constexpr unsigned NumXRegisters = 32;
+
+  // Given Xn register, returns n.
+  static unsigned indexOfXReg(Register Xn);
+  // Given n, returns Xn register.
+  static Register xRegByIndex(unsigned N);
+
+  ThunksSet &operator|=(const ThunksSet &Other) {
+    BLRThunks |= Other.BLRThunks;
+    BLRAAZThunks |= Other.BLRAAZThunks;
+    BLRABZThunks |= Other.BLRABZThunks;
+    for (unsigned I = 0; I < NumXRegisters; ++I)
+      BLRAAThunks[I] |= Other.BLRAAThunks[I];
+    for (unsigned I = 0; I < NumXRegisters; ++I)
+      BLRABThunks[I] |= Other.BLRABThunks[I];
+
+    return *this;
+  }
+
+  bool get(ThunkKind::ThunkKindId Kind, Register Xn, Register Xm) {
+    reg_bitmask_t XnBit = reg_bitmask_t(1) << indexOfXReg(Xn);
+    return getBitmask(Kind, Xm) & XnBit;
+  }
+
+  void set(ThunkKind::ThunkKindId Kind, Register Xn, Register Xm) {
+    reg_bitmask_t XnBit = reg_bitmask_t(1) << indexOfXReg(Xn);
+    getBitmask(Kind, Xm) |= XnBit;
+  }
+
+private:
+  typedef uint32_t reg_bitmask_t;
+  static_assert(NumXRegisters <= sizeof(reg_bitmask_t) * CHAR_BIT,
+                "Bitmask is not wide enough to hold all Xn registers");
+
+  // Bitmasks representing operands used, with n-th bit corresponding to Xn
+  // register operand. If the instruction has a second operand (Xm), an array
+  // of bitmasks is used, indexed by m.
+  // Indexes corresponding to the forbidden x16, x17 and x30 registers are
+  // always unset, for simplicity there are no holes.
+  reg_bitmask_t BLRThunks = 0;
+  reg_bitmask_t BLRAAZThunks = 0;
+  reg_bitmask_t BLRABZThunks = 0;
+  reg_bitmask_t BLRAAThunks[NumXRegisters] = {};
+  reg_bitmask_t BLRABThunks[NumXRegisters] = {};
+
+  reg_bitmask_t &getBitmask(ThunkKind::ThunkKindId Kind, Register Xm) {
+    switch (Kind) {
+    case ThunkKind::ThunkBR:
+      return BLRThunks;
+    case ThunkKind::ThunkBRAAZ:
+      return BLRAAZThunks;
+    case ThunkKind::ThunkBRABZ:
+      return BLRABZThunks;
+    case ThunkKind::ThunkBRAA:
+      return BLRAAThunks[indexOfXReg(Xm)];
+    case ThunkKind::ThunkBRAB:
+      return BLRABThunks[indexOfXReg(Xm)];
+    }
+  }
+};
 
 struct SLSHardeningInserter : ThunkInserter<SLSHardeningInserter, ThunksSet> {
 public:
-  const char *getThunkPrefix() { return SLSBLRNamePrefix; }
+  const char *getThunkPrefix() { return CommonNamePrefix.data(); }
   bool mayUseThunk(const MachineFunction &MF) {
     ComdatThunks &= !MF.getSubtarget<AArch64Subtarget>().hardenSlsNoComdat();
     // We are inserting barriers aside from thunk calls, so
@@ -68,6 +162,61 @@ struct SLSHardeningInserter : ThunkInserter<SLSHardeningInserter, ThunksSet> {
 
 } // end anonymous namespace
 
+const ThunkKind ThunkKind::BR = {ThunkBR, "", /*HasXmOperand=*/false,
+                                 /*NeedsPAuth=*/false, AArch64::BR};
+const ThunkKind ThunkKind::BRAA = {ThunkBRAA, "aa_", /*HasXmOperand=*/true,
+                                   /*NeedsPAuth=*/true, AArch64::BRAA};
+const ThunkKind ThunkKind::BRAB = {ThunkBRAB, "ab_", /*HasXmOperand=*/true,
+                                   /*NeedsPAuth=*/true, AArch64::BRAB};
+const ThunkKind ThunkKind::BRAAZ = {ThunkBRAAZ, "aaz_", /*HasXmOperand=*/false,
+                                    /*NeedsPAuth=*/true, AArch64::BRAAZ};
+const ThunkKind ThunkKind::BRABZ = {ThunkBRABZ, "abz_", /*HasXmOperand=*/false,
+                                    /*NeedsPAuth=*/true, AArch64::BRABZ};
+
+// Returns thunk kind to emit, or nullptr if not a BLR* instruction.
+static const ThunkKind *getThunkKind(unsigned OriginalOpcode) {
+  switch (OriginalOpcode) {
+  case AArch64::BLR:
+  case AArch64::BLRNoIP:
+    return &ThunkKind::BR;
+  case AArch64::BLRAA:
+    return &ThunkKind::BRAA;
+  case AArch64::BLRAB:
+    return &ThunkKind::BRAB;
+  case AArch64::BLRAAZ:
+    return &ThunkKind::BRAAZ;
+  case AArch64::BLRABZ:
+    return &ThunkKind::BRABZ;
+  }
+  return nullptr;
+}
+
+static bool isBLR(const MachineInstr &MI) {
+  return getThunkKind(MI.getOpcode()) != nullptr;
+}
+
+unsigned ThunksSet::indexOfXReg(Register Reg) {
+  assert(AArch64::GPR64RegClass.contains(Reg));
+  assert(Reg != AArch64::X16 && Reg != AArch64::X17 && Reg != AArch64::LR);
+
+  // Most Xn registers have consecutive ids, except for FP and XZR.
+  unsigned Result = (unsigned)Reg - (unsigned)AArch64::X0;
+  if (Reg == AArch64::FP)
+    Result = 29;
+  else if (Reg == AArch64::XZR)
+    Result = 31;
+
+  assert(Result < NumXRegisters && "Internal register numbering changed");
+  assert(AArch64::GPR64RegClass.getRegister(Result).id() == Reg &&
+         "Internal register numbering changed");
+
+  return Result;
+}
+
+Register ThunksSet::xRegByIndex(unsigned N) {
+  return AArch64::GPR64RegClass.getRegister(N);
+}
+
 static void insertSpeculationBarrier(const AArch64Subtarget *ST,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MBBI,
@@ -104,22 +253,6 @@ ThunksSet SLSHardeningInserter::insertThunks(MachineModuleInfo &MMI,
   return ExistingThunks;
 }
 
-static bool isBLR(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  case AArch64::BLR:
-  case AArch64::BLRNoIP:
-    return true;
-  case AArch64::BLRAA:
-  case AArch64::BLRAB:
-  case AArch64::BLRAAZ:
-  case AArch64::BLRABZ:
-    llvm_unreachable("Currently, LLVM's code generator does not support "
-                     "producing BLRA* instructions. Therefore, there's no "
-                     "support in this pass for those instructions.");
-  }
-  return false;
-}
-
 bool SLSHardeningInserter::hardenReturnsAndBRs(MachineModuleInfo &MMI,
                                                MachineBasicBlock &MBB) {
   const AArch64Subtarget *ST =
@@ -139,64 +272,64 @@ bool SLSHardeningInserter::hardenReturnsAndBRs(MachineModuleInfo &MMI,
   return Modified;
 }
 
-static const unsigned NumPermittedRegs = 29;
-static const struct ThunkNameAndReg {
-  const char* Name;
-  Register Reg;
-} SLSBLRThunks[NumPermittedRegs] = {
-    {"__llvm_slsblr_thunk_x0", AArch64::X0},
-    {"__llvm_slsblr_thunk_x1", AArch64::X1},
-    {"__llvm_slsblr_thunk_x2", AArch64::X2},
-    {"__llvm_slsblr_thunk_x3", AArch64::X3},
-    {"__llvm_slsblr_thunk_x4", AArch64::X4},
-    {"__llvm_slsblr_thunk_x5", AArch64::X5},
-    {"__llvm_slsblr_thunk_x6", AArch64::X6},
-    {"__llvm_slsblr_thunk_x7", AArch64::X7},
-    {"__llvm_slsblr_thunk_x8", AArch64::X8},
-    {"__llvm_slsblr_thunk_x9", AArch64::X9},
-    {"__llvm_slsblr_thunk_x10", AArch64::X10},
-    {"__llvm_slsblr_thunk_x11", AArch64::X11},
-    {"__llvm_slsblr_thunk_x12", AArch64::X12},
-    {"__llvm_slsblr_thunk_x13", AArch64::X13},
-    {"__llvm_slsblr_thunk_x14", AArch64::X14},
-    {"__llvm_slsblr_thunk_x15", AArch64::X15},
-    // X16 and X17 are deliberately missing, as the mitigation requires those
-    // register to not be used in BLR. See comment in ConvertBLRToBL for more
-    // details.
-    {"__llvm_slsblr_thunk_x18", AArch64::X18},
-    {"__llvm_slsblr_thunk_x19", AArch64::X19},
-    {"__llvm_slsblr_thunk_x20", AArch64::X20},
-    {"__llvm_slsblr_thunk_x21", AArch64::X21},
-    {"__llvm_slsblr_thunk_x22", AArch64::X22},
-    {"__llvm_slsblr_thunk_x23", AArch64::X23},
-    {"__llvm_slsblr_thunk_x24", AArch64::X24},
-    {"__llvm_slsblr_thunk_x25", AArch64::X25},
-    {"__llvm_slsblr_thunk_x26", AArch64::X26},
-    {"__llvm_slsblr_thunk_x27", AArch64::X27},
-    {"__llvm_slsblr_thunk_x28", AArch64::X28},
-    {"__llvm_slsblr_thunk_x29", AArch64::FP},
-    // X30 is deliberately missing, for similar reasons as X16 and X17 are
-    // missing.
-    {"__llvm_slsblr_thunk_x31", AArch64::XZR},
-};
+// Currently, the longest possible thunk name is
+//   __llvm_slsblr_thunk_aa_xNN_xMM
+// which is 31 characters (without the '\0' character).
+static SmallString<32> createThunkName(const ThunkKind &Kind, Register Xn,
+                                       Register Xm) {
+  unsigned N = ThunksSet::indexOfXReg(Xn);
+  if (!Kind.HasXmOperand)
+    return formatv("{0}{1}x{2}", CommonNamePrefix, Kind.NameInfix, N);
+
+  unsigned M = ThunksSet::indexOfXReg(Xm);
+  return formatv("{0}{1}x{2}_x{3}", CommonNamePrefix, Kind.NameInfix, N, M);
+}
 
-unsigned getThunkIndex(Register Reg) {
-  for (unsigned I = 0; I < NumPermittedRegs; ++I)
-    if (SLSBLRThunks[I].Reg == Reg)
-      return I;
-  llvm_unreachable("Unexpected register");
+static std::tuple<const ThunkKind &, Register, Register>
+parseThunkName(StringRef ThunkName) {
+  assert(ThunkName.starts_with(CommonNamePrefix) &&
+         "Should be filtered out by ThunkInserter");
+  // Thunk name suffix, such as "x1" or "aa_x2_x3".
+  StringRef NameSuffix = ThunkName.drop_front(CommonNamePrefix.size());
+
+  // Parse thunk kind based on thunk name infix.
+  const ThunkKind &Kind = *StringSwitch<const ThunkKind *>(NameSuffix)
+                               .StartsWith("aa_", &ThunkKind::BRAA)
+                               .StartsWith("ab_", &ThunkKind::BRAB)
+                               .StartsWith("aaz_", &ThunkKind::BRAAZ)
+                               .StartsWith("abz_", &ThunkKind::BRABZ)
+                               .Default(&ThunkKind::BR);
+
+  auto ParseRegName = [](StringRef Name) {
+    unsigned N;
+
+    assert(Name.starts_with("x") && "xN register name expected");
+    bool Fail = Name.drop_front(1).getAsInteger(/*Radix=*/10, N);
+    assert(!Fail && N < ThunksSet::NumXRegisters && "Unexpected register");
+    (void)Fail;
+
+    return ThunksSet::xRegByIndex(N);
+  };
+
+  // For example, "x1" or "x2_x3".
+  StringRef RegsStr = NameSuffix.drop_front(Kind.NameInfix.size());
+  StringRef XnStr, XmStr;
+  std::tie(XnStr, XmStr) = RegsStr.split('_');
+
+  // Parse register operands.
+  Register Xn = ParseRegName(XnStr);
+  Register Xm = Kind.HasXmOperand ? ParseRegName(XmStr) : AArch64::NoRegister;
+
+  return std::make_tuple(std::ref(Kind), Xn, Xm);
 }
 
 void SLSHardeningInserter::populateThunk(MachineFunction &MF) {
   assert(MF.getFunction().hasComdat() == ComdatThunks &&
          "ComdatThunks value changed since MF creation");
-  // FIXME: How to better communicate Register number, rather than through
-  // name and lookup table?
-  assert(MF.getName().starts_with(getThunkPrefix()));
-  auto ThunkIt = llvm::find_if(
-      SLSBLRThunks, [&MF](auto T) { return T.Name == MF.getName(); });
-  assert(ThunkIt != std::end(SLSBLRThunks));
-  Register ThunkReg = ThunkIt->Reg;
+  Register Xn, Xm;
+  auto KindAndRegs = parseThunkName(MF.getName());
+  const ThunkKind &Kind = std::get<0>(KindAndRegs);
+  std::tie(std::ignore, Xn, Xm) = KindAndRegs;
 
   const TargetInstrInfo *TII =
       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
@@ -218,16 +351,26 @@ void SLSHardeningInserter::populateThunk(MachineFunction &MF) {
   Entry->clear();
 
   //  These thunks need to consist of the following instructions:
-  //  __llvm_slsblr_thunk_xN:
-  //      BR xN
+  //  __llvm_slsblr_thunk_...:
+  //      MOV x16, xN     ; BR* instructions are not compatible with "BTI c"
+  //                      ; branch target unless xN is x16 or x17.
+  //      BR* ...         ; One of: BR        x16
+  //                      ;         BRA(A|B)  x16, xM
+  //                      ;         BRA(A|B)Z x16
   //      barrierInsts
-  Entry->addLiveIn(ThunkReg);
-  // MOV X16, ThunkReg == ORR X16, XZR, ThunkReg, LSL #0
+  Entry->addLiveIn(Xn);
+  // MOV X16, Reg == ORR X16, XZR, Reg, LSL #0
   BuildMI(Entry, DebugLoc(), TII->get(AArch64::ORRXrs), AArch64::X16)
       .addReg(AArch64::XZR)
-      .addReg(ThunkReg)
+      .addReg(Xn)
       .addImm(0);
-  BuildMI(Entry, DebugLoc(), TII->get(AArch64::BR)).addReg(AArch64::X16);
+  auto &Builder =
+      BuildMI(Entry, DebugLoc(), TII->get(Kind.BROpcode)).addReg(AArch64::X16);
+  if (Xm != AArch64::NoRegister) {
+    Entry->addLiveIn(Xm);
+    Builder.addReg(Xm);
+  }
+
   // Make sure the thunks do not make use of the SB extension in case there is
   // a function somewhere that will call to it that for some reason disabled
   // the SB extension locally on that function, even though it's enabled for
@@ -239,12 +382,14 @@ void SLSHardeningInserter::populateThunk(MachineFunction &MF) {
 void SLSHardeningInserter::convertBLRToBL(
     MachineModuleInfo &MMI, MachineBasicBlock &MBB,
     MachineBasicBlock::instr_iterator MBBI, ThunksSet &Thunks) {
-  // Transform a BLR to a BL as follows:
+  // Transform a BLR* instruction (one of BLR, BLRAA/BLRAB or BLRAAZ/BLRABZ) to
+  // a BL to the thunk containing BR, BRAA/BRAB or BRAAZ/BRABZ, respectively.
+  //
   // Before:
   //   |-----------------------------|
   //   |      ...                    |
   //   |  instI                      |
-  //   |  BLR xN                     |
+  //   |  BLR* xN or BLR* xN, xM     |
   //   |  instJ                      |
   //   |      ...                    |
   //   |-----------------------------|
@@ -253,61 +398,53 @@ void SLSHardeningInserter::convertBLRToBL(
   //   |-----------------------------|
   //   |      ...                    |
   //   |  instI                      |
-  //   |  BL __llvm_slsblr_thunk_xN  |
+  //   |  BL __llvm_slsblr_thunk_... |
   //   |  instJ                      |
   //   |      ...                    |
   //   |-----------------------------|
   //
-  //   __llvm_slsblr_thunk_xN:
+  //   __llvm_slsblr_thunk_...:
   //   |-----------------------------|
-  //   |  BR xN                      |
+  //   |  MOV x16, xN                |
+  //   |  BR* x16 or BR* x16, xM     |
   //   |  barrierInsts               |
   //   |-----------------------------|
   //
-  // This function merely needs to transform BLR xN into BL
-  // __llvm_slsblr_thunk_xN.
+  // This function needs to transform BLR* instruction into BL with the correct
+  // thunk name and lazily create the thunk if it does not exist yet.
   //
   // Since linkers are allowed to clobber X16 and X17 on function calls, the
-  // above mitigation only works if the original BLR instruction was not
-  // BLR X16 nor BLR X17. Code generation before must make sure that no BLR
-  // X16|X17 was produced if the mitigation is enabled.
+  // above mitigation only works if the original BLR* instruction had neither
+  // X16 nor X17 as one of its operands. Code generation before must make sure
+  // that no such BLR* instruction was produced if the mitigation is enabled.
 
   MachineInstr &BLR = *MBBI;
   assert(isBLR(BLR));
-  unsigned BLOpcode;
-  Register Reg;
-  bool RegIsKilled;
-  switch (BLR.getOpcode()) {
-  case AArch64::BLR:
-  case AArch64::BLRNoIP:
-    BLOpcode = AArch64::BL;
-    Reg = BLR.getOperand(0).getReg();
-    assert(Reg != AArch64::X16 && Reg != AArch64::X17 && Reg != AArch64::LR);
-    RegIsKilled = BLR.getOperand(0).isKill();
-    break;
-  case AArch64::BLRAA:
-  case AArch64::BLRAB:
-  case AArch64::BLRAAZ:
-  case AArch64::BLRABZ:
-    llvm_unreachable("BLRA instructions cannot yet be produced by LLVM, "
-                     "therefore there is no need to support them for now.");
-  default:
-    llvm_unreachable("unhandled BLR");
-  }
+  const ThunkKind &Kind = *getThunkKind(BLR.getOpcode());
+
+  unsigned NumRegOperands = Kind.HasXmOperand ? 2 : 1;
+  assert(BLR.getNumExplicitOperands() == NumRegOperands &&
+         "Expected one or two register inputs");
+  Register Xn = BLR.getOperand(0).getReg();
+  Register Xm =
+      Kind.HasXmOperand ? BLR.getOperand(1).getReg() : AArch64::NoRegister;
+
   DebugLoc DL = BLR.getDebugLoc();
 
   MachineFunction &MF = *MBBI->getMF();
   MCContext &Context = MBB.getParent()->getContext();
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-  unsigned ThunkIndex = getThunkIndex(Reg);
-  StringRef ThunkName = SLSBLRThunks[ThunkIndex].Name;
+
+  auto ThunkName = createThunkName(Kind, Xn, Xm);
   MCSymbol *Sym = Context.getOrCreateSymbol(ThunkName);
-  if (!(Thunks & (1u << ThunkIndex))) {
-    Thunks |= 1u << ThunkIndex;
-    createThunkFunction(MMI, ThunkName, ComdatThunks);
+
+  if (!Thunks.get(Kind.Id, Xn, Xm)) {
+    StringRef TargetAttrs = Kind.NeedsPAuth ? "+pauth" : "";
+    Thunks.set(Kind.Id, Xn, Xm);
+    createThunkFunction(MMI, ThunkName, ComdatThunks, TargetAttrs);
   }
 
-  MachineInstr *BL = BuildMI(MBB, MBBI, DL, TII->get(BLOpcode)).addSym(Sym);
+  MachineInstr *BL = BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)).addSym(Sym);
 
   // Now copy the implicit operands from BLR to BL and copy other necessary
   // info.
@@ -338,9 +475,13 @@ void SLSHardeningInserter::convertBLRToBL(
   // Now copy over the implicit operands from the original BLR
   BL->copyImplicitOps(MF, BLR);
   MF.moveCallSiteInfo(&BLR, BL);
-  // Also add the register called in the BLR as being used in the called thunk.
-  BL->addOperand(MachineOperand::CreateReg(Reg, false /*isDef*/, true /*isImp*/,
-                                           RegIsKilled /*isKill*/));
+  // Also add the register operands of the original BLR* instruction
+  // as being used in the called thunk.
+  for (unsigned OpIdx = 0; OpIdx < NumRegOperands; ++OpIdx) {
+    MachineOperand &Op = BLR.getOperand(OpIdx);
+    BL->addOperand(MachineOperand::CreateReg(Op.getReg(), /*isDef=*/false,
+                                             /*isImp=*/true, Op.isKill()));
+  }
   // Remove BLR instruction
   MBB.erase(MBBI);
 }
diff --git a/llvm/test/CodeGen/AArch64/speculation-hardening-sls-blra.mir b/llvm/test/CodeGen/AArch64/speculation-hardening-sls-blra.mir
new file mode 100644
index 00000000000000..06669a6d6aae25
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/speculation-hardening-sls-blra.mir
@@ -0,0 +1,210 @@
+# RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu \
+# RUN:     -start-before aarch64-sls-hardening -o - %s \
+# RUN:     -asm-verbose=0 \
+# RUN: | FileCheck %s \
+# RUN:     --implicit-check-not=__llvm_slsblr_thunk_aa_x5_x8 \
+# RUN:     --implicit-check-not=__llvm_slsblr_thunk_ab_x5_x8 \
+# RUN:     --implicit-check-not=__llvm_slsblr_thunk_aaz_x5 \
+# RUN:     --implicit-check-not=__llvm_slsblr_thunk_abz_x5
+
+# Pointer Authentication extension introduces more branch-with-link-to-register
+# instructions for the BLR SLS hardening to handle, namely BLRAA, BLRAB, BLRAAZ
+# and BLRABZ. Unlike the non-authenticating BLR instruction, BLRAA and BLRAB
+# accept two register operands (almost 900 combinations for each instruction).
+# For that reason, it is not practical to create all possible thunks.
+
+# Check that the BLR SLS hardening transforms BLRA* instructions into
+# unconditional BL calls to the correct thunk functions.
+# Check that only relevant thunk functions are generated.
+--- |
+  define void @test_instructions() #0 {
+  entry:
+    ret void
+  }
+
+  define void @test_no_redef() #0 {
+  entry:
+    ret void
+  }
+
+  define void @test_regs() #0 {
+  entry:
+    ret void
+  }
+
+  attributes #0 = { "target-features"="+pauth,+harden-sls-blr" }
+...
+
+# Test that all BLRA* instructions are handled.
+---
+name:            test_instructions
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $lr, $x0, $x1, $x2, $x3
+
+    BLRAA $x0, $x1, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    BLRAB $x1, $x2, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    BLRAAZ $x2, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    BLRABZ $x3, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    RET undef $lr
+...
+
+# Test that the same thunk function is not created twice.
+---
+name:            test_no_redef
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $lr, $x0, $x1, $x2, $x3, $x4
+
+    ; thunk used by @test_instructions
+    BLRAB $x1, $x2, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+
+    ; thunk used by this function twice
+    BLRAB $x3, $x4, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    BLRAB $x3, $x4, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+
+    RET undef $lr
+...
+
+# Test that all xN registers (except x16, x17, x30 and xzr) are handled.
+---
+name:            test_regs
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
+
+    BLRAA $x0, $x1, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    BLRAA $x2, $x3, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    BLRAA $x4, $x5, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    BLRAA $x6, $x7, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    BLRAA $x8, $x9, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    BLRAA $x10, $x11, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    BLRAA $x12, $x13, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    BLRAA $x14, $x15, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    ; skipping x16 and x17
+    BLRAA $x18, $x19, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    BLRAA $x20, $x21, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    BLRAA $x22, $x23, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    BLRAA $x24, $x25, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    BLRAA $x26, $x27, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    BLRAA $x28, $fp, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
+    RET undef $lr
+...
+
+# CHECK-LABEL: test_instructions:
+# CHECK-NEXT:    .cfi_startproc
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x0_x1
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_ab_x1_x2
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aaz_x2
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_abz_x3
+# CHECK-NEXT:    ret
+
+# CHECK-LABEL: test_no_redef:
+# CHECK-NEXT:    .cfi_startproc
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_ab_x1_x2
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_ab_x3_x4
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_ab_x3_x4
+# CHECK-NEXT:    ret
+
+# CHECK-LABEL: test_regs:
+# CHECK-NEXT:    .cfi_startproc
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x0_x1
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x2_x3
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x4_x5
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x6_x7
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x8_x9
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x10_x11
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x12_x13
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x14_x15
+# skipping x16 and x17
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x18_x19
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x20_x21
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x22_x23
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x24_x25
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x26_x27
+# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x28_x29
+# CHECK-NEXT:    ret
+
+# CHECK-LABEL: __llvm_slsblr_thunk_aa_x0_x1:
+# CHECK-NEXT:    mov     x16, x0
+# CHECK-NEXT:    braa    x16, x1
+# CHECK-NEXT:    dsb     sy
+# CHECK-NEXT:    isb
+
+# CHECK-LABEL: __llvm_slsblr_thunk_ab_x1_x2:
+# CHECK-NEXT:    mov     x16, x1
+# CHECK-NEXT:    brab    x16, x2
+# CHECK-NEXT:    dsb     sy
+# CHECK-NEXT:    isb
+
+# CHECK-LABEL: __llvm_slsblr_thunk_aaz_x2:
+# CHECK-NEXT:    mov     x16, x2
+# CHECK-NEXT:    braaz   x16
+# CHECK-NEXT:    dsb     sy
+# CHECK-NEXT:    isb
+
+# CHECK-LABEL: __llvm_slsblr_thunk_abz_x3:
+# CHECK-NEXT:    mov     x16, x3
+# CHECK-NEXT:    brabz   x16
+# CHECK-NEXT:    dsb     sy
+# CHECK-NEXT:    isb
+
+# The instruction *operands* should correspond to the thunk function *name*
+# (check that the name is parsed correctly when populating the thunk).
+
+# CHECK-LABEL: __llvm_slsblr_thunk_aa_x2_x3:
+# CHECK-NEXT:    mov     x16, x2
+# CHECK:         braa    x16, x3
+
+# CHECK-LABEL: __llvm_slsblr_thunk_aa_x4_x5:
+# CHECK-NEXT:    mov     x16, x4
+# CHECK:         braa    x16, x5
+
+# CHECK-LABEL: __llvm_slsblr_thunk_aa_x6_x7:
+# CHECK-NEXT:    mov     x16, x6
+# CHECK:         braa    x16, x7
+
+# CHECK-LABEL: __llvm_slsblr_thunk_aa_x8_x9:
+# CHECK-NEXT:    mov     x16, x8
+# CHECK:         braa    x16, x9
+
+# CHECK-LABEL: __llvm_slsblr_thunk_aa_x10_x11:
+# CHECK-NEXT:    mov     x16, x10
+# CHECK:         braa    x16, x11
+
+# CHECK-LABEL: __llvm_slsblr_thunk_aa_x12_x13:
+# CHECK-NEXT:    mov     x16, x12
+# CHECK:         braa    x16, x13
+
+# CHECK-LABEL: __llvm_slsblr_thunk_aa_x14_x15:
+# CHECK-NEXT:    mov     x16, x14
+# CHECK:         braa    x16, x15
+
+# skipping x16 and x17
+
+# CHECK-LABEL: __llvm_slsblr_thunk_aa_x18_x19:
+# CHECK-NEXT:    mov     x16, x18
+# CHECK:         braa    x16, x19
+
+# CHECK-LABEL: __llvm_slsblr_thunk_aa_x20_x21:
+# CHECK-NEXT:    mov     x16, x20
+# CHECK:         braa    x16, x21
+
+# CHECK-LABEL: __llvm_slsblr_thunk_aa_x22_x23:
+# CHECK-NEXT:    mov     x16, x22
+# CHECK:         braa    x16, x23
+
+# CHECK-LABEL: __llvm_slsblr_thunk_aa_x24_x25:
+# CHECK-NEXT:    mov     x16, x24
+# CHECK:         braa    x16, x25
+
+# CHECK-LABEL: __llvm_slsblr_thunk_aa_x26_x27:
+# CHECK-NEXT:    mov     x16, x26
+# CHECK:         braa    x16, x27
+
+# CHECK-LABEL: __llvm_slsblr_thunk_aa_x28_x29:
+# CHECK-NEXT:    mov     x16, x28
+# CHECK:         braa    x16, x29

From f90bac99e19d4243ac52cf6e18aa374f9a8754cb Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko <atrosinenko@accesssoftek.com>
Date: Sat, 6 Jul 2024 13:55:12 +0300
Subject: [PATCH 60/67] Revert "[AArch64][PAC] Support BLRA* instructions in
 SLS Hardening pass" (#97887)

This reverts commit 88b26293a24bdd85fce2b2f7191cc0a5bc0cecfe due to
failures of

    CodeGen/AArch64/speculation-hardening-sls-blra.mir
---
 .../Target/AArch64/AArch64SLSHardening.cpp    | 377 ++++++------------
 .../speculation-hardening-sls-blra.mir        | 210 ----------
 2 files changed, 118 insertions(+), 469 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/speculation-hardening-sls-blra.mir

diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
index 5e83015d72f422..00ba31b3e500dc 100644
--- a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
@@ -13,7 +13,6 @@
 
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/IndirectThunks.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -24,11 +23,8 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
-#include <climits>
-#include <tuple>
 
 using namespace llvm;
 
@@ -36,107 +32,17 @@ using namespace llvm;
 
 #define AARCH64_SLS_HARDENING_NAME "AArch64 sls hardening pass"
 
-// Common name prefix of all thunks generated by this pass.
-//
-// The generic form is
-// __llvm_slsblr_thunk_xN            for BLR thunks
-// __llvm_slsblr_thunk_(aaz|abz)_xN  for BLRAAZ and BLRABZ thunks
-// __llvm_slsblr_thunk_(aa|ab)_xN_xM for BLRAA and BLRAB thunks
-static constexpr StringRef CommonNamePrefix = "__llvm_slsblr_thunk_";
+static const char SLSBLRNamePrefix[] = "__llvm_slsblr_thunk_";
 
 namespace {
 
-struct ThunkKind {
-  enum ThunkKindId {
-    ThunkBR,
-    ThunkBRAA,
-    ThunkBRAB,
-    ThunkBRAAZ,
-    ThunkBRABZ,
-  };
-
-  ThunkKindId Id;
-  StringRef NameInfix;
-  bool HasXmOperand;
-  bool NeedsPAuth;
-
-  // Opcode to perform indirect jump from inside the thunk.
-  unsigned BROpcode;
-
-  static const ThunkKind BR;
-  static const ThunkKind BRAA;
-  static const ThunkKind BRAB;
-  static const ThunkKind BRAAZ;
-  static const ThunkKind BRABZ;
-};
-
-// Set of inserted thunks.
-class ThunksSet {
-public:
-  static constexpr unsigned NumXRegisters = 32;
-
-  // Given Xn register, returns n.
-  static unsigned indexOfXReg(Register Xn);
-  // Given n, returns Xn register.
-  static Register xRegByIndex(unsigned N);
-
-  ThunksSet &operator|=(const ThunksSet &Other) {
-    BLRThunks |= Other.BLRThunks;
-    BLRAAZThunks |= Other.BLRAAZThunks;
-    BLRABZThunks |= Other.BLRABZThunks;
-    for (unsigned I = 0; I < NumXRegisters; ++I)
-      BLRAAThunks[I] |= Other.BLRAAThunks[I];
-    for (unsigned I = 0; I < NumXRegisters; ++I)
-      BLRABThunks[I] |= Other.BLRABThunks[I];
-
-    return *this;
-  }
-
-  bool get(ThunkKind::ThunkKindId Kind, Register Xn, Register Xm) {
-    reg_bitmask_t XnBit = reg_bitmask_t(1) << indexOfXReg(Xn);
-    return getBitmask(Kind, Xm) & XnBit;
-  }
-
-  void set(ThunkKind::ThunkKindId Kind, Register Xn, Register Xm) {
-    reg_bitmask_t XnBit = reg_bitmask_t(1) << indexOfXReg(Xn);
-    getBitmask(Kind, Xm) |= XnBit;
-  }
-
-private:
-  typedef uint32_t reg_bitmask_t;
-  static_assert(NumXRegisters <= sizeof(reg_bitmask_t) * CHAR_BIT,
-                "Bitmask is not wide enough to hold all Xn registers");
-
-  // Bitmasks representing operands used, with n-th bit corresponding to Xn
-  // register operand. If the instruction has a second operand (Xm), an array
-  // of bitmasks is used, indexed by m.
-  // Indexes corresponding to the forbidden x16, x17 and x30 registers are
-  // always unset, for simplicity there are no holes.
-  reg_bitmask_t BLRThunks = 0;
-  reg_bitmask_t BLRAAZThunks = 0;
-  reg_bitmask_t BLRABZThunks = 0;
-  reg_bitmask_t BLRAAThunks[NumXRegisters] = {};
-  reg_bitmask_t BLRABThunks[NumXRegisters] = {};
-
-  reg_bitmask_t &getBitmask(ThunkKind::ThunkKindId Kind, Register Xm) {
-    switch (Kind) {
-    case ThunkKind::ThunkBR:
-      return BLRThunks;
-    case ThunkKind::ThunkBRAAZ:
-      return BLRAAZThunks;
-    case ThunkKind::ThunkBRABZ:
-      return BLRABZThunks;
-    case ThunkKind::ThunkBRAA:
-      return BLRAAThunks[indexOfXReg(Xm)];
-    case ThunkKind::ThunkBRAB:
-      return BLRABThunks[indexOfXReg(Xm)];
-    }
-  }
-};
+// Set of inserted thunks: bitmask with bits corresponding to
+// indexes in SLSBLRThunks array.
+typedef uint32_t ThunksSet;
 
 struct SLSHardeningInserter : ThunkInserter<SLSHardeningInserter, ThunksSet> {
 public:
-  const char *getThunkPrefix() { return CommonNamePrefix.data(); }
+  const char *getThunkPrefix() { return SLSBLRNamePrefix; }
   bool mayUseThunk(const MachineFunction &MF) {
     ComdatThunks &= !MF.getSubtarget<AArch64Subtarget>().hardenSlsNoComdat();
     // We are inserting barriers aside from thunk calls, so
@@ -162,61 +68,6 @@ struct SLSHardeningInserter : ThunkInserter<SLSHardeningInserter, ThunksSet> {
 
 } // end anonymous namespace
 
-const ThunkKind ThunkKind::BR = {ThunkBR, "", /*HasXmOperand=*/false,
-                                 /*NeedsPAuth=*/false, AArch64::BR};
-const ThunkKind ThunkKind::BRAA = {ThunkBRAA, "aa_", /*HasXmOperand=*/true,
-                                   /*NeedsPAuth=*/true, AArch64::BRAA};
-const ThunkKind ThunkKind::BRAB = {ThunkBRAB, "ab_", /*HasXmOperand=*/true,
-                                   /*NeedsPAuth=*/true, AArch64::BRAB};
-const ThunkKind ThunkKind::BRAAZ = {ThunkBRAAZ, "aaz_", /*HasXmOperand=*/false,
-                                    /*NeedsPAuth=*/true, AArch64::BRAAZ};
-const ThunkKind ThunkKind::BRABZ = {ThunkBRABZ, "abz_", /*HasXmOperand=*/false,
-                                    /*NeedsPAuth=*/true, AArch64::BRABZ};
-
-// Returns thunk kind to emit, or nullptr if not a BLR* instruction.
-static const ThunkKind *getThunkKind(unsigned OriginalOpcode) {
-  switch (OriginalOpcode) {
-  case AArch64::BLR:
-  case AArch64::BLRNoIP:
-    return &ThunkKind::BR;
-  case AArch64::BLRAA:
-    return &ThunkKind::BRAA;
-  case AArch64::BLRAB:
-    return &ThunkKind::BRAB;
-  case AArch64::BLRAAZ:
-    return &ThunkKind::BRAAZ;
-  case AArch64::BLRABZ:
-    return &ThunkKind::BRABZ;
-  }
-  return nullptr;
-}
-
-static bool isBLR(const MachineInstr &MI) {
-  return getThunkKind(MI.getOpcode()) != nullptr;
-}
-
-unsigned ThunksSet::indexOfXReg(Register Reg) {
-  assert(AArch64::GPR64RegClass.contains(Reg));
-  assert(Reg != AArch64::X16 && Reg != AArch64::X17 && Reg != AArch64::LR);
-
-  // Most Xn registers have consecutive ids, except for FP and XZR.
-  unsigned Result = (unsigned)Reg - (unsigned)AArch64::X0;
-  if (Reg == AArch64::FP)
-    Result = 29;
-  else if (Reg == AArch64::XZR)
-    Result = 31;
-
-  assert(Result < NumXRegisters && "Internal register numbering changed");
-  assert(AArch64::GPR64RegClass.getRegister(Result).id() == Reg &&
-         "Internal register numbering changed");
-
-  return Result;
-}
-
-Register ThunksSet::xRegByIndex(unsigned N) {
-  return AArch64::GPR64RegClass.getRegister(N);
-}
-
 static void insertSpeculationBarrier(const AArch64Subtarget *ST,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MBBI,
@@ -253,6 +104,22 @@ ThunksSet SLSHardeningInserter::insertThunks(MachineModuleInfo &MMI,
   return ExistingThunks;
 }
 
+static bool isBLR(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AArch64::BLR:
+  case AArch64::BLRNoIP:
+    return true;
+  case AArch64::BLRAA:
+  case AArch64::BLRAB:
+  case AArch64::BLRAAZ:
+  case AArch64::BLRABZ:
+    llvm_unreachable("Currently, LLVM's code generator does not support "
+                     "producing BLRA* instructions. Therefore, there's no "
+                     "support in this pass for those instructions.");
+  }
+  return false;
+}
+
 bool SLSHardeningInserter::hardenReturnsAndBRs(MachineModuleInfo &MMI,
                                                MachineBasicBlock &MBB) {
   const AArch64Subtarget *ST =
@@ -272,64 +139,64 @@ bool SLSHardeningInserter::hardenReturnsAndBRs(MachineModuleInfo &MMI,
   return Modified;
 }
 
-// Currently, the longest possible thunk name is
-//   __llvm_slsblr_thunk_aa_xNN_xMM
-// which is 31 characters (without the '\0' character).
-static SmallString<32> createThunkName(const ThunkKind &Kind, Register Xn,
-                                       Register Xm) {
-  unsigned N = ThunksSet::indexOfXReg(Xn);
-  if (!Kind.HasXmOperand)
-    return formatv("{0}{1}x{2}", CommonNamePrefix, Kind.NameInfix, N);
-
-  unsigned M = ThunksSet::indexOfXReg(Xm);
-  return formatv("{0}{1}x{2}_x{3}", CommonNamePrefix, Kind.NameInfix, N, M);
-}
+static const unsigned NumPermittedRegs = 29;
+static const struct ThunkNameAndReg {
+  const char* Name;
+  Register Reg;
+} SLSBLRThunks[NumPermittedRegs] = {
+    {"__llvm_slsblr_thunk_x0", AArch64::X0},
+    {"__llvm_slsblr_thunk_x1", AArch64::X1},
+    {"__llvm_slsblr_thunk_x2", AArch64::X2},
+    {"__llvm_slsblr_thunk_x3", AArch64::X3},
+    {"__llvm_slsblr_thunk_x4", AArch64::X4},
+    {"__llvm_slsblr_thunk_x5", AArch64::X5},
+    {"__llvm_slsblr_thunk_x6", AArch64::X6},
+    {"__llvm_slsblr_thunk_x7", AArch64::X7},
+    {"__llvm_slsblr_thunk_x8", AArch64::X8},
+    {"__llvm_slsblr_thunk_x9", AArch64::X9},
+    {"__llvm_slsblr_thunk_x10", AArch64::X10},
+    {"__llvm_slsblr_thunk_x11", AArch64::X11},
+    {"__llvm_slsblr_thunk_x12", AArch64::X12},
+    {"__llvm_slsblr_thunk_x13", AArch64::X13},
+    {"__llvm_slsblr_thunk_x14", AArch64::X14},
+    {"__llvm_slsblr_thunk_x15", AArch64::X15},
+    // X16 and X17 are deliberately missing, as the mitigation requires those
+    // register to not be used in BLR. See comment in ConvertBLRToBL for more
+    // details.
+    {"__llvm_slsblr_thunk_x18", AArch64::X18},
+    {"__llvm_slsblr_thunk_x19", AArch64::X19},
+    {"__llvm_slsblr_thunk_x20", AArch64::X20},
+    {"__llvm_slsblr_thunk_x21", AArch64::X21},
+    {"__llvm_slsblr_thunk_x22", AArch64::X22},
+    {"__llvm_slsblr_thunk_x23", AArch64::X23},
+    {"__llvm_slsblr_thunk_x24", AArch64::X24},
+    {"__llvm_slsblr_thunk_x25", AArch64::X25},
+    {"__llvm_slsblr_thunk_x26", AArch64::X26},
+    {"__llvm_slsblr_thunk_x27", AArch64::X27},
+    {"__llvm_slsblr_thunk_x28", AArch64::X28},
+    {"__llvm_slsblr_thunk_x29", AArch64::FP},
+    // X30 is deliberately missing, for similar reasons as X16 and X17 are
+    // missing.
+    {"__llvm_slsblr_thunk_x31", AArch64::XZR},
+};
 
-static std::tuple<const ThunkKind &, Register, Register>
-parseThunkName(StringRef ThunkName) {
-  assert(ThunkName.starts_with(CommonNamePrefix) &&
-         "Should be filtered out by ThunkInserter");
-  // Thunk name suffix, such as "x1" or "aa_x2_x3".
-  StringRef NameSuffix = ThunkName.drop_front(CommonNamePrefix.size());
-
-  // Parse thunk kind based on thunk name infix.
-  const ThunkKind &Kind = *StringSwitch<const ThunkKind *>(NameSuffix)
-                               .StartsWith("aa_", &ThunkKind::BRAA)
-                               .StartsWith("ab_", &ThunkKind::BRAB)
-                               .StartsWith("aaz_", &ThunkKind::BRAAZ)
-                               .StartsWith("abz_", &ThunkKind::BRABZ)
-                               .Default(&ThunkKind::BR);
-
-  auto ParseRegName = [](StringRef Name) {
-    unsigned N;
-
-    assert(Name.starts_with("x") && "xN register name expected");
-    bool Fail = Name.drop_front(1).getAsInteger(/*Radix=*/10, N);
-    assert(!Fail && N < ThunksSet::NumXRegisters && "Unexpected register");
-    (void)Fail;
-
-    return ThunksSet::xRegByIndex(N);
-  };
-
-  // For example, "x1" or "x2_x3".
-  StringRef RegsStr = NameSuffix.drop_front(Kind.NameInfix.size());
-  StringRef XnStr, XmStr;
-  std::tie(XnStr, XmStr) = RegsStr.split('_');
-
-  // Parse register operands.
-  Register Xn = ParseRegName(XnStr);
-  Register Xm = Kind.HasXmOperand ? ParseRegName(XmStr) : AArch64::NoRegister;
-
-  return std::make_tuple(std::ref(Kind), Xn, Xm);
+unsigned getThunkIndex(Register Reg) {
+  for (unsigned I = 0; I < NumPermittedRegs; ++I)
+    if (SLSBLRThunks[I].Reg == Reg)
+      return I;
+  llvm_unreachable("Unexpected register");
 }
 
 void SLSHardeningInserter::populateThunk(MachineFunction &MF) {
   assert(MF.getFunction().hasComdat() == ComdatThunks &&
          "ComdatThunks value changed since MF creation");
-  Register Xn, Xm;
-  auto KindAndRegs = parseThunkName(MF.getName());
-  const ThunkKind &Kind = std::get<0>(KindAndRegs);
-  std::tie(std::ignore, Xn, Xm) = KindAndRegs;
+  // FIXME: How to better communicate Register number, rather than through
+  // name and lookup table?
+  assert(MF.getName().starts_with(getThunkPrefix()));
+  auto ThunkIt = llvm::find_if(
+      SLSBLRThunks, [&MF](auto T) { return T.Name == MF.getName(); });
+  assert(ThunkIt != std::end(SLSBLRThunks));
+  Register ThunkReg = ThunkIt->Reg;
 
   const TargetInstrInfo *TII =
       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
@@ -351,26 +218,16 @@ void SLSHardeningInserter::populateThunk(MachineFunction &MF) {
   Entry->clear();
 
   //  These thunks need to consist of the following instructions:
-  //  __llvm_slsblr_thunk_...:
-  //      MOV x16, xN     ; BR* instructions are not compatible with "BTI c"
-  //                      ; branch target unless xN is x16 or x17.
-  //      BR* ...         ; One of: BR        x16
-  //                      ;         BRA(A|B)  x16, xM
-  //                      ;         BRA(A|B)Z x16
+  //  __llvm_slsblr_thunk_xN:
+  //      BR xN
   //      barrierInsts
-  Entry->addLiveIn(Xn);
-  // MOV X16, Reg == ORR X16, XZR, Reg, LSL #0
+  Entry->addLiveIn(ThunkReg);
+  // MOV X16, ThunkReg == ORR X16, XZR, ThunkReg, LSL #0
   BuildMI(Entry, DebugLoc(), TII->get(AArch64::ORRXrs), AArch64::X16)
       .addReg(AArch64::XZR)
-      .addReg(Xn)
+      .addReg(ThunkReg)
       .addImm(0);
-  auto &Builder =
-      BuildMI(Entry, DebugLoc(), TII->get(Kind.BROpcode)).addReg(AArch64::X16);
-  if (Xm != AArch64::NoRegister) {
-    Entry->addLiveIn(Xm);
-    Builder.addReg(Xm);
-  }
-
+  BuildMI(Entry, DebugLoc(), TII->get(AArch64::BR)).addReg(AArch64::X16);
   // Make sure the thunks do not make use of the SB extension in case there is
   // a function somewhere that will call to it that for some reason disabled
   // the SB extension locally on that function, even though it's enabled for
@@ -382,14 +239,12 @@ void SLSHardeningInserter::populateThunk(MachineFunction &MF) {
 void SLSHardeningInserter::convertBLRToBL(
     MachineModuleInfo &MMI, MachineBasicBlock &MBB,
     MachineBasicBlock::instr_iterator MBBI, ThunksSet &Thunks) {
-  // Transform a BLR* instruction (one of BLR, BLRAA/BLRAB or BLRAAZ/BLRABZ) to
-  // a BL to the thunk containing BR, BRAA/BRAB or BRAAZ/BRABZ, respectively.
-  //
+  // Transform a BLR to a BL as follows:
   // Before:
   //   |-----------------------------|
   //   |      ...                    |
   //   |  instI                      |
-  //   |  BLR* xN or BLR* xN, xM     |
+  //   |  BLR xN                     |
   //   |  instJ                      |
   //   |      ...                    |
   //   |-----------------------------|
@@ -398,53 +253,61 @@ void SLSHardeningInserter::convertBLRToBL(
   //   |-----------------------------|
   //   |      ...                    |
   //   |  instI                      |
-  //   |  BL __llvm_slsblr_thunk_... |
+  //   |  BL __llvm_slsblr_thunk_xN  |
   //   |  instJ                      |
   //   |      ...                    |
   //   |-----------------------------|
   //
-  //   __llvm_slsblr_thunk_...:
+  //   __llvm_slsblr_thunk_xN:
   //   |-----------------------------|
-  //   |  MOV x16, xN                |
-  //   |  BR* x16 or BR* x16, xM     |
+  //   |  BR xN                      |
   //   |  barrierInsts               |
   //   |-----------------------------|
   //
-  // This function needs to transform BLR* instruction into BL with the correct
-  // thunk name and lazily create the thunk if it does not exist yet.
+  // This function merely needs to transform BLR xN into BL
+  // __llvm_slsblr_thunk_xN.
   //
   // Since linkers are allowed to clobber X16 and X17 on function calls, the
-  // above mitigation only works if the original BLR* instruction had neither
-  // X16 nor X17 as one of its operands. Code generation before must make sure
-  // that no such BLR* instruction was produced if the mitigation is enabled.
+  // above mitigation only works if the original BLR instruction was not
+  // BLR X16 nor BLR X17. Code generation before must make sure that no BLR
+  // X16|X17 was produced if the mitigation is enabled.
 
   MachineInstr &BLR = *MBBI;
   assert(isBLR(BLR));
-  const ThunkKind &Kind = *getThunkKind(BLR.getOpcode());
-
-  unsigned NumRegOperands = Kind.HasXmOperand ? 2 : 1;
-  assert(BLR.getNumExplicitOperands() == NumRegOperands &&
-         "Expected one or two register inputs");
-  Register Xn = BLR.getOperand(0).getReg();
-  Register Xm =
-      Kind.HasXmOperand ? BLR.getOperand(1).getReg() : AArch64::NoRegister;
-
+  unsigned BLOpcode;
+  Register Reg;
+  bool RegIsKilled;
+  switch (BLR.getOpcode()) {
+  case AArch64::BLR:
+  case AArch64::BLRNoIP:
+    BLOpcode = AArch64::BL;
+    Reg = BLR.getOperand(0).getReg();
+    assert(Reg != AArch64::X16 && Reg != AArch64::X17 && Reg != AArch64::LR);
+    RegIsKilled = BLR.getOperand(0).isKill();
+    break;
+  case AArch64::BLRAA:
+  case AArch64::BLRAB:
+  case AArch64::BLRAAZ:
+  case AArch64::BLRABZ:
+    llvm_unreachable("BLRA instructions cannot yet be produced by LLVM, "
+                     "therefore there is no need to support them for now.");
+  default:
+    llvm_unreachable("unhandled BLR");
+  }
   DebugLoc DL = BLR.getDebugLoc();
 
   MachineFunction &MF = *MBBI->getMF();
   MCContext &Context = MBB.getParent()->getContext();
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-
-  auto ThunkName = createThunkName(Kind, Xn, Xm);
+  unsigned ThunkIndex = getThunkIndex(Reg);
+  StringRef ThunkName = SLSBLRThunks[ThunkIndex].Name;
   MCSymbol *Sym = Context.getOrCreateSymbol(ThunkName);
-
-  if (!Thunks.get(Kind.Id, Xn, Xm)) {
-    StringRef TargetAttrs = Kind.NeedsPAuth ? "+pauth" : "";
-    Thunks.set(Kind.Id, Xn, Xm);
-    createThunkFunction(MMI, ThunkName, ComdatThunks, TargetAttrs);
+  if (!(Thunks & (1u << ThunkIndex))) {
+    Thunks |= 1u << ThunkIndex;
+    createThunkFunction(MMI, ThunkName, ComdatThunks);
   }
 
-  MachineInstr *BL = BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)).addSym(Sym);
+  MachineInstr *BL = BuildMI(MBB, MBBI, DL, TII->get(BLOpcode)).addSym(Sym);
 
   // Now copy the implicit operands from BLR to BL and copy other necessary
   // info.
@@ -475,13 +338,9 @@ void SLSHardeningInserter::convertBLRToBL(
   // Now copy over the implicit operands from the original BLR
   BL->copyImplicitOps(MF, BLR);
   MF.moveCallSiteInfo(&BLR, BL);
-  // Also add the register operands of the original BLR* instruction
-  // as being used in the called thunk.
-  for (unsigned OpIdx = 0; OpIdx < NumRegOperands; ++OpIdx) {
-    MachineOperand &Op = BLR.getOperand(OpIdx);
-    BL->addOperand(MachineOperand::CreateReg(Op.getReg(), /*isDef=*/false,
-                                             /*isImp=*/true, Op.isKill()));
-  }
+  // Also add the register called in the BLR as being used in the called thunk.
+  BL->addOperand(MachineOperand::CreateReg(Reg, false /*isDef*/, true /*isImp*/,
+                                           RegIsKilled /*isKill*/));
   // Remove BLR instruction
   MBB.erase(MBBI);
 }
diff --git a/llvm/test/CodeGen/AArch64/speculation-hardening-sls-blra.mir b/llvm/test/CodeGen/AArch64/speculation-hardening-sls-blra.mir
deleted file mode 100644
index 06669a6d6aae25..00000000000000
--- a/llvm/test/CodeGen/AArch64/speculation-hardening-sls-blra.mir
+++ /dev/null
@@ -1,210 +0,0 @@
-# RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu \
-# RUN:     -start-before aarch64-sls-hardening -o - %s \
-# RUN:     -asm-verbose=0 \
-# RUN: | FileCheck %s \
-# RUN:     --implicit-check-not=__llvm_slsblr_thunk_aa_x5_x8 \
-# RUN:     --implicit-check-not=__llvm_slsblr_thunk_ab_x5_x8 \
-# RUN:     --implicit-check-not=__llvm_slsblr_thunk_aaz_x5 \
-# RUN:     --implicit-check-not=__llvm_slsblr_thunk_abz_x5
-
-# Pointer Authentication extension introduces more branch-with-link-to-register
-# instructions for the BLR SLS hardening to handle, namely BLRAA, BLRAB, BLRAAZ
-# and BLRABZ. Unlike the non-authenticating BLR instruction, BLRAA and BLRAB
-# accept two register operands (almost 900 combinations for each instruction).
-# For that reason, it is not practical to create all possible thunks.
-
-# Check that the BLR SLS hardening transforms BLRA* instructions into
-# unconditional BL calls to the correct thunk functions.
-# Check that only relevant thunk functions are generated.
---- |
-  define void @test_instructions() #0 {
-  entry:
-    ret void
-  }
-
-  define void @test_no_redef() #0 {
-  entry:
-    ret void
-  }
-
-  define void @test_regs() #0 {
-  entry:
-    ret void
-  }
-
-  attributes #0 = { "target-features"="+pauth,+harden-sls-blr" }
-...
-
-# Test that all BLRA* instructions are handled.
----
-name:            test_instructions
-tracksRegLiveness: true
-body:             |
-  bb.0.entry:
-    liveins: $lr, $x0, $x1, $x2, $x3
-
-    BLRAA $x0, $x1, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    BLRAB $x1, $x2, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    BLRAAZ $x2, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    BLRABZ $x3, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    RET undef $lr
-...
-
-# Test that the same thunk function is not created twice.
----
-name:            test_no_redef
-tracksRegLiveness: true
-body:             |
-  bb.0.entry:
-    liveins: $lr, $x0, $x1, $x2, $x3, $x4
-
-    ; thunk used by @test_instructions
-    BLRAB $x1, $x2, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-
-    ; thunk used by this function twice
-    BLRAB $x3, $x4, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    BLRAB $x3, $x4, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-
-    RET undef $lr
-...
-
-# Test that all xN registers (except x16, x17, x30 and xzr) are handled.
----
-name:            test_regs
-tracksRegLiveness: true
-body:             |
-  bb.0.entry:
-    liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $fp
-
-    BLRAA $x0, $x1, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    BLRAA $x2, $x3, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    BLRAA $x4, $x5, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    BLRAA $x6, $x7, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    BLRAA $x8, $x9, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    BLRAA $x10, $x11, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    BLRAA $x12, $x13, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    BLRAA $x14, $x15, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    ; skipping x16 and x17
-    BLRAA $x18, $x19, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    BLRAA $x20, $x21, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    BLRAA $x22, $x23, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    BLRAA $x24, $x25, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    BLRAA $x26, $x27, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    BLRAA $x28, $fp, implicit-def $lr, implicit $sp, implicit-def $sp, implicit-def $w0
-    RET undef $lr
-...
-
-# CHECK-LABEL: test_instructions:
-# CHECK-NEXT:    .cfi_startproc
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x0_x1
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_ab_x1_x2
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aaz_x2
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_abz_x3
-# CHECK-NEXT:    ret
-
-# CHECK-LABEL: test_no_redef:
-# CHECK-NEXT:    .cfi_startproc
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_ab_x1_x2
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_ab_x3_x4
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_ab_x3_x4
-# CHECK-NEXT:    ret
-
-# CHECK-LABEL: test_regs:
-# CHECK-NEXT:    .cfi_startproc
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x0_x1
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x2_x3
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x4_x5
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x6_x7
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x8_x9
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x10_x11
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x12_x13
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x14_x15
-# skipping x16 and x17
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x18_x19
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x20_x21
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x22_x23
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x24_x25
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x26_x27
-# CHECK-NEXT:    bl      __llvm_slsblr_thunk_aa_x28_x29
-# CHECK-NEXT:    ret
-
-# CHECK-LABEL: __llvm_slsblr_thunk_aa_x0_x1:
-# CHECK-NEXT:    mov     x16, x0
-# CHECK-NEXT:    braa    x16, x1
-# CHECK-NEXT:    dsb     sy
-# CHECK-NEXT:    isb
-
-# CHECK-LABEL: __llvm_slsblr_thunk_ab_x1_x2:
-# CHECK-NEXT:    mov     x16, x1
-# CHECK-NEXT:    brab    x16, x2
-# CHECK-NEXT:    dsb     sy
-# CHECK-NEXT:    isb
-
-# CHECK-LABEL: __llvm_slsblr_thunk_aaz_x2:
-# CHECK-NEXT:    mov     x16, x2
-# CHECK-NEXT:    braaz   x16
-# CHECK-NEXT:    dsb     sy
-# CHECK-NEXT:    isb
-
-# CHECK-LABEL: __llvm_slsblr_thunk_abz_x3:
-# CHECK-NEXT:    mov     x16, x3
-# CHECK-NEXT:    brabz   x16
-# CHECK-NEXT:    dsb     sy
-# CHECK-NEXT:    isb
-
-# The instruction *operands* should correspond to the thunk function *name*
-# (check that the name is parsed correctly when populating the thunk).
-
-# CHECK-LABEL: __llvm_slsblr_thunk_aa_x2_x3:
-# CHECK-NEXT:    mov     x16, x2
-# CHECK:         braa    x16, x3
-
-# CHECK-LABEL: __llvm_slsblr_thunk_aa_x4_x5:
-# CHECK-NEXT:    mov     x16, x4
-# CHECK:         braa    x16, x5
-
-# CHECK-LABEL: __llvm_slsblr_thunk_aa_x6_x7:
-# CHECK-NEXT:    mov     x16, x6
-# CHECK:         braa    x16, x7
-
-# CHECK-LABEL: __llvm_slsblr_thunk_aa_x8_x9:
-# CHECK-NEXT:    mov     x16, x8
-# CHECK:         braa    x16, x9
-
-# CHECK-LABEL: __llvm_slsblr_thunk_aa_x10_x11:
-# CHECK-NEXT:    mov     x16, x10
-# CHECK:         braa    x16, x11
-
-# CHECK-LABEL: __llvm_slsblr_thunk_aa_x12_x13:
-# CHECK-NEXT:    mov     x16, x12
-# CHECK:         braa    x16, x13
-
-# CHECK-LABEL: __llvm_slsblr_thunk_aa_x14_x15:
-# CHECK-NEXT:    mov     x16, x14
-# CHECK:         braa    x16, x15
-
-# skipping x16 and x17
-
-# CHECK-LABEL: __llvm_slsblr_thunk_aa_x18_x19:
-# CHECK-NEXT:    mov     x16, x18
-# CHECK:         braa    x16, x19
-
-# CHECK-LABEL: __llvm_slsblr_thunk_aa_x20_x21:
-# CHECK-NEXT:    mov     x16, x20
-# CHECK:         braa    x16, x21
-
-# CHECK-LABEL: __llvm_slsblr_thunk_aa_x22_x23:
-# CHECK-NEXT:    mov     x16, x22
-# CHECK:         braa    x16, x23
-
-# CHECK-LABEL: __llvm_slsblr_thunk_aa_x24_x25:
-# CHECK-NEXT:    mov     x16, x24
-# CHECK:         braa    x16, x25
-
-# CHECK-LABEL: __llvm_slsblr_thunk_aa_x26_x27:
-# CHECK-NEXT:    mov     x16, x26
-# CHECK:         braa    x16, x27
-
-# CHECK-LABEL: __llvm_slsblr_thunk_aa_x28_x29:
-# CHECK-NEXT:    mov     x16, x28
-# CHECK:         braa    x16, x29

From be3a8b8d94608746b22cb0cf3fc03af33b7d8648 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sat, 6 Jul 2024 15:52:19 +0400
Subject: [PATCH 61/67] [clang] Add C++26 diagnostics to compatibility
 diagnosic groups (#97806)

This patch adds `CXXPre26Compat` and `CXXPre26CompatPedantic` groups
(which are concerned with new features not available in older language
modes) to `CXX98Compat`, etc. This way, if user has `-Wc++20-compat` and
they use pack indexing, they will be warned.

Ideally this should have been done when C++26 groups were created, but
we shipped two releases of Clang since then.
---
 clang/docs/ReleaseNotes.rst                   |  9 +++++-
 clang/include/clang/Basic/DiagnosticGroups.td | 32 +++++++++++++------
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d60c6fbf15d56c..39187078d5786a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -439,8 +439,11 @@ New Compiler Flags
   Matches MSVC behaviour by defining ``__STDC__`` to ``1`` when
   MSVC compatibility mode is used. It has no effect for C++ code.
 
+- ``-Wc++23-compat`` group was added to help migrating existing codebases
+  to C++23.
+
 - ``-Wc++2c-compat`` group was added to help migrating existing codebases
-  to C++26.
+  to upcoming C++26.
 
 Deprecated Compiler Flags
 -------------------------
@@ -480,6 +483,10 @@ Modified Compiler Flags
   evaluating to ``true`` and an empty body such as ``while(1);``)
   are considered infinite, even when the ``-ffinite-loop`` flag is set.
 
+- Diagnostics groups about compatibility with a particular C++ Standard version
+  now include dianostics about C++26 features that are not present in older
+  versions.
+
 Removed Compiler Flags
 -------------------------
 
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 1b25cf36dd4f81..2241f8481484e2 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -344,7 +344,8 @@ def CXX98Compat : DiagGroup<"c++98-compat",
                              CXXPre14Compat,
                              CXXPre17Compat,
                              CXXPre20Compat,
-                             CXXPre23Compat]>;
+                             CXXPre23Compat,
+                             CXXPre26Compat]>;
 // Warnings for C++11 features which are Extensions in C++98 mode.
 def CXX98CompatPedantic : DiagGroup<"c++98-compat-pedantic",
                                     [CXX98Compat,
@@ -353,7 +354,8 @@ def CXX98CompatPedantic : DiagGroup<"c++98-compat-pedantic",
                                      CXXPre14CompatPedantic,
                                      CXXPre17CompatPedantic,
                                      CXXPre20CompatPedantic,
-                                     CXXPre23CompatPedantic]>;
+                                     CXXPre23CompatPedantic,
+                                     CXXPre26CompatPedantic]>;
 
 def CXX11NarrowingConstReference : DiagGroup<"c++11-narrowing-const-reference">;
 def CXX11Narrowing : DiagGroup<"c++11-narrowing", [CXX11NarrowingConstReference]>;
@@ -384,42 +386,52 @@ def CXX11Compat : DiagGroup<"c++11-compat",
                              CXXPre14Compat,
                              CXXPre17Compat,
                              CXXPre20Compat,
-                             CXXPre23Compat]>;
+                             CXXPre23Compat,
+                             CXXPre26Compat]>;
 def : DiagGroup<"c++0x-compat", [CXX11Compat]>;
 def CXX11CompatPedantic : DiagGroup<"c++11-compat-pedantic",
                                     [CXX11Compat,
                                      CXXPre14CompatPedantic,
                                      CXXPre17CompatPedantic,
                                      CXXPre20CompatPedantic,
-                                     CXXPre23CompatPedantic]>;
+                                     CXXPre23CompatPedantic,
+                                     CXXPre26CompatPedantic]>;
 
 def CXX14Compat : DiagGroup<"c++14-compat", [CXXPre17Compat,
                                              CXXPre20Compat,
-                                             CXXPre23Compat]>;
+                                             CXXPre23Compat,
+                                             CXXPre26Compat]>;
 def CXX14CompatPedantic : DiagGroup<"c++14-compat-pedantic",
                                     [CXX14Compat,
                                      CXXPre17CompatPedantic,
                                      CXXPre20CompatPedantic,
-                                     CXXPre23CompatPedantic]>;
+                                     CXXPre23CompatPedantic,
+                                     CXXPre26CompatPedantic]>;
 
 def CXX17Compat : DiagGroup<"c++17-compat", [DeprecatedRegister,
                                              DeprecatedIncrementBool,
                                              CXX17CompatMangling,
                                              CXXPre20Compat,
-                                             CXXPre23Compat]>;
+                                             CXXPre23Compat,
+                                             CXXPre26Compat]>;
 def CXX17CompatPedantic : DiagGroup<"c++17-compat-pedantic",
                                     [CXX17Compat,
                                      CXXPre20CompatPedantic,
-                                     CXXPre23CompatPedantic]>;
+                                     CXXPre23CompatPedantic,
+                                     CXXPre26CompatPedantic]>;
 def : DiagGroup<"c++1z-compat", [CXX17Compat]>;
 
-def CXX20Compat : DiagGroup<"c++20-compat", [CXXPre23Compat]>;
+def CXX20Compat : DiagGroup<"c++20-compat", [CXXPre23Compat,
+                                             CXXPre26Compat]>;
 def CXX20CompatPedantic : DiagGroup<"c++20-compat-pedantic",
                                     [CXX20Compat,
-                                     CXXPre23CompatPedantic]>;
+                                     CXXPre23CompatPedantic,
+                                     CXXPre26CompatPedantic]>;
 def : DiagGroup<"c++2a-compat", [CXX20Compat]>;
 def : DiagGroup<"c++2a-compat-pedantic", [CXX20CompatPedantic]>;
 
+def CXX23Compat : DiagGroup<"c++23-compat", [CXXPre26Compat]>;
+
 def CXX26Compat : DiagGroup<"c++2c-compat", [DeleteIncomplete]>;
 
 def ExitTimeDestructors : DiagGroup<"exit-time-destructors">;

From 4a9aef683df895934c26591404692d41a687b005 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Sat, 6 Jul 2024 13:16:07 +0100
Subject: [PATCH 62/67] DynamicAPInt: optimize size of structure (#97831)

Reuse the APInt::BitWidth to eliminate DynamicAPInt::HoldsLarge, cutting
the size of DynamicAPInt by four bytes. This is implemented by making
DynamicAPInt a friend of SlowDynamicAPInt and APInt, so it can directly
access SlowDynamicAPInt::Val and APInt::BitWidth.

We get a speedup of 4% with this patch.
---
 llvm/include/llvm/ADT/APInt.h            |  4 +++
 llvm/include/llvm/ADT/DynamicAPInt.h     | 32 ++++++++++++++++--------
 llvm/include/llvm/ADT/SlowDynamicAPInt.h |  7 ++++++
 llvm/lib/Support/DynamicAPInt.cpp        |  8 ++++++
 4 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 6cfa6ec6650842..108df7e0eaeaa3 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -30,6 +30,7 @@ class StringRef;
 class hash_code;
 class raw_ostream;
 struct Align;
+class DynamicAPInt;
 
 template <typename T> class SmallVectorImpl;
 template <typename T> class ArrayRef;
@@ -1895,6 +1896,9 @@ class [[nodiscard]] APInt {
   friend struct DenseMapInfo<APInt, void>;
   friend class APSInt;
 
+  // Make DynamicAPInt a friend so it can access BitWidth directly.
+  friend DynamicAPInt;
+
   /// This constructor is used only internally for speed of construction of
   /// temporaries. It is unsafe since it takes ownership of the pointer, so it
   /// is not public.
diff --git a/llvm/include/llvm/ADT/DynamicAPInt.h b/llvm/include/llvm/ADT/DynamicAPInt.h
index f312f776df971f..2f11f91f81e3bc 100644
--- a/llvm/include/llvm/ADT/DynamicAPInt.h
+++ b/llvm/include/llvm/ADT/DynamicAPInt.h
@@ -35,21 +35,23 @@ namespace llvm {
 /// We always_inline all operations; removing these results in a 1.5x
 /// performance slowdown.
 ///
-/// When HoldsLarge is true, a SlowMPInt is held in the union. If it is false,
-/// the int64_t is held. Using std::variant instead would lead to significantly
-/// worse performance.
+/// When isLarge returns true, a SlowMPInt is held in the union. If isSmall
+/// returns true, the int64_t is held. We don't have a separate field for
+/// indicating this, and instead "steal" memory from ValLarge when it is not in
+/// use because we know that the memory layout of APInt is such that BitWidth
+/// doesn't overlap with ValSmall (see static_assert_layout). Using std::variant
+/// instead would lead to significantly worse performance.
 class DynamicAPInt {
   union {
     int64_t ValSmall;
     detail::SlowDynamicAPInt ValLarge;
   };
-  unsigned HoldsLarge;
 
   LLVM_ATTRIBUTE_ALWAYS_INLINE void initSmall(int64_t O) {
     if (LLVM_UNLIKELY(isLarge()))
       ValLarge.detail::SlowDynamicAPInt::~SlowDynamicAPInt();
     ValSmall = O;
-    HoldsLarge = false;
+    ValLarge.Val.BitWidth = 0;
   }
   LLVM_ATTRIBUTE_ALWAYS_INLINE void
   initLarge(const detail::SlowDynamicAPInt &O) {
@@ -66,14 +68,17 @@ class DynamicAPInt {
       // and leak it.
       ValLarge = O;
     }
-    HoldsLarge = true;
   }
 
   LLVM_ATTRIBUTE_ALWAYS_INLINE explicit DynamicAPInt(
       const detail::SlowDynamicAPInt &Val)
-      : ValLarge(Val), HoldsLarge(true) {}
-  LLVM_ATTRIBUTE_ALWAYS_INLINE bool isSmall() const { return !HoldsLarge; }
-  LLVM_ATTRIBUTE_ALWAYS_INLINE bool isLarge() const { return HoldsLarge; }
+      : ValLarge(Val) {}
+  LLVM_ATTRIBUTE_ALWAYS_INLINE constexpr bool isSmall() const {
+    return ValLarge.Val.BitWidth == 0;
+  }
+  LLVM_ATTRIBUTE_ALWAYS_INLINE constexpr bool isLarge() const {
+    return !isSmall();
+  }
   /// Get the stored value. For getSmall/Large,
   /// the stored value should be small/large.
   LLVM_ATTRIBUTE_ALWAYS_INLINE int64_t getSmall() const {
@@ -105,14 +110,17 @@ class DynamicAPInt {
 
 public:
   LLVM_ATTRIBUTE_ALWAYS_INLINE explicit DynamicAPInt(int64_t Val)
-      : ValSmall(Val), HoldsLarge(false) {}
+      : ValSmall(Val) {
+    ValLarge.Val.BitWidth = 0;
+  }
   LLVM_ATTRIBUTE_ALWAYS_INLINE DynamicAPInt() : DynamicAPInt(0) {}
   LLVM_ATTRIBUTE_ALWAYS_INLINE ~DynamicAPInt() {
     if (LLVM_UNLIKELY(isLarge()))
       ValLarge.detail::SlowDynamicAPInt::~SlowDynamicAPInt();
   }
   LLVM_ATTRIBUTE_ALWAYS_INLINE DynamicAPInt(const DynamicAPInt &O)
-      : ValSmall(O.ValSmall), HoldsLarge(false) {
+      : ValSmall(O.ValSmall) {
+    ValLarge.Val.BitWidth = 0;
     if (LLVM_UNLIKELY(O.isLarge()))
       initLarge(O.ValLarge);
   }
@@ -203,6 +211,8 @@ class DynamicAPInt {
 
   friend hash_code hash_value(const DynamicAPInt &x); // NOLINT
 
+  void static_assert_layout(); // NOLINT
+
   raw_ostream &print(raw_ostream &OS) const;
   LLVM_DUMP_METHOD void dump() const;
 };
diff --git a/llvm/include/llvm/ADT/SlowDynamicAPInt.h b/llvm/include/llvm/ADT/SlowDynamicAPInt.h
index 1678bda046fecc..cda5f39eb98c35 100644
--- a/llvm/include/llvm/ADT/SlowDynamicAPInt.h
+++ b/llvm/include/llvm/ADT/SlowDynamicAPInt.h
@@ -21,6 +21,10 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
+class DynamicAPInt;
+} // namespace llvm
+
 namespace llvm::detail {
 /// A simple class providing dynamic arbitrary-precision arithmetic. Internally,
 /// it stores an APInt, whose width is doubled whenever an overflow occurs at a
@@ -69,6 +73,9 @@ class SlowDynamicAPInt {
   /// Overload to compute a hash_code for a SlowDynamicAPInt value.
   friend hash_code hash_value(const SlowDynamicAPInt &X); // NOLINT
 
+  // Make DynamicAPInt a friend so it can access Val directly.
+  friend DynamicAPInt;
+
   unsigned getBitWidth() const { return Val.getBitWidth(); }
 
   void print(raw_ostream &OS) const;
diff --git a/llvm/lib/Support/DynamicAPInt.cpp b/llvm/lib/Support/DynamicAPInt.cpp
index cae034cf6da6f6..bfcb97e0cc96a0 100644
--- a/llvm/lib/Support/DynamicAPInt.cpp
+++ b/llvm/lib/Support/DynamicAPInt.cpp
@@ -18,6 +18,14 @@ hash_code llvm::hash_value(const DynamicAPInt &X) {
   return detail::hash_value(X.getLarge());
 }
 
+void DynamicAPInt::static_assert_layout() {
+  constexpr size_t ValLargeOffset =
+      offsetof(DynamicAPInt, ValLarge.Val.BitWidth);
+  constexpr size_t ValSmallOffset = offsetof(DynamicAPInt, ValSmall);
+  constexpr size_t ValSmallSize = sizeof(ValSmall);
+  static_assert(ValLargeOffset >= ValSmallOffset + ValSmallSize);
+}
+
 raw_ostream &DynamicAPInt::print(raw_ostream &OS) const {
   if (isSmall())
     return OS << ValSmall;

From de88b2cb16af4bba659d0bb2ddf10bda681ec84d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Spaits?= <gaborspaits1@gmail.com>
Date: Sat, 6 Jul 2024 15:22:51 +0200
Subject: [PATCH 63/67] [Clang] Simplify release notes and remove irrelevant
 comment (#96407)

As discussed before with @cor3ntin before
(https://github.com/llvm/llvm-project/pull/94752) here is the
simplification of the release note written for the previously mentioned
PR and the removal of a comment that is no longer useful.

(Sorry for creating this PR this late.)

Co-authored-by: Gabor Spaits <Gabor.Spaits@hightec-rt.com>
---
 clang/docs/ReleaseNotes.rst | 3 ---
 clang/lib/Sema/SemaInit.cpp | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 39187078d5786a..edc932efd9416b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -236,9 +236,6 @@ C++20 Feature Support
   ``<expected>`` from libstdc++ to work correctly with Clang.
 
 - User defined constructors are allowed for copy-list-initialization with CTAD.
-  The example code for deduction guides for std::map in
-  (`cppreference <https://en.cppreference.com/w/cpp/container/map/deduction_guides>`_)
-  will now work.
   (#GH62925).
 
 C++23 Feature Support
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 64e43ded0961ea..41753a1661acee 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -9811,9 +9811,6 @@ QualType Sema::DeduceTemplateSpecializationFromInitializer(
     // C++ [over.best.ics]p4:
     //   When [...] the constructor [...] is a candidate by
     //    - [over.match.copy] (in all cases)
-    // FIXME: The "second phase of [over.match.list] case can also
-    // theoretically happen here, but it's not clear whether we can
-    // ever have a parameter of the right type.
     if (TD) {
       SmallVector<Expr *, 8> TmpInits;
       for (Expr *E : Inits)

From f8834ed24bf11d19c96c49d42e77d4408af91fd8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hendrik=20H=C3=BCbner?=
 <117831077+HendrikHuebner@users.noreply.github.com>
Date: Sat, 6 Jul 2024 15:24:05 +0200
Subject: [PATCH 64/67] [libc][C23][math] Implement cospif function correctly
 rounded for all rounding modes (#97464)

I also fixed a comment in sinpif.cpp in the first commit. Should this be
included in this PR?

All tests were passed, including the exhaustive test.

CC: @lntue
---
 libc/config/darwin/arm/entrypoints.txt        |   1 +
 libc/config/linux/aarch64/entrypoints.txt     |   1 +
 libc/config/linux/riscv/entrypoints.txt       |   1 +
 libc/config/linux/x86_64/entrypoints.txt      |   1 +
 libc/docs/math/index.rst                      |   2 +-
 libc/src/math/CMakeLists.txt                  |   1 +
 libc/src/math/cospif.h                        |  18 +++
 libc/src/math/generic/CMakeLists.txt          |  17 +++
 libc/src/math/generic/cospif.cpp              |  96 ++++++++++++++
 libc/src/math/generic/sinpif.cpp              |   6 +-
 libc/test/src/math/CMakeLists.txt             |  16 +++
 libc/test/src/math/cospif_test.cpp            | 120 ++++++++++++++++++
 libc/test/src/math/exhaustive/CMakeLists.txt  |  16 +++
 libc/test/src/math/exhaustive/cospif_test.cpp |  33 +++++
 libc/test/src/math/smoke/CMakeLists.txt       |  11 ++
 libc/test/src/math/smoke/cospif_test.cpp      |  34 +++++
 libc/utils/MPFRWrapper/MPFRUtils.cpp          |  35 +++++
 libc/utils/MPFRWrapper/MPFRUtils.h            |   1 +
 18 files changed, 406 insertions(+), 4 deletions(-)
 create mode 100644 libc/src/math/cospif.h
 create mode 100644 libc/src/math/generic/cospif.cpp
 create mode 100644 libc/test/src/math/cospif_test.cpp
 create mode 100644 libc/test/src/math/exhaustive/cospif_test.cpp
 create mode 100644 libc/test/src/math/smoke/cospif_test.cpp

diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
index feb106cc2cb632..9eb7d8960c6e49 100644
--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@@ -132,6 +132,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.coshf
     libc.src.math.cos
     libc.src.math.cosf
+    libc.src.math.cospif
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.expf
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index f7e08ec151d077..a6aeb0685bca48 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -346,6 +346,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cos
     libc.src.math.cosf
     libc.src.math.coshf
+    libc.src.math.cospif
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.exp10
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 5b0d591557944d..2b7e3d0256fc3a 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -354,6 +354,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cos
     libc.src.math.cosf
     libc.src.math.coshf
+    libc.src.math.cospif
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.exp10
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 09f04fb31dfd82..271763d8fe869a 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -371,6 +371,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cos
     libc.src.math.cosf
     libc.src.math.coshf
+    libc.src.math.cospif
     libc.src.math.erff
     libc.src.math.exp
     libc.src.math.exp10
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 5c4464b552cc66..422acfcdd4cec2 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -274,7 +274,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | cosh      | |check|          |                 |                        |                      |                        | 7.12.5.4               | F.10.2.4                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| cospi     |                  |                 |                        |                      |                        | 7.12.4.12              | F.10.1.12                  |
+| cospi     | |check|          |                 |                        |                      |                        | 7.12.4.12              | F.10.1.12                  |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | dsqrt     | N/A              | N/A             |                        | N/A                  |                        | 7.12.14.6              | F.10.11                    |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 0983d268bd4b8c..e21011f37b53c4 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -81,6 +81,7 @@ add_math_entrypoint_object(cos)
 add_math_entrypoint_object(cosf)
 add_math_entrypoint_object(cosh)
 add_math_entrypoint_object(coshf)
+add_math_entrypoint_object(cospif)
 
 add_math_entrypoint_object(erf)
 add_math_entrypoint_object(erff)
diff --git a/libc/src/math/cospif.h b/libc/src/math/cospif.h
new file mode 100644
index 00000000000000..50935bc33e59dd
--- /dev/null
+++ b/libc/src/math/cospif.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for cospif ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_COSPIF_H
+#define LLVM_LIBC_SRC_MATH_COSPIF_H
+
+namespace LIBC_NAMESPACE {
+
+float cospif(float x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_COSPIF_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index ff84a434cc2029..fc7d6996af1e6c 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -217,6 +217,23 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  cospif
+  SRCS
+    cospif.cpp
+  HDRS
+    ../cospif.h
+  DEPENDS
+    .sincosf_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.fma
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.macros.optimization
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   sin
   SRCS
diff --git a/libc/src/math/generic/cospif.cpp b/libc/src/math/generic/cospif.cpp
new file mode 100644
index 00000000000000..713619430fe4be
--- /dev/null
+++ b/libc/src/math/generic/cospif.cpp
@@ -0,0 +1,96 @@
+//===-- Single-precision cospi function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/cospif.h"
+#include "sincosf_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
+#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, cospif, (float x)) {
+  using FPBits = typename fputil::FPBits<float>;
+
+  FPBits xbits(x);
+  Sign xsign = xbits.sign();
+  xbits.set_sign(Sign::POS);
+
+  uint32_t x_abs = xbits.uintval();
+  double xd = static_cast<double>(xbits.get_val());
+
+  // Range reduction:
+  // For |x| > 1/32, we perform range reduction as follows:
+  // Find k and y such that:
+  //   x = (k + y) * 1/32
+  //   k is an integer
+  //   |y| < 0.5
+  //
+  // This is done by performing:
+  //   k = round(x * 32)
+  //   y = x * 32 - k
+  //
+  // Once k and y are computed, we then deduce the answer by the cosine of sum
+  // formula:
+  //   cospi(x) = cos((k + y)*pi/32)
+  //          = cos(y*pi/32) * cos(k*pi/32) - sin(y*pi/32) * sin(k*pi/32)
+  // The values of sin(k*pi/32) and cos(k*pi/32) for k = 0..63 are precomputed
+  // and stored using a vector of 32 doubles. Sin(y*pi/32) and cos(y*pi/32) are
+  // computed using degree-7 and degree-6 minimax polynomials generated by
+  // Sollya respectively.
+
+  // The exhautive test passes for smaller values
+  if (LIBC_UNLIKELY(x_abs < 0x38A2'F984U)) {
+
+#if defined(LIBC_TARGET_CPU_HAS_FMA)
+    return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f);
+#else
+    return static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, 1.0));
+#endif // LIBC_TARGET_CPU_HAS_FMA
+  }
+
+  // Numbers greater or equal to 2^23 are always integers or NaN
+  if (LIBC_UNLIKELY(x_abs >= 0x4B00'0000)) {
+
+    if (LIBC_UNLIKELY(x_abs < 0x4B80'0000)) {
+      return (x_abs & 0x1) ? -1.0f : 1.0f;
+    }
+
+    // x is inf or nan.
+    if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) {
+      if (x_abs == 0x7f80'0000U) {
+        fputil::set_errno_if_required(EDOM);
+        fputil::raise_except_if_required(FE_INVALID);
+      }
+      return x + FPBits::quiet_nan().get_val();
+    }
+
+    return 1.0f;
+  }
+
+  // Combine the results with the sine of sum formula:
+  //   cos(pi * x) = cos((k + y)*pi/32)
+  //          = cos(y*pi/32) * cos(k*pi/32) - sin(y*pi/32) * sin(k*pi/32)
+  //          = (cosm1_y + 1) * cos_k - sin_y * sin_k
+  //          = (cosm1_y * cos_k + cos_k) - sin_y * sin_k
+  double sin_k, cos_k, sin_y, cosm1_y;
+
+  sincospif_eval(xd, sin_k, cos_k, sin_y, cosm1_y);
+
+  if (LIBC_UNLIKELY(sin_y == 0 && cos_k == 0)) {
+    return FPBits::zero(xsign).get_val();
+  }
+
+  return static_cast<float>(fputil::multiply_add(
+      sin_y, -sin_k, fputil::multiply_add(cosm1_y, cos_k, cos_k)));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/sinpif.cpp b/libc/src/math/generic/sinpif.cpp
index 662263c9fc43ec..05bdad3ab4d0e3 100644
--- a/libc/src/math/generic/sinpif.cpp
+++ b/libc/src/math/generic/sinpif.cpp
@@ -26,13 +26,13 @@ LLVM_LIBC_FUNCTION(float, sinpif, (float x)) {
   double xd = static_cast<double>(x);
 
   // Range reduction:
-  // For |x| > pi/32, we perform range reduction as follows:
+  // For |x| > 1/32, we perform range reduction as follows:
   // Find k and y such that:
   //   x = (k + y) * 1/32
   //   k is an integer
   //   |y| < 0.5
-  // For small range (|x| < 2^45 when FMA instructions are available, 2^22
-  // otherwise), this is done by performing:
+  //
+  // This is done by performing:
   //   k = round(x * 32)
   //   y = x * 32 - k
   //
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index f919634ae108c5..35ca97b5de8af0 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -28,6 +28,22 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  cospif_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    cospif_test.cpp
+  HDRS
+    sdcomp26094.h
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.math.cospif
+    libc.src.__support.CPP.array
+    libc.src.__support.FPUtil.fp_bits
+)
+
 add_fp_unittest(
   sinf_test
   NEED_MPFR
diff --git a/libc/test/src/math/cospif_test.cpp b/libc/test/src/math/cospif_test.cpp
new file mode 100644
index 00000000000000..8a39957d1a274e
--- /dev/null
+++ b/libc/test/src/math/cospif_test.cpp
@@ -0,0 +1,120 @@
+//===-- Unittests for cospif ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/errno/libc_errno.h"
+#include "src/math/cospif.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/src/math/sdcomp26094.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcCospifTest = LIBC_NAMESPACE::testing::FPTest<float>;
+
+using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+TEST_F(LlvmLibcCospifTest, SpecialNumbers) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cospif(aNaN));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::cospif(0.0f));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::cospif(-0.0f));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cospif(inf));
+  EXPECT_MATH_ERRNO(EDOM);
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cospif(neg_inf));
+  EXPECT_MATH_ERRNO(EDOM);
+}
+
+TEST_F(LlvmLibcCospifTest, SpecificBitPatterns) {
+  constexpr int N = 36;
+  constexpr uint32_t INPUTS[N] = {
+      0x3f06'0a92U, // x = pi/6
+      0x3f3a'dc51U, // x = 0x1.75b8a2p-1f
+      0x3f49'0fdbU, // x = pi/4
+      0x3f86'0a92U, // x = pi/3
+      0x3fa7'832aU, // x = 0x1.4f0654p+0f
+      0x3fc9'0fdbU, // x = pi/2
+      0x4017'1973U, // x = 0x1.2e32e6p+1f
+      0x4049'0fdbU, // x = pi
+      0x4096'cbe4U, // x = 0x1.2d97c8p+2f
+      0x40c9'0fdbU, // x = 2*pi
+      0x433b'7490U, // x = 0x1.76e92p+7f
+      0x437c'e5f1U, // x = 0x1.f9cbe2p+7f
+      0x4619'9998U, // x = 0x1.33333p+13f
+      0x474d'246fU, // x = 0x1.9a48dep+15f
+      0x4afd'ece4U, // x = 0x1.fbd9c8p+22f
+      0x4c23'32e9U, // x = 0x1.4665d2p+25f
+      0x50a3'e87fU, // x = 0x1.47d0fep+34f
+      0x5239'47f6U, // x = 0x1.728fecp+37f
+      0x53b1'46a6U, // x = 0x1.628d4cp+40f
+      0x55ca'fb2aU, // x = 0x1.95f654p+44f
+      0x588e'f060U, // x = 0x1.1de0cp+50f
+      0x5c07'bcd0U, // x = 0x1.0f79ap+57f
+      0x5ebc'fddeU, // x = 0x1.79fbbcp+62f
+      0x5fa6'eba7U, // x = 0x1.4dd74ep+64f
+      0x61a4'0b40U, // x = 0x1.48168p+68f
+      0x6386'134eU, // x = 0x1.0c269cp+72f
+      0x6589'8498U, // x = 0x1.13093p+76f
+      0x6600'0001U, // x = 0x1.000002p+77f
+      0x664e'46e4U, // x = 0x1.9c8dc8p+77f
+      0x66b0'14aaU, // x = 0x1.602954p+78f
+      0x67a9'242bU, // x = 0x1.524856p+80f
+      0x6a19'76f1U, // x = 0x1.32ede2p+85f
+      0x6c55'da58U, // x = 0x1.abb4bp+89f
+      0x6f79'be45U, // x = 0x1.f37c8ap+95f
+      0x7276'69d4U, // x = 0x1.ecd3a8p+101f
+      0x7758'4625U, // x = 0x1.b08c4ap+111f
+  };
+
+  for (int i = 0; i < N; ++i) {
+    float x = FPBits(INPUTS[i]).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cospi, x,
+                                   LIBC_NAMESPACE::cospif(x), 0.5);
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cospi, -x,
+                                   LIBC_NAMESPACE::cospif(-x), 0.5);
+  }
+}
+
+// For small values, sinpi(x) is pi * x.
+TEST_F(LlvmLibcCospifTest, SmallValues) {
+  float x = FPBits(0x1780'0000U).get_val();
+  EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cospi, x,
+                                 LIBC_NAMESPACE::cospif(x), 0.5);
+
+  x = FPBits(0x0040'0000U).get_val();
+  EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cospi, x,
+                                 LIBC_NAMESPACE::cospif(x), 0.5);
+}
+
+// SDCOMP-26094: check sinfpi in the cases for which the range reducer
+// returns values furthest beyond its nominal upper bound of pi/4.
+TEST_F(LlvmLibcCospifTest, SDCOMP_26094) {
+  for (uint32_t v : SDCOMP26094_VALUES) {
+    float x = FPBits((v)).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cospi, x,
+                                   LIBC_NAMESPACE::cospif(x), 0.5);
+  }
+}
+
+// sinpi(-n) = -0.0
+// sinpi(+n) = +0.0
+TEST_F(LlvmLibcCospifTest, SignedZeros) {
+  EXPECT_FP_EQ(0.0, LIBC_NAMESPACE::cospif(100.5f));
+  EXPECT_FP_EQ(-0.0, LIBC_NAMESPACE::cospif(-100.5f));
+  EXPECT_FP_EQ(0.0, LIBC_NAMESPACE::cospif(45678.5f));
+  EXPECT_FP_EQ(-0.0, LIBC_NAMESPACE::cospif(-45678.5f));
+  EXPECT_FP_EQ(0.0, LIBC_NAMESPACE::cospif(8000000.5f));
+  EXPECT_FP_EQ(-0.0, LIBC_NAMESPACE::cospif(-8000000.5f));
+}
diff --git a/libc/test/src/math/exhaustive/CMakeLists.txt b/libc/test/src/math/exhaustive/CMakeLists.txt
index 412ca031d0e996..c5f75b51cbd9f6 100644
--- a/libc/test/src/math/exhaustive/CMakeLists.txt
+++ b/libc/test/src/math/exhaustive/CMakeLists.txt
@@ -74,6 +74,22 @@ add_fp_unittest(
     -lpthread
 )
 
+add_fp_unittest(
+  cospif_test
+  NO_RUN_POSTBUILD
+  NEED_MPFR
+  SUITE
+    libc_math_exhaustive_tests
+  SRCS
+    cospif_test.cpp
+  DEPENDS
+    .exhaustive_test
+    libc.src.math.cospif
+    libc.src.__support.FPUtil.fp_bits
+  LINK_LIBRARIES
+    -lpthread
+)
+
 add_fp_unittest(
   sincosf_test
   NO_RUN_POSTBUILD
diff --git a/libc/test/src/math/exhaustive/cospif_test.cpp b/libc/test/src/math/exhaustive/cospif_test.cpp
new file mode 100644
index 00000000000000..59077d59099377
--- /dev/null
+++ b/libc/test/src/math/exhaustive/cospif_test.cpp
@@ -0,0 +1,33 @@
+//===-- Exhaustive test for cospif ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "exhaustive_test.h"
+#include "src/math/cospif.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+using LlvmLibcCospifExhaustiveTest =
+    LlvmLibcUnaryOpExhaustiveMathTest<float, mpfr::Operation::Cospi,
+                                      LIBC_NAMESPACE::cospif>;
+
+static constexpr uint32_t POS_START = 0x0000'0000U;
+static constexpr uint32_t POS_STOP = 0x7f80'0000U;
+
+// Range: [0, Inf]
+TEST_F(LlvmLibcCospifExhaustiveTest, PostiveRange) {
+  test_full_range_all_roundings(POS_START, POS_STOP);
+}
+
+// Range: [-Inf, 0]
+static constexpr uint32_t NEG_START = 0xb000'0000U;
+static constexpr uint32_t NEG_STOP = 0xff80'0000U;
+
+TEST_F(LlvmLibcCospifExhaustiveTest, NegativeRange) {
+  test_full_range_all_roundings(NEG_START, NEG_STOP);
+}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 123e8ffdb5be89..b72d4b30787a0d 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -10,6 +10,17 @@ add_fp_unittest(
   DEPENDS
     libc.src.errno.errno
     libc.src.math.cosf
+)
+
+add_fp_unittest(
+  cospif_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    cospif_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.math.cospif
     libc.src.__support.CPP.array
     libc.src.__support.FPUtil.fp_bits
 )
diff --git a/libc/test/src/math/smoke/cospif_test.cpp b/libc/test/src/math/smoke/cospif_test.cpp
new file mode 100644
index 00000000000000..007c4c45e3b157
--- /dev/null
+++ b/libc/test/src/math/smoke/cospif_test.cpp
@@ -0,0 +1,34 @@
+//===-- Unittests for cospif ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/errno/libc_errno.h"
+#include "src/math/cospif.h"
+#include "test/UnitTest/FPMatcher.h"
+
+#include <stdint.h>
+
+using LlvmLibcCospifTest = LIBC_NAMESPACE::testing::FPTest<float>;
+
+TEST_F(LlvmLibcCospifTest, SpecialNumbers) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cospif(aNaN));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::cospif(0.0f));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::cospif(-0.0f));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cospif(inf));
+  EXPECT_MATH_ERRNO(EDOM);
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cospif(neg_inf));
+  EXPECT_MATH_ERRNO(EDOM);
+}
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index f0a653824bea29..6548fc36cb6b4e 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -239,6 +239,39 @@ class MPFRNumber {
     return result;
   }
 
+  MPFRNumber cospi() const {
+    MPFRNumber result(*this);
+
+#if MPFR_VERSION_MAJOR > 4 ||                                                  \
+    (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
+    mpfr_cospi(result.value, value, mpfr_rounding);
+    return result;
+#else
+    MPFRNumber value_frac(*this);
+    mpfr_frac(value_frac.value, value, MPFR_RNDN);
+
+    if (mpfr_cmp_si(value_frac.value, 0.0) == 0) {
+      mpz_t integer_part;
+      mpz_init(integer_part);
+      mpfr_get_z(integer_part, value, MPFR_RNDN);
+
+      if (mpz_tstbit(integer_part, 0)) {
+        mpfr_set_si(result.value, -1.0, MPFR_RNDN); // odd
+      } else {
+        mpfr_set_si(result.value, 1.0, MPFR_RNDN); // even
+      }
+      return result;
+    }
+
+    MPFRNumber value_pi(0.0, 1280);
+    mpfr_const_pi(value_pi.value, MPFR_RNDN);
+    mpfr_mul(value_pi.value, value_pi.value, value, MPFR_RNDN);
+    mpfr_cos(result.value, value_pi.value, mpfr_rounding);
+
+    return result;
+#endif
+  }
+
   MPFRNumber erf() const {
     MPFRNumber result(*this);
     mpfr_erf(result.value, value, mpfr_rounding);
@@ -675,6 +708,8 @@ unary_operation(Operation op, InputType input, unsigned int precision,
     return mpfrInput.cos();
   case Operation::Cosh:
     return mpfrInput.cosh();
+  case Operation::Cospi:
+    return mpfrInput.cospi();
   case Operation::Erf:
     return mpfrInput.erf();
   case Operation::Exp:
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index 213dc7a65c3bc3..002dc919396e72 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -34,6 +34,7 @@ enum class Operation : int {
   Ceil,
   Cos,
   Cosh,
+  Cospi,
   Erf,
   Exp,
   Exp2,

From 9374f83a73e53633da3258090f5c9a1f0d055526 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Sat, 6 Jul 2024 09:24:36 -0400
Subject: [PATCH 65/67] Outline X86 autoupgrade patterns (#97851)

Outlining these patterns has a significant impact on the overall stack
frame size of llvm::UpgradeIntrinsicCall. This is helpful for scenarios
where compilation threads are stack-constrained. The overall impact is
low when using clang as the host compiler, but very pronounced when
using MSVC 2022 with release builds.

Clang:   1,624 ->   824 bytes
MSVC:   23,560 -> 6,120 bytes
---
 llvm/lib/IR/AutoUpgrade.cpp | 3450 +++++++++++++++++------------------
 1 file changed, 1693 insertions(+), 1757 deletions(-)

diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 5beefaa1ec7018..53de9eef516b3f 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -2245,6 +2245,1696 @@ void llvm::UpgradeInlineAsmString(std::string *AsmStr) {
   }
 }
 
+static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
+                                      IRBuilder<> &Builder) {
+  LLVMContext &C = F->getContext();
+  Value *Rep = nullptr;
+
+  if (Name.starts_with("sse4a.movnt.")) {
+    SmallVector<Metadata *, 1> Elts;
+    Elts.push_back(
+        ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
+    MDNode *Node = MDNode::get(C, Elts);
+
+    Value *Arg0 = CI->getArgOperand(0);
+    Value *Arg1 = CI->getArgOperand(1);
+
+    // Nontemporal (unaligned) store of the 0'th element of the float/double
+    // vector.
+    Type *SrcEltTy = cast<VectorType>(Arg1->getType())->getElementType();
+    PointerType *EltPtrTy = PointerType::getUnqual(SrcEltTy);
+    Value *Addr = Builder.CreateBitCast(Arg0, EltPtrTy, "cast");
+    Value *Extract =
+        Builder.CreateExtractElement(Arg1, (uint64_t)0, "extractelement");
+
+    StoreInst *SI = Builder.CreateAlignedStore(Extract, Addr, Align(1));
+    SI->setMetadata(LLVMContext::MD_nontemporal, Node);
+  } else if (Name.starts_with("avx.movnt.") ||
+             Name.starts_with("avx512.storent.")) {
+    SmallVector<Metadata *, 1> Elts;
+    Elts.push_back(
+        ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
+    MDNode *Node = MDNode::get(C, Elts);
+
+    Value *Arg0 = CI->getArgOperand(0);
+    Value *Arg1 = CI->getArgOperand(1);
+
+    // Convert the type of the pointer to a pointer to the stored type.
+    Value *BC = Builder.CreateBitCast(
+        Arg0, PointerType::getUnqual(Arg1->getType()), "cast");
+    StoreInst *SI = Builder.CreateAlignedStore(
+        Arg1, BC,
+        Align(Arg1->getType()->getPrimitiveSizeInBits().getFixedValue() / 8));
+    SI->setMetadata(LLVMContext::MD_nontemporal, Node);
+  } else if (Name == "sse2.storel.dq") {
+    Value *Arg0 = CI->getArgOperand(0);
+    Value *Arg1 = CI->getArgOperand(1);
+
+    auto *NewVecTy = FixedVectorType::get(Type::getInt64Ty(C), 2);
+    Value *BC0 = Builder.CreateBitCast(Arg1, NewVecTy, "cast");
+    Value *Elt = Builder.CreateExtractElement(BC0, (uint64_t)0);
+    Value *BC = Builder.CreateBitCast(
+        Arg0, PointerType::getUnqual(Elt->getType()), "cast");
+    Builder.CreateAlignedStore(Elt, BC, Align(1));
+  } else if (Name.starts_with("sse.storeu.") ||
+             Name.starts_with("sse2.storeu.") ||
+             Name.starts_with("avx.storeu.")) {
+    Value *Arg0 = CI->getArgOperand(0);
+    Value *Arg1 = CI->getArgOperand(1);
+
+    Arg0 = Builder.CreateBitCast(Arg0, PointerType::getUnqual(Arg1->getType()),
+                                 "cast");
+    Builder.CreateAlignedStore(Arg1, Arg0, Align(1));
+  } else if (Name == "avx512.mask.store.ss") {
+    Value *Mask = Builder.CreateAnd(CI->getArgOperand(2), Builder.getInt8(1));
+    upgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+                       Mask, false);
+  } else if (Name.starts_with("avx512.mask.store")) {
+    // "avx512.mask.storeu." or "avx512.mask.store."
+    bool Aligned = Name[17] != 'u'; // "avx512.mask.storeu".
+    upgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+                       CI->getArgOperand(2), Aligned);
+  } else if (Name.starts_with("sse2.pcmp") || Name.starts_with("avx2.pcmp")) {
+    // Upgrade packed integer vector compare intrinsics to compare instructions.
+    // "sse2.pcpmpeq." "sse2.pcmpgt." "avx2.pcmpeq." or "avx2.pcmpgt."
+    bool CmpEq = Name[9] == 'e';
+    Rep = Builder.CreateICmp(CmpEq ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_SGT,
+                             CI->getArgOperand(0), CI->getArgOperand(1));
+    Rep = Builder.CreateSExt(Rep, CI->getType(), "");
+  } else if (Name.starts_with("avx512.broadcastm")) {
+    Type *ExtTy = Type::getInt32Ty(C);
+    if (CI->getOperand(0)->getType()->isIntegerTy(8))
+      ExtTy = Type::getInt64Ty(C);
+    unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() /
+                       ExtTy->getPrimitiveSizeInBits();
+    Rep = Builder.CreateZExt(CI->getArgOperand(0), ExtTy);
+    Rep = Builder.CreateVectorSplat(NumElts, Rep);
+  } else if (Name == "sse.sqrt.ss" || Name == "sse2.sqrt.sd") {
+    Value *Vec = CI->getArgOperand(0);
+    Value *Elt0 = Builder.CreateExtractElement(Vec, (uint64_t)0);
+    Function *Intr = Intrinsic::getDeclaration(F->getParent(), Intrinsic::sqrt,
+                                               Elt0->getType());
+    Elt0 = Builder.CreateCall(Intr, Elt0);
+    Rep = Builder.CreateInsertElement(Vec, Elt0, (uint64_t)0);
+  } else if (Name.starts_with("avx.sqrt.p") ||
+             Name.starts_with("sse2.sqrt.p") ||
+             Name.starts_with("sse.sqrt.p")) {
+    Rep =
+        Builder.CreateCall(Intrinsic::getDeclaration(
+                               F->getParent(), Intrinsic::sqrt, CI->getType()),
+                           {CI->getArgOperand(0)});
+  } else if (Name.starts_with("avx512.mask.sqrt.p")) {
+    if (CI->arg_size() == 4 &&
+        (!isa<ConstantInt>(CI->getArgOperand(3)) ||
+         cast<ConstantInt>(CI->getArgOperand(3))->getZExtValue() != 4)) {
+      Intrinsic::ID IID = Name[18] == 's' ? Intrinsic::x86_avx512_sqrt_ps_512
+                                          : Intrinsic::x86_avx512_sqrt_pd_512;
+
+      Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(3)};
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
+                               Args);
+    } else {
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
+                                                         Intrinsic::sqrt,
+                                                         CI->getType()),
+                               {CI->getArgOperand(0)});
+    }
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
+  } else if (Name.starts_with("avx512.ptestm") ||
+             Name.starts_with("avx512.ptestnm")) {
+    Value *Op0 = CI->getArgOperand(0);
+    Value *Op1 = CI->getArgOperand(1);
+    Value *Mask = CI->getArgOperand(2);
+    Rep = Builder.CreateAnd(Op0, Op1);
+    llvm::Type *Ty = Op0->getType();
+    Value *Zero = llvm::Constant::getNullValue(Ty);
+    ICmpInst::Predicate Pred = Name.starts_with("avx512.ptestm")
+                                   ? ICmpInst::ICMP_NE
+                                   : ICmpInst::ICMP_EQ;
+    Rep = Builder.CreateICmp(Pred, Rep, Zero);
+    Rep = applyX86MaskOn1BitsVec(Builder, Rep, Mask);
+  } else if (Name.starts_with("avx512.mask.pbroadcast")) {
+    unsigned NumElts = cast<FixedVectorType>(CI->getArgOperand(1)->getType())
+                           ->getNumElements();
+    Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
+  } else if (Name.starts_with("avx512.kunpck")) {
+    unsigned NumElts = CI->getType()->getScalarSizeInBits();
+    Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), NumElts);
+    Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), NumElts);
+    int Indices[64];
+    for (unsigned i = 0; i != NumElts; ++i)
+      Indices[i] = i;
+
+    // First extract half of each vector. This gives better codegen than
+    // doing it in a single shuffle.
+    LHS = Builder.CreateShuffleVector(LHS, LHS, ArrayRef(Indices, NumElts / 2));
+    RHS = Builder.CreateShuffleVector(RHS, RHS, ArrayRef(Indices, NumElts / 2));
+    // Concat the vectors.
+    // NOTE: Operands have to be swapped to match intrinsic definition.
+    Rep = Builder.CreateShuffleVector(RHS, LHS, ArrayRef(Indices, NumElts));
+    Rep = Builder.CreateBitCast(Rep, CI->getType());
+  } else if (Name == "avx512.kand.w") {
+    Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+    Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+    Rep = Builder.CreateAnd(LHS, RHS);
+    Rep = Builder.CreateBitCast(Rep, CI->getType());
+  } else if (Name == "avx512.kandn.w") {
+    Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+    Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+    LHS = Builder.CreateNot(LHS);
+    Rep = Builder.CreateAnd(LHS, RHS);
+    Rep = Builder.CreateBitCast(Rep, CI->getType());
+  } else if (Name == "avx512.kor.w") {
+    Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+    Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+    Rep = Builder.CreateOr(LHS, RHS);
+    Rep = Builder.CreateBitCast(Rep, CI->getType());
+  } else if (Name == "avx512.kxor.w") {
+    Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+    Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+    Rep = Builder.CreateXor(LHS, RHS);
+    Rep = Builder.CreateBitCast(Rep, CI->getType());
+  } else if (Name == "avx512.kxnor.w") {
+    Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+    Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+    LHS = Builder.CreateNot(LHS);
+    Rep = Builder.CreateXor(LHS, RHS);
+    Rep = Builder.CreateBitCast(Rep, CI->getType());
+  } else if (Name == "avx512.knot.w") {
+    Rep = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+    Rep = Builder.CreateNot(Rep);
+    Rep = Builder.CreateBitCast(Rep, CI->getType());
+  } else if (Name == "avx512.kortestz.w" || Name == "avx512.kortestc.w") {
+    Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
+    Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
+    Rep = Builder.CreateOr(LHS, RHS);
+    Rep = Builder.CreateBitCast(Rep, Builder.getInt16Ty());
+    Value *C;
+    if (Name[14] == 'c')
+      C = ConstantInt::getAllOnesValue(Builder.getInt16Ty());
+    else
+      C = ConstantInt::getNullValue(Builder.getInt16Ty());
+    Rep = Builder.CreateICmpEQ(Rep, C);
+    Rep = Builder.CreateZExt(Rep, Builder.getInt32Ty());
+  } else if (Name == "sse.add.ss" || Name == "sse2.add.sd" ||
+             Name == "sse.sub.ss" || Name == "sse2.sub.sd" ||
+             Name == "sse.mul.ss" || Name == "sse2.mul.sd" ||
+             Name == "sse.div.ss" || Name == "sse2.div.sd") {
+    Type *I32Ty = Type::getInt32Ty(C);
+    Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),
+                                               ConstantInt::get(I32Ty, 0));
+    Value *Elt1 = Builder.CreateExtractElement(CI->getArgOperand(1),
+                                               ConstantInt::get(I32Ty, 0));
+    Value *EltOp;
+    if (Name.contains(".add."))
+      EltOp = Builder.CreateFAdd(Elt0, Elt1);
+    else if (Name.contains(".sub."))
+      EltOp = Builder.CreateFSub(Elt0, Elt1);
+    else if (Name.contains(".mul."))
+      EltOp = Builder.CreateFMul(Elt0, Elt1);
+    else
+      EltOp = Builder.CreateFDiv(Elt0, Elt1);
+    Rep = Builder.CreateInsertElement(CI->getArgOperand(0), EltOp,
+                                      ConstantInt::get(I32Ty, 0));
+  } else if (Name.starts_with("avx512.mask.pcmp")) {
+    // "avx512.mask.pcmpeq." or "avx512.mask.pcmpgt."
+    bool CmpEq = Name[16] == 'e';
+    Rep = upgradeMaskedCompare(Builder, *CI, CmpEq ? 0 : 6, true);
+  } else if (Name.starts_with("avx512.mask.vpshufbitqmb.")) {
+    Type *OpTy = CI->getArgOperand(0)->getType();
+    unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
+    Intrinsic::ID IID;
+    switch (VecWidth) {
+    default:
+      llvm_unreachable("Unexpected intrinsic");
+    case 128:
+      IID = Intrinsic::x86_avx512_vpshufbitqmb_128;
+      break;
+    case 256:
+      IID = Intrinsic::x86_avx512_vpshufbitqmb_256;
+      break;
+    case 512:
+      IID = Intrinsic::x86_avx512_vpshufbitqmb_512;
+      break;
+    }
+
+    Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                             {CI->getOperand(0), CI->getArgOperand(1)});
+    Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.fpclass.p")) {
+    Type *OpTy = CI->getArgOperand(0)->getType();
+    unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
+    unsigned EltWidth = OpTy->getScalarSizeInBits();
+    Intrinsic::ID IID;
+    if (VecWidth == 128 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_fpclass_ps_128;
+    else if (VecWidth == 256 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_fpclass_ps_256;
+    else if (VecWidth == 512 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_fpclass_ps_512;
+    else if (VecWidth == 128 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_fpclass_pd_128;
+    else if (VecWidth == 256 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_fpclass_pd_256;
+    else if (VecWidth == 512 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_fpclass_pd_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+
+    Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                             {CI->getOperand(0), CI->getArgOperand(1)});
+    Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.cmp.p")) {
+    SmallVector<Value *, 4> Args(CI->args());
+    Type *OpTy = Args[0]->getType();
+    unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
+    unsigned EltWidth = OpTy->getScalarSizeInBits();
+    Intrinsic::ID IID;
+    if (VecWidth == 128 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_mask_cmp_ps_128;
+    else if (VecWidth == 256 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_mask_cmp_ps_256;
+    else if (VecWidth == 512 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_mask_cmp_ps_512;
+    else if (VecWidth == 128 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_mask_cmp_pd_128;
+    else if (VecWidth == 256 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_mask_cmp_pd_256;
+    else if (VecWidth == 512 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_mask_cmp_pd_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+
+    Value *Mask = Constant::getAllOnesValue(CI->getType());
+    if (VecWidth == 512)
+      std::swap(Mask, Args.back());
+    Args.push_back(Mask);
+
+    Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                             Args);
+  } else if (Name.starts_with("avx512.mask.cmp.")) {
+    // Integer compare intrinsics.
+    unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+    Rep = upgradeMaskedCompare(Builder, *CI, Imm, true);
+  } else if (Name.starts_with("avx512.mask.ucmp.")) {
+    unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+    Rep = upgradeMaskedCompare(Builder, *CI, Imm, false);
+  } else if (Name.starts_with("avx512.cvtb2mask.") ||
+             Name.starts_with("avx512.cvtw2mask.") ||
+             Name.starts_with("avx512.cvtd2mask.") ||
+             Name.starts_with("avx512.cvtq2mask.")) {
+    Value *Op = CI->getArgOperand(0);
+    Value *Zero = llvm::Constant::getNullValue(Op->getType());
+    Rep = Builder.CreateICmp(ICmpInst::ICMP_SLT, Op, Zero);
+    Rep = applyX86MaskOn1BitsVec(Builder, Rep, nullptr);
+  } else if (Name == "ssse3.pabs.b.128" || Name == "ssse3.pabs.w.128" ||
+             Name == "ssse3.pabs.d.128" || Name.starts_with("avx2.pabs") ||
+             Name.starts_with("avx512.mask.pabs")) {
+    Rep = upgradeAbs(Builder, *CI);
+  } else if (Name == "sse41.pmaxsb" || Name == "sse2.pmaxs.w" ||
+             Name == "sse41.pmaxsd" || Name.starts_with("avx2.pmaxs") ||
+             Name.starts_with("avx512.mask.pmaxs")) {
+    Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smax);
+  } else if (Name == "sse2.pmaxu.b" || Name == "sse41.pmaxuw" ||
+             Name == "sse41.pmaxud" || Name.starts_with("avx2.pmaxu") ||
+             Name.starts_with("avx512.mask.pmaxu")) {
+    Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umax);
+  } else if (Name == "sse41.pminsb" || Name == "sse2.pmins.w" ||
+             Name == "sse41.pminsd" || Name.starts_with("avx2.pmins") ||
+             Name.starts_with("avx512.mask.pmins")) {
+    Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smin);
+  } else if (Name == "sse2.pminu.b" || Name == "sse41.pminuw" ||
+             Name == "sse41.pminud" || Name.starts_with("avx2.pminu") ||
+             Name.starts_with("avx512.mask.pminu")) {
+    Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umin);
+  } else if (Name == "sse2.pmulu.dq" || Name == "avx2.pmulu.dq" ||
+             Name == "avx512.pmulu.dq.512" ||
+             Name.starts_with("avx512.mask.pmulu.dq.")) {
+    Rep = upgradePMULDQ(Builder, *CI, /*Signed*/ false);
+  } else if (Name == "sse41.pmuldq" || Name == "avx2.pmul.dq" ||
+             Name == "avx512.pmul.dq.512" ||
+             Name.starts_with("avx512.mask.pmul.dq.")) {
+    Rep = upgradePMULDQ(Builder, *CI, /*Signed*/ true);
+  } else if (Name == "sse.cvtsi2ss" || Name == "sse2.cvtsi2sd" ||
+             Name == "sse.cvtsi642ss" || Name == "sse2.cvtsi642sd") {
+    Rep =
+        Builder.CreateSIToFP(CI->getArgOperand(1),
+                             cast<VectorType>(CI->getType())->getElementType());
+    Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
+  } else if (Name == "avx512.cvtusi2sd") {
+    Rep =
+        Builder.CreateUIToFP(CI->getArgOperand(1),
+                             cast<VectorType>(CI->getType())->getElementType());
+    Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
+  } else if (Name == "sse2.cvtss2sd") {
+    Rep = Builder.CreateExtractElement(CI->getArgOperand(1), (uint64_t)0);
+    Rep = Builder.CreateFPExt(
+        Rep, cast<VectorType>(CI->getType())->getElementType());
+    Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
+  } else if (Name == "sse2.cvtdq2pd" || Name == "sse2.cvtdq2ps" ||
+             Name == "avx.cvtdq2.pd.256" || Name == "avx.cvtdq2.ps.256" ||
+             Name.starts_with("avx512.mask.cvtdq2pd.") ||
+             Name.starts_with("avx512.mask.cvtudq2pd.") ||
+             Name.starts_with("avx512.mask.cvtdq2ps.") ||
+             Name.starts_with("avx512.mask.cvtudq2ps.") ||
+             Name.starts_with("avx512.mask.cvtqq2pd.") ||
+             Name.starts_with("avx512.mask.cvtuqq2pd.") ||
+             Name == "avx512.mask.cvtqq2ps.256" ||
+             Name == "avx512.mask.cvtqq2ps.512" ||
+             Name == "avx512.mask.cvtuqq2ps.256" ||
+             Name == "avx512.mask.cvtuqq2ps.512" || Name == "sse2.cvtps2pd" ||
+             Name == "avx.cvt.ps2.pd.256" ||
+             Name == "avx512.mask.cvtps2pd.128" ||
+             Name == "avx512.mask.cvtps2pd.256") {
+    auto *DstTy = cast<FixedVectorType>(CI->getType());
+    Rep = CI->getArgOperand(0);
+    auto *SrcTy = cast<FixedVectorType>(Rep->getType());
+
+    unsigned NumDstElts = DstTy->getNumElements();
+    if (NumDstElts < SrcTy->getNumElements()) {
+      assert(NumDstElts == 2 && "Unexpected vector size");
+      Rep = Builder.CreateShuffleVector(Rep, Rep, ArrayRef<int>{0, 1});
+    }
+
+    bool IsPS2PD = SrcTy->getElementType()->isFloatTy();
+    bool IsUnsigned = Name.contains("cvtu");
+    if (IsPS2PD)
+      Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
+    else if (CI->arg_size() == 4 &&
+             (!isa<ConstantInt>(CI->getArgOperand(3)) ||
+              cast<ConstantInt>(CI->getArgOperand(3))->getZExtValue() != 4)) {
+      Intrinsic::ID IID = IsUnsigned ? Intrinsic::x86_avx512_uitofp_round
+                                     : Intrinsic::x86_avx512_sitofp_round;
+      Function *F =
+          Intrinsic::getDeclaration(CI->getModule(), IID, {DstTy, SrcTy});
+      Rep = Builder.CreateCall(F, {Rep, CI->getArgOperand(3)});
+    } else {
+      Rep = IsUnsigned ? Builder.CreateUIToFP(Rep, DstTy, "cvt")
+                       : Builder.CreateSIToFP(Rep, DstTy, "cvt");
+    }
+
+    if (CI->arg_size() >= 3)
+      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
+                          CI->getArgOperand(1));
+  } else if (Name.starts_with("avx512.mask.vcvtph2ps.") ||
+             Name.starts_with("vcvtph2ps.")) {
+    auto *DstTy = cast<FixedVectorType>(CI->getType());
+    Rep = CI->getArgOperand(0);
+    auto *SrcTy = cast<FixedVectorType>(Rep->getType());
+    unsigned NumDstElts = DstTy->getNumElements();
+    if (NumDstElts != SrcTy->getNumElements()) {
+      assert(NumDstElts == 4 && "Unexpected vector size");
+      Rep = Builder.CreateShuffleVector(Rep, Rep, ArrayRef<int>{0, 1, 2, 3});
+    }
+    Rep = Builder.CreateBitCast(
+        Rep, FixedVectorType::get(Type::getHalfTy(C), NumDstElts));
+    Rep = Builder.CreateFPExt(Rep, DstTy, "cvtph2ps");
+    if (CI->arg_size() >= 3)
+      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
+                          CI->getArgOperand(1));
+  } else if (Name.starts_with("avx512.mask.load")) {
+    // "avx512.mask.loadu." or "avx512.mask.load."
+    bool Aligned = Name[16] != 'u'; // "avx512.mask.loadu".
+    Rep = upgradeMaskedLoad(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+                            CI->getArgOperand(2), Aligned);
+  } else if (Name.starts_with("avx512.mask.expand.load.")) {
+    auto *ResultTy = cast<FixedVectorType>(CI->getType());
+    Type *PtrTy = ResultTy->getElementType();
+
+    // Cast the pointer to element type.
+    Value *Ptr = Builder.CreateBitCast(CI->getOperand(0),
+                                       llvm::PointerType::getUnqual(PtrTy));
+
+    Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2),
+                                   ResultTy->getNumElements());
+
+    Function *ELd = Intrinsic::getDeclaration(
+        F->getParent(), Intrinsic::masked_expandload, ResultTy);
+    Rep = Builder.CreateCall(ELd, {Ptr, MaskVec, CI->getOperand(1)});
+  } else if (Name.starts_with("avx512.mask.compress.store.")) {
+    auto *ResultTy = cast<VectorType>(CI->getArgOperand(1)->getType());
+    Type *PtrTy = ResultTy->getElementType();
+
+    // Cast the pointer to element type.
+    Value *Ptr = Builder.CreateBitCast(CI->getOperand(0),
+                                       llvm::PointerType::getUnqual(PtrTy));
+
+    Value *MaskVec =
+        getX86MaskVec(Builder, CI->getArgOperand(2),
+                      cast<FixedVectorType>(ResultTy)->getNumElements());
+
+    Function *CSt = Intrinsic::getDeclaration(
+        F->getParent(), Intrinsic::masked_compressstore, ResultTy);
+    Rep = Builder.CreateCall(CSt, {CI->getArgOperand(1), Ptr, MaskVec});
+  } else if (Name.starts_with("avx512.mask.compress.") ||
+             Name.starts_with("avx512.mask.expand.")) {
+    auto *ResultTy = cast<FixedVectorType>(CI->getType());
+
+    Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2),
+                                   ResultTy->getNumElements());
+
+    bool IsCompress = Name[12] == 'c';
+    Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress
+                                   : Intrinsic::x86_avx512_mask_expand;
+    Function *Intr = Intrinsic::getDeclaration(F->getParent(), IID, ResultTy);
+    Rep = Builder.CreateCall(Intr,
+                             {CI->getOperand(0), CI->getOperand(1), MaskVec});
+  } else if (Name.starts_with("xop.vpcom")) {
+    bool IsSigned;
+    if (Name.ends_with("ub") || Name.ends_with("uw") || Name.ends_with("ud") ||
+        Name.ends_with("uq"))
+      IsSigned = false;
+    else if (Name.ends_with("b") || Name.ends_with("w") ||
+             Name.ends_with("d") || Name.ends_with("q"))
+      IsSigned = true;
+    else
+      llvm_unreachable("Unknown suffix");
+
+    unsigned Imm;
+    if (CI->arg_size() == 3) {
+      Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+    } else {
+      Name = Name.substr(9); // strip off "xop.vpcom"
+      if (Name.starts_with("lt"))
+        Imm = 0;
+      else if (Name.starts_with("le"))
+        Imm = 1;
+      else if (Name.starts_with("gt"))
+        Imm = 2;
+      else if (Name.starts_with("ge"))
+        Imm = 3;
+      else if (Name.starts_with("eq"))
+        Imm = 4;
+      else if (Name.starts_with("ne"))
+        Imm = 5;
+      else if (Name.starts_with("false"))
+        Imm = 6;
+      else if (Name.starts_with("true"))
+        Imm = 7;
+      else
+        llvm_unreachable("Unknown condition");
+    }
+
+    Rep = upgradeX86vpcom(Builder, *CI, Imm, IsSigned);
+  } else if (Name.starts_with("xop.vpcmov")) {
+    Value *Sel = CI->getArgOperand(2);
+    Value *NotSel = Builder.CreateNot(Sel);
+    Value *Sel0 = Builder.CreateAnd(CI->getArgOperand(0), Sel);
+    Value *Sel1 = Builder.CreateAnd(CI->getArgOperand(1), NotSel);
+    Rep = Builder.CreateOr(Sel0, Sel1);
+  } else if (Name.starts_with("xop.vprot") || Name.starts_with("avx512.prol") ||
+             Name.starts_with("avx512.mask.prol")) {
+    Rep = upgradeX86Rotate(Builder, *CI, false);
+  } else if (Name.starts_with("avx512.pror") ||
+             Name.starts_with("avx512.mask.pror")) {
+    Rep = upgradeX86Rotate(Builder, *CI, true);
+  } else if (Name.starts_with("avx512.vpshld.") ||
+             Name.starts_with("avx512.mask.vpshld") ||
+             Name.starts_with("avx512.maskz.vpshld")) {
+    bool ZeroMask = Name[11] == 'z';
+    Rep = upgradeX86ConcatShift(Builder, *CI, false, ZeroMask);
+  } else if (Name.starts_with("avx512.vpshrd.") ||
+             Name.starts_with("avx512.mask.vpshrd") ||
+             Name.starts_with("avx512.maskz.vpshrd")) {
+    bool ZeroMask = Name[11] == 'z';
+    Rep = upgradeX86ConcatShift(Builder, *CI, true, ZeroMask);
+  } else if (Name == "sse42.crc32.64.8") {
+    Function *CRC32 = Intrinsic::getDeclaration(
+        F->getParent(), Intrinsic::x86_sse42_crc32_32_8);
+    Value *Trunc0 =
+        Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));
+    Rep = Builder.CreateCall(CRC32, {Trunc0, CI->getArgOperand(1)});
+    Rep = Builder.CreateZExt(Rep, CI->getType(), "");
+  } else if (Name.starts_with("avx.vbroadcast.s") ||
+             Name.starts_with("avx512.vbroadcast.s")) {
+    // Replace broadcasts with a series of insertelements.
+    auto *VecTy = cast<FixedVectorType>(CI->getType());
+    Type *EltTy = VecTy->getElementType();
+    unsigned EltNum = VecTy->getNumElements();
+    Value *Load = Builder.CreateLoad(EltTy, CI->getArgOperand(0));
+    Type *I32Ty = Type::getInt32Ty(C);
+    Rep = PoisonValue::get(VecTy);
+    for (unsigned I = 0; I < EltNum; ++I)
+      Rep = Builder.CreateInsertElement(Rep, Load, ConstantInt::get(I32Ty, I));
+  } else if (Name.starts_with("sse41.pmovsx") ||
+             Name.starts_with("sse41.pmovzx") ||
+             Name.starts_with("avx2.pmovsx") ||
+             Name.starts_with("avx2.pmovzx") ||
+             Name.starts_with("avx512.mask.pmovsx") ||
+             Name.starts_with("avx512.mask.pmovzx")) {
+    auto *DstTy = cast<FixedVectorType>(CI->getType());
+    unsigned NumDstElts = DstTy->getNumElements();
+
+    // Extract a subvector of the first NumDstElts lanes and sign/zero extend.
+    SmallVector<int, 8> ShuffleMask(NumDstElts);
+    for (unsigned i = 0; i != NumDstElts; ++i)
+      ShuffleMask[i] = i;
+
+    Value *SV = Builder.CreateShuffleVector(CI->getArgOperand(0), ShuffleMask);
+
+    bool DoSext = Name.contains("pmovsx");
+    Rep =
+        DoSext ? Builder.CreateSExt(SV, DstTy) : Builder.CreateZExt(SV, DstTy);
+    // If there are 3 arguments, it's a masked intrinsic so we need a select.
+    if (CI->arg_size() == 3)
+      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
+                          CI->getArgOperand(1));
+  } else if (Name == "avx512.mask.pmov.qd.256" ||
+             Name == "avx512.mask.pmov.qd.512" ||
+             Name == "avx512.mask.pmov.wb.256" ||
+             Name == "avx512.mask.pmov.wb.512") {
+    Type *Ty = CI->getArgOperand(1)->getType();
+    Rep = Builder.CreateTrunc(CI->getArgOperand(0), Ty);
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
+  } else if (Name.starts_with("avx.vbroadcastf128") ||
+             Name == "avx2.vbroadcasti128") {
+    // Replace vbroadcastf128/vbroadcasti128 with a vector load+shuffle.
+    Type *EltTy = cast<VectorType>(CI->getType())->getElementType();
+    unsigned NumSrcElts = 128 / EltTy->getPrimitiveSizeInBits();
+    auto *VT = FixedVectorType::get(EltTy, NumSrcElts);
+    Value *Op = Builder.CreatePointerCast(CI->getArgOperand(0),
+                                          PointerType::getUnqual(VT));
+    Value *Load = Builder.CreateAlignedLoad(VT, Op, Align(1));
+    if (NumSrcElts == 2)
+      Rep = Builder.CreateShuffleVector(Load, ArrayRef<int>{0, 1, 0, 1});
+    else
+      Rep = Builder.CreateShuffleVector(Load,
+                                        ArrayRef<int>{0, 1, 2, 3, 0, 1, 2, 3});
+  } else if (Name.starts_with("avx512.mask.shuf.i") ||
+             Name.starts_with("avx512.mask.shuf.f")) {
+    unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+    Type *VT = CI->getType();
+    unsigned NumLanes = VT->getPrimitiveSizeInBits() / 128;
+    unsigned NumElementsInLane = 128 / VT->getScalarSizeInBits();
+    unsigned ControlBitsMask = NumLanes - 1;
+    unsigned NumControlBits = NumLanes / 2;
+    SmallVector<int, 8> ShuffleMask(0);
+
+    for (unsigned l = 0; l != NumLanes; ++l) {
+      unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask;
+      // We actually need the other source.
+      if (l >= NumLanes / 2)
+        LaneMask += NumLanes;
+      for (unsigned i = 0; i != NumElementsInLane; ++i)
+        ShuffleMask.push_back(LaneMask * NumElementsInLane + i);
+    }
+    Rep = Builder.CreateShuffleVector(CI->getArgOperand(0),
+                                      CI->getArgOperand(1), ShuffleMask);
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(4), Rep, CI->getArgOperand(3));
+  } else if (Name.starts_with("avx512.mask.broadcastf") ||
+             Name.starts_with("avx512.mask.broadcasti")) {
+    unsigned NumSrcElts = cast<FixedVectorType>(CI->getArgOperand(0)->getType())
+                              ->getNumElements();
+    unsigned NumDstElts =
+        cast<FixedVectorType>(CI->getType())->getNumElements();
+
+    SmallVector<int, 8> ShuffleMask(NumDstElts);
+    for (unsigned i = 0; i != NumDstElts; ++i)
+      ShuffleMask[i] = i % NumSrcElts;
+
+    Rep = Builder.CreateShuffleVector(CI->getArgOperand(0),
+                                      CI->getArgOperand(0), ShuffleMask);
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
+  } else if (Name.starts_with("avx2.pbroadcast") ||
+             Name.starts_with("avx2.vbroadcast") ||
+             Name.starts_with("avx512.pbroadcast") ||
+             Name.starts_with("avx512.mask.broadcast.s")) {
+    // Replace vp?broadcasts with a vector shuffle.
+    Value *Op = CI->getArgOperand(0);
+    ElementCount EC = cast<VectorType>(CI->getType())->getElementCount();
+    Type *MaskTy = VectorType::get(Type::getInt32Ty(C), EC);
+    SmallVector<int, 8> M;
+    ShuffleVectorInst::getShuffleMask(Constant::getNullValue(MaskTy), M);
+    Rep = Builder.CreateShuffleVector(Op, M);
+
+    if (CI->arg_size() == 3)
+      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
+                          CI->getArgOperand(1));
+  } else if (Name.starts_with("sse2.padds.") ||
+             Name.starts_with("avx2.padds.") ||
+             Name.starts_with("avx512.padds.") ||
+             Name.starts_with("avx512.mask.padds.")) {
+    Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::sadd_sat);
+  } else if (Name.starts_with("sse2.psubs.") ||
+             Name.starts_with("avx2.psubs.") ||
+             Name.starts_with("avx512.psubs.") ||
+             Name.starts_with("avx512.mask.psubs.")) {
+    Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::ssub_sat);
+  } else if (Name.starts_with("sse2.paddus.") ||
+             Name.starts_with("avx2.paddus.") ||
+             Name.starts_with("avx512.mask.paddus.")) {
+    Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::uadd_sat);
+  } else if (Name.starts_with("sse2.psubus.") ||
+             Name.starts_with("avx2.psubus.") ||
+             Name.starts_with("avx512.mask.psubus.")) {
+    Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::usub_sat);
+  } else if (Name.starts_with("avx512.mask.palignr.")) {
+    Rep = upgradeX86ALIGNIntrinsics(Builder, CI->getArgOperand(0),
+                                    CI->getArgOperand(1), CI->getArgOperand(2),
+                                    CI->getArgOperand(3), CI->getArgOperand(4),
+                                    false);
+  } else if (Name.starts_with("avx512.mask.valign.")) {
+    Rep = upgradeX86ALIGNIntrinsics(
+        Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+        CI->getArgOperand(2), CI->getArgOperand(3), CI->getArgOperand(4), true);
+  } else if (Name == "sse2.psll.dq" || Name == "avx2.psll.dq") {
+    // 128/256-bit shift left specified in bits.
+    unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+    Rep = upgradeX86PSLLDQIntrinsics(Builder, CI->getArgOperand(0),
+                                     Shift / 8); // Shift is in bits.
+  } else if (Name == "sse2.psrl.dq" || Name == "avx2.psrl.dq") {
+    // 128/256-bit shift right specified in bits.
+    unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+    Rep = upgradeX86PSRLDQIntrinsics(Builder, CI->getArgOperand(0),
+                                     Shift / 8); // Shift is in bits.
+  } else if (Name == "sse2.psll.dq.bs" || Name == "avx2.psll.dq.bs" ||
+             Name == "avx512.psll.dq.512") {
+    // 128/256/512-bit shift left specified in bytes.
+    unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+    Rep = upgradeX86PSLLDQIntrinsics(Builder, CI->getArgOperand(0), Shift);
+  } else if (Name == "sse2.psrl.dq.bs" || Name == "avx2.psrl.dq.bs" ||
+             Name == "avx512.psrl.dq.512") {
+    // 128/256/512-bit shift right specified in bytes.
+    unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+    Rep = upgradeX86PSRLDQIntrinsics(Builder, CI->getArgOperand(0), Shift);
+  } else if (Name == "sse41.pblendw" || Name.starts_with("sse41.blendp") ||
+             Name.starts_with("avx.blend.p") || Name == "avx2.pblendw" ||
+             Name.starts_with("avx2.pblendd.")) {
+    Value *Op0 = CI->getArgOperand(0);
+    Value *Op1 = CI->getArgOperand(1);
+    unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+    auto *VecTy = cast<FixedVectorType>(CI->getType());
+    unsigned NumElts = VecTy->getNumElements();
+
+    SmallVector<int, 16> Idxs(NumElts);
+    for (unsigned i = 0; i != NumElts; ++i)
+      Idxs[i] = ((Imm >> (i % 8)) & 1) ? i + NumElts : i;
+
+    Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
+  } else if (Name.starts_with("avx.vinsertf128.") ||
+             Name == "avx2.vinserti128" ||
+             Name.starts_with("avx512.mask.insert")) {
+    Value *Op0 = CI->getArgOperand(0);
+    Value *Op1 = CI->getArgOperand(1);
+    unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+    unsigned DstNumElts =
+        cast<FixedVectorType>(CI->getType())->getNumElements();
+    unsigned SrcNumElts =
+        cast<FixedVectorType>(Op1->getType())->getNumElements();
+    unsigned Scale = DstNumElts / SrcNumElts;
+
+    // Mask off the high bits of the immediate value; hardware ignores those.
+    Imm = Imm % Scale;
+
+    // Extend the second operand into a vector the size of the destination.
+    SmallVector<int, 8> Idxs(DstNumElts);
+    for (unsigned i = 0; i != SrcNumElts; ++i)
+      Idxs[i] = i;
+    for (unsigned i = SrcNumElts; i != DstNumElts; ++i)
+      Idxs[i] = SrcNumElts;
+    Rep = Builder.CreateShuffleVector(Op1, Idxs);
+
+    // Insert the second operand into the first operand.
+
+    // Note that there is no guarantee that instruction lowering will actually
+    // produce a vinsertf128 instruction for the created shuffles. In
+    // particular, the 0 immediate case involves no lane changes, so it can
+    // be handled as a blend.
+
+    // Example of shuffle mask for 32-bit elements:
+    // Imm = 1  <i32 0, i32 1, i32 2,  i32 3,  i32 8, i32 9, i32 10, i32 11>
+    // Imm = 0  <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6,  i32 7 >
+
+    // First fill with identify mask.
+    for (unsigned i = 0; i != DstNumElts; ++i)
+      Idxs[i] = i;
+    // Then replace the elements where we need to insert.
+    for (unsigned i = 0; i != SrcNumElts; ++i)
+      Idxs[i + Imm * SrcNumElts] = i + DstNumElts;
+    Rep = Builder.CreateShuffleVector(Op0, Rep, Idxs);
+
+    // If the intrinsic has a mask operand, handle that.
+    if (CI->arg_size() == 5)
+      Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep,
+                          CI->getArgOperand(3));
+  } else if (Name.starts_with("avx.vextractf128.") ||
+             Name == "avx2.vextracti128" ||
+             Name.starts_with("avx512.mask.vextract")) {
+    Value *Op0 = CI->getArgOperand(0);
+    unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+    unsigned DstNumElts =
+        cast<FixedVectorType>(CI->getType())->getNumElements();
+    unsigned SrcNumElts =
+        cast<FixedVectorType>(Op0->getType())->getNumElements();
+    unsigned Scale = SrcNumElts / DstNumElts;
+
+    // Mask off the high bits of the immediate value; hardware ignores those.
+    Imm = Imm % Scale;
+
+    // Get indexes for the subvector of the input vector.
+    SmallVector<int, 8> Idxs(DstNumElts);
+    for (unsigned i = 0; i != DstNumElts; ++i) {
+      Idxs[i] = i + (Imm * DstNumElts);
+    }
+    Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
+
+    // If the intrinsic has a mask operand, handle that.
+    if (CI->arg_size() == 4)
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.perm.df.") ||
+             Name.starts_with("avx512.mask.perm.di.")) {
+    Value *Op0 = CI->getArgOperand(0);
+    unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+    auto *VecTy = cast<FixedVectorType>(CI->getType());
+    unsigned NumElts = VecTy->getNumElements();
+
+    SmallVector<int, 8> Idxs(NumElts);
+    for (unsigned i = 0; i != NumElts; ++i)
+      Idxs[i] = (i & ~0x3) + ((Imm >> (2 * (i & 0x3))) & 3);
+
+    Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
+
+    if (CI->arg_size() == 4)
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
+  } else if (Name.starts_with("avx.vperm2f128.") || Name == "avx2.vperm2i128") {
+    // The immediate permute control byte looks like this:
+    //    [1:0] - select 128 bits from sources for low half of destination
+    //    [2]   - ignore
+    //    [3]   - zero low half of destination
+    //    [5:4] - select 128 bits from sources for high half of destination
+    //    [6]   - ignore
+    //    [7]   - zero high half of destination
+
+    uint8_t Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+
+    unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
+    unsigned HalfSize = NumElts / 2;
+    SmallVector<int, 8> ShuffleMask(NumElts);
+
+    // Determine which operand(s) are actually in use for this instruction.
+    Value *V0 = (Imm & 0x02) ? CI->getArgOperand(1) : CI->getArgOperand(0);
+    Value *V1 = (Imm & 0x20) ? CI->getArgOperand(1) : CI->getArgOperand(0);
+
+    // If needed, replace operands based on zero mask.
+    V0 = (Imm & 0x08) ? ConstantAggregateZero::get(CI->getType()) : V0;
+    V1 = (Imm & 0x80) ? ConstantAggregateZero::get(CI->getType()) : V1;
+
+    // Permute low half of result.
+    unsigned StartIndex = (Imm & 0x01) ? HalfSize : 0;
+    for (unsigned i = 0; i < HalfSize; ++i)
+      ShuffleMask[i] = StartIndex + i;
+
+    // Permute high half of result.
+    StartIndex = (Imm & 0x10) ? HalfSize : 0;
+    for (unsigned i = 0; i < HalfSize; ++i)
+      ShuffleMask[i + HalfSize] = NumElts + StartIndex + i;
+
+    Rep = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+
+  } else if (Name.starts_with("avx.vpermil.") || Name == "sse2.pshuf.d" ||
+             Name.starts_with("avx512.mask.vpermil.p") ||
+             Name.starts_with("avx512.mask.pshuf.d.")) {
+    Value *Op0 = CI->getArgOperand(0);
+    unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+    auto *VecTy = cast<FixedVectorType>(CI->getType());
+    unsigned NumElts = VecTy->getNumElements();
+    // Calculate the size of each index in the immediate.
+    unsigned IdxSize = 64 / VecTy->getScalarSizeInBits();
+    unsigned IdxMask = ((1 << IdxSize) - 1);
+
+    SmallVector<int, 8> Idxs(NumElts);
+    // Lookup the bits for this element, wrapping around the immediate every
+    // 8-bits. Elements are grouped into sets of 2 or 4 elements so we need
+    // to offset by the first index of each group.
+    for (unsigned i = 0; i != NumElts; ++i)
+      Idxs[i] = ((Imm >> ((i * IdxSize) % 8)) & IdxMask) | (i & ~IdxMask);
+
+    Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
+
+    if (CI->arg_size() == 4)
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
+  } else if (Name == "sse2.pshufl.w" ||
+             Name.starts_with("avx512.mask.pshufl.w.")) {
+    Value *Op0 = CI->getArgOperand(0);
+    unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+    unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
+
+    SmallVector<int, 16> Idxs(NumElts);
+    for (unsigned l = 0; l != NumElts; l += 8) {
+      for (unsigned i = 0; i != 4; ++i)
+        Idxs[i + l] = ((Imm >> (2 * i)) & 0x3) + l;
+      for (unsigned i = 4; i != 8; ++i)
+        Idxs[i + l] = i + l;
+    }
+
+    Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
+
+    if (CI->arg_size() == 4)
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
+  } else if (Name == "sse2.pshufh.w" ||
+             Name.starts_with("avx512.mask.pshufh.w.")) {
+    Value *Op0 = CI->getArgOperand(0);
+    unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+    unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
+
+    SmallVector<int, 16> Idxs(NumElts);
+    for (unsigned l = 0; l != NumElts; l += 8) {
+      for (unsigned i = 0; i != 4; ++i)
+        Idxs[i + l] = i + l;
+      for (unsigned i = 0; i != 4; ++i)
+        Idxs[i + l + 4] = ((Imm >> (2 * i)) & 0x3) + 4 + l;
+    }
+
+    Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
+
+    if (CI->arg_size() == 4)
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.shuf.p")) {
+    Value *Op0 = CI->getArgOperand(0);
+    Value *Op1 = CI->getArgOperand(1);
+    unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+    unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
+
+    unsigned NumLaneElts = 128 / CI->getType()->getScalarSizeInBits();
+    unsigned HalfLaneElts = NumLaneElts / 2;
+
+    SmallVector<int, 16> Idxs(NumElts);
+    for (unsigned i = 0; i != NumElts; ++i) {
+      // Base index is the starting element of the lane.
+      Idxs[i] = i - (i % NumLaneElts);
+      // If we are half way through the lane switch to the other source.
+      if ((i % NumLaneElts) >= HalfLaneElts)
+        Idxs[i] += NumElts;
+      // Now select the specific element. By adding HalfLaneElts bits from
+      // the immediate. Wrapping around the immediate every 8-bits.
+      Idxs[i] += (Imm >> ((i * HalfLaneElts) % 8)) & ((1 << HalfLaneElts) - 1);
+    }
+
+    Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
+
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(4), Rep, CI->getArgOperand(3));
+  } else if (Name.starts_with("avx512.mask.movddup") ||
+             Name.starts_with("avx512.mask.movshdup") ||
+             Name.starts_with("avx512.mask.movsldup")) {
+    Value *Op0 = CI->getArgOperand(0);
+    unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
+    unsigned NumLaneElts = 128 / CI->getType()->getScalarSizeInBits();
+
+    unsigned Offset = 0;
+    if (Name.starts_with("avx512.mask.movshdup."))
+      Offset = 1;
+
+    SmallVector<int, 16> Idxs(NumElts);
+    for (unsigned l = 0; l != NumElts; l += NumLaneElts)
+      for (unsigned i = 0; i != NumLaneElts; i += 2) {
+        Idxs[i + l + 0] = i + l + Offset;
+        Idxs[i + l + 1] = i + l + Offset;
+      }
+
+    Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
+
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
+  } else if (Name.starts_with("avx512.mask.punpckl") ||
+             Name.starts_with("avx512.mask.unpckl.")) {
+    Value *Op0 = CI->getArgOperand(0);
+    Value *Op1 = CI->getArgOperand(1);
+    int NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
+    int NumLaneElts = 128 / CI->getType()->getScalarSizeInBits();
+
+    SmallVector<int, 64> Idxs(NumElts);
+    for (int l = 0; l != NumElts; l += NumLaneElts)
+      for (int i = 0; i != NumLaneElts; ++i)
+        Idxs[i + l] = l + (i / 2) + NumElts * (i % 2);
+
+    Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
+
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.punpckh") ||
+             Name.starts_with("avx512.mask.unpckh.")) {
+    Value *Op0 = CI->getArgOperand(0);
+    Value *Op1 = CI->getArgOperand(1);
+    int NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
+    int NumLaneElts = 128 / CI->getType()->getScalarSizeInBits();
+
+    SmallVector<int, 64> Idxs(NumElts);
+    for (int l = 0; l != NumElts; l += NumLaneElts)
+      for (int i = 0; i != NumLaneElts; ++i)
+        Idxs[i + l] = (NumLaneElts / 2) + l + (i / 2) + NumElts * (i % 2);
+
+    Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
+
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.and.") ||
+             Name.starts_with("avx512.mask.pand.")) {
+    VectorType *FTy = cast<VectorType>(CI->getType());
+    VectorType *ITy = VectorType::getInteger(FTy);
+    Rep = Builder.CreateAnd(Builder.CreateBitCast(CI->getArgOperand(0), ITy),
+                            Builder.CreateBitCast(CI->getArgOperand(1), ITy));
+    Rep = Builder.CreateBitCast(Rep, FTy);
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.andn.") ||
+             Name.starts_with("avx512.mask.pandn.")) {
+    VectorType *FTy = cast<VectorType>(CI->getType());
+    VectorType *ITy = VectorType::getInteger(FTy);
+    Rep = Builder.CreateNot(Builder.CreateBitCast(CI->getArgOperand(0), ITy));
+    Rep = Builder.CreateAnd(Rep,
+                            Builder.CreateBitCast(CI->getArgOperand(1), ITy));
+    Rep = Builder.CreateBitCast(Rep, FTy);
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.or.") ||
+             Name.starts_with("avx512.mask.por.")) {
+    VectorType *FTy = cast<VectorType>(CI->getType());
+    VectorType *ITy = VectorType::getInteger(FTy);
+    Rep = Builder.CreateOr(Builder.CreateBitCast(CI->getArgOperand(0), ITy),
+                           Builder.CreateBitCast(CI->getArgOperand(1), ITy));
+    Rep = Builder.CreateBitCast(Rep, FTy);
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.xor.") ||
+             Name.starts_with("avx512.mask.pxor.")) {
+    VectorType *FTy = cast<VectorType>(CI->getType());
+    VectorType *ITy = VectorType::getInteger(FTy);
+    Rep = Builder.CreateXor(Builder.CreateBitCast(CI->getArgOperand(0), ITy),
+                            Builder.CreateBitCast(CI->getArgOperand(1), ITy));
+    Rep = Builder.CreateBitCast(Rep, FTy);
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.padd.")) {
+    Rep = Builder.CreateAdd(CI->getArgOperand(0), CI->getArgOperand(1));
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.psub.")) {
+    Rep = Builder.CreateSub(CI->getArgOperand(0), CI->getArgOperand(1));
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.pmull.")) {
+    Rep = Builder.CreateMul(CI->getArgOperand(0), CI->getArgOperand(1));
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.add.p")) {
+    if (Name.ends_with(".512")) {
+      Intrinsic::ID IID;
+      if (Name[17] == 's')
+        IID = Intrinsic::x86_avx512_add_ps_512;
+      else
+        IID = Intrinsic::x86_avx512_add_pd_512;
+
+      Rep = Builder.CreateCall(
+          Intrinsic::getDeclaration(F->getParent(), IID),
+          {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
+    } else {
+      Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1));
+    }
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.div.p")) {
+    if (Name.ends_with(".512")) {
+      Intrinsic::ID IID;
+      if (Name[17] == 's')
+        IID = Intrinsic::x86_avx512_div_ps_512;
+      else
+        IID = Intrinsic::x86_avx512_div_pd_512;
+
+      Rep = Builder.CreateCall(
+          Intrinsic::getDeclaration(F->getParent(), IID),
+          {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
+    } else {
+      Rep = Builder.CreateFDiv(CI->getArgOperand(0), CI->getArgOperand(1));
+    }
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.mul.p")) {
+    if (Name.ends_with(".512")) {
+      Intrinsic::ID IID;
+      if (Name[17] == 's')
+        IID = Intrinsic::x86_avx512_mul_ps_512;
+      else
+        IID = Intrinsic::x86_avx512_mul_pd_512;
+
+      Rep = Builder.CreateCall(
+          Intrinsic::getDeclaration(F->getParent(), IID),
+          {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
+    } else {
+      Rep = Builder.CreateFMul(CI->getArgOperand(0), CI->getArgOperand(1));
+    }
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.sub.p")) {
+    if (Name.ends_with(".512")) {
+      Intrinsic::ID IID;
+      if (Name[17] == 's')
+        IID = Intrinsic::x86_avx512_sub_ps_512;
+      else
+        IID = Intrinsic::x86_avx512_sub_pd_512;
+
+      Rep = Builder.CreateCall(
+          Intrinsic::getDeclaration(F->getParent(), IID),
+          {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
+    } else {
+      Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1));
+    }
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
+  } else if ((Name.starts_with("avx512.mask.max.p") ||
+              Name.starts_with("avx512.mask.min.p")) &&
+             Name.drop_front(18) == ".512") {
+    bool IsDouble = Name[17] == 'd';
+    bool IsMin = Name[13] == 'i';
+    static const Intrinsic::ID MinMaxTbl[2][2] = {
+        {Intrinsic::x86_avx512_max_ps_512, Intrinsic::x86_avx512_max_pd_512},
+        {Intrinsic::x86_avx512_min_ps_512, Intrinsic::x86_avx512_min_pd_512}};
+    Intrinsic::ID IID = MinMaxTbl[IsMin][IsDouble];
+
+    Rep = Builder.CreateCall(
+        Intrinsic::getDeclaration(F->getParent(), IID),
+        {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
+  } else if (Name.starts_with("avx512.mask.lzcnt.")) {
+    Rep =
+        Builder.CreateCall(Intrinsic::getDeclaration(
+                               F->getParent(), Intrinsic::ctlz, CI->getType()),
+                           {CI->getArgOperand(0), Builder.getInt1(false)});
+    Rep =
+        emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
+  } else if (Name.starts_with("avx512.mask.psll")) {
+    bool IsImmediate = Name[16] == 'i' || (Name.size() > 18 && Name[18] == 'i');
+    bool IsVariable = Name[16] == 'v';
+    char Size = Name[16] == '.'   ? Name[17]
+                : Name[17] == '.' ? Name[18]
+                : Name[18] == '.' ? Name[19]
+                                  : Name[20];
+
+    Intrinsic::ID IID;
+    if (IsVariable && Name[17] != '.') {
+      if (Size == 'd' && Name[17] == '2') // avx512.mask.psllv2.di
+        IID = Intrinsic::x86_avx2_psllv_q;
+      else if (Size == 'd' && Name[17] == '4') // avx512.mask.psllv4.di
+        IID = Intrinsic::x86_avx2_psllv_q_256;
+      else if (Size == 's' && Name[17] == '4') // avx512.mask.psllv4.si
+        IID = Intrinsic::x86_avx2_psllv_d;
+      else if (Size == 's' && Name[17] == '8') // avx512.mask.psllv8.si
+        IID = Intrinsic::x86_avx2_psllv_d_256;
+      else if (Size == 'h' && Name[17] == '8') // avx512.mask.psllv8.hi
+        IID = Intrinsic::x86_avx512_psllv_w_128;
+      else if (Size == 'h' && Name[17] == '1') // avx512.mask.psllv16.hi
+        IID = Intrinsic::x86_avx512_psllv_w_256;
+      else if (Name[17] == '3' && Name[18] == '2') // avx512.mask.psllv32hi
+        IID = Intrinsic::x86_avx512_psllv_w_512;
+      else
+        llvm_unreachable("Unexpected size");
+    } else if (Name.ends_with(".128")) {
+      if (Size == 'd') // avx512.mask.psll.d.128, avx512.mask.psll.di.128
+        IID = IsImmediate ? Intrinsic::x86_sse2_pslli_d
+                          : Intrinsic::x86_sse2_psll_d;
+      else if (Size == 'q') // avx512.mask.psll.q.128, avx512.mask.psll.qi.128
+        IID = IsImmediate ? Intrinsic::x86_sse2_pslli_q
+                          : Intrinsic::x86_sse2_psll_q;
+      else if (Size == 'w') // avx512.mask.psll.w.128, avx512.mask.psll.wi.128
+        IID = IsImmediate ? Intrinsic::x86_sse2_pslli_w
+                          : Intrinsic::x86_sse2_psll_w;
+      else
+        llvm_unreachable("Unexpected size");
+    } else if (Name.ends_with(".256")) {
+      if (Size == 'd') // avx512.mask.psll.d.256, avx512.mask.psll.di.256
+        IID = IsImmediate ? Intrinsic::x86_avx2_pslli_d
+                          : Intrinsic::x86_avx2_psll_d;
+      else if (Size == 'q') // avx512.mask.psll.q.256, avx512.mask.psll.qi.256
+        IID = IsImmediate ? Intrinsic::x86_avx2_pslli_q
+                          : Intrinsic::x86_avx2_psll_q;
+      else if (Size == 'w') // avx512.mask.psll.w.256, avx512.mask.psll.wi.256
+        IID = IsImmediate ? Intrinsic::x86_avx2_pslli_w
+                          : Intrinsic::x86_avx2_psll_w;
+      else
+        llvm_unreachable("Unexpected size");
+    } else {
+      if (Size == 'd') // psll.di.512, pslli.d, psll.d, psllv.d.512
+        IID = IsImmediate  ? Intrinsic::x86_avx512_pslli_d_512
+              : IsVariable ? Intrinsic::x86_avx512_psllv_d_512
+                           : Intrinsic::x86_avx512_psll_d_512;
+      else if (Size == 'q') // psll.qi.512, pslli.q, psll.q, psllv.q.512
+        IID = IsImmediate  ? Intrinsic::x86_avx512_pslli_q_512
+              : IsVariable ? Intrinsic::x86_avx512_psllv_q_512
+                           : Intrinsic::x86_avx512_psll_q_512;
+      else if (Size == 'w') // psll.wi.512, pslli.w, psll.w
+        IID = IsImmediate ? Intrinsic::x86_avx512_pslli_w_512
+                          : Intrinsic::x86_avx512_psll_w_512;
+      else
+        llvm_unreachable("Unexpected size");
+    }
+
+    Rep = upgradeX86MaskedShift(Builder, *CI, IID);
+  } else if (Name.starts_with("avx512.mask.psrl")) {
+    bool IsImmediate = Name[16] == 'i' || (Name.size() > 18 && Name[18] == 'i');
+    bool IsVariable = Name[16] == 'v';
+    char Size = Name[16] == '.'   ? Name[17]
+                : Name[17] == '.' ? Name[18]
+                : Name[18] == '.' ? Name[19]
+                                  : Name[20];
+
+    Intrinsic::ID IID;
+    if (IsVariable && Name[17] != '.') {
+      if (Size == 'd' && Name[17] == '2') // avx512.mask.psrlv2.di
+        IID = Intrinsic::x86_avx2_psrlv_q;
+      else if (Size == 'd' && Name[17] == '4') // avx512.mask.psrlv4.di
+        IID = Intrinsic::x86_avx2_psrlv_q_256;
+      else if (Size == 's' && Name[17] == '4') // avx512.mask.psrlv4.si
+        IID = Intrinsic::x86_avx2_psrlv_d;
+      else if (Size == 's' && Name[17] == '8') // avx512.mask.psrlv8.si
+        IID = Intrinsic::x86_avx2_psrlv_d_256;
+      else if (Size == 'h' && Name[17] == '8') // avx512.mask.psrlv8.hi
+        IID = Intrinsic::x86_avx512_psrlv_w_128;
+      else if (Size == 'h' && Name[17] == '1') // avx512.mask.psrlv16.hi
+        IID = Intrinsic::x86_avx512_psrlv_w_256;
+      else if (Name[17] == '3' && Name[18] == '2') // avx512.mask.psrlv32hi
+        IID = Intrinsic::x86_avx512_psrlv_w_512;
+      else
+        llvm_unreachable("Unexpected size");
+    } else if (Name.ends_with(".128")) {
+      if (Size == 'd') // avx512.mask.psrl.d.128, avx512.mask.psrl.di.128
+        IID = IsImmediate ? Intrinsic::x86_sse2_psrli_d
+                          : Intrinsic::x86_sse2_psrl_d;
+      else if (Size == 'q') // avx512.mask.psrl.q.128, avx512.mask.psrl.qi.128
+        IID = IsImmediate ? Intrinsic::x86_sse2_psrli_q
+                          : Intrinsic::x86_sse2_psrl_q;
+      else if (Size == 'w') // avx512.mask.psrl.w.128, avx512.mask.psrl.wi.128
+        IID = IsImmediate ? Intrinsic::x86_sse2_psrli_w
+                          : Intrinsic::x86_sse2_psrl_w;
+      else
+        llvm_unreachable("Unexpected size");
+    } else if (Name.ends_with(".256")) {
+      if (Size == 'd') // avx512.mask.psrl.d.256, avx512.mask.psrl.di.256
+        IID = IsImmediate ? Intrinsic::x86_avx2_psrli_d
+                          : Intrinsic::x86_avx2_psrl_d;
+      else if (Size == 'q') // avx512.mask.psrl.q.256, avx512.mask.psrl.qi.256
+        IID = IsImmediate ? Intrinsic::x86_avx2_psrli_q
+                          : Intrinsic::x86_avx2_psrl_q;
+      else if (Size == 'w') // avx512.mask.psrl.w.256, avx512.mask.psrl.wi.256
+        IID = IsImmediate ? Intrinsic::x86_avx2_psrli_w
+                          : Intrinsic::x86_avx2_psrl_w;
+      else
+        llvm_unreachable("Unexpected size");
+    } else {
+      if (Size == 'd') // psrl.di.512, psrli.d, psrl.d, psrl.d.512
+        IID = IsImmediate  ? Intrinsic::x86_avx512_psrli_d_512
+              : IsVariable ? Intrinsic::x86_avx512_psrlv_d_512
+                           : Intrinsic::x86_avx512_psrl_d_512;
+      else if (Size == 'q') // psrl.qi.512, psrli.q, psrl.q, psrl.q.512
+        IID = IsImmediate  ? Intrinsic::x86_avx512_psrli_q_512
+              : IsVariable ? Intrinsic::x86_avx512_psrlv_q_512
+                           : Intrinsic::x86_avx512_psrl_q_512;
+      else if (Size == 'w') // psrl.wi.512, psrli.w, psrl.w)
+        IID = IsImmediate ? Intrinsic::x86_avx512_psrli_w_512
+                          : Intrinsic::x86_avx512_psrl_w_512;
+      else
+        llvm_unreachable("Unexpected size");
+    }
+
+    Rep = upgradeX86MaskedShift(Builder, *CI, IID);
+  } else if (Name.starts_with("avx512.mask.psra")) {
+    bool IsImmediate = Name[16] == 'i' || (Name.size() > 18 && Name[18] == 'i');
+    bool IsVariable = Name[16] == 'v';
+    char Size = Name[16] == '.'   ? Name[17]
+                : Name[17] == '.' ? Name[18]
+                : Name[18] == '.' ? Name[19]
+                                  : Name[20];
+
+    Intrinsic::ID IID;
+    if (IsVariable && Name[17] != '.') {
+      if (Size == 's' && Name[17] == '4') // avx512.mask.psrav4.si
+        IID = Intrinsic::x86_avx2_psrav_d;
+      else if (Size == 's' && Name[17] == '8') // avx512.mask.psrav8.si
+        IID = Intrinsic::x86_avx2_psrav_d_256;
+      else if (Size == 'h' && Name[17] == '8') // avx512.mask.psrav8.hi
+        IID = Intrinsic::x86_avx512_psrav_w_128;
+      else if (Size == 'h' && Name[17] == '1') // avx512.mask.psrav16.hi
+        IID = Intrinsic::x86_avx512_psrav_w_256;
+      else if (Name[17] == '3' && Name[18] == '2') // avx512.mask.psrav32hi
+        IID = Intrinsic::x86_avx512_psrav_w_512;
+      else
+        llvm_unreachable("Unexpected size");
+    } else if (Name.ends_with(".128")) {
+      if (Size == 'd') // avx512.mask.psra.d.128, avx512.mask.psra.di.128
+        IID = IsImmediate ? Intrinsic::x86_sse2_psrai_d
+                          : Intrinsic::x86_sse2_psra_d;
+      else if (Size == 'q') // avx512.mask.psra.q.128, avx512.mask.psra.qi.128
+        IID = IsImmediate  ? Intrinsic::x86_avx512_psrai_q_128
+              : IsVariable ? Intrinsic::x86_avx512_psrav_q_128
+                           : Intrinsic::x86_avx512_psra_q_128;
+      else if (Size == 'w') // avx512.mask.psra.w.128, avx512.mask.psra.wi.128
+        IID = IsImmediate ? Intrinsic::x86_sse2_psrai_w
+                          : Intrinsic::x86_sse2_psra_w;
+      else
+        llvm_unreachable("Unexpected size");
+    } else if (Name.ends_with(".256")) {
+      if (Size == 'd') // avx512.mask.psra.d.256, avx512.mask.psra.di.256
+        IID = IsImmediate ? Intrinsic::x86_avx2_psrai_d
+                          : Intrinsic::x86_avx2_psra_d;
+      else if (Size == 'q') // avx512.mask.psra.q.256, avx512.mask.psra.qi.256
+        IID = IsImmediate  ? Intrinsic::x86_avx512_psrai_q_256
+              : IsVariable ? Intrinsic::x86_avx512_psrav_q_256
+                           : Intrinsic::x86_avx512_psra_q_256;
+      else if (Size == 'w') // avx512.mask.psra.w.256, avx512.mask.psra.wi.256
+        IID = IsImmediate ? Intrinsic::x86_avx2_psrai_w
+                          : Intrinsic::x86_avx2_psra_w;
+      else
+        llvm_unreachable("Unexpected size");
+    } else {
+      if (Size == 'd') // psra.di.512, psrai.d, psra.d, psrav.d.512
+        IID = IsImmediate  ? Intrinsic::x86_avx512_psrai_d_512
+              : IsVariable ? Intrinsic::x86_avx512_psrav_d_512
+                           : Intrinsic::x86_avx512_psra_d_512;
+      else if (Size == 'q') // psra.qi.512, psrai.q, psra.q
+        IID = IsImmediate  ? Intrinsic::x86_avx512_psrai_q_512
+              : IsVariable ? Intrinsic::x86_avx512_psrav_q_512
+                           : Intrinsic::x86_avx512_psra_q_512;
+      else if (Size == 'w') // psra.wi.512, psrai.w, psra.w
+        IID = IsImmediate ? Intrinsic::x86_avx512_psrai_w_512
+                          : Intrinsic::x86_avx512_psra_w_512;
+      else
+        llvm_unreachable("Unexpected size");
+    }
+
+    Rep = upgradeX86MaskedShift(Builder, *CI, IID);
+  } else if (Name.starts_with("avx512.mask.move.s")) {
+    Rep = upgradeMaskedMove(Builder, *CI);
+  } else if (Name.starts_with("avx512.cvtmask2")) {
+    Rep = upgradeMaskToInt(Builder, *CI);
+  } else if (Name.ends_with(".movntdqa")) {
+    MDNode *Node = MDNode::get(
+        C, ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
+
+    Value *Ptr = CI->getArgOperand(0);
+
+    // Convert the type of the pointer to a pointer to the stored type.
+    Value *BC = Builder.CreateBitCast(
+        Ptr, PointerType::getUnqual(CI->getType()), "cast");
+    LoadInst *LI = Builder.CreateAlignedLoad(
+        CI->getType(), BC,
+        Align(CI->getType()->getPrimitiveSizeInBits().getFixedValue() / 8));
+    LI->setMetadata(LLVMContext::MD_nontemporal, Node);
+    Rep = LI;
+  } else if (Name.starts_with("fma.vfmadd.") ||
+             Name.starts_with("fma.vfmsub.") ||
+             Name.starts_with("fma.vfnmadd.") ||
+             Name.starts_with("fma.vfnmsub.")) {
+    bool NegMul = Name[6] == 'n';
+    bool NegAcc = NegMul ? Name[8] == 's' : Name[7] == 's';
+    bool IsScalar = NegMul ? Name[12] == 's' : Name[11] == 's';
+
+    Value *Ops[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                    CI->getArgOperand(2)};
+
+    if (IsScalar) {
+      Ops[0] = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
+      Ops[1] = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
+      Ops[2] = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
+    }
+
+    if (NegMul && !IsScalar)
+      Ops[0] = Builder.CreateFNeg(Ops[0]);
+    if (NegMul && IsScalar)
+      Ops[1] = Builder.CreateFNeg(Ops[1]);
+    if (NegAcc)
+      Ops[2] = Builder.CreateFNeg(Ops[2]);
+
+    Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(),
+                                                       Intrinsic::fma,
+                                                       Ops[0]->getType()),
+                             Ops);
+
+    if (IsScalar)
+      Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
+  } else if (Name.starts_with("fma4.vfmadd.s")) {
+    Value *Ops[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                    CI->getArgOperand(2)};
+
+    Ops[0] = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
+    Ops[1] = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
+    Ops[2] = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
+
+    Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(),
+                                                       Intrinsic::fma,
+                                                       Ops[0]->getType()),
+                             Ops);
+
+    Rep = Builder.CreateInsertElement(Constant::getNullValue(CI->getType()),
+                                      Rep, (uint64_t)0);
+  } else if (Name.starts_with("avx512.mask.vfmadd.s") ||
+             Name.starts_with("avx512.maskz.vfmadd.s") ||
+             Name.starts_with("avx512.mask3.vfmadd.s") ||
+             Name.starts_with("avx512.mask3.vfmsub.s") ||
+             Name.starts_with("avx512.mask3.vfnmsub.s")) {
+    bool IsMask3 = Name[11] == '3';
+    bool IsMaskZ = Name[11] == 'z';
+    // Drop the "avx512.mask." to make it easier.
+    Name = Name.drop_front(IsMask3 || IsMaskZ ? 13 : 12);
+    bool NegMul = Name[2] == 'n';
+    bool NegAcc = NegMul ? Name[4] == 's' : Name[3] == 's';
+
+    Value *A = CI->getArgOperand(0);
+    Value *B = CI->getArgOperand(1);
+    Value *C = CI->getArgOperand(2);
+
+    if (NegMul && (IsMask3 || IsMaskZ))
+      A = Builder.CreateFNeg(A);
+    if (NegMul && !(IsMask3 || IsMaskZ))
+      B = Builder.CreateFNeg(B);
+    if (NegAcc)
+      C = Builder.CreateFNeg(C);
+
+    A = Builder.CreateExtractElement(A, (uint64_t)0);
+    B = Builder.CreateExtractElement(B, (uint64_t)0);
+    C = Builder.CreateExtractElement(C, (uint64_t)0);
+
+    if (!isa<ConstantInt>(CI->getArgOperand(4)) ||
+        cast<ConstantInt>(CI->getArgOperand(4))->getZExtValue() != 4) {
+      Value *Ops[] = {A, B, C, CI->getArgOperand(4)};
+
+      Intrinsic::ID IID;
+      if (Name.back() == 'd')
+        IID = Intrinsic::x86_avx512_vfmadd_f64;
+      else
+        IID = Intrinsic::x86_avx512_vfmadd_f32;
+      Function *FMA = Intrinsic::getDeclaration(CI->getModule(), IID);
+      Rep = Builder.CreateCall(FMA, Ops);
+    } else {
+      Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma,
+                                                A->getType());
+      Rep = Builder.CreateCall(FMA, {A, B, C});
+    }
+
+    Value *PassThru = IsMaskZ   ? Constant::getNullValue(Rep->getType())
+                      : IsMask3 ? C
+                                : A;
+
+    // For Mask3 with NegAcc, we need to create a new extractelement that
+    // avoids the negation above.
+    if (NegAcc && IsMask3)
+      PassThru =
+          Builder.CreateExtractElement(CI->getArgOperand(2), (uint64_t)0);
+
+    Rep = emitX86ScalarSelect(Builder, CI->getArgOperand(3), Rep, PassThru);
+    Rep = Builder.CreateInsertElement(CI->getArgOperand(IsMask3 ? 2 : 0), Rep,
+                                      (uint64_t)0);
+  } else if (Name.starts_with("avx512.mask.vfmadd.p") ||
+             Name.starts_with("avx512.mask.vfnmadd.p") ||
+             Name.starts_with("avx512.mask.vfnmsub.p") ||
+             Name.starts_with("avx512.mask3.vfmadd.p") ||
+             Name.starts_with("avx512.mask3.vfmsub.p") ||
+             Name.starts_with("avx512.mask3.vfnmsub.p") ||
+             Name.starts_with("avx512.maskz.vfmadd.p")) {
+    bool IsMask3 = Name[11] == '3';
+    bool IsMaskZ = Name[11] == 'z';
+    // Drop the "avx512.mask." to make it easier.
+    Name = Name.drop_front(IsMask3 || IsMaskZ ? 13 : 12);
+    bool NegMul = Name[2] == 'n';
+    bool NegAcc = NegMul ? Name[4] == 's' : Name[3] == 's';
+
+    Value *A = CI->getArgOperand(0);
+    Value *B = CI->getArgOperand(1);
+    Value *C = CI->getArgOperand(2);
+
+    if (NegMul && (IsMask3 || IsMaskZ))
+      A = Builder.CreateFNeg(A);
+    if (NegMul && !(IsMask3 || IsMaskZ))
+      B = Builder.CreateFNeg(B);
+    if (NegAcc)
+      C = Builder.CreateFNeg(C);
+
+    if (CI->arg_size() == 5 &&
+        (!isa<ConstantInt>(CI->getArgOperand(4)) ||
+         cast<ConstantInt>(CI->getArgOperand(4))->getZExtValue() != 4)) {
+      Intrinsic::ID IID;
+      // Check the character before ".512" in string.
+      if (Name[Name.size() - 5] == 's')
+        IID = Intrinsic::x86_avx512_vfmadd_ps_512;
+      else
+        IID = Intrinsic::x86_avx512_vfmadd_pd_512;
+
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                               {A, B, C, CI->getArgOperand(4)});
+    } else {
+      Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma,
+                                                A->getType());
+      Rep = Builder.CreateCall(FMA, {A, B, C});
+    }
+
+    Value *PassThru = IsMaskZ   ? llvm::Constant::getNullValue(CI->getType())
+                      : IsMask3 ? CI->getArgOperand(2)
+                                : CI->getArgOperand(0);
+
+    Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+  } else if (Name.starts_with("fma.vfmsubadd.p")) {
+    unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
+    unsigned EltWidth = CI->getType()->getScalarSizeInBits();
+    Intrinsic::ID IID;
+    if (VecWidth == 128 && EltWidth == 32)
+      IID = Intrinsic::x86_fma_vfmaddsub_ps;
+    else if (VecWidth == 256 && EltWidth == 32)
+      IID = Intrinsic::x86_fma_vfmaddsub_ps_256;
+    else if (VecWidth == 128 && EltWidth == 64)
+      IID = Intrinsic::x86_fma_vfmaddsub_pd;
+    else if (VecWidth == 256 && EltWidth == 64)
+      IID = Intrinsic::x86_fma_vfmaddsub_pd_256;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+
+    Value *Ops[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                    CI->getArgOperand(2)};
+    Ops[2] = Builder.CreateFNeg(Ops[2]);
+    Rep =
+        Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), Ops);
+  } else if (Name.starts_with("avx512.mask.vfmaddsub.p") ||
+             Name.starts_with("avx512.mask3.vfmaddsub.p") ||
+             Name.starts_with("avx512.maskz.vfmaddsub.p") ||
+             Name.starts_with("avx512.mask3.vfmsubadd.p")) {
+    bool IsMask3 = Name[11] == '3';
+    bool IsMaskZ = Name[11] == 'z';
+    // Drop the "avx512.mask." to make it easier.
+    Name = Name.drop_front(IsMask3 || IsMaskZ ? 13 : 12);
+    bool IsSubAdd = Name[3] == 's';
+    if (CI->arg_size() == 5) {
+      Intrinsic::ID IID;
+      // Check the character before ".512" in string.
+      if (Name[Name.size() - 5] == 's')
+        IID = Intrinsic::x86_avx512_vfmaddsub_ps_512;
+      else
+        IID = Intrinsic::x86_avx512_vfmaddsub_pd_512;
+
+      Value *Ops[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                      CI->getArgOperand(2), CI->getArgOperand(4)};
+      if (IsSubAdd)
+        Ops[2] = Builder.CreateFNeg(Ops[2]);
+
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                               Ops);
+    } else {
+      int NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
+
+      Value *Ops[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                      CI->getArgOperand(2)};
+
+      Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma,
+                                                Ops[0]->getType());
+      Value *Odd = Builder.CreateCall(FMA, Ops);
+      Ops[2] = Builder.CreateFNeg(Ops[2]);
+      Value *Even = Builder.CreateCall(FMA, Ops);
+
+      if (IsSubAdd)
+        std::swap(Even, Odd);
+
+      SmallVector<int, 32> Idxs(NumElts);
+      for (int i = 0; i != NumElts; ++i)
+        Idxs[i] = i + (i % 2) * NumElts;
+
+      Rep = Builder.CreateShuffleVector(Even, Odd, Idxs);
+    }
+
+    Value *PassThru = IsMaskZ   ? llvm::Constant::getNullValue(CI->getType())
+                      : IsMask3 ? CI->getArgOperand(2)
+                                : CI->getArgOperand(0);
+
+    Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+  } else if (Name.starts_with("avx512.mask.pternlog.") ||
+             Name.starts_with("avx512.maskz.pternlog.")) {
+    bool ZeroMask = Name[11] == 'z';
+    unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
+    unsigned EltWidth = CI->getType()->getScalarSizeInBits();
+    Intrinsic::ID IID;
+    if (VecWidth == 128 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_pternlog_d_128;
+    else if (VecWidth == 256 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_pternlog_d_256;
+    else if (VecWidth == 512 && EltWidth == 32)
+      IID = Intrinsic::x86_avx512_pternlog_d_512;
+    else if (VecWidth == 128 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_pternlog_q_128;
+    else if (VecWidth == 256 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_pternlog_q_256;
+    else if (VecWidth == 512 && EltWidth == 64)
+      IID = Intrinsic::x86_avx512_pternlog_q_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+
+    Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                     CI->getArgOperand(2), CI->getArgOperand(3)};
+    Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
+                             Args);
+    Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
+                               : CI->getArgOperand(0);
+    Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru);
+  } else if (Name.starts_with("avx512.mask.vpmadd52") ||
+             Name.starts_with("avx512.maskz.vpmadd52")) {
+    bool ZeroMask = Name[11] == 'z';
+    bool High = Name[20] == 'h' || Name[21] == 'h';
+    unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
+    Intrinsic::ID IID;
+    if (VecWidth == 128 && !High)
+      IID = Intrinsic::x86_avx512_vpmadd52l_uq_128;
+    else if (VecWidth == 256 && !High)
+      IID = Intrinsic::x86_avx512_vpmadd52l_uq_256;
+    else if (VecWidth == 512 && !High)
+      IID = Intrinsic::x86_avx512_vpmadd52l_uq_512;
+    else if (VecWidth == 128 && High)
+      IID = Intrinsic::x86_avx512_vpmadd52h_uq_128;
+    else if (VecWidth == 256 && High)
+      IID = Intrinsic::x86_avx512_vpmadd52h_uq_256;
+    else if (VecWidth == 512 && High)
+      IID = Intrinsic::x86_avx512_vpmadd52h_uq_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+
+    Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                     CI->getArgOperand(2)};
+    Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
+                             Args);
+    Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
+                               : CI->getArgOperand(0);
+    Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+  } else if (Name.starts_with("avx512.mask.vpermi2var.") ||
+             Name.starts_with("avx512.mask.vpermt2var.") ||
+             Name.starts_with("avx512.maskz.vpermt2var.")) {
+    bool ZeroMask = Name[11] == 'z';
+    bool IndexForm = Name[17] == 'i';
+    Rep = upgradeX86VPERMT2Intrinsics(Builder, *CI, ZeroMask, IndexForm);
+  } else if (Name.starts_with("avx512.mask.vpdpbusd.") ||
+             Name.starts_with("avx512.maskz.vpdpbusd.") ||
+             Name.starts_with("avx512.mask.vpdpbusds.") ||
+             Name.starts_with("avx512.maskz.vpdpbusds.")) {
+    bool ZeroMask = Name[11] == 'z';
+    bool IsSaturating = Name[ZeroMask ? 21 : 20] == 's';
+    unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
+    Intrinsic::ID IID;
+    if (VecWidth == 128 && !IsSaturating)
+      IID = Intrinsic::x86_avx512_vpdpbusd_128;
+    else if (VecWidth == 256 && !IsSaturating)
+      IID = Intrinsic::x86_avx512_vpdpbusd_256;
+    else if (VecWidth == 512 && !IsSaturating)
+      IID = Intrinsic::x86_avx512_vpdpbusd_512;
+    else if (VecWidth == 128 && IsSaturating)
+      IID = Intrinsic::x86_avx512_vpdpbusds_128;
+    else if (VecWidth == 256 && IsSaturating)
+      IID = Intrinsic::x86_avx512_vpdpbusds_256;
+    else if (VecWidth == 512 && IsSaturating)
+      IID = Intrinsic::x86_avx512_vpdpbusds_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+
+    Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                     CI->getArgOperand(2)};
+    Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
+                             Args);
+    Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
+                               : CI->getArgOperand(0);
+    Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+  } else if (Name.starts_with("avx512.mask.vpdpwssd.") ||
+             Name.starts_with("avx512.maskz.vpdpwssd.") ||
+             Name.starts_with("avx512.mask.vpdpwssds.") ||
+             Name.starts_with("avx512.maskz.vpdpwssds.")) {
+    bool ZeroMask = Name[11] == 'z';
+    bool IsSaturating = Name[ZeroMask ? 21 : 20] == 's';
+    unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
+    Intrinsic::ID IID;
+    if (VecWidth == 128 && !IsSaturating)
+      IID = Intrinsic::x86_avx512_vpdpwssd_128;
+    else if (VecWidth == 256 && !IsSaturating)
+      IID = Intrinsic::x86_avx512_vpdpwssd_256;
+    else if (VecWidth == 512 && !IsSaturating)
+      IID = Intrinsic::x86_avx512_vpdpwssd_512;
+    else if (VecWidth == 128 && IsSaturating)
+      IID = Intrinsic::x86_avx512_vpdpwssds_128;
+    else if (VecWidth == 256 && IsSaturating)
+      IID = Intrinsic::x86_avx512_vpdpwssds_256;
+    else if (VecWidth == 512 && IsSaturating)
+      IID = Intrinsic::x86_avx512_vpdpwssds_512;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+
+    Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                     CI->getArgOperand(2)};
+    Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
+                             Args);
+    Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
+                               : CI->getArgOperand(0);
+    Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+  } else if (Name == "addcarryx.u32" || Name == "addcarryx.u64" ||
+             Name == "addcarry.u32" || Name == "addcarry.u64" ||
+             Name == "subborrow.u32" || Name == "subborrow.u64") {
+    Intrinsic::ID IID;
+    if (Name[0] == 'a' && Name.back() == '2')
+      IID = Intrinsic::x86_addcarry_32;
+    else if (Name[0] == 'a' && Name.back() == '4')
+      IID = Intrinsic::x86_addcarry_64;
+    else if (Name[0] == 's' && Name.back() == '2')
+      IID = Intrinsic::x86_subborrow_32;
+    else if (Name[0] == 's' && Name.back() == '4')
+      IID = Intrinsic::x86_subborrow_64;
+    else
+      llvm_unreachable("Unexpected intrinsic");
+
+    // Make a call with 3 operands.
+    Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                     CI->getArgOperand(2)};
+    Value *NewCall = Builder.CreateCall(
+        Intrinsic::getDeclaration(CI->getModule(), IID), Args);
+
+    // Extract the second result and store it.
+    Value *Data = Builder.CreateExtractValue(NewCall, 1);
+    // Cast the pointer to the right type.
+    Value *Ptr = Builder.CreateBitCast(
+        CI->getArgOperand(3), llvm::PointerType::getUnqual(Data->getType()));
+    Builder.CreateAlignedStore(Data, Ptr, Align(1));
+    // Replace the original call result with the first result of the new call.
+    Value *CF = Builder.CreateExtractValue(NewCall, 0);
+
+    CI->replaceAllUsesWith(CF);
+    Rep = nullptr;
+  } else if (Name.starts_with("avx512.mask.") &&
+             upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) {
+    // Rep will be updated by the call in the condition.
+  }
+
+  return Rep;
+}
+
 static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
                                       IRBuilder<> &Builder) {
   if (Name == "mve.vctp64.old") {
@@ -2497,1766 +4187,10 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     bool IsARM = Name.consume_front("arm.");
     bool IsAMDGCN = Name.consume_front("amdgcn.");
     bool IsDbg = Name.consume_front("dbg.");
-
-    if (IsX86 && Name.starts_with("sse4a.movnt.")) {
-      SmallVector<Metadata *, 1> Elts;
-      Elts.push_back(
-          ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
-      MDNode *Node = MDNode::get(C, Elts);
-
-      Value *Arg0 = CI->getArgOperand(0);
-      Value *Arg1 = CI->getArgOperand(1);
-
-      // Nontemporal (unaligned) store of the 0'th element of the float/double
-      // vector.
-      Type *SrcEltTy = cast<VectorType>(Arg1->getType())->getElementType();
-      PointerType *EltPtrTy = PointerType::getUnqual(SrcEltTy);
-      Value *Addr = Builder.CreateBitCast(Arg0, EltPtrTy, "cast");
-      Value *Extract =
-          Builder.CreateExtractElement(Arg1, (uint64_t)0, "extractelement");
-
-      StoreInst *SI = Builder.CreateAlignedStore(Extract, Addr, Align(1));
-      SI->setMetadata(LLVMContext::MD_nontemporal, Node);
-
-      // Remove intrinsic.
-      CI->eraseFromParent();
-      return;
-    }
-
-    if (IsX86 && (Name.starts_with("avx.movnt.") ||
-                  Name.starts_with("avx512.storent."))) {
-      SmallVector<Metadata *, 1> Elts;
-      Elts.push_back(
-          ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
-      MDNode *Node = MDNode::get(C, Elts);
-
-      Value *Arg0 = CI->getArgOperand(0);
-      Value *Arg1 = CI->getArgOperand(1);
-
-      // Convert the type of the pointer to a pointer to the stored type.
-      Value *BC = Builder.CreateBitCast(Arg0,
-                                        PointerType::getUnqual(Arg1->getType()),
-                                        "cast");
-      StoreInst *SI = Builder.CreateAlignedStore(
-          Arg1, BC,
-          Align(Arg1->getType()->getPrimitiveSizeInBits().getFixedValue() / 8));
-      SI->setMetadata(LLVMContext::MD_nontemporal, Node);
-
-      // Remove intrinsic.
-      CI->eraseFromParent();
-      return;
-    }
-
-    if (IsX86 && Name == "sse2.storel.dq") {
-      Value *Arg0 = CI->getArgOperand(0);
-      Value *Arg1 = CI->getArgOperand(1);
-
-      auto *NewVecTy = FixedVectorType::get(Type::getInt64Ty(C), 2);
-      Value *BC0 = Builder.CreateBitCast(Arg1, NewVecTy, "cast");
-      Value *Elt = Builder.CreateExtractElement(BC0, (uint64_t)0);
-      Value *BC = Builder.CreateBitCast(Arg0,
-                                        PointerType::getUnqual(Elt->getType()),
-                                        "cast");
-      Builder.CreateAlignedStore(Elt, BC, Align(1));
-
-      // Remove intrinsic.
-      CI->eraseFromParent();
-      return;
-    }
-
-    if (IsX86 && (Name.starts_with("sse.storeu.") ||
-                  Name.starts_with("sse2.storeu.") ||
-                  Name.starts_with("avx.storeu."))) {
-      Value *Arg0 = CI->getArgOperand(0);
-      Value *Arg1 = CI->getArgOperand(1);
-
-      Arg0 = Builder.CreateBitCast(Arg0,
-                                   PointerType::getUnqual(Arg1->getType()),
-                                   "cast");
-      Builder.CreateAlignedStore(Arg1, Arg0, Align(1));
-
-      // Remove intrinsic.
-      CI->eraseFromParent();
-      return;
-    }
-
-    if (IsX86 && Name == "avx512.mask.store.ss") {
-      Value *Mask = Builder.CreateAnd(CI->getArgOperand(2), Builder.getInt8(1));
-      upgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
-                         Mask, false);
-
-      // Remove intrinsic.
-      CI->eraseFromParent();
-      return;
-    }
-
-    if (IsX86 && Name.starts_with("avx512.mask.store")) {
-      // "avx512.mask.storeu." or "avx512.mask.store."
-      bool Aligned = Name[17] != 'u'; // "avx512.mask.storeu".
-      upgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
-                         CI->getArgOperand(2), Aligned);
-
-      // Remove intrinsic.
-      CI->eraseFromParent();
-      return;
-    }
-
     Value *Rep = nullptr;
-    // Upgrade packed integer vector compare intrinsics to compare instructions.
-    if (IsX86 && (Name.starts_with("sse2.pcmp") ||
-                  Name.starts_with("avx2.pcmp"))) {
-      // "sse2.pcpmpeq." "sse2.pcmpgt." "avx2.pcmpeq." or "avx2.pcmpgt."
-      bool CmpEq = Name[9] == 'e';
-      Rep = Builder.CreateICmp(CmpEq ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_SGT,
-                               CI->getArgOperand(0), CI->getArgOperand(1));
-      Rep = Builder.CreateSExt(Rep, CI->getType(), "");
-    } else if (IsX86 && (Name.starts_with("avx512.broadcastm"))) {
-      Type *ExtTy = Type::getInt32Ty(C);
-      if (CI->getOperand(0)->getType()->isIntegerTy(8))
-        ExtTy = Type::getInt64Ty(C);
-      unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() /
-                         ExtTy->getPrimitiveSizeInBits();
-      Rep = Builder.CreateZExt(CI->getArgOperand(0), ExtTy);
-      Rep = Builder.CreateVectorSplat(NumElts, Rep);
-    } else if (IsX86 && (Name == "sse.sqrt.ss" ||
-                         Name == "sse2.sqrt.sd")) {
-      Value *Vec = CI->getArgOperand(0);
-      Value *Elt0 = Builder.CreateExtractElement(Vec, (uint64_t)0);
-      Function *Intr = Intrinsic::getDeclaration(F->getParent(),
-                                                 Intrinsic::sqrt, Elt0->getType());
-      Elt0 = Builder.CreateCall(Intr, Elt0);
-      Rep = Builder.CreateInsertElement(Vec, Elt0, (uint64_t)0);
-    } else if (IsX86 && (Name.starts_with("avx.sqrt.p") ||
-                         Name.starts_with("sse2.sqrt.p") ||
-                         Name.starts_with("sse.sqrt.p"))) {
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
-                                                         Intrinsic::sqrt,
-                                                         CI->getType()),
-                               {CI->getArgOperand(0)});
-    } else if (IsX86 && (Name.starts_with("avx512.mask.sqrt.p"))) {
-      if (CI->arg_size() == 4 &&
-          (!isa<ConstantInt>(CI->getArgOperand(3)) ||
-           cast<ConstantInt>(CI->getArgOperand(3))->getZExtValue() != 4)) {
-        Intrinsic::ID IID = Name[18] == 's' ? Intrinsic::x86_avx512_sqrt_ps_512
-                                            : Intrinsic::x86_avx512_sqrt_pd_512;
-
-        Value *Args[] = { CI->getArgOperand(0), CI->getArgOperand(3) };
-        Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(),
-                                                           IID), Args);
-      } else {
-        Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
-                                                           Intrinsic::sqrt,
-                                                           CI->getType()),
-                                 {CI->getArgOperand(0)});
-      }
-      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
-                          CI->getArgOperand(1));
-    } else if (IsX86 && (Name.starts_with("avx512.ptestm") ||
-                         Name.starts_with("avx512.ptestnm"))) {
-      Value *Op0 = CI->getArgOperand(0);
-      Value *Op1 = CI->getArgOperand(1);
-      Value *Mask = CI->getArgOperand(2);
-      Rep = Builder.CreateAnd(Op0, Op1);
-      llvm::Type *Ty = Op0->getType();
-      Value *Zero = llvm::Constant::getNullValue(Ty);
-      ICmpInst::Predicate Pred =
-        Name.starts_with("avx512.ptestm") ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
-      Rep = Builder.CreateICmp(Pred, Rep, Zero);
-      Rep = applyX86MaskOn1BitsVec(Builder, Rep, Mask);
-    } else if (IsX86 && (Name.starts_with("avx512.mask.pbroadcast"))){
-      unsigned NumElts = cast<FixedVectorType>(CI->getArgOperand(1)->getType())
-                             ->getNumElements();
-      Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
-      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
-                          CI->getArgOperand(1));
-    } else if (IsX86 && (Name.starts_with("avx512.kunpck"))) {
-      unsigned NumElts = CI->getType()->getScalarSizeInBits();
-      Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), NumElts);
-      Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), NumElts);
-      int Indices[64];
-      for (unsigned i = 0; i != NumElts; ++i)
-        Indices[i] = i;
-
-      // First extract half of each vector. This gives better codegen than
-      // doing it in a single shuffle.
-      LHS =
-          Builder.CreateShuffleVector(LHS, LHS, ArrayRef(Indices, NumElts / 2));
-      RHS =
-          Builder.CreateShuffleVector(RHS, RHS, ArrayRef(Indices, NumElts / 2));
-      // Concat the vectors.
-      // NOTE: Operands have to be swapped to match intrinsic definition.
-      Rep = Builder.CreateShuffleVector(RHS, LHS, ArrayRef(Indices, NumElts));
-      Rep = Builder.CreateBitCast(Rep, CI->getType());
-    } else if (IsX86 && Name == "avx512.kand.w") {
-      Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
-      Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
-      Rep = Builder.CreateAnd(LHS, RHS);
-      Rep = Builder.CreateBitCast(Rep, CI->getType());
-    } else if (IsX86 && Name == "avx512.kandn.w") {
-      Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
-      Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
-      LHS = Builder.CreateNot(LHS);
-      Rep = Builder.CreateAnd(LHS, RHS);
-      Rep = Builder.CreateBitCast(Rep, CI->getType());
-    } else if (IsX86 && Name == "avx512.kor.w") {
-      Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
-      Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
-      Rep = Builder.CreateOr(LHS, RHS);
-      Rep = Builder.CreateBitCast(Rep, CI->getType());
-    } else if (IsX86 && Name == "avx512.kxor.w") {
-      Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
-      Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
-      Rep = Builder.CreateXor(LHS, RHS);
-      Rep = Builder.CreateBitCast(Rep, CI->getType());
-    } else if (IsX86 && Name == "avx512.kxnor.w") {
-      Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
-      Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
-      LHS = Builder.CreateNot(LHS);
-      Rep = Builder.CreateXor(LHS, RHS);
-      Rep = Builder.CreateBitCast(Rep, CI->getType());
-    } else if (IsX86 && Name == "avx512.knot.w") {
-      Rep = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
-      Rep = Builder.CreateNot(Rep);
-      Rep = Builder.CreateBitCast(Rep, CI->getType());
-    } else if (IsX86 &&
-               (Name == "avx512.kortestz.w" || Name == "avx512.kortestc.w")) {
-      Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), 16);
-      Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), 16);
-      Rep = Builder.CreateOr(LHS, RHS);
-      Rep = Builder.CreateBitCast(Rep, Builder.getInt16Ty());
-      Value *C;
-      if (Name[14] == 'c')
-        C = ConstantInt::getAllOnesValue(Builder.getInt16Ty());
-      else
-        C = ConstantInt::getNullValue(Builder.getInt16Ty());
-      Rep = Builder.CreateICmpEQ(Rep, C);
-      Rep = Builder.CreateZExt(Rep, Builder.getInt32Ty());
-    } else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd" ||
-                         Name == "sse.sub.ss" || Name == "sse2.sub.sd" ||
-                         Name == "sse.mul.ss" || Name == "sse2.mul.sd" ||
-                         Name == "sse.div.ss" || Name == "sse2.div.sd")) {
-      Type *I32Ty = Type::getInt32Ty(C);
-      Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),
-                                                 ConstantInt::get(I32Ty, 0));
-      Value *Elt1 = Builder.CreateExtractElement(CI->getArgOperand(1),
-                                                 ConstantInt::get(I32Ty, 0));
-      Value *EltOp;
-      if (Name.contains(".add."))
-        EltOp = Builder.CreateFAdd(Elt0, Elt1);
-      else if (Name.contains(".sub."))
-        EltOp = Builder.CreateFSub(Elt0, Elt1);
-      else if (Name.contains(".mul."))
-        EltOp = Builder.CreateFMul(Elt0, Elt1);
-      else
-        EltOp = Builder.CreateFDiv(Elt0, Elt1);
-      Rep = Builder.CreateInsertElement(CI->getArgOperand(0), EltOp,
-                                        ConstantInt::get(I32Ty, 0));
-    } else if (IsX86 && Name.starts_with("avx512.mask.pcmp")) {
-      // "avx512.mask.pcmpeq." or "avx512.mask.pcmpgt."
-      bool CmpEq = Name[16] == 'e';
-      Rep = upgradeMaskedCompare(Builder, *CI, CmpEq ? 0 : 6, true);
-    } else if (IsX86 && Name.starts_with("avx512.mask.vpshufbitqmb.")) {
-      Type *OpTy = CI->getArgOperand(0)->getType();
-      unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
-      Intrinsic::ID IID;
-      switch (VecWidth) {
-      default: llvm_unreachable("Unexpected intrinsic");
-      case 128: IID = Intrinsic::x86_avx512_vpshufbitqmb_128; break;
-      case 256: IID = Intrinsic::x86_avx512_vpshufbitqmb_256; break;
-      case 512: IID = Intrinsic::x86_avx512_vpshufbitqmb_512; break;
-      }
-
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                               { CI->getOperand(0), CI->getArgOperand(1) });
-      Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
-    } else if (IsX86 && Name.starts_with("avx512.mask.fpclass.p")) {
-      Type *OpTy = CI->getArgOperand(0)->getType();
-      unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
-      unsigned EltWidth = OpTy->getScalarSizeInBits();
-      Intrinsic::ID IID;
-      if (VecWidth == 128 && EltWidth == 32)
-        IID = Intrinsic::x86_avx512_fpclass_ps_128;
-      else if (VecWidth == 256 && EltWidth == 32)
-        IID = Intrinsic::x86_avx512_fpclass_ps_256;
-      else if (VecWidth == 512 && EltWidth == 32)
-        IID = Intrinsic::x86_avx512_fpclass_ps_512;
-      else if (VecWidth == 128 && EltWidth == 64)
-        IID = Intrinsic::x86_avx512_fpclass_pd_128;
-      else if (VecWidth == 256 && EltWidth == 64)
-        IID = Intrinsic::x86_avx512_fpclass_pd_256;
-      else if (VecWidth == 512 && EltWidth == 64)
-        IID = Intrinsic::x86_avx512_fpclass_pd_512;
-      else
-        llvm_unreachable("Unexpected intrinsic");
-
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                               { CI->getOperand(0), CI->getArgOperand(1) });
-      Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
-    } else if (IsX86 && Name.starts_with("avx512.cmp.p")) {
-      SmallVector<Value *, 4> Args(CI->args());
-      Type *OpTy = Args[0]->getType();
-      unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
-      unsigned EltWidth = OpTy->getScalarSizeInBits();
-      Intrinsic::ID IID;
-      if (VecWidth == 128 && EltWidth == 32)
-        IID = Intrinsic::x86_avx512_mask_cmp_ps_128;
-      else if (VecWidth == 256 && EltWidth == 32)
-        IID = Intrinsic::x86_avx512_mask_cmp_ps_256;
-      else if (VecWidth == 512 && EltWidth == 32)
-        IID = Intrinsic::x86_avx512_mask_cmp_ps_512;
-      else if (VecWidth == 128 && EltWidth == 64)
-        IID = Intrinsic::x86_avx512_mask_cmp_pd_128;
-      else if (VecWidth == 256 && EltWidth == 64)
-        IID = Intrinsic::x86_avx512_mask_cmp_pd_256;
-      else if (VecWidth == 512 && EltWidth == 64)
-        IID = Intrinsic::x86_avx512_mask_cmp_pd_512;
-      else
-        llvm_unreachable("Unexpected intrinsic");
-
-      Value *Mask = Constant::getAllOnesValue(CI->getType());
-      if (VecWidth == 512)
-        std::swap(Mask, Args.back());
-      Args.push_back(Mask);
-
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                               Args);
-    } else if (IsX86 && Name.starts_with("avx512.mask.cmp.")) {
-      // Integer compare intrinsics.
-      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-      Rep = upgradeMaskedCompare(Builder, *CI, Imm, true);
-    } else if (IsX86 && Name.starts_with("avx512.mask.ucmp.")) {
-      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-      Rep = upgradeMaskedCompare(Builder, *CI, Imm, false);
-    } else if (IsX86 && (Name.starts_with("avx512.cvtb2mask.") ||
-                         Name.starts_with("avx512.cvtw2mask.") ||
-                         Name.starts_with("avx512.cvtd2mask.") ||
-                         Name.starts_with("avx512.cvtq2mask."))) {
-      Value *Op = CI->getArgOperand(0);
-      Value *Zero = llvm::Constant::getNullValue(Op->getType());
-      Rep = Builder.CreateICmp(ICmpInst::ICMP_SLT, Op, Zero);
-      Rep = applyX86MaskOn1BitsVec(Builder, Rep, nullptr);
-    } else if(IsX86 && (Name == "ssse3.pabs.b.128" ||
-                        Name == "ssse3.pabs.w.128" ||
-                        Name == "ssse3.pabs.d.128" ||
-                        Name.starts_with("avx2.pabs") ||
-                        Name.starts_with("avx512.mask.pabs"))) {
-      Rep = upgradeAbs(Builder, *CI);
-    } else if (IsX86 && (Name == "sse41.pmaxsb" ||
-                         Name == "sse2.pmaxs.w" ||
-                         Name == "sse41.pmaxsd" ||
-                         Name.starts_with("avx2.pmaxs") ||
-                         Name.starts_with("avx512.mask.pmaxs"))) {
-      Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smax);
-    } else if (IsX86 && (Name == "sse2.pmaxu.b" ||
-                         Name == "sse41.pmaxuw" ||
-                         Name == "sse41.pmaxud" ||
-                         Name.starts_with("avx2.pmaxu") ||
-                         Name.starts_with("avx512.mask.pmaxu"))) {
-      Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umax);
-    } else if (IsX86 && (Name == "sse41.pminsb" ||
-                         Name == "sse2.pmins.w" ||
-                         Name == "sse41.pminsd" ||
-                         Name.starts_with("avx2.pmins") ||
-                         Name.starts_with("avx512.mask.pmins"))) {
-      Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smin);
-    } else if (IsX86 && (Name == "sse2.pminu.b" ||
-                         Name == "sse41.pminuw" ||
-                         Name == "sse41.pminud" ||
-                         Name.starts_with("avx2.pminu") ||
-                         Name.starts_with("avx512.mask.pminu"))) {
-      Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umin);
-    } else if (IsX86 && (Name == "sse2.pmulu.dq" ||
-                         Name == "avx2.pmulu.dq" ||
-                         Name == "avx512.pmulu.dq.512" ||
-                         Name.starts_with("avx512.mask.pmulu.dq."))) {
-      Rep = upgradePMULDQ(Builder, *CI, /*Signed*/false);
-    } else if (IsX86 && (Name == "sse41.pmuldq" ||
-                         Name == "avx2.pmul.dq" ||
-                         Name == "avx512.pmul.dq.512" ||
-                         Name.starts_with("avx512.mask.pmul.dq."))) {
-      Rep = upgradePMULDQ(Builder, *CI, /*Signed*/true);
-    } else if (IsX86 && (Name == "sse.cvtsi2ss" ||
-                         Name == "sse2.cvtsi2sd" ||
-                         Name == "sse.cvtsi642ss" ||
-                         Name == "sse2.cvtsi642sd")) {
-      Rep = Builder.CreateSIToFP(
-          CI->getArgOperand(1),
-          cast<VectorType>(CI->getType())->getElementType());
-      Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
-    } else if (IsX86 && Name == "avx512.cvtusi2sd") {
-      Rep = Builder.CreateUIToFP(
-          CI->getArgOperand(1),
-          cast<VectorType>(CI->getType())->getElementType());
-      Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
-    } else if (IsX86 && Name == "sse2.cvtss2sd") {
-      Rep = Builder.CreateExtractElement(CI->getArgOperand(1), (uint64_t)0);
-      Rep = Builder.CreateFPExt(
-          Rep, cast<VectorType>(CI->getType())->getElementType());
-      Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
-    } else if (IsX86 && (Name == "sse2.cvtdq2pd" ||
-                         Name == "sse2.cvtdq2ps" ||
-                         Name == "avx.cvtdq2.pd.256" ||
-                         Name == "avx.cvtdq2.ps.256" ||
-                         Name.starts_with("avx512.mask.cvtdq2pd.") ||
-                         Name.starts_with("avx512.mask.cvtudq2pd.") ||
-                         Name.starts_with("avx512.mask.cvtdq2ps.") ||
-                         Name.starts_with("avx512.mask.cvtudq2ps.") ||
-                         Name.starts_with("avx512.mask.cvtqq2pd.") ||
-                         Name.starts_with("avx512.mask.cvtuqq2pd.") ||
-                         Name == "avx512.mask.cvtqq2ps.256" ||
-                         Name == "avx512.mask.cvtqq2ps.512" ||
-                         Name == "avx512.mask.cvtuqq2ps.256" ||
-                         Name == "avx512.mask.cvtuqq2ps.512" ||
-                         Name == "sse2.cvtps2pd" ||
-                         Name == "avx.cvt.ps2.pd.256" ||
-                         Name == "avx512.mask.cvtps2pd.128" ||
-                         Name == "avx512.mask.cvtps2pd.256")) {
-      auto *DstTy = cast<FixedVectorType>(CI->getType());
-      Rep = CI->getArgOperand(0);
-      auto *SrcTy = cast<FixedVectorType>(Rep->getType());
-
-      unsigned NumDstElts = DstTy->getNumElements();
-      if (NumDstElts < SrcTy->getNumElements()) {
-        assert(NumDstElts == 2 && "Unexpected vector size");
-        Rep = Builder.CreateShuffleVector(Rep, Rep, ArrayRef<int>{0, 1});
-      }
-
-      bool IsPS2PD = SrcTy->getElementType()->isFloatTy();
-      bool IsUnsigned = Name.contains("cvtu");
-      if (IsPS2PD)
-        Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
-      else if (CI->arg_size() == 4 &&
-               (!isa<ConstantInt>(CI->getArgOperand(3)) ||
-                cast<ConstantInt>(CI->getArgOperand(3))->getZExtValue() != 4)) {
-        Intrinsic::ID IID = IsUnsigned ? Intrinsic::x86_avx512_uitofp_round
-                                       : Intrinsic::x86_avx512_sitofp_round;
-        Function *F = Intrinsic::getDeclaration(CI->getModule(), IID,
-                                                { DstTy, SrcTy });
-        Rep = Builder.CreateCall(F, { Rep, CI->getArgOperand(3) });
-      } else {
-        Rep = IsUnsigned ? Builder.CreateUIToFP(Rep, DstTy, "cvt")
-                         : Builder.CreateSIToFP(Rep, DstTy, "cvt");
-      }
-
-      if (CI->arg_size() >= 3)
-        Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
-                            CI->getArgOperand(1));
-    } else if (IsX86 && (Name.starts_with("avx512.mask.vcvtph2ps.") ||
-                         Name.starts_with("vcvtph2ps."))) {
-      auto *DstTy = cast<FixedVectorType>(CI->getType());
-      Rep = CI->getArgOperand(0);
-      auto *SrcTy = cast<FixedVectorType>(Rep->getType());
-      unsigned NumDstElts = DstTy->getNumElements();
-      if (NumDstElts != SrcTy->getNumElements()) {
-        assert(NumDstElts == 4 && "Unexpected vector size");
-        Rep = Builder.CreateShuffleVector(Rep, Rep, ArrayRef<int>{0, 1, 2, 3});
-      }
-      Rep = Builder.CreateBitCast(
-          Rep, FixedVectorType::get(Type::getHalfTy(C), NumDstElts));
-      Rep = Builder.CreateFPExt(Rep, DstTy, "cvtph2ps");
-      if (CI->arg_size() >= 3)
-        Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
-                            CI->getArgOperand(1));
-    } else if (IsX86 && Name.starts_with("avx512.mask.load")) {
-      // "avx512.mask.loadu." or "avx512.mask.load."
-      bool Aligned = Name[16] != 'u'; // "avx512.mask.loadu".
-      Rep =
-          upgradeMaskedLoad(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
-                            CI->getArgOperand(2), Aligned);
-    } else if (IsX86 && Name.starts_with("avx512.mask.expand.load.")) {
-      auto *ResultTy = cast<FixedVectorType>(CI->getType());
-      Type *PtrTy = ResultTy->getElementType();
-
-      // Cast the pointer to element type.
-      Value *Ptr = Builder.CreateBitCast(CI->getOperand(0),
-                                         llvm::PointerType::getUnqual(PtrTy));
-
-      Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2),
-                                     ResultTy->getNumElements());
-
-      Function *ELd = Intrinsic::getDeclaration(F->getParent(),
-                                                Intrinsic::masked_expandload,
-                                                ResultTy);
-      Rep = Builder.CreateCall(ELd, { Ptr, MaskVec, CI->getOperand(1) });
-    } else if (IsX86 && Name.starts_with("avx512.mask.compress.store.")) {
-      auto *ResultTy = cast<VectorType>(CI->getArgOperand(1)->getType());
-      Type *PtrTy = ResultTy->getElementType();
-
-      // Cast the pointer to element type.
-      Value *Ptr = Builder.CreateBitCast(CI->getOperand(0),
-                                         llvm::PointerType::getUnqual(PtrTy));
-
-      Value *MaskVec =
-          getX86MaskVec(Builder, CI->getArgOperand(2),
-                        cast<FixedVectorType>(ResultTy)->getNumElements());
-
-      Function *CSt = Intrinsic::getDeclaration(F->getParent(),
-                                                Intrinsic::masked_compressstore,
-                                                ResultTy);
-      Rep = Builder.CreateCall(CSt, { CI->getArgOperand(1), Ptr, MaskVec });
-    } else if (IsX86 && (Name.starts_with("avx512.mask.compress.") ||
-                         Name.starts_with("avx512.mask.expand."))) {
-      auto *ResultTy = cast<FixedVectorType>(CI->getType());
-
-      Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2),
-                                     ResultTy->getNumElements());
-
-      bool IsCompress = Name[12] == 'c';
-      Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress
-                                     : Intrinsic::x86_avx512_mask_expand;
-      Function *Intr = Intrinsic::getDeclaration(F->getParent(), IID, ResultTy);
-      Rep = Builder.CreateCall(Intr, { CI->getOperand(0), CI->getOperand(1),
-                                       MaskVec });
-    } else if (IsX86 && Name.starts_with("xop.vpcom")) {
-      bool IsSigned;
-      if (Name.ends_with("ub") || Name.ends_with("uw") || Name.ends_with("ud") ||
-          Name.ends_with("uq"))
-        IsSigned = false;
-      else if (Name.ends_with("b") || Name.ends_with("w") || Name.ends_with("d") ||
-               Name.ends_with("q"))
-        IsSigned = true;
-      else
-        llvm_unreachable("Unknown suffix");
-
-      unsigned Imm;
-      if (CI->arg_size() == 3) {
-        Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-      } else {
-        Name = Name.substr(9); // strip off "xop.vpcom"
-        if (Name.starts_with("lt"))
-          Imm = 0;
-        else if (Name.starts_with("le"))
-          Imm = 1;
-        else if (Name.starts_with("gt"))
-          Imm = 2;
-        else if (Name.starts_with("ge"))
-          Imm = 3;
-        else if (Name.starts_with("eq"))
-          Imm = 4;
-        else if (Name.starts_with("ne"))
-          Imm = 5;
-        else if (Name.starts_with("false"))
-          Imm = 6;
-        else if (Name.starts_with("true"))
-          Imm = 7;
-        else
-          llvm_unreachable("Unknown condition");
-      }
-
-      Rep = upgradeX86vpcom(Builder, *CI, Imm, IsSigned);
-    } else if (IsX86 && Name.starts_with("xop.vpcmov")) {
-      Value *Sel = CI->getArgOperand(2);
-      Value *NotSel = Builder.CreateNot(Sel);
-      Value *Sel0 = Builder.CreateAnd(CI->getArgOperand(0), Sel);
-      Value *Sel1 = Builder.CreateAnd(CI->getArgOperand(1), NotSel);
-      Rep = Builder.CreateOr(Sel0, Sel1);
-    } else if (IsX86 && (Name.starts_with("xop.vprot") ||
-                         Name.starts_with("avx512.prol") ||
-                         Name.starts_with("avx512.mask.prol"))) {
-      Rep = upgradeX86Rotate(Builder, *CI, false);
-    } else if (IsX86 && (Name.starts_with("avx512.pror") ||
-                         Name.starts_with("avx512.mask.pror"))) {
-      Rep = upgradeX86Rotate(Builder, *CI, true);
-    } else if (IsX86 && (Name.starts_with("avx512.vpshld.") ||
-                         Name.starts_with("avx512.mask.vpshld") ||
-                         Name.starts_with("avx512.maskz.vpshld"))) {
-      bool ZeroMask = Name[11] == 'z';
-      Rep = upgradeX86ConcatShift(Builder, *CI, false, ZeroMask);
-    } else if (IsX86 && (Name.starts_with("avx512.vpshrd.") ||
-                         Name.starts_with("avx512.mask.vpshrd") ||
-                         Name.starts_with("avx512.maskz.vpshrd"))) {
-      bool ZeroMask = Name[11] == 'z';
-      Rep = upgradeX86ConcatShift(Builder, *CI, true, ZeroMask);
-    } else if (IsX86 && Name == "sse42.crc32.64.8") {
-      Function *CRC32 = Intrinsic::getDeclaration(F->getParent(),
-                                               Intrinsic::x86_sse42_crc32_32_8);
-      Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));
-      Rep = Builder.CreateCall(CRC32, {Trunc0, CI->getArgOperand(1)});
-      Rep = Builder.CreateZExt(Rep, CI->getType(), "");
-    } else if (IsX86 && (Name.starts_with("avx.vbroadcast.s") ||
-                         Name.starts_with("avx512.vbroadcast.s"))) {
-      // Replace broadcasts with a series of insertelements.
-      auto *VecTy = cast<FixedVectorType>(CI->getType());
-      Type *EltTy = VecTy->getElementType();
-      unsigned EltNum = VecTy->getNumElements();
-      Value *Load = Builder.CreateLoad(EltTy, CI->getArgOperand(0));
-      Type *I32Ty = Type::getInt32Ty(C);
-      Rep = PoisonValue::get(VecTy);
-      for (unsigned I = 0; I < EltNum; ++I)
-        Rep = Builder.CreateInsertElement(Rep, Load,
-                                          ConstantInt::get(I32Ty, I));
-    } else if (IsX86 && (Name.starts_with("sse41.pmovsx") ||
-                         Name.starts_with("sse41.pmovzx") ||
-                         Name.starts_with("avx2.pmovsx") ||
-                         Name.starts_with("avx2.pmovzx") ||
-                         Name.starts_with("avx512.mask.pmovsx") ||
-                         Name.starts_with("avx512.mask.pmovzx"))) {
-      auto *DstTy = cast<FixedVectorType>(CI->getType());
-      unsigned NumDstElts = DstTy->getNumElements();
-
-      // Extract a subvector of the first NumDstElts lanes and sign/zero extend.
-      SmallVector<int, 8> ShuffleMask(NumDstElts);
-      for (unsigned i = 0; i != NumDstElts; ++i)
-        ShuffleMask[i] = i;
-
-      Value *SV =
-          Builder.CreateShuffleVector(CI->getArgOperand(0), ShuffleMask);
-
-      bool DoSext = Name.contains("pmovsx");
-      Rep = DoSext ? Builder.CreateSExt(SV, DstTy)
-                   : Builder.CreateZExt(SV, DstTy);
-      // If there are 3 arguments, it's a masked intrinsic so we need a select.
-      if (CI->arg_size() == 3)
-        Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
-                            CI->getArgOperand(1));
-    } else if (Name == "avx512.mask.pmov.qd.256" ||
-               Name == "avx512.mask.pmov.qd.512" ||
-               Name == "avx512.mask.pmov.wb.256" ||
-               Name == "avx512.mask.pmov.wb.512") {
-      Type *Ty = CI->getArgOperand(1)->getType();
-      Rep = Builder.CreateTrunc(CI->getArgOperand(0), Ty);
-      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
-                          CI->getArgOperand(1));
-    } else if (IsX86 && (Name.starts_with("avx.vbroadcastf128") ||
-                         Name == "avx2.vbroadcasti128")) {
-      // Replace vbroadcastf128/vbroadcasti128 with a vector load+shuffle.
-      Type *EltTy = cast<VectorType>(CI->getType())->getElementType();
-      unsigned NumSrcElts = 128 / EltTy->getPrimitiveSizeInBits();
-      auto *VT = FixedVectorType::get(EltTy, NumSrcElts);
-      Value *Op = Builder.CreatePointerCast(CI->getArgOperand(0),
-                                            PointerType::getUnqual(VT));
-      Value *Load = Builder.CreateAlignedLoad(VT, Op, Align(1));
-      if (NumSrcElts == 2)
-        Rep = Builder.CreateShuffleVector(Load, ArrayRef<int>{0, 1, 0, 1});
-      else
-        Rep = Builder.CreateShuffleVector(
-            Load, ArrayRef<int>{0, 1, 2, 3, 0, 1, 2, 3});
-    } else if (IsX86 && (Name.starts_with("avx512.mask.shuf.i") ||
-                         Name.starts_with("avx512.mask.shuf.f"))) {
-      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-      Type *VT = CI->getType();
-      unsigned NumLanes = VT->getPrimitiveSizeInBits() / 128;
-      unsigned NumElementsInLane = 128 / VT->getScalarSizeInBits();
-      unsigned ControlBitsMask = NumLanes - 1;
-      unsigned NumControlBits = NumLanes / 2;
-      SmallVector<int, 8> ShuffleMask(0);
-
-      for (unsigned l = 0; l != NumLanes; ++l) {
-        unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask;
-        // We actually need the other source.
-        if (l >= NumLanes / 2)
-          LaneMask += NumLanes;
-        for (unsigned i = 0; i != NumElementsInLane; ++i)
-          ShuffleMask.push_back(LaneMask * NumElementsInLane + i);
-      }
-      Rep = Builder.CreateShuffleVector(CI->getArgOperand(0),
-                                        CI->getArgOperand(1), ShuffleMask);
-      Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep,
-                          CI->getArgOperand(3));
-    }else if (IsX86 && (Name.starts_with("avx512.mask.broadcastf") ||
-                         Name.starts_with("avx512.mask.broadcasti"))) {
-      unsigned NumSrcElts =
-          cast<FixedVectorType>(CI->getArgOperand(0)->getType())
-              ->getNumElements();
-      unsigned NumDstElts =
-          cast<FixedVectorType>(CI->getType())->getNumElements();
-
-      SmallVector<int, 8> ShuffleMask(NumDstElts);
-      for (unsigned i = 0; i != NumDstElts; ++i)
-        ShuffleMask[i] = i % NumSrcElts;
-
-      Rep = Builder.CreateShuffleVector(CI->getArgOperand(0),
-                                        CI->getArgOperand(0),
-                                        ShuffleMask);
-      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
-                          CI->getArgOperand(1));
-    } else if (IsX86 && (Name.starts_with("avx2.pbroadcast") ||
-                         Name.starts_with("avx2.vbroadcast") ||
-                         Name.starts_with("avx512.pbroadcast") ||
-                         Name.starts_with("avx512.mask.broadcast.s"))) {
-      // Replace vp?broadcasts with a vector shuffle.
-      Value *Op = CI->getArgOperand(0);
-      ElementCount EC = cast<VectorType>(CI->getType())->getElementCount();
-      Type *MaskTy = VectorType::get(Type::getInt32Ty(C), EC);
-      SmallVector<int, 8> M;
-      ShuffleVectorInst::getShuffleMask(Constant::getNullValue(MaskTy), M);
-      Rep = Builder.CreateShuffleVector(Op, M);
-
-      if (CI->arg_size() == 3)
-        Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
-                            CI->getArgOperand(1));
-    } else if (IsX86 && (Name.starts_with("sse2.padds.") ||
-                         Name.starts_with("avx2.padds.") ||
-                         Name.starts_with("avx512.padds.") ||
-                         Name.starts_with("avx512.mask.padds."))) {
-      Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::sadd_sat);
-    } else if (IsX86 && (Name.starts_with("sse2.psubs.") ||
-                         Name.starts_with("avx2.psubs.") ||
-                         Name.starts_with("avx512.psubs.") ||
-                         Name.starts_with("avx512.mask.psubs."))) {
-      Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::ssub_sat);
-    } else if (IsX86 && (Name.starts_with("sse2.paddus.") ||
-                         Name.starts_with("avx2.paddus.") ||
-                         Name.starts_with("avx512.mask.paddus."))) {
-      Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::uadd_sat);
-    } else if (IsX86 && (Name.starts_with("sse2.psubus.") ||
-                         Name.starts_with("avx2.psubus.") ||
-                         Name.starts_with("avx512.mask.psubus."))) {
-      Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::usub_sat);
-    } else if (IsX86 && Name.starts_with("avx512.mask.palignr.")) {
-      Rep = upgradeX86ALIGNIntrinsics(
-          Builder, CI->getArgOperand(0), CI->getArgOperand(1),
-          CI->getArgOperand(2), CI->getArgOperand(3), CI->getArgOperand(4),
-          false);
-    } else if (IsX86 && Name.starts_with("avx512.mask.valign.")) {
-      Rep = upgradeX86ALIGNIntrinsics(
-          Builder, CI->getArgOperand(0), CI->getArgOperand(1),
-          CI->getArgOperand(2), CI->getArgOperand(3), CI->getArgOperand(4),
-          true);
-    } else if (IsX86 && (Name == "sse2.psll.dq" ||
-                         Name == "avx2.psll.dq")) {
-      // 128/256-bit shift left specified in bits.
-      unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Rep = upgradeX86PSLLDQIntrinsics(Builder, CI->getArgOperand(0),
-                                       Shift / 8); // Shift is in bits.
-    } else if (IsX86 && (Name == "sse2.psrl.dq" ||
-                         Name == "avx2.psrl.dq")) {
-      // 128/256-bit shift right specified in bits.
-      unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Rep = upgradeX86PSRLDQIntrinsics(Builder, CI->getArgOperand(0),
-                                       Shift / 8); // Shift is in bits.
-    } else if (IsX86 && (Name == "sse2.psll.dq.bs" ||
-                         Name == "avx2.psll.dq.bs" ||
-                         Name == "avx512.psll.dq.512")) {
-      // 128/256/512-bit shift left specified in bytes.
-      unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Rep = upgradeX86PSLLDQIntrinsics(Builder, CI->getArgOperand(0), Shift);
-    } else if (IsX86 && (Name == "sse2.psrl.dq.bs" ||
-                         Name == "avx2.psrl.dq.bs" ||
-                         Name == "avx512.psrl.dq.512")) {
-      // 128/256/512-bit shift right specified in bytes.
-      unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Rep = upgradeX86PSRLDQIntrinsics(Builder, CI->getArgOperand(0), Shift);
-    } else if (IsX86 && (Name == "sse41.pblendw" ||
-                         Name.starts_with("sse41.blendp") ||
-                         Name.starts_with("avx.blend.p") ||
-                         Name == "avx2.pblendw" ||
-                         Name.starts_with("avx2.pblendd."))) {
-      Value *Op0 = CI->getArgOperand(0);
-      Value *Op1 = CI->getArgOperand(1);
-      unsigned Imm = cast <ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-      auto *VecTy = cast<FixedVectorType>(CI->getType());
-      unsigned NumElts = VecTy->getNumElements();
-
-      SmallVector<int, 16> Idxs(NumElts);
-      for (unsigned i = 0; i != NumElts; ++i)
-        Idxs[i] = ((Imm >> (i%8)) & 1) ? i + NumElts : i;
-
-      Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
-    } else if (IsX86 && (Name.starts_with("avx.vinsertf128.") ||
-                         Name == "avx2.vinserti128" ||
-                         Name.starts_with("avx512.mask.insert"))) {
-      Value *Op0 = CI->getArgOperand(0);
-      Value *Op1 = CI->getArgOperand(1);
-      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-      unsigned DstNumElts =
-          cast<FixedVectorType>(CI->getType())->getNumElements();
-      unsigned SrcNumElts =
-          cast<FixedVectorType>(Op1->getType())->getNumElements();
-      unsigned Scale = DstNumElts / SrcNumElts;
-
-      // Mask off the high bits of the immediate value; hardware ignores those.
-      Imm = Imm % Scale;
-
-      // Extend the second operand into a vector the size of the destination.
-      SmallVector<int, 8> Idxs(DstNumElts);
-      for (unsigned i = 0; i != SrcNumElts; ++i)
-        Idxs[i] = i;
-      for (unsigned i = SrcNumElts; i != DstNumElts; ++i)
-        Idxs[i] = SrcNumElts;
-      Rep = Builder.CreateShuffleVector(Op1, Idxs);
-
-      // Insert the second operand into the first operand.
-
-      // Note that there is no guarantee that instruction lowering will actually
-      // produce a vinsertf128 instruction for the created shuffles. In
-      // particular, the 0 immediate case involves no lane changes, so it can
-      // be handled as a blend.
-
-      // Example of shuffle mask for 32-bit elements:
-      // Imm = 1  <i32 0, i32 1, i32 2,  i32 3,  i32 8, i32 9, i32 10, i32 11>
-      // Imm = 0  <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6,  i32 7 >
-
-      // First fill with identify mask.
-      for (unsigned i = 0; i != DstNumElts; ++i)
-        Idxs[i] = i;
-      // Then replace the elements where we need to insert.
-      for (unsigned i = 0; i != SrcNumElts; ++i)
-        Idxs[i + Imm * SrcNumElts] = i + DstNumElts;
-      Rep = Builder.CreateShuffleVector(Op0, Rep, Idxs);
-
-      // If the intrinsic has a mask operand, handle that.
-      if (CI->arg_size() == 5)
-        Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep,
-                            CI->getArgOperand(3));
-    } else if (IsX86 && (Name.starts_with("avx.vextractf128.") ||
-                         Name == "avx2.vextracti128" ||
-                         Name.starts_with("avx512.mask.vextract"))) {
-      Value *Op0 = CI->getArgOperand(0);
-      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      unsigned DstNumElts =
-          cast<FixedVectorType>(CI->getType())->getNumElements();
-      unsigned SrcNumElts =
-          cast<FixedVectorType>(Op0->getType())->getNumElements();
-      unsigned Scale = SrcNumElts / DstNumElts;
-
-      // Mask off the high bits of the immediate value; hardware ignores those.
-      Imm = Imm % Scale;
-
-      // Get indexes for the subvector of the input vector.
-      SmallVector<int, 8> Idxs(DstNumElts);
-      for (unsigned i = 0; i != DstNumElts; ++i) {
-        Idxs[i] = i + (Imm * DstNumElts);
-      }
-      Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
-
-      // If the intrinsic has a mask operand, handle that.
-      if (CI->arg_size() == 4)
-        Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                            CI->getArgOperand(2));
-    } else if (!IsX86 && Name == "stackprotectorcheck") {
-      Rep = nullptr;
-    } else if (IsX86 && (Name.starts_with("avx512.mask.perm.df.") ||
-                         Name.starts_with("avx512.mask.perm.di."))) {
-      Value *Op0 = CI->getArgOperand(0);
-      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      auto *VecTy = cast<FixedVectorType>(CI->getType());
-      unsigned NumElts = VecTy->getNumElements();
-
-      SmallVector<int, 8> Idxs(NumElts);
-      for (unsigned i = 0; i != NumElts; ++i)
-        Idxs[i] = (i & ~0x3) + ((Imm >> (2 * (i & 0x3))) & 3);
-
-      Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
-
-      if (CI->arg_size() == 4)
-        Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                            CI->getArgOperand(2));
-    } else if (IsX86 && (Name.starts_with("avx.vperm2f128.") ||
-                         Name == "avx2.vperm2i128")) {
-      // The immediate permute control byte looks like this:
-      //    [1:0] - select 128 bits from sources for low half of destination
-      //    [2]   - ignore
-      //    [3]   - zero low half of destination
-      //    [5:4] - select 128 bits from sources for high half of destination
-      //    [6]   - ignore
-      //    [7]   - zero high half of destination
-
-      uint8_t Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-
-      unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
-      unsigned HalfSize = NumElts / 2;
-      SmallVector<int, 8> ShuffleMask(NumElts);
-
-      // Determine which operand(s) are actually in use for this instruction.
-      Value *V0 = (Imm & 0x02) ? CI->getArgOperand(1) : CI->getArgOperand(0);
-      Value *V1 = (Imm & 0x20) ? CI->getArgOperand(1) : CI->getArgOperand(0);
-
-      // If needed, replace operands based on zero mask.
-      V0 = (Imm & 0x08) ? ConstantAggregateZero::get(CI->getType()) : V0;
-      V1 = (Imm & 0x80) ? ConstantAggregateZero::get(CI->getType()) : V1;
-
-      // Permute low half of result.
-      unsigned StartIndex = (Imm & 0x01) ? HalfSize : 0;
-      for (unsigned i = 0; i < HalfSize; ++i)
-        ShuffleMask[i] = StartIndex + i;
-
-      // Permute high half of result.
-      StartIndex = (Imm & 0x10) ? HalfSize : 0;
-      for (unsigned i = 0; i < HalfSize; ++i)
-        ShuffleMask[i + HalfSize] = NumElts + StartIndex + i;
-
-      Rep = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
-
-    } else if (IsX86 && (Name.starts_with("avx.vpermil.") ||
-                         Name == "sse2.pshuf.d" ||
-                         Name.starts_with("avx512.mask.vpermil.p") ||
-                         Name.starts_with("avx512.mask.pshuf.d."))) {
-      Value *Op0 = CI->getArgOperand(0);
-      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      auto *VecTy = cast<FixedVectorType>(CI->getType());
-      unsigned NumElts = VecTy->getNumElements();
-      // Calculate the size of each index in the immediate.
-      unsigned IdxSize = 64 / VecTy->getScalarSizeInBits();
-      unsigned IdxMask = ((1 << IdxSize) - 1);
-
-      SmallVector<int, 8> Idxs(NumElts);
-      // Lookup the bits for this element, wrapping around the immediate every
-      // 8-bits. Elements are grouped into sets of 2 or 4 elements so we need
-      // to offset by the first index of each group.
-      for (unsigned i = 0; i != NumElts; ++i)
-        Idxs[i] = ((Imm >> ((i * IdxSize) % 8)) & IdxMask) | (i & ~IdxMask);
-
-      Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
-
-      if (CI->arg_size() == 4)
-        Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                            CI->getArgOperand(2));
-    } else if (IsX86 && (Name == "sse2.pshufl.w" ||
-                         Name.starts_with("avx512.mask.pshufl.w."))) {
-      Value *Op0 = CI->getArgOperand(0);
-      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
-
-      SmallVector<int, 16> Idxs(NumElts);
-      for (unsigned l = 0; l != NumElts; l += 8) {
-        for (unsigned i = 0; i != 4; ++i)
-          Idxs[i + l] = ((Imm >> (2 * i)) & 0x3) + l;
-        for (unsigned i = 4; i != 8; ++i)
-          Idxs[i + l] = i + l;
-      }
-
-      Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
-
-      if (CI->arg_size() == 4)
-        Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                            CI->getArgOperand(2));
-    } else if (IsX86 && (Name == "sse2.pshufh.w" ||
-                         Name.starts_with("avx512.mask.pshufh.w."))) {
-      Value *Op0 = CI->getArgOperand(0);
-      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
-
-      SmallVector<int, 16> Idxs(NumElts);
-      for (unsigned l = 0; l != NumElts; l += 8) {
-        for (unsigned i = 0; i != 4; ++i)
-          Idxs[i + l] = i + l;
-        for (unsigned i = 0; i != 4; ++i)
-          Idxs[i + l + 4] = ((Imm >> (2 * i)) & 0x3) + 4 + l;
-      }
-
-      Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
-
-      if (CI->arg_size() == 4)
-        Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                            CI->getArgOperand(2));
-    } else if (IsX86 && Name.starts_with("avx512.mask.shuf.p")) {
-      Value *Op0 = CI->getArgOperand(0);
-      Value *Op1 = CI->getArgOperand(1);
-      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-      unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
-
-      unsigned NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
-      unsigned HalfLaneElts = NumLaneElts / 2;
-
-      SmallVector<int, 16> Idxs(NumElts);
-      for (unsigned i = 0; i != NumElts; ++i) {
-        // Base index is the starting element of the lane.
-        Idxs[i] = i - (i % NumLaneElts);
-        // If we are half way through the lane switch to the other source.
-        if ((i % NumLaneElts) >= HalfLaneElts)
-          Idxs[i] += NumElts;
-        // Now select the specific element. By adding HalfLaneElts bits from
-        // the immediate. Wrapping around the immediate every 8-bits.
-        Idxs[i] += (Imm >> ((i * HalfLaneElts) % 8)) & ((1 << HalfLaneElts) - 1);
-      }
-
-      Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
-
-      Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep,
-                          CI->getArgOperand(3));
-    } else if (IsX86 && (Name.starts_with("avx512.mask.movddup") ||
-                         Name.starts_with("avx512.mask.movshdup") ||
-                         Name.starts_with("avx512.mask.movsldup"))) {
-      Value *Op0 = CI->getArgOperand(0);
-      unsigned NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
-      unsigned NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
-
-      unsigned Offset = 0;
-      if (Name.starts_with("avx512.mask.movshdup."))
-        Offset = 1;
-
-      SmallVector<int, 16> Idxs(NumElts);
-      for (unsigned l = 0; l != NumElts; l += NumLaneElts)
-        for (unsigned i = 0; i != NumLaneElts; i += 2) {
-          Idxs[i + l + 0] = i + l + Offset;
-          Idxs[i + l + 1] = i + l + Offset;
-        }
-
-      Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
-
-      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
-                          CI->getArgOperand(1));
-    } else if (IsX86 && (Name.starts_with("avx512.mask.punpckl") ||
-                         Name.starts_with("avx512.mask.unpckl."))) {
-      Value *Op0 = CI->getArgOperand(0);
-      Value *Op1 = CI->getArgOperand(1);
-      int NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
-      int NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
-
-      SmallVector<int, 64> Idxs(NumElts);
-      for (int l = 0; l != NumElts; l += NumLaneElts)
-        for (int i = 0; i != NumLaneElts; ++i)
-          Idxs[i + l] = l + (i / 2) + NumElts * (i % 2);
-
-      Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
-
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
-    } else if (IsX86 && (Name.starts_with("avx512.mask.punpckh") ||
-                         Name.starts_with("avx512.mask.unpckh."))) {
-      Value *Op0 = CI->getArgOperand(0);
-      Value *Op1 = CI->getArgOperand(1);
-      int NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
-      int NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
-
-      SmallVector<int, 64> Idxs(NumElts);
-      for (int l = 0; l != NumElts; l += NumLaneElts)
-        for (int i = 0; i != NumLaneElts; ++i)
-          Idxs[i + l] = (NumLaneElts / 2) + l + (i / 2) + NumElts * (i % 2);
-
-      Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
-
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
-    } else if (IsX86 && (Name.starts_with("avx512.mask.and.") ||
-                         Name.starts_with("avx512.mask.pand."))) {
-      VectorType *FTy = cast<VectorType>(CI->getType());
-      VectorType *ITy = VectorType::getInteger(FTy);
-      Rep = Builder.CreateAnd(Builder.CreateBitCast(CI->getArgOperand(0), ITy),
-                              Builder.CreateBitCast(CI->getArgOperand(1), ITy));
-      Rep = Builder.CreateBitCast(Rep, FTy);
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
-    } else if (IsX86 && (Name.starts_with("avx512.mask.andn.") ||
-                         Name.starts_with("avx512.mask.pandn."))) {
-      VectorType *FTy = cast<VectorType>(CI->getType());
-      VectorType *ITy = VectorType::getInteger(FTy);
-      Rep = Builder.CreateNot(Builder.CreateBitCast(CI->getArgOperand(0), ITy));
-      Rep = Builder.CreateAnd(Rep,
-                              Builder.CreateBitCast(CI->getArgOperand(1), ITy));
-      Rep = Builder.CreateBitCast(Rep, FTy);
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
-    } else if (IsX86 && (Name.starts_with("avx512.mask.or.") ||
-                         Name.starts_with("avx512.mask.por."))) {
-      VectorType *FTy = cast<VectorType>(CI->getType());
-      VectorType *ITy = VectorType::getInteger(FTy);
-      Rep = Builder.CreateOr(Builder.CreateBitCast(CI->getArgOperand(0), ITy),
-                             Builder.CreateBitCast(CI->getArgOperand(1), ITy));
-      Rep = Builder.CreateBitCast(Rep, FTy);
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
-    } else if (IsX86 && (Name.starts_with("avx512.mask.xor.") ||
-                         Name.starts_with("avx512.mask.pxor."))) {
-      VectorType *FTy = cast<VectorType>(CI->getType());
-      VectorType *ITy = VectorType::getInteger(FTy);
-      Rep = Builder.CreateXor(Builder.CreateBitCast(CI->getArgOperand(0), ITy),
-                              Builder.CreateBitCast(CI->getArgOperand(1), ITy));
-      Rep = Builder.CreateBitCast(Rep, FTy);
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
-    } else if (IsX86 && Name.starts_with("avx512.mask.padd.")) {
-      Rep = Builder.CreateAdd(CI->getArgOperand(0), CI->getArgOperand(1));
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
-    } else if (IsX86 && Name.starts_with("avx512.mask.psub.")) {
-      Rep = Builder.CreateSub(CI->getArgOperand(0), CI->getArgOperand(1));
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
-    } else if (IsX86 && Name.starts_with("avx512.mask.pmull.")) {
-      Rep = Builder.CreateMul(CI->getArgOperand(0), CI->getArgOperand(1));
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
-    } else if (IsX86 && Name.starts_with("avx512.mask.add.p")) {
-      if (Name.ends_with(".512")) {
-        Intrinsic::ID IID;
-        if (Name[17] == 's')
-          IID = Intrinsic::x86_avx512_add_ps_512;
-        else
-          IID = Intrinsic::x86_avx512_add_pd_512;
-
-        Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                                 { CI->getArgOperand(0), CI->getArgOperand(1),
-                                   CI->getArgOperand(4) });
-      } else {
-        Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1));
-      }
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
-    } else if (IsX86 && Name.starts_with("avx512.mask.div.p")) {
-      if (Name.ends_with(".512")) {
-        Intrinsic::ID IID;
-        if (Name[17] == 's')
-          IID = Intrinsic::x86_avx512_div_ps_512;
-        else
-          IID = Intrinsic::x86_avx512_div_pd_512;
-
-        Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                                 { CI->getArgOperand(0), CI->getArgOperand(1),
-                                   CI->getArgOperand(4) });
-      } else {
-        Rep = Builder.CreateFDiv(CI->getArgOperand(0), CI->getArgOperand(1));
-      }
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
-    } else if (IsX86 && Name.starts_with("avx512.mask.mul.p")) {
-      if (Name.ends_with(".512")) {
-        Intrinsic::ID IID;
-        if (Name[17] == 's')
-          IID = Intrinsic::x86_avx512_mul_ps_512;
-        else
-          IID = Intrinsic::x86_avx512_mul_pd_512;
-
-        Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                                 { CI->getArgOperand(0), CI->getArgOperand(1),
-                                   CI->getArgOperand(4) });
-      } else {
-        Rep = Builder.CreateFMul(CI->getArgOperand(0), CI->getArgOperand(1));
-      }
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
-    } else if (IsX86 && Name.starts_with("avx512.mask.sub.p")) {
-      if (Name.ends_with(".512")) {
-        Intrinsic::ID IID;
-        if (Name[17] == 's')
-          IID = Intrinsic::x86_avx512_sub_ps_512;
-        else
-          IID = Intrinsic::x86_avx512_sub_pd_512;
-
-        Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                                 { CI->getArgOperand(0), CI->getArgOperand(1),
-                                   CI->getArgOperand(4) });
-      } else {
-        Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1));
-      }
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
-    } else if (IsX86 && (Name.starts_with("avx512.mask.max.p") ||
-                         Name.starts_with("avx512.mask.min.p")) &&
-               Name.drop_front(18) == ".512") {
-      bool IsDouble = Name[17] == 'd';
-      bool IsMin = Name[13] == 'i';
-      static const Intrinsic::ID MinMaxTbl[2][2] = {
-        { Intrinsic::x86_avx512_max_ps_512, Intrinsic::x86_avx512_max_pd_512 },
-        { Intrinsic::x86_avx512_min_ps_512, Intrinsic::x86_avx512_min_pd_512 }
-      };
-      Intrinsic::ID IID = MinMaxTbl[IsMin][IsDouble];
-
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                               { CI->getArgOperand(0), CI->getArgOperand(1),
-                                 CI->getArgOperand(4) });
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
-                          CI->getArgOperand(2));
-    } else if (IsX86 && Name.starts_with("avx512.mask.lzcnt.")) {
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
-                                                         Intrinsic::ctlz,
-                                                         CI->getType()),
-                               { CI->getArgOperand(0), Builder.getInt1(false) });
-      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
-                          CI->getArgOperand(1));
-    } else if (IsX86 && Name.starts_with("avx512.mask.psll")) {
-      bool IsImmediate = Name[16] == 'i' ||
-                         (Name.size() > 18 && Name[18] == 'i');
-      bool IsVariable = Name[16] == 'v';
-      char Size = Name[16] == '.' ? Name[17] :
-                  Name[17] == '.' ? Name[18] :
-                  Name[18] == '.' ? Name[19] :
-                                    Name[20];
-
-      Intrinsic::ID IID;
-      if (IsVariable && Name[17] != '.') {
-        if (Size == 'd' && Name[17] == '2') // avx512.mask.psllv2.di
-          IID = Intrinsic::x86_avx2_psllv_q;
-        else if (Size == 'd' && Name[17] == '4') // avx512.mask.psllv4.di
-          IID = Intrinsic::x86_avx2_psllv_q_256;
-        else if (Size == 's' && Name[17] == '4') // avx512.mask.psllv4.si
-          IID = Intrinsic::x86_avx2_psllv_d;
-        else if (Size == 's' && Name[17] == '8') // avx512.mask.psllv8.si
-          IID = Intrinsic::x86_avx2_psllv_d_256;
-        else if (Size == 'h' && Name[17] == '8') // avx512.mask.psllv8.hi
-          IID = Intrinsic::x86_avx512_psllv_w_128;
-        else if (Size == 'h' && Name[17] == '1') // avx512.mask.psllv16.hi
-          IID = Intrinsic::x86_avx512_psllv_w_256;
-        else if (Name[17] == '3' && Name[18] == '2') // avx512.mask.psllv32hi
-          IID = Intrinsic::x86_avx512_psllv_w_512;
-        else
-          llvm_unreachable("Unexpected size");
-      } else if (Name.ends_with(".128")) {
-        if (Size == 'd') // avx512.mask.psll.d.128, avx512.mask.psll.di.128
-          IID = IsImmediate ? Intrinsic::x86_sse2_pslli_d
-                            : Intrinsic::x86_sse2_psll_d;
-        else if (Size == 'q') // avx512.mask.psll.q.128, avx512.mask.psll.qi.128
-          IID = IsImmediate ? Intrinsic::x86_sse2_pslli_q
-                            : Intrinsic::x86_sse2_psll_q;
-        else if (Size == 'w') // avx512.mask.psll.w.128, avx512.mask.psll.wi.128
-          IID = IsImmediate ? Intrinsic::x86_sse2_pslli_w
-                            : Intrinsic::x86_sse2_psll_w;
-        else
-          llvm_unreachable("Unexpected size");
-      } else if (Name.ends_with(".256")) {
-        if (Size == 'd') // avx512.mask.psll.d.256, avx512.mask.psll.di.256
-          IID = IsImmediate ? Intrinsic::x86_avx2_pslli_d
-                            : Intrinsic::x86_avx2_psll_d;
-        else if (Size == 'q') // avx512.mask.psll.q.256, avx512.mask.psll.qi.256
-          IID = IsImmediate ? Intrinsic::x86_avx2_pslli_q
-                            : Intrinsic::x86_avx2_psll_q;
-        else if (Size == 'w') // avx512.mask.psll.w.256, avx512.mask.psll.wi.256
-          IID = IsImmediate ? Intrinsic::x86_avx2_pslli_w
-                            : Intrinsic::x86_avx2_psll_w;
-        else
-          llvm_unreachable("Unexpected size");
-      } else {
-        if (Size == 'd') // psll.di.512, pslli.d, psll.d, psllv.d.512
-          IID = IsImmediate ? Intrinsic::x86_avx512_pslli_d_512 :
-                IsVariable  ? Intrinsic::x86_avx512_psllv_d_512 :
-                              Intrinsic::x86_avx512_psll_d_512;
-        else if (Size == 'q') // psll.qi.512, pslli.q, psll.q, psllv.q.512
-          IID = IsImmediate ? Intrinsic::x86_avx512_pslli_q_512 :
-                IsVariable  ? Intrinsic::x86_avx512_psllv_q_512 :
-                              Intrinsic::x86_avx512_psll_q_512;
-        else if (Size == 'w') // psll.wi.512, pslli.w, psll.w
-          IID = IsImmediate ? Intrinsic::x86_avx512_pslli_w_512
-                            : Intrinsic::x86_avx512_psll_w_512;
-        else
-          llvm_unreachable("Unexpected size");
-      }
-
-      Rep = upgradeX86MaskedShift(Builder, *CI, IID);
-    } else if (IsX86 && Name.starts_with("avx512.mask.psrl")) {
-      bool IsImmediate = Name[16] == 'i' ||
-                         (Name.size() > 18 && Name[18] == 'i');
-      bool IsVariable = Name[16] == 'v';
-      char Size = Name[16] == '.' ? Name[17] :
-                  Name[17] == '.' ? Name[18] :
-                  Name[18] == '.' ? Name[19] :
-                                    Name[20];
-
-      Intrinsic::ID IID;
-      if (IsVariable && Name[17] != '.') {
-        if (Size == 'd' && Name[17] == '2') // avx512.mask.psrlv2.di
-          IID = Intrinsic::x86_avx2_psrlv_q;
-        else if (Size == 'd' && Name[17] == '4') // avx512.mask.psrlv4.di
-          IID = Intrinsic::x86_avx2_psrlv_q_256;
-        else if (Size == 's' && Name[17] == '4') // avx512.mask.psrlv4.si
-          IID = Intrinsic::x86_avx2_psrlv_d;
-        else if (Size == 's' && Name[17] == '8') // avx512.mask.psrlv8.si
-          IID = Intrinsic::x86_avx2_psrlv_d_256;
-        else if (Size == 'h' && Name[17] == '8') // avx512.mask.psrlv8.hi
-          IID = Intrinsic::x86_avx512_psrlv_w_128;
-        else if (Size == 'h' && Name[17] == '1') // avx512.mask.psrlv16.hi
-          IID = Intrinsic::x86_avx512_psrlv_w_256;
-        else if (Name[17] == '3' && Name[18] == '2') // avx512.mask.psrlv32hi
-          IID = Intrinsic::x86_avx512_psrlv_w_512;
-        else
-          llvm_unreachable("Unexpected size");
-      } else if (Name.ends_with(".128")) {
-        if (Size == 'd') // avx512.mask.psrl.d.128, avx512.mask.psrl.di.128
-          IID = IsImmediate ? Intrinsic::x86_sse2_psrli_d
-                            : Intrinsic::x86_sse2_psrl_d;
-        else if (Size == 'q') // avx512.mask.psrl.q.128, avx512.mask.psrl.qi.128
-          IID = IsImmediate ? Intrinsic::x86_sse2_psrli_q
-                            : Intrinsic::x86_sse2_psrl_q;
-        else if (Size == 'w') // avx512.mask.psrl.w.128, avx512.mask.psrl.wi.128
-          IID = IsImmediate ? Intrinsic::x86_sse2_psrli_w
-                            : Intrinsic::x86_sse2_psrl_w;
-        else
-          llvm_unreachable("Unexpected size");
-      } else if (Name.ends_with(".256")) {
-        if (Size == 'd') // avx512.mask.psrl.d.256, avx512.mask.psrl.di.256
-          IID = IsImmediate ? Intrinsic::x86_avx2_psrli_d
-                            : Intrinsic::x86_avx2_psrl_d;
-        else if (Size == 'q') // avx512.mask.psrl.q.256, avx512.mask.psrl.qi.256
-          IID = IsImmediate ? Intrinsic::x86_avx2_psrli_q
-                            : Intrinsic::x86_avx2_psrl_q;
-        else if (Size == 'w') // avx512.mask.psrl.w.256, avx512.mask.psrl.wi.256
-          IID = IsImmediate ? Intrinsic::x86_avx2_psrli_w
-                            : Intrinsic::x86_avx2_psrl_w;
-        else
-          llvm_unreachable("Unexpected size");
-      } else {
-        if (Size == 'd') // psrl.di.512, psrli.d, psrl.d, psrl.d.512
-          IID = IsImmediate ? Intrinsic::x86_avx512_psrli_d_512 :
-                IsVariable  ? Intrinsic::x86_avx512_psrlv_d_512 :
-                              Intrinsic::x86_avx512_psrl_d_512;
-        else if (Size == 'q') // psrl.qi.512, psrli.q, psrl.q, psrl.q.512
-          IID = IsImmediate ? Intrinsic::x86_avx512_psrli_q_512 :
-                IsVariable  ? Intrinsic::x86_avx512_psrlv_q_512 :
-                              Intrinsic::x86_avx512_psrl_q_512;
-        else if (Size == 'w') // psrl.wi.512, psrli.w, psrl.w)
-          IID = IsImmediate ? Intrinsic::x86_avx512_psrli_w_512
-                            : Intrinsic::x86_avx512_psrl_w_512;
-        else
-          llvm_unreachable("Unexpected size");
-      }
-
-      Rep = upgradeX86MaskedShift(Builder, *CI, IID);
-    } else if (IsX86 && Name.starts_with("avx512.mask.psra")) {
-      bool IsImmediate = Name[16] == 'i' ||
-                         (Name.size() > 18 && Name[18] == 'i');
-      bool IsVariable = Name[16] == 'v';
-      char Size = Name[16] == '.' ? Name[17] :
-                  Name[17] == '.' ? Name[18] :
-                  Name[18] == '.' ? Name[19] :
-                                    Name[20];
-
-      Intrinsic::ID IID;
-      if (IsVariable && Name[17] != '.') {
-        if (Size == 's' && Name[17] == '4') // avx512.mask.psrav4.si
-          IID = Intrinsic::x86_avx2_psrav_d;
-        else if (Size == 's' && Name[17] == '8') // avx512.mask.psrav8.si
-          IID = Intrinsic::x86_avx2_psrav_d_256;
-        else if (Size == 'h' && Name[17] == '8') // avx512.mask.psrav8.hi
-          IID = Intrinsic::x86_avx512_psrav_w_128;
-        else if (Size == 'h' && Name[17] == '1') // avx512.mask.psrav16.hi
-          IID = Intrinsic::x86_avx512_psrav_w_256;
-        else if (Name[17] == '3' && Name[18] == '2') // avx512.mask.psrav32hi
-          IID = Intrinsic::x86_avx512_psrav_w_512;
-        else
-          llvm_unreachable("Unexpected size");
-      } else if (Name.ends_with(".128")) {
-        if (Size == 'd') // avx512.mask.psra.d.128, avx512.mask.psra.di.128
-          IID = IsImmediate ? Intrinsic::x86_sse2_psrai_d
-                            : Intrinsic::x86_sse2_psra_d;
-        else if (Size == 'q') // avx512.mask.psra.q.128, avx512.mask.psra.qi.128
-          IID = IsImmediate ? Intrinsic::x86_avx512_psrai_q_128 :
-                IsVariable  ? Intrinsic::x86_avx512_psrav_q_128 :
-                              Intrinsic::x86_avx512_psra_q_128;
-        else if (Size == 'w') // avx512.mask.psra.w.128, avx512.mask.psra.wi.128
-          IID = IsImmediate ? Intrinsic::x86_sse2_psrai_w
-                            : Intrinsic::x86_sse2_psra_w;
-        else
-          llvm_unreachable("Unexpected size");
-      } else if (Name.ends_with(".256")) {
-        if (Size == 'd') // avx512.mask.psra.d.256, avx512.mask.psra.di.256
-          IID = IsImmediate ? Intrinsic::x86_avx2_psrai_d
-                            : Intrinsic::x86_avx2_psra_d;
-        else if (Size == 'q') // avx512.mask.psra.q.256, avx512.mask.psra.qi.256
-          IID = IsImmediate ? Intrinsic::x86_avx512_psrai_q_256 :
-                IsVariable  ? Intrinsic::x86_avx512_psrav_q_256 :
-                              Intrinsic::x86_avx512_psra_q_256;
-        else if (Size == 'w') // avx512.mask.psra.w.256, avx512.mask.psra.wi.256
-          IID = IsImmediate ? Intrinsic::x86_avx2_psrai_w
-                            : Intrinsic::x86_avx2_psra_w;
-        else
-          llvm_unreachable("Unexpected size");
-      } else {
-        if (Size == 'd') // psra.di.512, psrai.d, psra.d, psrav.d.512
-          IID = IsImmediate ? Intrinsic::x86_avx512_psrai_d_512 :
-                IsVariable  ? Intrinsic::x86_avx512_psrav_d_512 :
-                              Intrinsic::x86_avx512_psra_d_512;
-        else if (Size == 'q') // psra.qi.512, psrai.q, psra.q
-          IID = IsImmediate ? Intrinsic::x86_avx512_psrai_q_512 :
-                IsVariable  ? Intrinsic::x86_avx512_psrav_q_512 :
-                              Intrinsic::x86_avx512_psra_q_512;
-        else if (Size == 'w') // psra.wi.512, psrai.w, psra.w
-          IID = IsImmediate ? Intrinsic::x86_avx512_psrai_w_512
-                            : Intrinsic::x86_avx512_psra_w_512;
-        else
-          llvm_unreachable("Unexpected size");
-      }
-
-      Rep = upgradeX86MaskedShift(Builder, *CI, IID);
-    } else if (IsX86 && Name.starts_with("avx512.mask.move.s")) {
-      Rep = upgradeMaskedMove(Builder, *CI);
-    } else if (IsX86 && Name.starts_with("avx512.cvtmask2")) {
-      Rep = upgradeMaskToInt(Builder, *CI);
-    } else if (IsX86 && Name.ends_with(".movntdqa")) {
-      MDNode *Node = MDNode::get(
-          C, ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
-
-      Value *Ptr = CI->getArgOperand(0);
-
-      // Convert the type of the pointer to a pointer to the stored type.
-      Value *BC = Builder.CreateBitCast(
-          Ptr, PointerType::getUnqual(CI->getType()), "cast");
-      LoadInst *LI = Builder.CreateAlignedLoad(
-          CI->getType(), BC,
-          Align(CI->getType()->getPrimitiveSizeInBits().getFixedValue() / 8));
-      LI->setMetadata(LLVMContext::MD_nontemporal, Node);
-      Rep = LI;
-    } else if (IsX86 && (Name.starts_with("fma.vfmadd.") ||
-                         Name.starts_with("fma.vfmsub.") ||
-                         Name.starts_with("fma.vfnmadd.") ||
-                         Name.starts_with("fma.vfnmsub."))) {
-      bool NegMul = Name[6] == 'n';
-      bool NegAcc = NegMul ? Name[8] == 's' : Name[7] == 's';
-      bool IsScalar = NegMul ? Name[12] == 's' : Name[11] == 's';
-
-      Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
-                       CI->getArgOperand(2) };
-
-      if (IsScalar) {
-        Ops[0] = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
-        Ops[1] = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
-        Ops[2] = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
-      }
-
-      if (NegMul && !IsScalar)
-        Ops[0] = Builder.CreateFNeg(Ops[0]);
-      if (NegMul && IsScalar)
-        Ops[1] = Builder.CreateFNeg(Ops[1]);
-      if (NegAcc)
-        Ops[2] = Builder.CreateFNeg(Ops[2]);
-
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(),
-                                                         Intrinsic::fma,
-                                                         Ops[0]->getType()),
-                               Ops);
-
-      if (IsScalar)
-        Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep,
-                                          (uint64_t)0);
-    } else if (IsX86 && Name.starts_with("fma4.vfmadd.s")) {
-      Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
-                       CI->getArgOperand(2) };
-
-      Ops[0] = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
-      Ops[1] = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
-      Ops[2] = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
-
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(),
-                                                         Intrinsic::fma,
-                                                         Ops[0]->getType()),
-                               Ops);
-
-      Rep = Builder.CreateInsertElement(Constant::getNullValue(CI->getType()),
-                                        Rep, (uint64_t)0);
-    } else if (IsX86 && (Name.starts_with("avx512.mask.vfmadd.s") ||
-                         Name.starts_with("avx512.maskz.vfmadd.s") ||
-                         Name.starts_with("avx512.mask3.vfmadd.s") ||
-                         Name.starts_with("avx512.mask3.vfmsub.s") ||
-                         Name.starts_with("avx512.mask3.vfnmsub.s"))) {
-      bool IsMask3 = Name[11] == '3';
-      bool IsMaskZ = Name[11] == 'z';
-      // Drop the "avx512.mask." to make it easier.
-      Name = Name.drop_front(IsMask3 || IsMaskZ ? 13 : 12);
-      bool NegMul = Name[2] == 'n';
-      bool NegAcc = NegMul ? Name[4] == 's' : Name[3] == 's';
-
-      Value *A = CI->getArgOperand(0);
-      Value *B = CI->getArgOperand(1);
-      Value *C = CI->getArgOperand(2);
-
-      if (NegMul && (IsMask3 || IsMaskZ))
-        A = Builder.CreateFNeg(A);
-      if (NegMul && !(IsMask3 || IsMaskZ))
-        B = Builder.CreateFNeg(B);
-      if (NegAcc)
-        C = Builder.CreateFNeg(C);
-
-      A = Builder.CreateExtractElement(A, (uint64_t)0);
-      B = Builder.CreateExtractElement(B, (uint64_t)0);
-      C = Builder.CreateExtractElement(C, (uint64_t)0);
-
-      if (!isa<ConstantInt>(CI->getArgOperand(4)) ||
-          cast<ConstantInt>(CI->getArgOperand(4))->getZExtValue() != 4) {
-        Value *Ops[] = { A, B, C, CI->getArgOperand(4) };
-
-        Intrinsic::ID IID;
-        if (Name.back() == 'd')
-          IID = Intrinsic::x86_avx512_vfmadd_f64;
-        else
-          IID = Intrinsic::x86_avx512_vfmadd_f32;
-        Function *FMA = Intrinsic::getDeclaration(CI->getModule(), IID);
-        Rep = Builder.CreateCall(FMA, Ops);
-      } else {
-        Function *FMA = Intrinsic::getDeclaration(CI->getModule(),
-                                                  Intrinsic::fma,
-                                                  A->getType());
-        Rep = Builder.CreateCall(FMA, { A, B, C });
-      }
-
-      Value *PassThru = IsMaskZ ? Constant::getNullValue(Rep->getType()) :
-                        IsMask3 ? C : A;
-
-      // For Mask3 with NegAcc, we need to create a new extractelement that
-      // avoids the negation above.
-      if (NegAcc && IsMask3)
-        PassThru = Builder.CreateExtractElement(CI->getArgOperand(2),
-                                                (uint64_t)0);
-
-      Rep = emitX86ScalarSelect(Builder, CI->getArgOperand(3), Rep, PassThru);
-      Rep = Builder.CreateInsertElement(CI->getArgOperand(IsMask3 ? 2 : 0),
-                                        Rep, (uint64_t)0);
-    } else if (IsX86 && (Name.starts_with("avx512.mask.vfmadd.p") ||
-                         Name.starts_with("avx512.mask.vfnmadd.p") ||
-                         Name.starts_with("avx512.mask.vfnmsub.p") ||
-                         Name.starts_with("avx512.mask3.vfmadd.p") ||
-                         Name.starts_with("avx512.mask3.vfmsub.p") ||
-                         Name.starts_with("avx512.mask3.vfnmsub.p") ||
-                         Name.starts_with("avx512.maskz.vfmadd.p"))) {
-      bool IsMask3 = Name[11] == '3';
-      bool IsMaskZ = Name[11] == 'z';
-      // Drop the "avx512.mask." to make it easier.
-      Name = Name.drop_front(IsMask3 || IsMaskZ ? 13 : 12);
-      bool NegMul = Name[2] == 'n';
-      bool NegAcc = NegMul ? Name[4] == 's' : Name[3] == 's';
-
-      Value *A = CI->getArgOperand(0);
-      Value *B = CI->getArgOperand(1);
-      Value *C = CI->getArgOperand(2);
-
-      if (NegMul && (IsMask3 || IsMaskZ))
-        A = Builder.CreateFNeg(A);
-      if (NegMul && !(IsMask3 || IsMaskZ))
-        B = Builder.CreateFNeg(B);
-      if (NegAcc)
-        C = Builder.CreateFNeg(C);
-
-      if (CI->arg_size() == 5 &&
-          (!isa<ConstantInt>(CI->getArgOperand(4)) ||
-           cast<ConstantInt>(CI->getArgOperand(4))->getZExtValue() != 4)) {
-        Intrinsic::ID IID;
-        // Check the character before ".512" in string.
-        if (Name[Name.size()-5] == 's')
-          IID = Intrinsic::x86_avx512_vfmadd_ps_512;
-        else
-          IID = Intrinsic::x86_avx512_vfmadd_pd_512;
-
-        Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                                 { A, B, C, CI->getArgOperand(4) });
-      } else {
-        Function *FMA = Intrinsic::getDeclaration(CI->getModule(),
-                                                  Intrinsic::fma,
-                                                  A->getType());
-        Rep = Builder.CreateCall(FMA, { A, B, C });
-      }
-
-      Value *PassThru = IsMaskZ ? llvm::Constant::getNullValue(CI->getType()) :
-                        IsMask3 ? CI->getArgOperand(2) :
-                                  CI->getArgOperand(0);
-
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
-    } else if (IsX86 &&  Name.starts_with("fma.vfmsubadd.p")) {
-      unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
-      unsigned EltWidth = CI->getType()->getScalarSizeInBits();
-      Intrinsic::ID IID;
-      if (VecWidth == 128 && EltWidth == 32)
-        IID = Intrinsic::x86_fma_vfmaddsub_ps;
-      else if (VecWidth == 256 && EltWidth == 32)
-        IID = Intrinsic::x86_fma_vfmaddsub_ps_256;
-      else if (VecWidth == 128 && EltWidth == 64)
-        IID = Intrinsic::x86_fma_vfmaddsub_pd;
-      else if (VecWidth == 256 && EltWidth == 64)
-        IID = Intrinsic::x86_fma_vfmaddsub_pd_256;
-      else
-        llvm_unreachable("Unexpected intrinsic");
-
-      Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
-                       CI->getArgOperand(2) };
-      Ops[2] = Builder.CreateFNeg(Ops[2]);
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                               Ops);
-    } else if (IsX86 && (Name.starts_with("avx512.mask.vfmaddsub.p") ||
-                         Name.starts_with("avx512.mask3.vfmaddsub.p") ||
-                         Name.starts_with("avx512.maskz.vfmaddsub.p") ||
-                         Name.starts_with("avx512.mask3.vfmsubadd.p"))) {
-      bool IsMask3 = Name[11] == '3';
-      bool IsMaskZ = Name[11] == 'z';
-      // Drop the "avx512.mask." to make it easier.
-      Name = Name.drop_front(IsMask3 || IsMaskZ ? 13 : 12);
-      bool IsSubAdd = Name[3] == 's';
-      if (CI->arg_size() == 5) {
-        Intrinsic::ID IID;
-        // Check the character before ".512" in string.
-        if (Name[Name.size()-5] == 's')
-          IID = Intrinsic::x86_avx512_vfmaddsub_ps_512;
-        else
-          IID = Intrinsic::x86_avx512_vfmaddsub_pd_512;
-
-        Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
-                         CI->getArgOperand(2), CI->getArgOperand(4) };
-        if (IsSubAdd)
-          Ops[2] = Builder.CreateFNeg(Ops[2]);
-
-        Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                                 Ops);
-      } else {
-        int NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
-
-        Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
-                         CI->getArgOperand(2) };
-
-        Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma,
-                                                  Ops[0]->getType());
-        Value *Odd = Builder.CreateCall(FMA, Ops);
-        Ops[2] = Builder.CreateFNeg(Ops[2]);
-        Value *Even = Builder.CreateCall(FMA, Ops);
-
-        if (IsSubAdd)
-          std::swap(Even, Odd);
-
-        SmallVector<int, 32> Idxs(NumElts);
-        for (int i = 0; i != NumElts; ++i)
-          Idxs[i] = i + (i % 2) * NumElts;
-
-        Rep = Builder.CreateShuffleVector(Even, Odd, Idxs);
-      }
-
-      Value *PassThru = IsMaskZ ? llvm::Constant::getNullValue(CI->getType()) :
-                        IsMask3 ? CI->getArgOperand(2) :
-                                  CI->getArgOperand(0);
-
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
-    } else if (IsX86 && (Name.starts_with("avx512.mask.pternlog.") ||
-                         Name.starts_with("avx512.maskz.pternlog."))) {
-      bool ZeroMask = Name[11] == 'z';
-      unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
-      unsigned EltWidth = CI->getType()->getScalarSizeInBits();
-      Intrinsic::ID IID;
-      if (VecWidth == 128 && EltWidth == 32)
-        IID = Intrinsic::x86_avx512_pternlog_d_128;
-      else if (VecWidth == 256 && EltWidth == 32)
-        IID = Intrinsic::x86_avx512_pternlog_d_256;
-      else if (VecWidth == 512 && EltWidth == 32)
-        IID = Intrinsic::x86_avx512_pternlog_d_512;
-      else if (VecWidth == 128 && EltWidth == 64)
-        IID = Intrinsic::x86_avx512_pternlog_q_128;
-      else if (VecWidth == 256 && EltWidth == 64)
-        IID = Intrinsic::x86_avx512_pternlog_q_256;
-      else if (VecWidth == 512 && EltWidth == 64)
-        IID = Intrinsic::x86_avx512_pternlog_q_512;
-      else
-        llvm_unreachable("Unexpected intrinsic");
-
-      Value *Args[] = { CI->getArgOperand(0) , CI->getArgOperand(1),
-                        CI->getArgOperand(2), CI->getArgOperand(3) };
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
-                               Args);
-      Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
-                                 : CI->getArgOperand(0);
-      Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru);
-    } else if (IsX86 && (Name.starts_with("avx512.mask.vpmadd52") ||
-                         Name.starts_with("avx512.maskz.vpmadd52"))) {
-      bool ZeroMask = Name[11] == 'z';
-      bool High = Name[20] == 'h' || Name[21] == 'h';
-      unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
-      Intrinsic::ID IID;
-      if (VecWidth == 128 && !High)
-        IID = Intrinsic::x86_avx512_vpmadd52l_uq_128;
-      else if (VecWidth == 256 && !High)
-        IID = Intrinsic::x86_avx512_vpmadd52l_uq_256;
-      else if (VecWidth == 512 && !High)
-        IID = Intrinsic::x86_avx512_vpmadd52l_uq_512;
-      else if (VecWidth == 128 && High)
-        IID = Intrinsic::x86_avx512_vpmadd52h_uq_128;
-      else if (VecWidth == 256 && High)
-        IID = Intrinsic::x86_avx512_vpmadd52h_uq_256;
-      else if (VecWidth == 512 && High)
-        IID = Intrinsic::x86_avx512_vpmadd52h_uq_512;
-      else
-        llvm_unreachable("Unexpected intrinsic");
-
-      Value *Args[] = { CI->getArgOperand(0) , CI->getArgOperand(1),
-                        CI->getArgOperand(2) };
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
-                               Args);
-      Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
-                                 : CI->getArgOperand(0);
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
-    } else if (IsX86 && (Name.starts_with("avx512.mask.vpermi2var.") ||
-                         Name.starts_with("avx512.mask.vpermt2var.") ||
-                         Name.starts_with("avx512.maskz.vpermt2var."))) {
-      bool ZeroMask = Name[11] == 'z';
-      bool IndexForm = Name[17] == 'i';
-      Rep = upgradeX86VPERMT2Intrinsics(Builder, *CI, ZeroMask, IndexForm);
-    } else if (IsX86 && (Name.starts_with("avx512.mask.vpdpbusd.") ||
-                         Name.starts_with("avx512.maskz.vpdpbusd.") ||
-                         Name.starts_with("avx512.mask.vpdpbusds.") ||
-                         Name.starts_with("avx512.maskz.vpdpbusds."))) {
-      bool ZeroMask = Name[11] == 'z';
-      bool IsSaturating = Name[ZeroMask ? 21 : 20] == 's';
-      unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
-      Intrinsic::ID IID;
-      if (VecWidth == 128 && !IsSaturating)
-        IID = Intrinsic::x86_avx512_vpdpbusd_128;
-      else if (VecWidth == 256 && !IsSaturating)
-        IID = Intrinsic::x86_avx512_vpdpbusd_256;
-      else if (VecWidth == 512 && !IsSaturating)
-        IID = Intrinsic::x86_avx512_vpdpbusd_512;
-      else if (VecWidth == 128 && IsSaturating)
-        IID = Intrinsic::x86_avx512_vpdpbusds_128;
-      else if (VecWidth == 256 && IsSaturating)
-        IID = Intrinsic::x86_avx512_vpdpbusds_256;
-      else if (VecWidth == 512 && IsSaturating)
-        IID = Intrinsic::x86_avx512_vpdpbusds_512;
-      else
-        llvm_unreachable("Unexpected intrinsic");
-
-      Value *Args[] = { CI->getArgOperand(0), CI->getArgOperand(1),
-                        CI->getArgOperand(2)  };
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
-                               Args);
-      Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
-                                 : CI->getArgOperand(0);
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
-    } else if (IsX86 && (Name.starts_with("avx512.mask.vpdpwssd.") ||
-                         Name.starts_with("avx512.maskz.vpdpwssd.") ||
-                         Name.starts_with("avx512.mask.vpdpwssds.") ||
-                         Name.starts_with("avx512.maskz.vpdpwssds."))) {
-      bool ZeroMask = Name[11] == 'z';
-      bool IsSaturating = Name[ZeroMask ? 21 : 20] == 's';
-      unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
-      Intrinsic::ID IID;
-      if (VecWidth == 128 && !IsSaturating)
-        IID = Intrinsic::x86_avx512_vpdpwssd_128;
-      else if (VecWidth == 256 && !IsSaturating)
-        IID = Intrinsic::x86_avx512_vpdpwssd_256;
-      else if (VecWidth == 512 && !IsSaturating)
-        IID = Intrinsic::x86_avx512_vpdpwssd_512;
-      else if (VecWidth == 128 && IsSaturating)
-        IID = Intrinsic::x86_avx512_vpdpwssds_128;
-      else if (VecWidth == 256 && IsSaturating)
-        IID = Intrinsic::x86_avx512_vpdpwssds_256;
-      else if (VecWidth == 512 && IsSaturating)
-        IID = Intrinsic::x86_avx512_vpdpwssds_512;
-      else
-        llvm_unreachable("Unexpected intrinsic");
-
-      Value *Args[] = { CI->getArgOperand(0), CI->getArgOperand(1),
-                        CI->getArgOperand(2)  };
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
-                               Args);
-      Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
-                                 : CI->getArgOperand(0);
-      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
-    } else if (IsX86 && (Name == "addcarryx.u32" || Name == "addcarryx.u64" ||
-                         Name == "addcarry.u32" || Name == "addcarry.u64" ||
-                         Name == "subborrow.u32" || Name == "subborrow.u64")) {
-      Intrinsic::ID IID;
-      if (Name[0] == 'a' && Name.back() == '2')
-        IID = Intrinsic::x86_addcarry_32;
-      else if (Name[0] == 'a' && Name.back() == '4')
-        IID = Intrinsic::x86_addcarry_64;
-      else if (Name[0] == 's' && Name.back() == '2')
-        IID = Intrinsic::x86_subborrow_32;
-      else if (Name[0] == 's' && Name.back() == '4')
-        IID = Intrinsic::x86_subborrow_64;
-      else
-        llvm_unreachable("Unexpected intrinsic");
-
-      // Make a call with 3 operands.
-      Value *Args[] = { CI->getArgOperand(0), CI->getArgOperand(1),
-                        CI->getArgOperand(2)};
-      Value *NewCall = Builder.CreateCall(
-                                Intrinsic::getDeclaration(CI->getModule(), IID),
-                                Args);
-
-      // Extract the second result and store it.
-      Value *Data = Builder.CreateExtractValue(NewCall, 1);
-      // Cast the pointer to the right type.
-      Value *Ptr = Builder.CreateBitCast(CI->getArgOperand(3),
-                                 llvm::PointerType::getUnqual(Data->getType()));
-      Builder.CreateAlignedStore(Data, Ptr, Align(1));
-      // Replace the original call result with the first result of the new call.
-      Value *CF = Builder.CreateExtractValue(NewCall, 0);
 
-      CI->replaceAllUsesWith(CF);
+    if (!IsX86 && Name == "stackprotectorcheck") {
       Rep = nullptr;
-    } else if (IsX86 && Name.starts_with("avx512.mask.") &&
-               upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) {
-      // Rep will be updated by the call in the condition.
     } else if (IsNVVM && (Name == "abs.i" || Name == "abs.ll")) {
       Value *Arg = CI->getArgOperand(0);
       Value *Neg = Builder.CreateNeg(Arg, "neg");
@@ -4332,6 +4266,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
             Rep = Builder.CreateBitCast(Rep, F->getReturnType());
         }
       }
+    } else if (IsX86) {
+      Rep = upgradeX86IntrinsicCall(Name, CI, F, Builder);
     } else if (IsARM) {
       Rep = upgradeARMIntrinsicCall(Name, CI, F, Builder);
     } else if (IsAMDGCN) {

From 2da0055924161d63f4ecb88371ba6ccb4552f13d Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Sat, 6 Jul 2024 15:35:13 +0200
Subject: [PATCH 66/67] [libc++][NFC] Merge identical char_traits functions
 into a base class (#97700)

---
 libcxx/include/__string/char_traits.h | 266 +++++++-------------------
 1 file changed, 64 insertions(+), 202 deletions(-)

diff --git a/libcxx/include/__string/char_traits.h b/libcxx/include/__string/char_traits.h
index 40821f86465b89..2660ac2ede2d5c 100644
--- a/libcxx/include/__string/char_traits.h
+++ b/libcxx/include/__string/char_traits.h
@@ -170,31 +170,72 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<char> {
   static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int_type eof() _NOEXCEPT { return int_type(EOF); }
 };
 
-// char_traits<wchar_t>
-
-#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-template <>
-struct _LIBCPP_TEMPLATE_VIS char_traits<wchar_t> {
-  using char_type  = wchar_t;
-  using int_type   = wint_t;
+template <class _CharT, class _IntT, _IntT _EOFVal>
+struct __char_traits_base {
+  using char_type  = _CharT;
+  using int_type   = _IntT;
   using off_type   = streamoff;
-  using pos_type   = streampos;
   using state_type = mbstate_t;
-#  if _LIBCPP_STD_VER >= 20
+#if _LIBCPP_STD_VER >= 20
   using comparison_category = strong_ordering;
-#  endif
+#endif
 
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 void
-  assign(char_type& __c1, const char_type& __c2) _NOEXCEPT {
-    __c1 = __c2;
+  // There are different aliases for the different char types, but they are all aliases to this type
+  using pos_type = fpos<mbstate_t>;
+
+  _LIBCPP_HIDE_FROM_ABI static inline _LIBCPP_CONSTEXPR_SINCE_CXX17 void
+  assign(char_type& __lhs, const char_type& __rhs) _NOEXCEPT {
+    __lhs = __rhs;
   }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool eq(char_type __c1, char_type __c2) _NOEXCEPT {
-    return __c1 == __c2;
+
+  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR bool eq(char_type __lhs, char_type __rhs) _NOEXCEPT {
+    return __lhs == __rhs;
   }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool lt(char_type __c1, char_type __c2) _NOEXCEPT {
-    return __c1 < __c2;
+
+  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR bool lt(char_type __lhs, char_type __rhs) _NOEXCEPT {
+    return __lhs < __rhs;
   }
 
+  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type*
+  move(char_type* __dest, const char_type* __src, size_t __n) _NOEXCEPT {
+    return std::__constexpr_memmove(__dest, __src, __element_count(__n));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type*
+  copy(char_type* __dest, const char_type* __src, size_t __n) _NOEXCEPT {
+    _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(!std::__is_pointer_in_range(__dest, __dest + __n, __src),
+                                          "char_traits::copy: source and destination ranges overlap");
+    return std::__constexpr_memmove(__dest, __src, __element_count(__n));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type*
+  assign(char_type* __str, size_t __n, char_type __fill_char) _NOEXCEPT {
+    std::fill_n(__str, __n, __fill_char);
+    return __str;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR char_type to_char_type(int_type __c) _NOEXCEPT {
+    return char_type(__c);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR int_type to_int_type(char_type __c) _NOEXCEPT { return int_type(__c); }
+
+  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR bool eq_int_type(int_type __lhs, int_type __rhs) _NOEXCEPT {
+    return __lhs == __rhs;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR int_type eof() _NOEXCEPT { return _EOFVal; }
+
+  _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR int_type not_eof(int_type __c) _NOEXCEPT {
+    return eq_int_type(__c, eof()) ? static_cast<int_type>(~eof()) : __c;
+  }
+};
+
+// char_traits<wchar_t>
+
+#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+template <>
+struct _LIBCPP_TEMPLATE_VIS char_traits<wchar_t> : __char_traits_base<wchar_t, wint_t, static_cast<wint_t>(WEOF)> {
   static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 int
   compare(const char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT {
     if (__n == 0)
@@ -212,63 +253,14 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<wchar_t> {
       return nullptr;
     return std::__constexpr_wmemchr(__s, __a, __n);
   }
-
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type*
-  move(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT {
-    return std::__constexpr_memmove(__s1, __s2, __element_count(__n));
-  }
-
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type*
-  copy(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT {
-    _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(!std::__is_pointer_in_range(__s1, __s1 + __n, __s2),
-                                          "char_traits::copy: source and destination ranges overlap");
-    std::__constexpr_memmove(__s1, __s2, __element_count(__n));
-    return __s1;
-  }
-
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type*
-  assign(char_type* __s, size_t __n, char_type __a) _NOEXCEPT {
-    std::fill_n(__s, __n, __a);
-    return __s;
-  }
-
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int_type not_eof(int_type __c) _NOEXCEPT {
-    return eq_int_type(__c, eof()) ? ~eof() : __c;
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR char_type to_char_type(int_type __c) _NOEXCEPT {
-    return char_type(__c);
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int_type to_int_type(char_type __c) _NOEXCEPT {
-    return int_type(__c);
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool eq_int_type(int_type __c1, int_type __c2) _NOEXCEPT {
-    return __c1 == __c2;
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int_type eof() _NOEXCEPT { return int_type(WEOF); }
 };
 #endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
 
 #ifndef _LIBCPP_HAS_NO_CHAR8_T
 
 template <>
-struct _LIBCPP_TEMPLATE_VIS char_traits<char8_t> {
-  using char_type  = char8_t;
-  using int_type   = unsigned int;
-  using off_type   = streamoff;
-  using pos_type   = u8streampos;
-  using state_type = mbstate_t;
-#  if _LIBCPP_STD_VER >= 20
-  using comparison_category = strong_ordering;
-#  endif
-
-  static inline _LIBCPP_HIDE_FROM_ABI constexpr void assign(char_type& __c1, const char_type& __c2) noexcept {
-    __c1 = __c2;
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI constexpr bool eq(char_type __c1, char_type __c2) noexcept {
-    return __c1 == __c2;
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI constexpr bool lt(char_type __c1, char_type __c2) noexcept { return __c1 < __c2; }
-
+struct _LIBCPP_TEMPLATE_VIS char_traits<char8_t>
+    : __char_traits_base<char8_t, unsigned int, static_cast<unsigned int>(EOF)> {
   static _LIBCPP_HIDE_FROM_ABI constexpr int
   compare(const char_type* __s1, const char_type* __s2, size_t __n) noexcept {
     return std::__constexpr_memcmp(__s1, __s2, __element_count(__n));
@@ -282,61 +274,13 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<char8_t> {
   find(const char_type* __s, size_t __n, const char_type& __a) noexcept {
     return std::__constexpr_memchr(__s, __a, __n);
   }
-
-  static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type*
-  move(char_type* __s1, const char_type* __s2, size_t __n) noexcept {
-    return std::__constexpr_memmove(__s1, __s2, __element_count(__n));
-  }
-
-  static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type*
-  copy(char_type* __s1, const char_type* __s2, size_t __n) noexcept {
-    _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(!std::__is_pointer_in_range(__s1, __s1 + __n, __s2),
-                                          "char_traits::copy: source and destination ranges overlap");
-    std::__constexpr_memmove(__s1, __s2, __element_count(__n));
-    return __s1;
-  }
-
-  static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 char_type*
-  assign(char_type* __s, size_t __n, char_type __a) noexcept {
-    std::fill_n(__s, __n, __a);
-    return __s;
-  }
-
-  static inline _LIBCPP_HIDE_FROM_ABI constexpr int_type not_eof(int_type __c) noexcept {
-    return eq_int_type(__c, eof()) ? ~eof() : __c;
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI constexpr char_type to_char_type(int_type __c) noexcept { return char_type(__c); }
-  static inline _LIBCPP_HIDE_FROM_ABI constexpr int_type to_int_type(char_type __c) noexcept { return int_type(__c); }
-  static inline _LIBCPP_HIDE_FROM_ABI constexpr bool eq_int_type(int_type __c1, int_type __c2) noexcept {
-    return __c1 == __c2;
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI constexpr int_type eof() noexcept { return int_type(EOF); }
 };
 
 #endif // _LIBCPP_HAS_NO_CHAR8_T
 
 template <>
-struct _LIBCPP_TEMPLATE_VIS char_traits<char16_t> {
-  using char_type  = char16_t;
-  using int_type   = uint_least16_t;
-  using off_type   = streamoff;
-  using pos_type   = u16streampos;
-  using state_type = mbstate_t;
-#if _LIBCPP_STD_VER >= 20
-  using comparison_category = strong_ordering;
-#endif
-
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 void
-  assign(char_type& __c1, const char_type& __c2) _NOEXCEPT {
-    __c1 = __c2;
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool eq(char_type __c1, char_type __c2) _NOEXCEPT {
-    return __c1 == __c2;
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool lt(char_type __c1, char_type __c2) _NOEXCEPT {
-    return __c1 < __c2;
-  }
-
+struct _LIBCPP_TEMPLATE_VIS char_traits<char16_t>
+    : __char_traits_base<char16_t, uint_least16_t, static_cast<uint_least16_t>(0xFFFF)> {
   _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 int
   compare(const char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT;
   _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 size_t length(const char_type* __s) _NOEXCEPT;
@@ -349,38 +293,6 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<char16_t> {
       return nullptr;
     return __match;
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static char_type*
-  move(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT {
-    return std::__constexpr_memmove(__s1, __s2, __element_count(__n));
-  }
-
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static char_type*
-  copy(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT {
-    _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(!std::__is_pointer_in_range(__s1, __s1 + __n, __s2),
-                                          "char_traits::copy: source and destination ranges overlap");
-    std::__constexpr_memmove(__s1, __s2, __element_count(__n));
-    return __s1;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static char_type*
-  assign(char_type* __s, size_t __n, char_type __a) _NOEXCEPT {
-    std::fill_n(__s, __n, __a);
-    return __s;
-  }
-
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int_type not_eof(int_type __c) _NOEXCEPT {
-    return eq_int_type(__c, eof()) ? ~eof() : __c;
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR char_type to_char_type(int_type __c) _NOEXCEPT {
-    return char_type(__c);
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int_type to_int_type(char_type __c) _NOEXCEPT {
-    return int_type(__c);
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool eq_int_type(int_type __c1, int_type __c2) _NOEXCEPT {
-    return __c1 == __c2;
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int_type eof() _NOEXCEPT { return int_type(0xFFFF); }
 };
 
 inline _LIBCPP_CONSTEXPR_SINCE_CXX17 int
@@ -402,27 +314,8 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX17 size_t char_traits<char16_t>::length(const
 }
 
 template <>
-struct _LIBCPP_TEMPLATE_VIS char_traits<char32_t> {
-  using char_type  = char32_t;
-  using int_type   = uint_least32_t;
-  using off_type   = streamoff;
-  using pos_type   = u32streampos;
-  using state_type = mbstate_t;
-#if _LIBCPP_STD_VER >= 20
-  using comparison_category = strong_ordering;
-#endif
-
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 void
-  assign(char_type& __c1, const char_type& __c2) _NOEXCEPT {
-    __c1 = __c2;
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool eq(char_type __c1, char_type __c2) _NOEXCEPT {
-    return __c1 == __c2;
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool lt(char_type __c1, char_type __c2) _NOEXCEPT {
-    return __c1 < __c2;
-  }
-
+struct _LIBCPP_TEMPLATE_VIS char_traits<char32_t>
+    : __char_traits_base<char32_t, uint_least32_t, static_cast<uint_least32_t>(0xFFFFFFFF)> {
   _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 int
   compare(const char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT;
   _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX17 size_t length(const char_type* __s) _NOEXCEPT;
@@ -435,37 +328,6 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<char32_t> {
       return nullptr;
     return __match;
   }
-
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static char_type*
-  move(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT {
-    return std::__constexpr_memmove(__s1, __s2, __element_count(__n));
-  }
-
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static char_type*
-  copy(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT {
-    std::__constexpr_memmove(__s1, __s2, __element_count(__n));
-    return __s1;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static char_type*
-  assign(char_type* __s, size_t __n, char_type __a) _NOEXCEPT {
-    std::fill_n(__s, __n, __a);
-    return __s;
-  }
-
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int_type not_eof(int_type __c) _NOEXCEPT {
-    return eq_int_type(__c, eof()) ? ~eof() : __c;
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR char_type to_char_type(int_type __c) _NOEXCEPT {
-    return char_type(__c);
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int_type to_int_type(char_type __c) _NOEXCEPT {
-    return int_type(__c);
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool eq_int_type(int_type __c1, int_type __c2) _NOEXCEPT {
-    return __c1 == __c2;
-  }
-  static inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int_type eof() _NOEXCEPT { return int_type(0xFFFFFFFF); }
 };
 
 inline _LIBCPP_CONSTEXPR_SINCE_CXX17 int

From d64efe42eb98af76ba4ba26f48d079713f513af9 Mon Sep 17 00:00:00 2001
From: Daniel Bertalan <dani@danielbertalan.dev>
Date: Sat, 6 Jul 2024 15:41:40 +0200
Subject: [PATCH 67/67] [lld-macho] Remove symbols to `__mod_init_func` with
 `-init_offsets` (#97156)

When `-fixup_chains`/`-init_offsets` is used, a different section,
`__init_offsets` is synthesized from `__mod_init_func`. If there are any
symbols defined inside `__mod_init_func`, they are added to the symbol
table unconditionally while processing the input files. Later, when
querying these symbols' addresses (when constructing the symtab or
exports trie), we crash with a null deref, as there is no output section
assigned to them.

Just making the symbols point to `__init_offsets` is a bad idea, as the
new section stores 32-bit integers instead of 64-bit pointers; accessing
the symbols would not do what the programmer intended. We should
entirely omit them from the output. This is what ld64 and ld-prime do.

This patch uses the same mechanism as dead-stripping to mark these
symbols as not needed in the output. There might be nicer fixes than the
workaround, this is discussed in #97155.

Fixes https://github.com/llvm/llvm-project/pull/79894#issuecomment-1944092892
Fixes #94716
---
 lld/MachO/Driver.cpp                  | 11 +++++++++++
 lld/MachO/Writer.cpp                  | 12 +++++++++++-
 lld/test/MachO/init-offsets.s         |  6 +++++-
 lld/test/MachO/invalid/init-offsets.s | 16 ++++++++++++++++
 4 files changed, 43 insertions(+), 2 deletions(-)
 create mode 100644 lld/test/MachO/invalid/init-offsets.s

diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 9dddabcf3680cc..83c92d214de311 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1393,6 +1393,12 @@ static void handleExplicitExports() {
   }
 }
 
+static void eraseInitializerSymbols() {
+  for (ConcatInputSection *isec : in.initOffsets->inputs())
+    for (Defined *sym : isec->symbols)
+      sym->used = false;
+}
+
 namespace lld {
 namespace macho {
 bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
@@ -1971,6 +1977,11 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     if (config->deadStrip)
       markLive();
 
+    // Ensure that no symbols point inside __mod_init_func sections if they are
+    // removed due to -init_offsets. This must run after dead stripping.
+    if (config->emitInitOffsets)
+      eraseInitializerSymbols();
+
     // Categories are not subject to dead-strip. The __objc_catlist section is
     // marked as NO_DEAD_STRIP and that propagates into all category data.
     if (args.hasArg(OPT_check_category_conflicts))
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
index b9fcb45ef86b27..e6b80c1d42d9ee 100644
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -640,7 +640,17 @@ void Writer::treatSpecialUndefineds() {
 
 static void prepareSymbolRelocation(Symbol *sym, const InputSection *isec,
                                     const lld::macho::Reloc &r) {
-  assert(sym->isLive());
+  if (!sym->isLive()) {
+    if (Defined *defined = dyn_cast<Defined>(sym)) {
+      if (config->emitInitOffsets &&
+          defined->isec()->getName() == section_names::moduleInitFunc)
+        fatal(isec->getLocation(r.offset) + ": cannot reference " +
+              sym->getName() +
+              " defined in __mod_init_func when -init_offsets is used");
+    }
+    assert(false && "referenced symbol must be live");
+  }
+
   const RelocAttrs &relocAttrs = target->getRelocAttrs(r.type);
 
   if (relocAttrs.hasAttr(RelocAttrBits::BRANCH)) {
diff --git a/lld/test/MachO/init-offsets.s b/lld/test/MachO/init-offsets.s
index 844951a1dc3801..cf34a9b46f3083 100644
--- a/lld/test/MachO/init-offsets.s
+++ b/lld/test/MachO/init-offsets.s
@@ -12,7 +12,7 @@
 # RUN: llvm-objcopy --dump-section=__TEXT,__init_offsets=%t/section.bin %t/out
 # RUN: echo "__TEXT,__init_offsets contents:" >> %t/dump.txt
 # RUN: od -An -txI %t/section.bin >> %t/dump.txt
-# RUN: FileCheck --check-prefix=CONTENT %s < %t/dump.txt
+# RUN: FileCheck --check-prefix=CONTENT --implicit-check-not=_init_ptr %s < %t/dump.txt
 
 ## This test checks that:
 ## - __mod_init_func is replaced by __init_offsets.
@@ -21,6 +21,7 @@
 ##   command line, and in the order they show up within __mod_init_func.
 ## - for undefined and dylib symbols, stubs are created, and the offsets point to those.
 ## - offsets are relative to __TEXT's address, they aren't an absolute virtual address.
+## - symbols defined within __mod_init_func are ignored.
 
 # FLAGS:      sectname __init_offsets
 # FLAGS-NEXT:  segname __TEXT
@@ -48,6 +49,7 @@
 
 #--- first.s
 .globl _first_init, ___isnan, _main
+.globl _init_ptr_1
 .text
 _first_init:
   ret
@@ -55,6 +57,7 @@ _main:
   ret
 
 .section __DATA,__mod_init_func,mod_init_funcs
+_init_ptr_1:
 .quad _first_init
 .quad ___isnan
 
@@ -68,6 +71,7 @@ _second_init:
 
 .section __DATA,__mod_init_func,mod_init_funcs
 .quad _undefined
+_init_ptr_2:
 .quad _second_init
 
 .subsections_via_symbols
diff --git a/lld/test/MachO/invalid/init-offsets.s b/lld/test/MachO/invalid/init-offsets.s
new file mode 100644
index 00000000000000..51a441e0a3e29d
--- /dev/null
+++ b/lld/test/MachO/invalid/init-offsets.s
@@ -0,0 +1,16 @@
+# REQUIRES: x86
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o
+# RUN: not %lld -lSystem -init_offsets  %t.o -o /dev/null 2>&1 | FileCheck %s
+
+# CHECK: error: {{.*}}init-offsets.s.tmp.o:(symbol _main+0x3): cannot reference _init_slot defined in __mod_init_func when -init_offsets is used
+
+.globl _main
+.text
+_main:
+  leaq _init_slot(%rip), %rax
+
+.section __DATA,__mod_init_func,mod_init_funcs
+_init_slot:
+  .quad _main
+