From 00179e92c147e16de1f7c653f88c8805aef820c1 Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Wed, 15 May 2024 15:23:29 -0700
Subject: [PATCH 01/16] [bazel] Add new dependencies (#92323)

This also fixes building ... on Linux. Seems like target_compatible_with
isn't enough but you also need a manual tag.
---
 utils/bazel/llvm-project-overlay/lldb/BUILD.bazel           | 5 ++++-
 utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
index c6fc4e08aa7271..ddcaea5184d434 100644
--- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
@@ -916,7 +916,10 @@ apple_genrule(
     srcs = [":debugserver_unsigned"],
     outs = ["debugserver"],
     cmd = "cp $(SRCS) $(OUTS) && xcrun codesign -f -s - --entitlements $(location tools/debugserver/resources/debugserver-macosx-entitlements.plist) $(OUTS)",
-    tags = ["nobuildkite"],
+    tags = [
+        "manual",
+        "nobuildkite",
+    ],
     target_compatible_with = select({
         "@platforms//os:macos": [],
         "//conditions:default": ["@platforms//:incompatible"],
diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
index 21f0c7092f32e9..b44489e213a400 100644
--- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
@@ -617,6 +617,7 @@ cc_test(
         allow_empty = False,
     ),
     deps = [
+        "//llvm:BitstreamReader",
         "//llvm:Core",
         "//llvm:Coverage",
         "//llvm:DebugInfo",

From 050593fc4f9a7f2b9450ee093c4638b8539315b7 Mon Sep 17 00:00:00 2001
From: Andrey Ali Khan Bolshakov <bolsh.andrey@yandex.ru>
Date: Thu, 16 May 2024 01:39:12 +0300
Subject: [PATCH 02/16] [Coverage] Handle `CoroutineSuspendExpr` correctly
 (#88898)

This avoids visiting `co_await` or `co_yield` operand 5 times (it is
repeated under transformed awaiter subexpression, and under
`await_ready`, `await_suspend`, and `await_resume` generated call
subexpressions).
---
 clang/lib/CodeGen/CoverageMappingGen.cpp |  4 ++++
 clang/test/CoverageMapping/coroutine.cpp | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index ce2f39aeb08213..e46560029ab08d 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -1439,6 +1439,10 @@ struct CounterCoverageMappingBuilder
     terminateRegion(S);
   }
 
+  void VisitCoroutineSuspendExpr(const CoroutineSuspendExpr *E) {
+    Visit(E->getOperand());
+  }
+
   void VisitCXXThrowExpr(const CXXThrowExpr *E) {
     extendRegion(E);
     if (E->getSubExpr())
diff --git a/clang/test/CoverageMapping/coroutine.cpp b/clang/test/CoverageMapping/coroutine.cpp
index 0105005d198a1c..d322bc351a7276 100644
--- a/clang/test/CoverageMapping/coroutine.cpp
+++ b/clang/test/CoverageMapping/coroutine.cpp
@@ -32,6 +32,7 @@ struct std::coroutine_traits<int, int> {
     suspend_always final_suspend() noexcept;
     void unhandled_exception() noexcept;
     void return_value(int);
+    suspend_always yield_value(int);
   };
 };
 
@@ -45,3 +46,21 @@ int f1(int x) {       // CHECK-NEXT: File 0, [[@LINE]]:15 -> [[@LINE+8]]:2 = #0
   }                   // CHECK-NEXT: File 0, [[@LINE-2]]:10 -> [[@LINE]]:4 = (#0 - #1)
   co_return x;        // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:4 -> [[@LINE]]:3 = #1
 } // CHECK-NEXT: File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:14 = #1
+
+// CHECK-LABEL: _Z2f2i:
+// CHECK-NEXT: File 0, [[@LINE+1]]:15 -> [[@LINE+15]]:2 = #0
+int f2(int x) {
+// CHECK-NEXT: File 0, [[@LINE+5]]:13 -> [[@LINE+5]]:18 = #0
+// CHECK-NEXT: Branch,File 0, [[@LINE+4]]:13 -> [[@LINE+4]]:18 = #1, (#0 - #1)
+// CHECK-NEXT: Gap,File 0, [[@LINE+3]]:20 -> [[@LINE+3]]:21 = #1
+// CHECK-NEXT: File 0, [[@LINE+2]]:21 -> [[@LINE+2]]:37 = #1
+// CHECK-NEXT: File 0, [[@LINE+1]]:40 -> [[@LINE+1]]:56 = (#0 - #1)
+  co_await (x > 0 ? suspend_always{} : suspend_always{});
+// CHECK-NEXT: File 0, [[@LINE+5]]:12 -> [[@LINE+5]]:17 = #0
+// CHECK-NEXT: Branch,File 0, [[@LINE+4]]:12 -> [[@LINE+4]]:17 = #2, (#0 - #2)
+// CHECK-NEXT: Gap,File 0, [[@LINE+3]]:19 -> [[@LINE+3]]:20 = #2
+// CHECK-NEXT: File 0, [[@LINE+2]]:20 -> [[@LINE+2]]:21 = #2
+// CHECK-NEXT: File 0, [[@LINE+1]]:24 -> [[@LINE+1]]:25 = (#0 - #2)
+  co_yield x > 0 ? 1 : 2;
+  co_return 0;
+}

From 5ff6c6542ac451daaed6c417e481e313165d3454 Mon Sep 17 00:00:00 2001
From: Andrey Ali Khan Bolshakov <bolsh.andrey@yandex.ru>
Date: Thu, 16 May 2024 01:40:03 +0300
Subject: [PATCH 03/16] [Coverage] Handle array decomposition correctly
 (#88881)

`ArrayInitLoopExpr` AST node has two occurences of its as-written
initializing expression in its subexpressions through a non-unique
`OpaqueValueExpr`. It causes double-visiting of the initializing
expression if not handled explicitly, as discussed in #85837.
---
 clang/lib/CodeGen/CoverageMappingGen.cpp     |  4 ++++
 clang/test/CoverageMapping/decomposition.cpp | 15 +++++++++++++++
 2 files changed, 19 insertions(+)
 create mode 100644 clang/test/CoverageMapping/decomposition.cpp

diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index e46560029ab08d..cc8ab7a5b4369e 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -2177,6 +2177,10 @@ struct CounterCoverageMappingBuilder
     // propagate counts into them.
   }
 
+  void VisitArrayInitLoopExpr(const ArrayInitLoopExpr *AILE) {
+    Visit(AILE->getCommonExpr()->getSourceExpr());
+  }
+
   void VisitPseudoObjectExpr(const PseudoObjectExpr *POE) {
     // Just visit syntatic expression as this is what users actually write.
     VisitStmt(POE->getSyntacticForm());
diff --git a/clang/test/CoverageMapping/decomposition.cpp b/clang/test/CoverageMapping/decomposition.cpp
new file mode 100644
index 00000000000000..601ea630faeec9
--- /dev/null
+++ b/clang/test/CoverageMapping/decomposition.cpp
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -triple %itanium_abi_triple -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+
+// CHECK-LABEL:       _Z19array_decompositioni:
+// CHECK-NEXT:          File 0, [[@LINE+6]]:32 -> {{[0-9]+}}:2 = #0
+// CHECK-NEXT:          File 0, [[@LINE+8]]:20 -> [[@LINE+8]]:25 = #0
+// CHECK-NEXT:          Branch,File 0, [[@LINE+7]]:20 -> [[@LINE+7]]:25 = #1, (#0 - #1)
+// CHECK-NEXT:          Gap,File 0, [[@LINE+6]]:27 -> [[@LINE+6]]:28 = #1
+// CHECK-NEXT:          File 0, [[@LINE+5]]:28 -> [[@LINE+5]]:29 = #1
+// CHECK-NEXT:          File 0, [[@LINE+4]]:32 -> [[@LINE+4]]:33 = (#0 - #1)
+int array_decomposition(int i) {
+  int a[] = {1, 2, 3};
+  int b[] = {4, 5, 6};
+  auto [x, y, z] = i > 0 ? a : b;
+  return x + y + z;
+}

From aa889d7783af050ce5d19af67c7225ee119d625e Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 15 May 2024 15:41:20 -0700
Subject: [PATCH 04/16] [ELF,test] Fix FileCheck prefixes

Most violations are stale and should be removed while a few can be
adjusted.

Reported at #92238
---
 lld/test/ELF/arm-exidx-shared.s           |  2 +-
 lld/test/ELF/mips-tls-hilo.s              | 10 ----------
 lld/test/ELF/ppc32-reloc-rel.s            |  3 ++-
 lld/test/ELF/ppc64-pcrel-call-to-extern.s |  5 -----
 lld/test/ELF/ppc64-toc-relax-ifunc.s      | 13 +++++--------
 lld/test/ELF/riscv-gp.s                   |  4 ----
 6 files changed, 8 insertions(+), 29 deletions(-)

diff --git a/lld/test/ELF/arm-exidx-shared.s b/lld/test/ELF/arm-exidx-shared.s
index fce605d6d96aa5..2e484e5c065fee 100644
--- a/lld/test/ELF/arm-exidx-shared.s
+++ b/lld/test/ELF/arm-exidx-shared.s
@@ -2,7 +2,7 @@
 // RUN: llvm-mc -filetype=obj -arm-add-build-attributes -triple=armv7a-none-linux-gnueabi %s -o %t
 // RUN: ld.lld --hash-style=sysv %t --shared -o %t2
 // RUN: llvm-readobj --relocations %t2 | FileCheck %s
-// RUN: llvm-objdump -s --triple=armv7a-none-linux-gnueabi %t2 | FileCheck --check-prefix=CHECK-EXTAB-NEXT %s
+// RUN: llvm-objdump -s --triple=armv7a-none-linux-gnueabi %t2 | FileCheck --check-prefix=CHECK-EXTAB %s
 
 // Check that the relative R_ARM_PREL31 relocation can access a PLT entry
 // for when the personality routine is referenced from a shared library.
diff --git a/lld/test/ELF/mips-tls-hilo.s b/lld/test/ELF/mips-tls-hilo.s
index 6fd2033aac4142..9c67f9fe14ba6f 100644
--- a/lld/test/ELF/mips-tls-hilo.s
+++ b/lld/test/ELF/mips-tls-hilo.s
@@ -28,16 +28,6 @@
 # CHECK-NEXT: ]
 # CHECK-NOT:  Primary GOT
 
-# SO:      Relocations [
-# SO-NEXT: ]
-# SO:      Primary GOT {
-# SO:        Local entries [
-# SO-NEXT:   ]
-# SO-NEXT:   Global entries [
-# SO-NEXT:   ]
-# SO-NEXT:   Number of TLS and multi-GOT entries: 0
-# SO-NEXT: }
-
   .text
   .globl  __start
   .type __start,@function
diff --git a/lld/test/ELF/ppc32-reloc-rel.s b/lld/test/ELF/ppc32-reloc-rel.s
index b89e0b43cb785c..d13ebdb7997f7c 100644
--- a/lld/test/ELF/ppc32-reloc-rel.s
+++ b/lld/test/ELF/ppc32-reloc-rel.s
@@ -6,6 +6,7 @@
 # RUN: llvm-mc -filetype=obj -triple=powerpcle %s -o %t.le.o
 # RUN: ld.lld %t.le.o -o %t
 # RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
+# RUN: llvm-objdump -s %t | FileCheck %s --check-prefix=HEX
 
 .section .R_PPC_REL14,"ax",@progbits
   beq 1f
@@ -23,7 +24,7 @@
   .long 1f - .
 1:
 # HEX-LABEL: section .R_PPC_REL32:
-# HEX-NEXT: 10010008 00000004
+# HEX-NEXT:  04000000
 
 .section .R_PPC_PLTREL24,"ax",@progbits
   b 1f@PLT+32768
diff --git a/lld/test/ELF/ppc64-pcrel-call-to-extern.s b/lld/test/ELF/ppc64-pcrel-call-to-extern.s
index e5846e80ce2382..de05b733e175a2 100644
--- a/lld/test/ELF/ppc64-pcrel-call-to-extern.s
+++ b/lld/test/ELF/ppc64-pcrel-call-to-extern.s
@@ -73,9 +73,7 @@
 
 ## DT_PLTGOT points to .plt
 # SEC: .plt              NOBITS          0000000010030168 040168 000028 00  WA  0   0  8
-# SEC-OG: .plt              NOBITS          0000000010030158 040158 000028 00  WA  0   0  8
 # SEC: 0x0000000000000003 (PLTGOT)      0x10030168
-# SEC-OG: 0x0000000000000003 (PLTGOT)      0x10030168
 
 ## DT_PLTGOT points to .plt
 # SEC-NOP10: .plt              NOBITS          0000000010030168 040168 000028 00  WA  0   0  8
@@ -86,11 +84,8 @@
 ## Check that we emit 3 R_PPC64_JMP_SLOT in .rela.plt.
 # REL:      .rela.plt {
 # REL-NEXT:   0x10030178 R_PPC64_JMP_SLOT callee_global_stother0 0x0
-# REL-NEXT-OG:   0x10030168 R_PPC64_JMP_SLOT callee_global_stother0 0x0
 # REL-NEXT:   0x10030180 R_PPC64_JMP_SLOT callee_global_stother1 0x0
-# REL-NEXT-OG:   0x10030170 R_PPC64_JMP_SLOT callee_global_stother1 0x0
 # REL-NEXT:   0x10030188 R_PPC64_JMP_SLOT callee_global_TOC 0x0
-# REL-NEXT-OG:   0x10030178 R_PPC64_JMP_SLOT callee_global_TOC 0x0
 # REL-NEXT: }
 
 # REL-NOP10:      .rela.plt {
diff --git a/lld/test/ELF/ppc64-toc-relax-ifunc.s b/lld/test/ELF/ppc64-toc-relax-ifunc.s
index 9fb1bf0023b6d4..00a63c7e5b6772 100644
--- a/lld/test/ELF/ppc64-toc-relax-ifunc.s
+++ b/lld/test/ELF/ppc64-toc-relax-ifunc.s
@@ -4,7 +4,7 @@
 # RUN: echo '.globl ifunc; .type ifunc, %gnu_indirect_function; ifunc:' | \
 # RUN:   llvm-mc -filetype=obj -triple=powerpc64le - -o %t1.o
 # RUN: ld.lld %t.o %t1.o -o %t
-# RUN: llvm-readelf -S -s %t | FileCheck --check-prefix=SEC %s
+# RUN: llvm-readelf -Ssr %t | FileCheck --check-prefix=SEC %s
 # RUN: llvm-readelf -x .toc %t | FileCheck --check-prefix=HEX %s
 # RUN: llvm-objdump -d %t | FileCheck --check-prefix=DIS %s
 
@@ -13,18 +13,15 @@
 ## still perform toc-indirect to toc-relative relaxation because the distance
 ## to the address of the canonical PLT is fixed.
 
-# SEC: .text PROGBITS 00000000100101e0
-# SEC: .plt  NOBITS   0000000010030200
-# SEC: 00000000100101e8 0 FUNC GLOBAL DEFAULT 3 ifunc
+# SEC: .text PROGBITS [[#%x,TEXT:]]
+# SEC: .plt  NOBITS   [[#%x,PLT:]]
+# SEC: {{0*}}[[#PLT]] {{.+}} R_PPC64_IRELATIVE [[#TEXT+8]]
+# SEC: {{0*}}[[#TEXT+8]] 0 FUNC GLOBAL DEFAULT 3 ifunc
 
 ## .toc[0] stores the address of the canonical PLT.
 # HEX:      section '.toc':
 # HEX-NEXT: 0x100201f8 e8010110 00000000
 
-# REL:      .rela.dyn {
-# REL-NEXT:   0x100301f8 R_PPC64_IRELATIVE - 0x100101e8
-# REL-NEXT: }
-
 # DIS: addi 3, 3,
 
 addis 3, 2, .toc@toc@ha
diff --git a/lld/test/ELF/riscv-gp.s b/lld/test/ELF/riscv-gp.s
index 29411d19b01929..e82e36ee9a7ae5 100644
--- a/lld/test/ELF/riscv-gp.s
+++ b/lld/test/ELF/riscv-gp.s
@@ -16,10 +16,6 @@
 # SEC64: [ [[#SDATA:]]] .sdata PROGBITS {{0*}}000032e0
 # SEC64: {{0*}}00003ae0 0 NOTYPE GLOBAL DEFAULT [[#SDATA]] __global_pointer$
 
-## __global_pointer$ - 0x1000 = 4096*3-2048
-# DIS:      1000: auipc gp, 3
-# DIS-NEXT:       addi gp, gp, -2048
-
 # ERR: error: relocation R_RISCV_PCREL_HI20 cannot be used against symbol '__global_pointer$'; recompile with -fPIC
 
 ## -r mode does not define __global_pointer$.

From 0585eed9409c1362f7deaabc42c1d3c3f55c4b6c Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 15 May 2024 15:44:05 -0700
Subject: [PATCH 05/16] [lldb-dap] Support publishing to the VSCode market
 place (#92320)

Update the publisher and add a publish script that we can use from
Github actions.
---
 lldb/tools/lldb-dap/package.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json
index 2e8ad074256bf5..aeb24445551c1e 100644
--- a/lldb/tools/lldb-dap/package.json
+++ b/lldb/tools/lldb-dap/package.json
@@ -2,7 +2,7 @@
   "name": "lldb-dap",
   "displayName": "LLDB DAP",
   "version": "0.2.0",
-  "publisher": "llvm",
+  "publisher": "llvm-vs-code-extensions",
   "homepage": "https://lldb.llvm.org",
   "description": "LLDB debugging from VSCode",
   "license": "Apache 2.0 License with LLVM exceptions",
@@ -42,6 +42,7 @@
     "watch": "tsc -watch -p ./",
     "format": "npx prettier './src-ts/' --write",
     "package": "vsce package --out ./out/lldb-dap.vsix",
+    "publish": "vsce publish",
     "vscode-uninstall": "code --uninstall-extension llvm.lldb-dap",
     "vscode-install": "code --install-extension ./out/lldb-dap.vsix"
   },

From e00a3ccf43563209b71c5b68f56d83f4052dca63 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Wed, 15 May 2024 15:44:37 -0700
Subject: [PATCH 06/16] [flang] New -fdebug-unparse-with-modules option
 (#91660)

This option is a compilation action that parses a source file and
performs semantic analysis on it, like the existing -fdebug-unparse
option does. Its output, however, is preceded by the effective contents
of all of the non-intrinsic modules on which it depends but does not
define, transitively preceded by the closure of all of those modules'
dependencies.

The output from this option is therefore the analyzed parse tree for a
source file encapsulated with all of its non-intrinsic module
dependencies. This output may be useful for extracting code from large
applications for use as an attachment to a bug report, or as input to a
test case reduction tool for problem isolation.
---
 clang/include/clang/Driver/Options.td         |  4 +-
 .../include/flang/Frontend/FrontendActions.h  |  4 ++
 .../include/flang/Frontend/FrontendOptions.h  |  4 ++
 .../flang/Semantics/unparse-with-symbols.h    |  4 ++
 flang/lib/Frontend/CompilerInvocation.cpp     |  3 ++
 flang/lib/Frontend/FrontendActions.cpp        |  9 +++++
 .../ExecuteCompilerInvocation.cpp             |  2 +
 flang/lib/Semantics/mod-file.cpp              | 20 +++++++++-
 flang/lib/Semantics/mod-file.h                |  3 ++
 flang/lib/Semantics/unparse-with-symbols.cpp  | 38 +++++++++++++++++++
 flang/test/Driver/unparse-with-modules.f90    | 34 +++++++++++++++++
 11 files changed, 122 insertions(+), 3 deletions(-)
 create mode 100644 flang/test/Driver/unparse-with-modules.f90

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index c54eb543d6580e..e579f1a0a3665c 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6647,7 +6647,9 @@ def fdebug_unparse : Flag<["-"], "fdebug-unparse">, Group<Action_Group>,
   DocBrief<[{Run the parser and the semantic checks. Then unparse the
 parse-tree and output the generated Fortran source file.}]>;
 def fdebug_unparse_with_symbols : Flag<["-"], "fdebug-unparse-with-symbols">, Group<Action_Group>,
-  HelpText<"Unparse and stop.">;
+  HelpText<"Unparse with symbols and stop.">;
+def fdebug_unparse_with_modules : Flag<["-"], "fdebug-unparse-with-modules">, Group<Action_Group>,
+  HelpText<"Unparse with dependent modules and stop.">;
 def fdebug_dump_symbols : Flag<["-"], "fdebug-dump-symbols">, Group<Action_Group>,
   HelpText<"Dump symbols after the semantic analysis">;
 def fdebug_dump_parse_tree : Flag<["-"], "fdebug-dump-parse-tree">, Group<Action_Group>,
diff --git a/flang/include/flang/Frontend/FrontendActions.h b/flang/include/flang/Frontend/FrontendActions.h
index e2e859f3a81bd7..7823565eb815f8 100644
--- a/flang/include/flang/Frontend/FrontendActions.h
+++ b/flang/include/flang/Frontend/FrontendActions.h
@@ -108,6 +108,10 @@ class DebugUnparseWithSymbolsAction : public PrescanAndSemaAction {
   void executeAction() override;
 };
 
+class DebugUnparseWithModulesAction : public PrescanAndSemaAction {
+  void executeAction() override;
+};
+
 class DebugUnparseAction : public PrescanAndSemaAction {
   void executeAction() override;
 };
diff --git a/flang/include/flang/Frontend/FrontendOptions.h b/flang/include/flang/Frontend/FrontendOptions.h
index 06b1318f243b08..82ca99672ec610 100644
--- a/flang/include/flang/Frontend/FrontendOptions.h
+++ b/flang/include/flang/Frontend/FrontendOptions.h
@@ -63,6 +63,10 @@ enum ActionKind {
   /// Fortran source file
   DebugUnparseWithSymbols,
 
+  /// Parse, run semantics, and output a Fortran source file preceded
+  /// by all the necessary modules (transitively)
+  DebugUnparseWithModules,
+
   /// Parse, run semantics and then output symbols from semantics
   DebugDumpSymbols,
 
diff --git a/flang/include/flang/Semantics/unparse-with-symbols.h b/flang/include/flang/Semantics/unparse-with-symbols.h
index d70110245e2b2f..5e18b3fc3063db 100644
--- a/flang/include/flang/Semantics/unparse-with-symbols.h
+++ b/flang/include/flang/Semantics/unparse-with-symbols.h
@@ -21,8 +21,12 @@ struct Program;
 }
 
 namespace Fortran::semantics {
+class SemanticsContext;
 void UnparseWithSymbols(llvm::raw_ostream &, const parser::Program &,
     parser::Encoding encoding = parser::Encoding::UTF_8);
+void UnparseWithModules(llvm::raw_ostream &, SemanticsContext &,
+    const parser::Program &,
+    parser::Encoding encoding = parser::Encoding::UTF_8);
 }
 
 #endif // FORTRAN_SEMANTICS_UNPARSE_WITH_SYMBOLS_H_
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index db7fd3cccc7a28..e8a8c90045d92d 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -488,6 +488,9 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
     case clang::driver::options::OPT_fdebug_unparse_with_symbols:
       opts.programAction = DebugUnparseWithSymbols;
       break;
+    case clang::driver::options::OPT_fdebug_unparse_with_modules:
+      opts.programAction = DebugUnparseWithModules;
+      break;
     case clang::driver::options::OPT_fdebug_dump_symbols:
       opts.programAction = DebugDumpSymbols;
       break;
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 2f65ab6102f4d9..4341c104a69df2 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -477,6 +477,15 @@ void DebugUnparseWithSymbolsAction::executeAction() {
   reportFatalSemanticErrors();
 }
 
+void DebugUnparseWithModulesAction::executeAction() {
+  auto &parseTree{*getInstance().getParsing().parseTree()};
+  CompilerInstance &ci{getInstance()};
+  Fortran::semantics::UnparseWithModules(
+      llvm::outs(), ci.getSemantics().context(), parseTree,
+      /*encoding=*/Fortran::parser::Encoding::UTF_8);
+  reportFatalSemanticErrors();
+}
+
 void DebugDumpSymbolsAction::executeAction() {
   CompilerInstance &ci = this->getInstance();
 
diff --git a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index 4cad640562c619..e2cbd5112d6ea5 100644
--- a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -59,6 +59,8 @@ createFrontendAction(CompilerInstance &ci) {
     return std::make_unique<DebugUnparseNoSemaAction>();
   case DebugUnparseWithSymbols:
     return std::make_unique<DebugUnparseWithSymbolsAction>();
+  case DebugUnparseWithModules:
+    return std::make_unique<DebugUnparseWithModulesAction>();
   case DebugDumpSymbols:
     return std::make_unique<DebugDumpSymbolsAction>();
   case DebugDumpParseTree:
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index e9aebe5b08f2ba..bb8c6c7567b8d7 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -132,11 +132,11 @@ static std::string ModFileName(const SourceName &name,
 
 // Write the module file for symbol, which must be a module or submodule.
 void ModFileWriter::Write(const Symbol &symbol) {
-  auto &module{symbol.get<ModuleDetails>()};
+  const auto &module{symbol.get<ModuleDetails>()};
   if (module.moduleFileHash()) {
     return; // already written
   }
-  auto *ancestor{module.ancestor()};
+  const auto *ancestor{module.ancestor()};
   isSubmodule_ = ancestor != nullptr;
   auto ancestorName{ancestor ? ancestor->GetName().value().ToString() : ""s};
   auto path{context_.moduleDirectory() + '/' +
@@ -151,6 +151,21 @@ void ModFileWriter::Write(const Symbol &symbol) {
   const_cast<ModuleDetails &>(module).set_moduleFileHash(checkSum);
 }
 
+void ModFileWriter::WriteClosure(llvm::raw_ostream &out, const Symbol &symbol,
+    UnorderedSymbolSet &nonIntrinsicModulesWritten) {
+  if (!symbol.has<ModuleDetails>() || symbol.owner().IsIntrinsicModules() ||
+      !nonIntrinsicModulesWritten.insert(symbol).second) {
+    return;
+  }
+  PutSymbols(DEREF(symbol.scope()));
+  needsBuf_.clear(); // omit module checksums
+  auto str{GetAsString(symbol)};
+  for (auto depRef : std::move(usedNonIntrinsicModules_)) {
+    WriteClosure(out, *depRef, nonIntrinsicModulesWritten);
+  }
+  out << std::move(str);
+}
+
 // Return the entire body of the module file
 // and clear saved uses, decls, and contains.
 std::string ModFileWriter::GetAsString(const Symbol &symbol) {
@@ -710,6 +725,7 @@ void ModFileWriter::PutUse(const Symbol &symbol) {
     uses_ << "use,intrinsic::";
   } else {
     uses_ << "use ";
+    usedNonIntrinsicModules_.insert(module);
   }
   uses_ << module.name() << ",only:";
   PutGenericName(uses_, symbol);
diff --git a/flang/lib/Semantics/mod-file.h b/flang/lib/Semantics/mod-file.h
index b4ece4018c054d..739add32c2e0ee 100644
--- a/flang/lib/Semantics/mod-file.h
+++ b/flang/lib/Semantics/mod-file.h
@@ -35,6 +35,8 @@ class ModFileWriter {
 public:
   explicit ModFileWriter(SemanticsContext &context) : context_{context} {}
   bool WriteAll();
+  void WriteClosure(llvm::raw_ostream &, const Symbol &,
+      UnorderedSymbolSet &nonIntrinsicModulesWritten);
 
 private:
   SemanticsContext &context_;
@@ -46,6 +48,7 @@ class ModFileWriter {
   std::string containsBuf_;
   // Tracks nested DEC structures and fields of that type
   UnorderedSymbolSet emittedDECStructures_, emittedDECFields_;
+  UnorderedSymbolSet usedNonIntrinsicModules_;
 
   llvm::raw_string_ostream needs_{needsBuf_};
   llvm::raw_string_ostream uses_{usesBuf_};
diff --git a/flang/lib/Semantics/unparse-with-symbols.cpp b/flang/lib/Semantics/unparse-with-symbols.cpp
index 67016e85777c7e..c451f885c06279 100644
--- a/flang/lib/Semantics/unparse-with-symbols.cpp
+++ b/flang/lib/Semantics/unparse-with-symbols.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Semantics/unparse-with-symbols.h"
+#include "mod-file.h"
 #include "flang/Parser/parse-tree-visitor.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Parser/unparse.h"
@@ -98,4 +99,41 @@ void UnparseWithSymbols(llvm::raw_ostream &out, const parser::Program &program,
           int indent) { visitor.PrintSymbols(location, out, indent); }};
   parser::Unparse(out, program, encoding, false, true, &preStatement);
 }
+
+// UnparseWithModules()
+
+class UsedModuleVisitor {
+public:
+  UnorderedSymbolSet &modulesUsed() { return modulesUsed_; }
+  UnorderedSymbolSet &modulesDefined() { return modulesDefined_; }
+  template <typename T> bool Pre(const T &) { return true; }
+  template <typename T> void Post(const T &) {}
+  void Post(const parser::ModuleStmt &module) {
+    if (module.v.symbol) {
+      modulesDefined_.insert(*module.v.symbol);
+    }
+  }
+  void Post(const parser::UseStmt &use) {
+    if (use.moduleName.symbol) {
+      modulesUsed_.insert(*use.moduleName.symbol);
+    }
+  }
+
+private:
+  UnorderedSymbolSet modulesUsed_;
+  UnorderedSymbolSet modulesDefined_;
+};
+
+void UnparseWithModules(llvm::raw_ostream &out, SemanticsContext &context,
+    const parser::Program &program, parser::Encoding encoding) {
+  UsedModuleVisitor visitor;
+  parser::Walk(program, visitor);
+  UnorderedSymbolSet nonIntrinsicModulesWritten{
+      std::move(visitor.modulesDefined())};
+  ModFileWriter writer{context};
+  for (SymbolRef moduleRef : visitor.modulesUsed()) {
+    writer.WriteClosure(out, *moduleRef, nonIntrinsicModulesWritten);
+  }
+  parser::Unparse(out, program, encoding, false, true);
+}
 } // namespace Fortran::semantics
diff --git a/flang/test/Driver/unparse-with-modules.f90 b/flang/test/Driver/unparse-with-modules.f90
new file mode 100644
index 00000000000000..53997f7804efa4
--- /dev/null
+++ b/flang/test/Driver/unparse-with-modules.f90
@@ -0,0 +1,34 @@
+! RUN: %flang_fc1 -I %S/Inputs/module-dir -fdebug-unparse-with-modules %s | FileCheck %s
+module m1
+  use iso_fortran_env
+  use BasicTestModuleTwo
+  implicit none
+  type(t2) y
+  real(real32) x
+end
+
+program test
+  use m1
+  use BasicTestModuleTwo
+  implicit none
+  x = 123.
+  y = t2()
+end
+
+!CHECK-NOT: module iso_fortran_env
+!CHECK: module basictestmoduletwo
+!CHECK: type::t2
+!CHECK: end type
+!CHECK: end
+!CHECK: module m1
+!CHECK:  use :: iso_fortran_env
+!CHECK:  implicit none
+!CHECK:  real(kind=real32) x
+!CHECK: end module
+!CHECK: program test
+!CHECK:  use :: m1
+!CHECK:  use :: basictestmoduletwo
+!CHECK:  implicit none
+!CHECK:  x = 123.
+!CHECK:  y = t2()
+!CHECK: end program

From 667d12f86e626173726e87e101626a9060b8d967 Mon Sep 17 00:00:00 2001
From: Krystian Stasiowski <sdkrystian@gmail.com>
Date: Wed, 15 May 2024 18:55:53 -0400
Subject: [PATCH 07/16] [Clang][Sema] Do not mark template parameters in the
 exception specification as used during partial ordering (#91534)

We do not deduce template arguments from the exception specification
when determining the primary template of a function template
specialization or when taking the address of a function template.
Therefore, this patch changes `isAtLeastAsSpecializedAs` such that we do
not mark template parameters in the exception specification as 'used'
during partial ordering (per [temp.deduct.partial]
p12) to prevent the following from being ambiguous:

```
template<typename T, typename U>
void f(U) noexcept(noexcept(T())); // #1

template<typename T>
void f(T*) noexcept; // #2

template<>
void f<int>(int*) noexcept; // currently ambiguous, selects #2 with this patch applied
```

Although there is no corresponding wording in the standard (see core issue filed here
https://github.com/cplusplus/CWG/issues/537), this seems
to be the intended behavior given the definition of _deduction
substitution loci_ in [temp.deduct.general] p7 (and EDG does the same thing).
---
 clang/docs/ReleaseNotes.rst                   |  3 +
 clang/lib/Sema/SemaTemplateDeduction.cpp      | 36 +++++++---
 .../temp.deduct/temp.deduct.partial/p3.cpp    | 72 +++++++++++++++++++
 3 files changed, 103 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.partial/p3.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ae699ebfc60383..6f7e54252150c6 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -713,6 +713,9 @@ Bug Fixes to C++ Support
 - Correctly treat the compound statement of an ``if consteval`` as an immediate context. Fixes (#GH91509).
 - When partial ordering alias templates against template template parameters,
   allow pack expansions when the alias has a fixed-size parameter list. Fixes (#GH62529).
+- Clang now ignores template parameters only used within the exception specification of candidate function
+  templates during partial ordering when deducing template arguments from a function declaration or when
+  taking the address of a function template.
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 853c0e1b50619e..b5d405111fe4cb 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -5453,7 +5453,7 @@ static bool isAtLeastAsSpecializedAs(Sema &S, SourceLocation Loc,
     //     is used.
     if (DeduceTemplateArgumentsByTypeMatch(
             S, TemplateParams, FD2->getType(), FD1->getType(), Info, Deduced,
-            TDF_None,
+            TDF_AllowCompatibleFunctionType,
             /*PartialOrdering=*/true) != TemplateDeductionResult::Success)
       return false;
     break;
@@ -5485,20 +5485,40 @@ static bool isAtLeastAsSpecializedAs(Sema &S, SourceLocation Loc,
   switch (TPOC) {
   case TPOC_Call:
     for (unsigned I = 0, N = Args2.size(); I != N; ++I)
-      ::MarkUsedTemplateParameters(S.Context, Args2[I], false,
-                                   TemplateParams->getDepth(),
-                                   UsedParameters);
+      ::MarkUsedTemplateParameters(S.Context, Args2[I], /*OnlyDeduced=*/false,
+                                   TemplateParams->getDepth(), UsedParameters);
     break;
 
   case TPOC_Conversion:
-    ::MarkUsedTemplateParameters(S.Context, Proto2->getReturnType(), false,
+    ::MarkUsedTemplateParameters(S.Context, Proto2->getReturnType(),
+                                 /*OnlyDeduced=*/false,
                                  TemplateParams->getDepth(), UsedParameters);
     break;
 
   case TPOC_Other:
-    ::MarkUsedTemplateParameters(S.Context, FD2->getType(), false,
-                                 TemplateParams->getDepth(),
-                                 UsedParameters);
+    // We do not deduce template arguments from the exception specification
+    // when determining the primary template of a function template
+    // specialization or when taking the address of a function template.
+    // Therefore, we do not mark template parameters in the exception
+    // specification as used during partial ordering to prevent the following
+    // from being ambiguous:
+    //
+    //   template<typename T, typename U>
+    //   void f(U) noexcept(noexcept(T())); // #1
+    //
+    //   template<typename T>
+    //   void f(T*) noexcept; // #2
+    //
+    //   template<>
+    //   void f<int>(int*) noexcept; // explicit specialization of #2
+    //
+    // Although there is no corresponding wording in the standard, this seems
+    // to be the intended behavior given the definition of
+    // 'deduction substitution loci' in [temp.deduct].
+    ::MarkUsedTemplateParameters(
+        S.Context,
+        S.Context.getFunctionTypeWithExceptionSpec(FD2->getType(), EST_None),
+        /*OnlyDeduced=*/false, TemplateParams->getDepth(), UsedParameters);
     break;
   }
 
diff --git a/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.partial/p3.cpp b/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.partial/p3.cpp
new file mode 100644
index 00000000000000..cc1d4ecda2ecca
--- /dev/null
+++ b/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.partial/p3.cpp
@@ -0,0 +1,72 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+template<bool B>
+struct A { };
+
+constexpr A<false> a;
+constexpr A<false> b;
+
+constexpr int* x = nullptr;
+constexpr short* y = nullptr;
+
+namespace ExplicitArgs {
+  template<typename T, typename U>
+  constexpr int f(U) noexcept(noexcept(T())) {
+    return 0;
+  }
+
+  template<typename T>
+  constexpr int f(T*) noexcept {
+    return 1;
+  }
+
+  template<>
+  constexpr int f<int>(int*) noexcept {
+    return 2;
+  }
+
+  static_assert(f<int>(1) == 0);
+  static_assert(f<short>(y) == 1);
+  static_assert(f<int>(x) == 2);
+
+  template<typename T, typename U>
+  constexpr int g(U*) noexcept(noexcept(T())) {
+    return 3;
+  }
+
+  template<typename T>
+  constexpr int g(T) noexcept {
+    return 4;
+  }
+
+  template<>
+  constexpr int g<int>(int*) noexcept {
+    return 5;
+  }
+
+  static_assert(g<int>(y) == 3);
+  static_assert(g<short>(1) == 4);
+  static_assert(g<int>(x) == 5);
+} // namespace ExplicitArgs
+
+namespace DeducedArgs {
+  template<typename T, bool B>
+  constexpr int f(T, A<B>) noexcept(B) {
+    return 0;
+  }
+
+  template<typename T, bool B>
+  constexpr int f(T*, A<B>) noexcept(B && B) {
+    return 1;
+  }
+
+  template<>
+  constexpr int f(int*, A<false>) {
+    return 2;
+  }
+
+  static_assert(f<int*>(x, a) == 0);
+  static_assert(f<short>(y, a) == 1);
+  static_assert(f<int>(x, a) == 2);
+} // namespace DeducedArgs

From 325d1d0b73aa6bff0ce4174b45a7601f6b32a793 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Wed, 15 May 2024 15:58:20 -0700
Subject: [PATCH 08/16] [flang] Fix purity checking for internal subprograms
 (#91759)

ELEMENTAL internal subprograms are pure unless explicitly IMPURE.
---
 flang/lib/Semantics/check-purity.cpp | 10 +++--
 flang/test/Semantics/pure02.f90      | 59 ++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 3 deletions(-)
 create mode 100644 flang/test/Semantics/pure02.f90

diff --git a/flang/lib/Semantics/check-purity.cpp b/flang/lib/Semantics/check-purity.cpp
index 5176390f366bd5..55a9a2f107388d 100644
--- a/flang/lib/Semantics/check-purity.cpp
+++ b/flang/lib/Semantics/check-purity.cpp
@@ -39,12 +39,16 @@ bool PurityChecker::InPureSubprogram() const {
 
 bool PurityChecker::HasPurePrefix(
     const std::list<parser::PrefixSpec> &prefixes) const {
+  bool result{false};
   for (const parser::PrefixSpec &prefix : prefixes) {
-    if (std::holds_alternative<parser::PrefixSpec::Pure>(prefix.u)) {
-      return true;
+    if (std::holds_alternative<parser::PrefixSpec::Impure>(prefix.u)) {
+      return false;
+    } else if (std::holds_alternative<parser::PrefixSpec::Pure>(prefix.u) ||
+        std::holds_alternative<parser::PrefixSpec::Elemental>(prefix.u)) {
+      result = true;
     }
   }
-  return false;
+  return result;
 }
 
 void PurityChecker::Entered(
diff --git a/flang/test/Semantics/pure02.f90 b/flang/test/Semantics/pure02.f90
new file mode 100644
index 00000000000000..11dc0fd268293b
--- /dev/null
+++ b/flang/test/Semantics/pure02.f90
@@ -0,0 +1,59 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+pure subroutine s1
+ contains
+  !ERROR: An internal subprogram of a pure subprogram must also be pure
+  subroutine t1
+  end
+  pure subroutine t2 ! ok
+  end
+  elemental subroutine t3(k) ! ok
+    integer, intent(in) :: k
+  end
+  !ERROR: An internal subprogram of a pure subprogram must also be pure
+  impure elemental subroutine t4(k)
+    integer, intent(in) :: k
+  end
+  !ERROR: An internal subprogram of a pure subprogram must also be pure
+  elemental impure subroutine t5(k)
+    integer, intent(in) :: k
+  end
+end
+
+elemental subroutine s2(j)
+  integer, intent(in) :: j
+ contains
+  !ERROR: An internal subprogram of a pure subprogram must also be pure
+  subroutine t1
+  end
+  pure subroutine t2 ! ok
+  end
+  elemental subroutine t3(k) ! ok
+    integer, intent(in) :: k
+  end
+  !ERROR: An internal subprogram of a pure subprogram must also be pure
+  impure elemental subroutine t4(k)
+    integer, intent(in) :: k
+  end
+  !ERROR: An internal subprogram of a pure subprogram must also be pure
+  elemental impure subroutine t5(k)
+    integer, intent(in) :: k
+  end
+end
+
+impure elemental subroutine s3(j)
+  integer, intent(in) :: j
+ contains
+  subroutine t1
+  end
+  pure subroutine t2
+  end
+  elemental subroutine t3(k)
+    integer, intent(in) :: k
+  end
+  impure elemental subroutine t4(k)
+    integer, intent(in) :: k
+  end
+  elemental impure subroutine t5(k)
+    integer, intent(in) :: k
+  end
+end

From c227bf1b217598066acd32de8c9a75c2e0928f89 Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Wed, 15 May 2024 20:01:17 -0300
Subject: [PATCH 09/16] [clang] Create new warning group for deprecation of
 '-fno-relaxed-template-template-args' (#92324)

This allows the warning to be disabled in isolation, as it helps when
treating them as errors.
---
 clang/docs/ReleaseNotes.rst                           |  3 ++-
 clang/include/clang/Basic/DiagnosticDriverKinds.td    |  3 +++
 clang/include/clang/Basic/DiagnosticGroups.td         |  2 ++
 clang/lib/Driver/ToolChains/Clang.cpp                 | 10 +++++++---
 clang/test/Driver/frelaxed-template-template-args.cpp |  4 +++-
 5 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6f7e54252150c6..089a85c8cb3659 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -51,7 +51,8 @@ C++ Specific Potentially Breaking Changes
 - The behavior controlled by the `-frelaxed-template-template-args` flag is now
   on by default, and the flag is deprecated. Until the flag is finally removed,
   it's negative spelling can be used to obtain compatibility with previous
-  versions of clang.
+  versions of clang. The deprecation warning for the negative spelling can be
+  disabled with `-Wno-deprecated-no-relaxed-template-template-args`.
 
 - Clang now rejects pointer to member from parenthesized expression in unevaluated context such as ``decltype(&(foo::bar))``. (#GH40906).
 
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 9781fcaa4ff5e9..9d97a75f696f66 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -436,6 +436,9 @@ def warn_drv_clang_unsupported : Warning<
   "the clang compiler does not support '%0'">;
 def warn_drv_deprecated_arg : Warning<
   "argument '%0' is deprecated%select{|, use '%2' instead}1">, InGroup<Deprecated>;
+def warn_drv_deprecated_arg_no_relaxed_template_template_args : Warning<
+  "argument '-fno-relaxed-template-template-args' is deprecated">,
+  InGroup<DeprecatedNoRelaxedTemplateTemplateArgs>;
 def warn_drv_deprecated_custom : Warning<
   "argument '%0' is deprecated, %1">, InGroup<Deprecated>;
 def warn_drv_assuming_mfloat_abi_is : Warning<
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 2beb1d45124b49..4cb4f3d999f7ab 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -104,6 +104,7 @@ def EnumConversion : DiagGroup<"enum-conversion",
                                [EnumEnumConversion,
                                 EnumFloatConversion,
                                 EnumCompareConditional]>;
+def DeprecatedNoRelaxedTemplateTemplateArgs : DiagGroup<"deprecated-no-relaxed-template-template-args">;
 def ObjCSignedCharBoolImplicitIntConversion :
   DiagGroup<"objc-signed-char-bool-implicit-int-conversion">;
 def Shorten64To32 : DiagGroup<"shorten-64-to-32">;
@@ -228,6 +229,7 @@ def Deprecated : DiagGroup<"deprecated", [DeprecatedAnonEnumEnumConversion,
                                           DeprecatedLiteralOperator,
                                           DeprecatedPragma,
                                           DeprecatedRegister,
+                                          DeprecatedNoRelaxedTemplateTemplateArgs,
                                           DeprecatedThisCapture,
                                           DeprecatedType,
                                           DeprecatedVolatile,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 42feb1650574ed..c3e6d563f3bd21 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7253,10 +7253,14 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   if (Arg *A =
           Args.getLastArg(options::OPT_frelaxed_template_template_args,
                           options::OPT_fno_relaxed_template_template_args)) {
-    D.Diag(diag::warn_drv_deprecated_arg)
-        << A->getAsString(Args) << /*hasReplacement=*/false;
-    if (A->getOption().matches(options::OPT_fno_relaxed_template_template_args))
+    if (A->getOption().matches(
+            options::OPT_fno_relaxed_template_template_args)) {
+      D.Diag(diag::warn_drv_deprecated_arg_no_relaxed_template_template_args);
       CmdArgs.push_back("-fno-relaxed-template-template-args");
+    } else {
+      D.Diag(diag::warn_drv_deprecated_arg)
+          << A->getAsString(Args) << /*hasReplacement=*/false;
+    }
   }
 
   // -fsized-deallocation is off by default, as it is an ABI-breaking change for
diff --git a/clang/test/Driver/frelaxed-template-template-args.cpp b/clang/test/Driver/frelaxed-template-template-args.cpp
index 57fc4e3da6e5d0..7a7fd6f0bbc8f9 100644
--- a/clang/test/Driver/frelaxed-template-template-args.cpp
+++ b/clang/test/Driver/frelaxed-template-template-args.cpp
@@ -1,7 +1,9 @@
 // RUN: %clang -fsyntax-only -### %s 2>&1 | FileCheck --check-prefix=CHECK-DEF %s
 // RUN: %clang -fsyntax-only -frelaxed-template-template-args %s 2>&1 | FileCheck --check-prefix=CHECK-ON %s
 // RUN: %clang -fsyntax-only -fno-relaxed-template-template-args %s 2>&1 | FileCheck --check-prefix=CHECK-OFF %s
+// RUN: %clang -fsyntax-only -fno-relaxed-template-template-args -Wno-deprecated-no-relaxed-template-template-args %s 2>&1 | FileCheck --check-prefix=CHECK-DIS --allow-empty %s
 
 // CHECK-DEF-NOT: "-cc1"{{.*}} "-fno-relaxed-template-template-args"
 // CHECK-ON:  warning: argument '-frelaxed-template-template-args' is deprecated [-Wdeprecated]
-// CHECK-OFF: warning: argument '-fno-relaxed-template-template-args' is deprecated [-Wdeprecated]
+// CHECK-OFF: warning: argument '-fno-relaxed-template-template-args' is deprecated [-Wdeprecated-no-relaxed-template-template-args]
+// CHECK-DIS-NOT: warning: argument '-fno-relaxed-template-template-args' is deprecated

From 7605ad8a2f95e3b37de83e7fb3d320efc74e0ccc Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Wed, 15 May 2024 16:08:06 -0700
Subject: [PATCH 10/16] [flang] Always check procedure characterizability
 (#92008)

When a procedure is defined with a subprogram but never referenced in a
compilation unit, it may not be characterized until lowering, and any
errors in characterization then may crash the compiler. So always ensure
that procedure definitions are characterizable in declaration checking.

Fixes https://github.com/llvm/llvm-project/issues/91845.
---
 flang/lib/Semantics/check-declarations.cpp |  9 +++++++++
 flang/lib/Semantics/resolve-names.cpp      |  3 +--
 flang/test/Semantics/entry01.f90           |  2 ++
 flang/test/Semantics/resolve102.f90        | 23 +++++-----------------
 4 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index ce7870b8d54e4d..8d17989ac2793f 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -1357,6 +1357,15 @@ bool CheckHelper::IsResultOkToDiffer(const FunctionResult &result) {
 
 void CheckHelper::CheckSubprogram(
     const Symbol &symbol, const SubprogramDetails &details) {
+  // Evaluate a procedure definition's characteristics to flush out
+  // any errors that analysis might expose, in case this subprogram hasn't
+  // had any calls in this compilation unit that would have validated them.
+  if (!context_.HasError(symbol) && !details.isDummy() &&
+      !details.isInterface() && !details.stmtFunction()) {
+    if (!Procedure::Characterize(symbol, foldingContext_)) {
+      context_.SetError(symbol);
+    }
+  }
   if (const Symbol *iface{FindSeparateModuleSubprogramInterface(&symbol)}) {
     SubprogramMatchHelper{*this}.Check(symbol, *iface);
   }
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index e2875081b732c7..5626f2a8be97aa 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -5013,8 +5013,7 @@ bool DeclarationVisitor::HasCycle(
     if (procsInCycle.count(*interface) > 0) {
       for (const auto &procInCycle : procsInCycle) {
         Say(procInCycle->name(),
-            "The interface for procedure '%s' is recursively "
-            "defined"_err_en_US,
+            "The interface for procedure '%s' is recursively defined"_err_en_US,
             procInCycle->name());
         context().SetError(*procInCycle);
       }
diff --git a/flang/test/Semantics/entry01.f90 b/flang/test/Semantics/entry01.f90
index 970cd109921a15..765b18c2e81a8a 100644
--- a/flang/test/Semantics/entry01.f90
+++ b/flang/test/Semantics/entry01.f90
@@ -83,6 +83,7 @@ function ifunc()
   !ERROR: 'ibad1' is already declared in this scoping unit
   entry ibad1() result(ibad1res) ! C1570
   !ERROR: 'ibad2' is already declared in this scoping unit
+  !ERROR: Procedure 'ibad2' is referenced before being sufficiently defined in a context where it must be so
   entry ibad2()
   !ERROR: ENTRY in a function may not have an alternate return dummy argument
   entry ibadalt(*) ! C1573
@@ -91,6 +92,7 @@ function ifunc()
   entry iok()
   !ERROR: Explicit RESULT('iok') of function 'isameres2' cannot have the same name as a distinct ENTRY into the same scope
   entry isameres2() result(iok) ! C1574
+  !ERROR: Procedure 'iok2' is referenced before being sufficiently defined in a context where it must be so
   !ERROR: Explicit RESULT('iok2') of function 'isameres3' cannot have the same name as a distinct ENTRY into the same scope
   entry isameres3() result(iok2) ! C1574
   !ERROR: 'iok2' is already declared in this scoping unit
diff --git a/flang/test/Semantics/resolve102.f90 b/flang/test/Semantics/resolve102.f90
index 8f6e2246a57e79..33cf6fa245ea49 100644
--- a/flang/test/Semantics/resolve102.f90
+++ b/flang/test/Semantics/resolve102.f90
@@ -4,17 +4,12 @@
 !ERROR: Procedure 'sub' is recursively defined.  Procedures in the cycle: 'sub', 'p2'
 subroutine sub(p2)
   PROCEDURE(sub) :: p2
-
-  call sub()
 end subroutine
 
 subroutine circular
-  !ERROR: Procedure 'p' is recursively defined.  Procedures in the cycle: 'p', 'sub', 'p2'
   procedure(sub) :: p
-
-  call p(sub)
-
   contains
+    !ERROR: Procedure 'sub' is recursively defined.  Procedures in the cycle: 'p', 'sub', 'p2'
     subroutine sub(p2)
       procedure(p) :: p2
     end subroutine
@@ -41,11 +36,10 @@ subroutine sub(p2)
 
 subroutine mutual
   Procedure(sub1) :: p
-
-  Call p(sub)
-
   contains
     !ERROR: Procedure 'sub1' is recursively defined.  Procedures in the cycle: 'p', 'sub1', 'arg'
+    !ERROR: Procedure 'sub1' is recursively defined.  Procedures in the cycle: 'sub1', 'arg', 'sub', 'p2'
+    !ERROR: Procedure 'sub1' is recursively defined.  Procedures in the cycle: 'sub1', 'arg'
     Subroutine sub1(arg)
       procedure(sub1) :: arg
     End Subroutine
@@ -57,15 +51,14 @@ Subroutine sub(p2)
 
 subroutine mutual1
   Procedure(sub1) :: p
-
-  Call p(sub)
-
   contains
     !ERROR: Procedure 'sub1' is recursively defined.  Procedures in the cycle: 'p', 'sub1', 'arg', 'sub', 'p2'
+    !ERROR: Procedure 'sub1' is recursively defined.  Procedures in the cycle: 'sub1', 'arg', 'sub', 'p2'
     Subroutine sub1(arg)
       procedure(sub) :: arg
     End Subroutine
 
+    !ERROR: Procedure 'sub' is recursively defined.  Procedures in the cycle: 'sub1', 'arg', 'sub', 'p2'
     Subroutine sub(p2)
       Procedure(sub1) :: p2
     End Subroutine
@@ -76,8 +69,6 @@ subroutine twoCycle
   !ERROR: The interface for procedure 'p2' is recursively defined
   procedure(p1) p2
   procedure(p2) p1
-  call p1
-  call p2
 end subroutine
 
 subroutine threeCycle
@@ -87,9 +78,6 @@ subroutine threeCycle
   !ERROR: The interface for procedure 'p3' is recursively defined
   procedure(p2) p3
   procedure(p3) p1
-  call p1
-  call p2
-  call p3
 end subroutine
 
 module mutualSpecExprs
@@ -118,4 +106,3 @@ function ifunc(x)
     ifunc = x
   end
 end
-

From 463f58a564a8d136b3e5d56d23bb86b99ab75245 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Wed, 15 May 2024 16:18:47 -0700
Subject: [PATCH 11/16] [flang] Further work on relaxing BIND(C) enforcement
 (#92029)

When a BIND(C) interface or subprogram has a dummy argument whose
derived type is not BIND(C) but meets the constraints and requirements
of a BIND(C) type, accept it with a warning.
---
 flang/lib/Semantics/check-declarations.cpp | 16 +++++---
 flang/test/Semantics/bind-c15.f90          | 45 ++++++++++++++++++++++
 2 files changed, 56 insertions(+), 5 deletions(-)
 create mode 100644 flang/test/Semantics/bind-c15.f90

diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 8d17989ac2793f..527a1a9539aa6c 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -2891,7 +2891,8 @@ parser::Messages CheckHelper::WhyNotInteroperableDerivedType(
       } else {
         bool interoperableParent{true};
         if (parent->symbol()) {
-          auto bad{WhyNotInteroperableDerivedType(*parent->symbol(), false)};
+          auto bad{WhyNotInteroperableDerivedType(
+              *parent->symbol(), /*isError=*/false)};
           if (bad.AnyFatalError()) {
             auto &msg{msgs.Say(symbol.name(),
                 "The parent of an interoperable type is not interoperable"_err_en_US)};
@@ -2981,6 +2982,9 @@ parser::Messages CheckHelper::WhyNotInteroperableDerivedType(
       }
     }
   }
+  if (msgs.AnyFatalError()) {
+    examinedByWhyNotInteroperableDerivedType_.erase(symbol);
+  }
   return msgs;
 }
 
@@ -3068,8 +3072,8 @@ void CheckHelper::CheckBindC(const Symbol &symbol) {
           }
           context_.SetError(symbol);
         } else if (auto bad{WhyNotInteroperableDerivedType(
-                       derived->typeSymbol(), false)};
-                   !bad.empty()) {
+                       derived->typeSymbol(), /*isError=*/false)};
+                   bad.AnyFatalError()) {
           if (auto *msg{messages_.Say(symbol.name(),
                   "The derived type of an interoperable object must be interoperable, but is not"_err_en_US)}) {
             msg->Attach(
@@ -3077,7 +3081,9 @@ void CheckHelper::CheckBindC(const Symbol &symbol) {
             bad.AttachTo(*msg, parser::Severity::None);
           }
           context_.SetError(symbol);
-        } else {
+        } else if (context_.ShouldWarn(
+                       common::LanguageFeature::NonBindCInteroperability) &&
+            !InModuleFile()) {
           if (auto *msg{messages_.Say(symbol.name(),
                   "The derived type of an interoperable object should be BIND(C)"_warn_en_US)}) {
             msg->Attach(derived->typeSymbol().name(), "Non-BIND(C) type"_en_US);
@@ -3151,7 +3157,7 @@ void CheckHelper::CheckBindC(const Symbol &symbol) {
       }
     }
   } else if (symbol.has<DerivedTypeDetails>()) {
-    if (auto msgs{WhyNotInteroperableDerivedType(symbol, false)};
+    if (auto msgs{WhyNotInteroperableDerivedType(symbol, /*isError=*/false)};
         !msgs.empty()) {
       bool anyFatal{msgs.AnyFatalError()};
       if (msgs.AnyFatalError() ||
diff --git a/flang/test/Semantics/bind-c15.f90 b/flang/test/Semantics/bind-c15.f90
new file mode 100644
index 00000000000000..9aaad52cc0e0a4
--- /dev/null
+++ b/flang/test/Semantics/bind-c15.f90
@@ -0,0 +1,45 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic
+
+module m
+  type, bind(c) :: explicit_bind_c
+    real a
+  end type
+  type :: interoperable1
+    type(explicit_bind_c) a
+  end type
+  type, extends(interoperable1) :: interoperable2
+    real b
+  end type
+  type :: non_interoperable1
+    real, allocatable :: a
+  end type
+  type :: non_interoperable2
+    type(non_interoperable1) b
+  end type
+  interface
+    subroutine sub_bind_c_1(x_bind_c) bind(c)
+      import explicit_bind_c
+      type(explicit_bind_c), intent(in) :: x_bind_c
+    end
+    subroutine sub_bind_c_2(x_interop1) bind(c)
+      import interoperable1
+      !WARNING: The derived type of an interoperable object should be BIND(C)
+      type(interoperable1), intent(in) :: x_interop1
+    end
+    subroutine sub_bind_c_3(x_interop2) bind(c)
+      import interoperable2
+      !WARNING: The derived type of an interoperable object should be BIND(C)
+      type(interoperable2), intent(in) :: x_interop2
+    end
+    subroutine sub_bind_c_4(x_non_interop1) bind(c)
+      import non_interoperable1
+      !ERROR: The derived type of an interoperable object must be interoperable, but is not
+      type(non_interoperable1), intent(in) :: x_non_interop1
+    end
+    subroutine sub_bind_c_5(x_non_interop2) bind(c)
+      import non_interoperable2
+      !ERROR: The derived type of an interoperable object must be interoperable, but is not
+      type(non_interoperable2), intent(in) :: x_non_interop2
+    end
+  end interface
+end

From 5bbb63bd6d6d3929de643fcd88babbda20c97b69 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Wed, 15 May 2024 16:28:58 -0700
Subject: [PATCH 12/16] [flang] Parse REDUCE clauses in !$CUF KERNEL DO
 (#92154)

A !$CUF KERNEL DO directive is allowed to have advisory REDUCE clauses
similar to those in OpenACC and DO CONCURRENT. Parse and represent them.
Semantic validation will follow.
---
 flang/include/flang/Parser/dump-parse-tree.h |  1 +
 flang/include/flang/Parser/parse-tree.h      | 18 ++++-
 flang/lib/Parser/executable-parsers.cpp      | 23 +++++--
 flang/lib/Parser/openacc-parsers.cpp         |  6 +-
 flang/lib/Parser/unparse.cpp                 | 36 +++++++++-
 flang/lib/Semantics/check-cuda.cpp           | 44 ++++++++++++
 flang/lib/Semantics/resolve-directives.h     |  2 +-
 flang/lib/Semantics/resolve-names.cpp        |  2 +-
 flang/test/Parser/cuf-sanity-common          |  7 ++
 flang/test/Parser/cuf-sanity-unparse.CUF     |  6 ++
 flang/test/Semantics/reduce.cuf              | 72 ++++++++++++++++++++
 11 files changed, 199 insertions(+), 18 deletions(-)
 create mode 100644 flang/test/Semantics/reduce.cuf

diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 477d391277ee25..68ae50c312cdeb 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -236,6 +236,7 @@ class ParseTreeDumper {
   NODE(parser, CUFKernelDoConstruct)
   NODE(CUFKernelDoConstruct, StarOrExpr)
   NODE(CUFKernelDoConstruct, Directive)
+  NODE(parser, CUFReduction)
   NODE(parser, CycleStmt)
   NODE(parser, DataComponentDefStmt)
   NODE(parser, DataIDoObject)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index c0635445837904..0a40aa8b8f616f 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4303,12 +4303,23 @@ struct OpenACCConstruct {
 };
 
 // CUF-kernel-do-construct ->
-//     !$CUF KERNEL DO [ (scalar-int-constant-expr) ] <<< grid, block [, stream]
-//     >>> do-construct
+//   !$CUF KERNEL DO [ (scalar-int-constant-expr) ]
+//      <<< grid, block [, stream] >>>
+//      [ cuf-reduction... ]
+//      do-construct
 // star-or-expr -> * | scalar-int-expr
 // grid -> * | scalar-int-expr | ( star-or-expr-list )
 // block -> * | scalar-int-expr | ( star-or-expr-list )
 // stream -> 0, scalar-int-expr | STREAM = scalar-int-expr
+// cuf-reduction -> [ REDUCE | REDUCTION ] (
+//                  acc-reduction-op : scalar-variable-list )
+
+struct CUFReduction {
+  TUPLE_CLASS_BOILERPLATE(CUFReduction);
+  using Operator = AccReductionOperator;
+  std::tuple<Operator, std::list<Scalar<Variable>>> t;
+};
+
 struct CUFKernelDoConstruct {
   TUPLE_CLASS_BOILERPLATE(CUFKernelDoConstruct);
   WRAPPER_CLASS(StarOrExpr, std::optional<ScalarIntExpr>);
@@ -4316,7 +4327,8 @@ struct CUFKernelDoConstruct {
     TUPLE_CLASS_BOILERPLATE(Directive);
     CharBlock source;
     std::tuple<std::optional<ScalarIntConstantExpr>, std::list<StarOrExpr>,
-        std::list<StarOrExpr>, std::optional<ScalarIntExpr>>
+        std::list<StarOrExpr>, std::optional<ScalarIntExpr>,
+        std::list<CUFReduction>>
         t;
   };
   std::tuple<Directive, std::optional<DoConstruct>> t;
diff --git a/flang/lib/Parser/executable-parsers.cpp b/flang/lib/Parser/executable-parsers.cpp
index 07a570bd61e990..382a593416872a 100644
--- a/flang/lib/Parser/executable-parsers.cpp
+++ b/flang/lib/Parser/executable-parsers.cpp
@@ -538,25 +538,34 @@ TYPE_CONTEXT_PARSER("UNLOCK statement"_en_US,
     construct<UnlockStmt>("UNLOCK (" >> lockVariable,
         defaulted("," >> nonemptyList(statOrErrmsg)) / ")"))
 
-// CUF-kernel-do-construct -> CUF-kernel-do-directive do-construct
-// CUF-kernel-do-directive ->
-//     !$CUF KERNEL DO [ (scalar-int-constant-expr) ] <<< grid, block [, stream]
-//     >>> do-construct
+// CUF-kernel-do-construct ->
+//   !$CUF KERNEL DO [ (scalar-int-constant-expr) ]
+//      <<< grid, block [, stream] >>>
+//      [ cuf-reduction... ]
+//      do-construct
 // star-or-expr -> * | scalar-int-expr
 // grid -> * | scalar-int-expr | ( star-or-expr-list )
 // block -> * | scalar-int-expr | ( star-or-expr-list )
-// stream -> ( 0, | STREAM = ) scalar-int-expr
+// stream -> 0, scalar-int-expr | STREAM = scalar-int-expr
+// cuf-reduction -> [ REDUCTION | REDUCE ] (
+//                  acc-reduction-op : scalar-variable-list )
+
 constexpr auto starOrExpr{construct<CUFKernelDoConstruct::StarOrExpr>(
     "*" >> pure<std::optional<ScalarIntExpr>>() ||
     applyFunction(presentOptional<ScalarIntExpr>, scalarIntExpr))};
 constexpr auto gridOrBlock{parenthesized(nonemptyList(starOrExpr)) ||
     applyFunction(singletonList<CUFKernelDoConstruct::StarOrExpr>, starOrExpr)};
+
+TYPE_PARSER(("REDUCTION"_tok || "REDUCE"_tok) >>
+    parenthesized(construct<CUFReduction>(Parser<CUFReduction::Operator>{},
+        ":" >> nonemptyList(scalar(variable)))))
+
 TYPE_PARSER(sourced(beginDirective >> "$CUF KERNEL DO"_tok >>
     construct<CUFKernelDoConstruct::Directive>(
         maybe(parenthesized(scalarIntConstantExpr)), "<<<" >> gridOrBlock,
         "," >> gridOrBlock,
-        maybe((", 0 ,"_tok || ", STREAM ="_tok) >> scalarIntExpr) / ">>>" /
-            endDirective)))
+        maybe((", 0 ,"_tok || ", STREAM ="_tok) >> scalarIntExpr) / ">>>",
+        many(Parser<CUFReduction>{}) / endDirective)))
 TYPE_CONTEXT_PARSER("!$CUF KERNEL DO construct"_en_US,
     extension<LanguageFeature::CUDA>(construct<CUFKernelDoConstruct>(
         Parser<CUFKernelDoConstruct::Directive>{},
diff --git a/flang/lib/Parser/openacc-parsers.cpp b/flang/lib/Parser/openacc-parsers.cpp
index 946b33d0084a96..3d919e29a24826 100644
--- a/flang/lib/Parser/openacc-parsers.cpp
+++ b/flang/lib/Parser/openacc-parsers.cpp
@@ -19,9 +19,9 @@
 // OpenACC Directives and Clauses
 namespace Fortran::parser {
 
-constexpr auto startAccLine = skipStuffBeforeStatement >>
-    ("!$ACC "_sptok || "C$ACC "_sptok || "*$ACC "_sptok);
-constexpr auto endAccLine = space >> endOfLine;
+constexpr auto startAccLine{skipStuffBeforeStatement >>
+    ("!$ACC "_sptok || "C$ACC "_sptok || "*$ACC "_sptok)};
+constexpr auto endAccLine{space >> endOfLine};
 
 // Autogenerated clauses parser. Information is taken from ACC.td and the
 // parser is generated by tablegen.
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 3398b395f198f8..1639e900903fe9 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2705,7 +2705,6 @@ class UnparseVisitor {
   void Unparse(const CLASS::ENUM &x) { Word(CLASS::EnumToString(x)); }
   WALK_NESTED_ENUM(AccDataModifier, Modifier)
   WALK_NESTED_ENUM(AccessSpec, Kind) // R807
-  WALK_NESTED_ENUM(AccReductionOperator, Operator)
   WALK_NESTED_ENUM(common, TypeParamAttr) // R734
   WALK_NESTED_ENUM(common, CUDADataAttr) // CUDA
   WALK_NESTED_ENUM(common, CUDASubprogramAttrs) // CUDA
@@ -2736,6 +2735,31 @@ class UnparseVisitor {
   WALK_NESTED_ENUM(OmpOrderClause, Type) // OMP order-type
   WALK_NESTED_ENUM(OmpOrderModifier, Kind) // OMP order-modifier
 #undef WALK_NESTED_ENUM
+  void Unparse(const AccReductionOperator::Operator x) {
+    switch (x) {
+    case AccReductionOperator::Operator::Plus:
+      Word("+");
+      break;
+    case AccReductionOperator::Operator::Multiply:
+      Word("*");
+      break;
+    case AccReductionOperator::Operator::And:
+      Word(".AND.");
+      break;
+    case AccReductionOperator::Operator::Or:
+      Word(".OR.");
+      break;
+    case AccReductionOperator::Operator::Eqv:
+      Word(".EQV.");
+      break;
+    case AccReductionOperator::Operator::Neqv:
+      Word(".NEQV.");
+      break;
+    default:
+      Word(AccReductionOperator::EnumToString(x));
+      break;
+    }
+  }
 
   void Unparse(const CUFKernelDoConstruct::StarOrExpr &x) {
     if (x.v) {
@@ -2768,13 +2792,19 @@ class UnparseVisitor {
     if (const auto &stream{std::get<3>(x.t)}) {
       Word(",STREAM="), Walk(*stream);
     }
-    Word(">>>\n");
+    Word(">>>");
+    Walk(" ", std::get<std::list<CUFReduction>>(x.t), " ");
+    Word("\n");
   }
-
   void Unparse(const CUFKernelDoConstruct &x) {
     Walk(std::get<CUFKernelDoConstruct::Directive>(x.t));
     Walk(std::get<std::optional<DoConstruct>>(x.t));
   }
+  void Unparse(const CUFReduction &x) {
+    Word("REDUCE(");
+    Walk(std::get<CUFReduction::Operator>(x.t));
+    Walk(":", std::get<std::list<Scalar<Variable>>>(x.t), ",", ")");
+  }
 
   void Done() const { CHECK(indent_ == 0); }
 
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index 96ab9023926330..45217ed2e3ccd2 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -463,6 +463,46 @@ static int DoConstructTightNesting(
   return 1;
 }
 
+static void CheckReduce(
+    SemanticsContext &context, const parser::CUFReduction &reduce) {
+  auto op{std::get<parser::CUFReduction::Operator>(reduce.t).v};
+  for (const auto &var :
+      std::get<std::list<parser::Scalar<parser::Variable>>>(reduce.t)) {
+    if (const auto &typedExprPtr{var.thing.typedExpr};
+        typedExprPtr && typedExprPtr->v) {
+      const auto &expr{*typedExprPtr->v};
+      if (auto type{expr.GetType()}) {
+        auto cat{type->category()};
+        bool isOk{false};
+        switch (op) {
+        case parser::AccReductionOperator::Operator::Plus:
+        case parser::AccReductionOperator::Operator::Multiply:
+        case parser::AccReductionOperator::Operator::Max:
+        case parser::AccReductionOperator::Operator::Min:
+          isOk = cat == TypeCategory::Integer || cat == TypeCategory::Real;
+          break;
+        case parser::AccReductionOperator::Operator::Iand:
+        case parser::AccReductionOperator::Operator::Ior:
+        case parser::AccReductionOperator::Operator::Ieor:
+          isOk = cat == TypeCategory::Integer;
+          break;
+        case parser::AccReductionOperator::Operator::And:
+        case parser::AccReductionOperator::Operator::Or:
+        case parser::AccReductionOperator::Operator::Eqv:
+        case parser::AccReductionOperator::Operator::Neqv:
+          isOk = cat == TypeCategory::Logical;
+          break;
+        }
+        if (!isOk) {
+          context.Say(var.thing.GetSource(),
+              "!$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type %s"_err_en_US,
+              type->AsFortran());
+        }
+      }
+    }
+  }
+}
+
 void CUDAChecker::Enter(const parser::CUFKernelDoConstruct &x) {
   auto source{std::get<parser::CUFKernelDoConstruct::Directive>(x.t).source};
   const auto &directive{std::get<parser::CUFKernelDoConstruct::Directive>(x.t)};
@@ -489,6 +529,10 @@ void CUDAChecker::Enter(const parser::CUFKernelDoConstruct &x) {
   if (innerBlock) {
     DeviceContextChecker<true>{context_}.Check(*innerBlock);
   }
+  for (const auto &reduce :
+      std::get<std::list<parser::CUFReduction>>(directive.t)) {
+    CheckReduce(context_, reduce);
+  }
 }
 
 void CUDAChecker::Enter(const parser::AssignmentStmt &x) {
diff --git a/flang/lib/Semantics/resolve-directives.h b/flang/lib/Semantics/resolve-directives.h
index 4aef8ad6c40081..5a890c26aa3349 100644
--- a/flang/lib/Semantics/resolve-directives.h
+++ b/flang/lib/Semantics/resolve-directives.h
@@ -21,7 +21,7 @@ class SemanticsContext;
 
 // Name resolution for OpenACC and OpenMP directives
 void ResolveAccParts(
-    SemanticsContext &, const parser::ProgramUnit &, Scope *topScope = {});
+    SemanticsContext &, const parser::ProgramUnit &, Scope *topScope);
 void ResolveOmpParts(SemanticsContext &, const parser::ProgramUnit &);
 void ResolveOmpTopLevelParts(SemanticsContext &, const parser::Program &);
 
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 5626f2a8be97aa..40eee89de131a3 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -8940,7 +8940,7 @@ bool ResolveNamesVisitor::Pre(const parser::ProgramUnit &x) {
   FinishSpecificationParts(root);
   ResolveExecutionParts(root);
   FinishExecutionParts(root);
-  ResolveAccParts(context(), x);
+  ResolveAccParts(context(), x, /*topScope=*/nullptr);
   ResolveOmpParts(context(), x);
   return false;
 }
diff --git a/flang/test/Parser/cuf-sanity-common b/flang/test/Parser/cuf-sanity-common
index b097a6aa300458..9d73204e3f5f6c 100644
--- a/flang/test/Parser/cuf-sanity-common
+++ b/flang/test/Parser/cuf-sanity-common
@@ -23,12 +23,19 @@ module m
   end subroutine
   subroutine test
     logical isPinned
+    real a(10), x, y, z
     !$cuf kernel do(1) <<<*, *, stream = 1>>>
     do j = 1, 10
     end do
     !$cuf kernel do <<<1, (2, 3), stream = 1>>>
     do j = 1, 10
     end do
+    !$cuf kernel do <<<*, *>>> reduce(+:x,y) reduce(*:z)
+    do j = 1, 10
+      x = x + a(j)
+      y = y + a(j)
+      z = z * a(j)
+    end do
     call globalsub<<<1, 2>>>
     call globalsub<<<1, 2, 3>>>
     call globalsub<<<1, 2, 3, 4>>>
diff --git a/flang/test/Parser/cuf-sanity-unparse.CUF b/flang/test/Parser/cuf-sanity-unparse.CUF
index b6921e74fc05ae..d4be347dd044ea 100644
--- a/flang/test/Parser/cuf-sanity-unparse.CUF
+++ b/flang/test/Parser/cuf-sanity-unparse.CUF
@@ -34,6 +34,12 @@ include "cuf-sanity-common"
 !CHECK:   !$CUF KERNEL DO <<<1_4,(2_4,3_4),STREAM=1_4>>>
 !CHECK:   DO j=1_4,10_4
 !CHECK:   END DO
+!CHECK:   !$CUF KERNEL DO <<<*,*>>> REDUCE(+:x,y) REDUCE(*:z)
+!CHECK:   DO j=1_4,10_4
+!CHECK:    x=x+a(int(j,kind=8))
+!CHECK:    y=y+a(int(j,kind=8))
+!CHECK:    z=z*a(int(j,kind=8))
+!CHECK:   END DO
 !CHECK:    CALL globalsub<<<1_4,2_4>>>()
 !CHECK:    CALL globalsub<<<1_4,2_4,3_4>>>()
 !CHECK:    CALL globalsub<<<1_4,2_4,3_4,4_4>>>()
diff --git a/flang/test/Semantics/reduce.cuf b/flang/test/Semantics/reduce.cuf
new file mode 100644
index 00000000000000..95ff2e87c09b47
--- /dev/null
+++ b/flang/test/Semantics/reduce.cuf
@@ -0,0 +1,72 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+subroutine s(n,m,a,l)
+  integer, intent(in) :: n
+  integer, intent(in) :: m(n)
+  real, intent(in) :: a(n)
+  logical, intent(in) :: l(n)
+  integer j, mr
+  real ar
+  logical lr
+!$cuf kernel do <<<*,*>>> reduce (+:mr,ar)
+  do j=1,n; mr = mr + m(j); ar = ar + a(j); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type LOGICAL(4)
+!$cuf kernel do <<<*,*>>> reduce (+:lr)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (*:mr,ar)
+  do j=1,n; mr = mr * m(j); ar = ar * a(j); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type LOGICAL(4)
+!$cuf kernel do <<<*,*>>> reduce (*:lr)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (max:mr,ar)
+  do j=1,n; mr = max(mr,m(j)); ar = max(ar,a(j)); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type LOGICAL(4)
+!$cuf kernel do <<<*,*>>> reduce (max:lr)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (min:mr,ar)
+  do j=1,n; mr = min(mr,m(j)); ar = min(ar,a(j)); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type LOGICAL(4)
+!$cuf kernel do <<<*,*>>> reduce (min:lr)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (iand:mr)
+  do j=1,n; mr = iand(mr,m(j)); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type REAL(4)
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type LOGICAL(4)
+!$cuf kernel do <<<*,*>>> reduce (iand:ar,lr)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (ieor:mr)
+  do j=1,n; mr = ieor(mr,m(j)); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type REAL(4)
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type LOGICAL(4)
+!$cuf kernel do <<<*,*>>> reduce (ieor:ar,lr)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (ior:mr)
+  do j=1,n; mr = ior(mr,m(j)); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type REAL(4)
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type LOGICAL(4)
+!$cuf kernel do <<<*,*>>> reduce (ior:ar,lr)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (.and.:lr)
+  do j=1,n; lr = lr .and. l(j); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type INTEGER(4)
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type REAL(4)
+!$cuf kernel do <<<*,*>>> reduce (.and.:mr,ar)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (.eqv.:lr)
+  do j=1,n; lr = lr .eqv. l(j); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type INTEGER(4)
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type REAL(4)
+!$cuf kernel do <<<*,*>>> reduce (.eqv.:mr,ar)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (.neqv.:lr)
+  do j=1,n; lr = lr .neqv. l(j); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type INTEGER(4)
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type REAL(4)
+!$cuf kernel do <<<*,*>>> reduce (.neqv.:mr,ar)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (.or.:lr)
+  do j=1,n; lr = lr .or. l(j); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type INTEGER(4)
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type REAL(4)
+!$cuf kernel do <<<*,*>>> reduce (.or.:mr,ar)
+  do j=1,n; end do
+end

From 3ddfb6807e905868a3a9df71fa5ea87309181270 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Wed, 15 May 2024 16:41:12 -0700
Subject: [PATCH 13/16] [flang] Prevent crash from unfoldable TRANSFER()
 (#92282)

When the MOLD= argument's type is polymorphic, the type of the result
cannot be known at compilation time, so the call cannot be folded even
when the SOURCE= is constant.

Fixes https://github.com/llvm/llvm-project/issues/92264.
---
 flang/lib/Evaluate/fold.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/lib/Evaluate/fold.cpp b/flang/lib/Evaluate/fold.cpp
index ed882958199802..cf6262d9a7c65a 100644
--- a/flang/lib/Evaluate/fold.cpp
+++ b/flang/lib/Evaluate/fold.cpp
@@ -272,6 +272,7 @@ std::optional<Expr<SomeType>> FoldTransfer(
     }
   }
   if (sourceBytes && IsActuallyConstant(*source) && moldType && extents &&
+      !moldType->IsPolymorphic() &&
       (moldLength || moldType->category() != TypeCategory::Character)) {
     std::size_t elements{
         extents->empty() ? 1 : static_cast<std::size_t>((*extents)[0])};

From c87b1ca4edefe3c267a20f28eaf79f6b83d36c66 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Wed, 15 May 2024 18:41:25 -0500
Subject: [PATCH 14/16] [InstrProf] Fix bug when clearing traces with samples
 (#92310)

The `--temporal-profile-max-trace-length=0` flag in the `llvm-profdata
merge` command is used to remove traces from a profile. There was a bug
where traces would not be cleared if the profile was already sampled.
This patch fixes that.
---
 llvm/lib/ProfileData/InstrProfWriter.cpp           | 11 ++++++-----
 llvm/test/tools/llvm-profdata/trace-limit.proftext |  6 +++++-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index b61c59aacc0f95..c941b9d89df388 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -320,11 +320,8 @@ void InstrProfWriter::addBinaryIds(ArrayRef<llvm::object::BuildID> BIs) {
 }
 
 void InstrProfWriter::addTemporalProfileTrace(TemporalProfTraceTy Trace) {
-  if (Trace.FunctionNameRefs.size() > MaxTemporalProfTraceLength)
-    Trace.FunctionNameRefs.resize(MaxTemporalProfTraceLength);
-  if (Trace.FunctionNameRefs.empty())
-    return;
-
+  assert(Trace.FunctionNameRefs.size() <= MaxTemporalProfTraceLength);
+  assert(!Trace.FunctionNameRefs.empty());
   if (TemporalProfTraceStreamSize < TemporalProfTraceReservoirSize) {
     // Simply append the trace if we have not yet hit our reservoir size limit.
     TemporalProfTraces.push_back(std::move(Trace));
@@ -341,6 +338,10 @@ void InstrProfWriter::addTemporalProfileTrace(TemporalProfTraceTy Trace) {
 
 void InstrProfWriter::addTemporalProfileTraces(
     SmallVectorImpl<TemporalProfTraceTy> &SrcTraces, uint64_t SrcStreamSize) {
+  for (auto &Trace : SrcTraces)
+    if (Trace.FunctionNameRefs.size() > MaxTemporalProfTraceLength)
+      Trace.FunctionNameRefs.resize(MaxTemporalProfTraceLength);
+  llvm::erase_if(SrcTraces, [](auto &T) { return T.FunctionNameRefs.empty(); });
   // Assume that the source has the same reservoir size as the destination to
   // avoid needing to record it in the indexed profile format.
   bool IsDestSampled =
diff --git a/llvm/test/tools/llvm-profdata/trace-limit.proftext b/llvm/test/tools/llvm-profdata/trace-limit.proftext
index cf6edd648b23b2..e246ee890ba313 100644
--- a/llvm/test/tools/llvm-profdata/trace-limit.proftext
+++ b/llvm/test/tools/llvm-profdata/trace-limit.proftext
@@ -1,13 +1,17 @@
 # RUN: llvm-profdata merge --temporal-profile-max-trace-length=0 %s -o %t.profdata
 # RUN: llvm-profdata show --temporal-profile-traces %t.profdata | FileCheck %s --check-prefix=NONE
 
+# RUN: llvm-profdata merge --temporal-profile-trace-reservoir-size=2 %s %s %s %s -o %t.profdata
+# RUN: llvm-profdata merge --temporal-profile-trace-reservoir-size=2 --temporal-profile-max-trace-length=0 %t.profdata -o %t.profdata
+# RUN: llvm-profdata show --temporal-profile-traces %t.profdata | FileCheck %s --check-prefix=NONE
+
 # RUN: llvm-profdata merge --temporal-profile-max-trace-length=2 %s -o %t.profdata
 # RUN: llvm-profdata show --temporal-profile-traces %t.profdata | FileCheck %s --check-prefixes=CHECK,SOME
 
 # RUN: llvm-profdata merge --temporal-profile-max-trace-length=1000 %s -o %t.profdata
 # RUN: llvm-profdata show --temporal-profile-traces %t.profdata | FileCheck %s --check-prefixes=CHECK,ALL
 
-# NONE: Temporal Profile Traces (samples=0 seen=0):
+# NONE: Temporal Profile Traces (samples=0
 # CHECK: Temporal Profile Traces (samples=1 seen=1):
 # SOME:   Trace 0 (weight=1 count=2):
 # ALL:    Trace 0 (weight=1 count=3):

From c00e012bcf5da384a3e7339dc2e046779b339063 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 15 May 2024 17:03:09 -0700
Subject: [PATCH 15/16] [ctx_profile] Follow the pattern elsewhere for choosing
 the block IDs

This was an oversight in #91859. Using the subblock ID mechanism
other places that use the bitstream APIs (e.g. `BitstreamRemarkSerializer`) use.
---
 llvm/include/llvm/ProfileData/PGOCtxProfWriter.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
index 15578c51a49578..edcf02c0946971 100644
--- a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
+++ b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_PROFILEDATA_PGOCTXPROFWRITER_H_
 #define LLVM_PROFILEDATA_PGOCTXPROFWRITER_H_
 
+#include "llvm/Bitstream/BitCodeEnums.h"
 #include "llvm/Bitstream/BitstreamWriter.h"
 #include "llvm/ProfileData/CtxInstrContextNode.h"
 
@@ -20,7 +21,7 @@ namespace llvm {
 enum PGOCtxProfileRecords { Invalid = 0, Version, Guid, CalleeIndex, Counters };
 
 enum PGOCtxProfileBlockIDs {
-  ProfileMetadataBlockID = 100,
+  ProfileMetadataBlockID = bitc::FIRST_APPLICATION_BLOCKID,
   ContextNodeBlockID = ProfileMetadataBlockID + 1
 };
 

From 772b1b0cb26c66804d0a7e416dc7a5742b7f8db2 Mon Sep 17 00:00:00 2001
From: ChiaHungDuan <chiahungduan@google.com>
Date: Wed, 15 May 2024 17:13:08 -0700
Subject: [PATCH 16/16] [scudo] Move the chunk update into functions (#83493)

The code paths for mte enabled and disabled were interleaving and which
increases the difficulty of reading each path in both source level and
assembly level. In this change, we move the parts that they have
different logic into functions and minor refactors on the code
structure.
---
 compiler-rt/lib/scudo/standalone/combined.h | 371 ++++++++++++--------
 1 file changed, 221 insertions(+), 150 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 927513dea92dab..15a199ae0349ba 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -410,133 +410,18 @@ class Allocator {
       reportOutOfMemory(NeededSize);
     }
 
-    const uptr BlockUptr = reinterpret_cast<uptr>(Block);
-    const uptr UnalignedUserPtr = BlockUptr + Chunk::getHeaderSize();
-    const uptr UserPtr = roundUp(UnalignedUserPtr, Alignment);
-
-    void *Ptr = reinterpret_cast<void *>(UserPtr);
-    void *TaggedPtr = Ptr;
-    if (LIKELY(ClassId)) {
-      // We only need to zero or tag the contents for Primary backed
-      // allocations. We only set tags for primary allocations in order to avoid
-      // faulting potentially large numbers of pages for large secondary
-      // allocations. We assume that guard pages are enough to protect these
-      // allocations.
-      //
-      // FIXME: When the kernel provides a way to set the background tag of a
-      // mapping, we should be able to tag secondary allocations as well.
-      //
-      // When memory tagging is enabled, zeroing the contents is done as part of
-      // setting the tag.
-      if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options))) {
-        uptr PrevUserPtr;
-        Chunk::UnpackedHeader Header;
-        const uptr BlockSize = PrimaryT::getSizeByClassId(ClassId);
-        const uptr BlockEnd = BlockUptr + BlockSize;
-        // If possible, try to reuse the UAF tag that was set by deallocate().
-        // For simplicity, only reuse tags if we have the same start address as
-        // the previous allocation. This handles the majority of cases since
-        // most allocations will not be more aligned than the minimum alignment.
-        //
-        // We need to handle situations involving reclaimed chunks, and retag
-        // the reclaimed portions if necessary. In the case where the chunk is
-        // fully reclaimed, the chunk's header will be zero, which will trigger
-        // the code path for new mappings and invalid chunks that prepares the
-        // chunk from scratch. There are three possibilities for partial
-        // reclaiming:
-        //
-        // (1) Header was reclaimed, data was partially reclaimed.
-        // (2) Header was not reclaimed, all data was reclaimed (e.g. because
-        //     data started on a page boundary).
-        // (3) Header was not reclaimed, data was partially reclaimed.
-        //
-        // Case (1) will be handled in the same way as for full reclaiming,
-        // since the header will be zero.
-        //
-        // We can detect case (2) by loading the tag from the start
-        // of the chunk. If it is zero, it means that either all data was
-        // reclaimed (since we never use zero as the chunk tag), or that the
-        // previous allocation was of size zero. Either way, we need to prepare
-        // a new chunk from scratch.
-        //
-        // We can detect case (3) by moving to the next page (if covered by the
-        // chunk) and loading the tag of its first granule. If it is zero, it
-        // means that all following pages may need to be retagged. On the other
-        // hand, if it is nonzero, we can assume that all following pages are
-        // still tagged, according to the logic that if any of the pages
-        // following the next page were reclaimed, the next page would have been
-        // reclaimed as well.
-        uptr TaggedUserPtr;
-        if (getChunkFromBlock(BlockUptr, &PrevUserPtr, &Header) &&
-            PrevUserPtr == UserPtr &&
-            (TaggedUserPtr = loadTag(UserPtr)) != UserPtr) {
-          uptr PrevEnd = TaggedUserPtr + Header.SizeOrUnusedBytes;
-          const uptr NextPage = roundUp(TaggedUserPtr, getPageSizeCached());
-          if (NextPage < PrevEnd && loadTag(NextPage) != NextPage)
-            PrevEnd = NextPage;
-          TaggedPtr = reinterpret_cast<void *>(TaggedUserPtr);
-          resizeTaggedChunk(PrevEnd, TaggedUserPtr + Size, Size, BlockEnd);
-          if (UNLIKELY(FillContents != NoFill && !Header.OriginOrWasZeroed)) {
-            // If an allocation needs to be zeroed (i.e. calloc) we can normally
-            // avoid zeroing the memory now since we can rely on memory having
-            // been zeroed on free, as this is normally done while setting the
-            // UAF tag. But if tagging was disabled per-thread when the memory
-            // was freed, it would not have been retagged and thus zeroed, and
-            // therefore it needs to be zeroed now.
-            memset(TaggedPtr, 0,
-                   Min(Size, roundUp(PrevEnd - TaggedUserPtr,
-                                     archMemoryTagGranuleSize())));
-          } else if (Size) {
-            // Clear any stack metadata that may have previously been stored in
-            // the chunk data.
-            memset(TaggedPtr, 0, archMemoryTagGranuleSize());
-          }
-        } else {
-          const uptr OddEvenMask =
-              computeOddEvenMaskForPointerMaybe(Options, BlockUptr, ClassId);
-          TaggedPtr = prepareTaggedChunk(Ptr, Size, OddEvenMask, BlockEnd);
-        }
-        storePrimaryAllocationStackMaybe(Options, Ptr);
-      } else {
-        Block = addHeaderTag(Block);
-        Ptr = addHeaderTag(Ptr);
-        if (UNLIKELY(FillContents != NoFill)) {
-          // This condition is not necessarily unlikely, but since memset is
-          // costly, we might as well mark it as such.
-          memset(Block, FillContents == ZeroFill ? 0 : PatternFillByte,
-                 PrimaryT::getSizeByClassId(ClassId));
-        }
-      }
-    } else {
-      Block = addHeaderTag(Block);
-      Ptr = addHeaderTag(Ptr);
-      if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options))) {
-        storeTags(reinterpret_cast<uptr>(Block), reinterpret_cast<uptr>(Ptr));
-        storeSecondaryAllocationStackMaybe(Options, Ptr, Size);
-      }
+    const uptr UserPtr = roundUp(
+        reinterpret_cast<uptr>(Block) + Chunk::getHeaderSize(), Alignment);
+    const uptr SizeOrUnusedBytes =
+        ClassId ? Size : SecondaryBlockEnd - (UserPtr + Size);
+
+    if (LIKELY(!useMemoryTagging<AllocatorConfig>(Options))) {
+      return initChunk(ClassId, Origin, Block, UserPtr, SizeOrUnusedBytes,
+                       FillContents);
     }
 
-    Chunk::UnpackedHeader Header = {};
-    if (UNLIKELY(UnalignedUserPtr != UserPtr)) {
-      const uptr Offset = UserPtr - UnalignedUserPtr;
-      DCHECK_GE(Offset, 2 * sizeof(u32));
-      // The BlockMarker has no security purpose, but is specifically meant for
-      // the chunk iteration function that can be used in debugging situations.
-      // It is the only situation where we have to locate the start of a chunk
-      // based on its block address.
-      reinterpret_cast<u32 *>(Block)[0] = BlockMarker;
-      reinterpret_cast<u32 *>(Block)[1] = static_cast<u32>(Offset);
-      Header.Offset = (Offset >> MinAlignmentLog) & Chunk::OffsetMask;
-    }
-    Header.ClassId = ClassId & Chunk::ClassIdMask;
-    Header.State = Chunk::State::Allocated;
-    Header.OriginOrWasZeroed = Origin & Chunk::OriginMask;
-    Header.SizeOrUnusedBytes =
-        (ClassId ? Size : SecondaryBlockEnd - (UserPtr + Size)) &
-        Chunk::SizeOrUnusedBytesMask;
-    Chunk::storeHeader(Cookie, Ptr, &Header);
-
-    return TaggedPtr;
+    return initChunkWithMemoryTagging(ClassId, Origin, Block, UserPtr, Size,
+                                      SizeOrUnusedBytes, FillContents);
   }
 
   NOINLINE void deallocate(void *Ptr, Chunk::Origin Origin, uptr DeleteSize = 0,
@@ -1163,6 +1048,175 @@ class Allocator {
            reinterpret_cast<uptr>(Ptr) - SizeOrUnusedBytes;
   }
 
+  ALWAYS_INLINE void *initChunk(const uptr ClassId, const Chunk::Origin Origin,
+                                void *Block, const uptr UserPtr,
+                                const uptr SizeOrUnusedBytes,
+                                const FillContentsMode FillContents) {
+    Block = addHeaderTag(Block);
+    // Only do content fill when it's from primary allocator because secondary
+    // allocator has filled the content.
+    if (ClassId != 0 && UNLIKELY(FillContents != NoFill)) {
+      // This condition is not necessarily unlikely, but since memset is
+      // costly, we might as well mark it as such.
+      memset(Block, FillContents == ZeroFill ? 0 : PatternFillByte,
+             PrimaryT::getSizeByClassId(ClassId));
+    }
+
+    Chunk::UnpackedHeader Header = {};
+
+    const uptr DefaultAlignedPtr =
+        reinterpret_cast<uptr>(Block) + Chunk::getHeaderSize();
+    if (UNLIKELY(DefaultAlignedPtr != UserPtr)) {
+      const uptr Offset = UserPtr - DefaultAlignedPtr;
+      DCHECK_GE(Offset, 2 * sizeof(u32));
+      // The BlockMarker has no security purpose, but is specifically meant for
+      // the chunk iteration function that can be used in debugging situations.
+      // It is the only situation where we have to locate the start of a chunk
+      // based on its block address.
+      reinterpret_cast<u32 *>(Block)[0] = BlockMarker;
+      reinterpret_cast<u32 *>(Block)[1] = static_cast<u32>(Offset);
+      Header.Offset = (Offset >> MinAlignmentLog) & Chunk::OffsetMask;
+    }
+
+    Header.ClassId = ClassId & Chunk::ClassIdMask;
+    Header.State = Chunk::State::Allocated;
+    Header.OriginOrWasZeroed = Origin & Chunk::OriginMask;
+    Header.SizeOrUnusedBytes = SizeOrUnusedBytes & Chunk::SizeOrUnusedBytesMask;
+    Chunk::storeHeader(Cookie, reinterpret_cast<void *>(addHeaderTag(UserPtr)),
+                       &Header);
+
+    return reinterpret_cast<void *>(UserPtr);
+  }
+
+  NOINLINE void *
+  initChunkWithMemoryTagging(const uptr ClassId, const Chunk::Origin Origin,
+                             void *Block, const uptr UserPtr, const uptr Size,
+                             const uptr SizeOrUnusedBytes,
+                             const FillContentsMode FillContents) {
+    const Options Options = Primary.Options.load();
+    DCHECK(useMemoryTagging<AllocatorConfig>(Options));
+
+    void *Ptr = reinterpret_cast<void *>(UserPtr);
+    void *TaggedPtr = Ptr;
+
+    if (LIKELY(ClassId)) {
+      // Init the primary chunk.
+      //
+      // We only need to zero or tag the contents for Primary backed
+      // allocations. We only set tags for primary allocations in order to avoid
+      // faulting potentially large numbers of pages for large secondary
+      // allocations. We assume that guard pages are enough to protect these
+      // allocations.
+      //
+      // FIXME: When the kernel provides a way to set the background tag of a
+      // mapping, we should be able to tag secondary allocations as well.
+      //
+      // When memory tagging is enabled, zeroing the contents is done as part of
+      // setting the tag.
+
+      Chunk::UnpackedHeader Header;
+      const uptr BlockSize = PrimaryT::getSizeByClassId(ClassId);
+      const uptr BlockUptr = reinterpret_cast<uptr>(Block);
+      const uptr BlockEnd = BlockUptr + BlockSize;
+      // If possible, try to reuse the UAF tag that was set by deallocate().
+      // For simplicity, only reuse tags if we have the same start address as
+      // the previous allocation. This handles the majority of cases since
+      // most allocations will not be more aligned than the minimum alignment.
+      //
+      // We need to handle situations involving reclaimed chunks, and retag
+      // the reclaimed portions if necessary. In the case where the chunk is
+      // fully reclaimed, the chunk's header will be zero, which will trigger
+      // the code path for new mappings and invalid chunks that prepares the
+      // chunk from scratch. There are three possibilities for partial
+      // reclaiming:
+      //
+      // (1) Header was reclaimed, data was partially reclaimed.
+      // (2) Header was not reclaimed, all data was reclaimed (e.g. because
+      //     data started on a page boundary).
+      // (3) Header was not reclaimed, data was partially reclaimed.
+      //
+      // Case (1) will be handled in the same way as for full reclaiming,
+      // since the header will be zero.
+      //
+      // We can detect case (2) by loading the tag from the start
+      // of the chunk. If it is zero, it means that either all data was
+      // reclaimed (since we never use zero as the chunk tag), or that the
+      // previous allocation was of size zero. Either way, we need to prepare
+      // a new chunk from scratch.
+      //
+      // We can detect case (3) by moving to the next page (if covered by the
+      // chunk) and loading the tag of its first granule. If it is zero, it
+      // means that all following pages may need to be retagged. On the other
+      // hand, if it is nonzero, we can assume that all following pages are
+      // still tagged, according to the logic that if any of the pages
+      // following the next page were reclaimed, the next page would have been
+      // reclaimed as well.
+      uptr TaggedUserPtr;
+      uptr PrevUserPtr;
+      if (getChunkFromBlock(BlockUptr, &PrevUserPtr, &Header) &&
+          PrevUserPtr == UserPtr &&
+          (TaggedUserPtr = loadTag(UserPtr)) != UserPtr) {
+        uptr PrevEnd = TaggedUserPtr + Header.SizeOrUnusedBytes;
+        const uptr NextPage = roundUp(TaggedUserPtr, getPageSizeCached());
+        if (NextPage < PrevEnd && loadTag(NextPage) != NextPage)
+          PrevEnd = NextPage;
+        TaggedPtr = reinterpret_cast<void *>(TaggedUserPtr);
+        resizeTaggedChunk(PrevEnd, TaggedUserPtr + Size, Size, BlockEnd);
+        if (UNLIKELY(FillContents != NoFill && !Header.OriginOrWasZeroed)) {
+          // If an allocation needs to be zeroed (i.e. calloc) we can normally
+          // avoid zeroing the memory now since we can rely on memory having
+          // been zeroed on free, as this is normally done while setting the
+          // UAF tag. But if tagging was disabled per-thread when the memory
+          // was freed, it would not have been retagged and thus zeroed, and
+          // therefore it needs to be zeroed now.
+          memset(TaggedPtr, 0,
+                 Min(Size, roundUp(PrevEnd - TaggedUserPtr,
+                                   archMemoryTagGranuleSize())));
+        } else if (Size) {
+          // Clear any stack metadata that may have previously been stored in
+          // the chunk data.
+          memset(TaggedPtr, 0, archMemoryTagGranuleSize());
+        }
+      } else {
+        const uptr OddEvenMask =
+            computeOddEvenMaskForPointerMaybe(Options, BlockUptr, ClassId);
+        TaggedPtr = prepareTaggedChunk(Ptr, Size, OddEvenMask, BlockEnd);
+      }
+      storePrimaryAllocationStackMaybe(Options, Ptr);
+    } else {
+      // Init the secondary chunk.
+
+      Block = addHeaderTag(Block);
+      Ptr = addHeaderTag(Ptr);
+      storeTags(reinterpret_cast<uptr>(Block), reinterpret_cast<uptr>(Ptr));
+      storeSecondaryAllocationStackMaybe(Options, Ptr, Size);
+    }
+
+    Chunk::UnpackedHeader Header = {};
+
+    const uptr DefaultAlignedPtr =
+        reinterpret_cast<uptr>(Block) + Chunk::getHeaderSize();
+    if (UNLIKELY(DefaultAlignedPtr != UserPtr)) {
+      const uptr Offset = UserPtr - DefaultAlignedPtr;
+      DCHECK_GE(Offset, 2 * sizeof(u32));
+      // The BlockMarker has no security purpose, but is specifically meant for
+      // the chunk iteration function that can be used in debugging situations.
+      // It is the only situation where we have to locate the start of a chunk
+      // based on its block address.
+      reinterpret_cast<u32 *>(Block)[0] = BlockMarker;
+      reinterpret_cast<u32 *>(Block)[1] = static_cast<u32>(Offset);
+      Header.Offset = (Offset >> MinAlignmentLog) & Chunk::OffsetMask;
+    }
+
+    Header.ClassId = ClassId & Chunk::ClassIdMask;
+    Header.State = Chunk::State::Allocated;
+    Header.OriginOrWasZeroed = Origin & Chunk::OriginMask;
+    Header.SizeOrUnusedBytes = SizeOrUnusedBytes & Chunk::SizeOrUnusedBytesMask;
+    Chunk::storeHeader(Cookie, Ptr, &Header);
+
+    return TaggedPtr;
+  }
+
   void quarantineOrDeallocateChunk(const Options &Options, void *TaggedPtr,
                                    Chunk::UnpackedHeader *Header,
                                    uptr Size) NO_THREAD_SAFETY_ANALYSIS {
@@ -1177,31 +1231,23 @@ class Allocator {
       Header->State = Chunk::State::Available;
     else
       Header->State = Chunk::State::Quarantined;
-    Header->OriginOrWasZeroed = useMemoryTagging<AllocatorConfig>(Options) &&
-                                Header->ClassId &&
-                                !TSDRegistry.getDisableMemInit();
-    Chunk::storeHeader(Cookie, Ptr, Header);
 
-    if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options))) {
-      u8 PrevTag = extractTag(reinterpret_cast<uptr>(TaggedPtr));
-      storeDeallocationStackMaybe(Options, Ptr, PrevTag, Size);
-      if (Header->ClassId) {
-        if (!TSDRegistry.getDisableMemInit()) {
-          uptr TaggedBegin, TaggedEnd;
-          const uptr OddEvenMask = computeOddEvenMaskForPointerMaybe(
-              Options, reinterpret_cast<uptr>(getBlockBegin(Ptr, Header)),
-              Header->ClassId);
-          // Exclude the previous tag so that immediate use after free is
-          // detected 100% of the time.
-          setRandomTag(Ptr, Size, OddEvenMask | (1UL << PrevTag), &TaggedBegin,
-                       &TaggedEnd);
-        }
-      }
+    void *BlockBegin;
+    if (LIKELY(!useMemoryTagging<AllocatorConfig>(Options))) {
+      Header->OriginOrWasZeroed = 0U;
+      if (BypassQuarantine && allocatorSupportsMemoryTagging<AllocatorConfig>())
+        Ptr = untagPointer(Ptr);
+      BlockBegin = getBlockBegin(Ptr, Header);
+    } else {
+      Header->OriginOrWasZeroed =
+          Header->ClassId && !TSDRegistry.getDisableMemInit();
+      BlockBegin =
+          retagBlock(Options, TaggedPtr, Ptr, Header, Size, BypassQuarantine);
     }
+
+    Chunk::storeHeader(Cookie, Ptr, Header);
+
     if (BypassQuarantine) {
-      if (allocatorSupportsMemoryTagging<AllocatorConfig>())
-        Ptr = untagPointer(Ptr);
-      void *BlockBegin = getBlockBegin(Ptr, Header);
       const uptr ClassId = Header->ClassId;
       if (LIKELY(ClassId)) {
         bool CacheDrained;
@@ -1216,9 +1262,6 @@ class Allocator {
         if (CacheDrained)
           Primary.tryReleaseToOS(ClassId, ReleaseToOS::Normal);
       } else {
-        if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options)))
-          storeTags(reinterpret_cast<uptr>(BlockBegin),
-                    reinterpret_cast<uptr>(Ptr));
         Secondary.deallocate(Options, BlockBegin);
       }
     } else {
@@ -1228,6 +1271,34 @@ class Allocator {
     }
   }
 
+  NOINLINE void *retagBlock(const Options &Options, void *TaggedPtr, void *&Ptr,
+                            Chunk::UnpackedHeader *Header, const uptr Size,
+                            bool BypassQuarantine) {
+    DCHECK(useMemoryTagging<AllocatorConfig>(Options));
+
+    const u8 PrevTag = extractTag(reinterpret_cast<uptr>(TaggedPtr));
+    storeDeallocationStackMaybe(Options, Ptr, PrevTag, Size);
+    if (Header->ClassId && !TSDRegistry.getDisableMemInit()) {
+      uptr TaggedBegin, TaggedEnd;
+      const uptr OddEvenMask = computeOddEvenMaskForPointerMaybe(
+          Options, reinterpret_cast<uptr>(getBlockBegin(Ptr, Header)),
+          Header->ClassId);
+      // Exclude the previous tag so that immediate use after free is
+      // detected 100% of the time.
+      setRandomTag(Ptr, Size, OddEvenMask | (1UL << PrevTag), &TaggedBegin,
+                   &TaggedEnd);
+    }
+
+    Ptr = untagPointer(Ptr);
+    void *BlockBegin = getBlockBegin(Ptr, Header);
+    if (BypassQuarantine && !Header->ClassId) {
+      storeTags(reinterpret_cast<uptr>(BlockBegin),
+                reinterpret_cast<uptr>(Ptr));
+    }
+
+    return BlockBegin;
+  }
+
   bool getChunkFromBlock(uptr Block, uptr *Chunk,
                          Chunk::UnpackedHeader *Header) {
     *Chunk =