diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ae699ebfc60383..089a85c8cb3659 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -51,7 +51,8 @@ C++ Specific Potentially Breaking Changes
 - The behavior controlled by the `-frelaxed-template-template-args` flag is now
   on by default, and the flag is deprecated. Until the flag is finally removed,
   it's negative spelling can be used to obtain compatibility with previous
-  versions of clang.
+  versions of clang. The deprecation warning for the negative spelling can be
+  disabled with `-Wno-deprecated-no-relaxed-template-template-args`.
 
 - Clang now rejects pointer to member from parenthesized expression in unevaluated context such as ``decltype(&(foo::bar))``. (#GH40906).
 
@@ -713,6 +714,9 @@ Bug Fixes to C++ Support
 - Correctly treat the compound statement of an ``if consteval`` as an immediate context. Fixes (#GH91509).
 - When partial ordering alias templates against template template parameters,
   allow pack expansions when the alias has a fixed-size parameter list. Fixes (#GH62529).
+- Clang now ignores template parameters only used within the exception specification of candidate function
+  templates during partial ordering when deducing template arguments from a function declaration or when
+  taking the address of a function template.
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 9781fcaa4ff5e9..9d97a75f696f66 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -436,6 +436,9 @@ def warn_drv_clang_unsupported : Warning<
   "the clang compiler does not support '%0'">;
 def warn_drv_deprecated_arg : Warning<
   "argument '%0' is deprecated%select{|, use '%2' instead}1">, InGroup<Deprecated>;
+def warn_drv_deprecated_arg_no_relaxed_template_template_args : Warning<
+  "argument '-fno-relaxed-template-template-args' is deprecated">,
+  InGroup<DeprecatedNoRelaxedTemplateTemplateArgs>;
 def warn_drv_deprecated_custom : Warning<
   "argument '%0' is deprecated, %1">, InGroup<Deprecated>;
 def warn_drv_assuming_mfloat_abi_is : Warning<
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 2beb1d45124b49..4cb4f3d999f7ab 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -104,6 +104,7 @@ def EnumConversion : DiagGroup<"enum-conversion",
                                [EnumEnumConversion,
                                 EnumFloatConversion,
                                 EnumCompareConditional]>;
+def DeprecatedNoRelaxedTemplateTemplateArgs : DiagGroup<"deprecated-no-relaxed-template-template-args">;
 def ObjCSignedCharBoolImplicitIntConversion :
   DiagGroup<"objc-signed-char-bool-implicit-int-conversion">;
 def Shorten64To32 : DiagGroup<"shorten-64-to-32">;
@@ -228,6 +229,7 @@ def Deprecated : DiagGroup<"deprecated", [DeprecatedAnonEnumEnumConversion,
                                           DeprecatedLiteralOperator,
                                           DeprecatedPragma,
                                           DeprecatedRegister,
+                                          DeprecatedNoRelaxedTemplateTemplateArgs,
                                           DeprecatedThisCapture,
                                           DeprecatedType,
                                           DeprecatedVolatile,
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index c54eb543d6580e..e579f1a0a3665c 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6647,7 +6647,9 @@ def fdebug_unparse : Flag<["-"], "fdebug-unparse">, Group<Action_Group>,
   DocBrief<[{Run the parser and the semantic checks. Then unparse the
 parse-tree and output the generated Fortran source file.}]>;
 def fdebug_unparse_with_symbols : Flag<["-"], "fdebug-unparse-with-symbols">, Group<Action_Group>,
-  HelpText<"Unparse and stop.">;
+  HelpText<"Unparse with symbols and stop.">;
+def fdebug_unparse_with_modules : Flag<["-"], "fdebug-unparse-with-modules">, Group<Action_Group>,
+  HelpText<"Unparse with dependent modules and stop.">;
 def fdebug_dump_symbols : Flag<["-"], "fdebug-dump-symbols">, Group<Action_Group>,
   HelpText<"Dump symbols after the semantic analysis">;
 def fdebug_dump_parse_tree : Flag<["-"], "fdebug-dump-parse-tree">, Group<Action_Group>,
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index ce2f39aeb08213..cc8ab7a5b4369e 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -1439,6 +1439,10 @@ struct CounterCoverageMappingBuilder
     terminateRegion(S);
   }
 
+  void VisitCoroutineSuspendExpr(const CoroutineSuspendExpr *E) {
+    Visit(E->getOperand());
+  }
+
   void VisitCXXThrowExpr(const CXXThrowExpr *E) {
     extendRegion(E);
     if (E->getSubExpr())
@@ -2173,6 +2177,10 @@ struct CounterCoverageMappingBuilder
     // propagate counts into them.
   }
 
+  void VisitArrayInitLoopExpr(const ArrayInitLoopExpr *AILE) {
+    Visit(AILE->getCommonExpr()->getSourceExpr());
+  }
+
   void VisitPseudoObjectExpr(const PseudoObjectExpr *POE) {
     // Just visit syntatic expression as this is what users actually write.
     VisitStmt(POE->getSyntacticForm());
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 42feb1650574ed..c3e6d563f3bd21 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7253,10 +7253,14 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   if (Arg *A =
           Args.getLastArg(options::OPT_frelaxed_template_template_args,
                           options::OPT_fno_relaxed_template_template_args)) {
-    D.Diag(diag::warn_drv_deprecated_arg)
-        << A->getAsString(Args) << /*hasReplacement=*/false;
-    if (A->getOption().matches(options::OPT_fno_relaxed_template_template_args))
+    if (A->getOption().matches(
+            options::OPT_fno_relaxed_template_template_args)) {
+      D.Diag(diag::warn_drv_deprecated_arg_no_relaxed_template_template_args);
       CmdArgs.push_back("-fno-relaxed-template-template-args");
+    } else {
+      D.Diag(diag::warn_drv_deprecated_arg)
+          << A->getAsString(Args) << /*hasReplacement=*/false;
+    }
   }
 
   // -fsized-deallocation is off by default, as it is an ABI-breaking change for
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 853c0e1b50619e..b5d405111fe4cb 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -5453,7 +5453,7 @@ static bool isAtLeastAsSpecializedAs(Sema &S, SourceLocation Loc,
     //     is used.
     if (DeduceTemplateArgumentsByTypeMatch(
             S, TemplateParams, FD2->getType(), FD1->getType(), Info, Deduced,
-            TDF_None,
+            TDF_AllowCompatibleFunctionType,
             /*PartialOrdering=*/true) != TemplateDeductionResult::Success)
       return false;
     break;
@@ -5485,20 +5485,40 @@ static bool isAtLeastAsSpecializedAs(Sema &S, SourceLocation Loc,
   switch (TPOC) {
   case TPOC_Call:
     for (unsigned I = 0, N = Args2.size(); I != N; ++I)
-      ::MarkUsedTemplateParameters(S.Context, Args2[I], false,
-                                   TemplateParams->getDepth(),
-                                   UsedParameters);
+      ::MarkUsedTemplateParameters(S.Context, Args2[I], /*OnlyDeduced=*/false,
+                                   TemplateParams->getDepth(), UsedParameters);
     break;
 
   case TPOC_Conversion:
-    ::MarkUsedTemplateParameters(S.Context, Proto2->getReturnType(), false,
+    ::MarkUsedTemplateParameters(S.Context, Proto2->getReturnType(),
+                                 /*OnlyDeduced=*/false,
                                  TemplateParams->getDepth(), UsedParameters);
     break;
 
   case TPOC_Other:
-    ::MarkUsedTemplateParameters(S.Context, FD2->getType(), false,
-                                 TemplateParams->getDepth(),
-                                 UsedParameters);
+    // We do not deduce template arguments from the exception specification
+    // when determining the primary template of a function template
+    // specialization or when taking the address of a function template.
+    // Therefore, we do not mark template parameters in the exception
+    // specification as used during partial ordering to prevent the following
+    // from being ambiguous:
+    //
+    //   template<typename T, typename U>
+    //   void f(U) noexcept(noexcept(T())); // #1
+    //
+    //   template<typename T>
+    //   void f(T*) noexcept; // #2
+    //
+    //   template<>
+    //   void f<int>(int*) noexcept; // explicit specialization of #2
+    //
+    // Although there is no corresponding wording in the standard, this seems
+    // to be the intended behavior given the definition of
+    // 'deduction substitution loci' in [temp.deduct].
+    ::MarkUsedTemplateParameters(
+        S.Context,
+        S.Context.getFunctionTypeWithExceptionSpec(FD2->getType(), EST_None),
+        /*OnlyDeduced=*/false, TemplateParams->getDepth(), UsedParameters);
     break;
   }
 
diff --git a/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.partial/p3.cpp b/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.partial/p3.cpp
new file mode 100644
index 00000000000000..cc1d4ecda2ecca
--- /dev/null
+++ b/clang/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.partial/p3.cpp
@@ -0,0 +1,72 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+template<bool B>
+struct A { };
+
+constexpr A<false> a;
+constexpr A<false> b;
+
+constexpr int* x = nullptr;
+constexpr short* y = nullptr;
+
+namespace ExplicitArgs {
+  template<typename T, typename U>
+  constexpr int f(U) noexcept(noexcept(T())) {
+    return 0;
+  }
+
+  template<typename T>
+  constexpr int f(T*) noexcept {
+    return 1;
+  }
+
+  template<>
+  constexpr int f<int>(int*) noexcept {
+    return 2;
+  }
+
+  static_assert(f<int>(1) == 0);
+  static_assert(f<short>(y) == 1);
+  static_assert(f<int>(x) == 2);
+
+  template<typename T, typename U>
+  constexpr int g(U*) noexcept(noexcept(T())) {
+    return 3;
+  }
+
+  template<typename T>
+  constexpr int g(T) noexcept {
+    return 4;
+  }
+
+  template<>
+  constexpr int g<int>(int*) noexcept {
+    return 5;
+  }
+
+  static_assert(g<int>(y) == 3);
+  static_assert(g<short>(1) == 4);
+  static_assert(g<int>(x) == 5);
+} // namespace ExplicitArgs
+
+namespace DeducedArgs {
+  template<typename T, bool B>
+  constexpr int f(T, A<B>) noexcept(B) {
+    return 0;
+  }
+
+  template<typename T, bool B>
+  constexpr int f(T*, A<B>) noexcept(B && B) {
+    return 1;
+  }
+
+  template<>
+  constexpr int f(int*, A<false>) {
+    return 2;
+  }
+
+  static_assert(f<int*>(x, a) == 0);
+  static_assert(f<short>(y, a) == 1);
+  static_assert(f<int>(x, a) == 2);
+} // namespace DeducedArgs
diff --git a/clang/test/CoverageMapping/coroutine.cpp b/clang/test/CoverageMapping/coroutine.cpp
index 0105005d198a1c..d322bc351a7276 100644
--- a/clang/test/CoverageMapping/coroutine.cpp
+++ b/clang/test/CoverageMapping/coroutine.cpp
@@ -32,6 +32,7 @@ struct std::coroutine_traits<int, int> {
     suspend_always final_suspend() noexcept;
     void unhandled_exception() noexcept;
     void return_value(int);
+    suspend_always yield_value(int);
   };
 };
 
@@ -45,3 +46,21 @@ int f1(int x) {       // CHECK-NEXT: File 0, [[@LINE]]:15 -> [[@LINE+8]]:2 = #0
   }                   // CHECK-NEXT: File 0, [[@LINE-2]]:10 -> [[@LINE]]:4 = (#0 - #1)
   co_return x;        // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:4 -> [[@LINE]]:3 = #1
 } // CHECK-NEXT: File 0, [[@LINE-1]]:3 -> [[@LINE-1]]:14 = #1
+
+// CHECK-LABEL: _Z2f2i:
+// CHECK-NEXT: File 0, [[@LINE+1]]:15 -> [[@LINE+15]]:2 = #0
+int f2(int x) {
+// CHECK-NEXT: File 0, [[@LINE+5]]:13 -> [[@LINE+5]]:18 = #0
+// CHECK-NEXT: Branch,File 0, [[@LINE+4]]:13 -> [[@LINE+4]]:18 = #1, (#0 - #1)
+// CHECK-NEXT: Gap,File 0, [[@LINE+3]]:20 -> [[@LINE+3]]:21 = #1
+// CHECK-NEXT: File 0, [[@LINE+2]]:21 -> [[@LINE+2]]:37 = #1
+// CHECK-NEXT: File 0, [[@LINE+1]]:40 -> [[@LINE+1]]:56 = (#0 - #1)
+  co_await (x > 0 ? suspend_always{} : suspend_always{});
+// CHECK-NEXT: File 0, [[@LINE+5]]:12 -> [[@LINE+5]]:17 = #0
+// CHECK-NEXT: Branch,File 0, [[@LINE+4]]:12 -> [[@LINE+4]]:17 = #2, (#0 - #2)
+// CHECK-NEXT: Gap,File 0, [[@LINE+3]]:19 -> [[@LINE+3]]:20 = #2
+// CHECK-NEXT: File 0, [[@LINE+2]]:20 -> [[@LINE+2]]:21 = #2
+// CHECK-NEXT: File 0, [[@LINE+1]]:24 -> [[@LINE+1]]:25 = (#0 - #2)
+  co_yield x > 0 ? 1 : 2;
+  co_return 0;
+}
diff --git a/clang/test/CoverageMapping/decomposition.cpp b/clang/test/CoverageMapping/decomposition.cpp
new file mode 100644
index 00000000000000..601ea630faeec9
--- /dev/null
+++ b/clang/test/CoverageMapping/decomposition.cpp
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -triple %itanium_abi_triple -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+
+// CHECK-LABEL:       _Z19array_decompositioni:
+// CHECK-NEXT:          File 0, [[@LINE+6]]:32 -> {{[0-9]+}}:2 = #0
+// CHECK-NEXT:          File 0, [[@LINE+8]]:20 -> [[@LINE+8]]:25 = #0
+// CHECK-NEXT:          Branch,File 0, [[@LINE+7]]:20 -> [[@LINE+7]]:25 = #1, (#0 - #1)
+// CHECK-NEXT:          Gap,File 0, [[@LINE+6]]:27 -> [[@LINE+6]]:28 = #1
+// CHECK-NEXT:          File 0, [[@LINE+5]]:28 -> [[@LINE+5]]:29 = #1
+// CHECK-NEXT:          File 0, [[@LINE+4]]:32 -> [[@LINE+4]]:33 = (#0 - #1)
+int array_decomposition(int i) {
+  int a[] = {1, 2, 3};
+  int b[] = {4, 5, 6};
+  auto [x, y, z] = i > 0 ? a : b;
+  return x + y + z;
+}
diff --git a/clang/test/Driver/frelaxed-template-template-args.cpp b/clang/test/Driver/frelaxed-template-template-args.cpp
index 57fc4e3da6e5d0..7a7fd6f0bbc8f9 100644
--- a/clang/test/Driver/frelaxed-template-template-args.cpp
+++ b/clang/test/Driver/frelaxed-template-template-args.cpp
@@ -1,7 +1,9 @@
 // RUN: %clang -fsyntax-only -### %s 2>&1 | FileCheck --check-prefix=CHECK-DEF %s
 // RUN: %clang -fsyntax-only -frelaxed-template-template-args %s 2>&1 | FileCheck --check-prefix=CHECK-ON %s
 // RUN: %clang -fsyntax-only -fno-relaxed-template-template-args %s 2>&1 | FileCheck --check-prefix=CHECK-OFF %s
+// RUN: %clang -fsyntax-only -fno-relaxed-template-template-args -Wno-deprecated-no-relaxed-template-template-args %s 2>&1 | FileCheck --check-prefix=CHECK-DIS --allow-empty %s
 
 // CHECK-DEF-NOT: "-cc1"{{.*}} "-fno-relaxed-template-template-args"
 // CHECK-ON:  warning: argument '-frelaxed-template-template-args' is deprecated [-Wdeprecated]
-// CHECK-OFF: warning: argument '-fno-relaxed-template-template-args' is deprecated [-Wdeprecated]
+// CHECK-OFF: warning: argument '-fno-relaxed-template-template-args' is deprecated [-Wdeprecated-no-relaxed-template-template-args]
+// CHECK-DIS-NOT: warning: argument '-fno-relaxed-template-template-args' is deprecated
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 927513dea92dab..15a199ae0349ba 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -410,133 +410,18 @@ class Allocator {
       reportOutOfMemory(NeededSize);
     }
 
-    const uptr BlockUptr = reinterpret_cast<uptr>(Block);
-    const uptr UnalignedUserPtr = BlockUptr + Chunk::getHeaderSize();
-    const uptr UserPtr = roundUp(UnalignedUserPtr, Alignment);
-
-    void *Ptr = reinterpret_cast<void *>(UserPtr);
-    void *TaggedPtr = Ptr;
-    if (LIKELY(ClassId)) {
-      // We only need to zero or tag the contents for Primary backed
-      // allocations. We only set tags for primary allocations in order to avoid
-      // faulting potentially large numbers of pages for large secondary
-      // allocations. We assume that guard pages are enough to protect these
-      // allocations.
-      //
-      // FIXME: When the kernel provides a way to set the background tag of a
-      // mapping, we should be able to tag secondary allocations as well.
-      //
-      // When memory tagging is enabled, zeroing the contents is done as part of
-      // setting the tag.
-      if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options))) {
-        uptr PrevUserPtr;
-        Chunk::UnpackedHeader Header;
-        const uptr BlockSize = PrimaryT::getSizeByClassId(ClassId);
-        const uptr BlockEnd = BlockUptr + BlockSize;
-        // If possible, try to reuse the UAF tag that was set by deallocate().
-        // For simplicity, only reuse tags if we have the same start address as
-        // the previous allocation. This handles the majority of cases since
-        // most allocations will not be more aligned than the minimum alignment.
-        //
-        // We need to handle situations involving reclaimed chunks, and retag
-        // the reclaimed portions if necessary. In the case where the chunk is
-        // fully reclaimed, the chunk's header will be zero, which will trigger
-        // the code path for new mappings and invalid chunks that prepares the
-        // chunk from scratch. There are three possibilities for partial
-        // reclaiming:
-        //
-        // (1) Header was reclaimed, data was partially reclaimed.
-        // (2) Header was not reclaimed, all data was reclaimed (e.g. because
-        //     data started on a page boundary).
-        // (3) Header was not reclaimed, data was partially reclaimed.
-        //
-        // Case (1) will be handled in the same way as for full reclaiming,
-        // since the header will be zero.
-        //
-        // We can detect case (2) by loading the tag from the start
-        // of the chunk. If it is zero, it means that either all data was
-        // reclaimed (since we never use zero as the chunk tag), or that the
-        // previous allocation was of size zero. Either way, we need to prepare
-        // a new chunk from scratch.
-        //
-        // We can detect case (3) by moving to the next page (if covered by the
-        // chunk) and loading the tag of its first granule. If it is zero, it
-        // means that all following pages may need to be retagged. On the other
-        // hand, if it is nonzero, we can assume that all following pages are
-        // still tagged, according to the logic that if any of the pages
-        // following the next page were reclaimed, the next page would have been
-        // reclaimed as well.
-        uptr TaggedUserPtr;
-        if (getChunkFromBlock(BlockUptr, &PrevUserPtr, &Header) &&
-            PrevUserPtr == UserPtr &&
-            (TaggedUserPtr = loadTag(UserPtr)) != UserPtr) {
-          uptr PrevEnd = TaggedUserPtr + Header.SizeOrUnusedBytes;
-          const uptr NextPage = roundUp(TaggedUserPtr, getPageSizeCached());
-          if (NextPage < PrevEnd && loadTag(NextPage) != NextPage)
-            PrevEnd = NextPage;
-          TaggedPtr = reinterpret_cast<void *>(TaggedUserPtr);
-          resizeTaggedChunk(PrevEnd, TaggedUserPtr + Size, Size, BlockEnd);
-          if (UNLIKELY(FillContents != NoFill && !Header.OriginOrWasZeroed)) {
-            // If an allocation needs to be zeroed (i.e. calloc) we can normally
-            // avoid zeroing the memory now since we can rely on memory having
-            // been zeroed on free, as this is normally done while setting the
-            // UAF tag. But if tagging was disabled per-thread when the memory
-            // was freed, it would not have been retagged and thus zeroed, and
-            // therefore it needs to be zeroed now.
-            memset(TaggedPtr, 0,
-                   Min(Size, roundUp(PrevEnd - TaggedUserPtr,
-                                     archMemoryTagGranuleSize())));
-          } else if (Size) {
-            // Clear any stack metadata that may have previously been stored in
-            // the chunk data.
-            memset(TaggedPtr, 0, archMemoryTagGranuleSize());
-          }
-        } else {
-          const uptr OddEvenMask =
-              computeOddEvenMaskForPointerMaybe(Options, BlockUptr, ClassId);
-          TaggedPtr = prepareTaggedChunk(Ptr, Size, OddEvenMask, BlockEnd);
-        }
-        storePrimaryAllocationStackMaybe(Options, Ptr);
-      } else {
-        Block = addHeaderTag(Block);
-        Ptr = addHeaderTag(Ptr);
-        if (UNLIKELY(FillContents != NoFill)) {
-          // This condition is not necessarily unlikely, but since memset is
-          // costly, we might as well mark it as such.
-          memset(Block, FillContents == ZeroFill ? 0 : PatternFillByte,
-                 PrimaryT::getSizeByClassId(ClassId));
-        }
-      }
-    } else {
-      Block = addHeaderTag(Block);
-      Ptr = addHeaderTag(Ptr);
-      if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options))) {
-        storeTags(reinterpret_cast<uptr>(Block), reinterpret_cast<uptr>(Ptr));
-        storeSecondaryAllocationStackMaybe(Options, Ptr, Size);
-      }
+    const uptr UserPtr = roundUp(
+        reinterpret_cast<uptr>(Block) + Chunk::getHeaderSize(), Alignment);
+    const uptr SizeOrUnusedBytes =
+        ClassId ? Size : SecondaryBlockEnd - (UserPtr + Size);
+
+    if (LIKELY(!useMemoryTagging<AllocatorConfig>(Options))) {
+      return initChunk(ClassId, Origin, Block, UserPtr, SizeOrUnusedBytes,
+                       FillContents);
     }
 
-    Chunk::UnpackedHeader Header = {};
-    if (UNLIKELY(UnalignedUserPtr != UserPtr)) {
-      const uptr Offset = UserPtr - UnalignedUserPtr;
-      DCHECK_GE(Offset, 2 * sizeof(u32));
-      // The BlockMarker has no security purpose, but is specifically meant for
-      // the chunk iteration function that can be used in debugging situations.
-      // It is the only situation where we have to locate the start of a chunk
-      // based on its block address.
-      reinterpret_cast<u32 *>(Block)[0] = BlockMarker;
-      reinterpret_cast<u32 *>(Block)[1] = static_cast<u32>(Offset);
-      Header.Offset = (Offset >> MinAlignmentLog) & Chunk::OffsetMask;
-    }
-    Header.ClassId = ClassId & Chunk::ClassIdMask;
-    Header.State = Chunk::State::Allocated;
-    Header.OriginOrWasZeroed = Origin & Chunk::OriginMask;
-    Header.SizeOrUnusedBytes =
-        (ClassId ? Size : SecondaryBlockEnd - (UserPtr + Size)) &
-        Chunk::SizeOrUnusedBytesMask;
-    Chunk::storeHeader(Cookie, Ptr, &Header);
-
-    return TaggedPtr;
+    return initChunkWithMemoryTagging(ClassId, Origin, Block, UserPtr, Size,
+                                      SizeOrUnusedBytes, FillContents);
   }
 
   NOINLINE void deallocate(void *Ptr, Chunk::Origin Origin, uptr DeleteSize = 0,
@@ -1163,6 +1048,175 @@ class Allocator {
            reinterpret_cast<uptr>(Ptr) - SizeOrUnusedBytes;
   }
 
+  ALWAYS_INLINE void *initChunk(const uptr ClassId, const Chunk::Origin Origin,
+                                void *Block, const uptr UserPtr,
+                                const uptr SizeOrUnusedBytes,
+                                const FillContentsMode FillContents) {
+    Block = addHeaderTag(Block);
+    // Only do content fill when it's from primary allocator because secondary
+    // allocator has filled the content.
+    if (ClassId != 0 && UNLIKELY(FillContents != NoFill)) {
+      // This condition is not necessarily unlikely, but since memset is
+      // costly, we might as well mark it as such.
+      memset(Block, FillContents == ZeroFill ? 0 : PatternFillByte,
+             PrimaryT::getSizeByClassId(ClassId));
+    }
+
+    Chunk::UnpackedHeader Header = {};
+
+    const uptr DefaultAlignedPtr =
+        reinterpret_cast<uptr>(Block) + Chunk::getHeaderSize();
+    if (UNLIKELY(DefaultAlignedPtr != UserPtr)) {
+      const uptr Offset = UserPtr - DefaultAlignedPtr;
+      DCHECK_GE(Offset, 2 * sizeof(u32));
+      // The BlockMarker has no security purpose, but is specifically meant for
+      // the chunk iteration function that can be used in debugging situations.
+      // It is the only situation where we have to locate the start of a chunk
+      // based on its block address.
+      reinterpret_cast<u32 *>(Block)[0] = BlockMarker;
+      reinterpret_cast<u32 *>(Block)[1] = static_cast<u32>(Offset);
+      Header.Offset = (Offset >> MinAlignmentLog) & Chunk::OffsetMask;
+    }
+
+    Header.ClassId = ClassId & Chunk::ClassIdMask;
+    Header.State = Chunk::State::Allocated;
+    Header.OriginOrWasZeroed = Origin & Chunk::OriginMask;
+    Header.SizeOrUnusedBytes = SizeOrUnusedBytes & Chunk::SizeOrUnusedBytesMask;
+    Chunk::storeHeader(Cookie, reinterpret_cast<void *>(addHeaderTag(UserPtr)),
+                       &Header);
+
+    return reinterpret_cast<void *>(UserPtr);
+  }
+
+  NOINLINE void *
+  initChunkWithMemoryTagging(const uptr ClassId, const Chunk::Origin Origin,
+                             void *Block, const uptr UserPtr, const uptr Size,
+                             const uptr SizeOrUnusedBytes,
+                             const FillContentsMode FillContents) {
+    const Options Options = Primary.Options.load();
+    DCHECK(useMemoryTagging<AllocatorConfig>(Options));
+
+    void *Ptr = reinterpret_cast<void *>(UserPtr);
+    void *TaggedPtr = Ptr;
+
+    if (LIKELY(ClassId)) {
+      // Init the primary chunk.
+      //
+      // We only need to zero or tag the contents for Primary backed
+      // allocations. We only set tags for primary allocations in order to avoid
+      // faulting potentially large numbers of pages for large secondary
+      // allocations. We assume that guard pages are enough to protect these
+      // allocations.
+      //
+      // FIXME: When the kernel provides a way to set the background tag of a
+      // mapping, we should be able to tag secondary allocations as well.
+      //
+      // When memory tagging is enabled, zeroing the contents is done as part of
+      // setting the tag.
+
+      Chunk::UnpackedHeader Header;
+      const uptr BlockSize = PrimaryT::getSizeByClassId(ClassId);
+      const uptr BlockUptr = reinterpret_cast<uptr>(Block);
+      const uptr BlockEnd = BlockUptr + BlockSize;
+      // If possible, try to reuse the UAF tag that was set by deallocate().
+      // For simplicity, only reuse tags if we have the same start address as
+      // the previous allocation. This handles the majority of cases since
+      // most allocations will not be more aligned than the minimum alignment.
+      //
+      // We need to handle situations involving reclaimed chunks, and retag
+      // the reclaimed portions if necessary. In the case where the chunk is
+      // fully reclaimed, the chunk's header will be zero, which will trigger
+      // the code path for new mappings and invalid chunks that prepares the
+      // chunk from scratch. There are three possibilities for partial
+      // reclaiming:
+      //
+      // (1) Header was reclaimed, data was partially reclaimed.
+      // (2) Header was not reclaimed, all data was reclaimed (e.g. because
+      //     data started on a page boundary).
+      // (3) Header was not reclaimed, data was partially reclaimed.
+      //
+      // Case (1) will be handled in the same way as for full reclaiming,
+      // since the header will be zero.
+      //
+      // We can detect case (2) by loading the tag from the start
+      // of the chunk. If it is zero, it means that either all data was
+      // reclaimed (since we never use zero as the chunk tag), or that the
+      // previous allocation was of size zero. Either way, we need to prepare
+      // a new chunk from scratch.
+      //
+      // We can detect case (3) by moving to the next page (if covered by the
+      // chunk) and loading the tag of its first granule. If it is zero, it
+      // means that all following pages may need to be retagged. On the other
+      // hand, if it is nonzero, we can assume that all following pages are
+      // still tagged, according to the logic that if any of the pages
+      // following the next page were reclaimed, the next page would have been
+      // reclaimed as well.
+      uptr TaggedUserPtr;
+      uptr PrevUserPtr;
+      if (getChunkFromBlock(BlockUptr, &PrevUserPtr, &Header) &&
+          PrevUserPtr == UserPtr &&
+          (TaggedUserPtr = loadTag(UserPtr)) != UserPtr) {
+        uptr PrevEnd = TaggedUserPtr + Header.SizeOrUnusedBytes;
+        const uptr NextPage = roundUp(TaggedUserPtr, getPageSizeCached());
+        if (NextPage < PrevEnd && loadTag(NextPage) != NextPage)
+          PrevEnd = NextPage;
+        TaggedPtr = reinterpret_cast<void *>(TaggedUserPtr);
+        resizeTaggedChunk(PrevEnd, TaggedUserPtr + Size, Size, BlockEnd);
+        if (UNLIKELY(FillContents != NoFill && !Header.OriginOrWasZeroed)) {
+          // If an allocation needs to be zeroed (i.e. calloc) we can normally
+          // avoid zeroing the memory now since we can rely on memory having
+          // been zeroed on free, as this is normally done while setting the
+          // UAF tag. But if tagging was disabled per-thread when the memory
+          // was freed, it would not have been retagged and thus zeroed, and
+          // therefore it needs to be zeroed now.
+          memset(TaggedPtr, 0,
+                 Min(Size, roundUp(PrevEnd - TaggedUserPtr,
+                                   archMemoryTagGranuleSize())));
+        } else if (Size) {
+          // Clear any stack metadata that may have previously been stored in
+          // the chunk data.
+          memset(TaggedPtr, 0, archMemoryTagGranuleSize());
+        }
+      } else {
+        const uptr OddEvenMask =
+            computeOddEvenMaskForPointerMaybe(Options, BlockUptr, ClassId);
+        TaggedPtr = prepareTaggedChunk(Ptr, Size, OddEvenMask, BlockEnd);
+      }
+      storePrimaryAllocationStackMaybe(Options, Ptr);
+    } else {
+      // Init the secondary chunk.
+
+      Block = addHeaderTag(Block);
+      Ptr = addHeaderTag(Ptr);
+      storeTags(reinterpret_cast<uptr>(Block), reinterpret_cast<uptr>(Ptr));
+      storeSecondaryAllocationStackMaybe(Options, Ptr, Size);
+    }
+
+    Chunk::UnpackedHeader Header = {};
+
+    const uptr DefaultAlignedPtr =
+        reinterpret_cast<uptr>(Block) + Chunk::getHeaderSize();
+    if (UNLIKELY(DefaultAlignedPtr != UserPtr)) {
+      const uptr Offset = UserPtr - DefaultAlignedPtr;
+      DCHECK_GE(Offset, 2 * sizeof(u32));
+      // The BlockMarker has no security purpose, but is specifically meant for
+      // the chunk iteration function that can be used in debugging situations.
+      // It is the only situation where we have to locate the start of a chunk
+      // based on its block address.
+      reinterpret_cast<u32 *>(Block)[0] = BlockMarker;
+      reinterpret_cast<u32 *>(Block)[1] = static_cast<u32>(Offset);
+      Header.Offset = (Offset >> MinAlignmentLog) & Chunk::OffsetMask;
+    }
+
+    Header.ClassId = ClassId & Chunk::ClassIdMask;
+    Header.State = Chunk::State::Allocated;
+    Header.OriginOrWasZeroed = Origin & Chunk::OriginMask;
+    Header.SizeOrUnusedBytes = SizeOrUnusedBytes & Chunk::SizeOrUnusedBytesMask;
+    Chunk::storeHeader(Cookie, Ptr, &Header);
+
+    return TaggedPtr;
+  }
+
   void quarantineOrDeallocateChunk(const Options &Options, void *TaggedPtr,
                                    Chunk::UnpackedHeader *Header,
                                    uptr Size) NO_THREAD_SAFETY_ANALYSIS {
@@ -1177,31 +1231,23 @@ class Allocator {
       Header->State = Chunk::State::Available;
     else
       Header->State = Chunk::State::Quarantined;
-    Header->OriginOrWasZeroed = useMemoryTagging<AllocatorConfig>(Options) &&
-                                Header->ClassId &&
-                                !TSDRegistry.getDisableMemInit();
-    Chunk::storeHeader(Cookie, Ptr, Header);
 
-    if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options))) {
-      u8 PrevTag = extractTag(reinterpret_cast<uptr>(TaggedPtr));
-      storeDeallocationStackMaybe(Options, Ptr, PrevTag, Size);
-      if (Header->ClassId) {
-        if (!TSDRegistry.getDisableMemInit()) {
-          uptr TaggedBegin, TaggedEnd;
-          const uptr OddEvenMask = computeOddEvenMaskForPointerMaybe(
-              Options, reinterpret_cast<uptr>(getBlockBegin(Ptr, Header)),
-              Header->ClassId);
-          // Exclude the previous tag so that immediate use after free is
-          // detected 100% of the time.
-          setRandomTag(Ptr, Size, OddEvenMask | (1UL << PrevTag), &TaggedBegin,
-                       &TaggedEnd);
-        }
-      }
+    void *BlockBegin;
+    if (LIKELY(!useMemoryTagging<AllocatorConfig>(Options))) {
+      Header->OriginOrWasZeroed = 0U;
+      if (BypassQuarantine && allocatorSupportsMemoryTagging<AllocatorConfig>())
+        Ptr = untagPointer(Ptr);
+      BlockBegin = getBlockBegin(Ptr, Header);
+    } else {
+      Header->OriginOrWasZeroed =
+          Header->ClassId && !TSDRegistry.getDisableMemInit();
+      BlockBegin =
+          retagBlock(Options, TaggedPtr, Ptr, Header, Size, BypassQuarantine);
     }
+
+    Chunk::storeHeader(Cookie, Ptr, Header);
+
     if (BypassQuarantine) {
-      if (allocatorSupportsMemoryTagging<AllocatorConfig>())
-        Ptr = untagPointer(Ptr);
-      void *BlockBegin = getBlockBegin(Ptr, Header);
       const uptr ClassId = Header->ClassId;
       if (LIKELY(ClassId)) {
         bool CacheDrained;
@@ -1216,9 +1262,6 @@ class Allocator {
         if (CacheDrained)
           Primary.tryReleaseToOS(ClassId, ReleaseToOS::Normal);
       } else {
-        if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options)))
-          storeTags(reinterpret_cast<uptr>(BlockBegin),
-                    reinterpret_cast<uptr>(Ptr));
         Secondary.deallocate(Options, BlockBegin);
       }
     } else {
@@ -1228,6 +1271,34 @@ class Allocator {
     }
   }
 
+  NOINLINE void *retagBlock(const Options &Options, void *TaggedPtr, void *&Ptr,
+                            Chunk::UnpackedHeader *Header, const uptr Size,
+                            bool BypassQuarantine) {
+    DCHECK(useMemoryTagging<AllocatorConfig>(Options));
+
+    const u8 PrevTag = extractTag(reinterpret_cast<uptr>(TaggedPtr));
+    storeDeallocationStackMaybe(Options, Ptr, PrevTag, Size);
+    if (Header->ClassId && !TSDRegistry.getDisableMemInit()) {
+      uptr TaggedBegin, TaggedEnd;
+      const uptr OddEvenMask = computeOddEvenMaskForPointerMaybe(
+          Options, reinterpret_cast<uptr>(getBlockBegin(Ptr, Header)),
+          Header->ClassId);
+      // Exclude the previous tag so that immediate use after free is
+      // detected 100% of the time.
+      setRandomTag(Ptr, Size, OddEvenMask | (1UL << PrevTag), &TaggedBegin,
+                   &TaggedEnd);
+    }
+
+    Ptr = untagPointer(Ptr);
+    void *BlockBegin = getBlockBegin(Ptr, Header);
+    if (BypassQuarantine && !Header->ClassId) {
+      storeTags(reinterpret_cast<uptr>(BlockBegin),
+                reinterpret_cast<uptr>(Ptr));
+    }
+
+    return BlockBegin;
+  }
+
   bool getChunkFromBlock(uptr Block, uptr *Chunk,
                          Chunk::UnpackedHeader *Header) {
     *Chunk =
diff --git a/flang/include/flang/Frontend/FrontendActions.h b/flang/include/flang/Frontend/FrontendActions.h
index e2e859f3a81bd7..7823565eb815f8 100644
--- a/flang/include/flang/Frontend/FrontendActions.h
+++ b/flang/include/flang/Frontend/FrontendActions.h
@@ -108,6 +108,10 @@ class DebugUnparseWithSymbolsAction : public PrescanAndSemaAction {
   void executeAction() override;
 };
 
+class DebugUnparseWithModulesAction : public PrescanAndSemaAction {
+  void executeAction() override;
+};
+
 class DebugUnparseAction : public PrescanAndSemaAction {
   void executeAction() override;
 };
diff --git a/flang/include/flang/Frontend/FrontendOptions.h b/flang/include/flang/Frontend/FrontendOptions.h
index 06b1318f243b08..82ca99672ec610 100644
--- a/flang/include/flang/Frontend/FrontendOptions.h
+++ b/flang/include/flang/Frontend/FrontendOptions.h
@@ -63,6 +63,10 @@ enum ActionKind {
   /// Fortran source file
   DebugUnparseWithSymbols,
 
+  /// Parse, run semantics, and output a Fortran source file preceded
+  /// by all the necessary modules (transitively)
+  DebugUnparseWithModules,
+
   /// Parse, run semantics and then output symbols from semantics
   DebugDumpSymbols,
 
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 477d391277ee25..68ae50c312cdeb 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -236,6 +236,7 @@ class ParseTreeDumper {
   NODE(parser, CUFKernelDoConstruct)
   NODE(CUFKernelDoConstruct, StarOrExpr)
   NODE(CUFKernelDoConstruct, Directive)
+  NODE(parser, CUFReduction)
   NODE(parser, CycleStmt)
   NODE(parser, DataComponentDefStmt)
   NODE(parser, DataIDoObject)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index c0635445837904..0a40aa8b8f616f 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4303,12 +4303,23 @@ struct OpenACCConstruct {
 };
 
 // CUF-kernel-do-construct ->
-//     !$CUF KERNEL DO [ (scalar-int-constant-expr) ] <<< grid, block [, stream]
-//     >>> do-construct
+//   !$CUF KERNEL DO [ (scalar-int-constant-expr) ]
+//      <<< grid, block [, stream] >>>
+//      [ cuf-reduction... ]
+//      do-construct
 // star-or-expr -> * | scalar-int-expr
 // grid -> * | scalar-int-expr | ( star-or-expr-list )
 // block -> * | scalar-int-expr | ( star-or-expr-list )
 // stream -> 0, scalar-int-expr | STREAM = scalar-int-expr
+// cuf-reduction -> [ REDUCE | REDUCTION ] (
+//                  acc-reduction-op : scalar-variable-list )
+
+struct CUFReduction {
+  TUPLE_CLASS_BOILERPLATE(CUFReduction);
+  using Operator = AccReductionOperator;
+  std::tuple<Operator, std::list<Scalar<Variable>>> t;
+};
+
 struct CUFKernelDoConstruct {
   TUPLE_CLASS_BOILERPLATE(CUFKernelDoConstruct);
   WRAPPER_CLASS(StarOrExpr, std::optional<ScalarIntExpr>);
@@ -4316,7 +4327,8 @@ struct CUFKernelDoConstruct {
     TUPLE_CLASS_BOILERPLATE(Directive);
     CharBlock source;
     std::tuple<std::optional<ScalarIntConstantExpr>, std::list<StarOrExpr>,
-        std::list<StarOrExpr>, std::optional<ScalarIntExpr>>
+        std::list<StarOrExpr>, std::optional<ScalarIntExpr>,
+        std::list<CUFReduction>>
         t;
   };
   std::tuple<Directive, std::optional<DoConstruct>> t;
diff --git a/flang/include/flang/Semantics/unparse-with-symbols.h b/flang/include/flang/Semantics/unparse-with-symbols.h
index d70110245e2b2f..5e18b3fc3063db 100644
--- a/flang/include/flang/Semantics/unparse-with-symbols.h
+++ b/flang/include/flang/Semantics/unparse-with-symbols.h
@@ -21,8 +21,12 @@ struct Program;
 }
 
 namespace Fortran::semantics {
+class SemanticsContext;
 void UnparseWithSymbols(llvm::raw_ostream &, const parser::Program &,
     parser::Encoding encoding = parser::Encoding::UTF_8);
+void UnparseWithModules(llvm::raw_ostream &, SemanticsContext &,
+    const parser::Program &,
+    parser::Encoding encoding = parser::Encoding::UTF_8);
 }
 
 #endif // FORTRAN_SEMANTICS_UNPARSE_WITH_SYMBOLS_H_
diff --git a/flang/lib/Evaluate/fold.cpp b/flang/lib/Evaluate/fold.cpp
index ed882958199802..cf6262d9a7c65a 100644
--- a/flang/lib/Evaluate/fold.cpp
+++ b/flang/lib/Evaluate/fold.cpp
@@ -272,6 +272,7 @@ std::optional<Expr<SomeType>> FoldTransfer(
     }
   }
   if (sourceBytes && IsActuallyConstant(*source) && moldType && extents &&
+      !moldType->IsPolymorphic() &&
       (moldLength || moldType->category() != TypeCategory::Character)) {
     std::size_t elements{
         extents->empty() ? 1 : static_cast<std::size_t>((*extents)[0])};
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index db7fd3cccc7a28..e8a8c90045d92d 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -488,6 +488,9 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
     case clang::driver::options::OPT_fdebug_unparse_with_symbols:
       opts.programAction = DebugUnparseWithSymbols;
       break;
+    case clang::driver::options::OPT_fdebug_unparse_with_modules:
+      opts.programAction = DebugUnparseWithModules;
+      break;
     case clang::driver::options::OPT_fdebug_dump_symbols:
       opts.programAction = DebugDumpSymbols;
       break;
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 2f65ab6102f4d9..4341c104a69df2 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -477,6 +477,15 @@ void DebugUnparseWithSymbolsAction::executeAction() {
   reportFatalSemanticErrors();
 }
 
+void DebugUnparseWithModulesAction::executeAction() {
+  auto &parseTree{*getInstance().getParsing().parseTree()};
+  CompilerInstance &ci{getInstance()};
+  Fortran::semantics::UnparseWithModules(
+      llvm::outs(), ci.getSemantics().context(), parseTree,
+      /*encoding=*/Fortran::parser::Encoding::UTF_8);
+  reportFatalSemanticErrors();
+}
+
 void DebugDumpSymbolsAction::executeAction() {
   CompilerInstance &ci = this->getInstance();
 
diff --git a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index 4cad640562c619..e2cbd5112d6ea5 100644
--- a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -59,6 +59,8 @@ createFrontendAction(CompilerInstance &ci) {
     return std::make_unique<DebugUnparseNoSemaAction>();
   case DebugUnparseWithSymbols:
     return std::make_unique<DebugUnparseWithSymbolsAction>();
+  case DebugUnparseWithModules:
+    return std::make_unique<DebugUnparseWithModulesAction>();
   case DebugDumpSymbols:
     return std::make_unique<DebugDumpSymbolsAction>();
   case DebugDumpParseTree:
diff --git a/flang/lib/Parser/executable-parsers.cpp b/flang/lib/Parser/executable-parsers.cpp
index 07a570bd61e990..382a593416872a 100644
--- a/flang/lib/Parser/executable-parsers.cpp
+++ b/flang/lib/Parser/executable-parsers.cpp
@@ -538,25 +538,34 @@ TYPE_CONTEXT_PARSER("UNLOCK statement"_en_US,
     construct<UnlockStmt>("UNLOCK (" >> lockVariable,
         defaulted("," >> nonemptyList(statOrErrmsg)) / ")"))
 
-// CUF-kernel-do-construct -> CUF-kernel-do-directive do-construct
-// CUF-kernel-do-directive ->
-//     !$CUF KERNEL DO [ (scalar-int-constant-expr) ] <<< grid, block [, stream]
-//     >>> do-construct
+// CUF-kernel-do-construct ->
+//   !$CUF KERNEL DO [ (scalar-int-constant-expr) ]
+//      <<< grid, block [, stream] >>>
+//      [ cuf-reduction... ]
+//      do-construct
 // star-or-expr -> * | scalar-int-expr
 // grid -> * | scalar-int-expr | ( star-or-expr-list )
 // block -> * | scalar-int-expr | ( star-or-expr-list )
-// stream -> ( 0, | STREAM = ) scalar-int-expr
+// stream -> 0, scalar-int-expr | STREAM = scalar-int-expr
+// cuf-reduction -> [ REDUCTION | REDUCE ] (
+//                  acc-reduction-op : scalar-variable-list )
+
 constexpr auto starOrExpr{construct<CUFKernelDoConstruct::StarOrExpr>(
     "*" >> pure<std::optional<ScalarIntExpr>>() ||
     applyFunction(presentOptional<ScalarIntExpr>, scalarIntExpr))};
 constexpr auto gridOrBlock{parenthesized(nonemptyList(starOrExpr)) ||
     applyFunction(singletonList<CUFKernelDoConstruct::StarOrExpr>, starOrExpr)};
+
+TYPE_PARSER(("REDUCTION"_tok || "REDUCE"_tok) >>
+    parenthesized(construct<CUFReduction>(Parser<CUFReduction::Operator>{},
+        ":" >> nonemptyList(scalar(variable)))))
+
 TYPE_PARSER(sourced(beginDirective >> "$CUF KERNEL DO"_tok >>
     construct<CUFKernelDoConstruct::Directive>(
         maybe(parenthesized(scalarIntConstantExpr)), "<<<" >> gridOrBlock,
         "," >> gridOrBlock,
-        maybe((", 0 ,"_tok || ", STREAM ="_tok) >> scalarIntExpr) / ">>>" /
-            endDirective)))
+        maybe((", 0 ,"_tok || ", STREAM ="_tok) >> scalarIntExpr) / ">>>",
+        many(Parser<CUFReduction>{}) / endDirective)))
 TYPE_CONTEXT_PARSER("!$CUF KERNEL DO construct"_en_US,
     extension<LanguageFeature::CUDA>(construct<CUFKernelDoConstruct>(
         Parser<CUFKernelDoConstruct::Directive>{},
diff --git a/flang/lib/Parser/openacc-parsers.cpp b/flang/lib/Parser/openacc-parsers.cpp
index 946b33d0084a96..3d919e29a24826 100644
--- a/flang/lib/Parser/openacc-parsers.cpp
+++ b/flang/lib/Parser/openacc-parsers.cpp
@@ -19,9 +19,9 @@
 // OpenACC Directives and Clauses
 namespace Fortran::parser {
 
-constexpr auto startAccLine = skipStuffBeforeStatement >>
-    ("!$ACC "_sptok || "C$ACC "_sptok || "*$ACC "_sptok);
-constexpr auto endAccLine = space >> endOfLine;
+constexpr auto startAccLine{skipStuffBeforeStatement >>
+    ("!$ACC "_sptok || "C$ACC "_sptok || "*$ACC "_sptok)};
+constexpr auto endAccLine{space >> endOfLine};
 
 // Autogenerated clauses parser. Information is taken from ACC.td and the
 // parser is generated by tablegen.
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 3398b395f198f8..1639e900903fe9 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2705,7 +2705,6 @@ class UnparseVisitor {
   void Unparse(const CLASS::ENUM &x) { Word(CLASS::EnumToString(x)); }
   WALK_NESTED_ENUM(AccDataModifier, Modifier)
   WALK_NESTED_ENUM(AccessSpec, Kind) // R807
-  WALK_NESTED_ENUM(AccReductionOperator, Operator)
   WALK_NESTED_ENUM(common, TypeParamAttr) // R734
   WALK_NESTED_ENUM(common, CUDADataAttr) // CUDA
   WALK_NESTED_ENUM(common, CUDASubprogramAttrs) // CUDA
@@ -2736,6 +2735,31 @@ class UnparseVisitor {
   WALK_NESTED_ENUM(OmpOrderClause, Type) // OMP order-type
   WALK_NESTED_ENUM(OmpOrderModifier, Kind) // OMP order-modifier
 #undef WALK_NESTED_ENUM
+  void Unparse(const AccReductionOperator::Operator x) {
+    switch (x) {
+    case AccReductionOperator::Operator::Plus:
+      Word("+");
+      break;
+    case AccReductionOperator::Operator::Multiply:
+      Word("*");
+      break;
+    case AccReductionOperator::Operator::And:
+      Word(".AND.");
+      break;
+    case AccReductionOperator::Operator::Or:
+      Word(".OR.");
+      break;
+    case AccReductionOperator::Operator::Eqv:
+      Word(".EQV.");
+      break;
+    case AccReductionOperator::Operator::Neqv:
+      Word(".NEQV.");
+      break;
+    default:
+      Word(AccReductionOperator::EnumToString(x));
+      break;
+    }
+  }
 
   void Unparse(const CUFKernelDoConstruct::StarOrExpr &x) {
     if (x.v) {
@@ -2768,13 +2792,19 @@ class UnparseVisitor {
     if (const auto &stream{std::get<3>(x.t)}) {
       Word(",STREAM="), Walk(*stream);
     }
-    Word(">>>\n");
+    Word(">>>");
+    Walk(" ", std::get<std::list<CUFReduction>>(x.t), " ");
+    Word("\n");
   }
-
   void Unparse(const CUFKernelDoConstruct &x) {
     Walk(std::get<CUFKernelDoConstruct::Directive>(x.t));
     Walk(std::get<std::optional<DoConstruct>>(x.t));
   }
+  void Unparse(const CUFReduction &x) {
+    Word("REDUCE(");
+    Walk(std::get<CUFReduction::Operator>(x.t));
+    Walk(":", std::get<std::list<Scalar<Variable>>>(x.t), ",", ")");
+  }
 
   void Done() const { CHECK(indent_ == 0); }
 
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index 96ab9023926330..45217ed2e3ccd2 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -463,6 +463,46 @@ static int DoConstructTightNesting(
   return 1;
 }
 
+static void CheckReduce(
+    SemanticsContext &context, const parser::CUFReduction &reduce) {
+  auto op{std::get<parser::CUFReduction::Operator>(reduce.t).v};
+  for (const auto &var :
+      std::get<std::list<parser::Scalar<parser::Variable>>>(reduce.t)) {
+    if (const auto &typedExprPtr{var.thing.typedExpr};
+        typedExprPtr && typedExprPtr->v) {
+      const auto &expr{*typedExprPtr->v};
+      if (auto type{expr.GetType()}) {
+        auto cat{type->category()};
+        bool isOk{false};
+        switch (op) {
+        case parser::AccReductionOperator::Operator::Plus:
+        case parser::AccReductionOperator::Operator::Multiply:
+        case parser::AccReductionOperator::Operator::Max:
+        case parser::AccReductionOperator::Operator::Min:
+          isOk = cat == TypeCategory::Integer || cat == TypeCategory::Real;
+          break;
+        case parser::AccReductionOperator::Operator::Iand:
+        case parser::AccReductionOperator::Operator::Ior:
+        case parser::AccReductionOperator::Operator::Ieor:
+          isOk = cat == TypeCategory::Integer;
+          break;
+        case parser::AccReductionOperator::Operator::And:
+        case parser::AccReductionOperator::Operator::Or:
+        case parser::AccReductionOperator::Operator::Eqv:
+        case parser::AccReductionOperator::Operator::Neqv:
+          isOk = cat == TypeCategory::Logical;
+          break;
+        }
+        if (!isOk) {
+          context.Say(var.thing.GetSource(),
+              "!$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type %s"_err_en_US,
+              type->AsFortran());
+        }
+      }
+    }
+  }
+}
+
 void CUDAChecker::Enter(const parser::CUFKernelDoConstruct &x) {
   auto source{std::get<parser::CUFKernelDoConstruct::Directive>(x.t).source};
   const auto &directive{std::get<parser::CUFKernelDoConstruct::Directive>(x.t)};
@@ -489,6 +529,10 @@ void CUDAChecker::Enter(const parser::CUFKernelDoConstruct &x) {
   if (innerBlock) {
     DeviceContextChecker<true>{context_}.Check(*innerBlock);
   }
+  for (const auto &reduce :
+      std::get<std::list<parser::CUFReduction>>(directive.t)) {
+    CheckReduce(context_, reduce);
+  }
 }
 
 void CUDAChecker::Enter(const parser::AssignmentStmt &x) {
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index ce7870b8d54e4d..527a1a9539aa6c 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -1357,6 +1357,15 @@ bool CheckHelper::IsResultOkToDiffer(const FunctionResult &result) {
 
 void CheckHelper::CheckSubprogram(
     const Symbol &symbol, const SubprogramDetails &details) {
+  // Evaluate a procedure definition's characteristics to flush out
+  // any errors that analysis might expose, in case this subprogram hasn't
+  // had any calls in this compilation unit that would have validated them.
+  if (!context_.HasError(symbol) && !details.isDummy() &&
+      !details.isInterface() && !details.stmtFunction()) {
+    if (!Procedure::Characterize(symbol, foldingContext_)) {
+      context_.SetError(symbol);
+    }
+  }
   if (const Symbol *iface{FindSeparateModuleSubprogramInterface(&symbol)}) {
     SubprogramMatchHelper{*this}.Check(symbol, *iface);
   }
@@ -2882,7 +2891,8 @@ parser::Messages CheckHelper::WhyNotInteroperableDerivedType(
       } else {
         bool interoperableParent{true};
         if (parent->symbol()) {
-          auto bad{WhyNotInteroperableDerivedType(*parent->symbol(), false)};
+          auto bad{WhyNotInteroperableDerivedType(
+              *parent->symbol(), /*isError=*/false)};
           if (bad.AnyFatalError()) {
             auto &msg{msgs.Say(symbol.name(),
                 "The parent of an interoperable type is not interoperable"_err_en_US)};
@@ -2972,6 +2982,9 @@ parser::Messages CheckHelper::WhyNotInteroperableDerivedType(
       }
     }
   }
+  if (msgs.AnyFatalError()) {
+    examinedByWhyNotInteroperableDerivedType_.erase(symbol);
+  }
   return msgs;
 }
 
@@ -3059,8 +3072,8 @@ void CheckHelper::CheckBindC(const Symbol &symbol) {
           }
           context_.SetError(symbol);
         } else if (auto bad{WhyNotInteroperableDerivedType(
-                       derived->typeSymbol(), false)};
-                   !bad.empty()) {
+                       derived->typeSymbol(), /*isError=*/false)};
+                   bad.AnyFatalError()) {
           if (auto *msg{messages_.Say(symbol.name(),
                   "The derived type of an interoperable object must be interoperable, but is not"_err_en_US)}) {
             msg->Attach(
@@ -3068,7 +3081,9 @@ void CheckHelper::CheckBindC(const Symbol &symbol) {
             bad.AttachTo(*msg, parser::Severity::None);
           }
           context_.SetError(symbol);
-        } else {
+        } else if (context_.ShouldWarn(
+                       common::LanguageFeature::NonBindCInteroperability) &&
+            !InModuleFile()) {
           if (auto *msg{messages_.Say(symbol.name(),
                   "The derived type of an interoperable object should be BIND(C)"_warn_en_US)}) {
             msg->Attach(derived->typeSymbol().name(), "Non-BIND(C) type"_en_US);
@@ -3142,7 +3157,7 @@ void CheckHelper::CheckBindC(const Symbol &symbol) {
       }
     }
   } else if (symbol.has<DerivedTypeDetails>()) {
-    if (auto msgs{WhyNotInteroperableDerivedType(symbol, false)};
+    if (auto msgs{WhyNotInteroperableDerivedType(symbol, /*isError=*/false)};
         !msgs.empty()) {
       bool anyFatal{msgs.AnyFatalError()};
       if (msgs.AnyFatalError() ||
diff --git a/flang/lib/Semantics/check-purity.cpp b/flang/lib/Semantics/check-purity.cpp
index 5176390f366bd5..55a9a2f107388d 100644
--- a/flang/lib/Semantics/check-purity.cpp
+++ b/flang/lib/Semantics/check-purity.cpp
@@ -39,12 +39,16 @@ bool PurityChecker::InPureSubprogram() const {
 
 bool PurityChecker::HasPurePrefix(
     const std::list<parser::PrefixSpec> &prefixes) const {
+  bool result{false};
   for (const parser::PrefixSpec &prefix : prefixes) {
-    if (std::holds_alternative<parser::PrefixSpec::Pure>(prefix.u)) {
-      return true;
+    if (std::holds_alternative<parser::PrefixSpec::Impure>(prefix.u)) {
+      return false;
+    } else if (std::holds_alternative<parser::PrefixSpec::Pure>(prefix.u) ||
+        std::holds_alternative<parser::PrefixSpec::Elemental>(prefix.u)) {
+      result = true;
     }
   }
-  return false;
+  return result;
 }
 
 void PurityChecker::Entered(
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index e9aebe5b08f2ba..bb8c6c7567b8d7 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -132,11 +132,11 @@ static std::string ModFileName(const SourceName &name,
 
 // Write the module file for symbol, which must be a module or submodule.
 void ModFileWriter::Write(const Symbol &symbol) {
-  auto &module{symbol.get<ModuleDetails>()};
+  const auto &module{symbol.get<ModuleDetails>()};
   if (module.moduleFileHash()) {
     return; // already written
   }
-  auto *ancestor{module.ancestor()};
+  const auto *ancestor{module.ancestor()};
   isSubmodule_ = ancestor != nullptr;
   auto ancestorName{ancestor ? ancestor->GetName().value().ToString() : ""s};
   auto path{context_.moduleDirectory() + '/' +
@@ -151,6 +151,21 @@ void ModFileWriter::Write(const Symbol &symbol) {
   const_cast<ModuleDetails &>(module).set_moduleFileHash(checkSum);
 }
 
+void ModFileWriter::WriteClosure(llvm::raw_ostream &out, const Symbol &symbol,
+    UnorderedSymbolSet &nonIntrinsicModulesWritten) {
+  if (!symbol.has<ModuleDetails>() || symbol.owner().IsIntrinsicModules() ||
+      !nonIntrinsicModulesWritten.insert(symbol).second) {
+    return;
+  }
+  PutSymbols(DEREF(symbol.scope()));
+  needsBuf_.clear(); // omit module checksums
+  auto str{GetAsString(symbol)};
+  for (auto depRef : std::move(usedNonIntrinsicModules_)) {
+    WriteClosure(out, *depRef, nonIntrinsicModulesWritten);
+  }
+  out << std::move(str);
+}
+
 // Return the entire body of the module file
 // and clear saved uses, decls, and contains.
 std::string ModFileWriter::GetAsString(const Symbol &symbol) {
@@ -710,6 +725,7 @@ void ModFileWriter::PutUse(const Symbol &symbol) {
     uses_ << "use,intrinsic::";
   } else {
     uses_ << "use ";
+    usedNonIntrinsicModules_.insert(module);
   }
   uses_ << module.name() << ",only:";
   PutGenericName(uses_, symbol);
diff --git a/flang/lib/Semantics/mod-file.h b/flang/lib/Semantics/mod-file.h
index b4ece4018c054d..739add32c2e0ee 100644
--- a/flang/lib/Semantics/mod-file.h
+++ b/flang/lib/Semantics/mod-file.h
@@ -35,6 +35,8 @@ class ModFileWriter {
 public:
   explicit ModFileWriter(SemanticsContext &context) : context_{context} {}
   bool WriteAll();
+  void WriteClosure(llvm::raw_ostream &, const Symbol &,
+      UnorderedSymbolSet &nonIntrinsicModulesWritten);
 
 private:
   SemanticsContext &context_;
@@ -46,6 +48,7 @@ class ModFileWriter {
   std::string containsBuf_;
   // Tracks nested DEC structures and fields of that type
   UnorderedSymbolSet emittedDECStructures_, emittedDECFields_;
+  UnorderedSymbolSet usedNonIntrinsicModules_;
 
   llvm::raw_string_ostream needs_{needsBuf_};
   llvm::raw_string_ostream uses_{usesBuf_};
diff --git a/flang/lib/Semantics/resolve-directives.h b/flang/lib/Semantics/resolve-directives.h
index 4aef8ad6c40081..5a890c26aa3349 100644
--- a/flang/lib/Semantics/resolve-directives.h
+++ b/flang/lib/Semantics/resolve-directives.h
@@ -21,7 +21,7 @@ class SemanticsContext;
 
 // Name resolution for OpenACC and OpenMP directives
 void ResolveAccParts(
-    SemanticsContext &, const parser::ProgramUnit &, Scope *topScope = {});
+    SemanticsContext &, const parser::ProgramUnit &, Scope *topScope);
 void ResolveOmpParts(SemanticsContext &, const parser::ProgramUnit &);
 void ResolveOmpTopLevelParts(SemanticsContext &, const parser::Program &);
 
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index e2875081b732c7..40eee89de131a3 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -5013,8 +5013,7 @@ bool DeclarationVisitor::HasCycle(
     if (procsInCycle.count(*interface) > 0) {
       for (const auto &procInCycle : procsInCycle) {
         Say(procInCycle->name(),
-            "The interface for procedure '%s' is recursively "
-            "defined"_err_en_US,
+            "The interface for procedure '%s' is recursively defined"_err_en_US,
             procInCycle->name());
         context().SetError(*procInCycle);
       }
@@ -8941,7 +8940,7 @@ bool ResolveNamesVisitor::Pre(const parser::ProgramUnit &x) {
   FinishSpecificationParts(root);
   ResolveExecutionParts(root);
   FinishExecutionParts(root);
-  ResolveAccParts(context(), x);
+  ResolveAccParts(context(), x, /*topScope=*/nullptr);
   ResolveOmpParts(context(), x);
   return false;
 }
diff --git a/flang/lib/Semantics/unparse-with-symbols.cpp b/flang/lib/Semantics/unparse-with-symbols.cpp
index 67016e85777c7e..c451f885c06279 100644
--- a/flang/lib/Semantics/unparse-with-symbols.cpp
+++ b/flang/lib/Semantics/unparse-with-symbols.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Semantics/unparse-with-symbols.h"
+#include "mod-file.h"
 #include "flang/Parser/parse-tree-visitor.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Parser/unparse.h"
@@ -98,4 +99,41 @@ void UnparseWithSymbols(llvm::raw_ostream &out, const parser::Program &program,
           int indent) { visitor.PrintSymbols(location, out, indent); }};
   parser::Unparse(out, program, encoding, false, true, &preStatement);
 }
+
+// UnparseWithModules()
+
+class UsedModuleVisitor {
+public:
+  UnorderedSymbolSet &modulesUsed() { return modulesUsed_; }
+  UnorderedSymbolSet &modulesDefined() { return modulesDefined_; }
+  template <typename T> bool Pre(const T &) { return true; }
+  template <typename T> void Post(const T &) {}
+  void Post(const parser::ModuleStmt &module) {
+    if (module.v.symbol) {
+      modulesDefined_.insert(*module.v.symbol);
+    }
+  }
+  void Post(const parser::UseStmt &use) {
+    if (use.moduleName.symbol) {
+      modulesUsed_.insert(*use.moduleName.symbol);
+    }
+  }
+
+private:
+  UnorderedSymbolSet modulesUsed_;
+  UnorderedSymbolSet modulesDefined_;
+};
+
+void UnparseWithModules(llvm::raw_ostream &out, SemanticsContext &context,
+    const parser::Program &program, parser::Encoding encoding) {
+  UsedModuleVisitor visitor;
+  parser::Walk(program, visitor);
+  UnorderedSymbolSet nonIntrinsicModulesWritten{
+      std::move(visitor.modulesDefined())};
+  ModFileWriter writer{context};
+  for (SymbolRef moduleRef : visitor.modulesUsed()) {
+    writer.WriteClosure(out, *moduleRef, nonIntrinsicModulesWritten);
+  }
+  parser::Unparse(out, program, encoding, false, true);
+}
 } // namespace Fortran::semantics
diff --git a/flang/test/Driver/unparse-with-modules.f90 b/flang/test/Driver/unparse-with-modules.f90
new file mode 100644
index 00000000000000..53997f7804efa4
--- /dev/null
+++ b/flang/test/Driver/unparse-with-modules.f90
@@ -0,0 +1,34 @@
+! RUN: %flang_fc1 -I %S/Inputs/module-dir -fdebug-unparse-with-modules %s | FileCheck %s
+module m1
+  use iso_fortran_env
+  use BasicTestModuleTwo
+  implicit none
+  type(t2) y
+  real(real32) x
+end
+
+program test
+  use m1
+  use BasicTestModuleTwo
+  implicit none
+  x = 123.
+  y = t2()
+end
+
+!CHECK-NOT: module iso_fortran_env
+!CHECK: module basictestmoduletwo
+!CHECK: type::t2
+!CHECK: end type
+!CHECK: end
+!CHECK: module m1
+!CHECK:  use :: iso_fortran_env
+!CHECK:  implicit none
+!CHECK:  real(kind=real32) x
+!CHECK: end module
+!CHECK: program test
+!CHECK:  use :: m1
+!CHECK:  use :: basictestmoduletwo
+!CHECK:  implicit none
+!CHECK:  x = 123.
+!CHECK:  y = t2()
+!CHECK: end program
diff --git a/flang/test/Parser/cuf-sanity-common b/flang/test/Parser/cuf-sanity-common
index b097a6aa300458..9d73204e3f5f6c 100644
--- a/flang/test/Parser/cuf-sanity-common
+++ b/flang/test/Parser/cuf-sanity-common
@@ -23,12 +23,19 @@ module m
   end subroutine
   subroutine test
     logical isPinned
+    real a(10), x, y, z
     !$cuf kernel do(1) <<<*, *, stream = 1>>>
     do j = 1, 10
     end do
     !$cuf kernel do <<<1, (2, 3), stream = 1>>>
     do j = 1, 10
     end do
+    !$cuf kernel do <<<*, *>>> reduce(+:x,y) reduce(*:z)
+    do j = 1, 10
+      x = x + a(j)
+      y = y + a(j)
+      z = z * a(j)
+    end do
     call globalsub<<<1, 2>>>
     call globalsub<<<1, 2, 3>>>
     call globalsub<<<1, 2, 3, 4>>>
diff --git a/flang/test/Parser/cuf-sanity-unparse.CUF b/flang/test/Parser/cuf-sanity-unparse.CUF
index b6921e74fc05ae..d4be347dd044ea 100644
--- a/flang/test/Parser/cuf-sanity-unparse.CUF
+++ b/flang/test/Parser/cuf-sanity-unparse.CUF
@@ -34,6 +34,12 @@ include "cuf-sanity-common"
 !CHECK:   !$CUF KERNEL DO <<<1_4,(2_4,3_4),STREAM=1_4>>>
 !CHECK:   DO j=1_4,10_4
 !CHECK:   END DO
+!CHECK:   !$CUF KERNEL DO <<<*,*>>> REDUCE(+:x,y) REDUCE(*:z)
+!CHECK:   DO j=1_4,10_4
+!CHECK:    x=x+a(int(j,kind=8))
+!CHECK:    y=y+a(int(j,kind=8))
+!CHECK:    z=z*a(int(j,kind=8))
+!CHECK:   END DO
 !CHECK:    CALL globalsub<<<1_4,2_4>>>()
 !CHECK:    CALL globalsub<<<1_4,2_4,3_4>>>()
 !CHECK:    CALL globalsub<<<1_4,2_4,3_4,4_4>>>()
diff --git a/flang/test/Semantics/bind-c15.f90 b/flang/test/Semantics/bind-c15.f90
new file mode 100644
index 00000000000000..9aaad52cc0e0a4
--- /dev/null
+++ b/flang/test/Semantics/bind-c15.f90
@@ -0,0 +1,45 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic
+
+module m
+  type, bind(c) :: explicit_bind_c
+    real a
+  end type
+  type :: interoperable1
+    type(explicit_bind_c) a
+  end type
+  type, extends(interoperable1) :: interoperable2
+    real b
+  end type
+  type :: non_interoperable1
+    real, allocatable :: a
+  end type
+  type :: non_interoperable2
+    type(non_interoperable1) b
+  end type
+  interface
+    subroutine sub_bind_c_1(x_bind_c) bind(c)
+      import explicit_bind_c
+      type(explicit_bind_c), intent(in) :: x_bind_c
+    end
+    subroutine sub_bind_c_2(x_interop1) bind(c)
+      import interoperable1
+      !WARNING: The derived type of an interoperable object should be BIND(C)
+      type(interoperable1), intent(in) :: x_interop1
+    end
+    subroutine sub_bind_c_3(x_interop2) bind(c)
+      import interoperable2
+      !WARNING: The derived type of an interoperable object should be BIND(C)
+      type(interoperable2), intent(in) :: x_interop2
+    end
+    subroutine sub_bind_c_4(x_non_interop1) bind(c)
+      import non_interoperable1
+      !ERROR: The derived type of an interoperable object must be interoperable, but is not
+      type(non_interoperable1), intent(in) :: x_non_interop1
+    end
+    subroutine sub_bind_c_5(x_non_interop2) bind(c)
+      import non_interoperable2
+      !ERROR: The derived type of an interoperable object must be interoperable, but is not
+      type(non_interoperable2), intent(in) :: x_non_interop2
+    end
+  end interface
+end
diff --git a/flang/test/Semantics/entry01.f90 b/flang/test/Semantics/entry01.f90
index 970cd109921a15..765b18c2e81a8a 100644
--- a/flang/test/Semantics/entry01.f90
+++ b/flang/test/Semantics/entry01.f90
@@ -83,6 +83,7 @@ function ifunc()
   !ERROR: 'ibad1' is already declared in this scoping unit
   entry ibad1() result(ibad1res) ! C1570
   !ERROR: 'ibad2' is already declared in this scoping unit
+  !ERROR: Procedure 'ibad2' is referenced before being sufficiently defined in a context where it must be so
   entry ibad2()
   !ERROR: ENTRY in a function may not have an alternate return dummy argument
   entry ibadalt(*) ! C1573
@@ -91,6 +92,7 @@ function ifunc()
   entry iok()
   !ERROR: Explicit RESULT('iok') of function 'isameres2' cannot have the same name as a distinct ENTRY into the same scope
   entry isameres2() result(iok) ! C1574
+  !ERROR: Procedure 'iok2' is referenced before being sufficiently defined in a context where it must be so
   !ERROR: Explicit RESULT('iok2') of function 'isameres3' cannot have the same name as a distinct ENTRY into the same scope
   entry isameres3() result(iok2) ! C1574
   !ERROR: 'iok2' is already declared in this scoping unit
diff --git a/flang/test/Semantics/pure02.f90 b/flang/test/Semantics/pure02.f90
new file mode 100644
index 00000000000000..11dc0fd268293b
--- /dev/null
+++ b/flang/test/Semantics/pure02.f90
@@ -0,0 +1,59 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+pure subroutine s1
+ contains
+  !ERROR: An internal subprogram of a pure subprogram must also be pure
+  subroutine t1
+  end
+  pure subroutine t2 ! ok
+  end
+  elemental subroutine t3(k) ! ok
+    integer, intent(in) :: k
+  end
+  !ERROR: An internal subprogram of a pure subprogram must also be pure
+  impure elemental subroutine t4(k)
+    integer, intent(in) :: k
+  end
+  !ERROR: An internal subprogram of a pure subprogram must also be pure
+  elemental impure subroutine t5(k)
+    integer, intent(in) :: k
+  end
+end
+
+elemental subroutine s2(j)
+  integer, intent(in) :: j
+ contains
+  !ERROR: An internal subprogram of a pure subprogram must also be pure
+  subroutine t1
+  end
+  pure subroutine t2 ! ok
+  end
+  elemental subroutine t3(k) ! ok
+    integer, intent(in) :: k
+  end
+  !ERROR: An internal subprogram of a pure subprogram must also be pure
+  impure elemental subroutine t4(k)
+    integer, intent(in) :: k
+  end
+  !ERROR: An internal subprogram of a pure subprogram must also be pure
+  elemental impure subroutine t5(k)
+    integer, intent(in) :: k
+  end
+end
+
+impure elemental subroutine s3(j)
+  integer, intent(in) :: j
+ contains
+  subroutine t1
+  end
+  pure subroutine t2
+  end
+  elemental subroutine t3(k)
+    integer, intent(in) :: k
+  end
+  impure elemental subroutine t4(k)
+    integer, intent(in) :: k
+  end
+  elemental impure subroutine t5(k)
+    integer, intent(in) :: k
+  end
+end
diff --git a/flang/test/Semantics/reduce.cuf b/flang/test/Semantics/reduce.cuf
new file mode 100644
index 00000000000000..95ff2e87c09b47
--- /dev/null
+++ b/flang/test/Semantics/reduce.cuf
@@ -0,0 +1,72 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+subroutine s(n,m,a,l)
+  integer, intent(in) :: n
+  integer, intent(in) :: m(n)
+  real, intent(in) :: a(n)
+  logical, intent(in) :: l(n)
+  integer j, mr
+  real ar
+  logical lr
+!$cuf kernel do <<<*,*>>> reduce (+:mr,ar)
+  do j=1,n; mr = mr + m(j); ar = ar + a(j); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type LOGICAL(4)
+!$cuf kernel do <<<*,*>>> reduce (+:lr)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (*:mr,ar)
+  do j=1,n; mr = mr * m(j); ar = ar * a(j); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type LOGICAL(4)
+!$cuf kernel do <<<*,*>>> reduce (*:lr)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (max:mr,ar)
+  do j=1,n; mr = max(mr,m(j)); ar = max(ar,a(j)); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type LOGICAL(4)
+!$cuf kernel do <<<*,*>>> reduce (max:lr)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (min:mr,ar)
+  do j=1,n; mr = min(mr,m(j)); ar = min(ar,a(j)); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type LOGICAL(4)
+!$cuf kernel do <<<*,*>>> reduce (min:lr)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (iand:mr)
+  do j=1,n; mr = iand(mr,m(j)); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type REAL(4)
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type LOGICAL(4)
+!$cuf kernel do <<<*,*>>> reduce (iand:ar,lr)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (ieor:mr)
+  do j=1,n; mr = ieor(mr,m(j)); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type REAL(4)
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type LOGICAL(4)
+!$cuf kernel do <<<*,*>>> reduce (ieor:ar,lr)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (ior:mr)
+  do j=1,n; mr = ior(mr,m(j)); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type REAL(4)
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type LOGICAL(4)
+!$cuf kernel do <<<*,*>>> reduce (ior:ar,lr)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (.and.:lr)
+  do j=1,n; lr = lr .and. l(j); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type INTEGER(4)
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type REAL(4)
+!$cuf kernel do <<<*,*>>> reduce (.and.:mr,ar)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (.eqv.:lr)
+  do j=1,n; lr = lr .eqv. l(j); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type INTEGER(4)
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type REAL(4)
+!$cuf kernel do <<<*,*>>> reduce (.eqv.:mr,ar)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (.neqv.:lr)
+  do j=1,n; lr = lr .neqv. l(j); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type INTEGER(4)
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type REAL(4)
+!$cuf kernel do <<<*,*>>> reduce (.neqv.:mr,ar)
+  do j=1,n; end do
+!$cuf kernel do <<<*,*>>> reduce (.or.:lr)
+  do j=1,n; lr = lr .or. l(j); end do
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type INTEGER(4)
+!ERROR: !$CUF KERNEL DO REDUCE operation is not acceptable for a variable with type REAL(4)
+!$cuf kernel do <<<*,*>>> reduce (.or.:mr,ar)
+  do j=1,n; end do
+end
diff --git a/flang/test/Semantics/resolve102.f90 b/flang/test/Semantics/resolve102.f90
index 8f6e2246a57e79..33cf6fa245ea49 100644
--- a/flang/test/Semantics/resolve102.f90
+++ b/flang/test/Semantics/resolve102.f90
@@ -4,17 +4,12 @@
 !ERROR: Procedure 'sub' is recursively defined.  Procedures in the cycle: 'sub', 'p2'
 subroutine sub(p2)
   PROCEDURE(sub) :: p2
-
-  call sub()
 end subroutine
 
 subroutine circular
-  !ERROR: Procedure 'p' is recursively defined.  Procedures in the cycle: 'p', 'sub', 'p2'
   procedure(sub) :: p
-
-  call p(sub)
-
   contains
+    !ERROR: Procedure 'sub' is recursively defined.  Procedures in the cycle: 'p', 'sub', 'p2'
     subroutine sub(p2)
       procedure(p) :: p2
     end subroutine
@@ -41,11 +36,10 @@ subroutine sub(p2)
 
 subroutine mutual
   Procedure(sub1) :: p
-
-  Call p(sub)
-
   contains
     !ERROR: Procedure 'sub1' is recursively defined.  Procedures in the cycle: 'p', 'sub1', 'arg'
+    !ERROR: Procedure 'sub1' is recursively defined.  Procedures in the cycle: 'sub1', 'arg', 'sub', 'p2'
+    !ERROR: Procedure 'sub1' is recursively defined.  Procedures in the cycle: 'sub1', 'arg'
     Subroutine sub1(arg)
       procedure(sub1) :: arg
     End Subroutine
@@ -57,15 +51,14 @@ Subroutine sub(p2)
 
 subroutine mutual1
   Procedure(sub1) :: p
-
-  Call p(sub)
-
   contains
     !ERROR: Procedure 'sub1' is recursively defined.  Procedures in the cycle: 'p', 'sub1', 'arg', 'sub', 'p2'
+    !ERROR: Procedure 'sub1' is recursively defined.  Procedures in the cycle: 'sub1', 'arg', 'sub', 'p2'
     Subroutine sub1(arg)
       procedure(sub) :: arg
     End Subroutine
 
+    !ERROR: Procedure 'sub' is recursively defined.  Procedures in the cycle: 'sub1', 'arg', 'sub', 'p2'
     Subroutine sub(p2)
       Procedure(sub1) :: p2
     End Subroutine
@@ -76,8 +69,6 @@ subroutine twoCycle
   !ERROR: The interface for procedure 'p2' is recursively defined
   procedure(p1) p2
   procedure(p2) p1
-  call p1
-  call p2
 end subroutine
 
 subroutine threeCycle
@@ -87,9 +78,6 @@ subroutine threeCycle
   !ERROR: The interface for procedure 'p3' is recursively defined
   procedure(p2) p3
   procedure(p3) p1
-  call p1
-  call p2
-  call p3
 end subroutine
 
 module mutualSpecExprs
@@ -118,4 +106,3 @@ function ifunc(x)
     ifunc = x
   end
 end
-
diff --git a/lld/test/ELF/arm-exidx-shared.s b/lld/test/ELF/arm-exidx-shared.s
index fce605d6d96aa5..2e484e5c065fee 100644
--- a/lld/test/ELF/arm-exidx-shared.s
+++ b/lld/test/ELF/arm-exidx-shared.s
@@ -2,7 +2,7 @@
 // RUN: llvm-mc -filetype=obj -arm-add-build-attributes -triple=armv7a-none-linux-gnueabi %s -o %t
 // RUN: ld.lld --hash-style=sysv %t --shared -o %t2
 // RUN: llvm-readobj --relocations %t2 | FileCheck %s
-// RUN: llvm-objdump -s --triple=armv7a-none-linux-gnueabi %t2 | FileCheck --check-prefix=CHECK-EXTAB-NEXT %s
+// RUN: llvm-objdump -s --triple=armv7a-none-linux-gnueabi %t2 | FileCheck --check-prefix=CHECK-EXTAB %s
 
 // Check that the relative R_ARM_PREL31 relocation can access a PLT entry
 // for when the personality routine is referenced from a shared library.
diff --git a/lld/test/ELF/mips-tls-hilo.s b/lld/test/ELF/mips-tls-hilo.s
index 6fd2033aac4142..9c67f9fe14ba6f 100644
--- a/lld/test/ELF/mips-tls-hilo.s
+++ b/lld/test/ELF/mips-tls-hilo.s
@@ -28,16 +28,6 @@
 # CHECK-NEXT: ]
 # CHECK-NOT:  Primary GOT
 
-# SO:      Relocations [
-# SO-NEXT: ]
-# SO:      Primary GOT {
-# SO:        Local entries [
-# SO-NEXT:   ]
-# SO-NEXT:   Global entries [
-# SO-NEXT:   ]
-# SO-NEXT:   Number of TLS and multi-GOT entries: 0
-# SO-NEXT: }
-
   .text
   .globl  __start
   .type __start,@function
diff --git a/lld/test/ELF/ppc32-reloc-rel.s b/lld/test/ELF/ppc32-reloc-rel.s
index b89e0b43cb785c..d13ebdb7997f7c 100644
--- a/lld/test/ELF/ppc32-reloc-rel.s
+++ b/lld/test/ELF/ppc32-reloc-rel.s
@@ -6,6 +6,7 @@
 # RUN: llvm-mc -filetype=obj -triple=powerpcle %s -o %t.le.o
 # RUN: ld.lld %t.le.o -o %t
 # RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
+# RUN: llvm-objdump -s %t | FileCheck %s --check-prefix=HEX
 
 .section .R_PPC_REL14,"ax",@progbits
   beq 1f
@@ -23,7 +24,7 @@
   .long 1f - .
 1:
 # HEX-LABEL: section .R_PPC_REL32:
-# HEX-NEXT: 10010008 00000004
+# HEX-NEXT:  04000000
 
 .section .R_PPC_PLTREL24,"ax",@progbits
   b 1f@PLT+32768
diff --git a/lld/test/ELF/ppc64-pcrel-call-to-extern.s b/lld/test/ELF/ppc64-pcrel-call-to-extern.s
index e5846e80ce2382..de05b733e175a2 100644
--- a/lld/test/ELF/ppc64-pcrel-call-to-extern.s
+++ b/lld/test/ELF/ppc64-pcrel-call-to-extern.s
@@ -73,9 +73,7 @@
 
 ## DT_PLTGOT points to .plt
 # SEC: .plt              NOBITS          0000000010030168 040168 000028 00  WA  0   0  8
-# SEC-OG: .plt              NOBITS          0000000010030158 040158 000028 00  WA  0   0  8
 # SEC: 0x0000000000000003 (PLTGOT)      0x10030168
-# SEC-OG: 0x0000000000000003 (PLTGOT)      0x10030168
 
 ## DT_PLTGOT points to .plt
 # SEC-NOP10: .plt              NOBITS          0000000010030168 040168 000028 00  WA  0   0  8
@@ -86,11 +84,8 @@
 ## Check that we emit 3 R_PPC64_JMP_SLOT in .rela.plt.
 # REL:      .rela.plt {
 # REL-NEXT:   0x10030178 R_PPC64_JMP_SLOT callee_global_stother0 0x0
-# REL-NEXT-OG:   0x10030168 R_PPC64_JMP_SLOT callee_global_stother0 0x0
 # REL-NEXT:   0x10030180 R_PPC64_JMP_SLOT callee_global_stother1 0x0
-# REL-NEXT-OG:   0x10030170 R_PPC64_JMP_SLOT callee_global_stother1 0x0
 # REL-NEXT:   0x10030188 R_PPC64_JMP_SLOT callee_global_TOC 0x0
-# REL-NEXT-OG:   0x10030178 R_PPC64_JMP_SLOT callee_global_TOC 0x0
 # REL-NEXT: }
 
 # REL-NOP10:      .rela.plt {
diff --git a/lld/test/ELF/ppc64-toc-relax-ifunc.s b/lld/test/ELF/ppc64-toc-relax-ifunc.s
index 9fb1bf0023b6d4..00a63c7e5b6772 100644
--- a/lld/test/ELF/ppc64-toc-relax-ifunc.s
+++ b/lld/test/ELF/ppc64-toc-relax-ifunc.s
@@ -4,7 +4,7 @@
 # RUN: echo '.globl ifunc; .type ifunc, %gnu_indirect_function; ifunc:' | \
 # RUN:   llvm-mc -filetype=obj -triple=powerpc64le - -o %t1.o
 # RUN: ld.lld %t.o %t1.o -o %t
-# RUN: llvm-readelf -S -s %t | FileCheck --check-prefix=SEC %s
+# RUN: llvm-readelf -Ssr %t | FileCheck --check-prefix=SEC %s
 # RUN: llvm-readelf -x .toc %t | FileCheck --check-prefix=HEX %s
 # RUN: llvm-objdump -d %t | FileCheck --check-prefix=DIS %s
 
@@ -13,18 +13,15 @@
 ## still perform toc-indirect to toc-relative relaxation because the distance
 ## to the address of the canonical PLT is fixed.
 
-# SEC: .text PROGBITS 00000000100101e0
-# SEC: .plt  NOBITS   0000000010030200
-# SEC: 00000000100101e8 0 FUNC GLOBAL DEFAULT 3 ifunc
+# SEC: .text PROGBITS [[#%x,TEXT:]]
+# SEC: .plt  NOBITS   [[#%x,PLT:]]
+# SEC: {{0*}}[[#PLT]] {{.+}} R_PPC64_IRELATIVE [[#TEXT+8]]
+# SEC: {{0*}}[[#TEXT+8]] 0 FUNC GLOBAL DEFAULT 3 ifunc
 
 ## .toc[0] stores the address of the canonical PLT.
 # HEX:      section '.toc':
 # HEX-NEXT: 0x100201f8 e8010110 00000000
 
-# REL:      .rela.dyn {
-# REL-NEXT:   0x100301f8 R_PPC64_IRELATIVE - 0x100101e8
-# REL-NEXT: }
-
 # DIS: addi 3, 3,
 
 addis 3, 2, .toc@toc@ha
diff --git a/lld/test/ELF/riscv-gp.s b/lld/test/ELF/riscv-gp.s
index 29411d19b01929..e82e36ee9a7ae5 100644
--- a/lld/test/ELF/riscv-gp.s
+++ b/lld/test/ELF/riscv-gp.s
@@ -16,10 +16,6 @@
 # SEC64: [ [[#SDATA:]]] .sdata PROGBITS {{0*}}000032e0
 # SEC64: {{0*}}00003ae0 0 NOTYPE GLOBAL DEFAULT [[#SDATA]] __global_pointer$
 
-## __global_pointer$ - 0x1000 = 4096*3-2048
-# DIS:      1000: auipc gp, 3
-# DIS-NEXT:       addi gp, gp, -2048
-
 # ERR: error: relocation R_RISCV_PCREL_HI20 cannot be used against symbol '__global_pointer$'; recompile with -fPIC
 
 ## -r mode does not define __global_pointer$.
diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json
index 2e8ad074256bf5..aeb24445551c1e 100644
--- a/lldb/tools/lldb-dap/package.json
+++ b/lldb/tools/lldb-dap/package.json
@@ -2,7 +2,7 @@
   "name": "lldb-dap",
   "displayName": "LLDB DAP",
   "version": "0.2.0",
-  "publisher": "llvm",
+  "publisher": "llvm-vs-code-extensions",
   "homepage": "https://lldb.llvm.org",
   "description": "LLDB debugging from VSCode",
   "license": "Apache 2.0 License with LLVM exceptions",
@@ -42,6 +42,7 @@
     "watch": "tsc -watch -p ./",
     "format": "npx prettier './src-ts/' --write",
     "package": "vsce package --out ./out/lldb-dap.vsix",
+    "publish": "vsce publish",
     "vscode-uninstall": "code --uninstall-extension llvm.lldb-dap",
     "vscode-install": "code --install-extension ./out/lldb-dap.vsix"
   },
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 4ced970e54d880..70b51876c1dde5 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -16,7 +16,7 @@
 
 /* Indicate that this is LLVM compiled from the amd-gfx branch. */
 #define LLVM_HAVE_BRANCH_AMD_GFX
-#define LLVM_MAIN_REVISION 498829
+#define LLVM_MAIN_REVISION 498845
 
 /* Define if LLVM_ENABLE_DUMP is enabled */
 #cmakedefine LLVM_ENABLE_DUMP
diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
index 15578c51a49578..edcf02c0946971 100644
--- a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
+++ b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_PROFILEDATA_PGOCTXPROFWRITER_H_
 #define LLVM_PROFILEDATA_PGOCTXPROFWRITER_H_
 
+#include "llvm/Bitstream/BitCodeEnums.h"
 #include "llvm/Bitstream/BitstreamWriter.h"
 #include "llvm/ProfileData/CtxInstrContextNode.h"
 
@@ -20,7 +21,7 @@ namespace llvm {
 enum PGOCtxProfileRecords { Invalid = 0, Version, Guid, CalleeIndex, Counters };
 
 enum PGOCtxProfileBlockIDs {
-  ProfileMetadataBlockID = 100,
+  ProfileMetadataBlockID = bitc::FIRST_APPLICATION_BLOCKID,
   ContextNodeBlockID = ProfileMetadataBlockID + 1
 };
 
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index b61c59aacc0f95..c941b9d89df388 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -320,11 +320,8 @@ void InstrProfWriter::addBinaryIds(ArrayRef<llvm::object::BuildID> BIs) {
 }
 
 void InstrProfWriter::addTemporalProfileTrace(TemporalProfTraceTy Trace) {
-  if (Trace.FunctionNameRefs.size() > MaxTemporalProfTraceLength)
-    Trace.FunctionNameRefs.resize(MaxTemporalProfTraceLength);
-  if (Trace.FunctionNameRefs.empty())
-    return;
-
+  assert(Trace.FunctionNameRefs.size() <= MaxTemporalProfTraceLength);
+  assert(!Trace.FunctionNameRefs.empty());
   if (TemporalProfTraceStreamSize < TemporalProfTraceReservoirSize) {
     // Simply append the trace if we have not yet hit our reservoir size limit.
     TemporalProfTraces.push_back(std::move(Trace));
@@ -341,6 +338,10 @@ void InstrProfWriter::addTemporalProfileTrace(TemporalProfTraceTy Trace) {
 
 void InstrProfWriter::addTemporalProfileTraces(
     SmallVectorImpl<TemporalProfTraceTy> &SrcTraces, uint64_t SrcStreamSize) {
+  for (auto &Trace : SrcTraces)
+    if (Trace.FunctionNameRefs.size() > MaxTemporalProfTraceLength)
+      Trace.FunctionNameRefs.resize(MaxTemporalProfTraceLength);
+  llvm::erase_if(SrcTraces, [](auto &T) { return T.FunctionNameRefs.empty(); });
   // Assume that the source has the same reservoir size as the destination to
   // avoid needing to record it in the indexed profile format.
   bool IsDestSampled =
diff --git a/llvm/test/tools/llvm-profdata/trace-limit.proftext b/llvm/test/tools/llvm-profdata/trace-limit.proftext
index cf6edd648b23b2..e246ee890ba313 100644
--- a/llvm/test/tools/llvm-profdata/trace-limit.proftext
+++ b/llvm/test/tools/llvm-profdata/trace-limit.proftext
@@ -1,13 +1,17 @@
 # RUN: llvm-profdata merge --temporal-profile-max-trace-length=0 %s -o %t.profdata
 # RUN: llvm-profdata show --temporal-profile-traces %t.profdata | FileCheck %s --check-prefix=NONE
 
+# RUN: llvm-profdata merge --temporal-profile-trace-reservoir-size=2 %s %s %s %s -o %t.profdata
+# RUN: llvm-profdata merge --temporal-profile-trace-reservoir-size=2 --temporal-profile-max-trace-length=0 %t.profdata -o %t.profdata
+# RUN: llvm-profdata show --temporal-profile-traces %t.profdata | FileCheck %s --check-prefix=NONE
+
 # RUN: llvm-profdata merge --temporal-profile-max-trace-length=2 %s -o %t.profdata
 # RUN: llvm-profdata show --temporal-profile-traces %t.profdata | FileCheck %s --check-prefixes=CHECK,SOME
 
 # RUN: llvm-profdata merge --temporal-profile-max-trace-length=1000 %s -o %t.profdata
 # RUN: llvm-profdata show --temporal-profile-traces %t.profdata | FileCheck %s --check-prefixes=CHECK,ALL
 
-# NONE: Temporal Profile Traces (samples=0 seen=0):
+# NONE: Temporal Profile Traces (samples=0
 # CHECK: Temporal Profile Traces (samples=1 seen=1):
 # SOME:   Trace 0 (weight=1 count=2):
 # ALL:    Trace 0 (weight=1 count=3):
diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
index c6fc4e08aa7271..ddcaea5184d434 100644
--- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
@@ -916,7 +916,10 @@ apple_genrule(
     srcs = [":debugserver_unsigned"],
     outs = ["debugserver"],
     cmd = "cp $(SRCS) $(OUTS) && xcrun codesign -f -s - --entitlements $(location tools/debugserver/resources/debugserver-macosx-entitlements.plist) $(OUTS)",
-    tags = ["nobuildkite"],
+    tags = [
+        "manual",
+        "nobuildkite",
+    ],
     target_compatible_with = select({
         "@platforms//os:macos": [],
         "//conditions:default": ["@platforms//:incompatible"],
diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
index 21f0c7092f32e9..b44489e213a400 100644
--- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
@@ -617,6 +617,7 @@ cc_test(
         allow_empty = False,
     ),
     deps = [
+        "//llvm:BitstreamReader",
         "//llvm:Core",
         "//llvm:Coverage",
         "//llvm:DebugInfo",