diff --git a/.github/workflows/docs-test.yml b/.github/workflows/docs-test.yml
index be80c197fc..562a305b7a 100644
--- a/.github/workflows/docs-test.yml
+++ b/.github/workflows/docs-test.yml
@@ -9,6 +9,10 @@ on:
     branches:
       - development
 
+env:
+  # enable color output from Sphinx
+  FORCE_COLOR: "1"
+
 jobs:
   docs:
     runs-on: ubuntu-latest
@@ -33,7 +37,10 @@ jobs:
       - name: Build docs
         run: |
           cd Docs/
-          make SPHINXOPTS=-v NO_DOXYGEN=TRUE html
+          # remove missing TOC entries that would otherwise be generated by
+          # doxygen to avoid warnings about missing references
+          sed -i -e 's/^   filelist$//; s/^   classlist$//' source/index.rst
+          make SPHINXOPTS='-v -W --keep-going' NO_DOXYGEN=TRUE html
 
       - name: Check links
         run: |
diff --git a/.gitignore b/.gitignore
index 6f29568fcb..648030bee4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -72,6 +72,8 @@ Docs/source/namespacelist.rst
 Docs/source/runtime_parameters.rst
 Docs/source/*_files.rst
 Docs/source/preprocessed_files
+Docs/source/yt_example.rst
+Docs/source/yt_example_files/
 
 
 amr_diag.out
diff --git a/Docs/rp.py b/Docs/rp.py
index 343d1f6dc6..5bb7f43301 100755
--- a/Docs/rp.py
+++ b/Docs/rp.py
@@ -7,17 +7,17 @@
 from more_itertools import unique_everseen
 
 MAIN_HEADER = """
-+--------------------------------------------+-------------------------------------------------------------+---------------+
-| parameter                                  | description                                                 | default value |
-+============================================+=============================================================+===============+
++--------------------------------------------+-------------------------------------------------------------+-----------------------------+
+| parameter                                  | description                                                 | default value               |
++============================================+=============================================================+=============================+
 """
 
 SEPARATOR = """
-+--------------------------------------------+-------------------------------------------------------------+---------------+
++--------------------------------------------+-------------------------------------------------------------+-----------------------------+
 """
 
 ENTRY = """
-| {:42} | {:59} | {:13} |
+| {:42} | {:59} | {:27} |
 """
 
 WRAP_LEN = 59
diff --git a/Docs/source/FlowChart.rst b/Docs/source/FlowChart.rst
index f24c4cfa7b..a47b22ad55 100644
--- a/Docs/source/FlowChart.rst
+++ b/Docs/source/FlowChart.rst
@@ -309,7 +309,7 @@ In the code, the objective is to evolve the state from the old time,
    A. Create ``Sborder``, initialized from ``S_old``
 
    B. Call ``clean_state()`` to make sure the thermodynamics are in
-     sync, in particular, compute the temperature.
+      sync, in particular, compute the temperature.
 
    C. [``SHOCK_VAR``] zero out the shock flag.
 
diff --git a/Docs/source/Hydrodynamics.rst b/Docs/source/Hydrodynamics.rst
index 8d76bd5fdf..edaaf8accd 100644
--- a/Docs/source/Hydrodynamics.rst
+++ b/Docs/source/Hydrodynamics.rst
@@ -342,6 +342,14 @@ accounted for in **Steps 1** and **6**. The source terms are:
    S_{{\rm ext},\rho Y_k}
    \end{array}\right)^n.
 
+.. index:: USE_SPECIES_SOURCES
+
+.. note:: To reduce memory usage, we do not include source terms for the
+   advected quantities, species, and auxiliary variables in the conserved
+   state vector by default. If your application needs external source terms for
+   these variables, set ``USE_SPECIES_SOURCES=TRUE`` when compiling so that space
+   will be allocated for them.
+
 Primitive Forms
 ===============
 
@@ -585,9 +593,6 @@ runtime parameters for hydrodynamics:
 
    See :ref:`sponge_section` for more details on the sponge.
 
--  ``castro.normalize_species``: enforce that :math:`\sum_i X_i = 1`
-   (0 or 1; default: 0)
-
 .. index:: castro.small_dens, castro.small_temp, castro.small_pres
 
 Several floors are imposed on the thermodynamic quantities to prevet unphysical
diff --git a/Docs/source/docutils.conf b/Docs/source/docutils.conf
new file mode 100644
index 0000000000..3b4d0981a5
--- /dev/null
+++ b/Docs/source/docutils.conf
@@ -0,0 +1,2 @@
+[parsers]
+line_length_limit = 1000000
diff --git a/Docs/source/index.rst b/Docs/source/index.rst
index 14b646d378..0500e0bd62 100644
--- a/Docs/source/index.rst
+++ b/Docs/source/index.rst
@@ -60,7 +60,7 @@ https://github.com/amrex-astro/Castro
 
    filelist
    classlist
-   .. namespacelist
+.. namespacelist
 
 .. toctree::
    :caption: References
diff --git a/Docs/source/io.rst b/Docs/source/io.rst
index 1a29b56988..2054eb4e1c 100644
--- a/Docs/source/io.rst
+++ b/Docs/source/io.rst
@@ -271,8 +271,6 @@ radiation quantities).
 | (where X is any of the species    | :math:`\omegadot_k = DX_k/Dt`                     |                                      |
 | defined in the network)           |                                                   |                                      |
 +-----------------------------------+---------------------------------------------------+--------------------------------------+
-| ``enuc``                          | Nuclear energy generation rate / gram             | :math:`{\rm erg~g^{-1}~s^{-1}}`      |
-+-----------------------------------+---------------------------------------------------+--------------------------------------+
 | ``rho_enuc``                      | Nuclear energy generation rate density            | :math:`{\rm erg~cm^{-3}~s^{-1}}`     |
 +-----------------------------------+---------------------------------------------------+--------------------------------------+
 | ``phiGrav``                       | Gravitational potential                           | :math:`{\rm erg~g^{-1}}`             |
@@ -393,7 +391,8 @@ Derived variables
 | ``y_velocity``,                   | :math:`\ub = (\rho \ub)/\rho`                     |                             |                                         |
 | ``z_velocity``                    |                                                   |                             |                                         |
 +-----------------------------------+---------------------------------------------------+-----------------------------+-----------------------------------------+
-
+| ``enuc``                          | Nuclear energy generation rate / gram             | ``derenuc``                 | :math:`{\rm erg~g^{-1}~s^{-1}}`         |
++-----------------------------------+---------------------------------------------------+-----------------------------+-----------------------------------------+
 
 problem-specific plotfile variables
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/Source/gravity/Gravity.cpp b/Source/gravity/Gravity.cpp
index 8ef59f0e46..35fab221d3 100644
--- a/Source/gravity/Gravity.cpp
+++ b/Source/gravity/Gravity.cpp
@@ -1844,8 +1844,13 @@ Gravity::fill_multipole_BCs(int crse_level, int fine_level, const Vector<MultiFa
         MultiFab::Copy(source, *Rhs[lev - crse_level], 0, 0, 1, 0);
 
         if (lev < fine_level) {
-            const MultiFab& mask = dynamic_cast<Castro*>(&(parent->getLevel(lev+1)))->build_fine_mask();
-            MultiFab::Multiply(source, mask, 0, 0, 1, 0);
+	    auto *castro_level = dynamic_cast<Castro*>(&(parent->getLevel(lev+1)));
+	    if (castro_level != nullptr) {
+		const MultiFab& mask = castro_level->build_fine_mask();
+		MultiFab::Multiply(source, mask, 0, 0, 1, 0);
+	    } else {
+                amrex::Abort("unable to access mask");
+            }
         }
 
         // Loop through the grids and compute the individual contributions
@@ -2968,7 +2973,11 @@ Gravity::set_mass_offset (Real time, bool multi_level)
         {
             for (int lev = 0; lev <= parent->finestLevel(); lev++) {
                 auto* cs = dynamic_cast<Castro*>(&parent->getLevel(lev));
-                mass_offset += cs->volWgtSum("density", time);
+		if (cs != nullptr) {
+		    mass_offset += cs->volWgtSum("density", time);
+		} else {
+                    amrex::Abort("unable to access volWgtSum");
+                }
             }
         }
         else
@@ -3132,9 +3141,13 @@ Gravity::make_radial_gravity(int level, Real time, RealVector& radial_grav)
         if (lev < level)
         {
             auto* fine_level = dynamic_cast<Castro*>(&(parent->getLevel(lev+1)));
-            const MultiFab& mask = fine_level->build_fine_mask();
-            for (int n = 0; n < NUM_STATE; ++n) {
-                MultiFab::Multiply(S, mask, 0, n, 1, 0);
+	    if (fine_level != nullptr) {
+		const MultiFab& mask = fine_level->build_fine_mask();
+		for (int n = 0; n < NUM_STATE; ++n) {
+		    MultiFab::Multiply(S, mask, 0, n, 1, 0);
+		}
+	    } else {
+                amrex::Abort("unable to create mask");
             }
         }
 
diff --git a/Source/reactions/Castro_react.cpp b/Source/reactions/Castro_react.cpp
index 0455ee0d07..73c5a3c873 100644
--- a/Source/reactions/Castro_react.cpp
+++ b/Source/reactions/Castro_react.cpp
@@ -186,12 +186,14 @@ Castro::react_state(MultiFab& s, MultiFab& r, Real time, Real dt, const int stra
     MultiFab tmp_mask_mf;
     const MultiFab& mask_mf = mask_covered_zones ? getLevel(level+1).build_fine_mask() : tmp_mask_mf;
 
-    ReduceOps<ReduceOpSum> reduce_op;
-    ReduceData<Real> reduce_data(reduce_op);
-    using ReduceTuple = typename decltype(reduce_data)::Type;
+#if defined(AMREX_USE_GPU)
+    Gpu::Buffer<int> d_num_failed({0});
+    auto* p_num_failed = d_num_failed.data();
+#endif
+    int num_failed = 0;
 
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel reduction(+:num_failed)
 #endif
     for (MFIter mfi(s, TilingIfNotGPU()); mfi.isValid(); ++mfi)
     {
@@ -208,8 +210,11 @@ Castro::react_state(MultiFab& s, MultiFab& r, Real time, Real dt, const int stra
         const auto problo = geom.ProbLoArray();
 #endif
 
-        reduce_op.eval(bx, reduce_data,
-        [=] AMREX_GPU_HOST_DEVICE (int i, int j, int k) -> ReduceTuple
+#if defined(AMREX_USE_GPU)
+        ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k)
+#else
+        LoopOnCpu(bx, [&] (int i, int j, int k) mutable
+#endif
         {
 
             burn_t burn_state;
@@ -230,7 +235,7 @@ Castro::react_state(MultiFab& s, MultiFab& r, Real time, Real dt, const int stra
 
             bool do_burn = true;
             burn_state.success = true;
-            Real burn_failed = 0.0_rt;
+            int burn_failed = 0;
 
             // Don't burn on zones inside shock regions, if the relevant option is set.
 
@@ -329,7 +334,7 @@ Castro::react_state(MultiFab& s, MultiFab& r, Real time, Real dt, const int stra
                 // If we were unsuccessful, update the failure count.
 
                 if (!burn_state.success) {
-                    burn_failed = 1.0_rt;
+                    burn_failed = 1;
                 }
 
                 // Add burning rates to reactions MultiFab, but be
@@ -399,19 +404,25 @@ Castro::react_state(MultiFab& s, MultiFab& r, Real time, Real dt, const int stra
 
             }
 
-
-            return {burn_failed};
-
+#if defined(AMREX_USE_GPU)
+            if (burn_failed) {
+                Gpu::Atomic::Add(p_num_failed, burn_failed);
+            }
+#else
+            num_failed += burn_failed;
+#endif
         });
 
+#if defined(AMREX_USE_HIP)
+        Gpu::streamSynchronize(); // otherwise HIP may fail to allocate the necessary resources.
+#endif
     }
 
-    ReduceTuple hv = reduce_data.value();
-    Real burn_failed = amrex::get<0>(hv);
+#if defined(AMREX_USE_GPU)
+    num_failed = *(d_num_failed.copyToHost());
+#endif
 
-    if (burn_failed != 0.0) {
-      burn_success = 0;
-    }
+    burn_success = !num_failed;
 
     ParallelDescriptor::ReduceIntMin(burn_success);
 
@@ -516,11 +527,13 @@ Castro::react_state(Real time, Real dt)
 
     int burn_success = 1;
 
-    ReduceOps<ReduceOpSum> reduce_op;
-    ReduceData<Real> reduce_data(reduce_op);
-
-    using ReduceTuple = typename decltype(reduce_data)::Type;
+#if defined(AMREX_USE_GPU)
+    Gpu::Buffer<int> d_num_failed({0});
+    auto* p_num_failed = d_num_failed.data();
+#endif
+    int num_failed = 0;
 
+    // why no omp here?
     for (MFIter mfi(S_new, TilingIfNotGPU()); mfi.isValid(); ++mfi)
     {
         const Box& bx = mfi.growntilebox(ng);
@@ -542,8 +555,11 @@ Castro::react_state(Real time, Real dt)
         const auto dx = geom.CellSizeArray();
         const auto problo = geom.ProbLoArray();
 
-        reduce_op.eval(bx, reduce_data,
-        [=] AMREX_GPU_HOST_DEVICE (int i, int j, int k) -> ReduceTuple
+#if defined(AMREX_USE_GPU)
+        ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k)
+#else
+        LoopOnCpu(bx, [&] (int i, int j, int k) mutable
+#endif
         {
             burn_t burn_state;
 
@@ -563,7 +579,7 @@ Castro::react_state(Real time, Real dt)
 
             bool do_burn = true;
             burn_state.success = true;
-            Real burn_failed = 0.0_rt;
+            int burn_failed = 0;
 
             // Don't burn on zones inside shock regions, if the
             // relevant option is set.
@@ -687,7 +703,7 @@ Castro::react_state(Real time, Real dt)
                 // If we were unsuccessful, update the failure count.
 
                 if (!burn_state.success) {
-                    burn_failed = 1.0_rt;
+                    burn_failed = 1;
                 }
 
                 // update the state data.
@@ -780,16 +796,25 @@ Castro::react_state(Real time, Real dt)
                 }
             }
 
-            return {burn_failed};
+#if defined(AMREX_USE_GPU)
+            if (burn_failed) {
+                Gpu::Atomic::Add(p_num_failed, burn_failed);
+            }
+#else
+            num_failed += burn_failed;
+#endif
         });
+
+#if defined(AMREX_USE_HIP)
+        Gpu::streamSynchronize(); // otherwise HIP may fail to allocate the necessary resources.
+#endif
     }
 
-    ReduceTuple hv = reduce_data.value();
-    Real burn_failed = amrex::get<0>(hv);
+#if defined(AMREX_USE_GPU)
+    num_failed = *(d_num_failed.copyToHost());
+#endif
 
-    if (burn_failed != 0.0) {
-        burn_success = 0;
-    }
+    burn_success = !num_failed;
 
     ParallelDescriptor::ReduceIntMin(burn_success);