diff --git a/components/eamxx/cime_config/namelist_defaults_eamxx.xml b/components/eamxx/cime_config/namelist_defaults_eamxx.xml
index 951723c57a5d..ee6d26847434 100644
--- a/components/eamxx/cime_config/namelist_defaults_eamxx.xml
+++ b/components/eamxx/cime_config/namelist_defaults_eamxx.xml
@@ -895,8 +895,11 @@ be lost if SCREAM_HACK_XML is not enabled.
     <!-- SL transport settings. SL defaults to on for pg2 configs. -->
     <dt_tracer_factor constraints="ge 1" hgrid=".*pg2">6</dt_tracer_factor>
     <hypervis_subcycle_q hgrid=".*pg2">6</hypervis_subcycle_q>
-    <transport_alg hgrid=".*pg2">12</transport_alg>
-    <semi_lagrange_trajectory_nsubstep>0</semi_lagrange_trajectory_nsubstep>
+    <transport_alg hgrid=".*pg2" doc="0 (Eulerian) or 12 (semi-Lagrangian) tracer transport">12</transport_alg>
+    <semi_lagrange_trajectory_nsubstep doc="Controls trajectory method. 0: original method. 1: New method. 2 or more: New method with this many trajectory substeps per tracer time step.">0</semi_lagrange_trajectory_nsubstep>
+    <semi_lagrange_trajectory_nvelocity doc="Number of velocity slices to use in new method. 2 or less maps to 2">-1</semi_lagrange_trajectory_nvelocity>
+    <semi_lagrange_halo doc="Max number of element halos available in communication. -1 triggers an automatic estimate.">-1</semi_lagrange_halo>
+    <semi_lagrange_diagnostics>0</semi_lagrange_diagnostics>
     <!-- Other settings that we'll trigger based on pg2 for convenience -->
     <se_ftype valid_values="0,2" hgrid=".*pg2">2</se_ftype>
     <mesh_file type="file">none</mesh_file>
diff --git a/components/eamxx/cime_config/testdefs/testmods_dirs/eamxx/sl_nsubstep2/shell_commands b/components/eamxx/cime_config/testdefs/testmods_dirs/eamxx/sl_nsubstep2/shell_commands
index 71cdd9b691ce..82ec9297caa9 100644
--- a/components/eamxx/cime_config/testdefs/testmods_dirs/eamxx/sl_nsubstep2/shell_commands
+++ b/components/eamxx/cime_config/testdefs/testmods_dirs/eamxx/sl_nsubstep2/shell_commands
@@ -1,3 +1,4 @@
 ATMCHANGE=$CIMEROOT/../components/eamxx/scripts/atmchange
 
 $ATMCHANGE semi_lagrange_trajectory_nsubstep=2 -b
+$ATMCHANGE semi_lagrange_trajectory_nvelocity=3 -b
diff --git a/components/homme/cmake/HommeMacros.cmake b/components/homme/cmake/HommeMacros.cmake
index 5ead59d677cf..116444f342f0 100644
--- a/components/homme/cmake/HommeMacros.cmake
+++ b/components/homme/cmake/HommeMacros.cmake
@@ -829,6 +829,24 @@ MACRO(CREATE_CXX_VS_F90_TESTS_WITH_PROFILE TESTS_LIST testProfile)
   ENDFOREACH ()
 ENDMACRO(CREATE_CXX_VS_F90_TESTS_WITH_PROFILE)
 
+function(check_transport_error_norms
+    TEST_NAME TCEN_ERROR_ANCHOR TCEN_FILENAME TCEN_UPPER_BOUNDS)
+  # Check prescribed-wind tracer transport test error-norm output, which is
+  # available with some of these tests. This function encapsulates the setup
+  # steps to add a check after a tracer transport test runs. See
+  # TransportCheckErrorNorms.cmake.in for details.
+  configure_file(
+    ${HOMME_SOURCE_DIR}/cmake/TransportCheckErrorNorms.cmake.in
+    ${HOMME_BINARY_DIR}/tests/${TEST_NAME}/check.cmake
+    @ONLY)
+  add_test(
+    NAME "${TEST_NAME}_l2err"
+    COMMAND ${CMAKE_COMMAND} -P check.cmake
+    WORKING_DIRECTORY ${HOMME_BINARY_DIR}/tests/${TEST_NAME})
+  set_tests_properties(
+    "${TEST_NAME}_l2err" PROPERTIES DEPENDS "${TEST_NAME}")
+endfunction()
+
 macro(testQuadPrec HOMME_QUAD_PREC)
 
   TRY_COMPILE(COMPILE_RESULT_VAR
diff --git a/components/homme/cmake/TransportCheckErrorNorms.cmake.in b/components/homme/cmake/TransportCheckErrorNorms.cmake.in
new file mode 100644
index 000000000000..0b61ee25b8be
--- /dev/null
+++ b/components/homme/cmake/TransportCheckErrorNorms.cmake.in
@@ -0,0 +1,39 @@
+# This utility checks output of the form
+#   planar_conv>                          l2                    linf
+#   planar_conv> Q 1  3.7612654153016604E-03  9.8620728873592828E-03
+#   planar_conv> Q 2  3.9526251584022527E-03  8.2639849437505641E-03
+# against caller-provided upper bounds on the l2 norms. Output of this sort
+# appears on stdout at the end of some tracer transport tests. 'planar_conv' can
+# be something else; it is caller-configurable. Example usage:
+#   set(TCEN_ERROR_ANCHOR "planar_conv")
+#   set(TCEN_FILENAME "out.txt")
+#   set(TCEN_UPPER_BOUNDS "3.9e-3;4.1e-3")
+#   configure_file(TransportCheckErrorNorms.cmake.in check.cmake @ONLY)
+#   add_test(
+#     NAME "${TEST_NAME}_l2err"
+#     COMMAND ${CMAKE_COMMAND} -P check.cmake
+#     WORKING_DIRECTORY ${HOMME_BINARY_DIR}/tests/${TEST_NAME})
+#   set_tests_properties(
+#     "${TEST_NAME}_l2err" PROPERTIES DEPENDS "${TEST_NAME}")
+# Function check_transport_error_norms in HommeMacros.cmake encapsulates the
+# above commands.
+
+function(check_error_norms test_name filename upper_bounds)
+  execute_process(COMMAND grep "${test_name}> Q" ${filename} OUTPUT_VARIABLE grep_stdout)
+  separate_arguments(tokens NATIVE_COMMAND "${grep_stdout}")
+  list(LENGTH upper_bounds nchk)
+  math(EXPR nchk "${nchk} - 1")
+  foreach(idx RANGE 0 ${nchk})
+    math(EXPR tidx "3 + 5 * ${idx}")
+    list(GET tokens ${tidx} e)
+    list(GET upper_bounds ${idx} ub)
+    if(e GREATER ub)
+      message(FATAL_ERROR "${e} should be <= ${ub} but is not")
+    endif()
+  endforeach()
+endfunction()
+
+check_error_norms(
+  "@TCEN_ERROR_ANCHOR@"
+  "@TCEN_FILENAME@"
+  "@TCEN_UPPER_BOUNDS@")
diff --git a/components/homme/cmake/machineFiles/aurora-aot.cmake b/components/homme/cmake/machineFiles/aurora-aot.cmake
index 2a9499ab3edc..28cced899c4d 100644
--- a/components/homme/cmake/machineFiles/aurora-aot.cmake
+++ b/components/homme/cmake/machineFiles/aurora-aot.cmake
@@ -44,7 +44,8 @@ SET(CMAKE_Fortran_COMPILER "mpifort" CACHE STRING "")
 SET(CMAKE_CXX_COMPILER "mpicxx" CACHE STRING "")
 
 #AOT flags
-SET(SYCL_COMPILE_FLAGS "-std=c++17 -fsycl -fsycl-device-code-split=per_kernel -fno-sycl-id-queries-fit-in-int -fsycl-unnamed-lambda -Xclang -fsycl-allow-virtual-functions")
+#SET(SYCL_COMPILE_FLAGS "-std=c++17 -fsycl -fsycl-device-code-split=per_kernel -fno-sycl-id-queries-fit-in-int -fsycl-unnamed-lambda -Xclang -fsycl-allow-virtual-functions")
+SET(SYCL_COMPILE_FLAGS "-std=c++17 -fsycl -fsycl-device-code-split=per_kernel -fno-sycl-id-queries-fit-in-int -fsycl-unnamed-lambda")
 SET(SYCL_LINK_FLAGS "-Wl,--no-relax -flink-huge-device-code -fsycl-max-parallel-link-jobs=32 -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=intel_gpu_pvc")
 
 SET(ADD_Fortran_FLAGS "-fc=ifx -fpscomp logicals -O3 -DNDEBUG -DCPRINTEL -g" CACHE STRING "")
diff --git a/components/homme/cmake/machineFiles/chrysalis.cmake b/components/homme/cmake/machineFiles/chrysalis.cmake
index 68ff76ec8082..13e125e40035 100644
--- a/components/homme/cmake/machineFiles/chrysalis.cmake
+++ b/components/homme/cmake/machineFiles/chrysalis.cmake
@@ -54,6 +54,7 @@ IF (${IFORT_RESULT} EQUAL 0)
   SET (ADD_Fortran_FLAGS "-traceback" CACHE STRING "")
   SET (ADD_C_FLAGS       "-traceback" CACHE STRING "")
   SET (ADD_CXX_FLAGS     "-traceback" CACHE STRING "")
+  SET (BUILD_HOMME_THETA_KOKKOS TRUE CACHE BOOL "")
 ELSE()
   SET (MKLROOT $ENV{MKLROOT} CACHE FILEPATH "")
   SET (HOMME_FIND_BLASLAPACK TRUE CACHE BOOL "")
diff --git a/components/homme/src/preqx_kokkos/CMakeLists.txt b/components/homme/src/preqx_kokkos/CMakeLists.txt
index 94af555ae8f5..ffaf14e38604 100644
--- a/components/homme/src/preqx_kokkos/CMakeLists.txt
+++ b/components/homme/src/preqx_kokkos/CMakeLists.txt
@@ -97,6 +97,7 @@ MACRO(PREQX_KOKKOS_SETUP)
     ${SRC_SHARE_DIR}/cxx/prim_cxx_driver_base.F90
     ${SRC_SHARE_DIR}/cxx/utilities/bfb_mod.F90
     ${SRC_SHARE_DIR}/planar_mod.F90
+    ${TEST_SRC_DIR}/planar_transport.F90
     ${SRC_SHARE_DIR}/geometry_mod.F90
     ${SRC_SHARE_DIR}/planar_mesh_mod.F90
   )
diff --git a/components/homme/src/share/compose/compose_hommexx.cpp b/components/homme/src/share/compose/compose_hommexx.cpp
index 0b88e6024a1d..a0f76c6230e6 100644
--- a/components/homme/src/share/compose/compose_hommexx.cpp
+++ b/components/homme/src/share/compose/compose_hommexx.cpp
@@ -24,8 +24,8 @@ using View = typename TracerArrays<ko::MachineTraits>::View<DataType>;
 void set_views (const SetView<HommexxReal***>& spheremp,
                 const SetView<HommexxReal****>& dp, const SetView5& dp3d,
                 const SetView<HommexxReal******>& qdp, const SetView5& q,
-                const SetView5& dep_points, const SetView5& vnode,
-                const SetView5& vdep, const Int ndim) {
+                const SetView5& dep_points, const SetView5& vnode, const Int ndim,
+                const SetView5& vdep, const Int vdep_ndim) {
   static_assert(std::is_same<Real, HommexxReal>::value,
                 "Hommexx and Compose real types must be the same.");
 #ifdef COMPOSE_PORT
@@ -42,23 +42,22 @@ void set_views (const SetView<HommexxReal***>& spheremp,
   if (vnode.data())
     ta.vnode = View<Real****>(vnode.data(), nel, vnode.extent_int(1), np2, ndim);
   if (vdep.data())
-    ta.vdep  = View<Real****>(vdep.data(),  nel, vdep .extent_int(1), np2, ndim);
+    ta.vdep  = View<Real****>(vdep.data(),  nel, vdep .extent_int(1), np2, vdep_ndim);
 #else
   slmm_throw_if(true, "Running a Hommexx code path with the non-Hommexx build"
                 " is not supported.\n");
 #endif
 }
 
-void set_hvcoord (const HommexxReal etai_beg, const HommexxReal etai_end,
-                  const HommexxReal* etam) {
+void set_hvcoord (const HommexxReal* etai, const HommexxReal* etam) {
   auto& cm = *get_isl_mpi_singleton();
-  islmpi::set_hvcoord(cm, etai_beg, etai_end, etam);
+  islmpi::set_hvcoord(cm, etai, etam);
 }
 
-void calc_v_departure (const int step, const HommexxReal dtsub) {
+void interp_v_update (const int step, const HommexxReal dtsub) {
   auto& cm = *get_isl_mpi_singleton();
-  islmpi::calc_v_departure<>(cm, 0, cm.nelemd - 1, step, dtsub,
-                             nullptr, nullptr, nullptr);
+  islmpi::interp_v_update<>(cm, 0, cm.nelemd - 1, step, dtsub,
+                            nullptr, nullptr, nullptr);
 }
 
 void advect (const int np1, const int n0_qdp, const int np1_qdp) {
diff --git a/components/homme/src/share/compose/compose_hommexx.hpp b/components/homme/src/share/compose/compose_hommexx.hpp
index b2e339055404..ead916049783 100644
--- a/components/homme/src/share/compose/compose_hommexx.hpp
+++ b/components/homme/src/share/compose/compose_hommexx.hpp
@@ -16,12 +16,11 @@ typedef SetView<HommexxReal*****> SetView5;
 void set_views(const SetView<HommexxReal***>& spheremp,
                const SetView<HommexxReal****>& dp, const SetView5& dp3d,
                const SetView<HommexxReal******>& qdp, const SetView5& q,
-               const SetView5& dep_points, const SetView5& vnode,
-               const SetView5& vdep, const int trajectory_ndim);
+               const SetView5& dep_points, const SetView5& vnode, const int ndim,
+               const SetView5& vdep, const int vdep_ndim);
 
-void set_hvcoord(const HommexxReal etai_beg, const HommexxReal etai_end,
-                 const HommexxReal* etam);
-void calc_v_departure(const int step, const HommexxReal dtsub);
+void set_hvcoord(const HommexxReal* etai, const HommexxReal* etam);
+void interp_v_update(const int step, const HommexxReal dtsub);
 
 void advect(const int np1, const int n0_qdp, const int np1_qdp);
 
diff --git a/components/homme/src/share/compose/compose_slmm.cpp b/components/homme/src/share/compose/compose_slmm.cpp
index 4528a447b653..fb8b40aae95d 100644
--- a/components/homme/src/share/compose/compose_slmm.cpp
+++ b/components/homme/src/share/compose/compose_slmm.cpp
@@ -124,8 +124,7 @@ void finalize_init_phase (IslMpi<MT>& cm, typename IslMpi<MT>::Advecter& advecte
 }
 
 template <typename MT>
-void set_hvcoord (IslMpi<MT>& cm, const Real etai_beg, const Real etai_end,
-                  const Real* etam) {
+void set_hvcoord (IslMpi<MT>& cm, const Real* etai, const Real* etam) {
   if (cm.etam.size() > 0) return;
 #if defined COMPOSE_HORIZ_OPENMP
 # pragma omp barrier
@@ -133,16 +132,24 @@ void set_hvcoord (IslMpi<MT>& cm, const Real etai_beg, const Real etai_end,
 #endif
   {
     slmm_assert(cm.nlev > 0);
-    cm.etai_beg = etai_beg;
-    cm.etai_end = etai_end;
+    cm.etai_beg = etai[0];
+    cm.etai_end = etai[cm.nlev];
+    cm.etai = typename IslMpi<MT>::template ArrayD<Real*>("etai", cm.nlev+1);
     cm.etam = typename IslMpi<MT>::template ArrayD<Real*>("etam", cm.nlev);
-    const auto h = ko::create_mirror_view(cm.etam);
+    const auto hi = ko::create_mirror_view(cm.etai);
+    const auto hm = ko::create_mirror_view(cm.etam);
+    for (int k = 0; k <= cm.nlev; ++k) {
+      hi(k) = etai[k];
+      slmm_assert(k == 0 or hi(k) > hi(k-1));
+      slmm_assert(hi(k) >= 0 && hi(k) <= 1);
+    }
     for (int k = 0; k < cm.nlev; ++k) {
-      h(k) = etam[k];
-      slmm_assert(k == 0 || h(k) > h(k-1));
-      slmm_assert(h(k) > 0 && h(k) < 1);
+      hm(k) = etam[k];
+      slmm_assert(k == 0 or hm(k) > hm(k-1));
+      slmm_assert(hm(k) > 0 && hm(k) < 1);
     }
-    ko::deep_copy(cm.etam, h);
+    ko::deep_copy(cm.etai, hi);
+    ko::deep_copy(cm.etam, hm);
   }
 #if defined COMPOSE_HORIZ_OPENMP
 # pragma omp barrier
@@ -150,15 +157,14 @@ void set_hvcoord (IslMpi<MT>& cm, const Real etai_beg, const Real etai_end,
 }
 
 template void set_hvcoord(
-  IslMpi<ko::MachineTraits>& cm, const Real etai_beg, const Real etai_end,
-  const Real* etam);
+  IslMpi<ko::MachineTraits>& cm, const Real* etai, const Real* etam);
 
 // Set pointers to HOMME data arrays.
 template <typename MT>
 void set_elem_data (IslMpi<MT>& cm, const Int ie, Real* qdp, const Int n0_qdp,
                     const Real* dp, Real* q, const Int nelem_in_patch) {
   slmm_assert(ie < cm.ed_h.size());
-  slmm_assert(cm.halo > 1 || cm.ed_h(ie).nbrs.size() == nelem_in_patch);
+  slmm_assert(cm.halo > 1 or cm.ed_h(ie).nbrs.size() == nelem_in_patch);
   auto& e = cm.ed_h(ie);
 #if defined COMPOSE_PORT
   cm.tracer_arrays->pqdp.set_ie_ptr(ie, qdp);
@@ -395,15 +401,14 @@ void slmm_check_ref2sphere (homme::Int ie, homme::Cartesian3D* p) {
   amb::dev_fin_threads();
 }
 
-void slmm_set_hvcoord (const homme::Real etai_beg, const homme::Real etai_end,
-                       const homme::Real* etam) {
+void slmm_set_hvcoord (const homme::Real* etai, const homme::Real* etam) {
   amb::dev_init_threads();
   slmm_assert(homme::g_csl_mpi);
-  homme::islmpi::set_hvcoord(*homme::g_csl_mpi, etai_beg, etai_end, etam);
+  homme::islmpi::set_hvcoord(*homme::g_csl_mpi, etai, etam);
   amb::dev_fin_threads();
 }
 
-void slmm_calc_v_departure (
+void slmm_interp_v_update (
   homme::Int nets, homme::Int nete, homme::Int step, homme::Real dtsub,
   homme::Real* dep_points, homme::Int dep_points_ndim, homme::Real* vnode,
   homme::Real* vdep, homme::Int* info)
@@ -419,8 +424,8 @@ void slmm_calc_v_departure (
     homme::sl_traj_h2d(*cm.tracer_arrays, dep_points, vnode, vdep,
                        cm.dep_points_ndim);
   }
-  homme::islmpi::calc_v_departure(cm, nets - 1, nete - 1, step - 1,
-                                  dtsub, dep_points, vnode, vdep);
+  homme::islmpi::interp_v_update(cm, nets - 1, nete - 1, step - 1,
+                                 dtsub, dep_points, vnode, vdep);
   *info = 0;
   {
     slmm::Timer timer("d2h");
diff --git a/components/homme/src/share/compose/compose_slmm_islmpi.hpp b/components/homme/src/share/compose/compose_slmm_islmpi.hpp
index f8814caf66f8..6d4f2baa007d 100644
--- a/components/homme/src/share/compose/compose_slmm_islmpi.hpp
+++ b/components/homme/src/share/compose/compose_slmm_islmpi.hpp
@@ -579,10 +579,10 @@ struct IslMpi {
   const typename Advecter::ConstPtr advecter;
   const Int np, np2, nlev, qsize, qsized, nelemd, halo;
   const bool traj_3d;
-  const Int traj_nsubstep, dep_points_ndim;
+  const Int traj_nsubstep, dep_points_ndim, traj_msg_sz;
 
   Real etai_beg, etai_end;
-  ArrayD<Real*> etam;
+  ArrayD<Real*> etai, etam;
 
   ElemDataListH ed_h; // this rank's owned cells, indexed by LID
   ElemDataListD ed_d;
@@ -637,8 +637,10 @@ struct IslMpi {
           Int itraj_3d, Int itraj_nsubstep)
     : p(ip), advecter(advecter),
       np(inp), np2(np*np), nlev(inlev), qsize(iqsize), qsized(iqsized), nelemd(inelemd),
-      halo(ihalo), traj_3d(itraj_3d), traj_nsubstep(itraj_nsubstep),
-      dep_points_ndim(traj_3d && traj_nsubstep > 0 ? 4 : 3),
+      halo(ihalo), traj_3d(itraj_3d),
+      traj_nsubstep(itraj_nsubstep),
+      dep_points_ndim(traj_3d and traj_nsubstep > 0 ? 4 : 3),
+      traj_msg_sz(traj_3d ? 5 : dep_points_ndim),
       tracer_arrays(itracer_arrays)
   {}
 
@@ -768,11 +770,10 @@ void step(
   Real* q_min_r, Real* q_max_r);
 
 template <typename MT = ko::MachineTraits>
-void set_hvcoord(IslMpi<MT>& cm, const Real etai_beg, const Real etai_end,
-                 const Real* etam);
+void set_hvcoord(IslMpi<MT>& cm, const Real* etai, const Real* etam);
 
 template <typename MT = ko::MachineTraits>
-void calc_v_departure(
+void interp_v_update(
   IslMpi<MT>& cm, const Int nets, const Int nete, const Int step, const Real dtsub,
   Real* dep_points_r, const Real* vnode, Real* vdep);
 
diff --git a/components/homme/src/share/compose/compose_slmm_islmpi_calc_trajectory.cpp b/components/homme/src/share/compose/compose_slmm_islmpi_calc_trajectory.cpp
index e2bcf311d871..6d5705d8f7b8 100644
--- a/components/homme/src/share/compose/compose_slmm_islmpi_calc_trajectory.cpp
+++ b/components/homme/src/share/compose/compose_slmm_islmpi_calc_trajectory.cpp
@@ -1,3 +1,86 @@
+/* This file contains the driver routine interp_v_update. It interpolates
+   reference-grid velocities to off-grid arrival points. This situation occurs
+   when the enhanced trajectory method is run with nsubstep >= 2. In the first
+   substep, the arrival points are on-grid; thus, interp_v_update is not
+   needed. Subsequent substeps start with off-grid arrival points. These are the
+   departure points from the previous substep.
+     interp_v_update follows the same computational and communication patterns
+   as the main 'step' routine (compose_slmm_islmpi_step.cpp). Both
+   interp_v_update and 'step' are essentially computing interpolants. In the
+   case of 'step', the mixing ratios q are interpolated; in the case of
+   interp_v_update, (xdot, ydot, zdot, etadot) are interpolated, where (xdot,
+   ydot, zdot) is the horizontal velocity on the sphere and etadot is the
+   vertical velocity. In this sense, interp_v_update can be understood as a
+   specialization of 'step' to this case.
+     However, there is one bit of complexity in interp_v_update that requires
+   further explanation.
+     Let the horizontal position (velocity) be abbreviated as h(dot) := (x(dot),
+   y(dot), z(dot)), where h(dot) means the statement applies to both h and
+   hdot. h(dot) lives on vertical midpoints. In contrast, the eta(dot) part of
+   the trajectory lives on vertical interfaces. A previous version of this
+   method interpolated etadot to midpoints, then interpolated the vertical part
+   of departure points from midpoints to interfaces. This caused issues in
+   certain chemistry parameterizations around the tropopause. The current
+   version keeps eta and etadot at interfaces, which adds complexity since
+   (h(dot), eta(dot)) are at a mix of midpoints and interfaces. This change
+   removes the dependence of the reconstruction of floating Lagrangian levels on
+   the vertical grid in some (but, unavoidably, not all) terms of the
+   approximation. Simply stated, staying on interfaces as much as possible,
+   where eta(dot) naturally lives, is better than moving to midpoints, where
+   eta(dot) has to be interpolated.
+     The high-level key ideas are as follows. Let _ref denote quantities on the
+   reference grid; _arr (for "arrival point"), quantities on the vertically
+   Lagrangian grid.
+     Interpolate (hdot_ref, etadot_ref) at (h_arr, midpoint(eta_arr)) to get
+   hdot_arr. In this step, eta_arr is interpolated to midpoints. But this step
+   does not compute etadot_arr.
+     Next, we need to interpolate for etadot_arr. The simplest procedure would
+   be to compute extra full trajectories at interfaces. But this would be
+   expensive. Instead, we reuse the midpoint trajectories as follows.
+     At each interface k, interpolate for etadot_k_ref at (h_(k-1)_arr,
+   eta_k_arr) and (h_k_arr, eta_k_arr). Then combine these two values to get the
+   final etadot_k_arr. The key here is that only interface etadot_ref values are
+   used in computing these steps; a derived midpoint etadot_arr is never used.
+     This calculation is a bit subtle, but it's set up such that it adds very
+   little cost to the much simpler calculation that uses eta(dot) at
+   midpoints. It adds two additional entries to the communicated arrays and a
+   small amount of extra computation.
+     Again, the conceptually simplest and most accurate approach would double
+   the number of trajectories to compute. The approach above is one
+   approximation to this. Other approximations are possible, but the one above
+   seems to use the least computation and communication among the available
+   approximations.
+     The following notes provide precise implementation details about this
+   calculation, outlining the code in the rest of this file.
+
+     Level arrangement. x is horizontal position. A suffixed 'd' means 'dot'.
+          0 i etai(0), etaid(0) = 0
+          0 m x or xdot (= xd)
+          1 i e or ed
+          1 m x(d)
+          2 i e(d)
+          2 m x(d)
+       nlev i
+     Algorithm:
+       x[k] is the horizontal position at midpoint k
+       analyze_dep_points for x[k]
+       send/recv p[k] = (x[k], e[k], e[k+1])
+         where e is at interfaces
+       compute (rx,ry) using x[k]
+       interp3d for xd[k]   at (x[k], (e[k] + e[k+1])/2)
+         using eta_ref_mid
+       interp3d for ed[k]   at (x[k], e[k])
+         using eta_ref_int if k > 0, else 0
+       interp3d for ed[k+1] at (x[k], e[k+1])
+         using eta_ref_int if k+1 < nlev, else 0
+       send/recv pd[k] = (xd[k], ed[k], ed[k+1])
+       vdep[k,0:3] = pd[k].xd[k]
+       vdep[k,3] = linterp(eta_ref_mid[k-1:k+1],
+                           (pd[k-1].ed[k], pd[k].ed[k]),
+                           eta_ref_int[k])
+         if k > 0, else 0
+ */
+
 #include "compose_slmm_islmpi.hpp"
 #include "compose_slmm_islmpi_interpolate.hpp"
 #include "compose_slmm_islmpi_buf.hpp"
@@ -7,40 +90,81 @@ namespace islmpi {
 
 template <typename T> using CA4 = ko::View<T****, ko::LayoutRight, ko::HostSpace>;
 
-template <Int np, typename EtamT, typename VnodeT> SLMM_KF void
-interpolate_vertical (const Int nlev, const Real etai_beg, const Real etai_end,
-                      const EtamT& etam, const VnodeT& vnode,
-                      const Int src_lid, const Int lev, const Real eta_dep,
-                      const Real rx[np], const Real ry[np], Real* const v_tgt) {
-  slmm_kernel_assert(eta_dep > etai_beg && eta_dep < etai_end);
+// Interpolate at eta interfaces, rather than midpoints as below.
+template <Int np, typename EtaT, typename VnodeT> SLMM_KF
+Real interpolate_at_vertical_interfaces (
+  const Int nlev, const EtaT& etai, const VnodeT& vnode, const Int src_lid,
+  const Int lev, const Real rx[np], const Real ry[np], Real etai_dep)
+{
+  if (etai_dep < etai[0   ]) etai_dep = etai[0   ];
+  if (etai_dep > etai[nlev]) etai_dep = etai[nlev];
+
+  // Search for the eta interface values that support etai_dep.
+  Int lev_dep = lev;
+  if (etai_dep != etai(lev)) {
+    if (etai_dep < etai(lev)) {
+      for (lev_dep = lev-1; lev_dep >= 0; --lev_dep)
+        if (etai_dep >= etai(lev_dep))
+          break;
+    } else {
+      for (lev_dep = lev; lev_dep < nlev; ++lev_dep)
+        if (etai_dep < etai(lev_dep+1))
+          break;
+    }
+  }
+  slmm_kernel_assert(lev_dep >= 0 and lev_dep <= nlev);
+  if (lev_dep == nlev) return 0; // etai_dep == etai_end => eta_dot = 0
+  const Real a = (etai_dep - etai(lev_dep)) / (etai(lev_dep+1) - etai(lev_dep));
+  // Linear interp coefficients.
+  const Real alpha[] = {1-a, a};
+  const int dim = 3;
+  Real etai_dot = 0;
+  for (int i = 0; i < 2; ++i) {
+    if (lev_dep+i == 0 or lev_dep+i == nlev)
+      continue; // vel_nodes[:] = 0
+    slmm_kernel_assert(lev_dep+i >= 0 and lev_dep+i < nlev);
+    Real vel_nodes[np*np];
+    for (int k = 0; k < np*np; ++k)
+      vel_nodes[k] = vnode(src_lid,lev_dep+i,k,dim);
+    etai_dot += alpha[i]*calc_q_tgt(rx, ry, vel_nodes);
+  }
+  return etai_dot;
+}
+
+template <Int np, typename EtaT, typename VnodeT> SLMM_KF void
+interpolate_vertical (const Int nlev, const EtaT& etai, const EtaT& etam,
+                      const VnodeT& vnode, const Int src_lid, const Int lev,
+                      const Real etai_lev, const Real etai_levp1,
+                      const Real rx[np], const Real ry[np],
+                      Real* const v_tgt) {
+  Real eta_mid_dep = (etai_lev + etai_levp1)/2;
+  if (eta_mid_dep < etai[0   ]) eta_mid_dep = etai[0   ];
+  if (eta_mid_dep > etai[nlev]) eta_mid_dep = etai[nlev];
   
   // Search for the eta midpoint values that support the departure point's eta
   // value.
   Int lev_dep = lev;
-  if (eta_dep != etam(lev)) {
-    if (eta_dep < etam(lev)) {
+  if (eta_mid_dep != etam(lev)) {
+    if (eta_mid_dep < etam(lev)) {
       for (lev_dep = lev-1; lev_dep >= 0; --lev_dep)
-        if (eta_dep >= etam(lev_dep))
+        if (eta_mid_dep >= etam(lev_dep))
           break;
     } else {
       for (lev_dep = lev; lev_dep < nlev-1; ++lev_dep)
-        if (eta_dep < etam(lev_dep+1))
+        if (eta_mid_dep < etam(lev_dep+1))
           break;
     }
   }
-  slmm_kernel_assert(lev_dep >= -1 && lev_dep < nlev);
-  slmm_kernel_assert(lev_dep == -1 || eta_dep >= etam(lev_dep));
+  slmm_kernel_assert(lev_dep >= -1 and lev_dep < nlev);
+  slmm_kernel_assert(lev_dep == -1 or eta_mid_dep >= etam(lev_dep));
   Real a;
-  bool bdy = false;
   if (lev_dep == -1) {
     lev_dep = 0;
     a = 0;
-    bdy = true;
   } else if (lev_dep == nlev-1) {
     a = 0;
-    bdy = true;
   } else {
-    a = ((eta_dep - etam(lev_dep)) /
+    a = ((eta_mid_dep - etam(lev_dep)) /
          (etam(lev_dep+1) - etam(lev_dep)));
   }
   // Linear interp coefficients.
@@ -50,33 +174,30 @@ interpolate_vertical (const Int nlev, const Real etai_beg, const Real etai_end,
     v_tgt[d] = 0;
   for (int i = 0; i < 2; ++i) {
     if (alpha[i] == 0) continue;
-    for (int d = 0; d < 4; ++d) {
+    const int ndim = 3;
+    for (int d = 0; d < ndim; ++d) {
       Real vel_nodes[np*np];
       for (int k = 0; k < np*np; ++k)
         vel_nodes[k] = vnode(src_lid,lev_dep+i,k,d);
       v_tgt[d] += alpha[i]*calc_q_tgt(rx, ry, vel_nodes);
     }
   }
-  // Treat eta_dot specially since eta_dot goes to 0 at the boundaries.
-  if (bdy) {
-    slmm_kernel_assert(etam(0) > etai_beg);
-    slmm_kernel_assert(etam(nlev-1) < etai_end);
-    if (lev_dep == 0)
-      v_tgt[3] *= (eta_dep - etai_beg)/(etam(0) - etai_beg);
-    else
-      v_tgt[3] *= (etai_end - eta_dep)/(etai_end - etam(nlev-1));
-  }
+
+  v_tgt[3] = interpolate_at_vertical_interfaces<np>(
+    nlev, etai, vnode, src_lid, lev, rx, ry, etai_lev);
+  v_tgt[4] = interpolate_at_vertical_interfaces<np>(
+    nlev, etai, vnode, src_lid, lev, rx, ry, etai_levp1);
 }
 
 template <Int np, typename VnodeT, typename MT>
 void calc_v (const IslMpi<MT>& cm, const VnodeT& vnode,
              const Int src_lid, const Int lev,
-             const Real* const dep_point, Real* const v_tgt) {
+             const Real* const dep, Real* const v_tgt) {
   // Horizontal interpolation.
   Real rx[np], ry[np]; {
     Real ref_coord[2];
     const auto& m = cm.advecter->local_mesh(src_lid);
-    cm.advecter->s2r().calc_sphere_to_ref(src_lid, m, dep_point,
+    cm.advecter->s2r().calc_sphere_to_ref(src_lid, m, dep,
                                           ref_coord[0], ref_coord[1]);
     interpolate<MT>(cm.advecter->alg(), ref_coord, rx, ry);
   }
@@ -93,8 +214,8 @@ void calc_v (const IslMpi<MT>& cm, const VnodeT& vnode,
 
   // Vertical Interpolation.
   slmm_kernel_assert(cm.dep_points_ndim == 4);
-  interpolate_vertical<np>(cm.nlev, cm.etai_beg, cm.etai_end, cm.etam, vnode,
-                           src_lid, lev, dep_point[3], rx, ry, v_tgt);
+  interpolate_vertical<np>(cm.nlev, cm.etai, cm.etam, vnode,
+                           src_lid, lev, dep[3], dep[4], rx, ry, v_tgt);
 }
 
 template <typename MT>
@@ -106,8 +227,7 @@ struct CalcVData {
   const bool traj_3d;
   const int dep_points_ndim;
   const int nlev;
-  const Real etai_beg, etai_end;
-  const typename IslMpi<MT>::template ArrayD<Real*> etam;
+  const typename IslMpi<MT>::template ArrayD<Real*> etai, etam;
 
   CalcVData (const IslMpi<MT>& cm)
     : local_meshes(cm.advecter->local_meshes()),
@@ -116,20 +236,19 @@ struct CalcVData {
       traj_3d(cm.traj_3d),
       dep_points_ndim(cm.dep_points_ndim),
       nlev(cm.nlev),
-      etai_beg(cm.etai_beg), etai_end(cm.etai_end),
-      etam(cm.etam)
+      etai(cm.etai), etam(cm.etam)
   {}
 };
 
 template <Int np, typename VnodeT, typename MT> SLMM_KF
 void calc_v (const CalcVData<MT>& cvd, const VnodeT& vnode,
              const Int src_lid, const Int lev,
-             const Real* const dep_point, Real* const v_tgt) {
+             const Real* const dep, Real* const v_tgt) {
   // Horizontal interpolation.
   Real rx[np], ry[np]; {
     Real ref_coord[2];
     const auto& m = cvd.local_meshes(src_lid);
-    cvd.s2r.calc_sphere_to_ref(src_lid, m, dep_point,
+    cvd.s2r.calc_sphere_to_ref(src_lid, m, dep,
                                ref_coord[0], ref_coord[1]);
     interpolate<MT>(cvd.interp_alg, ref_coord, rx, ry);
   }
@@ -146,14 +265,14 @@ void calc_v (const CalcVData<MT>& cvd, const VnodeT& vnode,
 
   // Vertical Interpolation.
   slmm_kernel_assert(cvd.dep_points_ndim == 4);
-  interpolate_vertical<np>(cvd.nlev, cvd.etai_beg, cvd.etai_end, cvd.etam, vnode,
-                           src_lid, lev, dep_point[3], rx, ry, v_tgt);
+  interpolate_vertical<np>(cvd.nlev, cvd.etai, cvd.etam, vnode,
+                           src_lid, lev, dep[3], dep[4], rx, ry, v_tgt);
 }
 
 template <int np, typename VnodeT, typename MT>
 void traj_calc_rmt_next_step (IslMpi<MT>& cm, const VnodeT& vnode) {
   calc_rmt_q_pass1(cm, true);
-  const auto ndim = cm.dep_points_ndim;
+  const auto xsz = cm.traj_msg_sz;
   const auto& rmt_xs = cm.rmt_xs;
   const auto& sendbuf = cm.sendbuf;
   const auto& recvbuf = cm.recvbuf;
@@ -170,7 +289,7 @@ void traj_calc_rmt_next_step (IslMpi<MT>& cm, const VnodeT& vnode) {
   {
     const Int
       ri = rmt_xs(5*it), lid = rmt_xs(5*it + 1), lev = rmt_xs(5*it + 2),
-      xos = rmt_xs(5*it + 3), vos = ndim*rmt_xs(5*it + 4);
+      xos = rmt_xs(5*it + 3), vos = xsz*rmt_xs(5*it + 4);
     const auto&& xs = recvbuf(ri);
     auto&& v = sendbuf(ri);
     calc_v<np>(cvd, vnode, lid, lev, &xs(xos), &v(vos));
@@ -183,7 +302,9 @@ void traj_calc_rmt_next_step (IslMpi<MT>& cm, const VnodeT& vnode) {
 template <int np, typename VnodeT, typename VdepT, typename MT>
 void traj_calc_own_next_step (IslMpi<MT>& cm, const DepPoints<MT>& dep_points,
                               const VnodeT& vnode, const VdepT& vdep) {
+  const auto xsz = cm.traj_msg_sz;
   const auto ndim = cm.dep_points_ndim;
+  const auto etai_end = cm.etai_end;
 #ifdef COMPOSE_PORT
   const auto& ed_d = cm.ed_d;
   const auto& own_dep_list = cm.own_dep_list;
@@ -194,9 +315,14 @@ void traj_calc_own_next_step (IslMpi<MT>& cm, const DepPoints<MT>& dep_points,
     const Int tgt_k = own_dep_list(it,2);
     const auto& ed = ed_d(tci);
     const Int slid = ed.nbrs(ed.src(tgt_lev, tgt_k)).lid_on_rank;
-    Real v_tgt[4];
-    calc_v<np>(cvd, vnode, slid, tgt_lev, &dep_points(tci,tgt_lev,tgt_k,0), v_tgt);
-    for (int d = 0; d < ndim; ++d)
+    Real dep[5], v_tgt[5];
+    for (Int d = 0; d < ndim; ++d)
+      dep[d] = dep_points(tci,tgt_lev,tgt_k,d);
+    dep[ndim] = (tgt_lev+1 == cvd.nlev ?
+                 etai_end :
+                 dep_points(tci,tgt_lev+1,tgt_k,ndim-1));
+    calc_v<np>(cvd, vnode, slid, tgt_lev, dep, v_tgt);
+    for (int d = 0; d < xsz; ++d)
       vdep(tci,tgt_lev,tgt_k,d) = v_tgt[d];
   };
   ko::parallel_for(
@@ -212,9 +338,14 @@ void traj_calc_own_next_step (IslMpi<MT>& cm, const DepPoints<MT>& dep_points,
     for (Int idx = 0; idx < ned; ++idx) {
       const auto& e = ed.own(idx);
       const Int slid = ed.nbrs(ed.src(e.lev, e.k)).lid_on_rank;
-      Real v_tgt[4];
-      calc_v<np>(cm, vnode, slid, e.lev, &dep_points(tci,e.lev,e.k,0), v_tgt);
-      for (int d = 0; d < ndim; ++d)
+      Real dep[5], v_tgt[5];
+      for (Int d = 0; d < ndim; ++d)
+        dep[d] = dep_points(tci,e.lev,e.k,d);
+      dep[ndim] = (e.lev+1 == cm.nlev ?
+                   etai_end :
+                   dep_points(tci,e.lev+1,e.k,ndim-1));
+      calc_v<np>(cm, vnode, slid, e.lev, dep, v_tgt);
+      for (int d = 0; d < xsz; ++d)
         vdep(tci,e.lev,e.k,d) = v_tgt[d];
     }
   }
@@ -226,7 +357,7 @@ void traj_copy_next_step (IslMpi<MT>& cm, const VdepT& vdep) {
 #ifndef NDEBUG
   const auto myrank = cm.p->rank();
 #endif
-  const auto ndim = cm.dep_points_ndim;
+  const auto xsz = cm.traj_msg_sz;
 #ifdef COMPOSE_PORT
   const auto& mylid_with_comm = cm.mylid_with_comm_d;
   const auto& ed_d = cm.ed_d;
@@ -242,7 +373,7 @@ void traj_copy_next_step (IslMpi<MT>& cm, const VdepT& vdep) {
     slmm_kernel_assert(ed.nbrs(ed.src(e.lev, e.k)).rank != myrank);
     const Int ri = ed.nbrs(ed.src(e.lev, e.k)).rank_idx;
     const auto&& recvbuf = recvbufs(ri);
-    for (int d = 0; d < ndim; ++d)
+    for (int d = 0; d < xsz; ++d)
       vdep(tci,e.lev,e.k,d) = recvbuf(e.q_ptr + d);
   };
   ko::parallel_for(ko::RangePolicy<typename MT::DES>(0, nlid*np2*nlev), f);
@@ -257,7 +388,7 @@ void traj_copy_next_step (IslMpi<MT>& cm, const VdepT& vdep) {
       slmm_assert(ed.nbrs(ed.src(e.lev, e.k)).rank != myrank);
       const Int ri = ed.nbrs(ed.src(e.lev, e.k)).rank_idx;
       const auto&& recvbuf = cm.recvbuf(ri);
-      for (int d = 0; d < ndim; ++d)
+      for (int d = 0; d < xsz; ++d)
         vdep(tci,e.lev,e.k,d) = recvbuf(e.q_ptr + d);
     }
   }
@@ -270,9 +401,9 @@ void traj_copy_next_step (IslMpi<MT>& cm, const VdepT& vdep) {
 // 3D Cartesian representation of the horizontal velocity; dim = 3 is for
 // eta_dot.
 template <typename MT> void
-calc_v_departure (IslMpi<MT>& cm, const Int nets, const Int nete,
-                  const Int step, const Real dtsub,
-                  Real* dep_points_r, const Real* vnode_r, Real* vdep_r)
+interp_v_update (IslMpi<MT>& cm, const Int nets, const Int nete,
+                 const Int step, const Real dtsub,
+                 Real* dep_points_r, const Real* vnode_r, Real* vdep_r)
 {
   const int np = 4;
 
@@ -280,24 +411,25 @@ calc_v_departure (IslMpi<MT>& cm, const Int nets, const Int nete,
   slmm_assert((cm.traj_3d and cm.dep_points_ndim == 4) or
               (not cm.traj_3d and cm.dep_points_ndim == 3));
 #ifdef COMPOSE_PORT
-  slmm_assert(nets == 0 && nete+1 == cm.nelemd);
+  slmm_assert(nets == 0 and nete+1 == cm.nelemd);
 #endif
 
   // If step = 0, the departure points are at the nodes and no interpolation is
-  // needed. calc_v_departure should not have been called; rather, the calling
+  // needed. interp_v_update should not have been called; rather, the calling
   // routine should use vnode instead of vdep in subsequent calculations.
   slmm_assert(step > 0);
 
+  const auto ndim = cm.dep_points_ndim;
+
 #ifdef COMPOSE_PORT
   const auto& vnode = cm.tracer_arrays->vnode;
   const auto& vdep  = cm.tracer_arrays->vdep;
 #else
-  const auto ndim = cm.dep_points_ndim;
   CA4<const Real> vnode(vnode_r, cm.nelemd, cm.nlev, cm.np2, ndim);
-  CA4<      Real> vdep (vdep_r , cm.nelemd, cm.nlev, cm.np2, ndim);
+  CA4<      Real> vdep (vdep_r , cm.nelemd, cm.nlev, cm.np2, ndim+1);
 #endif
-  slmm_assert(vnode.extent_int(3) == cm.dep_points_ndim);
-  slmm_assert(vdep .extent_int(3) == cm.dep_points_ndim);
+  slmm_assert(vnode.extent_int(3) == ndim);
+  slmm_assert(vdep .extent_int(3) == ndim+1);
 
 #ifdef COMPOSE_PORT
   const auto& dep_points = cm.tracer_arrays->dep_points;
@@ -326,7 +458,7 @@ calc_v_departure (IslMpi<MT>& cm, const Int nets, const Int nete,
   wait_on_send(cm, true /* skip_if_empty */);
 }
 
-template void calc_v_departure(
+template void interp_v_update(
   IslMpi<ko::MachineTraits>&, const Int, const Int, const Int, const Real,
   Real*, const Real*, Real*);
 
diff --git a/components/homme/src/share/compose/compose_slmm_islmpi_pack.cpp b/components/homme/src/share/compose/compose_slmm_islmpi_pack.cpp
index 213226b6c3b8..981e2db4a431 100644
--- a/components/homme/src/share/compose/compose_slmm_islmpi_pack.cpp
+++ b/components/homme/src/share/compose/compose_slmm_islmpi_pack.cpp
@@ -36,7 +36,7 @@ void pack_dep_points_sendbuf_pass1_scan (IslMpi<MT>& cm, const bool trajectory)
   const auto& blas = cm.bla;
   const auto nlev = cm.nlev;
   const Int nrmtrank = static_cast<Int>(cm.ranks.size()) - 1;
-  const Int ndim = trajectory ? cm.dep_points_ndim : 3;
+  const Int xsz = trajectory ? cm.traj_msg_sz : 3;
   for (Int ri = 0; ri < nrmtrank; ++ri) {
     const Int lid_on_rank_n = cm.lid_on_rank_h(ri).n();
     const auto f = COMPOSE_LAMBDA (const int idx, Accum& a, const bool fin) {
@@ -66,9 +66,9 @@ void pack_dep_points_sendbuf_pass1_scan (IslMpi<MT>& cm, const bool trajectory)
       if (nx > 0) {
         const auto dos = setbuf(sendbuf, a.mos, lid_on_rank(lidi), lev, nx, fin);
         a.mos += dos;
-        a.sendcount += dos + ndim*nx;
+        a.sendcount += dos + xsz*nx;
         if (fin) t.xptr = a.xos;
-        a.xos += ndim*nx;
+        a.xos += xsz*nx;
         a.qos += trajectory ? nx : 2 + nx;
       }
     };
@@ -112,7 +112,7 @@ void pack_dep_points_sendbuf_pass1_noscan (IslMpi<MT>& cm, const bool trajectory
   deep_copy(cm.bla_h, cm.bla);
 #endif
   const Int nrmtrank = static_cast<Int>(cm.ranks.size()) - 1;
-  const Int ndim = trajectory ? cm.dep_points_ndim : 3;
+  const Int xsz = trajectory ? cm.traj_msg_sz : 3;
 #ifdef COMPOSE_HORIZ_OPENMP
 # pragma omp for
 #endif
@@ -149,9 +149,9 @@ void pack_dep_points_sendbuf_pass1_noscan (IslMpi<MT>& cm, const bool trajectory
         slmm_assert_high(nx > 0);
         const auto dos = setbuf(sendbuf, mos, lev, nx);
         mos += dos;
-        sendcount += dos + ndim*nx;
+        sendcount += dos + xsz*nx;
         t.xptr = xos;
-        xos += ndim*nx;
+        xos += xsz*nx;
         qos += trajectory ? nx : 2 + nx;
         nx_in_lid -= nx;
       }
@@ -214,6 +214,8 @@ void pack_dep_points_sendbuf_pass2 (IslMpi<MT>& cm, const DepPoints<MT>& dep_poi
   {
     ConstExceptGnu Int np2 = cm.np2, nlev = cm.nlev, qsize = cm.qsize;
     ConstExceptGnu Int ndim = trajectory ? cm.dep_points_ndim : 3;
+    ConstExceptGnu Int xsz = trajectory ? ndim+1 : ndim;
+    ConstExceptGnu auto etai_end = cm.etai_end;
     const auto& ed_d = cm.ed_d;
     const auto& mylid_with_comm_d = cm.mylid_with_comm_d;
     const auto& sendbuf = cm.sendbuf;
@@ -251,7 +253,7 @@ void pack_dep_points_sendbuf_pass2 (IslMpi<MT>& cm, const DepPoints<MT>& dep_poi
         ++t.cnt;
 #endif
         qptr = t.qptr;
-        xptr = x_bulkdata_offset(ri) + t.xptr + ndim*cnt;
+        xptr = x_bulkdata_offset(ri) + t.xptr + xsz*cnt;
       }
 #ifdef COMPOSE_HORIZ_OPENMP
       if (horiz_openmp) omp_unset_lock(lock);
@@ -259,9 +261,13 @@ void pack_dep_points_sendbuf_pass2 (IslMpi<MT>& cm, const DepPoints<MT>& dep_poi
       slmm_kernel_assert_high(xptr > 0);
       for (Int i = 0; i < ndim; ++i)
         sb(xptr + i) = dep_points(tci,lev,k,i);
+      if (trajectory)
+        sb(xptr + ndim) = (lev+1 == nlev ?
+                           etai_end :
+                           dep_points(tci,lev+1,k,ndim-1));
       auto& item = ed.rmt.atomic_inc_and_return_next();
       if (trajectory) {
-        item.q_extrema_ptr = item.q_ptr = ndim*(qptr + cnt);
+        item.q_extrema_ptr = item.q_ptr = xsz*(qptr + cnt);
       } else {
         item.q_extrema_ptr = qsize * qptr;
         item.q_ptr = item.q_extrema_ptr + qsize*(2 + cnt);
diff --git a/components/homme/src/share/compose/compose_slmm_islmpi_q.cpp b/components/homme/src/share/compose/compose_slmm_islmpi_q.cpp
index a9af5cfcd476..105c04cce279 100644
--- a/components/homme/src/share/compose/compose_slmm_islmpi_q.cpp
+++ b/components/homme/src/share/compose/compose_slmm_islmpi_q.cpp
@@ -286,7 +286,7 @@ void calc_rmt_q_pass1_scan (IslMpi<MT>& cm, const bool trajectory) {
   const auto& rmt_xs = cm.rmt_xs;
   const auto& rmt_qs_extrema = cm.rmt_qs_extrema;
   const Int nrmtrank = static_cast<Int>(cm.ranks.size()) - 1;
-  const Int ndim = trajectory ? cm.dep_points_ndim : 3;
+  const Int xsz = trajectory ? cm.traj_msg_sz : 3;
   Int cnt = 0, qcnt = 0;
   for (Int ri = 0; ri < nrmtrank; ++ri) {
     const auto get_xos = COMPOSE_LAMBDA (const Int, Int& xos) {
@@ -326,18 +326,18 @@ void calc_rmt_q_pass1_scan (IslMpi<MT>& cm, const bool trajectory) {
           rmt_xs(5*cnt_tot + 3) = xos + a.xos;
           rmt_xs(5*cnt_tot + 4) = a.qos;
           a.cnt += 1;
-          a.xos += ndim;
+          a.xos += xsz;
           a.qos += 1;
         }
       } else {
         a.cnt += nx;
-        a.xos += ndim*nx;
+        a.xos += xsz*nx;
         a.qos += nx;
       }
     };
     Accum a;
     ko::parallel_scan(ko::RangePolicy<typename MT::DES>(0, xos/nreal_per_2int - 1), f, a);
-    cm.sendcount_h(ri) = (trajectory ? ndim : cm.qsize)*a.qos;
+    cm.sendcount_h(ri) = (trajectory ? xsz : cm.qsize)*a.qos;
     cnt += a.cnt;
     qcnt += a.qcnt;
   }
@@ -412,7 +412,7 @@ void calc_rmt_q_pass2 (IslMpi<MT>& cm) {
 template <typename MT>
 void calc_rmt_q_pass1_noscan (IslMpi<MT>& cm, const bool trajectory) {
   const Int nrmtrank = static_cast<Int>(cm.ranks.size()) - 1;
-  const Int ndim = trajectory ? cm.dep_points_ndim : 3;
+  const Int xsz = trajectory ? cm.traj_msg_sz : 3;
 #ifdef COMPOSE_PORT_SEPARATE_VIEWS
 #ifdef COMPOSE_HORIZ_OPENMP
 # pragma omp for
@@ -454,7 +454,7 @@ void calc_rmt_q_pass1_noscan (IslMpi<MT>& cm, const bool trajectory) {
           Int lev, nx;
           mos += getbuf(xs, mos, lev, nx);
           slmm_assert(nx > 0);
-          if ( ! trajectory) {
+          if (not trajectory) {
             cm.rmt_qs_extrema_h(4*qcnt + 0) = ri;
             cm.rmt_qs_extrema_h(4*qcnt + 1) = lid;
             cm.rmt_qs_extrema_h(4*qcnt + 2) = lev;
@@ -469,7 +469,7 @@ void calc_rmt_q_pass1_noscan (IslMpi<MT>& cm, const bool trajectory) {
             cm.rmt_xs_h(5*cnt + 3) = xos;
             cm.rmt_xs_h(5*cnt + 4) = qos;
             ++cnt;
-            xos += ndim;
+            xos += xsz;
             ++qos;
           }
           nx_in_lid -= nx;
@@ -480,7 +480,7 @@ void calc_rmt_q_pass1_noscan (IslMpi<MT>& cm, const bool trajectory) {
         if (nx_in_rank == 0) break;
       }
       slmm_assert(nx_in_rank == 0);
-      cm.sendcount_h(ri) = (trajectory ? ndim : cm.qsize)*qos;
+      cm.sendcount_h(ri) = (trajectory ? xsz : cm.qsize)*qos;
     }
     cm.nrmt_xs = cnt;
     cm.nrmt_qs_extrema = trajectory ? 0 : qcnt;
diff --git a/components/homme/src/share/compose_mod.F90 b/components/homme/src/share/compose_mod.F90
index 6085eaf1238d..813b0e6a68b2 100644
--- a/components/homme/src/share/compose_mod.F90
+++ b/components/homme/src/share/compose_mod.F90
@@ -188,14 +188,13 @@ subroutine slmm_check_ref2sphere(ie, sphere_cart_coord) bind(c)
        type(cartesian3D_t), intent(in) :: sphere_cart_coord
      end subroutine slmm_check_ref2sphere
 
-     subroutine slmm_set_hvcoord(etai_beg, etai_end, etam) bind(c)
+     subroutine slmm_set_hvcoord(etai, etam) bind(c)
        use iso_c_binding, only: c_double
-       use dimensions_mod, only : nlev
-       real(kind=c_double), value, intent(in) :: etai_beg, etai_end
-       real(kind=c_double), intent(in) :: etam(nlev)
+       use dimensions_mod, only : nlevp, nlev
+       real(kind=c_double), intent(in) :: etai(nlevp), etam(nlev)
      end subroutine slmm_set_hvcoord
 
-     subroutine slmm_calc_v_departure(nets, nete, step, dtsub, dep_points, &
+     subroutine slmm_interp_v_update(nets, nete, step, dtsub, dep_points, &
           dep_points_ndim, vnode, vdep, info) bind(c)
        use iso_c_binding, only: c_int, c_double
        use dimensions_mod, only : np, nlev, nelemd, qsize
@@ -206,7 +205,7 @@ subroutine slmm_calc_v_departure(nets, nete, step, dtsub, dep_points, &
        real(kind=c_double), intent(in) :: vnode(dep_points_ndim,np,np,nlev,nelemd)
        real(kind=c_double), intent(out) :: vdep(dep_points_ndim,np,np,nlev,nelemd)
        integer(kind=c_int), intent(out) :: info
-     end subroutine slmm_calc_v_departure
+     end subroutine slmm_interp_v_update
 
      subroutine slmm_csl_set_elem_data(ie, metdet, qdp, n0_qdp, dp, q, nelem_in_patch, &
           h2d, d2h) bind(c)
@@ -278,7 +277,7 @@ subroutine compose_init(par, elem, GridVertex, init_kokkos)
          ! These are for non-scalable grid initialization, still used for RRM.
          sc2gci(:), sc2rank(:)        ! space curve index -> (GID, rank)
     integer :: lid2gid(nelemd), lid2facenum(nelemd)
-    integer :: i, j, k, sfc, gid, igv, sc, geometry_type, sl_traj_3d
+    integer :: i, j, k, sfc, gid, igv, sc, geometry_type, sl_traj_3d, nsub
     ! To map SFC index to IDs and ranks
     logical(kind=c_bool) :: use_sgi, owned, independent_time_steps, hard_zero
     integer, allocatable :: owned_ids(:)
@@ -305,7 +304,9 @@ subroutine compose_init(par, elem, GridVertex, init_kokkos)
        ! related to the dynamics' tstep, dt_tracer_factor is meaningful,
        ! implying:
        semi_lagrange_halo = (dt_tracer_factor + 2) / 3
-       if (semi_lagrange_halo < 1) semi_lagrange_halo = 1
+       ! Set the lower bound to 2. 1 is also reasonable, but 2 is a conservative
+       ! choice with no performance impact.
+       if (semi_lagrange_halo < 2) semi_lagrange_halo = 2
     end if
 
     geometry_type = 0 ! sphere
@@ -396,10 +397,11 @@ subroutine compose_init(par, elem, GridVertex, init_kokkos)
        nirptr(nelemd+1) = k - 1
        sl_traj_3d = 0
        if (independent_time_steps) sl_traj_3d = 1
+       nsub = semi_lagrange_trajectory_nsubstep
        call slmm_init_impl(par%comm, transport_alg, np, nlev, qsize, qsize_d, &
             nelem, nelemd, cubed_sphere_map, geometry_type, lid2gid, lid2facenum, &
             nbr_id_rank, nirptr, semi_lagrange_halo, sl_traj_3d, &
-            semi_lagrange_trajectory_nsubstep, semi_lagrange_nearest_point_lev, &
+            nsub, semi_lagrange_nearest_point_lev, &
             size(lid2gid), size(lid2facenum), size(nbr_id_rank), size(nirptr))
        if (geometry_type == 1) call slmm_init_plane(Sx, Sy, Lx, Ly)
        deallocate(nbr_id_rank, nirptr)
diff --git a/components/homme/src/share/compose_test_mod.F90 b/components/homme/src/share/compose_test_mod.F90
index 263c4ec86099..e8623ccee961 100644
--- a/components/homme/src/share/compose_test_mod.F90
+++ b/components/homme/src/share/compose_test_mod.F90
@@ -78,7 +78,7 @@ end subroutine compose_stt_finish
 contains
 
   ! For comprehensive testing.
-  subroutine compose_test(par, hvcoord, dom_mt, elem, eval)
+  subroutine compose_test(par, hvcoord, dom_mt, elem, nerr, eval)
     use kinds, only: real_kind
     use parallel_mod, only: parallel_t
     use domain_mod, only: domain1d_t
@@ -101,11 +101,12 @@ subroutine compose_test(par, hvcoord, dom_mt, elem, eval)
     type (hvcoord_t) , intent(in) :: hvcoord
     type (domain1d_t), pointer, intent(in) :: dom_mt(:)
     type (element_t), intent(inout) :: elem(:)
+    integer, optional, intent(out) :: nerr
     real (real_kind), optional, intent(out) :: eval(:)
 
     type (hybrid_t) :: hybrid
     type (derivative_t) :: deriv
-    integer :: ithr, nets, nete, nerr
+    integer :: ithr, nets, nete, ne, nesum
 
 #ifdef HOMME_ENABLE_COMPOSE
     if (transport_alg == 19) then
@@ -120,10 +121,12 @@ subroutine compose_test(par, hvcoord, dom_mt, elem, eval)
 
     ! 1. Unit tests.
     call compose_unittest()
-    call sl_unittest(par, hvcoord)
-    nerr = 0
-    call cedr_unittest(par%comm, nerr)
-    if (nerr /= 0) print *, 'cedr_unittest returned', nerr
+    call sl_unittest(par, hvcoord, ne)
+    nesum = ne
+    call cedr_unittest(par%comm, ne)
+    if (ne /= 0) print *, 'cedr_unittest returned', ne
+    nesum = nesum + ne
+    if (present(nerr)) nerr = nesum
 
 #if (defined HORIZ_OPENMP)
     !$omp parallel num_threads(hthreads), default(SHARED), private(ithr,nets,nete,hybrid)
diff --git a/components/homme/src/share/control_mod.F90 b/components/homme/src/share/control_mod.F90
index e94029abba3f..8d2c2c8ea64a 100644
--- a/components/homme/src/share/control_mod.F90
+++ b/components/homme/src/share/control_mod.F90
@@ -643,7 +643,7 @@ end subroutine test_timestep_make_parameters_consistent
 
 subroutine set_planar_defaults()
 
-use physical_constants, only: Lx, Ly, Sx, Sy
+use physical_constants, only: Lx, Ly, Sx, Sy, dd_pi, rearth
  
 !since defaults here depend on test, they cannot be set before ctl_nl is read, unlike some other parameters, bubble_*, etc.        
 !if true, most likely lx,ly,sx,sy weren't set in ctl_nl
@@ -705,7 +705,11 @@ subroutine set_planar_defaults()
 !       Ly = 5000.0D0 * 1000.0D0
 !       Sx = 0.0D0
 !       Sy = 0.0D0
-
+    else if (test_case(1:16) == 'planar_transport') then
+       Lx = 2*dd_pi*rearth
+       Ly = Lx
+       Sx = -Lx/2
+       Sy = Sx
     endif
     endif !if lx,ly,sx,sy are not set in nl
 
diff --git a/components/homme/src/share/cxx/ComposeTransport.cpp b/components/homme/src/share/cxx/ComposeTransport.cpp
index 8d7349093282..6433fb39fdc1 100644
--- a/components/homme/src/share/cxx/ComposeTransport.cpp
+++ b/components/homme/src/share/cxx/ComposeTransport.cpp
@@ -55,6 +55,11 @@ void ComposeTransport::init_boundary_exchanges () {
   m_compose_impl->init_boundary_exchanges();
 }
 
+void ComposeTransport::observe_velocity (const TimeLevel& tl, const int step) {
+  assert(is_setup);
+  m_compose_impl->observe_velocity(tl, step);
+}
+
 void ComposeTransport::run (const TimeLevel& tl, const Real dt) {
   assert(is_setup);
   m_compose_impl->run(tl, dt);
@@ -69,13 +74,11 @@ std::vector<std::pair<std::string, int> >
 ComposeTransport::run_unit_tests () {
   assert(is_setup);
   std::vector<std::pair<std::string, int> > fails;
-  int ne, nerr = 0;
+  int ne;
   ne = m_compose_impl->run_trajectory_unit_tests();
-  if (ne) fails.push_back(std::make_pair("run_trajectory_unit_tests", nerr));
-  nerr += ne;
+  if (ne) fails.push_back(std::make_pair("run_trajectory_unit_tests", ne));
   ne = m_compose_impl->run_enhanced_trajectory_unit_tests();
-  if (ne) fails.push_back(std::make_pair("run_enhanced_trajectory_unit_tests", nerr));
-  nerr += ne;
+  if (ne) fails.push_back(std::make_pair("run_enhanced_trajectory_unit_tests", ne));
   return fails;
 }
 
diff --git a/components/homme/src/share/cxx/ComposeTransport.hpp b/components/homme/src/share/cxx/ComposeTransport.hpp
index cbdbe0f9715a..8068f65f47d0 100644
--- a/components/homme/src/share/cxx/ComposeTransport.hpp
+++ b/components/homme/src/share/cxx/ComposeTransport.hpp
@@ -15,10 +15,10 @@
 
 namespace Homme {
 
-class FunctorsBuffersManager;
-class ComposeTransportImpl;
-class SimulationParams;
-class TimeLevel;
+struct FunctorsBuffersManager;
+struct ComposeTransportImpl;
+struct SimulationParams;
+struct TimeLevel;
 
 class ComposeTransport {
 public:
@@ -38,6 +38,7 @@ class ComposeTransport {
   void init_buffers(const FunctorsBuffersManager& fbm);
   void init_boundary_exchanges();
 
+  void observe_velocity(const TimeLevel& tl, const int step);
   void run(const TimeLevel& tl, const Real dt);
   void remap_q(const TimeLevel& tl);
 
diff --git a/components/homme/src/share/cxx/ComposeTransportImpl.hpp b/components/homme/src/share/cxx/ComposeTransportImpl.hpp
index bcdff306e511..dea144818d8a 100644
--- a/components/homme/src/share/cxx/ComposeTransportImpl.hpp
+++ b/components/homme/src/share/cxx/ComposeTransportImpl.hpp
@@ -72,16 +72,17 @@ struct ComposeTransportImpl {
   typedef typename ViewConst<S2Nlev>::type CS2Nlev;
   typedef typename ViewConst<R2Nlev>::type CR2Nlev;
 
-  using  DpSlot = ExecViewUnmanaged<      Scalar**   [NP][NP][NUM_LEV]>;
-  using   VSlot = ExecViewUnmanaged<      Scalar**[2][NP][NP][NUM_LEV]>;
-  using CDpSlot = ExecViewUnmanaged<const Scalar**   [NP][NP][NUM_LEV]>;
-  using  CVSlot = ExecViewUnmanaged<const Scalar**[2][NP][NP][NUM_LEV]>;
+  using DpSnaps = ExecViewManaged<Scalar**   [NP][NP][NUM_LEV]>;
+  using  VSnaps = ExecViewManaged<Scalar**[2][NP][NP][NUM_LEV]>;
+
   struct VelocityRecord;
 
   struct Data {
     int nelemd, qsize, limiter_option, cdr_check, hv_q, hv_subcycle_q;
     int geometry_type; // 0: sphere, 1: plane
     int trajectory_nsubstep; // 0: original alg, >= 1: enhanced
+    int trajectory_nvelocity;
+    int diagnostics;
     Real nu_q, hv_scaling, dp_tol, deta_tol;
     bool independent_time_steps;
 
@@ -94,8 +95,12 @@ struct ComposeTransportImpl {
 
     ExecView<Scalar[NUM_LEV]> hydetai; // diff(etai)
     ExecView<Real[NUM_INTERFACE_LEV]> hydetam_ref;
+    ExecView<Scalar[NUM_LEV_P]> db_deta_i; // B_eta at interfaces
 
+    // Persistent, allocated memory, depending on options.
     DeparturePoints dep_pts, vnode, vdep; // (ie,lev,i,j,d)
+    DpSnaps dp_extra_snapshots; // (ie,snapshot,i,j,lev)
+    VSnaps vel_extra_snapshots; // (ie,snapshot,d,i,j,lev)
 
     std::shared_ptr<VelocityRecord> vrec;
 
@@ -133,17 +138,18 @@ struct ComposeTransportImpl {
   }
 
   void set_dp_tol();
-  void setup_enhanced_trajectory();
+  void setup_enhanced_trajectory(const SimulationParams& params, const int num_elems);
   void reset(const SimulationParams& params);
   int requested_buffer_size() const;
   void init_buffers(const FunctorsBuffersManager& fbm);
   void init_boundary_exchanges();
 
+  void observe_velocity(const TimeLevel& tl, const int step);
   void run(const TimeLevel& tl, const Real dt);
   void remap_q(const TimeLevel& tl);
 
   void calc_trajectory(const int np1, const Real dt);
-  void calc_enhanced_trajectory(const int np1, const Real dt);
+  void calc_enhanced_trajectory(const int nstep, const int np1, const Real dt);
   void remap_v(const ExecViewUnmanaged<const Scalar*[NUM_TIME_LEVELS][NP][NP][NUM_LEV]>& dp3d,
                const int np1, const ExecViewUnmanaged<const Scalar*[NP][NP][NUM_LEV]>& dp,
                const ExecViewUnmanaged<Scalar*[2][NP][NP][NUM_LEV]>& v);
@@ -334,20 +340,39 @@ struct ComposeTransportImpl {
     }
   }
 
-  // Form a 3rd-degree Lagrange polynomial over (x(k-1:k+1), y(k-1:k+1)) and set
+  // Form a 2nd-degree Lagrange polynomial over (x(k-1:k+1), y(k-1:k+1)) and set
   // yi(k) to its derivative at x(k). yps(:,:,0) is not written.
+  //   This is equivalent to a weighted average of 1-sided finite differences:
+  //      dx1 = xk - xkm1, dx2 = xkp1 - xk
+  //      w = dx2/(dx1 + dx2)
+  //      return w (yk - ykm1)/dx1 + (1-w) (ykp1 - yk)/dx2.
+  // This impl is retained for BFBness in the original trajectory algorithm with
+  // the F90. The next impl is preferred in practice.
+  template <typename Real>
+  KOKKOS_FUNCTION static Real approx_derivative1 (
+    const Real& xkm1, const Real& xk, const Real& xkp1,
+    const Real& ykm1, const Real& yk, const Real& ykp1)
+  {
+    return (ykm1*((1/(xkm1 - xk))*((xk - xkp1)/(xkm1 - xkp1))) +
+            yk  *(1/(xk - xkm1) + 1/(xk - xkp1)) +
+            ykp1*((1/(xkp1 - xk))*((xk - xkm1)/(xkp1 - xkm1))));
+  }
+
+  // In infinite precision, same as above. Impl as the weighted average of
+  // 1-sided finite differences to reduce ops.
   template <typename Real>
   KOKKOS_FUNCTION static Real approx_derivative (
     const Real& xkm1, const Real& xk, const Real& xkp1,
     const Real& ykm1, const Real& yk, const Real& ykp1)
   {
-    return (ykm1*((         1 /(xkm1 - xk  ))*((xk - xkp1)/(xkm1 - xkp1))) +
-            yk  *((         1 /(xk   - xkm1))*((xk - xkp1)/(xk   - xkp1)) +
-                  ((xk - xkm1)/(xk   - xkm1))*(         1 /(xk   - xkp1))) +
-            ykp1*(((xk - xkm1)/(xkp1 - xkm1))*(         1 /(xkp1 - xk  ))));
+    const auto
+      dx1 = xk - xkm1,
+      dx2 = xkp1 - xk,
+      w = dx2/(dx1 + dx2);
+    return w*(yk - ykm1)/dx1 + (1-w)*(ykp1 - yk)/dx2;
   }
 
-  KOKKOS_INLINE_FUNCTION static void approx_derivative (
+  KOKKOS_INLINE_FUNCTION static void approx_derivative1 (
     const KernelVariables& kv, const CSNlevp& xs, const CSNlevp& ys,
     const SNlev& yps) // yps(:,:,0) is undefined
   {
@@ -356,8 +381,8 @@ struct ComposeTransportImpl {
     RNlev yp(pack2real(yps));
     const auto f = [&] (const int i, const int j, const int k) {
       if (k == 0) return;
-      yp(i,j,k) = approx_derivative(x(i,j,k-1), x(i,j,k), x(i,j,k+1),
-                                    y(i,j,k-1), y(i,j,k), y(i,j,k+1));
+      yp(i,j,k) = approx_derivative1(x(i,j,k-1), x(i,j,k), x(i,j,k+1),
+                                     y(i,j,k-1), y(i,j,k), y(i,j,k+1));
     };
     loop_ijk<num_phys_lev>(kv, f);
   }
diff --git a/components/homme/src/share/cxx/ComposeTransportImplEnhancedTrajectory.cpp b/components/homme/src/share/cxx/ComposeTransportImplEnhancedTrajectory.cpp
index 62bd190bc305..379a32a9ccaa 100644
--- a/components/homme/src/share/cxx/ComposeTransportImplEnhancedTrajectory.cpp
+++ b/components/homme/src/share/cxx/ComposeTransportImplEnhancedTrajectory.cpp
@@ -89,17 +89,17 @@
    (calc_etadotmid_from_etadotdpdnint).
      Use eta_dot, the horizontal velocity data, and the update formula described
    in the comment for calc_nodal_velocities to compute the velocity term in the
-   update formula at the Eulerian vertical-midpoint nodes. Call the result V
+   update formula at the Eulerian vertical-midpoint nodes. Call the result V_upd
    (calc_vel_horiz_formula_node_ref_mid, calc_eta_dot_formula_node_ref_mid).
      In general, the trajectory arrival point at t1 is not on the grid, but it
-   is in the first substep. If it is not on the grid, interpolate V to the
-   arrival points to produce V_dep (calc_v_departure). A detail here is we
+   is in the first substep. If it is not on the grid, interpolate V_upd to the
+   arrival points to produce V_upd' (interp_v_update). A detail here is we
    should actually think of the original velocity data as being interpolated,
-   and then V_dep is computed from the interpolated data. But the formula to
-   compute V is linear in these data, so we can defer interpolation to the end
-   and do it just once.
+   and then V_upd' is computed from the interpolated data. But the formula to
+   compute V_upd is linear in these data, so we can defer interpolation to the
+   end and do it just once.
      Integrate the trajectory backward from t1 at the arrival point to t0 at the
-   departure point using V_dep (update_dep_points).
+   departure point using V_upd or V_upd' (update_dep_points).
      The algorithm up to this point can be substepped, running multiple times to
    go backward from t1 to t0 in multiple steps.
      After substepping is finished, there is one final part.
@@ -129,13 +129,151 @@
 #include "ComposeTransportImplEnhancedTrajectoryImpl.hpp"
 
 namespace Homme {
+
+using CTI = ComposeTransportImpl;
+
+Real& CTI::VelocityRecord::t_vel (const int i) {
+  assert(_nvel > 2);
+  assert(i >= 0 and i < _nvel);
+  return _t_vel[i];
+}
+
+int& CTI::VelocityRecord::obs_slots (const int n, const int k) {
+  assert(_nvel > 2);
+  assert(n >= 0 and n <= _dtf);
+  assert(k >= 0 and k <= 1);
+  return _obs_slots[2*n+k];
+}
+
+Real& CTI::VelocityRecord::obs_wts (const int n, const int k) {
+  assert(_nvel > 2);
+  assert(n >= 0 and n <= _dtf);
+  assert(k >= 0 and k <= 1);
+  return _obs_wts[2*n+k];  
+}
+
+int& CTI::VelocityRecord::run_step (const int i) {
+  assert(_nvel > 2);
+  assert(i >= 0 and i <= _nsub);
+  return _run_step[i];
+}
+
+Real CTI::VelocityRecord::t_vel (const int i) const {
+  return const_cast<VelocityRecord*>(this)->t_vel(i);
+}
+
+int CTI::VelocityRecord::obs_slots (const int n, const int k) const {
+  return const_cast<VelocityRecord*>(this)->obs_slots(n,k);
+}
+
+Real CTI::VelocityRecord::obs_wts (const int n, const int k) const {
+  return const_cast<VelocityRecord*>(this)->obs_wts(n,k);
+}
+
+int CTI::VelocityRecord::run_step (const int i) const {
+  return const_cast<VelocityRecord*>(this)->run_step(i);
+}
+
+void CTI::VelocityRecord
+::init (const int dtf, const int drf_param, const int nsub, const int nvel_param) {
+  const int
+    drf = drf_param == 0 ? 1 : drf_param,
+    navail = dtf/drf + 1,
+    nvel = std::min(nvel_param == -1 ?
+                    (2 + (nsub-1)/2) :  // default value
+                    nvel_param,
+                    std::min(nsub+1,    // can't use more than this
+                             navail));  // this is the max available
+
+  _dtf = dtf; _drf = drf; _nsub = nsub; _nvel = nvel;
+
+  // nsub <= 1: No substepping.
+  // nvel <= 2: Save velocity only at endpoints, as always occurs.
+  if (nsub <= 1 or nvel <= 2) {
+    _nvel = 2;
+    return;
+  }
+
+  _t_vel.resize(nvel);
+  _obs_slots.resize(2*dtf); _obs_wts.resize(2*dtf);
+  _run_step.resize(nsub+1);
+
+  // Times at which velocity data are available.
+  std::vector<int> t_avail(navail); {
+    t_avail[0] = 0;
+    int i = 1;
+    for (int n = 0; n < dtf; ++n) {
+      if ((n+1) % drf != 0) continue;
+      t_avail[i] = n+1;
+      i = i + 1;
+    }
+    assert(i == navail);
+    assert(t_avail[navail-1] == dtf);
+  }
+
+  // Times to which we associate velocity data.
+  for (int n = 0; n < nvel; ++n) {
+    t_vel(n) = ((n*dtf) % (nvel-1) == 0 ?
+                /**/ (n*dtf) / (nvel-1) :
+                Real (n*dtf) / (nvel-1));
+    assert(t_vel(n) >= 0 and t_vel(n) <= dtf);
+    assert(n == 0 or t_vel(n) > t_vel(n-1));
+  }
+
+  // Build the tables mapping n in 0:dtf-1 to velocity slots to accumulate into.
+  for (int n = 0; n < dtf; ++n) {
+    for (int k = 0; k < 2; ++k) {
+      obs_slots(n,k) = -1;
+      obs_wts(n,k) = 0;
+    }
+    if (n == dtf-1) continue;
+    if ((n+1) % drf != 0) continue;
+    const int time = n+1;
+    int iav = -1;
+    for (int i = 1; i <= navail-1; ++i)
+      if (time == t_avail[i]) {
+        iav = i;
+        break;
+      }
+    assert(iav > 0 and iav < navail-1);
+    for (int i = 1; i < nvel-1; ++i) {
+      if (t_avail[iav-1] < t_vel(i) and time > t_vel(i)) {
+        obs_slots(n,0) = i;
+        obs_wts(n,0) = ((t_vel(i) - t_avail[iav-1]) /
+                        (t_avail[iav] - t_avail[iav-1]));
+      }
+      if (time <= t_vel(i) and t_avail[iav+1] > t_vel(i)) {
+        obs_slots(n,1) = i;
+        obs_wts(n,1) = ((t_avail[iav+1] - t_vel(i)) /
+                        (t_avail[iav+1] - t_avail[iav]));
+      }
+    }
+  }
+
+  // Build table mapping n to interval to use. The trajectories go backward in
+  // time, and this table reflects that.
+  run_step(0) = nvel-1;
+  run_step(nsub) = 1;
+  for (int n = 1; n < nsub; ++n) {
+    const auto time = Real((nsub-n)*dtf)/nsub;
+    int ifnd = -1;
+    for (int i = 0; i < nvel-1; ++i)
+      if (t_vel(i) <= time and time <= t_vel(i+1)) {
+        ifnd = i;
+        break;
+      }
+    assert(ifnd >= 0 and ifnd < nvel-1);
+    run_step(n) = ifnd + 1;
+  }
+}
+
 namespace {
 
 // Set dep_points_all to level-midpoint arrival points.
 void init_dep_points (const CTI& c, const cti::DeparturePoints& dep_pts) {
   const auto independent_time_steps = c.m_data.independent_time_steps;
   const auto& sphere_cart = c.m_geometry.m_sphere_cart;
-  const CRNV<NUM_PHYSICAL_LEV> hyetam(cti::cpack2real(c.m_hvcoord.etam));
+  const auto& etai = c.m_hvcoord.etai;
   assert(not independent_time_steps or dep_pts.extent_int(4) == 4);
   const auto f = KOKKOS_LAMBDA (const int idx) {
     int ie, lev, i, j;
@@ -143,7 +281,7 @@ void init_dep_points (const CTI& c, const cti::DeparturePoints& dep_pts) {
     for (int d = 0; d < 3; ++d)
       dep_pts(ie,lev,i,j,d) = sphere_cart(ie,i,j,d);
     if (independent_time_steps)
-      dep_pts(ie,lev,i,j,3) = hyetam(lev);
+      dep_pts(ie,lev,i,j,3) = etai(lev);
   };
   c.launch_ie_physlev_ij(f);
 }
@@ -177,6 +315,26 @@ void update_dep_points (
   c.launch_ie_physlev_ij(f);
 }
 
+// Interpolate eta_dot at interfaces. The support data are not midpoint data,
+// though; rather, they're interface data collected at different horizontal
+// points. Thus, to be clear, this is not midpoint-to-interface interpolation of
+// eta_dot.
+void interp_etadot_at_interfaces (const CTI& c, const cti::DeparturePoints& vdep) {
+  assert(vdep.extent_int(4) == 5);
+  const auto& etai = c.m_hvcoord.etai;
+  const CRNV<NUM_PHYSICAL_LEV> etam(cti::cpack2real(c.m_hvcoord.etam));
+  const auto f = KOKKOS_LAMBDA (const int idx) {
+    int ie, lev, i, j;
+    cti::idx_ie_physlev_ij(idx, ie, lev, i, j);
+    if (lev == 0) return;
+    const auto a = (etai(lev) - etam(lev-1)) / (etam(lev) - etam(lev-1));
+    // Safe to write to this slot in parallel b/c only this slot reads from it.
+    vdep(ie,lev,i,j,3) = ((1-a)*vdep(ie,lev-1,i,j,4) +
+                          (  a)*vdep(ie,lev  ,i,j,3));
+  };
+  c.launch_ie_physlev_ij(f);
+}
+
 /* Evaluate a formula to provide an estimate of nodal velocities that are use to
    create a 2nd-order update to the trajectory. The fundamental formula for the
    update in position p from arrival point p1 to departure point p0 is
@@ -184,10 +342,9 @@ void update_dep_points (
    Here we compute the velocity estimate at the nodes:
        1/2 (v(p1,t0) + v(p1,t1) - dt v(p1,t1) grad v(p1,t0)).
 */
+template <typename Snapshots>
 void calc_nodal_velocities (
-  const CTI& c, const Real dtsub, const Real halpha[2],
-  const cti::CVSlot& v1, const cti::CDpSlot& dp1, const int idx1,
-  const cti::CVSlot& v2, const cti::CDpSlot& dp2, const int idx2,
+  const CTI& c, const Real dtsub, const Snapshots& snaps,
   const cti::DeparturePoints& vnode)
 {
   using Kokkos::ALL;
@@ -201,14 +358,14 @@ void calc_nodal_velocities (
   const auto& hybi = h.hybrid_bi_packed;
   const auto& hydai = h.hybrid_ai_delta;
   const auto& hydbi = h.hybrid_bi_delta;
-  const auto& hyetam = h.etam;
+  const auto& db_deta_i = d.db_deta_i;
   const auto& hyetai = h.etai;
+  const auto& hyetam = h.etam;
   const auto& hydetai = d.hydetai;
   const auto& buf1a = d.buf1o[0]; const auto& buf1b = d.buf1o[1];
   const auto& buf1c = d.buf1o[2]; const auto& buf1d = d.buf1o[3];
   const auto& buf2a = d.buf2 [0]; const auto& buf2b = d.buf2 [1];
   const auto& buf2c = d.buf2 [2]; const auto& buf2d = d.buf2 [3];
-  const auto alpha0 = halpha[0], alpha1 = halpha[1];
   const auto f = KOKKOS_LAMBDA (const cti::MT& team) {
     KernelVariables kv(team);
     const int ie = kv.ie;
@@ -216,22 +373,17 @@ void calc_nodal_velocities (
     const auto  wrk2 = Homme::subview(buf1b, kv.team_idx);
     const auto vwrk1 = Homme::subview(buf2a, kv.team_idx);
     const auto vwrk2 = Homme::subview(buf2b, kv.team_idx);
-    const auto v1_ie = Homme::subview(v1, ie, idx1);
-    const auto v2_ie = Homme::subview(v2, ie, idx2);
-    const Real alpha[] = {alpha0, alpha1};
     CSelNlevp eta_dot[] = {Homme::subview(buf1c, kv.team_idx),
                            Homme::subview(buf1d, kv.team_idx)};
     {
       SelNlevp eta_dot[] = {Homme::subview(buf1c, kv.team_idx),
                             Homme::subview(buf1d, kv.team_idx)};
       if (independent_time_steps) {
-        const auto dp1_ie = Homme::subview(dp1, ie, idx1);
-        const auto dp2_ie = Homme::subview(dp2, ie, idx2);
-        calc_eta_dot_ref_mid(kv, sphere_ops,
-                             ps0, hyai0, hybi, hydai, hydbi, hydetai,
-                             alpha, v1_ie, dp1_ie, v2_ie, dp2_ie,
-                             wrk1, wrk2, vwrk1,
-                             eta_dot);
+        calc_eta_dot_ref(kv, sphere_ops, snaps,
+                         ps0, hyai0, hybi,
+                         hydai, hydbi, hydetai, db_deta_i,
+                         wrk1, wrk2, vwrk1,
+                         eta_dot);
       } else {
         for (int t = 0; t < 2; ++t) {
           const auto& ed = eta_dot[t];
@@ -242,19 +394,18 @@ void calc_nodal_velocities (
         }
       }
     }
-    // Collect the horizontal nodal velocities. v1,2 are on Eulerian levels. v1
-    // is from time t1 < t2.
+    // Collect the horizontal nodal velocities.
     auto* vm1 = Homme::subview(buf2c, kv.team_idx).data();
     auto* vm2 = Homme::subview(buf2d, kv.team_idx).data();
     CS2elNlev vsph[] = {CS2elNlev(vm1), CS2elNlev(vm2)};
     {
       S2elNlev vsph[] = {S2elNlev(vm1), S2elNlev(vm2)};
+      const auto e = snaps.get_element(ie);
       for (int t = 0; t < 2; ++t) {
         const auto& v = vsph[t];
         for (int d = 0; d < 2; ++d) {
           const auto f = [&] (const int i, const int j, const int k) {
-            v(d,i,j,k) = ((1 - alpha[t])*v1_ie(d,i,j,k) +
-                          /**/ alpha[t] *v2_ie(d,i,j,k));
+            v(d,i,j,k) = e.combine_v(t, d, i, j, k);
           };
           cti::loop_ijk<cti::num_lev_pack>(kv, f);
         }
@@ -273,7 +424,7 @@ void calc_nodal_velocities (
                                         vnode_ie);
     if (independent_time_steps) {
       kv.team_barrier();
-      calc_eta_dot_formula_node_ref_mid(kv, sphere_ops,
+      calc_eta_dot_formula_node_ref_int(kv, sphere_ops,
                                         hyetai, hyetam,
                                         dtsub, vsph, eta_dot,
                                         wrk1, vwrk1,
@@ -287,7 +438,7 @@ void calc_nodal_velocities (
 // grid's arrival midpoints, where the floating levels are those that evolve
 // over the course of the full tracer time step. Also compute divdp, which holds
 // the floating levels' dp values for later use in vertical remap.
-void interp_departure_points_to_floating_level_midpoints (const CTI& c, const int np1) {
+int interp_departure_points_to_floating_level_midpoints (const CTI& c, const int np1) {
   using Kokkos::ALL;
   const int nlev = NUM_PHYSICAL_LEV, nlevp = nlev+1;
   const auto is_sphere = c.m_data.geometry_type == 0;
@@ -298,14 +449,14 @@ void interp_departure_points_to_floating_level_midpoints (const CTI& c, const in
   const auto& hybi = h.hybrid_bi;
   const auto& hyetai = h.etai;
   const CRNV<NUM_PHYSICAL_LEV> hyetam(cti::cpack2real(h.etam));
-  const auto& detam_ref = d.hydetam_ref;
+  const CRNV<NUM_PHYSICAL_LEV> detai(cti::cpack2real(d.hydetai));
   const auto deta_tol = d.deta_tol;
   const auto& dep_pts = d.dep_pts;
   const auto& dp3d = c.m_state.m_dp3d;
   const auto& buf1a = d.buf1e[0]; const auto& buf1b = d.buf1e[1];
   const auto& buf1c = d.buf1e[2]; const auto& buf1d = d.buf1e[3];
   const auto& buf2a = d.buf2[0];
-  const auto f = KOKKOS_LAMBDA (const cti::MT& team) {
+  const auto f = KOKKOS_LAMBDA (const cti::MT& team, int& limcnt) {
     KernelVariables kv(team);
     const int ie = kv.ie;
     const auto wrk1 = Homme::subview(buf1a, kv.team_idx);
@@ -313,27 +464,26 @@ void interp_departure_points_to_floating_level_midpoints (const CTI& c, const in
     const auto wrk3 = Homme::subview(buf1c, kv.team_idx);
     const auto wrk4 = Homme::subview(buf1d, kv.team_idx);
     const auto vwrk = Homme::subview(buf2a, kv.team_idx);
-    // Reconstruct Lagrangian levels at t1 on arrival column:
-    //     eta_arr_int = I[eta_ref_mid([eta(0),eta_dep_mid,eta(1)])](eta_ref_int)
-    const auto etam = p2rel(wrk3.data(), nlev);
+    // Reconstruct Lagrangian levels at t1 on arrival column.
+    const auto eta = p2rel(wrk3.data(), nlev);
     const auto f = [&] (const int i, const int j, const int k) {
-      etam(i,j,k) = dep_pts(ie,k,i,j,3);
+      eta(i,j,k) = dep_pts(ie,k,i,j,3);
     };
     cti::loop_ijk<cti::num_phys_lev>(kv, f);
     kv.team_barrier();
-    limit_etam(kv, nlev,
-               hyetai, detam_ref, deta_tol,
-               p2rel(wrk1.data(), nlevp), p2rel(wrk2.data(), nlevp),
-               etam);
+    limcnt += limit_etai(kv, nlev,
+                         hyetai, detai, deta_tol,
+                         p2rel(wrk1.data(), nlev), p2rel(wrk2.data(), nlev),
+                         eta);
     kv.team_barrier();
     {
-      // Compute eta_arr_int.
+      // Compute Lagrangian level interfaces at t1 on arrival column.
       const auto etai_arr = p2rel(wrk4.data(), nlevp);
-      eta_interp_eta(kv, nlev,
-                     hyetai,
-                     etam, hyetam,
-                     p2rel(wrk1.data(), nlev+2), RnV(cti::pack2real(wrk2), nlev+2),
-                     nlevp-2, hyetai, etai_arr, 1);
+      // eta_arr_int = I[eta_ref_int(eta_dep_int)](eta_ref_int)
+      eta_interp_eta(kv, nlev, hyetai,
+                     nlevp-2, 1, eta, hyetai,
+                     p2rel(wrk1.data(), nlev+1), RnV(cti::pack2real(wrk2), nlev+1),
+                     nlevp-2, 1, hyetai, etai_arr);
       const auto f = [&] (const int i, const int j) {
         etai_arr(i,j,0) = hyetai(0);
         etai_arr(i,j,nlev) = hyetai(nlev);
@@ -354,14 +504,13 @@ void interp_departure_points_to_floating_level_midpoints (const CTI& c, const in
                       NP, NP, NUM_LEV*VECTOR_SIZE));
       kv.team_barrier();
     }
-    // Compute Lagrangian level midpoints at t1 on arrival column:
-    //     eta_arr_mid = I[eta_ref_mid([eta(0),eta_dep_mid,eta(1)])](eta_ref_mid)
+    // Compute Lagrangian level midpoints at t1 on arrival column.
     const auto etam_arr = p2rel(wrk4.data(), nlev);
-    eta_interp_eta(kv, nlev,
-                   hyetai,
-                   etam, hyetam,
-                   p2rel(wrk1.data(), nlev+2), RnV(cti::pack2real(wrk2), nlev+2),
-                   nlev, hyetam, etam_arr);
+    // eta_arr_mid = I[eta_ref_int(eta_dep_int)](eta_ref_mid)
+    eta_interp_eta(kv, nlev, hyetai,
+                   nlevp-2, 1, eta, hyetai,
+                   p2rel(wrk1.data(), nlev+1), RnV(cti::pack2real(wrk2), nlev+1),
+                   nlev, 0, hyetam, etam_arr);
     kv.team_barrier();
     // Compute departure horizontal points corresponding to arrival
     // Lagrangian level midpoints:
@@ -399,7 +548,9 @@ void interp_departure_points_to_floating_level_midpoints (const CTI& c, const in
       }
     }
   };
-  Kokkos::parallel_for(c.m_tp_ne, f);
+  int limcnt = 0;
+  Kokkos::parallel_reduce(c.m_tp_ne, f, limcnt);
+  return limcnt;
 }
 
 void dss_vnode (const CTI& c, const cti::DeparturePoints& vnode) {
@@ -432,7 +583,10 @@ void dss_vnode (const CTI& c, const cti::DeparturePoints& vnode) {
 } // namespace anon
 
 // For limit_etam.
-void ComposeTransportImpl::setup_enhanced_trajectory () {
+void ComposeTransportImpl
+::setup_enhanced_trajectory (const SimulationParams& params, const int num_elems) {
+  assert(params.dt_tracer_factor > 0);
+
   const auto etai = cmvdc(m_hvcoord.etai);
   const Real deta_ave = (etai(num_phys_lev) - etai(0)) / num_phys_lev;
   m_data.deta_tol = 10*std::numeric_limits<Real>::epsilon()*deta_ave;
@@ -448,20 +602,84 @@ void ComposeTransportImpl::setup_enhanced_trajectory () {
   const auto etamp = cmvdc(m_hvcoord.etam);
   HostViewUnmanaged<Real[NUM_PHYSICAL_LEV]> etam(pack2real(etamp));
   
-  // hydetam_ref.
+  // hydetam_ref
   m_data.hydetam_ref = decltype(m_data.hydetam_ref)("hydetam_ref");
-  const auto m = Kokkos::create_mirror_view(m_data.hydetam_ref);
-  const int nlev = num_phys_lev;
-  m(0) = etam(0) - etai(0);
-  for (int k = 1; k < nlev; ++k) m(k) = etam(k) - etam(k-1);
-  m(nlev) = etai(nlev) - etam(nlev-1);
-  Kokkos::deep_copy(m_data.hydetam_ref, m);
+  {
+    const auto m = Kokkos::create_mirror_view(m_data.hydetam_ref);
+    const int nlev = num_phys_lev;
+    m(0) = etam(0) - etai(0);
+    for (int k = 1; k < nlev; ++k) m(k) = etam(k) - etam(k-1);
+    m(nlev) = etai(nlev) - etam(nlev-1);
+    Kokkos::deep_copy(m_data.hydetam_ref, m);
+  }
 
   // etam
-  homme::compose::set_hvcoord(etai(0), etai(num_phys_lev), etam.data());
+  homme::compose::set_hvcoord(etai.data(), etam.data());
+
+  // Initialization for semi_lagrange_trajectory_nvelocity > 2.
+  m_data.vrec = std::make_shared<VelocityRecord>(
+    params.dt_tracer_factor, params.dt_remap_factor, m_data.trajectory_nsubstep,
+    m_data.trajectory_nvelocity);
+  const auto nv = m_data.vrec->nvel();
+  if (nv > 2) {
+    m_data.dp_extra_snapshots = DpSnaps( "dp_extra_snapshots", num_elems, nv-2);
+    m_data.vel_extra_snapshots = VSnaps("vel_extra_snapshots", num_elems, nv-2);
+  }
+
+  // B_eta at interfaces
+  m_data.db_deta_i = decltype(m_data.db_deta_i)("db_deta_i");
+  {
+    const auto m_p = Kokkos::create_mirror_view(m_data.db_deta_i);
+    HostViewUnmanaged<Real[NUM_INTERFACE_LEV]> m(pack2real(m_p));
+    m(0) = m(NUM_INTERFACE_LEV-1) = 0; // unused
+    const auto hybi = cmvdc(m_hvcoord.hybrid_bi);
+    for (int k = 2; k < NUM_PHYSICAL_LEV-1; ++k)
+      m(k) = approx_derivative(etai(k-1), etai(k), etai(k+1),
+                               hybi(k-1), hybi(k), hybi(k+1));
+    Kokkos::deep_copy(m_data.db_deta_i, m_p);
+  }
+}
+
+void ComposeTransportImpl::observe_velocity (const TimeLevel& tl, const int step) {
+  // Optionally observe the dynamics velocity snapshot available at this step.
+
+  if (not m_data.vrec or m_data.vrec->nvel() == 2) return;
+
+  const auto& v = *m_data.vrec;
+  assert((step + 1) % v.drf() == 0);
+
+  if (step + 1 == v.drf()) {
+    // This is either (1) the first vertical remap step in the tracer step or
+    // (2) the first dynamics step and we're running vertically Eulerian. In
+    // either case, zero the quantities accumulated over the step.
+    Kokkos::deep_copy(m_data.dp_extra_snapshots, 0);
+    Kokkos::deep_copy(m_data.vel_extra_snapshots, 0);
+  }
+
+  const auto& state = Context::singleton().get<ElementsState>();
+  const auto np1 = tl.np1;
+  for (int t = 0; t < 2; ++t) {
+    const auto slot = v.obs_slots(step, t);
+    if (slot == -1) continue;
+    assert(slot > 0 and slot < v.nvel()-1);
+    const auto wt = v.obs_wts(step, t);
+    const auto& dp_snap = m_data.dp_extra_snapshots;
+    const auto& v_snap = m_data.vel_extra_snapshots;
+    const auto& dp3d = state.m_dp3d;
+    const auto& v = state.m_v;
+    const auto f = KOKKOS_LAMBDA (const int idx) {
+      int ie, igp, jgp, ilev;
+      idx_ie_ij_nlev<num_lev_pack>(idx, ie, igp, jgp, ilev);
+      dp_snap(ie,slot-1,igp,jgp,ilev) += wt * dp3d(ie,np1,igp,jgp,ilev);
+      for (int d = 0; d < 2; ++d)
+        v_snap(ie,slot-1,d,igp,jgp,ilev) += wt * v(ie,np1,d,igp,jgp,ilev);
+    };
+    launch_ie_ij_nlev<num_lev_pack>(f);
+  }
 }
 
-void ComposeTransportImpl::calc_enhanced_trajectory (const int np1, const Real dt) {
+void ComposeTransportImpl
+::calc_enhanced_trajectory (const int nstep, const int np1, const Real dt) {
   GPTLstart("compose_calc_enhanced_trajectory");
 
   const auto& dep_pts = m_data.dep_pts;
@@ -477,15 +695,23 @@ void ComposeTransportImpl::calc_enhanced_trajectory (const int np1, const Real d
     {
       Kokkos::fence();
       GPTLstart("compose_vnode");
-      const Real alpha[] = {Real(nsubstep-step-1)/nsubstep,
-                            Real(nsubstep-step  )/nsubstep};
-      const CVSlot v1(m_derived.m_vstar.data(), nelemd, 1);
-      const CDpSlot dp1(m_derived.m_dp.data(), nelemd, 1);
+      const CVSnap v1(m_derived.m_vstar.data(), nelemd, 1);
+      const CDpSnap dp1(m_derived.m_dp.data(), nelemd, 1);
       const auto& v2 = m_state.m_v;
       const auto& dp2 = m_state.m_dp3d;
-      calc_nodal_velocities(*this, dtsub, alpha,
-                            v1, dp1, 0, v2, dp2, np1,
-                            vnode);
+      if (m_data.vrec->nvel() == 2) {
+        const Real alpha[] = {Real(nsubstep-step-1)/nsubstep,
+                              Real(nsubstep-step  )/nsubstep};
+        EndpointSnapshots snaps(alpha, dp1, v1, 0, dp2, v2, np1);
+        calc_nodal_velocities(*this, dtsub, snaps, vnode);
+      } else {
+        ManySnapshots snaps(*m_data.vrec,
+                            dp1, v1, 0, dp2, v2, np1,
+                            m_data.dp_extra_snapshots, m_data.vel_extra_snapshots,
+                            nsubstep, step);
+        calc_nodal_velocities(*this, dtsub, snaps, vnode);
+      }
+        
       Kokkos::fence();
       GPTLstop("compose_vnode");
     }
@@ -499,10 +725,12 @@ void ComposeTransportImpl::calc_enhanced_trajectory (const int np1, const Real d
       update_dep_points(*this, dtsub, vnode, dep_pts);
     } else {
       GPTLstart("compose_vdep");
-      homme::compose::calc_v_departure(step, dtsub);
+      homme::compose::interp_v_update(step, dtsub);
       Kokkos::fence();
       GPTLstop("compose_vdep");
 
+      interp_etadot_at_interfaces(*this, vdep);
+
       update_dep_points(*this, dtsub, vdep, dep_pts);
     }
   }
@@ -510,9 +738,19 @@ void ComposeTransportImpl::calc_enhanced_trajectory (const int np1, const Real d
 
   if (m_data.independent_time_steps) {
     GPTLstart("compose_floating_dep_pts");
-    interp_departure_points_to_floating_level_midpoints(*this, np1);
+    const int limcnt =
+      interp_departure_points_to_floating_level_midpoints(*this, np1);
     Kokkos::fence();
     GPTLstop("compose_floating_dep_pts");
+    if (m_data.diagnostics & 1) {
+      const auto& c = Context::singleton();
+      const auto& comm = c.get<Comm>();
+      int glimcnt;
+      MPI_Allreduce(&limcnt, &glimcnt, 1, MPI_INT, MPI_SUM, comm.mpi_comm());
+      if (glimcnt > 0 and comm.root())
+        printf("COMPOSE> nstep %10d limiter_active_count %10d\n",
+               nstep, glimcnt);
+    }
   }
 
   GPTLstop("compose_calc_enhanced_trajectory");
diff --git a/components/homme/src/share/cxx/ComposeTransportImplEnhancedTrajectoryImpl.hpp b/components/homme/src/share/cxx/ComposeTransportImplEnhancedTrajectoryImpl.hpp
index aa797ef8cff9..6773699d7ea1 100644
--- a/components/homme/src/share/cxx/ComposeTransportImplEnhancedTrajectoryImpl.hpp
+++ b/components/homme/src/share/cxx/ComposeTransportImplEnhancedTrajectoryImpl.hpp
@@ -15,6 +15,57 @@
 #include "compose_hommexx.hpp"
 
 namespace Homme {
+
+// Organize and interpolate velocity snapshots from the dynamics. The result is
+// a set of snapshots to be used in computing the SL trajectories. To
+// distinguish between these types of snapshots, we refer to "dynamics
+// snapshots" (available at vertical remap time steps) and "internal snapshots"
+// (interpolated to tracer trajectory substeps).
+//
+// Parameter short names:
+//   dtf = dt_tracer_factor
+//   drf = dt_remap_factor
+//   nsub = semi_lagrange_trajectory_nsubstep
+//   nvel = semi_lagrange_trajectory_nvelocity
+struct ComposeTransportImpl::VelocityRecord {
+  VelocityRecord () {}
+  VelocityRecord (const int dtf, const int drf, const int nsub, const int nvel)
+  { init(dtf, drf, nsub, nvel); }
+
+  void init(const int dtf, const int drf, const int nsub, const int nvel);
+
+  int dtf () const { return _dtf; }
+  int drf () const { return _drf; }
+  int nsub () const { return _nsub; }
+  int nvel () const { return _nvel; }
+
+  // Times to which velocity slots i in 0:nvel-1 correspond, in reference time
+  // [0,dtf].
+  Real t_vel(const int i) const;
+
+  // For n = 0:dtf-1, obs_slots(n,0:1) = [slot1, slot2], -1 if unused. These are
+  // the slots to which velocity sample n contributes. obs_slots(dtf-1,:) is
+  // always -1.
+  int obs_slots(const int n, const int k) const;
+
+  // For n = 0:dtf-1, obs_wts(n,0:1) = [wt1, wt2], 0 if unused.
+  Real obs_wts(const int n, const int k) const;
+
+  // Substep end point i in 0:nsub uses velocity slots run_step(i),
+  // run_step(i)-1.
+  int run_step(const int i) const;
+
+private:
+  int _dtf = -1, _drf = -1, _nsub = -1, _nvel = -1;
+  std::vector<int> _obs_slots, _run_step;
+  std::vector<Real> _t_vel, _obs_wts;
+
+  Real& t_vel(const int i);
+  int& obs_slots(const int n, const int k);
+  Real& obs_wts(const int n, const int k);
+  int& run_step(const int i);
+};
+
 namespace { // anon
 
 using cti = ComposeTransportImpl;
@@ -112,6 +163,161 @@ void assert_eln (const CSelnV& a, const int nlev) {
   assert(calc_nscal(a.extent_int(2)) >= nlev);
 }
 
+// Structs to manage access to internal velocity snapshots at the end points of
+// a substep interval.
+//   dp1,2 and v1,2 are on Eulerian levels. dp,v1 is from time t1 < t2.
+
+using  DpSnap   = ExecViewUnmanaged<      Scalar**    [NP][NP][NUM_LEV]>;
+using   VSnap   = ExecViewUnmanaged<      Scalar** [2][NP][NP][NUM_LEV]>;
+using CDpSnap   = ExecViewUnmanaged<const Scalar**    [NP][NP][NUM_LEV]>;
+using  CVSnap   = ExecViewUnmanaged<const Scalar** [2][NP][NP][NUM_LEV]>;
+using CDpSnapEl = ExecViewUnmanaged<const Scalar      [NP][NP][NUM_LEV]>;
+using  CVSnapEl = ExecViewUnmanaged<const Scalar   [2][NP][NP][NUM_LEV]>;
+
+// This is the simple case, for nvelocity = 2. We have velocity snapshots only
+// at the tracer time step end points; dynamics and internal are the same. We
+// linearly interpolate them to give values anywhere within the time step. In
+// particular, for a particular substep's interval, we linearly interpolate
+// using alpha[t] for t = 0,1 the start and end of the substep interval.
+struct EndpointSnapshots {
+  const Real alpha[2];
+  const int idxs[2];
+  const CDpSnap dps[2];
+  const CVSnap vs[2];
+
+  // Use subview access for efficiency.
+  struct Element {
+    const Real alpha[2];
+    const CDpSnapEl dps[2];
+    const CVSnapEl vs[2];
+
+    KOKKOS_INLINE_FUNCTION
+    Element (const EndpointSnapshots& s, const int ie)
+      : alpha{s.alpha[0], s.alpha[1]},
+        dps{Homme::subview(s.dps[0], ie, s.idxs[0]), Homme::subview(s.dps[1], ie, s.idxs[1])},
+        vs{Homme::subview(s.vs[0], ie, s.idxs[0]), Homme::subview(s.vs[1], ie, s.idxs[1])}
+    {}
+
+    // Direct access.
+
+    KOKKOS_INLINE_FUNCTION
+    Real get_dp_real (const int t, const int i, const int j, const int k) const {
+      return dps[t](i,j, k / VECTOR_SIZE)[k % VECTOR_SIZE];
+    }
+
+    // Linear combinations.
+
+    KOKKOS_INLINE_FUNCTION
+    Scalar combine_v (const int t, const int d, const int i, const int j, const int k) const {
+      return (1 - alpha[t])*vs[0](d,i,j,k) + alpha[t]*vs[1](d,i,j,k);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    Scalar combine_vdp (const int t, const int d, const int i, const int j, const int k) const {
+      return ((1 - alpha[t])*vs[0](d,i,j,k)*dps[0](i,j,k)
+              +    alpha[t] *vs[1](d,i,j,k)*dps[1](i,j,k));
+    }
+  };
+
+  EndpointSnapshots (const Real alpha_[2],
+                     const CDpSnap& dp1, const CVSnap& v1, const int idx1,
+                     const CDpSnap& dp2, const CVSnap& v2, const int idx2)
+    : alpha{alpha_[0], alpha_[1]}, idxs{idx1, idx2}, dps{dp1, dp2}, vs{v1, v2}
+  {}
+
+  KOKKOS_INLINE_FUNCTION Real get_alpha (const int t) const { return alpha[t]; }
+
+  KOKKOS_INLINE_FUNCTION Element get_element (const int ie) const {
+    return Element(*this, ie);
+  }
+};
+
+// This is the more complicated case: we have dynamics velocity snapshots at the
+// time step end points, plus internal snapshots that may be interpolated from
+// multiple dynamics snapshots. alpha is no longer needed; values are trivially
+// set to 0 or 1. Interpolation of snapshots here takes the place of alpha.
+struct ManySnapshots {
+  Real beta[2];
+  int idxs[4];
+  CDpSnap dps[4];
+  CVSnap vs[4];
+
+  struct Element {
+    const Real beta[2];
+    const CDpSnapEl dps[4];
+    const CVSnapEl vs[4];
+
+    KOKKOS_INLINE_FUNCTION
+    Element (const ManySnapshots& s, const int ie)
+      : beta{s.beta[0], s.beta[1]},
+        dps{Homme::subview(s.dps[0], ie, s.idxs[0]), Homme::subview(s.dps[1], ie, s.idxs[1]),
+            Homme::subview(s.dps[2], ie, s.idxs[2]), Homme::subview(s.dps[3], ie, s.idxs[3])},
+        vs{Homme::subview(s.vs[0], ie, s.idxs[0]), Homme::subview(s.vs[1], ie, s.idxs[1]),
+           Homme::subview(s.vs[2], ie, s.idxs[2]), Homme::subview(s.vs[3], ie, s.idxs[3])}
+    {}
+
+    KOKKOS_INLINE_FUNCTION
+    Real get_dp_real (const int t, const int i, const int j, const int k) const {
+      const int os = 2*t, kp = k / VECTOR_SIZE, ks = k % VECTOR_SIZE;
+      return ((1 - beta[t])*dps[os  ](i,j,kp)[ks]
+              +    beta[t] *dps[os+1](i,j,kp)[ks]);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    Scalar combine_v (const int t, const int d, const int i, const int j, const int k) const {
+      const int os = 2*t;
+      return ((1 - beta[t])*vs[os  ](d,i,j,k)
+              +    beta[t] *vs[os+1](d,i,j,k));
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    Scalar combine_vdp (const int t, const int d, const int i, const int j, const int k) const {
+      const int os = 2*t;
+      return ((1 - beta[t])*vs[os  ](d,i,j,k)*dps[os  ](i,j,k)
+              +    beta[t] *vs[os+1](d,i,j,k)*dps[os+1](i,j,k));
+    }
+  };
+
+  ManySnapshots (const CTI::VelocityRecord& vrec,
+                 // endpoint data
+                 const CDpSnap& dp1, const CVSnap& v1, const int idx1,
+                 const CDpSnap& dp2, const CVSnap& v2, const int idx2,
+                 // interior data
+                 const CDpSnap& dps_int, const CVSnap& vs_int,
+                 // current substep
+                 const int nsubstep, const int step) {
+    const int end = vrec.nvel() - 1;
+    for (int t = 0; t < 2; ++t) {
+      const int substep_idx = nsubstep - (step+1) + t;
+      const Real time = (substep_idx * vrec.t_vel(end))/nsubstep;
+      const int k = vrec.run_step(step);
+      const int os = 2*t;
+      // Get the two relevant dynamics snapshots.
+      idxs[os]   = k == 1   ? idx1 : (k-2);
+      dps [os]   = k == 1   ? dp1  : dps_int;
+      vs  [os]   = k == 1   ? v1   : vs_int;
+      idxs[os+1] = k == end ? idx2 : (k-1);
+      dps[os+1]  = k == end ? dp2  : dps_int;
+      vs[os+1]   = k == end ? v2   : vs_int;
+      assert(idxs[os] >= 0 and idxs[os+1] >= 0);
+      assert(idxs[os] < dps[os].extent_int(1) and idxs[os+1] < dps[os+1].extent_int(1));
+      // Parameter for the linear combination of the two dynamics snapshots to
+      // make an internal snapshot.
+      beta[t] = (time - vrec.t_vel(k-1))/(vrec.t_vel(k) - vrec.t_vel(k-1));
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION Real get_alpha (const int t) const {
+    return t == 0 ? 0 : 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION Element get_element (const int ie) const {
+    return Element(*this, ie);
+  }
+};
+
+// Routines to interpolate in the vertical direction.
+
 // Find the support for the linear interpolant.
 //   For sorted ascending x[0:n] and x in [x[0], x[n-1]] with hint xi_idx,
 // return i such that x[i] <= xi <= x[i+1].
@@ -162,9 +368,8 @@ linterp (const Range& range,
          const int x_idx_offset = 0, const char* const caller = nullptr) {
 #ifndef NDEBUG
   if (xi[0] < x[0] or xi[ni-1] > x[n-1]) {
-    if (caller)
-      Kokkos::printf("linterp: xi out of bounds: %s %1.15e %1.15e %1.15e %1.15e\n",
-                     caller ? caller : "NONE", x[0], xi[0], xi[ni-1], x[n-1]);
+    Kokkos::printf("linterp: xi out of bounds: %s %1.15e %1.15e %1.15e %1.15e\n",
+                   caller ? caller : "NONE", x[0], xi[0], xi[ni-1], x[n-1]);
     assert(false);
   }
 #endif
@@ -176,49 +381,51 @@ linterp (const Range& range,
   Kokkos::parallel_for(range, f);
 }
 
-// Compute Lagrangian level midpoints at t1 on arrival column:
-//     eta_arr_mid = I[eta_ref_mid([eta(0),eta_dep_mid,eta(1)])](eta_ref_mid).
+// Compute Lagrangian levels at t1 on arrival column:
+//     yi(i_os:) = I[y([eta(0),x,eta(1)])](xi(i_os:)),
+// where both x and y are eta values located at midpoints or interfaces and on
+// the reference or departure grids.
 KOKKOS_FUNCTION void
-eta_interp_eta (const KernelVariables& kv, const int nlev,
-                const CRnV& hy_etai, const CRelnV& x, const CRnV& y,
+eta_interp_eta (const KernelVariables& kv, const int nlev, const CRnV& hy_etai,
+                const int n, const int os, const CRelnV& x, const CRnV& y,
                 const RelnV& xwrk, const RnV& ywrk,
                 // Use xi(i_os:), yi(i,j,i_os:).
-                const int ni, const CRnV& xi, const RelnV& yi, const int i_os = 0) {
+                const int ni, const int i_os, const CRnV& xi, const RelnV& yi) {
   const auto& xbdy = xwrk;
   const auto& ybdy = ywrk;
   assert(hy_etai.extent_int(0) >= nlev+1);
-  assert_eln(x, nlev);
-  assert(y.extent_int(0) >= nlev);
-  assert_eln(xbdy, nlev+2);
-  assert(ybdy.extent_int(0) >= nlev+2);
+  assert_eln(x, os + n);
+  assert(y.extent_int(0) >= os + n);
+  assert_eln(xbdy, n+2);
+  assert(ybdy.extent_int(0) >= n+2);
   assert(xi.extent_int(0) >= i_os + ni);
   assert_eln(yi, i_os + ni);
   const auto ttr = Kokkos::TeamThreadRange(kv.team, NP*NP);
   const auto tvr_ni = Kokkos::ThreadVectorRange(kv.team, ni);
-  const auto tvr_nlevp2 = Kokkos::ThreadVectorRange(kv.team, nlev+2);
+  const auto tvr_np2 = Kokkos::ThreadVectorRange(kv.team, n+2);
   const auto f_y = [&] (const int k) {
-    ybdy(k) = (k == 0      ? hy_etai(0) :
-               k == nlev+1 ? hy_etai(nlev) :
-               /**/          y(k-1));
+    ybdy(k) = (k == 0   ? hy_etai(0) :
+               k == n+1 ? hy_etai(nlev) :
+               /**/       y(os+k-1));
   };
-  Kokkos::parallel_for(Kokkos::TeamVectorRange(kv.team, nlev+2), f_y);
+  Kokkos::parallel_for(Kokkos::TeamVectorRange(kv.team, n+2), f_y);
   kv.team_barrier();
   const auto f_x = [&] (const int idx) {
     const int i = idx / NP, j = idx % NP;
     const auto g = [&] (const int k) {
-      xbdy(i,j,k) = (k == 0      ? hy_etai(0) :
-                     k == nlev+1 ? hy_etai(nlev) :
-                     /**/          x(i,j,k-1));
+      xbdy(i,j,k) = (k == 0   ? hy_etai(0) :
+                     k == n+1 ? hy_etai(nlev) :
+                     /**/       x(i,j,os+k-1));
     };
-    Kokkos::parallel_for(tvr_nlevp2, g);
+    Kokkos::parallel_for(tvr_np2, g);
   };
   Kokkos::parallel_for(ttr, f_x);
   kv.team_barrier();
   const auto f_linterp = [&] (const int idx) {
     const int i = idx / NP, j = idx % NP;
     linterp(tvr_ni,
-            nlev+2, getcolc(xbdy,i,j), ybdy,
-            ni,     xi.data() + i_os, getcol(yi,i,j).data() + i_os,
+            n+2, getcolc(xbdy,i,j), ybdy,
+            ni, xi.data() + i_os, getcol(yi,i,j).data() + i_os,
             1, "eta_interp_eta");
   };
   Kokkos::parallel_for(ttr, f_linterp);
@@ -345,7 +552,7 @@ eta_to_dp (const KernelVariables& kv, const int nlev,
  */
 template <typename Range>
 KOKKOS_FUNCTION void
-deta_caas (const KernelVariables& kv, const Range& tvr_nlevp,
+deta_caas (const KernelVariables& kv, const Range& tvr,
            const CRnV& deta_ref, const Real low, const RnV& w,
            const RnV& deta) {
   const auto g1 = [&] (const int k, Kokkos::Real2& sums) {
@@ -363,7 +570,7 @@ deta_caas (const KernelVariables& kv, const Range& tvr_nlevp,
     w(k) = wk;
   };
   Kokkos::Real2 sums;
-  Dispatch<>::parallel_reduce(kv.team, tvr_nlevp, g1, sums);
+  Dispatch<>::parallel_reduce(kv.team, tvr, g1, sums);
   const Real wneeded = sums.v[0];
   if (wneeded == 0) return;
   // Remove what is needed from the donors.
@@ -371,49 +578,32 @@ deta_caas (const KernelVariables& kv, const Range& tvr_nlevp,
   const auto g2 = [&] (const int k) {
     deta(k) += wneeded*(w(k)/wavail);
   };
-  Kokkos::parallel_for(tvr_nlevp, g2);
+  Kokkos::parallel_for(tvr, g2);
 }
 
-// Wrapper to above.
-KOKKOS_FUNCTION void
-deta_caas (const KernelVariables& kv, const int nlevp, const CRnV& deta_ref,
-           const Real low, const RelnV& wrk, const RelnV& deta) {
-  assert(deta_ref.extent_int(0) >= nlevp);
-  assert_eln(wrk, nlevp);
-  assert_eln(deta, nlevp);
-  const auto ttr = Kokkos::TeamThreadRange(kv.team, NP*NP);
-  const auto tvr = Kokkos::ThreadVectorRange(kv.team, nlevp);
-  const auto f = [&] (const int idx) {
-    const int i = idx / NP, j = idx % NP;
-    deta_caas(kv, tvr, deta_ref, low, getcol(wrk,i,j), getcol(deta,i,j));
-  };
-  Kokkos::parallel_for(ttr, f);
-}
-
-// Wrapper to deta_caas. On input and output, eta contains the midpoint eta
-// values. On output, deta_caas has been applied, if necessary, to
-// diff(eta(i,j,:)).
-KOKKOS_FUNCTION void
-limit_etam (const KernelVariables& kv, const int nlev, const CRnV& hy_etai,
+// Wrapper to deta_caas. On input and output, eta contains the interface eta
+// values, excluding the last one. On output, deta_caas has been applied, if
+// necessary, to diff(eta(i,j,:)).
+KOKKOS_FUNCTION int
+limit_etai (const KernelVariables& kv, const int nlev, const CRnV& hy_etai,
             const CRnV& deta_ref, const Real deta_tol, const RelnV& wrk1,
             const RelnV& wrk2, const RelnV& eta) {
   assert(hy_etai.extent_int(0) >= nlev+1);
-  assert(deta_ref.extent_int(0) >= nlev+1);
+  assert(deta_ref.extent_int(0) >= nlev);
   const auto deta = wrk2;
-  assert_eln(wrk1, nlev+1);
-  assert_eln(deta, nlev+1);
-  assert_eln(eta , nlev  );
+  assert_eln(wrk1, nlev);
+  assert_eln(deta, nlev);
+  assert_eln(eta , nlev);
   const auto ttr = Kokkos::TeamThreadRange(kv.team, NP*NP);
-  const auto tvr = Kokkos::ThreadVectorRange(kv.team, nlev+1);
+  const auto tvr = Kokkos::ThreadVectorRange(kv.team, nlev);
   // eta -> deta; limit deta if needed.
   const auto f1 = [&] (const int idx) {
     const int i = idx / NP, j = idx % NP;
     const auto  etaij = getcolc( eta,i,j);
-    const auto detaij = getcol(deta,i,j);
+    const auto detaij = getcol (deta,i,j);
     const auto g1 = [&] (const int k, int& nbad) {
-      const auto d = (k == 0    ? etaij(0) - hy_etai(0) :
-                      k == nlev ? hy_etai(nlev) - etaij(nlev-1) :
-                      /**/        etaij(k) - etaij(k-1));
+      const auto d = (k+1 == nlev ? hy_etai(nlev) - etaij(nlev-1) :
+                      /**/          etaij(k+1) - etaij(k));
       const bool ok = d >= deta_tol;
       if (not ok) ++nbad;
       detaij(k) = d;
@@ -424,26 +614,29 @@ limit_etam (const KernelVariables& kv, const int nlev, const CRnV& hy_etai,
       // Signal this column is fine.
       Kokkos::single(Kokkos::PerThread(kv.team), [&] () { detaij(0) = -1; });
       return;
-    };
+    }
     deta_caas(kv, tvr, deta_ref, deta_tol, getcol(wrk1,i,j), detaij);
   };
   Kokkos::parallel_for(ttr, f1);
   kv.team_barrier();
   // deta -> eta; ignore columns where limiting wasn't needed.
-  const auto f2 = [&] (const int idx) {
+  const auto f2 = [&] (const int idx, int& cnt) {
     const int i = idx / NP, j = idx % NP;
-    const auto  etaij = getcol( eta,i,j);
-    const auto detaij = getcol(deta,i,j);
+    const auto detaij = getcolc(deta,i,j);
     if (detaij(0) == -1) return;
+    ++cnt;
+    const auto etaij = getcol(eta,i,j);
     const auto g = [&] (const int k, Real& accum, const bool final) {
       assert(k != 0 or accum == 0);
-      const Real d = k == 0 ? hy_etai(0) + detaij(0) : detaij(k);
+      const Real d = k == 0 ? etaij(0) + detaij(0) : detaij(k);
       accum += d;
-      if (final) etaij(k) = accum;
+      if (final) etaij(k+1) = accum;
     };
-    Dispatch<>::parallel_scan(kv.team, nlev, g);
+    Dispatch<>::parallel_scan(kv.team, nlev-1, g);
   };
-  Kokkos::parallel_for(ttr, f2);
+  int cnt = 0;
+  Kokkos::parallel_reduce(ttr, f2, cnt);
+  return cnt;
 }
 
 // Compute surface pressure ps = ai(0) ps0 + sum dp.
@@ -470,22 +663,23 @@ KOKKOS_FUNCTION void calc_ps (
 
 // Compute the surface pressure ps[i] at time point i corresponding to
 // dp[i] = (1-alpha[i]) dp1 + alpha[i] dp2.
+template <typename Snapshots>
 KOKKOS_FUNCTION void calc_ps (
   const KernelVariables& kv, const int nlev,
   const Real& ps0, const Real& hyai0,
-  const Real alpha[2], const CSelnV& dp1, const CSelnV& dp2,
+  const Snapshots& snaps,
   const ExecViewUnmanaged<Real[2][NP][NP]>& ps)
 {
-  assert_eln(dp1, nlev);
-  assert_eln(dp2, nlev);
+  const auto ie = kv.ie;
   const auto ttr = Kokkos::TeamThreadRange(kv.team, NP*NP);
   const auto tvr_snlev = Kokkos::ThreadVectorRange(kv.team, nlev);
-  const CRelnV dps[] = {elp2r(dp1), elp2r(dp2)};
   const auto f1 = [&] (const int idx) {
     const int i = idx / NP, j = idx % NP;
     for (int t = 0; t < 2; ++t) {
-      const auto& dp = dps[t];
-      const auto g = [&] (int k, Real& sum) { sum += dp(i,j,k); };
+      const auto e = snaps.get_element(ie);
+      const auto g = [&] (int k, Real& sum) {
+        sum += e.get_dp_real(t,i,j,k);
+      };
       Real sum;
       Dispatch<>::parallel_reduce(kv.team, tvr_snlev, g, sum);
       Kokkos::single(Kokkos::PerThread(kv.team), [&] { ps(t,i,j) = sum; });
@@ -497,10 +691,12 @@ KOKKOS_FUNCTION void calc_ps (
     const int i = idx / NP, j = idx % NP;
     const auto g = [&] () {
       Real vals[2];
-      for (int t = 0; t < 2; ++t)
+      for (int t = 0; t < 2; ++t) {
+        const auto alpha = snaps.get_alpha(t);
         vals[t] = (hyai0*ps0 +
-                   (1 - alpha[t])*ps(0,i,j) +
-                   /**/ alpha[t] *ps(1,i,j));
+                   (1 - alpha)*ps(0,i,j) +
+                   /**/ alpha *ps(1,i,j));
+      }
       for (int t = 0; t < 2; ++t)
         ps(t,i,j) = vals[t];
     };
@@ -509,57 +705,40 @@ KOKKOS_FUNCTION void calc_ps (
   Kokkos::parallel_for(ttr, f2);
 }
 
-// Transform eta_dot_dpdn at interfaces to eta_dot at midpoints using the
-// formula
-//     eta_dot = eta_dot_dpdn/(A_eta p0 + B_eta ps).
-//            a= eta_dot_dpdn diff(eta)/(diff(A) p0 + diff(B) ps).
-KOKKOS_FUNCTION void calc_etadotmid_from_etadotdpdnint (
+// Transform eta_dot_dpdn to eta_dot, both at interfaces, using the formula
+//     eta_dot = eta_dot_dpdn/(A_eta p0 + B_eta ps)
+//             = eta_dot_dpdn/(p0 + B_eta (ps - p0)).
+KOKKOS_FUNCTION void calc_etadotint_from_etadotdpdnint (
   const KernelVariables& kv, const int nlev,
-  const Real& ps0, const CSnV& hydai, const CSnV& hydbi,
-  const CSnV& hydetai, const CRelV& ps, const SelnV& wrk,
+  const Real ps0, const CSnV& db_deta_i, const CRelV& ps,
   //  in: eta_dot_dpdn at interfaces
-  // out: eta_dot at midpoints, final slot unused
+  // out: eta_dot at interfaces
   const SelnV& ed)
 {
-  assert(calc_nscal(hydai.extent_int(0)) >= nlev);
-  assert(calc_nscal(hydbi.extent_int(0)) >= nlev);
-  assert(calc_nscal(hydetai.extent_int(0)) >= nlev);
-  assert_eln(wrk, nlev+1);
+  assert(calc_nscal(db_deta_i.extent_int(0)) >= nlev+1);
   assert_eln(ed, nlev+1);
-  const auto& edd_mid = wrk;
-  {
-    const CRelnV edd(elp2r(ed));
-    const RelnV tmp(elp2r(wrk));
-    const auto f = [&] (const int i, const int j, const int k) {
-      tmp(i,j,k) = (edd(i,j,k) + edd(i,j,k+1))/2;
-    };
-    cti::loop_ijk(nlev, kv, f);
-  }
-  kv.team_barrier();
-  {
-    const auto f = [&] (const int i, const int j, const int kp) {
-      ed(i,j,kp) = (edd_mid(i,j,kp)
-                    * hydetai(kp)
-                    / (hydai(kp)*ps0 + hydbi(kp)*ps(i,j)));
-    };
-    cti::loop_ijk(calc_npack(nlev), kv, f);
-  }
+  const auto f = [&] (const int i, const int j, const int k) {
+    ed(i,j,k) = ed(i,j,k) / (ps0 + db_deta_i(k)*(ps(i,j) - ps0));
+  };
+  cti::loop_ijk(calc_npack(nlev), kv, f); // final level is unused
 }
 
-// Compute eta_dot at midpoint nodes at the start and end of the substep.
-KOKKOS_FUNCTION void calc_eta_dot_ref_mid (
-  const KernelVariables& kv, const SphereOperators& sphere_ops,
+// Compute eta_dot at midpoint or interface nodes at the start and end of the
+// substep.
+template <typename Snapshots>
+KOKKOS_FUNCTION void calc_eta_dot_ref (
+  const KernelVariables& kv, const SphereOperators& sphops, const Snapshots& snaps,
   const Real& ps0, const Real& hyai0, const CSNV<NUM_LEV_P>& hybi,
   const CSNV<NUM_LEV>& hydai, const CSNV<NUM_LEV>& hydbi, // delta ai, bi
   const CSNV<NUM_LEV>& hydetai, // delta etai
-  const Real alpha[2],
-  const CS2elNlev& v1, const CSelNlev& dp1, const CS2elNlev& v2, const CSelNlev& dp2,
+  const CSNV<NUM_LEV_P>& db_deta_i,
   const SelNlevp& wrk1, const SelNlevp& wrk2, const S2elNlevp& vwrk1,
   // Holds interface levels as intermediate data but is midpoint data on output,
   // with final slot unused.
   const SelNlevp eta_dot[2])
 {
   using Kokkos::ALL;
+  const auto ie = kv.ie;
   const int nlev = NUM_PHYSICAL_LEV;
   const SelNlev divdp(wrk1.data());
   const S2elNlev vdp(vwrk1.data());
@@ -567,19 +746,19 @@ KOKKOS_FUNCTION void calc_eta_dot_ref_mid (
   // Calc surface pressure for use at the end.
   calc_ps(kv, nlev,
           ps0, hyai0,
-          alpha, dp1, dp2,
+          snaps,
           ps);
   kv.team_barrier();
   for (int t = 0; t < 2; ++t) {
     // Compute divdp.
+    const auto e = snaps.get_element(ie);
     const auto f = [&] (const int i, const int j, const int kp) {
       for (int d = 0; d < 2; ++d)
-        vdp(d,i,j,kp) = ((1 - alpha[t])*v1(d,i,j,kp)*dp1(i,j,kp) +
-                         /**/ alpha[t] *v2(d,i,j,kp)*dp2(i,j,kp));
+        vdp(d,i,j,kp) = e.combine_vdp(t,d,i,j,kp);
     };
     cti::loop_ijk<cti::num_lev_pack>(kv, f);
     kv.team_barrier();
-    sphere_ops.divergence_sphere(kv, vdp, divdp);
+    sphops.divergence_sphere(kv, vdp, divdp);
     kv.team_barrier();
     // Compute eta_dot_dpdn at interface nodes.
     const auto& edd = eta_dot[t];
@@ -590,11 +769,8 @@ KOKKOS_FUNCTION void calc_eta_dot_ref_mid (
                            divdps, edd,
                            edds);
     kv.team_barrier();
-    calc_etadotmid_from_etadotdpdnint(kv, nlev,
-                                      ps0, hydai, hydbi, hydetai,
-                                      Kokkos::subview(ps,t,ALL,ALL),
-                                      wrk1,
-                                      edd);
+    const auto pst = Kokkos::subview(ps,t,ALL,ALL);
+    calc_etadotint_from_etadotdpdnint(kv, nlev, ps0, db_deta_i, pst, edd);
     // No team_barrier: wrk1 is protected in second iteration.
   }
 }
@@ -643,7 +819,11 @@ KOKKOS_FUNCTION void calc_vel_horiz_formula_node_ref_mid (
             etams(k-1), etams(k), etams(k+1),
             vsph1s(d,i,j,k-1), vsph1s(d,i,j,k), vsph1s(d,i,j,k+1));
         }
-        vfsphs(d,i,j,k) = (vfsphs(d,i,j,k) - dtsub*eds(i,j,k)*deriv)/2;
+        // Interpolate eta_dot at interfaces to midpoints. Note that this is the
+        // only time this is done, and it's used only in a term of the formula
+        // that contains dtsub.
+        const auto eta_dot = (eds(i,j,k) + eds(i,j,k+1))/2;
+        vfsphs(d,i,j,k) = (vfsphs(d,i,j,k) - dtsub*eta_dot*deriv)/2;
       };
       cti::loop_ijk<cti::num_phys_lev>(kv, f);
     }
@@ -659,10 +839,10 @@ KOKKOS_FUNCTION void calc_vel_horiz_formula_node_ref_mid (
   }
 }
 
-// Given the vertical and horizontal nodal velocities at time endpoints,
-// evaluate the velocity estimate formula, providing the final vertical velocity
-// estimates at midpoint nodes.
-KOKKOS_FUNCTION void calc_eta_dot_formula_node_ref_mid (
+// Given the vertical (interface) and horizontal (midpoint) nodal velocities at
+// time endpoints, evaluate the velocity estimate formula, providing the final
+// vertical velocity estimates at interface nodes.
+KOKKOS_FUNCTION void calc_eta_dot_formula_node_ref_int (
   const KernelVariables& kv, const SphereOperators& sphere_ops,
   const CRNV<NUM_INTERFACE_LEV>& hyetai, const CSNV<NUM_LEV>& hyetam,
   // Velocities are at midpoints. Final eta_dot entry is ignored.
@@ -670,46 +850,39 @@ KOKKOS_FUNCTION void calc_eta_dot_formula_node_ref_mid (
   const SelNlevp& wrk1, const S2elNlevp& vwrk1,
   const ExecViewUnmanaged<Real****>& vnode)
 {
-  const SelNlev ed1_vderiv(wrk1.data());
-  {
-    const CRNV<NUM_PHYSICAL_LEV> etams(cti::cpack2real(hyetam));
+  const RelNlev ed1_vderiv(cti::pack2real(wrk1));
+  const CRNV<NUM_INTERFACE_LEV> etai(hyetai);
+  { // \dot{eta}_eta evaluated at interfaces
     const CRelNlevp ed1s(cti::cpack2real(eta_dot[0]));
-    const RelNlev ed1_vderiv_s(cti::pack2real(ed1_vderiv));
-    const auto f = [&] (const int i, const int j, const int k) {
-      Real deriv;
-      if (k == 0 or k+1 == NUM_PHYSICAL_LEV) {
-        deriv = cti::approx_derivative(
-          k == 0 ? hyetai(0) : etams(k-1),
-          etams(k),
-          k+1 == NUM_PHYSICAL_LEV ? hyetai(NUM_PHYSICAL_LEV) : etams(k+1),
-          k == 0 ? 0 : ed1s(i,j,k-1),
-          ed1s(i,j,k),
-          k+1 == NUM_PHYSICAL_LEV ? 0 : ed1s(i,j,k+1));
-      } else {
-        deriv = cti::approx_derivative(
-          etams(k-1), etams(k), etams(k+1),
+    const auto f = [&] (const int i, const int j, const int km1) {
+      const auto k = km1 + 1;
+      ed1_vderiv(i,j,k) =
+        cti::approx_derivative(
+          etai(k-1), etai(k), etai(k+1),
           ed1s(i,j,k-1), ed1s(i,j,k), ed1s(i,j,k+1));
-      }
-      ed1_vderiv_s(i,j,k) = deriv;
     };
-    cti::loop_ijk<cti::num_phys_lev>(kv, f);
+    cti::loop_ijk<cti::num_phys_lev-1>(kv, f);
   }
   kv.team_barrier();
-  const S2elNlev ed1_hderiv(vwrk1.data());
-  sphere_ops.gradient_sphere(kv, eta_dot[0], ed1_hderiv, NUM_LEV);
+  const S2elNlev ed1_hderiv_p(vwrk1.data());
+  sphere_ops.gradient_sphere(kv, eta_dot[0], ed1_hderiv_p, NUM_LEV);
+  const CR2elNlev ed1_hderiv(cti::pack2real(ed1_hderiv_p));
+  const CRNV<NUM_PHYSICAL_LEV> etam(cti::cpack2real(hyetam));
   {
-    const auto& vsph2 = vsph[1];
-    const auto& ed1 = eta_dot[0];
-    const auto& ed2 = eta_dot[1];
-    const auto f = [&] (const int i, const int j, const int k) {
-      const auto v = (ed1(i,j,k) + ed2(i,j,k)
-                      - dtsub*(  vsph2(0,i,j,k)*ed1_hderiv(0,i,j,k)
-                               + vsph2(1,i,j,k)*ed1_hderiv(1,i,j,k)
-                               +   ed2(  i,j,k)*ed1_vderiv(  i,j,k)))/2;
-      for (int s = 0; s < VECTOR_SIZE; ++s)
-        vnode(VECTOR_SIZE*k+s, i,j,3) = v[s];
+    const CR2elNlev vsph2(cti::cpack2real(vsph[1]));
+    const CRelNlevp ed1(cti::cpack2real(eta_dot[0]));
+    const CRelNlevp ed2(cti::cpack2real(eta_dot[1]));
+    const auto f = [&] (const int i, const int j, const int km1) {
+      const auto k = km1 + 1;
+      // Linearly interp horiz velocity to interfaces.
+      const auto a = (etai(k) - etam(k-1)) / (etam(k) - etam(k-1));
+      vnode(k,i,j,3) =
+        (ed1(i,j,k) + ed2(i,j,k)
+         - dtsub*(  ((1-a)*vsph2(0,i,j,k-1) + a*vsph2(0,i,j,k))*ed1_hderiv(0,i,j,k)
+                  + ((1-a)*vsph2(1,i,j,k-1) + a*vsph2(1,i,j,k))*ed1_hderiv(1,i,j,k)
+                  + ed2(i,j,k)*ed1_vderiv(i,j,k)))/2;
     };
-    cti::loop_ijk<cti::num_lev_pack>(kv, f);
+    cti::loop_ijk<cti::num_phys_lev-1>(kv, f);
   }
 }
 
diff --git a/components/homme/src/share/cxx/ComposeTransportImplEnhancedTrajectoryTests.cpp b/components/homme/src/share/cxx/ComposeTransportImplEnhancedTrajectoryTests.cpp
index 49074df5e52a..10c41a137350 100644
--- a/components/homme/src/share/cxx/ComposeTransportImplEnhancedTrajectoryTests.cpp
+++ b/components/homme/src/share/cxx/ComposeTransportImplEnhancedTrajectoryTests.cpp
@@ -75,6 +75,33 @@ struct ElData {
 
 const Real TestData::eps = std::numeric_limits<Real>::epsilon();
 
+int test_deriv (const TestData& td) {
+  int ne = 0;
+  { // The two impls are equivalent in infinite precision.
+    const int n = 3;
+    const Real x[n] = {-0.3, 0.1, 1.1};
+    Real y[n] = {-0.1, 0.1, 0};
+    for (int t = 0; t < 10; ++t) {
+      y[2] = 0.2*(t - 9.5);
+      Real g1, g2;
+      g1 = cti::approx_derivative1(x[0], x[1], x[2], y[0], y[1], y[2]);
+      g2 = cti::approx_derivative (x[0], x[1], x[2], y[0], y[1], y[2]);
+      if (std::abs(g1 - g2) > 100*td.eps) ++ne;
+    }
+  }
+  { // Exactly recovers quadratic.
+    const int n = 3;
+    const Real x[n] = {-0.3, 0.1, 1.1};
+    Real y[n];
+    for (int i = 0; i < n; ++i)
+      y[i] = 0.7*x[i]*x[i] - 1.2*x[i] + 0.7;
+    const Real g_true = 1.4*x[1] - 1.2;
+    const Real g_est = cti::approx_derivative(x[0], x[1], x[2], y[0], y[1], y[2]);
+    if (std::abs(g_est - g_true) > 100*td.eps) ++ne;
+  }
+  return ne;
+}
+
 int test_find_support (TestData&) {
   int ne = 0;
   const int n = 97;
@@ -233,6 +260,23 @@ int make_random_deta (TestData& td, const Real deta_tol, const RelnV& deta) {
   return nerr;
 }
 
+// Wrapper to main deta_caas routine; this wrapper is used in the unit test of
+// the main routine.
+KOKKOS_FUNCTION void
+deta_caas (const KernelVariables& kv, const int nlevp, const CRnV& deta_ref,
+           const Real low, const RelnV& wrk, const RelnV& deta) {
+  assert(deta_ref.extent_int(0) >= nlevp);
+  assert_eln(wrk, nlevp);
+  assert_eln(deta, nlevp);
+  const auto ttr = Kokkos::TeamThreadRange(kv.team, NP*NP);
+  const auto tvr = Kokkos::ThreadVectorRange(kv.team, nlevp);
+  const auto f = [&] (const int idx) {
+    const int i = idx / NP, j = idx % NP;
+    deta_caas(kv, tvr, deta_ref, low, getcol(wrk,i,j), getcol(deta,i,j));
+  };
+  Kokkos::parallel_for(ttr, f);
+}
+
 int test_deta_caas (TestData& td) {
   int nerr = 0;
   const Real tol = 100*td.eps;
@@ -371,7 +415,7 @@ int test_deta_caas (TestData& td) {
 
 struct HybridLevels {
   Real ps0, a_eta, b_eta;
-  std::vector<Real> ai, dai, bi, dbi, am, bm, etai, detai, etam, detam;
+  std::vector<Real> ai, dai, bi, dbi, am, bm, etai, etam, detai, detam;
 };
 
 // Follow DCMIP2012 3D tracer transport specification for a, b, eta.
@@ -431,64 +475,67 @@ void fill (HybridLevels& h, const int n) {
   h.detam[n] = h.etai[n] - h.etam[n-1];
 }
 
-int test_limit_etam (TestData& td) {
+int test_limit_etai (TestData& td) {
   int nerr = 0;
   const Real tol = 100*td.eps;
 
   for (const int nlev : {143, 128, 81}) {
     const Real deta_tol = 1e5*td.eps/nlev;
 
-    ExecView<Real*> hy_etai("hy_etai",nlev+1), detam("detam",nlev+1);
-    ExecView<Real***> wrk1("wrk1",NP,NP,nlev+1), wrk2("wrk2",NP,NP,nlev+1);
-    ExecView<Real***> etam("etam",NP,NP,nlev);
+    ExecView<Real*> hy_etai("hy_etai",nlev+1), detai("detai",nlev);
+    ExecView<Real***> wrk1("wrk1",NP,NP,nlev), wrk2("wrk2",NP,NP,nlev);
+    ExecView<Real***> etai("etai",NP,NP,nlev);
 
     HybridLevels h;
     fill(h, nlev);
     todev(h.etai, hy_etai);
-    todev(h.detam, detam);
+    todev(h.detai, detai);
 
-    const auto he = Kokkos::create_mirror_view(etam);
+    const auto he = Kokkos::create_mirror_view(etai);
 
     const auto policy = get_test_team_policy(1, nlev);
     const auto run = [&] () {
-      Kokkos::deep_copy(etam, he);
+      Kokkos::deep_copy(etai, he);
       const auto f = KOKKOS_LAMBDA(const cti::MT& team) {
         KernelVariables kv(team);
-        limit_etam(kv, nlev, hy_etai, detam, deta_tol, wrk1, wrk2, etam);
+        limit_etai(kv, nlev, hy_etai, detai, deta_tol, wrk1, wrk2, etai);
       };
       Kokkos::parallel_for(policy, f);
       Kokkos::fence();
-      Kokkos::deep_copy(he, etam);
+      Kokkos::deep_copy(he, etai);
     };
 
-    fillcols(h.etam.size(), h.etam.data(), he);
-    // Col 0 should be untouched. Cols 1 and 2 should have very specific changes.
-    const int col1_idx = static_cast<int>(0.25*nlev);
+    fillcols(h.etai.size()-1, h.etai.data(), he);
+    // Col 0 should be untouched. Cols 1 and 2 should have very specific
+    // changes. etai[0] is intended to be constant, so don't perturb that entry.
+    const int col1_idx = std::max(1, static_cast<int>(0.25*nlev));
     he(0,1,col1_idx) += 0.3;
-    const int col2_idx = static_cast<int>(0.8*nlev);
+    const int col2_idx = std::max(1, static_cast<int>(0.8*nlev));
     he(0,2,col2_idx) -= 5.3;
     // The rest of the columns get wild changes.
     for (int idx = 3; idx < NP*NP; ++idx) {
       const int i = idx / NP, j = idx % NP;
-      for (int k = 0; k < nlev; ++k)
-        he(i,j,k) += td.urand(-1, 1)*(h.etai[k+1] - h.etai[k]);
+      for (int k = 1; k < nlev; ++k)
+        he(i,j,k) += td.urand(-1, 1)*(h.etam[k] - h.etam[k-1]);
     }
     run();
     bool ok = true;
     for (int k = 0; k < nlev; ++k)
-      if (he(0,0,k) != h.etam[k]) ok = false;
+      if (he(0,0,k) != h.etai[k]) ok = false;
+    if (he(0,1,0) != h.etai[0]) ok = false;
     for (int k = 0; k < nlev; ++k) {
       if (k == col1_idx) continue;
-      if (std::abs(he(0,1,k) - h.etam[k]) > tol) ok = false;
+      if (std::abs(he(0,1,k) - h.etai[k]) > tol) ok = false;
     }
+    if (he(0,2,0) != h.etai[0]) ok = false;
     for (int k = 0; k < nlev; ++k) {
       if (k == col2_idx) continue;
-      if (std::abs(he(0,2,k) - h.etam[k]) > tol) ok = false;
+      if (std::abs(he(0,2,k) - h.etai[k]) > tol) ok = false;
     }
     Real mingap = 1;
     for (int i = 0; i < NP; ++i)
       for (int j = 0; j < NP; ++j) {
-        mingap = std::min(mingap, he(i,j,0) - h.etai[0]);
+        if (he(i,j,0) != h.etai[0]) ok = false;
         for (int k = 1; k < nlev; ++k)
           mingap = std::min(mingap, he(i,j,k) - he(i,j,k-1));
         mingap = std::min(mingap, h.etai[nlev] - he(i,j,nlev-1));
@@ -528,9 +575,9 @@ int test_eta_interp (TestData& td) {
       const auto f = KOKKOS_LAMBDA(const cti::MT& team) {
         KernelVariables kv(team);
         eta_interp_eta(kv, nlev, hy_etai,
-                       x, getcolc(y,0,0),
+                       nlev, 0, x, getcolc(y,0,0),
                        xwrk, getcol(ywrk,0,0),
-                       ni, getcolc(xi,0,0), yi);
+                       ni, 0, getcolc(xi,0,0), yi);
       };
       Kokkos::parallel_for(policy, f);
       Kokkos::fence();
@@ -695,6 +742,36 @@ int test_eta_to_dp (TestData& td) {
   return nerr;
 }
 
+struct Snapshots {
+  const Real alpha[2];
+  const ExecViewUnmanaged<Scalar***> dps[2];
+
+  struct Element {
+    const ExecViewUnmanaged<Scalar***> dps[2];
+
+    KOKKOS_INLINE_FUNCTION
+    Element (const Snapshots& s, const int ie)
+      : dps{s.dps[0], s.dps[1]}
+    {}
+
+    KOKKOS_INLINE_FUNCTION
+    Real get_dp_real(const int t, const int i, const int j, const int k) const {
+      return dps[t](i,j, k / VECTOR_SIZE)[k % VECTOR_SIZE];
+    }
+  };
+
+  Snapshots (const Real as[2], const ExecViewUnmanaged<Scalar***>& dp1,
+             const ExecViewUnmanaged<Scalar***>& dp2)
+    : alpha{as[0], as[1]}, dps{dp1, dp2}
+  {}
+
+  KOKKOS_INLINE_FUNCTION Real get_alpha (const int t) const { return alpha[t]; }
+
+  KOKKOS_INLINE_FUNCTION Element get_element (const int ie) const {
+    return Element(*this, ie);
+  }
+};
+
 int test_calc_ps (TestData& td) {
   int nerr = 0;
   const Real tol = 100*td.eps;
@@ -719,9 +796,10 @@ int test_calc_ps (TestData& td) {
     ExecView<Real[NP][NP]> ps("ps");
     ExecView<Real[2][NP][NP]> ps2("ps2");
     const auto policy = get_test_team_policy(1, nlev);
+    Snapshots snaps(alpha, dp1.d, dp2.d);
     const auto f = KOKKOS_LAMBDA(const cti::MT& team) {
       KernelVariables kv(team);
-      calc_ps(kv, nlev, ps0, hyai0, alpha, dp1.d, dp2.d, ps2);
+      calc_ps(kv, nlev, ps0, hyai0, snaps, ps2);
       calc_ps(kv, nlev, ps0, hyai0, dp1.d, ps);
     };
     Kokkos::parallel_for(policy, f);
@@ -749,7 +827,46 @@ int test_calc_ps (TestData& td) {
   return nerr;
 }
 
-int test_calc_etadotmid_from_etadotdpdnint (TestData& td) {
+// Transform eta_dot_dpdn at interfaces to eta_dot at midpoints using the
+// formula
+//     eta_dot = eta_dot_dpdn/(A_eta p0 + B_eta ps)
+//            a= eta_dot_dpdn diff(eta)/(diff(A) p0 + diff(B) ps).
+// I'm keeping this because it's unit tested and might be used in the future.
+// But it's not currently used, so it's in this unit-test file.
+KOKKOS_FUNCTION void calc_etadotmid_from_etadotdpdnint (
+  const KernelVariables& kv, const int nlev,
+  const Real& ps0, const CSnV& hydai, const CSnV& hydbi,
+  const CSnV& hydetai, const CRelV& ps, const SelnV& wrk,
+  //  in: eta_dot_dpdn at interfaces
+  // out: eta_dot at midpoints, final slot unused
+  const SelnV& ed)
+{
+  assert(calc_nscal(hydai.extent_int(0)) >= nlev);
+  assert(calc_nscal(hydbi.extent_int(0)) >= nlev);
+  assert(calc_nscal(hydetai.extent_int(0)) >= nlev);
+  assert_eln(wrk, nlev+1);
+  assert_eln(ed, nlev+1);
+  const auto& edd_mid = wrk;
+  {
+    const CRelnV edd(elp2r(ed));
+    const RelnV tmp(elp2r(wrk));
+    const auto f = [&] (const int i, const int j, const int k) {
+      tmp(i,j,k) = (edd(i,j,k) + edd(i,j,k+1))/2;
+    };
+    cti::loop_ijk(nlev, kv, f);
+  }
+  kv.team_barrier();
+  {
+    const auto f = [&] (const int i, const int j, const int kp) {
+      ed(i,j,kp) = (edd_mid(i,j,kp)
+                    * hydetai(kp)
+                    / (hydai(kp)*ps0 + hydbi(kp)*ps(i,j)));
+    };
+    cti::loop_ijk(calc_npack(nlev), kv, f);
+  }
+}
+
+int test_calc_etadot_from_etadotdpdnint (TestData& td) {
   int nerr = 0;
   const Real tol = 100*td.eps;
 
@@ -770,46 +887,207 @@ int test_calc_etadotmid_from_etadotdpdnint (TestData& td) {
     ElData wrk("wrk",nlev+1), ed("ed",nlev+1);
     ExecView<Real[NP][NP]> ps("ps");
     const Real ps0 = h.ps0;
+    ExecView<Scalar*> db_deta_i("db_deta_i", calc_npack(nlev+1));
+    Kokkos::deep_copy(db_deta_i, h.b_eta);
 
-    const auto ps_m = Kokkos::create_mirror_view(ps);
-    for (int i = 0; i < NP; ++i)
-      for (int j = 0; j < NP; ++j) {
-        ps_m(i,j) = td.urand(0.5, 1.2)*ps0;
-        for (int k = 0; k < nlev; ++k) {
-          hydai.r[k] = h.dai[k];
-          hydbi.r[k] = h.dbi[k];
-          hydetai.r[k] = h.detai[k];
+    for (int trial = 0; trial < 2; ++trial) {
+      const auto ps_m = Kokkos::create_mirror_view(ps);
+      for (int i = 0; i < NP; ++i)
+        for (int j = 0; j < NP; ++j) {
+          ps_m(i,j) = td.urand(0.5, 1.2)*ps0;
+          for (int k = 0; k < nlev; ++k) {
+            hydai.r[k] = h.dai[k];
+            hydbi.r[k] = h.dbi[k];
+            hydetai.r[k] = h.detai[k];
+          }
+          for (int k = 0; k <= nlev; ++k)
+            ed.r(i,j,k) = (i-j)*h.etai[k] + 0.3;
+          if (trial == 1) {
+            ed.r(i,j,0) = 0;
+            ed.r(i,j,nlev) = 0;
+          }
         }
-        for (int k = 0; k <= nlev; ++k)
-          ed.r(i,j,k) = (i-j)*h.etai[k] + 0.3;
-      }
-    Kokkos::deep_copy(ps, ps_m);
-    hydai.h2d(); hydbi.h2d(); hydetai.h2d();
-    ed.h2d();
+      Kokkos::deep_copy(ps, ps_m);
+      hydai.h2d(); hydbi.h2d(); hydetai.h2d();
+      ed.h2d();
 
-    const auto policy = get_test_team_policy(1, nlev);
-    const auto f = KOKKOS_LAMBDA(const cti::MT& team) {
-      KernelVariables kv(team);
-      calc_etadotmid_from_etadotdpdnint(
-        kv, nlev, ps0, hydai.d, hydbi.d, hydetai.d, ps, wrk.d, ed.d);
-    };
-    Kokkos::parallel_for(policy, f);
-    Kokkos::fence();
-    ed.d2h();
+      const auto policy = get_test_team_policy(1, nlev);
+      const auto fmid = KOKKOS_LAMBDA(const cti::MT& team) {
+        KernelVariables kv(team);
+        calc_etadotmid_from_etadotdpdnint(
+          kv, nlev, ps0, hydai.d, hydbi.d, hydetai.d, ps, wrk.d, ed.d);
+      };
+      const auto fint = KOKKOS_LAMBDA(const cti::MT& team) {
+        KernelVariables kv(team);
+        calc_etadotint_from_etadotdpdnint(
+          kv, nlev, ps0, db_deta_i, ps, ed.d);
+      };
+      if (trial == 0)
+        Kokkos::parallel_for(policy, fmid);
+      else
+        Kokkos::parallel_for(policy, fint);
+      Kokkos::fence();
+      ed.d2h();
 
-    for (int i = 0; i < NP; ++i)
-      for (int j = 0; j < NP; ++j) {
-        const auto den = h.a_eta*h.ps0 + h.b_eta*ps_m(i,j);
-        for (int k = 0; k < nlev; ++k) {
-          const auto ed_true = ((i-j)*h.etam[k] + 0.3)/den;
-          if (std::abs(ed.r(i,j,k) - ed_true) > tol*(10/den)) ++nerr;
+      for (int i = 0; i < NP; ++i)
+        for (int j = 0; j < NP; ++j) {
+          const auto den = h.a_eta*h.ps0 + h.b_eta*ps_m(i,j);
+          for (int k = 0; k <= nlev; ++k) {
+            if (trial == 0 and k == nlev) continue;
+            if (trial == 1 and (k == 0 or k == nlev)) {
+              if (ed.r(i,j,k) != 0) ++nerr;
+              continue;
+            }
+            const auto eta = trial == 0 ? h.etam[k] : h.etai[k];
+            const auto ed_true = ((i-j)*eta + 0.3)/den;
+            if (std::abs(ed.r(i,j,k) - ed_true) > tol*(10/den)) ++nerr;
+          }
         }
-      }
+    }
   }
 
   return nerr;
 }
 
+int test1_init_velocity_record (
+  const int dtf, const int drf, const int nsub, const int nvel)
+{
+  const auto eps = std::numeric_limits<Real>::epsilon();
+  int e = 0;
+
+  if (dtf % drf != 0) {
+    printf("Testing erro: dtf %% drf == 0 is required: %d %d\n", dtf, drf);
+    ++e; 
+  }
+
+  const CTI::VelocityRecord v(dtf, drf, nsub, nvel);
+  if (v.dtf() != dtf) ++e;
+  if (v.nsub() != nsub) ++e;
+
+  // Check that t_vel is monotonically increasing.
+  for (int n = 1; n < v.nvel(); ++n)
+    if (v.t_vel(n) <= v.t_vel(n-1))
+      ++e;
+
+  // Check that obs_slots does not reference end points. This should not happen
+  // b/c nvel <= navail and observations are uniformly spaced.
+  for (int n = 0; n < dtf; ++n)
+    for (int i = 0; i < 2; ++i)
+      if (v.obs_slots(n,i) == 0 or v.obs_slots(n,i) == dtf)
+        ++e;
+
+  // Check that weights sum to 1.
+  std::vector<Real> ys(dtf);
+  for (int n = 0; n < dtf; ++n)
+    for (int i = 0; i < 2; ++i)
+      if (v.obs_slots(n,i) >= 0)
+        ys[v.obs_slots(n,i)] += v.obs_wts(n,i);
+  for (int i = 1; i < v.nvel()-1; ++i)
+    if (std::abs(ys[i] - 1) > 1e3*eps)
+      ++e;
+
+  // Test for exact interp of an affine function.
+  const auto tfn = [] (const Real x) { return 7.1*x - 11.5; };
+  //   Observe data forward in time.
+  Real endslots[2];
+  endslots[0] = tfn(0);
+  endslots[1] = tfn(dtf);
+  ys[0] = -1000; // unused
+  for (int i = 1; i < dtf; ++i) ys[i] = 0;
+  for (int n = 0; n < dtf; ++n) {
+    if ((n+1) % drf != 0) continue;
+    const Real y = tfn(n+1);
+    for (int i = 0; i < 2; ++i) {
+      if (v.obs_slots(n,i) < 0) continue;
+      ys[v.obs_slots(n,i)] += v.obs_wts(n,i)*y;
+    }
+  }
+  //   Use the data backward in time.
+  for (int n = 0; n < nsub; ++n) {
+    // Each segment orders the data forward in time. Thus, data are always
+    // ordered forward in time but used backward.
+    Real xsup[2], ysup[2];
+    for (int i = 0; i < 2; ++i) {
+      int k = nsub - (n+1) + i;
+      xsup[i] = (k*v.t_vel(v.nvel()-1))/nsub;
+      k = v.run_step(n);
+      const Real
+        y0 = k == 1 ? endslots[0] : ys[k-1],
+        y1 = k == v.nvel()-1 ? endslots[1] : ys[k];
+      ysup[i] = (((v.t_vel(k) - xsup[i])*y0 + (xsup[i] - v.t_vel(k-1))*y1) /
+                 (v.t_vel(k) - v.t_vel(k-1)));
+    }
+    for (int i = 0; i <= 10; ++i) {
+      const Real
+        a = Real(i)/10,
+        x = (1-a)*xsup[0] + a*xsup[1],
+        y = (1-a)*ysup[0] + a*ysup[1];
+      if (std::abs(y - tfn(x)) > 1e3*eps) {
+        printf("n %d i %2d x %7.3f y %7.3f t %7.3f\n", n, i, x, y, tfn(x));
+        ++e;
+      }
+    }
+  }
+  
+  if (e) {
+    printf("ERROR e %d\n", e);
+    printf("dtf %d drf %d nsub %d nvel %d v.nvel %d\n",
+           dtf, drf, nsub, nvel, v.nvel());
+    printf("  t_vel:");
+    for (int i = 0; i < v.nvel(); ++i) printf(" %1.3f", v.t_vel(i));
+    printf("\n  obs:\n");
+    for (int n = 0; n <= dtf; ++n)
+      printf("    %2d %2d %2d %1.3f %1.3f\n",
+             n, v.obs_slots(n,0), v.obs_slots(n,1), v.obs_wts(n,0),
+             v.obs_wts(n,1));
+    printf("  run_step:\n");
+    for (int n = 0; n <= nsub; ++n) printf("    %2d %2d\n", n, v.run_step(n));
+  }
+
+  return e;
+}
+
+int test_init_velocity_record (TestData& td) {
+  int dtf, drf, nsub, nvel, error;
+
+  error = 0;
+
+  const auto f = [&] () {
+    const int e = test1_init_velocity_record(dtf, drf, nsub, nvel);
+    if (e > 0) error = 1;
+  };
+
+  error = 0;
+  dtf = 6;
+  drf = 2;
+  nsub = 3;
+  nvel = 4;
+  f();
+  nvel = 3;
+  f();
+  drf = 3;
+  nvel = 6;
+  f();
+  drf = 1;
+  nsub = 5;
+  f();
+  dtf = 12;
+  drf = 2;
+  nsub = 3;
+  nvel = -1;
+  f();
+  nsub = 5;
+  nvel = 5;
+  f();
+  dtf = 27;
+  drf = 3;
+  nsub = 51;
+  nvel = 99;
+  f();
+
+  return error;
+}
+
 } // namespace anon
 
 #define comunittest(f) do {                     \
@@ -821,14 +1099,16 @@ int test_calc_etadotmid_from_etadotdpdnint (TestData& td) {
 int ComposeTransportImpl::run_enhanced_trajectory_unit_tests () {
   int nerr = 0, ne;
   TestData td(*this);
+  comunittest(test_deriv);
   comunittest(test_find_support);
   comunittest(test_linterp);
   comunittest(test_eta_interp);
   comunittest(test_eta_to_dp);
   comunittest(test_deta_caas);
-  comunittest(test_limit_etam);
+  comunittest(test_limit_etai);
   comunittest(test_calc_ps);
-  comunittest(test_calc_etadotmid_from_etadotdpdnint);
+  comunittest(test_calc_etadot_from_etadotdpdnint);
+  comunittest(test_init_velocity_record);
   return nerr;
 }
 
diff --git a/components/homme/src/share/cxx/ComposeTransportImplGeneral.cpp b/components/homme/src/share/cxx/ComposeTransportImplGeneral.cpp
index bbffbeb8e48c..8fc353c66654 100644
--- a/components/homme/src/share/cxx/ComposeTransportImplGeneral.cpp
+++ b/components/homme/src/share/cxx/ComposeTransportImplGeneral.cpp
@@ -13,7 +13,8 @@
 extern "C" void
 sl_get_params(double* nu_q, double* hv_scaling, int* hv_q, int* hv_subcycle_q,
               int* limiter_option, int* cdr_check, int* geometry_type,
-              int* trajectory_nsubstep);
+              int* trajectory_nsubstep, int* trajectory_nvelocity,
+              int* diagnostics);
 
 namespace Homme {
 
@@ -47,7 +48,6 @@ void ComposeTransportImpl::setup () {
   m_sphere_ops = Context::singleton().get<SphereOperators>();
   
   set_dp_tol();
-  setup_enhanced_trajectory();
   
   nslot = calc_nslot(m_geometry.num_elems());
 }
@@ -59,10 +59,11 @@ void ComposeTransportImpl::reset (const SimulationParams& params) {
 
   sl_get_params(&m_data.nu_q, &m_data.hv_scaling, &m_data.hv_q, &m_data.hv_subcycle_q,
                 &m_data.limiter_option, &m_data.cdr_check, &m_data.geometry_type,
-                &m_data.trajectory_nsubstep);
+                &m_data.trajectory_nsubstep, &m_data.trajectory_nvelocity,
+                &m_data.diagnostics);
 
-  if (independent_time_steps != m_data.independent_time_steps ||
-      m_data.nelemd != num_elems || m_data.qsize != params.qsize) {
+  if (independent_time_steps != m_data.independent_time_steps or
+      m_data.nelemd != num_elems or m_data.qsize != params.qsize) {
     const auto& g = m_geometry;
     const auto& t = m_tracers;
     const auto& s = m_state;
@@ -76,7 +77,9 @@ void ComposeTransportImpl::reset (const SimulationParams& params) {
     if (m_data.trajectory_nsubstep > 0)
       m_data.vnode = DeparturePoints("vnode", nel, num_phys_lev, np, np, ndim);
     if (m_data.trajectory_nsubstep > 1)
-      m_data.vdep  = DeparturePoints("vdep" , nel, num_phys_lev, np, np, ndim);
+      m_data.vdep  = DeparturePoints("vdep" , nel, num_phys_lev, np, np, ndim+1);
+    if (m_data.trajectory_nsubstep > 0)
+      setup_enhanced_trajectory(params, num_elems);
     homme::compose::set_views(
       g.m_spheremp,
       homme::compose::SetView<Real****>  (reinterpret_cast<Real*>(d.m_dp.data()),
@@ -91,7 +94,7 @@ void ComposeTransportImpl::reset (const SimulationParams& params) {
         nel, t.qdp.extent_int(1), t.qdp.extent_int(2), np, np, nlev),
       homme::compose::SetView<Real*****> (reinterpret_cast<Real*>(t.Q.data()),
                                           nel, t.Q.extent_int(1), np, np, nlev),
-      m_data.dep_pts, m_data.vnode, m_data.vdep, ndim);
+      m_data.dep_pts, m_data.vnode, ndim, m_data.vdep, m_data.vdep.extent_int(4));
   }
 
   m_data.independent_time_steps = independent_time_steps;
@@ -212,7 +215,7 @@ void ComposeTransportImpl::run (const TimeLevel& tl, const Real dt) {
   if (m_data.trajectory_nsubstep == 0)
     calc_trajectory(tl.np1, dt);
   else
-    calc_enhanced_trajectory(tl.np1, dt);
+    calc_enhanced_trajectory(tl.nstep, tl.np1, dt);
   
   GPTLstart("compose_isl");
   homme::compose::advect(tl.np1, tl.n0_qdp, tl.np1_qdp);
diff --git a/components/homme/src/share/cxx/ComposeTransportImplTrajectory.cpp b/components/homme/src/share/cxx/ComposeTransportImplTrajectory.cpp
index 7ed513e086f4..83586fb7c56b 100644
--- a/components/homme/src/share/cxx/ComposeTransportImplTrajectory.cpp
+++ b/components/homme/src/share/cxx/ComposeTransportImplTrajectory.cpp
@@ -201,7 +201,7 @@ KOKKOS_FUNCTION static void calc_vertically_lagrangian_levels (
   // Gradient of eta_dot_dpdn = p_eta deta/dt at final time w.r.t. p at initial
   // time.
   const auto& ptp0 = dprecon;
-  cti::approx_derivative(kv, pref, *eta_dot_dpdn[1], ptp0);
+  cti::approx_derivative1(kv, pref, *eta_dot_dpdn[1], ptp0);
   kv.team_barrier();
 
   {
@@ -437,7 +437,7 @@ static int test_approx_derivative () {
   { // Run approx_derivative.
     const auto f = KOKKOS_LAMBDA (const cti::MT& team) {
       KernelVariables kv(team);
-      cti::approx_derivative(kv, xp, yp, yip);
+      cti::approx_derivative1(kv, xp, yp, yip);
     };
     Kokkos::fence();
     Kokkos::parallel_for(policy, f);
diff --git a/components/homme/src/share/cxx/prim_advec_tracers_remap.cpp b/components/homme/src/share/cxx/prim_advec_tracers_remap.cpp
index 20623b8c57da..31455e35daae 100644
--- a/components/homme/src/share/cxx/prim_advec_tracers_remap.cpp
+++ b/components/homme/src/share/cxx/prim_advec_tracers_remap.cpp
@@ -14,11 +14,29 @@
 namespace Homme
 {
 
+static void prim_advec_tracers_observe_velocity_compose (const int step);
 static void prim_advec_tracers_remap_RK2 (const Real dt);
 static void prim_advec_tracers_remap_compose (const Real dt);
 
 // ----------- IMPLEMENTATION ---------- //
 
+void prim_advec_tracers_observe_velocity (const int step) {
+  SimulationParams& params = Context::singleton().get<SimulationParams>();
+
+  if (params.transport_alg > 0)
+    prim_advec_tracers_observe_velocity_compose(step);
+}
+
+static void prim_advec_tracers_observe_velocity_compose (const int step) {
+#if defined MODEL_THETA_L && defined HOMME_ENABLE_COMPOSE
+  GPTLstart("tl-at prim_advec_observe_velocity");
+  const auto& tl = Context::singleton().get<TimeLevel>();
+  auto& ct = Context::singleton().get<ComposeTransport>();
+  ct.observe_velocity(tl, step);
+  GPTLstop("tl-at prim_advec_observe_velocity");
+#endif  
+}
+
 void prim_advec_tracers_remap (const Real dt) {
   SimulationParams& params = Context::singleton().get<SimulationParams>();
 
@@ -97,7 +115,6 @@ static void prim_advec_tracers_remap_compose (const Real dt) {
   auto& tl = Context::singleton().get<TimeLevel>();
   tl.update_tracers_levels(params.dt_tracer_factor);
   auto& ct = Context::singleton().get<ComposeTransport>();
-  ct.reset(params);
   ct.run(tl, dt);
   GPTLstop("tl-at prim_advec_tracers_compose");
 #else
diff --git a/components/homme/src/share/cxx/prim_step.cpp b/components/homme/src/share/cxx/prim_step.cpp
index c5c75e54a354..9ef386132480 100644
--- a/components/homme/src/share/cxx/prim_step.cpp
+++ b/components/homme/src/share/cxx/prim_step.cpp
@@ -17,6 +17,7 @@ namespace Homme
 {
 
 void prim_advance_exp (TimeLevel& tl, const Real dt, const bool compute_diagnostics);
+void prim_advec_tracers_observe_velocity (const int step);
 void prim_advec_tracers_remap (const Real);
 void vertical_remap (const Real);
 
@@ -169,10 +170,12 @@ void prim_step_flexible (const Real dt, const bool compute_diagnostics) {
     prim_advance_exp(tl, dt, compute_diagnostics);
     tl.tevolve += dt;
 
+    bool observe = false;
     if (params.dt_remap_factor == 0) {
       // Since dt_remap == 0, the only part of vertical_remap that is active is
       // the updates to ps_v(:,:,np1) and dp3d(:,:,:,np1).
       vertical_remap(dt_remap);
+      observe = true;
     } else if ((n+1) % params.dt_remap_factor == 0) {
       if (compute_diagnostics)
         context.get<Diagnostics>().run_diagnostics(false, 3);
@@ -187,7 +190,10 @@ void prim_step_flexible (const Real dt, const bool compute_diagnostics) {
         vertical_remap(dt_remap);
         GPTLstop("tl-sc vertical_remap");
       }
+      observe = true;
     }
+    if (observe)
+      prim_advec_tracers_observe_velocity(n);
   }
 
   if (params.qsize > 0)
diff --git a/components/homme/src/share/cxx/vector/vector_pragmas.hpp b/components/homme/src/share/cxx/vector/vector_pragmas.hpp
index fc58b819a293..76901f05ee15 100644
--- a/components/homme/src/share/cxx/vector/vector_pragmas.hpp
+++ b/components/homme/src/share/cxx/vector/vector_pragmas.hpp
@@ -13,7 +13,6 @@
 
 #define VECTOR_IVDEP_LOOP _Pragma("ivdep")
 #define ALWAYS_VECTORIZE_LOOP _Pragma("vector always")
-#define VECTOR_SIMD_LOOP _Pragma("omp simd")
 #if HOMMEXX_VECTOR_SIZE == 1
 # define VECTOR_SIMD_LOOP
 #else
diff --git a/components/homme/src/share/namelist_mod.F90 b/components/homme/src/share/namelist_mod.F90
index 4f775bdbb17f..a9393cab85e5 100644
--- a/components/homme/src/share/namelist_mod.F90
+++ b/components/homme/src/share/namelist_mod.F90
@@ -460,18 +460,7 @@ subroutine readnl(par)
     ne              = 0
     ne_x              = 0
     ne_y              = 0
-    transport_alg = 0
-    semi_lagrange_cdr_alg = 3
-    semi_lagrange_cdr_check = .false.
-    semi_lagrange_hv_q = 1
-    semi_lagrange_nearest_point_lev = 256
-    semi_lagrange_halo = 2
-    semi_lagrange_trajectory_nsubstep = 0
-    semi_lagrange_trajectory_nvelocity = -1
-    semi_lagrange_diagnostics = 0
     disable_diagnostics = .false.
-    se_fv_phys_remap_alg = 1
-    internal_diagnostics_level = 0
     planar_slice = .false.
 
     theta_hydrostatic_mode = .true.    ! for preqx, this must be .true.
@@ -582,20 +571,7 @@ subroutine readnl(par)
            test_case(1:13)== "jw_baroclinic"  .or. &
            test_case(1:5) == "dcmip"          .or. &
            test_case(1:5) == "mtest"          .or. &
-           test_case      == "planar_hydro_gravity_wave"            .or. &
-           test_case      == "planar_nonhydro_gravity_wave"           .or. &
-           test_case      == "planar_hydro_mtn_wave"            .or. &
-           test_case      == "planar_nonhydro_mtn_wave"           .or. &
-           test_case      == "planar_schar_mtn_wave"            .or. &
-           test_case      == "planar_rising_bubble"             .or. &
-           test_case      == "planar_rising_bubble_pg2"         .or. &
-           test_case      == "planar_density_current"             .or. &
-           test_case      == "planar_baroclinic_instab"             .or. &
-           test_case      == "planar_moist_rising_bubble"            .or. &
-           test_case      == "planar_moist_density_current"            .or. &
-           test_case      == "planar_moist_baroclinic_instab"            .or. &
-           test_case      == "planar_tropical_cyclone"             .or. &
-           test_case      == "planar_supercell"             .or. &
+           test_case(1:6) == "planar"         .or. &
            test_case(1:4) == "asp_")  then
          write(iulog,*) "reading vertical namelist..."
 
diff --git a/components/homme/src/share/sl_advection.F90 b/components/homme/src/share/sl_advection.F90
index 3e29c6a017d2..71585219335a 100644
--- a/components/homme/src/share/sl_advection.F90
+++ b/components/homme/src/share/sl_advection.F90
@@ -36,7 +36,7 @@ module sl_advection
   integer :: dep_points_ndim
 
   ! For use in make_positive. Set at initialization to a function of hvcoord%dp0.
-  real(kind=real_kind) :: dp_tol, deta_tol
+  real(kind=real_kind) :: dp_tol, deta_tol, db_deta(nlevp)
 
   public :: prim_advec_tracers_observe_velocity_ALE, prim_advec_tracers_remap_ALE, &
        &    sl_init1, sl_vertically_remap_tracers, sl_unittest
@@ -54,8 +54,8 @@ module sl_advection
   real(kind=real_kind), dimension(:,:,:,:,:), allocatable :: minq, maxq  ! (np,np,nlev,qsize,nelemd)
 
   ! Trajectory velocity data.
-  real(kind=real_kind), dimension(:,:,:,:,:), allocatable :: vnode, vdep ! (ndim,np,np,nlev,nelemd)
-  real(kind=real_kind), allocatable :: dep_points_all(:,:,:,:,:)         ! (ndim,np,np,nlev,nelemd)
+  real(kind=real_kind), dimension(:,:,:,:,:), allocatable :: vnode, vdep ! (ndim[+1],np,np,nlev,nelemd)
+  real(kind=real_kind), allocatable :: dep_points_all(:,:,:,:,:)         ! (ndim,    np,np,nlev,nelemd)
 
   type :: velocity_record_t
      integer :: nvel
@@ -119,7 +119,7 @@ subroutine sl_init1(par, elem)
     use control_mod,            only : transport_alg, semi_lagrange_cdr_alg, &
          nu_q, semi_lagrange_hv_q, semi_lagrange_trajectory_nsubstep, &
          semi_lagrange_trajectory_nvelocity, geometry, dt_remap_factor, dt_tracer_factor, &
-         semi_lagrange_halo
+         semi_lagrange_halo, semi_lagrange_diagnostics
     use element_state,          only : timelevels
     use coordinate_systems_mod, only : cartesian3D_t
     use perf_mod, only: t_startf, t_stopf
@@ -139,7 +139,9 @@ subroutine sl_init1(par, elem)
        is_sphere = trim(geometry) /= 'plane'
        enhanced_trajectory = semi_lagrange_trajectory_nsubstep > 0
        dep_points_ndim = 3
-       if (enhanced_trajectory .and. independent_time_steps) dep_points_ndim = 4
+       if (enhanced_trajectory .and. independent_time_steps) then
+          dep_points_ndim = 4
+       end if
        nslots = nlev*qsize
        do ie = 1, size(elem)
           ! Provide a point inside the target element.
@@ -164,8 +166,8 @@ subroutine sl_init1(par, elem)
        allocate(minq(np,np,nlev,qsize,size(elem)), maxq(np,np,nlev,qsize,size(elem)), &
             &   dep_points_all(dep_points_ndim,np,np,nlev,size(elem)))
        if (enhanced_trajectory) then
-          allocate(vnode(dep_points_ndim,np,np,nlev,size(elem)), &
-               &   vdep (dep_points_ndim,np,np,nlev,size(elem)))
+          allocate(vnode(dep_points_ndim,  np,np,nlev,size(elem)), &
+               &   vdep (dep_points_ndim+1,np,np,nlev,size(elem)))
        end if
        call init_velocity_record(size(elem), dt_tracer_factor, dt_remap_factor, &
             semi_lagrange_trajectory_nsubstep, semi_lagrange_trajectory_nvelocity, &
@@ -181,7 +183,7 @@ subroutine sl_init1(par, elem)
              write(iulog,'(a,i3,i3,i3)') &
                   'COMPOSE> dt_tracer_factor, dt_remap_factor, halo:', &
                   dt_tracer_factor, dt_remap_factor, semi_lagrange_halo
-             write(iulog,'(a,i3,i3)') &
+             write(iulog,'(a,i3,i3,i3)') &
                   'COMPOSE> use enhanced trajectory; nsub, nvel:', &
                   semi_lagrange_trajectory_nsubstep, vrec%nvel
           end if
@@ -192,14 +194,15 @@ subroutine sl_init1(par, elem)
   end subroutine sl_init1
 
   subroutine sl_get_params(nu_q_out, hv_scaling, hv_q, hv_subcycle_q, limiter_option_out, &
-       cdr_check, geometry_type, trajectory_nsubstep) bind(c)
+       cdr_check, geometry_type, trajectory_nsubstep, trajectory_nvelocity, diagnostics) bind(c)
     use control_mod, only: semi_lagrange_hv_q, hypervis_subcycle_q, semi_lagrange_cdr_check, &
-         nu_q, hypervis_scaling, limiter_option, geometry, semi_lagrange_trajectory_nsubstep
+         nu_q, hypervis_scaling, limiter_option, geometry, semi_lagrange_trajectory_nsubstep, &
+         semi_lagrange_trajectory_nvelocity, semi_lagrange_diagnostics
     use iso_c_binding, only: c_int, c_double
 
     real(c_double), intent(out) :: nu_q_out, hv_scaling
     integer(c_int), intent(out) :: hv_q, hv_subcycle_q, limiter_option_out, cdr_check, &
-         geometry_type, trajectory_nsubstep
+         geometry_type, trajectory_nsubstep, trajectory_nvelocity, diagnostics
 
     nu_q_out = nu_q
     hv_scaling = hypervis_scaling
@@ -211,6 +214,8 @@ subroutine sl_get_params(nu_q_out, hv_scaling, hv_q, hv_subcycle_q, limiter_opti
     geometry_type = 0 ! sphere
     if (trim(geometry) == "plane") geometry_type = 1
     trajectory_nsubstep = semi_lagrange_trajectory_nsubstep
+    trajectory_nvelocity = semi_lagrange_trajectory_nvelocity
+    diagnostics = semi_lagrange_diagnostics
   end subroutine sl_get_params
 
   subroutine init_velocity_record(nelemd, dtf, drf_param, nsub, nvel_param, v, error)
@@ -1343,12 +1348,18 @@ subroutine calc_enhanced_trajectory(elem, deriv, hvcoord, hybrid, dt, tl, &
     logical, intent(in) :: independent_time_steps
 
 #ifdef HOMME_ENABLE_COMPOSE
-    integer :: step, ie, info, limiter_active_count
-    real(real_kind) :: alpha(2), dtsub
+    integer :: step, ie, info, limiter_active_count, k, i, j
+    real(real_kind) :: alpha(2), dtsub, a, p(3)
+    real(real_kind), allocatable :: ptmp(:,:,:,:,:), vtmp(:,:,:,:,:)
 
     call t_startf('SLMM_trajectory')
 
-    call slmm_set_hvcoord(hvcoord%etai(1), hvcoord%etai(nlevp), hvcoord%etam)
+    if (deta_tol < 0) then
+       ! Benign write race. Constants are only written, and at least one thread
+       ! must enter this block.
+       call init_constants(hvcoord)
+    end if
+    call slmm_set_hvcoord(hvcoord%etai, hvcoord%etam)
 
     ! Set dep_points_all to level-midpoint arrival points.
     call init_dep_points_all(elem, hvcoord, nets, nete, independent_time_steps)
@@ -1378,8 +1389,21 @@ subroutine calc_enhanced_trajectory(elem, deriv, hvcoord, hybrid, dt, tl, &
           call update_dep_points_all(independent_time_steps, dtsub, nets, nete, vnode)
        else
           ! Fill vdep.
-          call slmm_calc_v_departure(nets, nete, step, dtsub, dep_points_all, &
-               &                     dep_points_ndim, vnode, vdep, info)
+          call slmm_interp_v_update(nets, nete, step, dtsub, dep_points_all, &
+               &                    dep_points_ndim, vnode, vdep, info)
+
+          ! Interpolate eta_dot at interfaces. The support data are not midpoint
+          ! data, though; rather, they're interface data collected at different
+          ! horizontal points. Thus, to be clear, this is not
+          ! midpoint-to-interface interpolation of eta_dot.
+          do ie = nets, nete
+             do k = 2,nlev
+                a =  (hvcoord%etai(k) - hvcoord%etam(k-1)) / &
+                     (hvcoord%etam(k) - hvcoord%etam(k-1))
+                vdep(4,:,:,k,ie) = (1-a)*vdep(5,:,:,k-1,ie) + &
+                     &                a *vdep(4,:,:,k  ,ie)
+             end do
+          end do
 
           ! Using vdep, update dep_points_all to departure points.
           call update_dep_points_all(independent_time_steps, dtsub, nets, nete, vdep)
@@ -1394,8 +1418,8 @@ subroutine calc_enhanced_trajectory(elem, deriv, hvcoord, hybrid, dt, tl, &
        if (iand(semi_lagrange_diagnostics, 1) /= 0) then
           limiter_active_count = ParallelSum(limiter_active_count, hybrid)
           if (limiter_active_count > 0 .and. hybrid%masterthread) then
-             write(iulog, '(a,i11)') 'COMPOSE> limiter_active_count', &
-                  limiter_active_count
+             write(iulog, '(a,i11,i11)') 'COMPOSE> nstep, limiter_active_count:', &
+                  tl%nstep, limiter_active_count
           end if
        end if
     end if
@@ -1426,8 +1450,9 @@ subroutine init_dep_points_all(elem, hvcoord, nets, nete, independent_time_steps
                 dep_points_all(1:3,i,j,k,ie) = dep_points_all(1:3,i,j,1,ie)
              end do
              if (independent_time_steps) then
+                ! hvcoord%etai(k), k = 1 and nlevp, are not used.
                 do k = 1, nlev
-                   dep_points_all(4,i,j,k,ie) = hvcoord%etam(k)
+                   dep_points_all(4,i,j,k,ie) = hvcoord%etai(k)
                 end do
              end if
           end do
@@ -1510,8 +1535,8 @@ subroutine calc_nodal_velocities(elem, deriv, hvcoord, tl, &
     integer :: t
 
     if (independent_time_steps) then
-       call calc_eta_dot_ref_mid(elem, deriv, tl, hvcoord, alpha, &
-            &                    v1, dp1, v2, dp2, eta_dot)
+       call calc_eta_dot_ref(elem, deriv, tl, hvcoord, alpha, &
+            &                v1, dp1, v2, dp2, eta_dot)
     else
        eta_dot = zero
     end if
@@ -1528,14 +1553,12 @@ subroutine calc_nodal_velocities(elem, deriv, hvcoord, tl, &
     call calc_vel_horiz_formula_node_ref_mid( &
          &  elem, deriv, hvcoord, dtsub, vsph, eta_dot, vnode)
     if (independent_time_steps) then
-       call calc_eta_dot_formula_node_ref_mid( &
+       call calc_eta_dot_formula_node_ref_int( &
             elem, deriv, hvcoord, dtsub, vsph, eta_dot, vnode)
     end if
   end subroutine calc_nodal_velocities
 
-  subroutine calc_eta_dot_ref_mid(elem, deriv, tl, hvcoord, alpha, v1, dp1, v2, dp2, eta_dot)
-    ! Compute eta_dot at midpoint nodes at the start and end of the substep.
-
+  subroutine calc_eta_dot_ref(elem, deriv, tl, hvcoord, alpha, v1, dp1, v2, dp2, eta_dot)
     type (element_t), intent(in) :: elem
     type (derivative_t), intent(in) :: deriv
     type (TimeLevel_t), intent(in) :: tl
@@ -1564,22 +1587,20 @@ subroutine calc_eta_dot_ref_mid(elem, deriv, tl, hvcoord, alpha, v1, dp1, v2, dp
        do k = 2,nlev
           eta_dot(:,:,k,t) = hvcoord%hybi(k)*w1 - eta_dot(:,:,k,t)
        end do
-       ! Transform eta_dot_dpdn at interfaces to eta_dot at midpoints using the
-       ! formula
-       !     eta_dot = eta_dot_dpdn/(A_eta p0 + B_eta ps)
-       !            a= eta_dot_dpdn diff(eta)/(diff(A) p0 + diff(B) ps).
-       !   Compute ps.
+       ! Compute ps.
        w1 = hvcoord%hyai(1)*hvcoord%ps0 + &
             &    (1 - alpha(t))*sum(dp1, 3) + &
             &         alpha(t) *sum(dp2, 3)
-       do k = 1,nlev
-          eta_dot(:,:,k,t) = half*(eta_dot(:,:,k,t) + eta_dot(:,:,k+1,t)) &
-               &             * (hvcoord%etai(k+1) - hvcoord%etai(k)) &
-               &             / (  (hvcoord%hyai(k+1) - hvcoord%hyai(k))*hvcoord%ps0 &
-               &                + (hvcoord%hybi(k+1) - hvcoord%hybi(k))*w1)
+       ! Transform eta_dot_dpdn at interfaces to eta_dot at midpoints using the
+       ! formula
+       !     eta_dot = eta_dot_dpdn/(A_eta p0 + B_eta ps).
+       ! Use p_eta = A_eta p0 + B_eta ps = p0 + B_eta (ps - p0).
+       do k = 2,nlev
+          eta_dot(:,:,k,t) = eta_dot(:,:,k,t) / &
+               &             (hvcoord%ps0 + db_deta(k)*(w1 - hvcoord%ps0))
        end do
     end do
-  end subroutine calc_eta_dot_ref_mid
+  end subroutine calc_eta_dot_ref
 
   subroutine calc_vel_horiz_formula_node_ref_mid( &
        elem, deriv, hvcoord, dtsub, vsph, eta_dot, vnode)
@@ -1616,7 +1637,8 @@ subroutine calc_vel_horiz_formula_node_ref_mid( &
              w2 = hvcoord%etam(k) ! derivative at this eta value
              call eval_lagrange_poly_derivative(3, w3, vsph(:,:,d,k-1:k+1,t0), w2, w1)
           end if
-          vfsph(:,:,d) = vfsph(:,:,d) - dtsub*eta_dot(:,:,k,t1)*w1
+          vfsph(:,:,d) = vfsph(:,:,d) - &
+               dtsub*half*(eta_dot(:,:,k,t1) + eta_dot(:,:,k+1,t1))*w1
        end do
        ! Finish the formula.
        vfsph = half*vfsph
@@ -1627,7 +1649,7 @@ subroutine calc_vel_horiz_formula_node_ref_mid( &
     end do
   end subroutine calc_vel_horiz_formula_node_ref_mid
 
-  subroutine calc_eta_dot_formula_node_ref_mid( &
+  subroutine calc_eta_dot_formula_node_ref_int( &
        elem, deriv, hvcoord, dtsub, vsph, eta_dot, vnode)
 
     type (element_t), intent(in) :: elem
@@ -1638,43 +1660,28 @@ subroutine calc_eta_dot_formula_node_ref_mid( &
 
     integer, parameter :: t0 = 1, t1 = 2
     
-    real(real_kind) :: w1(np,np), w2(np,np), w3(np,np,3), w4(np,np,3)
+    real(real_kind) :: w1(np,np), w2(np,np), w3(np,np,3), w4(np,np,2), a
     integer :: k, i, k1, k2
 
-    do k = 1, nlev
-       w2 = hvcoord%etam(k)
-       if (k == 1 .or. k == nlev) then
-          if (k == 1) then
-             w3(:,:,1) = hvcoord%etai(1)
-             w4(:,:,1) = zero
-             do i = 1, 2
-                w3(:,:,i+1) = hvcoord%etam(i)
-                w4(:,:,i+1) = eta_dot(:,:,i,t0)
-             end do
-          else
-             do i = 1, 2
-                w3(:,:,i) = hvcoord%etam(nlev-2+i)
-                w4(:,:,i) = eta_dot(:,:,nlev-2+i,t0)
-             end do
-             w3(:,:,3) = hvcoord%etai(nlevp)
-             w4(:,:,3) = zero
-          end if
-          call eval_lagrange_poly_derivative(3, w3, w4, w2, w1)
-       else
-          k1 = k-1
-          k2 = k+1
-          do i = 1, 3
-             w3(:,:,i) = hvcoord%etam(k1-1+i)
-          end do
-          call eval_lagrange_poly_derivative(k2-k1+1, w3, eta_dot(:,:,k1:k2,t0), w2, w1)
-       end if
+    vnode(4,:,:,1) = zero
+    do k = 2, nlev
+       w2 = hvcoord%etai(k)
+       k1 = k-1
+       k2 = k+1
+       do i = 1, 3
+          w3(:,:,i) = hvcoord%etai(k1-1+i)
+       end do
+       call eval_lagrange_poly_derivative(3, w3, eta_dot(:,:,k1:k2,t0), w2, w1)
        w3(:,:,1:2) = gradient_sphere(eta_dot(:,:,k,t0), deriv, elem%Dinv)
+       ! Linearly interp horiz velocity to interfaces.
+       a = (hvcoord%etai(k) - hvcoord%etam(k-1)) / (hvcoord%etam(k) - hvcoord%etam(k-1))
+       w4 = (1 - a)*vsph(:,:,:,k-1,t1) + a*vsph(:,:,:,k,t1)
        vnode(4,:,:,k) = &
             half*(eta_dot(:,:,k,t0) + eta_dot(:,:,k,t1) &
-            &     - dtsub*(vsph(:,:,1,k,t1)*w3(:,:,1) + vsph(:,:,2,k,t1)*w3(:,:,2) &
+            &     - dtsub*(w4(:,:,1)*w3(:,:,1) + w4(:,:,2)*w3(:,:,2) &
             &              + eta_dot(:,:,k,t1)*w1))
     end do
-  end subroutine calc_eta_dot_formula_node_ref_mid
+  end subroutine calc_eta_dot_formula_node_ref_int
 
   subroutine update_dep_points_all(independent_time_steps, dtsub, nets, nete, vdep)
     ! Determine the departure points corresponding to the reference grid's
@@ -1687,7 +1694,7 @@ subroutine update_dep_points_all(independent_time_steps, dtsub, nets, nete, vdep
     integer, intent(in) :: nets, nete
     real(real_kind), intent(in) :: vdep(:,:,:,:,:)
 
-    real(real_kind) :: norm, p(3)
+    real(real_kind) :: norm, p(3), eta_dot_kp1
     integer :: ie, k, j, i
 
     do ie = nets, nete
@@ -1728,34 +1735,37 @@ subroutine interp_departure_points_to_floating_level_midpoints( &
     real(real_kind), intent(inout) :: dep_points_all(:,:,:,:,:)
     integer, intent(inout) :: limcnt
 
-    real(real_kind) :: deta_ref(nlevp), w1(np,np), v1(np,np,nlev), &
-         &             v2(np,np,nlevp), p(3)
+    real(real_kind) :: detam_ref(nlevp), detai_ref(nlev), w1(np,np), &
+         &             v1(np,np,nlev), v2(np,np,nlevp), p(3)
     integer :: ie, i, j, k, d
 
-    call set_deta_tol(hvcoord)
-
-    deta_ref(1) = hvcoord%etam(1) - hvcoord%etai(1)
+    detam_ref(1) = hvcoord%etam(1) - hvcoord%etai(1)
     do k = 2, nlev
-       deta_ref(k) = hvcoord%etam(k) - hvcoord%etam(k-1)
+       detam_ref(k) = hvcoord%etam(k) - hvcoord%etam(k-1)
+    end do
+    detam_ref(nlevp) = hvcoord%etai(nlevp) - hvcoord%etam(nlev)
+    do k = 1, nlev
+       detai_ref(k) = hvcoord%etai(k+1) - hvcoord%etai(k)
     end do
-    deta_ref(nlevp) = hvcoord%etai(nlevp) - hvcoord%etam(nlev)
 
     do ie = nets, nete
        ! Surface pressure.
        w1 = hvcoord%hyai(1)*hvcoord%ps0 + sum(elem(ie)%state%dp3d(:,:,:,tl%np1), 3)
 
        ! Reconstruct Lagrangian levels at t1 on arrival column:
-       !     eta_arr_int = I[eta_ref_mid([0,eta_dep_mid,1])](eta_ref_int)
-       call limit_etam(hvcoord, deta_ref, dep_points_all(4,:,:,:,ie), v1, limcnt)
+       !     eta_arr_int = I[eta_ref_int(eta_dep_int)](eta_ref_int)
+       call limit_etai(hvcoord, detai_ref, dep_points_all(4,:,:,:,ie), v1, limcnt)
        v2(:,:,1) = hvcoord%etai(1)
        v2(:,:,nlevp) = hvcoord%etai(nlevp)
-       call eta_interp_eta(hvcoord, v1, hvcoord%etam, &
+       call eta_interp_eta(hvcoord, &
+            &              nlevp-2, v1(:,:,2:nlev), hvcoord%etai(2:nlev), &
             &              nlevp-2, hvcoord%etai(2:nlev), v2(:,:,2:nlev))
        call eta_to_dp(hvcoord, w1, v2, elem(ie)%derived%divdp)
 
        ! Compute Lagrangian level midpoints at t1 on arrival column:
-       !     eta_arr_mid = I[eta_ref_mid([0,eta_dep_mid,1])](eta_ref_mid)
-       call eta_interp_eta(hvcoord, v1, hvcoord%etam, &
+       !     eta_arr_mid = I[eta_ref_int(eta_dep_int)](eta_ref_mid)
+       call eta_interp_eta(hvcoord, &
+            &              nlevp-2, v1(:,:,2:nlev), hvcoord%etai(2:nlev), &
             &              nlev, hvcoord%etam, v2(:,:,1:nlev))
        dep_points_all(4,:,:,:,ie) = v2(:,:,1:nlev)
 
@@ -1782,7 +1792,7 @@ subroutine interp_departure_points_to_floating_level_midpoints( &
     end do
   end subroutine interp_departure_points_to_floating_level_midpoints
 
-  subroutine set_deta_tol(hvcoord)
+  subroutine init_constants(hvcoord)
     type (hvcoord_t), intent(in) :: hvcoord
 
     real(real_kind) :: deta_ave
@@ -1791,36 +1801,59 @@ subroutine set_deta_tol(hvcoord)
 
     ! Benign write race condition. A thread might see eta_tol < 0 and set it
     ! here even as another thread does the same. But because there is no read
-    ! and only one value to write, the redundant writes don't matter.
+    ! and only one value to write, the redundant writes don't matter. At least
+    ! one thread must see eta_tol < 0.
 
     deta_ave = (hvcoord%etai(nlev+1) - hvcoord%etai(1)) / nlev
     deta_tol = 10_real_kind*eps*deta_ave
-  end subroutine set_deta_tol
 
-  subroutine limit_etam(hvcoord, deta_ref, eta, eta_lim, cnt)
+    call estimate_derivative(nlevp, hvcoord%etai, hvcoord%hybi, db_deta)
+  end subroutine init_constants
+
+  subroutine estimate_derivative(n, x, y, y_x)
+    ! Weighted average of the two 1-sided finite differences. In infinite
+    ! precision, the values for indices 2:n-1 are the same as
+    ! eval_lagrange_poly_derivative with three support points.
+    
+    integer, intent(in) :: n
+    real(real_kind), intent(in) :: x(n), y(n)
+    real(real_kind), intent(out) :: y_x(n)
+
+    integer :: k
+    real(real_kind) :: dx1, dx2, a
+
+    y_x(1) = (y(2) - y(1)) / (x(2) - x(1))
+    do k = 2, n-1
+       dx1 = x(k) - x(k-1)
+       dx2 = x(k+1) - x(k)
+       a = dx2/(dx1 + dx2)
+       y_x(k) = a*(y(k) - y(k-1))/dx1 + (1-a)*(y(k+1) - y(k))/dx2
+    end do
+    y_x(n) = (y(n) - y(n-1)) / (x(n) - x(n-1))
+  end subroutine estimate_derivative
+
+  subroutine limit_etai(hvcoord, deta_ref, eta, eta_lim, cnt)
     type (hvcoord_t), intent(in) :: hvcoord
-    real(real_kind), intent(in) :: deta_ref(nlevp), eta(np,np,nlev)
+    real(real_kind), intent(in) :: deta_ref(nlev), eta(np,np,nlev)
     real(real_kind), intent(out) :: eta_lim(np,np,nlev)
     integer, intent(inout) :: cnt
 
-    real(real_kind) :: deta(nlevp)
+    real(real_kind) :: deta(nlev)
     integer :: i, j, k
     logical :: ok
 
     do j = 1, np
        do i = 1, np
-          ! Check nonmonotonicity in eta.
-          ok = eta(i,j,1) - hvcoord%etai(1) >= deta_tol
-          if (ok) then
-             do k = 2, nlev
-                if (eta(i,j,k) - eta(i,j,k-1) < deta_tol) then
-                   ok = .false.
-                   exit
-                end if
-             end do
-             if (ok) then
-                ok = hvcoord%etai(nlevp) - eta(i,j,nlev) >= deta_tol
+          ! Check for nonmonotonicity in eta.
+          ok = .true.
+          do k = 2, nlev
+             if (eta(i,j,k) - eta(i,j,k-1) < deta_tol) then
+                ok = .false.
+                exit
              end if
+          end do
+          if (ok) then
+             ok = hvcoord%etai(nlevp) - eta(i,j,nlev) >= deta_tol
           end if
           ! eta is monotonically increasing, so don't need to do anything
           ! further.
@@ -1829,33 +1862,30 @@ subroutine limit_etam(hvcoord, deta_ref, eta, eta_lim, cnt)
              cycle
           end if
           
-          deta(1) = eta(i,j,1) - hvcoord%etai(1)
-          do k = 2, nlev
-             deta(k) = eta(i,j,k) - eta(i,j,k-1)
+          do k = 1, nlev-1
+             deta(k) = eta(i,j,k+1) - eta(i,j,k)
           end do
-          deta(nlevp) = hvcoord%etai(nlevp) - eta(i,j,nlev)
-          ! [0, etam(1)] and [etam(nlev),1] are half levels, but deta_tol is so
-          ! small there's no reason not to use it as a lower bound for these.
+          deta(nlev) = hvcoord%etai(nlevp) - eta(i,j,nlev)
           cnt = cnt + 1
-          call deta_caas(nlevp, deta_ref, deta_tol, deta)
-          eta_lim(i,j,1) = hvcoord%etai(1) + deta(1)
-          do k = 2, nlev
-             eta_lim(i,j,k) = eta_lim(i,j,k-1) + deta(k)
+          call deta_caas(nlev, deta_ref, deta_tol, deta)
+          eta_lim(i,j,1) = eta(i,j,1)
+          do k = 1, nlev-1
+             eta_lim(i,j,k+1) = eta_lim(i,j,k) + deta(k)
           end do
        end do
     end do
-  end subroutine limit_etam
+  end subroutine limit_etai
 
-  subroutine deta_caas(nlp, deta_ref, lo, deta)
-    integer, intent(in) :: nlp
-    real(real_kind), intent(in) :: deta_ref(nlp), lo
-    real(real_kind), intent(inout) :: deta(nlp)
+  subroutine deta_caas(nl, deta_ref, lo, deta)
+    integer, intent(in) :: nl
+    real(real_kind), intent(in) :: deta_ref(nl), lo
+    real(real_kind), intent(inout) :: deta(nl)
 
-    real(real_kind) :: nerr, w(nlp)
+    real(real_kind) :: nerr, w(nl)
     integer :: k
 
     nerr = zero
-    do k = 1, nlp
+    do k = 1, nl
        if (deta(k) < lo) then
           nerr = nerr + (deta(k) - lo)
           deta(k) = lo
@@ -1906,25 +1936,25 @@ subroutine linterp(n, x, y, ni, xi, yi, caller)
     end do
   end subroutine linterp
 
-  subroutine eta_interp_eta(hvcoord, x, y, ni, xi, yi)
+  subroutine eta_interp_eta(hvcoord, n, x, y, ni, xi, yi)
     type (hvcoord_t), intent(in) :: hvcoord
-    real(real_kind), intent(in) :: x(np,np,nlev), y(nlev)
-    integer, intent(in) :: ni
+    integer, intent(in) :: n, ni
+    real(real_kind), intent(in) :: x(np,np,n), y(n)
     real(real_kind), intent(in) :: xi(ni)
     real(real_kind), intent(out) :: yi(np,np,ni)
 
-    real(real_kind) :: x01(nlev+2), y01(nlev+2)
+    real(real_kind) :: x01(n+2), y01(n+2)
     integer :: i, j
 
     x01(1) = hvcoord%etai(1)
-    x01(nlev+2) = hvcoord%etai(nlevp)
+    x01(n+2) = hvcoord%etai(nlevp)
     y01(1) = hvcoord%etai(1)
-    y01(2:nlev+1) = y
-    y01(nlev+2) = hvcoord%etai(nlevp)
+    y01(2:n+1) = y
+    y01(n+2) = hvcoord%etai(nlevp)
     do j = 1, np
        do i = 1, np
-          x01(2:nlev+1) = x(i,j,:)
-          call linterp(nlev+2, x01, y01, &
+          x01(2:n+1) = x(i,j,:)
+          call linterp(n+2, x01, y01, &
                &       ni, xi, yi(i,j,:), &
                &       'eta_interp_eta')
        end do
@@ -2070,6 +2100,37 @@ function assert(b, msg) result(nerr)
     nerr = 1
   end function assert
 
+  function test_estimate_derivative() result (nerr)
+    integer :: nerr
+
+    integer, parameter :: n = 3
+    real(real_kind), parameter :: x(3) = (/ -0.3, 0.1, 1.1 /)
+
+    real(real_kind) :: y(n), y_x_true(n), y_x_est(n)
+
+    integer :: k
+
+    nerr = 0
+
+    ! Linear.
+    do k = 1, n
+       y(k) = -1.2*x(k) + 0.7
+       y_x_true(k) = -1.2
+    end do
+    call estimate_derivative(n, x, y, y_x_est)
+    do k = 1, n
+       if (abs(y_x_est(k) - y_x_true(k)) > 10*eps) nerr = nerr + 1
+    end do
+
+    ! Quadratic.
+    do k = 1, n
+       y(k) = 0.7*x(k)**2 - 1.2*x(k) + 0.7
+       y_x_true(k) = 1.4*x(k) - 1.2
+    end do
+    call estimate_derivative(n, x, y, y_x_est)
+    if (abs(y_x_est(2) - y_x_true(2)) > 10*eps) nerr = nerr + 1
+  end function test_estimate_derivative
+
   function test_linterp() result (nerr)
     integer, parameter :: n = 128, ni = 111
 
@@ -2344,13 +2405,14 @@ subroutine cleanup(v)
     end subroutine cleanup
   end function test_init_velocity_record
 
-  subroutine sl_unittest(par, hvcoord)
+  subroutine sl_unittest(par, hvcoord, nerr)
     use kinds, only: iulog
 
     type (parallel_t), intent(in) :: par
     type (hvcoord_t), intent(in) :: hvcoord
+    integer, intent(out) :: nerr
 
-    integer :: n(6)
+    integer :: n(7)
 
     n(1) = test_lagrange()
     n(2) = test_reconstruct_and_limit_dp()
@@ -2358,9 +2420,11 @@ subroutine sl_unittest(par, hvcoord)
     n(4) = test_linterp()
     n(5) = test_eta_to_dp(hvcoord)
     n(6) = test_init_velocity_record()
+    n(7) = test_estimate_derivative()
 
-    if (sum(n) > 0 .and. par%masterproc) then
-       write(iulog,'(a,6i2)') 'COMPOSE> sl_unittest FAIL ', n
+    nerr = sum(n)
+    if (nerr > 0 .and. par%masterproc) then
+       write(iulog,'(a,7i2)') 'COMPOSE> sl_unittest FAIL ', n
     end if
   end subroutine sl_unittest
 
diff --git a/components/homme/src/test_mod.F90 b/components/homme/src/test_mod.F90
index 0f5220282cce..2a888f5ecc28 100644
--- a/components/homme/src/test_mod.F90
+++ b/components/homme/src/test_mod.F90
@@ -32,6 +32,7 @@ module test_mod
 use dry_planar_tests,     only: planar_rising_bubble_init, planar_density_current_init, planar_baroclinic_instab_init
 use moist_planar_tests,   only: planar_moist_rising_bubble_init, planar_moist_density_current_init, planar_moist_baroclinic_instab_init
 use moist_planar_tests,   only: planar_tropical_cyclone_init, planar_supercell_init
+use planar_transport_tests, only: test_conv_planar_advection
 
 implicit none
 
@@ -69,7 +70,7 @@ subroutine set_test_initial_conditions(elem, deriv, hybrid, hvcoord, tl, nets, n
     case('baroclinic');
     case('dcmip2012_test1_1');
     case('dcmip2012_test1_3a_conv', 'dcmip2012_test1_3b_conv', 'dcmip2012_test1_3c_conv', &
-         'dcmip2012_test1_3d_conv', 'dcmip2012_test1_3e_conv', 'dcmip2012_test1_3f_conv')
+         'dcmip2012_test1_3d_conv', 'dcmip2012_test1_3e_conv', 'dcmip2012_test1_3f_conv');
     case('dcmip2012_test1_2');
     case('dcmip2012_test1_3');
     case('dcmip2012_test2_0');
@@ -103,7 +104,8 @@ subroutine set_test_initial_conditions(elem, deriv, hybrid, hvcoord, tl, nets, n
     case('planar_moist_density_current');
     case('planar_moist_baroclinic_instab'); 
     case('planar_tropical_cyclone'); 
-    case('planar_supercell'); 
+    case('planar_supercell');
+    case('planar_transport_a');
     case default;               call abortmp('unrecognized test case')
   endselect
 
@@ -158,6 +160,10 @@ subroutine set_test_initial_conditions(elem, deriv, hybrid, hvcoord, tl, nets, n
       case('planar_moist_baroclinic_instab');       call planar_moist_baroclinic_instab_init(elem,hybrid,hvcoord,nets,nete)
       case('planar_tropical_cyclone');              call planar_tropical_cyclone_init(elem,hybrid,hvcoord,nets,nete)
       case('planar_supercell');                     call planar_supercell_init(elem,hybrid,hvcoord,nets,nete)
+      case('planar_transport_a')
+         midpoint_eta_dot_dpdn = .true.
+         call test_conv_planar_advection( &
+              test_case,elem,hybrid,hvcoord,deriv,nets,nete,0.0d0,1,timelevels)
       case default;               call abortmp('unrecognized test case')
 
     endselect
@@ -204,6 +210,8 @@ subroutine set_test_prescribed_wind(elem, deriv, hybrid, hvcoord, dt, tl, nets,
        call dcmip2012_test1_conv(test_case,elem,hybrid,hvcoord,deriv,nets,nete,time,np1,np1)
     case('dcmip2012_test1_2'); call dcmip2012_test1_2(elem,hybrid,hvcoord,nets,nete,time,np1,np1)
     case('dcmip2012_test1_3'); call dcmip2012_test1_3(elem,hybrid,hvcoord,nets,nete,time,np1,np1,deriv)
+    case('planar_transport_a')
+       call test_conv_planar_advection(test_case,elem,hybrid,hvcoord,deriv,nets,nete,time,np1,np1)
   endselect
 
 end subroutine
@@ -355,6 +363,7 @@ subroutine set_prescribed_wind(elem,deriv,hybrid,hv,dt,tl,nets,nete,eta_ave_w)
   subroutine print_test_results(elem, tl, hvcoord, par)
     use parallel_mod, only: parallel_t
     use dcmip12_wrapper, only: dcmip2012_print_test1_conv_results
+    use planar_transport_tests, only: print_conv_planar_advection_results
 
     type(element_t), intent(in) :: elem(:)
     type(timelevel_t), intent(in) :: tl
@@ -365,6 +374,8 @@ subroutine print_test_results(elem, tl, hvcoord, par)
     case('dcmip2012_test1_3a_conv', 'dcmip2012_test1_3b_conv', 'dcmip2012_test1_3c_conv', &
          'dcmip2012_test1_3d_conv', 'dcmip2012_test1_3e_conv', 'dcmip2012_test1_3f_conv')
        call dcmip2012_print_test1_conv_results(test_case, elem, tl, hvcoord, par, 1)
+    case('planar_transport_a')
+       call print_conv_planar_advection_results(test_case, elem, tl, hvcoord, par)
     end select
   end subroutine print_test_results
 
diff --git a/components/homme/src/test_src/dcmip2012_test1_conv_mod.F90 b/components/homme/src/test_src/dcmip2012_test1_conv_mod.F90
index b8805e4486ee..4026ebb1df3d 100644
--- a/components/homme/src/test_src/dcmip2012_test1_conv_mod.F90
+++ b/components/homme/src/test_src/dcmip2012_test1_conv_mod.F90
@@ -312,12 +312,17 @@ subroutine test1_conv_advection_orography( &
           tau_topo = tau_h
        end if
        u_topo_fac = -u0_topo/two
+       ! The mountain center moves with velocity
+       !    u(time) = cos(pi time/tau_topo) u_topo_fac
+       ! Then the position is
+       !    p(time) = int_0^time u(t) dt
+       !            = sin(pi time/tau_topo) (tau_topo/pi) u_topo_fac
        lambdam_t = lambdam_t + &
-            &      sin(pi*time/tau_topo)*(tau_topo/pi)*u_topo_fac & ! integral of u at lat = 0
+            &      sin(pi*time/tau_topo)*(tau_topo/pi)*u_topo_fac &
             &      /a ! to radians
     end if
     r = great_circle_dist(lambdam_t, phim, lon, lat)
-    if (r .lt. Rm) then
+    if (r < Rm) then
        zs = (h0/2.d0)*(one+cos(pi*r/Rm))*cos(pi*r/zetam)**2.d0
     else
        zs = zero
@@ -381,8 +386,9 @@ subroutine test1_conv_advection_orography( &
           fz_z = -3*sin(gz)**2*cos(gz)*gz_z
        end if
        c0 = w0_h*(rho0/rho)*cos(pi*time/tau_h)
-       w =    c0*(cos(lat)*fl_lat - 2*sin(lat)*fl)*fz
-       v = -a*c0*(cos(lat)*fl                    )*fz_z
+       ! Don't use this because !use_w:
+       !   w = c0*(cos(lat)*fl_lat - 2*sin(lat)*fl)*fz
+       v = -a*c0*(cos(lat)*fl)*fz_z
     case default
        call abortmp('test1_conv_advection_orography: invalid case')
     end select
@@ -470,6 +476,7 @@ subroutine test1_conv_print_results(test_case, elem, tl, hvcoord, par, subnum)
     use parallel_mod, only: global_shared_buf, global_shared_sum
     use global_norms_mod, only: wrap_repro_sum
     use physical_constants, only: Rd => Rgas, p0
+    use kinds, only: iulog
 
     character(len=*), intent(in) :: test_case
     type(element_t), intent(in) :: elem(:)
@@ -534,13 +541,14 @@ subroutine test1_conv_print_results(test_case, elem, tl, hvcoord, par, subnum)
     end do
     
     if (par%masterproc) then
-       print '(a)', 'test1_conv>                          l2                    linf'
+       write(iulog, '(a)') &
+            'test1_conv>                          l2                    linf'
        do iq = 1,qsize
           a = global_shared_sum(2*iq-1)
           b = global_shared_sum(2*iq)
           reldif = sqrt(a/b)
-          print '(a,i2,es24.16,es24.16)', 'test1_conv> Q', &
-               iq, reldif, linf_num(iq)/linf_den(iq)
+          write(iulog, '(a,i2,es24.16,es24.16)') &
+               'test1_conv> Q', iq, reldif, linf_num(iq)/linf_den(iq)
        end do
     end if
   end subroutine test1_conv_print_results
diff --git a/components/homme/src/test_src/planar_transport.F90 b/components/homme/src/test_src/planar_transport.F90
new file mode 100644
index 000000000000..97f156b6f911
--- /dev/null
+++ b/components/homme/src/test_src/planar_transport.F90
@@ -0,0 +1,241 @@
+#ifndef CAM
+#include "config.h"
+
+module planar_transport_tests
+  use kinds, only: rl=>real_kind, iulog
+  use element_mod, only: element_t
+  use parallel_mod, only: parallel_t, abortmp
+  use hybrid_mod, only: hybrid_t
+  use hybvcoord_mod, only: hvcoord_t, set_layer_locations
+  use derivative_mod, only: derivative_t, gradient_sphere
+  use element_ops, only: set_state, set_state_i
+  ! Planar geometry parameters.
+  use physical_constants, only: Lx, Ly, dd_pi, &
+       &                        Rgas, g, cp, pi => dd_pi, p0
+  use dimensions_mod, only: ne_x, ne_y, qsize, qsize_d, nlev, nlevp, np, nelemd
+  ! Test problem tools.
+  use dcmip12_wrapper, only: get_evenly_spaced_z, set_hybrid_coefficients, &
+       &                     pressure_thickness
+
+  implicit none
+  private
+
+  public :: test_conv_planar_advection, print_conv_planar_advection_results
+
+  real(rl), parameter :: &
+       tau     = 12.d0 * 86400.d0, & ! period of motion 12 days
+       T0      = 300.d0,           & ! temperature (K)
+       ztop    = 12000.d0,         & ! model top (m)
+       H       = Rgas * T0 / g       ! scale height
+
+  character :: tc_major, tc_minor
+  real(rl) :: zi(nlevp), zm(nlev)
+
+contains  
+
+  subroutine test_conv_planar_advection( &
+       test_case, elem, hybrid, hvcoord, deriv, nets, nete, time, n0, n1)
+    character(len=*), intent(in):: test_case
+    type (element_t), intent(inout), target :: elem(:)
+    type (hybrid_t), intent(in):: hybrid
+    type (hvcoord_t), intent(inout) :: hvcoord
+    type (derivative_t), intent(in):: deriv
+    integer, intent(in):: nets, nete, n0, n1
+    real(rl), intent(in):: time
+
+    integer :: ie, k, j, i, qi
+    real(rl) :: x, y, u, v, w, T, ps, phis, p, dp, z, q(qsize)
+    real(rl):: grad_p(np,np,2), p_i(np,np), u_i(np,np), v_i(np,np)
+
+    if (time <= 0.d0) then
+       call init(test_case, hybrid, hvcoord)
+    end if
+
+    w = 0
+    do ie = nets,nete
+       do k = 1,nlevp
+          do j = 1,np
+             do i = 1,np
+                x = elem(ie)%spherep(i,j)%lon
+                y = elem(ie)%spherep(i,j)%lat
+                if (k < nlevp) then
+                   call get_values(time, x, y, hvcoord%hyam(k), hvcoord%hybm(k), &
+                        &          ps, phis, p, z, T, u, v, q)
+                   dp = pressure_thickness(ps,k,hvcoord)
+                   if (time <= 0.d0) then
+                      do qi = 1, qsize
+                         elem(ie)%state%Q(i,j,k,qi) = q(qi)
+                         elem(ie)%state%Qdp(i,j,k,qi,:) = elem(ie)%state%Q(i,j,k,qi) * dp
+                      end do
+                   end if
+                   call set_state(u, v, w, T, ps, phis, p, dp, z, g, i, j, k, elem(ie), n0, n1)
+                end if
+                call get_values(time, x, y, hvcoord%hyai(k), hvcoord%hybi(k), &
+                     &          ps, phis, p, z, T, u, v, q)
+                call set_state_i(u, v, w, T, ps, phis, p, z, g, i, j, k, elem(ie), n0, n1)
+                p_i(i,j) = p
+                u_i(i,j) = u
+                v_i(i,j) = v
+             end do
+          end do
+          ! Get vertical mass flux. This is not used in the case of interest: 3D
+          ! transport. But include it for completeness.
+          grad_p = gradient_sphere(p_i,deriv,elem(ie)%Dinv)
+          elem(ie)%derived%eta_dot_dpdn_prescribed(:,:,k) = &
+               -u_i*grad_p(:,:,1) - v_i*grad_p(:,:,2)
+       end do
+       elem(ie)%derived%eta_dot_dpdn_prescribed(:,:,1)     = 0
+       elem(ie)%derived%eta_dot_dpdn_prescribed(:,:,nlevp) = 0       
+    end do
+  end subroutine test_conv_planar_advection
+
+  subroutine get_values(time, x, y, hya, hyb, &
+       &                ps, phis, p, z, T, u, v, q)
+    real(rl), intent(in) :: time, x, y, hya, hyb
+    real(rl), intent(out) :: ps, phis, p, z, T, u, v, q(qsize)
+
+    real(rl), parameter :: &
+         xm = 0.25, &
+         distm = 3.d0/8.d0, &
+         h0 = 2000.d0, &
+         ztop_t = 2000.d0, &
+         z1_h = ztop_t + 1000.d0, &
+         z2_h = ztop_t + 6000.d0, &
+         z0_h = (z1_h + z2_h)/2
+
+    real(rl) :: u_topo_fac, xm_t, dist, zs, ztaper
+
+    zs = 0
+    xm_t = xm*Lx
+    ! The mountain center moves in time.
+    u_topo_fac = -Lx/tau/2.d0
+    xm_t = xm_t + sin(pi*time/tau)*(tau/pi)*u_topo_fac
+    ! Mountain shape.
+    dist = min(min(abs(x - xm_t), &
+         &         abs(x - (xm_t - Lx))), &
+         &         abs(x - (xm_t + Lx)))
+    dist = dist/Lx
+    if (dist < distm) then
+       zs = h0*(1.d0 + cos(pi*(dist/distm)))*cos(pi*(6.d0*dist))**2.d0
+    end if
+
+    zs = -zs
+    phis = g*zs
+    ps = p0 * exp(-zs/H)
+    p = hya*p0 + hyb*ps
+    z = H * log(p0/p)
+
+    if (z <= 0) then
+       ztaper = 0
+    elseif (z >= ztop_t) then
+       ztaper = 1
+    else
+       ztaper = (1 + cos(pi*(1 + z/ztop_t)))/2
+    end if
+
+    ! Simple translation in x direction.
+    u = (Lx/tau)*ztaper
+    ! Account for moving ps.
+    u = u + cos(pi*time/tau)*u_topo_fac*(1 - ztaper)
+
+    v = 0
+    T = T0
+
+    if (time > 0.d0) return
+
+    if (qsize == 0) return
+    q = 0
+    if (z < z2_h .and. z > z1_h) then
+       q(1) = 0.5d0 * (1 + cos(2.d0*pi*(z-z0_h)/(z2_h-z1_h)))
+    end if
+    if (qsize > 2) q(3) = q(1)
+    q(1) = q(1)*(1 + cos(6.d0*pi*(x/Lx)))
+    q(2) = q(1)
+    if (qsize > 3) q(3:qsize) = q(1)
+  end subroutine get_values
+  
+  subroutine print_conv_planar_advection_results(test_case, elem, tl, hvcoord, par)
+    use time_mod, only: timelevel_t
+    use parallel_mod, only: global_shared_buf, global_shared_sum, pmax_1d
+    use global_norms_mod, only: wrap_repro_sum
+
+    character(len=*), intent(in) :: test_case
+    type(element_t), intent(in) :: elem(:)
+    type(timelevel_t), intent(in) :: tl
+    type(hvcoord_t), intent(in) :: hvcoord
+    type(parallel_t), intent(in) :: par
+
+    integer :: ie, k, j, i, iq
+    real(rl) :: time, x, y, ps, phis, p, z, T, u, v, q(np,np,qsize), &
+         &      reldif, linf_num(qsize), linf_den(qsize), a, b
+    
+    ! Set time to 0 to get the initial conditions.
+    time = 0
+
+    linf_num = 0
+    linf_den = 0
+    do ie = 1,nelemd
+       global_shared_buf(ie,:2*qsize) = 0
+       do k = 1,nlev
+          do j = 1,np
+             do i = 1,np
+                x = elem(ie)%spherep(i,j)%lon
+                y = elem(ie)%spherep(i,j)%lat
+                call get_values(time, x, y, hvcoord%hyam(k), hvcoord%hybm(k), &
+                     &          ps, phis, p, z, T, u, v, q(i,j,:))
+             end do
+          end do
+          do iq = 1,qsize
+             global_shared_buf(ie,2*iq-1) = global_shared_buf(ie,2*iq-1) + &
+                  sum(elem(ie)%spheremp*(elem(ie)%state%Q(:,:,k,iq) - q(:,:,iq))**2)
+             global_shared_buf(ie,2*iq) = global_shared_buf(ie,2*iq) + &
+                  sum(elem(ie)%spheremp*q(:,:,iq)**2)
+             linf_num(iq) = max(linf_num(iq), &
+                  maxval(abs(elem(ie)%state%Q(:,:,k,iq) - q(:,:,iq))))
+             linf_den(iq) = max(linf_den(iq), &
+                  maxval(abs(q(:,:,iq))))
+          end do
+       end do
+    end do
+
+    call wrap_repro_sum(nvars=2*qsize, comm=par%comm)
+    do iq = 1, qsize
+       linf_num(iq) = pmax_1d(linf_num(iq:iq), par)
+       linf_den(iq) = pmax_1d(linf_den(iq:iq), par)
+    end do
+    
+    if (par%masterproc) then
+       write(iulog, '(a)') &
+            'planar_conv>                          l2                    linf'
+       do iq = 1,qsize
+          a = global_shared_sum(2*iq-1)
+          b = global_shared_sum(2*iq)
+          reldif = sqrt(a/b)
+          write(iulog, '(a,i2,es24.16,es24.16)') &
+               'planar_conv> Q', iq, reldif, linf_num(iq)/linf_den(iq)
+       end do
+    end if
+  end subroutine print_conv_planar_advection_results
+
+  subroutine init(test_case, hybrid, hvcoord)
+    character(len=*), intent(in):: test_case
+    type (hybrid_t), intent(in):: hybrid
+    type (hvcoord_t), intent(inout) :: hvcoord
+
+    !$omp barrier
+    !$omp master
+    ! Major and minor test case codes.
+    tc_major = test_case(17:17)
+    tc_minor = test_case(18:18) ! currently unused
+    ! Vertical dimension.
+    call get_evenly_spaced_z(zi, zm, 0.d0, ztop)
+    hvcoord%etai = exp(-zi/H)
+    call set_hybrid_coefficients(hvcoord, hybrid, hvcoord%etai(1), 1.d0)
+    call set_layer_locations(hvcoord, .true., hybrid%masterthread)
+    !$omp end master
+    !$omp barrier
+  end subroutine init
+  
+end module planar_transport_tests
+
+#endif
diff --git a/components/homme/src/theta-l_kokkos/CMakeLists.txt b/components/homme/src/theta-l_kokkos/CMakeLists.txt
index 06257590abe1..37f5bb9b604b 100644
--- a/components/homme/src/theta-l_kokkos/CMakeLists.txt
+++ b/components/homme/src/theta-l_kokkos/CMakeLists.txt
@@ -122,6 +122,7 @@ MACRO(THETAL_KOKKOS_SETUP)
     ${TEST_SRC_DIR}/dcmip2016-tropical-cyclone.F90
     ${TEST_SRC_DIR}/held_suarez_mod.F90
     ${TEST_SRC_DIR}/dry_planar.F90
+    ${TEST_SRC_DIR}/planar_transport.F90
     ${TEST_SRC_DIR}/moist_planar.F90
     ${TEST_SRC_DIR}/mtests.F90
   )
diff --git a/components/homme/test/reg_test/namelists/planar-transport-a-etm.nl b/components/homme/test/reg_test/namelists/planar-transport-a-etm.nl
new file mode 100644
index 000000000000..1fd010d68b60
--- /dev/null
+++ b/components/homme/test/reg_test/namelists/planar-transport-a-etm.nl
@@ -0,0 +1,62 @@
+! Prescribed-wind planar transport with spatially and temporally varying ps.
+
+&ctl_nl
+  nthreads          = 1
+  partmethod        = 4
+  topology          = "plane"
+  geometry          = "plane"
+  test_case         = "planar_transport_a"
+  prescribed_wind   = 1
+  theta_hydrostatic_mode = .true.
+  vert_remap_q_alg  = 10
+  moisture          = 'moist'
+  qsize             = 3
+  statefreq         = 1000
+  restartfreq       = -1
+  runtype           = 0
+  integration       = 'explicit'
+  tstep_type        = 5
+  ne_x              = 128
+  ne_y              = 5
+  planar_slice      = .true.
+  ndays             = 12
+  hypervis_order    = 2
+  hypervis_scaling  = 3.2
+  nu                = 0.01
+  nu_top            = 0.0
+  nu_p              = 42 ! to satisfy kokkos exe
+  hypervis_order    = 2                         ! 2 = hyperviscosity
+  hypervis_subcycle = 1                         ! 1 = no hypervis subcycling
+  hypervis_subcycle_tom = 1
+  limiter_option    = 9
+  tstep             = 1800
+  transport_alg       = 12
+  dt_remap_factor     = 0
+  dt_tracer_factor    = 6
+  hypervis_subcycle_q = 6
+  semi_lagrange_hv_q  = 1
+  semi_lagrange_trajectory_nsubstep  = 4        ! >0: Enhanced trajectory method
+  semi_lagrange_trajectory_nvelocity = 3        ! >2: Extra velocity snapshots
+  semi_lagrange_halo = 3
+  semi_lagrange_nearest_point_lev = 0
+/
+&vert_nl
+  vanalytic         = 1                         ! set vcoords in initialization routine
+  vtop              = 2.73919e-1                ! vertical coordinate at top of atm (z=10000m)
+/
+&analysis_nl
+  output_dir        = "./movies/"               ! destination dir for netcdf file
+  output_timeunits  = 1,                        ! 1=days, 2=hours, 0=timesteps
+  output_frequency  = 6,
+  output_varnames1  = 'Q','Q2','Q3','ps'
+  interp_type       = 0                         ! 0=native grid, 1=bilinear
+  output_type       = 'netcdf'                  ! netcdf or pnetcdf
+  num_io_procs      = 1
+  interp_nlat       = 128
+  interp_nlon       = 256
+  interp_gridtype   = 2                         ! gauss grid
+/
+&prof_inparm
+  profile_outpe_num   = 100
+  profile_single_file	= .true.
+/
diff --git a/components/homme/test/reg_test/run_tests/planar-transport-a-etm-kokkos.cmake b/components/homme/test/reg_test/run_tests/planar-transport-a-etm-kokkos.cmake
new file mode 100644
index 000000000000..17249c1e4f4d
--- /dev/null
+++ b/components/homme/test/reg_test/run_tests/planar-transport-a-etm-kokkos.cmake
@@ -0,0 +1,10 @@
+set(TEST_NAME planar-transport-a-etm-kokkos)
+set(EXEC_NAME theta-l-nlev128-native-kokkos)
+set(NAMELIST_FILES ${HOMME_ROOT}/test/reg_test/namelists/planar-transport-a-etm.nl)
+set(NUM_CPUS 16)
+
+set(NC_OUTPUT_FILES planar_transport_a1.nc)
+
+check_transport_error_norms(
+  ${TEST_NAME} "planar_conv" "${TEST_NAME}_1.out"
+  "3.9e-3;3.9e-3;4.1e-3")
diff --git a/components/homme/test/reg_test/run_tests/planar-transport-a-etm.cmake b/components/homme/test/reg_test/run_tests/planar-transport-a-etm.cmake
new file mode 100644
index 000000000000..b95c774d6c55
--- /dev/null
+++ b/components/homme/test/reg_test/run_tests/planar-transport-a-etm.cmake
@@ -0,0 +1,10 @@
+set(TEST_NAME planar-transport-a-etm)
+set(EXEC_NAME theta-l-nlev128-native)
+set(NAMELIST_FILES ${HOMME_ROOT}/test/reg_test/namelists/planar-transport-a-etm.nl)
+set(NUM_CPUS 16)
+
+set(NC_OUTPUT_FILES planar_transport_a1.nc)
+
+check_transport_error_norms(
+  ${TEST_NAME} "planar_conv" "${TEST_NAME}_1.out"
+  "3.9e-3;3.9e-3;4.1e-3")
diff --git a/components/homme/test/reg_test/run_tests/test-list.cmake b/components/homme/test/reg_test/run_tests/test-list.cmake
index c53deb98cede..09607d387dcb 100644
--- a/components/homme/test/reg_test/run_tests/test-list.cmake
+++ b/components/homme/test/reg_test/run_tests/test-list.cmake
@@ -43,10 +43,13 @@ SET(HOMME_TESTS
 
 IF (HOMME_ENABLE_COMPOSE)
   LIST(APPEND HOMME_TESTS
+    thetanh-moist-bubble-sl.cmake
+    thetanh-moist-bubble-sl-pg2.cmake
     thetah-sl-test11conv-r1t2-cdr20.cmake
     thetah-sl-test11conv-r0t1-cdr30-rrm.cmake
     thetah-sl-dcmip16_test1pg2.cmake
-    thetah-sl-testconv-3e.cmake)
+    thetah-sl-testconv-3e.cmake
+    planar-transport-a-etm.cmake)
 ENDIF()
 
 SET(HOMME_RUN_TESTS_DIR ${HOMME_SOURCE_DIR}/test/reg_test/run_tests)
@@ -92,34 +95,34 @@ IF (BUILD_HOMME_THETA_KOKKOS)
   # Various one-off tests.
   IF (HOMME_ENABLE_COMPOSE)
     LIST(APPEND HOMME_TESTS
-      thetah-sl-test11conv-r0t1-cdr30-rrm-kokkos.cmake
-      thetah-sl-testconv-3e-kokkos.cmake)
+      thetah-sl-testconv-3e-kokkos.cmake
+      planar-transport-a-etm-kokkos.cmake)
     IF (HOMMEXX_BFB_TESTING)
+      LIST(APPEND HOMME_TESTS
+        thetah-sl-test11conv-r0t1-cdr30-rrm-kokkos.cmake
+        thetanh-moist-bubble-sl-kokkos.cmake
+        thetanh-moist-bubble-sl-pg2-kokkos.cmake)
       LIST(APPEND HOMME_ONEOFF_CVF_TESTS
-        thetah-sl-test11conv-r0t1-cdr30-rrm)
+        thetah-sl-test11conv-r0t1-cdr30-rrm
+        thetanh-moist-bubble-sl
+        thetanh-moist-bubble-sl-pg2)
     ENDIF()
   ENDIF()
-  LIST(APPEND HOMME_TESTS
-    thetanh-moist-bubble-kokkos.cmake
-    thetanh-dry-bubble-kokkos.cmake
-    thetah-nhgw-kokkos.cmake
-    thetanh-nhgw-kokkos.cmake
-    thetah-nhgw-slice-kokkos.cmake
-    thetanh-nhgw-slice-kokkos.cmake
-    thetanh-moist-bubble-sl.cmake
-    thetanh-moist-bubble-sl-kokkos.cmake
-    thetanh-moist-bubble-sl-pg2.cmake
-    thetanh-moist-bubble-sl-pg2-kokkos.cmake)
   IF (HOMMEXX_BFB_TESTING)
+    LIST(APPEND HOMME_TESTS
+      thetanh-moist-bubble-kokkos.cmake
+      thetanh-dry-bubble-kokkos.cmake
+      thetah-nhgw-kokkos.cmake
+      thetanh-nhgw-kokkos.cmake
+      thetah-nhgw-slice-kokkos.cmake
+      thetanh-nhgw-slice-kokkos.cmake)
     LIST(APPEND HOMME_ONEOFF_CVF_TESTS
       thetanh-moist-bubble
       thetanh-dry-bubble
       thetah-nhgw
       thetanh-nhgw
       thetah-nhgw-slice
-      thetanh-nhgw-slice
-      thetanh-moist-bubble-sl
-      thetanh-moist-bubble-sl-pg2)
+      thetanh-nhgw-slice)
   ENDIF()
 
   #cmake/namelist will be built with create-... script
diff --git a/components/homme/test/reg_test/run_tests/thetah-sl-testconv-3e-kokkos.cmake b/components/homme/test/reg_test/run_tests/thetah-sl-testconv-3e-kokkos.cmake
index 4d87d00d6552..aabd4aaf228b 100644
--- a/components/homme/test/reg_test/run_tests/thetah-sl-testconv-3e-kokkos.cmake
+++ b/components/homme/test/reg_test/run_tests/thetah-sl-testconv-3e-kokkos.cmake
@@ -9,3 +9,7 @@ SET(NAMELIST_FILES ${HOMME_ROOT}/test/reg_test/namelists/thetah-sl-testconv-3e.n
 
 # compare all of these files against baselines:
 SET(NC_OUTPUT_FILES dcmip2012_test1_3e_conv1.nc)
+
+check_transport_error_norms(
+  ${TEST_NAME} "test1_conv" "${TEST_NAME}_1.out"
+  "9.1e-2;1.1e-1;4.0e-2;8.1e-2")
diff --git a/components/homme/test/reg_test/run_tests/thetah-sl-testconv-3e.cmake b/components/homme/test/reg_test/run_tests/thetah-sl-testconv-3e.cmake
index 6eaff42f94d2..0e97b59daa0f 100644
--- a/components/homme/test/reg_test/run_tests/thetah-sl-testconv-3e.cmake
+++ b/components/homme/test/reg_test/run_tests/thetah-sl-testconv-3e.cmake
@@ -9,3 +9,8 @@ SET(NAMELIST_FILES ${HOMME_ROOT}/test/reg_test/namelists/thetah-sl-testconv-3e.n
 
 # compare all of these files against baselines:
 SET(NC_OUTPUT_FILES dcmip2012_test1_3e_conv1.nc)
+
+check_transport_error_norms(
+  ${TEST_NAME} "test1_conv" "${TEST_NAME}_1.out"
+  "9.1e-2;1.1e-1;4.0e-2;8.1e-2")
+ 
\ No newline at end of file
diff --git a/components/homme/test_execs/CMakeLists.txt b/components/homme/test_execs/CMakeLists.txt
index b67f286cd024..01a74e98766b 100644
--- a/components/homme/test_execs/CMakeLists.txt
+++ b/components/homme/test_execs/CMakeLists.txt
@@ -210,7 +210,7 @@ IF(${BUILD_HOMME_THETA})
 #  ADD_SUBDIRECTORY(theta-l-nlev30-native)
 #  ADD_SUBDIRECTORY(theta-l-nlev64-native)
 #  ADD_SUBDIRECTORY(theta-l-nlev100-native)
-#  ADD_SUBDIRECTORY(theta-l-nlev128-native)
+  ADD_SUBDIRECTORY(theta-l-nlev128-native)
 #  ADD_SUBDIRECTORY(theta-l-nlev140-native)
 #  ADD_SUBDIRECTORY(theta-l-nlev150-native)
 #  ADD_SUBDIRECTORY(theta-l-nlev200-native)
@@ -237,6 +237,7 @@ IF (${BUILD_HOMME_THETA_KOKKOS})
     # For doubly periodic tests.
     ADD_SUBDIRECTORY(theta-l-nlev20-native-kokkos)
     ADD_SUBDIRECTORY(theta-l-nlev10-native-kokkos)
+    ADD_SUBDIRECTORY(theta-l-nlev128-native-kokkos)
 
     # This one we build it only if both theta and theta_kokkos are requested,
     # since it's just needed for perf/correctness comparisons
@@ -271,7 +272,7 @@ INCLUDE(${HOMME_SOURCE_DIR}/test/reg_test/run_tests/test-list.cmake)
 SET(TEST_DIR_LIST "")
 createTests(HOMME_TESTS)
 
-IF (${BUILD_HOMME_PREQX_KOKKOS})
+IF (HOMMEXX_BFB_TESTING AND BUILD_HOMME_PREQX_KOKKOS)
   MESSAGE("-- Generating preqx tests no more intensive than profile ${HOMME_TESTING_PROFILE}.\n"
     "   Example ctest lines:\n"
     "     ctest -LE \"nightly\": All tests except nightly.\n"
@@ -283,7 +284,7 @@ IF (${BUILD_HOMME_PREQX_KOKKOS})
     CREATE_CXX_VS_F90_TESTS_WITH_PROFILE(PREQX_COMPARE_F_C_TEST ${p})
   ENDFOREACH ()
 ENDIF()
-IF (${BUILD_HOMME_THETA_KOKKOS})
+IF (HOMMEXX_BFB_TESTING AND BUILD_HOMME_THETA_KOKKOS)
   MESSAGE("-- Generating theta tests no more intensive than profile ${HOMME_TESTING_PROFILE}.\n"
     "   Example ctest lines:\n"
     "     ctest -LE \"nightly\": All tests except nightly.\n"
diff --git a/components/homme/test_execs/theta-l-nlev128-native-kokkos/CMakeLists.txt b/components/homme/test_execs/theta-l-nlev128-native-kokkos/CMakeLists.txt
new file mode 100644
index 000000000000..351b1037afa1
--- /dev/null
+++ b/components/homme/test_execs/theta-l-nlev128-native-kokkos/CMakeLists.txt
@@ -0,0 +1,3 @@
+THETAL_KOKKOS_SETUP()
+#              name                         target         NP NC PLEV USE_PIO WITH_ENERGY QSIZE_D
+createTestExec(theta-l-nlev128-native-kokkos theta-l_kokkos  4  4   128    TRUE       FALSE       3)
diff --git a/components/homme/test_execs/thetal_kokkos_ut/compose_interface.F90 b/components/homme/test_execs/thetal_kokkos_ut/compose_interface.F90
index 4e11b0c54183..4a9e708b8261 100644
--- a/components/homme/test_execs/thetal_kokkos_ut/compose_interface.F90
+++ b/components/homme/test_execs/thetal_kokkos_ut/compose_interface.F90
@@ -43,8 +43,8 @@ subroutine init_compose_f90(ne, hyai, hybi, hyam, hybm, ps0, dvv, mp, qsize_in,
     vert_remap_q_alg = 10
     qsplit = 1
     rsplit = 1
-    dt_tracer_factor = -1
-    dt_remap_factor = -1
+    dt_tracer_factor = 1
+    dt_remap_factor = 1
     theta_hydrostatic_mode = .true.
     semi_lagrange_nearest_point_lev = -1
     if (nearest_point) semi_lagrange_nearest_point_lev = 100000
@@ -144,7 +144,7 @@ subroutine cleanup_compose_f90() bind(c)
     call cleanup_f90()
   end subroutine cleanup_compose_f90
 
-  subroutine run_compose_standalone_test_f90(nmax_out, eval) bind(c)
+  subroutine run_compose_standalone_test_f90(nmax_out, eval, nerr) bind(c)
     use thetal_test_interface, only: deriv, hvcoord
     use compose_test_mod, only: compose_test
     use domain_mod, only: domain1d_t
@@ -155,6 +155,7 @@ subroutine run_compose_standalone_test_f90(nmax_out, eval) bind(c)
 
     integer(c_int), intent(inout) :: nmax_out
     real(c_double), intent(out) :: eval((nlev+1)*qsize)
+    integer(c_int), intent(out) :: nerr
 
     type (domain1d_t), pointer :: dom_mt(:)
     real(real_kind) :: buf((nlev+1)*qsize)
@@ -174,7 +175,7 @@ subroutine run_compose_standalone_test_f90(nmax_out, eval) bind(c)
        nmax = nmax_out
     end if
     statefreq = 2*ne
-    call compose_test(par, hvcoord, dom_mt, elem, buf)
+    call compose_test(par, hvcoord, dom_mt, elem, nerr, buf)
     do i = 1,size(buf)
        eval(i) = buf(i)
     end do
diff --git a/components/homme/test_execs/thetal_kokkos_ut/compose_ut.cpp b/components/homme/test_execs/thetal_kokkos_ut/compose_ut.cpp
index 85a300567665..22e235c38576 100644
--- a/components/homme/test_execs/thetal_kokkos_ut/compose_ut.cpp
+++ b/components/homme/test_execs/thetal_kokkos_ut/compose_ut.cpp
@@ -41,7 +41,7 @@ extern "C" {
                         bool nearest_point, int halo, int traj_nsubstep);
   void init_geometry_f90();
   void cleanup_compose_f90();
-  void run_compose_standalone_test_f90(int* nmax, Real* eval);
+  void run_compose_standalone_test_f90(int* nmax, Real* eval, int* nerr);
   void run_trajectory_f90(Real t0, Real t1, bool independent_time_steps, Real* dep,
                           Real* dprecon);
   void run_sl_vertical_remap_bfb_f90(Real* diagnostic);
@@ -136,8 +136,8 @@ struct Session {
     p.remap_alg = RemapAlg::PPM_LIMITED_EXTRAP;
     p.qsplit = 1;
     p.rsplit = 1;
-    p.dt_tracer_factor = -1;
-    p.dt_remap_factor = -1;
+    p.dt_tracer_factor = 1;
+    p.dt_remap_factor = 1;
     p.params_set = true;
     p.theta_hydrostatic_mode = true;
     p.scale_factor = is_sphere ? PhysicalConstants::rearth0 : 1;
@@ -394,9 +394,10 @@ TEST_CASE ("compose_transport_testing") {
   do { // breakable
 
   if (s.run_only_advection_test) {
-    int nmax = s.nmax;
+    int nmax = s.nmax, nerr;
     std::vector<Real> eval_f((s.nlev+1)*s.qsize);
-    run_compose_standalone_test_f90(&nmax, eval_f.data());
+    run_compose_standalone_test_f90(&nmax, eval_f.data(), &nerr);
+    REQUIRE(nerr == 0);
     break;
   }
 
@@ -451,9 +452,10 @@ TEST_CASE ("compose_transport_testing") {
   }
 
   { // 2D SL BFB
-    int nmax = s.nmax;
+    int nmax = s.nmax, nerr;
     std::vector<Real> eval_f((s.nlev+1)*s.qsize), eval_c(eval_f.size());
-    run_compose_standalone_test_f90(&nmax, eval_f.data());
+    run_compose_standalone_test_f90(&nmax, eval_f.data(), &nerr);
+    REQUIRE(nerr == 0);
     for (const bool bfb : {false, true}) {
 #if ! defined HOMMEXX_BFB_TESTING
       if (bfb) continue;