diff --git a/CMakeLists.txt b/CMakeLists.txt
index d79d4547e..be26cdd1c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,6 +100,17 @@ if (TARGET MADworld)
   message(STATUS "MADNESS_FOUND=1")
 endif(TARGET MADworld)
 
+
+##########################
+#### CUDA
+##########################
+check_language(CUDA)
+if(CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+endif(CMAKE_CUDA_COMPILER)
+set(TTG_HAVE_CUDA ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if TTG supports compiling .cu files")
+
+
 ##########################
 #### Examples
 ##########################
diff --git a/examples/madness/mrattg.cc b/examples/madness/mrattg.cc
index 47bce1ce9..2b2d6c113 100644
--- a/examples/madness/mrattg.cc
+++ b/examples/madness/mrattg.cc
@@ -124,6 +124,46 @@ auto make_project(functorT& f,
   return ttg::make_tt(F, edges(fuse(refine, ctl)), edges(refine, result), name, {"control"}, {"refine", "result"});
 }
 
+
+/// Returns an std::unique_ptr to the object
+template <typename functorT, typename T, size_t K, Dimension NDIM>
+auto make_project_device(functorT& f,
+                         const T thresh,  /// should be scalar value not complex
+                         ctlEdge<NDIM>& ctl, rnodeEdge<T, K, NDIM>& result, const std::string& name = "project") {
+  auto F = [f, thresh](const Key<NDIM>& key, std::tuple<ctlOut<NDIM>, rnodeOut<T, K, NDIM>>& out) {
+    FunctionReconstructedNode<T, K, NDIM> node(key);  // Our eventual result
+    auto& coeffs = node.coeffs;                       // Need to clean up OO design
+    bool is_leaf;
+
+    if (key.level() < initial_level(f)) {
+      for (auto child : children(key)) ttg::sendk<0>(child, out);
+      coeffs = T(1e7);  // set to obviously bad value to detect incorrect use
+      is_leaf = false;
+    } else if (is_negligible<functorT, T, NDIM>(f, Domain<NDIM>::template bounding_box<T>(key),
+                                                truncate_tol(key, thresh))) {
+      coeffs = T(0.0);
+      is_leaf = true;
+    } else {
+      auto node_view  = ttg::make_view(node, ttg::ViewScope::Out); // no need to move node onto the device
+      auto is_leaf_view = ttg::make_view(is_leaf, ttg::ViewScope::Out);
+      co_await ttg::device::wait_views{};
+      fcoeffs<functorT, T, K>(f, key, thresh,
+                              node_view.get_device_ptr<0>(),
+                              is_leaf_view.get_device_ptr<0>());  // cannot deduce K
+      co_await ttg::device::wait_kernel{};
+      if (!is_leaf) {
+        for (auto child : children(key)) ttg::sendk<0>(child, out);  // should be broadcast ?
+      }
+    }
+    node.is_leaf = is_leaf;
+    ttg::send<1>(key, node, out);  // always produce a result
+  };
+  ctlEdge<NDIM> refine("refine");
+  return ttg::make_tt(F, edges(fuse(refine, ctl)), edges(refine, result), name, {"control"}, {"refine", "result"});
+}
+
+
+
 namespace detail {
   template <typename T, size_t K, Dimension NDIM>
   struct tree_types {};
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 47dc543e6..73d8a152a 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -2,16 +2,18 @@ include(AddTTGExecutable)
 
 # TT unit test: core TTG ops
 set(ut_src
-        fibonacci.cc
-        device_coro.cc
-        ranges.cc
-        tt.cc
-        unit_main.cpp)
+        #fibonacci.cc
+        #ranges.cc
+        #tt.cc
+        unit_main.cpp
+    )
 set(ut_libs Catch2::Catch2)
-if (TARGET std::coroutine)
-    list(APPEND ut_src fibonacci-coro.cc)
+#if (TARGET std::coroutine)
+    #list(APPEND ut_src fibonacci-coro.cc)
+    list(APPEND ut_src device_coro.cc)
+    list(APPEND ut_src cuda_kernel.cu)
     list(APPEND ut_libs std::coroutine)
-endif()
+#endif()
 add_ttg_executable(core-unittests-ttg "${ut_src}" LINK_LIBRARIES "${ut_libs}")
 
 
diff --git a/tests/unit/device_coro.cc b/tests/unit/device_coro.cc
index 1e35c4586..11ed77d91 100644
--- a/tests/unit/device_coro.cc
+++ b/tests/unit/device_coro.cc
@@ -3,12 +3,154 @@
 #include "ttg.h"
 #include "ttg/view.h"
 
+#include "cuda_kernel.h"
+
+struct value_t {
+  ttg::buffer<double> db; // TODO: rename
+  int quark;
+
+  template<typename Archive>
+  void ttg_serialize(Archive& ar) {
+    ar& quark;
+    ar& db; // input:
+  }
+};
+
+/* devicebuf is non-POD so provide serialization
+ * information for members not a devicebuf */
+namespace madness::archive {
+  template <class Archive>
+  struct ArchiveSerializeImpl<Archive, value_t> {
+    static inline void serialize(const Archive& ar, value_t& obj) { ar& obj.quark & obj.db; };
+  };
+}  // namespace madness::archive
+
+
 
 TEST_CASE("Device", "coro") {
+
+  SECTION("devicebuf") {
+
+    ttg::Edge<int, value_t> edge;
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device_task {
+      ttg::print("device_task key ", key);
+      /* wait for the view to be available on the device */
+      co_await ttg::to_device(val.db);
+      /* once we're back here the data has been transferred */
+      CHECK(val.db.current_device_ptr() != nullptr);
+
+      /* NO KERNEL */
+
+      /* here we suspend to wait for a kernel to complete */
+      co_await ttg::wait_kernel();
+
+      /* we're back, the kernel executed and we can send */
+      if (key < 1) {
+        /* TODO: should we move the view in here if we want to get the device side data */
+        ttg::send<0>(key+1, std::move(val));
+      }
+    };
+
+    //ptr.get_view<ttg::ExecutionSpace::CUDA>(device_id);
+
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    make_graph_executable(tt);
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+
+  SECTION("scratch") {
+
+    ttg::Edge<int, value_t> edge;
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device_task {
+      double scratch = 0.0;
+      ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::SyncOut);
+
+      /* wait for the view to be available on the device */
+      co_await ttg::to_device(ds, val.db);
+      /* once we're back here the data has been transferred */
+      CHECK(ds.device_ptr()  != nullptr);
+
+      /* call a kernel */
+      increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
+
+      /* here we suspend to wait for a kernel to complete */
+      co_await ttg::wait_kernel();
+
+      /* buffer is increment once per task, so it should be the same as key */
+      CHECK(static_cast<int>(scratch) == key);
+
+      /* we're back, the kernel executed and we can send */
+      if (key < 10) {
+        /* TODO: should we move the view in here if we want to get the device side data */
+        ttg::send<0>(key+1, std::move(val));
+      }
+    };
+
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    make_graph_executable(tt);
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+
+  SECTION("ptr") {
+
+    ttg::Edge<int, value_t> edge;
+    ttg::Ptr<value_t> ptr;
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device_task {
+      double scratch = 1.0;
+      ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::SyncOut);
+
+      /* wait for the view to be available on the device */
+      co_await ttg::to_device(ds, val.db);
+      /* once we're back here the data has been transferred */
+      CHECK(ds.device_ptr()  != nullptr);
+
+      /* KERNEL */
+      increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
+
+      /* here we suspend to wait for a kernel and the out-transfer to complete */
+      co_await ttg::wait_kernel_out(val.db);
+
+      /* buffer is increment once per task, so it should be the same as key */
+      CHECK(static_cast<int>(scratch) == key);
+      CHECK(static_cast<int>(*val.db.host_ptr()) == key);
+
+      /* we're back, the kernel executed and we can send */
+      if (key < 10 || scratch < 0.0) {
+        ttg::send<0>(key+1, std::move(val));
+      } else {
+        /* exfiltrate the value */
+        /* TODO: what consistency do we expect from get_ptr? */
+        ptr = ttg::get_ptr(val);
+      }
+    };
+
+    //ptr.get_view<ttg::ExecutionSpace::CUDA>(device_id);
+
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    make_graph_executable(tt);
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    ttg::ttg_fence(ttg::default_execution_context());
+    CHECK(ptr.is_valid());
+
+    /* feed the ptr back into a graph */
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(11, ptr);
+    ttg::ttg_fence(ttg::default_execution_context());
+
+    ptr.reset();
+  }
+
+
+#if 0
+
   SECTION("device_task") {
     ttg::Edge<int, double> edge;
     auto fn = [&](const int& key, double&& val) -> ttg::device_task {
-      ttg::View<double, double> view = ttg::make_view(val, ttg::ViewScope::SyncInOut);
+      ttg::View<double> view = ttg::make_view(val, ttg::ViewScope::SyncInOut);
       ttg::print("device_task key ", key, ", value ", val);
       /* wait for the view to be available on the device */
       co_yield view;
@@ -27,7 +169,7 @@ TEST_CASE("Device", "coro") {
 
       /* we're back, the kernel executed and we can send */
       if (key < 10) {
-        ttg::send<0>(key+1, val);
+        ttg::send<0>(key+1, std::move(val));
       }
     };
     auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
@@ -37,7 +179,154 @@ TEST_CASE("Device", "coro") {
     ttg::ttg_fence(ttg::default_execution_context());
   }
 
+  SECTION("get_ptr") {
+    ttg::Edge<int, double> edge;
+    ttg::ptr<double> ptr;
+    auto fn = [&](const int& key, double&& val) -> ttg::device_task {
+      ttg::View<double> view = ttg::make_view(val, ttg::ViewScope::SyncInOut);
+      ttg::print("device_task key ", key, ", value ", val);
+      /* wait for the view to be available on the device */
+      co_yield view;
+      // co_yield std::tie(view1, view2);
+      // TTG_WAIT_VIEW(view);
+      /* once we're back here the data has been transferred */
+      //CHECK(view.get_rw_device_ptr<0>()  != nullptr);
+      CHECK(view.get_device_ptr<0>()  != nullptr);
+      CHECK(view.get_device_size<0>() == sizeof(val));
+      CHECK(&view.get_host_object() == &val);
+
+      ttg::print("device_task key ", key, ", device pointer ", view.get_device_ptr<0>());
+
+      /* here we suspend to wait for a kernel to complete */
+      co_yield ttg::device_op_wait_kernel{};
+      // TTG_WAIT_KERNEL();
+
+      /* we're back, the kernel executed and we can send */
+      if (key < 10) {
+        /* TODO: should we move the view in here if we want to get the device side data */
+        ttg::send<0>(key+1, std::move(val));
+      } else {
+        /* exfiltrate the value */
+        ptr = ttg::get_ptr(val);
+      }
+    };
+
+    //ptr.get_view<ttg::ExecutionSpace::CUDA>(device_id);
+
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    make_graph_executable(tt);
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, 0.0);
+    ttg::ttg_fence(ttg::default_execution_context());
+
 #if 0
+    /* feed the host-side value back into the graph */
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, pview.get_ptr());
+    ttg::ttg_fence(ttg::default_execution_context());
+
+    /* feed the device-side value back into the graph */
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, pview);
+    ttg::ttg_fence(ttg::default_execution_context());
+#endif // 0
+  }
+
+
+
+
+  SECTION("device_task") {
+    ttg::Edge<int, double> edge;
+    auto fn = [&](const int& key, double&& val) -> ttg::device_task {
+      // will schedule the view for transfer to and from the device
+      ttg::View<double> view = ttg::make_view(val, ttg::ViewScope::SyncInOut);
+      ttg::print("device_task key ", key, ", value ", val);
+      /* wait for the view to be available on the device */
+      co_await ttg::device_task_wait_views{};
+
+      /* once we're back here the data has been transferred */
+      CHECK(view.get_device_ptr<0>()  != nullptr);
+      CHECK(view.get_device_size<0>() == sizeof(val));
+      CHECK(&view.get_host_object() == &val);
+
+      ttg::print("device_task key ", key, ", device pointer ", view.get_device_ptr<0>());
+
+      while (val < 10.0) {
+
+        view.set_scope(ttg::ViewScope::SyncOut);
+
+        /* <submit kernel here> */
+
+        /* here we suspend to wait for a kernel to complete */
+        co_await ttg::device_task_wait_kernel{};
+
+        // TTG_WAIT_KERNEL();
+      }
+
+      /* we're back, the kernel executed and we can send */
+      if (key < 10) {
+        ttg::send<0>(key+1, val);
+      }
+    };
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    make_graph_executable(tt);
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, 0.0);
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+
+  struct A {
+    double norm;
+    std::vector<double> d;
+  };
+
+  SECTION("device_task") {
+    ttg::Edge<int, double> edge;
+    auto fn = [&](const int& key, ttg::ptr<A>&& a) -> ttg::device_task {
+      // will schedule the view for transfer to and from the device
+      View<double> norm_view = a.to_host(&A::norm);
+
+      View<A::norm> norm_view{a};
+      co_await ttg::device::wait_transfer{};
+
+
+
+
+      if (val)
+      val += 1.0;
+      ptr.sync_to_device();
+      /* wait for the view to be available on the device */
+      co_await ttg::device_task_wait_views{};
+
+      /* once we're back here the data has been transferred */
+      CHECK(view.get_device_ptr<0>()  != nullptr);
+      CHECK(view.get_device_size<0>() == sizeof(val));
+      CHECK(&view.get_host_object() == &val);
+
+      ttg::print("device_task key ", key, ", device pointer ", view.get_device_ptr<0>());
+
+      while (val < 10.0) {
+
+        view.set_scope(ttg::ViewScope::SyncOut);
+
+        /* <submit kernel here> */
+
+        /* here we suspend to wait for a kernel to complete */
+        co_await ttg::device_task_wait_kernel{};
+
+        // TTG_WAIT_KERNEL();
+      }
+
+      /* we're back, the kernel executed and we can send */
+      if (key < 10) {
+        ttg::send<0>(key+1, val);
+      }
+    };
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    make_graph_executable(tt);
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, 0.0);
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+
   struct A {
     int a[10];
     double b[10];
@@ -48,7 +337,7 @@ TEST_CASE("Device", "coro") {
     ttg::Edge<int, double> edge;
     auto fn = [](const int& key, A&& val) -> ttg::device_task {
       auto view = ttg::make_view(val, {val.a, 10, ttg::ViewScope::SyncIn},
-                                      {val.b, 10, ttg::ViewScope::SyncIn|ttg::ViewScope::SyncOut});
+                                      {val.b, 10, ttg::ViewScope::SyncIn});
       /* wait for the view to be available on the device */
       co_yield view;
       // co_yield std::tie(view1, view2);
diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt
index 05231ef6f..d30b68da4 100644
--- a/ttg/CMakeLists.txt
+++ b/ttg/CMakeLists.txt
@@ -44,12 +44,16 @@ configure_file(
 )
 set(ttg-impl-headers
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/broadcast.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/buffer.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/devicescope.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/devicescratch.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/edge.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/execution.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/func.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/fwd.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/impl_selector.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/tt.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/ptr.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/reduce.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/run.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/runtimes.h
@@ -203,8 +207,13 @@ endif(TARGET MADworld)
 ########################
 if (TARGET PaRSEC::parsec)
   set(ttg-parsec-headers
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/buffer.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/devicescratch.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/fwd.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/import.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ptr.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/task.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/thread_local.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ttg.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ttg_data_copy.h
           )
diff --git a/ttg/ttg.h b/ttg/ttg.h
index e0fa9a702..09ccfaf90 100644
--- a/ttg/ttg.h
+++ b/ttg/ttg.h
@@ -27,6 +27,11 @@
 
 #include "ttg/edge.h"
 
+#include "ttg/ptr.h"
+#include "ttg/buffer.h"
+#include "ttg/devicescratch.h"
+#include "ttg/devicescope.h"
+
 #if defined(TTG_USE_PARSEC)
 #include "ttg/parsec/ttg.h"
 #elif defined(TTG_USE_MADNESS)
diff --git a/ttg/ttg/buffer.h b/ttg/ttg/buffer.h
new file mode 100644
index 000000000..1868b7e0c
--- /dev/null
+++ b/ttg/ttg/buffer.h
@@ -0,0 +1,31 @@
+#ifndef TTG_BUFFER_H
+#define TTG_BUFFER_H
+
+#include <memory>
+#include "ttg/impl_selector.h"
+
+
+namespace ttg {
+
+template<typename T>
+using buffer = TTG_IMPL_NS::buffer<T>;
+
+namespace detail {
+  template<typename T>
+  struct is_buffer : std::false_type
+  { };
+
+  template<typename T>
+  struct is_buffer<ttg::buffer<T>> : std::true_type
+  { };
+
+  template<typename T>
+  constexpr bool is_buffer_v = is_buffer<T>::value;
+
+  static_assert(is_buffer_v<ttg::buffer<double>>);
+  static_assert(is_buffer_v<TTG_IMPL_NS::buffer<double>>);
+} // namespace detail
+
+} // namespace ttg
+
+#endif // TTG_buffer_H
\ No newline at end of file
diff --git a/ttg/ttg/devicescope.h b/ttg/ttg/devicescope.h
new file mode 100644
index 000000000..a8427fb56
--- /dev/null
+++ b/ttg/ttg/devicescope.h
@@ -0,0 +1,15 @@
+#ifndef TTG_DEVICESCOPE_H
+#define TTG_DEVICESCOPE_H
+
+namespace ttg {
+  enum class scope {
+    Allocate     = 0x0,  //< memory allocated as scratch, but not moved in or out
+    SyncIn       = 0x2,  //< data will be allocated on and transferred to device
+                         //< if latest version resides on the device (no previous sync-out) the data will
+                         //< not be transferred again
+    SyncOut      = 0x4,  //< value will be transferred from device to host after kernel completes
+    SyncInOut    = 0x8,  //< data will be moved in and synchronized back out after the kernel completes
+  };
+} // namespace ttg
+
+#endif // TTG_DEVICESCOPE_H
\ No newline at end of file
diff --git a/ttg/ttg/devicescratch.h b/ttg/ttg/devicescratch.h
new file mode 100644
index 000000000..1478d82cf
--- /dev/null
+++ b/ttg/ttg/devicescratch.h
@@ -0,0 +1,19 @@
+#ifndef TTG_DEVICESCRATCH_H
+#define TTG_DEVICESCRATCH_H
+
+#include "ttg/devicescope.h"
+#include "ttg/impl_selector.h"
+
+namespace ttg {
+
+template<typename T>
+using devicescratch = TTG_IMPL_NS::devicescratch<T>;
+
+template<typename T>
+auto make_scratch(T* val, ttg::scope scope, std::size_t count = 1) {
+  return devicescratch<T>(val, scope, 1);
+}
+
+} // namespace ttg
+
+#endif // TTG_DEVICESCRATCH_H
\ No newline at end of file
diff --git a/ttg/ttg/fwd.h b/ttg/ttg/fwd.h
index df32505d0..f9b8d1c0f 100644
--- a/ttg/ttg/fwd.h
+++ b/ttg/ttg/fwd.h
@@ -47,6 +47,7 @@ namespace ttg {
   template <typename... RestOfArgs>
   void initialize(int argc, char **argv, int num_threads = -1, RestOfArgs &&...);
   void finalize();
+  [[noreturn]]
   void abort();
   World default_execution_context();
   void execute(ttg::World world);
diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h
new file mode 100644
index 000000000..6d1a0c16f
--- /dev/null
+++ b/ttg/ttg/parsec/buffer.h
@@ -0,0 +1,388 @@
+#ifndef TTG_PARSEC_BUFFER_H
+#define TTG_PARSEC_BUFFER_H
+
+// TODO: replace with short vector
+#define TTG_PARSEC_MAX_NUM_DEVICES 4
+
+#include <array>
+#include <parsec.h>
+#include <parsec/data_internal.h>
+#include <parsec/mca/device/device.h>
+#include "ttg/parsec/ttg_data_copy.h"
+
+namespace ttg_parsec {
+
+
+namespace detail {
+  // fwd decl
+  template<typename T>
+  parsec_data_t* get_parsec_data(const ttg_parsec::buffer<T>& db);
+} // namespace detail
+
+/**
+ * A buffer that is mirrored between host memory
+ * and different devices. The runtime is free to
+ * move data between device and host memory based
+ * on where the tasks are executing.
+ *
+ * Note that a buffer is movable and should not
+ * be shared between two objects (e.g., through a pointer)
+ * in order for TTG to properly facilitate ownership
+ * tracking of the containing object.
+ */
+template<typename T>
+struct buffer {
+
+  using element_type = std::decay_t<T>;
+
+  static_assert(std::is_trivially_copyable_v<element_type>,
+                "Only trivially copyable types are supported for devices.");
+  static_assert(std::is_default_constructible_v<element_type>,
+                "Only default constructible types are supported for devices.");
+
+private:
+  using delete_fn_t = std::add_pointer_t<void(element_type*)>;
+
+  using parsec_data_ptr = std::unique_ptr<parsec_data_t, decltype(&parsec_data_destroy)>;
+  using host_data_ptr   = std::unique_ptr<element_type[], delete_fn_t>;
+  parsec_data_ptr m_data;
+  host_data_ptr m_host_data;
+  std::size_t m_count = 0;
+  detail::ttg_data_copy_t *m_ttg_copy = nullptr;
+
+  static void delete_owned(element_type *ptr) {
+    delete[] ptr;
+  }
+
+  static void delete_non_owned(element_type *ptr) {
+    // nothing to be done, we don't own the memory
+  }
+
+  static void delete_parsec_data(parsec_data_t *data) {
+    std::cout << "delete parsec_data " << data << std::endl;
+    parsec_data_destroy(data);
+  }
+
+  static void delete_null_parsec_data(parsec_data_t *) {
+    // nothing to be done, only used for nullptr
+  }
+
+  void create_host_copy() {
+    /* create a new copy for the host object */
+    parsec_data_copy_t* copy;
+    copy = parsec_data_copy_new(m_data.get(), 0, parsec_datatype_int8_t, PARSEC_DATA_FLAG_PARSEC_MANAGED);
+    copy->device_private = m_host_data.get();
+    copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
+    copy->version = 1; // this version is valid
+    m_data->nb_elts = sizeof(element_type)*m_count;
+    m_data->owner_device = 0;
+    /* register the new data with the host copy */
+    if (nullptr != m_ttg_copy) {
+      m_ttg_copy->add_device_data(m_data.get());
+    }
+  }
+
+  void reset() {
+    if (m_data) {
+      if (nullptr != m_ttg_copy) {
+        m_ttg_copy->remove_device_data(m_data.get());
+      }
+      m_data.reset();
+      m_count = 0;
+    }
+  }
+
+  friend parsec_data_t* detail::get_parsec_data<T>(const ttg_parsec::buffer<T>&);
+
+public:
+
+  /* The device ID of the CPU. */
+  static constexpr int cpu_device = 0;
+
+  buffer() : buffer(1)
+  { }
+
+  buffer(std::size_t count)
+  : m_data(parsec_data_new(), &delete_parsec_data)
+  , m_host_data(new element_type[count](), &delete_owned)
+  , m_count(count)
+  , m_ttg_copy(detail::ttg_data_copy_container())
+  {
+    create_host_copy();
+  }
+
+  /* Constructing a buffer using application-managed memory.
+   * The memory pointed to by ptr must be accessible during
+   * the life-time of the buffer. */
+  buffer(element_type* ptr, std::size_t count = 1)
+  : m_data(parsec_data_new(), &parsec_data_destroy)
+  , m_host_data(ptr, &delete_non_owned)
+  , m_count(count)
+  , m_ttg_copy(detail::ttg_data_copy_container())
+  {
+    create_host_copy();
+  }
+
+  ~buffer() {
+    unpin(); // make sure the copies are not pinned
+    /* remove the tracked copy */
+    if (nullptr != m_ttg_copy && m_data) {
+      m_ttg_copy->remove_device_data(m_data.get());
+    }
+  }
+
+  /* allow moving device buffers */
+  buffer(buffer&& db)
+  : m_data(std::move(db.m_data))
+  , m_host_data(std::move(db.m_host_data))
+  , m_count(db.m_count)
+  {
+    db.m_count = 0;
+    /* don't update the ttg_copy, we keep the connection */
+  }
+
+  /* copy the host data but leave the devices untouched */
+  buffer(const buffer& db)
+  : m_data(db.m_count ? parsec_data_new() : nullptr,
+           db.m_count ? &parsec_data_destroy : &delete_null_parsec_data)
+  , m_host_data(db.m_count ? new element_type[db.m_count] : nullptr,
+                db.m_count ? &delete_owned : delete_non_owned)
+  , m_count(db.m_count)
+  , m_ttg_copy(detail::ttg_data_copy_container())
+  {
+    /* copy host data */
+    std::copy(db.m_host_data.get(),
+              db.m_host_data.get() + m_count,
+              m_host_data.get());
+    /* create the host copy with the allocated memory */
+    create_host_copy();
+  }
+
+  /* allow moving device buffers */
+  buffer& operator=(buffer&& db) {
+    m_data = std::move(db.m_data);
+    m_host_data = std::move(db.m_host_data);
+    m_count = db.m_count;
+    db.m_count = 0;
+    /* don't update the ttg_copy, we keep the connection */
+  }
+
+  /* copy the host buffer content but leave the devices untouched */
+  buffer& operator=(const buffer& db) {
+    if (db.m_count == 0) {
+      m_data = parsec_data_ptr(nullptr, &delete_null_parsec_data);
+      m_host_data = host_data_ptr(nullptr, &delete_non_owned);
+    } else {
+      m_data = parsec_data_ptr(parsec_data_new(), &parsec_data_destroy);
+      m_host_data = host_data_ptr(new element_type[db.m_count], &delete_owned);
+      /* copy host data */
+      std::copy(db.m_host_data.get(),
+                db.m_host_data.get() + db.m_count,
+                m_host_data.get());
+      /* create the host copy with the allocated memory */
+      create_host_copy();
+    }
+    m_count = db.m_count;
+  }
+
+  /* set the current device, useful when a device
+   * buffer was modified outside of a TTG */
+  void set_current_device(int device_id) {
+    assert(is_valid());
+    /* make sure it's a valid device */
+    assert(parsec_nb_devices > device_id);
+    /* make sure it's a valid copy */
+    assert(m_data->device_copies[device_id] != nullptr);
+    m_data->owner_device = device_id;
+  }
+
+  /* get the current device ID, i.e., the last updated
+   * device buffer.  */
+  int get_current_device() const {
+    assert(is_valid());
+    return m_data->owner_device;
+  }
+
+  /* get the current device pointer */
+  element_type* current_device_ptr() {
+    assert(is_valid());
+    return static_cast<element_type*>(m_data->device_copies[m_data->owner_device]->device_private);
+  }
+
+  /* get the current device pointer */
+  const element_type* current_device_ptr() const {
+    assert(is_valid());
+    return static_cast<element_type*>(m_data->device_copies[m_data->owner_device]->device_private);
+  }
+
+  /* get the device pointer at the given device
+   * \sa cpu_device
+   */
+  element_type* device_ptr_on(int device_id) {
+    assert(is_valid());
+    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), device_id));
+  }
+
+  /* get the device pointer at the given device
+   * \sa cpu_device
+   */
+  const element_type* device_ptr_on(int device_id) const {
+    assert(is_valid());
+    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), device_id));
+  }
+
+  element_type* host_ptr() {
+    return device_ptr_on(cpu_device);
+  }
+
+  const element_type* host_ptr() const {
+    return device_ptr_on(cpu_device);
+  }
+
+  bool is_valid_on(int device_id) const {
+    assert(is_valid());
+    return (parsec_data_get_ptr(m_data.get(), device_id) != nullptr);
+  }
+
+  void allocate_on(int device_id) {
+    /* TODO: need exposed PaRSEC memory allocator */
+  }
+
+  /* TODO: can we do this automatically?
+   * Pin the memory on all devices we currently track.
+   * Pinned memory won't be released by PaRSEC and can be used
+   * at any time.
+   */
+  void pin() {
+    for (int i = 1; i < parsec_nb_devices; ++i) {
+      pin_on(i);
+    }
+  }
+
+  /* Unpin the memory on all devices we currently track. */
+  void unpin() {
+    if (!is_valid()) return;
+    for (int i = 1; i < parsec_nb_devices; ++i) {
+      unpin_on(i);
+    }
+  }
+
+  /* Pin the memory on a given device */
+  void pin_on(int device_id) {
+    /* TODO: how can we pin memory on a device? */
+  }
+
+  /* Pin the memory on a given device */
+  void unpin_on(int device_id) {
+    /* TODO: how can we unpin memory on a device? */
+  }
+
+  bool is_valid() const {
+    return !!m_data;
+  }
+
+  operator bool() const {
+    return is_valid();
+  }
+
+  std::size_t size() const {
+    return m_count;
+  }
+
+  /* Reallocate the buffer with count elements */
+  void reset(std::size_t count) {
+    /* TODO: can we resize if count is smaller than m_count? */
+    /* drop the current data and reallocate */
+    reset();
+    if (count == 0) {
+      m_data = parsec_data_ptr(nullptr, &delete_null_parsec_data);
+      m_host_data = host_data_ptr(nullptr, &delete_non_owned);
+    } else {
+      m_data = parsec_data_ptr(parsec_data_new(), &parsec_data_destroy);
+      m_host_data = host_data_ptr(new element_type[count], &delete_owned);
+      /* create the host copy with the allocated memory */
+      create_host_copy();
+    }
+    m_count = count;
+    /* don't touch the ttg_copy, we still belong to the same container */
+  }
+
+  /* Reset the buffer to use the ptr to count elements */
+  void reset(T* ptr, std::size_t count = 1) {
+    /* TODO: can we resize if count is smaller than m_count? */
+    /* drop the current data and reallocate */
+    reset();
+    if (nullptr == ptr) {
+      m_data = parsec_data_ptr(nullptr, &delete_null_parsec_data);
+      m_host_data = host_data_ptr(nullptr, &delete_non_owned);
+      m_count = 0;
+    } else {
+      m_data = parsec_data_ptr(parsec_data_new(), &parsec_data_destroy);
+      m_host_data = host_data_ptr(ptr, &delete_non_owned);
+      /* create the host copy with the allocated memory */
+      create_host_copy();
+      m_count = count;
+    }
+    /* don't touch the ttg_copy, we still belong to the same container */
+  }
+
+  /* serialization support */
+
+#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
+  template <class Archive>
+  std::enable_if_t<std::is_base_of_v<cereal::detail::InputArchiveBase, Archive> ||
+                    std::is_base_of_v<cereal::detail::OutputArchiveBase, Archive>>
+  serialize(Archive& ar) {
+    if constexpr (ttg::detail::is_output_archive_v<Archive>)
+      std::size_t s = size();
+      ar(s);
+    else {
+      std::size_t s;
+      ar(s);
+      reset(s);
+    }
+    ar(value);
+  }
+#endif // TTG_SERIALIZATION_SUPPORTS_CEREAL
+
+#ifdef TTG_SERIALIZATION_SUPPORTS_MADNESS
+  template <typename Archive>
+  std::enable_if_t<std::is_base_of_v<madness::archive::BufferInputArchive, Archive> ||
+                   std::is_base_of_v<madness::archive::BufferOutputArchive, Archive>>
+  serialize(Archive& ar) {
+    if constexpr (ttg::detail::is_output_archive_v<Archive>)
+      ar& size();
+    else {
+      std::size_t s;
+      ar & s;
+      /* initialize internal pointers and then reset */
+
+      reset(s);
+    }
+  }
+#endif // TTG_SERIALIZATION_SUPPORTS_MADNESS
+
+
+};
+
+template<typename T>
+struct is_buffer : std::false_type
+{ };
+
+template<typename T>
+struct is_buffer<buffer<T>> : std::true_type
+{ };
+
+template<typename T>
+constexpr static const bool is_buffer_v = is_buffer<T>::value;
+
+namespace detail {
+  template<typename T>
+  parsec_data_t* get_parsec_data(const ttg_parsec::buffer<T>& db) {
+    return const_cast<parsec_data_t*>(db.m_data.get());
+  }
+} // namespace detail
+
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_BUFFER_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/devicescratch.h b/ttg/ttg/parsec/devicescratch.h
new file mode 100644
index 000000000..1c0487cdf
--- /dev/null
+++ b/ttg/ttg/parsec/devicescratch.h
@@ -0,0 +1,139 @@
+#ifndef TTG_PARSEC_DEVICESCRATCH_H
+#define TTG_PARSEC_DEVICESCRATCH_H
+
+// TODO: replace with short vector
+#define TTG_PARSEC_MAX_NUM_DEVICES 4
+
+#include <array>
+#include <parsec.h>
+#include <parsec/data_internal.h>
+#include <parsec/mca/device/device.h>
+#include <ttg/devicescope.h>
+
+namespace ttg_parsec {
+
+namespace detail {
+  // fwd decl
+  template<typename T>
+  parsec_data_t* get_parsec_data(const ttg_parsec::devicescratch<T>&);
+} // namespace detail
+
+/**
+ * Scratch-space for task-local variables.
+ * TTG will allocate memory on the device
+ * and transfer data in and out based on the scope.
+ */
+template<typename T>
+struct devicescratch {
+
+  using element_type = std::decay_t<T>;
+
+  static_assert(std::is_trivially_copyable_v<element_type>,
+                "Only trivially copyable types are supported for devices.");
+  static_assert(std::is_default_constructible_v<element_type>,
+                "Only default constructible types are supported for devices.");
+
+private:
+
+  parsec_data_t* m_data = nullptr;
+  parsec_data_copy_t m_data_copy;
+  ttg::scope m_scope;
+
+  void create_host_copy(element_type *ptr, std::size_t count) {
+    /* TODO: is the construction call necessary? */
+    /* TODO: handle the scope */
+    PARSEC_OBJ_CONSTRUCT(&m_data_copy, parsec_data_copy_t);
+    m_data_copy.device_index    = 0;
+    //m_data_copy.original        = &m_data;
+    //m_data_copy.older           = NULL;
+    m_data_copy.flags           = PARSEC_DATA_FLAG_PARSEC_MANAGED;
+    m_data_copy.dtt             = parsec_datatype_int8_t;
+    m_data_copy.version         = 1;
+    m_data_copy.device_private  = ptr;
+    m_data_copy.coherency_state = PARSEC_DATA_COHERENCY_SHARED;
+
+    m_data->nb_elts              = count * sizeof(element_type);
+    m_data->owner_device         = 0;
+    parsec_data_copy_attach(m_data, &m_data_copy, 0);
+  }
+
+  friend parsec_data_t* detail::get_parsec_data<T>(const ttg_parsec::devicescratch<T>&);
+
+public:
+
+  /* Constructing a devicescratch using application-managed memory.
+   * The memory pointed to by ptr must be accessible during
+   * the life-time of the devicescratch. */
+  devicescratch(element_type* ptr, ttg::scope scope = ttg::scope::SyncIn, std::size_t count = 1)
+  : m_data(parsec_data_new())
+  , m_scope(scope) {
+    create_host_copy(ptr, count);
+  }
+
+  /* don't allow moving */
+  devicescratch(devicescratch&&) = delete;
+
+  /* don't allow copying */
+  devicescratch(const devicescratch& db) = delete;
+
+  /* don't allow moving */
+  devicescratch& operator=(devicescratch&&) = delete;
+
+  /* don't allow copying */
+  devicescratch& operator=(const devicescratch& db) = delete;
+
+  ~devicescratch() {
+    PARSEC_OBJ_DESTRUCT(&m_data_copy);
+    parsec_data_destroy(m_data);
+    m_data = nullptr;
+  }
+
+  /* get the current device pointer */
+  element_type* device_ptr() {
+    assert(is_valid());
+    return static_cast<element_type*>(m_data->device_copies[m_data->owner_device]->device_private);
+  }
+
+  /* get the current device pointer */
+  const element_type* device_ptr() const {
+    assert(is_valid());
+    return static_cast<element_type*>(m_data->device_copies[m_data->owner_device]->device_private);
+  }
+
+  bool is_valid() const {
+    // TODO: how to get the current device
+    // return (m_data->owner_device == parsec_current_device);
+    return true;
+  }
+
+  ttg::scope scope() const {
+    return m_scope;
+  }
+
+  std::size_t size() const {
+    return (m_data->nb_elts / sizeof(element_type));
+  }
+
+};
+
+template<typename T>
+struct is_devicescratch : std::false_type
+{ };
+
+template<typename T>
+struct is_devicescratch<devicescratch<T>> : std::true_type
+{ };
+
+template<typename T>
+constexpr static const bool is_devicescratch_v = is_devicescratch<T>::value;
+
+namespace detail {
+  template<typename T>
+  parsec_data_t* get_parsec_data(const ttg_parsec::devicescratch<T>& scratch) {
+    return const_cast<parsec_data_t*>(scratch.m_data);
+  }
+} // namespace detail
+
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_DEVICESCRATCH_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/fwd.h b/ttg/ttg/parsec/fwd.h
index ece8c42fb..06c4dadf4 100644
--- a/ttg/ttg/parsec/fwd.h
+++ b/ttg/ttg/parsec/fwd.h
@@ -13,6 +13,20 @@ namespace ttg_parsec {
   template <typename keyT, typename output_terminalsT, typename derivedT, typename input_valueTs = ttg::typelist<>>
   class TT;
 
+  template<typename T>
+  struct ptr;
+
+  template<typename T>
+  struct buffer;
+  template<typename T>
+  struct devicescratch;
+
+  template<typename... Views>
+  inline bool register_device_memory(std::tuple<Views&...> &views);
+
+  template<typename... Buffer>
+  inline void mark_device_out(std::tuple<Buffer&...> &b);
+
   /// \internal the OG name
   template <typename keyT, typename output_terminalsT, typename derivedT, typename... input_valueTs>
   using Op [[deprecated("use TT instead")]] = TT<keyT, output_terminalsT, derivedT, ttg::typelist<input_valueTs...>>;
@@ -58,6 +72,17 @@ namespace ttg_parsec {
     void  free(int did, void *ptr);
     ttg::ExecutionSpace space(int did);
   }
+  
+#if 0
+  template<typename... Args>
+  inline std::pair<bool, std::tuple<ptr<std::decay_t<Args>>...>> get_ptr(Args&&... args);
+#endif
+  template<typename T>
+  inline ptr<std::decay_t<T>> get_ptr(T&& obj);
+
+  template<typename T, typename... Args>
+  inline ptr<T> make_ptr(Args&&... args);
+
 
 }  // namespace ttg_parsec
 
diff --git a/ttg/ttg/parsec/ptr.h b/ttg/ttg/parsec/ptr.h
new file mode 100644
index 000000000..6499e050b
--- /dev/null
+++ b/ttg/ttg/parsec/ptr.h
@@ -0,0 +1,282 @@
+#ifndef TTG_PARSEC_PTR_H
+#define TTG_PARSEC_PTR_H
+
+#include <unordered_map>
+#include <mutex>
+
+#include "ttg/parsec/ttg_data_copy.h"
+#include "ttg/parsec/thread_local.h"
+#include "ttg/parsec/task.h"
+
+namespace ttg_parsec {
+
+  // fwd decl
+  template<typename T>
+  struct ptr;
+
+  namespace detail {
+    /* fwd decl */
+    template <typename Value>
+    inline ttg_data_copy_t *create_new_datacopy(Value &&value);
+
+    struct ptr {
+      using copy_type = detail::ttg_data_copy_t;
+
+    private:
+      static inline std::unordered_map<ptr*, bool> m_ptr_map;
+      static inline std::mutex m_ptr_map_mtx;
+
+      copy_type *m_copy = nullptr;
+
+      void drop_copy() {
+        std::cout << "ptr drop_copy " << m_copy << " ref " << m_copy->num_ref() << std::endl;
+        if (nullptr != m_copy && 1 == m_copy->drop_ref()) {
+          delete m_copy;
+        }
+        m_copy = nullptr;
+      }
+
+      void register_self() {
+        /* insert ourselves from the list of ptr */
+        std::lock_guard {m_ptr_map_mtx};
+        m_ptr_map.insert(std::pair{this, true});
+      }
+
+      void deregister_self() {
+        /* remove ourselves from the list of ptr */
+        std::lock_guard _{m_ptr_map_mtx};
+        if (m_ptr_map.contains(this)) {
+          m_ptr_map.erase(this);
+        }
+      }
+
+    public:
+      ptr(copy_type *copy)
+      : m_copy(copy)
+      {
+        register_self();
+        m_copy->add_ref();
+        std::cout << "ptr copy_obj ref " << m_copy->num_ref() << std::endl;
+      }
+
+      copy_type* get_copy() const {
+        return m_copy;
+      }
+
+      ptr(const ptr& p)
+      : m_copy(p.m_copy)
+      {
+        register_self();
+        m_copy->add_ref();
+        std::cout << "ptr cpy " << m_copy << " ref " << m_copy->num_ref() << std::endl;
+      }
+
+      ptr(ptr&& p)
+      : m_copy(p.m_copy)
+      {
+        register_self();
+        p.m_copy = nullptr;
+        std::cout << "ptr mov " << m_copy << " ref " << m_copy->num_ref() << std::endl;
+      }
+
+      ~ptr() {
+        deregister_self();
+        drop_copy();
+      }
+
+      ptr& operator=(const ptr& p)
+      {
+        drop_copy();
+        m_copy = p.m_copy;
+        m_copy->add_ref();
+        std::cout << "ptr cpy " << m_copy << " ref " << m_copy->num_ref() << std::endl;
+        return *this;
+      }
+
+      ptr& operator=(ptr&& p) {
+        drop_copy();
+        m_copy = p.m_copy;
+        p.m_copy = nullptr;
+        std::cout << "ptr mov " << m_copy << " ref " << m_copy->num_ref() << std::endl;
+        return *this;
+      }
+
+      bool is_valid() const {
+        return (nullptr != m_copy);
+      }
+
+      void reset() {
+        drop_copy();
+      }
+
+      /* drop all currently registered ptr
+       * \note this function is not thread-safe
+       *       and should only be called at the
+       *       end of the execution, e.g., during finalize.
+       */
+      static void drop_all_ptr() {
+        for(auto it : m_ptr_map) {
+          it.first->drop_copy();
+        }
+      }
+    };
+
+
+    template<typename T>
+    ttg_parsec::detail::ttg_data_copy_t* get_copy(ttg_parsec::ptr<T>& p);
+  } // namespace detail
+
+  template<typename T, typename... Args>
+  ptr<T> ttg_parsec::make_ptr(Args&&... args);
+
+  template<typename T>
+  ptr<std::decay_t<T>> ttg_parsec::get_ptr(T&& obj);
+
+  template<typename T>
+  struct ptr {
+
+    using value_type = std::decay_t<T>;
+
+  private:
+    using copy_type = detail::ttg_data_value_copy_t<value_type>;
+
+    std::unique_ptr<detail::ptr> m_ptr;
+
+    /* only PaRSEC backend functions are allowed to touch our private parts */
+    template<typename... Args>
+    friend ptr<T> make_ptr(Args&&... args);
+    template<typename S>
+    friend ptr<std::decay_t<S>> get_ptr(S&& obj);
+    template<typename S>
+    friend detail::ttg_data_copy_t* detail::get_copy(ptr<S>& p);
+    friend ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC>;
+
+    /* only accessible by get_ptr and make_ptr */
+    ptr(detail::ptr::copy_type *copy)
+    : m_ptr(new detail::ptr(copy))
+    { }
+
+    copy_type* get_copy() const {
+      return static_cast<copy_type*>(m_ptr->get_copy());
+    }
+
+  public:
+
+    ptr() = default;
+
+    ptr(const ptr& p)
+    : ptr(p.get_copy())
+    { }
+
+    ptr(ptr&& p) = default;
+
+    ~ptr() = default;
+
+    ptr& operator=(const ptr& p) {
+      m_ptr.reset(new detail::ptr(p.get_copy()));
+      return *this;
+    }
+
+    ptr& operator=(ptr&& p) = default;
+
+    value_type& operator*() const {
+      return **static_cast<copy_type*>(m_ptr->get_copy());
+    }
+
+    value_type& operator->() const {
+      return **static_cast<copy_type*>(m_ptr->get_copy());
+    }
+
+    bool is_valid() const {
+      return m_ptr && m_ptr->is_valid();
+    }
+
+    void reset() {
+      m_ptr.reset();
+    }
+  };
+
+#if 0
+  namespace detail {
+    template<typename Arg>
+    inline auto get_ptr(Arg&& obj) {
+
+      for (int i = 0; i < detail::parsec_ttg_caller->data_count; ++i) {
+        detail::ttg_data_copy_t *copy = detail::parsec_ttg_caller->copies[i];
+        if (nullptr != copy) {
+          if (copy->get_ptr() == &obj) {
+            bool is_ready = true;
+            /* TODO: how can we force-sync host and device? Current data could be on either. */
+#if 0
+            /* check all tracked device data for validity */
+            for (auto it : copy) {
+              parsec_data_t *data = *it;
+              for (int i = 0; i < parsec_nb_devices; ++i) {
+                if (nullptr != data->device_copies[i]) {
+
+                } else {
+                  is_ready = false;
+                }
+              }
+            }
+#endif // 0
+            return std::make_pair(is_ready, std::tuple{ttg_parsec::ptr<std::decay_t<Arg>>(copy)});
+          }
+        }
+      }
+
+      throw std::runtime_error("ttg::get_ptr called on an unknown object!");
+    }
+  }
+
+  template<typename... Args>
+  inline std::pair<bool, std::tuple<ptr<std::decay_t<Args>>...>> get_ptr(Args&&... args) {
+    if (nullptr == detail::parsec_ttg_caller) {
+      throw std::runtime_error("ttg::get_ptr called outside of a task!");
+    }
+
+    bool ready = true;
+    auto fn = [&](auto&& arg){
+      auto pair = get_ptr(std::forward<decltype(arg)>(arg));
+      ready &= pair.first;
+      return std::move(pair.second);
+    };
+    std::tuple<ptr<std::decay_t<Args>>...> tpl = {(fn(std::forward<Args>(args)))...};
+    return {ready, std::move(tpl)};
+  }
+#endif // 0
+
+  template<typename T>
+  inline ptr<std::decay_t<T>> get_ptr(T&& obj) {
+    using ptr_type = ptr<std::decay_t<T>>;
+    if (nullptr != detail::parsec_ttg_caller) {
+      for (int i = 0; i < detail::parsec_ttg_caller->data_count; ++i) {
+        detail::ttg_data_copy_t *copy = detail::parsec_ttg_caller->copies[i];
+        if (nullptr != copy) {
+          if (copy->get_ptr() == &obj) {
+            return ptr_type(copy);
+          }
+        }
+      }
+    }
+    /* object not tracked, make a new ptr that is now tracked */
+    detail::ttg_data_copy_t *copy = detail::create_new_datacopy(obj);
+    return ptr_type(copy);
+  }
+
+  template<typename T, typename... Args>
+  inline ptr<T> make_ptr(Args&&... args) {
+    detail::ttg_data_copy_t *copy = detail::create_new_datacopy(T(std::forward<Args>(args)...));
+    return ptr<T>(copy);
+  }
+
+  namespace detail {
+    template<typename T>
+    detail::ttg_data_copy_t* get_copy(ttg_parsec::ptr<T>& p) {
+      return p.get_copy();
+    }
+  } // namespace detail
+
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_PTR_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h
new file mode 100644
index 000000000..c8465b81c
--- /dev/null
+++ b/ttg/ttg/parsec/task.h
@@ -0,0 +1,207 @@
+#ifndef TTG_PARSEC_TASK_H
+#define TTG_PARSEC_TASK_H
+
+#include "ttg/parsec/ttg_data_copy.h"
+
+#include <parsec/parsec_internal.h>
+#include <parsec/mca/device/device_gpu.h>
+
+namespace ttg_parsec {
+
+  namespace detail {
+
+    struct device_ptr_t {
+      parsec_gpu_task_t* gpu_task = nullptr;
+      parsec_flow_t* flows = nullptr;
+    };
+
+    template<bool SupportDevice>
+    struct device_state_t
+    {
+      static constexpr bool support_device = false;
+      static constexpr size_t num_flows = 0;
+      device_state_t(parsec_task_t *parsec_task)
+      { }
+      static constexpr device_ptr_t* dev_ptr() {
+        return nullptr;
+      }
+    };
+
+    template<>
+    struct device_state_t<true> {
+      static constexpr bool support_device = false;
+      static constexpr size_t num_flows = MAX_PARAM_COUNT;
+      parsec_flow_t m_flows[num_flows];
+      device_ptr_t m_dev_ptr = {nullptr, &m_flows[0]}; // gpu_task will be allocated in each task
+      device_ptr_t* dev_ptr() {
+        return &m_dev_ptr;
+      }
+    };
+
+    typedef parsec_hook_return_t (*parsec_static_op_t)(void *);  // static_op will be cast to this type
+
+    struct parsec_ttg_task_base_t {
+      parsec_task_t parsec_task;
+      int32_t in_data_count = 0;   //< number of satisfied inputs
+      int32_t data_count = 0;      //< number of data elements in the copies array
+      ttg_data_copy_t **copies;    //< pointer to the fixed copies array of the derived task
+      parsec_hash_table_item_t tt_ht_item = {};
+      parsec_static_op_t function_template_class_ptr[ttg::runtime_traits<ttg::Runtime::PaRSEC>::num_execution_spaces] =
+          {nullptr};
+
+      typedef struct {
+        std::size_t goal;
+        std::size_t size;
+      } size_goal_t;
+
+      typedef void (release_task_fn)(parsec_ttg_task_base_t*);
+      /* Poor-mans virtual function
+       * We cannot use virtual inheritance or private visibility because we
+       * need offsetof for the mempool and scheduling.
+       */
+      release_task_fn* release_task_cb = nullptr;
+      device_ptr_t* dev_ptr;
+      bool remove_from_hash = true;
+      bool is_dummy = false;
+      bool defer_writer = TTG_PARSEC_DEFER_WRITER; // whether to defer writer instead of creating a new copy
+
+
+      /*
+      virtual void release_task() = 0;
+      */
+    //public:
+      void release_task() {
+        release_task_cb(this);
+      }
+
+     protected:
+      /**
+       * Protected constructors: this class should not be instantiated directly
+       * but always be use through parsec_ttg_task_t.
+       */
+
+      parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
+                             int data_count, ttg_data_copy_t **copies, device_ptr_t *dev_ptr,
+                             bool defer_writer = TTG_PARSEC_DEFER_WRITER)
+          : data_count(data_count)
+          , copies(copies)
+          , dev_ptr(dev_ptr)
+          , defer_writer(defer_writer) {
+        PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super);
+        parsec_task.mempool_owner = mempool;
+        parsec_task.task_class = task_class;
+        parsec_task.priority = 0;
+      }
+
+      parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
+                             parsec_taskpool_t *taskpool, int32_t priority,
+                             int data_count, ttg_data_copy_t **copies, device_ptr_t *dev_ptr,
+                             release_task_fn *release_fn,
+                             bool defer_writer = TTG_PARSEC_DEFER_WRITER)
+          : data_count(data_count)
+          , copies(copies)
+          , release_task_cb(release_fn)
+          , dev_ptr(dev_ptr)
+          , defer_writer(defer_writer) {
+        PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super);
+        parsec_task.mempool_owner = mempool;
+        parsec_task.task_class = task_class;
+        parsec_task.status = PARSEC_TASK_STATUS_HOOK;
+        parsec_task.taskpool = taskpool;
+        parsec_task.priority = priority;
+        parsec_task.chore_mask = 1<<0;
+      }
+
+    public:
+      void set_dummy(bool d) { is_dummy = d; }
+      bool dummy() { return is_dummy; }
+    };
+
+    template <typename TT, bool KeyIsVoid = ttg::meta::is_void_v<typename TT::key_type>>
+    struct parsec_ttg_task_t : public parsec_ttg_task_base_t {
+      using key_type = typename TT::key_type;
+      static constexpr size_t num_streams = TT::numins;
+      TT* tt;
+      key_type key;
+      size_goal_t stream[num_streams] = {};
+#ifdef TTG_HAS_COROUTINE
+      void* suspended_task_address = nullptr;  // if not null the function is suspended
+#endif
+      ttg_data_copy_t *copies[num_streams+1] = { nullptr };  // the data copies tracked by this task
+                                                             // +1 for the copy needed during send/bcast
+      device_state_t<TT::derived_has_cuda_op()> dev_state;
+
+      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class)
+          : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies, dev_state.dev_ptr()) {
+        tt_ht_item.key = pkey();
+
+        // We store the hash of the key and the address where it can be found in locals considered as a scratchpad
+        *(uintptr_t*)&(parsec_task.locals[0]) = 0; //there is no key
+        *(uintptr_t*)&(parsec_task.locals[2]) = 0; //there is no key
+      }
+
+      parsec_ttg_task_t(const key_type& key, parsec_thread_mempool_t *mempool,
+                        parsec_task_class_t *task_class, parsec_taskpool_t *taskpool,
+                        TT *tt_ptr, int32_t priority)
+          : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority,
+                                   num_streams, copies, dev_state.dev_ptr(),
+                                   &release_task, tt_ptr->m_defer_writer)
+          , tt(tt_ptr), key(key) {
+        tt_ht_item.key = pkey();
+
+        // We store the hash of the key and the address where it can be found in locals considered as a scratchpad
+        uint64_t hv = ttg::hash<std::decay_t<decltype(key)>>{}(key);
+        *(uintptr_t*)&(parsec_task.locals[0]) = hv;
+        *(uintptr_t*)&(parsec_task.locals[2]) = reinterpret_cast<uintptr_t>(&this->key);
+      }
+
+      static void release_task(parsec_ttg_task_base_t* task_base) {
+        parsec_ttg_task_t *task = static_cast<parsec_ttg_task_t*>(task_base);
+        TT *tt = task->tt;
+        tt->release_task(task);
+      }
+
+      parsec_key_t pkey() { return reinterpret_cast<parsec_key_t>(&key); }
+    };
+
+    template <typename TT>
+    struct parsec_ttg_task_t<TT, true> : public parsec_ttg_task_base_t {
+      static constexpr size_t num_streams = TT::numins;
+      TT* tt;
+      size_goal_t stream[num_streams] = {};
+#ifdef TTG_HAS_COROUTINE
+      void* suspended_task_address = nullptr;  // if not null the function is suspended
+#endif
+      ttg_data_copy_t *copies[num_streams+1] = { nullptr };  // the data copies tracked by this task
+                                                             // +1 for the copy needed during send/bcast
+      device_state_t<TT::derived_has_cuda_op()> dev_state;
+
+      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class)
+          : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies, dev_state.dev_ptr()) {
+        tt_ht_item.key = pkey();
+      }
+
+      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
+                        parsec_taskpool_t *taskpool, TT *tt_ptr, int32_t priority)
+          : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority,
+                                   num_streams, copies, dev_state.dev_ptr(),
+                                   &release_task, tt_ptr->m_defer_writer)
+          , tt(tt_ptr) {
+        tt_ht_item.key = pkey();
+      }
+
+      static void release_task(parsec_ttg_task_base_t* task_base) {
+        parsec_ttg_task_t *task = static_cast<parsec_ttg_task_t*>(task_base);
+        TT *tt = task->tt;
+        tt->release_task(task);
+      }
+
+      parsec_key_t pkey() { return 0; }
+    };
+
+
+  } // namespace detail
+
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_TASK_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/thread_local.h b/ttg/ttg/parsec/thread_local.h
new file mode 100644
index 000000000..54b98885e
--- /dev/null
+++ b/ttg/ttg/parsec/thread_local.h
@@ -0,0 +1,22 @@
+#ifndef TTG_PARSEC_THREAD_LOCAL_H
+#define TTG_PARSEC_THREAD_LOCAL_H
+
+namespace ttg_parsec {
+
+namespace detail {
+
+  // fwd decls
+  struct parsec_ttg_task_base_t;
+  struct ttg_data_copy_t;
+
+  inline thread_local parsec_ttg_task_base_t *parsec_ttg_caller = nullptr;
+
+  inline ttg_data_copy_t*& ttg_data_copy_container() {
+    static thread_local ttg_data_copy_t *ptr = nullptr;
+    return ptr;
+  }
+
+} // namespace detail
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_THREAD_LOCAL_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index 01e88d8a2..77991551c 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -7,6 +7,11 @@
 #define TTG_USE_PARSEC 1
 #endif  // !defined(TTG_IMPL_NAME)
 
+/* Whether to defer a potential writer if there are readers.
+ * This may avoid extra copies in exchange for concurrency.
+ * This may cause deadlocks, so use with caution. */
+#define TTG_PARSEC_DEFER_WRITER false
+
 #include "ttg/impl_selector.h"
 
 /* include ttg header to make symbols available in case this header is included directly */
@@ -31,8 +36,15 @@
 
 #include "ttg/serialization/data_descriptor.h"
 
+#include "ttg/view.h"
+
 #include "ttg/parsec/fwd.h"
 
+#include "ttg/parsec/buffer.h"
+#include "ttg/parsec/devicescratch.h"
+#include "ttg/parsec/thread_local.h"
+#include "ttg/parsec/devicefunc.h"
+
 #include <algorithm>
 #include <array>
 #include <cassert>
@@ -80,6 +92,9 @@
 #include <cstring>
 
 #include "ttg/parsec/ttg_data_copy.h"
+#include "ttg/parsec/thread_local.h"
+#include "ttg/parsec/ptr.h"
+#include "ttg/parsec/task.h"
 
 #undef TTG_PARSEC_DEBUG_TRACK_DATA_COPIES
 
@@ -87,11 +102,6 @@
 #include <unordered_set>
 #endif
 
-/* Whether to defer a potential writer if there are readers.
- * This may avoid extra copies in exchange for concurrency.
- * This may cause deadlocks, so use with caution. */
-#define TTG_PARSEC_DEFER_WRITER false
-
 /* PaRSEC function declarations */
 extern "C" {
 void parsec_taskpool_termination_detected(parsec_taskpool_t *tp);
@@ -100,42 +110,6 @@ int parsec_add_fetch_runtime_task(parsec_taskpool_t *tp, int tasks);
 
 #include "ttg/view.h"
 
-namespace test::meta {
-
-    template <class...>
-    using void_t = void;
-  namespace detail {
-      template <class Default, class Enabler, template <class...> class TT, class... Args>
-      struct detector {
-        using value_t = std::false_type;
-        using type = Default;
-      };
-
-      template <class Default, template <class...> class TT, class... Args>
-      struct detector<Default, void_t<TT<Args...>>, TT, Args...> {
-        using value_t = std::true_type;
-        using type = TT<Args...>;
-      };
-
-  }
-
-      template <template <class...> class TT, class... Args>
-      using is_detected = typename detail::detector<ttg::meta::nonesuch, void, TT, Args...>::value_t;
-
-    template <template <class...> class TT, class... Args>
-    using is_detected = typename detail::detector<ttg::meta::nonesuch, void, TT, Args...>::value_t;
-
-    template <template <class...> class TT, class... Args>
-    using detected_t = typename detail::detector<ttg::meta::nonesuch, void, TT, Args...>::type;
-
-    template <class Default, template <class...> class TT, class... Args>
-    using detected_or = detail::detector<Default, void, TT, Args...>;
-
-    template <template <class...> class TT, class... Args>
-    constexpr bool is_detected_v = is_detected<TT, Args...>::value;
-
-}
-
 namespace ttg_parsec {
   inline thread_local parsec_execution_stream_t *parsec_ttg_es;
 
@@ -302,14 +276,12 @@ namespace ttg_parsec {
       parsec_taskpool_reserve_id(tpool);
 
       tpool->devices_index_mask = 0;
-      // TODO DEBUG: reenable CPU incarnations
-      for(int i = 2; i < (int)parsec_nb_devices; i++) {
+      for(int i = 0; i < (int)parsec_nb_devices; i++) {
           parsec_device_module_t *device = parsec_mca_device_get(i);
           if( NULL == device ) continue;
           tpool->devices_index_mask |= (1 << device->device_index);
       }
 
-
 #ifdef TTG_USE_USER_TERMDET
       parsec_termdet_open_module(tpool, "user_trigger");
 #else   // TTG_USE_USER_TERMDET
@@ -558,8 +530,6 @@ namespace ttg_parsec {
 
   namespace detail {
 
-    typedef parsec_hook_return_t (*parsec_static_op_t)(void *);  // static_op will be cast to this type
-
     const parsec_symbol_t parsec_taskclass_param0 = {
       .flags = PARSEC_SYMBOL_IS_STANDALONE|PARSEC_SYMBOL_IS_GLOBAL,
       .name = "HASH0",
@@ -593,159 +563,6 @@ namespace ttg_parsec {
       .expr_inc = nullptr,
       .cst_inc = 0 };
 
-    struct parsec_ttg_task_base_t {
-      parsec_task_t parsec_task;
-      int32_t in_data_count = 0;  //< number of satisfied inputs
-      int32_t data_count = 0;     //< number of data elements in parsec_task.data
-      ttg_data_copy_t **copies;    //< pointer to the fixed copies array of the derived task
-      parsec_hash_table_item_t tt_ht_item = {};
-      parsec_static_op_t function_template_class_ptr[ttg::runtime_traits<ttg::Runtime::PaRSEC>::num_execution_spaces] =
-          {nullptr};
-      bool is_dummy = false;
-      bool defer_writer = TTG_PARSEC_DEFER_WRITER; // whether to defer writer instead of creating a new copy
-
-      typedef void (release_task_fn)(parsec_ttg_task_base_t*);
-
-      typedef struct {
-        std::size_t goal;
-        std::size_t size;
-      } size_goal_t;
-
-      /* Poor-mans virtual function
-       * We cannot use virtual inheritance or private visibility because we
-       * need offsetof for the mempool and scheduling.
-       */
-      release_task_fn* release_task_cb = nullptr;
-      bool remove_from_hash = true;
-
-      /*
-      virtual void release_task() = 0;
-      */
-    //public:
-      void release_task() {
-        release_task_cb(this);
-      }
-
-     protected:
-      /**
-       * Protected constructors: this class should not be instantiated directly
-       * but always be use through parsec_ttg_task_t.
-       */
-
-      parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
-                             int data_count, ttg_data_copy_t **copies,
-                             bool defer_writer = TTG_PARSEC_DEFER_WRITER)
-          : data_count(data_count)
-          , copies(copies)
-          , defer_writer(defer_writer) {
-        PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super);
-        parsec_task.mempool_owner = mempool;
-        parsec_task.task_class = task_class;
-        parsec_task.priority = 0;
-      }
-
-      parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
-                             parsec_taskpool_t *taskpool, int32_t priority,
-                             int data_count, ttg_data_copy_t **copies,
-                             release_task_fn *release_fn,
-                             bool defer_writer = TTG_PARSEC_DEFER_WRITER)
-          : data_count(data_count)
-          , copies(copies)
-          , defer_writer(defer_writer)
-          , release_task_cb(release_fn) {
-            int32_t p = priority;
-        PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super);
-        parsec_task.mempool_owner = mempool;
-        parsec_task.task_class = task_class;
-        parsec_task.status = PARSEC_TASK_STATUS_HOOK;
-        parsec_task.taskpool = taskpool;
-        parsec_task.priority = priority;
-        parsec_task.chore_mask = 1<<0;
-      }
-
-    public:
-      void set_dummy(bool d) { is_dummy = d; }
-      bool dummy() { return is_dummy; }
-    };
-
-    template <typename TT, bool KeyIsVoid = ttg::meta::is_void_v<typename TT::key_type>>
-    struct parsec_ttg_task_t : public parsec_ttg_task_base_t {
-      using key_type = typename TT::key_type;
-      static constexpr size_t num_streams = TT::numins;
-      TT* tt;
-      key_type key;
-      size_goal_t stream[num_streams] = {};
-#ifdef TTG_HAS_COROUTINE
-      void* suspended_task_address = nullptr;  // if not null the function is suspended
-#endif
-      ttg_data_copy_t *copies[num_streams+1] = { nullptr };  // the data copies tracked by this task
-                                                             // +1 for the copy needed during send/bcast
-
-      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class)
-          : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies) {
-        tt_ht_item.key = pkey();
-
-        // We store the hash of the key and the address where it can be found in locals considered as a scratchpad
-        *(uintptr_t*)&(parsec_task.locals[0]) = 0; //there is no key
-        *(uintptr_t*)&(parsec_task.locals[2]) = 0; //there is no key
-      }
-
-      parsec_ttg_task_t(const key_type& key, parsec_thread_mempool_t *mempool,
-                        parsec_task_class_t *task_class, parsec_taskpool_t *taskpool,
-                        TT *tt_ptr, int32_t priority)
-          : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority,
-                                   num_streams, copies, &release_task, tt_ptr->m_defer_writer)
-          , tt(tt_ptr), key(key) {
-        tt_ht_item.key = pkey();
-
-        // We store the hash of the key and the address where it can be found in locals considered as a scratchpad
-        uint64_t hv = ttg::hash<std::decay_t<decltype(key)>>{}(key);
-        *(uintptr_t*)&(parsec_task.locals[0]) = hv;
-        *(uintptr_t*)&(parsec_task.locals[2]) = reinterpret_cast<uintptr_t>(&this->key);
-      }
-
-      static void release_task(parsec_ttg_task_base_t* task_base) {
-        parsec_ttg_task_t *task = static_cast<parsec_ttg_task_t*>(task_base);
-        TT *tt = task->tt;
-        tt->release_task(task);
-      }
-
-      parsec_key_t pkey() { return reinterpret_cast<parsec_key_t>(&key); }
-    };
-
-    template <typename TT>
-    struct parsec_ttg_task_t<TT, true> : public parsec_ttg_task_base_t {
-      static constexpr size_t num_streams = TT::numins;
-      TT* tt;
-      size_goal_t stream[num_streams] = {};
-#ifdef TTG_HAS_COROUTINE
-      void* suspended_task_address = nullptr;  // if not null the function is suspended
-#endif
-      ttg_data_copy_t *copies[num_streams+1] = { nullptr };  // the data copies tracked by this task
-                                                             // +1 for the copy needed during send/bcas
-
-      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class)
-          : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies) {
-        tt_ht_item.key = pkey();
-      }
-
-      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
-                        parsec_taskpool_t *taskpool, TT *tt_ptr, int32_t priority)
-          : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority,
-                                   num_streams, copies, &release_task, tt_ptr->m_defer_writer)
-          , tt(tt_ptr) {
-        tt_ht_item.key = pkey();
-      }
-
-      static void release_task(parsec_ttg_task_base_t* task_base) {
-        parsec_ttg_task_t *task = static_cast<parsec_ttg_task_t*>(task_base);
-        TT *tt = task->tt;
-        tt->release_task(task);
-      }
-
-      parsec_key_t pkey() { return 0; }
-    };
-
     inline ttg_data_copy_t *find_copy_in_task(parsec_ttg_task_base_t *task, const void *ptr) {
       ttg_data_copy_t *res = nullptr;
       if (task == nullptr || ptr == nullptr) {
@@ -753,7 +570,7 @@ namespace ttg_parsec {
       }
       for (int i = 0; i < task->data_count; ++i) {
         auto copy = static_cast<ttg_data_copy_t *>(task->copies[i]);
-        if (NULL != copy && copy->device_private == ptr) {
+        if (NULL != copy && copy->get_ptr() == ptr) {
           res = copy;
           break;
         }
@@ -768,7 +585,7 @@ namespace ttg_parsec {
       }
       for (i = 0; i < task->data_count; ++i) {
         auto copy = static_cast<ttg_data_copy_t *>(task->copies[i]);
-        if (NULL != copy && copy->device_private == ptr) {
+        if (NULL != copy && copy->get_ptr() == ptr) {
           return i;
         }
       }
@@ -944,7 +761,7 @@ namespace ttg_parsec {
     }
 
     inline void release_data_copy(ttg_data_copy_t *copy) {
-      if (copy->is_mutable()) {
+      if (copy->is_mutable() && nullptr == copy->get_next_task()) {
         /* current task mutated the data but there are no consumers so prepare
         * the copy to be freed below */
         copy->reset_readers();
@@ -954,19 +771,24 @@ namespace ttg_parsec {
       if (readers > 1) {
         /* potentially more than one reader, decrement atomically */
         readers = copy->decrement_readers();
-      }
-      /* if there was only one reader (the current task) we release the copy */
-      if (1 == readers) {
-        if (nullptr != copy->push_task) {
+      } else if (readers == 1) {
+        /* make sure readers drop to zero */
+        readers = copy->decrement_readers<false>();
+      }
+      /* if there was only one reader (the current task) or
+       * a mutable copy and a successor, we release the copy */
+      if (1 == readers || copy->is_mutable()) {
+        if (nullptr != copy->get_next_task()) {
           /* Release the deferred task.
-          * The copy was mutable and will be mutated by the released task,
-          * so simply transfer ownership.
-          */
-          parsec_task_t *push_task = copy->push_task;
-          copy->push_task = nullptr;
-          parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)push_task;
+           * The copy was mutable and will be mutated by the released task,
+           * so simply transfer ownership.
+           */
+          parsec_task_t *next_task = copy->get_next_task();
+          copy->set_next_task(nullptr);
+          parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)next_task;
           deferred_op->release_task();
-        } else {
+        } else if ((1 == copy->num_ref()) || (1 == copy->drop_ref())) {
+          /* we are the last reference, delete the copy */
 #if defined(TTG_PARSEC_DEBUG_TRACK_DATA_COPIES)
           {
             const std::lock_guard<std::mutex> lock(pending_copies_mutex);
@@ -1002,10 +824,10 @@ namespace ttg_parsec {
       }
 
       if (readers == copy_in->mutable_tag) {
-        if (copy_res->push_task != nullptr) {
+        if (copy_res->get_next_task() != nullptr) {
           if (readonly) {
-            parsec_ttg_task_base_t *push_task = reinterpret_cast<parsec_ttg_task_base_t *>(copy_res->push_task);
-            if (push_task->defer_writer) {
+            parsec_ttg_task_base_t *next_task = reinterpret_cast<parsec_ttg_task_base_t *>(copy_res->get_next_task());
+            if (next_task->defer_writer) {
               /* there is a writer but it signalled that it wants to wait for readers to complete */
               return copy_res;
             }
@@ -1040,13 +862,13 @@ namespace ttg_parsec {
            * of the task
            */
           copy_in->mark_mutable();
-          assert(nullptr == copy_in->push_task);
+          assert(nullptr == copy_in->get_next_task());
           assert(nullptr != task);
-          copy_in->push_task = &task->parsec_task;
+          copy_in->set_next_task(&task->parsec_task);
         } else {
-          if (task->defer_writer && nullptr == copy_in->push_task) {
+          if (task->defer_writer && nullptr == copy_in->get_next_task()) {
             /* we're the first writer and want to wait for all readers to complete */
-            copy_res->push_task = &task->parsec_task;
+            copy_res->set_next_task(&task->parsec_task);
           } else {
             /* there are writers and/or waiting already of this copy already, make a copy that we can mutate */
             copy_res = NULL;
@@ -1055,10 +877,10 @@ namespace ttg_parsec {
       }
 
       if (NULL == copy_res) {
-        ttg_data_copy_t *new_copy = detail::create_new_datacopy(*static_cast<Value *>(copy_in->device_private));
-        if (replace && nullptr != copy_in->push_task) {
+        ttg_data_copy_t *new_copy = detail::create_new_datacopy(*static_cast<Value *>(copy_in->get_ptr()));
+        if (replace && nullptr != copy_in->get_next_task()) {
           /* replace the task that was deferred */
-          parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)copy_in->push_task;
+          parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)copy_in->get_next_task();
           new_copy->mark_mutable();
           /* replace the copy in the deferred task */
           for (int i = 0; i < deferred_op->data_count; ++i) {
@@ -1067,7 +889,7 @@ namespace ttg_parsec {
               break;
             }
           }
-          copy_in->push_task = nullptr;
+          copy_in->set_next_task(nullptr);
           deferred_op->release_task();
           copy_in->reset_readers();            // set the copy back to being read-only
           copy_in->increment_readers<false>(); // register as reader
@@ -1084,8 +906,6 @@ namespace ttg_parsec {
 
   }  // namespace detail
 
-  inline thread_local detail::parsec_ttg_task_base_t *parsec_ttg_caller;
-
   inline void ttg_initialize(int argc, char **argv, int num_threads, parsec_context_t *ctx) {
     if (detail::initialized_mpi()) throw std::runtime_error("ttg_parsec::ttg_initialize: can only be called once");
 
@@ -1113,6 +933,7 @@ namespace ttg_parsec {
     if(0 == ttg::default_execution_context().rank())
       ttg::default_execution_context().impl().final_task();
     ttg::detail::set_default_world(ttg::World{});  // reset the default world
+    detail::ptr::drop_all_ptr();
     ttg::detail::destroy_worlds<ttg_parsec::WorldImpl>();
     if (detail::initialized_mpi()) MPI_Finalize();
   }
@@ -1217,17 +1038,16 @@ namespace ttg_parsec {
     static constexpr int numouts = std::tuple_size_v<output_terminalsT>;       // number of outputs
     static constexpr int numflows = std::max(numins, numouts);                 // max number of flows
 
+   public:
     /// @return true if derivedT::have_cuda_op exists and is defined to true
     static constexpr bool derived_has_cuda_op() {
-      //if constexpr (test::meta::is_detected_v<have_cuda_op_non_type_t, derivedT>) {
-      if constexpr (derivedT::have_cuda_op) {
+      if constexpr (ttg::meta::is_detected_v<have_cuda_op_non_type_t, derivedT>) {
         return derivedT::have_cuda_op;
       } else {
         return false;
       }
     }
 
-   public:
     using ttT = TT;
     using key_type = keyT;
     using input_terminals_type = ttg::detail::input_terminals_tuple_t<keyT, input_tuple_type>;
@@ -1401,7 +1221,7 @@ namespace ttg_parsec {
     static input_refs_tuple_type make_tuple_of_ref_from_array(task_t *task, std::index_sequence<IS...>) {
       return input_refs_tuple_type{static_cast<std::tuple_element_t<IS, input_refs_tuple_type>>(
           *reinterpret_cast<std::remove_reference_t<std::tuple_element_t<IS, input_refs_tuple_type>> *>(
-              task->copies[IS]->device_private))...};
+              task->copies[IS]->get_ptr()))...};
     }
 
     /**
@@ -1424,6 +1244,7 @@ namespace ttg_parsec {
       /* we should still be waiting for the transfer to complete */
       assert(dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_TRANSFER);
 
+#if 0
       /* update the device pointers in the device views */
       int i = 0;
       for (auto& view : dev_data) {
@@ -1433,6 +1254,7 @@ namespace ttg_parsec {
           ++i;
         }
       }
+#endif // 0
 
       /* Here we call back into the coroutine again after the transfers have completed */
       static_op<Space>(&task->parsec_task);
@@ -1449,33 +1271,36 @@ namespace ttg_parsec {
     template <ttg::ExecutionSpace Space>
     static parsec_hook_return_t device_static_op(parsec_task_t* parsec_task) {
       static_assert(derived_has_cuda_op());
+
+      int dev_index;
+      double ratio = 1.0;
+
       task_t *task = (task_t*)parsec_task;
       parsec_execution_stream_s *es = task->tt->world.impl().execution_stream();
 
       std::cout << "device_static_op: task " << parsec_task << std::endl;
 
-      int dev_index;
-      double ratio = 1.0;
 
       /* set up a device task */
-      /* TODO: take them from a free-list */
       parsec_gpu_task_t *gpu_task;
-      gpu_task = (parsec_gpu_task_t *) calloc(1, sizeof(parsec_gpu_task_t));
+      /* PaRSEC wants to free the gpu_task, because F***K ownerships */
+      gpu_task = static_cast<parsec_gpu_task_t*>(std::calloc(1, sizeof(*gpu_task)));
       PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);
-
       gpu_task->ec = parsec_task;
-      /* come back into this function
-        * TODO: have a specialized function that checks
-        */
-      gpu_task->submit = &TT::device_static_submit<Space>;
       gpu_task->task_type = 0; // user task
       gpu_task->load = 1.0;    // TODO: can we do better?
       gpu_task->last_data_check_epoch = -1; // used internally
       gpu_task->pushout = 0;
+      gpu_task->submit = &TT::device_static_submit<Space>;
+
+      /* set the gpu_task so it's available in register_device_memory */
+      task->dev_ptr->gpu_task = gpu_task;
 
       // first invocation of the coroutine to get the coroutine handle
       static_op<Space>(parsec_task);
 
+      /* when we come back here, the flows in gpu_task are set (see register_device_memory) */
+
       // get the device task from the coroutine handle
       auto dev_task = ttg::device_task_handle_type::from_address(task->suspended_task_address);
 
@@ -1485,6 +1310,17 @@ namespace ttg_parsec {
       /* for now make sure we're waiting for transfers and the coro hasn't skipped this step */
       assert(dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_TRANSFER);
 
+      /* TODO: is this the right place to set the mask? */
+      task->parsec_task.chore_mask = PARSEC_DEV_ALL;
+      /* get a device and come back if we need another one */
+      dev_index = parsec_get_best_device(parsec_task, ratio);
+      assert(dev_index >= 0);
+      if (dev_index < 2) {
+          return PARSEC_HOOK_RETURN_NEXT; /* Fall back */
+      }
+
+
+#if 0
       // manage the gpu_task flows
 
       // set the input flows
@@ -1512,7 +1348,7 @@ namespace ttg_parsec {
           /* try to find the view in the task and allocate a new copy if needed */
           for (int i = 0; nullptr == copy && i < numins; ++i) {
             ttg_parsec::detail::ttg_data_copy_t* obj_copy = task->copies[i];
-            if (obj_copy->device_private == host_obj) {
+            if (obj_copy->get_ptr() == host_obj) {
               for (auto& dev_copy : *obj_copy) {
                 if (view_span.data() == dev_copy->device_private) {
                   copy = dev_copy;
@@ -1563,12 +1399,7 @@ namespace ttg_parsec {
         task->parsec_task.data[i].data_in = nullptr;
         task->parsec_task.data[i].source_repo_entry = NULL;
       }
-
-      dev_index = parsec_get_best_device(parsec_task, ratio);
-      assert(dev_index >= 0);
-      if (dev_index < 2) {
-          return PARSEC_HOOK_RETURN_NEXT; /* Fall back */
-      }
+#endif // 0
 
       parsec_device_module_t *device = parsec_mca_device_get(dev_index);
       assert(NULL != device);
@@ -1593,30 +1424,10 @@ namespace ttg_parsec {
       return PARSEC_HOOK_RETURN_DONE; // will not be reacehed
     }
 
-    class A {
-    public:
-      static constexpr bool have_cuda_op = true;
-    };
-
     template <ttg::ExecutionSpace Space>
     static parsec_hook_return_t static_op(parsec_task_t *parsec_task) {
 
-      //static_assert(ttg::meta::is_detected_v<have_cuda_op_non_type_t, A>);
-      //static_assert(ttg::meta::is_detected_v<have_cuda_op_non_type_t, derivedT>);
-      //static_assert(ttg::meta::is_detected<have_cuda_op_non_type_t, derivedT>::value);
-      // TODO: this does not compile
-      //static_assert(ttg::meta::detail::detector<ttg::meta::nonesuch, void, have_cuda_op_non_type_t, derivedT>::value_t::value);
-      // TODO: this does compile. WTF?!
-      //static_assert(test::meta::detail::detector<ttg::meta::nonesuch, void, have_cuda_op_non_type_t, derivedT>::value_t::value);
-      //static_assert(test::meta::is_detected<have_cuda_op_non_type_t, derivedT>::value);
-      //static_assert(derivedT::have_cuda_op);
-      //static_assert(detector<ttg::meta::nonesuch, void, have_cuda_op_non_type_t, derivedT>::value_t::value);
-      using t = ttg::meta::void_t<have_cuda_op_non_type_t<derivedT>>;
-
       task_t *task = (task_t*)parsec_task;
-      have_cuda_op_non_type_t<derivedT> tmp = 0;
-      std::cout << "static_op derived_has_cuda_op " << derived_has_cuda_op() << " detected " << test::meta::is_detected_v<have_cuda_op_non_type_t, derivedT>
-                << " have_cuda_op " << derivedT::have_cuda_op << " detected " << ttg::meta::detail::detector<ttg::meta::nonesuch, void, have_cuda_op_non_type_t, derivedT>::value_t::value << std::endl;
       void* suspended_task_address =
 #ifdef TTG_HAS_COROUTINE
         task->suspended_task_address;  // non-null = need to resume the task
@@ -1628,8 +1439,8 @@ namespace ttg_parsec {
 
         ttT *baseobj = task->tt;
         derivedT *obj = static_cast<derivedT *>(baseobj);
-        assert(parsec_ttg_caller == nullptr);
-        parsec_ttg_caller = static_cast<detail::parsec_ttg_task_base_t*>(task);
+        assert(detail::parsec_ttg_caller == nullptr);
+        detail::parsec_ttg_caller = static_cast<detail::parsec_ttg_task_base_t*>(task);
         if (obj->tracing()) {
           if constexpr (!ttg::meta::is_void_v<keyT>)
             ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : ", task->key, ": executing");
@@ -1650,12 +1461,12 @@ namespace ttg_parsec {
         } else {
           ttg::abort();
         }
-        parsec_ttg_caller = nullptr;
+        detail::parsec_ttg_caller = nullptr;
       }
       else {  // resume the suspended coroutine
         auto coro = static_cast<ttg::device_task>(ttg::device_task_handle_type::from_address(suspended_task_address));
-        assert(parsec_ttg_caller == nullptr);
-        parsec_ttg_caller = static_cast<detail::parsec_ttg_task_base_t*>(task);
+        assert(detail::parsec_ttg_caller == nullptr);
+        detail::parsec_ttg_caller = static_cast<detail::parsec_ttg_task_base_t*>(task);
         // TODO: unify the outputs tls handling
         auto old_output_tls_ptr = task->tt->outputs_tls_ptr_accessor();
         task->tt->set_outputs_tls_ptr();
@@ -1665,7 +1476,7 @@ namespace ttg_parsec {
           suspended_task_address = nullptr;
         }
         task->tt->set_outputs_tls_ptr(old_output_tls_ptr);
-        parsec_ttg_caller = nullptr;
+        detail::parsec_ttg_caller = nullptr;
 #if 0
 #ifdef TTG_HAS_COROUTINE
         auto ret = static_cast<ttg::resumable_task>(ttg::coroutine_handle<>::from_address(suspended_task_address));
@@ -1736,15 +1547,15 @@ namespace ttg_parsec {
       if (suspended_task_address == nullptr) {  // task is a coroutine that has not started or an ordinary function
         ttT *baseobj = (ttT *)task->object_ptr;
         derivedT *obj = (derivedT *)task->object_ptr;
-        assert(parsec_ttg_caller == NULL);
-        parsec_ttg_caller = task;
+        assert(detail::parsec_ttg_caller == NULL);
+        detail::parsec_ttg_caller = task;
         if constexpr (!ttg::meta::is_void_v<keyT>) {
           TTG_PROCESS_TT_OP_RETURN(suspended_task_address, baseobj->template op<Space>(task->key, obj->output_terminals));
         } else if constexpr (ttg::meta::is_void_v<keyT>) {
           TTG_PROCESS_TT_OP_RETURN(suspended_task_address, baseobj->template op<Space>(obj->output_terminals));
         } else  // unreachable
           ttg:: abort();
-        parsec_ttg_caller = NULL;
+        detail::parsec_ttg_caller = NULL;
       }
       else {
 #ifdef TTG_HAS_COROUTINE
@@ -1871,13 +1682,13 @@ namespace ttg_parsec {
       dummy->parsec_task.taskpool = world.impl().taskpool();
 
       /* save the current task and set the dummy task */
-      auto parsec_ttg_caller_save = parsec_ttg_caller;
-      parsec_ttg_caller = dummy;
+      auto parsec_ttg_caller_save = detail::parsec_ttg_caller;
+      detail::parsec_ttg_caller = dummy;
 
       /* iterate over the keys and have them use the copy we made */
       parsec_task_t *task_ring = nullptr;
       for (auto &&key : keylist) {
-        set_arg_local_impl<i>(key, *reinterpret_cast<valueT *>(copy->device_private), copy, &task_ring);
+        set_arg_local_impl<i>(key, *reinterpret_cast<valueT *>(copy->get_ptr()), copy, &task_ring);
       }
 
       if (nullptr != task_ring) {
@@ -1886,7 +1697,7 @@ namespace ttg_parsec {
       }
 
       /* restore the previous task */
-      parsec_ttg_caller = parsec_ttg_caller_save;
+      detail::parsec_ttg_caller = parsec_ttg_caller_save;
 
       /* release the dummy task */
       complete_task_and_release(es, &dummy->parsec_task);
@@ -1925,7 +1736,7 @@ namespace ttg_parsec {
           using decvalueT = std::decay_t<valueT>;
           if constexpr (!ttg::has_split_metadata<decvalueT>::value) {
             detail::ttg_data_copy_t *copy = detail::create_new_datacopy(decvalueT{});
-            unpack(*static_cast<decvalueT *>(copy->device_private), msg->bytes, pos);
+            unpack(*static_cast<decvalueT *>(copy->get_ptr()), msg->bytes, pos);
 
             set_arg_from_msg_keylist<i, decvalueT>(ttg::span<keyT>(&keylist[0], num_keys), copy);
           } else {
@@ -1967,7 +1778,7 @@ namespace ttg_parsec {
                     set_arg_from_msg_keylist<i, decvalueT>(keylist, copy);
                     this->world.impl().decrement_inflight_msg();
                   });
-              auto &val = *static_cast<decvalueT *>(copy->device_private);
+              auto &val = *static_cast<decvalueT *>(copy->get_ptr());
 
               using ActivationT = std::decay_t<decltype(*activation)>;
 
@@ -2208,8 +2019,8 @@ namespace ttg_parsec {
 
       if( world_impl.dag_profiling() ) {
 #if defined(PARSEC_PROF_GRAPHER)
-        if(NULL != parsec_ttg_caller && !parsec_ttg_caller->dummy()) {
-          int orig_index = detail::find_index_of_copy_in_task(parsec_ttg_caller, &value);
+        if(NULL != detail::parsec_ttg_caller && !detail::parsec_ttg_caller->dummy()) {
+          int orig_index = detail::find_index_of_copy_in_task(detail::parsec_ttg_caller, &value);
           char orig_str[32];
           char dest_str[32];
           if(orig_index >= 0) {
@@ -2222,7 +2033,7 @@ namespace ttg_parsec {
                               .flow_index = 0, .flow_datatype_mask = ~0 };
           parsec_flow_t dest{ .name = dest_str, .sym_type = PARSEC_SYM_INOUT, .flow_flags = PARSEC_FLOW_ACCESS_RW,
                               .flow_index = 0, .flow_datatype_mask = ~0 };
-          parsec_prof_grapher_dep(&parsec_ttg_caller->parsec_task, &task->parsec_task, discover_task ? 1 : 0, &orig, &dest);
+          parsec_prof_grapher_dep(&detail::parsec_ttg_caller->parsec_task, &task->parsec_task, discover_task ? 1 : 0, &orig, &dest);
         }
 #endif
       }
@@ -2242,7 +2053,7 @@ namespace ttg_parsec {
             copy = detail::create_new_datacopy(std::forward<Value>(value));
             task->copies[i] = copy;
           } else {
-            reducer(*reinterpret_cast<std::decay_t<valueT> *>(copy->device_private), value);
+            reducer(*reinterpret_cast<std::decay_t<valueT> *>(copy->get_ptr()), value);
           }
         } else {
           reducer();  // even if this was a control input, must execute the reducer for possible side effects
@@ -2263,8 +2074,8 @@ namespace ttg_parsec {
           }
 
           detail::ttg_data_copy_t *copy = copy_in;
-          if (nullptr == copy_in && nullptr != parsec_ttg_caller) {
-            copy = detail::find_copy_in_task(parsec_ttg_caller, &value);
+          if (nullptr == copy_in && nullptr != detail::parsec_ttg_caller) {
+            copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value);
           }
 
           if (nullptr != copy) {
@@ -2273,10 +2084,23 @@ namespace ttg_parsec {
           } else {
             copy = detail::create_new_datacopy(std::forward<Value>(value));
           }
+
+          /* if this is a host task make sure tracked buffers get copied to the host */
+          if constexpr(!derived_has_cuda_op()) {
+            int c = 0;
+            for (auto it : *copy) {
+              parsec_data_t *data = *it;
+              if (data->owner_device != 0) {
+                task->parsec_task.data[c].data_in = data->device_copies[0];
+                task->parsec_task.data[c].source_repo_entry = NULL;
+                ++c;
+              }
+            }
+          }
           /* if we registered as a writer and were the first to register with this copy
            * we need to defer the release of this task to give other tasks a chance to
            * make a copy of the original data */
-          release = (copy->push_task != &task->parsec_task);
+          release = (copy->get_next_task() != &task->parsec_task);
           task->copies[i] = copy;
         }
       }
@@ -2365,7 +2189,7 @@ namespace ttg_parsec {
 
     // Used to set the i'th argument
     template <std::size_t i, typename Key, typename Value>
-    void set_arg_impl(const Key &key, Value &&value) {
+    void set_arg_impl(const Key &key, Value &&value, detail::ttg_data_copy_t *copy_in = nullptr) {
       int owner;
 
 #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
@@ -2380,9 +2204,9 @@ namespace ttg_parsec {
         owner = keymap();
       if (owner == world.rank()) {
         if constexpr (!ttg::meta::is_void_v<keyT>)
-          set_arg_local<i, keyT, Value>(key, std::forward<Value>(value));
+          set_arg_local_impl<i, keyT, Value>(key, std::forward<Value>(value), copy_in);
         else
-          set_arg_local<i, keyT, Value>(std::forward<Value>(value));
+          set_arg_local_impl<i, keyT, Value>(std::forward<Value>(value), copy_in);
 #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
           if(world.impl().profiling()) {
             parsec_profiling_ts_trace(world.impl().parsec_ttg_profile_backend_set_arg_end, 0, 0, NULL);
@@ -2410,11 +2234,13 @@ namespace ttg_parsec {
         if constexpr (!ttg::has_split_metadata<decvalueT>::value) {
           pos = pack(value, msg->bytes, pos);
         } else {
-          detail::ttg_data_copy_t *copy;
-          copy = detail::find_copy_in_task(parsec_ttg_caller, &value);
+          detail::ttg_data_copy_t *copy = copy_in;
           if (nullptr == copy) {
-            // We need to create a copy for this data, as it does not exist yet.
-            copy = detail::create_new_datacopy(std::forward<Value>(value));
+            copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value);
+            if (nullptr == copy) {
+              // We need to create a copy for this data, as it does not exist yet.
+              copy = detail::create_new_datacopy(std::forward<Value>(value));
+            }
           }
           copy = detail::register_data_copy<decvalueT>(copy, nullptr, true);
 
@@ -2429,7 +2255,7 @@ namespace ttg_parsec {
           std::memcpy(msg->bytes + pos, &rank, sizeof(rank));
           pos += sizeof(rank);
 
-          auto iovecs = descr.get_data(*static_cast<decvalueT *>(copy->device_private));
+          auto iovecs = descr.get_data(*static_cast<decvalueT *>(copy->get_ptr()));
 
           int32_t num_iovs = std::distance(std::begin(iovecs), std::end(iovecs));
           std::memcpy(msg->bytes + pos, &num_iovs, sizeof(num_iovs));
@@ -2486,8 +2312,8 @@ namespace ttg_parsec {
       }
 #endif
 #if defined(PARSEC_PROF_GRAPHER)
-      if(NULL != parsec_ttg_caller && !parsec_ttg_caller->dummy()) {
-        int orig_index = detail::find_index_of_copy_in_task(parsec_ttg_caller, &value);
+      if(NULL != detail::parsec_ttg_caller && !detail::parsec_ttg_caller->dummy()) {
+        int orig_index = detail::find_index_of_copy_in_task(detail::parsec_ttg_caller, &value);
         char orig_str[32];
         char dest_str[32];
         if(orig_index >= 0) {
@@ -2501,7 +2327,7 @@ namespace ttg_parsec {
         parsec_flow_t dest{ .name = dest_str, .sym_type = PARSEC_SYM_INOUT, .flow_flags = PARSEC_FLOW_ACCESS_RW,
                             .flow_index = 0, .flow_datatype_mask = ~0 };
         task_t *task = create_new_task(key);
-        parsec_prof_grapher_dep(&parsec_ttg_caller->parsec_task, &task->parsec_task, 0, &orig, &dest);
+        parsec_prof_grapher_dep(&detail::parsec_ttg_caller->parsec_task, &task->parsec_task, 0, &orig, &dest);
         delete task;
       }
 #endif
@@ -2516,8 +2342,8 @@ namespace ttg_parsec {
 #endif
       parsec_task_t *task_ring = nullptr;
       detail::ttg_data_copy_t *copy = nullptr;
-      if (nullptr != parsec_ttg_caller) {
-        copy = detail::find_copy_in_task(parsec_ttg_caller, &value);
+      if (nullptr != detail::parsec_ttg_caller) {
+        copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value);
       }
 
       for (auto it = begin; it != end; ++it) {
@@ -2661,7 +2487,7 @@ namespace ttg_parsec {
         size_t metadata_size = sizeof(metadata);
 
         detail::ttg_data_copy_t *copy;
-        copy = detail::find_copy_in_task(parsec_ttg_caller, &value);
+        copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value);
         assert(nullptr != copy);
 
         parsec_taskpool_t *tp = world_impl.taskpool();
@@ -3347,7 +3173,7 @@ namespace ttg_parsec {
       self.release_task = &parsec_release_task_to_mempool_update_nbtasks;
       self.complete_execution = complete_task_and_release;
 
-      for (i = 0; i < numins; i++) {
+      for (i = 0; i < MAX_PARAM_COUNT; i++) {
         parsec_flow_t *flow = new parsec_flow_t;
         flow->name = strdup((std::string("flow in") + std::to_string(i)).c_str());
         flow->sym_type = PARSEC_SYM_INOUT;
@@ -3356,13 +3182,13 @@ namespace ttg_parsec {
         flow->dep_in[0] = NULL;
         flow->dep_out[0] = NULL;
         flow->flow_index = i;
-        flow->flow_datatype_mask = (1 << i);
+        flow->flow_datatype_mask = ~0;
         *((parsec_flow_t **)&(self.in[i])) = flow;
       }
-      *((parsec_flow_t **)&(self.in[i])) = NULL;
-      initialize_flows<input_terminals_type>(self.in);
+      //*((parsec_flow_t **)&(self.in[i])) = NULL;
+      //initialize_flows<input_terminals_type>(self.in);
 
-      for (i = 0; i < numouts; i++) {
+      for (i = 0; i < MAX_PARAM_COUNT; i++) {
         parsec_flow_t *flow = new parsec_flow_t;
         flow->name = strdup((std::string("flow out") + std::to_string(i)).c_str());
         flow->sym_type = PARSEC_SYM_INOUT;
@@ -3373,7 +3199,7 @@ namespace ttg_parsec {
         flow->flow_datatype_mask = (1 << i);
         *((parsec_flow_t **)&(self.out[i])) = flow;
       }
-      *((parsec_flow_t **)&(self.out[i])) = NULL;
+      //*((parsec_flow_t **)&(self.out[i])) = NULL;
 
       self.flags = 0;
       self.dependencies_goal = numins; /* (~(uint32_t)0) >> (32 - numins); */
@@ -3556,6 +3382,48 @@ namespace ttg_parsec {
         TTBase::invoke();
     }
 
+  private:
+    template<typename Key, typename Arg, typename... Args, std::size_t I, std::size_t... Is>
+    void invoke_arglist(std::index_sequence<I, Is...>, const Key& key, Arg&& arg, Args&&... args) {
+      using arg_type = std::decay_t<Arg>;
+      if constexpr (ttg::detail::is_ptr_v<arg_type>) {
+        /* add a reference to the object */
+        auto copy = ttg_parsec::detail::get_copy(arg);
+        copy->add_ref();
+        /* reset readers so that the value can flow without copying */
+        copy->reset_readers();
+        auto& val = *arg;
+        set_arg_impl<I>(key, val, copy);
+        if constexpr (std::is_rvalue_reference_v<Arg>) {
+          /* if the ptr was moved in we reset it */
+          arg.reset();
+        }
+      } else if constexpr (!ttg::detail::is_ptr_v<arg_type>) {
+        set_arg<I>(key, std::forward<Arg>(arg));
+      }
+      if constexpr (sizeof...(Is) > 0) {
+        /* recursive next argument */
+        invoke_arglist(std::index_sequence<Is...>{}, key, std::forward<Args>(args)...);
+      }
+    }
+
+  public:
+    // Manual injection of a task with all input arguments specified as a tuple
+    template <typename Key = keyT, typename Arg, typename... Args>
+    std::enable_if_t<!ttg::meta::is_void_v<Key> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>, void> invoke(
+        const Key &key, Arg&& arg, Args&&... args) {
+      static_assert(sizeof...(Args)+1 == std::tuple_size_v<actual_input_tuple_type>,
+                    "Number of arguments to invoke must match the number of task inputs.");
+      TTG_OP_ASSERT_EXECUTABLE();
+      /* trigger non-void inputs */
+      invoke_arglist(ttg::meta::nonvoid_index_seq<actual_input_tuple_type>{}, key,
+                     std::forward<Arg>(arg), std::forward<Args>(args)...);
+      //set_args(ttg::meta::nonvoid_index_seq<actual_input_tuple_type>{}, key, args);
+      /* trigger void inputs */
+      using void_index_seq = ttg::meta::void_index_seq<actual_input_tuple_type>;
+      set_args(void_index_seq{}, key, ttg::detail::make_void_tuple<void_index_seq::size()>());
+    }
+
     void set_defer_writer(bool value) {
       m_defer_writer = value;
     }
@@ -3719,31 +3587,34 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
  private:
   ttg_parsec::detail::ttg_data_copy_t *copy_to_remove = nullptr;
 
+
+
  public:
   ~value_copy_handler() {
     if (nullptr != copy_to_remove) {
-      ttg_parsec::detail::remove_data_copy(copy_to_remove, parsec_ttg_caller);
+      ttg_parsec::detail::remove_data_copy(copy_to_remove, ttg_parsec::detail::parsec_ttg_caller);
       ttg_parsec::detail::release_data_copy(copy_to_remove);
     }
   }
 
   template <typename Value>
   inline Value &&operator()(Value &&value) {
-    if (nullptr == parsec_ttg_caller) {
+    if (nullptr == ttg_parsec::detail::parsec_ttg_caller) {
       ttg::print("ERROR: ttg_send or ttg_broadcast called outside of a task!\n");
     }
+    using value_type = std::remove_reference_t<Value>;
     ttg_parsec::detail::ttg_data_copy_t *copy;
-    copy = ttg_parsec::detail::find_copy_in_task(parsec_ttg_caller, &value);
-    Value *value_ptr = &value;
+    copy = ttg_parsec::detail::find_copy_in_task(ttg_parsec::detail::parsec_ttg_caller, &value);
+    value_type *value_ptr = &value;
     if (nullptr == copy) {
       /**
        * the value is not known, create a copy that we can track
        * depending on Value, this uses either the copy or move constructor
        */
       copy = ttg_parsec::detail::create_new_datacopy(std::forward<Value>(value));
-      bool inserted = ttg_parsec::detail::add_copy_to_task(copy, parsec_ttg_caller);
+      bool inserted = ttg_parsec::detail::add_copy_to_task(copy, ttg_parsec::detail::parsec_ttg_caller);
       assert(inserted);
-      value_ptr = reinterpret_cast<Value *>(copy->device_private);
+      value_ptr = reinterpret_cast<value_type *>(copy->get_ptr());
       copy_to_remove = copy;
     } else {
       /* this copy won't be modified anymore so mark it as read-only */
@@ -3754,11 +3625,11 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
 
   template <typename Value>
   inline const Value &operator()(const Value &value) {
-    if (nullptr == parsec_ttg_caller) {
+    if (nullptr == ttg_parsec::detail::parsec_ttg_caller) {
       ttg::print("ERROR: ttg_send or ttg_broadcast called outside of a task!\n");
     }
     ttg_parsec::detail::ttg_data_copy_t *copy;
-    copy = ttg_parsec::detail::find_copy_in_task(parsec_ttg_caller, &value);
+    copy = ttg_parsec::detail::find_copy_in_task(ttg_parsec::detail::parsec_ttg_caller, &value);
     const Value *value_ptr = &value;
     if (nullptr == copy) {
       /**
@@ -3766,30 +3637,103 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
        * depending on Value, this uses either the copy or move constructor
        */
       copy = ttg_parsec::detail::create_new_datacopy(value);
-      bool inserted = ttg_parsec::detail::add_copy_to_task(copy, parsec_ttg_caller);
+      bool inserted = ttg_parsec::detail::add_copy_to_task(copy, ttg_parsec::detail::parsec_ttg_caller);
       assert(inserted);
-      value_ptr = reinterpret_cast<Value *>(copy->device_private);
+      value_ptr = reinterpret_cast<Value *>(copy->get_ptr());
       copy_to_remove = copy;
     }
     return *value_ptr;
   }
 
+#if 0
   /* we have to make a copy of non-const data as the user may modify it after
    * send/broadcast */
   template <typename Value, typename Enabler = std::enable_if_t<!std::is_const_v<Value>>>
   inline Value &operator()(Value &value) {
-    if (nullptr == parsec_ttg_caller) {
+    if (nullptr == ttg_parsec::detail::parsec_ttg_caller) {
       ttg::print("ERROR: ttg_send or ttg_broadcast called outside of a task!\n");
     }
     /* the value is not known, create a copy that we can track */
     ttg_parsec::detail::ttg_data_copy_t *copy;
     copy = ttg_parsec::detail::create_new_datacopy(value);
-    bool inserted = ttg_parsec::detail::add_copy_to_task(copy, parsec_ttg_caller);
+    bool inserted = ttg_parsec::detail::add_copy_to_task(copy, ttg_parsec::detail::parsec_ttg_caller);
     assert(inserted);
-    Value *value_ptr = reinterpret_cast<Value *>(copy->device_private);
+    Value *value_ptr = reinterpret_cast<Value *>(copy->get_ptr());
     copy_to_remove = copy;
     return *value_ptr;
   }
+
+  /**
+   * Overload for PersistentView objects
+   *
+   * TODO: make sure the device copy is current?!
+   */
+
+  template <typename Value>
+  inline Value &operator()(ttg::PersistentView<Value> &&ptr) {
+    Value& value_ref = *ptr;
+
+    /* register the copy with the task so that the TT can find it */
+    ttg_parsec::detail::ttg_data_copy_t *copy;
+    copy = ptr.get_copy();
+    bool inserted = ttg_parsec::detail::add_copy_to_task(copy, ttg_parsec::detail::parsec_ttg_caller);
+    if (inserted) {
+      copy_to_remove = copy;
+      /* if this is the first time we see this copy again we add the TTG reference */
+      if (0 == copy->num_readers()) {
+        copy->add_ref();
+      }
+      /* steal the copy from the ptr */
+      ptr.reset();
+    }
+
+    return value_ref;
+  }
+
+  template <typename Value>
+  inline Value &operator()(ttg::PersistentView<Value> &ptr) {
+    Value& value_ref = *ptr;
+
+    /* register the copy with the task so that the TT can find it */
+    ttg_parsec::detail::ttg_data_copy_t *copy;
+    copy = ptr.get_copy();
+    bool inserted = ttg_parsec::detail::add_copy_to_task(copy, ttg_parsec::detail::parsec_ttg_caller);
+    if (inserted) {
+      copy_to_remove = copy;
+      /* if this is the first time we see this copy again we add the TTG reference */
+      if (0 == copy->readers) {
+        copy->add_ref();
+      }
+    }
+    return value_ref;
+  }
+
+  template <typename Value>
+  inline const Value &operator()(const ttg::PersistentView<Value> &ptr) {
+    const Value& value_ref = *ptr;
+    /* register the copy with the task so that the TT can find it */
+    ttg_parsec::detail::ttg_data_copy_t *copy;
+    copy = ptr.get_copy();
+    bool inserted = ttg_parsec::detail::add_copy_to_task(copy, ttg_parsec::detail::parsec_ttg_caller);
+    if (inserted) {
+      copy_to_remove = copy;
+      /* if this is the first time we see this copy again we add the TTG reference */
+      if (0 == copy->num_readers()) {
+        copy->add_ref();
+      }
+    }
+    return value_ref;
+  }
+
+  /**
+   * Overload for ttg::ptr objects
+   *
+   * TODO: implement :)
+   *
+   * TODO: make sure the host copies are current?!
+   */
+#endif // 0
+
 };
 
 #endif  // PARSEC_TTG_H_INCLUDED
diff --git a/ttg/ttg/parsec/ttg_data_copy.h b/ttg/ttg/parsec/ttg_data_copy.h
index 88d5531f3..66c907356 100644
--- a/ttg/ttg/parsec/ttg_data_copy.h
+++ b/ttg/ttg/parsec/ttg_data_copy.h
@@ -5,36 +5,102 @@
 #include <limits>
 #include <vector>
 #include <iterator>
+#include <atomic>
+#include <type_traits>
 
 #include <parsec.h>
 
+#include "ttg/parsec/thread_local.h"
+
 
 namespace ttg_parsec {
 
   namespace detail {
 
-    /* Extension of PaRSEC's data copy. Note that we use the readers field
-    * to facilitate the ref-counting of the data copy.
-    * TODO: create abstractions for all fields in parsec_data_copy_t that we access.
-    */
-    struct ttg_data_copy_t : public parsec_data_copy_t {
-#if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
-      int64_t size;
-      int64_t uid;
-#endif
+    /* Non-owning copy-tracking wrapper, accounting for N readers or 1 writer.
+     * Also counts external references, which are not treated as
+     * readers or writers but merely prevent the object from being
+     * destroyed once no readers/writers exist.
+     */
+    struct ttg_data_copy_t {
 
       /* special value assigned to parsec_data_copy_t::readers to mark the copy as
       * mutable, i.e., a task will modify it */
       static constexpr int mutable_tag = std::numeric_limits<int>::min();
 
+      ttg_data_copy_t()
+      {
+        /* set the container ptr here, will be reset in the the ttg_data_value_copy_t ctor */
+        ttg_data_copy_container() = this;
+      }
+
+      ttg_data_copy_t(const ttg_data_copy_t& c) {
+        /* we allow copying but do not copy any data over from the original
+         * device copies will have to be allocated again
+         * and it's a new object to reference */
+
+        /* set the container ptr here, will be reset in the the ttg_data_value_copy_t ctor */
+        ttg_data_copy_container() = this;
+      }
+
+      ttg_data_copy_t(ttg_data_copy_t&& c)
+      : m_ptr(c.m_ptr)
+      , m_next_task(c.m_next_task)
+      , m_readers(c.m_readers)
+      , m_refs(c.m_refs.load(std::memory_order_relaxed))
+      , m_dev_data(std::move(c.m_dev_data))
+      , m_single_dev_data(c.m_single_dev_data)
+      , m_num_dev_data(c.m_num_dev_data)
+      {
+        c.m_num_dev_data = 0;
+        c.m_readers = 0;
+        c.m_single_dev_data = nullptr;
+
+        /* set the container ptr here, will be reset in the the ttg_data_value_copy_t ctor */
+        ttg_data_copy_container() = this;
+      }
+
+      ttg_data_copy_t& operator=(ttg_data_copy_t&& c)
+      {
+        m_ptr = c.m_ptr;
+        c.m_ptr = nullptr;
+        m_next_task = c.m_next_task;
+        c.m_next_task = nullptr;
+        m_readers = c.m_readers;
+        c.m_readers = 0;
+        m_refs.store(c.m_refs.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        c.m_refs.store(0, std::memory_order_relaxed);
+        m_dev_data = std::move(c.m_dev_data);
+        m_single_dev_data = c.m_single_dev_data;
+        c.m_single_dev_data = nullptr;
+        m_num_dev_data = c.m_num_dev_data;
+        c.m_num_dev_data = 0;
+        /* set the container ptr here, will be reset in the the ttg_data_value_copy_t ctor */
+        ttg_data_copy_container() = this;
+        return *this;
+      }
+
+      ttg_data_copy_t& operator=(const ttg_data_copy_t& c) {
+        /* we allow copying but do not copy any data over from the original
+         * device copies will have to be allocated again
+         * and it's a new object to reference */
+
+        /* set the container ptr here, will be reset in the the ttg_data_value_copy_t ctor */
+        ttg_data_copy_container() = this;
+        return *this;
+      }
+
+      /* mark destructor as virtual */
+      virtual ~ttg_data_copy_t() = default;
+
       /* Returns true if the copy is mutable */
       bool is_mutable() const {
-        return this->readers == mutable_tag;
+        return m_readers == mutable_tag;
       }
 
       /* Mark the copy as mutable */
       void mark_mutable() {
-        this->readers = mutable_tag;
+        m_readers = mutable_tag;
       }
 
       /* Increment the reader counter and return previous value
@@ -43,9 +109,11 @@ namespace ttg_parsec {
       template<bool Atomic = true>
       int increment_readers() {
         if constexpr(Atomic) {
-          return parsec_atomic_fetch_inc_int32(&this->readers);
+          //return parsec_atomic_fetch_inc_int32(&m_readers);
+          std::atomic_ref<int32_t> a{m_readers};
+          return a.fetch_add(1, std::memory_order_relaxed);
         } else {
-          return this->readers++;
+          return m_readers++;
         }
       }
 
@@ -53,7 +121,7 @@ namespace ttg_parsec {
       * Reset the number of readers to read-only with a single reader.
       */
       void reset_readers() {
-        this->readers = 1;
+        m_readers = 1;
       }
 
       /* Decrement the reader counter and return previous value.
@@ -62,84 +130,122 @@ namespace ttg_parsec {
       template<bool Atomic = true>
       int decrement_readers() {
         if constexpr(Atomic) {
-          return parsec_atomic_fetch_dec_int32(&this->readers);
+          //return parsec_atomic_fetch_dec_int32(&m_readers);
+          std::atomic_ref<int32_t> a{m_readers};
+          return a.fetch_sub(1, std::memory_order_relaxed);
         } else {
-          return this->readers--;
+          return m_readers--;
         }
       }
 
       /* Returns the number of readers if the copy is immutable, or \c mutable_tag
       * if the copy is mutable */
       int num_readers() const {
-        return this->readers;
+        return m_readers;
       }
 
-      ttg_data_copy_t()
-      {
-        /* TODO: do we need this construction? */
-        PARSEC_OBJ_CONSTRUCT(this, parsec_data_copy_t);
-        this->readers = 1;
-        this->push_task = nullptr;
+      void *get_ptr() const {
+        return m_ptr;
       }
 
-      /* mark destructor as virtual */
-      virtual ~ttg_data_copy_t() = default;
+      parsec_task_t* get_next_task() const {
+        return m_next_task;
+      }
 
+      void set_next_task(parsec_task_t* task) {
+        m_next_task = task;
+      }
+
+      int32_t add_ref() {
+        return m_refs.fetch_add(1, std::memory_order_relaxed);
+      }
+
+      int32_t drop_ref() {
+        return m_refs.fetch_sub(1, std::memory_order_relaxed);
+      }
+
+      bool has_ref() {
+        return (m_refs.load(std::memory_order_relaxed) != 0);
+      }
 
+      int32_t num_ref() const {
+        return m_refs.load(std::memory_order_relaxed);
+      }
 
       /* manage device copies owned by this object
        * we only touch the vector if we have more than one copies to track
        * and otherwise use the single-element member.
        */
-      using iterator = parsec_data_copy_t**;
+      using iterator = parsec_data_t**;
 
-      void add_device_copy(parsec_data_copy_t* copy) {
+      void add_device_data(parsec_data_t* data) {
         // TODO: properly release again!
-        PARSEC_OBJ_RETAIN(copy);
-        switch (m_num_dev_copies) {
+        PARSEC_OBJ_RETAIN(data);
+        switch (m_num_dev_data) {
           case 0:
-            m_single_dev_copy = copy;
+            m_single_dev_data = data;
             break;
           case 1:
             /* move single copy into vector and add new copy below */
-            m_dev_copies.push_back(m_single_dev_copy);
+            m_dev_data.push_back(m_single_dev_data);
             /* fall-through */
           default:
             /* store in multi-copy vector */
-            m_dev_copies.push_back(copy);
+            m_dev_data.push_back(data);
             break;
         }
-        m_num_dev_copies++;
+        m_num_dev_data++;
       }
 
-      int num_dev_copies() const {
-        return m_num_dev_copies;
+      void remove_device_data(parsec_data_t* data) {
+        if (m_num_dev_data == 1) {
+          m_single_dev_data = nullptr;
+        } else if (m_num_dev_data > 1) {
+          auto it = std::find(m_dev_data.begin(), m_dev_data.end(), data);
+          if (it != m_dev_data.end()) {
+            m_dev_data.erase(it);
+          }
+        }
+        --m_num_dev_data;
+      }
+
+      int num_dev_data() const {
+        return m_num_dev_data;
       }
 
       iterator begin() {
-        switch(m_num_dev_copies) {
+        switch(m_num_dev_data) {
           // no device copies
           case 0: return end();
-          case 1: return &m_single_dev_copy;
-          default: return m_dev_copies.data();
+          case 1: return &m_single_dev_data;
+          default: return m_dev_data.data();
         }
       }
 
       iterator end() {
-        switch(m_num_dev_copies) {
+        switch(m_num_dev_data) {
           case 0:
           case 1:
-            return &(m_single_dev_copy) + 1;
+            return &(m_single_dev_data) + 1;
           default:
-            return m_dev_copies.data() + m_dev_copies.size();
+            return m_dev_data.data() + m_dev_data.size();
         }
       }
 
-    private:
-      std::vector<parsec_data_copy_t*> m_dev_copies;   //< used if there are multiple device copies
-                                                       //  that belong to this object
-      parsec_data_copy_t* m_single_dev_copy;           //< used if there is a single device copy
-      int m_num_dev_copies = 0;                        //< number of device copies
+#if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
+      int64_t size;
+      int64_t uid;
+#endif
+    protected:
+      void          *m_ptr;
+      parsec_task_t *m_next_task = nullptr;
+      int32_t        m_readers  = 1;
+      std::atomic<int32_t>  m_refs = 1; // number of entities referencing this copy (TTGs, external)
+
+      std::vector<parsec_data_t*> m_dev_data;   //< used if there are multiple device copies
+                                                  //  that belong to this object
+      parsec_data_t *m_single_dev_data;           //< used if there is a single device copy
+      int m_num_dev_data = 0;                   //< number of device copies
     };
 
 
@@ -150,14 +256,57 @@ namespace ttg_parsec {
     */
     template<typename ValueT>
     struct ttg_data_value_copy_t final : public ttg_data_copy_t {
-      using value_type = std::decay_t<ValueT>;
+      using value_type = ValueT;
       value_type m_value;
 
       template<typename T>
       ttg_data_value_copy_t(T&& value)
-      : ttg_data_copy_t(), m_value(std::forward<T>(value))
+      : ttg_data_copy_t()
+      , m_value(std::forward<T>(value))
+      {
+        this->m_ptr = const_cast<value_type*>(&m_value);
+        /* reset the container tracker */
+        ttg_data_copy_container() = nullptr;
+      }
+
+      ttg_data_value_copy_t(ttg_data_value_copy_t&& c)
+        noexcept(std::is_nothrow_move_constructible_v<value_type>)
+      : ttg_data_copy_t(std::move(c))
+      , m_value(std::move(c.m_value))
       {
-        this->device_private = const_cast<value_type*>(&m_value);
+        /* reset the container tracker */
+        ttg_data_copy_container() = nullptr;
+      }
+
+      ttg_data_value_copy_t(const ttg_data_value_copy_t& c)
+        noexcept(std::is_nothrow_copy_constructible_v<value_type>)
+      : ttg_data_copy_t(c)
+      , m_value(c.m_value)
+      {
+        /* reset the container tracker */
+        ttg_data_copy_container() = nullptr;
+      }
+
+      ttg_data_value_copy_t& operator=(ttg_data_value_copy_t&& c)
+        noexcept(std::is_nothrow_move_assignable_v<value_type>)
+      {
+        ttg_data_copy_t::operator=(std::move(c));
+        m_value = std::move(c.m_value);
+        /* reset the container tracker */
+        ttg_data_copy_container() = nullptr;
+      }
+
+      ttg_data_value_copy_t& operator=(const ttg_data_value_copy_t& c)
+        noexcept(std::is_nothrow_copy_assignable_v<value_type>)
+      {
+        ttg_data_copy_t::operator=(c);
+        m_value = c.m_value;
+        /* reset the container tracker */
+        ttg_data_copy_container() = nullptr;
+      }
+
+      value_type& operator*() {
+        return m_value;
       }
 
       /* will destruct the value */
@@ -168,4 +317,4 @@ namespace ttg_parsec {
 
 } // namespace ttg_parsec
 
-#endif // TTG_DATA_COPY_H
+#endif // TTG_DATA_COPY_H
\ No newline at end of file
diff --git a/ttg/ttg/ptr.h b/ttg/ttg/ptr.h
new file mode 100644
index 000000000..3cc699f60
--- /dev/null
+++ b/ttg/ttg/ptr.h
@@ -0,0 +1,118 @@
+#ifndef TTG_PTR_H
+#define TTG_PTR_H
+
+#include "ttg/impl_selector.h"
+
+namespace ttg {
+
+template<typename T>
+using Ptr = TTG_IMPL_NS::ptr<T>;
+
+template<typename T, typename... Args>
+Ptr<T> make_ptr(Args&&... args) {
+  return TTG_IMPL_NS::make_ptr(std::forward<Args>(args)...);
+}
+
+template<typename T>
+auto get_ptr(T&& obj) {
+  return TTG_IMPL_NS::get_ptr(std::forward<T>(obj));
+}
+
+namespace detail {
+    template<typename T>
+    struct is_ptr : std::false_type
+    { };
+
+    template<typename T>
+    struct is_ptr<ttg::Ptr<T>> : std::true_type
+    { };
+
+    template<typename T>
+    constexpr bool is_ptr_v = is_ptr<T>::value;
+
+} // namespace detail
+
+#if 0
+namespace detail {
+
+    /* awaiter for ttg::get_ptr with multiple arguments
+     * operator co_wait will return the tuple of ttg::Ptr
+     */
+    template<typename... Ts>
+    struct get_ptr_tpl_t {
+    private:
+      std::tuple<ttg::Ptr<Ts>...> m_ptr_tuple;
+      bool m_is_ready = false;
+    public:
+      get_ptr_tpl_t(bool is_ready, std::tuple<ttg::ptr<Ts>...>&& ptrs)
+      : m_ptr_tuple(std::forward<std::tuple<ttg::Ptr<Ts>...>>(ptrs))
+      , m_is_ready(is_ready)
+      { }
+
+      bool await_ready() const noexcept {
+        return m_is_ready;
+      }
+
+      constexpr void await_suspend( std::coroutine_handle<> ) const noexcept {
+        /* TODO: anything to be done here? */
+      }
+
+      auto await_resume() const noexcept {
+        return std::move(m_ptr_tuple);
+      }
+    };
+
+
+    /* awaiter for ttg::get_ptr for a single argument */
+    template<typename T>
+    struct get_ptr_t {
+    private:
+      ttg::Ptr<T> m_ptr;
+      bool m_is_ready = false;
+    public:
+      get_ptr_t(bool is_ready, ttg::Ptr<T>&& ptr)
+      : m_ptr(std::forward<ttg::Ptr<T>>(ptr))
+      , m_is_ready(is_ready)
+      { }
+
+      bool await_ready() const noexcept {
+        return m_is_ready;
+      }
+
+      constexpr void await_suspend( std::coroutine_handle<> ) const noexcept {
+        /* TODO: anything to be done here? */
+      }
+
+      auto await_resume() const noexcept {
+        return std::move(m_ptr);
+      }
+    };
+  } // namespace detail
+
+  /**
+   * Get an awaiter that results in a ttg::Ptr to a task argument.
+   * Must only be called inside a task on a value that was passed
+   * to the task and has not yet been moved on.
+   * Should be used in conjunction with co_await, e.g.,
+   * ttg::Ptr<double> ptr = co_await ttg::get_ptr(val);
+   *
+   * Multiple value can be passed, which results in a tuple of ptr:
+   * ttg::Ptr<double> ptr1, ptr2;
+   * std::tie(ptr1, ptr2) = co_await ttg::get_ptr(val1, val2);
+   */
+  template<typename Arg, typename... Args>
+  auto get_ptr(Arg&& arg, Args&&... args) {
+    bool is_ready;
+    using tpl_type    = std::tuple<ttg::Ptr<std::decay_t<Arg>, std::decay<Args>...>>;
+    using result_type = std::pair<bool, tpl_type>;
+    result_type p = TTG_IMPL_NS::get_ptr(std::forward<Arg>(arg), std::forward<Args>(args)...);
+    if constexpr (sizeof...(Args) > 0) {
+      return detail::get_ptr_tpl_t<std::decay_t<Arg>, std::decay_t<Args>...>(p.first, std::move(p.second));
+    } else if constexpr (sizeof...(Args) == 0) {
+      return detail::get_ptr_t<std::decay_t<Arg>>(p.first, std::move(std::get<0>(p.second)));
+    }
+  }
+#endif // 0
+} // namespace ttg
+
+#endif // TTG_PTR_H
\ No newline at end of file
diff --git a/ttg/ttg/run.h b/ttg/ttg/run.h
index 21ec337e8..b4dc593de 100644
--- a/ttg/ttg/run.h
+++ b/ttg/ttg/run.h
@@ -57,6 +57,7 @@ namespace ttg {
   inline void finalize() { TTG_IMPL_NS::ttg_finalize(); }
 
   /// Aborts the TTG program using the default backend's `ttg_abort` method
+  [[noreturn]]
   inline void abort() { TTG_IMPL_NS::ttg_abort(); }
 
   /// Accesses the default backend's default execution context
diff --git a/ttg/ttg/view.h b/ttg/ttg/view.h
index 11cb3dcde..4275b3fa0 100644
--- a/ttg/ttg/view.h
+++ b/ttg/ttg/view.h
@@ -5,13 +5,19 @@
 #include <type_traits>
 #include <span>
 
+#include "ttg/ptr.h"
+
 namespace ttg {
 
   enum class ViewScope {
-    Allocate  = 0x0,
-    SyncIn    = 0x1,
-    SyncOut   = 0x2,
-    SyncInOut = 0x3
+    Allocate     = 0x0,  //< memory allocated as scratch, but not moved in or out
+    Available    = 0x1,  //< data will be reused on device if available, transferred to device otherwise
+    SyncIn       = 0x2,  //< data will be allocated on and transferred to device
+                         //< if latest version resides on the device (no previous sync-out) the data will
+                         //< not be transferred again
+    SyncOut      = 0x4,  //< value will be transferred from device to host after kernel completes
+    SyncInOut    = 0x8,  //< data will be moved in and synchronized back out after the kernel completes
+    AvailableOut = 0x16, //< similar to Available and data is transferred back to device after kernel completes
   };
 
   /**
@@ -20,17 +26,17 @@ namespace ttg {
    * to remove the type and convert to void pointers instead.
    */
   template<typename T, typename = void>
-  struct ViewSpan;
+  struct ViewPart;
 
   template<>
-  struct ViewSpan<void, void> {
+  struct ViewPart<void, void> {
 
     using element_type  = void;
     using value_type    = void;
-    using viewspan_type = ViewSpan<value_type>;
+    using view_part_type = ViewPart<value_type>;
 
-    constexpr ViewSpan() = default;
-    constexpr ViewSpan(void* ptr, std::size_t size, ViewScope scope = ViewScope::SyncIn)
+    constexpr ViewPart() = default;
+    constexpr ViewPart(void* ptr, std::size_t size, ViewScope scope = ViewScope::SyncIn)
     : m_data_ptr(ptr)
     , m_size(size)
     , m_scope(scope)
@@ -72,15 +78,15 @@ namespace ttg {
 
 
   template<typename T>
-  struct ViewSpan<T, void> : public ViewSpan<void, void> {
+  struct ViewPart<T, void> : public ViewPart<void, void> {
 
     using element_type  = T;
     using value_type    = std::remove_cv_t<T>;
-    using viewspan_type = ViewSpan<value_type>;
+    using view_part_type = ViewPart<value_type>;
 
-    constexpr ViewSpan() = default;
-    constexpr ViewSpan(T* ptr, std::size_t size, ViewScope scope = ViewScope::SyncIn)
-    : ViewSpan<void, void>(ptr, size, scope)
+    constexpr ViewPart() = default;
+    constexpr ViewPart(T* ptr, std::size_t size, ViewScope scope = ViewScope::SyncIn)
+    : ViewPart<void, void>(ptr, size, scope)
     { }
 
     constexpr T* data() const {
@@ -89,13 +95,22 @@ namespace ttg {
 
   };
 
-  template<typename HostT, typename... DevTypeTs>
+  namespace detail {
+    template<typename T>
+    struct view_trait
+    {
+      static constexpr bool is_view = false;
+      static constexpr bool is_persistent = false;
+    };
+  } // namespace detail
+
+  template<typename HostT, typename DevT = HostT, typename... DevTypeTs>
   struct View {
 
-    using span_tuple_type  = std::tuple<ttg::ViewSpan<DevTypeTs>...>;
+    using span_tuple_type  = std::tuple<ttg::ViewPart<DevT>, ttg::ViewPart<DevTypeTs>...>;
     using host_type = HostT;
 
-    using view_type = View<HostT, DevTypeTs...>;
+    using view_type = View<HostT, DevT, DevTypeTs...>;
 
     constexpr static std::size_t num_spans = std::tuple_size_v<span_tuple_type>;
 
@@ -106,20 +121,36 @@ namespace ttg {
     , m_spans({std::get<Is>(spans)...})
     { }
 
+    View(HostT& obj, span_tuple_type spans)
+    : View(obj, spans, std::make_index_sequence<sizeof...(DevTypeTs)+1>{})
+    {
+      /* TODO: let the runtime handle the view */
+      //ttg::detail::register_view(*this);
+    }
+
+    /* hidden so that users cannot create views outside of a task */
+    template<typename T, typename... ArgsT>
+    friend auto make_view(T& obj, ViewPart<ArgsT>... spans);
+
   public:
 
-    constexpr View() = default;
+    constexpr View() = delete;
 
-    View(HostT& obj, span_tuple_type spans)
-    : View(obj, spans, std::index_sequence_for<DevTypeTs...>{})
-    { }
+    /* move ctor deleted to prevent moving out of a task */
+    View(view_type&&) = delete;
 
-    View(view_type&&) = default;
+    /* copy ctor deleted to prevent copying out of a task */
+    View(const view_type&) = delete;
 
-    View(const view_type&) = default;
+    ~View() {
+      /* TODO: let the runtime remove the view */
+      //ttg::detail::drop_view(*this);
+    }
 
-    view_type& operator=(view_type&&) = default;
-    view_type& operator=(const view_type&) = default;
+    /* move operator deleted to prevent moving out of a task */
+    view_type& operator=(view_type&&) = delete;
+    /* copy operator deleted to prevent moving out of a task */
+    view_type& operator=(const view_type&) = delete;
 
     template<std::size_t i>
     auto get_device_ptr() {
@@ -137,7 +168,7 @@ namespace ttg {
     }
 
     template<std::size_t i>
-    auto& get_viewspan() const {
+    auto& get_ViewPart() const {
       return std::get<i>(m_spans);
     }
 
@@ -158,29 +189,38 @@ namespace ttg {
       return num_spans;
     }
 
-    /* return a std::span of type-punned ViewSpans */
-    std::span<ViewSpan<void>> view_spans() {
+    /* return a std::span of type-punned ViewParts */
+    std::span<ViewPart<void>> view_spans() {
       return {m_spans.begin(), m_spans.end()};
     }
 
   private:
     HostT* m_obj = nullptr;
     /* type-punned storage, cast to actual types in get_device_ptr */
-    std::array<ViewSpan<void>, num_spans> m_spans;
+    std::array<ViewPart<void>, num_spans> m_spans;
     //span_tuple_type m_spans{};
   };
 
-  template<typename HostT, typename... ViewSpanTs>
-  auto make_view(HostT& obj, ViewSpan<ViewSpanTs>... spans) {
+  template<typename HostT, typename... ViewPartTs>
+  auto make_view(HostT& obj, ViewPart<ViewPartTs>... spans) {
     return View(obj, std::make_tuple(std::move(spans)...));
   }
 
   /* overload for trivially-copyable host objects */
   template<typename HostT, typename = std::enable_if_t<std::is_trivially_copyable_v<HostT>>>
   auto make_view(HostT& obj, ViewScope scope = ViewScope::SyncIn) {
-    return View<HostT, HostT>(obj, std::make_tuple(ViewSpan<HostT>(&obj, sizeof(HostT), scope)));
+    return make_view(obj, ViewPart<HostT>(&obj, sizeof(HostT), scope));
   }
 
+  namespace detail {
+    template<typename HostT, typename... DevTs>
+    struct view_trait<View<HostT, DevTs...>>
+    {
+      static constexpr bool is_view = true;
+      static constexpr bool is_persistent = false;
+    };
+  } // namespace detail
+
   /* yielded when waiting on a kernel to complete */
   struct device_op_wait_kernel
   { };
@@ -196,7 +236,7 @@ namespace ttg {
   /* type-punned version of the View, providing access to the object
    * pointer and a std::span over the views of that object */
   struct device_obj_view {
-    using span_type = std::span<ViewSpan<void>>;
+    using span_type = std::span<ViewPart<void>>;
     using iterator = typename span_type::iterator;
 
     device_obj_view(void *obj, span_type&& span)
@@ -221,6 +261,190 @@ namespace ttg {
     span_type m_span;
   };
 
+
+  /**
+   * A view that is persistent and can be copied in and out of the TTG
+   */
+  template<typename HostT, typename DevT = HostT, typename... DevTypeTs>
+  struct PersistentView {
+
+    using span_tuple_type  = std::tuple<ttg::ViewPart<DevT>, ttg::ViewPart<DevTypeTs>...>;
+    using host_type = HostT;
+
+    using view_type = PersistentView<HostT, DevT, DevTypeTs...>;
+
+    using ptr_type = ttg::ptr<host_type>;
+
+    constexpr static std::size_t num_spans = std::tuple_size_v<span_tuple_type>;
+
+  private:
+    template<std::size_t... Is>
+    PersistentView(ptr_type&& ptr, span_tuple_type& spans, std::index_sequence<Is...>)
+    : m_ptr(std::forward<ptr_type>(ptr))
+    , m_spans({std::get<Is>(spans)...})
+    { }
+
+  public:
+
+    constexpr PersistentView() = default;
+
+    PersistentView(ptr_type ptr, span_tuple_type spans)
+    : PersistentView(std::move(ptr), spans, std::make_index_sequence<sizeof...(DevTypeTs)+1>{})
+    {
+      /* TODO: let the runtime handle the view */
+      //ttg::detail::register_view(*this);
+    }
+
+    PersistentView(view_type&&) = default;
+
+    PersistentView(const view_type&) = default;
+
+    ~PersistentView() {
+      /* TODO: let the runtime remove the view */
+      //ttg::detail::drop_view(*this);
+    }
+
+    view_type& operator=(view_type&&) = default;
+    view_type& operator=(const view_type&) = default;
+
+    template<std::size_t i>
+    auto get_device_ptr() {
+      return static_cast<std::tuple_element_t<i, span_tuple_type>::value_type*>(std::get<i>(m_spans).data());
+    }
+
+    template<std::size_t i>
+    const auto get_device_ptr() const {
+      return static_cast<std::tuple_element_t<i, span_tuple_type>::value_type>(std::get<i>(m_spans).data());
+    }
+
+    template<std::size_t i>
+    std::size_t get_device_size() const {
+      return std::get<i>(m_spans).size();
+    }
+
+    template<std::size_t i>
+    auto& get_ViewPart() const {
+      return std::get<i>(m_spans);
+    }
+
+    HostT& get_host_object() {
+      return *m_ptr;
+    }
+
+    const HostT& get_host_object() const {
+      return *m_ptr;
+    }
+
+    template<std::size_t i>
+    ViewScope get_scope() const {
+      return std::get<i>(m_spans).scope();
+    }
+
+    constexpr static std::size_t size() {
+      return num_spans;
+    }
+
+    ptr_type get_ptr() const {
+      return m_ptr;
+    }
+
+    /* return a std::span of type-punned ViewParts */
+    std::span<ViewPart<void>> view_spans() {
+      return {m_spans.begin(), m_spans.end()};
+    }
+
+  private:
+    ptr_type m_ptr;
+    /* type-punned storage, cast to actual types in get_device_ptr */
+    std::array<ViewPart<void>, num_spans> m_spans;
+    //span_tuple_type m_spans{};
+  };
+
+  template<typename HostT, typename... ViewPartTs>
+  auto make_persistent_view(HostT& obj, ViewPart<ViewPartTs>... spans) {
+    return PersistentView(obj, std::make_tuple(std::move(spans)...));
+  }
+
+  /* overload for trivially-copyable host objects */
+  template<typename HostT, typename = std::enable_if_t<std::is_trivially_copyable_v<HostT>>>
+  auto make_persistent_view(HostT& obj, ViewScope scope = ViewScope::SyncIn) {
+    return PersistentView<HostT, HostT>(obj, std::make_tuple(ViewPart<HostT>(&obj, sizeof(HostT), scope)));
+  }
+
+
+  namespace detail {
+    template<typename HostT, typename... DevTs>
+    struct view_trait<PersistentView<HostT, DevTs...>>
+    {
+      static constexpr bool is_view = true;
+      static constexpr bool is_persistent = true;
+    };
+  } // namespace detail
+
+
+  namespace detail {
+    /* TODO: is this still needed? */
+    template<typename... Ts>
+    struct await_t {
+      std::tuple<Ts&...> ties;
+    };
+  }
+
+  template<typename... Args>
+  inline auto make_await(Args&&... args) {
+    return detail::await_t{std::tie(std::forward<Args>(args)...)};
+  }
+
+  namespace detail {
+    template<typename... Ts>
+    struct to_device_t {
+      std::tuple<Ts&...> ties;
+    };
+  } // namespace detail
+
+  template<typename... Args>
+  inline auto to_device(Args&&... args) {
+    return detail::to_device_t{std::tie(std::forward<Args>(args)...)};
+  }
+
+  namespace detail {
+    template<typename... Ts>
+    struct to_host_t {
+      std::tuple<Ts&...> ties;
+    };
+  } // namespace detail
+
+  template<typename... Args>
+  inline auto to_host(Args&&... args) {
+    return detail::to_host_t{std::tie(std::forward<Args>(args)...)};
+  }
+
+  namespace detail {
+    template<typename... T>
+    struct wait_kernel_t;
+    template<>
+    struct wait_kernel_t<>
+    { };
+    template<typename T, typename... Ts>
+    struct wait_kernel_t<T, Ts...> {
+      std::tuple<T&, Ts&...> ties;
+    };
+  } // namespace detail
+
+  /* Wait for the kernel to complete */
+  inline auto wait_kernel() {
+    return detail::wait_kernel_t<>{};
+  }
+
+  /* Wait for kernel to complete and provided ttg::buffer
+   * to be transferred back to host */
+  template<typename... Buffers>
+  inline auto wait_kernel_out(Buffers&&... args) {
+    static_assert((ttg::detail::is_buffer_v<std::decay_t<Buffers>>&&...),
+                  "Only ttg::buffer can be explicitly waited on!");
+    return detail::wait_kernel_t<std::decay_t<Buffers>...>{std::tie(std::forward<Buffers>(args)...)};
+  }
+
   struct device_task_promise_type;
 
   using device_task_handle_type = TTG_CXX_COROUTINE_NAMESPACE::coroutine_handle<device_task_promise_type>;
@@ -302,6 +526,13 @@ namespace ttg {
       return yield_value(tmp_tuple);
     }
 
+    /* convenience-function to yield a single view */
+    template<typename HostT, typename... DeviceViewTs>
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always yield_value(PersistentView<HostT, DeviceViewTs...> &view) {
+      auto tmp_tuple = std::tie(view);
+      return yield_value(tmp_tuple);
+    }
+
     /* waiting for the kernel to complete should always suspend */
     TTG_CXX_COROUTINE_NAMESPACE::suspend_always yield_value(device_op_wait_kernel) {
       std::cout << "yield_value: device_op_wait_kernel" << std::endl;
@@ -309,6 +540,71 @@ namespace ttg {
       return {};
     }
 
+    /* Allow co_await on a tuple */
+    template<typename... Views>
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always await_transform(std::tuple<Views&...> &views) {
+      return yield_value(views);
+    }
+
+    /* convenience-function to await a single view */
+    template<typename HostT, typename... DeviceViewTs>
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always await_transform(View<HostT, DeviceViewTs...> &view) {
+      auto tmp_tuple = std::tie(view);
+      return yield_value(tmp_tuple);
+    }
+
+    /* convenience-function to await a single view */
+    template<typename HostT, typename... DeviceViewTs>
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always await_transform(PersistentView<HostT, DeviceViewTs...> &view) {
+      auto tmp_tuple = std::tie(view);
+      return yield_value(tmp_tuple);
+    }
+
+    /* co_await for the kernel to complete should always suspend */
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always await_transform(device_op_wait_kernel) {
+      std::cout << "yield_value: device_op_wait_kernel" << std::endl;
+      m_state = TTG_DEVICE_CORO_WAIT_KERNEL;
+      return {};
+    }
+
+    template<typename... Ts>
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always await_transform(detail::await_t<Ts...>&& a) {
+      bool need_transfer = !(TTG_IMPL_NS::register_device_memory(a.ties));
+      /* TODO: are we allowed to not suspend here and launch the kernel directly? */
+      m_state = TTG_DEVICE_CORO_WAIT_TRANSFER;
+      return {};
+    }
+
+    template<typename... Ts>
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always await_transform(detail::to_device_t<Ts...>&& a) {
+      bool need_transfer = !(TTG_IMPL_NS::register_device_memory(a.ties));
+      /* TODO: are we allowed to not suspend here and launch the kernel directly? */
+      m_state = TTG_DEVICE_CORO_WAIT_TRANSFER;
+      return {};
+    }
+
+    template<typename... Ts>
+    TTG_CXX_COROUTINE_NAMESPACE::suspend_always await_transform(detail::wait_kernel_t<ttg::buffer<Ts>...>&& a) {
+      std::cout << "yield_value: wait_kernel_t" << std::endl;
+      if constexpr (sizeof...(Ts) > 0) {
+        TTG_IMPL_NS::mark_device_out(a.ties);
+      }
+      m_state = TTG_DEVICE_CORO_WAIT_KERNEL;
+      return {};
+    }
+
+#if 0
+    template<typename... Ts>
+    auto await_transform(ttg::detail::get_ptr_tpl_t<Ts...>&& a) {
+      return a;
+    }
+
+    template<typename T>
+    auto await_transform(ttg::detail::get_ptr_t<T>&& a) {
+      return a;
+    }
+#endif // 0
+
     void return_void() {
       m_state = TTG_DEVICE_CORO_COMPLETE;
     }
@@ -346,47 +642,8 @@ namespace ttg {
 
   bool device_task::completed() { return base_type::promise().state() == TTG_DEVICE_CORO_COMPLETE; }
 
-  /// std::span mirrored between host and device memory
-  struct HDSpan {
-    HDSpan() = default;
-    HDSpan(std::byte* ptr, std::size_t nbytes) {
-      ptrs_[0] = ptr;
-      nbytes_ = nbytes;
-      last_touched_space_ = 0;
-    }
-
-    std::size_t nbytes() const { return nbytes_; }
-
-    const std::byte* host_data() const { return ptrs_[0]; }
-
-    std::byte* host_data() {
-      last_touched_space_ = 0;
-      return ptrs_[0];
-    }
-
-    const std::byte* device_data() const { return ptrs_[1]; }
-
-    std::byte* device_data() {
-      last_touched_space_ = 1;
-      return ptrs_[1];
-    }
-
-    void mark_synched() { last_touched_space_ = 2; }
-
-   private:
-    std::array<std::byte*, 2> ptrs_ = {nullptr, nullptr};
-    std::size_t nbytes_ = 0;
-    std::size_t last_touched_space_ = 2;
-  };
-
-  /// set of std::span's mirrored between host and device memory
-  template <std::size_t N = 10>
-  struct HDSpans {
-    HDSpans() = default;
-
-   private:
-    std::array<HDSpan, N> spans_;
-  };
+  struct device_wait_kernel
+  { };
 
 }  // namespace ttg