diff --git a/pyop2/gpu/cuda.py b/pyop2/gpu/cuda.py
index b0e48b29d..93499bbab 100644
--- a/pyop2/gpu/cuda.py
+++ b/pyop2/gpu/cuda.py
@@ -85,10 +85,21 @@ def _kernel_args_(self):
 
 class Arg(Arg):
     """
-    Arg for GPU
+    Arg for GPU.
     """
 
 
+class ExtrudedSet(ExtrudedSet):
+    """
+    ExtrudedSet for GPU.
+    """
+    @cached_property
+    def _kernel_args_(self):
+        m_gpu = cuda.mem_alloc(int(self.layers_array.nbytes))
+        cuda.memcpy_htod(m_gpu, self.layers_array)
+        return (m_gpu,)
+
+
 class Dat(petsc_Dat):
     """
     Dat for GPU.
@@ -373,11 +384,9 @@ def argtypes(self):
     @cached_property
     def argshapes(self):
         argshapes = ((), ())
-        # argtypes += self._iterset._argtypes_
         if self._iterset._argtypes_:
-            raise NotImplementedError("Do not know what to do when"
-                    " self._iterset._argtypes is not empty, is this the case"
-                    " when we have extruded mesh")
+            # TODO: verify that this bogus value doesn't affect anyone.
+            argshapes += ((), )
 
         for arg in self._args:
             argshapes += (arg.data.shape, )
diff --git a/pyop2/gpu/snpt.py b/pyop2/gpu/snpt.py
index f22b1f184..66c72550f 100644
--- a/pyop2/gpu/snpt.py
+++ b/pyop2/gpu/snpt.py
@@ -3,7 +3,9 @@
 
 def snpt_transform(kernel, block_size):
     """
-    SNPT := Single 'n' Per Thread transformation.
+    SNPT := Single 'n' Per Thread.
+
+    Implements outer-loop parallelization strategy.
 
     PyOP2 uses 'n' as the outer loop iname. In Firedrake 'n' might denote
     either a cell or a DOF.