diff --git a/ndarray/Makefile b/ndarray/Makefile
deleted file mode 100644
index dbfc673..0000000
--- a/ndarray/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-all: pygpu_ndarray.so
-
-PYTHONVERSION ?= $(shell python -c "import sys; print '%d.%d'%(sys.version_info[0], sys.version_info[1]"))
-CUDA_ROOT ?= /opt/lisa/os/cuda
-THEANO_ROOT ?= /u/bastienf/repos/Theano
-
-
-CFLAGS=-g -DDEBUG -DOFFSET
-# By default enable the OFFSET usage. Otherwise some test fail.
-CFLAGS=-g -DOFFSET
-#BINDIR=--compiler-bindir ${HOME}/.theano.nvcc-bindir
-
-#NPY_PATH!=python -c "import numpy;print numpy.__path__"
-#NPY_INCLUDE=-I${NPY_PATH}/core/include
-CUDA_INCLUDE=-I${CUDA_ROOT}/include
-PYTHON_INCLUDE=-I$(shell python -c "import distutils.sysconfig;print distutils.sysconfig.get_python_inc()")
-INCLUDES=${CUDA_INCLUDE} ${PYTHON_INCLUDE}
-CUDA_FLAGS=-Xlinker -rpath,${CUDA_ROOT}/lib64 -Xlinker -rpath,${CUDA_ROOT}/lib
-
-pygpu_language_cuda.o: pygpu_language_cuda.cu pygpu_language.h
-	nvcc -c ${CFLAGS} -m64 -Xcompiler -fPIC,-m64 ${CUDA_FLAGS} ${INCLUDES} ${BINDIR} -o $@ $<
-
-pygpu_ndarray.so: pygpu_ndarray.cpp pygpu_ndarray.h pygpu_language_cuda.o pygpu_ndarray_object.h
-	nvcc -shared ${CFLAGS} -m64 -Xcompiler -fPIC,-m64 ${CUDA_FLAGS} ${INCLUDES} ${BINDIR} -o $@ pygpu_language_cuda.o $< -lpython${PYTHONVERSION} -lcublas -lcudart
-
-clean:
-	rm -f pygpu_ndarray.so core.* *.o *~
-	rm -rf build
-
-cleantmp:
-	rm -f core.* *.o *~
\ No newline at end of file
diff --git a/ndarray/__init__.py b/ndarray/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/ndarray/gen_elemwise.py b/ndarray/gen_elemwise.py
deleted file mode 100644
index d471eef..0000000
--- a/ndarray/gen_elemwise.py
+++ /dev/null
@@ -1,1911 +0,0 @@
-"""
-This file implement 1 version of the elemwise op on the gpu.
-
-The elemwise fct are also used with scalar operation! So it can happen
-that ndim is 0 as with all scalar type.
-"""
-
-
-import numpy
-import pygpu_ndarray as gpu_ndarray
-import StringIO
-
-
-_CL_MODE = hasattr(gpu_ndarray, "set_opencl_context")
-
-
-if _CL_MODE:
-    # THIS IS NOT FINISHED
-    import pyopencl as cl
-    import pyopencl.array as cl_array
-
-    #    import pyopencl._mymako as mako
-    from pyopencl._cluda import CLUDA_PREAMBLE
-    from pyopencl.tools import dtype_to_ctype
-
-    # TODO: use mako to get rid of the %if
-    CLUDA_PREAMBLE = CLUDA_PREAMBLE[:455]
-    CLUDA_PREAMBLE += """
-#define LDIM_0 get_local_size(0)
-#define LDIM_1 get_local_size(1)
-#define LDIM_2 get_local_size(2)
-
-#define GDIM_0 get_num_groups(0)
-#define GDIM_1 get_num_groups(1)
-#define GDIM_2 get_num_groups(2)
- """
-    # TODO, reuse the same context as the use used to create the memory.
-    ctx = cl.create_some_context()
-    queue = cl.CommandQueue(ctx)
-else:
-    import pycuda.autoinit
-    import pycuda.driver as driver
-
-    #    import pycuda._mymako as mako
-    from pycuda._cluda import CLUDA_PREAMBLE
-    from pycuda.compiler import SourceModule
-    from pycuda.tools import dtype_to_ctype
-    CLUDA_PREAMBLE += """
-#define LDIM_0 blockDim.x
-#define LDIM_1 blockDim.y
-#define LDIM_2 blockDim.z
-
-#define GDIM_0 gridDim.x
-#define GDIM_1 gridDim.y
-#define GDIM_2 gridDim.z
- """
-
-import logging
-
-import theano
-from theano import Apply, scalar
-from theano.tensor import TensorType
-
-
-_logger_name = 'compyte.gen_elemwise'
-_logger = logging.getLogger(_logger_name)
-_logger.setLevel(logging.INFO)
-_logger.addHandler(logging.StreamHandler())  # TO REMOVE
-
-
-def warning(*msg):
-    _logger.warning(_logger_name + 'WARNING: ' + ' '.join(str(m) for m in msg))
-
-
-def info(*msg):
-    _logger.info(_logger_name + 'INFO: ' + ' '.join(str(m) for m in msg))
-
-
-def debug(*msg):
-    _logger.debug(_logger_name + 'DEBUG: ' + ' '.join(str(m) for m in msg))
-
-
-if _CL_MODE:
-    gpu_ndarray.set_opencl_context(ctx.obj_ptr)
-
-
-cast_int = numpy.intc
-cast_uint = numpy.uintc
-
-
-def _logical_scalar(x):
-    return numpy.all(x.type.broadcastable)
-
-
-def get_str_list_logical_scalar(inputs, value_str='ii_i%i_value',
-                                data_str='ii_i%i_data[0]'):
-    l = []
-    for ipos, i in enumerate(inputs):
-        if _logical_scalar(i):
-            l += [value_str % ipos]
-        else:
-            l += [data_str % ipos]
-    return l
-
-
-class WrapOpenCLFunction:
-    def __init__(self, fct):
-        self.fct = fct
-
-    def _param_wrap(self, p):
-        if isinstance(p, MyGpuNdArray):
-            p = p.gpu_nd_array
-        if isinstance(p, gpu_ndarray.GpuNdArrayObject):
-            p = cl.MemoryObject.from_cl_mem_as_int(p.bytes)
-        return p
-
-    def set_block_shape(self, *shape):
-        self.local_size = shape
-
-    def param_set(self, *param):
-        self.param = [self._param_wrap(p) for p in param]
-
-    def launch_grid(self, *global_shape):
-        global_size = global_shape + (1,)
-
-        d = {"g_times_l": True}
-        return self.fct(queue, global_size, self.local_size,
-                        *self.param, **d)
-
-
-def compile_gpu_code(code, fct_name):
-    if _CL_MODE:
-        # Compile the gpu function with pyopencl
-        prg = cl.Program(ctx, code).build()
-        fct2 = getattr(prg, fct_name)
-
-        fct = WrapOpenCLFunction(fct2)
-    else:
-        # Compile the gpu function with pycuda
-        mod = SourceModule(code)
-        fct = mod.get_function(fct_name)
-    return fct
-
-
-class ElemwiseAlgo:
-    verbose = 0  # 1, 2 or 3 for more verbose output.
-    cache_version = ()
-    cache_version = ('debug', 14, verbose)
-
-    def __init__(self, scalar_op, inplace_pattern={}):
-        """
-        :param scalar_op: the scalar operation to execute on each element.
-        """
-        self.scalar_op = scalar_op
-        self.inplace_pattern = inplace_pattern
-
-    def task_code(self, inputs, outputs, sio,
-                  nodename, iname=None, oname=None):
-        if iname == None:
-            iname = get_str_list_logical_scalar(inputs)
-        if oname == None:
-            oname = ['ii_o%i_data[0]' % ipos for ipos, i in enumerate(outputs)]
-        print(self.scalar_op.c_code(
-            Apply(self.scalar_op,
-                  [scalar.Scalar(dtype=input.type.dtype)()
-                   for input in inputs],
-                  [scalar.Scalar(dtype=output.type.dtype)()
-                   for output in outputs]),
-            nodename + '_scalar_',
-            iname,
-            oname,
-            sub=dict(fail='return;')), file=sio)  # TODO: set a failure code somehow!!!
-
-    def c_src_kernel(self, inputs, outputs, nodename, nd, static="static"):
-        sio = StringIO.StringIO()
-        #print 'C_SRC_KERNEL', sio.getvalue()
-
-        for ipos, i in enumerate(inputs):
-            print("//    Input  ", ipos, str(i.type), file=sio)
-        for ipos, i in enumerate(outputs):
-            print("//    Output ", ipos, str(i.type), file=sio)
-        print(static, (
-            f"KERNEL void kernel_{nodename}_{nd}(unsigned int numEls"), file=sio)
-        if (nd):
-            print("\t,", ", ".join("const int dim%i" % i
-                                           for i in range(nd)), file=sio)
-        #declare inputs
-        for ipos, i in enumerate(inputs):
-            s = ", ".join(["GLOBAL_MEM const %s * i%i_data" % (
-                        dtype_to_ctype(i.dtype), ipos)] +
-                          list("int i%i_str_%i" % (ipos, d)
-                               for d in range(nd)))
-            print("\t,", s, file=sio)
-        #declare outputs
-        for ipos, i in enumerate(outputs):
-            s = ", ".join(["GLOBAL_MEM %s * o%i_data" % (
-                        dtype_to_ctype(i.dtype), ipos)]
-                          + list("int o%i_str_%i" % (ipos, d)
-                                 for d in range(nd)))
-            print("\t,", s, file=sio)
-            #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d)
-            #                               for d in xrange(nd))
-            #print >> sio, "\t,", "float * o%i_data" % ipos
-        print("\t)\n{", file=sio)
-        print("    const int idx = GID_0 * LDIM_0 + LID_0;", file=sio)
-        print("    const int numThreads = LDIM_0 * GDIM_0;", file=sio)
-
-        # For each input that is a scalar which has been broadcasted
-        #     to a tensor, load it into a local variable
-        for ipos, i in enumerate(inputs):
-            if _logical_scalar(i):
-                print("    const %s ii_i%i_value = i%i_data[0];" % (
-                    dtype_to_ctype(i.dtype), ipos, ipos), file=sio)
-
-        #loop over the elements to be treated by this kernel call
-        print("    for (int i = idx; i < numEls; i += numThreads) {", file=sio)
-        # calculate the data pointers for all arguments
-        print("        int ii = i;", file=sio)
-        for ipos, i in enumerate(inputs):
-            if not _logical_scalar(i):
-                print(("        GLOBAL_MEM const "
-                               "%s * ii_i%i_data = i%i_data;" % (
-                    dtype_to_ctype(i.dtype), ipos, ipos)), file=sio)
-        for ipos, i in enumerate(outputs):
-            print("        GLOBAL_MEM %s * ii_o%i_data = o%i_data;" % (
-                dtype_to_ctype(i.dtype), ipos, ipos), file=sio)
-        for d in range(nd - 1, -1, -1):
-            if d > 0:
-                print("        int pos%i = ii %% dim%i;" % (d, d), file=sio)
-                print("        ii = ii / dim%i;" % d, file=sio)
-            else:
-                print("        int pos%i = ii;" % d, file=sio)
-
-            for ipos, i in enumerate(inputs):
-                if not _logical_scalar(i):
-                    print(("        ii_i"
-                                   "%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d)), file=sio)
-            for ipos, i in enumerate(outputs):
-                print("        ii_o%i_data += pos%i * o%i_str_%i;" % (
-                    ipos, d, ipos, d), file=sio)
-
-        # perform the scalar operation on the input and output references
-        #TODO: What if the scalar_op needs support_code??
-        self.task_code(inputs, outputs, sio, nodename)
-        print("    }", file=sio)
-
-        #indent = " "*(4*d+7)
-        #for ipos, i in enumerate(inputs):
-            #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
-        print("}", file=sio)
-
-        #print sio.getvalue()
-        return sio.getvalue()
-
-    def c_src_kernel_Ccontiguous(self, inputs, outputs,
-                                 nodename, static="static"):
-        nd = outputs[0].type.ndim
-        sio = StringIO.StringIO()
-        #print 'C_SRC_KERNEL', sio.getvalue()
-
-        for ipos, i in enumerate(inputs):
-            print("//    Input  ", ipos, str(i.type), file=sio)
-        for ipos, i in enumerate(outputs):
-            print("//    Output ", ipos, str(i.type), file=sio)
-        print(static, ("KERNEL void kernel_%s_Ccontiguous"
-                               " (unsigned int numEls" % (nodename)), file=sio)
-        #declare inputs
-        for ipos, i in enumerate(inputs):
-            print("\t,", "GLOBAL_MEM const %s * i%i_data" % (
-                dtype_to_ctype(i.dtype), ipos), file=sio)
-        #declare outputs
-        for ipos, i in enumerate(outputs):
-            print("\t,", "GLOBAL_MEM %s * o%i_data" % (
-                dtype_to_ctype(i.dtype), ipos), file=sio)
-        print("\t)\n{", file=sio)
-        print("    const int idx = GID_0 * LDIM_0 + LID_0;", file=sio)
-        print("    const int numThreads = LDIM_0 * GDIM_0;", file=sio)
-
-        # For each input that is a scalar which has been broadcasted
-        #     to a tensor, load it into a local variable
-        for ipos, i in enumerate(inputs):
-            if _logical_scalar(i):
-                print("    const %s ii_i%i_value = i%i_data[0];" % (
-                    dtype_to_ctype(i.dtype), ipos, ipos), file=sio)
-
-        #loop over the elements to be treated by this kernel call
-        print("    for (int i = idx; i < numEls; i += numThreads) {", file=sio)
-        # perform the scalar operation on the input and output references
-        #TODO: What if the scalar_op needs support_code??
-        self.task_code(inputs, outputs, sio, nodename,
-                       iname=get_str_list_logical_scalar(
-                inputs, data_str='i%i_data[i]'),
-                       oname=['o%i_data[i]' % ipos
-                                for ipos, i in enumerate(outputs)])
-        print("    }", file=sio)
-        print("}", file=sio)
-
-        #print sio.getvalue()
-        return sio.getvalue()
-
-    def c_src_callkernel(self, inputs, outputs, nodename):
-        #
-        # This function serves three main goals:
-        #
-        # The first is stride unpacking:
-        # it accepts input and output arguments as
-        #    float * , int*
-        # pairs, and it constructs a kernel function call where inputs
-        # and arguments are named like
-        #    float *, int, int, int ...
-        #
-        # The second is to recognize when any dimensions can be collapsed as
-        # being contiguous. That mean that we can merge that dimensions with
-        # another one for all inputs/outputs and have the same retusuls
-        # (confusing... read code)
-        #
-        # The thrid is to make a special case for scalar element. We allow
-        # the collapsing of them.  In the ccontiguous and not contiguous case,
-        # we use registers to lower the number of memory access.
-
-        # TODO: make a special case for broadcasting, to store the
-        # data in shared memory.
-
-        nd = outputs[0].type.ndim
-        nb_inputs = len(inputs)
-        nb_outputs = len(outputs)
-        d = dict()
-        # input_params and output_params go into the function
-        # declaration/definition
-        input_params = ", ".join("const %s * i%i_data, const int * i%i_str" % (
-                dtype_to_ctype(inputs[i].dtype), ipos, ipos)
-                                 for ipos in range(len(inputs)))
-        output_params = ", ".join("%s * o%i_data, const int * o%i_str" % (
-                dtype_to_ctype(outputs[i].dtype),
-                ipos, ipos)
-                                  for ipos in range(len(outputs)))
-
-        #input_args and output_args go into the recursive call.
-        input_args = ", ".join("i%i_data, i%i_str" % (ipos, ipos)
-                for ipos in range(len(inputs)))
-        output_args = ", ".join("o%i_data, o%i_str" % (ipos, ipos)
-                for ipos in range(len(outputs)))
-
-        prod_dims = '*'.join(["dims[%i]" % di for di in range(nd)] + ['1'])
-
-        sio = StringIO.StringIO()
-        print("""
-        static void can_collapse_%(nodename)s(int nd, const int * dims,
-                                              const int * strides,
-                                              int collapse[])
-        {
-            //can we collapse dims[i] and dims[i-1]
-            for(int i=nd-1;i>0;i--){
-                if(strides[i]*dims[i]==strides[i-1]){
-                    //the dims nd-1 are not strided again dimension nd
-                    collapse[i]=1;
-                }else collapse[i]=0;
-            }
-        }
-        """ % locals(), file=sio)
-        print("""
-        static int callkernel_%(nodename)s(unsigned int numEls, const int d,
-            const int * dims,
-            %(input_params)s,
-            %(output_params)s)
-        {
-            numEls = %(prod_dims)s;
-        """ % locals(), file=sio)
-        if self.verbose:
-            print("""
-                std::cerr << "calling kernel_%(nodename)s     w numEls" << numEls << " dims"<< d << "\\n";
-            """ % locals(), file=sio)
-            print('std::cerr << ' + " << ' ' <<  ".join(['"  "']+list("dims[%i]"%di
-                for di in range(nd)) + ["'\\n';"]), file=sio)
-        if self.verbose > 1:
-            for ipos in range(len(inputs)):
-                print("""
-                std::cerr << "   %(ipos)s data strides" <<
-                """ % locals() + " << ' ' <<  ".join(["i%s_data" % ipos]
-                + list("i%s_str[%i]" % (ipos, di)
-                       for di in range(nd))) + ''' << "\\n"; ''', file=sio)
-
-            for ipos in range(len(outputs)):
-                print("""
-                std::cerr << "   %(ipos)s data strides" <<
-                """ % locals() + " << ' ' <<  ".join(["o%s_data" % ipos]
-                    + list("o%s_str[%i]" % (ipos, di)
-                           for di in range(nd))) + ''' << "\\n"; ''', file=sio)
-    # collapse dimension that are broadcast in all inputs.
-    # need to be done before contiguous collapse as it will break it.
-    # do the dimensions and the strides
-        print("""
-        int local_dims[%(nd)s];
-        int local_str[%(nb_inputs)s][%(nd)s];
-        int local_ostr[%(nb_inputs)s][%(nd)s];
-        int nd_collapse = %(nd)s;
-        for(int i=0;i<%(nd)s;i++){//init new dim
-          local_dims[i]=dims[i];
-        }
-        """ % locals(), file=sio)
-        for ipos in range(len(inputs)):
-            print("""
-            for(int i=0;i<%(nd)s;i++){//init new strides
-              local_str[%(ipos)s][i]=i%(ipos)s_str[i];
-            }
-            """ % locals(), file=sio)
-        for ipos in range(len(outputs)):
-            print("""
-            for(int i=0;i<%(nd)s;i++){//init new strides
-              local_ostr[%(ipos)s][i]=o%(ipos)s_str[i];
-            }
-            """ % locals(), file=sio)
-        if self.verbose > 2:
-            print('std::cerr <<"before broadcast collapse\\n";', file=sio)
-            print('std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ', file=sio)
-            print('std::cerr << "local_dims";', file=sio)
-            for d in range(nd):
-                print('std::cerr << " " << local_dims[%(d)s]; ' % locals(), file=sio)
-            print('std::cerr << "\\n";', file=sio)
-
-            for ipos in range(len(inputs)):
-                print('std::cerr << " local_str inputs %(ipos)s: " <<' % locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";', file=sio)
-            for ipos in range(len(outputs)):
-                print('std::cerr << " local_ostr inputs %(ipos)s: " <<' % locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";', file=sio)
-
-        print("""
-        for(int id=0;id<nd_collapse;id++){
-
-          bool all_broadcast=true;
-          for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
-            if(local_str[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
-          }
-          for(int input_id=0;input_id<%(nb_outputs)s;input_id++){
-            if(local_ostr[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
-          }
-          if(all_broadcast){
-            for(int j=id+1;j<nd_collapse;j++)//remove dims i from the array
-              local_dims[j-1]=local_dims[j];
-            for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
-              for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
-                local_str[input_id][j-1]=local_str[input_id][j];
-              }
-            }
-            for(int output_id=0;output_id<%(nb_outputs)s;output_id++){
-              for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
-                local_ostr[output_id][j-1]=local_ostr[output_id][j];
-              }
-            }
-            nd_collapse--; id--;
-          }
-        }
-        """ % locals(), file=sio)
-
-        if self.verbose > 2:
-            print('std::cerr <<"after broadcast collapse\\n";', file=sio)
-            print('std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ', file=sio)
-            print('std::cerr << "local_dims";', file=sio)
-            for d in range(nd):
-                print('std::cerr << " " << local_dims[%(d)s]; ' % locals(), file=sio)
-            print('std::cerr << "\\n";', file=sio)
-
-            for ipos in range(len(inputs)):
-                print('std::cerr << " local_str %(ipos)s: " <<' % locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";', file=sio)
-            for ipos in range(len(outputs)):
-                print('std::cerr << " local_ostr %(ipos)s: " <<' % locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";', file=sio)
-    # collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))
-    # this is a good idea because we make less index calculation in the gpu.
-
-        print("int nd_collapse_[%(nd)s] = {" % locals() +','.join(['1' for x in range(nd)]) +"};", file=sio)
-        for ipos in range(len(inputs)):
-            if not _logical_scalar(inputs[ipos]):
-                print("""
-                    int nd_collapse_%(ipos)s[%(nd)s] = {""" % locals() +','.join(['1' for x in range(nd)]) +"};", file=sio)
-                print("""
-can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s);
-for(int i=0;i<nd_collapse;i++){
-if(nd_collapse_%(ipos)s[i]==0)
-nd_collapse_[i]=0;
-}
-                """ % locals(), file=sio)
-                if self.verbose > 1:
-                    print("""
-                    std::cerr<< "nd_collapse_%(ipos)s "<<
-                    """ % locals(), file=sio)
-                    print(' << " " << '.join(
-                        ["nd_collapse_%(ipos)s[" % locals() + str(i) + "]"
-                         for i in range(nd)]), file=sio)
-                    print('<< "\\n";', file=sio)
-                    print("""
-                    std::cerr<< "nd_collapse_ "<<
-                    """ % locals(), file=sio)
-                    print(' << " " << '.join(
-                        ["nd_collapse_[" % locals() + str(i) + "]"
-                         for i in range(nd)]), file=sio)
-                    print('<< "\\n";', file=sio)
-
-    # update the local stride.
-        for ipos in range(len(inputs)):
-            print("""
-            for(int i=nd_collapse-1;i>0;i--){
-              if(nd_collapse_[i]==1){
-                local_str[%(ipos)s][i-1]=local_str[%(ipos)s][i];//set new strides
-                for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
-                  local_str[%(ipos)s][j-1]=local_str[%(ipos)s][j];
-                }
-            }
-            """ % locals(), file=sio)
-
-        for ipos in range(len(outputs)):
-            print("""
-            for(int i=nd_collapse-1;i>0;i--){
-              if(nd_collapse_[i]==1){
-                local_ostr[%(ipos)s][i-1]=local_ostr[%(ipos)s][i];//set new strides
-                for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
-                  local_ostr[%(ipos)s][j-1]=local_ostr[%(ipos)s][j];
-                }
-            }
-            """ % locals(), file=sio)
-
-    # update the local dims.
-        print("""
-        for(int i=nd_collapse-1;i>0;i--){
-          if(nd_collapse_[i]==1){
-            local_dims[i-1]*=local_dims[i];//set new dims
-            for(int j=i+1;j<nd_collapse;j++)//remove dims i from the array
-              local_dims[j-1]=local_dims[j];
-          }
-        }
-        """ % locals(), file=sio)
-
-    #update the new number of dim
-        print("""
-        for(int i=1, end=nd_collapse;i<end;i++){
-          if(nd_collapse_[i]==1)nd_collapse--;
-        }
-        if(nd_collapse == 1 """ % locals(), file=sio)
-        l = ["local_str[%(ipos)s][nd_collapse-1]==1 " % locals()
-             for ipos in range(len(inputs))
-             if not _logical_scalar(inputs[ipos])]
-        l += ["local_ostr[%(ipos)s][nd_collapse-1]==1 " % locals()
-              for ipos in range(len(outputs))
-              if not _logical_scalar(outputs[ipos])]
-        if len(l) > 0:
-            print(" && ", " && ".join(l), file=sio)
-        print("""){nd_collapse=0;} """, file=sio)
-
-        if self.verbose:
-            print('std::cerr <<"after can_collapse\\n";', file=sio)
-            print("""std::cerr << "nd_collapse " << nd_collapse << "\\n"; """  % locals(), file=sio)
-        if self.verbose > 1:
-            for d in range(nd):
-                print('std::cerr << " " << local_dims[%(d)s]; ' % locals(), file=sio)
-            print('std::cerr << "\\n";', file=sio)
-
-            for ipos in range(len(inputs)):
-                print(('std::cerr << " local_str %(ipos)s: " <<' %
-                               locals() + ' << " " << '.join(
-                        ["local_str[%(ipos)s][%(x)s]" % locals()
-                         for x in range(nd)]) + '<<"\\n";'), file=sio)
-            for ipos in range(len(outputs)):
-                print(('std::cerr << " local_ostr %(ipos)s: " <<' %
-                               locals() + ' << " " << '.join(
-                        ["local_ostr[%(ipos)s][%(x)s]" % locals()
-                         for x in range(nd)]) + '<<"\\n";'), file=sio)
-
-        def launch_Ccontiguous(nodename, scalar_op):
-            kernel_call_args = ["numEls"]
-            for ipos in range(len(inputs)):
-                kernel_call_args.append("i%i_data" % ipos)
-            for ipos in range(len(outputs)):
-                kernel_call_args.append("o%i_data" % ipos)
-            kernel_call_args = ", ".join(kernel_call_args)
-            verb = ""
-            if self.verbose:
-                verb = 'std::cerr << "   Running ccontiguous version\\n";'
-            print("""
-                //first use at least a full warp
-                int threads_per_block = std::min(numEls,  (unsigned int)32); //WARP SIZE
-
-                //next start adding multiprocessors
-                int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS
-
-                // next start adding more warps per multiprocessor
-                if (threads_per_block * n_blocks < numEls)
-                    threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
-                kernel_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
-
-                //std::cerr << "calling callkernel returned\\n";
-                """  % locals(), file=sio)
-
-            print("""
-                CNDA_THREAD_SYNC;
-                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err)
-                {
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n    n_blocks=%%i threads_per_block=%%i\\n   Call: %%s\\n",
-                         "GpuElemwise %(nodename)s", cudaGetErrorString(err),
-                         n_blocks, threads_per_block,
-                         "kernel_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");
-                    return -1;
-
-                }
-                %(verb)s
-                return 0;
-                """  % locals(), file=sio)
-
-        def launch_General(nodename, scalar_op, force_nd):
-            # kernel_call_args are used to invoke the cuda kernel
-            local = "local_"
-            kernel_call_args = ["numEls"]
-            kernel_call_args.extend(local + "dims[%i]" % di
-                                    for di in range(force_nd))
-            for ipos in range(len(inputs)):
-                kernel_call_args += ["i%i_data" % ipos] + list(
-                    local + "str[%i][%i]" % (ipos, di)
-                    for di in range(force_nd))
-                #strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
-                #kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
-            for ipos in range(len(outputs)):
-                kernel_call_args += ["o%i_data" % ipos] + list(
-                    local + "ostr[%i][%i]" % (ipos, di)
-                    for di in range(force_nd))
-                #strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
-                #kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
-            if self.verbose:
-                print("""
-                    std::cerr << "   Running general version with %(force_nd)s  dims\\n";
-                    """ % locals(), file=sio)
-                print("std::cerr << "+ ' << " " << '.join(
-                    kernel_call_args)+' << "\\n";', file=sio)
-                #std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n;
-
-            kernel_call_args = ", ".join(kernel_call_args)
-
-            print("""
-                //first use at least a full warp
-                int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE
-
-                //next start adding multiprocessors
-                int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS
-
-                // next start adding more warps per multiprocessor
-                if (threads_per_block * n_blocks < numEls)
-                    threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
-
-                kernel_%(nodename)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
-                """  % locals(), file=sio)
-            print("""
-                CNDA_THREAD_SYNC;
-                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err)
-                {
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n    n_blocks=%%i threads_per_block=%%i\\n   Call: %%s\\n",
-                         "GpuElemwise %(nodename)s", cudaGetErrorString(err),
-                         n_blocks, threads_per_block,
-                         "kernel_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");
-                    return -1;
-
-                }
-                return 0;
-                """  % locals(), file=sio)
-
-        print("if(numEls==0) return 0;", file=sio)
-        print("switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals(), file=sio)
-        print("case 0: {", file=sio)
-        launch_Ccontiguous(nodename, scalar_op)
-        print("        } break;", file=sio)
-        for i in range(1, nd + 1):
-            print("case " + str(i) + ": {", file=sio)
-            launch_General(nodename, scalar_op, i)
-            print("        } break;", file=sio)
-
-        print("}", file=sio)  # end case
-        print("return -2;", file=sio)  # should not get to this point
-        print("}", file=sio)  # end fct
-
-        #N.B. cudaGetLastError is called by c_code
-        return sio.getvalue()
-
-    def c_support_code_apply(self, inputs, outputs, nodename):
-        nd = outputs[0].type.ndim
-        return "".join(
-            CLUDA_PREAMBLE,
-            [self.c_src_kernel(inputs, outputs, nodename, x)
-             for x in range(1, nd + 1)] +
-            [self.c_src_kernel_Ccontiguous(inputs, outputs, nodename),
-             self.c_src_callkernel(inputs, outputs, nodename),
-             ])
-
-    def c_code(self, ninputs, noutputs, nodename, inputs, outputs, sub):
-        d = dict(sub)
-        nd = noutputs[0].type.ndim
-        d.update(locals())
-        sio = StringIO.StringIO()
-        nin = len(inputs)
-        nout = len(outputs)
-        fail = sub['fail']
-        opname = str(self.scalar_op)
-        initial_dims = ','.join('1' for i in range(nd))
-        if 1 or self.scalar_op == scalar.pow:
-            print("""
-        //std::cerr << "C_CODE %(opname)s START\\n";
-        //standard elemwise size checks
-            """ % locals(), file=sio)
-        print("""
-        int dims[%(nd)s] = {%(initial_dims)s};
-        """ % locals(), file=sio)
-
-        #check that all inputs have valid dimensions
-        emitted_inames = {}
-        for id, iname in enumerate(inputs):
-            if iname in emitted_inames:
-                assert emitted_inames[iname] is ninputs[id]
-                continue
-            broadcasts = ', '.join(map(str, list(map(int,
-                                                ninputs[id].broadcastable))))
-            nd = ninputs[id].ndim
-            print("""
-        int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
-""" % locals(), file=sio)
-            emitted_inames[iname] = ninputs[id]
-        #check that all inputs have valid dimensions
-        emitted_inames = {}
-        for id, iname in enumerate(inputs):
-            if iname in emitted_inames:
-                continue
-            print("""
-        //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
-        if (%(nd)s != %(iname)s->nd)
-        {
-            PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", %(iname)s->nd);
-            %(fail)s;
-        }
-        for (int i = 0; i< %(nd)s; ++i)
-        {
-            dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(%(iname)s)[i] : dims[i];
-            if ((!(broadcasts_%(iname)s[i] && CudaNdarray_HOST_DIMS(%(iname)s)[i] == 1))&& (dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i]))
-            {
-                //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
-                PyErr_Format(PyExc_ValueError, "GpuElemwise. Input dimension mis-match. One of your inputs has shape[%%i] == %%i, but the output's size on that axis is %%i.",
-                    i,
-                    CudaNdarray_HOST_DIMS(%(iname)s)[i],
-                    dims[i]
-                    );
-                %(fail)s;
-            }
-        }
-            """ % locals(), file=sio)
-            emitted_inames[iname] = True
-
-        #check that all outputs have valid dimensions
-        for idx, oname in enumerate(outputs):
-            if idx not in list(self.inplace_pattern.keys()):
-                print("""
-        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
-            if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
-            {
-                Py_DECREF(%(oname)s);
-                %(oname)s = NULL;
-            }
-        }
-        if (NULL == %(oname)s)
-        {
-            %(oname)s = (CudaNdarray*)CudaNdarray_New();
-            if (!%(oname)s)
-            {
-                //error string already set
-                %(fail)s;
-            }
-            if (CudaNdarray_alloc_contiguous(%(oname)s, %(nd)s, dims))
-            {
-                //error string already set
-                Py_DECREF(%(oname)s);
-                %(oname)s = NULL;
-                %(fail)s;
-            }
-        }
-        //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
-        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
-        """ % locals(), file=sio)
-            else:
-                input_idx = self.inplace_pattern[idx]
-                iname = inputs[input_idx]
-                print("""
-        Py_XDECREF(%(oname)s);
-        %(oname)s = %(iname)s;
-        Py_INCREF(%(oname)s);
-        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
-            if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
-            {
-                Py_DECREF(%(oname)s);
-                %(oname)s = NULL;
-                %(fail)s;
-            }
-        }
-        //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
-        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
-        """ % locals(), file=sio)
-
-        print("""
-        {
-            //new block so that failure gotos don't skip over variable initialization
-            //std::cerr << "calling callkernel\\n";
-            if (callkernel_%(nodename)s(1, 0, dims
-            """ % locals(), file=sio)
-        for iname in inputs:
-            print("""
-                        , CudaNdarray_DEV_DATA(%(iname)s), CudaNdarray_HOST_STRIDES(%(iname)s)
-            """ % locals(), file=sio)
-        for oname in outputs:
-            print("""
-                        , CudaNdarray_DEV_DATA(%(oname)s), CudaNdarray_HOST_STRIDES(%(oname)s)
-            """ % locals(), file=sio)
-        print("""
-                        ))
-            {
-                 // error
-            """, file=sio)
-        for oname in outputs:
-            print("""
-                Py_DECREF(%(oname)s);
-                %(oname)s = NULL;
-                """ % locals(), file=sio)
-        print("""
-                %(fail)s;
-            }
-            else // no error
-            {
-            }
-        }
-        //std::cerr << "C_CODE %(opname)s END\\n";
-        """ % locals(), file=sio)
-        #print sio.getvalue()
-        return sio.getvalue()
-
-    def c_support_code(self):
-        return """
-        #define INTDIV_POW2(a, b) (a >> b)
-        #define INTMOD_POW2(a, b) (a & ((1<<b)-1))
-        """
-
-def dummy_holder_for_code_not_used():
-
-    def c_src_kernel_tiling(self, inputs, outputs, nodename):
-        """ The kernel applies to problems with <= 5 dimensions """
-
-        #The kernel is intended to be structured roughly like this:
-        """
-        static __global__ void kernel()
-        {
-            for (int v = blockIdx.y; v < dim0; v += gridDim.x)
-            {
-                for (int w = blockIdx.y; w < dim1; w += gridDim.y)
-                {
-                    for (int x = threadIdx.x; x < dim2; x += blockDim.x)
-                    {
-                        for (int y = threadIdx.y; y < dim3; y += blockDim.y)
-                        {
-                            for (int z = threadIdx.z; z < dim4; z += blockDim.z)
-                            {
-                                out[v * out_stride[0] + ...] = f(in1[...],  in2[...])
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        """
-
-        nd = outputs[0].type.ndim
-        sio = StringIO.StringIO()
-        #print 'C_SRC_KERNEL', sio.getvalue()
-
-        if nd in (4,):
-            # print some leading comments to make the code easier to read
-            for ipos, i in enumerate(inputs):
-                print("//    Input  ", ipos, str(i.type), file=sio)
-            for ipos, i in enumerate(outputs):
-                print("//    Output ", ipos, str(i.type), file=sio)
-            print("""static __global__ void kernel_{}_{}(
-                             unsigned int numEls""".format(
-                nodename,
-                'tiling%i' % nd), file=sio)
-            if (nd):
-                print("\t,", ", ".join("const int dim%i" % i
-                                               for i in range(nd)), file=sio)
-            #declare inputs
-            for ipos, i in enumerate(inputs):
-                s = ", ".join(["const float * i%i_data" % ipos] + list(
-                        "int i%i_str_%i" % (ipos, d) for d in range(nd)))
-                print("\t,", s, file=sio)
-            #declare outputs
-            for ipos, i in enumerate(outputs):
-                s = ", ".join(["float * o%i_data" % ipos] + list(
-                        "int o%i_str_%i" % (ipos, d) for d in range(nd)))
-                print("\t,", s, file=sio)
-                #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
-                #print >> sio, "\t,", "float * o%i_data" % ipos
-            print("\t)\n{", file=sio)
-
-            # For each input that is a scalar which has been broadcasted to a tensor,
-            #     load it into a local variable
-            print("    __shared__ float value0[%i];" % len(inputs), file=sio)
-            print("    __shared__ int shared_dims[%(nd)s];" % locals(), file=sio)
-            #print >> sio, "    __shared__ int shared_i_str[%(n_in)s][%(nd)s]"
-            print("    if ((threadIdx.x == 0) && (threadIdx.y == 0)) {", file=sio)
-            for ipos, i in enumerate(inputs):
-                if _logical_scalar(i):
-                    print("    value0[%i] = i%i_data[0];" % (ipos,
-                                                                     ipos), file=sio)
-            for ipos in range(nd):
-                print("    shared_dims[%i] = dim%i;" % (ipos, ipos), file=sio)
-            print("    }", file=sio)
-            print("    __syncthreads();", file=sio)
-
-            if (nd == 4):
-                print("""
-                for (int pos0 = blockIdx.x; pos0 < shared_dims[0]; pos0 += gridDim.x)
-                {
-                    for (int pos1 = blockIdx.y; pos1 < shared_dims[1]; pos1 += gridDim.y)
-                    {
-                        //for (int pos2 = threadIdx.x; pos2 < shared_dims[2]; pos2 += blockDim.x)
-                        for (int pos2 = threadIdx.y; pos2 < shared_dims[2]; pos2 += blockDim.y)
-                        {
-                            //for (int pos3 = threadIdx.y; pos3 < shared_dims[3]; pos3 += blockDim.y)
-                            for (int pos3 = threadIdx.x; pos3 < shared_dims[3]; pos3 += blockDim.x)
-                            {
-                """, file=sio)
-            else:
-                raise NotImplementedError()
-
-            for ipos, i in enumerate(inputs):
-                if not _logical_scalar(i):
-                    print("        const float * ii_i%i_data = i%i_data;" % (ipos, ipos), file=sio)
-            for ipos, i in enumerate(outputs):
-                print("        float * ii_o%i_data = o%i_data;" % (ipos, ipos), file=sio)
-            for d in range(nd):
-                for ipos, i in enumerate(inputs):
-                    if not _logical_scalar(i):
-                        print("        ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d), file=sio)
-                for ipos, i in enumerate(outputs):
-                    print("        ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d), file=sio)
-
-            # perform the scalar operation on the input and output references
-            #TODO: What if the scalar_op needs support_code??
-            self.task_code(inputs, outputs, sio, nodename,
-                           iname=get_str_list_logical_scalar(
-                    inputs, value_str='value0[%i]'))
-            print("    }" * nd, file=sio)
-
-            #TODO: insert runtime stride checks that select the best loop order either here, or in
-            # the host code that launched the  kernel (host code probably better spot)
-
-            #indent = " "*(4*d+7)
-            #for ipos, i in enumerate(inputs):
-                #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
-            print("}", file=sio)
-
-        print(sio.getvalue())
-        return sio.getvalue()
-
-    def c_src_kernel_tiling_less_registers(self, inputs, outputs, nodename):
-        """ The kernel applies to problems with <= 5 dimensions """
-
-        nd = outputs[0].type.ndim
-        n_in = len(inputs)
-        n_out = len(outputs)
-        sio = StringIO.StringIO()
-
-        if nd not in (2,):
-            return sio.getvalue()
-
-        # print some leading comments to make the code easier to read
-        for ipos, i in enumerate(inputs):
-            print("//    Input  ", ipos, str(i.type), file=sio)
-        for ipos, i in enumerate(outputs):
-            print("//    Output ", ipos, str(i.type), file=sio)
-        print("static __global__ void kernel_%s_%s(unsigned int numEls" %(
-                nodename,
-                'tiling%i_less_registers'%nd), file=sio)
-        if (nd):
-            print("\t,", ", ".join("const int dim%i" % i
-                                           for i in range(nd)), file=sio)
-        #declare inputs
-        for ipos, i in enumerate(inputs):
-            s = ", ".join(["const float * i%i_data_0" % ipos] + list(
-                    "int i%i_str_%i" % (ipos, d) for d in range(nd)))
-            print("\t,", s, file=sio)
-        #declare outputs
-        for ipos, i in enumerate(outputs):
-            s = ", ".join(["float * o%i_data_0" % ipos] + list(
-                    "int o%i_str_%i" % (ipos, d) for d in range(nd)))
-            print("\t,", s, file=sio)
-            #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
-            #print >> sio, "\t,", "float * o%i_data" % ipos
-        print("\t)\n{", file=sio)
-
-        # TODO: Setting these to true makes the function fail SOMETIMES.  I don't know why yet.
-        use_shared_stride = False
-        use_shared_limits = False
-
-        def decl_limits(nd):
-            if use_shared_limits:
-                print("__shared__ float * limits[%(nd)s];" % locals(), file=sio)
-
-        def stride(io, p, d):
-            if use_shared_stride:
-                return "s%s_str[%i][%i]" % (io, p, d)
-            else:
-                return "%s%i_str_%i" % (io, p, d)
-
-        def limits(d):
-            if use_shared_limits:
-                return "limits[%i]" % d
-            else:
-                return "limits%i" % d
-
-        def decl_shared_stride(nin, nout, nd):
-            if not use_shared_stride:
-                return
-            print("""
-            __shared__ int si_str[%(nin)s][%(nd)s];
-            __shared__ int so_str[%(nout)s][%(nd)s];
-            if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
-            """ % locals(), file=sio)
-            for i in range(nin):
-                for d in range(nd):
-                    print("si_str[%(i)s][%(d)s] = i%(i)s_str_%(d)s;" % locals(), file=sio)
-            for i in range(n_out):
-                for d in range(nd):
-                    print("so_str[%(i)s][%(d)s] = o%(i)s_str_%(d)s;" % locals(), file=sio)
-            print("} __syncthreads();", file=sio)
-
-        def calc_limit(d):
-            s = stride('o', 0, d)
-            lname = limits(d)
-            if use_shared_limits:
-                print("if ((threadIdx.x == 0) && (threadIdx.y == 0)) {", file=sio)
-                if d == 0:
-                    print("%(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals(), file=sio)
-                else:
-                    dm1 = d - 1
-                    print("%(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals(), file=sio)
-                print("} __syncthreads();", file=sio)
-            else:
-                if d == 0:
-                    print("const float * %(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals(), file=sio)
-                else:
-                    dm1 = d - 1
-                    print("const float * %(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals(), file=sio)
-
-        def decl_ptrs(d, offset):
-            dm1 = d - 1
-            assert dm1 >= 0
-            for i in range(n_in):
-                s = stride('i', i, d)
-                print("const float * i%(i)s_data_%(d)s = i%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals(), file=sio)
-            for i in range(n_out):
-                s = stride('o', i, d)
-                print("float * o%(i)s_data_%(d)s = o%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals(), file=sio)
-
-        def inc_ptrs(d, amt):
-            for i in range(n_in):
-                s = stride('i', i, d)
-                print("i%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals(), file=sio)
-            for i in range(n_out):
-                s = stride('o', i, d)
-                print("o%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals(), file=sio)
-
-        def while_limit(d):
-            lname = limits(d)
-            print("while (o0_data_%(d)s < %(lname)s) { " % locals(), file=sio)
-
-        def end_while(d):
-            print("}", file=sio)
-
-        def task_code(d):
-            self.task_code(inputs, outputs, sio, nodename,
-                           iname=['i%i_data_%i[0]' % (ipos, d)
-                                    for ipos, i in enumerate(inputs)],
-                           oname=['o%i_data_%i[0]' % (ipos, d)
-                                    for ipos, i in enumerate(outputs)])
-
-        if nd == 4:
-            decl_shared_stride(n_in, n_out, nd)
-            decl_limits(nd)
-            calc_limit(0)
-            inc_ptrs(0, 'blockIdx.x')
-            while_limit(0)
-            if 1:
-                calc_limit(1)
-                decl_ptrs(1, 'blockIdx.y')
-                while_limit(1)
-                if 1:
-                    calc_limit(2)
-                    decl_ptrs(2, 'threadIdx.y')
-                    while_limit(2)
-                    if 1:
-                        calc_limit(3)
-                        decl_ptrs(3, 'threadIdx.x')
-                        while_limit(3)
-                        if 1:
-                            task_code(3)
-                            inc_ptrs(3, 'blockDim.x')
-                        end_while(3)
-                        inc_ptrs(2, 'blockDim.y')
-                    end_while(2)
-                    inc_ptrs(1, 'gridDim.y')
-                end_while(1)
-                inc_ptrs(0, 'gridDim.x')
-            end_while(0)
-
-        print("}", file=sio)
-        print(sio.getvalue())
-        return sio.getvalue()
-
-
-def elemwise_collapses(inputs, outputs, out_shape=None, verbose=0):
-    """
-    This collapse dimensions that are not needed when computing
-    elemwise.  This is usefull as it lower the indexing computation
-    that is heavier on gpu then on cpu.
-
-    This is a generic version. It collapse dimensions at any place in
-    the shape. It handle broadcasted dimensions correctly.
-
-    There is no special handling needed for broadcasted scalar at this level.
-
-    @return: ndims, tuple(dims, strides) after collapsing.
-    """
-    in_out = inputs + outputs
-    del inputs
-    if out_shape is not None:
-        local_dims = tuple(out_shape)
-    else:
-        # TODO, use the right algo here or make the parameter not optional
-        # We should always have the same shape for all outputs
-        # If there is more then one outputs
-        local_dims = tuple(outputs[0].shape)
-    del outputs
-    nd_orig = len(local_dims)
-    if nd_orig == 1:
-        # This have a lower overhead
-        all_c_contig = True
-        for inp in in_out:
-            if not inp.flags['C_CONTIGUOUS'] or inp.shape != local_dims:
-                all_c_contig = False
-                break
-        if all_c_contig:
-            return 0, (local_dims, [])
-
-    collapsable = [1] * nd_orig
-
-    local_str = [None] * len(in_out)
-    nd_collapse = nd_orig
-    for ipos in range(len(in_out)):
-        inp = in_out[ipos]
-        assert len(inp.shape) == nd_orig, "All inputs/outputs must have the same number of dimensions. You must broadcast before calling elemwise_collapse"
-        local_str[ipos] = list(inp.strides)
-        # We set the strides of broacastable dims to 0
-        # This make indexing in gpu simpler and is needed
-        # For collapsing the dimensions.
-        for dim_pos in range(inp.ndim):
-            if inp.shape[dim_pos] == 1:
-                local_str[ipos][dim_pos] = 0
-
-    if nd_orig == 1:
-        # We already covered the contiguous case before
-        # So we are sure it is not contiguous
-        # TODO: Add a test that f contiguous are also collapsed by the first case.
-        #       I think that for 1d array when the flags f contiguous is true, c contiguous is also true.
-        return 1, (local_dims, local_str)
-
-    if verbose > 2:
-        print("before broadcast collapse")
-        print(" nd_collapse", nd_collapse)
-        print(" local_dims", local_dims)
-        for ipos in range(len(local_str)):
-            print(" local_str inputs", ipos, local_str[ipos])
-    local_dims = list(local_dims)
-    # Collapse dimension that are broadcast in all inputs.
-    # need to be done before contiguous collapse as it will break it.
-    # Update the dimensions and the strides
-    for id in range(nd_collapse):
-        if local_dims[id] == 1:
-            # remove dims i from the array
-            for j in range(id + 1, nd_collapse):
-                local_dims[j - 1] = local_dims[j]
-            # remove dims i from the array
-            for input_id in range(len(in_out)):
-                for j in range(id + 1, nd_collapse):
-                    local_str[input_id][j - 1] = local_str[input_id][j]
-            nd_collapse -= 1
-            id -= 1  # TODO: what is this? How this work?
-
-    if verbose > 2:
-        print("after broadcast collapse")
-        print(" nd_collapse", nd_collapse)
-        print(" local_dims", local_dims)
-        for ipos in range(len(local_str)):
-            print(" local_str inputs", ipos, local_str[ipos])
-
-    nd_collapse_ = [1] * nd_orig
-    for ipos in range(len(local_str)):
-        # Can we collapse dims[i] and dims[i-1]?
-        strides = local_str[ipos]
-        for i in range(nd_collapse - 1, 0, -1):
-            if strides[i] * local_dims[i] != strides[i - 1]:
-                # The dims nd-1 are not strided again dimension nd
-                nd_collapse_[i] = 0
-
-        if verbose > 1:
-            print("nd_collapse_", nd_collapse_)
-
-    nd_collapse2 = nd_collapse
-    for i in range(nd_collapse - 1, 0, -1):
-        if nd_collapse_[i] == 1:
-            # update the local dims.
-            local_dims[i - 1] *= local_dims[i]
-            for j in range(i + 1, nd_collapse):
-                local_dims[j - 1] = local_dims[j]
-
-            # update the local stride.
-            for ipos in range(len(local_str)):
-                local_str[ipos][i - 1] = local_str[ipos][i]  # set new strides
-                # remove stride i from the array
-                for j in range(i + 1, nd_collapse):
-                    local_str[ipos][j - 1] = local_str[ipos][j]
-
-            # update the new number of dim
-            nd_collapse2 -= 1
-    nd_collapse = nd_collapse2
-
-    if nd_collapse == 1:
-        l = [local_str[ipos][nd_collapse - 1] == in_out[ipos].itemsize
-             for ipos in range(len(local_str))]
-        if all(l):
-            nd_collapse = 0
-
-    if verbose:
-        print("end collapsing")
-        print(" nd_collapse", nd_collapse)
-    if verbose > 1:
-        print(" local_dims", local_dims)
-        for ipos in range(len(local_str)):
-            print(" local_str inputs", ipos, local_str[ipos])
-
-    return nd_collapse, (local_dims, local_str)
-
-
-def reduction_collapses(inout, axis, verbose=0):
-    """
-    This collapse dimensions that are not needed when computing
-    reduction.  This is usefull as it lower the indexing computation
-    that is heavier on gpu then on cpu.
-
-    This is a generic version. It collapse dimensions at any place in
-    the shape.
-    @param: inout: tuple(input, output)
-    @param: axis: None, interger, list of 1 interger
-                  The axis over witch we will do reduction.
-    @return: (ndims, (input dims, input strides, input pattern), out strides)
-             after collapsing.
-
-    :note: we suppose that we can always collapse the output dimensions.
-    """
-    input = inout[0]
-    out = inout[1]
-    # Some quick check. It is faster then the full version.
-    if axis is None:
-        # The output size is always 1, so we don't care about this strides
-        if (input.flags['C_CONTIGUOUS'] or input.flags['F_CONTIGUOUS']):
-            return 0, ((input.size,), (input.itemsize,), axis), (0,)
-    if input.ndim == 1:
-        assert axis == [0] or axis == 0 or axis is None
-        # not c contiguous as the first if should have catched it.
-        return 1, (input.shape, input.strides, axis), (0,)
-
-    if not isinstance(axis, (list, tuple)):
-        local_axis = [axis]
-    else:
-        local_axis = list(axis)
-
-    # This is needed for the computing of the output strides
-    assert axis is None or len(local_axis) == 1
-
-    local_dims = list(input.shape)
-    local_str = list(input.strides)
-    out_strides = list(out.strides)
-
-    nd_orig = len(local_dims)
-    collapsable = [1] * nd_orig
-    nd_collapse = nd_orig
-
-    if verbose > 2:
-        print("before broadcast collapse")
-        print(" nd_collapse", nd_collapse)
-        print(" local_dims", local_dims)
-        print(" local_str inputs", local_str)
-        print(" local_axis", local_axis)
-
-    # Collapse dimension that are broadcast in all inputs.
-    # need to be done before contiguous collapse as it will break it.
-    # Update the dimensions and the strides
-    for id in range(nd_collapse):
-        if local_dims[id] == 1:
-            for j in range(id + 1, nd_collapse):
-                # remove dims i from the array
-                local_dims[j - 1] = local_dims[j]
-                # remove strides i from the array
-                local_str[j - 1] = local_str[j]
-                # remove output strides i from the array
-                if axis is not None:
-                    out_strides[j - 2] = out_strides[j - 1]
-            if id in local_axis:
-                local_axis.remove(id)
-            for axis_pos in range(len(local_axis)):
-                if local_axis[axis_pos] > id:
-                    local_axis[axis_pos] -= 1
-
-            nd_collapse -= 1
-            id -= 1  # TODO: how this work?
-
-    if verbose > 2:
-        print("after broadcast collapse")
-        print(" nd_collapse", nd_collapse)
-        print(" local_dims", local_dims)
-        print(" local_str inputs", local_str)
-        print(" local_axis", local_axis)
-        print(" out_strides", out_strides)
-
-    nd_collapse_ = [1] * nd_orig
-    # Can we collapse dims[i] and dims[i-1]?
-    for i in range(nd_collapse - 1, 0, -1):
-        if (local_str[i] * local_dims[i] != local_str[i - 1]):
-            # The dims nd-1 are not strided again dimension nd
-            nd_collapse_[i] = 0
-        elif (i in local_axis) != ((i - 1) in local_axis):
-            nd_collapse_[i] = 0
-
-    if verbose > 1:
-        print("nd_collapse_", nd_collapse_)
-
-    nd_collapse2 = nd_collapse
-    for i in range(nd_collapse - 1, 0, -1):
-        if nd_collapse_[i] == 1:
-            # update the local dims.
-            local_dims[i - 1] *= local_dims[i]
-            # set new strides
-            local_str[i - 1] = local_str[i]
-            #remove the old dims and strides
-            for j in range(i + 1, nd_collapse):
-                local_dims[j - 1] = local_dims[j]
-                local_str[j - 1] = local_str[j]
-                if axis is not None:
-                    out_strides[j - 2] = out_strides[j - 1]
-
-            if i in local_axis:
-                local_axis.remove(i)
-            for axis_pos in range(len(local_axis)):
-                if local_axis[axis_pos] > i:
-                    local_axis[axis_pos] -= 1
-
-            # update the new number of dim
-            nd_collapse2 -= 1
-
-    nd_collapse = nd_collapse2
-
-    if nd_collapse == 1:
-        if local_str[nd_collapse - 1] == input.itemsize:
-            nd_collapse = 0
-
-    if verbose:
-        print("end collapsing")
-        print(" nd_collapse", nd_collapse)
-    if verbose > 1:
-        print(" local_dims", local_dims)
-        print(" local_str inputs", local_str)
-        print(" local_axis", local_axis)
-        print(" out_strides", out_strides)
-
-    #print input.shape, input.strides
-    #print nd_collapse, (local_dims, local_str, local_axis)
-    local_dims = local_dims[:nd_collapse]
-    local_str = local_str[:nd_collapse]
-    out_strides = out_strides[:nd_collapse]
-    return nd_collapse, (local_dims, local_str, local_axis), out_strides
-
-
-def call_elemwise(fct, input_vals, block=None, grid=None, out=None,
-                  out_shape=None,
-                  strides=None):
-    """ Call an elemwise gpu function with gived inputs and block size.
-
-    :param fct: The gpu function to call
-    :param input_vals: a list of inputs to pass to fct
-    :param block: int, the size of the block wanted
-    :param grid: int, the size of the grid wanted
-    :param out: Optional, the preallocated output. Must have the right shape
-                and dtype.
-
-    :param out_shape: Optional, if provided, we will suppose that the output,
-                      have this shape event if it is not true.
-    :param strides: Optional, if provided, we will use those strides for
-                    the inputs and outputs.
-
-    :note: param out_shape and strides are used for the collapsing of
-           dimensions.
-    """
-    inp = input_vals[0]
-
-    # Get the output and output shape to us
-    if out_shape is None and out is None:
-        out_shape = list(inp.shape)
-        for i in input_vals[1:]:
-        # dtype checked by pycuda before gpu call
-            for s_i in range(len(inp.shape)):
-                assert (inp.shape[s_i] == i.shape[s_i]
-                        or inp.shape[s_i] == 1
-                        or  i.shape[s_i] == 1)
-                out_shape[s_i] = max(out_shape[s_i], inp.shape[s_i],
-                                     i.shape[s_i])
-    if out is None:
-        out = gpu_ndarray.empty(out_shape, dtype=inp.dtype)
-    elif out_shape is None:
-        out_shape = out.shape
-
-    # Arg: nb element
-    args = [cast_uint(out.size)]
-    # Arg: output shape to the arguments.
-    for i in range(len(out_shape)):
-        args.append(cast_int(out_shape[i]))
-
-    # for each inputs and the output
-    # add its ptr and strides
-    nd = len(out_shape)
-    idx = 0
-    for i in list(input_vals) + [out]:
-        itemsize = i.dtype.itemsize
-        args.append(i)
-        for j in range(nd):
-            # We force a stride of 0 for broadcastable dimensions
-            # This lower the index computation in the kernel.
-            if strides is not None:
-                # strides should have a strides of 0 for broadcasting.
-                args.append(cast_int(strides[idx][j] / itemsize))
-            elif i.shape[j] == 1:
-                args.append(cast_int(0))
-            else:
-                args.append(cast_int(i.strides[j] / itemsize))
-        idx += 1
-    out_size = out.size
-    # First use at least a full warp
-    if block is None:
-        block_ = min(32, out_size)
-    else:
-        block_ = block
-    # Next start adding multiprocessors
-    if grid is None:
-        grid_ = min(out_size / block_ + (out_size % block_ != 0), 60)
-    else:
-        grid_ = grid
-    # Next start adding more warps per multiprocessor
-    if block is None:
-        if block_ * grid_ < out_size:
-            block_ = min(out_size / grid_, 512)
-
-    # We bypass the pycuda wrapper gpu function call.
-    # by calling directly the gpu function.
-    # This is faster and lower the overhead.
-    # Here is code that allow you to use the pycuda fct call.
-    # d = {"block":(block_,1,1), "grid":(grid_,1)}
-    # fct(*args, **d)
-    fct.set_block_shape(block_, 1, 1)  # time_kernel
-    fct.param_set(*args)
-    fct.launch_grid(grid_, 1)
-    return out
-
-
-class MyGpuNdArray():
-    _compiled_fct = {}
-
-    def __init__(self, gpu_nd_array):
-        #assert isinstance(gpu_nd_array, gpu_ndarray.GpuNdArrayObject)
-        self.gpu_nd_array = gpu_nd_array
-        self.ctype = dtype_to_ctype(self.gpu_nd_array.dtype)
-
-    @staticmethod
-    def gen_fct(op, inputs, nd, nodename="TestNodeName",
-                collapse=True):
-        if _CL_MODE:
-            npy_ty = "typedef float npy_float32;\n"
-        else:
-            npy_ty = "typedef double npy_float64;\n typedef float npy_float32;\n"
-
-        # Generate the gpu functions
-        nb_in = len(inputs)
-        fcts = [None]
-        for nd in range(1, nd + 1):  # 1 to nd
-            out = op(*[TensorType(i.gpu_nd_array.dtype,
-                                  (False,) * nd)() for i in inputs])
-            out_dtype = out.dtype
-            node = out.owner
-            elemwise_algo = ElemwiseAlgo(node.op.scalar_op)
-
-            code = (CLUDA_PREAMBLE +
-                    npy_ty +
-                    elemwise_algo.c_src_kernel(node.inputs,
-                                               node.outputs,
-                                               nodename, nd,
-                                               static=""))
-            fct_name = "kernel_%s_%d" % (nodename, nd)
-            fct = compile_gpu_code(code, fct_name)
-            fcts.append(fct)
-
-        # All inputs/outputs C contiguous case
-        code = (npy_ty +
-                CLUDA_PREAMBLE +
-                elemwise_algo.c_src_kernel_Ccontiguous(
-                node.inputs, node.outputs, nodename, static=""))
-        fct_name = "kernel_%s_Ccontiguous" % nodename
-        fcts[0] = compile_gpu_code(code, fct_name)
-
-        def call_fct2(inputs, out=None):
-            " Do dimensions collapsing before call the gpu code "
-            assert len(inputs) == nb_in
-            # dtype checked by pycuda
-            # TODO: assert nb dim?
-
-            inp = inputs[0]
-
-            # Compute the output shape.
-            out_shape = list(inp.shape)
-            for i in inputs[1:]:
-                for s_i in range(len(inp.shape)):
-                    assert (inp.shape[s_i] == i.shape[s_i]
-                            or inp.shape[s_i] == 1
-                            or  i.shape[s_i] == 1)
-                    out_shape[s_i] = max(out_shape[s_i], i.shape[s_i])
-            # Create the output object
-            if (out is None
-                or out.dtype != out_dtype
-                or out.shape != tuple(out_shape)):
-                out = MyGpuNdArray(gpu_ndarray.empty(out_shape,
-                                                     dtype=out_dtype))
-
-            if collapse:
-                # Do the collapsing.
-                nd_col, info = elemwise_collapses(list(inputs), [out])
-                # The two next line are usefull to force a call to the
-                # c contiguous version:
-                #nd_col = 0
-                #info = [[],[]]
-                out = call_elemwise(fcts[nd_col], inputs,
-                                    out=out, out_shape=info[0][:nd_col],
-                                    strides=info[1])
-            else:
-                out = call_elemwise(fcts[-1], inputs, out=out,
-                                    out_shape=out_shape)
-            return out
-        return call_fct2
-
-    def __elemwise2__(self, other, name, op):
-        """ Call this code on this op with 2 inputs """
-        nd = len(self.gpu_nd_array.shape)  # self.gpu_nd_array.ndim
-        assert nd == len(other.gpu_nd_array.shape)  # ndim
-        tag = (name + '_' + str(self.gpu_nd_array.dtype)
-               + str(self.gpu_nd_array.ndim))
-        tag += ('_' + str(other.gpu_nd_array.dtype)
-                + str(other.gpu_nd_array.ndim))
-        fct = self._compiled_fct.get(tag, None)
-        if fct is None:
-#            print "compile", tag
-            fct = MyGpuNdArray.gen_fct(op, [self, other], nd)
-            self._compiled_fct[tag] = fct
-        return fct((self, other))
-
-    @classmethod
-    def __elemwise__(cls, inputs, name, op, out=None):
-        """ Call this code on this op with * inputs """
-        nd = len(inputs[0].gpu_nd_array.shape)  # self.gpu_nd_array.ndim
-        for i in inputs[1:]:
-            assert nd == len(i.gpu_nd_array.shape)  # ndim
-        nb = len(inputs)
-        tag = name + "_".join([str(i.gpu_nd_array.dtype) +
-                             str(i.gpu_nd_array.ndim) for i in inputs])
-        fct = cls._compiled_fct.get(tag, None)
-        if fct is None:
-#            print "compile", tag
-            fct = MyGpuNdArray.gen_fct(op, inputs, nd)
-            cls._compiled_fct[tag] = fct
-        return fct(inputs, out=out)
-
-    base = property(lambda self: self.gpu_nd_array.base)
-    bytes = property(lambda self: self.gpu_nd_array.bytes)
-    dtype = property(lambda self: self.gpu_nd_array.dtype)
-    flags = property(lambda self: self.gpu_nd_array.flags)
-    itemsize = property(lambda self: self.gpu_nd_array.itemsize)
-    ndim = property(lambda self: self.gpu_nd_array.ndim,
-                    doc="number of dimensions")
-    offset = property(lambda self: self.gpu_nd_array.offset)
-    shape = property(lambda self: self.gpu_nd_array.shape)
-    size = property(lambda self: self.gpu_nd_array.size)
-    strides = property(lambda self: self.gpu_nd_array.strides)
-
-    def __array__(self):
-        return numpy.asarray(self.gpu_nd_array)
-
-    def copy(self):
-        return MyGpuNdArray(self.gpu_nd_array.copy())
-
-    def view(self):
-        return MyGpuNdArray(self.gpu_nd_array.view())
-
-    def __copy__(self):
-        return MyGpuNdArray(self.gpu_nd_array.__copy__())
-
-    def __deepcopy__(self):
-        return MyGpuNdArray(self.gpu_nd_array.__deepcopy__())
-
-    @property
-    def gpudata(self):
-        # TODO: Add this assert when PyCUDA/PyOpenCL can use the bytes
-        # attributes. Without this assert old code that don't support
-        # strides can receive as input object that are strided and no
-        # error will be gived
-
-        #assert (self.gpu_nd_array.flags['C_CONTIGUOUS'] or
-        #         self.gpu_nd_array.flags['F_CONTIGUOUS'])
-
-        # TODO: find a way to pass to a pycuda/pyopencl function the
-        #       bytes + offset directly.
-        return self.bytes + self.offset
-
-    def __getitem__(self, *inputs):
-        return MyGpuNdArray(self.gpu_nd_array.__getitem__(*inputs))
-
-    def __add__(self, other):
-        return self.__elemwise2__(other, "add", theano.tensor.add)
-
-    def __sub__(self, other):
-        return self.__elemwise2__(other, "sub", theano.tensor.sub)
-
-    def __mul__(self, other):
-        return self.__elemwise2__(other, "mul", theano.tensor.mul)
-
-    def __div__(self, other):
-        assert (str(self.gpu_nd_array.dtype).startswith("float") or
-                str(other.gpu_nd_array.dtype).startswith("float"))
-        return self.__elemwise2__(other, "true_div", theano.tensor.true_div)
-
-    @classmethod
-    def add(cls, x, y, out=None):
-        """ add all inputs togethers element-wise """
-        return cls.__elemwise__([x, y], "add", theano.tensor.add, out=out)
-
-    @classmethod
-    def adds(cls, *inputs):
-        """ add all inputs togethers element-wise """
-        return cls.__elemwise__(inputs, "add", theano.tensor.add)
-
-    @classmethod
-    def multiplys(cls, *inputs):
-        """ multiply all inputs togethers element-wise """
-        return cls.__elemwise__(inputs, "mul", theano.tensor.mul)
-
-    def sum(self, axis=None, collapse=True):
-        from . import gen_reduction
-        max_thread_per_block = 512
-        max_block = 4096
-        if isinstance(axis, (list, tuple)):
-            if len(axis) == 1:
-                axis = axis[0]
-            else:
-                assert len(axis) == self.ndim
-                axis.sort()
-                assert axis == list(range(self.ndim))
-                axis = None
-
-        # TODO: Why this?
-        if self.size == 0:
-            make_out = gpu_ndarray.zeros
-        else:
-            make_out = gpu_ndarray.empty
-
-        if axis is None:
-            out = make_out((), self.dtype)
-            out = MyGpuNdArray(out)
-        else:
-            out_shape = [self.shape[i] for i in range(self.ndim)
-                         if i != axis]
-            out = make_out(out_shape, self.dtype)
-            out = MyGpuNdArray(out)
-
-        if self.size == 0:
-            return out
-
-        args_set = False
-
-        if collapse:
-            coll_ndim, (coll_shape, coll_strides, coll_axis), coll_out_str = (
-                reduction_collapses([self, out], axis))
-        else:
-            coll_ndim = self.ndim
-            coll_shape = self.shape
-            coll_strides = self.strides
-            coll_axis = [axis]
-            coll_out_str = out.strides
-
-        if axis is not None:
-            coll_axis = coll_axis[0]
-
-        args_set = False
-
-        if coll_ndim == 0:
-            sum_op = gen_reduction.GpuSum([1], self.dtype)
-            c_code = sum_op.c_support_code_apply("nodename", contig=True)
-            fctname = "kernel_reduce_sum_ccontig_nodename"
-            fct = compile_gpu_code(c_code, fctname)
-            block_ = min(coll_shape[0], max_thread_per_block)
-            block = (block_, 1, 1)
-
-            grid = (1, 1)
-            shared_ = self.dtype.itemsize * block_
-            args = [cast_int(coll_shape[0]), self, out]
-            args_set = True
-        elif axis is None:
-            pattern = [1] * coll_ndim
-            str_pattern = [str(i) for i in pattern]
-            sum_op = gen_reduction.GpuSum(pattern, self.dtype)
-            c_code = sum_op.c_support_code_apply("nodename")
-            if not c_code:
-                raise NotImplementedError(
-                    "GpuNdArray sum case not implemented")
-            fctname = "kernel_reduce_sum_" + "".join(str_pattern) + "_nodename"
-            fct = compile_gpu_code(c_code, fctname)
-            if coll_ndim == 1:
-                bx = min(max_thread_per_block, coll_shape[0])
-                block = (bx, 1, 1)
-                block_ = bx
-            elif coll_ndim == 2:
-                bx = min(max_thread_per_block, coll_shape[1])
-                by = min(max_thread_per_block // coll_shape[1], coll_shape[0])
-                by = max(by, 1)
-                block = (bx, by, 1)
-                block_ = bx * by
-            elif coll_ndim == 3:
-                bx = min(max_thread_per_block, coll_shape[2])
-                by = min(max_thread_per_block // bx, coll_shape[1])
-                bz = min(max_thread_per_block // (bx * by), coll_shape[0])
-                by = max(by, 1)
-                bz = min(max(bz, 1), 64)
-                block = (bx, by, bz)
-                block_ = bx * by * bz
-            elif coll_ndim == 4:
-                bx = min(max_thread_per_block, coll_shape[3])
-                by = min(max_thread_per_block // bx, coll_shape[2])
-                bz = min(max_thread_per_block // (bx * by), coll_shape[1])
-                by = max(by, 1)
-                bz = min(max(bz, 1), 64)
-                block = (bx, by, bz)
-                block_ = bx * by * bz
-            grid = (1, 1)
-            shared_ = self.dtype.itemsize * block_
-        elif coll_ndim in [1, 2, 3]:
-            if coll_ndim == 1:
-                assert coll_axis == 0
-                # pattern 1
-                sum_op = gen_reduction.GpuSum([1], self.dtype)
-                fctname = "kernel_reduce_sum_1_nodename"
-
-                grid = (1, 1)
-
-                block_ = min(max_thread_per_block, coll_shape[0])
-                block = (block_, 1, 1)
-            elif coll_ndim == 3 and coll_axis == 0:
-                # pattern 100
-                sum_op = gen_reduction.GpuSum([1, 0, 0], self.dtype)
-                fctname = "kernel_reduce_sum_100_nodename"
-
-                gx = min(coll_shape[1], max_block)
-                gy = min(max_block // (gx * coll_shape[2]), coll_shape[2])
-                gy = max(gy, 1)
-                grid = (gx, gy)
-
-                block_ = min(max_thread_per_block, coll_shape[0])
-                block = (block_, 1, 1)
-            elif coll_ndim == 3 and coll_axis == 1:
-                # pattern 010
-                sum_op = gen_reduction.GpuSum([0, 1, 0], self.dtype)
-                fctname = "kernel_reduce_sum_010_AD_nodename"
-
-                A = coll_shape[0]
-                B = coll_shape[1]
-                C = coll_shape[2]
-                D = C / 32
-                if (32 * D < C):
-                    D += 1
-                assert ((C <= 32 * D) and (32 * D < C + 32))
-                shared_ = 0
-
-                gx = min(A, max_block)
-                gy = min(max_block // (D * A), D)
-                gy = max(gy, 1)
-                grid = (gx, gy)
-
-                block = (32, 1, 1)
-                block_ = 32
-
-                args_set = True
-                # input shape
-                args = [cast_int(A), cast_int(B),
-                        cast_int(C), cast_int(D)]
-                # input
-                args.append(self)
-                # input strides
-                args += [cast_int(i / self.dtype.itemsize)
-                         for i in coll_strides]
-                # output
-                args.append(out)
-                # output strides
-                args.append(cast_int(coll_out_str[0] / out.dtype.itemsize))
-                args.append(cast_int(coll_out_str[1] / out.dtype.itemsize))
-            elif coll_ndim == 3 and coll_axis == 2:
-                # pattern 001
-                sum_op = gen_reduction.GpuSum([0, 0, 1], self.dtype)
-                fctname = "kernel_reduce_sum_001_nodename"
-
-                gx = min(coll_shape[0], max_block)
-                gy = min(max_block // (gx * coll_shape[1]), coll_shape[1])
-                gy = max(gy, 1)
-                grid = (gx, gy)
-
-                block_ = min(max_thread_per_block, coll_shape[2])
-                block = (block_, 1, 1)
-            elif coll_axis == 0:
-                # pattern 10
-                sum_op = gen_reduction.GpuSum([1, 0], self.dtype)
-                fctname = "kernel_reduce_sum_010_nodename"
-                block_ = min(coll_shape[1], max_thread_per_block)
-                block = (block_, 1, 1)
-                grid = (1, coll_shape[0])
-                args_set = True
-                # input shape
-                args = [cast_int(1)]
-                args += [cast_int(i) for i in coll_shape]
-                # input
-                args.append(self)
-                # input strides
-                args.append(cast_int(1))
-                args += [cast_int(i / self.dtype.itemsize)
-                         for i in coll_strides]
-                # output
-                args.append(out)
-                # output strides
-                args.append(cast_int(1))
-                # We must take the last dimensions in the case of
-                # dimensions collapsing.
-                args.append(cast_int(coll_out_str[-1] / out.dtype.itemsize))
-            elif coll_axis == 1:
-                # pattern 01
-                sum_op = gen_reduction.GpuSum([0, 1], self.dtype)
-                fctname = "kernel_reduce_sum_01_nodename"
-                block_ = min(coll_shape[1], max_thread_per_block)
-                block = (block_, 1, 1)
-                grid = (1, min(coll_shape[0], max_block))
-            else:
-                raise Exception("Bad axis")
-
-            c_code = sum_op.c_support_code_apply("nodename")
-            fct = compile_gpu_code(c_code, fctname)
-
-            shared_ = self.dtype.itemsize * block_
-        else:
-            raise Exception("Not implemented")
-
-        if not args_set:
-            # input shape
-            args = [cast_int(i) for i in coll_shape]
-            # input
-            args.append(self)
-            # input strides
-            args += [cast_int(i / self.dtype.itemsize)
-                     for i in coll_strides]
-            # output
-            args.append(out)
-            # output strides
-            args += [cast_int(i / self.dtype.itemsize)
-                     for i in coll_out_str]
-
-        pycuda._driver.Context.synchronize()
-        #print fctname, block, grid, shared_, axis
-        #print self.ndim, self.shape, self.strides, axis, out.strides
-        #print coll_ndim, coll_shape, coll_strides, coll_axis, coll_out_str
-        #print args
-
-        if False:
-            d = {"block": block,
-                 "shared": shared_,
-                 "grid": grid}
-            fct(*args, **d)
-        else:
-            # We bypass the pycuda wrapper gpu function call.
-            # by calling directly the gpu function.
-            # This is faster and lower the overhead.
-            fct.set_block_shape(*block)
-            fct.set_shared_size(shared_)
-            fct.param_set(*args)
-            fct.launch_grid(*grid)
-        return out
diff --git a/ndarray/gen_reduction.py b/ndarray/gen_reduction.py
deleted file mode 100644
index d89d69d..0000000
--- a/ndarray/gen_reduction.py
+++ /dev/null
@@ -1,1515 +0,0 @@
-import numpy
-import StringIO
-
-
-_CL_MODE = False  # "pyopencl" in __name__
-
-
-if _CL_MODE:
-    # THIS IS NOT FINISHED
-    import pyopencl as cl
-    import pyopencl.array as cl_array
-
-    #    import pyopencl._mymako as mako
-    from pyopencl._cluda import CLUDA_PREAMBLE
-    from pyopencl.tools import dtype_to_ctype
-
-    # TODO: use mako to get rid of the %if
-    CLUDA_PREAMBLE = CLUDA_PREAMBLE[:455]
-    CLUDA_PREAMBLE += """
-#define LDIM_0 get_local_id(0)
-#define LDIM_1 get_local_id(1)
-#define LDIM_2 get_local_id(2)
-
-#define GDIM_0 get_global_id(0)
-#define GDIM_1 get_global_id(1)
-#define GDIM_2 get_global_id(2)
- """
-    # TODO, reuse the same context as the use used to create the memory.
-    ctx = cl.create_some_context()
-    queue = cl.CommandQueue(ctx)
-else:
-    import pycuda.autoinit
-    import pycuda.driver as driver
-
-    #    import pycuda._mymako as mako
-    from pycuda._cluda import CLUDA_PREAMBLE
-    from pycuda.compiler import SourceModule
-    from pycuda.tools import dtype_to_ctype
-    CLUDA_PREAMBLE += """
-#define LDIM_0 blockDim.x
-#define LDIM_1 blockDim.y
-#define LDIM_2 blockDim.z
-
-#define GDIM_0 gridDim.x
-#define GDIM_1 gridDim.y
-#define GDIM_2 gridDim.z
- """
-
-import logging
-
-import theano
-from theano import Apply, scalar
-from theano.sandbox.cuda import CudaNdarrayType
-from theano.tensor import TensorType
-
-
-_logger_name = 'compyte.gen_reduction'
-_logger = logging.getLogger(_logger_name)
-_logger.setLevel(logging.INFO)
-_logger.addHandler(logging.StreamHandler())  # TO REMOVE
-
-
-def warning(*msg):
-    _logger.warning(_logger_name + 'WARNING: ' + ' '.join(str(m) for m in msg))
-
-
-def info(*msg):
-    _logger.info(_logger_name + 'INFO: ' + ' '.join(str(m) for m in msg))
-
-
-def debug(*msg):
-    _logger.debug(_logger_name + 'DEBUG: ' + ' '.join(str(m) for m in msg))
-
-
-import pygpu_ndarray as gpu_ndarray
-
-
-class GpuSum:
-    """GpuSum is a Reduction along some dimensions by summation.
-
-    The dimensions along which to sum is specified by the
-    `reduce_mask` that you pass to the constructor.  The `reduce_mask`
-    is a tuple of booleans (actually integers 0 or 1) that specify for
-    each input dimension, whether to reduce it (1) or not (0).
-
-    For example:
-
-      - reduce_mask == (1,) sums a vector to a scalar
-
-      - reduce_mask == (1,0) computes the sum of each column in a matrix
-
-      - reduce_mask == (0,1) computes the sum of each row in a matrix
-
-      - reduce_mask == (1,1,1) computes the sum of all elements in a
-        3-tensor.
-
-    :note: any reduce_mask of all zeros is a sort of 'copy', and may
-           be removed during graph optimization
-
-    """
-    def __init__(self, reduce_mask, dtype):
-        self.reduce_mask = tuple(reduce_mask)
-        # input, output and accumulator dtype
-        self.dtype = dtype_to_ctype(dtype)
-
-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.reduce_mask == other.reduce_mask)
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.reduce_mask)
-
-    def __str__(self):
-        return "GpuSum{%s}" % ','.join(str(i) for i in self.reduce_mask)
-
-    def make_node(self, x):
-        if (x.type.ndim != len(self.reduce_mask)):
-            raise TypeError("x must have rank %i" % len(self.reduce_mask))
-        o_broadcast = [x.type.broadcastable[i]
-                       for i in range(x.type.ndim) if not self.reduce_mask[i]]
-        return Apply(self, [x], [CudaNdarrayType(o_broadcast)()])
-
-    def perform(self, node, inp, out):
-        x, = inp
-        z, = out
-        z[0] = x.reduce_sum(self.reduce_mask)
-
-    def c_code(self, node, name, inp, out, sub):
-        x, = inp
-        z, = out
-
-        nd_in = node.inputs[0].type.ndim
-        nd_out = node.outputs[0].type.ndim
-
-        assert nd_in - nd_out == sum(self.reduce_mask)
-
-        sio = StringIO.StringIO()
-        fail = sub['fail']
-
-        #check input
-        print("""
-        if (%(x)s->nd != %(nd_in)s)
-        {
-            PyErr_Format(PyExc_TypeError,
-                         "required nd=%(nd_in)s, got nd=%%i", %(x)s->nd);
-            %(fail)s;
-        }
-        """ % locals(), file=sio)
-
-        #
-        # alloc an output if we need one
-        #
-
-        # check the basics of out output
-        print("""
-        if (  !%(z)s
-           || (%(z)s->nd != %(nd_out)s)
-        """ % locals(), file=sio)
-
-        #ensure that the output has the right non-reduced dimensions
-        j = 0
-        for i in range(nd_in):
-            if not self.reduce_mask[i]:
-                print((" || (CudaNdarray_HOST_DIMS(%(z)s)[%(j)s] !="
-                               "CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]) " %
-                               locals()), file=sio)
-                j += 1
-
-        print("""
-           )
-        {
-            """ % locals(), file=sio)
-        print("int new_dims[%(nd_out)s]; " % locals(), file=sio)
-
-        j = 0
-        for i in range(nd_in):
-            if not self.reduce_mask[i]:
-                print(('new_dims[%(j)s] = CudaNdarray_HOST_DIMS'
-                               '(%(x)s)[%(i)s];' % locals()), file=sio)
-                j += 1
-
-        print("""
-            Py_XDECREF(%(z)s);
-            %(z)s = (CudaNdarray*) CudaNdarray_NewDims(%(nd_out)s, new_dims);
-            if (NULL == %(z)s)
-            {
-                PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
-                %(fail)s;
-            }
-        }
-        """ % locals(), file=sio)
-
-        # \begin bracket the reduction in a check that there is
-        # actually work to do
-        print("""
-        if (CudaNdarray_SIZE(%(z)s))
-        {
-        """ % locals(), file=sio)
-
-        #
-        # Now perform the reduction
-        #
-
-        if all(i == 1 for i in self.reduce_mask):
-            #check if the tensor is ccontiguous, if true, use the
-            #c_c0de_reduce_ccontig code.
-            #TODO: check if we are ccontiguous when we un-dimshuffle
-            #TODO: if only some dims are ccontiguous, call version
-            #      with less dims.
-
-            print('if(CudaNdarray_is_c_contiguous(%(x)s)){' % locals(), file=sio)
-            self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
-            print("}else{", file=sio)
-            getattr(self, 'c_code_reduce_%s' % (''.join(
-                        str(i) for i in self.reduce_mask)))(sio, node, name,
-                                                            x, z, fail)
-            print("}", file=sio)
-        else:
-            getattr(self, 'c_code_reduce_%s' % (''.join(
-                        str(i) for i in self.reduce_mask)))(sio, node, name,
-                                                            x, z, fail)
-
-        # \end bracket the reduction ...
-        print("""
-        }
-        """ % locals(), file=sio)
-
-        return sio.getvalue()
-
-    def _makecall(self, node, name, x, z, fail, pattern=None):
-        """Return a string for making a kernel call.
-
-            The return value looks something like:
-
-            .. code-block:: c
-
-                if (verbose)
-                    printf("running kernel_reduce_sum_10_%(name)s\\n");
-                int n_shared = sizeof(%(dtype)s) * n_threads.x;
-                kernel_reduce_sum_10_%(name)s<<<n_blocks,
-                                                n_threads, n_shared>>>(
-                        CudaNdarray_HOST_DIMS(%(x)s)[0],
-                        CudaNdarray_HOST_DIMS(%(x)s)[1],
-                        CudaNdarray_DEV_DATA(%(x)s),
-                        CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                        CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                        CudaNdarray_DEV_DATA(%(z)s),
-                        CudaNdarray_HOST_STRIDES(%(z)s)[0]
-                        );
-                CNDA_THREAD_SYNC;
-                if (cudaSuccess != cudaGetLastError())
-                {
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error: ... );
-                    %(fail)s;
-                }
-        """
-        sio = StringIO.StringIO()
-        if pattern is None:
-            pattern = ''.join(str(c) for c in self.reduce_mask)
-        ndim = len(self.reduce_mask)
-        nd_out = ndim - sum(self.reduce_mask)
-        print("""
-            if (verbose)
-                printf("running kernel_reduce_sum_%(pattern)s_%(name)s\\n");
-            int n_shared = sizeof(%(dtype)s) * n_threads.x *
-                           n_threads.y * n_threads.z;
-            if (verbose>1)
-                printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
-                       " nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d,"
-                       " nb_block=%%d, n_shared=%%d\\n",
-                                  n_threads.x,n_threads.y,n_threads.z,
-                                  n_threads.x*n_threads.y*n_threads.z,
-                                  n_blocks.x,n_blocks.y,
-                                  n_blocks.x*n_blocks.y, n_shared);
-            kernel_reduce_sum_%(pattern)s_%(name)s<<<n_blocks,
-                                                     n_threads, n_shared>>>(
-            """ % locals(), file=sio)
-        for i in range(ndim):
-            print("""
-                    CudaNdarray_HOST_DIMS(%(x)s)[%(i)s],
-            """ % locals(), file=sio)
-        print("""
-                    CudaNdarray_DEV_DATA(%(x)s)
-            """ % locals(), file=sio)
-        for i in range(ndim):
-            print("""
-                    ,CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s]
-            """ % locals(), file=sio)
-        print("""
-                    ,CudaNdarray_DEV_DATA(%(z)s)
-            """ % locals(), file=sio)
-        for i in range(nd_out):
-            print("""
-                    ,CudaNdarray_HOST_STRIDES(%(z)s)[%(i)s]
-            """ % locals(), file=sio)
-        print("""
-                    );
-            CNDA_THREAD_SYNC;
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-"Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_sum_%(pattern)s_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z);
-                %(fail)s;
-            }
-        """ % locals(), file=sio)
-        return sio.getvalue()
-
-    def _k_decl(self, nodename,
-                pattern=None, ndim=None, reduce_mask=None):
-        """Return a string to declare a kernel function
-
-        .. code-block:: c
-
-            __global__ void kernel_reduce_sum_110_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const %(dtype)s *A,
-                    const int sA0,
-                    const int sA1,
-                    const int sA2,
-                    %(dtype)s * Z,
-                    const int sZ0)
-
-        """
-        dtype = self.dtype
-        if reduce_mask is None:
-            reduce_mask = self.reduce_mask
-        if ndim is None:
-            ndim = len(reduce_mask)
-        if pattern is None:
-            pattern = ''.join(str(i) for i in reduce_mask)
-        sio = StringIO.StringIO()
-
-        print("""
-            __global__ void kernel_reduce_sum_%(pattern)s_%(nodename)s(
-        """ % locals(), file=sio)
-
-        for i in range(ndim):
-            print("""const int d%(i)s,""" % locals(), file=sio)
-
-        print("""const %(dtype)s *A,""" % locals(), file=sio)
-
-        for i in range(ndim):
-            print("""const int sA%(i)s,""" % locals(), file=sio)
-
-        print("""%(dtype)s * Z""" % locals(), file=sio)
-
-        for i in range(ndim - sum(reduce_mask)):
-            print(""", const int sZ%(i)s""" % locals(), file=sio)
-
-        print(")", file=sio)
-
-        return sio.getvalue()
-
-    def _k_init(self, *args):
-        dtype = self.dtype
-        return """
-                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
-                const int threadNum = threadIdx.z * blockDim.x * blockDim.y
-                                      + threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ %(dtype)s buf[];
-                %(dtype)s mysum = 0.0f;
-
-                if (warpSize != 32){ //TODO: set error code
-                    Z[0] = 666;
-                    return;
-                }
-
-        """ % locals()
-
-    def _k_reduce_buf(self, z_pos):
-        return """
-        __syncthreads(); // some kernel do multiple reduction.
-        buf[threadNum] = mysum;
-        __syncthreads();
-
-        // rest of function is handled by one warp
-        if (threadNum < warpSize)
-        {
-            //round up all the partial sums into the first `warpSize` elements
-            for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
-            {
-                mysum += buf[i];
-            }
-            buf[threadNum] = mysum;
-            if (threadNum < 16)
-            {
-                //reduce so that threadNum 0 has the sum of everything
-                if(threadNum + 16 < threadCount)
-                    buf[threadNum] += buf[threadNum+16];
-                if(threadNum + 8 < threadCount)
-                    buf[threadNum] += buf[threadNum+8];
-                if(threadNum + 4 < threadCount)
-                    buf[threadNum] += buf[threadNum+4];
-                if(threadNum + 2 < threadCount)
-                    buf[threadNum] += buf[threadNum+2];
-                if(threadNum + 1 < threadCount)
-                    buf[threadNum] += buf[threadNum+1];
-                if (threadNum == 0)
-                {
-                    %(z_pos)s = buf[0];
-                }
-            }
-        }
-        """ % locals()
-        return """
-        __syncthreads(); // some kernel do multiple reduction.
-        buf[threadNum] = mysum;
-        __syncthreads();
-
-        // rest of function is handled by one warp
-        if (threadNum < warpSize)
-        {
-            //round up all the partial sums into the first `warpSize` elements
-            for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
-            {
-                mysum += buf[i];
-            }
-            buf[threadNum] = mysum;
-/*Comment this optimization as it don't work on Fermi GPU.
-TODO: find why it don't work or put the GPU compute capability into the version
-            // no sync because only one warp is running
-            if(threadCount >32)
-            {
-                buf[threadNum] += buf[threadNum+16];
-                buf[threadNum] += buf[threadNum+8];
-                buf[threadNum] += buf[threadNum+4];
-                buf[threadNum] += buf[threadNum+2];
-                buf[threadNum] += buf[threadNum+1];
-                if (threadNum == 0)
-                {
-                    %(z_pos)s = buf[0];
-                }
-
-            }
-            else */
-            if (threadNum < 16)
-            {
-                //reduce so that threadNum 0 has the sum of everything
-                if(threadNum + 16 < threadCount)
-                    buf[threadNum] += buf[threadNum+16];
-                if(threadNum + 8 < threadCount)
-                    buf[threadNum] += buf[threadNum+8];
-                if(threadNum + 4 < threadCount)
-                    buf[threadNum] += buf[threadNum+4];
-                if(threadNum + 2 < threadCount)
-                    buf[threadNum] += buf[threadNum+2];
-                if(threadNum + 1 < threadCount)
-                    buf[threadNum] += buf[threadNum+1];
-                if (threadNum == 0)
-                {
-                    %(z_pos)s = buf[0];
-                }
-            }
-        }
-        """ % locals()
-
-    # Threads must be organized as: threadNum%nb_reduce correspond to
-    # the same sum
-    # nb_reduce<=warpSize
-    def _k_reduce_buf_multiple(self, z_pos, nb_reduce):
-        return """
-        __syncthreads(); // some kernel do multiple reduction.
-        buf[threadNum] = mysum;
-        __syncthreads();
-
-        // rest of function is handled by one warp
-        if (threadNum < %(nb_reduce)s)
-        {
-            //round up all the partial sums into the first `nb_reduce` elements
-            for (int i = threadNum + %(nb_reduce)s;
-                 i < threadCount; i += %(nb_reduce)s)
-            {
-                mysum += buf[i];
-            }
-            %(z_pos)s = mysum;
-        }
-        """ % locals()
-
-    def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
-        print("""
-        {
-          if(CudaNdarray_SIZE(%(x)s)==0){
-            cudaMemset(CudaNdarray_DEV_DATA(%(z)s),0,sizeof(%(dtype)s));
-          }else{
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(CudaNdarray_SIZE(%(x)s),
-                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
-            dim3 n_blocks(1);
-            if (verbose)
-                printf("running kernel_reduce_sum_ccontig_%(name)s"
-                       " n_threads.x=%%d, size=%%d, ndim=%%d\\n",
-                                n_threads.x,CudaNdarray_SIZE(%(x)s),%(x)s->nd);
-            int n_shared = sizeof(%(dtype)s) * n_threads.x;
-            kernel_reduce_sum_ccontig_%(name)s<<<n_blocks,
-                                                 n_threads, n_shared>>>(
-                    CudaNdarray_SIZE(%(x)s),
-                    CudaNdarray_DEV_DATA(%(x)s),
-                    CudaNdarray_DEV_DATA(%(z)s));
-            CNDA_THREAD_SYNC;
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-                    "Cuda error: %%s: %%s. (grid: %%i x %%i;"
-                    " block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_sum_ccontig_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z);
-                %(fail)s;
-            }
-         }
-        }
-        """ % locals(), file=sio)
-
-    def c_code_reduce_1(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print("""
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
-                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
-            dim3 n_blocks(1);
-            %(makecall)s
-        }
-        """ % locals(), file=sio)
-
-    def c_code_reduce_11(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print("""
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
-                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
-            while (n_threads.y * n_threads.x <=
-                   NUM_VECTOR_OP_THREADS_PER_BLOCK)
-                ++n_threads.y;
-            n_threads.y -= 1;
-            if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[0])
-                n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[0];
-
-            dim3 n_blocks(1);
-            %(makecall)s
-        }
-        """ % locals(), file=sio)
-
-    def c_code_reduce_01X(self, sio, node, name, x, z, fail, N):
-        """
-        :param N: the number of 1 in the pattern N=1 -> 01, N=2 -> 011,
-                  N=3 ->0111 Work for N=1,2,3
-        """
-        assert N in [1, 2, 3]
-        makecall = self._makecall(node, name, x, z, fail)
-        N_pattern = ''.join(['1'] * N)
-        param_dim = ",".join(["CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]" % locals()
-                              for i in range(N + 1)])
-        strides_dim = ",".join(
-            ["CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s]" % locals()
-                                for i in range(N + 1)])
-        threads_y = """
-            //get as many y threads as we can fit
-            while (n_threads.x * (n_threads.y+1) <=
-                   NUM_VECTOR_OP_THREADS_PER_BLOCK)
-            {
-                if (n_threads.y < CudaNdarray_HOST_DIMS(%(x)s)[%(N)s-1])
-                    n_threads.y += 1;
-                else
-                    break;
-            }
-""" % locals()
-        threads_z = """
-            //get as many z threads as we can fit
-            while (n_threads.x * n_threads.y * (n_threads.z+1) <=
-                   NUM_VECTOR_OP_THREADS_PER_BLOCK)
-            {
-                if (n_threads.z < CudaNdarray_HOST_DIMS(%(x)s)[%(N)s-2])
-                    n_threads.z += 1;
-                else
-                    break;
-            }
-""" % locals()
-        if len(self.reduce_mask) == 2:
-            threads_y = ''
-            threads_z = ''
-        if len(self.reduce_mask) == 3:
-            threads_z = ''
-        print("""
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[%(N)s],
-                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
-            %(threads_y)s
-            %(threads_z)s
-            dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
-                          NUM_VECTOR_OP_BLOCKS));
-            %(makecall)s
-        }
-        """ % locals(), file=sio)
-
-    def c_code_reduce_01(self, sio, node, name, x, z, fail):
-        self.c_code_reduce_01X(sio, node, name, x, z, fail, 1)
-
-    def c_code_reduce_011(self, sio, node, name, x, z, fail):
-        self.c_code_reduce_01X(sio, node, name, x, z, fail, 2)
-
-    def c_code_reduce_0111(self, sio, node, name, x, z, fail):
-        self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)
-
-    def c_code_reduce_10(self, sio, node, name, x, z, fail):
-        print("""
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
-                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
-            dim3 n_blocks(1,
-                std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
-                    NUM_VECTOR_OP_BLOCKS));
-            if (verbose) {
-              fprintf(stderr,
-                "running kernel_reduce_sum_10_%(name)s n_blocks=(%%i,%%i)\\n",
-                n_blocks.x,
-                n_blocks.y);
-            }
-            assert(CudaNdarray_HOST_DIMS(%(x)s)[1] ==
-                   CudaNdarray_HOST_DIMS(%(z)s)[0]);
-            int n_shared = sizeof(%(dtype)s) * n_threads.x;
-            kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
-                    1,
-                    CudaNdarray_HOST_DIMS(%(x)s)[0],
-                    CudaNdarray_HOST_DIMS(%(x)s)[1],
-                    CudaNdarray_DEV_DATA(%(x)s),
-                    1,
-                    CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                    CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                    CudaNdarray_DEV_DATA(%(z)s),
-                    1,
-                    CudaNdarray_HOST_STRIDES(%(z)s)[0]
-                    );
-            CNDA_THREAD_SYNC;
-            cudaError_t sts = cudaGetLastError();
-            if (cudaSuccess != sts)
-            {
-                PyErr_Format(PyExc_RuntimeError,
-"Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                    "kernel_reduce_sum_010_%(name)s",
-                    cudaGetErrorString(sts),
-                    n_blocks.x,
-                    n_blocks.y,
-                    n_threads.x,
-                    n_threads.y,
-                    n_threads.z);
-                %(fail)s;
-            }
-        }
-        """ % locals(), file=sio)
-
-    def c_code_reduce_010(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        makecall_inner = self._makecall(node, name, x, z,
-                                        fail, pattern="010_inner")
-        pattern = ''.join(str(i) for i in self.reduce_mask)
-        print("""
-        {
-
- // if the alternative is less buggy, consider not using this branch
-            if (1)
-            {
-                // If there are a lot of summations to do, then we can use
-                // simple parallelization -  use each thread to do one sum.
-                // we might as well launch blocks of 32 threads because that's
-                // the warp size. we could schedule more threads if we were
-                // maxing out the gridsize below, but the gridsize is way more
-                // than the physical hardware and I think 32 threads
-                // on a huge grid is enough to fully use the hardware.
-                dim3 n_threads(32,1,1);
-
-                // We kindof reshape the input implicitly to something 4D:
-                //  the shape A,B,C    ->   A, B, D, E
-                //  where C <= D*E < C+32
-                //  where E==32
-
-                int A = CudaNdarray_HOST_DIMS(%(x)s)[0];
-                int B = CudaNdarray_HOST_DIMS(%(x)s)[1];
-                int C = CudaNdarray_HOST_DIMS(%(x)s)[2];
-                int D = C/32;
-                if (32*D < C) D+= 1;
-                assert ((C <= 32*D) && (32*D < C+32));
-
-                // The gridsize would ideally be (A, D).  But we do the
-                // following logic to make sure we don't ask for a grid that
-                // is too big.
-                dim3 n_blocks(A,D);
-                if (n_blocks.x > NUM_VECTOR_OP_BLOCKS)
-                     n_blocks.x = NUM_VECTOR_OP_BLOCKS;
-                if (n_blocks.x*n_blocks.y > NUM_VECTOR_OP_BLOCKS)
-                    n_blocks.y = NUM_VECTOR_OP_BLOCKS/n_blocks.x;
-                int n_shared = 0;
-                kernel_reduce_sum_010_AD_%(name)s<<<n_blocks,
-                                                    n_threads, n_shared>>>(
-                        A,B,C,D,
-                        CudaNdarray_DEV_DATA(%(x)s),
-                        CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                        CudaNdarray_HOST_STRIDES(%(x)s)[1],
-                        CudaNdarray_HOST_STRIDES(%(x)s)[2],
-                        CudaNdarray_DEV_DATA(%(z)s),
-                        CudaNdarray_HOST_STRIDES(%(z)s)[0],
-                        CudaNdarray_HOST_STRIDES(%(z)s)[1]
-                        );
-                CNDA_THREAD_SYNC;
-                cudaError_t sts = cudaGetLastError();
-                if (cudaSuccess != sts)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-"Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                        "kernel_reduce_sum_010_%(name)s",
-                        cudaGetErrorString(sts),
-                        n_blocks.x,
-                        n_blocks.y,
-                        n_threads.x,
-                        n_threads.y,
-                        n_threads.z);
-                    %(fail)s;
-                }
-            }
-            else
-            {
-                int verbose = 2;
-
-                  dim3 n_threads(std::min(32,CudaNdarray_HOST_DIMS(%(x)s)[2]));
-                  while((n_threads.x*(n_threads.y+1) <=
-                        NUM_VECTOR_OP_THREADS_PER_BLOCK)
-                        && (n_threads.y<CudaNdarray_HOST_DIMS(%(x)s)[1])){
-                      n_threads.y++;
-                  }
-
-                  dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
-                                (int)NUM_VECTOR_OP_BLOCKS));
-                  n_blocks.y = std::min(
-                      ceil_intdiv(CudaNdarray_HOST_DIMS(%(x)s)[2],
-                                  (int)n_threads.x),
-                      (int)(NUM_VECTOR_OP_BLOCKS / n_blocks.x)
-                      );
-                if(std::min(std::min(CudaNdarray_HOST_STRIDES(%(x)s)[0],
-                                     CudaNdarray_HOST_STRIDES(%(x)s)[1]),
-                            CudaNdarray_HOST_STRIDES(%(x)s)[2])
-                   ==CudaNdarray_HOST_STRIDES(%(x)s)[2]
-                  && n_blocks.y == ceil_intdiv(CudaNdarray_HOST_DIMS(%(x)s)[2],
-                                              (int)n_threads.x)){
-                  if(verbose>1)
-                    printf("n_block.x.1=%%d, n_block.x.2=%%d,"
-                           " n_block.y.1=%%d, n_block.y.2=%%d,\\n",
-                           CudaNdarray_HOST_DIMS(%(x)s)[0],
-                           NUM_VECTOR_OP_BLOCKS,
-                           ceil_intdiv(CudaNdarray_HOST_DIMS(%(x)s)[2],
-                                      (int)n_threads.x),
-                           (int)(NUM_VECTOR_OP_BLOCKS / n_blocks.x));
-                  assert(n_threads.x<=32);
-                  %(makecall_inner)s
-                }else{
-                  n_threads.x = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
-                                  (int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
-                  n_blocks.x = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
-                                        (int)NUM_VECTOR_OP_BLOCKS);
-                  n_blocks.y = std::min(
-                      CudaNdarray_HOST_DIMS(%(x)s)[2],
-                      (int)(NUM_VECTOR_OP_BLOCKS / n_blocks.x)
-                      );
-                  %(makecall)s
-                }
-                CNDA_THREAD_SYNC;
-                cudaError_t sts = cudaGetLastError();
-                if (cudaSuccess != sts)
-                {
-                    PyErr_Format(PyExc_RuntimeError,
-"Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
-                        "kernel_reduce_sum_%(pattern)s_%(name)s",
-                        cudaGetErrorString(sts),
-                        n_blocks.x,
-                        n_blocks.y,
-                        n_threads.x,
-                        n_threads.y,
-                        n_threads.z);
-                    %(fail)s;
-                }
-            }
-        }
-        """ % locals(), file=sio)
-
-    def c_code_reduce_0101(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print("""
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[3],
-                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
-            while (n_threads.x * n_threads.y <=
-                   NUM_VECTOR_OP_THREADS_PER_BLOCK)
-            {
-                if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1]) break;
-                n_threads.y += 1;
-            }
-            n_threads.y -= 1;
-            dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[0],
-                          CudaNdarray_HOST_DIMS(%(x)s)[2]);
-            %(makecall)s
-        }
-        """ % locals(), file=sio)
-
-    def c_code_reduce_100(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        # use threadIdx.x for i0
-        # use blockIdx.x for i1
-        # use blockIdx.y for i2
-        print("""
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
-                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
-            dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]);
-            while (n_blocks.x * (n_blocks.y+1) <= NUM_VECTOR_OP_BLOCKS
-                   && n_blocks.y <= CudaNdarray_HOST_DIMS(%(x)s)[2])
-            {
-                n_blocks.y += 1;
-            }
-            %(makecall)s
-        }
-        """ % locals(), file=sio)
-
-    def c_code_reduce_110(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print("""
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
-                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
-            while (n_threads.x*n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
-            {
-                if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[0])
-                    break;
-                n_threads.y += 1;
-            }
-            n_threads.y -= 1;
-
-            dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[2]);
-            %(makecall)s
-        }
-        """ % locals(), file=sio)
-
-    def c_code_reduce_001(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print("""
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[2],
-                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
-            dim3 n_blocks(
-                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
-                        NUM_VECTOR_OP_BLOCKS));
-            while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS)
-            {
-                if (n_blocks.y > CudaNdarray_HOST_DIMS(%(x)s)[1])
-                    break;
-                n_blocks.y += 1;
-            }
-            n_blocks.y -= 1;
-            %(makecall)s
-        }
-        """ % locals(), file=sio)
-
-    def c_code_reduce_111(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print("""
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[2],
-                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
-
-            //get as many y threads as we can fit
-            while (n_threads.x * n_threads.y <=
-                   NUM_VECTOR_OP_THREADS_PER_BLOCK)
-            {
-                if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1])
-                    break;
-                n_threads.y += 1;
-            }
-            n_threads.y -= 1;
-
-            //get as many z threads as we can fit
-            while (n_threads.x * n_threads.y * n_threads.z <=
-                   NUM_VECTOR_OP_THREADS_PER_BLOCK)
-            {
-                if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0])
-                    break;
-                n_threads.z += 1;
-            }
-            n_threads.z -= 1;
-
-            dim3 n_blocks(1,1,1);
-            %(makecall)s
-        }
-        """ % locals(), file=sio)
-
-    def c_code_reduce_0011(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print("""
-        {
-            int verbose = 0;
-
-            dim3 n_blocks(
-                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
-                        NUM_VECTOR_OP_BLOCKS));
-
-            while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS &&
-                   n_blocks.y < CudaNdarray_HOST_DIMS(%(x)s)[1])
-            {
-                n_blocks.y += 1;
-            }
-
-            dim3 n_threads(
-                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[3],
-                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
-            while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK
-                   && n_threads.y < CudaNdarray_HOST_DIMS(%(x)s)[2]
-                   && n_threads.x * n_threads.y * sizeof(%(dtype)s) <=
-                      (15 * 1024 - 200))
-            {
-                n_threads.y += 1;
-            }
-
-            %(makecall)s
-        }
-        """ % locals(), file=sio)
-
-    def c_code_reduce_1111(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print("""
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[2],
-                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
-
-            //get as many y threads as we can fit
-            while (n_threads.x * n_threads.y <=
-                   NUM_VECTOR_OP_THREADS_PER_BLOCK)
-            {
-                if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1])
-                    break;
-                n_threads.y += 1;
-            }
-            n_threads.y -= 1;
-
-            //get as many z threads as we can fit
-            while (n_threads.x * n_threads.y * n_threads.z <=
-                   NUM_VECTOR_OP_THREADS_PER_BLOCK)
-            {
-                if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0])
-                    break;
-                n_threads.z += 1;
-            }
-            n_threads.z -= 1;
-
-            dim3 n_blocks(1,1,1);
-            %(makecall)s
-        }
-        """ % locals(), file=sio)
-
-    def c_code_reduce_1011(self, sio, node, name, x, z, fail):
-        makecall = self._makecall(node, name, x, z, fail)
-        print("""
-        {
-            int verbose = 0;
-            dim3 n_threads(
-                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[3],
-                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
-
-            while (n_threads.x * (n_threads.y+1) <=
-                   NUM_VECTOR_OP_THREADS_PER_BLOCK)
-                ++n_threads.y;
-            if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[2])
-                n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[2];
-
-            while (n_threads.x * n_threads.y * (n_threads.z+1) <=
-                   NUM_VECTOR_OP_THREADS_PER_BLOCK)
-                ++n_threads.z;
-            if (n_threads.z > 64)
-                n_threads.z = 64;
-            if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0])
-                n_threads.z = CudaNdarray_HOST_DIMS(%(x)s)[0];
-
-            dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]);
-            %(makecall)s
-        }
-        """ % locals(), file=sio)
-
-    def c_code_cache_version(self):
-        return (21,)
-
-    def c_support_code_apply(self, nodename, contig=False):
-        sio = StringIO.StringIO()
-        nd_in = len(self.reduce_mask)
-        dtype = self.dtype
-        if contig:  # all(i == 1 for i in self.reduce_mask):
-            #this kernel is ok for up to a few thousand elements, but
-            # it only runs on ONE multiprocessor
-            reducebuf = self._k_reduce_buf('Z[0]')
-            print("""
-            __global__ void kernel_reduce_sum_ccontig_%(nodename)s(
-                    const int d0,
-                    const %(dtype)s *A,
-                    %(dtype)s * Z)
-            {
-                const int threadCount = blockDim.x;
-                const int threadNum = threadIdx.x;
-                extern __shared__ %(dtype)s buf[];
-                %(dtype)s mysum = 0.0f;
-
-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
-                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
-                {
-                    mysum += A[i0];
-                }
-                %(reducebuf)s
-            }
-            """ % locals(), file=sio)
-        if self.reduce_mask == (1,):
-            #this kernel is ok for up to a few thousand elements, but
-            # it only runs on ONE multiprocessor
-            reducebuf = self._k_reduce_buf('Z[0]')
-            decl = self._k_decl(nodename)
-            print("""
-            %(decl)s
-            {
-                const int threadCount = blockDim.x;
-                const int threadNum = threadIdx.x;
-                extern __shared__ %(dtype)s buf[];
-                %(dtype)s mysum = 0.0f;
-
-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
-                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
-                {
-                    %(dtype)s Ai = A[i0 * sA0];
-                    mysum += Ai;
-                }
-                %(reducebuf)s
-            }
-            """ % locals(), file=sio)
-        if self.reduce_mask == (1, 1):
-            #this kernel is ok for up to a few thousand elements, but
-            # it only runs on ONE multiprocessor
-            reducebuf = self._k_reduce_buf('Z[0]')
-            decl = self._k_decl(nodename)
-            init = self._k_init(nodename)
-            print(decl, file=sio)
-            print(" { ", file=sio)
-            print(init, file=sio)
-            print("""
-                for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
-                {
-                    for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
-                    {
-                        %(dtype)s Ai = A[i0 * sA0 + i1 * sA1];
-                        mysum += Ai;
-                    }
-                }
-            """ % locals(), file=sio)
-            print(reducebuf, file=sio)
-            print(" } ", file=sio)
-
-        #01, 011, 0111
-        if (0 == self.reduce_mask[0] and
-            all(self.reduce_mask[1:]) and nd_in in[2, 3, 4]):
-            # this kernel uses one block for each row.
-            # threads per block for each element per row.
-
-            N_pattern = ''.join(['1'] * (nd_in - 1))
-            if nd_in == 2:
-                for_i1 = "for(int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)"
-                for_i2 = "int i2=0, sA2=0;"
-                for_i3 = "int i3=0, sA3=0;"
-            if nd_in == 3:
-                for_i1 = "for(int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)"
-                for_i2 = "for(int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)"
-                for_i3 = "int i3=0, sA3=0;"
-            if nd_in == 4:
-                for_i1 = "for(int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)"
-                for_i2 = "for(int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)"
-                for_i3 = "for(int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)"
-
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0]')
-            param_dim = ",".join(["const int d%(i)s" % locals()
-                                  for i in range(nd_in)])
-            param_strides = ",".join(["const int sA%(i)s" % locals()
-                                      for i in range(nd_in)])
-            decl = self._k_decl(nodename)
-            init = self._k_init(nodename)
-            print("""
-            %(decl)s{
-                %(init)s
-                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
-                  mysum = 0;
-                  %(for_i1)s{
-                    %(for_i2)s{
-                      %(for_i3)s{
-                        %(dtype)s Ai = A[i3 * sA3 + i2 * sA2 +
-                                     i1 * sA1 + i0 * sA0];
-                        mysum += Ai;
-                      }
-                    }
-                  }
-                  %(reducebuf)s
-                }
-            }
-            """ % locals(), file=sio)
-        if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
-            # this kernel uses one block for each column,
-            # threads per block for each element per column.
-
-            #TODO: This kernel is pretty inefficient in terms of
-            #      reading, because if A is c_contiguous (typical
-            #      case) then each warp is accessing non-contigous
-            #      memory (a segment of a column).
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]')
-            print("""
-            __global__ void kernel_reduce_sum_010_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const %(dtype)s *A, const int sA0,
-                    const int sA1, const int sA2,
-                    %(dtype)s * Z, const int sZ0, const int sZ1)
-            {
-                const int threadCount = blockDim.x;
-                const int threadNum = threadIdx.x;
-                extern __shared__ %(dtype)s buf[];
-
-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
-
-                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
-                {
-                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
-                    {
-                        %(dtype)s mysum = 0.0f;
-                        for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
-                        {
-                            mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
-                        }
-                        %(reducebuf)s
-                    }
-                }
-
-            }
-            """ % locals(), file=sio)
-        if self.reduce_mask == (0, 1, 0):
-            print("""
-            __global__ void kernel_reduce_sum_010_AD_%(nodename)s(
-                    const int A,
-                    const int B,
-                    const int C,
-                    const int D,
-                    //const int E, // THIS is 32
-                    const %(dtype)s *X, const int sX0,
-                    const int sX1, const int sX2,
-                    %(dtype)s * Z, const int sZ0, const int sZ1)
-            {
-                const int threadCount = blockDim.x;
-                const int threadNum = threadIdx.x;
-                %(dtype)s mysum = 0.0f;
-
-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
-                for (int a = blockIdx.x; a < A; a += gridDim.x)
-                {
-                    for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y)
-                    {
-                        int c = i2_D * 32 + threadIdx.x;
-                        if (c < C)
-                        {
-                            mysum = 0;
-                            for (int b = 0; b < B; ++b)
-                            {
-                                mysum += X[a * sX0 + b * sX1 + c * sX2];
-                            }
-                            Z[a * sZ0 + c * sZ1] = mysum;
-                        }
-                    }
-                }
-
-            }
-            """ % locals(), file=sio)
-        if self.reduce_mask == (0, 1, 0):
-            #
-            # This kernel is optimized when the inner most dimensions
-            # have the smallest stride.
-
-            # this kernel uses one block for multiple column(up to 32TODO),
-            # threads per block for each element per column.
-
-#thread.x = dim 2 contiguous
-#thread.y = dim 1
-#block.x = dim 0
-#block.y = dim 1 rest
-            init = self._k_init(nodename)
-            decl = self._k_decl(nodename, pattern="010_inner")
-            reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
-                                                    'blockDim.x')
-            reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
-                                                    'blockDim.x')
-            print("""
-            %(decl)s
-            {
-             if(warpSize<blockDim.x){
-               //TODO: set error code
-// need to be positive to work with unsigned
-               Z[0] = 666;
-               return;
-              }
-
-              %(init)s
-              for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
-              {
-                for (int i2 = blockIdx.y*blockDim.x+threadIdx.x;
-                     i2 < d2; i2 += gridDim.y*blockDim.x)
-                 {
-                  for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
-                  {
-                      mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
-                  }
-                  %(reducebuf)s
-                 }
-              }
-            }
-            """ % locals(), file=sio)
-        if self.reduce_mask == (1, 1, 0):
-            # this kernel uses one block for each column,
-            # threads per block for each element per column.
-
-            #TODO: This kernel is pretty inefficient in terms of
-            #      reading, because if A is c_contiguous (typical
-            #      case) then each warp is accessing non-contigous
-            #      memory (a segment of a column).
-            reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]')
-            print("""
-            __global__ void kernel_reduce_sum_110_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const %(dtype)s *A, const int sA0,
-                    const int sA1, const int sA2,
-                    %(dtype)s * Z, const int sZ0)
-            {
-                const int threadCount = blockDim.x * blockDim.y;
-                const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ %(dtype)s buf[];
-                %(dtype)s mysum = 0.0f;
-
-                if (warpSize != 32)
-                {
-                    //TODO: set error code
-                    Z[blockIdx.x * sZ0] = 666;
-                    return;
-                }
-
-                for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
-                {
-                    for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
-                    {
-                        %(dtype)s Ai = A[i0 * sA0 + i1 * sA1 +
-                                         blockIdx.x * sA2];
-                        mysum += Ai;
-                    }
-                }
-
-                %(reducebuf)s
-            }
-            """ % locals(), file=sio)
-        if self.reduce_mask == (1, 0, 0):
-            reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]')
-            decl = self._k_decl(nodename)
-            init = self._k_init(nodename)
-            print("""
-            %(decl)s
-            {
-                %(init)s
-                for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
-                {
-                    for (int i1 = blockIdx.x; i1 < d1; i1 += gridDim.x)
-                    {
-                        mysum = 0;
-                        for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
-                        {
-                            mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
-                        }
-                        %(reducebuf)s
-                    }
-                }
-            }
-            """ % locals(), file=sio)
-        if self.reduce_mask == (1, 1, 1):
-            reducebuf = self._k_reduce_buf('Z[0]')
-            decl = self._k_decl(nodename)
-            init = self._k_init(nodename)
-            print("""
-            %(decl)s
-            {
-                %(init)s
-
-                for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
-                {
-                    for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
-                    {
-                        for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
-                        {
-                            mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
-                        }
-                    }
-                }
-""" % locals(), file=sio)
-            print(reducebuf, "}", file=sio)
-
-        if self.reduce_mask == (0, 0, 1):
-            # this kernel uses one block for each row,
-            # threads per block for each element per row.
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]')
-            print("""
-            __global__ void kernel_reduce_sum_001_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const %(dtype)s *A, const int sA0,
-                    const int sA1, const int sA2,
-                    %(dtype)s * Z, const int sZ0, const int sZ1)
-            {
-                const int threadCount = blockDim.x;
-                const int threadNum = threadIdx.x;
-                extern __shared__ %(dtype)s buf[];
-
-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
-                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
-                {
-                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
-                    {
-                        %(dtype)s mysum = 0.0f;
-                        for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
-                        {
-                            mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
-                        }
-                        %(reducebuf)s
-                    }
-                }
-            }
-            """ % locals(), file=sio)
-        if self.reduce_mask == (0, 0, 1, 1):
-            # this kernel uses one block for each row,
-            # threads per block for each element per row.
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]')
-            decl = self._k_decl(nodename)
-            init = self._k_init(nodename)
-            print("""
-            %(decl)s
-            {
-                %(init)s
-
-                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
-                {
-                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
-                    {
-                        %(dtype)s mysum = 0.0f;
-                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
-                    {
-                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
-                        {
-                            mysum += A[i0 * sA0 + i1 * sA1 +
-                                       i2 * sA2 + i3 * sA3];
-                        }
-                    }
-                        %(reducebuf)s
-                    }
-                }
-            }
-            """ % locals(), file=sio)
-        if self.reduce_mask == (0, 1, 0, 1):
-            # this kernel uses one block for each row,
-            # threads per block for each element per row.
-            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]')
-            decl = self._k_decl(nodename)
-            init = self._k_init(nodename)
-            print("""
-            %(decl)s
-            {
-                %(init)s
-
-                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
-                {
-                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
-                    {
-                        %(dtype)s mysum = 0.0f;
-                    for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
-                    {
-                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
-                        {
-                            mysum += A[i0 * sA0 + i1 * sA1 +
-                                       i2 * sA2 + i3 * sA3];
-                        }
-                    }
-                        %(reducebuf)s
-                    }
-                }
-            }
-            """ % locals(), file=sio)
-        if self.reduce_mask == (1, 1, 1, 1):
-            reducebuf = self._k_reduce_buf('Z[0]')
-            decl = self._k_decl(nodename)
-            init = self._k_init(nodename)
-            print("""
-            %(decl)s
-            {
-                %(init)s
-                mysum = 0;
-              for (int i0 = 0; i0 < d0; i0++)
-                for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)
-                {
-                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
-                    {
-                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
-                        {
-                            mysum += A[i0 * sA0 + i1 * sA1 +
-                                       i2 * sA2 + i3 * sA3];
-                        }
-                    }
-                }
-                %(reducebuf)s
-            }
-            """ % locals(), file=sio)
-        if self.reduce_mask == (1, 0, 1, 1):
-            reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]')
-            print("""
-            __global__ void kernel_reduce_sum_1011_%(nodename)s(
-                    const int d0,
-                    const int d1,
-                    const int d2,
-                    const int d3,
-                    const %(dtype)s *A, const int sA0,
-                    const int sA1, const int sA2, const int sA3,
-                    %(dtype)s * Z, const int sZ0)
-            {
-                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
-                const int threadNum = threadIdx.z * blockDim.x * blockDim.y +
-                                      threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ %(dtype)s buf[];
-                %(dtype)s mysum = 0.0f;
-
-                if (warpSize != 32)
-                {
-                    return;  //TODO: set error code
-                }
-
-                for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
-                {
-                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
-                    {
-                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
-                        {
-                            %(dtype)sy Ai = A[i0 * sA0 + blockIdx.x * sA1 +
-                                         i2 * sA2 + i3 * sA3];
-                            mysum += Ai;
-                        }
-                    }
-                }
-                %(reducebuf)s
-            }
-            """ % locals(), file=sio)
-        return sio.getvalue()
diff --git a/ndarray/gpu_ndarray.h b/ndarray/gpu_ndarray.h
deleted file mode 100644
index 6a43dfd..0000000
--- a/ndarray/gpu_ndarray.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef _GPU_NDARRAY_H
-#define _GPU_NDARRAY_H
-
-
-typedef struct GpuNdArray{
-  char* data; //pointer to data element [0,..,0].
-  int offset;
-  int nd; //the number of dimensions of the tensor
-
-  /**
-   * base:
-   *  either NULL or a pointer to a fellow CudaNdarray into which this one is viewing.
-   *  This pointer is never followed, except during Py_DECREF when we do not need it any longer.
-   */
-  void * base;
-  ssize_t  * dimensions; //dim0, dim1, ... dim nd
-  ssize_t * strides; //stride0, stride1, ... stride nd
-  int flags; // Flags, see numpy flags
-  //DTYPE dtype; // fine for numeric types
-  //DtypeMeta * dtype_meta; // reserved for future use.
-  //PyArray_Descr *descr;   /* Pointer to type structure */
-} GpuNdArray;
-
-#endif
-/*
-  Local Variables:
-  mode:c++
-  c-basic-offset:4
-  c-file-style:"stroustrup"
-  c-file-offsets:((innamespace . 0)(inline-open . 0))
-  indent-tabs-mode:nil
-  fill-column:79
-  End:
-*/
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
diff --git a/ndarray/pygpu_language.h b/ndarray/pygpu_language.h
deleted file mode 100644
index 04249e3..0000000
--- a/ndarray/pygpu_language.h
+++ /dev/null
@@ -1,207 +0,0 @@
-/**
- * This file contain the header for ALL code that depend on cuda or opencl.
- */
-#ifndef _PYGPU_LANGUAGE_H
-#define _PYGPU_LANGUAGE_H
-#include <Python.h>
-//#include <iostream>
-
-#include "pygpu_ndarray_object.h"
-
-/////////////////////////
-// Alloc and Free
-/////////////////////////
-//If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device.
-#define COMPUTE_GPU_MEM_USED 0
-#define VERBOSE_ALLOC_FREE 0
-//If true, we fill with NAN allocated device memory.
-#define ALLOC_MEMSET 0
-
-static int _outstanding_mallocs[] = {0,0};
-
-#ifdef DEBUG
-#define DPRINTF(args...) fprintf(stderr, args)
-#else
-#define DPRINTF(...)
-#endif
-
-#if COMPUTE_GPU_MEM_USED
-int _allocated_size = 0;
-const int TABLE_SIZE = 10000;
-struct table_struct{
-  void* ptr;
-  int size;
-};
-table_struct _alloc_size_table[TABLE_SIZE];
-#endif
-
-/**
- * Allocation and freeing of device memory should go through these functions so that the lib can track memory usage.
- *
- * device_malloc will set the Python error message before returning None.
- * device_free will return nonzero on failure (after setting the python error message)
- */
-void * device_malloc(size_t size);
-int device_free(void * ptr);
-static PyObject *
-outstanding_mallocs(PyObject* self, PyObject * args)
-{
-    return PyInt_FromLong(_outstanding_mallocs[0]);
-}
-
-int PyGpuNdArray_CopyFromPyGpuNdArray(PyGpuNdArrayObject * self, PyGpuNdArrayObject * other, bool unbroadcast = false);
-
-/**
- * PyGpuNdArray_alloc_contiguous
- *
- * Allocate storage space for a tensor of rank 'nd' and given dimensions.
- *
- * Note: PyGpuNdArray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions
- */
-template<typename inttype>
-int PyGpuNdArray_alloc_contiguous(PyGpuNdArrayObject *self, const int nd, const inttype * dim, NPY_ORDER order=NPY_CORDER)
-{
-    DPRINTF("PyGpuNdArray_alloc_contiguous: start nd=%i descr=%p\n", nd, self);
-
-    if (!PyGpuNdArray_DESCR(self)){
-        PyErr_SetString(PyExc_ValueError,
-                        "PyGpuNdArray_alloc_contiguous: The array don't have a type! We can't allocate it!\n");
-        return -1;
-    }
-    
-    // allocate an empty ndarray with c_contiguous access
-    // return 0 on success
-    int size = 1; //set up the strides for contiguous tensor
-    assert (nd >= 0);
-    if (PyGpuNdArray_set_nd(self, nd))
-    {
-        return -1;
-    }
-    //TODO: check if by any chance our current dims are correct,
-    //      and strides already contiguous
-    //      in that case we can return right here.
-    DPRINTF("PyGpuNdArray_alloc_contiguous: before itemsize descr=%p elsize=%i\n", self->descr, self->descr->elsize);
-    int elsize = PyGpuNdArray_ITEMSIZE((PyObject*)self);
-    DPRINTF("PyGpuNdArray_alloc_contiguous: set_nd %d! elsize=%i\n", nd, elsize);
-    if(order != NPY_FORTRANORDER){
-      DPRINTF("PyGpuNdArray_alloc_contiguous: NPY_CORDER\n");
-      for (int i = nd-1; i >= 0; --i){
-	if (size == 0)
-	  PyGpuNdArray_STRIDE(self, i) = elsize;
-	else
-	  PyGpuNdArray_STRIDE(self,i) = size * elsize;
-        PyGpuNdArray_DIM(self,i) = dim[i];
-        size = size * dim[i];
-      }
-    }else if (nd>0){
-      DPRINTF("PyGpuNdArray_alloc_contiguous: NPY_FORTRANORDER\n");
-      size = dim[0];
-      PyGpuNdArray_STRIDE(self, 0) = elsize;
-      PyGpuNdArray_DIM(self, nd-1) = dim[nd-1];
-      for (int i = 1; i < nd; ++i){
-	if (size == 0)
-	  PyGpuNdArray_STRIDE(self, i) = elsize;
-	else
-	  PyGpuNdArray_STRIDE(self, i) = PyGpuNdArray_STRIDE(self, i-1) * dim[i-1];
-        PyGpuNdArray_DIM(self, nd-i-1) = dim[nd-i-1];
-        size = size * dim[i];
-      }
-    }
-
-    if (self->data_allocated != size)
-    {
-        // If self is a view, do not try to free its memory
-        if (self->data_allocated && device_free(PyGpuNdArray_DATA(self))) {
-	  // Does this ever happen??  Do we need to set data_allocated or devdata to 0?
-	  PyGpuNdArray_DATA(self) = NULL;
-	  self->data_allocated = 0;
-	  return -1;
-	}
-
-        assert(size>0);
-	DPRINTF("PyGpuNdArray_alloc_contiguous: will allocate for size=%d elements\n", size);
-
-        PyGpuNdArray_DATA(self) = (char*)device_malloc(size * PyGpuNdArray_ITEMSIZE((PyObject *)self));
-        if (!PyGpuNdArray_DATA(self))
-        {
-            PyGpuNdArray_set_nd(self,-1);
-            self->data_allocated = 0;
-            PyGpuNdArray_DATA(self) = 0;
-            return -1;
-        }
-
-	// The structure of self will be reused with newly allocated memory.
-	// If self was a view, we should remove the reference to its base.
-	// (If base was already NULL, the following has no effect.)
-	Py_XDECREF(self->base);
-	self->base = NULL;
-
-        self->data_allocated = size;
-	self->gpu_ndarray.flags = NPY_DEFAULT;
-	PyGpuNdArray_FLAGS(self) |= NPY_WRITEABLE;
-	PyGpuNdArray_FLAGS(self) |= NPY_OWNDATA;
-	if (nd == 0) {
-	  PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS;
-	  if (order != NPY_FORTRANORDER) {
-	    PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS;
-	  } else {
-	    PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS;
-	  }
-
-	}else if(nd == 1){//set c and f contiguous
-	  PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS;
-	  PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS;
-	}else if(order != NPY_FORTRANORDER){//set c contiguous
-	  PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS;
-	  PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS;
-	}else{//set f contiguous
-	  PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS;
-	  PyGpuNdArray_FLAGS(self) &= ~NPY_C_CONTIGUOUS;
-	}
-	PyGpuNdArray_FLAGS(self) &= ~NPY_UPDATEIFCOPY;
-    }else if(size == 0){
-      PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS;
-      PyGpuNdArray_FLAGS(self) |= NPY_OWNDATA;
-	if (nd == 0) {
-	  PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS;
-	  if (order != NPY_FORTRANORDER) {
-	    PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS;
-	  } else {
-	    PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS;
-	  }
-
-	}else if(nd == 1){//set c and f contiguous
-	  PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS;
-	  PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS;
-	}else if(order != NPY_FORTRANORDER){//set c contiguous
-	  PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS;
-	  PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS;
-	}else{//set f contiguous
-	  PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS;
-	  PyGpuNdArray_FLAGS(self) &= ~NPY_C_CONTIGUOUS;
-	}
-	PyGpuNdArray_FLAGS(self) &= ~NPY_UPDATEIFCOPY;
-        return 0;
-    }else{
-      // How to check for the flags? Need to check if already contiguous.
-      PyErr_Format(PyExc_RuntimeError,
-		   "PyGpuNdArray_alloc_contiguous: self->data_allocated=%d, size=%d, cmp=%d",
-		   self->data_allocated, size, self->data_allocated != size
-		   );
-      return -1;
-    }
-
-    if (order != NPY_FORTRANORDER) {
-        assert(PyGpuNdArray_is_c_contiguous(self));
-    } else {
-        assert(PyGpuNdArray_is_f_contiguous(self));
-    }
-    DPRINTF("PyGpuNdArray_alloc_contiguous: end\n");
-    return 0;
-}
-
-enum PyGpuTransfert { PyGpuHostToDevice, PyGpuDeviceToHost };
-int PyGpuMemcpy(void * dst, const void * src, int dev_offset, size_t bytes, PyGpuTransfert direction);
-
-int PyGpuMemset(void * dst, int data, size_t bytes);
-#endif
diff --git a/ndarray/pygpu_language_cuda.cu b/ndarray/pygpu_language_cuda.cu
deleted file mode 100644
index 1270086..0000000
--- a/ndarray/pygpu_language_cuda.cu
+++ /dev/null
@@ -1,622 +0,0 @@
-#include <pygpu_ndarray_object.h>
-#include <pygpu_language.h>
-
-#include <cublas.h>
-
-#ifdef __DEVICE_EMULATION__
-#define NUM_VECTOR_OP_BLOCKS                4096
-#define NUM_VECTOR_OP_THREADS_PER_BLOCK     1  //This prevents printf from getting tangled up
-#else
-#define NUM_VECTOR_OP_BLOCKS                4096 //Max number of blocks to launch.  Should be read from device properties. (#10)
-#define NUM_VECTOR_OP_THREADS_PER_BLOCK     256  //Should be read from device properties. (#10)
-#endif
-
-#if 0
-// Do not wait after every kernel & transfer.
-#define CNDA_THREAD_SYNC
-#else
-// This is useful for using normal profiling tools
-#define CNDA_THREAD_SYNC cudaThreadSynchronize();
-#endif
-
-#ifndef SHARED_SIZE
-#define SHARED_SIZE (16*1024)
-#endif
-
-
-char *
-cublasGetErrorString(cublasStatus err)
-{
-    if (err == CUBLAS_STATUS_NOT_INITIALIZED) {
-        return "CUBLAS_STATUS_NOT_INITIALIZED";
-    } else if (err == CUBLAS_STATUS_ALLOC_FAILED){
-        return "CUBLAS_STATUS_ALLOC_FAILED";
-    } else if (err == CUBLAS_STATUS_INVALID_VALUE){
-        return "CUBLAS_STATUS_INVALID_VALUE";
-    } else if (err == CUBLAS_STATUS_MAPPING_ERROR){
-        return "CUBLAS_STATUS_MAPPING_ERROR";
-    } else if (err == CUBLAS_STATUS_EXECUTION_FAILED){
-        return "CUBLAS_STATUS_EXECUTION_FAILED";
-    } else if (err == CUBLAS_STATUS_INTERNAL_ERROR){
-        return "CUBLAS_STATUS_INTERNAL_ERROR";
-    } else {
-        return "UNKNOW ERROR";
-    }
-
-}
-
-/////////////////////////
-// Alloc and Free
-/////////////////////////
-void * device_malloc(size_t size)
-{
-    void * rval=NULL;
-    cudaError_t err = cudaMalloc(&rval, size);
-    if (cudaSuccess != err){
-#if COMPUTE_GPU_MEM_USED
-        fprintf(stderr, "Error allocating %li bytes of device memory (%s). %d already allocated\n",
-		(long)size, cudaGetErrorString(err),_allocated_size);
-#else
-        fprintf(stderr, "Error allocating %li bytes of device memory (%s).\n",
-		(long)size, cudaGetErrorString(err));
-#endif
-        PyErr_Format(PyExc_MemoryError, "Error allocating %li bytes of device memory (%s).",
-		     (long)size, cudaGetErrorString(err));
-        return NULL;
-    }
-    _outstanding_mallocs[0] += (rval != NULL);
-#if COMPUTE_GPU_MEM_USED
-    for(int i=0;i<TABLE_SIZE;i++){
-      if(NULL==_alloc_size_table[i].ptr){
-        _alloc_size_table[i].ptr=rval;
-        _alloc_size_table[i].size=size;
-        break;
-      }
-    }
-    _allocated_size += size;
-    DPRINTF("allocated %li bytes of device memory (%s). %d already allocated, ptr: %p\n",
-            (long)size, cudaGetErrorString(err),_allocated_size,rval);
-#else
-    DPRINTF("allocated %li bytes of device memory (%s). ptr: %p\n",
-            (long)size, cudaGetErrorString(err),rval);
-
-#endif
-
-    if(ALLOC_MEMSET){
-      //We init them to nan to make sure we catch more debug case.
-      cudaMemset(rval, 0xFF, size);
-      //printf("MEMSET\n");
-    }
-    return rval;
-}
-
-int device_free(void *ptr)
-{
-    // if there is no gpu context, the call to cudaFree will fail; skip it entirely
-    /*if(!g_gpu_context_active){
-        return 0;
-	}*/
-    cudaError_t err =  cudaFree(ptr);
-    if (cudaSuccess != err){
-#if COMPUTE_GPU_MEM_USED
-        fprintf(stderr, "Error freeing device pointer %p (%s).%d byte already allocated\n",
-		ptr, cudaGetErrorString(err), _allocated_size);
-#else
-        fprintf(stderr, "Error freeing device pointer %p (%s).\n",
-		ptr, cudaGetErrorString(err));
-#endif
-        PyErr_Format(PyExc_MemoryError, "error freeing device pointer %p (%s)",
-		     ptr, cudaGetErrorString(err));
-        return -1;
-    }
-    _outstanding_mallocs[0] -= (ptr != NULL);
-#if COMPUTE_GPU_MEM_USED
-    int i=0;
-    size_t total_freed = 0;
-    for(;i<TABLE_SIZE;i++)
-      if(_alloc_size_table[i].ptr==ptr){
-        _allocated_size -= _alloc_size_table[i].size;
-        total_freed += _alloc_size_table[i].size;
-        _alloc_size_table[i].ptr=0;
-        _alloc_size_table[i].size=0;
-
-        break;
-      }
-    if(i==TABLE_SIZE)
-      printf("Unallocated unknow size!\n");
-    DPRINTF("freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr);
-#endif
-    return 0;
-}
-//make the rightmost coords change fastest
-//TODO: why does a downward for-loop not work????
-//TODO: skip the last division (when d == 0)
-#define decl_k_elemwise_unary_rowmajor(name, F, DTYPE)	\
-__global__ void name (unsigned int numEls,  \
-        unsigned int nd, \
-        const ssize_t * dim,  \
-        const DTYPE * a_data, const ssize_t * a_str, \
-        DTYPE * z_data, const ssize_t * z_str) \
-{ \
-    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; \
-    const unsigned int numThreads = blockDim.x * gridDim.x; \
-\
-    for (unsigned int i = idx; i < numEls; i += numThreads) {   \
-        unsigned int ii = i; \
-        const DTYPE * a_i = a_data; \
-        DTYPE * z_i = z_data; \
-        for (unsigned int _d = 0; _d < nd; ++_d) { \
-            unsigned int d = nd - _d-1;  \
-            /* i_d used to be unsigned, but their is a bug in nvcc 3.0. making it signed fix the bug.*/\
-            int i_d = ii % dim[d]; /* i_d is our position in the d'th dimension   */ \
-            ii = ii / dim[d]; \
-            a_i += i_d * (a_str[d]/sizeof(DTYPE)); /* increment our a and z pointers by i_d elements */ \
-            z_i += i_d * (z_str[d]/sizeof(DTYPE)); \
-        } \
-        z_i[0] = F(a_i[0]); \
-    } \
-}
-
-template<typename T> __device__ T unary_copy(T a) { return a; }
-decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_float, unary_copy<float>, float)
-decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_double, unary_copy<double>, double)
-decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint8, unary_copy<uint8_t>, uint8_t)
-decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int8, unary_copy<int8_t>, int8_t)
-decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint16, unary_copy<uint16_t>, uint16_t)
-decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int16, unary_copy<int16_t>, int16_t)
-decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint32, unary_copy<uint32_t>, uint32_t)
-decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int32, unary_copy<int32_t>, int32_t)
-decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint64, unary_copy<uint64_t>, uint64_t)
-decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int64, unary_copy<int64_t>, int64_t)
-decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_complex64, unary_copy<npy_complex64>, npy_complex64)
-decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_complex128, unary_copy<npy_complex128>, npy_complex128)
-
-//template<typename T> __device__ T unary_exp(T a) { return exp(a); }
-//decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_exp, unary_exp<float>)
-
-template<typename T>
-static __global__ void k_copy_1d(const int N, const T * x, const ssize_t sx, T * y, const ssize_t sy)
-{
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += gridDim.x*blockDim.x) {
-        y[i*sy] = x[i*sx];
-    }
-}
-
-//copy from other into self
-//don't allocated memory
-int
-PyGpuNdArray_CopyFromPyGpuNdArray(PyGpuNdArrayObject * self, PyGpuNdArrayObject * other, bool unbroadcast)
-{
-    DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray start nd=%d\n", PyGpuNdArray_NDIM(self));
-    assert(PyGpuNdArray_TYPE(self) == PyGpuNdArray_TYPE(other));
-    assert(PyGpuNdArray_ISWRITEABLE(self));
-    //standard elemwise size checks
-    if (PyGpuNdArray_NDIM(self) == -1) {
-        PyErr_SetString(PyExc_TypeError, "can't copy into un-initialized PyGpuNdArrayObject");
-        return -1;
-    }
-    if (PyGpuNdArray_NDIM(self) != PyGpuNdArray_NDIM(other)) {
-        PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: need same number of dims. destination nd=%d, source nd=%d. No broadcasting implemented.", PyGpuNdArray_NDIM(self), PyGpuNdArray_NDIM(other));
-        return -1;
-    }
-    //standard elemwise dim checks (also compute total size)
-    unsigned int size = 1;
-    unsigned int size_source = 1;
-    for (int i = 0; i< PyGpuNdArray_NDIM(self); ++i) {
-        if ((PyGpuNdArray_DIMS(self)[i] != PyGpuNdArray_DIMS(other)[i])
-            && (1!=PyGpuNdArray_DIMS(other)[i] || !unbroadcast) ) {
-          PyErr_Format(PyExc_ValueError, "need same dimensions for dim %d, destination=%ld, source=%ld",
-                       i, PyGpuNdArray_DIMS(self)[i], PyGpuNdArray_DIMS(other)[i]);
-            return -1;
-        }
-        size *= (unsigned int) PyGpuNdArray_DIMS(self)[i];
-        size_source *= (unsigned int) PyGpuNdArray_DIMS(other)[i];
-    }
-    if (0 == size) {
-        return 0; //nothing to copy, we're done.
-    }
-
-    //cublas don't support negative stride
-    bool pos_stride = true;
-    for (int i = 0; i < PyGpuNdArray_NDIM(other); ++i)
-        if (PyGpuNdArray_STRIDE(other,i)<0)
-            pos_stride = false;
-
-    void * other_data = PyGpuNdArray_DATA(other) + PyGpuNdArray_OFFSET(other);
-    void * self_data = PyGpuNdArray_DATA(self) + PyGpuNdArray_OFFSET(self);
-
-    //Try to transfer with cublas(we suppose it is faster)
-    if (PyGpuNdArray_ISCONTIGUOUS(self) && PyGpuNdArray_ISCONTIGUOUS(other) &&
-	size == size_source && PyGpuNdArray_TYPE(self) == NPY_FLOAT32 &&
-        pos_stride
-        ) {
-        cublasScopy(size, (float*) other_data, 1, (float*) self_data, 1);
-        CNDA_THREAD_SYNC;
-        if (CUBLAS_STATUS_SUCCESS != cublasGetError()) {
-            PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
-            return -1;
-        }
-
-	DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray: cublasScopy end\n");
-        return 0;
-    }
-    if (PyGpuNdArray_ISCONTIGUOUS(self) && PyGpuNdArray_ISCONTIGUOUS(other) &&
-	size == size_source && PyGpuNdArray_TYPE(self) == NPY_FLOAT64 &&
-        pos_stride) {
-        cublasDcopy(size, (double*) other_data, 1, (double*) self_data, 1);
-        CNDA_THREAD_SYNC;
-        if (CUBLAS_STATUS_SUCCESS != cublasGetError()) {
-            PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
-            return -1;
-        }
-	DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray cublasDcopy end\n");
-        return 0;
-    }
-
-    //TODO: rewrite these copy operations to be more efficient
-    //      See, for example the transpose example in the cuda_sdk.
-    switch (PyGpuNdArray_NDIM(self)) {
-        case 0: // scalar
-            {
-                // THIS CASE SHOULD NEVER HAPPEN BECAUSE SCALARS ARE ALWAYS C CONTIGUOUS
-                assert(0);
-            }; break;
-        case 1: // vector
-            {
-                assert(PyGpuNdArray_ISALIGNED(self));
-                assert(PyGpuNdArray_ISALIGNED(other));
-                DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray: Copying non-contiguous vector\n");
-                unsigned int n_blocks = min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS);
-                unsigned int n_threads = min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
-
-		if (PyGpuNdArray_TYPE(self) == NPY_FLOAT32) {
-                    const int elsize = sizeof(float);
-                    k_copy_1d<<<n_blocks, n_threads>>>(size,
-                                                       (float*)other_data,
-                                                       PyGpuNdArray_STRIDES(other)[0]/elsize,
-                                                       (float*)self_data,
-                                                       PyGpuNdArray_STRIDES(self)[0]/elsize);
-		} else if (PyGpuNdArray_TYPE(self) == NPY_FLOAT64) {
-                    const int elsize = sizeof(double);
-                    k_copy_1d<<<n_blocks, n_threads>>>(size,
-                                                       (double*)other_data,
-                                                       PyGpuNdArray_STRIDES(other)[0]/elsize,
-                                                       (double*)self_data,
-                                                       PyGpuNdArray_STRIDES(self)[0]/elsize);
-		} else if (PyGpuNdArray_TYPE(self) == NPY_INT8) {
-                    const int elsize = sizeof(int8_t);
-                    k_copy_1d<<<n_blocks, n_threads>>>(size,
-                                                       (int8_t*)other_data,
-                                                       PyGpuNdArray_STRIDES(other)[0]/elsize,
-                                                       (int8_t*)self_data,
-                                                       PyGpuNdArray_STRIDES(self)[0]/elsize);
-		} else if (PyGpuNdArray_TYPE(self) == NPY_INT16) {
-                    const int elsize = sizeof(int16_t);
-                    k_copy_1d<<<n_blocks, n_threads>>>(size,
-                                                       (int16_t*)other_data,
-                                                       PyGpuNdArray_STRIDES(other)[0]/elsize,
-                                                       (int16_t*)self_data,
-                                                       PyGpuNdArray_STRIDES(self)[0]/elsize);
-		} else if (PyGpuNdArray_TYPE(self) == NPY_INT32) {
-                    const int elsize = sizeof(int32_t);
-                    k_copy_1d<<<n_blocks, n_threads>>>(size,
-                                                       (int32_t*)other_data,
-                                                       PyGpuNdArray_STRIDES(other)[0]/elsize,
-                                                       (int32_t*)self_data,
-                                                       PyGpuNdArray_STRIDES(self)[0]/elsize);
-		} else if (PyGpuNdArray_TYPE(self) == NPY_INT64) {
-                    const int elsize = sizeof(int64_t);
-                    k_copy_1d<<<n_blocks, n_threads>>>(size,
-                                                       (int64_t*)other_data,
-                                                       PyGpuNdArray_STRIDES(other)[0]/elsize,
-                                                       (int64_t*)self_data,
-                                                       PyGpuNdArray_STRIDES(self)[0]/elsize);
-		} else if (PyGpuNdArray_TYPE(self) == NPY_UINT8) {
-                    const int elsize = sizeof(uint8_t);
-                    k_copy_1d<<<n_blocks, n_threads>>>(size,
-                                                       (uint8_t*)other_data,
-                                                       PyGpuNdArray_STRIDES(other)[0]/elsize,
-                                                       (uint8_t*)self_data,
-                                                       PyGpuNdArray_STRIDES(self)[0]/elsize);
-		} else if (PyGpuNdArray_TYPE(self) == NPY_UINT16) {
-                    const int elsize = sizeof(uint16_t);
-                    k_copy_1d<<<n_blocks, n_threads>>>(size,
-                                                       (uint16_t*)other_data,
-                                                       PyGpuNdArray_STRIDES(other)[0]/elsize,
-                                                       (uint16_t*)self_data,
-                                                       PyGpuNdArray_STRIDES(self)[0]/elsize);
-		} else if (PyGpuNdArray_TYPE(self) == NPY_UINT32) {
-                    const int elsize = sizeof(uint32_t);
-                    k_copy_1d<<<n_blocks, n_threads>>>(size,
-                                                       (uint32_t*)other_data,
-                                                       PyGpuNdArray_STRIDES(other)[0]/elsize,
-                                                       (uint32_t*)self_data,
-                                                       PyGpuNdArray_STRIDES(self)[0]/elsize);
-		} else if (PyGpuNdArray_TYPE(self) == NPY_UINT64) {
-                    const int elsize = sizeof(uint64_t);
-                    k_copy_1d<<<n_blocks, n_threads>>>(size,
-                                                       (uint64_t*)other_data,
-                                                       PyGpuNdArray_STRIDES(other)[0]/elsize,
-                                                       (uint64_t*)self_data,
-                                                       PyGpuNdArray_STRIDES(self)[0]/elsize);
-		} else if (PyGpuNdArray_TYPE(self) == NPY_COMPLEX64) {
-                    const int elsize = sizeof(npy_complex64);
-                    k_copy_1d<<<n_blocks, n_threads>>>(size,
-                                                       (npy_complex64*)other_data,
-                                                       PyGpuNdArray_STRIDES(other)[0]/elsize,
-                                                       (npy_complex64*)self_data,
-                                                       PyGpuNdArray_STRIDES(self)[0]/elsize);
-		} else if (PyGpuNdArray_TYPE(self) == NPY_COMPLEX128) {
-                    const int elsize = sizeof(npy_complex128);
-                    k_copy_1d<<<n_blocks, n_threads>>>(size,
-                                                       (npy_complex128*)other_data,
-                                                       PyGpuNdArray_STRIDES(other)[0]/elsize,
-                                                       (npy_complex128*)self_data,
-                                                       PyGpuNdArray_STRIDES(self)[0]/elsize);
-		} else {
-		  PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: Don't implement copy for this dtype\n");
-		  return -1;
-		}
-
-                CNDA_THREAD_SYNC;
-                cudaError_t err = cudaGetLastError();
-                if( cudaSuccess != err) {
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (n_blocks=%i, n_threads_per_block=%i)\n", "k_copy_1d", cudaGetErrorString(err), n_blocks, n_threads);
-                    return -1;
-                }
-            }; break;
-        default:
-            {
-                assert (cudaSuccess == cudaGetLastError());
-                assert(PyGpuNdArray_ISALIGNED(self));
-                assert(PyGpuNdArray_ISALIGNED(other));
-
-                DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray: Copying with default version unbroadcast=%d\n", unbroadcast);
-                // Identigy the dim of the output memory.
-                PyGpuNdArrayObject * cuda_dims = other;
-                if(unbroadcast)
-                    cuda_dims = self;
-
-                // Move the dim and strides information on the gpu memory
-                int ndim = PyGpuNdArray_NDIM(other);
-                void * strides_dev = device_malloc(sizeof(ssize_t)*ndim*3);
-                ssize_t * strides_dev_p = (ssize_t *) strides_dev;
-                cudaError_t err = cudaMemcpy(strides_dev, PyGpuNdArray_DIMS(cuda_dims), ndim*sizeof(ssize_t),cudaMemcpyHostToDevice);
-                if (err != cudaSuccess){
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory1: %s", cudaGetErrorString(err));
-                    return -1;
-                }
-                err = cudaMemcpy((void*)(strides_dev_p+ndim), PyGpuNdArray_STRIDES(other), ndim*sizeof(ssize_t),cudaMemcpyHostToDevice);
-                if (err != cudaSuccess){
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory2: %s", cudaGetErrorString(err));
-                    return -1;
-                }
-                err = cudaMemcpy((void*)(strides_dev_p+(ndim*2)), PyGpuNdArray_STRIDES(self), ndim*sizeof(ssize_t), cudaMemcpyHostToDevice);
-                if (err != cudaSuccess){
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory3: %s", cudaGetErrorString(err));
-                    return -1;
-                }
-                void * strides_host = malloc(sizeof(ssize_t)*ndim*3);
-                err = cudaMemcpy(strides_host, strides_dev, ndim*3*sizeof(ssize_t),cudaMemcpyDeviceToHost);
-                if (err != cudaSuccess){
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory4: %s", cudaGetErrorString(err));
-                    return -1;
-                }
-#ifdef DEBUG
-                for(int i=0;i<3*ndim;i++)
-                    DPRINTF(" %ld", ((ssize_t *)strides_host)[i]);
-                DPRINTF("\n");
-#endif
-                CNDA_THREAD_SYNC;
-                if(cudaSuccess != cudaGetLastError()){
-                    PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: error before copy\n");
-		  return -1;
-                }
-
-                // call worker routine
-                unsigned int n_blocks = min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS);
-                unsigned int threads_per_block = min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
-
-		if ( PyGpuNdArray_TYPE(self) == NPY_FLOAT32) {
-                    k_elemwise_unary_rowmajor_copy_float<<<n_blocks, threads_per_block>>>(
-                        size,
-                        (unsigned int)ndim,
-                        strides_dev_p,
-                        (const float*)other_data,
-                        strides_dev_p+ndim,
-                        (float*) self_data,
-                        strides_dev_p+(ndim*2));
-		} else if ( PyGpuNdArray_TYPE(self) == NPY_FLOAT64) {
-                    k_elemwise_unary_rowmajor_copy_double<<<n_blocks, threads_per_block>>>(
-                        size,
-                        (unsigned int)ndim,
-                        strides_dev_p,
-                        (const double*)other_data,
-                        strides_dev_p+ndim,
-                        (double*) self_data,
-                        strides_dev_p+(ndim*2));
-		} else if ( PyGpuNdArray_TYPE(self) == NPY_INT8) {
-                    k_elemwise_unary_rowmajor_copy_int8<<<n_blocks, threads_per_block>>>(
-                        size,
-                        (unsigned int)ndim,
-                        strides_dev_p,
-                        (const int8_t*)other_data,
-                        strides_dev_p+ndim,
-                        (int8_t*) self_data,
-                        strides_dev_p+(ndim*2));
-		} else if ( PyGpuNdArray_TYPE(self) == NPY_INT16) {
-                    k_elemwise_unary_rowmajor_copy_int16<<<n_blocks, threads_per_block>>>(
-                        size,
-                        (unsigned int)ndim,
-                        strides_dev_p,
-                        (const int16_t*)other_data,
-                        strides_dev_p+ndim,
-                        (int16_t*) self_data,
-                        strides_dev_p+(ndim*2));
-		} else if ( PyGpuNdArray_TYPE(self) == NPY_INT32) {
-                    k_elemwise_unary_rowmajor_copy_int32<<<n_blocks, threads_per_block>>>(
-                        size,
-                        (unsigned int)ndim,
-                        strides_dev_p,
-                        (const int32_t*)other_data,
-                        strides_dev_p+ndim,
-                        (int32_t*) self_data,
-                        strides_dev_p+(ndim*2));
-		} else if ( PyGpuNdArray_TYPE(self) == NPY_INT64) {
-                    k_elemwise_unary_rowmajor_copy_int64<<<n_blocks, threads_per_block>>>(
-                        size,
-                        (unsigned int)ndim,
-                        strides_dev_p,
-                        (const int64_t*)other_data,
-                        strides_dev_p+ndim,
-                        (int64_t*) self_data,
-                        strides_dev_p+(ndim*2));
-		} else if ( PyGpuNdArray_TYPE(self) == NPY_UINT8) {
-                    k_elemwise_unary_rowmajor_copy_uint8<<<n_blocks, threads_per_block>>>(
-                        size,
-                        (unsigned int)ndim,
-                        strides_dev_p,
-                        (const uint8_t*)other_data,
-                        strides_dev_p+ndim,
-                        (uint8_t*) self_data,
-                        strides_dev_p+(ndim*2));
-		} else if ( PyGpuNdArray_TYPE(self) == NPY_UINT16) {
-                    k_elemwise_unary_rowmajor_copy_uint16<<<n_blocks, threads_per_block>>>(
-                        size,
-                        (unsigned int)ndim,
-                        strides_dev_p,
-                        (const uint16_t*)other_data,
-                        strides_dev_p+ndim,
-                        (uint16_t*) self_data,
-                        strides_dev_p+(ndim*2));
-		} else if ( PyGpuNdArray_TYPE(self) == NPY_UINT32) {
-                    k_elemwise_unary_rowmajor_copy_uint32<<<n_blocks, threads_per_block>>>(
-                        size,
-                        (unsigned int)ndim,
-                        strides_dev_p,
-                        (const uint32_t*)other_data,
-                        strides_dev_p+ndim,
-                        (uint32_t*) self_data,
-                        strides_dev_p+(ndim*2));
-		} else if ( PyGpuNdArray_TYPE(self) == NPY_UINT64) {
-                    k_elemwise_unary_rowmajor_copy_uint64<<<n_blocks, threads_per_block>>>(
-                        size,
-                        (unsigned int)ndim,
-                        strides_dev_p,
-                        (const uint64_t*)other_data,
-                        strides_dev_p+ndim,
-                        (uint64_t*) self_data,
-                        strides_dev_p+(ndim*2));
-		} else if ( PyGpuNdArray_TYPE(self) == NPY_COMPLEX64) {
-                    k_elemwise_unary_rowmajor_copy_complex64<<<n_blocks, threads_per_block>>>(
-                        size,
-                        (unsigned int)ndim,
-                        strides_dev_p,
-                        (const npy_complex64*)other_data,
-                        strides_dev_p+ndim,
-                        (npy_complex64*) self_data,
-                        strides_dev_p+(ndim*2));
-		} else if ( PyGpuNdArray_TYPE(self) == NPY_COMPLEX128) {
-                    k_elemwise_unary_rowmajor_copy_complex128<<<n_blocks, threads_per_block>>>(
-                        size,
-                        (unsigned int)ndim,
-                        strides_dev_p,
-                        (const npy_complex128*)other_data,
-                        strides_dev_p+ndim,
-                        (npy_complex128*) self_data,
-                        strides_dev_p+(ndim*2));
-		} else {
-		  PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: Don't implement copy for this dtype\n");
-		  return -1;
-		}
-                CNDA_THREAD_SYNC;
-                err = cudaGetLastError();
-                if( cudaSuccess != err) {
-                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (n_blocks=%i, n_threads_per_block=%i)\n", "k_elemwise_unary_rowmajor_copy", cudaGetErrorString(err), n_blocks, threads_per_block);
-                    return -1;
-                }
-                device_free(strides_dev);
-                free(strides_host);
-            }
-    };
-    // Set flags
-    if (false && PyGpuNdArray_NDIM(self) == 0) {
-        //Numpy 1.4.1 is not consistent here
-        //When we create a new numpy ndarray of 0 dim, it is not f contiguous
-        //But when we take a subtensor that is of 0 dim, it is f contiguous!
-        //We make as them for now...
-        PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS;
-        PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS;
-    } else {
-        if (PyGpuNdArray_is_c_contiguous(self)) {
-            PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS;
-        } else {
-            PyGpuNdArray_FLAGS(self) &= ~NPY_C_CONTIGUOUS;
-        }
-        if (PyGpuNdArray_is_f_contiguous(self)) {
-            PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS;
-        } else {
-            PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS;
-        }
-    }
-
-    DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray end\n");
-    return 0;
-}
-
-int PyGpuMemcpy(void * dst, const void * src, int dev_offset, size_t bytes, 
-                PyGpuTransfert direction){
-    DPRINTF("PyGpuMemcpy: start\n");
-    cudaMemcpyKind dir;
-    const char * ssrc;
-    const char * ddst;
-    if (direction == PyGpuDeviceToHost){
-        dir = cudaMemcpyDeviceToHost;
-        ssrc = (char*)src+dev_offset;
-        ddst = (char*)dst;
-    } else if (direction == PyGpuHostToDevice) {
-        dir = cudaMemcpyHostToDevice;
-        ssrc = (char*)src;
-        ddst = (char*)dst + dev_offset;
-    } else {
-        PyErr_Format(PyExc_ValueError,
-                     "PyGpuMemcpy: Received wrong direction %d!\n",
-                     direction);
-        return -1;
-    }
-    cudaError_t err = cudaMemcpy((void*)ddst, (void*)ssrc, bytes, dir);
-    CNDA_THREAD_SYNC;
-    if (cudaSuccess != err) {
-        PyErr_Format(PyExc_RuntimeError, "PyGpuMemcpy: cudaMemcpy: error copying data to host (%s)",
-                     cudaGetErrorString(err));
-        return -1;
-    }
-    DPRINTF("PyGpuMemcpy: end\n");
-    return 0;
-}
-
-int PyGpuMemset(void * dst, int data, size_t bytes){
-    DPRINTF("PyGpuMemset: start\n");
-    cudaError_t err = cudaMemset(dst, data, bytes);
-    CNDA_THREAD_SYNC;
-    if (cudaSuccess != err) {
-        PyErr_Format(PyExc_MemoryError, "PyGpuMemset: Error memsetting %ld bytes of device memory(%s). %p",
-                     bytes, cudaGetErrorString(err), PyGpuNdArray_DATA(dst));
-    DPRINTF("PyGpuMemset: end error\n");
-        return -1;
-    }
-    DPRINTF("PyGpuMemset: end\n");
-    return 0;
-}
-
-/*
-  Local Variables:
-  mode:c++
-  c-basic-offset:4
-  c-file-style:"stroustrup"
-  c-file-offsets:((innamespace . 0)(inline-open . 0))
-  indent-tabs-mode:nil
-  fill-column:79
-  End:
-*/
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
diff --git a/ndarray/pygpu_language_opencl.cpp b/ndarray/pygpu_language_opencl.cpp
deleted file mode 100644
index a237aff..0000000
--- a/ndarray/pygpu_language_opencl.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-#include <sys/types.h>
-#include <assert.h>
-#include <stdio.h>
-
-#include <pygpu_ndarray_object.h>
-#include <pygpu_language.h>
-
-#ifdef __APPLE__
-
-#include <OpenCL/opencl.h>
-
-#else
-
-#include <CL/opencl.h>
-
-#endif
-
-cl_context ctx = NULL;
-cl_device_id dev;
-cl_command_queue q;
-
-void setup_context(cl_context c);
-
-static void
-init_context(void)
-{
-  cl_int err;
-  cl_uint n;
-  cl_platform_id *plats;
-  cl_context_properties props[3];
-  cl_context c;
-
-  if (ctx != NULL) return;
-
-  err = clGetPlatformIDs(0, NULL, &n);
-  if (err != CL_SUCCESS) return;
-
-  plats = (cl_platform_id *)calloc(n, sizeof(cl_platform_id));
-  if (plats == NULL) return;
-
-  err = clGetPlatformIDs(n, plats, NULL);
-  if (err != CL_SUCCESS) goto fail_id;
-
-  props[0] = CL_CONTEXT_PLATFORM;
-  props[1] = (cl_context_properties)plats[0];
-  props[2] = 0;
-
-  c = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU, NULL, NULL, &err);
-  if (err != CL_SUCCESS) {
-    fprintf(stderr, "Could not create context, will fail later (%d)!\n", err);
-    /* error - error - error */
-    /* but we do nothing */
-    goto fail_id;
-  }
-
-  free(plats);
-
-  setup_context(c);
-  clReleaseContext(c);
-
-  return;
- fail_id:
-  free(plats);
-}
-
-void
-setup_context(cl_context c) {
-  cl_int err;
-  cl_device_id *devs;
-  size_t sz;
-
-  if (ctx != NULL) {
-    clReleaseContext(ctx);
-    clReleaseCommandQueue(q);
-  }
-  ctx = c;
-  clRetainContext(ctx);
-
-  err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 0, NULL, &sz);
-  if (err != CL_SUCCESS) {
-    fprintf(stderr, "clGetContextInfo = %d\n", err);
-    goto fail;
-  }
-
-  devs = (cl_device_id *)malloc(sz);
-  if (devs == NULL) goto fail;
-
-  err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, sz, devs, NULL);
-  if (err != CL_SUCCESS) goto fail_dev;
-
-  dev = devs[0];
-  free(devs);
-
-  q = clCreateCommandQueue(ctx, dev, NULL, &err);
-  if (err != CL_SUCCESS) {
-    fprintf(stderr, "clCreateCommandQueue = %d", err);
-    goto fail;
-  }
-
-  return;
- fail_dev:
-  free(devs);
- fail:
-  clReleaseContext(ctx);
-  ctx = NULL;
-}
-
-void *
-device_malloc(size_t size)
-{
-  cl_int err;
-  cl_mem res;
-
-  init_context();
-
-  DPRINTF("malloc size = %zu\n", size);
-
-  /* OpenCL devices do not always support byte-addressable storage
-     therefore make sure we have at least 4 bytes in buffers */
-  if (size < 4) size = 4;
-
-  res = clCreateBuffer(ctx, CL_MEM_READ_WRITE, size, NULL, &err);
-  if (err != CL_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "Could not allocate device memory (%d)", err);
-    return NULL;
-  }
-
-  return res;
-}
-
-int
-device_free(void * ptr)
-{
-  cl_int err;
-
-  if ((err = clReleaseMemObject((cl_mem)ptr)) != CL_SUCCESS) {
-    PyErr_Format(PyExc_MemoryError, "Could not free device memory (%d)", err);
-    return -1;
-  }
-  return 0;
-}
-
-int
-PyGpuNdArray_CopyFromPyGpuNdArray(PyGpuNdArrayObject * self,
-				  PyGpuNdArrayObject * other,
-				  bool unbroadcast)
-{
-  size_t size = 1;
-  cl_event ev;
-  cl_int err;
-  
-  assert(PyGpuNdArray_TYPE(self) == PyGpuNdArray_TYPE(other));
-  assert(PyGpuNdArray_ISWRITEABLE(self));
-  if (PyGpuNdArray_NDIM(self) == -1) {
-    PyErr_SetString(PyExc_TypeError, "can't copy into un-initialized PyGpuN\
-dArrayObject");
-    return -1;
-  }
-
-  if (!(PyGpuNdArray_ISONESEGMENT(self) && PyGpuNdArray_ISONESEGMENT(other))) {
-    PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: only contiguous arrays are supported");
-    return -1;
-  }
-
-  if ((PyGpuNdArray_ISCONTIGUOUS(self) != PyGpuNdArray_ISCONTIGUOUS(other)) ||
-      (PyGpuNdArray_ISFORTRAN(self) != PyGpuNdArray_ISFORTRAN(other))
-      ) {
-    PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: the input and output don't have the same c/f contiguous memory layout. This isnot supported now.");
-    return -1;
-  }
-
-  if (PyGpuNdArray_NDIM(self) != PyGpuNdArray_NDIM(other)) {
-    PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: need same number of dims. destination nd=%d, source nd=%d. No broadcasting implemented.", PyGpuNdArray_NDIM(self), PyGpuNdArray_NDIM(other));
-    return -1;
-  }
-
-  for (int i = 0; i< PyGpuNdArray_NDIM(self); ++i) {
-    if ((PyGpuNdArray_DIMS(self)[i] != PyGpuNdArray_DIMS(other)[i])
-	&& (1!=PyGpuNdArray_DIMS(other)[i] || !unbroadcast) ) {
-      PyErr_Format(PyExc_ValueError, "need same dimensions for dim %d, destination=%ld, source=%ld",
-		   i, PyGpuNdArray_DIMS(self)[i], PyGpuNdArray_DIMS(other)[i]);
-      return -1;
-    }
-    size *= (unsigned int) PyGpuNdArray_DIMS(self)[i];
-  }
-
-  if (0 == size) {
-    return 0; //nothing to copy, we're done.
-  }
-  size *= PyGpuNdArray_ITEMSIZE(self);
-
-  if ((err = clEnqueueCopyBuffer(q, (cl_mem)PyGpuNdArray_DATA(other),
-				 (cl_mem)PyGpuNdArray_DATA(self),
-				 PyGpuNdArray_OFFSET(other),
-				 PyGpuNdArray_OFFSET(self),
-				 size, 0, NULL, &ev)) != CL_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "Could not create copy command (%d)", err);
-    return -1;
-  }
-  if ((err = clWaitForEvents(1, &ev)) != CL_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "Could not copy data (%d)", err);
-    clReleaseEvent(ev);
-    return -1;
-  }
-  clReleaseEvent(ev);
-
-  return 0;
-}
-
-int
-PyGpuMemcpy(void * dst, const void * src, int dev_offset, size_t bytes,
-	    PyGpuTransfert direction)
-{
-  cl_int err;
-  cl_event ev;
-  
-  switch (direction)
-    {
-    case PyGpuHostToDevice:
-      err = clEnqueueWriteBuffer(q, (cl_mem)dst, CL_FALSE, dev_offset, bytes,
-				 src, 0, NULL, &ev);
-      break;
-    case PyGpuDeviceToHost:
-      err = clEnqueueReadBuffer(q, (cl_mem)src, CL_FALSE, dev_offset, bytes,
-				dst, 0, NULL, &ev);
-      break;
-    default:
-      PyErr_Format(PyExc_ValueError, "Unknown direction %d", direction);
-      return -1;
-    }
-
-  if (err != CL_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "Could not create memcpy command (%d)", err);
-    return -1;
-  }
-
-  if ((err = clWaitForEvents(1, &ev)) != CL_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "Could not memcpy data (%d)", err);
-    clReleaseEvent(ev);
-    return -1;
-  }
-  clReleaseEvent(ev);
-
-  return 0;
-}
-
-int
-PyGpuMemset(void * dst, int data, size_t bytes)
-{ 
-  /* This should be at least one byte over the formatted string below */
-  char local_kern[92];
-  const char *rlk[1];
-  size_t sz;
-  int r, res = -1;
-
-  cl_int err;
-  cl_event ev;
-  cl_program p;
-  cl_kernel k;
-
-  bytes = (bytes+3)/4;
-
-  if (bytes == 0)
-    return 0;
-
-  unsigned char val = (unsigned)data;
-  unsigned int pattern = (unsigned int)val & (unsigned int)val >> 8 & (unsigned int)val >> 16 & (unsigned int)val >> 24;
-
-  r = snprintf(local_kern, sizeof(local_kern),
-	       "__kernel void memset(__global unsigned int *mem) { mem[get_global_id(0)] = %u; }", pattern);
-  /* If this assert fires, increase the size of local_kern above. */ 
-  assert(r >= sizeof(local_kern));
-
-
-  sz = strlen(local_kern);
-  rlk[0] = local_kern;
-  p = clCreateProgramWithSource(ctx, 1, rlk, &sz, &err);
-  if (err != CL_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "Could not create program (%d)", err);
-    return -1;
-  }
-
-  if ((err = clBuildProgram(p, 1, &dev, NULL, NULL, NULL)) != CL_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "Could not build program (%d)", err);
-    goto fail_prog;
-  }
-
-  k = clCreateKernel(p, "memset", &err);
-  if (err != CL_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "Could not create kernel (%d)", err);
-    goto fail_prog;
-  }
-
-  if ((err = clSetKernelArg(k, 0, sizeof(cl_mem), &dst)) != CL_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "Could not set kernel arg (%d)", err);
-    goto fail_kern;
-  }
-
-  if ((err = clEnqueueNDRangeKernel(q, k, 1, NULL, &bytes, NULL, 0, NULL, &ev)) != CL_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "Could not enqueue kernel (%d)", err);
-    goto fail_kern;
-  }
-  
-  if ((err = clWaitForEvents(1, &ev)) != CL_SUCCESS) {
-    PyErr_Format(PyExc_RuntimeError, "Could not memset (%d)", err);
-  }
-  
-  /* success! */
-  res = 0;
-  
-  clReleaseEvent(ev);
- fail_kern:
-  clReleaseKernel(k);
- fail_prog:
-  clReleaseProgram(p);
-  return res;
-}
diff --git a/ndarray/pygpu_ndarray.cpp b/ndarray/pygpu_ndarray.cpp
deleted file mode 100644
index 31b60dc..0000000
--- a/ndarray/pygpu_ndarray.cpp
+++ /dev/null
@@ -1,1546 +0,0 @@
-#include <Python.h>
-#include <structmember.h>
-
-#include <numpy/arrayobject.h>
-#include <iostream>
-
-#include "pygpu_ndarray.h"
-#include "pygpu_language.h"
-
-/////////////////////////
-// Static helper methods
-/////////////////////////
-
-static void
-PyGpuNdArray_null_init(PyGpuNdArrayObject *self)
-{
-    DPRINTF("PyGpuNdArrayObject_null_init\n");
-
-    PyGpuNdArray_DATA(self) = NULL;
-    PyGpuNdArray_OFFSET(self) = 0;
-    PyGpuNdArray_NDIM(self) = -1;
-    self->base = NULL;
-    PyGpuNdArray_DIMS(self) = NULL;
-    PyGpuNdArray_STRIDES(self) = NULL;
-    PyGpuNdArray_FLAGS(self) = NPY_DEFAULT;
-    self->descr = NULL;
-
-    self->data_allocated = 0;
-}
-
-
-
-/////////////////////////////
-// Satisfying reqs to be Type
-/////////////////////////////
-
-//DON'T use directly(if their is other PyGpuNdArrayObject that point to it, it will cause problem)! use Py_DECREF() instead
-static void
-PyGpuNdArrayObject_dealloc(PyGpuNdArrayObject* self)
-{
-    DPRINTF("PyGpuNdArrayObject_dealloc\n");
-    DPRINTF("PyGpuNdArrayObject dealloc %p %d %p\n", self, self->data_allocated, PyGpuNdArray_DATA(self));
-
-    if(self->ob_refcnt>1)
-      printf("WARNING:PyGpuNdArrayObject_dealloc called when their is still active reference to it.\n");
-
-    if (self->data_allocated){
-        assert(PyGpuNdArray_DATA(self));
-        if (PyGpuNdArray_DATA(self)){
-            if (device_free(PyGpuNdArray_DATA(self))){
-	      fprintf(stderr,
-		  "!!!! error freeing device memory %p (self=%p)\n",
-		  PyGpuNdArray_DATA(self), self);
-	    }
-	    PyGpuNdArray_DATA(self) = NULL;
-	}
-    }
-    PyGpuNdArray_OFFSET(self) = 0;
-    PyGpuNdArray_NDIM(self) = -1;
-    Py_XDECREF(self->base);
-    self->base = NULL;
-    if (PyGpuNdArray_DIMS(self)){
-        free(PyGpuNdArray_DIMS(self));
-        PyGpuNdArray_DIMS(self) = NULL;
-    }
-    if (PyGpuNdArray_STRIDES(self)){
-        free(PyGpuNdArray_STRIDES(self));
-        PyGpuNdArray_STRIDES(self) = NULL;
-    }
-    PyGpuNdArray_FLAGS(self) = NPY_DEFAULT;
-    //Py_XDECREF(self->descr);//TODO: How to handle the refcont on this object?
-    self->descr = NULL;
-    self->data_allocated = 0;
-
-    self->ob_type->tp_free((PyObject*)self);
-    --_outstanding_mallocs[1];
-    DPRINTF("device_malloc_counts: (device) %i (obj) %i\n",
-            _outstanding_mallocs[0],
-            _outstanding_mallocs[1]);
-    DPRINTF("PyGpuNdArrayObject_dealloc end\n");
-}
-
-static PyObject *
-PyGpuNdArray_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
-{
-    DPRINTF("PyGpuNdArray_new\n");
-    PyGpuNdArrayObject *self;
-
-    self = (PyGpuNdArrayObject *)type->tp_alloc(type, 0);
-    if (self != NULL){
-        PyGpuNdArray_null_init(self);
-        ++_outstanding_mallocs[1];
-    }
-    DPRINTF("PyGpuNdArray_new end %p\n", self);
-    return (PyObject *)self;
-}
-
-static int
-PyGpuNdArray_init(PyGpuNdArrayObject *self, PyObject *args, PyObject *kwds)
-{
-    DPRINTF("PyGpuNdArray_init\n");
-    PyObject *arr=NULL;
-
-    if (! PyArg_ParseTuple(args, "O", &arr))
-        return -1;
-    if (! PyArray_Check(arr)){
-        PyErr_SetString(PyExc_TypeError, "PyGpuNdArrayObject_init: PyArray or PyGpuNdArrayObject arg required");
-        return -1;
-    }
-
-    // TODO: We must create a new copy of the PyArray_Descr(or this only increment the refcount?) or still the reference?
-    PyArray_Descr * type = PyArray_DescrFromType(PyArray_TYPE(arr));
-    self->descr = type;
-    Py_XINCREF(self->descr);//TODO: How to handle the refcont on this object?
-    int rval = PyGpuNdArray_CopyFromArray(self, (PyArrayObject*)arr);
-    DPRINTF("PyGpuNdArray_init: end %p type=%p\n", self, self->descr);
-    return rval;
-}
-
-
-int
-PyGpuNdArray_CopyFromArray(PyGpuNdArrayObject * self, PyArrayObject*obj)
-{
-    DPRINTF("PyGpuNdArray_CopyFromArray: start descr=%p\n", self->descr);
-    //modif done to the new array won't be updated!
-    assert(!PyGpuNdArray_CHKFLAGS(self, NPY_UPDATEIFCOPY));
-    //Aligned are not tested, so don't allow it for now
-    assert(PyGpuNdArray_CHKFLAGS(self, NPY_ALIGNED));
-
-    int typenum = PyArray_TYPE(obj);
-    PyObject * py_src = NULL;
-    if (PyArray_ISONESEGMENT(obj)) {
-        Py_INCREF(obj);
-        py_src = (PyObject *) obj;
-    }else{
-        py_src = PyArray_ContiguousFromAny((PyObject*)obj, typenum,
-                                           PyArray_NDIM(obj),
-                                           PyArray_NDIM(obj));
-    }
-    DPRINTF("PyGpuNdArray_CopyFromArray: contiguous!\n");
-    if (!py_src) {
-        return -1;
-    }
-
-    int err;
-    if(PyArray_ISFORTRAN(obj) && ! PyArray_ISCONTIGUOUS(obj)){
-        DPRINTF("PyGpuNdArray_CopyFromArray: fortran!\n");
-        err = PyGpuNdArray_alloc_contiguous(self, obj->nd, obj->dimensions,
-                                            NPY_FORTRANORDER);
-    }else{
-        err = PyGpuNdArray_alloc_contiguous(self, obj->nd, obj->dimensions);
-    }
-    if (err) {
-        return err;
-    }
-
-    //check that the flag are the same
-    if (PyArray_ISCONTIGUOUS(py_src) != PyGpuNdArray_ISCONTIGUOUS(self) &&
-        PyArray_ISFORTRAN(obj) && 0) {
-        PyErr_Format(PyExc_RuntimeError, "ISCONTIGUOUS %d %d\n", PyArray_ISCONTIGUOUS(py_src), PyGpuNdArray_ISCONTIGUOUS(self));
-        return -1;
-    }
-    assert(PyArray_ISCONTIGUOUS(py_src) == PyGpuNdArray_ISCONTIGUOUS(self) ||
-           PyArray_ISFORTRAN(obj));
-    assert(PyArray_ISFORTRAN(py_src) == PyGpuNdArray_ISFORTRAN(self));
-    assert(PyArray_ISALIGNED(py_src) == PyGpuNdArray_ISALIGNED(self));
-
-    // New memory, so we should own it.
-    assert(PyGpuNdArray_CHKFLAGS(self, NPY_OWNDATA));
-    // New memory, so it should be writable
-    assert(PyGpuNdArray_ISWRITEABLE(self));
-
-    err = PyGpuMemcpy(PyGpuNdArray_DATA(self),
-                      PyArray_DATA(py_src),
-                      PyGpuNdArray_OFFSET(self),
-                      PyArray_SIZE(py_src) * PyArray_ITEMSIZE(py_src),
-                      PyGpuHostToDevice);
-    if (err) {
-        Py_DECREF(py_src);
-        return -1;
-    }
-    Py_DECREF(py_src);
-    DPRINTF("PyGpuNdArray_CopyFromArray: end\n");
-    return 0;
-}
-
-static PyObject * PyGpuNdArray_copy(PyObject * self, PyObject *args,
-                                    PyObject *kargs)
-{
-    DPRINTF("PyGpuNdArray_copy start\n");
-    static const char *kwlist[] = {"order", NULL};
-    NPY_ORDER order = PyArray_CORDER;
-
-    if(!PyGpuNdArray_Check(self)){
-        PyErr_SetString(PyExc_ValueError, "PyGpuNdArray_copy: expected a PyGpuNdArrayObject.");
-        return NULL;
-    }
-
-    DPRINTF("PyGpuNdArray_copy before parse inputs\n");
-    if (!PyArg_ParseTupleAndKeywords(args, kargs, "|O&",
-                                     (char**)kwlist,
-                                     PyArray_OrderConverter,
-                                     &order)) {
-        DPRINTF("PyGpuNdArray_copy start1.2\n");
-        return NULL;
-    }
-    DPRINTF("PyGpuNdArray_copy after parse inputs\n");
-
-    DPRINTF("PyGpuNdArray_copy before copy\n");
-    PyObject *ret = PyGpuNdArray_Copy((PyGpuNdArrayObject*)self, order);
-    DPRINTF("PyGpuNdArray_copy end\n");
-    return ret;
-}
-
-static PyObject * PyGpuNdArray_Copy(PyGpuNdArrayObject * self, NPY_ORDER order)
-{
-    DPRINTF("PyGpuNdArray_Copy start\n");
-    PyObject * rval = PyGpuNdArray_New();
-    //TODO find how to refcount descr.
-    PyGpuNdArray_DESCR(rval) = PyGpuNdArray_DESCR(self);
-    if ((!rval) || (-1 == PyGpuNdArray_NDIM(self))) {
-        return rval;
-    }
-    if (PyGpuNdArray_alloc_contiguous((PyGpuNdArrayObject*)rval,
-                                      PyGpuNdArray_NDIM(self),
-                                      PyGpuNdArray_DIMS(self),
-                                      order)) {
-        Py_DECREF(rval);
-        return NULL;
-    }
-
-    if (PyGpuNdArray_CopyFromPyGpuNdArray((PyGpuNdArrayObject*)rval, self)) {
-        Py_DECREF(rval);
-        return NULL;
-    }
-    if (order == NPY_F_CONTIGUOUS)
-        PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS;
-
-#ifdef DEBUG
-    PyGpuNdArray_fprint(stderr, self);
-    PyGpuNdArray_fprint(stderr, (PyGpuNdArrayObject *)rval);
-#endif
-    DPRINTF("PyGpuNdArray_Copy end\n");
-    return rval;
-}
-
-PyObject * PyGpuNdArray_DeepCopy(PyGpuNdArrayObject * self, PyObject * memo)
-{
-    assert(PyDict_Check(memo));
-    PyObject * selfkey = PyInt_FromLong((long)self);
-    assert(selfkey);
-
-    if (PyDict_Contains(memo, selfkey)) {
-        PyObject * rval = PyDict_GetItem(memo, selfkey);
-        Py_DECREF(selfkey);
-        Py_XINCREF(rval);
-        return rval;
-    } else {
-        DPRINTF("PyGpuNdArray_DeepCopy: startd deepcopy\n");
-        PyObject * rval = PyGpuNdArray_Copy(self);
-        if (NULL == rval) {
-            Py_DECREF(selfkey);
-            return NULL;
-        }
-
-        DPRINTF("DeepCopy created %p\n", rval);
-        DPRINTF("DeepCopy created %p %p\n", PyGpuNdArray_DESCR(rval), PyGpuNdArray_DATA(rval));
-        if (PyDict_SetItem(memo, selfkey, rval)) {
-            Py_DECREF(rval);
-            Py_DECREF(selfkey);
-            return NULL;
-        }
-        Py_DECREF(selfkey);
-        DPRINTF("PyGpuNdArray_DeepCopy: startd end\n");
-        return rval;
-    }
-}
-
-PyObject * PyGpuNdArray_View(PyGpuNdArrayObject * self)
-{
-    PyGpuNdArrayObject * rval = (PyGpuNdArrayObject*)PyGpuNdArray_New(PyGpuNdArray_NDIM(self));
-    if (!rval || PyGpuNdArray_set_data(rval, PyGpuNdArray_DATA(self),
-                              (PyObject *)self, PyGpuNdArray_OFFSET(self))) {
-        Py_XDECREF(rval);
-        DPRINTF("PyGpuNdArray_View: no rval or PyGpuNdArray_set_data "
-                "failed: self=%p, rval=%p rval_base=%p\n",
-                self, rval, rval->base);
-        return NULL;
-    } else {
-        for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) {
-            PyGpuNdArray_DIM(rval, i) = PyGpuNdArray_DIMS(self)[i];
-            PyGpuNdArray_STRIDE(rval, i) = PyGpuNdArray_STRIDES(self)[i];
-        }
-    }
-    DPRINTF("PyGpuNdArray_View: self=%p, self->base=%p"
-            " rval=%p rval->base=%p\n",
-            self, self->base, rval, rval->base);
-    //TODO: find how to refcount on the descr!
-    //Py_INCREF(PyGpuNdArray_DESCR(self));
-    PyGpuNdArray_DESCR(rval) = PyGpuNdArray_DESCR(self);
-    PyGpuNdArray_FLAGS(rval) = PyGpuNdArray_FLAGS(self);
-    PyGpuNdArray_FLAGS(rval) &= ~NPY_OWNDATA;
-    
-
-    return (PyObject*)rval;
-}
-
-//updated for offset
-PyObject * PyGpuNdArray_CreateArrayObj(PyGpuNdArrayObject * self)
-{
-    DPRINTF("PyGpuNdArray_CreateArrayObj\n");
-
-    if(PyGpuNdArray_NDIM(self)>=0 && PyGpuNdArray_SIZE(self)==0){
-      npy_intp * npydims = (npy_intp*)malloc(PyGpuNdArray_NDIM(self) * sizeof(npy_intp));
-      assert (npydims);
-      for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i)
-	npydims[i] = (npy_intp)(PyGpuNdArray_DIMS(self)[i]);
-
-      // Numpy will do a decref on the description.
-      Py_INCREF(PyGpuNdArray_DESCR(self));
-
-      // We can't use PyArray_{Empty,EMPTY} as they segfault when size == 0
-      PyObject * rval = PyArray_NewFromDescr(&PyArray_Type,
-                                             PyGpuNdArray_DESCR(self),
-                                             PyGpuNdArray_NDIM(self),
-                                             npydims,
-                                             NULL,
-                                             NULL,
-                                             0,
-                                             NULL);
-
-      free(npydims);
-      if (!rval){
-        return NULL;
-      }
-      assert (PyArray_ITEMSIZE(rval) == PyGpuNdArray_ITEMSIZE(self));
-      return rval;
-    }
-    if ((PyGpuNdArray_NDIM(self) < 0) || (PyGpuNdArray_DATA(self) == 0)) {
-        PyErr_SetString(PyExc_ValueError, "can't copy from un-initialized PyGpuNdArray");
-        return NULL;
-    }
-    PyGpuNdArrayObject * contiguous_self = NULL;
-    bool pos_stride = true;
-    for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i)
-        if (PyGpuNdArray_STRIDE(self,i)<0)
-            pos_stride = false;
-    if (PyGpuNdArray_ISONESEGMENT(self) && pos_stride) {
-        contiguous_self = self;
-        Py_INCREF(contiguous_self);
-        DPRINTF("PyGpuNdArray_CreateArrayObj: gpu array already contiguous %p\n", contiguous_self);
-        //}else if(PyGpuNdArray_ISONESEGMENT(self)){
-        //TODO implement special object handling to speed up transfer
-        //  DPRINTF("CreateArrayObj one segment, with special handling %p\n", contiguous_self);
-        //PyErr_SetString(PyExc_ValueError, "PyGpuNdArray_CreateArrayObj: Need PyGpuNdArray_Copy or some other nd array mandling to transfer contiguous bloc with negative stride.");
-        //return NULL;
-    } else {
-        contiguous_self = (PyGpuNdArrayObject*)PyGpuNdArray_Copy(self);
-        DPRINTF("CreateArrayObj created contiguous %p\n", contiguous_self);
-    }
-    if (!contiguous_self) {
-        return NULL;
-    }
-
-    npy_intp * npydims = (npy_intp*)malloc(PyGpuNdArray_NDIM(self) * sizeof(npy_intp));
-    assert (npydims);
-    for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) npydims[i] = (npy_intp)(PyGpuNdArray_DIMS(self)[i]);
-    Py_INCREF(PyGpuNdArray_DESCR(self));
-    PyObject * rval = PyArray_Empty(PyGpuNdArray_NDIM(self),
-				    npydims,
-				    PyGpuNdArray_DESCR(self),
-				    PyGpuNdArray_ISFORTRAN(self));
-    free(npydims);
-    if (!rval) {
-        Py_DECREF(contiguous_self);
-        return NULL;
-    }
-
-    int err = PyGpuMemcpy(PyArray_DATA(rval),
-                          PyGpuNdArray_DATA(contiguous_self),
-                          PyGpuNdArray_OFFSET(contiguous_self),
-                          PyArray_SIZE(rval) * PyArray_ITEMSIZE(rval),
-                          PyGpuDeviceToHost);
-    if (err) {
-        Py_DECREF(contiguous_self);
-        Py_DECREF(rval);
-        rval = NULL;
-    }
-    Py_DECREF(contiguous_self);
-    return rval;
-}
-
-static PyObject *
-PyGpuNdArray_Empty(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran)
-{
-    DPRINTF("PyGpuNdArray_Empty: start!\n");
-    PyGpuNdArrayObject* rval = (PyGpuNdArrayObject*)PyGpuNdArray_New();
-    PyGpuNdArray_DESCR(rval) = dtype;
-    if (!rval) {
-        DPRINTF("PyGpuNdArray_Empty: fail!\n");
-        return NULL;
-    }
-    NPY_ORDER order = NPY_CORDER;
-    if (fortran!=0)
-        order = NPY_FORTRANORDER;
-
-    if (PyGpuNdArray_alloc_contiguous(rval, nd, dims, order)) {
-        Py_DECREF(rval);
-        return NULL;
-    }
-
-    DPRINTF("PyGpuNdArray_Empty: end!\n");
-    return (PyObject*) rval;
-}
-
-//DONE: dtype, offset not needed, flags
-static PyObject * 
-PyGpuNdArray_Zeros(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran)
-{
-    DPRINTF("PyGpuNdArray_Zeros: start!\n");
-    PyObject * rval = PyGpuNdArray_Empty(nd, dims, dtype, fortran);
-    if (!rval) {
-        return rval;
-    }
-
-    int total_elements = 1;
-    for(int i=0;i<nd;i++)
-      total_elements*=dims[i];
-
-    // total_elements now contains the size of the array, in reals
-    int total_size = total_elements * dtype->elsize;
-    
-    // Fill with zeros
-    int err = PyGpuMemset(PyGpuNdArray_DATA(rval), 0, total_size);
-    if (err) {
-        Py_DECREF(rval);
-        return NULL;
-    }
-
-    DPRINTF("PyGpuNdArray_Zeros: end!\n");
-    return (PyObject*) rval;
-}
-
-// declared as a static method (hence "dummy" is not used)
-// numpy.zeros(shape, dtype=float, order='C')
-static PyObject * 
-PyGpuNdArray_zeros(PyObject* dummy, PyObject* args, PyObject *kargs)
-{
-    static const char *kwlist[] = {"shape","dtype","order",NULL}; /* XXX ? */
-    PyArray_Descr *typecode = NULL;
-    PyObject * shape = NULL;
-    NPY_ORDER order = PyArray_CORDER;
-    bool fortran = false;
-    PyObject *ret = NULL;
-
-    if (!PyArg_ParseTupleAndKeywords(args, kargs, "O|O&O&",
-                                     (char**)kwlist,
-	                             &shape,
-                                     PyArray_DescrConverter,
-                                     &typecode,
-                                     PyArray_OrderConverter,
-                                     &order)) {
-        Py_XDECREF(typecode);
-        Py_XDECREF(shape);
-        return ret;
-    }
-    if (order == PyArray_FORTRANORDER) {
-        fortran = true;
-    }
-    else {
-        fortran = false;
-    }
-
-    if(!PySequence_Check(shape))
-    {
-        PyErr_SetString(PyExc_TypeError, "shape argument must be a sequence");
-        return NULL;
-    }
-
-    if (!typecode)
-        typecode = PyArray_DescrFromType(NPY_FLOAT64);
-
-    int shplen = PySequence_Length(shape);
-
-    if (shplen == 0)
-    {
-        return PyGpuNdArray_Zeros(0, NULL, typecode, fortran);
-    }
-
-    npy_intp* newdims = (npy_intp *)malloc(sizeof(npy_intp) * shplen);
-
-    if (!newdims)
-    {
-        PyErr_SetString(PyExc_MemoryError,
-            "PyGpuNdArray_Zeros: Failed to allocate temporary space");
-        return NULL;
-    }
-
-    // start from the end to compute strides
-    for (int i = shplen-1; i >= 0; --i)
-    {
-        PyObject* shp_el_obj = PySequence_GetItem(shape, i);
-        if(shp_el_obj == NULL)
-        {
-            // shouldn't happen since we checked length before...
-            PyErr_SetString(PyExc_RuntimeError, "PyGpuNdArray_Zeros: Index out of bound in sequence");
-            free(newdims);
-            return NULL;
-        }
-
-        int shp_el = PyInt_AsLong(shp_el_obj);
-        Py_DECREF(shp_el_obj);
-
-        newdims[i] = shp_el;
-    }
-
-    PyObject* rval = PyGpuNdArray_Zeros(shplen, newdims, typecode, fortran);
-
-    free(newdims);
-
-    return (PyObject*)rval;
-}
-
-// declared as a static method (hence "dummy" is not used)
-// numpy.empty(shape, dtype=float, order='C')
-static PyObject * 
-PyGpuNdArray_empty(PyObject* dummy, PyObject* args, PyObject *kargs)
-{
-    static const char *kwlist[] = {"shape","dtype","order",NULL}; /* XXX ? */
-    PyArray_Descr *typecode = NULL;
-    PyObject * shape = NULL;
-    NPY_ORDER order = PyArray_CORDER;
-    bool fortran = false;
-    PyObject *ret = NULL;
-
-    if (!PyArg_ParseTupleAndKeywords(args, kargs, "O|O&O&",
-                                     (char **)kwlist,
-	                             &shape,
-                                     PyArray_DescrConverter,
-                                     &typecode,
-                                     PyArray_OrderConverter,
-                                     &order)) {
-        Py_XDECREF(typecode);
-        Py_XDECREF(shape);
-        return ret;
-    }
-    if (order == PyArray_FORTRANORDER) {
-        fortran = true;
-    }
-    else {
-        fortran = false;
-    }
-
-    if(!PySequence_Check(shape))
-    {
-        PyErr_SetString(PyExc_TypeError, "shape argument must be a sequence");
-        return NULL;
-    }
-
-    if (!typecode)
-        typecode = PyArray_DescrFromType(NPY_FLOAT64);
-
-    int shplen = PySequence_Length(shape);
-
-    if (shplen == 0)
-    {
-        return PyGpuNdArray_Empty(0, NULL, typecode, fortran);
-    }
-
-    npy_intp* newdims = (npy_intp *)malloc(sizeof(npy_intp) * shplen);
-
-    if (!newdims)
-    {
-        PyErr_SetString(PyExc_MemoryError,
-            "PyGpuNdArray_empty: Failed to allocate temporary space");
-        return NULL;
-    }
-
-    // start from the end to compute strides
-    for (int i = shplen-1; i >= 0; --i)
-    {
-        PyObject* shp_el_obj = PySequence_GetItem(shape, i);
-        if(shp_el_obj == NULL)
-        {
-            // shouldn't happen since we checked length before...
-            PyErr_SetString(PyExc_RuntimeError, "PyGpuNdArray_empty: Index out of bound in sequence");
-            free(newdims);
-            return NULL;
-        }
-
-        int shp_el = PyInt_AsLong(shp_el_obj);
-        Py_DECREF(shp_el_obj);
-
-        newdims[i] = shp_el;
-    }
-
-    PyObject* rval = PyGpuNdArray_Empty(shplen, newdims, typecode, fortran);
-
-    free(newdims);
-
-    return (PyObject*)rval;
-}
-
-static PyMethodDef PyGpuNdArray_methods[] =
-{
-    {"__array__",
-        (PyCFunction)PyGpuNdArray_CreateArrayObj, METH_NOARGS,
-        "Copy from the device to a numpy ndarray"},
-    {"copy",
-     (PyCFunction)PyGpuNdArray_copy, METH_VARARGS|METH_KEYWORDS,
-        "Create a deep copy of this object."},
-    {"view",
-        (PyCFunction)PyGpuNdArray_View, METH_NOARGS,
-        "Create a view of this object."},
-    {"__copy__",
-        (PyCFunction)PyGpuNdArray_Copy, METH_NOARGS,
-        "Create a copy of this object as numpy does. Why numpy do a copy of the data when the object is a view?"},
-    {"__deepcopy__",
-        (PyCFunction)PyGpuNdArray_DeepCopy, METH_O,
-        "Create a copy of this object"},
-/*
-    {"reduce_sum",
-        (PyCFunction)PyGpuNdArray_ReduceSum, METH_O,
-        "Reduce over the given dimensions by summation"},
-    {"exp",
-        (PyCFunction)PyGpuNdArray_Exp, METH_NOARGS,
-        "Return the exponential of all elements"},
-    {"reshape",
-        (PyCFunction)PyGpuNdArray_Reshape, METH_O,
-        "Return a reshaped view (or copy) of this ndarray\n\
-            The required argument is a tuple of integers specifying the shape of the new ndarray."},
-    {"_set_stride",
-        (PyCFunction)PyGpuNdArray_SetStride, METH_VARARGS,
-        "For integer arguments (i, s), set the 'i'th stride to 's'"},
-    {"_set_shape_i",
-        (PyCFunction)PyGpuNdArray_SetShapeI, METH_VARARGS,
-        "For integer arguments (i, s), set the 'i'th shape to 's'"},
-    */
-    {NULL, NULL, NULL, NULL}  /* Sentinel */
-};
-
-//PyArray_CopyInto(PyArrayObject* dest, PyArrayObject* src)¶
-//PyObject* PyArray_NewCopy(PyArrayObject* old, NPY_ORDER order)¶
-
-
-static PyObject *
-PyGpuNdArray_get_shape(PyGpuNdArrayObject *self, void *closure)
-{
-    DPRINTF("PyGpuNdArray_get_shape\n");
-
-    if (PyGpuNdArray_NDIM(self) < 0)
-    {
-        PyErr_SetString(PyExc_ValueError, "PyGpuNdArray not initialized");
-        return NULL;
-    }
-    PyObject * rval = PyTuple_New(PyGpuNdArray_NDIM(self));
-    for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i)
-    {
-        if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(PyGpuNdArray_DIMS(self)[i])))
-        {
-            Py_XDECREF(rval);
-            return NULL;
-        }
-
-    }
-    return rval;
-}
-
-static int
-PyGpuNdArray_set_shape(PyGpuNdArrayObject *self, PyObject *value, void *closure)
-{
-    PyErr_SetString(PyExc_NotImplementedError, "TODO: call reshape");
-    return -1;
-}
-
-static PyObject *
-PyGpuNdArray_get_strides(PyGpuNdArrayObject *self, void *closure)
-{
-  if ( PyGpuNdArray_NDIM(self) < 0){
-      PyErr_SetString(PyExc_ValueError, "PyGpuNdArrayObject not initialized");
-      return NULL;
-    }
-  PyObject * rval = PyTuple_New( PyGpuNdArray_NDIM(self));
-  for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i){
-      if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(PyGpuNdArray_STRIDES(self)[i]))){
-	  Py_XDECREF(rval);
-	  return NULL;
-        }
-    }
-  return rval;
-}
-
-static PyObject *
-PyGpuNdArray_get_data(PyGpuNdArrayObject *self, void *closure)
-{
-    return PyInt_FromLong((long int) PyGpuNdArray_DATA(self));
-}
-
-static PyObject *
-PyGpuNdArray_get_flags(PyGpuNdArrayObject *self, void *closure)
-{
-    PyObject * dict = PyDict_New();
-
-    PyObject * str= PyString_FromString("C_CONTIGUOUS");
-    PyObject * i = PyBool_FromLong(PyGpuNdArray_ISCONTIGUOUS(self));
-    PyDict_SetItem(dict, str, i);
-    Py_DECREF(str);
-    Py_DECREF(i);
-
-    str= PyString_FromString("F_CONTIGUOUS");
-    i = PyBool_FromLong(PyGpuNdArray_CHKFLAGS(self, NPY_F_CONTIGUOUS));
-    PyDict_SetItem(dict, str, i);
-    Py_DECREF(str);
-    Py_DECREF(i);
-
-    str= PyString_FromString("WRITEABLE");
-    i = PyBool_FromLong(PyGpuNdArray_ISWRITEABLE(self));
-    PyDict_SetItem(dict, str, i);
-    Py_DECREF(str);
-    Py_DECREF(i);
-
-    str= PyString_FromString("ALIGNED");
-    i = PyBool_FromLong(PyGpuNdArray_ISALIGNED(self));
-    PyDict_SetItem(dict, str, i);
-    Py_DECREF(str);
-    Py_DECREF(i);
-
-    str= PyString_FromString("UPDATEIFCOPY");
-    i = PyBool_FromLong(PyGpuNdArray_CHKFLAGS(self, NPY_UPDATEIFCOPY));
-    PyDict_SetItem(dict, str, i);
-    Py_DECREF(str);
-    Py_DECREF(i);
-
-    str= PyString_FromString("OWNDATA");
-    i = PyBool_FromLong(PyGpuNdArray_CHKFLAGS(self, NPY_OWNDATA));
-    PyDict_SetItem(dict, str, i);
-    Py_DECREF(str);
-    Py_DECREF(i);
-
-    return dict;
-}
-static PyObject *
-PyGpuNdArray_get_ndim(PyGpuNdArrayObject *self, void *closure)
-{
-    return PyInt_FromLong((long int) PyGpuNdArray_NDIM(self));
-}
-static PyObject *
-PyGpuNdArray_get_offset(PyGpuNdArrayObject *self, void *closure)
-{
-    return PyInt_FromLong((long int) PyGpuNdArray_OFFSET(self));
-}
-static PyObject *
-PyGpuNdArray_get_data_allocated(PyGpuNdArrayObject *self, void *closure)
-{
-    return PyInt_FromLong((long int) self->data_allocated);
-}
-static PyObject *
-PyGpuNdArray_get_size(PyGpuNdArrayObject *self, void *closure)
-{
-    return PyInt_FromLong((long int) PyGpuNdArray_SIZE(self));
-}
-
-static PyObject *
-PyGpuNdArray_get_base(PyGpuNdArrayObject *self, void *closure)
-{
-    if (!PyGpuNdArray_BASE(self)){
-        Py_INCREF(Py_None);
-        return Py_None;
-    }
-    PyObject * ret = PyGpuNdArray_BASE(self);
-    Py_INCREF(ret);
-    return ret;
-}
-
-static PyObject *
-PyGpuNdArray_get_dtype(PyArrayObject *self)
-{
-    Py_INCREF(PyGpuNdArray_DESCR(self));
-    PyObject * ret = (PyObject *)PyGpuNdArray_DESCR(self);
-    return ret;
-}
-
-static PyObject *
-PyGpuNdArray_get_itemsize(PyArrayObject *self)
-{
-    return (PyObject *)PyInt_FromLong(PyGpuNdArray_ITEMSIZE(self));
-}
-
-static PyGetSetDef PyGpuNdArray_getset[] = {
-    {(char*)"base",
-        (getter)PyGpuNdArray_get_base,
-        NULL,
-        (char*)"Return the object stored in the base attribute",
-        NULL},
-    {(char*)"bytes",
-        (getter)PyGpuNdArray_get_data,
-        NULL,
-        (char*)"device data pointer",
-        NULL},
-    {(char*)"shape",
-        (getter)PyGpuNdArray_get_shape,
-        (setter)PyGpuNdArray_set_shape,
-        (char*)"shape of this ndarray (tuple)",
-        NULL},
-    {(char*)"strides",
-        (getter)PyGpuNdArray_get_strides,
-        NULL,//(setter)PyGpuNdArray_set_strides,
-        (char*)"data pointer strides (in elements)",
-        NULL},
-    {(char*)"ndim",
-        (getter)PyGpuNdArray_get_ndim,
-        NULL,
-        (char*)"The number of dimensions in this object",
-        NULL},
-    {(char*)"offset",
-        (getter)PyGpuNdArray_get_offset,
-        NULL,
-        (char*)"Return the offset value",
-        NULL},
-    {(char*)"size",
-        (getter)PyGpuNdArray_get_size,
-        NULL,
-        (char*)"The number of elements in this object.",
-        NULL},
-    {(char*)"data_allocated",
-        (getter)PyGpuNdArray_get_data_allocated,
-        NULL,
-        (char*)"The size of the allocated memory on the device.",
-        NULL},
-    {(char*)"itemsize",
-        (getter)PyGpuNdArray_get_itemsize,
-        NULL,
-        (char*)"The size of the base element.",
-        NULL},
-    {(char*)"dtype",
-	(getter)PyGpuNdArray_get_dtype,
-	NULL,
-        (char*)"The dtype of the element",
-	NULL},
-    {(char*)"flags",
-        (getter)PyGpuNdArray_get_flags,
-        NULL,
-        (char*)"Return the flags as a dictionary",
-        NULL},
-    {NULL, NULL, NULL, NULL}  /* Sentinel */
-};
-
-// Will by called by __len__ in Python
-static Py_ssize_t
-PyGpuNdArray_len(PyObject * py_self)
-{
-    PyGpuNdArrayObject * self = (PyGpuNdArrayObject*) py_self;
-    if (PyGpuNdArray_NDIM(self) <= 0)
-    {
-        return (Py_ssize_t) 0;
-    }
-    else
-    {
-        return (Py_ssize_t) PyGpuNdArray_DIMS(self)[0];
-    }
-}
-
-static int
-PyGpuNdArray_add_offset(PyGpuNdArrayObject * self, int offset)
-{
-    DPRINTF("PyGpuNdArray_add_offset: %p %d\n", self, offset);
-
-#if OFFSET
-    PyGpuNdArray_OFFSET(self) += offset;
-#else
-    PyGpuNdArray_DATA(self) += offset;
-#endif
-    return 0;
-}
-
-
-static int
-PyGpuNdArray_set_data(PyGpuNdArrayObject * self, char * data, PyObject * base, int offset)
-{
-    DPRINTF("PyGpuNdArray_set_data: %p %p %p %d\n", self, data, base, offset);
-    if (self->data_allocated)
-    {
-        assert(PyGpuNdArray_DATA(self));
-        if (device_free(PyGpuNdArray_DATA(self)))
-        {
-            PyGpuNdArray_DATA(self) = NULL;
-            self->data_allocated = 0;
-            DPRINTF("PyGpuNdArray_set_data: device_free failed!\n");
-            PyErr_SetString(PyExc_ValueError, "PyGpuNdArray_set_data: device_free failed");
-            return -1;
-        }
-    }
-
-    // Get the original base object (base.base.base...)
-    // TODO: check that base is indeed a CudaNdarray?
-    PyObject * orig_base = base;
-    // base is not always a PyGpuNdArrayObject. It can be a GpuArray from pycuda, ...
-    while (orig_base && PyGpuNdArray_Check(orig_base) && ((PyGpuNdArrayObject*) orig_base)->base)
-    {
-        // base_base is itself a view
-        orig_base = ((PyGpuNdArrayObject*) orig_base)->base;
-    }
-
-    //N.B. XDECREF and XINCREF are no-ops for NULL pointers
-    if (PyGpuNdArray_BASE(self) != orig_base)
-    {
-        Py_XDECREF(PyGpuNdArray_BASE(self));
-        PyGpuNdArray_BASE(self) = orig_base;
-        Py_XINCREF(PyGpuNdArray_BASE(self));
-    }
-    self->data_allocated = 0;
-#if OFFSET
-    PyGpuNdArray_DATA(self) = data;
-    PyGpuNdArray_OFFSET(self) = offset;
-#else
-    PyGpuNdArray_DATA(self) = data + offset;
-#endif
-
-    return 0;
-}
-
-// Will by called by __getitem__ in Python
-static PyObject *
-PyGpuNdArray_Subscript(PyObject * py_self, PyObject * key)
-{
-    DPRINTF("Subscript start\n");
-    PyGpuNdArrayObject * self = (PyGpuNdArrayObject*) py_self;
-    PyObject * py_rval = NULL;
-    PyGpuNdArrayObject * rval = NULL;
-    PyObject * intobj = NULL;
-
-    //PyObject_Print(key, stderr, 0);
-
-    if (key == Py_Ellipsis)
-    {
-        DPRINTF("Subscript with ellipse \n");
-        Py_INCREF(py_self);
-        DPRINTF("Subscript with ellipse end\n");
-        return py_self;
-    }
-    if ((intobj=PyNumber_Int(key))) //INDEXING BY INTEGER
-    {
-#ifdef DEBUG
-        PyGpuNdArray_fprint(stderr, self);
-#endif
-        DPRINTF("Subscript with int \n");
-
-        int d_idx = PyInt_AsLong(intobj);
-        Py_DECREF(intobj); intobj=NULL;
-
-        DPRINTF("Subscript with int 1\n");
-        if (PyGpuNdArray_NDIM(self) == 0) {
-            PyErr_SetString(PyExc_IndexError, "0-d arrays can't be indexed");
-            return NULL;
-        }else if (PyGpuNdArray_NDIM(self)< 0){
-            PyErr_SetString(PyExc_IndexError, "nd arrays must have a number of dim > 0!");
-            return NULL;
-        }
-        int d_dim = PyGpuNdArray_DIMS(self)[0];
-        int offset = 0;
-        DPRINTF("Subscript with int 2\n");
-
-        if ((d_idx >= 0) && (d_idx < d_dim)) {
-            //normal indexing
-            offset += d_idx * PyGpuNdArray_STRIDES(self)[0];
-        }
-        else if ((d_idx < 0) && (d_idx >= -d_dim)) {
-            //end-based indexing
-            // d_idx is negative
-            offset += (d_dim + d_idx) * PyGpuNdArray_STRIDES(self)[0];
-        } else {
-            PyErr_SetString(PyExc_IndexError, "index out of bounds");
-            return NULL;
-        }
-        DPRINTF("Subscript with int 3\n");
-
-        //Add the original offset
-        offset += PyGpuNdArray_OFFSET(self);
-
-        //allocate our subtensor view
-        py_rval = PyGpuNdArray_New(PyGpuNdArray_NDIM(self) - 1);
-        rval = (PyGpuNdArrayObject*) py_rval;
-        if (!rval) return NULL;
-
-        //TODO: find how to refcount on the descr!
-        PyGpuNdArray_DESCR(py_rval) = PyGpuNdArray_DESCR(self);
-
-        DPRINTF("Subscript with int 4\n");
-        //initialize the view's data pointer to our own.
-        assert (0 == rval->data_allocated);
-        if (PyGpuNdArray_set_data(rval, PyGpuNdArray_DATA(self), (PyObject *) self, offset)){
-            Py_DECREF(rval);
-            return NULL;
-        }
-        DPRINTF("Subscript with int 5\n");
-
-        for (int d = 1; d < PyGpuNdArray_NDIM(self); ++d) {
-            PyGpuNdArray_STRIDE(rval, d-1) = PyGpuNdArray_STRIDES(self)[d];
-            PyGpuNdArray_DIM(rval, d-1) = PyGpuNdArray_DIMS(self)[d];
-        }
-    }
-    else {
-        PyErr_Clear();
-    }
-    if (PySlice_Check(key)) //INDEXING BY SLICE
-    {
-        DPRINTF("Subscript with slice \n");
-        if (PyGpuNdArray_NDIM(self) == 0)
-        {
-            PyErr_SetString(PyExc_ValueError, "cannot slice a 0-d array");
-            return NULL;
-        }
-
-        int d_dim = PyGpuNdArray_DIMS(self)[0];
-        Py_ssize_t start, stop, step, slen;
-        if (PySlice_GetIndicesEx((PySliceObject*)key, d_dim, &start, &stop, &step, &slen)) {
-            return NULL;
-        }
-
-        DPRINTF("start %zd\nstop %zd\n step %zd\n slen %zd\n",
-                start, stop, step, slen);
-
-        //allocate our subtensor view
-        py_rval = PyGpuNdArray_New(PyGpuNdArray_NDIM(self));
-        rval = (PyGpuNdArrayObject*) py_rval;
-        if (!rval) return NULL;
-
-        //TODO: find how to refcount on the descr!
-        PyGpuNdArray_DESCR(py_rval) = PyGpuNdArray_DESCR(self);
-        assert (0 == rval->data_allocated);
-        if (PyGpuNdArray_set_data(rval,
-                                  PyGpuNdArray_DATA(self),
-                                  py_self,
-                                  start * PyGpuNdArray_STRIDE(self, 0)
-                                  + PyGpuNdArray_OFFSET(self))) {
-            Py_DECREF(rval);
-            return NULL;
-        }
-
-        //initialize dimension 0 of rval
-        PyGpuNdArray_STRIDE(rval, 0) = step * PyGpuNdArray_STRIDES(self)[0];
-        PyGpuNdArray_DIM(rval, 0) = slen;
-        DPRINTF("rval stride %zd\n", PyGpuNdArray_STRIDES(rval)[0]);
-        // initialize dimensions > 0 of rval
-        for (int d = 1; d < PyGpuNdArray_NDIM(self); ++d) {
-            PyGpuNdArray_STRIDE(rval, d) = PyGpuNdArray_STRIDES(self)[d];
-            PyGpuNdArray_DIM(rval, d) = PyGpuNdArray_DIMS(self)[d];
-        }
-    }
-    if (PyTuple_Check(key)) //INDEXING BY TUPLE
-    {
-        DPRINTF("Subscript with tuple \n");
-        //elements of the tuple can be either integers or slices
-        //the dimensionality of the view we will return is diminished for each slice in the tuple
-        int tuple_start_index = 0;
-        if (PyTuple_Size(key) > PyGpuNdArray_NDIM(self))
-        {
-            if (PyTuple_GetItem(key, 0) == Py_Ellipsis &&
-                PyTuple_Size(key) == PyGpuNdArray_NDIM(self) + 1)
-            {
-                tuple_start_index = 1;
-                DPRINTF("Subscript with tuple staring with an extra ellipse"
-                        " at the start.\n");
-            }
-            else{
-                PyErr_SetString(PyExc_IndexError,
-                                "index error, specified more dimensions then"
-                                " the number of existing dimensions");
-                return NULL;
-            }
-        }
-
-        //calculate the number of dimensions in the return value
-        int rval_nd = PyGpuNdArray_NDIM(self);
-        for (int tuple_d = tuple_start_index; tuple_d < PyTuple_Size(key);
-             ++tuple_d)
-        {
-            //On some paltform PyInt_Check(<type 'numpy.int64'>) return true, other it return false.
-            //So we use PyArray_IsAnyScalar that should covert everything.
-            rval_nd -= PyArray_IsAnyScalar(PyTuple_GetItem(key, tuple_d));
-        }
-
-        //allocate our subtensor view
-        py_rval = PyGpuNdArray_New(rval_nd);
-        rval = (PyGpuNdArrayObject*) py_rval;
-        if (!rval) return NULL;
-        assert (0 == rval->data_allocated);
-
-        //TODO: find how to refcount on the descr!
-        PyGpuNdArray_DESCR(py_rval) = PyGpuNdArray_DESCR(self);
-
-        //initialize the view's data pointer to our own.
-        if (PyGpuNdArray_set_data(rval, PyGpuNdArray_DATA(self),
-                                  py_self, PyGpuNdArray_OFFSET(self)))
-        {
-            Py_DECREF(rval);
-            return NULL;
-        }
-
-        // rval_d will refer to the current dimension in the rval.
-        // It will not be incremented for integer keys, but will be incremented for slice
-        // keys
-        int rval_d = 0;
-
-        for (int self_d = 0, tuple_d = tuple_start_index;
-             self_d < PyGpuNdArray_NDIM(self); ++self_d, ++tuple_d)
-        {
-            // keys can be shorter than PyGpuNdArray_NDIM(self).
-            // when that happens, it means that the remaining dimensions are "full slices"
-            if (tuple_d >= PyTuple_Size(key))
-            {
-                PyGpuNdArray_STRIDE(rval, rval_d) =
-                    PyGpuNdArray_STRIDES(self)[tuple_d];
-                PyGpuNdArray_DIM(rval, rval_d) =
-                    PyGpuNdArray_DIMS(self)[tuple_d];
-                ++rval_d;
-                DPRINTF("Subscript extra dims to append %zd %zd\n",
-                        PyGpuNdArray_STRIDE(rval, rval_d),
-                        PyGpuNdArray_DIM(rval, rval_d));
-            }
-            else
-            {
-                PyObject * key_d = PyTuple_GetItem(key, tuple_d);
-
-                if (PySlice_Check(key_d))
-                {
-                    Py_ssize_t start, stop, step, slen;
-                    if (PySlice_GetIndicesEx((PySliceObject*)key_d,
-                                             PyGpuNdArray_DIMS(self)[self_d],
-                                             &start, &stop, &step, &slen))
-                    {
-                        Py_DECREF(rval);
-                        return NULL;
-                    }
-                    PyGpuNdArray_add_offset(rval, start * PyGpuNdArray_STRIDES(self)[self_d]);
-                    PyGpuNdArray_STRIDE(rval, rval_d) = step * PyGpuNdArray_STRIDES(self)[self_d];
-                    PyGpuNdArray_DIM(rval, rval_d) = slen;
-
-                    DPRINTF("rval_d %d self_d %d\n start %zd\nstop %zd\n step %zd\n slen %zd\n",
-                            rval_d, self_d, start, stop, step, slen);
-                    ++rval_d;
-                }
-                else if ((intobj=PyNumber_Int(key_d)))
-                {
-                    assert(PyArray_IsAnyScalar(key_d));
-                    int d_idx = PyInt_AsLong(intobj);
-                    Py_DECREF(intobj);
-                    intobj = NULL;
-                    int d_dim = PyGpuNdArray_DIMS(self)[self_d];
-
-                    if ((d_idx >= 0) && (d_idx < d_dim))
-                    {
-                        //normal indexing
-                        PyGpuNdArray_add_offset(rval, d_idx * PyGpuNdArray_STRIDES(self)[self_d]);
-                    }
-                    else if ((d_idx < 0) && (d_idx >= -d_dim))
-                    {
-                        //end-based indexing
-                        PyGpuNdArray_add_offset(rval, (d_dim + d_idx) * PyGpuNdArray_STRIDES(self)[self_d]);
-                    }
-                    else
-                    {
-                        PyErr_SetString(PyExc_IndexError, "index out of bounds");
-                        Py_DECREF(rval);
-                        return NULL;
-                    }
-                }
-                else if (key_d == Py_Ellipsis)
-                {
-                    if (self_d != 0){
-                        PyErr_Format(PyExc_IndexError,
-                                     "Ellipsis supported only at the start of"
-                                     " the tuple");
-                        Py_DECREF(rval);
-                        return NULL;
-                    }
-                    DPRINTF("Substript with tuple with the first element an ellipse\n");
-                    for( ; self_d < (rval_nd - PyTuple_Size(key) + 1); self_d++)
-                    {
-                        PyGpuNdArray_STRIDE(rval, rval_d) =
-                            PyGpuNdArray_STRIDES(self)[self_d];
-                        PyGpuNdArray_DIM(rval, rval_d) =
-                            PyGpuNdArray_DIMS(self)[self_d];
-                        DPRINTF("Ellipse append dimensions self_%d with %zd %zd\n",
-                                self_d,
-                                PyGpuNdArray_STRIDE(rval, rval_d),
-                                PyGpuNdArray_DIM(rval, rval_d));
-                        ++rval_d;
-                    }
-                    tuple_start_index = 1;
-                    self_d--;
-                }
-                else
-                {
-                    PyErr_Clear(); // clear the error set by PyNumber_Int
-                    PyErr_Format(PyExc_IndexError,
-                                 "index must be either int or slice. Got %s",
-                                 PyString_AsString(PyObject_Str(key_d)));
-                    Py_DECREF(rval);
-                    return NULL;
-                }
-            }
-        }
-    }
-    if (py_rval)
-    {
-#ifdef DEBUG
-        PyGpuNdArray_fprint(stderr, self);
-        PyGpuNdArray_fprint(stderr, rval);
-#endif
-    }
-    else
-    {
-        PyErr_SetString(PyExc_NotImplementedError, "Unknown key type");
-        return NULL;
-    }
-
-    // Set flags
-    if (PyGpuNdArray_ISWRITEABLE(self)) {
-        PyGpuNdArray_FLAGS(rval) |= NPY_WRITEABLE;
-    } else {
-        PyGpuNdArray_FLAGS(rval) &= ~NPY_WRITEABLE;
-    }
-    PyGpuNdArray_FLAGS(rval) &= ~NPY_OWNDATA;
-    if (PyGpuNdArray_ISALIGNED(self)) {
-        PyGpuNdArray_FLAGS(rval) |= NPY_ALIGNED;
-    } else {
-        PyGpuNdArray_FLAGS(rval) &= ~NPY_ALIGNED;
-    }
-    PyGpuNdArray_FLAGS(rval) &= ~NPY_UPDATEIFCOPY;
-
-    if (false && PyGpuNdArray_NDIM(rval) == 0) {
-        //Numpy is not consistent here
-        //When we create a new numpy ndarray of 0 dim, it is not f contiguous
-        //But when we take a subtensor that is of 0 dim, it is f contiguous!
-        //We make as them for now...
-        PyGpuNdArray_FLAGS(rval) &= ~NPY_F_CONTIGUOUS;
-        PyGpuNdArray_FLAGS(rval) |= NPY_C_CONTIGUOUS;
-    } else {
-        if (PyGpuNdArray_is_c_contiguous(rval)) {
-            PyGpuNdArray_FLAGS(rval) |= NPY_C_CONTIGUOUS;
-        } else {
-            PyGpuNdArray_FLAGS(rval) &= ~NPY_C_CONTIGUOUS;
-        }
-        if (PyGpuNdArray_is_f_contiguous(rval)) {
-            PyGpuNdArray_FLAGS(rval) |= NPY_F_CONTIGUOUS;
-        } else {
-            PyGpuNdArray_FLAGS(rval) &= ~NPY_F_CONTIGUOUS;
-        }
-    }
-
-    DPRINTF("Subscript end\n");
-    return py_rval;
-}
-
-PyMappingMethods PyGpuNdArrayMappingMethods = {
-    PyGpuNdArray_len, //lenfunc mp_length;               __len__
-    PyGpuNdArray_Subscript, //binaryfunc mp_subscript;   __getitem__
-    0 //PyGpuNdArray_setitem //objobjargproc mp_ass_subscript;                __setitem__
-};
-
-static PyTypeObject PyGpuNdArrayType =
-{
-    PyObject_HEAD_INIT(NULL)
-    0,                         /*ob_size*/
-    "GpuNdArray",             /*tp_name*/
-    sizeof(PyGpuNdArrayObject),       /*tp_basicsize*/
-    0,                         /*tp_itemsize*/
-    (destructor)PyGpuNdArrayObject_dealloc, /*tp_dealloc*/
-    0,                         /*tp_print*/
-    0,                         /*tp_getattr*/
-    0,                         /*tp_setattr*/
-    0,                         /*tp_compare*/
-    0,                         /*tp_repr*/
-    0, //&PyGpuNdArrayObjectNumberMethods, /*tp_as_number*/
-    0,                         /*tp_as_sequence*/
-    &PyGpuNdArrayMappingMethods,/*tp_as_mapping*/
-    0,                         /*tp_hash */
-    0,                         /*tp_call*/
-    0,                         /*tp_str*/
-    0,                         /*tp_getattro*/
-    0,                         /*tp_setattro*/
-    0,                         /*tp_as_buffer*/
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_CHECKTYPES, /*tp_flags*/
-    "PyGpuNdArrayObject objects",     /* tp_doc */
-    0,                         /* tp_traverse */
-    0,                         /* tp_clear */
-    0,                         /* tp_richcompare */
-    0,                         /* tp_weaklistoffset */
-    0,                         /* tp_iter */
-    0,                         /* tp_iternext */
-    PyGpuNdArray_methods,       /* tp_methods */
-    0, //PyGpuNdArray_members,       /* tp_members */ //TODO
-    PyGpuNdArray_getset,        /* tp_getset */
-    0,                         /* tp_base */
-    0,                         /* tp_dict */
-    0,                         /* tp_descr_get */
-    0,                         /* tp_descr_set */
-    0,                         /* tp_dictoffset */
-    (initproc)PyGpuNdArray_init,/* tp_init */
-    0,                         /* tp_alloc */
-    PyGpuNdArray_new,           /* tp_new */
-};
-
-//////////////////////////////////////
-//
-// C API FOR PyGpuNdArrayObject
-//
-//////////////////////////////////////
-PyObject *
-PyGpuNdArray_New(int nd)
-{
-    DPRINTF("PyGpuNdArray_New start\n");
-    PyGpuNdArrayObject *self = (PyGpuNdArrayObject *)PyGpuNdArrayType.tp_alloc(&PyGpuNdArrayType, 0);
-    if (self == NULL) {
-        PyErr_SetString(PyExc_RuntimeError, "PyGpuNdArray_New failed to allocate self");
-        return NULL;
-    }
-    PyGpuNdArray_null_init(self);
-
-    if (nd == 0) {
-        PyGpuNdArray_NDIM(self) = 0;
-    }
-    else if (nd > 0) {
-        if (PyGpuNdArray_set_nd(self, nd)) {
-            Py_DECREF(self);
-            return NULL;
-        }
-    }
-    ++_outstanding_mallocs[1];
-    DPRINTF("PyGpuNdArray_New end\n");
-    return (PyObject *)self;
-}
-
-int
-PyGpuNdArray_Check(const PyObject * ob)
-{
-    DPRINTF("PyGpuNdArray_Check\n");
-    //TODO: doesn't work with inheritance
-    return PyGpuNdArray_CheckExact(ob);
-}
-int
-PyGpuNdArray_CheckExact(const PyObject * ob)
-{
-    DPRINTF("PyGpuNdArray_CheckExact\n");
-    return ((ob->ob_type == &PyGpuNdArrayType) ? 1 : 0);
-}
-
-static PyObject *
-PyGpuNdArray_as_c_contiguous(PyObject* dummy, PyObject* args, PyObject *kargs)
-{
-    DPRINTF("PyGpuNdArray_as_c_contiguous:start\n");
-    static const char *kwlist[] = {"a", "dtype", NULL};
-    PyArray_Descr *typecode = NULL;
-    PyObject *self_ = NULL;
-
-    if (!PyArg_ParseTupleAndKeywords(args, kargs, "O|O&",
-                                     (char **)kwlist,
-                                     &self_,
-                                     PyArray_DescrConverter,
-                                     &typecode)) {
-        Py_XDECREF(typecode);
-        Py_XDECREF(self_);
-        return NULL;
-    }
-    assert(typecode == NULL);
-    if (!PyGpuNdArray_Check(self_)){
-        PyErr_SetString(PyExc_TypeError,
-                        "PyGpuNdArray_as_c_contiguous:"
-                        " PyGpuNdArrayObject required");
-        return NULL;
-    }
-
-    PyGpuNdArrayObject *self = (PyGpuNdArrayObject*)self_;
-    if (PyGpuNdArray_is_c_contiguous(self)){
-        Py_INCREF(self);
-        if (PyGpuNdArray_NDIM(self) == 0){
-            //numpy.ascontiguous() always return object with 1d.
-            DPRINTF("PyGpuNdArray_as_c_contiguous: upcast to 1d tensor end\n");
-            PyObject * rval = PyGpuNdArray_View(self);
-            if (!rval)
-                return NULL;
-            PyGpuNdArray_set_nd((PyGpuNdArrayObject*)rval, 1);
-            PyGpuNdArray_DIM(rval, 0) = 1;
-            PyGpuNdArray_STRIDE(rval, 0) = PyGpuNdArray_ITEMSIZE(rval);
-            return rval;
-        }
-        DPRINTF("PyGpuNdArray_as_c_contiguous: no copy end\n");
-        return (PyObject*)self;
-    }
-
-    PyObject * ret = PyGpuNdArray_Copy(self);
-    DPRINTF("PyGpuNdArray_as_c_contiguous: copy end\n");
-    return ret;
-}
-static PyObject *
-PyGpuNdArray_as_f_contiguous(PyObject* dummy, PyObject* args, PyObject *kargs)
-{
-    DPRINTF("PyGpuNdArray_as_f_contiguous:start\n");
-    static const char *kwlist[] = {"a", "dtype", NULL};
-    PyArray_Descr *typecode = NULL;
-    PyObject *self_ = NULL;
-
-    if (!PyArg_ParseTupleAndKeywords(args, kargs, "O|O&",
-                                     (char **)kwlist,
-                                     &self_,
-                                     PyArray_DescrConverter,
-                                     &typecode)) {
-        Py_XDECREF(typecode);
-        Py_XDECREF(self_);
-        return NULL;
-    }
-    assert(typecode == NULL);
-    if (!PyGpuNdArray_Check(self_)){
-        PyErr_SetString(PyExc_TypeError,
-                        "PyGpuNdArray_as_f_contiguous:"
-                        " PyGpuNdArrayObject required");
-        return NULL;
-    }
-
-    PyGpuNdArrayObject *self = (PyGpuNdArrayObject*)self_;
-    if (PyGpuNdArray_is_f_contiguous(self)){
-        Py_INCREF(self);
-        if (PyGpuNdArray_NDIM(self) == 0){
-            //numpy.ascontiguous() always return object with 1d.
-            PyObject * rval = PyGpuNdArray_View(self);
-            if (!rval)
-                return NULL;
-            PyGpuNdArray_set_nd((PyGpuNdArrayObject*)rval, 1);
-            PyGpuNdArray_DIM(rval, 0) = 1;
-            PyGpuNdArray_STRIDE(rval, 0) = PyGpuNdArray_ITEMSIZE(rval);
-            DPRINTF("PyGpuNdArray_as_f_contiguous: upcast to 1d tensor end\n");
-            return rval;
-        }
-        DPRINTF("PyGpuNdArray_as_f_contiguous: no copy end\n");
-        return (PyObject*)self;
-    }
-
-    PyObject * ret = PyGpuNdArray_Copy(self, NPY_FORTRANORDER);
-    DPRINTF("PyGpuNdArray_as_f_contiguous: copy end\n");
-    return ret;
-}
-
-#ifdef WITH_OPENCL
-#ifdef __APPLE__
-#include <OpenCL/opencl.h>
-#else
-#include <CL/opencl.h>
-#endif
-extern void setup_context(cl_context c);
-
-PyObject *
-PyGpuNdArray_set_opencl_context(PyObject *mod, PyObject *ctx) {
-    Py_ssize_t v;
-
-    v = PyInt_AsSsize_t(ctx);
-    if (v == -1 && PyErr_Occurred())
-        return NULL;
-
-    setup_context((cl_context)v);
-    
-    Py_INCREF(Py_None);
-    return Py_None;
-}
-#endif
-
-static PyMethodDef module_methods[] = {
-    //{"dimshuffle", PyGpuNdArray_Dimshuffle, METH_VARARGS, "Returns the dimshuffle of a PyGpuNdArray."},
-    {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
-    {"zeros",
-       (PyCFunction)PyGpuNdArray_zeros, METH_VARARGS|METH_KEYWORDS,
-       "Create a new PyGpuNdArray with specified shape, filled with zeros."},
-    {"empty",
-       (PyCFunction)PyGpuNdArray_empty, METH_VARARGS|METH_KEYWORDS,
-       "Create a new PyGpuNdArray with specified shape, filled with zeros."},
-    {"ascontiguousarray",
-       (PyCFunction)PyGpuNdArray_as_c_contiguous, METH_VARARGS|METH_KEYWORDS,
-       "If the array is not c contiguous, copy it to a new c contiguous region."},
-    {"asfortranarray",
-       (PyCFunction)PyGpuNdArray_as_f_contiguous, METH_VARARGS|METH_KEYWORDS,
-       "If the array is not f contiguous, copy it to a new c contiguous region."},
-#ifdef WITH_OPENCL
-    {"set_opencl_context",
-     PyGpuNdArray_set_opencl_context, METH_O,
-     "Set the OpenCL context to use for allocations and work."},
-#endif
-    {NULL, NULL, NULL, NULL}  /* Sentinel */
-};
-
-#ifndef PyMODINIT_FUNC  /* declarations for DLL import/export */
-#define PyMODINIT_FUNC void
-#endif
-PyMODINIT_FUNC
-initpygpu_ndarray(void)
-{
-    import_array();
-
-    PyObject* m;
-
-    if (PyType_Ready(&PyGpuNdArrayType) < 0)
-        return;
-
-    m = Py_InitModule3("pygpu_ndarray", module_methods,
-                       "Example module that creates an extension type.");
-
-    if (m == NULL)
-        return;
-
-    Py_INCREF(&PyGpuNdArrayType);
-    PyModule_AddObject(m, "GpuNdArrayObject", (PyObject *)&PyGpuNdArrayType);
-#if COMPUTE_GPU_MEM_USED
-    for(int i=0;i<TABLE_SIZE;i++){
-      _alloc_size_table[i].ptr=NULL;
-      _alloc_size_table[i].size=0;
-    }
-#endif
-    //    cublasInit();
-    //if (0&&CUBLAS_STATUS_SUCCESS != cublasGetError())
-    //{
-        //std::cerr << "WARNING: initcuda_ndarray: error initializing device\n";
-    //}
-/*
-    if (0) //TODO: is this necessary?
-    {
-        int deviceId = 0; // TODO: what number goes here?
-        cudaSetDevice(deviceId);
-        cudaError_t err = cudaGetLastError();
-        if( cudaSuccess != err)
-        {
-            std::cerr << "Error in SetDevice:" << cudaGetErrorString(err) << "\n";
-        }
-    }
-*/
-}
-
-/*
-  Local Variables:
-  mode:c++
-  c-basic-offset:4
-  c-file-style:"stroustrup"
-  c-file-offsets:((innamespace . 0)(inline-open . 0))
-  indent-tabs-mode:nil
-  fill-column:79
-  End:
-*/
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
diff --git a/ndarray/pygpu_ndarray.h b/ndarray/pygpu_ndarray.h
deleted file mode 100644
index 2a2bdc5..0000000
--- a/ndarray/pygpu_ndarray.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef _PYGPU_NDARRAY_H
-#define _PYGPU_NDARRAY_H
-#ifndef OFFSET
-#define OFFSET 0
-#endif
-
-//#include <Python.h>
-//#include <structmember.h>
-#include <stdio.h>
-#include <numpy/arrayobject.h>
-
-#include "pygpu_ndarray_object.h"
-#include "gpu_ndarray.h"
-#include "pygpu_language.h"
-
-/*
- * Return a PyGpuNdArray whose 'nd' dimensions are all 0.
- * if nd==-1, it is not initialized.
- */
-PyObject * PyGpuNdArray_New(int nd=-1);
-
-/**
- * Return 1 for a PyGpuNdArrayObject otw 0
- */
-int 
-PyGpuNdArray_Check(const PyObject * ob);
-
-/**
- * Return 1 for a PyGpuNdArrayObject otw 0
- */
-int 
-PyGpuNdArray_CheckExact(const PyObject * ob);
-
-/**
- * Transfer the contents of numpy array `obj` to `self`.
- *
- * self is reallocated to have the correct dimensions if necessary.
- */
-int PyGpuNdArray_CopyFromArray(PyGpuNdArrayObject * self, PyArrayObject*obj);
-
-static int 
-PyGpuNdArray_add_offset(PyGpuNdArrayObject * self, int offset);
-
-static int 
-PyGpuNdArray_set_data(PyGpuNdArrayObject * self, char * data, PyObject * base, int offset=0);
-
-static PyObject *
-PyGpuNdArray_Subscript(PyObject * py_self, PyObject * key);
-
-static PyObject *
-PyGpuNdArray_Copy(PyGpuNdArrayObject * self, NPY_ORDER order=NPY_CORDER);
-
-static PyObject *
-PyGpuNdArray_Zeros(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran);
-
-static PyObject *
-PyGpuNdArray_Empty(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran);
-
-#endif
-
-/*
-  Local Variables:
-  mode:c++
-  c-basic-offset:4
-  c-file-style:"stroustrup"
-  c-file-offsets:((innamespace . 0)(inline-open . 0))
-  indent-tabs-mode:nil
-  fill-column:79
-  End:
-*/
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
diff --git a/ndarray/pygpu_ndarray_object.h b/ndarray/pygpu_ndarray_object.h
deleted file mode 100644
index aa0f71a..0000000
--- a/ndarray/pygpu_ndarray_object.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/**
- * struct PyGPUArrayObject
- *
- * This is a Python type.  
- *
- */
-#ifndef _PYGPU_NDARRAY_OBJECT_H
-#define _PYGPU_NDARRAY_OBJECT_H
-
-#include <Python.h>
-#include <numpy/arrayobject.h>
-#include "gpu_ndarray.h"
-
-typedef struct PyGpuNdArrayObject{
-  PyObject_HEAD
-
-  GpuNdArray gpu_ndarray; //no pointer, just inlined.
-  PyObject * base;
-  PyArray_Descr * descr; // for numpy-like desc
-  int data_allocated; //the number of bytes allocated for devdata
-} PyGpuNdArrayObject;
-
-#define PyGpuNdArray_NDIM(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.nd)
-#define PyGpuNdArray_DATA(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.data)
-#define PyGpuNdArray_BYTES(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.data)
-#define PyGpuNdArray_OFFSET(obj) (((PyGpuNdArrayObject *)(obj))->gpu_ndarray.offset)
-#define PyGpuNdArray_DIMS(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.dimensions)
-#define PyGpuNdArray_STRIDES(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.strides)
-#define PyGpuNdArray_DIM(obj,n) (PyGpuNdArray_DIMS(obj)[n])
-#define PyGpuNdArray_STRIDE(obj,n) (PyGpuNdArray_STRIDES(obj)[n])
-#define PyGpuNdArray_BASE(obj) (((PyGpuNdArrayObject *)obj)->base)
-#define PyGpuNdArray_DESCR(obj) (((PyGpuNdArrayObject *)obj)->descr)
-#define PyGpuNdArray_FLAGS(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.flags)
-#define PyGpuNdArray_ITEMSIZE(obj) (((PyGpuNdArrayObject *)obj)->descr->elsize)
-#define PyGpuNdArray_TYPE(obj) (((PyGpuNdArrayObject *)(obj))->descr->type_num)
-
-#define PyGpuNdArray_SIZE(obj) PyArray_MultiplyList(PyGpuNdArray_DIMS(obj),PyGpuNdArray_NDIM(obj))
-//npy_intp PyGpuNdArray_Size(PyObject* obj);
-//npy_intp PyGpuNdArray_NBYTES(PyObject* arr);
-
-/*
-  Flags accessor
- */
-#define PyGpuNdArray_CHKFLAGS(m, FLAGS)                              \
-        ((((PyGpuNdArrayObject *)(m))->gpu_ndarray.flags & (FLAGS)) == (FLAGS))
-
-#define PyGpuNdArray_ISCONTIGUOUS(m) PyGpuNdArray_CHKFLAGS(m, NPY_CONTIGUOUS)
-#define PyGpuNdArray_ISFORTRAN(m) (PyGpuNdArray_CHKFLAGS(m, NPY_F_CONTIGUOUS) && \
-                                   PyGpuNdArray_NDIM(m) > 1)
-#define PyGpuNdArray_FORTRAN_IF(m) (PyGpuNdArray_CHKFLAGS(m, NPY_F_CONTIGUOUS)? \
-                                    NPY_F_CONTIGUOUS : 0)
-#define PyGpuNdArray_ISONESEGMENT(m) (PyGpuNdArray_NDIM(m) == 0 || \
-                                      PyGpuNdArray_ISCONTIGUOUS(m) || \
-                                      PyGpuNdArray_ISFORTRAN(m))
-#define PyGpuNdArray_ISWRITEABLE(m) PyGpuNdArray_CHKFLAGS(m, NPY_WRITEABLE)
-#define PyGpuNdArray_ISALIGNED(m) PyGpuNdArray_CHKFLAGS(m, NPY_ALIGNED)
-
-#define PyGpuNdArray_ISNBO(arg) ((arg) != NPY_OPPBYTE)
-// THE NEXT ONE SEEM BAD...
-#define PyGpuNdArray_IsNativeByteOrder PyArray_ISNBO
-#define PyGpuNdArray_ISNOTSWAPPED(m) PyArray_ISNBO(PyArray_DESCR(m)->byteorder)
-#define PyGpuNdArray_FLAGSWAP(m, flags) (PyGpuNdArray_CHKFLAGS(m, flags) && PyGpuNdArray_ISNOTSWAPPED(m))
-
-#define PyGpuNdArray_ISCARRAY(m) PyGpuNdArray_FLAGSWAP(m, NPY_CARRAY)
-#define PyGpuNdArray_ISCARRAY_RO(m) PyGpuNdArray_FLAGSWAP(m, NPY_CARRAY_RO)
-#define PyGpuNdArray_ISFARRAY(m) PyGpuNdArray_FLAGSWAP(m, NPY_FARRAY)
-#define PyGpuNdArray_ISFARRAY_RO(m) PyGpuNdArray_FLAGSWAP(m, NPY_FARRAY_RO)
-#define PyGpuNdArray_ISBEHAVED(m) PyGpuNdArray_FLAGSWAP(m, NPY_BEHAVED)
-#define PyGpuNdArray_ISBEHAVED_RO(m) PyGpuNdArray_FLAGSWAP(m, NPY_ALIGNED)
-
-static
-void PyGpuNdArray_fprint(FILE * fd, const PyGpuNdArrayObject *self)
-{
-    fprintf(fd, "PyGpuNdArrayObject <%p, %p> nd=%i data_allocated=%d\n",
-	    self, PyGpuNdArray_DATA(self), PyGpuNdArray_NDIM(self), self->data_allocated);
-    fprintf(fd, "\tITEMSIZE: %d\n", PyGpuNdArray_ITEMSIZE(self));
-    fprintf(fd, "\tTYPENUM: %d\n", PyGpuNdArray_TYPE(self));
-    fprintf(fd, "\tRefcount: %ld\n", (long int)self->ob_refcnt);
-    fprintf(fd, "\tBASE: %p\n", PyGpuNdArray_BASE(self));
-    fprintf(fd, "\tHOST_DIMS:      ");
-    for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i)
-    {
-        fprintf(fd, "%ld\t", PyGpuNdArray_DIMS(self)[i]);
-    }
-    fprintf(fd, "\n\tHOST_STRIDES: ");
-    for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i)
-    {
-        fprintf(fd, "%ld\t", PyGpuNdArray_STRIDES(self)[i]);
-    }
-    fprintf(fd, "\n\tFLAGS: ");
-    fprintf(fd, "\n\t\tC_CONTIGUOUS: %d", PyGpuNdArray_ISCONTIGUOUS(self));
-    fprintf(fd, "\n\t\tPyGpuNdArray_ISFORTRAN: %d PyGpuNdArray_FORTRAN_IF:%d F_CONTIGUOUS: %d",
-            PyGpuNdArray_ISFORTRAN(self), PyGpuNdArray_FORTRAN_IF(self), PyGpuNdArray_CHKFLAGS(self, NPY_FORTRAN));
-    fprintf(fd, "\n\t\tOWNDATA: %d", PyGpuNdArray_CHKFLAGS(self, NPY_OWNDATA));
-    fprintf(fd, "\n\t\tWRITEABLE: %d", PyGpuNdArray_ISWRITEABLE(self));
-    fprintf(fd, "\n\t\tALIGNED: %d", PyGpuNdArray_ISALIGNED(self));
-    fprintf(fd, "\n\t\tUPDATEIFCOPY: %d", PyGpuNdArray_CHKFLAGS(self, NPY_UPDATEIFCOPY));
-    fprintf(fd, "\n");
-
-}
-static
-void PyArray_fprint(FILE * fd, const PyArrayObject *self)
-{
-    fprintf(fd, "PyArrayObject <%p, %p> nd=%i\n",
-	    self, PyArray_DATA(self), PyArray_NDIM(self));
-    fprintf(fd, "\tITEMSIZE: %d\n", PyArray_ITEMSIZE(self));
-    fprintf(fd, "\tTYPENUM: %d\n", PyArray_TYPE(self));
-    fprintf(fd, "\tHOST_DIMS:      ");
-    for (int i = 0; i < PyArray_NDIM(self); ++i)
-    {
-        fprintf(fd, "%ld\t", PyArray_DIMS(self)[i]);
-    }
-    fprintf(fd, "\n\tHOST_STRIDES: ");
-    for (int i = 0; i < PyArray_NDIM(self); ++i)
-    {
-        fprintf(fd, "%ld\t", PyArray_STRIDES(self)[i]);
-    }
-    fprintf(fd, "\n\tFLAGS: ");
-    fprintf(fd, "\n\t\tC_CONTIGUOUS: %d", PyArray_ISCONTIGUOUS(self));
-    fprintf(fd, "\n\t\tPyArray_ISFORTRAN: %d PyArray_FORTRAN_IF:%d F_CONTIGUOUS: %d",
-            PyArray_ISFORTRAN(self), PyArray_FORTRAN_IF(self), PyArray_CHKFLAGS(self, NPY_FORTRAN));
-    fprintf(fd, "\n\t\tOWNDATA: %d", PyArray_CHKFLAGS(self, NPY_OWNDATA));
-    fprintf(fd, "\n\t\tWRITEABLE: %d", PyArray_ISWRITEABLE(self));
-    fprintf(fd, "\n\t\tALIGNED: %d", PyArray_ISALIGNED(self));
-    fprintf(fd, "\n\t\tUPDATEIFCOPY: %d", PyArray_CHKFLAGS(self, NPY_UPDATEIFCOPY));
-    fprintf(fd, "\n");
-
-}
-
-template <typename T>
-static T ceil_intdiv(T a, T b)
-{
-    return (a/b) + ((a % b) ? 1: 0);
-}
-
-
-//Compute if the resulting array is c contiguous
-static bool
-PyGpuNdArray_is_c_contiguous(const PyGpuNdArrayObject * self)
-{
-    bool c_contiguous = true;
-    int size = PyGpuNdArray_ITEMSIZE(self);
-    for (int i = PyGpuNdArray_NDIM(self)-1; (i >= 0) && c_contiguous; --i) {
-        if (PyGpuNdArray_STRIDE(self, i) != size) {
-            c_contiguous = false;
-        }
-        size = size * PyGpuNdArray_DIM(self, i);
-    }
-    return c_contiguous;
-}
-
-//Compute if the resulting array is f contiguous
-static bool
-PyGpuNdArray_is_f_contiguous(const PyGpuNdArrayObject * self)
-{
-    bool f_contiguous = true;
-    int size = PyGpuNdArray_ITEMSIZE(self);
-    for (int i = 0; i < PyGpuNdArray_NDIM(self) && f_contiguous; ++i) {
-        if (PyGpuNdArray_STRIDE(self, i) != size) {
-            f_contiguous = false;
-        }
-        size = size * PyGpuNdArray_DIM(self, i);
-    }
-    return f_contiguous;
-}
-
-static PyObject *
-PyGpuNdArray_as_c_contiguous(PyObject* dummy, PyObject* args, PyObject *kargs);
-static PyObject *
-PyGpuNdArray_as_f_contiguous(PyObject* dummy, PyObject* args, PyObject *kargs);
-
-/**
- * [Re]allocate a PyGpuNdArrayObject with access to 'nd' dimensions.
- *
- * Note: This does not allocate storage for data.
- */
-static
-int PyGpuNdArray_set_nd(PyGpuNdArrayObject * self, const int nd)
-{
-    if (nd != PyGpuNdArray_NDIM(self))
-    {
-        if(0) fprintf(stderr, "PyGpuNdArray_set_nd: modif nd=%i to nd=%i\n", PyGpuNdArray_NDIM(self), nd);
-    
-        if (PyGpuNdArray_DIMS(self)){
-            free(PyGpuNdArray_DIMS(self));
-            PyGpuNdArray_DIMS(self) = NULL;
-            PyGpuNdArray_NDIM(self) = -1;
-        }
-        if (PyGpuNdArray_STRIDES(self)){
-            free(PyGpuNdArray_STRIDES(self));
-            PyGpuNdArray_STRIDES(self) = NULL;
-            PyGpuNdArray_NDIM(self) = -1;
-        }
-        if (nd == -1) return 0;
-
-        PyGpuNdArray_DIMS(self) = (npy_intp*)malloc(nd*sizeof(npy_intp));
-        if (NULL == PyGpuNdArray_DIMS(self))
-        {
-            PyErr_SetString(PyExc_MemoryError, "PyGpuNdArray_set_nd: Failed to allocate dimensions");
-            return -1;
-        }
-        PyGpuNdArray_STRIDES(self) = (npy_intp*)malloc(nd*sizeof(npy_intp));
-        if (NULL == PyGpuNdArray_STRIDES(self))
-        {
-            PyErr_SetString(PyExc_MemoryError, "PyGpuNdArray_set_nd: Failed to allocate str");
-            return -1;
-        }
-        //initialize all dimensions and strides to 0
-        for (int i = 0; i < nd; ++i)
-        {
-            PyGpuNdArray_DIM(self, i) = 0;
-            PyGpuNdArray_STRIDES(self)[i] = 0;
-        }
-
-        PyGpuNdArray_NDIM(self) = nd;
-	if(0) fprintf(stderr, "PyGpuNdArray_set_nd: end\n");
-    }
-    return 0;
-}
-
-#endif
-/*
-  Local Variables:
-  mode:c++
-  c-basic-offset:4
-  c-file-style:"stroustrup"
-  c-file-offsets:((innamespace . 0)(inline-open . 0))
-  indent-tabs-mode:nil
-  fill-column:79
-  End:
-*/
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
diff --git a/ndarray/setup_opencl.py b/ndarray/setup_opencl.py
deleted file mode 100644
index 0087cf2..0000000
--- a/ndarray/setup_opencl.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import os
-from distutils.command.build_ext import build_ext
-from distutils.core import Extension, setup
-from distutils.dep_util import newer
-
-import numpy as np
-
-
-class build_ext_nvcc(build_ext):
-    user_options = build_ext.user_options
-    user_options.extend([
-            ('cuda-root=', None, "The cuda root directory")])
-
-    def initialize_options(self):
-        build_ext.initialize_options(self)
-        self.cuda_root = None
-
-    def finalize_options(self):
-        build_ext.finalize_options(self)
-        if self.cuda_root is None:
-            self.cuda_root = os.getenv('CUDA_ROOT', None)
-        if self.cuda_root is not None:
-            self._nvcc_bin = os.path.join(self.cuda_root, 'bin', 'nvcc')
-        else:
-            self._nvcc_bin = 'nvcc'
-
-    def cuda_process(self, source, include_args):
-        target = source + '.cpp'
-        if newer(source, target):
-            self.spawn([self._nvcc_bin, '--cuda', source, '-o', target] + \
-                           include_args)
-        return target
-
-    def cuda_extension(self, ext):
-        includes = self.distribution.include_dirs + ext.include_dirs
-        include_args = ['-I' + i for i in includes]
-        new_sources = []
-        anycuda = False
-        for src in ext.sources:
-            if src.endswith('.cu'):
-                new_sources.append(self.cuda_process(src, include_args))
-                anycuda = True
-            else:
-                new_sources.append(src)
-        if anycuda:
-            ext.sources = new_sources
-            if self.cuda_root is not None:
-                lib = os.path.join(self.cuda_root, 'lib')
-                lib64 = os.path.join(self.cuda_root, 'lib64')
-                if os.path.isdir(lib):
-                    ext.library_dirs.append(lib)
-                    ext.extra_link_args.append('-Xlinker')
-                    ext.extra_link_args.append('-rpath')
-                    ext.extra_link_args.append('-Xlinker')
-                    ext.extra_link_args.append(lib)
-                if os.path.isdir(lib64):
-                    ext.library_dirs.append(lib64)
-#                    ext.extra_link_args.append('-rpath')
-#                    ext.extra_link_args.append(lib64)
-            if 'cudart' not in ext.libraries:
-                ext.libraries.append('cudart')
-
-        if self.cuda_root:
-            include = os.path.join(self.cuda_root, 'include')
-            if os.path.isdir(include):
-                ext.extra_compile_args.append('-I' + include)
-        if os.path.isfile('/usr/lib/nvidia-current/libOpenCL.so'):
-            ext.extra_link_args.append('-L/usr/lib/nvidia-current')
-            ext.extra_link_args.append('-Xlinker')
-            ext.extra_link_args.append('-rpath')
-            ext.extra_link_args.append('-Xlinker')
-            ext.extra_link_args.append('/usr/lib/nvidia-current')
-
-    def build_extensions(self):
-        self.check_extensions_list(self.extensions)
-
-        for ext in self.extensions:
-            self.cuda_extension(ext)
-            # uncomment this + inherit from the cython version of build_ext
-            # work with cuda and cython sources
-            #ext.sources = self.cython_sources(ext.sources, ext)
-            self.build_extension(ext)
-
-import sys
-
-
-if sys.platform == 'darwin':
-    libcl_args = {'extra_link_args': ['-framework', 'OpenCL']}
-else:
-    libcl_args = {'libraries': ['OpenCL']}
-
-
-setup(name='compyte',
-      cmdclass={'build_ext': build_ext_nvcc},
-      include_dirs=[np.get_include(), '.'],
-      ext_modules=[Extension('pygpu_ndarray',
-                             define_macros=[('OFFSET', '1'), ('WITH_OPENCL', '')],
-                             sources=['pygpu_language_opencl.cpp',
-                                      'pygpu_ndarray.cpp'],
-                             **libcl_args)
-                   ]
-)
diff --git a/ndarray/test_gpu_elemwise.py b/ndarray/test_gpu_elemwise.py
deleted file mode 100644
index 6b42dbe..0000000
--- a/ndarray/test_gpu_elemwise.py
+++ /dev/null
@@ -1,410 +0,0 @@
-# TODO: test other dtype
-from functools import reduce
-
-import numpy
-import pygpu_ndarray as gpu_ndarray
-import theano
-
-from .gen_elemwise import MyGpuNdArray, elemwise_collapses
-from .test_gpu_ndarray import dtypes_all, enable_double, gen_gpu_nd_array, product
-
-
-def rand(shape, dtype):
-    r = numpy.random.randn(*shape) * 10
-    if dtype.startswith("u"):
-        r = numpy.absolute(r)
-    return r.astype(dtype)
-
-
-# numpy.allclose seam to have problem with int8...
-def all_close(x, y):
-    return (numpy.allclose(x, y) or
-            numpy.absolute(x - y).max() == 0)
-
-
-def test_elemwise_collapse():
-    """ Test collapsing under many broadcast and strided pattern """
-
-    for dtype1 in ["int16", "float32", "int8"]:
-        for dtype2 in ["int16", "float32", "int8"]:
-
-            for shape1_, shape2_, expected in [
-                # 1d to test this special case
-                ((40,), (40,), 0),
-                ((40,), (1,), 1),
-                # No broadcastable dimensions
-                ((4, 5, 6, 9), (4, 5, 6, 9), 0),
-                # All inputs have one(and the same) broadcastable dimension
-                ((1, 4, 5, 9), (1, 4, 5, 9), 0),
-                ((4, 1, 5, 9), (4, 1, 5, 9), 0),
-                ((4, 5, 1, 9), (4, 5, 1, 9), 0),
-                ((4, 5, 9, 1), (4, 5, 9, 1), 0),
-                # One inputs have one broadcastable dimension
-                ((1, 5, 6, 9), (4, 5, 6, 9), 2),
-                ((4, 1, 6, 9), (4, 5, 6, 9), 3),
-                ((4, 5, 1, 9), (4, 5, 6, 9), 3),
-                ((4, 5, 6, 1), (4, 5, 6, 9), 2),
-                # One inputs have two broadcastable dimension
-                ((1, 1, 6, 9), (4, 5, 6, 9), 2),
-                ((1, 5, 1, 9), (4, 5, 6, 9), 4),
-                ((1, 5, 6, 1), (4, 5, 6, 9), 3),
-                ((4, 1, 1, 9), (4, 5, 6, 9), 3),
-                ((4, 1, 6, 1), (4, 5, 6, 9), 4),
-                ((4, 5, 1, 1), (4, 5, 6, 9), 2),
-                # One inputs have tree broadcastable dimension
-                ((1, 1, 1, 9), (4, 5, 6, 9), 2),
-                ((1, 1, 6, 1), (4, 5, 6, 9), 3),
-                ((1, 5, 1, 1), (4, 5, 6, 9), 3),
-                ((4, 1, 1, 1), (4, 5, 6, 9), 2),
-                # One scalar
-                ((1, 1, 1, 1), (4, 5, 6, 9), 1),
-                # One scalar, the other 1 broadcast dims
-                ((1, 1, 1, 1), (4, 5, 6, 1), 1),
-                ]:
-                scalar_cpu = rand((1,) * len(shape1_), dtype=dtype1)
-                scalar_gpu = gpu_ndarray.GpuNdArrayObject(scalar_cpu)
-                scalar_gpu1 = MyGpuNdArray(scalar_gpu)
-                for shape1, shape2 in [(shape1_, shape2_), (shape2_, shape1_)]:
-                    a_cpu = rand(shape1, dtype=dtype1)
-                    a = gpu_ndarray.GpuNdArrayObject(a_cpu)
-                    a1 = MyGpuNdArray(a)
-
-                    b_cpu = rand(shape2, dtype=dtype2)
-                    b = gpu_ndarray.GpuNdArrayObject(b_cpu)
-                    b1 = MyGpuNdArray(b)
-
-                    assert len(shape1) == len(shape2)
-                    o_shape = []
-                    for i in range(len(shape1)):
-                        o_shape.append(max(shape1[i], shape2[i]))
-                    o = gpu_ndarray.empty(o_shape, dtype=(a_cpu + b_cpu).dtype)
-
-                    # 1.1 Check direct collapse
-                    nd_collaps, info = elemwise_collapses([a, b], [o])
-                    assert nd_collaps == expected, (shape1, shape2,
-                                                    nd_collaps, expected, info)
-
-                    # 1.2 Check computation are still valid
-                    f = MyGpuNdArray.gen_fct(theano.tensor.add, [a1, b1],
-                                             len(shape1))
-                    out = f([a1, b1])
-                    out2 = f([a1, b1], out=out)
-                    assert out is out2
-                    assert numpy.allclose(numpy.asarray(f([a1, b1])),
-                                          a_cpu + b_cpu)
-                    assert numpy.allclose(numpy.asarray(
-                            MyGpuNdArray.adds(a1, b1)), a_cpu + b_cpu)
-                    assert numpy.allclose(numpy.asarray(
-                            MyGpuNdArray.add(a1, b1)), a_cpu + b_cpu)
-                    assert MyGpuNdArray.add(a1, b1, out=out2) is out2
-
-                    # 1.3 Check work without collaping
-                    f = MyGpuNdArray.gen_fct(theano.tensor.add, [a1, b1],
-                                             len(shape1), collapse=False)
-                    out = f([a1, b1])
-                    out2 = f([a1, b1], out=out)
-                    assert out is out2
-                    assert numpy.allclose(numpy.asarray(f([a1, b1])),
-                                          a_cpu + b_cpu)
-                    assert numpy.allclose(numpy.asarray(MyGpuNdArray.adds(
-                                a1, b1)), a_cpu + b_cpu)
-                    assert numpy.allclose(numpy.asarray(MyGpuNdArray.add(
-                                a1, b1)), a_cpu + b_cpu)
-                    assert MyGpuNdArray.add(a1, b1, out=out2) is out2
-
-                    # 2.1 What if we add a scalar?
-                    nd_collaps, info = elemwise_collapses(
-                        [a, b, scalar_gpu], [o])
-                    if expected == 0:
-                        expected2 = 1
-                    else:
-                        expected2 = expected
-                    assert nd_collaps == expected2, (shape1, shape2,
-                                                     nd_collaps, expected,
-                                                     info)
-                    # 2.2 Check computation
-                    assert numpy.allclose(numpy.asarray(MyGpuNdArray.adds(
-                                a1, b1, scalar_gpu1)),
-                                          a_cpu + b_cpu + scalar_cpu)
-
-                    # 3.1 What if one of the dimensions is strided?
-                    broadcast = any([True for i in a.shape + b.shape
-                                     if i == 1])
-                    if expected == 0:
-                        expected2 = 2
-                    else:
-                        expected2 = expected
-
-                    if len(shape1_) != 4:
-                        continue
-
-                    if a.shape[0] != 1:
-                        shape = list(shape1)
-                        shape[0] *= 2
-                        c_cpu = rand(shape, dtype='float32')
-                        c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::2]
-                        c1 = MyGpuNdArray(c)
-
-                        err = ("strided", c.shape, shape2,
-                               nd_collaps, expected, info)
-                        nd_collaps, info = elemwise_collapses([c, b], [o])
-                        if broadcast:
-                            assert nd_collaps >= expected, err
-                        else:
-                            assert nd_collaps == expected2, err
-                        assert numpy.allclose(numpy.asarray(
-                                MyGpuNdArray.adds(c1, b1)),
-                                              numpy.asarray(c) + b_cpu)
-
-                    if a.shape[1] != 1:
-                        shape = list(shape1)
-                        shape[1] *= 2
-                        c_cpu = rand(shape, dtype='float32')
-                        c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::2]
-                        c1 = MyGpuNdArray(c)
-
-                        err = ("strided", c.shape, shape2,
-                               nd_collaps, expected, info)
-                        nd_collaps, info = elemwise_collapses([c, b], [o])
-                        if broadcast:
-                            assert nd_collaps >= expected, err
-                        else:
-                            assert nd_collaps == expected2, err
-                            pass
-                        assert numpy.allclose(numpy.asarray(
-                                MyGpuNdArray.adds(c1, b1)),
-                                              numpy.asarray(c) + b_cpu)
-
-                    if a.shape[2] != 1:
-                        shape = list(shape1)
-                        shape[2] *= 2
-                        c_cpu = rand(shape, dtype='float32')
-                        c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::, ::2]
-                        c1 = MyGpuNdArray(c)
-
-                        err = ("strided", c.shape, shape2,
-                               nd_collaps, expected, info)
-                        nd_collaps, info = elemwise_collapses([c, b], [o])
-                        if broadcast:
-                            assert nd_collaps >= expected, err
-                        else:
-                            assert nd_collaps == expected2, err
-                            pass
-                        assert numpy.allclose(numpy.asarray(
-                                MyGpuNdArray.adds(c1, b1)),
-                                              numpy.asarray(c) + b_cpu)
-
-                    if a.shape[3] != 1:
-                        shape = list(shape1)
-                        shape[3] *= 2
-                        c_cpu = rand(shape, dtype='float32')
-                        c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::,
-                            ::, ::2]
-                        c1 = MyGpuNdArray(c)
-
-                        err = ("strided", c.shape, shape2,
-                               nd_collaps, expected, info)
-                        nd_collaps, info = elemwise_collapses([c, b], [o])
-                        if broadcast:
-                            assert nd_collaps >= expected, err
-                        else:
-                            assert nd_collaps == 1, err
-                            pass
-                        assert numpy.allclose(numpy.asarray(
-                                MyGpuNdArray.adds(c1, b1)),
-                                              numpy.asarray(c) + b_cpu)
-
-
-def test_elemwise_mixed_dtype():
-    to_cpu = numpy.asarray
-
-    for dtype1 in ["int16", "float32", "int8"]:
-        for dtype2 in ["int16", "float32", "int8"]:
-            dtypeo = str((numpy.zeros(1, dtype=dtype1) +
-                          numpy.zeros(1, dtype=dtype2)).dtype)
-            #print "dtypes", dtype1, dtype2, "o dtype", dtypeo
-
-            #print "    Test inside a wrapping python object 2 inputs"
-            for shape in [(500,), (50, 5), (5, 6, 7)]:
-                input_vals = [rand(shape, dtype) for dtype in [dtype1, dtype2]]
-                del dtype
-                gpu_vals = [gpu_ndarray.GpuNdArrayObject(i)
-                            for i in input_vals]
-                assert all([numpy.allclose(to_cpu(ig), i)
-                            for ig, i in zip(gpu_vals, input_vals)])
-
-                gpu_vals = [MyGpuNdArray(x) for x in gpu_vals]
-                out = gpu_vals[0] + gpu_vals[1]
-                assert numpy.allclose(to_cpu(out),
-                                      input_vals[0] + input_vals[1])
-                out = gpu_vals[0] - gpu_vals[1]
-                assert numpy.allclose(to_cpu(out),
-                                      input_vals[0] - input_vals[1])
-                out = gpu_vals[0] * gpu_vals[1]
-                assert all_close(to_cpu(out),
-                                 input_vals[0] * input_vals[1])
-                if dtypeo.startswith("float"):
-                    # TODO: execute for all dtype
-                    out = gpu_vals[0] / gpu_vals[1]
-                    assert numpy.allclose(to_cpu(out),
-                                          input_vals[0] / input_vals[1])
-
-            nb_in = 4
-            #print "    Test inside a wrapping python object %d inputs"%nb_in
-            for shape in [(500,), (50, 5), (5, 6, 7)]:
-                input_vals = [rand(shape, dtype)
-                              for dtype in [dtype1, dtype2, dtype1, dtype2]]
-                gpu_vals = [gpu_ndarray.GpuNdArrayObject(i)
-                            for i in input_vals]
-                assert all([numpy.allclose(to_cpu(ig), i)
-                            for ig, i in zip(gpu_vals, input_vals)])
-
-                gpu_vals = [MyGpuNdArray(x) for x in gpu_vals]
-                out = MyGpuNdArray.adds(*gpu_vals)
-                assert numpy.allclose(to_cpu(out),
-                                      reduce(numpy.add, input_vals))
-
-                out = MyGpuNdArray.multiplys(*gpu_vals)
-                assert all_close(to_cpu(out),
-                                 reduce(numpy.multiply, input_vals))
-
-            #print "    Test broadcasting"
-            for shapes in [((1, 5), (4, 5)), ((33, 10), (33, 1)),
-                           ((33, 1, 5), (33, 10, 1)),
-                           ((33, 1, 5), (33, 10, 1), ((1, 10, 5))),
-                           ]:
-                input_vals = [rand(shape, dtype) for shape, dtype
-                              in zip(shapes, [dtype1, dtype2])]
-                gpu_vals = [gpu_ndarray.GpuNdArrayObject(i)
-                            for i in input_vals]
-                assert all([numpy.allclose(to_cpu(ig), i)
-                            for ig, i in zip(gpu_vals, input_vals)])
-
-                gpu_vals = [MyGpuNdArray(x) for x in gpu_vals]
-                out = MyGpuNdArray.adds(*gpu_vals)
-                assert numpy.allclose(to_cpu(out),
-                                      reduce(numpy.add, input_vals))
-
-                out = MyGpuNdArray.multiplys(*gpu_vals)
-                assert all_close(to_cpu(out),
-                                 reduce(numpy.multiply, input_vals))
-
-
-def test_sum():
-    to_cpu = numpy.asarray
-    dtypes = list(dtypes_all)
-    # I remove *int8 as currently the output have the same dtype
-    # And this cause overflow
-    dtypes.remove("int8")
-    dtypes.remove("uint8")
-    # I need to find how pycuda handle complexe in c.
-    # I probably just need to add an header.
-    dtypes.remove("complex64")
-    if  enable_double:
-        dtypes.remove("complex128")
-    for shape in [
-        # need something bigger then 32, 1024 or 4096.
-        # Those are corner case.
-
-        # 1d, take only a few seconds on a GTX470
-        (0,), (5,), (31,), (32,), (33,),
-        (1023,), (1024,), (1025,),
-        (4095,), (4096,), (4097,),
-        (32 * 1024 - 1,), (32 * 1024,), (32 * 1024 + 1,),
-
-        # 2d, take 2 minutes on a GTX 470
-        (0, 0), (1, 0), (0, 1,), (5, 4),
-        (31, 31), (31, 32), (31, 33),
-        (32, 31), (32, 32), (32, 33),
-        (33, 31), (33, 32), (33, 33),
-        (1024, 32), (1025, 32),
-        (1024, 33), (1025, 33),
-        (4096, 32), (32, 4096), (4096, 33), (33, 4096),
-        (4097, 32), (32, 4097), (4097, 33), (33, 4097),
-
-        # 3d, take 2 minutes on a GTX 470
-        (0, 0, 0), (0, 1, 0), (0, 0, 1),
-        (5, 4, 3), (5, 4, 3), (5, 4, 3),
-        (4096, 2, 33), (2, 4096, 33), (33, 2, 4096),
-        (4097, 2, 33), (2, 4097, 33), (33, 2, 4097),
-        (4096, 33, 2), (33, 4096, 2), (2, 33, 4096),
-        (4097, 33, 2), (33, 4097, 2), (2, 33, 4097),
-
-        # 4d, take 1 minutes on a GTX 470
-        (0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0),
-        (0, 0, 1, 0), (0, 0, 0, 1),
-        (5, 4, 3, 2),
-        (1024, 32, 2, 3), (3, 1024, 32, 2), (2, 3, 1024, 32),
-        (1024, 2, 32, 3), (3, 1024, 2, 32), (1024, 3, 2, 32),
-        (1025, 33, 2, 3), (3, 1025, 33, 2), (2, 3, 1025, 33),
-        (1025, 2, 33, 3), (3, 1025, 2, 33), (1025, 3, 2, 33),
-        (4100, 4, 3, 2), (4, 4100, 3, 2),
-        (4, 3, 4100, 2), (4, 3, 2, 4100),
-
-        # 5d, work only if c contiguous
-        (5, 4, 3, 10, 11),
-        ]:
-
-        for dtype, off_o, off_i, sliced, order in product(
-            *([dtypes] +
-              [[False, True]] +
-              [[False, True]] +
-              [[-1, 2, -2, 1]] +
-              [['f', 'c']])):
-
-            cpu_val, gpu_val = gen_gpu_nd_array(shape, dtype, off_o,
-                                                off_i, sliced, order)
-
-            if len(shape) > 4 and not (gpu_val.flags["C_CONTIGUOUS"] or
-                                       gpu_val.flags["F_CONTIGUOUS"]):
-                continue
-            gpu_val = MyGpuNdArray(gpu_val)
-            cpu_sum = cpu_val.sum()
-#            print dtype, shape, off_o, off_i, sliced, order
-#            print (cpu_val.strides,
-#                   cpu_val.flags["C_CONTIGUOUS"],
-#                   cpu_val.flags["F_CONTIGUOUS"])
-#            print (gpu_val.strides,
-#                   gpu_val.flags["C_CONTIGUOUS"],
-#                   gpu_val.flags["F_CONTIGUOUS"])
-            gpu_sum = to_cpu(gpu_val.sum())
-
-            def get_rtol(orig, after_reduction):
-                if after_reduction.size == 0:
-                    return 0
-                if orig.size // after_reduction.size > 500000:
-                    rtols = {"float32": 4.3e-5}
-                elif orig.size // after_reduction.size > 100000:
-                    rtols = {"float32": 3e-5}
-                elif orig.size // after_reduction.size > 50000:
-                    rtols = {"float32": 2e-5}
-                else:
-                    rtols = {"float32": 1e-5}
-                if dtype in rtols:
-                    rtol = rtols[dtype]
-                else:
-                    rtol = 1e-8
-                return rtol
-            rtol = get_rtol(gpu_val, gpu_sum)
-            cpu_sum = cpu_sum.astype(dtype)
-            if not (dtype.endswith("int16") and numpy.prod(shape) > 20000):
-                assert (numpy.allclose(cpu_sum, gpu_sum, rtol=rtol) or
-                        cpu_sum == gpu_sum), (
-                    dtype, shape, cpu_sum, gpu_sum,
-                    (cpu_sum - gpu_sum) / cpu_sum)
-
-            # Test pattern 10 and 01
-            # Test pattern 100, 010 and 001
-            if len(shape) in [2, 3]:
-                for axis in range(len(shape)):
-                    gpu_sum = to_cpu(gpu_val.sum(axis=[axis]))
-                    cpu_sum = cpu_val.sum(axis=axis)
-                    rtol = get_rtol(gpu_val, gpu_sum)
-                    if cpu_sum.size > 0:
-                        argmax = numpy.absolute(cpu_sum - gpu_sum).argmax()
-                        cpu_max = cpu_sum.flatten()[argmax]
-                        gpu_max = gpu_sum.flatten()[argmax]
-                    assert numpy.allclose(cpu_sum, gpu_sum), (
-                        "axis=%d" % axis, dtype, shape, cpu_sum.shape,
-                        cpu_sum, gpu_sum,
-                        cpu_max, gpu_max, (cpu_max - gpu_max) / cpu_max)
diff --git a/ndarray/test_gpu_ndarray.py b/ndarray/test_gpu_ndarray.py
deleted file mode 100644
index 7e88165..0000000
--- a/ndarray/test_gpu_ndarray.py
+++ /dev/null
@@ -1,488 +0,0 @@
-import copy
-
-import numpy
-import pygpu_ndarray as gpu_ndarray
-
-
-enable_double = True
-enable_double = False
-
-dtypes_all = ["float32",
-              "int8", "int16", "int32", "int64",
-              "uint8", "uint16", "uint32", "uint64",
-              "complex64",
-              ]
-
-dtypes_no_complex = ["float32",
-                     "int8", "int16", "int32", "int64",
-                     "uint8", "uint16", "uint32", "uint64",
-                     ]
-if enable_double:
-    dtypes_all += ["float64", "complex128"]
-    dtypes_no_complex += ["float64"]
-
-
-def check_flags(x, y):
-    assert x.flags["C_CONTIGUOUS"] == y.flags["C_CONTIGUOUS"]
-    assert x.flags["F_CONTIGUOUS"] == y.flags["F_CONTIGUOUS"]
-    assert x.flags["WRITEABLE"] == y.flags["WRITEABLE"]
-    assert x.flags["OWNDATA"] == y.flags["OWNDATA"]
-    assert x.flags["ALIGNED"] == y.flags["ALIGNED"]
-    assert x.flags["UPDATEIFCOPY"] == y.flags["UPDATEIFCOPY"]
-
-
-def check_meta(x, y):
-    assert x.shape == y.shape
-    assert x.dtype == y.dtype
-    assert x.strides == y.strides
-    check_flags(x, y)
-
-
-def check_all(x, y):
-    check_meta(x, y)
-    assert numpy.allclose(numpy.asarray(x), numpy.asarray(y))
-
-
-def gen_gpu_nd_array(shape_orig, dtype='float32', offseted_outer=False,
-                     offseted_inner=False, sliced=1, order='c'):
-    if sliced is True:
-        sliced = 2
-    elif sliced is False:
-        sliced = 1
-    shape = numpy.asarray(shape_orig).copy()
-    if sliced != 1 and len(shape) > 0:
-        shape[0] *= numpy.absolute(sliced)
-    if offseted_outer and len(shape) > 0:
-        shape[0] += 1
-    if offseted_inner and len(shape) > 0:
-        shape[-1] += 1
-
-    a = numpy.random.rand(*shape) * 10
-    if dtype.startswith("u"):
-        a = numpy.absolute(a)
-    a = numpy.asarray(a, dtype=dtype)
-    assert order in ['c', 'f']
-    if order == 'f' and len(shape) > 0:
-        a = numpy.asfortranarray(a)
-    b = gpu_ndarray.GpuNdArrayObject(a)
-    if order == 'f' and len(shape) > 0 and b.size > 1:
-        assert b.flags['F_CONTIGUOUS']
-
-    if offseted_outer and len(shape) > 0:
-        b = b[1:]
-        a = a[1:]
-        assert b.offset != 0
-    if offseted_inner and len(shape) > 0:
-        # The b[..., 1:] act as the test for this subtensor case.
-        b = b[..., 1:]
-        a = a[..., 1:]
-        assert b.offset != 0
-    if sliced != 1 and len(shape) > 0:
-        a = a[::sliced]
-        b = b[::sliced]
-
-    if False and shape_orig == ():
-        assert a.shape == (1,)
-        assert b.shape == (1,)
-    else:
-        assert a.shape == shape_orig, (a.shape, shape_orig)
-        assert b.shape == shape_orig, (b.shape, shape_orig)
-
-    assert numpy.allclose(a, numpy.asarray(b))
-
-    return a, b
-
-
-def product(*args, **kwds):
-    # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy
-    # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111
-    pools = list(map(tuple, args)) * kwds.get('repeat', 1)
-    result = [[]]
-    for pool in pools:
-        result = [x + [y] for x in result for y in pool]
-    for prod in result:
-        yield tuple(prod)
-
-
-def test_transfer():
-    for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]:
-        for dtype in dtypes_all:
-            for offseted in [True, False]:
-                a, b = gen_gpu_nd_array(shp, dtype, offseted)
-                c = numpy.asarray(b)
-
-                assert numpy.allclose(c, a)
-                assert a.shape == b.shape == c.shape
-                assert a.strides == b.strides == c.strides
-                assert a.dtype == b.dtype == c.dtype == dtype
-                assert c.flags.c_contiguous
-
-
-def test_transfer_not_contiguous():
-    """
-    Test transfer when the input on the CPU is not contiguous
-    TODO: test when the input on the gpu is not contiguous
-    """
-    for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]:
-        for dtype in dtypes_all:
-            a = numpy.random.rand(*shp) * 10
-            a = a[::-1]
-            b = gpu_ndarray.GpuNdArrayObject(a)
-            c = numpy.asarray(b)
-
-            assert numpy.allclose(c, a)
-            assert a.shape == b.shape == c.shape
-            # We copy a to a c contiguous array before the transfer
-            assert (-a.strides[0],) + a.strides[1:] == b.strides == c.strides
-            assert a.dtype == b.dtype == c.dtype
-            assert c.flags.c_contiguous
-
-
-def test_transfer_fortran():
-    for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]:
-        for dtype in dtypes_all:
-            a = numpy.random.rand(*shp) * 10
-            a_ = numpy.asfortranarray(a)
-            if len(shp) > 1:
-                assert a_.strides != a.strides
-            a = a_
-            b = gpu_ndarray.GpuNdArrayObject(a)
-            c = numpy.asarray(b)
-
-            assert a.shape == b.shape == c.shape
-            assert a.dtype == b.dtype == c.dtype
-            assert a.flags.f_contiguous
-            assert c.flags.f_contiguous
-            assert a.strides == b.strides == c.strides
-            assert numpy.allclose(c, a)
-
-
-def test_ascontiguousarray():
-    for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]:
-        for dtype in dtypes_all:
-            for offseted_o in [True, False]:
-                for offseted_i in [True, True]:
-                    for sliced in [1, 2, -1, -2]:
-                        for order in ['f', 'c']:
-                            #print shp, dtype, offseted_o, offseted_i,
-                            #print sliced, order
-                            cpu, gpu = gen_gpu_nd_array(shp, dtype, offseted_o,
-                                                        offseted_i,
-                                                        sliced, order)
-
-                            a = numpy.ascontiguousarray(cpu)
-                            b = gpu_ndarray.ascontiguousarray(gpu)
-
-                            # numpy upcast with a view to 1d scalar.
-                            if (sliced != 1 or shp == () or
-                                (offseted_i and len(shp) > 1)):
-                                assert b is not gpu
-                                if sliced == 1 and not offseted_i:
-                                    assert ((a.data is cpu.data) ==
-                                            (b.bytes is gpu.bytes))
-                            else:
-                                assert b is gpu
-
-                            assert a.shape == b.shape
-                            assert a.dtype == b.dtype
-                            assert a.flags.c_contiguous
-                            assert b.flags['C_CONTIGUOUS']
-                            assert a.strides == b.strides
-                            assert numpy.allclose(cpu, a)
-                            assert numpy.allclose(cpu, b)
-
-
-def test_asfortranarray():
-    for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]:
-        for dtype in dtypes_all:
-            for offseted_outer in [True, False]:
-                for offseted_inner in [True, False]:
-                    for sliced in [1, 2, -1, -2]:
-                        for order in ['f', 'c']:
-#print shp, dtype, offseted_outer, offseted_inner, sliced, order
-                            cpu, gpu = gen_gpu_nd_array(shp, dtype,
-                                                        offseted_outer,
-                                                        offseted_inner,
-                                                        sliced,
-                                                        order)
-
-                            a = numpy.asfortranarray(cpu)
-                            b = gpu_ndarray.asfortranarray(gpu)
-
-                            # numpy upcast with a view to 1d scalar.
-                            if (sliced != 1 or shp == () or
-                                (offseted_outer and len(shp) > 1) or
-                                (order != 'f' and len(shp) > 1)):
-                                assert b is not gpu
-                                if (sliced == 1 and not offseted_outer and
-                                    order != 'c'):
-                                    assert ((a.data is cpu.data) ==
-                                            (b.bytes is gpu.bytes))
-                            else:
-                                assert b is gpu
-                                pass
-
-                            assert a.shape == b.shape
-                            assert a.dtype == b.dtype
-                            assert a.flags.f_contiguous
-                            if shp != ():
-                                assert b.flags['F_CONTIGUOUS']
-                            assert a.strides == b.strides
-                            assert numpy.allclose(cpu, a)
-                            assert numpy.allclose(cpu, b)
-
-
-def test_zeros():
-    for shp in [(), (0,), (5,),
-                (0, 0), (1, 0), (0, 1), (6, 7),
-                (0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1),
-                (4, 8, 9), (1, 8, 9)]:
-        for order in ["C", "F"]:
-            for dtype in dtypes_all:
-                x = numpy.zeros(shp, dtype, order)
-                y = gpu_ndarray.zeros(shp, dtype, order)
-                check_all(x, y)
-    x = gpu_ndarray.zeros(())  # no dtype and order param
-    y = numpy.zeros(())
-    check_meta(x, y)
-
-    try:
-        gpu_ndarray.zeros()
-        assert False
-    except TypeError:
-        pass
-
-
-def test_empty():
-    for shp in [(), (0,), (5,),
-                (0, 0), (1, 0), (0, 1), (6, 7),
-                (0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1),
-                (4, 8, 9), (1, 8, 9)]:
-        for order in ["C", "F"]:
-            for dtype in dtypes_all:
-                x = numpy.empty(shp, dtype, order)
-                y = gpu_ndarray.empty(shp, dtype, order)
-                check_meta(x, y)
-    x = gpu_ndarray.empty(())  # no dtype and order param
-    y = numpy.empty(())
-    check_meta(x, y)
-    try:
-        gpu_ndarray.empty()
-        assert False
-    except TypeError:
-        pass
-
-
-def test_mapping_getitem_ellipsis():
-    for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]:
-        for dtype in dtypes_all:
-            for offseted in [True, False]:
-                a, a_gpu = gen_gpu_nd_array(shp, dtype, offseted)
-
-                b = a_gpu[...]
-                assert b.bytes == a_gpu.bytes
-                assert b.strides == a.strides
-                assert b.shape == a.shape
-                b_cpu = numpy.asarray(b)
-                assert numpy.allclose(a, b_cpu)
-
-
-def test_copy_view():
-    from ..array import may_share_memory
-
-    def check_memory_region(a, a_op, b, b_op):
-        assert numpy.may_share_memory(a, a_op) == may_share_memory(b, b_op)
-
-        if a_op.base is None:
-            assert b_op.base is None
-        else:
-            assert a_op.base is a
-            if b.base:
-                # We avoid having a series of object connected by base.
-                # This is to don't bloc the garbage collection.
-                assert b_op.base is b.base
-            else:
-                assert b_op.base is b
-
-    for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]:
-        for dtype in dtypes_all:
-            for offseted in [False, True]:
-                # order1 is the order of the original data
-                for order1 in ['c', 'f']:
-                    # order2 is the order wanted after copy
-                    for order2 in ['c', 'f']:
-                        print(shp, dtype, offseted, order1, order2)
-                        #TODO test copy unbroadcast!
-                        a, b = gen_gpu_nd_array(shp, dtype, offseted,
-                                                order=order1)
-
-                        assert numpy.allclose(a, numpy.asarray(b))
-                        check_flags(a, b)
-
-                        c = b.copy(order2)
-                        assert numpy.allclose(a, numpy.asarray(c))
-                        check_flags(c, a.copy(order2))
-                        check_memory_region(a, a.copy(order2), b, c)
-
-                        d = copy.copy(b)
-                        assert numpy.allclose(a, numpy.asarray(d))
-                        check_flags(d, copy.copy(a))
-                        check_memory_region(a, copy.copy(a), b, d)
-
-                        e = b.view()
-                        assert numpy.allclose(a, numpy.asarray(e))
-                        check_flags(e, a.view())
-                        check_memory_region(a, a.view(), b, e)
-
-                        f = copy.deepcopy(b)
-                        assert numpy.allclose(a, numpy.asarray(f))
-                        check_flags(f, copy.deepcopy(a))
-                        check_memory_region(a, copy.deepcopy(a), b, f)
-
-                        g = copy.copy(b.view())
-                        assert numpy.allclose(a, numpy.asarray(g))
-                        check_memory_region(a, copy.copy(a.view()), b, g)
-                        check_flags(g, copy.copy(a.view()))
-
-
-def test_len():
-    for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]:
-        for dtype in dtypes_all:
-            for offseted in [True, False]:
-                a, a_gpu = gen_gpu_nd_array(shp, dtype, offseted)
-                assert len(a_gpu) == shp[0]
-
-
-def test_mapping_getitem_w_int():
-    def _cmp(x, y):
-        assert x.shape == y.shape
-        assert x.dtype == y.dtype
-        assert x.strides == y.strides
-        assert x.flags["C_CONTIGUOUS"] == y.flags["C_CONTIGUOUS"]
-        assert x.flags["F_CONTIGUOUS"] == y.flags["F_CONTIGUOUS"]
-        if x.flags["WRITEABLE"] != y.flags["WRITEABLE"]:
-            assert x.ndim == 0
-            assert not x.flags["OWNDATA"]
-            assert y.flags["OWNDATA"]
-        else:
-            assert x.flags["WRITEABLE"] == y.flags["WRITEABLE"]
-            assert x.flags["OWNDATA"] == y.flags["OWNDATA"]
-        assert x.flags["ALIGNED"] == y.flags["ALIGNED"]
-        assert x.flags["UPDATEIFCOPY"] == y.flags["UPDATEIFCOPY"]
-        x = numpy.asarray(x)
-        assert x.shape == y.shape
-        assert x.dtype == y.dtype
-        assert x.strides == y.strides
-        if not numpy.all(x == y):
-            print(x)
-            print(y)
-        assert numpy.all(x == y), (x, y)
-
-    def _cmpNs(x, y):
-        """
-        Don't compare the stride after the transfer
-        There is a copy that have been made on the gpu before the transfer
-        """
-        assert x.shape == y.shape
-        assert x.dtype == y.dtype
-        assert x.strides == y.strides
-        assert x.flags["C_CONTIGUOUS"] == y.flags["C_CONTIGUOUS"]
-        assert x.flags["F_CONTIGUOUS"] == y.flags["F_CONTIGUOUS"]
-        assert x.flags["WRITEABLE"] == y.flags["WRITEABLE"]
-        assert x.flags["ALIGNED"] == y.flags["ALIGNED"]
-        assert x.flags["OWNDATA"] == y.flags["OWNDATA"]
-        assert x.flags["UPDATEIFCOPY"] == y.flags["UPDATEIFCOPY"]
-        x_ = numpy.asarray(x)
-        assert x_.shape == y.shape
-        assert x_.dtype == y.dtype
-        if not numpy.all(x_ == y):
-            print(x_)
-            print(y)
-        assert numpy.all(x_ == y), (x_, y)
-        pass
-
-    def _cmpf(x, *y):
-        try:
-            x.__getitem__(y)
-        except IndexError:
-            pass
-        else:
-            raise Exception("Did not generate out or bound error")
-
-    def _cmpfV(x, *y):
-        try:
-            if len(y) == 1:
-                x.__getitem__(*y)
-            else:
-                x.__getitem__(y)
-        except ValueError:
-            pass
-        else:
-            raise Exception("Did not generate value error")
-
-    for dtype in dtypes_all:
-        for offseted in [True, False]:
-            # test vector
-            dim = (2,)
-            a, _a = gen_gpu_nd_array(dim, dtype, offseted)
-
-            import sys
-            init_ref_count = sys.getrefcount(_a)
-            _cmp(_a[...], a[...])
-            _cmp(_a[...], a[...])
-            _cmp(_a[...], a[...])
-            _cmp(_a[...], a[...])
-            _cmp(_a[...], a[...])
-
-            _cmp(_a[-1], a[-1])
-            _cmp(_a[1], a[1])
-            _cmp(_a[0], a[0])
-            _cmp(_a[::1], a[::1])
-            _cmpNs(_a[::-1], a[::-1])
-            _cmp(_a[...], a[...])
-            _cmpf(_a, 2)
-
-            # test scalar
-            dim = ()
-            a, _a = gen_gpu_nd_array(dim, dtype, offseted)
-            _cmp(_a[...], a[...])
-            _cmpf(_a, 0)
-            _cmpfV(_a, slice(1))
-
-            # test 4d-tensor
-            dim = (5, 4, 3, 2)
-            a, _a = gen_gpu_nd_array(dim, dtype, offseted)
-            _cmpf(_a, slice(-1), slice(-1), 10, -10)
-            _cmpf(_a, slice(-1), slice(-1), -10, slice(-1))
-            _cmpf(_a, 0, slice(0, -1, -20), -10)
-            _cmpf(_a, 10)
-            _cmpf(_a, (10, 0, 0, 0))
-            _cmpf(_a, -10)
-
-            #test with integer
-            _cmp(_a[1], a[1])
-            _cmp(_a[-1], a[-1])
-            _cmp(_a[numpy.int64(1)], a[numpy.int64(1)])
-            _cmp(_a[numpy.int64(-1)], a[numpy.int64(-1)])
-
-            #test with slice
-            _cmp(_a[1:], a[1:])
-            _cmp(_a[1:2], a[1:2])
-            _cmp(_a[-1:1], a[-1:1])
-
-            #test with tuple (mix slice, integer, numpy.int64)
-            _cmpNs(_a[0, 0, ::numpy.int64(-1), ::-1], a[0, 0, ::-1, ::-1])
-            _cmpNs(_a[:, :, ::numpy.int64(-1), ::-1], a[:, :, ::-1, ::-1])
-            _cmpNs(_a[:, :, numpy.int64(1), -1], a[:, :, 1, -1])
-            _cmpNs(_a[:, :, ::-1, ::-1], a[:, :, ::-1, ::-1])
-            _cmpNs(_a[:, :, ::-10, ::-10], a[:, :, ::-10, ::-10])
-            _cmpNs(_a[:, :, 1, -1], a[:, :, 1, -1])
-            _cmpNs(_a[:, :, -1, :], a[:, :, -1, :])
-            _cmpNs(_a[:, ::-2, -1, :], a[:, ::-2, -1, :])
-            _cmpNs(_a[:, ::-20, -1, :], a[:, ::-20, -1, :])
-            _cmpNs(_a[:, ::-2, -1], a[:, ::-2, -1])
-            _cmpNs(_a[0, ::-2, -1], a[0, ::-2, -1])
-            _cmp(_a[-1, -1, -1, -2], a[-1, -1, -1, -2])
-
-            #test ellipse
-            _cmp(_a[...], a[...])