diff --git a/ndarray/Makefile b/ndarray/Makefile deleted file mode 100644 index dbfc673..0000000 --- a/ndarray/Makefile +++ /dev/null @@ -1,31 +0,0 @@ -all: pygpu_ndarray.so - -PYTHONVERSION ?= $(shell python -c "import sys; print '%d.%d'%(sys.version_info[0], sys.version_info[1]")) -CUDA_ROOT ?= /opt/lisa/os/cuda -THEANO_ROOT ?= /u/bastienf/repos/Theano - - -CFLAGS=-g -DDEBUG -DOFFSET -# By default enable the OFFSET usage. Otherwise some test fail. -CFLAGS=-g -DOFFSET -#BINDIR=--compiler-bindir ${HOME}/.theano.nvcc-bindir - -#NPY_PATH!=python -c "import numpy;print numpy.__path__" -#NPY_INCLUDE=-I${NPY_PATH}/core/include -CUDA_INCLUDE=-I${CUDA_ROOT}/include -PYTHON_INCLUDE=-I$(shell python -c "import distutils.sysconfig;print distutils.sysconfig.get_python_inc()") -INCLUDES=${CUDA_INCLUDE} ${PYTHON_INCLUDE} -CUDA_FLAGS=-Xlinker -rpath,${CUDA_ROOT}/lib64 -Xlinker -rpath,${CUDA_ROOT}/lib - -pygpu_language_cuda.o: pygpu_language_cuda.cu pygpu_language.h - nvcc -c ${CFLAGS} -m64 -Xcompiler -fPIC,-m64 ${CUDA_FLAGS} ${INCLUDES} ${BINDIR} -o $@ $< - -pygpu_ndarray.so: pygpu_ndarray.cpp pygpu_ndarray.h pygpu_language_cuda.o pygpu_ndarray_object.h - nvcc -shared ${CFLAGS} -m64 -Xcompiler -fPIC,-m64 ${CUDA_FLAGS} ${INCLUDES} ${BINDIR} -o $@ pygpu_language_cuda.o $< -lpython${PYTHONVERSION} -lcublas -lcudart - -clean: - rm -f pygpu_ndarray.so core.* *.o *~ - rm -rf build - -cleantmp: - rm -f core.* *.o *~ \ No newline at end of file diff --git a/ndarray/__init__.py b/ndarray/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/ndarray/gen_elemwise.py b/ndarray/gen_elemwise.py deleted file mode 100644 index d471eef..0000000 --- a/ndarray/gen_elemwise.py +++ /dev/null @@ -1,1911 +0,0 @@ -""" -This file implement 1 version of the elemwise op on the gpu. - -The elemwise fct are also used with scalar operation! So it can happen -that ndim is 0 as with all scalar type. -""" - - -import numpy -import pygpu_ndarray as gpu_ndarray -import StringIO - - -_CL_MODE = hasattr(gpu_ndarray, "set_opencl_context") - - -if _CL_MODE: - # THIS IS NOT FINISHED - import pyopencl as cl - import pyopencl.array as cl_array - - # import pyopencl._mymako as mako - from pyopencl._cluda import CLUDA_PREAMBLE - from pyopencl.tools import dtype_to_ctype - - # TODO: use mako to get rid of the %if - CLUDA_PREAMBLE = CLUDA_PREAMBLE[:455] - CLUDA_PREAMBLE += """ -#define LDIM_0 get_local_size(0) -#define LDIM_1 get_local_size(1) -#define LDIM_2 get_local_size(2) - -#define GDIM_0 get_num_groups(0) -#define GDIM_1 get_num_groups(1) -#define GDIM_2 get_num_groups(2) - """ - # TODO, reuse the same context as the use used to create the memory. - ctx = cl.create_some_context() - queue = cl.CommandQueue(ctx) -else: - import pycuda.autoinit - import pycuda.driver as driver - - # import pycuda._mymako as mako - from pycuda._cluda import CLUDA_PREAMBLE - from pycuda.compiler import SourceModule - from pycuda.tools import dtype_to_ctype - CLUDA_PREAMBLE += """ -#define LDIM_0 blockDim.x -#define LDIM_1 blockDim.y -#define LDIM_2 blockDim.z - -#define GDIM_0 gridDim.x -#define GDIM_1 gridDim.y -#define GDIM_2 gridDim.z - """ - -import logging - -import theano -from theano import Apply, scalar -from theano.tensor import TensorType - - -_logger_name = 'compyte.gen_elemwise' -_logger = logging.getLogger(_logger_name) -_logger.setLevel(logging.INFO) -_logger.addHandler(logging.StreamHandler()) # TO REMOVE - - -def warning(*msg): - _logger.warning(_logger_name + 'WARNING: ' + ' '.join(str(m) for m in msg)) - - -def info(*msg): - _logger.info(_logger_name + 'INFO: ' + ' '.join(str(m) for m in msg)) - - -def debug(*msg): - _logger.debug(_logger_name + 'DEBUG: ' + ' '.join(str(m) for m in msg)) - - -if _CL_MODE: - gpu_ndarray.set_opencl_context(ctx.obj_ptr) - - -cast_int = numpy.intc -cast_uint = numpy.uintc - - -def _logical_scalar(x): - return numpy.all(x.type.broadcastable) - - -def get_str_list_logical_scalar(inputs, value_str='ii_i%i_value', - data_str='ii_i%i_data[0]'): - l = [] - for ipos, i in enumerate(inputs): - if _logical_scalar(i): - l += [value_str % ipos] - else: - l += [data_str % ipos] - return l - - -class WrapOpenCLFunction: - def __init__(self, fct): - self.fct = fct - - def _param_wrap(self, p): - if isinstance(p, MyGpuNdArray): - p = p.gpu_nd_array - if isinstance(p, gpu_ndarray.GpuNdArrayObject): - p = cl.MemoryObject.from_cl_mem_as_int(p.bytes) - return p - - def set_block_shape(self, *shape): - self.local_size = shape - - def param_set(self, *param): - self.param = [self._param_wrap(p) for p in param] - - def launch_grid(self, *global_shape): - global_size = global_shape + (1,) - - d = {"g_times_l": True} - return self.fct(queue, global_size, self.local_size, - *self.param, **d) - - -def compile_gpu_code(code, fct_name): - if _CL_MODE: - # Compile the gpu function with pyopencl - prg = cl.Program(ctx, code).build() - fct2 = getattr(prg, fct_name) - - fct = WrapOpenCLFunction(fct2) - else: - # Compile the gpu function with pycuda - mod = SourceModule(code) - fct = mod.get_function(fct_name) - return fct - - -class ElemwiseAlgo: - verbose = 0 # 1, 2 or 3 for more verbose output. - cache_version = () - cache_version = ('debug', 14, verbose) - - def __init__(self, scalar_op, inplace_pattern={}): - """ - :param scalar_op: the scalar operation to execute on each element. - """ - self.scalar_op = scalar_op - self.inplace_pattern = inplace_pattern - - def task_code(self, inputs, outputs, sio, - nodename, iname=None, oname=None): - if iname == None: - iname = get_str_list_logical_scalar(inputs) - if oname == None: - oname = ['ii_o%i_data[0]' % ipos for ipos, i in enumerate(outputs)] - print(self.scalar_op.c_code( - Apply(self.scalar_op, - [scalar.Scalar(dtype=input.type.dtype)() - for input in inputs], - [scalar.Scalar(dtype=output.type.dtype)() - for output in outputs]), - nodename + '_scalar_', - iname, - oname, - sub=dict(fail='return;')), file=sio) # TODO: set a failure code somehow!!! - - def c_src_kernel(self, inputs, outputs, nodename, nd, static="static"): - sio = StringIO.StringIO() - #print 'C_SRC_KERNEL', sio.getvalue() - - for ipos, i in enumerate(inputs): - print("// Input ", ipos, str(i.type), file=sio) - for ipos, i in enumerate(outputs): - print("// Output ", ipos, str(i.type), file=sio) - print(static, ( - f"KERNEL void kernel_{nodename}_{nd}(unsigned int numEls"), file=sio) - if (nd): - print("\t,", ", ".join("const int dim%i" % i - for i in range(nd)), file=sio) - #declare inputs - for ipos, i in enumerate(inputs): - s = ", ".join(["GLOBAL_MEM const %s * i%i_data" % ( - dtype_to_ctype(i.dtype), ipos)] + - list("int i%i_str_%i" % (ipos, d) - for d in range(nd))) - print("\t,", s, file=sio) - #declare outputs - for ipos, i in enumerate(outputs): - s = ", ".join(["GLOBAL_MEM %s * o%i_data" % ( - dtype_to_ctype(i.dtype), ipos)] - + list("int o%i_str_%i" % (ipos, d) - for d in range(nd))) - print("\t,", s, file=sio) - #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) - # for d in xrange(nd)) - #print >> sio, "\t,", "float * o%i_data" % ipos - print("\t)\n{", file=sio) - print(" const int idx = GID_0 * LDIM_0 + LID_0;", file=sio) - print(" const int numThreads = LDIM_0 * GDIM_0;", file=sio) - - # For each input that is a scalar which has been broadcasted - # to a tensor, load it into a local variable - for ipos, i in enumerate(inputs): - if _logical_scalar(i): - print(" const %s ii_i%i_value = i%i_data[0];" % ( - dtype_to_ctype(i.dtype), ipos, ipos), file=sio) - - #loop over the elements to be treated by this kernel call - print(" for (int i = idx; i < numEls; i += numThreads) {", file=sio) - # calculate the data pointers for all arguments - print(" int ii = i;", file=sio) - for ipos, i in enumerate(inputs): - if not _logical_scalar(i): - print((" GLOBAL_MEM const " - "%s * ii_i%i_data = i%i_data;" % ( - dtype_to_ctype(i.dtype), ipos, ipos)), file=sio) - for ipos, i in enumerate(outputs): - print(" GLOBAL_MEM %s * ii_o%i_data = o%i_data;" % ( - dtype_to_ctype(i.dtype), ipos, ipos), file=sio) - for d in range(nd - 1, -1, -1): - if d > 0: - print(" int pos%i = ii %% dim%i;" % (d, d), file=sio) - print(" ii = ii / dim%i;" % d, file=sio) - else: - print(" int pos%i = ii;" % d, file=sio) - - for ipos, i in enumerate(inputs): - if not _logical_scalar(i): - print((" ii_i" - "%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d)), file=sio) - for ipos, i in enumerate(outputs): - print(" ii_o%i_data += pos%i * o%i_str_%i;" % ( - ipos, d, ipos, d), file=sio) - - # perform the scalar operation on the input and output references - #TODO: What if the scalar_op needs support_code?? - self.task_code(inputs, outputs, sio, nodename) - print(" }", file=sio) - - #indent = " "*(4*d+7) - #for ipos, i in enumerate(inputs): - #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' - print("}", file=sio) - - #print sio.getvalue() - return sio.getvalue() - - def c_src_kernel_Ccontiguous(self, inputs, outputs, - nodename, static="static"): - nd = outputs[0].type.ndim - sio = StringIO.StringIO() - #print 'C_SRC_KERNEL', sio.getvalue() - - for ipos, i in enumerate(inputs): - print("// Input ", ipos, str(i.type), file=sio) - for ipos, i in enumerate(outputs): - print("// Output ", ipos, str(i.type), file=sio) - print(static, ("KERNEL void kernel_%s_Ccontiguous" - " (unsigned int numEls" % (nodename)), file=sio) - #declare inputs - for ipos, i in enumerate(inputs): - print("\t,", "GLOBAL_MEM const %s * i%i_data" % ( - dtype_to_ctype(i.dtype), ipos), file=sio) - #declare outputs - for ipos, i in enumerate(outputs): - print("\t,", "GLOBAL_MEM %s * o%i_data" % ( - dtype_to_ctype(i.dtype), ipos), file=sio) - print("\t)\n{", file=sio) - print(" const int idx = GID_0 * LDIM_0 + LID_0;", file=sio) - print(" const int numThreads = LDIM_0 * GDIM_0;", file=sio) - - # For each input that is a scalar which has been broadcasted - # to a tensor, load it into a local variable - for ipos, i in enumerate(inputs): - if _logical_scalar(i): - print(" const %s ii_i%i_value = i%i_data[0];" % ( - dtype_to_ctype(i.dtype), ipos, ipos), file=sio) - - #loop over the elements to be treated by this kernel call - print(" for (int i = idx; i < numEls; i += numThreads) {", file=sio) - # perform the scalar operation on the input and output references - #TODO: What if the scalar_op needs support_code?? - self.task_code(inputs, outputs, sio, nodename, - iname=get_str_list_logical_scalar( - inputs, data_str='i%i_data[i]'), - oname=['o%i_data[i]' % ipos - for ipos, i in enumerate(outputs)]) - print(" }", file=sio) - print("}", file=sio) - - #print sio.getvalue() - return sio.getvalue() - - def c_src_callkernel(self, inputs, outputs, nodename): - # - # This function serves three main goals: - # - # The first is stride unpacking: - # it accepts input and output arguments as - # float * , int* - # pairs, and it constructs a kernel function call where inputs - # and arguments are named like - # float *, int, int, int ... - # - # The second is to recognize when any dimensions can be collapsed as - # being contiguous. That mean that we can merge that dimensions with - # another one for all inputs/outputs and have the same retusuls - # (confusing... read code) - # - # The thrid is to make a special case for scalar element. We allow - # the collapsing of them. In the ccontiguous and not contiguous case, - # we use registers to lower the number of memory access. - - # TODO: make a special case for broadcasting, to store the - # data in shared memory. - - nd = outputs[0].type.ndim - nb_inputs = len(inputs) - nb_outputs = len(outputs) - d = dict() - # input_params and output_params go into the function - # declaration/definition - input_params = ", ".join("const %s * i%i_data, const int * i%i_str" % ( - dtype_to_ctype(inputs[i].dtype), ipos, ipos) - for ipos in range(len(inputs))) - output_params = ", ".join("%s * o%i_data, const int * o%i_str" % ( - dtype_to_ctype(outputs[i].dtype), - ipos, ipos) - for ipos in range(len(outputs))) - - #input_args and output_args go into the recursive call. - input_args = ", ".join("i%i_data, i%i_str" % (ipos, ipos) - for ipos in range(len(inputs))) - output_args = ", ".join("o%i_data, o%i_str" % (ipos, ipos) - for ipos in range(len(outputs))) - - prod_dims = '*'.join(["dims[%i]" % di for di in range(nd)] + ['1']) - - sio = StringIO.StringIO() - print(""" - static void can_collapse_%(nodename)s(int nd, const int * dims, - const int * strides, - int collapse[]) - { - //can we collapse dims[i] and dims[i-1] - for(int i=nd-1;i>0;i--){ - if(strides[i]*dims[i]==strides[i-1]){ - //the dims nd-1 are not strided again dimension nd - collapse[i]=1; - }else collapse[i]=0; - } - } - """ % locals(), file=sio) - print(""" - static int callkernel_%(nodename)s(unsigned int numEls, const int d, - const int * dims, - %(input_params)s, - %(output_params)s) - { - numEls = %(prod_dims)s; - """ % locals(), file=sio) - if self.verbose: - print(""" - std::cerr << "calling kernel_%(nodename)s w numEls" << numEls << " dims"<< d << "\\n"; - """ % locals(), file=sio) - print('std::cerr << ' + " << ' ' << ".join(['" "']+list("dims[%i]"%di - for di in range(nd)) + ["'\\n';"]), file=sio) - if self.verbose > 1: - for ipos in range(len(inputs)): - print(""" - std::cerr << " %(ipos)s data strides" << - """ % locals() + " << ' ' << ".join(["i%s_data" % ipos] - + list("i%s_str[%i]" % (ipos, di) - for di in range(nd))) + ''' << "\\n"; ''', file=sio) - - for ipos in range(len(outputs)): - print(""" - std::cerr << " %(ipos)s data strides" << - """ % locals() + " << ' ' << ".join(["o%s_data" % ipos] - + list("o%s_str[%i]" % (ipos, di) - for di in range(nd))) + ''' << "\\n"; ''', file=sio) - # collapse dimension that are broadcast in all inputs. - # need to be done before contiguous collapse as it will break it. - # do the dimensions and the strides - print(""" - int local_dims[%(nd)s]; - int local_str[%(nb_inputs)s][%(nd)s]; - int local_ostr[%(nb_inputs)s][%(nd)s]; - int nd_collapse = %(nd)s; - for(int i=0;i<%(nd)s;i++){//init new dim - local_dims[i]=dims[i]; - } - """ % locals(), file=sio) - for ipos in range(len(inputs)): - print(""" - for(int i=0;i<%(nd)s;i++){//init new strides - local_str[%(ipos)s][i]=i%(ipos)s_str[i]; - } - """ % locals(), file=sio) - for ipos in range(len(outputs)): - print(""" - for(int i=0;i<%(nd)s;i++){//init new strides - local_ostr[%(ipos)s][i]=o%(ipos)s_str[i]; - } - """ % locals(), file=sio) - if self.verbose > 2: - print('std::cerr <<"before broadcast collapse\\n";', file=sio) - print('std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ', file=sio) - print('std::cerr << "local_dims";', file=sio) - for d in range(nd): - print('std::cerr << " " << local_dims[%(d)s]; ' % locals(), file=sio) - print('std::cerr << "\\n";', file=sio) - - for ipos in range(len(inputs)): - print('std::cerr << " local_str inputs %(ipos)s: " <<' % locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";', file=sio) - for ipos in range(len(outputs)): - print('std::cerr << " local_ostr inputs %(ipos)s: " <<' % locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";', file=sio) - - print(""" - for(int id=0;id 2: - print('std::cerr <<"after broadcast collapse\\n";', file=sio) - print('std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ', file=sio) - print('std::cerr << "local_dims";', file=sio) - for d in range(nd): - print('std::cerr << " " << local_dims[%(d)s]; ' % locals(), file=sio) - print('std::cerr << "\\n";', file=sio) - - for ipos in range(len(inputs)): - print('std::cerr << " local_str %(ipos)s: " <<' % locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";', file=sio) - for ipos in range(len(outputs)): - print('std::cerr << " local_ostr %(ipos)s: " <<' % locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";', file=sio) - # collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle)) - # this is a good idea because we make less index calculation in the gpu. - - print("int nd_collapse_[%(nd)s] = {" % locals() +','.join(['1' for x in range(nd)]) +"};", file=sio) - for ipos in range(len(inputs)): - if not _logical_scalar(inputs[ipos]): - print(""" - int nd_collapse_%(ipos)s[%(nd)s] = {""" % locals() +','.join(['1' for x in range(nd)]) +"};", file=sio) - print(""" -can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s); -for(int i=0;i 1: - print(""" - std::cerr<< "nd_collapse_%(ipos)s "<< - """ % locals(), file=sio) - print(' << " " << '.join( - ["nd_collapse_%(ipos)s[" % locals() + str(i) + "]" - for i in range(nd)]), file=sio) - print('<< "\\n";', file=sio) - print(""" - std::cerr<< "nd_collapse_ "<< - """ % locals(), file=sio) - print(' << " " << '.join( - ["nd_collapse_[" % locals() + str(i) + "]" - for i in range(nd)]), file=sio) - print('<< "\\n";', file=sio) - - # update the local stride. - for ipos in range(len(inputs)): - print(""" - for(int i=nd_collapse-1;i>0;i--){ - if(nd_collapse_[i]==1){ - local_str[%(ipos)s][i-1]=local_str[%(ipos)s][i];//set new strides - for(int j=i+1;j0;i--){ - if(nd_collapse_[i]==1){ - local_ostr[%(ipos)s][i-1]=local_ostr[%(ipos)s][i];//set new strides - for(int j=i+1;j0;i--){ - if(nd_collapse_[i]==1){ - local_dims[i-1]*=local_dims[i];//set new dims - for(int j=i+1;j 0: - print(" && ", " && ".join(l), file=sio) - print("""){nd_collapse=0;} """, file=sio) - - if self.verbose: - print('std::cerr <<"after can_collapse\\n";', file=sio) - print("""std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ % locals(), file=sio) - if self.verbose > 1: - for d in range(nd): - print('std::cerr << " " << local_dims[%(d)s]; ' % locals(), file=sio) - print('std::cerr << "\\n";', file=sio) - - for ipos in range(len(inputs)): - print(('std::cerr << " local_str %(ipos)s: " <<' % - locals() + ' << " " << '.join( - ["local_str[%(ipos)s][%(x)s]" % locals() - for x in range(nd)]) + '<<"\\n";'), file=sio) - for ipos in range(len(outputs)): - print(('std::cerr << " local_ostr %(ipos)s: " <<' % - locals() + ' << " " << '.join( - ["local_ostr[%(ipos)s][%(x)s]" % locals() - for x in range(nd)]) + '<<"\\n";'), file=sio) - - def launch_Ccontiguous(nodename, scalar_op): - kernel_call_args = ["numEls"] - for ipos in range(len(inputs)): - kernel_call_args.append("i%i_data" % ipos) - for ipos in range(len(outputs)): - kernel_call_args.append("o%i_data" % ipos) - kernel_call_args = ", ".join(kernel_call_args) - verb = "" - if self.verbose: - verb = 'std::cerr << " Running ccontiguous version\\n";' - print(""" - //first use at least a full warp - int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE - - //next start adding multiprocessors - int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS - - // next start adding more warps per multiprocessor - if (threads_per_block * n_blocks < numEls) - threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); - kernel_%(nodename)s_Ccontiguous<<>>(%(kernel_call_args)s); - - //std::cerr << "calling callkernel returned\\n"; - """ % locals(), file=sio) - - print(""" - CNDA_THREAD_SYNC; - cudaError_t err = cudaGetLastError(); - if( cudaSuccess != err) - { - PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n", - "GpuElemwise %(nodename)s", cudaGetErrorString(err), - n_blocks, threads_per_block, - "kernel_%(nodename)s_Ccontiguous<<>>(%(kernel_call_args)s)"); - return -1; - - } - %(verb)s - return 0; - """ % locals(), file=sio) - - def launch_General(nodename, scalar_op, force_nd): - # kernel_call_args are used to invoke the cuda kernel - local = "local_" - kernel_call_args = ["numEls"] - kernel_call_args.extend(local + "dims[%i]" % di - for di in range(force_nd)) - for ipos in range(len(inputs)): - kernel_call_args += ["i%i_data" % ipos] + list( - local + "str[%i][%i]" % (ipos, di) - for di in range(force_nd)) - #strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd)) - #kernel_call_args.append( "%s, i%i_data" % (strides, ipos)) - for ipos in range(len(outputs)): - kernel_call_args += ["o%i_data" % ipos] + list( - local + "ostr[%i][%i]" % (ipos, di) - for di in range(force_nd)) - #strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd)) - #kernel_call_args.append( "%s, o%i_data" % (strides, ipos)) - if self.verbose: - print(""" - std::cerr << " Running general version with %(force_nd)s dims\\n"; - """ % locals(), file=sio) - print("std::cerr << "+ ' << " " << '.join( - kernel_call_args)+' << "\\n";', file=sio) - #std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n; - - kernel_call_args = ", ".join(kernel_call_args) - - print(""" - //first use at least a full warp - int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE - - //next start adding multiprocessors - int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS - - // next start adding more warps per multiprocessor - if (threads_per_block * n_blocks < numEls) - threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); - - kernel_%(nodename)s_%(force_nd)s<<>>(%(kernel_call_args)s); - """ % locals(), file=sio) - print(""" - CNDA_THREAD_SYNC; - cudaError_t err = cudaGetLastError(); - if( cudaSuccess != err) - { - PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n", - "GpuElemwise %(nodename)s", cudaGetErrorString(err), - n_blocks, threads_per_block, - "kernel_%(nodename)s_Ccontiguous<<>>(%(kernel_call_args)s)"); - return -1; - - } - return 0; - """ % locals(), file=sio) - - print("if(numEls==0) return 0;", file=sio) - print("switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals(), file=sio) - print("case 0: {", file=sio) - launch_Ccontiguous(nodename, scalar_op) - print(" } break;", file=sio) - for i in range(1, nd + 1): - print("case " + str(i) + ": {", file=sio) - launch_General(nodename, scalar_op, i) - print(" } break;", file=sio) - - print("}", file=sio) # end case - print("return -2;", file=sio) # should not get to this point - print("}", file=sio) # end fct - - #N.B. cudaGetLastError is called by c_code - return sio.getvalue() - - def c_support_code_apply(self, inputs, outputs, nodename): - nd = outputs[0].type.ndim - return "".join( - CLUDA_PREAMBLE, - [self.c_src_kernel(inputs, outputs, nodename, x) - for x in range(1, nd + 1)] + - [self.c_src_kernel_Ccontiguous(inputs, outputs, nodename), - self.c_src_callkernel(inputs, outputs, nodename), - ]) - - def c_code(self, ninputs, noutputs, nodename, inputs, outputs, sub): - d = dict(sub) - nd = noutputs[0].type.ndim - d.update(locals()) - sio = StringIO.StringIO() - nin = len(inputs) - nout = len(outputs) - fail = sub['fail'] - opname = str(self.scalar_op) - initial_dims = ','.join('1' for i in range(nd)) - if 1 or self.scalar_op == scalar.pow: - print(""" - //std::cerr << "C_CODE %(opname)s START\\n"; - //standard elemwise size checks - """ % locals(), file=sio) - print(""" - int dims[%(nd)s] = {%(initial_dims)s}; - """ % locals(), file=sio) - - #check that all inputs have valid dimensions - emitted_inames = {} - for id, iname in enumerate(inputs): - if iname in emitted_inames: - assert emitted_inames[iname] is ninputs[id] - continue - broadcasts = ', '.join(map(str, list(map(int, - ninputs[id].broadcastable)))) - nd = ninputs[id].ndim - print(""" - int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s}; -""" % locals(), file=sio) - emitted_inames[iname] = ninputs[id] - #check that all inputs have valid dimensions - emitted_inames = {} - for id, iname in enumerate(inputs): - if iname in emitted_inames: - continue - print(""" - //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n"; - if (%(nd)s != %(iname)s->nd) - { - PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", %(iname)s->nd); - %(fail)s; - } - for (int i = 0; i< %(nd)s; ++i) - { - dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(%(iname)s)[i] : dims[i]; - if ((!(broadcasts_%(iname)s[i] && CudaNdarray_HOST_DIMS(%(iname)s)[i] == 1))&& (dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i])) - { - //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n"; - PyErr_Format(PyExc_ValueError, "GpuElemwise. Input dimension mis-match. One of your inputs has shape[%%i] == %%i, but the output's size on that axis is %%i.", - i, - CudaNdarray_HOST_DIMS(%(iname)s)[i], - dims[i] - ); - %(fail)s; - } - } - """ % locals(), file=sio) - emitted_inames[iname] = True - - #check that all outputs have valid dimensions - for idx, oname in enumerate(outputs): - if idx not in list(self.inplace_pattern.keys()): - print(""" - for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { - if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i]) - { - Py_DECREF(%(oname)s); - %(oname)s = NULL; - } - } - if (NULL == %(oname)s) - { - %(oname)s = (CudaNdarray*)CudaNdarray_New(); - if (!%(oname)s) - { - //error string already set - %(fail)s; - } - if (CudaNdarray_alloc_contiguous(%(oname)s, %(nd)s, dims)) - { - //error string already set - Py_DECREF(%(oname)s); - %(oname)s = NULL; - %(fail)s; - } - } - //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n"; - //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; - """ % locals(), file=sio) - else: - input_idx = self.inplace_pattern[idx] - iname = inputs[input_idx] - print(""" - Py_XDECREF(%(oname)s); - %(oname)s = %(iname)s; - Py_INCREF(%(oname)s); - for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { - if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i]) - { - Py_DECREF(%(oname)s); - %(oname)s = NULL; - %(fail)s; - } - } - //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n"; - //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; - """ % locals(), file=sio) - - print(""" - { - //new block so that failure gotos don't skip over variable initialization - //std::cerr << "calling callkernel\\n"; - if (callkernel_%(nodename)s(1, 0, dims - """ % locals(), file=sio) - for iname in inputs: - print(""" - , CudaNdarray_DEV_DATA(%(iname)s), CudaNdarray_HOST_STRIDES(%(iname)s) - """ % locals(), file=sio) - for oname in outputs: - print(""" - , CudaNdarray_DEV_DATA(%(oname)s), CudaNdarray_HOST_STRIDES(%(oname)s) - """ % locals(), file=sio) - print(""" - )) - { - // error - """, file=sio) - for oname in outputs: - print(""" - Py_DECREF(%(oname)s); - %(oname)s = NULL; - """ % locals(), file=sio) - print(""" - %(fail)s; - } - else // no error - { - } - } - //std::cerr << "C_CODE %(opname)s END\\n"; - """ % locals(), file=sio) - #print sio.getvalue() - return sio.getvalue() - - def c_support_code(self): - return """ - #define INTDIV_POW2(a, b) (a >> b) - #define INTMOD_POW2(a, b) (a & ((1<> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) - #print >> sio, "\t,", "float * o%i_data" % ipos - print("\t)\n{", file=sio) - - # For each input that is a scalar which has been broadcasted to a tensor, - # load it into a local variable - print(" __shared__ float value0[%i];" % len(inputs), file=sio) - print(" __shared__ int shared_dims[%(nd)s];" % locals(), file=sio) - #print >> sio, " __shared__ int shared_i_str[%(n_in)s][%(nd)s]" - print(" if ((threadIdx.x == 0) && (threadIdx.y == 0)) {", file=sio) - for ipos, i in enumerate(inputs): - if _logical_scalar(i): - print(" value0[%i] = i%i_data[0];" % (ipos, - ipos), file=sio) - for ipos in range(nd): - print(" shared_dims[%i] = dim%i;" % (ipos, ipos), file=sio) - print(" }", file=sio) - print(" __syncthreads();", file=sio) - - if (nd == 4): - print(""" - for (int pos0 = blockIdx.x; pos0 < shared_dims[0]; pos0 += gridDim.x) - { - for (int pos1 = blockIdx.y; pos1 < shared_dims[1]; pos1 += gridDim.y) - { - //for (int pos2 = threadIdx.x; pos2 < shared_dims[2]; pos2 += blockDim.x) - for (int pos2 = threadIdx.y; pos2 < shared_dims[2]; pos2 += blockDim.y) - { - //for (int pos3 = threadIdx.y; pos3 < shared_dims[3]; pos3 += blockDim.y) - for (int pos3 = threadIdx.x; pos3 < shared_dims[3]; pos3 += blockDim.x) - { - """, file=sio) - else: - raise NotImplementedError() - - for ipos, i in enumerate(inputs): - if not _logical_scalar(i): - print(" const float * ii_i%i_data = i%i_data;" % (ipos, ipos), file=sio) - for ipos, i in enumerate(outputs): - print(" float * ii_o%i_data = o%i_data;" % (ipos, ipos), file=sio) - for d in range(nd): - for ipos, i in enumerate(inputs): - if not _logical_scalar(i): - print(" ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d), file=sio) - for ipos, i in enumerate(outputs): - print(" ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d), file=sio) - - # perform the scalar operation on the input and output references - #TODO: What if the scalar_op needs support_code?? - self.task_code(inputs, outputs, sio, nodename, - iname=get_str_list_logical_scalar( - inputs, value_str='value0[%i]')) - print(" }" * nd, file=sio) - - #TODO: insert runtime stride checks that select the best loop order either here, or in - # the host code that launched the kernel (host code probably better spot) - - #indent = " "*(4*d+7) - #for ipos, i in enumerate(inputs): - #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' - print("}", file=sio) - - print(sio.getvalue()) - return sio.getvalue() - - def c_src_kernel_tiling_less_registers(self, inputs, outputs, nodename): - """ The kernel applies to problems with <= 5 dimensions """ - - nd = outputs[0].type.ndim - n_in = len(inputs) - n_out = len(outputs) - sio = StringIO.StringIO() - - if nd not in (2,): - return sio.getvalue() - - # print some leading comments to make the code easier to read - for ipos, i in enumerate(inputs): - print("// Input ", ipos, str(i.type), file=sio) - for ipos, i in enumerate(outputs): - print("// Output ", ipos, str(i.type), file=sio) - print("static __global__ void kernel_%s_%s(unsigned int numEls" %( - nodename, - 'tiling%i_less_registers'%nd), file=sio) - if (nd): - print("\t,", ", ".join("const int dim%i" % i - for i in range(nd)), file=sio) - #declare inputs - for ipos, i in enumerate(inputs): - s = ", ".join(["const float * i%i_data_0" % ipos] + list( - "int i%i_str_%i" % (ipos, d) for d in range(nd))) - print("\t,", s, file=sio) - #declare outputs - for ipos, i in enumerate(outputs): - s = ", ".join(["float * o%i_data_0" % ipos] + list( - "int o%i_str_%i" % (ipos, d) for d in range(nd))) - print("\t,", s, file=sio) - #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) - #print >> sio, "\t,", "float * o%i_data" % ipos - print("\t)\n{", file=sio) - - # TODO: Setting these to true makes the function fail SOMETIMES. I don't know why yet. - use_shared_stride = False - use_shared_limits = False - - def decl_limits(nd): - if use_shared_limits: - print("__shared__ float * limits[%(nd)s];" % locals(), file=sio) - - def stride(io, p, d): - if use_shared_stride: - return "s%s_str[%i][%i]" % (io, p, d) - else: - return "%s%i_str_%i" % (io, p, d) - - def limits(d): - if use_shared_limits: - return "limits[%i]" % d - else: - return "limits%i" % d - - def decl_shared_stride(nin, nout, nd): - if not use_shared_stride: - return - print(""" - __shared__ int si_str[%(nin)s][%(nd)s]; - __shared__ int so_str[%(nout)s][%(nd)s]; - if ((threadIdx.x == 0) && (threadIdx.y == 0)) { - """ % locals(), file=sio) - for i in range(nin): - for d in range(nd): - print("si_str[%(i)s][%(d)s] = i%(i)s_str_%(d)s;" % locals(), file=sio) - for i in range(n_out): - for d in range(nd): - print("so_str[%(i)s][%(d)s] = o%(i)s_str_%(d)s;" % locals(), file=sio) - print("} __syncthreads();", file=sio) - - def calc_limit(d): - s = stride('o', 0, d) - lname = limits(d) - if use_shared_limits: - print("if ((threadIdx.x == 0) && (threadIdx.y == 0)) {", file=sio) - if d == 0: - print("%(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals(), file=sio) - else: - dm1 = d - 1 - print("%(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals(), file=sio) - print("} __syncthreads();", file=sio) - else: - if d == 0: - print("const float * %(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals(), file=sio) - else: - dm1 = d - 1 - print("const float * %(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals(), file=sio) - - def decl_ptrs(d, offset): - dm1 = d - 1 - assert dm1 >= 0 - for i in range(n_in): - s = stride('i', i, d) - print("const float * i%(i)s_data_%(d)s = i%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals(), file=sio) - for i in range(n_out): - s = stride('o', i, d) - print("float * o%(i)s_data_%(d)s = o%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals(), file=sio) - - def inc_ptrs(d, amt): - for i in range(n_in): - s = stride('i', i, d) - print("i%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals(), file=sio) - for i in range(n_out): - s = stride('o', i, d) - print("o%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals(), file=sio) - - def while_limit(d): - lname = limits(d) - print("while (o0_data_%(d)s < %(lname)s) { " % locals(), file=sio) - - def end_while(d): - print("}", file=sio) - - def task_code(d): - self.task_code(inputs, outputs, sio, nodename, - iname=['i%i_data_%i[0]' % (ipos, d) - for ipos, i in enumerate(inputs)], - oname=['o%i_data_%i[0]' % (ipos, d) - for ipos, i in enumerate(outputs)]) - - if nd == 4: - decl_shared_stride(n_in, n_out, nd) - decl_limits(nd) - calc_limit(0) - inc_ptrs(0, 'blockIdx.x') - while_limit(0) - if 1: - calc_limit(1) - decl_ptrs(1, 'blockIdx.y') - while_limit(1) - if 1: - calc_limit(2) - decl_ptrs(2, 'threadIdx.y') - while_limit(2) - if 1: - calc_limit(3) - decl_ptrs(3, 'threadIdx.x') - while_limit(3) - if 1: - task_code(3) - inc_ptrs(3, 'blockDim.x') - end_while(3) - inc_ptrs(2, 'blockDim.y') - end_while(2) - inc_ptrs(1, 'gridDim.y') - end_while(1) - inc_ptrs(0, 'gridDim.x') - end_while(0) - - print("}", file=sio) - print(sio.getvalue()) - return sio.getvalue() - - -def elemwise_collapses(inputs, outputs, out_shape=None, verbose=0): - """ - This collapse dimensions that are not needed when computing - elemwise. This is usefull as it lower the indexing computation - that is heavier on gpu then on cpu. - - This is a generic version. It collapse dimensions at any place in - the shape. It handle broadcasted dimensions correctly. - - There is no special handling needed for broadcasted scalar at this level. - - @return: ndims, tuple(dims, strides) after collapsing. - """ - in_out = inputs + outputs - del inputs - if out_shape is not None: - local_dims = tuple(out_shape) - else: - # TODO, use the right algo here or make the parameter not optional - # We should always have the same shape for all outputs - # If there is more then one outputs - local_dims = tuple(outputs[0].shape) - del outputs - nd_orig = len(local_dims) - if nd_orig == 1: - # This have a lower overhead - all_c_contig = True - for inp in in_out: - if not inp.flags['C_CONTIGUOUS'] or inp.shape != local_dims: - all_c_contig = False - break - if all_c_contig: - return 0, (local_dims, []) - - collapsable = [1] * nd_orig - - local_str = [None] * len(in_out) - nd_collapse = nd_orig - for ipos in range(len(in_out)): - inp = in_out[ipos] - assert len(inp.shape) == nd_orig, "All inputs/outputs must have the same number of dimensions. You must broadcast before calling elemwise_collapse" - local_str[ipos] = list(inp.strides) - # We set the strides of broacastable dims to 0 - # This make indexing in gpu simpler and is needed - # For collapsing the dimensions. - for dim_pos in range(inp.ndim): - if inp.shape[dim_pos] == 1: - local_str[ipos][dim_pos] = 0 - - if nd_orig == 1: - # We already covered the contiguous case before - # So we are sure it is not contiguous - # TODO: Add a test that f contiguous are also collapsed by the first case. - # I think that for 1d array when the flags f contiguous is true, c contiguous is also true. - return 1, (local_dims, local_str) - - if verbose > 2: - print("before broadcast collapse") - print(" nd_collapse", nd_collapse) - print(" local_dims", local_dims) - for ipos in range(len(local_str)): - print(" local_str inputs", ipos, local_str[ipos]) - local_dims = list(local_dims) - # Collapse dimension that are broadcast in all inputs. - # need to be done before contiguous collapse as it will break it. - # Update the dimensions and the strides - for id in range(nd_collapse): - if local_dims[id] == 1: - # remove dims i from the array - for j in range(id + 1, nd_collapse): - local_dims[j - 1] = local_dims[j] - # remove dims i from the array - for input_id in range(len(in_out)): - for j in range(id + 1, nd_collapse): - local_str[input_id][j - 1] = local_str[input_id][j] - nd_collapse -= 1 - id -= 1 # TODO: what is this? How this work? - - if verbose > 2: - print("after broadcast collapse") - print(" nd_collapse", nd_collapse) - print(" local_dims", local_dims) - for ipos in range(len(local_str)): - print(" local_str inputs", ipos, local_str[ipos]) - - nd_collapse_ = [1] * nd_orig - for ipos in range(len(local_str)): - # Can we collapse dims[i] and dims[i-1]? - strides = local_str[ipos] - for i in range(nd_collapse - 1, 0, -1): - if strides[i] * local_dims[i] != strides[i - 1]: - # The dims nd-1 are not strided again dimension nd - nd_collapse_[i] = 0 - - if verbose > 1: - print("nd_collapse_", nd_collapse_) - - nd_collapse2 = nd_collapse - for i in range(nd_collapse - 1, 0, -1): - if nd_collapse_[i] == 1: - # update the local dims. - local_dims[i - 1] *= local_dims[i] - for j in range(i + 1, nd_collapse): - local_dims[j - 1] = local_dims[j] - - # update the local stride. - for ipos in range(len(local_str)): - local_str[ipos][i - 1] = local_str[ipos][i] # set new strides - # remove stride i from the array - for j in range(i + 1, nd_collapse): - local_str[ipos][j - 1] = local_str[ipos][j] - - # update the new number of dim - nd_collapse2 -= 1 - nd_collapse = nd_collapse2 - - if nd_collapse == 1: - l = [local_str[ipos][nd_collapse - 1] == in_out[ipos].itemsize - for ipos in range(len(local_str))] - if all(l): - nd_collapse = 0 - - if verbose: - print("end collapsing") - print(" nd_collapse", nd_collapse) - if verbose > 1: - print(" local_dims", local_dims) - for ipos in range(len(local_str)): - print(" local_str inputs", ipos, local_str[ipos]) - - return nd_collapse, (local_dims, local_str) - - -def reduction_collapses(inout, axis, verbose=0): - """ - This collapse dimensions that are not needed when computing - reduction. This is usefull as it lower the indexing computation - that is heavier on gpu then on cpu. - - This is a generic version. It collapse dimensions at any place in - the shape. - @param: inout: tuple(input, output) - @param: axis: None, interger, list of 1 interger - The axis over witch we will do reduction. - @return: (ndims, (input dims, input strides, input pattern), out strides) - after collapsing. - - :note: we suppose that we can always collapse the output dimensions. - """ - input = inout[0] - out = inout[1] - # Some quick check. It is faster then the full version. - if axis is None: - # The output size is always 1, so we don't care about this strides - if (input.flags['C_CONTIGUOUS'] or input.flags['F_CONTIGUOUS']): - return 0, ((input.size,), (input.itemsize,), axis), (0,) - if input.ndim == 1: - assert axis == [0] or axis == 0 or axis is None - # not c contiguous as the first if should have catched it. - return 1, (input.shape, input.strides, axis), (0,) - - if not isinstance(axis, (list, tuple)): - local_axis = [axis] - else: - local_axis = list(axis) - - # This is needed for the computing of the output strides - assert axis is None or len(local_axis) == 1 - - local_dims = list(input.shape) - local_str = list(input.strides) - out_strides = list(out.strides) - - nd_orig = len(local_dims) - collapsable = [1] * nd_orig - nd_collapse = nd_orig - - if verbose > 2: - print("before broadcast collapse") - print(" nd_collapse", nd_collapse) - print(" local_dims", local_dims) - print(" local_str inputs", local_str) - print(" local_axis", local_axis) - - # Collapse dimension that are broadcast in all inputs. - # need to be done before contiguous collapse as it will break it. - # Update the dimensions and the strides - for id in range(nd_collapse): - if local_dims[id] == 1: - for j in range(id + 1, nd_collapse): - # remove dims i from the array - local_dims[j - 1] = local_dims[j] - # remove strides i from the array - local_str[j - 1] = local_str[j] - # remove output strides i from the array - if axis is not None: - out_strides[j - 2] = out_strides[j - 1] - if id in local_axis: - local_axis.remove(id) - for axis_pos in range(len(local_axis)): - if local_axis[axis_pos] > id: - local_axis[axis_pos] -= 1 - - nd_collapse -= 1 - id -= 1 # TODO: how this work? - - if verbose > 2: - print("after broadcast collapse") - print(" nd_collapse", nd_collapse) - print(" local_dims", local_dims) - print(" local_str inputs", local_str) - print(" local_axis", local_axis) - print(" out_strides", out_strides) - - nd_collapse_ = [1] * nd_orig - # Can we collapse dims[i] and dims[i-1]? - for i in range(nd_collapse - 1, 0, -1): - if (local_str[i] * local_dims[i] != local_str[i - 1]): - # The dims nd-1 are not strided again dimension nd - nd_collapse_[i] = 0 - elif (i in local_axis) != ((i - 1) in local_axis): - nd_collapse_[i] = 0 - - if verbose > 1: - print("nd_collapse_", nd_collapse_) - - nd_collapse2 = nd_collapse - for i in range(nd_collapse - 1, 0, -1): - if nd_collapse_[i] == 1: - # update the local dims. - local_dims[i - 1] *= local_dims[i] - # set new strides - local_str[i - 1] = local_str[i] - #remove the old dims and strides - for j in range(i + 1, nd_collapse): - local_dims[j - 1] = local_dims[j] - local_str[j - 1] = local_str[j] - if axis is not None: - out_strides[j - 2] = out_strides[j - 1] - - if i in local_axis: - local_axis.remove(i) - for axis_pos in range(len(local_axis)): - if local_axis[axis_pos] > i: - local_axis[axis_pos] -= 1 - - # update the new number of dim - nd_collapse2 -= 1 - - nd_collapse = nd_collapse2 - - if nd_collapse == 1: - if local_str[nd_collapse - 1] == input.itemsize: - nd_collapse = 0 - - if verbose: - print("end collapsing") - print(" nd_collapse", nd_collapse) - if verbose > 1: - print(" local_dims", local_dims) - print(" local_str inputs", local_str) - print(" local_axis", local_axis) - print(" out_strides", out_strides) - - #print input.shape, input.strides - #print nd_collapse, (local_dims, local_str, local_axis) - local_dims = local_dims[:nd_collapse] - local_str = local_str[:nd_collapse] - out_strides = out_strides[:nd_collapse] - return nd_collapse, (local_dims, local_str, local_axis), out_strides - - -def call_elemwise(fct, input_vals, block=None, grid=None, out=None, - out_shape=None, - strides=None): - """ Call an elemwise gpu function with gived inputs and block size. - - :param fct: The gpu function to call - :param input_vals: a list of inputs to pass to fct - :param block: int, the size of the block wanted - :param grid: int, the size of the grid wanted - :param out: Optional, the preallocated output. Must have the right shape - and dtype. - - :param out_shape: Optional, if provided, we will suppose that the output, - have this shape event if it is not true. - :param strides: Optional, if provided, we will use those strides for - the inputs and outputs. - - :note: param out_shape and strides are used for the collapsing of - dimensions. - """ - inp = input_vals[0] - - # Get the output and output shape to us - if out_shape is None and out is None: - out_shape = list(inp.shape) - for i in input_vals[1:]: - # dtype checked by pycuda before gpu call - for s_i in range(len(inp.shape)): - assert (inp.shape[s_i] == i.shape[s_i] - or inp.shape[s_i] == 1 - or i.shape[s_i] == 1) - out_shape[s_i] = max(out_shape[s_i], inp.shape[s_i], - i.shape[s_i]) - if out is None: - out = gpu_ndarray.empty(out_shape, dtype=inp.dtype) - elif out_shape is None: - out_shape = out.shape - - # Arg: nb element - args = [cast_uint(out.size)] - # Arg: output shape to the arguments. - for i in range(len(out_shape)): - args.append(cast_int(out_shape[i])) - - # for each inputs and the output - # add its ptr and strides - nd = len(out_shape) - idx = 0 - for i in list(input_vals) + [out]: - itemsize = i.dtype.itemsize - args.append(i) - for j in range(nd): - # We force a stride of 0 for broadcastable dimensions - # This lower the index computation in the kernel. - if strides is not None: - # strides should have a strides of 0 for broadcasting. - args.append(cast_int(strides[idx][j] / itemsize)) - elif i.shape[j] == 1: - args.append(cast_int(0)) - else: - args.append(cast_int(i.strides[j] / itemsize)) - idx += 1 - out_size = out.size - # First use at least a full warp - if block is None: - block_ = min(32, out_size) - else: - block_ = block - # Next start adding multiprocessors - if grid is None: - grid_ = min(out_size / block_ + (out_size % block_ != 0), 60) - else: - grid_ = grid - # Next start adding more warps per multiprocessor - if block is None: - if block_ * grid_ < out_size: - block_ = min(out_size / grid_, 512) - - # We bypass the pycuda wrapper gpu function call. - # by calling directly the gpu function. - # This is faster and lower the overhead. - # Here is code that allow you to use the pycuda fct call. - # d = {"block":(block_,1,1), "grid":(grid_,1)} - # fct(*args, **d) - fct.set_block_shape(block_, 1, 1) # time_kernel - fct.param_set(*args) - fct.launch_grid(grid_, 1) - return out - - -class MyGpuNdArray(): - _compiled_fct = {} - - def __init__(self, gpu_nd_array): - #assert isinstance(gpu_nd_array, gpu_ndarray.GpuNdArrayObject) - self.gpu_nd_array = gpu_nd_array - self.ctype = dtype_to_ctype(self.gpu_nd_array.dtype) - - @staticmethod - def gen_fct(op, inputs, nd, nodename="TestNodeName", - collapse=True): - if _CL_MODE: - npy_ty = "typedef float npy_float32;\n" - else: - npy_ty = "typedef double npy_float64;\n typedef float npy_float32;\n" - - # Generate the gpu functions - nb_in = len(inputs) - fcts = [None] - for nd in range(1, nd + 1): # 1 to nd - out = op(*[TensorType(i.gpu_nd_array.dtype, - (False,) * nd)() for i in inputs]) - out_dtype = out.dtype - node = out.owner - elemwise_algo = ElemwiseAlgo(node.op.scalar_op) - - code = (CLUDA_PREAMBLE + - npy_ty + - elemwise_algo.c_src_kernel(node.inputs, - node.outputs, - nodename, nd, - static="")) - fct_name = "kernel_%s_%d" % (nodename, nd) - fct = compile_gpu_code(code, fct_name) - fcts.append(fct) - - # All inputs/outputs C contiguous case - code = (npy_ty + - CLUDA_PREAMBLE + - elemwise_algo.c_src_kernel_Ccontiguous( - node.inputs, node.outputs, nodename, static="")) - fct_name = "kernel_%s_Ccontiguous" % nodename - fcts[0] = compile_gpu_code(code, fct_name) - - def call_fct2(inputs, out=None): - " Do dimensions collapsing before call the gpu code " - assert len(inputs) == nb_in - # dtype checked by pycuda - # TODO: assert nb dim? - - inp = inputs[0] - - # Compute the output shape. - out_shape = list(inp.shape) - for i in inputs[1:]: - for s_i in range(len(inp.shape)): - assert (inp.shape[s_i] == i.shape[s_i] - or inp.shape[s_i] == 1 - or i.shape[s_i] == 1) - out_shape[s_i] = max(out_shape[s_i], i.shape[s_i]) - # Create the output object - if (out is None - or out.dtype != out_dtype - or out.shape != tuple(out_shape)): - out = MyGpuNdArray(gpu_ndarray.empty(out_shape, - dtype=out_dtype)) - - if collapse: - # Do the collapsing. - nd_col, info = elemwise_collapses(list(inputs), [out]) - # The two next line are usefull to force a call to the - # c contiguous version: - #nd_col = 0 - #info = [[],[]] - out = call_elemwise(fcts[nd_col], inputs, - out=out, out_shape=info[0][:nd_col], - strides=info[1]) - else: - out = call_elemwise(fcts[-1], inputs, out=out, - out_shape=out_shape) - return out - return call_fct2 - - def __elemwise2__(self, other, name, op): - """ Call this code on this op with 2 inputs """ - nd = len(self.gpu_nd_array.shape) # self.gpu_nd_array.ndim - assert nd == len(other.gpu_nd_array.shape) # ndim - tag = (name + '_' + str(self.gpu_nd_array.dtype) - + str(self.gpu_nd_array.ndim)) - tag += ('_' + str(other.gpu_nd_array.dtype) - + str(other.gpu_nd_array.ndim)) - fct = self._compiled_fct.get(tag, None) - if fct is None: -# print "compile", tag - fct = MyGpuNdArray.gen_fct(op, [self, other], nd) - self._compiled_fct[tag] = fct - return fct((self, other)) - - @classmethod - def __elemwise__(cls, inputs, name, op, out=None): - """ Call this code on this op with * inputs """ - nd = len(inputs[0].gpu_nd_array.shape) # self.gpu_nd_array.ndim - for i in inputs[1:]: - assert nd == len(i.gpu_nd_array.shape) # ndim - nb = len(inputs) - tag = name + "_".join([str(i.gpu_nd_array.dtype) + - str(i.gpu_nd_array.ndim) for i in inputs]) - fct = cls._compiled_fct.get(tag, None) - if fct is None: -# print "compile", tag - fct = MyGpuNdArray.gen_fct(op, inputs, nd) - cls._compiled_fct[tag] = fct - return fct(inputs, out=out) - - base = property(lambda self: self.gpu_nd_array.base) - bytes = property(lambda self: self.gpu_nd_array.bytes) - dtype = property(lambda self: self.gpu_nd_array.dtype) - flags = property(lambda self: self.gpu_nd_array.flags) - itemsize = property(lambda self: self.gpu_nd_array.itemsize) - ndim = property(lambda self: self.gpu_nd_array.ndim, - doc="number of dimensions") - offset = property(lambda self: self.gpu_nd_array.offset) - shape = property(lambda self: self.gpu_nd_array.shape) - size = property(lambda self: self.gpu_nd_array.size) - strides = property(lambda self: self.gpu_nd_array.strides) - - def __array__(self): - return numpy.asarray(self.gpu_nd_array) - - def copy(self): - return MyGpuNdArray(self.gpu_nd_array.copy()) - - def view(self): - return MyGpuNdArray(self.gpu_nd_array.view()) - - def __copy__(self): - return MyGpuNdArray(self.gpu_nd_array.__copy__()) - - def __deepcopy__(self): - return MyGpuNdArray(self.gpu_nd_array.__deepcopy__()) - - @property - def gpudata(self): - # TODO: Add this assert when PyCUDA/PyOpenCL can use the bytes - # attributes. Without this assert old code that don't support - # strides can receive as input object that are strided and no - # error will be gived - - #assert (self.gpu_nd_array.flags['C_CONTIGUOUS'] or - # self.gpu_nd_array.flags['F_CONTIGUOUS']) - - # TODO: find a way to pass to a pycuda/pyopencl function the - # bytes + offset directly. - return self.bytes + self.offset - - def __getitem__(self, *inputs): - return MyGpuNdArray(self.gpu_nd_array.__getitem__(*inputs)) - - def __add__(self, other): - return self.__elemwise2__(other, "add", theano.tensor.add) - - def __sub__(self, other): - return self.__elemwise2__(other, "sub", theano.tensor.sub) - - def __mul__(self, other): - return self.__elemwise2__(other, "mul", theano.tensor.mul) - - def __div__(self, other): - assert (str(self.gpu_nd_array.dtype).startswith("float") or - str(other.gpu_nd_array.dtype).startswith("float")) - return self.__elemwise2__(other, "true_div", theano.tensor.true_div) - - @classmethod - def add(cls, x, y, out=None): - """ add all inputs togethers element-wise """ - return cls.__elemwise__([x, y], "add", theano.tensor.add, out=out) - - @classmethod - def adds(cls, *inputs): - """ add all inputs togethers element-wise """ - return cls.__elemwise__(inputs, "add", theano.tensor.add) - - @classmethod - def multiplys(cls, *inputs): - """ multiply all inputs togethers element-wise """ - return cls.__elemwise__(inputs, "mul", theano.tensor.mul) - - def sum(self, axis=None, collapse=True): - from . import gen_reduction - max_thread_per_block = 512 - max_block = 4096 - if isinstance(axis, (list, tuple)): - if len(axis) == 1: - axis = axis[0] - else: - assert len(axis) == self.ndim - axis.sort() - assert axis == list(range(self.ndim)) - axis = None - - # TODO: Why this? - if self.size == 0: - make_out = gpu_ndarray.zeros - else: - make_out = gpu_ndarray.empty - - if axis is None: - out = make_out((), self.dtype) - out = MyGpuNdArray(out) - else: - out_shape = [self.shape[i] for i in range(self.ndim) - if i != axis] - out = make_out(out_shape, self.dtype) - out = MyGpuNdArray(out) - - if self.size == 0: - return out - - args_set = False - - if collapse: - coll_ndim, (coll_shape, coll_strides, coll_axis), coll_out_str = ( - reduction_collapses([self, out], axis)) - else: - coll_ndim = self.ndim - coll_shape = self.shape - coll_strides = self.strides - coll_axis = [axis] - coll_out_str = out.strides - - if axis is not None: - coll_axis = coll_axis[0] - - args_set = False - - if coll_ndim == 0: - sum_op = gen_reduction.GpuSum([1], self.dtype) - c_code = sum_op.c_support_code_apply("nodename", contig=True) - fctname = "kernel_reduce_sum_ccontig_nodename" - fct = compile_gpu_code(c_code, fctname) - block_ = min(coll_shape[0], max_thread_per_block) - block = (block_, 1, 1) - - grid = (1, 1) - shared_ = self.dtype.itemsize * block_ - args = [cast_int(coll_shape[0]), self, out] - args_set = True - elif axis is None: - pattern = [1] * coll_ndim - str_pattern = [str(i) for i in pattern] - sum_op = gen_reduction.GpuSum(pattern, self.dtype) - c_code = sum_op.c_support_code_apply("nodename") - if not c_code: - raise NotImplementedError( - "GpuNdArray sum case not implemented") - fctname = "kernel_reduce_sum_" + "".join(str_pattern) + "_nodename" - fct = compile_gpu_code(c_code, fctname) - if coll_ndim == 1: - bx = min(max_thread_per_block, coll_shape[0]) - block = (bx, 1, 1) - block_ = bx - elif coll_ndim == 2: - bx = min(max_thread_per_block, coll_shape[1]) - by = min(max_thread_per_block // coll_shape[1], coll_shape[0]) - by = max(by, 1) - block = (bx, by, 1) - block_ = bx * by - elif coll_ndim == 3: - bx = min(max_thread_per_block, coll_shape[2]) - by = min(max_thread_per_block // bx, coll_shape[1]) - bz = min(max_thread_per_block // (bx * by), coll_shape[0]) - by = max(by, 1) - bz = min(max(bz, 1), 64) - block = (bx, by, bz) - block_ = bx * by * bz - elif coll_ndim == 4: - bx = min(max_thread_per_block, coll_shape[3]) - by = min(max_thread_per_block // bx, coll_shape[2]) - bz = min(max_thread_per_block // (bx * by), coll_shape[1]) - by = max(by, 1) - bz = min(max(bz, 1), 64) - block = (bx, by, bz) - block_ = bx * by * bz - grid = (1, 1) - shared_ = self.dtype.itemsize * block_ - elif coll_ndim in [1, 2, 3]: - if coll_ndim == 1: - assert coll_axis == 0 - # pattern 1 - sum_op = gen_reduction.GpuSum([1], self.dtype) - fctname = "kernel_reduce_sum_1_nodename" - - grid = (1, 1) - - block_ = min(max_thread_per_block, coll_shape[0]) - block = (block_, 1, 1) - elif coll_ndim == 3 and coll_axis == 0: - # pattern 100 - sum_op = gen_reduction.GpuSum([1, 0, 0], self.dtype) - fctname = "kernel_reduce_sum_100_nodename" - - gx = min(coll_shape[1], max_block) - gy = min(max_block // (gx * coll_shape[2]), coll_shape[2]) - gy = max(gy, 1) - grid = (gx, gy) - - block_ = min(max_thread_per_block, coll_shape[0]) - block = (block_, 1, 1) - elif coll_ndim == 3 and coll_axis == 1: - # pattern 010 - sum_op = gen_reduction.GpuSum([0, 1, 0], self.dtype) - fctname = "kernel_reduce_sum_010_AD_nodename" - - A = coll_shape[0] - B = coll_shape[1] - C = coll_shape[2] - D = C / 32 - if (32 * D < C): - D += 1 - assert ((C <= 32 * D) and (32 * D < C + 32)) - shared_ = 0 - - gx = min(A, max_block) - gy = min(max_block // (D * A), D) - gy = max(gy, 1) - grid = (gx, gy) - - block = (32, 1, 1) - block_ = 32 - - args_set = True - # input shape - args = [cast_int(A), cast_int(B), - cast_int(C), cast_int(D)] - # input - args.append(self) - # input strides - args += [cast_int(i / self.dtype.itemsize) - for i in coll_strides] - # output - args.append(out) - # output strides - args.append(cast_int(coll_out_str[0] / out.dtype.itemsize)) - args.append(cast_int(coll_out_str[1] / out.dtype.itemsize)) - elif coll_ndim == 3 and coll_axis == 2: - # pattern 001 - sum_op = gen_reduction.GpuSum([0, 0, 1], self.dtype) - fctname = "kernel_reduce_sum_001_nodename" - - gx = min(coll_shape[0], max_block) - gy = min(max_block // (gx * coll_shape[1]), coll_shape[1]) - gy = max(gy, 1) - grid = (gx, gy) - - block_ = min(max_thread_per_block, coll_shape[2]) - block = (block_, 1, 1) - elif coll_axis == 0: - # pattern 10 - sum_op = gen_reduction.GpuSum([1, 0], self.dtype) - fctname = "kernel_reduce_sum_010_nodename" - block_ = min(coll_shape[1], max_thread_per_block) - block = (block_, 1, 1) - grid = (1, coll_shape[0]) - args_set = True - # input shape - args = [cast_int(1)] - args += [cast_int(i) for i in coll_shape] - # input - args.append(self) - # input strides - args.append(cast_int(1)) - args += [cast_int(i / self.dtype.itemsize) - for i in coll_strides] - # output - args.append(out) - # output strides - args.append(cast_int(1)) - # We must take the last dimensions in the case of - # dimensions collapsing. - args.append(cast_int(coll_out_str[-1] / out.dtype.itemsize)) - elif coll_axis == 1: - # pattern 01 - sum_op = gen_reduction.GpuSum([0, 1], self.dtype) - fctname = "kernel_reduce_sum_01_nodename" - block_ = min(coll_shape[1], max_thread_per_block) - block = (block_, 1, 1) - grid = (1, min(coll_shape[0], max_block)) - else: - raise Exception("Bad axis") - - c_code = sum_op.c_support_code_apply("nodename") - fct = compile_gpu_code(c_code, fctname) - - shared_ = self.dtype.itemsize * block_ - else: - raise Exception("Not implemented") - - if not args_set: - # input shape - args = [cast_int(i) for i in coll_shape] - # input - args.append(self) - # input strides - args += [cast_int(i / self.dtype.itemsize) - for i in coll_strides] - # output - args.append(out) - # output strides - args += [cast_int(i / self.dtype.itemsize) - for i in coll_out_str] - - pycuda._driver.Context.synchronize() - #print fctname, block, grid, shared_, axis - #print self.ndim, self.shape, self.strides, axis, out.strides - #print coll_ndim, coll_shape, coll_strides, coll_axis, coll_out_str - #print args - - if False: - d = {"block": block, - "shared": shared_, - "grid": grid} - fct(*args, **d) - else: - # We bypass the pycuda wrapper gpu function call. - # by calling directly the gpu function. - # This is faster and lower the overhead. - fct.set_block_shape(*block) - fct.set_shared_size(shared_) - fct.param_set(*args) - fct.launch_grid(*grid) - return out diff --git a/ndarray/gen_reduction.py b/ndarray/gen_reduction.py deleted file mode 100644 index d89d69d..0000000 --- a/ndarray/gen_reduction.py +++ /dev/null @@ -1,1515 +0,0 @@ -import numpy -import StringIO - - -_CL_MODE = False # "pyopencl" in __name__ - - -if _CL_MODE: - # THIS IS NOT FINISHED - import pyopencl as cl - import pyopencl.array as cl_array - - # import pyopencl._mymako as mako - from pyopencl._cluda import CLUDA_PREAMBLE - from pyopencl.tools import dtype_to_ctype - - # TODO: use mako to get rid of the %if - CLUDA_PREAMBLE = CLUDA_PREAMBLE[:455] - CLUDA_PREAMBLE += """ -#define LDIM_0 get_local_id(0) -#define LDIM_1 get_local_id(1) -#define LDIM_2 get_local_id(2) - -#define GDIM_0 get_global_id(0) -#define GDIM_1 get_global_id(1) -#define GDIM_2 get_global_id(2) - """ - # TODO, reuse the same context as the use used to create the memory. - ctx = cl.create_some_context() - queue = cl.CommandQueue(ctx) -else: - import pycuda.autoinit - import pycuda.driver as driver - - # import pycuda._mymako as mako - from pycuda._cluda import CLUDA_PREAMBLE - from pycuda.compiler import SourceModule - from pycuda.tools import dtype_to_ctype - CLUDA_PREAMBLE += """ -#define LDIM_0 blockDim.x -#define LDIM_1 blockDim.y -#define LDIM_2 blockDim.z - -#define GDIM_0 gridDim.x -#define GDIM_1 gridDim.y -#define GDIM_2 gridDim.z - """ - -import logging - -import theano -from theano import Apply, scalar -from theano.sandbox.cuda import CudaNdarrayType -from theano.tensor import TensorType - - -_logger_name = 'compyte.gen_reduction' -_logger = logging.getLogger(_logger_name) -_logger.setLevel(logging.INFO) -_logger.addHandler(logging.StreamHandler()) # TO REMOVE - - -def warning(*msg): - _logger.warning(_logger_name + 'WARNING: ' + ' '.join(str(m) for m in msg)) - - -def info(*msg): - _logger.info(_logger_name + 'INFO: ' + ' '.join(str(m) for m in msg)) - - -def debug(*msg): - _logger.debug(_logger_name + 'DEBUG: ' + ' '.join(str(m) for m in msg)) - - -import pygpu_ndarray as gpu_ndarray - - -class GpuSum: - """GpuSum is a Reduction along some dimensions by summation. - - The dimensions along which to sum is specified by the - `reduce_mask` that you pass to the constructor. The `reduce_mask` - is a tuple of booleans (actually integers 0 or 1) that specify for - each input dimension, whether to reduce it (1) or not (0). - - For example: - - - reduce_mask == (1,) sums a vector to a scalar - - - reduce_mask == (1,0) computes the sum of each column in a matrix - - - reduce_mask == (0,1) computes the sum of each row in a matrix - - - reduce_mask == (1,1,1) computes the sum of all elements in a - 3-tensor. - - :note: any reduce_mask of all zeros is a sort of 'copy', and may - be removed during graph optimization - - """ - def __init__(self, reduce_mask, dtype): - self.reduce_mask = tuple(reduce_mask) - # input, output and accumulator dtype - self.dtype = dtype_to_ctype(dtype) - - def __eq__(self, other): - return (type(self) == type(other) and - self.reduce_mask == other.reduce_mask) - - def __hash__(self): - return hash(type(self)) ^ hash(self.reduce_mask) - - def __str__(self): - return "GpuSum{%s}" % ','.join(str(i) for i in self.reduce_mask) - - def make_node(self, x): - if (x.type.ndim != len(self.reduce_mask)): - raise TypeError("x must have rank %i" % len(self.reduce_mask)) - o_broadcast = [x.type.broadcastable[i] - for i in range(x.type.ndim) if not self.reduce_mask[i]] - return Apply(self, [x], [CudaNdarrayType(o_broadcast)()]) - - def perform(self, node, inp, out): - x, = inp - z, = out - z[0] = x.reduce_sum(self.reduce_mask) - - def c_code(self, node, name, inp, out, sub): - x, = inp - z, = out - - nd_in = node.inputs[0].type.ndim - nd_out = node.outputs[0].type.ndim - - assert nd_in - nd_out == sum(self.reduce_mask) - - sio = StringIO.StringIO() - fail = sub['fail'] - - #check input - print(""" - if (%(x)s->nd != %(nd_in)s) - { - PyErr_Format(PyExc_TypeError, - "required nd=%(nd_in)s, got nd=%%i", %(x)s->nd); - %(fail)s; - } - """ % locals(), file=sio) - - # - # alloc an output if we need one - # - - # check the basics of out output - print(""" - if ( !%(z)s - || (%(z)s->nd != %(nd_out)s) - """ % locals(), file=sio) - - #ensure that the output has the right non-reduced dimensions - j = 0 - for i in range(nd_in): - if not self.reduce_mask[i]: - print((" || (CudaNdarray_HOST_DIMS(%(z)s)[%(j)s] !=" - "CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]) " % - locals()), file=sio) - j += 1 - - print(""" - ) - { - """ % locals(), file=sio) - print("int new_dims[%(nd_out)s]; " % locals(), file=sio) - - j = 0 - for i in range(nd_in): - if not self.reduce_mask[i]: - print(('new_dims[%(j)s] = CudaNdarray_HOST_DIMS' - '(%(x)s)[%(i)s];' % locals()), file=sio) - j += 1 - - print(""" - Py_XDECREF(%(z)s); - %(z)s = (CudaNdarray*) CudaNdarray_NewDims(%(nd_out)s, new_dims); - if (NULL == %(z)s) - { - PyErr_Format(PyExc_RuntimeError, "Failed to allocate output"); - %(fail)s; - } - } - """ % locals(), file=sio) - - # \begin bracket the reduction in a check that there is - # actually work to do - print(""" - if (CudaNdarray_SIZE(%(z)s)) - { - """ % locals(), file=sio) - - # - # Now perform the reduction - # - - if all(i == 1 for i in self.reduce_mask): - #check if the tensor is ccontiguous, if true, use the - #c_c0de_reduce_ccontig code. - #TODO: check if we are ccontiguous when we un-dimshuffle - #TODO: if only some dims are ccontiguous, call version - # with less dims. - - print('if(CudaNdarray_is_c_contiguous(%(x)s)){' % locals(), file=sio) - self.c_code_reduce_ccontig(sio, node, name, x, z, fail) - print("}else{", file=sio) - getattr(self, 'c_code_reduce_%s' % (''.join( - str(i) for i in self.reduce_mask)))(sio, node, name, - x, z, fail) - print("}", file=sio) - else: - getattr(self, 'c_code_reduce_%s' % (''.join( - str(i) for i in self.reduce_mask)))(sio, node, name, - x, z, fail) - - # \end bracket the reduction ... - print(""" - } - """ % locals(), file=sio) - - return sio.getvalue() - - def _makecall(self, node, name, x, z, fail, pattern=None): - """Return a string for making a kernel call. - - The return value looks something like: - - .. code-block:: c - - if (verbose) - printf("running kernel_reduce_sum_10_%(name)s\\n"); - int n_shared = sizeof(%(dtype)s) * n_threads.x; - kernel_reduce_sum_10_%(name)s<<>>( - CudaNdarray_HOST_DIMS(%(x)s)[0], - CudaNdarray_HOST_DIMS(%(x)s)[1], - CudaNdarray_DEV_DATA(%(x)s), - CudaNdarray_HOST_STRIDES(%(x)s)[0], - CudaNdarray_HOST_STRIDES(%(x)s)[1], - CudaNdarray_DEV_DATA(%(z)s), - CudaNdarray_HOST_STRIDES(%(z)s)[0] - ); - CNDA_THREAD_SYNC; - if (cudaSuccess != cudaGetLastError()) - { - PyErr_Format(PyExc_RuntimeError, "Cuda error: ... ); - %(fail)s; - } - """ - sio = StringIO.StringIO() - if pattern is None: - pattern = ''.join(str(c) for c in self.reduce_mask) - ndim = len(self.reduce_mask) - nd_out = ndim - sum(self.reduce_mask) - print(""" - if (verbose) - printf("running kernel_reduce_sum_%(pattern)s_%(name)s\\n"); - int n_shared = sizeof(%(dtype)s) * n_threads.x * - n_threads.y * n_threads.z; - if (verbose>1) - printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d," - " nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d," - " nb_block=%%d, n_shared=%%d\\n", - n_threads.x,n_threads.y,n_threads.z, - n_threads.x*n_threads.y*n_threads.z, - n_blocks.x,n_blocks.y, - n_blocks.x*n_blocks.y, n_shared); - kernel_reduce_sum_%(pattern)s_%(name)s<<>>( - """ % locals(), file=sio) - for i in range(ndim): - print(""" - CudaNdarray_HOST_DIMS(%(x)s)[%(i)s], - """ % locals(), file=sio) - print(""" - CudaNdarray_DEV_DATA(%(x)s) - """ % locals(), file=sio) - for i in range(ndim): - print(""" - ,CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s] - """ % locals(), file=sio) - print(""" - ,CudaNdarray_DEV_DATA(%(z)s) - """ % locals(), file=sio) - for i in range(nd_out): - print(""" - ,CudaNdarray_HOST_STRIDES(%(z)s)[%(i)s] - """ % locals(), file=sio) - print(""" - ); - CNDA_THREAD_SYNC; - cudaError_t sts = cudaGetLastError(); - if (cudaSuccess != sts) - { - PyErr_Format(PyExc_RuntimeError, -"Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", - "kernel_reduce_sum_%(pattern)s_%(name)s", - cudaGetErrorString(sts), - n_blocks.x, - n_blocks.y, - n_threads.x, - n_threads.y, - n_threads.z); - %(fail)s; - } - """ % locals(), file=sio) - return sio.getvalue() - - def _k_decl(self, nodename, - pattern=None, ndim=None, reduce_mask=None): - """Return a string to declare a kernel function - - .. code-block:: c - - __global__ void kernel_reduce_sum_110_%(nodename)s( - const int d0, - const int d1, - const int d2, - const %(dtype)s *A, - const int sA0, - const int sA1, - const int sA2, - %(dtype)s * Z, - const int sZ0) - - """ - dtype = self.dtype - if reduce_mask is None: - reduce_mask = self.reduce_mask - if ndim is None: - ndim = len(reduce_mask) - if pattern is None: - pattern = ''.join(str(i) for i in reduce_mask) - sio = StringIO.StringIO() - - print(""" - __global__ void kernel_reduce_sum_%(pattern)s_%(nodename)s( - """ % locals(), file=sio) - - for i in range(ndim): - print("""const int d%(i)s,""" % locals(), file=sio) - - print("""const %(dtype)s *A,""" % locals(), file=sio) - - for i in range(ndim): - print("""const int sA%(i)s,""" % locals(), file=sio) - - print("""%(dtype)s * Z""" % locals(), file=sio) - - for i in range(ndim - sum(reduce_mask)): - print(""", const int sZ%(i)s""" % locals(), file=sio) - - print(")", file=sio) - - return sio.getvalue() - - def _k_init(self, *args): - dtype = self.dtype - return """ - const int threadCount = blockDim.x * blockDim.y * blockDim.z; - const int threadNum = threadIdx.z * blockDim.x * blockDim.y - + threadIdx.y * blockDim.x + threadIdx.x; - extern __shared__ %(dtype)s buf[]; - %(dtype)s mysum = 0.0f; - - if (warpSize != 32){ //TODO: set error code - Z[0] = 666; - return; - } - - """ % locals() - - def _k_reduce_buf(self, z_pos): - return """ - __syncthreads(); // some kernel do multiple reduction. - buf[threadNum] = mysum; - __syncthreads(); - - // rest of function is handled by one warp - if (threadNum < warpSize) - { - //round up all the partial sums into the first `warpSize` elements - for (int i = threadNum + warpSize; i < threadCount; i += warpSize) - { - mysum += buf[i]; - } - buf[threadNum] = mysum; - if (threadNum < 16) - { - //reduce so that threadNum 0 has the sum of everything - if(threadNum + 16 < threadCount) - buf[threadNum] += buf[threadNum+16]; - if(threadNum + 8 < threadCount) - buf[threadNum] += buf[threadNum+8]; - if(threadNum + 4 < threadCount) - buf[threadNum] += buf[threadNum+4]; - if(threadNum + 2 < threadCount) - buf[threadNum] += buf[threadNum+2]; - if(threadNum + 1 < threadCount) - buf[threadNum] += buf[threadNum+1]; - if (threadNum == 0) - { - %(z_pos)s = buf[0]; - } - } - } - """ % locals() - return """ - __syncthreads(); // some kernel do multiple reduction. - buf[threadNum] = mysum; - __syncthreads(); - - // rest of function is handled by one warp - if (threadNum < warpSize) - { - //round up all the partial sums into the first `warpSize` elements - for (int i = threadNum + warpSize; i < threadCount; i += warpSize) - { - mysum += buf[i]; - } - buf[threadNum] = mysum; -/*Comment this optimization as it don't work on Fermi GPU. -TODO: find why it don't work or put the GPU compute capability into the version - // no sync because only one warp is running - if(threadCount >32) - { - buf[threadNum] += buf[threadNum+16]; - buf[threadNum] += buf[threadNum+8]; - buf[threadNum] += buf[threadNum+4]; - buf[threadNum] += buf[threadNum+2]; - buf[threadNum] += buf[threadNum+1]; - if (threadNum == 0) - { - %(z_pos)s = buf[0]; - } - - } - else */ - if (threadNum < 16) - { - //reduce so that threadNum 0 has the sum of everything - if(threadNum + 16 < threadCount) - buf[threadNum] += buf[threadNum+16]; - if(threadNum + 8 < threadCount) - buf[threadNum] += buf[threadNum+8]; - if(threadNum + 4 < threadCount) - buf[threadNum] += buf[threadNum+4]; - if(threadNum + 2 < threadCount) - buf[threadNum] += buf[threadNum+2]; - if(threadNum + 1 < threadCount) - buf[threadNum] += buf[threadNum+1]; - if (threadNum == 0) - { - %(z_pos)s = buf[0]; - } - } - } - """ % locals() - - # Threads must be organized as: threadNum%nb_reduce correspond to - # the same sum - # nb_reduce<=warpSize - def _k_reduce_buf_multiple(self, z_pos, nb_reduce): - return """ - __syncthreads(); // some kernel do multiple reduction. - buf[threadNum] = mysum; - __syncthreads(); - - // rest of function is handled by one warp - if (threadNum < %(nb_reduce)s) - { - //round up all the partial sums into the first `nb_reduce` elements - for (int i = threadNum + %(nb_reduce)s; - i < threadCount; i += %(nb_reduce)s) - { - mysum += buf[i]; - } - %(z_pos)s = mysum; - } - """ % locals() - - def c_code_reduce_ccontig(self, sio, node, name, x, z, fail): - print(""" - { - if(CudaNdarray_SIZE(%(x)s)==0){ - cudaMemset(CudaNdarray_DEV_DATA(%(z)s),0,sizeof(%(dtype)s)); - }else{ - int verbose = 0; - dim3 n_threads( - std::min(CudaNdarray_SIZE(%(x)s), - NUM_VECTOR_OP_THREADS_PER_BLOCK)); - dim3 n_blocks(1); - if (verbose) - printf("running kernel_reduce_sum_ccontig_%(name)s" - " n_threads.x=%%d, size=%%d, ndim=%%d\\n", - n_threads.x,CudaNdarray_SIZE(%(x)s),%(x)s->nd); - int n_shared = sizeof(%(dtype)s) * n_threads.x; - kernel_reduce_sum_ccontig_%(name)s<<>>( - CudaNdarray_SIZE(%(x)s), - CudaNdarray_DEV_DATA(%(x)s), - CudaNdarray_DEV_DATA(%(z)s)); - CNDA_THREAD_SYNC; - cudaError_t sts = cudaGetLastError(); - if (cudaSuccess != sts) - { - PyErr_Format(PyExc_RuntimeError, - "Cuda error: %%s: %%s. (grid: %%i x %%i;" - " block: %%i x %%i x %%i)\\n", - "kernel_reduce_sum_ccontig_%(name)s", - cudaGetErrorString(sts), - n_blocks.x, - n_blocks.y, - n_threads.x, - n_threads.y, - n_threads.z); - %(fail)s; - } - } - } - """ % locals(), file=sio) - - def c_code_reduce_1(self, sio, node, name, x, z, fail): - makecall = self._makecall(node, name, x, z, fail) - print(""" - { - int verbose = 0; - dim3 n_threads( - std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], - NUM_VECTOR_OP_THREADS_PER_BLOCK)); - dim3 n_blocks(1); - %(makecall)s - } - """ % locals(), file=sio) - - def c_code_reduce_11(self, sio, node, name, x, z, fail): - makecall = self._makecall(node, name, x, z, fail) - print(""" - { - int verbose = 0; - dim3 n_threads( - std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], - NUM_VECTOR_OP_THREADS_PER_BLOCK)); - while (n_threads.y * n_threads.x <= - NUM_VECTOR_OP_THREADS_PER_BLOCK) - ++n_threads.y; - n_threads.y -= 1; - if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[0]) - n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[0]; - - dim3 n_blocks(1); - %(makecall)s - } - """ % locals(), file=sio) - - def c_code_reduce_01X(self, sio, node, name, x, z, fail, N): - """ - :param N: the number of 1 in the pattern N=1 -> 01, N=2 -> 011, - N=3 ->0111 Work for N=1,2,3 - """ - assert N in [1, 2, 3] - makecall = self._makecall(node, name, x, z, fail) - N_pattern = ''.join(['1'] * N) - param_dim = ",".join(["CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]" % locals() - for i in range(N + 1)]) - strides_dim = ",".join( - ["CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s]" % locals() - for i in range(N + 1)]) - threads_y = """ - //get as many y threads as we can fit - while (n_threads.x * (n_threads.y+1) <= - NUM_VECTOR_OP_THREADS_PER_BLOCK) - { - if (n_threads.y < CudaNdarray_HOST_DIMS(%(x)s)[%(N)s-1]) - n_threads.y += 1; - else - break; - } -""" % locals() - threads_z = """ - //get as many z threads as we can fit - while (n_threads.x * n_threads.y * (n_threads.z+1) <= - NUM_VECTOR_OP_THREADS_PER_BLOCK) - { - if (n_threads.z < CudaNdarray_HOST_DIMS(%(x)s)[%(N)s-2]) - n_threads.z += 1; - else - break; - } -""" % locals() - if len(self.reduce_mask) == 2: - threads_y = '' - threads_z = '' - if len(self.reduce_mask) == 3: - threads_z = '' - print(""" - { - int verbose = 0; - dim3 n_threads( - std::min(CudaNdarray_HOST_DIMS(%(x)s)[%(N)s], - NUM_VECTOR_OP_THREADS_PER_BLOCK)); - %(threads_y)s - %(threads_z)s - dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], - NUM_VECTOR_OP_BLOCKS)); - %(makecall)s - } - """ % locals(), file=sio) - - def c_code_reduce_01(self, sio, node, name, x, z, fail): - self.c_code_reduce_01X(sio, node, name, x, z, fail, 1) - - def c_code_reduce_011(self, sio, node, name, x, z, fail): - self.c_code_reduce_01X(sio, node, name, x, z, fail, 2) - - def c_code_reduce_0111(self, sio, node, name, x, z, fail): - self.c_code_reduce_01X(sio, node, name, x, z, fail, 3) - - def c_code_reduce_10(self, sio, node, name, x, z, fail): - print(""" - { - int verbose = 0; - dim3 n_threads( - std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], - NUM_VECTOR_OP_THREADS_PER_BLOCK)); - dim3 n_blocks(1, - std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], - NUM_VECTOR_OP_BLOCKS)); - if (verbose) { - fprintf(stderr, - "running kernel_reduce_sum_10_%(name)s n_blocks=(%%i,%%i)\\n", - n_blocks.x, - n_blocks.y); - } - assert(CudaNdarray_HOST_DIMS(%(x)s)[1] == - CudaNdarray_HOST_DIMS(%(z)s)[0]); - int n_shared = sizeof(%(dtype)s) * n_threads.x; - kernel_reduce_sum_010_%(name)s<<>>( - 1, - CudaNdarray_HOST_DIMS(%(x)s)[0], - CudaNdarray_HOST_DIMS(%(x)s)[1], - CudaNdarray_DEV_DATA(%(x)s), - 1, - CudaNdarray_HOST_STRIDES(%(x)s)[0], - CudaNdarray_HOST_STRIDES(%(x)s)[1], - CudaNdarray_DEV_DATA(%(z)s), - 1, - CudaNdarray_HOST_STRIDES(%(z)s)[0] - ); - CNDA_THREAD_SYNC; - cudaError_t sts = cudaGetLastError(); - if (cudaSuccess != sts) - { - PyErr_Format(PyExc_RuntimeError, -"Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", - "kernel_reduce_sum_010_%(name)s", - cudaGetErrorString(sts), - n_blocks.x, - n_blocks.y, - n_threads.x, - n_threads.y, - n_threads.z); - %(fail)s; - } - } - """ % locals(), file=sio) - - def c_code_reduce_010(self, sio, node, name, x, z, fail): - makecall = self._makecall(node, name, x, z, fail) - makecall_inner = self._makecall(node, name, x, z, - fail, pattern="010_inner") - pattern = ''.join(str(i) for i in self.reduce_mask) - print(""" - { - - // if the alternative is less buggy, consider not using this branch - if (1) - { - // If there are a lot of summations to do, then we can use - // simple parallelization - use each thread to do one sum. - // we might as well launch blocks of 32 threads because that's - // the warp size. we could schedule more threads if we were - // maxing out the gridsize below, but the gridsize is way more - // than the physical hardware and I think 32 threads - // on a huge grid is enough to fully use the hardware. - dim3 n_threads(32,1,1); - - // We kindof reshape the input implicitly to something 4D: - // the shape A,B,C -> A, B, D, E - // where C <= D*E < C+32 - // where E==32 - - int A = CudaNdarray_HOST_DIMS(%(x)s)[0]; - int B = CudaNdarray_HOST_DIMS(%(x)s)[1]; - int C = CudaNdarray_HOST_DIMS(%(x)s)[2]; - int D = C/32; - if (32*D < C) D+= 1; - assert ((C <= 32*D) && (32*D < C+32)); - - // The gridsize would ideally be (A, D). But we do the - // following logic to make sure we don't ask for a grid that - // is too big. - dim3 n_blocks(A,D); - if (n_blocks.x > NUM_VECTOR_OP_BLOCKS) - n_blocks.x = NUM_VECTOR_OP_BLOCKS; - if (n_blocks.x*n_blocks.y > NUM_VECTOR_OP_BLOCKS) - n_blocks.y = NUM_VECTOR_OP_BLOCKS/n_blocks.x; - int n_shared = 0; - kernel_reduce_sum_010_AD_%(name)s<<>>( - A,B,C,D, - CudaNdarray_DEV_DATA(%(x)s), - CudaNdarray_HOST_STRIDES(%(x)s)[0], - CudaNdarray_HOST_STRIDES(%(x)s)[1], - CudaNdarray_HOST_STRIDES(%(x)s)[2], - CudaNdarray_DEV_DATA(%(z)s), - CudaNdarray_HOST_STRIDES(%(z)s)[0], - CudaNdarray_HOST_STRIDES(%(z)s)[1] - ); - CNDA_THREAD_SYNC; - cudaError_t sts = cudaGetLastError(); - if (cudaSuccess != sts) - { - PyErr_Format(PyExc_RuntimeError, -"Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", - "kernel_reduce_sum_010_%(name)s", - cudaGetErrorString(sts), - n_blocks.x, - n_blocks.y, - n_threads.x, - n_threads.y, - n_threads.z); - %(fail)s; - } - } - else - { - int verbose = 2; - - dim3 n_threads(std::min(32,CudaNdarray_HOST_DIMS(%(x)s)[2])); - while((n_threads.x*(n_threads.y+1) <= - NUM_VECTOR_OP_THREADS_PER_BLOCK) - && (n_threads.y1) - printf("n_block.x.1=%%d, n_block.x.2=%%d," - " n_block.y.1=%%d, n_block.y.2=%%d,\\n", - CudaNdarray_HOST_DIMS(%(x)s)[0], - NUM_VECTOR_OP_BLOCKS, - ceil_intdiv(CudaNdarray_HOST_DIMS(%(x)s)[2], - (int)n_threads.x), - (int)(NUM_VECTOR_OP_BLOCKS / n_blocks.x)); - assert(n_threads.x<=32); - %(makecall_inner)s - }else{ - n_threads.x = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], - (int)NUM_VECTOR_OP_THREADS_PER_BLOCK); - n_blocks.x = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], - (int)NUM_VECTOR_OP_BLOCKS); - n_blocks.y = std::min( - CudaNdarray_HOST_DIMS(%(x)s)[2], - (int)(NUM_VECTOR_OP_BLOCKS / n_blocks.x) - ); - %(makecall)s - } - CNDA_THREAD_SYNC; - cudaError_t sts = cudaGetLastError(); - if (cudaSuccess != sts) - { - PyErr_Format(PyExc_RuntimeError, -"Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", - "kernel_reduce_sum_%(pattern)s_%(name)s", - cudaGetErrorString(sts), - n_blocks.x, - n_blocks.y, - n_threads.x, - n_threads.y, - n_threads.z); - %(fail)s; - } - } - } - """ % locals(), file=sio) - - def c_code_reduce_0101(self, sio, node, name, x, z, fail): - makecall = self._makecall(node, name, x, z, fail) - print(""" - { - int verbose = 0; - dim3 n_threads( - std::min(CudaNdarray_HOST_DIMS(%(x)s)[3], - NUM_VECTOR_OP_THREADS_PER_BLOCK)); - while (n_threads.x * n_threads.y <= - NUM_VECTOR_OP_THREADS_PER_BLOCK) - { - if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1]) break; - n_threads.y += 1; - } - n_threads.y -= 1; - dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[0], - CudaNdarray_HOST_DIMS(%(x)s)[2]); - %(makecall)s - } - """ % locals(), file=sio) - - def c_code_reduce_100(self, sio, node, name, x, z, fail): - makecall = self._makecall(node, name, x, z, fail) - # use threadIdx.x for i0 - # use blockIdx.x for i1 - # use blockIdx.y for i2 - print(""" - { - int verbose = 0; - dim3 n_threads( - std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], - NUM_VECTOR_OP_THREADS_PER_BLOCK)); - dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]); - while (n_blocks.x * (n_blocks.y+1) <= NUM_VECTOR_OP_BLOCKS - && n_blocks.y <= CudaNdarray_HOST_DIMS(%(x)s)[2]) - { - n_blocks.y += 1; - } - %(makecall)s - } - """ % locals(), file=sio) - - def c_code_reduce_110(self, sio, node, name, x, z, fail): - makecall = self._makecall(node, name, x, z, fail) - print(""" - { - int verbose = 0; - dim3 n_threads( - std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], - NUM_VECTOR_OP_THREADS_PER_BLOCK)); - while (n_threads.x*n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK) - { - if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[0]) - break; - n_threads.y += 1; - } - n_threads.y -= 1; - - dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[2]); - %(makecall)s - } - """ % locals(), file=sio) - - def c_code_reduce_001(self, sio, node, name, x, z, fail): - makecall = self._makecall(node, name, x, z, fail) - print(""" - { - int verbose = 0; - dim3 n_threads( - std::min(CudaNdarray_HOST_DIMS(%(x)s)[2], - NUM_VECTOR_OP_THREADS_PER_BLOCK)); - dim3 n_blocks( - std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], - NUM_VECTOR_OP_BLOCKS)); - while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS) - { - if (n_blocks.y > CudaNdarray_HOST_DIMS(%(x)s)[1]) - break; - n_blocks.y += 1; - } - n_blocks.y -= 1; - %(makecall)s - } - """ % locals(), file=sio) - - def c_code_reduce_111(self, sio, node, name, x, z, fail): - makecall = self._makecall(node, name, x, z, fail) - print(""" - { - int verbose = 0; - dim3 n_threads( - std::min(CudaNdarray_HOST_DIMS(%(x)s)[2], - NUM_VECTOR_OP_THREADS_PER_BLOCK)); - - //get as many y threads as we can fit - while (n_threads.x * n_threads.y <= - NUM_VECTOR_OP_THREADS_PER_BLOCK) - { - if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1]) - break; - n_threads.y += 1; - } - n_threads.y -= 1; - - //get as many z threads as we can fit - while (n_threads.x * n_threads.y * n_threads.z <= - NUM_VECTOR_OP_THREADS_PER_BLOCK) - { - if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0]) - break; - n_threads.z += 1; - } - n_threads.z -= 1; - - dim3 n_blocks(1,1,1); - %(makecall)s - } - """ % locals(), file=sio) - - def c_code_reduce_0011(self, sio, node, name, x, z, fail): - makecall = self._makecall(node, name, x, z, fail) - print(""" - { - int verbose = 0; - - dim3 n_blocks( - std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], - NUM_VECTOR_OP_BLOCKS)); - - while (n_blocks.x * n_blocks.y <= NUM_VECTOR_OP_BLOCKS && - n_blocks.y < CudaNdarray_HOST_DIMS(%(x)s)[1]) - { - n_blocks.y += 1; - } - - dim3 n_threads( - std::min(CudaNdarray_HOST_DIMS(%(x)s)[3], - NUM_VECTOR_OP_THREADS_PER_BLOCK)); - while (n_threads.x * n_threads.y <= NUM_VECTOR_OP_THREADS_PER_BLOCK - && n_threads.y < CudaNdarray_HOST_DIMS(%(x)s)[2] - && n_threads.x * n_threads.y * sizeof(%(dtype)s) <= - (15 * 1024 - 200)) - { - n_threads.y += 1; - } - - %(makecall)s - } - """ % locals(), file=sio) - - def c_code_reduce_1111(self, sio, node, name, x, z, fail): - makecall = self._makecall(node, name, x, z, fail) - print(""" - { - int verbose = 0; - dim3 n_threads( - std::min(CudaNdarray_HOST_DIMS(%(x)s)[2], - NUM_VECTOR_OP_THREADS_PER_BLOCK)); - - //get as many y threads as we can fit - while (n_threads.x * n_threads.y <= - NUM_VECTOR_OP_THREADS_PER_BLOCK) - { - if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[1]) - break; - n_threads.y += 1; - } - n_threads.y -= 1; - - //get as many z threads as we can fit - while (n_threads.x * n_threads.y * n_threads.z <= - NUM_VECTOR_OP_THREADS_PER_BLOCK) - { - if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0]) - break; - n_threads.z += 1; - } - n_threads.z -= 1; - - dim3 n_blocks(1,1,1); - %(makecall)s - } - """ % locals(), file=sio) - - def c_code_reduce_1011(self, sio, node, name, x, z, fail): - makecall = self._makecall(node, name, x, z, fail) - print(""" - { - int verbose = 0; - dim3 n_threads( - std::min(CudaNdarray_HOST_DIMS(%(x)s)[3], - NUM_VECTOR_OP_THREADS_PER_BLOCK)); - - while (n_threads.x * (n_threads.y+1) <= - NUM_VECTOR_OP_THREADS_PER_BLOCK) - ++n_threads.y; - if (n_threads.y > CudaNdarray_HOST_DIMS(%(x)s)[2]) - n_threads.y = CudaNdarray_HOST_DIMS(%(x)s)[2]; - - while (n_threads.x * n_threads.y * (n_threads.z+1) <= - NUM_VECTOR_OP_THREADS_PER_BLOCK) - ++n_threads.z; - if (n_threads.z > 64) - n_threads.z = 64; - if (n_threads.z > CudaNdarray_HOST_DIMS(%(x)s)[0]) - n_threads.z = CudaNdarray_HOST_DIMS(%(x)s)[0]; - - dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]); - %(makecall)s - } - """ % locals(), file=sio) - - def c_code_cache_version(self): - return (21,) - - def c_support_code_apply(self, nodename, contig=False): - sio = StringIO.StringIO() - nd_in = len(self.reduce_mask) - dtype = self.dtype - if contig: # all(i == 1 for i in self.reduce_mask): - #this kernel is ok for up to a few thousand elements, but - # it only runs on ONE multiprocessor - reducebuf = self._k_reduce_buf('Z[0]') - print(""" - __global__ void kernel_reduce_sum_ccontig_%(nodename)s( - const int d0, - const %(dtype)s *A, - %(dtype)s * Z) - { - const int threadCount = blockDim.x; - const int threadNum = threadIdx.x; - extern __shared__ %(dtype)s buf[]; - %(dtype)s mysum = 0.0f; - - if (warpSize != 32) - { - return; //TODO: set error code - } - - for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) - { - mysum += A[i0]; - } - %(reducebuf)s - } - """ % locals(), file=sio) - if self.reduce_mask == (1,): - #this kernel is ok for up to a few thousand elements, but - # it only runs on ONE multiprocessor - reducebuf = self._k_reduce_buf('Z[0]') - decl = self._k_decl(nodename) - print(""" - %(decl)s - { - const int threadCount = blockDim.x; - const int threadNum = threadIdx.x; - extern __shared__ %(dtype)s buf[]; - %(dtype)s mysum = 0.0f; - - if (warpSize != 32) - { - return; //TODO: set error code - } - - for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x) - { - %(dtype)s Ai = A[i0 * sA0]; - mysum += Ai; - } - %(reducebuf)s - } - """ % locals(), file=sio) - if self.reduce_mask == (1, 1): - #this kernel is ok for up to a few thousand elements, but - # it only runs on ONE multiprocessor - reducebuf = self._k_reduce_buf('Z[0]') - decl = self._k_decl(nodename) - init = self._k_init(nodename) - print(decl, file=sio) - print(" { ", file=sio) - print(init, file=sio) - print(""" - for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y) - { - for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) - { - %(dtype)s Ai = A[i0 * sA0 + i1 * sA1]; - mysum += Ai; - } - } - """ % locals(), file=sio) - print(reducebuf, file=sio) - print(" } ", file=sio) - - #01, 011, 0111 - if (0 == self.reduce_mask[0] and - all(self.reduce_mask[1:]) and nd_in in[2, 3, 4]): - # this kernel uses one block for each row. - # threads per block for each element per row. - - N_pattern = ''.join(['1'] * (nd_in - 1)) - if nd_in == 2: - for_i1 = "for(int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)" - for_i2 = "int i2=0, sA2=0;" - for_i3 = "int i3=0, sA3=0;" - if nd_in == 3: - for_i1 = "for(int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)" - for_i2 = "for(int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)" - for_i3 = "int i3=0, sA3=0;" - if nd_in == 4: - for_i1 = "for(int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)" - for_i2 = "for(int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)" - for_i3 = "for(int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)" - - reducebuf = self._k_reduce_buf('Z[i0 * sZ0]') - param_dim = ",".join(["const int d%(i)s" % locals() - for i in range(nd_in)]) - param_strides = ",".join(["const int sA%(i)s" % locals() - for i in range(nd_in)]) - decl = self._k_decl(nodename) - init = self._k_init(nodename) - print(""" - %(decl)s{ - %(init)s - for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){ - mysum = 0; - %(for_i1)s{ - %(for_i2)s{ - %(for_i3)s{ - %(dtype)s Ai = A[i3 * sA3 + i2 * sA2 + - i1 * sA1 + i0 * sA0]; - mysum += Ai; - } - } - } - %(reducebuf)s - } - } - """ % locals(), file=sio) - if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0): - # this kernel uses one block for each column, - # threads per block for each element per column. - - #TODO: This kernel is pretty inefficient in terms of - # reading, because if A is c_contiguous (typical - # case) then each warp is accessing non-contigous - # memory (a segment of a column). - reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]') - print(""" - __global__ void kernel_reduce_sum_010_%(nodename)s( - const int d0, - const int d1, - const int d2, - const %(dtype)s *A, const int sA0, - const int sA1, const int sA2, - %(dtype)s * Z, const int sZ0, const int sZ1) - { - const int threadCount = blockDim.x; - const int threadNum = threadIdx.x; - extern __shared__ %(dtype)s buf[]; - - if (warpSize != 32) - { - return; //TODO: set error code - } - - - for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x) - { - for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) - { - %(dtype)s mysum = 0.0f; - for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) - { - mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2]; - } - %(reducebuf)s - } - } - - } - """ % locals(), file=sio) - if self.reduce_mask == (0, 1, 0): - print(""" - __global__ void kernel_reduce_sum_010_AD_%(nodename)s( - const int A, - const int B, - const int C, - const int D, - //const int E, // THIS is 32 - const %(dtype)s *X, const int sX0, - const int sX1, const int sX2, - %(dtype)s * Z, const int sZ0, const int sZ1) - { - const int threadCount = blockDim.x; - const int threadNum = threadIdx.x; - %(dtype)s mysum = 0.0f; - - if (warpSize != 32) - { - return; //TODO: set error code - } - - for (int a = blockIdx.x; a < A; a += gridDim.x) - { - for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y) - { - int c = i2_D * 32 + threadIdx.x; - if (c < C) - { - mysum = 0; - for (int b = 0; b < B; ++b) - { - mysum += X[a * sX0 + b * sX1 + c * sX2]; - } - Z[a * sZ0 + c * sZ1] = mysum; - } - } - } - - } - """ % locals(), file=sio) - if self.reduce_mask == (0, 1, 0): - # - # This kernel is optimized when the inner most dimensions - # have the smallest stride. - - # this kernel uses one block for multiple column(up to 32TODO), - # threads per block for each element per column. - -#thread.x = dim 2 contiguous -#thread.y = dim 1 -#block.x = dim 0 -#block.y = dim 1 rest - init = self._k_init(nodename) - decl = self._k_decl(nodename, pattern="010_inner") - reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]', - 'blockDim.x') - reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]', - 'blockDim.x') - print(""" - %(decl)s - { - if(warpSize -//#include - -#include "pygpu_ndarray_object.h" - -///////////////////////// -// Alloc and Free -///////////////////////// -//If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device. -#define COMPUTE_GPU_MEM_USED 0 -#define VERBOSE_ALLOC_FREE 0 -//If true, we fill with NAN allocated device memory. -#define ALLOC_MEMSET 0 - -static int _outstanding_mallocs[] = {0,0}; - -#ifdef DEBUG -#define DPRINTF(args...) fprintf(stderr, args) -#else -#define DPRINTF(...) -#endif - -#if COMPUTE_GPU_MEM_USED -int _allocated_size = 0; -const int TABLE_SIZE = 10000; -struct table_struct{ - void* ptr; - int size; -}; -table_struct _alloc_size_table[TABLE_SIZE]; -#endif - -/** - * Allocation and freeing of device memory should go through these functions so that the lib can track memory usage. - * - * device_malloc will set the Python error message before returning None. - * device_free will return nonzero on failure (after setting the python error message) - */ -void * device_malloc(size_t size); -int device_free(void * ptr); -static PyObject * -outstanding_mallocs(PyObject* self, PyObject * args) -{ - return PyInt_FromLong(_outstanding_mallocs[0]); -} - -int PyGpuNdArray_CopyFromPyGpuNdArray(PyGpuNdArrayObject * self, PyGpuNdArrayObject * other, bool unbroadcast = false); - -/** - * PyGpuNdArray_alloc_contiguous - * - * Allocate storage space for a tensor of rank 'nd' and given dimensions. - * - * Note: PyGpuNdArray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions - */ -template -int PyGpuNdArray_alloc_contiguous(PyGpuNdArrayObject *self, const int nd, const inttype * dim, NPY_ORDER order=NPY_CORDER) -{ - DPRINTF("PyGpuNdArray_alloc_contiguous: start nd=%i descr=%p\n", nd, self); - - if (!PyGpuNdArray_DESCR(self)){ - PyErr_SetString(PyExc_ValueError, - "PyGpuNdArray_alloc_contiguous: The array don't have a type! We can't allocate it!\n"); - return -1; - } - - // allocate an empty ndarray with c_contiguous access - // return 0 on success - int size = 1; //set up the strides for contiguous tensor - assert (nd >= 0); - if (PyGpuNdArray_set_nd(self, nd)) - { - return -1; - } - //TODO: check if by any chance our current dims are correct, - // and strides already contiguous - // in that case we can return right here. - DPRINTF("PyGpuNdArray_alloc_contiguous: before itemsize descr=%p elsize=%i\n", self->descr, self->descr->elsize); - int elsize = PyGpuNdArray_ITEMSIZE((PyObject*)self); - DPRINTF("PyGpuNdArray_alloc_contiguous: set_nd %d! elsize=%i\n", nd, elsize); - if(order != NPY_FORTRANORDER){ - DPRINTF("PyGpuNdArray_alloc_contiguous: NPY_CORDER\n"); - for (int i = nd-1; i >= 0; --i){ - if (size == 0) - PyGpuNdArray_STRIDE(self, i) = elsize; - else - PyGpuNdArray_STRIDE(self,i) = size * elsize; - PyGpuNdArray_DIM(self,i) = dim[i]; - size = size * dim[i]; - } - }else if (nd>0){ - DPRINTF("PyGpuNdArray_alloc_contiguous: NPY_FORTRANORDER\n"); - size = dim[0]; - PyGpuNdArray_STRIDE(self, 0) = elsize; - PyGpuNdArray_DIM(self, nd-1) = dim[nd-1]; - for (int i = 1; i < nd; ++i){ - if (size == 0) - PyGpuNdArray_STRIDE(self, i) = elsize; - else - PyGpuNdArray_STRIDE(self, i) = PyGpuNdArray_STRIDE(self, i-1) * dim[i-1]; - PyGpuNdArray_DIM(self, nd-i-1) = dim[nd-i-1]; - size = size * dim[i]; - } - } - - if (self->data_allocated != size) - { - // If self is a view, do not try to free its memory - if (self->data_allocated && device_free(PyGpuNdArray_DATA(self))) { - // Does this ever happen?? Do we need to set data_allocated or devdata to 0? - PyGpuNdArray_DATA(self) = NULL; - self->data_allocated = 0; - return -1; - } - - assert(size>0); - DPRINTF("PyGpuNdArray_alloc_contiguous: will allocate for size=%d elements\n", size); - - PyGpuNdArray_DATA(self) = (char*)device_malloc(size * PyGpuNdArray_ITEMSIZE((PyObject *)self)); - if (!PyGpuNdArray_DATA(self)) - { - PyGpuNdArray_set_nd(self,-1); - self->data_allocated = 0; - PyGpuNdArray_DATA(self) = 0; - return -1; - } - - // The structure of self will be reused with newly allocated memory. - // If self was a view, we should remove the reference to its base. - // (If base was already NULL, the following has no effect.) - Py_XDECREF(self->base); - self->base = NULL; - - self->data_allocated = size; - self->gpu_ndarray.flags = NPY_DEFAULT; - PyGpuNdArray_FLAGS(self) |= NPY_WRITEABLE; - PyGpuNdArray_FLAGS(self) |= NPY_OWNDATA; - if (nd == 0) { - PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; - if (order != NPY_FORTRANORDER) { - PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; - } else { - PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; - } - - }else if(nd == 1){//set c and f contiguous - PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; - PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; - }else if(order != NPY_FORTRANORDER){//set c contiguous - PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; - PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; - }else{//set f contiguous - PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; - PyGpuNdArray_FLAGS(self) &= ~NPY_C_CONTIGUOUS; - } - PyGpuNdArray_FLAGS(self) &= ~NPY_UPDATEIFCOPY; - }else if(size == 0){ - PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; - PyGpuNdArray_FLAGS(self) |= NPY_OWNDATA; - if (nd == 0) { - PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; - if (order != NPY_FORTRANORDER) { - PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; - } else { - PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; - } - - }else if(nd == 1){//set c and f contiguous - PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; - PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; - }else if(order != NPY_FORTRANORDER){//set c contiguous - PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; - PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; - }else{//set f contiguous - PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; - PyGpuNdArray_FLAGS(self) &= ~NPY_C_CONTIGUOUS; - } - PyGpuNdArray_FLAGS(self) &= ~NPY_UPDATEIFCOPY; - return 0; - }else{ - // How to check for the flags? Need to check if already contiguous. - PyErr_Format(PyExc_RuntimeError, - "PyGpuNdArray_alloc_contiguous: self->data_allocated=%d, size=%d, cmp=%d", - self->data_allocated, size, self->data_allocated != size - ); - return -1; - } - - if (order != NPY_FORTRANORDER) { - assert(PyGpuNdArray_is_c_contiguous(self)); - } else { - assert(PyGpuNdArray_is_f_contiguous(self)); - } - DPRINTF("PyGpuNdArray_alloc_contiguous: end\n"); - return 0; -} - -enum PyGpuTransfert { PyGpuHostToDevice, PyGpuDeviceToHost }; -int PyGpuMemcpy(void * dst, const void * src, int dev_offset, size_t bytes, PyGpuTransfert direction); - -int PyGpuMemset(void * dst, int data, size_t bytes); -#endif diff --git a/ndarray/pygpu_language_cuda.cu b/ndarray/pygpu_language_cuda.cu deleted file mode 100644 index 1270086..0000000 --- a/ndarray/pygpu_language_cuda.cu +++ /dev/null @@ -1,622 +0,0 @@ -#include -#include - -#include - -#ifdef __DEVICE_EMULATION__ -#define NUM_VECTOR_OP_BLOCKS 4096 -#define NUM_VECTOR_OP_THREADS_PER_BLOCK 1 //This prevents printf from getting tangled up -#else -#define NUM_VECTOR_OP_BLOCKS 4096 //Max number of blocks to launch. Should be read from device properties. (#10) -#define NUM_VECTOR_OP_THREADS_PER_BLOCK 256 //Should be read from device properties. (#10) -#endif - -#if 0 -// Do not wait after every kernel & transfer. -#define CNDA_THREAD_SYNC -#else -// This is useful for using normal profiling tools -#define CNDA_THREAD_SYNC cudaThreadSynchronize(); -#endif - -#ifndef SHARED_SIZE -#define SHARED_SIZE (16*1024) -#endif - - -char * -cublasGetErrorString(cublasStatus err) -{ - if (err == CUBLAS_STATUS_NOT_INITIALIZED) { - return "CUBLAS_STATUS_NOT_INITIALIZED"; - } else if (err == CUBLAS_STATUS_ALLOC_FAILED){ - return "CUBLAS_STATUS_ALLOC_FAILED"; - } else if (err == CUBLAS_STATUS_INVALID_VALUE){ - return "CUBLAS_STATUS_INVALID_VALUE"; - } else if (err == CUBLAS_STATUS_MAPPING_ERROR){ - return "CUBLAS_STATUS_MAPPING_ERROR"; - } else if (err == CUBLAS_STATUS_EXECUTION_FAILED){ - return "CUBLAS_STATUS_EXECUTION_FAILED"; - } else if (err == CUBLAS_STATUS_INTERNAL_ERROR){ - return "CUBLAS_STATUS_INTERNAL_ERROR"; - } else { - return "UNKNOW ERROR"; - } - -} - -///////////////////////// -// Alloc and Free -///////////////////////// -void * device_malloc(size_t size) -{ - void * rval=NULL; - cudaError_t err = cudaMalloc(&rval, size); - if (cudaSuccess != err){ -#if COMPUTE_GPU_MEM_USED - fprintf(stderr, "Error allocating %li bytes of device memory (%s). %d already allocated\n", - (long)size, cudaGetErrorString(err),_allocated_size); -#else - fprintf(stderr, "Error allocating %li bytes of device memory (%s).\n", - (long)size, cudaGetErrorString(err)); -#endif - PyErr_Format(PyExc_MemoryError, "Error allocating %li bytes of device memory (%s).", - (long)size, cudaGetErrorString(err)); - return NULL; - } - _outstanding_mallocs[0] += (rval != NULL); -#if COMPUTE_GPU_MEM_USED - for(int i=0;i __device__ T unary_copy(T a) { return a; } -decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_float, unary_copy, float) -decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_double, unary_copy, double) -decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint8, unary_copy, uint8_t) -decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int8, unary_copy, int8_t) -decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint16, unary_copy, uint16_t) -decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int16, unary_copy, int16_t) -decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint32, unary_copy, uint32_t) -decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int32, unary_copy, int32_t) -decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_uint64, unary_copy, uint64_t) -decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_int64, unary_copy, int64_t) -decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_complex64, unary_copy, npy_complex64) -decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy_complex128, unary_copy, npy_complex128) - -//template __device__ T unary_exp(T a) { return exp(a); } -//decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_exp, unary_exp) - -template -static __global__ void k_copy_1d(const int N, const T * x, const ssize_t sx, T * y, const ssize_t sy) -{ - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += gridDim.x*blockDim.x) { - y[i*sy] = x[i*sx]; - } -} - -//copy from other into self -//don't allocated memory -int -PyGpuNdArray_CopyFromPyGpuNdArray(PyGpuNdArrayObject * self, PyGpuNdArrayObject * other, bool unbroadcast) -{ - DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray start nd=%d\n", PyGpuNdArray_NDIM(self)); - assert(PyGpuNdArray_TYPE(self) == PyGpuNdArray_TYPE(other)); - assert(PyGpuNdArray_ISWRITEABLE(self)); - //standard elemwise size checks - if (PyGpuNdArray_NDIM(self) == -1) { - PyErr_SetString(PyExc_TypeError, "can't copy into un-initialized PyGpuNdArrayObject"); - return -1; - } - if (PyGpuNdArray_NDIM(self) != PyGpuNdArray_NDIM(other)) { - PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: need same number of dims. destination nd=%d, source nd=%d. No broadcasting implemented.", PyGpuNdArray_NDIM(self), PyGpuNdArray_NDIM(other)); - return -1; - } - //standard elemwise dim checks (also compute total size) - unsigned int size = 1; - unsigned int size_source = 1; - for (int i = 0; i< PyGpuNdArray_NDIM(self); ++i) { - if ((PyGpuNdArray_DIMS(self)[i] != PyGpuNdArray_DIMS(other)[i]) - && (1!=PyGpuNdArray_DIMS(other)[i] || !unbroadcast) ) { - PyErr_Format(PyExc_ValueError, "need same dimensions for dim %d, destination=%ld, source=%ld", - i, PyGpuNdArray_DIMS(self)[i], PyGpuNdArray_DIMS(other)[i]); - return -1; - } - size *= (unsigned int) PyGpuNdArray_DIMS(self)[i]; - size_source *= (unsigned int) PyGpuNdArray_DIMS(other)[i]; - } - if (0 == size) { - return 0; //nothing to copy, we're done. - } - - //cublas don't support negative stride - bool pos_stride = true; - for (int i = 0; i < PyGpuNdArray_NDIM(other); ++i) - if (PyGpuNdArray_STRIDE(other,i)<0) - pos_stride = false; - - void * other_data = PyGpuNdArray_DATA(other) + PyGpuNdArray_OFFSET(other); - void * self_data = PyGpuNdArray_DATA(self) + PyGpuNdArray_OFFSET(self); - - //Try to transfer with cublas(we suppose it is faster) - if (PyGpuNdArray_ISCONTIGUOUS(self) && PyGpuNdArray_ISCONTIGUOUS(other) && - size == size_source && PyGpuNdArray_TYPE(self) == NPY_FLOAT32 && - pos_stride - ) { - cublasScopy(size, (float*) other_data, 1, (float*) self_data, 1); - CNDA_THREAD_SYNC; - if (CUBLAS_STATUS_SUCCESS != cublasGetError()) { - PyErr_SetString(PyExc_RuntimeError, "Error copying memory"); - return -1; - } - - DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray: cublasScopy end\n"); - return 0; - } - if (PyGpuNdArray_ISCONTIGUOUS(self) && PyGpuNdArray_ISCONTIGUOUS(other) && - size == size_source && PyGpuNdArray_TYPE(self) == NPY_FLOAT64 && - pos_stride) { - cublasDcopy(size, (double*) other_data, 1, (double*) self_data, 1); - CNDA_THREAD_SYNC; - if (CUBLAS_STATUS_SUCCESS != cublasGetError()) { - PyErr_SetString(PyExc_RuntimeError, "Error copying memory"); - return -1; - } - DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray cublasDcopy end\n"); - return 0; - } - - //TODO: rewrite these copy operations to be more efficient - // See, for example the transpose example in the cuda_sdk. - switch (PyGpuNdArray_NDIM(self)) { - case 0: // scalar - { - // THIS CASE SHOULD NEVER HAPPEN BECAUSE SCALARS ARE ALWAYS C CONTIGUOUS - assert(0); - }; break; - case 1: // vector - { - assert(PyGpuNdArray_ISALIGNED(self)); - assert(PyGpuNdArray_ISALIGNED(other)); - DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray: Copying non-contiguous vector\n"); - unsigned int n_blocks = min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS); - unsigned int n_threads = min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); - - if (PyGpuNdArray_TYPE(self) == NPY_FLOAT32) { - const int elsize = sizeof(float); - k_copy_1d<<>>(size, - (float*)other_data, - PyGpuNdArray_STRIDES(other)[0]/elsize, - (float*)self_data, - PyGpuNdArray_STRIDES(self)[0]/elsize); - } else if (PyGpuNdArray_TYPE(self) == NPY_FLOAT64) { - const int elsize = sizeof(double); - k_copy_1d<<>>(size, - (double*)other_data, - PyGpuNdArray_STRIDES(other)[0]/elsize, - (double*)self_data, - PyGpuNdArray_STRIDES(self)[0]/elsize); - } else if (PyGpuNdArray_TYPE(self) == NPY_INT8) { - const int elsize = sizeof(int8_t); - k_copy_1d<<>>(size, - (int8_t*)other_data, - PyGpuNdArray_STRIDES(other)[0]/elsize, - (int8_t*)self_data, - PyGpuNdArray_STRIDES(self)[0]/elsize); - } else if (PyGpuNdArray_TYPE(self) == NPY_INT16) { - const int elsize = sizeof(int16_t); - k_copy_1d<<>>(size, - (int16_t*)other_data, - PyGpuNdArray_STRIDES(other)[0]/elsize, - (int16_t*)self_data, - PyGpuNdArray_STRIDES(self)[0]/elsize); - } else if (PyGpuNdArray_TYPE(self) == NPY_INT32) { - const int elsize = sizeof(int32_t); - k_copy_1d<<>>(size, - (int32_t*)other_data, - PyGpuNdArray_STRIDES(other)[0]/elsize, - (int32_t*)self_data, - PyGpuNdArray_STRIDES(self)[0]/elsize); - } else if (PyGpuNdArray_TYPE(self) == NPY_INT64) { - const int elsize = sizeof(int64_t); - k_copy_1d<<>>(size, - (int64_t*)other_data, - PyGpuNdArray_STRIDES(other)[0]/elsize, - (int64_t*)self_data, - PyGpuNdArray_STRIDES(self)[0]/elsize); - } else if (PyGpuNdArray_TYPE(self) == NPY_UINT8) { - const int elsize = sizeof(uint8_t); - k_copy_1d<<>>(size, - (uint8_t*)other_data, - PyGpuNdArray_STRIDES(other)[0]/elsize, - (uint8_t*)self_data, - PyGpuNdArray_STRIDES(self)[0]/elsize); - } else if (PyGpuNdArray_TYPE(self) == NPY_UINT16) { - const int elsize = sizeof(uint16_t); - k_copy_1d<<>>(size, - (uint16_t*)other_data, - PyGpuNdArray_STRIDES(other)[0]/elsize, - (uint16_t*)self_data, - PyGpuNdArray_STRIDES(self)[0]/elsize); - } else if (PyGpuNdArray_TYPE(self) == NPY_UINT32) { - const int elsize = sizeof(uint32_t); - k_copy_1d<<>>(size, - (uint32_t*)other_data, - PyGpuNdArray_STRIDES(other)[0]/elsize, - (uint32_t*)self_data, - PyGpuNdArray_STRIDES(self)[0]/elsize); - } else if (PyGpuNdArray_TYPE(self) == NPY_UINT64) { - const int elsize = sizeof(uint64_t); - k_copy_1d<<>>(size, - (uint64_t*)other_data, - PyGpuNdArray_STRIDES(other)[0]/elsize, - (uint64_t*)self_data, - PyGpuNdArray_STRIDES(self)[0]/elsize); - } else if (PyGpuNdArray_TYPE(self) == NPY_COMPLEX64) { - const int elsize = sizeof(npy_complex64); - k_copy_1d<<>>(size, - (npy_complex64*)other_data, - PyGpuNdArray_STRIDES(other)[0]/elsize, - (npy_complex64*)self_data, - PyGpuNdArray_STRIDES(self)[0]/elsize); - } else if (PyGpuNdArray_TYPE(self) == NPY_COMPLEX128) { - const int elsize = sizeof(npy_complex128); - k_copy_1d<<>>(size, - (npy_complex128*)other_data, - PyGpuNdArray_STRIDES(other)[0]/elsize, - (npy_complex128*)self_data, - PyGpuNdArray_STRIDES(self)[0]/elsize); - } else { - PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: Don't implement copy for this dtype\n"); - return -1; - } - - CNDA_THREAD_SYNC; - cudaError_t err = cudaGetLastError(); - if( cudaSuccess != err) { - PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (n_blocks=%i, n_threads_per_block=%i)\n", "k_copy_1d", cudaGetErrorString(err), n_blocks, n_threads); - return -1; - } - }; break; - default: - { - assert (cudaSuccess == cudaGetLastError()); - assert(PyGpuNdArray_ISALIGNED(self)); - assert(PyGpuNdArray_ISALIGNED(other)); - - DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray: Copying with default version unbroadcast=%d\n", unbroadcast); - // Identigy the dim of the output memory. - PyGpuNdArrayObject * cuda_dims = other; - if(unbroadcast) - cuda_dims = self; - - // Move the dim and strides information on the gpu memory - int ndim = PyGpuNdArray_NDIM(other); - void * strides_dev = device_malloc(sizeof(ssize_t)*ndim*3); - ssize_t * strides_dev_p = (ssize_t *) strides_dev; - cudaError_t err = cudaMemcpy(strides_dev, PyGpuNdArray_DIMS(cuda_dims), ndim*sizeof(ssize_t),cudaMemcpyHostToDevice); - if (err != cudaSuccess){ - PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory1: %s", cudaGetErrorString(err)); - return -1; - } - err = cudaMemcpy((void*)(strides_dev_p+ndim), PyGpuNdArray_STRIDES(other), ndim*sizeof(ssize_t),cudaMemcpyHostToDevice); - if (err != cudaSuccess){ - PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory2: %s", cudaGetErrorString(err)); - return -1; - } - err = cudaMemcpy((void*)(strides_dev_p+(ndim*2)), PyGpuNdArray_STRIDES(self), ndim*sizeof(ssize_t), cudaMemcpyHostToDevice); - if (err != cudaSuccess){ - PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory3: %s", cudaGetErrorString(err)); - return -1; - } - void * strides_host = malloc(sizeof(ssize_t)*ndim*3); - err = cudaMemcpy(strides_host, strides_dev, ndim*3*sizeof(ssize_t),cudaMemcpyDeviceToHost); - if (err != cudaSuccess){ - PyErr_Format(PyExc_RuntimeError, "Cuda error when copying memory4: %s", cudaGetErrorString(err)); - return -1; - } -#ifdef DEBUG - for(int i=0;i<3*ndim;i++) - DPRINTF(" %ld", ((ssize_t *)strides_host)[i]); - DPRINTF("\n"); -#endif - CNDA_THREAD_SYNC; - if(cudaSuccess != cudaGetLastError()){ - PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: error before copy\n"); - return -1; - } - - // call worker routine - unsigned int n_blocks = min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS); - unsigned int threads_per_block = min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); - - if ( PyGpuNdArray_TYPE(self) == NPY_FLOAT32) { - k_elemwise_unary_rowmajor_copy_float<<>>( - size, - (unsigned int)ndim, - strides_dev_p, - (const float*)other_data, - strides_dev_p+ndim, - (float*) self_data, - strides_dev_p+(ndim*2)); - } else if ( PyGpuNdArray_TYPE(self) == NPY_FLOAT64) { - k_elemwise_unary_rowmajor_copy_double<<>>( - size, - (unsigned int)ndim, - strides_dev_p, - (const double*)other_data, - strides_dev_p+ndim, - (double*) self_data, - strides_dev_p+(ndim*2)); - } else if ( PyGpuNdArray_TYPE(self) == NPY_INT8) { - k_elemwise_unary_rowmajor_copy_int8<<>>( - size, - (unsigned int)ndim, - strides_dev_p, - (const int8_t*)other_data, - strides_dev_p+ndim, - (int8_t*) self_data, - strides_dev_p+(ndim*2)); - } else if ( PyGpuNdArray_TYPE(self) == NPY_INT16) { - k_elemwise_unary_rowmajor_copy_int16<<>>( - size, - (unsigned int)ndim, - strides_dev_p, - (const int16_t*)other_data, - strides_dev_p+ndim, - (int16_t*) self_data, - strides_dev_p+(ndim*2)); - } else if ( PyGpuNdArray_TYPE(self) == NPY_INT32) { - k_elemwise_unary_rowmajor_copy_int32<<>>( - size, - (unsigned int)ndim, - strides_dev_p, - (const int32_t*)other_data, - strides_dev_p+ndim, - (int32_t*) self_data, - strides_dev_p+(ndim*2)); - } else if ( PyGpuNdArray_TYPE(self) == NPY_INT64) { - k_elemwise_unary_rowmajor_copy_int64<<>>( - size, - (unsigned int)ndim, - strides_dev_p, - (const int64_t*)other_data, - strides_dev_p+ndim, - (int64_t*) self_data, - strides_dev_p+(ndim*2)); - } else if ( PyGpuNdArray_TYPE(self) == NPY_UINT8) { - k_elemwise_unary_rowmajor_copy_uint8<<>>( - size, - (unsigned int)ndim, - strides_dev_p, - (const uint8_t*)other_data, - strides_dev_p+ndim, - (uint8_t*) self_data, - strides_dev_p+(ndim*2)); - } else if ( PyGpuNdArray_TYPE(self) == NPY_UINT16) { - k_elemwise_unary_rowmajor_copy_uint16<<>>( - size, - (unsigned int)ndim, - strides_dev_p, - (const uint16_t*)other_data, - strides_dev_p+ndim, - (uint16_t*) self_data, - strides_dev_p+(ndim*2)); - } else if ( PyGpuNdArray_TYPE(self) == NPY_UINT32) { - k_elemwise_unary_rowmajor_copy_uint32<<>>( - size, - (unsigned int)ndim, - strides_dev_p, - (const uint32_t*)other_data, - strides_dev_p+ndim, - (uint32_t*) self_data, - strides_dev_p+(ndim*2)); - } else if ( PyGpuNdArray_TYPE(self) == NPY_UINT64) { - k_elemwise_unary_rowmajor_copy_uint64<<>>( - size, - (unsigned int)ndim, - strides_dev_p, - (const uint64_t*)other_data, - strides_dev_p+ndim, - (uint64_t*) self_data, - strides_dev_p+(ndim*2)); - } else if ( PyGpuNdArray_TYPE(self) == NPY_COMPLEX64) { - k_elemwise_unary_rowmajor_copy_complex64<<>>( - size, - (unsigned int)ndim, - strides_dev_p, - (const npy_complex64*)other_data, - strides_dev_p+ndim, - (npy_complex64*) self_data, - strides_dev_p+(ndim*2)); - } else if ( PyGpuNdArray_TYPE(self) == NPY_COMPLEX128) { - k_elemwise_unary_rowmajor_copy_complex128<<>>( - size, - (unsigned int)ndim, - strides_dev_p, - (const npy_complex128*)other_data, - strides_dev_p+ndim, - (npy_complex128*) self_data, - strides_dev_p+(ndim*2)); - } else { - PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: Don't implement copy for this dtype\n"); - return -1; - } - CNDA_THREAD_SYNC; - err = cudaGetLastError(); - if( cudaSuccess != err) { - PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s. (n_blocks=%i, n_threads_per_block=%i)\n", "k_elemwise_unary_rowmajor_copy", cudaGetErrorString(err), n_blocks, threads_per_block); - return -1; - } - device_free(strides_dev); - free(strides_host); - } - }; - // Set flags - if (false && PyGpuNdArray_NDIM(self) == 0) { - //Numpy 1.4.1 is not consistent here - //When we create a new numpy ndarray of 0 dim, it is not f contiguous - //But when we take a subtensor that is of 0 dim, it is f contiguous! - //We make as them for now... - PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; - PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; - } else { - if (PyGpuNdArray_is_c_contiguous(self)) { - PyGpuNdArray_FLAGS(self) |= NPY_C_CONTIGUOUS; - } else { - PyGpuNdArray_FLAGS(self) &= ~NPY_C_CONTIGUOUS; - } - if (PyGpuNdArray_is_f_contiguous(self)) { - PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; - } else { - PyGpuNdArray_FLAGS(self) &= ~NPY_F_CONTIGUOUS; - } - } - - DPRINTF("PyGpuNdArray_CopyFromPyGpuNdArray end\n"); - return 0; -} - -int PyGpuMemcpy(void * dst, const void * src, int dev_offset, size_t bytes, - PyGpuTransfert direction){ - DPRINTF("PyGpuMemcpy: start\n"); - cudaMemcpyKind dir; - const char * ssrc; - const char * ddst; - if (direction == PyGpuDeviceToHost){ - dir = cudaMemcpyDeviceToHost; - ssrc = (char*)src+dev_offset; - ddst = (char*)dst; - } else if (direction == PyGpuHostToDevice) { - dir = cudaMemcpyHostToDevice; - ssrc = (char*)src; - ddst = (char*)dst + dev_offset; - } else { - PyErr_Format(PyExc_ValueError, - "PyGpuMemcpy: Received wrong direction %d!\n", - direction); - return -1; - } - cudaError_t err = cudaMemcpy((void*)ddst, (void*)ssrc, bytes, dir); - CNDA_THREAD_SYNC; - if (cudaSuccess != err) { - PyErr_Format(PyExc_RuntimeError, "PyGpuMemcpy: cudaMemcpy: error copying data to host (%s)", - cudaGetErrorString(err)); - return -1; - } - DPRINTF("PyGpuMemcpy: end\n"); - return 0; -} - -int PyGpuMemset(void * dst, int data, size_t bytes){ - DPRINTF("PyGpuMemset: start\n"); - cudaError_t err = cudaMemset(dst, data, bytes); - CNDA_THREAD_SYNC; - if (cudaSuccess != err) { - PyErr_Format(PyExc_MemoryError, "PyGpuMemset: Error memsetting %ld bytes of device memory(%s). %p", - bytes, cudaGetErrorString(err), PyGpuNdArray_DATA(dst)); - DPRINTF("PyGpuMemset: end error\n"); - return -1; - } - DPRINTF("PyGpuMemset: end\n"); - return 0; -} - -/* - Local Variables: - mode:c++ - c-basic-offset:4 - c-file-style:"stroustrup" - c-file-offsets:((innamespace . 0)(inline-open . 0)) - indent-tabs-mode:nil - fill-column:79 - End: -*/ -// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 : diff --git a/ndarray/pygpu_language_opencl.cpp b/ndarray/pygpu_language_opencl.cpp deleted file mode 100644 index a237aff..0000000 --- a/ndarray/pygpu_language_opencl.cpp +++ /dev/null @@ -1,317 +0,0 @@ -#include -#include -#include - -#include -#include - -#ifdef __APPLE__ - -#include - -#else - -#include - -#endif - -cl_context ctx = NULL; -cl_device_id dev; -cl_command_queue q; - -void setup_context(cl_context c); - -static void -init_context(void) -{ - cl_int err; - cl_uint n; - cl_platform_id *plats; - cl_context_properties props[3]; - cl_context c; - - if (ctx != NULL) return; - - err = clGetPlatformIDs(0, NULL, &n); - if (err != CL_SUCCESS) return; - - plats = (cl_platform_id *)calloc(n, sizeof(cl_platform_id)); - if (plats == NULL) return; - - err = clGetPlatformIDs(n, plats, NULL); - if (err != CL_SUCCESS) goto fail_id; - - props[0] = CL_CONTEXT_PLATFORM; - props[1] = (cl_context_properties)plats[0]; - props[2] = 0; - - c = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU, NULL, NULL, &err); - if (err != CL_SUCCESS) { - fprintf(stderr, "Could not create context, will fail later (%d)!\n", err); - /* error - error - error */ - /* but we do nothing */ - goto fail_id; - } - - free(plats); - - setup_context(c); - clReleaseContext(c); - - return; - fail_id: - free(plats); -} - -void -setup_context(cl_context c) { - cl_int err; - cl_device_id *devs; - size_t sz; - - if (ctx != NULL) { - clReleaseContext(ctx); - clReleaseCommandQueue(q); - } - ctx = c; - clRetainContext(ctx); - - err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 0, NULL, &sz); - if (err != CL_SUCCESS) { - fprintf(stderr, "clGetContextInfo = %d\n", err); - goto fail; - } - - devs = (cl_device_id *)malloc(sz); - if (devs == NULL) goto fail; - - err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, sz, devs, NULL); - if (err != CL_SUCCESS) goto fail_dev; - - dev = devs[0]; - free(devs); - - q = clCreateCommandQueue(ctx, dev, NULL, &err); - if (err != CL_SUCCESS) { - fprintf(stderr, "clCreateCommandQueue = %d", err); - goto fail; - } - - return; - fail_dev: - free(devs); - fail: - clReleaseContext(ctx); - ctx = NULL; -} - -void * -device_malloc(size_t size) -{ - cl_int err; - cl_mem res; - - init_context(); - - DPRINTF("malloc size = %zu\n", size); - - /* OpenCL devices do not always support byte-addressable storage - therefore make sure we have at least 4 bytes in buffers */ - if (size < 4) size = 4; - - res = clCreateBuffer(ctx, CL_MEM_READ_WRITE, size, NULL, &err); - if (err != CL_SUCCESS) { - PyErr_Format(PyExc_MemoryError, "Could not allocate device memory (%d)", err); - return NULL; - } - - return res; -} - -int -device_free(void * ptr) -{ - cl_int err; - - if ((err = clReleaseMemObject((cl_mem)ptr)) != CL_SUCCESS) { - PyErr_Format(PyExc_MemoryError, "Could not free device memory (%d)", err); - return -1; - } - return 0; -} - -int -PyGpuNdArray_CopyFromPyGpuNdArray(PyGpuNdArrayObject * self, - PyGpuNdArrayObject * other, - bool unbroadcast) -{ - size_t size = 1; - cl_event ev; - cl_int err; - - assert(PyGpuNdArray_TYPE(self) == PyGpuNdArray_TYPE(other)); - assert(PyGpuNdArray_ISWRITEABLE(self)); - if (PyGpuNdArray_NDIM(self) == -1) { - PyErr_SetString(PyExc_TypeError, "can't copy into un-initialized PyGpuN\ -dArrayObject"); - return -1; - } - - if (!(PyGpuNdArray_ISONESEGMENT(self) && PyGpuNdArray_ISONESEGMENT(other))) { - PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: only contiguous arrays are supported"); - return -1; - } - - if ((PyGpuNdArray_ISCONTIGUOUS(self) != PyGpuNdArray_ISCONTIGUOUS(other)) || - (PyGpuNdArray_ISFORTRAN(self) != PyGpuNdArray_ISFORTRAN(other)) - ) { - PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: the input and output don't have the same c/f contiguous memory layout. This isnot supported now."); - return -1; - } - - if (PyGpuNdArray_NDIM(self) != PyGpuNdArray_NDIM(other)) { - PyErr_Format(PyExc_NotImplementedError, "PyGpuNdArray_CopyFromPyGpuNdArray: need same number of dims. destination nd=%d, source nd=%d. No broadcasting implemented.", PyGpuNdArray_NDIM(self), PyGpuNdArray_NDIM(other)); - return -1; - } - - for (int i = 0; i< PyGpuNdArray_NDIM(self); ++i) { - if ((PyGpuNdArray_DIMS(self)[i] != PyGpuNdArray_DIMS(other)[i]) - && (1!=PyGpuNdArray_DIMS(other)[i] || !unbroadcast) ) { - PyErr_Format(PyExc_ValueError, "need same dimensions for dim %d, destination=%ld, source=%ld", - i, PyGpuNdArray_DIMS(self)[i], PyGpuNdArray_DIMS(other)[i]); - return -1; - } - size *= (unsigned int) PyGpuNdArray_DIMS(self)[i]; - } - - if (0 == size) { - return 0; //nothing to copy, we're done. - } - size *= PyGpuNdArray_ITEMSIZE(self); - - if ((err = clEnqueueCopyBuffer(q, (cl_mem)PyGpuNdArray_DATA(other), - (cl_mem)PyGpuNdArray_DATA(self), - PyGpuNdArray_OFFSET(other), - PyGpuNdArray_OFFSET(self), - size, 0, NULL, &ev)) != CL_SUCCESS) { - PyErr_Format(PyExc_RuntimeError, "Could not create copy command (%d)", err); - return -1; - } - if ((err = clWaitForEvents(1, &ev)) != CL_SUCCESS) { - PyErr_Format(PyExc_RuntimeError, "Could not copy data (%d)", err); - clReleaseEvent(ev); - return -1; - } - clReleaseEvent(ev); - - return 0; -} - -int -PyGpuMemcpy(void * dst, const void * src, int dev_offset, size_t bytes, - PyGpuTransfert direction) -{ - cl_int err; - cl_event ev; - - switch (direction) - { - case PyGpuHostToDevice: - err = clEnqueueWriteBuffer(q, (cl_mem)dst, CL_FALSE, dev_offset, bytes, - src, 0, NULL, &ev); - break; - case PyGpuDeviceToHost: - err = clEnqueueReadBuffer(q, (cl_mem)src, CL_FALSE, dev_offset, bytes, - dst, 0, NULL, &ev); - break; - default: - PyErr_Format(PyExc_ValueError, "Unknown direction %d", direction); - return -1; - } - - if (err != CL_SUCCESS) { - PyErr_Format(PyExc_RuntimeError, "Could not create memcpy command (%d)", err); - return -1; - } - - if ((err = clWaitForEvents(1, &ev)) != CL_SUCCESS) { - PyErr_Format(PyExc_RuntimeError, "Could not memcpy data (%d)", err); - clReleaseEvent(ev); - return -1; - } - clReleaseEvent(ev); - - return 0; -} - -int -PyGpuMemset(void * dst, int data, size_t bytes) -{ - /* This should be at least one byte over the formatted string below */ - char local_kern[92]; - const char *rlk[1]; - size_t sz; - int r, res = -1; - - cl_int err; - cl_event ev; - cl_program p; - cl_kernel k; - - bytes = (bytes+3)/4; - - if (bytes == 0) - return 0; - - unsigned char val = (unsigned)data; - unsigned int pattern = (unsigned int)val & (unsigned int)val >> 8 & (unsigned int)val >> 16 & (unsigned int)val >> 24; - - r = snprintf(local_kern, sizeof(local_kern), - "__kernel void memset(__global unsigned int *mem) { mem[get_global_id(0)] = %u; }", pattern); - /* If this assert fires, increase the size of local_kern above. */ - assert(r >= sizeof(local_kern)); - - - sz = strlen(local_kern); - rlk[0] = local_kern; - p = clCreateProgramWithSource(ctx, 1, rlk, &sz, &err); - if (err != CL_SUCCESS) { - PyErr_Format(PyExc_RuntimeError, "Could not create program (%d)", err); - return -1; - } - - if ((err = clBuildProgram(p, 1, &dev, NULL, NULL, NULL)) != CL_SUCCESS) { - PyErr_Format(PyExc_RuntimeError, "Could not build program (%d)", err); - goto fail_prog; - } - - k = clCreateKernel(p, "memset", &err); - if (err != CL_SUCCESS) { - PyErr_Format(PyExc_RuntimeError, "Could not create kernel (%d)", err); - goto fail_prog; - } - - if ((err = clSetKernelArg(k, 0, sizeof(cl_mem), &dst)) != CL_SUCCESS) { - PyErr_Format(PyExc_RuntimeError, "Could not set kernel arg (%d)", err); - goto fail_kern; - } - - if ((err = clEnqueueNDRangeKernel(q, k, 1, NULL, &bytes, NULL, 0, NULL, &ev)) != CL_SUCCESS) { - PyErr_Format(PyExc_RuntimeError, "Could not enqueue kernel (%d)", err); - goto fail_kern; - } - - if ((err = clWaitForEvents(1, &ev)) != CL_SUCCESS) { - PyErr_Format(PyExc_RuntimeError, "Could not memset (%d)", err); - } - - /* success! */ - res = 0; - - clReleaseEvent(ev); - fail_kern: - clReleaseKernel(k); - fail_prog: - clReleaseProgram(p); - return res; -} diff --git a/ndarray/pygpu_ndarray.cpp b/ndarray/pygpu_ndarray.cpp deleted file mode 100644 index 31b60dc..0000000 --- a/ndarray/pygpu_ndarray.cpp +++ /dev/null @@ -1,1546 +0,0 @@ -#include -#include - -#include -#include - -#include "pygpu_ndarray.h" -#include "pygpu_language.h" - -///////////////////////// -// Static helper methods -///////////////////////// - -static void -PyGpuNdArray_null_init(PyGpuNdArrayObject *self) -{ - DPRINTF("PyGpuNdArrayObject_null_init\n"); - - PyGpuNdArray_DATA(self) = NULL; - PyGpuNdArray_OFFSET(self) = 0; - PyGpuNdArray_NDIM(self) = -1; - self->base = NULL; - PyGpuNdArray_DIMS(self) = NULL; - PyGpuNdArray_STRIDES(self) = NULL; - PyGpuNdArray_FLAGS(self) = NPY_DEFAULT; - self->descr = NULL; - - self->data_allocated = 0; -} - - - -///////////////////////////// -// Satisfying reqs to be Type -///////////////////////////// - -//DON'T use directly(if their is other PyGpuNdArrayObject that point to it, it will cause problem)! use Py_DECREF() instead -static void -PyGpuNdArrayObject_dealloc(PyGpuNdArrayObject* self) -{ - DPRINTF("PyGpuNdArrayObject_dealloc\n"); - DPRINTF("PyGpuNdArrayObject dealloc %p %d %p\n", self, self->data_allocated, PyGpuNdArray_DATA(self)); - - if(self->ob_refcnt>1) - printf("WARNING:PyGpuNdArrayObject_dealloc called when their is still active reference to it.\n"); - - if (self->data_allocated){ - assert(PyGpuNdArray_DATA(self)); - if (PyGpuNdArray_DATA(self)){ - if (device_free(PyGpuNdArray_DATA(self))){ - fprintf(stderr, - "!!!! error freeing device memory %p (self=%p)\n", - PyGpuNdArray_DATA(self), self); - } - PyGpuNdArray_DATA(self) = NULL; - } - } - PyGpuNdArray_OFFSET(self) = 0; - PyGpuNdArray_NDIM(self) = -1; - Py_XDECREF(self->base); - self->base = NULL; - if (PyGpuNdArray_DIMS(self)){ - free(PyGpuNdArray_DIMS(self)); - PyGpuNdArray_DIMS(self) = NULL; - } - if (PyGpuNdArray_STRIDES(self)){ - free(PyGpuNdArray_STRIDES(self)); - PyGpuNdArray_STRIDES(self) = NULL; - } - PyGpuNdArray_FLAGS(self) = NPY_DEFAULT; - //Py_XDECREF(self->descr);//TODO: How to handle the refcont on this object? - self->descr = NULL; - self->data_allocated = 0; - - self->ob_type->tp_free((PyObject*)self); - --_outstanding_mallocs[1]; - DPRINTF("device_malloc_counts: (device) %i (obj) %i\n", - _outstanding_mallocs[0], - _outstanding_mallocs[1]); - DPRINTF("PyGpuNdArrayObject_dealloc end\n"); -} - -static PyObject * -PyGpuNdArray_new(PyTypeObject *type, PyObject *args, PyObject *kwds) -{ - DPRINTF("PyGpuNdArray_new\n"); - PyGpuNdArrayObject *self; - - self = (PyGpuNdArrayObject *)type->tp_alloc(type, 0); - if (self != NULL){ - PyGpuNdArray_null_init(self); - ++_outstanding_mallocs[1]; - } - DPRINTF("PyGpuNdArray_new end %p\n", self); - return (PyObject *)self; -} - -static int -PyGpuNdArray_init(PyGpuNdArrayObject *self, PyObject *args, PyObject *kwds) -{ - DPRINTF("PyGpuNdArray_init\n"); - PyObject *arr=NULL; - - if (! PyArg_ParseTuple(args, "O", &arr)) - return -1; - if (! PyArray_Check(arr)){ - PyErr_SetString(PyExc_TypeError, "PyGpuNdArrayObject_init: PyArray or PyGpuNdArrayObject arg required"); - return -1; - } - - // TODO: We must create a new copy of the PyArray_Descr(or this only increment the refcount?) or still the reference? - PyArray_Descr * type = PyArray_DescrFromType(PyArray_TYPE(arr)); - self->descr = type; - Py_XINCREF(self->descr);//TODO: How to handle the refcont on this object? - int rval = PyGpuNdArray_CopyFromArray(self, (PyArrayObject*)arr); - DPRINTF("PyGpuNdArray_init: end %p type=%p\n", self, self->descr); - return rval; -} - - -int -PyGpuNdArray_CopyFromArray(PyGpuNdArrayObject * self, PyArrayObject*obj) -{ - DPRINTF("PyGpuNdArray_CopyFromArray: start descr=%p\n", self->descr); - //modif done to the new array won't be updated! - assert(!PyGpuNdArray_CHKFLAGS(self, NPY_UPDATEIFCOPY)); - //Aligned are not tested, so don't allow it for now - assert(PyGpuNdArray_CHKFLAGS(self, NPY_ALIGNED)); - - int typenum = PyArray_TYPE(obj); - PyObject * py_src = NULL; - if (PyArray_ISONESEGMENT(obj)) { - Py_INCREF(obj); - py_src = (PyObject *) obj; - }else{ - py_src = PyArray_ContiguousFromAny((PyObject*)obj, typenum, - PyArray_NDIM(obj), - PyArray_NDIM(obj)); - } - DPRINTF("PyGpuNdArray_CopyFromArray: contiguous!\n"); - if (!py_src) { - return -1; - } - - int err; - if(PyArray_ISFORTRAN(obj) && ! PyArray_ISCONTIGUOUS(obj)){ - DPRINTF("PyGpuNdArray_CopyFromArray: fortran!\n"); - err = PyGpuNdArray_alloc_contiguous(self, obj->nd, obj->dimensions, - NPY_FORTRANORDER); - }else{ - err = PyGpuNdArray_alloc_contiguous(self, obj->nd, obj->dimensions); - } - if (err) { - return err; - } - - //check that the flag are the same - if (PyArray_ISCONTIGUOUS(py_src) != PyGpuNdArray_ISCONTIGUOUS(self) && - PyArray_ISFORTRAN(obj) && 0) { - PyErr_Format(PyExc_RuntimeError, "ISCONTIGUOUS %d %d\n", PyArray_ISCONTIGUOUS(py_src), PyGpuNdArray_ISCONTIGUOUS(self)); - return -1; - } - assert(PyArray_ISCONTIGUOUS(py_src) == PyGpuNdArray_ISCONTIGUOUS(self) || - PyArray_ISFORTRAN(obj)); - assert(PyArray_ISFORTRAN(py_src) == PyGpuNdArray_ISFORTRAN(self)); - assert(PyArray_ISALIGNED(py_src) == PyGpuNdArray_ISALIGNED(self)); - - // New memory, so we should own it. - assert(PyGpuNdArray_CHKFLAGS(self, NPY_OWNDATA)); - // New memory, so it should be writable - assert(PyGpuNdArray_ISWRITEABLE(self)); - - err = PyGpuMemcpy(PyGpuNdArray_DATA(self), - PyArray_DATA(py_src), - PyGpuNdArray_OFFSET(self), - PyArray_SIZE(py_src) * PyArray_ITEMSIZE(py_src), - PyGpuHostToDevice); - if (err) { - Py_DECREF(py_src); - return -1; - } - Py_DECREF(py_src); - DPRINTF("PyGpuNdArray_CopyFromArray: end\n"); - return 0; -} - -static PyObject * PyGpuNdArray_copy(PyObject * self, PyObject *args, - PyObject *kargs) -{ - DPRINTF("PyGpuNdArray_copy start\n"); - static const char *kwlist[] = {"order", NULL}; - NPY_ORDER order = PyArray_CORDER; - - if(!PyGpuNdArray_Check(self)){ - PyErr_SetString(PyExc_ValueError, "PyGpuNdArray_copy: expected a PyGpuNdArrayObject."); - return NULL; - } - - DPRINTF("PyGpuNdArray_copy before parse inputs\n"); - if (!PyArg_ParseTupleAndKeywords(args, kargs, "|O&", - (char**)kwlist, - PyArray_OrderConverter, - &order)) { - DPRINTF("PyGpuNdArray_copy start1.2\n"); - return NULL; - } - DPRINTF("PyGpuNdArray_copy after parse inputs\n"); - - DPRINTF("PyGpuNdArray_copy before copy\n"); - PyObject *ret = PyGpuNdArray_Copy((PyGpuNdArrayObject*)self, order); - DPRINTF("PyGpuNdArray_copy end\n"); - return ret; -} - -static PyObject * PyGpuNdArray_Copy(PyGpuNdArrayObject * self, NPY_ORDER order) -{ - DPRINTF("PyGpuNdArray_Copy start\n"); - PyObject * rval = PyGpuNdArray_New(); - //TODO find how to refcount descr. - PyGpuNdArray_DESCR(rval) = PyGpuNdArray_DESCR(self); - if ((!rval) || (-1 == PyGpuNdArray_NDIM(self))) { - return rval; - } - if (PyGpuNdArray_alloc_contiguous((PyGpuNdArrayObject*)rval, - PyGpuNdArray_NDIM(self), - PyGpuNdArray_DIMS(self), - order)) { - Py_DECREF(rval); - return NULL; - } - - if (PyGpuNdArray_CopyFromPyGpuNdArray((PyGpuNdArrayObject*)rval, self)) { - Py_DECREF(rval); - return NULL; - } - if (order == NPY_F_CONTIGUOUS) - PyGpuNdArray_FLAGS(self) |= NPY_F_CONTIGUOUS; - -#ifdef DEBUG - PyGpuNdArray_fprint(stderr, self); - PyGpuNdArray_fprint(stderr, (PyGpuNdArrayObject *)rval); -#endif - DPRINTF("PyGpuNdArray_Copy end\n"); - return rval; -} - -PyObject * PyGpuNdArray_DeepCopy(PyGpuNdArrayObject * self, PyObject * memo) -{ - assert(PyDict_Check(memo)); - PyObject * selfkey = PyInt_FromLong((long)self); - assert(selfkey); - - if (PyDict_Contains(memo, selfkey)) { - PyObject * rval = PyDict_GetItem(memo, selfkey); - Py_DECREF(selfkey); - Py_XINCREF(rval); - return rval; - } else { - DPRINTF("PyGpuNdArray_DeepCopy: startd deepcopy\n"); - PyObject * rval = PyGpuNdArray_Copy(self); - if (NULL == rval) { - Py_DECREF(selfkey); - return NULL; - } - - DPRINTF("DeepCopy created %p\n", rval); - DPRINTF("DeepCopy created %p %p\n", PyGpuNdArray_DESCR(rval), PyGpuNdArray_DATA(rval)); - if (PyDict_SetItem(memo, selfkey, rval)) { - Py_DECREF(rval); - Py_DECREF(selfkey); - return NULL; - } - Py_DECREF(selfkey); - DPRINTF("PyGpuNdArray_DeepCopy: startd end\n"); - return rval; - } -} - -PyObject * PyGpuNdArray_View(PyGpuNdArrayObject * self) -{ - PyGpuNdArrayObject * rval = (PyGpuNdArrayObject*)PyGpuNdArray_New(PyGpuNdArray_NDIM(self)); - if (!rval || PyGpuNdArray_set_data(rval, PyGpuNdArray_DATA(self), - (PyObject *)self, PyGpuNdArray_OFFSET(self))) { - Py_XDECREF(rval); - DPRINTF("PyGpuNdArray_View: no rval or PyGpuNdArray_set_data " - "failed: self=%p, rval=%p rval_base=%p\n", - self, rval, rval->base); - return NULL; - } else { - for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) { - PyGpuNdArray_DIM(rval, i) = PyGpuNdArray_DIMS(self)[i]; - PyGpuNdArray_STRIDE(rval, i) = PyGpuNdArray_STRIDES(self)[i]; - } - } - DPRINTF("PyGpuNdArray_View: self=%p, self->base=%p" - " rval=%p rval->base=%p\n", - self, self->base, rval, rval->base); - //TODO: find how to refcount on the descr! - //Py_INCREF(PyGpuNdArray_DESCR(self)); - PyGpuNdArray_DESCR(rval) = PyGpuNdArray_DESCR(self); - PyGpuNdArray_FLAGS(rval) = PyGpuNdArray_FLAGS(self); - PyGpuNdArray_FLAGS(rval) &= ~NPY_OWNDATA; - - - return (PyObject*)rval; -} - -//updated for offset -PyObject * PyGpuNdArray_CreateArrayObj(PyGpuNdArrayObject * self) -{ - DPRINTF("PyGpuNdArray_CreateArrayObj\n"); - - if(PyGpuNdArray_NDIM(self)>=0 && PyGpuNdArray_SIZE(self)==0){ - npy_intp * npydims = (npy_intp*)malloc(PyGpuNdArray_NDIM(self) * sizeof(npy_intp)); - assert (npydims); - for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) - npydims[i] = (npy_intp)(PyGpuNdArray_DIMS(self)[i]); - - // Numpy will do a decref on the description. - Py_INCREF(PyGpuNdArray_DESCR(self)); - - // We can't use PyArray_{Empty,EMPTY} as they segfault when size == 0 - PyObject * rval = PyArray_NewFromDescr(&PyArray_Type, - PyGpuNdArray_DESCR(self), - PyGpuNdArray_NDIM(self), - npydims, - NULL, - NULL, - 0, - NULL); - - free(npydims); - if (!rval){ - return NULL; - } - assert (PyArray_ITEMSIZE(rval) == PyGpuNdArray_ITEMSIZE(self)); - return rval; - } - if ((PyGpuNdArray_NDIM(self) < 0) || (PyGpuNdArray_DATA(self) == 0)) { - PyErr_SetString(PyExc_ValueError, "can't copy from un-initialized PyGpuNdArray"); - return NULL; - } - PyGpuNdArrayObject * contiguous_self = NULL; - bool pos_stride = true; - for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) - if (PyGpuNdArray_STRIDE(self,i)<0) - pos_stride = false; - if (PyGpuNdArray_ISONESEGMENT(self) && pos_stride) { - contiguous_self = self; - Py_INCREF(contiguous_self); - DPRINTF("PyGpuNdArray_CreateArrayObj: gpu array already contiguous %p\n", contiguous_self); - //}else if(PyGpuNdArray_ISONESEGMENT(self)){ - //TODO implement special object handling to speed up transfer - // DPRINTF("CreateArrayObj one segment, with special handling %p\n", contiguous_self); - //PyErr_SetString(PyExc_ValueError, "PyGpuNdArray_CreateArrayObj: Need PyGpuNdArray_Copy or some other nd array mandling to transfer contiguous bloc with negative stride."); - //return NULL; - } else { - contiguous_self = (PyGpuNdArrayObject*)PyGpuNdArray_Copy(self); - DPRINTF("CreateArrayObj created contiguous %p\n", contiguous_self); - } - if (!contiguous_self) { - return NULL; - } - - npy_intp * npydims = (npy_intp*)malloc(PyGpuNdArray_NDIM(self) * sizeof(npy_intp)); - assert (npydims); - for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) npydims[i] = (npy_intp)(PyGpuNdArray_DIMS(self)[i]); - Py_INCREF(PyGpuNdArray_DESCR(self)); - PyObject * rval = PyArray_Empty(PyGpuNdArray_NDIM(self), - npydims, - PyGpuNdArray_DESCR(self), - PyGpuNdArray_ISFORTRAN(self)); - free(npydims); - if (!rval) { - Py_DECREF(contiguous_self); - return NULL; - } - - int err = PyGpuMemcpy(PyArray_DATA(rval), - PyGpuNdArray_DATA(contiguous_self), - PyGpuNdArray_OFFSET(contiguous_self), - PyArray_SIZE(rval) * PyArray_ITEMSIZE(rval), - PyGpuDeviceToHost); - if (err) { - Py_DECREF(contiguous_self); - Py_DECREF(rval); - rval = NULL; - } - Py_DECREF(contiguous_self); - return rval; -} - -static PyObject * -PyGpuNdArray_Empty(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran) -{ - DPRINTF("PyGpuNdArray_Empty: start!\n"); - PyGpuNdArrayObject* rval = (PyGpuNdArrayObject*)PyGpuNdArray_New(); - PyGpuNdArray_DESCR(rval) = dtype; - if (!rval) { - DPRINTF("PyGpuNdArray_Empty: fail!\n"); - return NULL; - } - NPY_ORDER order = NPY_CORDER; - if (fortran!=0) - order = NPY_FORTRANORDER; - - if (PyGpuNdArray_alloc_contiguous(rval, nd, dims, order)) { - Py_DECREF(rval); - return NULL; - } - - DPRINTF("PyGpuNdArray_Empty: end!\n"); - return (PyObject*) rval; -} - -//DONE: dtype, offset not needed, flags -static PyObject * -PyGpuNdArray_Zeros(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran) -{ - DPRINTF("PyGpuNdArray_Zeros: start!\n"); - PyObject * rval = PyGpuNdArray_Empty(nd, dims, dtype, fortran); - if (!rval) { - return rval; - } - - int total_elements = 1; - for(int i=0;ielsize; - - // Fill with zeros - int err = PyGpuMemset(PyGpuNdArray_DATA(rval), 0, total_size); - if (err) { - Py_DECREF(rval); - return NULL; - } - - DPRINTF("PyGpuNdArray_Zeros: end!\n"); - return (PyObject*) rval; -} - -// declared as a static method (hence "dummy" is not used) -// numpy.zeros(shape, dtype=float, order='C') -static PyObject * -PyGpuNdArray_zeros(PyObject* dummy, PyObject* args, PyObject *kargs) -{ - static const char *kwlist[] = {"shape","dtype","order",NULL}; /* XXX ? */ - PyArray_Descr *typecode = NULL; - PyObject * shape = NULL; - NPY_ORDER order = PyArray_CORDER; - bool fortran = false; - PyObject *ret = NULL; - - if (!PyArg_ParseTupleAndKeywords(args, kargs, "O|O&O&", - (char**)kwlist, - &shape, - PyArray_DescrConverter, - &typecode, - PyArray_OrderConverter, - &order)) { - Py_XDECREF(typecode); - Py_XDECREF(shape); - return ret; - } - if (order == PyArray_FORTRANORDER) { - fortran = true; - } - else { - fortran = false; - } - - if(!PySequence_Check(shape)) - { - PyErr_SetString(PyExc_TypeError, "shape argument must be a sequence"); - return NULL; - } - - if (!typecode) - typecode = PyArray_DescrFromType(NPY_FLOAT64); - - int shplen = PySequence_Length(shape); - - if (shplen == 0) - { - return PyGpuNdArray_Zeros(0, NULL, typecode, fortran); - } - - npy_intp* newdims = (npy_intp *)malloc(sizeof(npy_intp) * shplen); - - if (!newdims) - { - PyErr_SetString(PyExc_MemoryError, - "PyGpuNdArray_Zeros: Failed to allocate temporary space"); - return NULL; - } - - // start from the end to compute strides - for (int i = shplen-1; i >= 0; --i) - { - PyObject* shp_el_obj = PySequence_GetItem(shape, i); - if(shp_el_obj == NULL) - { - // shouldn't happen since we checked length before... - PyErr_SetString(PyExc_RuntimeError, "PyGpuNdArray_Zeros: Index out of bound in sequence"); - free(newdims); - return NULL; - } - - int shp_el = PyInt_AsLong(shp_el_obj); - Py_DECREF(shp_el_obj); - - newdims[i] = shp_el; - } - - PyObject* rval = PyGpuNdArray_Zeros(shplen, newdims, typecode, fortran); - - free(newdims); - - return (PyObject*)rval; -} - -// declared as a static method (hence "dummy" is not used) -// numpy.empty(shape, dtype=float, order='C') -static PyObject * -PyGpuNdArray_empty(PyObject* dummy, PyObject* args, PyObject *kargs) -{ - static const char *kwlist[] = {"shape","dtype","order",NULL}; /* XXX ? */ - PyArray_Descr *typecode = NULL; - PyObject * shape = NULL; - NPY_ORDER order = PyArray_CORDER; - bool fortran = false; - PyObject *ret = NULL; - - if (!PyArg_ParseTupleAndKeywords(args, kargs, "O|O&O&", - (char **)kwlist, - &shape, - PyArray_DescrConverter, - &typecode, - PyArray_OrderConverter, - &order)) { - Py_XDECREF(typecode); - Py_XDECREF(shape); - return ret; - } - if (order == PyArray_FORTRANORDER) { - fortran = true; - } - else { - fortran = false; - } - - if(!PySequence_Check(shape)) - { - PyErr_SetString(PyExc_TypeError, "shape argument must be a sequence"); - return NULL; - } - - if (!typecode) - typecode = PyArray_DescrFromType(NPY_FLOAT64); - - int shplen = PySequence_Length(shape); - - if (shplen == 0) - { - return PyGpuNdArray_Empty(0, NULL, typecode, fortran); - } - - npy_intp* newdims = (npy_intp *)malloc(sizeof(npy_intp) * shplen); - - if (!newdims) - { - PyErr_SetString(PyExc_MemoryError, - "PyGpuNdArray_empty: Failed to allocate temporary space"); - return NULL; - } - - // start from the end to compute strides - for (int i = shplen-1; i >= 0; --i) - { - PyObject* shp_el_obj = PySequence_GetItem(shape, i); - if(shp_el_obj == NULL) - { - // shouldn't happen since we checked length before... - PyErr_SetString(PyExc_RuntimeError, "PyGpuNdArray_empty: Index out of bound in sequence"); - free(newdims); - return NULL; - } - - int shp_el = PyInt_AsLong(shp_el_obj); - Py_DECREF(shp_el_obj); - - newdims[i] = shp_el; - } - - PyObject* rval = PyGpuNdArray_Empty(shplen, newdims, typecode, fortran); - - free(newdims); - - return (PyObject*)rval; -} - -static PyMethodDef PyGpuNdArray_methods[] = -{ - {"__array__", - (PyCFunction)PyGpuNdArray_CreateArrayObj, METH_NOARGS, - "Copy from the device to a numpy ndarray"}, - {"copy", - (PyCFunction)PyGpuNdArray_copy, METH_VARARGS|METH_KEYWORDS, - "Create a deep copy of this object."}, - {"view", - (PyCFunction)PyGpuNdArray_View, METH_NOARGS, - "Create a view of this object."}, - {"__copy__", - (PyCFunction)PyGpuNdArray_Copy, METH_NOARGS, - "Create a copy of this object as numpy does. Why numpy do a copy of the data when the object is a view?"}, - {"__deepcopy__", - (PyCFunction)PyGpuNdArray_DeepCopy, METH_O, - "Create a copy of this object"}, -/* - {"reduce_sum", - (PyCFunction)PyGpuNdArray_ReduceSum, METH_O, - "Reduce over the given dimensions by summation"}, - {"exp", - (PyCFunction)PyGpuNdArray_Exp, METH_NOARGS, - "Return the exponential of all elements"}, - {"reshape", - (PyCFunction)PyGpuNdArray_Reshape, METH_O, - "Return a reshaped view (or copy) of this ndarray\n\ - The required argument is a tuple of integers specifying the shape of the new ndarray."}, - {"_set_stride", - (PyCFunction)PyGpuNdArray_SetStride, METH_VARARGS, - "For integer arguments (i, s), set the 'i'th stride to 's'"}, - {"_set_shape_i", - (PyCFunction)PyGpuNdArray_SetShapeI, METH_VARARGS, - "For integer arguments (i, s), set the 'i'th shape to 's'"}, - */ - {NULL, NULL, NULL, NULL} /* Sentinel */ -}; - -//PyArray_CopyInto(PyArrayObject* dest, PyArrayObject* src)¶ -//PyObject* PyArray_NewCopy(PyArrayObject* old, NPY_ORDER order)¶ - - -static PyObject * -PyGpuNdArray_get_shape(PyGpuNdArrayObject *self, void *closure) -{ - DPRINTF("PyGpuNdArray_get_shape\n"); - - if (PyGpuNdArray_NDIM(self) < 0) - { - PyErr_SetString(PyExc_ValueError, "PyGpuNdArray not initialized"); - return NULL; - } - PyObject * rval = PyTuple_New(PyGpuNdArray_NDIM(self)); - for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) - { - if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(PyGpuNdArray_DIMS(self)[i]))) - { - Py_XDECREF(rval); - return NULL; - } - - } - return rval; -} - -static int -PyGpuNdArray_set_shape(PyGpuNdArrayObject *self, PyObject *value, void *closure) -{ - PyErr_SetString(PyExc_NotImplementedError, "TODO: call reshape"); - return -1; -} - -static PyObject * -PyGpuNdArray_get_strides(PyGpuNdArrayObject *self, void *closure) -{ - if ( PyGpuNdArray_NDIM(self) < 0){ - PyErr_SetString(PyExc_ValueError, "PyGpuNdArrayObject not initialized"); - return NULL; - } - PyObject * rval = PyTuple_New( PyGpuNdArray_NDIM(self)); - for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i){ - if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(PyGpuNdArray_STRIDES(self)[i]))){ - Py_XDECREF(rval); - return NULL; - } - } - return rval; -} - -static PyObject * -PyGpuNdArray_get_data(PyGpuNdArrayObject *self, void *closure) -{ - return PyInt_FromLong((long int) PyGpuNdArray_DATA(self)); -} - -static PyObject * -PyGpuNdArray_get_flags(PyGpuNdArrayObject *self, void *closure) -{ - PyObject * dict = PyDict_New(); - - PyObject * str= PyString_FromString("C_CONTIGUOUS"); - PyObject * i = PyBool_FromLong(PyGpuNdArray_ISCONTIGUOUS(self)); - PyDict_SetItem(dict, str, i); - Py_DECREF(str); - Py_DECREF(i); - - str= PyString_FromString("F_CONTIGUOUS"); - i = PyBool_FromLong(PyGpuNdArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)); - PyDict_SetItem(dict, str, i); - Py_DECREF(str); - Py_DECREF(i); - - str= PyString_FromString("WRITEABLE"); - i = PyBool_FromLong(PyGpuNdArray_ISWRITEABLE(self)); - PyDict_SetItem(dict, str, i); - Py_DECREF(str); - Py_DECREF(i); - - str= PyString_FromString("ALIGNED"); - i = PyBool_FromLong(PyGpuNdArray_ISALIGNED(self)); - PyDict_SetItem(dict, str, i); - Py_DECREF(str); - Py_DECREF(i); - - str= PyString_FromString("UPDATEIFCOPY"); - i = PyBool_FromLong(PyGpuNdArray_CHKFLAGS(self, NPY_UPDATEIFCOPY)); - PyDict_SetItem(dict, str, i); - Py_DECREF(str); - Py_DECREF(i); - - str= PyString_FromString("OWNDATA"); - i = PyBool_FromLong(PyGpuNdArray_CHKFLAGS(self, NPY_OWNDATA)); - PyDict_SetItem(dict, str, i); - Py_DECREF(str); - Py_DECREF(i); - - return dict; -} -static PyObject * -PyGpuNdArray_get_ndim(PyGpuNdArrayObject *self, void *closure) -{ - return PyInt_FromLong((long int) PyGpuNdArray_NDIM(self)); -} -static PyObject * -PyGpuNdArray_get_offset(PyGpuNdArrayObject *self, void *closure) -{ - return PyInt_FromLong((long int) PyGpuNdArray_OFFSET(self)); -} -static PyObject * -PyGpuNdArray_get_data_allocated(PyGpuNdArrayObject *self, void *closure) -{ - return PyInt_FromLong((long int) self->data_allocated); -} -static PyObject * -PyGpuNdArray_get_size(PyGpuNdArrayObject *self, void *closure) -{ - return PyInt_FromLong((long int) PyGpuNdArray_SIZE(self)); -} - -static PyObject * -PyGpuNdArray_get_base(PyGpuNdArrayObject *self, void *closure) -{ - if (!PyGpuNdArray_BASE(self)){ - Py_INCREF(Py_None); - return Py_None; - } - PyObject * ret = PyGpuNdArray_BASE(self); - Py_INCREF(ret); - return ret; -} - -static PyObject * -PyGpuNdArray_get_dtype(PyArrayObject *self) -{ - Py_INCREF(PyGpuNdArray_DESCR(self)); - PyObject * ret = (PyObject *)PyGpuNdArray_DESCR(self); - return ret; -} - -static PyObject * -PyGpuNdArray_get_itemsize(PyArrayObject *self) -{ - return (PyObject *)PyInt_FromLong(PyGpuNdArray_ITEMSIZE(self)); -} - -static PyGetSetDef PyGpuNdArray_getset[] = { - {(char*)"base", - (getter)PyGpuNdArray_get_base, - NULL, - (char*)"Return the object stored in the base attribute", - NULL}, - {(char*)"bytes", - (getter)PyGpuNdArray_get_data, - NULL, - (char*)"device data pointer", - NULL}, - {(char*)"shape", - (getter)PyGpuNdArray_get_shape, - (setter)PyGpuNdArray_set_shape, - (char*)"shape of this ndarray (tuple)", - NULL}, - {(char*)"strides", - (getter)PyGpuNdArray_get_strides, - NULL,//(setter)PyGpuNdArray_set_strides, - (char*)"data pointer strides (in elements)", - NULL}, - {(char*)"ndim", - (getter)PyGpuNdArray_get_ndim, - NULL, - (char*)"The number of dimensions in this object", - NULL}, - {(char*)"offset", - (getter)PyGpuNdArray_get_offset, - NULL, - (char*)"Return the offset value", - NULL}, - {(char*)"size", - (getter)PyGpuNdArray_get_size, - NULL, - (char*)"The number of elements in this object.", - NULL}, - {(char*)"data_allocated", - (getter)PyGpuNdArray_get_data_allocated, - NULL, - (char*)"The size of the allocated memory on the device.", - NULL}, - {(char*)"itemsize", - (getter)PyGpuNdArray_get_itemsize, - NULL, - (char*)"The size of the base element.", - NULL}, - {(char*)"dtype", - (getter)PyGpuNdArray_get_dtype, - NULL, - (char*)"The dtype of the element", - NULL}, - {(char*)"flags", - (getter)PyGpuNdArray_get_flags, - NULL, - (char*)"Return the flags as a dictionary", - NULL}, - {NULL, NULL, NULL, NULL} /* Sentinel */ -}; - -// Will by called by __len__ in Python -static Py_ssize_t -PyGpuNdArray_len(PyObject * py_self) -{ - PyGpuNdArrayObject * self = (PyGpuNdArrayObject*) py_self; - if (PyGpuNdArray_NDIM(self) <= 0) - { - return (Py_ssize_t) 0; - } - else - { - return (Py_ssize_t) PyGpuNdArray_DIMS(self)[0]; - } -} - -static int -PyGpuNdArray_add_offset(PyGpuNdArrayObject * self, int offset) -{ - DPRINTF("PyGpuNdArray_add_offset: %p %d\n", self, offset); - -#if OFFSET - PyGpuNdArray_OFFSET(self) += offset; -#else - PyGpuNdArray_DATA(self) += offset; -#endif - return 0; -} - - -static int -PyGpuNdArray_set_data(PyGpuNdArrayObject * self, char * data, PyObject * base, int offset) -{ - DPRINTF("PyGpuNdArray_set_data: %p %p %p %d\n", self, data, base, offset); - if (self->data_allocated) - { - assert(PyGpuNdArray_DATA(self)); - if (device_free(PyGpuNdArray_DATA(self))) - { - PyGpuNdArray_DATA(self) = NULL; - self->data_allocated = 0; - DPRINTF("PyGpuNdArray_set_data: device_free failed!\n"); - PyErr_SetString(PyExc_ValueError, "PyGpuNdArray_set_data: device_free failed"); - return -1; - } - } - - // Get the original base object (base.base.base...) - // TODO: check that base is indeed a CudaNdarray? - PyObject * orig_base = base; - // base is not always a PyGpuNdArrayObject. It can be a GpuArray from pycuda, ... - while (orig_base && PyGpuNdArray_Check(orig_base) && ((PyGpuNdArrayObject*) orig_base)->base) - { - // base_base is itself a view - orig_base = ((PyGpuNdArrayObject*) orig_base)->base; - } - - //N.B. XDECREF and XINCREF are no-ops for NULL pointers - if (PyGpuNdArray_BASE(self) != orig_base) - { - Py_XDECREF(PyGpuNdArray_BASE(self)); - PyGpuNdArray_BASE(self) = orig_base; - Py_XINCREF(PyGpuNdArray_BASE(self)); - } - self->data_allocated = 0; -#if OFFSET - PyGpuNdArray_DATA(self) = data; - PyGpuNdArray_OFFSET(self) = offset; -#else - PyGpuNdArray_DATA(self) = data + offset; -#endif - - return 0; -} - -// Will by called by __getitem__ in Python -static PyObject * -PyGpuNdArray_Subscript(PyObject * py_self, PyObject * key) -{ - DPRINTF("Subscript start\n"); - PyGpuNdArrayObject * self = (PyGpuNdArrayObject*) py_self; - PyObject * py_rval = NULL; - PyGpuNdArrayObject * rval = NULL; - PyObject * intobj = NULL; - - //PyObject_Print(key, stderr, 0); - - if (key == Py_Ellipsis) - { - DPRINTF("Subscript with ellipse \n"); - Py_INCREF(py_self); - DPRINTF("Subscript with ellipse end\n"); - return py_self; - } - if ((intobj=PyNumber_Int(key))) //INDEXING BY INTEGER - { -#ifdef DEBUG - PyGpuNdArray_fprint(stderr, self); -#endif - DPRINTF("Subscript with int \n"); - - int d_idx = PyInt_AsLong(intobj); - Py_DECREF(intobj); intobj=NULL; - - DPRINTF("Subscript with int 1\n"); - if (PyGpuNdArray_NDIM(self) == 0) { - PyErr_SetString(PyExc_IndexError, "0-d arrays can't be indexed"); - return NULL; - }else if (PyGpuNdArray_NDIM(self)< 0){ - PyErr_SetString(PyExc_IndexError, "nd arrays must have a number of dim > 0!"); - return NULL; - } - int d_dim = PyGpuNdArray_DIMS(self)[0]; - int offset = 0; - DPRINTF("Subscript with int 2\n"); - - if ((d_idx >= 0) && (d_idx < d_dim)) { - //normal indexing - offset += d_idx * PyGpuNdArray_STRIDES(self)[0]; - } - else if ((d_idx < 0) && (d_idx >= -d_dim)) { - //end-based indexing - // d_idx is negative - offset += (d_dim + d_idx) * PyGpuNdArray_STRIDES(self)[0]; - } else { - PyErr_SetString(PyExc_IndexError, "index out of bounds"); - return NULL; - } - DPRINTF("Subscript with int 3\n"); - - //Add the original offset - offset += PyGpuNdArray_OFFSET(self); - - //allocate our subtensor view - py_rval = PyGpuNdArray_New(PyGpuNdArray_NDIM(self) - 1); - rval = (PyGpuNdArrayObject*) py_rval; - if (!rval) return NULL; - - //TODO: find how to refcount on the descr! - PyGpuNdArray_DESCR(py_rval) = PyGpuNdArray_DESCR(self); - - DPRINTF("Subscript with int 4\n"); - //initialize the view's data pointer to our own. - assert (0 == rval->data_allocated); - if (PyGpuNdArray_set_data(rval, PyGpuNdArray_DATA(self), (PyObject *) self, offset)){ - Py_DECREF(rval); - return NULL; - } - DPRINTF("Subscript with int 5\n"); - - for (int d = 1; d < PyGpuNdArray_NDIM(self); ++d) { - PyGpuNdArray_STRIDE(rval, d-1) = PyGpuNdArray_STRIDES(self)[d]; - PyGpuNdArray_DIM(rval, d-1) = PyGpuNdArray_DIMS(self)[d]; - } - } - else { - PyErr_Clear(); - } - if (PySlice_Check(key)) //INDEXING BY SLICE - { - DPRINTF("Subscript with slice \n"); - if (PyGpuNdArray_NDIM(self) == 0) - { - PyErr_SetString(PyExc_ValueError, "cannot slice a 0-d array"); - return NULL; - } - - int d_dim = PyGpuNdArray_DIMS(self)[0]; - Py_ssize_t start, stop, step, slen; - if (PySlice_GetIndicesEx((PySliceObject*)key, d_dim, &start, &stop, &step, &slen)) { - return NULL; - } - - DPRINTF("start %zd\nstop %zd\n step %zd\n slen %zd\n", - start, stop, step, slen); - - //allocate our subtensor view - py_rval = PyGpuNdArray_New(PyGpuNdArray_NDIM(self)); - rval = (PyGpuNdArrayObject*) py_rval; - if (!rval) return NULL; - - //TODO: find how to refcount on the descr! - PyGpuNdArray_DESCR(py_rval) = PyGpuNdArray_DESCR(self); - assert (0 == rval->data_allocated); - if (PyGpuNdArray_set_data(rval, - PyGpuNdArray_DATA(self), - py_self, - start * PyGpuNdArray_STRIDE(self, 0) - + PyGpuNdArray_OFFSET(self))) { - Py_DECREF(rval); - return NULL; - } - - //initialize dimension 0 of rval - PyGpuNdArray_STRIDE(rval, 0) = step * PyGpuNdArray_STRIDES(self)[0]; - PyGpuNdArray_DIM(rval, 0) = slen; - DPRINTF("rval stride %zd\n", PyGpuNdArray_STRIDES(rval)[0]); - // initialize dimensions > 0 of rval - for (int d = 1; d < PyGpuNdArray_NDIM(self); ++d) { - PyGpuNdArray_STRIDE(rval, d) = PyGpuNdArray_STRIDES(self)[d]; - PyGpuNdArray_DIM(rval, d) = PyGpuNdArray_DIMS(self)[d]; - } - } - if (PyTuple_Check(key)) //INDEXING BY TUPLE - { - DPRINTF("Subscript with tuple \n"); - //elements of the tuple can be either integers or slices - //the dimensionality of the view we will return is diminished for each slice in the tuple - int tuple_start_index = 0; - if (PyTuple_Size(key) > PyGpuNdArray_NDIM(self)) - { - if (PyTuple_GetItem(key, 0) == Py_Ellipsis && - PyTuple_Size(key) == PyGpuNdArray_NDIM(self) + 1) - { - tuple_start_index = 1; - DPRINTF("Subscript with tuple staring with an extra ellipse" - " at the start.\n"); - } - else{ - PyErr_SetString(PyExc_IndexError, - "index error, specified more dimensions then" - " the number of existing dimensions"); - return NULL; - } - } - - //calculate the number of dimensions in the return value - int rval_nd = PyGpuNdArray_NDIM(self); - for (int tuple_d = tuple_start_index; tuple_d < PyTuple_Size(key); - ++tuple_d) - { - //On some paltform PyInt_Check() return true, other it return false. - //So we use PyArray_IsAnyScalar that should covert everything. - rval_nd -= PyArray_IsAnyScalar(PyTuple_GetItem(key, tuple_d)); - } - - //allocate our subtensor view - py_rval = PyGpuNdArray_New(rval_nd); - rval = (PyGpuNdArrayObject*) py_rval; - if (!rval) return NULL; - assert (0 == rval->data_allocated); - - //TODO: find how to refcount on the descr! - PyGpuNdArray_DESCR(py_rval) = PyGpuNdArray_DESCR(self); - - //initialize the view's data pointer to our own. - if (PyGpuNdArray_set_data(rval, PyGpuNdArray_DATA(self), - py_self, PyGpuNdArray_OFFSET(self))) - { - Py_DECREF(rval); - return NULL; - } - - // rval_d will refer to the current dimension in the rval. - // It will not be incremented for integer keys, but will be incremented for slice - // keys - int rval_d = 0; - - for (int self_d = 0, tuple_d = tuple_start_index; - self_d < PyGpuNdArray_NDIM(self); ++self_d, ++tuple_d) - { - // keys can be shorter than PyGpuNdArray_NDIM(self). - // when that happens, it means that the remaining dimensions are "full slices" - if (tuple_d >= PyTuple_Size(key)) - { - PyGpuNdArray_STRIDE(rval, rval_d) = - PyGpuNdArray_STRIDES(self)[tuple_d]; - PyGpuNdArray_DIM(rval, rval_d) = - PyGpuNdArray_DIMS(self)[tuple_d]; - ++rval_d; - DPRINTF("Subscript extra dims to append %zd %zd\n", - PyGpuNdArray_STRIDE(rval, rval_d), - PyGpuNdArray_DIM(rval, rval_d)); - } - else - { - PyObject * key_d = PyTuple_GetItem(key, tuple_d); - - if (PySlice_Check(key_d)) - { - Py_ssize_t start, stop, step, slen; - if (PySlice_GetIndicesEx((PySliceObject*)key_d, - PyGpuNdArray_DIMS(self)[self_d], - &start, &stop, &step, &slen)) - { - Py_DECREF(rval); - return NULL; - } - PyGpuNdArray_add_offset(rval, start * PyGpuNdArray_STRIDES(self)[self_d]); - PyGpuNdArray_STRIDE(rval, rval_d) = step * PyGpuNdArray_STRIDES(self)[self_d]; - PyGpuNdArray_DIM(rval, rval_d) = slen; - - DPRINTF("rval_d %d self_d %d\n start %zd\nstop %zd\n step %zd\n slen %zd\n", - rval_d, self_d, start, stop, step, slen); - ++rval_d; - } - else if ((intobj=PyNumber_Int(key_d))) - { - assert(PyArray_IsAnyScalar(key_d)); - int d_idx = PyInt_AsLong(intobj); - Py_DECREF(intobj); - intobj = NULL; - int d_dim = PyGpuNdArray_DIMS(self)[self_d]; - - if ((d_idx >= 0) && (d_idx < d_dim)) - { - //normal indexing - PyGpuNdArray_add_offset(rval, d_idx * PyGpuNdArray_STRIDES(self)[self_d]); - } - else if ((d_idx < 0) && (d_idx >= -d_dim)) - { - //end-based indexing - PyGpuNdArray_add_offset(rval, (d_dim + d_idx) * PyGpuNdArray_STRIDES(self)[self_d]); - } - else - { - PyErr_SetString(PyExc_IndexError, "index out of bounds"); - Py_DECREF(rval); - return NULL; - } - } - else if (key_d == Py_Ellipsis) - { - if (self_d != 0){ - PyErr_Format(PyExc_IndexError, - "Ellipsis supported only at the start of" - " the tuple"); - Py_DECREF(rval); - return NULL; - } - DPRINTF("Substript with tuple with the first element an ellipse\n"); - for( ; self_d < (rval_nd - PyTuple_Size(key) + 1); self_d++) - { - PyGpuNdArray_STRIDE(rval, rval_d) = - PyGpuNdArray_STRIDES(self)[self_d]; - PyGpuNdArray_DIM(rval, rval_d) = - PyGpuNdArray_DIMS(self)[self_d]; - DPRINTF("Ellipse append dimensions self_%d with %zd %zd\n", - self_d, - PyGpuNdArray_STRIDE(rval, rval_d), - PyGpuNdArray_DIM(rval, rval_d)); - ++rval_d; - } - tuple_start_index = 1; - self_d--; - } - else - { - PyErr_Clear(); // clear the error set by PyNumber_Int - PyErr_Format(PyExc_IndexError, - "index must be either int or slice. Got %s", - PyString_AsString(PyObject_Str(key_d))); - Py_DECREF(rval); - return NULL; - } - } - } - } - if (py_rval) - { -#ifdef DEBUG - PyGpuNdArray_fprint(stderr, self); - PyGpuNdArray_fprint(stderr, rval); -#endif - } - else - { - PyErr_SetString(PyExc_NotImplementedError, "Unknown key type"); - return NULL; - } - - // Set flags - if (PyGpuNdArray_ISWRITEABLE(self)) { - PyGpuNdArray_FLAGS(rval) |= NPY_WRITEABLE; - } else { - PyGpuNdArray_FLAGS(rval) &= ~NPY_WRITEABLE; - } - PyGpuNdArray_FLAGS(rval) &= ~NPY_OWNDATA; - if (PyGpuNdArray_ISALIGNED(self)) { - PyGpuNdArray_FLAGS(rval) |= NPY_ALIGNED; - } else { - PyGpuNdArray_FLAGS(rval) &= ~NPY_ALIGNED; - } - PyGpuNdArray_FLAGS(rval) &= ~NPY_UPDATEIFCOPY; - - if (false && PyGpuNdArray_NDIM(rval) == 0) { - //Numpy is not consistent here - //When we create a new numpy ndarray of 0 dim, it is not f contiguous - //But when we take a subtensor that is of 0 dim, it is f contiguous! - //We make as them for now... - PyGpuNdArray_FLAGS(rval) &= ~NPY_F_CONTIGUOUS; - PyGpuNdArray_FLAGS(rval) |= NPY_C_CONTIGUOUS; - } else { - if (PyGpuNdArray_is_c_contiguous(rval)) { - PyGpuNdArray_FLAGS(rval) |= NPY_C_CONTIGUOUS; - } else { - PyGpuNdArray_FLAGS(rval) &= ~NPY_C_CONTIGUOUS; - } - if (PyGpuNdArray_is_f_contiguous(rval)) { - PyGpuNdArray_FLAGS(rval) |= NPY_F_CONTIGUOUS; - } else { - PyGpuNdArray_FLAGS(rval) &= ~NPY_F_CONTIGUOUS; - } - } - - DPRINTF("Subscript end\n"); - return py_rval; -} - -PyMappingMethods PyGpuNdArrayMappingMethods = { - PyGpuNdArray_len, //lenfunc mp_length; __len__ - PyGpuNdArray_Subscript, //binaryfunc mp_subscript; __getitem__ - 0 //PyGpuNdArray_setitem //objobjargproc mp_ass_subscript; __setitem__ -}; - -static PyTypeObject PyGpuNdArrayType = -{ - PyObject_HEAD_INIT(NULL) - 0, /*ob_size*/ - "GpuNdArray", /*tp_name*/ - sizeof(PyGpuNdArrayObject), /*tp_basicsize*/ - 0, /*tp_itemsize*/ - (destructor)PyGpuNdArrayObject_dealloc, /*tp_dealloc*/ - 0, /*tp_print*/ - 0, /*tp_getattr*/ - 0, /*tp_setattr*/ - 0, /*tp_compare*/ - 0, /*tp_repr*/ - 0, //&PyGpuNdArrayObjectNumberMethods, /*tp_as_number*/ - 0, /*tp_as_sequence*/ - &PyGpuNdArrayMappingMethods,/*tp_as_mapping*/ - 0, /*tp_hash */ - 0, /*tp_call*/ - 0, /*tp_str*/ - 0, /*tp_getattro*/ - 0, /*tp_setattro*/ - 0, /*tp_as_buffer*/ - Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_CHECKTYPES, /*tp_flags*/ - "PyGpuNdArrayObject objects", /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - 0, /* tp_iter */ - 0, /* tp_iternext */ - PyGpuNdArray_methods, /* tp_methods */ - 0, //PyGpuNdArray_members, /* tp_members */ //TODO - PyGpuNdArray_getset, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - (initproc)PyGpuNdArray_init,/* tp_init */ - 0, /* tp_alloc */ - PyGpuNdArray_new, /* tp_new */ -}; - -////////////////////////////////////// -// -// C API FOR PyGpuNdArrayObject -// -////////////////////////////////////// -PyObject * -PyGpuNdArray_New(int nd) -{ - DPRINTF("PyGpuNdArray_New start\n"); - PyGpuNdArrayObject *self = (PyGpuNdArrayObject *)PyGpuNdArrayType.tp_alloc(&PyGpuNdArrayType, 0); - if (self == NULL) { - PyErr_SetString(PyExc_RuntimeError, "PyGpuNdArray_New failed to allocate self"); - return NULL; - } - PyGpuNdArray_null_init(self); - - if (nd == 0) { - PyGpuNdArray_NDIM(self) = 0; - } - else if (nd > 0) { - if (PyGpuNdArray_set_nd(self, nd)) { - Py_DECREF(self); - return NULL; - } - } - ++_outstanding_mallocs[1]; - DPRINTF("PyGpuNdArray_New end\n"); - return (PyObject *)self; -} - -int -PyGpuNdArray_Check(const PyObject * ob) -{ - DPRINTF("PyGpuNdArray_Check\n"); - //TODO: doesn't work with inheritance - return PyGpuNdArray_CheckExact(ob); -} -int -PyGpuNdArray_CheckExact(const PyObject * ob) -{ - DPRINTF("PyGpuNdArray_CheckExact\n"); - return ((ob->ob_type == &PyGpuNdArrayType) ? 1 : 0); -} - -static PyObject * -PyGpuNdArray_as_c_contiguous(PyObject* dummy, PyObject* args, PyObject *kargs) -{ - DPRINTF("PyGpuNdArray_as_c_contiguous:start\n"); - static const char *kwlist[] = {"a", "dtype", NULL}; - PyArray_Descr *typecode = NULL; - PyObject *self_ = NULL; - - if (!PyArg_ParseTupleAndKeywords(args, kargs, "O|O&", - (char **)kwlist, - &self_, - PyArray_DescrConverter, - &typecode)) { - Py_XDECREF(typecode); - Py_XDECREF(self_); - return NULL; - } - assert(typecode == NULL); - if (!PyGpuNdArray_Check(self_)){ - PyErr_SetString(PyExc_TypeError, - "PyGpuNdArray_as_c_contiguous:" - " PyGpuNdArrayObject required"); - return NULL; - } - - PyGpuNdArrayObject *self = (PyGpuNdArrayObject*)self_; - if (PyGpuNdArray_is_c_contiguous(self)){ - Py_INCREF(self); - if (PyGpuNdArray_NDIM(self) == 0){ - //numpy.ascontiguous() always return object with 1d. - DPRINTF("PyGpuNdArray_as_c_contiguous: upcast to 1d tensor end\n"); - PyObject * rval = PyGpuNdArray_View(self); - if (!rval) - return NULL; - PyGpuNdArray_set_nd((PyGpuNdArrayObject*)rval, 1); - PyGpuNdArray_DIM(rval, 0) = 1; - PyGpuNdArray_STRIDE(rval, 0) = PyGpuNdArray_ITEMSIZE(rval); - return rval; - } - DPRINTF("PyGpuNdArray_as_c_contiguous: no copy end\n"); - return (PyObject*)self; - } - - PyObject * ret = PyGpuNdArray_Copy(self); - DPRINTF("PyGpuNdArray_as_c_contiguous: copy end\n"); - return ret; -} -static PyObject * -PyGpuNdArray_as_f_contiguous(PyObject* dummy, PyObject* args, PyObject *kargs) -{ - DPRINTF("PyGpuNdArray_as_f_contiguous:start\n"); - static const char *kwlist[] = {"a", "dtype", NULL}; - PyArray_Descr *typecode = NULL; - PyObject *self_ = NULL; - - if (!PyArg_ParseTupleAndKeywords(args, kargs, "O|O&", - (char **)kwlist, - &self_, - PyArray_DescrConverter, - &typecode)) { - Py_XDECREF(typecode); - Py_XDECREF(self_); - return NULL; - } - assert(typecode == NULL); - if (!PyGpuNdArray_Check(self_)){ - PyErr_SetString(PyExc_TypeError, - "PyGpuNdArray_as_f_contiguous:" - " PyGpuNdArrayObject required"); - return NULL; - } - - PyGpuNdArrayObject *self = (PyGpuNdArrayObject*)self_; - if (PyGpuNdArray_is_f_contiguous(self)){ - Py_INCREF(self); - if (PyGpuNdArray_NDIM(self) == 0){ - //numpy.ascontiguous() always return object with 1d. - PyObject * rval = PyGpuNdArray_View(self); - if (!rval) - return NULL; - PyGpuNdArray_set_nd((PyGpuNdArrayObject*)rval, 1); - PyGpuNdArray_DIM(rval, 0) = 1; - PyGpuNdArray_STRIDE(rval, 0) = PyGpuNdArray_ITEMSIZE(rval); - DPRINTF("PyGpuNdArray_as_f_contiguous: upcast to 1d tensor end\n"); - return rval; - } - DPRINTF("PyGpuNdArray_as_f_contiguous: no copy end\n"); - return (PyObject*)self; - } - - PyObject * ret = PyGpuNdArray_Copy(self, NPY_FORTRANORDER); - DPRINTF("PyGpuNdArray_as_f_contiguous: copy end\n"); - return ret; -} - -#ifdef WITH_OPENCL -#ifdef __APPLE__ -#include -#else -#include -#endif -extern void setup_context(cl_context c); - -PyObject * -PyGpuNdArray_set_opencl_context(PyObject *mod, PyObject *ctx) { - Py_ssize_t v; - - v = PyInt_AsSsize_t(ctx); - if (v == -1 && PyErr_Occurred()) - return NULL; - - setup_context((cl_context)v); - - Py_INCREF(Py_None); - return Py_None; -} -#endif - -static PyMethodDef module_methods[] = { - //{"dimshuffle", PyGpuNdArray_Dimshuffle, METH_VARARGS, "Returns the dimshuffle of a PyGpuNdArray."}, - {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"}, - {"zeros", - (PyCFunction)PyGpuNdArray_zeros, METH_VARARGS|METH_KEYWORDS, - "Create a new PyGpuNdArray with specified shape, filled with zeros."}, - {"empty", - (PyCFunction)PyGpuNdArray_empty, METH_VARARGS|METH_KEYWORDS, - "Create a new PyGpuNdArray with specified shape, filled with zeros."}, - {"ascontiguousarray", - (PyCFunction)PyGpuNdArray_as_c_contiguous, METH_VARARGS|METH_KEYWORDS, - "If the array is not c contiguous, copy it to a new c contiguous region."}, - {"asfortranarray", - (PyCFunction)PyGpuNdArray_as_f_contiguous, METH_VARARGS|METH_KEYWORDS, - "If the array is not f contiguous, copy it to a new c contiguous region."}, -#ifdef WITH_OPENCL - {"set_opencl_context", - PyGpuNdArray_set_opencl_context, METH_O, - "Set the OpenCL context to use for allocations and work."}, -#endif - {NULL, NULL, NULL, NULL} /* Sentinel */ -}; - -#ifndef PyMODINIT_FUNC /* declarations for DLL import/export */ -#define PyMODINIT_FUNC void -#endif -PyMODINIT_FUNC -initpygpu_ndarray(void) -{ - import_array(); - - PyObject* m; - - if (PyType_Ready(&PyGpuNdArrayType) < 0) - return; - - m = Py_InitModule3("pygpu_ndarray", module_methods, - "Example module that creates an extension type."); - - if (m == NULL) - return; - - Py_INCREF(&PyGpuNdArrayType); - PyModule_AddObject(m, "GpuNdArrayObject", (PyObject *)&PyGpuNdArrayType); -#if COMPUTE_GPU_MEM_USED - for(int i=0;i -//#include -#include -#include - -#include "pygpu_ndarray_object.h" -#include "gpu_ndarray.h" -#include "pygpu_language.h" - -/* - * Return a PyGpuNdArray whose 'nd' dimensions are all 0. - * if nd==-1, it is not initialized. - */ -PyObject * PyGpuNdArray_New(int nd=-1); - -/** - * Return 1 for a PyGpuNdArrayObject otw 0 - */ -int -PyGpuNdArray_Check(const PyObject * ob); - -/** - * Return 1 for a PyGpuNdArrayObject otw 0 - */ -int -PyGpuNdArray_CheckExact(const PyObject * ob); - -/** - * Transfer the contents of numpy array `obj` to `self`. - * - * self is reallocated to have the correct dimensions if necessary. - */ -int PyGpuNdArray_CopyFromArray(PyGpuNdArrayObject * self, PyArrayObject*obj); - -static int -PyGpuNdArray_add_offset(PyGpuNdArrayObject * self, int offset); - -static int -PyGpuNdArray_set_data(PyGpuNdArrayObject * self, char * data, PyObject * base, int offset=0); - -static PyObject * -PyGpuNdArray_Subscript(PyObject * py_self, PyObject * key); - -static PyObject * -PyGpuNdArray_Copy(PyGpuNdArrayObject * self, NPY_ORDER order=NPY_CORDER); - -static PyObject * -PyGpuNdArray_Zeros(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran); - -static PyObject * -PyGpuNdArray_Empty(int nd, npy_intp* dims, PyArray_Descr* dtype, int fortran); - -#endif - -/* - Local Variables: - mode:c++ - c-basic-offset:4 - c-file-style:"stroustrup" - c-file-offsets:((innamespace . 0)(inline-open . 0)) - indent-tabs-mode:nil - fill-column:79 - End: -*/ -// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 : diff --git a/ndarray/pygpu_ndarray_object.h b/ndarray/pygpu_ndarray_object.h deleted file mode 100644 index aa0f71a..0000000 --- a/ndarray/pygpu_ndarray_object.h +++ /dev/null @@ -1,232 +0,0 @@ -/** - * struct PyGPUArrayObject - * - * This is a Python type. - * - */ -#ifndef _PYGPU_NDARRAY_OBJECT_H -#define _PYGPU_NDARRAY_OBJECT_H - -#include -#include -#include "gpu_ndarray.h" - -typedef struct PyGpuNdArrayObject{ - PyObject_HEAD - - GpuNdArray gpu_ndarray; //no pointer, just inlined. - PyObject * base; - PyArray_Descr * descr; // for numpy-like desc - int data_allocated; //the number of bytes allocated for devdata -} PyGpuNdArrayObject; - -#define PyGpuNdArray_NDIM(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.nd) -#define PyGpuNdArray_DATA(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.data) -#define PyGpuNdArray_BYTES(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.data) -#define PyGpuNdArray_OFFSET(obj) (((PyGpuNdArrayObject *)(obj))->gpu_ndarray.offset) -#define PyGpuNdArray_DIMS(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.dimensions) -#define PyGpuNdArray_STRIDES(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.strides) -#define PyGpuNdArray_DIM(obj,n) (PyGpuNdArray_DIMS(obj)[n]) -#define PyGpuNdArray_STRIDE(obj,n) (PyGpuNdArray_STRIDES(obj)[n]) -#define PyGpuNdArray_BASE(obj) (((PyGpuNdArrayObject *)obj)->base) -#define PyGpuNdArray_DESCR(obj) (((PyGpuNdArrayObject *)obj)->descr) -#define PyGpuNdArray_FLAGS(obj) (((PyGpuNdArrayObject *)obj)->gpu_ndarray.flags) -#define PyGpuNdArray_ITEMSIZE(obj) (((PyGpuNdArrayObject *)obj)->descr->elsize) -#define PyGpuNdArray_TYPE(obj) (((PyGpuNdArrayObject *)(obj))->descr->type_num) - -#define PyGpuNdArray_SIZE(obj) PyArray_MultiplyList(PyGpuNdArray_DIMS(obj),PyGpuNdArray_NDIM(obj)) -//npy_intp PyGpuNdArray_Size(PyObject* obj); -//npy_intp PyGpuNdArray_NBYTES(PyObject* arr); - -/* - Flags accessor - */ -#define PyGpuNdArray_CHKFLAGS(m, FLAGS) \ - ((((PyGpuNdArrayObject *)(m))->gpu_ndarray.flags & (FLAGS)) == (FLAGS)) - -#define PyGpuNdArray_ISCONTIGUOUS(m) PyGpuNdArray_CHKFLAGS(m, NPY_CONTIGUOUS) -#define PyGpuNdArray_ISFORTRAN(m) (PyGpuNdArray_CHKFLAGS(m, NPY_F_CONTIGUOUS) && \ - PyGpuNdArray_NDIM(m) > 1) -#define PyGpuNdArray_FORTRAN_IF(m) (PyGpuNdArray_CHKFLAGS(m, NPY_F_CONTIGUOUS)? \ - NPY_F_CONTIGUOUS : 0) -#define PyGpuNdArray_ISONESEGMENT(m) (PyGpuNdArray_NDIM(m) == 0 || \ - PyGpuNdArray_ISCONTIGUOUS(m) || \ - PyGpuNdArray_ISFORTRAN(m)) -#define PyGpuNdArray_ISWRITEABLE(m) PyGpuNdArray_CHKFLAGS(m, NPY_WRITEABLE) -#define PyGpuNdArray_ISALIGNED(m) PyGpuNdArray_CHKFLAGS(m, NPY_ALIGNED) - -#define PyGpuNdArray_ISNBO(arg) ((arg) != NPY_OPPBYTE) -// THE NEXT ONE SEEM BAD... -#define PyGpuNdArray_IsNativeByteOrder PyArray_ISNBO -#define PyGpuNdArray_ISNOTSWAPPED(m) PyArray_ISNBO(PyArray_DESCR(m)->byteorder) -#define PyGpuNdArray_FLAGSWAP(m, flags) (PyGpuNdArray_CHKFLAGS(m, flags) && PyGpuNdArray_ISNOTSWAPPED(m)) - -#define PyGpuNdArray_ISCARRAY(m) PyGpuNdArray_FLAGSWAP(m, NPY_CARRAY) -#define PyGpuNdArray_ISCARRAY_RO(m) PyGpuNdArray_FLAGSWAP(m, NPY_CARRAY_RO) -#define PyGpuNdArray_ISFARRAY(m) PyGpuNdArray_FLAGSWAP(m, NPY_FARRAY) -#define PyGpuNdArray_ISFARRAY_RO(m) PyGpuNdArray_FLAGSWAP(m, NPY_FARRAY_RO) -#define PyGpuNdArray_ISBEHAVED(m) PyGpuNdArray_FLAGSWAP(m, NPY_BEHAVED) -#define PyGpuNdArray_ISBEHAVED_RO(m) PyGpuNdArray_FLAGSWAP(m, NPY_ALIGNED) - -static -void PyGpuNdArray_fprint(FILE * fd, const PyGpuNdArrayObject *self) -{ - fprintf(fd, "PyGpuNdArrayObject <%p, %p> nd=%i data_allocated=%d\n", - self, PyGpuNdArray_DATA(self), PyGpuNdArray_NDIM(self), self->data_allocated); - fprintf(fd, "\tITEMSIZE: %d\n", PyGpuNdArray_ITEMSIZE(self)); - fprintf(fd, "\tTYPENUM: %d\n", PyGpuNdArray_TYPE(self)); - fprintf(fd, "\tRefcount: %ld\n", (long int)self->ob_refcnt); - fprintf(fd, "\tBASE: %p\n", PyGpuNdArray_BASE(self)); - fprintf(fd, "\tHOST_DIMS: "); - for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) - { - fprintf(fd, "%ld\t", PyGpuNdArray_DIMS(self)[i]); - } - fprintf(fd, "\n\tHOST_STRIDES: "); - for (int i = 0; i < PyGpuNdArray_NDIM(self); ++i) - { - fprintf(fd, "%ld\t", PyGpuNdArray_STRIDES(self)[i]); - } - fprintf(fd, "\n\tFLAGS: "); - fprintf(fd, "\n\t\tC_CONTIGUOUS: %d", PyGpuNdArray_ISCONTIGUOUS(self)); - fprintf(fd, "\n\t\tPyGpuNdArray_ISFORTRAN: %d PyGpuNdArray_FORTRAN_IF:%d F_CONTIGUOUS: %d", - PyGpuNdArray_ISFORTRAN(self), PyGpuNdArray_FORTRAN_IF(self), PyGpuNdArray_CHKFLAGS(self, NPY_FORTRAN)); - fprintf(fd, "\n\t\tOWNDATA: %d", PyGpuNdArray_CHKFLAGS(self, NPY_OWNDATA)); - fprintf(fd, "\n\t\tWRITEABLE: %d", PyGpuNdArray_ISWRITEABLE(self)); - fprintf(fd, "\n\t\tALIGNED: %d", PyGpuNdArray_ISALIGNED(self)); - fprintf(fd, "\n\t\tUPDATEIFCOPY: %d", PyGpuNdArray_CHKFLAGS(self, NPY_UPDATEIFCOPY)); - fprintf(fd, "\n"); - -} -static -void PyArray_fprint(FILE * fd, const PyArrayObject *self) -{ - fprintf(fd, "PyArrayObject <%p, %p> nd=%i\n", - self, PyArray_DATA(self), PyArray_NDIM(self)); - fprintf(fd, "\tITEMSIZE: %d\n", PyArray_ITEMSIZE(self)); - fprintf(fd, "\tTYPENUM: %d\n", PyArray_TYPE(self)); - fprintf(fd, "\tHOST_DIMS: "); - for (int i = 0; i < PyArray_NDIM(self); ++i) - { - fprintf(fd, "%ld\t", PyArray_DIMS(self)[i]); - } - fprintf(fd, "\n\tHOST_STRIDES: "); - for (int i = 0; i < PyArray_NDIM(self); ++i) - { - fprintf(fd, "%ld\t", PyArray_STRIDES(self)[i]); - } - fprintf(fd, "\n\tFLAGS: "); - fprintf(fd, "\n\t\tC_CONTIGUOUS: %d", PyArray_ISCONTIGUOUS(self)); - fprintf(fd, "\n\t\tPyArray_ISFORTRAN: %d PyArray_FORTRAN_IF:%d F_CONTIGUOUS: %d", - PyArray_ISFORTRAN(self), PyArray_FORTRAN_IF(self), PyArray_CHKFLAGS(self, NPY_FORTRAN)); - fprintf(fd, "\n\t\tOWNDATA: %d", PyArray_CHKFLAGS(self, NPY_OWNDATA)); - fprintf(fd, "\n\t\tWRITEABLE: %d", PyArray_ISWRITEABLE(self)); - fprintf(fd, "\n\t\tALIGNED: %d", PyArray_ISALIGNED(self)); - fprintf(fd, "\n\t\tUPDATEIFCOPY: %d", PyArray_CHKFLAGS(self, NPY_UPDATEIFCOPY)); - fprintf(fd, "\n"); - -} - -template -static T ceil_intdiv(T a, T b) -{ - return (a/b) + ((a % b) ? 1: 0); -} - - -//Compute if the resulting array is c contiguous -static bool -PyGpuNdArray_is_c_contiguous(const PyGpuNdArrayObject * self) -{ - bool c_contiguous = true; - int size = PyGpuNdArray_ITEMSIZE(self); - for (int i = PyGpuNdArray_NDIM(self)-1; (i >= 0) && c_contiguous; --i) { - if (PyGpuNdArray_STRIDE(self, i) != size) { - c_contiguous = false; - } - size = size * PyGpuNdArray_DIM(self, i); - } - return c_contiguous; -} - -//Compute if the resulting array is f contiguous -static bool -PyGpuNdArray_is_f_contiguous(const PyGpuNdArrayObject * self) -{ - bool f_contiguous = true; - int size = PyGpuNdArray_ITEMSIZE(self); - for (int i = 0; i < PyGpuNdArray_NDIM(self) && f_contiguous; ++i) { - if (PyGpuNdArray_STRIDE(self, i) != size) { - f_contiguous = false; - } - size = size * PyGpuNdArray_DIM(self, i); - } - return f_contiguous; -} - -static PyObject * -PyGpuNdArray_as_c_contiguous(PyObject* dummy, PyObject* args, PyObject *kargs); -static PyObject * -PyGpuNdArray_as_f_contiguous(PyObject* dummy, PyObject* args, PyObject *kargs); - -/** - * [Re]allocate a PyGpuNdArrayObject with access to 'nd' dimensions. - * - * Note: This does not allocate storage for data. - */ -static -int PyGpuNdArray_set_nd(PyGpuNdArrayObject * self, const int nd) -{ - if (nd != PyGpuNdArray_NDIM(self)) - { - if(0) fprintf(stderr, "PyGpuNdArray_set_nd: modif nd=%i to nd=%i\n", PyGpuNdArray_NDIM(self), nd); - - if (PyGpuNdArray_DIMS(self)){ - free(PyGpuNdArray_DIMS(self)); - PyGpuNdArray_DIMS(self) = NULL; - PyGpuNdArray_NDIM(self) = -1; - } - if (PyGpuNdArray_STRIDES(self)){ - free(PyGpuNdArray_STRIDES(self)); - PyGpuNdArray_STRIDES(self) = NULL; - PyGpuNdArray_NDIM(self) = -1; - } - if (nd == -1) return 0; - - PyGpuNdArray_DIMS(self) = (npy_intp*)malloc(nd*sizeof(npy_intp)); - if (NULL == PyGpuNdArray_DIMS(self)) - { - PyErr_SetString(PyExc_MemoryError, "PyGpuNdArray_set_nd: Failed to allocate dimensions"); - return -1; - } - PyGpuNdArray_STRIDES(self) = (npy_intp*)malloc(nd*sizeof(npy_intp)); - if (NULL == PyGpuNdArray_STRIDES(self)) - { - PyErr_SetString(PyExc_MemoryError, "PyGpuNdArray_set_nd: Failed to allocate str"); - return -1; - } - //initialize all dimensions and strides to 0 - for (int i = 0; i < nd; ++i) - { - PyGpuNdArray_DIM(self, i) = 0; - PyGpuNdArray_STRIDES(self)[i] = 0; - } - - PyGpuNdArray_NDIM(self) = nd; - if(0) fprintf(stderr, "PyGpuNdArray_set_nd: end\n"); - } - return 0; -} - -#endif -/* - Local Variables: - mode:c++ - c-basic-offset:4 - c-file-style:"stroustrup" - c-file-offsets:((innamespace . 0)(inline-open . 0)) - indent-tabs-mode:nil - fill-column:79 - End: -*/ -// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 : diff --git a/ndarray/setup_opencl.py b/ndarray/setup_opencl.py deleted file mode 100644 index 0087cf2..0000000 --- a/ndarray/setup_opencl.py +++ /dev/null @@ -1,102 +0,0 @@ -import os -from distutils.command.build_ext import build_ext -from distutils.core import Extension, setup -from distutils.dep_util import newer - -import numpy as np - - -class build_ext_nvcc(build_ext): - user_options = build_ext.user_options - user_options.extend([ - ('cuda-root=', None, "The cuda root directory")]) - - def initialize_options(self): - build_ext.initialize_options(self) - self.cuda_root = None - - def finalize_options(self): - build_ext.finalize_options(self) - if self.cuda_root is None: - self.cuda_root = os.getenv('CUDA_ROOT', None) - if self.cuda_root is not None: - self._nvcc_bin = os.path.join(self.cuda_root, 'bin', 'nvcc') - else: - self._nvcc_bin = 'nvcc' - - def cuda_process(self, source, include_args): - target = source + '.cpp' - if newer(source, target): - self.spawn([self._nvcc_bin, '--cuda', source, '-o', target] + \ - include_args) - return target - - def cuda_extension(self, ext): - includes = self.distribution.include_dirs + ext.include_dirs - include_args = ['-I' + i for i in includes] - new_sources = [] - anycuda = False - for src in ext.sources: - if src.endswith('.cu'): - new_sources.append(self.cuda_process(src, include_args)) - anycuda = True - else: - new_sources.append(src) - if anycuda: - ext.sources = new_sources - if self.cuda_root is not None: - lib = os.path.join(self.cuda_root, 'lib') - lib64 = os.path.join(self.cuda_root, 'lib64') - if os.path.isdir(lib): - ext.library_dirs.append(lib) - ext.extra_link_args.append('-Xlinker') - ext.extra_link_args.append('-rpath') - ext.extra_link_args.append('-Xlinker') - ext.extra_link_args.append(lib) - if os.path.isdir(lib64): - ext.library_dirs.append(lib64) -# ext.extra_link_args.append('-rpath') -# ext.extra_link_args.append(lib64) - if 'cudart' not in ext.libraries: - ext.libraries.append('cudart') - - if self.cuda_root: - include = os.path.join(self.cuda_root, 'include') - if os.path.isdir(include): - ext.extra_compile_args.append('-I' + include) - if os.path.isfile('/usr/lib/nvidia-current/libOpenCL.so'): - ext.extra_link_args.append('-L/usr/lib/nvidia-current') - ext.extra_link_args.append('-Xlinker') - ext.extra_link_args.append('-rpath') - ext.extra_link_args.append('-Xlinker') - ext.extra_link_args.append('/usr/lib/nvidia-current') - - def build_extensions(self): - self.check_extensions_list(self.extensions) - - for ext in self.extensions: - self.cuda_extension(ext) - # uncomment this + inherit from the cython version of build_ext - # work with cuda and cython sources - #ext.sources = self.cython_sources(ext.sources, ext) - self.build_extension(ext) - -import sys - - -if sys.platform == 'darwin': - libcl_args = {'extra_link_args': ['-framework', 'OpenCL']} -else: - libcl_args = {'libraries': ['OpenCL']} - - -setup(name='compyte', - cmdclass={'build_ext': build_ext_nvcc}, - include_dirs=[np.get_include(), '.'], - ext_modules=[Extension('pygpu_ndarray', - define_macros=[('OFFSET', '1'), ('WITH_OPENCL', '')], - sources=['pygpu_language_opencl.cpp', - 'pygpu_ndarray.cpp'], - **libcl_args) - ] -) diff --git a/ndarray/test_gpu_elemwise.py b/ndarray/test_gpu_elemwise.py deleted file mode 100644 index 6b42dbe..0000000 --- a/ndarray/test_gpu_elemwise.py +++ /dev/null @@ -1,410 +0,0 @@ -# TODO: test other dtype -from functools import reduce - -import numpy -import pygpu_ndarray as gpu_ndarray -import theano - -from .gen_elemwise import MyGpuNdArray, elemwise_collapses -from .test_gpu_ndarray import dtypes_all, enable_double, gen_gpu_nd_array, product - - -def rand(shape, dtype): - r = numpy.random.randn(*shape) * 10 - if dtype.startswith("u"): - r = numpy.absolute(r) - return r.astype(dtype) - - -# numpy.allclose seam to have problem with int8... -def all_close(x, y): - return (numpy.allclose(x, y) or - numpy.absolute(x - y).max() == 0) - - -def test_elemwise_collapse(): - """ Test collapsing under many broadcast and strided pattern """ - - for dtype1 in ["int16", "float32", "int8"]: - for dtype2 in ["int16", "float32", "int8"]: - - for shape1_, shape2_, expected in [ - # 1d to test this special case - ((40,), (40,), 0), - ((40,), (1,), 1), - # No broadcastable dimensions - ((4, 5, 6, 9), (4, 5, 6, 9), 0), - # All inputs have one(and the same) broadcastable dimension - ((1, 4, 5, 9), (1, 4, 5, 9), 0), - ((4, 1, 5, 9), (4, 1, 5, 9), 0), - ((4, 5, 1, 9), (4, 5, 1, 9), 0), - ((4, 5, 9, 1), (4, 5, 9, 1), 0), - # One inputs have one broadcastable dimension - ((1, 5, 6, 9), (4, 5, 6, 9), 2), - ((4, 1, 6, 9), (4, 5, 6, 9), 3), - ((4, 5, 1, 9), (4, 5, 6, 9), 3), - ((4, 5, 6, 1), (4, 5, 6, 9), 2), - # One inputs have two broadcastable dimension - ((1, 1, 6, 9), (4, 5, 6, 9), 2), - ((1, 5, 1, 9), (4, 5, 6, 9), 4), - ((1, 5, 6, 1), (4, 5, 6, 9), 3), - ((4, 1, 1, 9), (4, 5, 6, 9), 3), - ((4, 1, 6, 1), (4, 5, 6, 9), 4), - ((4, 5, 1, 1), (4, 5, 6, 9), 2), - # One inputs have tree broadcastable dimension - ((1, 1, 1, 9), (4, 5, 6, 9), 2), - ((1, 1, 6, 1), (4, 5, 6, 9), 3), - ((1, 5, 1, 1), (4, 5, 6, 9), 3), - ((4, 1, 1, 1), (4, 5, 6, 9), 2), - # One scalar - ((1, 1, 1, 1), (4, 5, 6, 9), 1), - # One scalar, the other 1 broadcast dims - ((1, 1, 1, 1), (4, 5, 6, 1), 1), - ]: - scalar_cpu = rand((1,) * len(shape1_), dtype=dtype1) - scalar_gpu = gpu_ndarray.GpuNdArrayObject(scalar_cpu) - scalar_gpu1 = MyGpuNdArray(scalar_gpu) - for shape1, shape2 in [(shape1_, shape2_), (shape2_, shape1_)]: - a_cpu = rand(shape1, dtype=dtype1) - a = gpu_ndarray.GpuNdArrayObject(a_cpu) - a1 = MyGpuNdArray(a) - - b_cpu = rand(shape2, dtype=dtype2) - b = gpu_ndarray.GpuNdArrayObject(b_cpu) - b1 = MyGpuNdArray(b) - - assert len(shape1) == len(shape2) - o_shape = [] - for i in range(len(shape1)): - o_shape.append(max(shape1[i], shape2[i])) - o = gpu_ndarray.empty(o_shape, dtype=(a_cpu + b_cpu).dtype) - - # 1.1 Check direct collapse - nd_collaps, info = elemwise_collapses([a, b], [o]) - assert nd_collaps == expected, (shape1, shape2, - nd_collaps, expected, info) - - # 1.2 Check computation are still valid - f = MyGpuNdArray.gen_fct(theano.tensor.add, [a1, b1], - len(shape1)) - out = f([a1, b1]) - out2 = f([a1, b1], out=out) - assert out is out2 - assert numpy.allclose(numpy.asarray(f([a1, b1])), - a_cpu + b_cpu) - assert numpy.allclose(numpy.asarray( - MyGpuNdArray.adds(a1, b1)), a_cpu + b_cpu) - assert numpy.allclose(numpy.asarray( - MyGpuNdArray.add(a1, b1)), a_cpu + b_cpu) - assert MyGpuNdArray.add(a1, b1, out=out2) is out2 - - # 1.3 Check work without collaping - f = MyGpuNdArray.gen_fct(theano.tensor.add, [a1, b1], - len(shape1), collapse=False) - out = f([a1, b1]) - out2 = f([a1, b1], out=out) - assert out is out2 - assert numpy.allclose(numpy.asarray(f([a1, b1])), - a_cpu + b_cpu) - assert numpy.allclose(numpy.asarray(MyGpuNdArray.adds( - a1, b1)), a_cpu + b_cpu) - assert numpy.allclose(numpy.asarray(MyGpuNdArray.add( - a1, b1)), a_cpu + b_cpu) - assert MyGpuNdArray.add(a1, b1, out=out2) is out2 - - # 2.1 What if we add a scalar? - nd_collaps, info = elemwise_collapses( - [a, b, scalar_gpu], [o]) - if expected == 0: - expected2 = 1 - else: - expected2 = expected - assert nd_collaps == expected2, (shape1, shape2, - nd_collaps, expected, - info) - # 2.2 Check computation - assert numpy.allclose(numpy.asarray(MyGpuNdArray.adds( - a1, b1, scalar_gpu1)), - a_cpu + b_cpu + scalar_cpu) - - # 3.1 What if one of the dimensions is strided? - broadcast = any([True for i in a.shape + b.shape - if i == 1]) - if expected == 0: - expected2 = 2 - else: - expected2 = expected - - if len(shape1_) != 4: - continue - - if a.shape[0] != 1: - shape = list(shape1) - shape[0] *= 2 - c_cpu = rand(shape, dtype='float32') - c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::2] - c1 = MyGpuNdArray(c) - - err = ("strided", c.shape, shape2, - nd_collaps, expected, info) - nd_collaps, info = elemwise_collapses([c, b], [o]) - if broadcast: - assert nd_collaps >= expected, err - else: - assert nd_collaps == expected2, err - assert numpy.allclose(numpy.asarray( - MyGpuNdArray.adds(c1, b1)), - numpy.asarray(c) + b_cpu) - - if a.shape[1] != 1: - shape = list(shape1) - shape[1] *= 2 - c_cpu = rand(shape, dtype='float32') - c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::2] - c1 = MyGpuNdArray(c) - - err = ("strided", c.shape, shape2, - nd_collaps, expected, info) - nd_collaps, info = elemwise_collapses([c, b], [o]) - if broadcast: - assert nd_collaps >= expected, err - else: - assert nd_collaps == expected2, err - pass - assert numpy.allclose(numpy.asarray( - MyGpuNdArray.adds(c1, b1)), - numpy.asarray(c) + b_cpu) - - if a.shape[2] != 1: - shape = list(shape1) - shape[2] *= 2 - c_cpu = rand(shape, dtype='float32') - c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::, ::2] - c1 = MyGpuNdArray(c) - - err = ("strided", c.shape, shape2, - nd_collaps, expected, info) - nd_collaps, info = elemwise_collapses([c, b], [o]) - if broadcast: - assert nd_collaps >= expected, err - else: - assert nd_collaps == expected2, err - pass - assert numpy.allclose(numpy.asarray( - MyGpuNdArray.adds(c1, b1)), - numpy.asarray(c) + b_cpu) - - if a.shape[3] != 1: - shape = list(shape1) - shape[3] *= 2 - c_cpu = rand(shape, dtype='float32') - c = gpu_ndarray.GpuNdArrayObject(c_cpu)[::, ::, - ::, ::2] - c1 = MyGpuNdArray(c) - - err = ("strided", c.shape, shape2, - nd_collaps, expected, info) - nd_collaps, info = elemwise_collapses([c, b], [o]) - if broadcast: - assert nd_collaps >= expected, err - else: - assert nd_collaps == 1, err - pass - assert numpy.allclose(numpy.asarray( - MyGpuNdArray.adds(c1, b1)), - numpy.asarray(c) + b_cpu) - - -def test_elemwise_mixed_dtype(): - to_cpu = numpy.asarray - - for dtype1 in ["int16", "float32", "int8"]: - for dtype2 in ["int16", "float32", "int8"]: - dtypeo = str((numpy.zeros(1, dtype=dtype1) + - numpy.zeros(1, dtype=dtype2)).dtype) - #print "dtypes", dtype1, dtype2, "o dtype", dtypeo - - #print " Test inside a wrapping python object 2 inputs" - for shape in [(500,), (50, 5), (5, 6, 7)]: - input_vals = [rand(shape, dtype) for dtype in [dtype1, dtype2]] - del dtype - gpu_vals = [gpu_ndarray.GpuNdArrayObject(i) - for i in input_vals] - assert all([numpy.allclose(to_cpu(ig), i) - for ig, i in zip(gpu_vals, input_vals)]) - - gpu_vals = [MyGpuNdArray(x) for x in gpu_vals] - out = gpu_vals[0] + gpu_vals[1] - assert numpy.allclose(to_cpu(out), - input_vals[0] + input_vals[1]) - out = gpu_vals[0] - gpu_vals[1] - assert numpy.allclose(to_cpu(out), - input_vals[0] - input_vals[1]) - out = gpu_vals[0] * gpu_vals[1] - assert all_close(to_cpu(out), - input_vals[0] * input_vals[1]) - if dtypeo.startswith("float"): - # TODO: execute for all dtype - out = gpu_vals[0] / gpu_vals[1] - assert numpy.allclose(to_cpu(out), - input_vals[0] / input_vals[1]) - - nb_in = 4 - #print " Test inside a wrapping python object %d inputs"%nb_in - for shape in [(500,), (50, 5), (5, 6, 7)]: - input_vals = [rand(shape, dtype) - for dtype in [dtype1, dtype2, dtype1, dtype2]] - gpu_vals = [gpu_ndarray.GpuNdArrayObject(i) - for i in input_vals] - assert all([numpy.allclose(to_cpu(ig), i) - for ig, i in zip(gpu_vals, input_vals)]) - - gpu_vals = [MyGpuNdArray(x) for x in gpu_vals] - out = MyGpuNdArray.adds(*gpu_vals) - assert numpy.allclose(to_cpu(out), - reduce(numpy.add, input_vals)) - - out = MyGpuNdArray.multiplys(*gpu_vals) - assert all_close(to_cpu(out), - reduce(numpy.multiply, input_vals)) - - #print " Test broadcasting" - for shapes in [((1, 5), (4, 5)), ((33, 10), (33, 1)), - ((33, 1, 5), (33, 10, 1)), - ((33, 1, 5), (33, 10, 1), ((1, 10, 5))), - ]: - input_vals = [rand(shape, dtype) for shape, dtype - in zip(shapes, [dtype1, dtype2])] - gpu_vals = [gpu_ndarray.GpuNdArrayObject(i) - for i in input_vals] - assert all([numpy.allclose(to_cpu(ig), i) - for ig, i in zip(gpu_vals, input_vals)]) - - gpu_vals = [MyGpuNdArray(x) for x in gpu_vals] - out = MyGpuNdArray.adds(*gpu_vals) - assert numpy.allclose(to_cpu(out), - reduce(numpy.add, input_vals)) - - out = MyGpuNdArray.multiplys(*gpu_vals) - assert all_close(to_cpu(out), - reduce(numpy.multiply, input_vals)) - - -def test_sum(): - to_cpu = numpy.asarray - dtypes = list(dtypes_all) - # I remove *int8 as currently the output have the same dtype - # And this cause overflow - dtypes.remove("int8") - dtypes.remove("uint8") - # I need to find how pycuda handle complexe in c. - # I probably just need to add an header. - dtypes.remove("complex64") - if enable_double: - dtypes.remove("complex128") - for shape in [ - # need something bigger then 32, 1024 or 4096. - # Those are corner case. - - # 1d, take only a few seconds on a GTX470 - (0,), (5,), (31,), (32,), (33,), - (1023,), (1024,), (1025,), - (4095,), (4096,), (4097,), - (32 * 1024 - 1,), (32 * 1024,), (32 * 1024 + 1,), - - # 2d, take 2 minutes on a GTX 470 - (0, 0), (1, 0), (0, 1,), (5, 4), - (31, 31), (31, 32), (31, 33), - (32, 31), (32, 32), (32, 33), - (33, 31), (33, 32), (33, 33), - (1024, 32), (1025, 32), - (1024, 33), (1025, 33), - (4096, 32), (32, 4096), (4096, 33), (33, 4096), - (4097, 32), (32, 4097), (4097, 33), (33, 4097), - - # 3d, take 2 minutes on a GTX 470 - (0, 0, 0), (0, 1, 0), (0, 0, 1), - (5, 4, 3), (5, 4, 3), (5, 4, 3), - (4096, 2, 33), (2, 4096, 33), (33, 2, 4096), - (4097, 2, 33), (2, 4097, 33), (33, 2, 4097), - (4096, 33, 2), (33, 4096, 2), (2, 33, 4096), - (4097, 33, 2), (33, 4097, 2), (2, 33, 4097), - - # 4d, take 1 minutes on a GTX 470 - (0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0), - (0, 0, 1, 0), (0, 0, 0, 1), - (5, 4, 3, 2), - (1024, 32, 2, 3), (3, 1024, 32, 2), (2, 3, 1024, 32), - (1024, 2, 32, 3), (3, 1024, 2, 32), (1024, 3, 2, 32), - (1025, 33, 2, 3), (3, 1025, 33, 2), (2, 3, 1025, 33), - (1025, 2, 33, 3), (3, 1025, 2, 33), (1025, 3, 2, 33), - (4100, 4, 3, 2), (4, 4100, 3, 2), - (4, 3, 4100, 2), (4, 3, 2, 4100), - - # 5d, work only if c contiguous - (5, 4, 3, 10, 11), - ]: - - for dtype, off_o, off_i, sliced, order in product( - *([dtypes] + - [[False, True]] + - [[False, True]] + - [[-1, 2, -2, 1]] + - [['f', 'c']])): - - cpu_val, gpu_val = gen_gpu_nd_array(shape, dtype, off_o, - off_i, sliced, order) - - if len(shape) > 4 and not (gpu_val.flags["C_CONTIGUOUS"] or - gpu_val.flags["F_CONTIGUOUS"]): - continue - gpu_val = MyGpuNdArray(gpu_val) - cpu_sum = cpu_val.sum() -# print dtype, shape, off_o, off_i, sliced, order -# print (cpu_val.strides, -# cpu_val.flags["C_CONTIGUOUS"], -# cpu_val.flags["F_CONTIGUOUS"]) -# print (gpu_val.strides, -# gpu_val.flags["C_CONTIGUOUS"], -# gpu_val.flags["F_CONTIGUOUS"]) - gpu_sum = to_cpu(gpu_val.sum()) - - def get_rtol(orig, after_reduction): - if after_reduction.size == 0: - return 0 - if orig.size // after_reduction.size > 500000: - rtols = {"float32": 4.3e-5} - elif orig.size // after_reduction.size > 100000: - rtols = {"float32": 3e-5} - elif orig.size // after_reduction.size > 50000: - rtols = {"float32": 2e-5} - else: - rtols = {"float32": 1e-5} - if dtype in rtols: - rtol = rtols[dtype] - else: - rtol = 1e-8 - return rtol - rtol = get_rtol(gpu_val, gpu_sum) - cpu_sum = cpu_sum.astype(dtype) - if not (dtype.endswith("int16") and numpy.prod(shape) > 20000): - assert (numpy.allclose(cpu_sum, gpu_sum, rtol=rtol) or - cpu_sum == gpu_sum), ( - dtype, shape, cpu_sum, gpu_sum, - (cpu_sum - gpu_sum) / cpu_sum) - - # Test pattern 10 and 01 - # Test pattern 100, 010 and 001 - if len(shape) in [2, 3]: - for axis in range(len(shape)): - gpu_sum = to_cpu(gpu_val.sum(axis=[axis])) - cpu_sum = cpu_val.sum(axis=axis) - rtol = get_rtol(gpu_val, gpu_sum) - if cpu_sum.size > 0: - argmax = numpy.absolute(cpu_sum - gpu_sum).argmax() - cpu_max = cpu_sum.flatten()[argmax] - gpu_max = gpu_sum.flatten()[argmax] - assert numpy.allclose(cpu_sum, gpu_sum), ( - "axis=%d" % axis, dtype, shape, cpu_sum.shape, - cpu_sum, gpu_sum, - cpu_max, gpu_max, (cpu_max - gpu_max) / cpu_max) diff --git a/ndarray/test_gpu_ndarray.py b/ndarray/test_gpu_ndarray.py deleted file mode 100644 index 7e88165..0000000 --- a/ndarray/test_gpu_ndarray.py +++ /dev/null @@ -1,488 +0,0 @@ -import copy - -import numpy -import pygpu_ndarray as gpu_ndarray - - -enable_double = True -enable_double = False - -dtypes_all = ["float32", - "int8", "int16", "int32", "int64", - "uint8", "uint16", "uint32", "uint64", - "complex64", - ] - -dtypes_no_complex = ["float32", - "int8", "int16", "int32", "int64", - "uint8", "uint16", "uint32", "uint64", - ] -if enable_double: - dtypes_all += ["float64", "complex128"] - dtypes_no_complex += ["float64"] - - -def check_flags(x, y): - assert x.flags["C_CONTIGUOUS"] == y.flags["C_CONTIGUOUS"] - assert x.flags["F_CONTIGUOUS"] == y.flags["F_CONTIGUOUS"] - assert x.flags["WRITEABLE"] == y.flags["WRITEABLE"] - assert x.flags["OWNDATA"] == y.flags["OWNDATA"] - assert x.flags["ALIGNED"] == y.flags["ALIGNED"] - assert x.flags["UPDATEIFCOPY"] == y.flags["UPDATEIFCOPY"] - - -def check_meta(x, y): - assert x.shape == y.shape - assert x.dtype == y.dtype - assert x.strides == y.strides - check_flags(x, y) - - -def check_all(x, y): - check_meta(x, y) - assert numpy.allclose(numpy.asarray(x), numpy.asarray(y)) - - -def gen_gpu_nd_array(shape_orig, dtype='float32', offseted_outer=False, - offseted_inner=False, sliced=1, order='c'): - if sliced is True: - sliced = 2 - elif sliced is False: - sliced = 1 - shape = numpy.asarray(shape_orig).copy() - if sliced != 1 and len(shape) > 0: - shape[0] *= numpy.absolute(sliced) - if offseted_outer and len(shape) > 0: - shape[0] += 1 - if offseted_inner and len(shape) > 0: - shape[-1] += 1 - - a = numpy.random.rand(*shape) * 10 - if dtype.startswith("u"): - a = numpy.absolute(a) - a = numpy.asarray(a, dtype=dtype) - assert order in ['c', 'f'] - if order == 'f' and len(shape) > 0: - a = numpy.asfortranarray(a) - b = gpu_ndarray.GpuNdArrayObject(a) - if order == 'f' and len(shape) > 0 and b.size > 1: - assert b.flags['F_CONTIGUOUS'] - - if offseted_outer and len(shape) > 0: - b = b[1:] - a = a[1:] - assert b.offset != 0 - if offseted_inner and len(shape) > 0: - # The b[..., 1:] act as the test for this subtensor case. - b = b[..., 1:] - a = a[..., 1:] - assert b.offset != 0 - if sliced != 1 and len(shape) > 0: - a = a[::sliced] - b = b[::sliced] - - if False and shape_orig == (): - assert a.shape == (1,) - assert b.shape == (1,) - else: - assert a.shape == shape_orig, (a.shape, shape_orig) - assert b.shape == shape_orig, (b.shape, shape_orig) - - assert numpy.allclose(a, numpy.asarray(b)) - - return a, b - - -def product(*args, **kwds): - # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy - # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111 - pools = list(map(tuple, args)) * kwds.get('repeat', 1) - result = [[]] - for pool in pools: - result = [x + [y] for x in result for y in pool] - for prod in result: - yield tuple(prod) - - -def test_transfer(): - for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: - for dtype in dtypes_all: - for offseted in [True, False]: - a, b = gen_gpu_nd_array(shp, dtype, offseted) - c = numpy.asarray(b) - - assert numpy.allclose(c, a) - assert a.shape == b.shape == c.shape - assert a.strides == b.strides == c.strides - assert a.dtype == b.dtype == c.dtype == dtype - assert c.flags.c_contiguous - - -def test_transfer_not_contiguous(): - """ - Test transfer when the input on the CPU is not contiguous - TODO: test when the input on the gpu is not contiguous - """ - for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: - for dtype in dtypes_all: - a = numpy.random.rand(*shp) * 10 - a = a[::-1] - b = gpu_ndarray.GpuNdArrayObject(a) - c = numpy.asarray(b) - - assert numpy.allclose(c, a) - assert a.shape == b.shape == c.shape - # We copy a to a c contiguous array before the transfer - assert (-a.strides[0],) + a.strides[1:] == b.strides == c.strides - assert a.dtype == b.dtype == c.dtype - assert c.flags.c_contiguous - - -def test_transfer_fortran(): - for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: - for dtype in dtypes_all: - a = numpy.random.rand(*shp) * 10 - a_ = numpy.asfortranarray(a) - if len(shp) > 1: - assert a_.strides != a.strides - a = a_ - b = gpu_ndarray.GpuNdArrayObject(a) - c = numpy.asarray(b) - - assert a.shape == b.shape == c.shape - assert a.dtype == b.dtype == c.dtype - assert a.flags.f_contiguous - assert c.flags.f_contiguous - assert a.strides == b.strides == c.strides - assert numpy.allclose(c, a) - - -def test_ascontiguousarray(): - for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: - for dtype in dtypes_all: - for offseted_o in [True, False]: - for offseted_i in [True, True]: - for sliced in [1, 2, -1, -2]: - for order in ['f', 'c']: - #print shp, dtype, offseted_o, offseted_i, - #print sliced, order - cpu, gpu = gen_gpu_nd_array(shp, dtype, offseted_o, - offseted_i, - sliced, order) - - a = numpy.ascontiguousarray(cpu) - b = gpu_ndarray.ascontiguousarray(gpu) - - # numpy upcast with a view to 1d scalar. - if (sliced != 1 or shp == () or - (offseted_i and len(shp) > 1)): - assert b is not gpu - if sliced == 1 and not offseted_i: - assert ((a.data is cpu.data) == - (b.bytes is gpu.bytes)) - else: - assert b is gpu - - assert a.shape == b.shape - assert a.dtype == b.dtype - assert a.flags.c_contiguous - assert b.flags['C_CONTIGUOUS'] - assert a.strides == b.strides - assert numpy.allclose(cpu, a) - assert numpy.allclose(cpu, b) - - -def test_asfortranarray(): - for shp in [(), (5,), (6, 7), (4, 8, 9), (1, 8, 9)]: - for dtype in dtypes_all: - for offseted_outer in [True, False]: - for offseted_inner in [True, False]: - for sliced in [1, 2, -1, -2]: - for order in ['f', 'c']: -#print shp, dtype, offseted_outer, offseted_inner, sliced, order - cpu, gpu = gen_gpu_nd_array(shp, dtype, - offseted_outer, - offseted_inner, - sliced, - order) - - a = numpy.asfortranarray(cpu) - b = gpu_ndarray.asfortranarray(gpu) - - # numpy upcast with a view to 1d scalar. - if (sliced != 1 or shp == () or - (offseted_outer and len(shp) > 1) or - (order != 'f' and len(shp) > 1)): - assert b is not gpu - if (sliced == 1 and not offseted_outer and - order != 'c'): - assert ((a.data is cpu.data) == - (b.bytes is gpu.bytes)) - else: - assert b is gpu - pass - - assert a.shape == b.shape - assert a.dtype == b.dtype - assert a.flags.f_contiguous - if shp != (): - assert b.flags['F_CONTIGUOUS'] - assert a.strides == b.strides - assert numpy.allclose(cpu, a) - assert numpy.allclose(cpu, b) - - -def test_zeros(): - for shp in [(), (0,), (5,), - (0, 0), (1, 0), (0, 1), (6, 7), - (0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1), - (4, 8, 9), (1, 8, 9)]: - for order in ["C", "F"]: - for dtype in dtypes_all: - x = numpy.zeros(shp, dtype, order) - y = gpu_ndarray.zeros(shp, dtype, order) - check_all(x, y) - x = gpu_ndarray.zeros(()) # no dtype and order param - y = numpy.zeros(()) - check_meta(x, y) - - try: - gpu_ndarray.zeros() - assert False - except TypeError: - pass - - -def test_empty(): - for shp in [(), (0,), (5,), - (0, 0), (1, 0), (0, 1), (6, 7), - (0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1), - (4, 8, 9), (1, 8, 9)]: - for order in ["C", "F"]: - for dtype in dtypes_all: - x = numpy.empty(shp, dtype, order) - y = gpu_ndarray.empty(shp, dtype, order) - check_meta(x, y) - x = gpu_ndarray.empty(()) # no dtype and order param - y = numpy.empty(()) - check_meta(x, y) - try: - gpu_ndarray.empty() - assert False - except TypeError: - pass - - -def test_mapping_getitem_ellipsis(): - for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: - for dtype in dtypes_all: - for offseted in [True, False]: - a, a_gpu = gen_gpu_nd_array(shp, dtype, offseted) - - b = a_gpu[...] - assert b.bytes == a_gpu.bytes - assert b.strides == a.strides - assert b.shape == a.shape - b_cpu = numpy.asarray(b) - assert numpy.allclose(a, b_cpu) - - -def test_copy_view(): - from ..array import may_share_memory - - def check_memory_region(a, a_op, b, b_op): - assert numpy.may_share_memory(a, a_op) == may_share_memory(b, b_op) - - if a_op.base is None: - assert b_op.base is None - else: - assert a_op.base is a - if b.base: - # We avoid having a series of object connected by base. - # This is to don't bloc the garbage collection. - assert b_op.base is b.base - else: - assert b_op.base is b - - for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: - for dtype in dtypes_all: - for offseted in [False, True]: - # order1 is the order of the original data - for order1 in ['c', 'f']: - # order2 is the order wanted after copy - for order2 in ['c', 'f']: - print(shp, dtype, offseted, order1, order2) - #TODO test copy unbroadcast! - a, b = gen_gpu_nd_array(shp, dtype, offseted, - order=order1) - - assert numpy.allclose(a, numpy.asarray(b)) - check_flags(a, b) - - c = b.copy(order2) - assert numpy.allclose(a, numpy.asarray(c)) - check_flags(c, a.copy(order2)) - check_memory_region(a, a.copy(order2), b, c) - - d = copy.copy(b) - assert numpy.allclose(a, numpy.asarray(d)) - check_flags(d, copy.copy(a)) - check_memory_region(a, copy.copy(a), b, d) - - e = b.view() - assert numpy.allclose(a, numpy.asarray(e)) - check_flags(e, a.view()) - check_memory_region(a, a.view(), b, e) - - f = copy.deepcopy(b) - assert numpy.allclose(a, numpy.asarray(f)) - check_flags(f, copy.deepcopy(a)) - check_memory_region(a, copy.deepcopy(a), b, f) - - g = copy.copy(b.view()) - assert numpy.allclose(a, numpy.asarray(g)) - check_memory_region(a, copy.copy(a.view()), b, g) - check_flags(g, copy.copy(a.view())) - - -def test_len(): - for shp in [(5,), (6, 7), (4, 8, 9), (1, 8, 9)]: - for dtype in dtypes_all: - for offseted in [True, False]: - a, a_gpu = gen_gpu_nd_array(shp, dtype, offseted) - assert len(a_gpu) == shp[0] - - -def test_mapping_getitem_w_int(): - def _cmp(x, y): - assert x.shape == y.shape - assert x.dtype == y.dtype - assert x.strides == y.strides - assert x.flags["C_CONTIGUOUS"] == y.flags["C_CONTIGUOUS"] - assert x.flags["F_CONTIGUOUS"] == y.flags["F_CONTIGUOUS"] - if x.flags["WRITEABLE"] != y.flags["WRITEABLE"]: - assert x.ndim == 0 - assert not x.flags["OWNDATA"] - assert y.flags["OWNDATA"] - else: - assert x.flags["WRITEABLE"] == y.flags["WRITEABLE"] - assert x.flags["OWNDATA"] == y.flags["OWNDATA"] - assert x.flags["ALIGNED"] == y.flags["ALIGNED"] - assert x.flags["UPDATEIFCOPY"] == y.flags["UPDATEIFCOPY"] - x = numpy.asarray(x) - assert x.shape == y.shape - assert x.dtype == y.dtype - assert x.strides == y.strides - if not numpy.all(x == y): - print(x) - print(y) - assert numpy.all(x == y), (x, y) - - def _cmpNs(x, y): - """ - Don't compare the stride after the transfer - There is a copy that have been made on the gpu before the transfer - """ - assert x.shape == y.shape - assert x.dtype == y.dtype - assert x.strides == y.strides - assert x.flags["C_CONTIGUOUS"] == y.flags["C_CONTIGUOUS"] - assert x.flags["F_CONTIGUOUS"] == y.flags["F_CONTIGUOUS"] - assert x.flags["WRITEABLE"] == y.flags["WRITEABLE"] - assert x.flags["ALIGNED"] == y.flags["ALIGNED"] - assert x.flags["OWNDATA"] == y.flags["OWNDATA"] - assert x.flags["UPDATEIFCOPY"] == y.flags["UPDATEIFCOPY"] - x_ = numpy.asarray(x) - assert x_.shape == y.shape - assert x_.dtype == y.dtype - if not numpy.all(x_ == y): - print(x_) - print(y) - assert numpy.all(x_ == y), (x_, y) - pass - - def _cmpf(x, *y): - try: - x.__getitem__(y) - except IndexError: - pass - else: - raise Exception("Did not generate out or bound error") - - def _cmpfV(x, *y): - try: - if len(y) == 1: - x.__getitem__(*y) - else: - x.__getitem__(y) - except ValueError: - pass - else: - raise Exception("Did not generate value error") - - for dtype in dtypes_all: - for offseted in [True, False]: - # test vector - dim = (2,) - a, _a = gen_gpu_nd_array(dim, dtype, offseted) - - import sys - init_ref_count = sys.getrefcount(_a) - _cmp(_a[...], a[...]) - _cmp(_a[...], a[...]) - _cmp(_a[...], a[...]) - _cmp(_a[...], a[...]) - _cmp(_a[...], a[...]) - - _cmp(_a[-1], a[-1]) - _cmp(_a[1], a[1]) - _cmp(_a[0], a[0]) - _cmp(_a[::1], a[::1]) - _cmpNs(_a[::-1], a[::-1]) - _cmp(_a[...], a[...]) - _cmpf(_a, 2) - - # test scalar - dim = () - a, _a = gen_gpu_nd_array(dim, dtype, offseted) - _cmp(_a[...], a[...]) - _cmpf(_a, 0) - _cmpfV(_a, slice(1)) - - # test 4d-tensor - dim = (5, 4, 3, 2) - a, _a = gen_gpu_nd_array(dim, dtype, offseted) - _cmpf(_a, slice(-1), slice(-1), 10, -10) - _cmpf(_a, slice(-1), slice(-1), -10, slice(-1)) - _cmpf(_a, 0, slice(0, -1, -20), -10) - _cmpf(_a, 10) - _cmpf(_a, (10, 0, 0, 0)) - _cmpf(_a, -10) - - #test with integer - _cmp(_a[1], a[1]) - _cmp(_a[-1], a[-1]) - _cmp(_a[numpy.int64(1)], a[numpy.int64(1)]) - _cmp(_a[numpy.int64(-1)], a[numpy.int64(-1)]) - - #test with slice - _cmp(_a[1:], a[1:]) - _cmp(_a[1:2], a[1:2]) - _cmp(_a[-1:1], a[-1:1]) - - #test with tuple (mix slice, integer, numpy.int64) - _cmpNs(_a[0, 0, ::numpy.int64(-1), ::-1], a[0, 0, ::-1, ::-1]) - _cmpNs(_a[:, :, ::numpy.int64(-1), ::-1], a[:, :, ::-1, ::-1]) - _cmpNs(_a[:, :, numpy.int64(1), -1], a[:, :, 1, -1]) - _cmpNs(_a[:, :, ::-1, ::-1], a[:, :, ::-1, ::-1]) - _cmpNs(_a[:, :, ::-10, ::-10], a[:, :, ::-10, ::-10]) - _cmpNs(_a[:, :, 1, -1], a[:, :, 1, -1]) - _cmpNs(_a[:, :, -1, :], a[:, :, -1, :]) - _cmpNs(_a[:, ::-2, -1, :], a[:, ::-2, -1, :]) - _cmpNs(_a[:, ::-20, -1, :], a[:, ::-20, -1, :]) - _cmpNs(_a[:, ::-2, -1], a[:, ::-2, -1]) - _cmpNs(_a[0, ::-2, -1], a[0, ::-2, -1]) - _cmp(_a[-1, -1, -1, -2], a[-1, -1, -1, -2]) - - #test ellipse - _cmp(_a[...], a[...])