diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu index 1dc1c81cdc..0166ece8f7 100644 --- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu +++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu @@ -520,7 +520,7 @@ void vector_div_constant_op::operator()(const b // In small cases, 1024 threads per block will only utilize 17 blocks, much less than 40 int thread = thread_per_block; int block = (dim + thread - 1) / thread; - vector_div_constant_kernel << > > (dim, result, vector, constant); + vector_div_constant_kernel <<>> (dim, result, vector, constant); cudaErrcheck(cudaGetLastError()); cudaErrcheck(cudaDeviceSynchronize()); @@ -539,7 +539,7 @@ inline void vector_div_constant_complex_wrapper(const base_device::DEVICE_GPU* d int thread = thread_per_block; int block = (dim + thread - 1) / thread; - vector_div_constant_kernel> << > > (dim, result_tmp, vector_tmp, constant); + vector_div_constant_kernel> <<>> (dim, result_tmp, vector_tmp, constant); cudaErrcheck(cudaGetLastError()); cudaErrcheck(cudaDeviceSynchronize()); @@ -573,7 +573,7 @@ void vector_mul_vector_op::operator()(const bas { int thread = thread_per_block; int block = (dim + thread - 1) / thread; - vector_mul_vector_kernel << > > (dim, result, vector1, vector2); + vector_mul_vector_kernel <<>> (dim, result, vector1, vector2); cudaErrcheck(cudaGetLastError()); cudaErrcheck(cudaDeviceSynchronize()); @@ -590,7 +590,7 @@ inline void vector_mul_vector_complex_wrapper(const base_device::DEVICE_GPU* d, const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); int thread = thread_per_block; int block = (dim + thread - 1) / thread; - vector_mul_vector_kernel> << > > (dim, result_tmp, vector1_tmp, vector2); + vector_mul_vector_kernel> <<>> (dim, result_tmp, vector1_tmp, vector2); cudaErrcheck(cudaGetLastError()); cudaErrcheck(cudaDeviceSynchronize()); @@ -625,7 +625,7 @@ void vector_div_vector_op::operator()(const bas { int thread = thread_per_block; int block = (dim + thread - 1) / thread; - vector_div_vector_kernel << > > (dim, result, vector1, vector2); + vector_div_vector_kernel <<>> (dim, result, vector1, vector2); cudaErrcheck(cudaGetLastError()); cudaErrcheck(cudaDeviceSynchronize()); @@ -642,7 +642,7 @@ inline void vector_div_vector_complex_wrapper(const base_device::DEVICE_GPU* d, const thrust::complex* vector1_tmp = reinterpret_cast*>(vector1); int thread = thread_per_block; int block = (dim + thread - 1) / thread; - vector_div_vector_kernel> << > > (dim, result_tmp, vector1_tmp, vector2); + vector_div_vector_kernel> <<>> (dim, result_tmp, vector1_tmp, vector2); cudaErrcheck(cudaGetLastError()); cudaErrcheck(cudaDeviceSynchronize()); @@ -989,7 +989,7 @@ void matrixTranspose_op::operator()(const base_ { int thread = 1024; int block = (row + col + thread - 1) / thread; - matrix_transpose_kernel << > > (row, col, input_matrix, device_temp); + matrix_transpose_kernel <<>> (row, col, input_matrix, device_temp); cudaErrcheck(cudaGetLastError()); cudaErrcheck(cudaDeviceSynchronize()); @@ -1031,7 +1031,7 @@ void matrixTranspose_op, base_device::DEVICE_GPU>::operator( { int thread = 1024; int block = (row + col + thread - 1) / thread; - matrix_transpose_kernel> << > > (row, col, (thrust::complex*)input_matrix, (thrust::complex*)device_temp); + matrix_transpose_kernel> <<>> (row, col, (thrust::complex*)input_matrix, (thrust::complex*)device_temp); cudaErrcheck(cudaGetLastError()); cudaErrcheck(cudaDeviceSynchronize()); @@ -1075,7 +1075,7 @@ void matrixTranspose_op, base_device::DEVICE_GPU>::operator { int thread = 1024; int block = (row + col + thread - 1) / thread; - matrix_transpose_kernel> << > > (row, col, (thrust::complex*)input_matrix, (thrust::complex*)device_temp); + matrix_transpose_kernel> <<>> (row, col, (thrust::complex*)input_matrix, (thrust::complex*)device_temp); cudaErrcheck(cudaGetLastError()); cudaErrcheck(cudaDeviceSynchronize()); @@ -1098,7 +1098,7 @@ void matrixSetToAnother::operator()(const base_ { int thread = 1024; int block = (LDA + thread - 1) / thread; - matrix_setTo_another_kernel << > > (n, LDA, LDB, A, B); + matrix_setTo_another_kernel <<>> (n, LDA, LDB, A, B); cudaErrcheck(cudaGetLastError()); cudaErrcheck(cudaDeviceSynchronize()); @@ -1113,7 +1113,7 @@ void matrixSetToAnother, base_device::DEVICE_GPU>::operator( { int thread = 1024; int block = (LDA + thread - 1) / thread; - matrix_setTo_another_kernel> << > > (n, LDA, LDB, reinterpret_cast*>(A), reinterpret_cast*>(B)); + matrix_setTo_another_kernel> <<>> (n, LDA, LDB, reinterpret_cast*>(A), reinterpret_cast*>(B)); cudaErrcheck(cudaGetLastError()); cudaErrcheck(cudaDeviceSynchronize()); @@ -1128,7 +1128,7 @@ void matrixSetToAnother, base_device::DEVICE_GPU>::operator { int thread = 1024; int block = (LDA + thread - 1) / thread; - matrix_setTo_another_kernel> << > > (n, LDA, LDB, reinterpret_cast*>(A), reinterpret_cast*>(B)); + matrix_setTo_another_kernel> <<>> (n, LDA, LDB, reinterpret_cast*>(A), reinterpret_cast*>(B)); cudaErrcheck(cudaGetLastError()); cudaErrcheck(cudaDeviceSynchronize());