From f125ec4769702b2e0e58db58ca6a62617721af3e Mon Sep 17 00:00:00 2001 From: d3v-null Date: Fri, 21 Jun 2024 06:13:11 +0000 Subject: [PATCH] 0.9.1 fixes #11 go back to stack allocation --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/fee/gpu/fee.h | 98 +++++++++++++++++++++++++---------------------- 3 files changed, 54 insertions(+), 48 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4580410..341b4c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -869,7 +869,7 @@ dependencies = [ [[package]] name = "mwa_hyperbeam" -version = "0.9.0" +version = "0.9.1" dependencies = [ "approx", "cbindgen", diff --git a/Cargo.toml b/Cargo.toml index 4a94427..b956c00 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mwa_hyperbeam" -version = "0.9.0" +version = "0.9.1" authors = [ "Christopher H. Jordan ", "Jack L. B. Line ", diff --git a/src/fee/gpu/fee.h b/src/fee/gpu/fee.h index 1878871..5718c2b 100644 --- a/src/fee/gpu/fee.h +++ b/src/fee/gpu/fee.h @@ -215,50 +215,49 @@ extern "C" } } - inline __device__ int lidx_device(const int i_direction, const int l, const int m) + inline __device__ int lidx_device(const int l, const int m) { // lengendre table is now ((NMAX + 1)*(NMAX + 2)) >> 1) by n_dirs which is why i_direction is now needed. // summation series over l + m => (l*(l+1))/2 + m - return i_direction * (((NMAX + 1) * (NMAX + 2)) >> 1) + ((l * (l + 1)) >> 1) + m; + return ((l * (l + 1)) >> 1) + m; } - inline __device__ int Pidx_device(const int i_direction, const int i) + inline __device__ int Pidx_device(const int i) { // all P arrays are now NMAX*(NMAX + 2) by i_direction, so that's why we need this. // Used for P1sin_arr and P1_arr - return i_direction * NMAX * (NMAX + 2) + i; + return i; } - inline __device__ void legendre_polynomials_device(FLOAT *legendre, const FLOAT *thetas, int i_direction) + inline __device__ void legendre_polynomials_device(FLOAT *legendre, const FLOAT u) { int l, m; - const FLOAT x = COS(thetas[i_direction]); - const FLOAT factor = -SQRT(1.0 - (x * x)); + const FLOAT factor = -SQRT(1.0 - (u * u)); // Init legendre - legendre[lidx_device(i_direction, 0, 0)] = 1.0; // P_0,0(x) = 1 + legendre[lidx_device(0, 0)] = 1.0; // P_0,0(u) = 1 // Easy values - legendre[lidx_device(i_direction, 1, 0)] = x; // P_1,0(x) = x - legendre[lidx_device(i_direction, 1, 1)] = factor; // P_1,1(x) = 342210222sqrt(1 342210222 x^2) + legendre[lidx_device(1, 0)] = u; // P_1,0(u) = u + legendre[lidx_device(1, 1)] = factor; // P_1,1(u) = -sqrt(1 - u^2) for (l = 2; l <= NMAX; ++l) { for (m = 0; m < l - 1; ++m) { - // P_l,m = (2l-1)*x*P_l-1,m - (l+m-1)*x*P_l-2,m / (l-k) - legendre[lidx_device(i_direction, l, m)] = ((FLOAT)(2 * l - 1) * x * legendre[lidx_device(i_direction, l - 1, m)] - - (FLOAT)(l + m - 1) * legendre[lidx_device(i_direction, l - 2, m)]) / - (FLOAT)(l - m); + // P_l,m = (2l-1)*u*P_l-1,m - (l+m-1)*u*P_l-2,m / (l-k) + legendre[lidx_device(l, m)] = ((FLOAT)(2 * l - 1) * u * legendre[lidx_device(l - 1, m)] - + (FLOAT)(l + m - 1) * legendre[lidx_device(l - 2, m)]) / + (FLOAT)(l - m); } - // P_l,l-1 = (2l-1)*x*P_l-1,l-1 - legendre[lidx_device(i_direction, l, l - 1)] = (FLOAT)(2 * l - 1) * x * legendre[lidx_device(i_direction, l - 1, l - 1)]; + // P_l,l-1 = (2l-1)*u*P_l-1,l-1 + legendre[lidx_device(l, l - 1)] = (FLOAT)(2 * l - 1) * u * legendre[lidx_device(l - 1, l - 1)]; // P_l,l = (2l-1)*factor*P_l-1,l-1 - legendre[lidx_device(i_direction, l, l)] = (FLOAT)(2 * l - 1) * factor * legendre[lidx_device(i_direction, l - 1, l - 1)]; + legendre[lidx_device(l, l)] = (FLOAT)(2 * l - 1) * factor * legendre[lidx_device(l - 1, l - 1)]; } } - inline __device__ int jones_p1sin_device(const FLOAT *thetas, int i_direction, FLOAT *p1sin_out, FLOAT *p1_out, FLOAT *legendret) + inline __device__ int jones_p1sin_device(const FLOAT theta, FLOAT *p1sin_out, FLOAT *p1_out, FLOAT *legendret) { int n, m; int ind_start, ind_stop; @@ -268,7 +267,6 @@ extern "C" FLOAT P[NMAX + 1], Pm1[NMAX + 1], Pm_sin[NMAX + 1], Pu_mdelu[NMAX + 1], Pm_sin_merged[NMAX * 2 + 1], Pm1_merged[NMAX * 2 + 1]; - const FLOAT theta = thetas[i_direction]; SINCOS(theta, &sin_th, &u); for (n = 1; n <= NMAX; n++) @@ -276,7 +274,7 @@ extern "C" int i; for (m = 0; m != n + 1; ++m) { - P[m] = legendret[lidx_device(i_direction, n, m)]; + P[m] = legendret[lidx_device(n, m)]; } memcpy(Pm1, &(P[1]), n * sizeof(FLOAT)); Pm1[n] = 0; @@ -313,7 +311,7 @@ extern "C" modified = 0; for (i = ind_start; i < ind_stop; i++) { - p1sin_out[Pidx_device(i_direction, i)] = Pm_sin_merged[modified]; + p1sin_out[Pidx_device(i)] = Pm_sin_merged[modified]; modified++; } @@ -324,7 +322,7 @@ extern "C" modified = 0; for (i = ind_start; i < ind_stop; i++) { - p1_out[Pidx_device(i_direction, i)] = Pm1_merged[modified]; + p1_out[Pidx_device(i)] = Pm1_merged[modified]; modified++; } } @@ -332,13 +330,12 @@ extern "C" return NMAX; } - inline __device__ void jones_calc_sigmas_device(const FLOAT phi, const FLOAT *thetas, const int i_direction, const COMPLEX *q1_accum, + inline __device__ void jones_calc_sigmas_device(const FLOAT phi, const FLOAT u, const COMPLEX *q1_accum, const COMPLEX *q2_accum, const int8_t *m_accum, const int8_t *n_accum, const int8_t *m_signs, const int8_t *m_abs_m, const int coeff_length, const FLOAT *P1sin_arr, const FLOAT *P1_arr, const char pol, JONES *jm) { - const FLOAT u = COS(thetas[i_direction]); COMPLEX sigma_P = MAKE_COMPLEX(0.0, 0.0); COMPLEX sigma_T = MAKE_COMPLEX(0.0, 0.0); COMPLEX ejm_phi; @@ -361,15 +358,15 @@ extern "C" const COMPLEX j_power_n = J_POWERS[n % 4]; const COMPLEX q1 = q1_accum[i]; const COMPLEX q2 = q2_accum[i]; - const COMPLEX s1 = q2 * (P1sin_arr[Pidx_device(i_direction, i)] * FABS(M) * u); - const COMPLEX s2 = q1 * (P1sin_arr[Pidx_device(i_direction, i)] * M); - const COMPLEX s3 = q2 * P1_arr[Pidx_device(i_direction, i)]; + const COMPLEX s1 = q2 * (P1sin_arr[Pidx_device(i)] * FABS(M) * u); + const COMPLEX s2 = q1 * (P1sin_arr[Pidx_device(i)] * M); + const COMPLEX s3 = q2 * P1_arr[Pidx_device(i)]; const COMPLEX s4 = CSUB(s1, s2); const COMPLEX E_theta_mn = CMUL(j_power_n, CADD(s4, s3)); const COMPLEX j_power_np1 = J_POWERS[(n + 1) % 4]; - const COMPLEX o1 = q2 * (P1sin_arr[Pidx_device(i_direction, i)] * M); - const COMPLEX o2 = q1 * (P1sin_arr[Pidx_device(i_direction, i)] * FABS(M) * u); - const COMPLEX o3 = q1 * P1_arr[Pidx_device(i_direction, i)]; + const COMPLEX o1 = q2 * (P1sin_arr[Pidx_device(i)] * M); + const COMPLEX o2 = q1 * (P1sin_arr[Pidx_device(i)] * FABS(M) * u); + const COMPLEX o3 = q1 * P1_arr[Pidx_device(i)]; const COMPLEX o4 = CSUB(o1, o2); const COMPLEX E_phi_mn = CMUL(j_power_np1, CSUB(o4, o3)); sigma_P += CMUL(phi_comp, E_phi_mn); @@ -394,8 +391,9 @@ extern "C" * blockIdx.x * blockDim.x + threadIdx.x corresponds to direction. */ __global__ void fee_kernel(const FEECoeffs coeffs, const FLOAT *azs, const FLOAT *zas, const int num_directions, - const JONES *norm_jones, const FLOAT *latitude_rad, const int iau_order, JONES *fee_jones, - FLOAT *legendret, FLOAT *P1sin_arr, FLOAT *P1_arr) + const JONES *norm_jones, const FLOAT *latitude_rad, const int iau_order, JONES *fee_jones + // , FLOAT *legendret, FLOAT *P1sin_arr, FLOAT *P1_arr + ) { int i_direction = blockIdx.x * blockDim.x + threadIdx.x; @@ -403,25 +401,32 @@ extern "C" return; const FLOAT az = azs[i_direction]; - const FLOAT za = zas[i_direction]; + const FLOAT za = zas[i_direction]; // theta const FLOAT phi = M_PI_2 - az; + const FLOAT u = COS(za); + + const uint legendret_size = (((NMAX + 1) * (NMAX + 2)) >> 1); + const uint P_size = NMAX * (NMAX + 2); + FLOAT legendret[legendret_size]; + FLOAT P1sin_arr[P_size]; + FLOAT P1_arr[P_size]; // Create a look-up table for the legendre polynomials // Such that legendre_table[ m * nmax + (n-1) ] = legendre(n, m, u) - legendre_polynomials_device(legendret, zas, i_direction); + legendre_polynomials_device(legendret, u); // Set up our "P1sin" arrays. This is pretty expensive, but only depends // on the zenith angle and "n_max". - jones_p1sin_device(zas, i_direction, P1sin_arr, P1_arr, legendret); + jones_p1sin_device(za, P1sin_arr, P1_arr, legendret); const int x_offset = coeffs.x_offsets[blockIdx.y]; const int y_offset = coeffs.y_offsets[blockIdx.y]; JONES jm; - jones_calc_sigmas_device(phi, zas, i_direction, (const COMPLEX *)coeffs.x_q1_accum + x_offset, + jones_calc_sigmas_device(phi, u, (const COMPLEX *)coeffs.x_q1_accum + x_offset, (const COMPLEX *)coeffs.x_q2_accum + x_offset, coeffs.x_m_accum + x_offset, coeffs.x_n_accum + x_offset, coeffs.x_m_signs + x_offset, coeffs.x_m_abs_m + x_offset, coeffs.x_lengths[blockIdx.y], P1sin_arr, P1_arr, 'x', &jm); - jones_calc_sigmas_device(phi, zas, i_direction, (const COMPLEX *)coeffs.y_q1_accum + y_offset, + jones_calc_sigmas_device(phi, u, (const COMPLEX *)coeffs.y_q1_accum + y_offset, (const COMPLEX *)coeffs.y_q2_accum + y_offset, coeffs.y_m_accum + y_offset, coeffs.y_n_accum + y_offset, coeffs.y_m_signs + y_offset, coeffs.y_m_abs_m + y_offset, coeffs.y_lengths[blockIdx.y], P1sin_arr, P1_arr, 'y', &jm); @@ -451,19 +456,20 @@ extern "C" const FLOAT *d_latitude_rad, const int iau_order, void *d_results) { // Allocate device memory for legendre polynomials - FLOAT *d_legendret; - FLOAT *d_P1sin_arr, *d_P1_arr; // TODO: replace NMAX with d_coeffs->n_max - GPUCHECK(gpuMalloc(&d_legendret, num_directions * (((NMAX + 1) * (NMAX + 2)) >> 1) * sizeof(FLOAT))); - GPUCHECK(gpuMalloc(&d_P1sin_arr, num_directions * (NMAX * (NMAX + 2)) * sizeof(FLOAT))); - GPUCHECK(gpuMalloc(&d_P1_arr, num_directions * (NMAX * (NMAX + 2)) * sizeof(FLOAT))); + // FLOAT *d_legendret, *d_P1sin_arr, *d_P1_arr; + // GPUCHECK(gpuMalloc(&d_legendret, num_directions * (((NMAX + 1) * (NMAX + 2)) >> 1) * sizeof(FLOAT))); + // GPUCHECK(gpuMalloc(&d_P1sin_arr, num_directions * (NMAX * (NMAX + 2)) * sizeof(FLOAT))); + // GPUCHECK(gpuMalloc(&d_P1_arr, num_directions * (NMAX * (NMAX + 2)) * sizeof(FLOAT))); dim3 gridDim, blockDim; blockDim.x = num_directions < warpSize ? num_directions : warpSize; gridDim.x = num_directions < warpSize ? 1 : (num_directions - 1) / blockDim.x + 1; gridDim.y = num_coeffs; fee_kernel<<>>(*d_coeffs, d_azs, d_zas, num_directions, (JONES *)d_norm_jones, d_latitude_rad, - iau_order, (JONES *)d_results, d_legendret, d_P1sin_arr, d_P1_arr); + iau_order, (JONES *)d_results + // , d_legendret, d_P1sin_arr, d_P1_arr + ); #ifdef DEBUG GPUCHECK(gpuDeviceSynchronize()); @@ -471,9 +477,9 @@ extern "C" GPUCHECK(gpuGetLastError()); // Free device memory - GPUCHECK(gpuFree(d_legendret)); - GPUCHECK(gpuFree(d_P1sin_arr)); - GPUCHECK(gpuFree(d_P1_arr)); + // GPUCHECK(gpuFree(d_legendret)); + // GPUCHECK(gpuFree(d_P1sin_arr)); + // GPUCHECK(gpuFree(d_P1_arr)); return NULL; }