Skip to content

Commit

Permalink
0.9.1 fixes #11 go back to stack allocation
Browse files Browse the repository at this point in the history
  • Loading branch information
d3v-null committed Jun 21, 2024
1 parent 283f33e commit f125ec4
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 48 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "mwa_hyperbeam"
version = "0.9.0"
version = "0.9.1"
authors = [
"Christopher H. Jordan <[email protected]>",
"Jack L. B. Line <[email protected]>",
Expand Down
98 changes: 52 additions & 46 deletions src/fee/gpu/fee.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,50 +215,49 @@ extern "C"
}
}

inline __device__ int lidx_device(const int i_direction, const int l, const int m)
inline __device__ int lidx_device(const int l, const int m)
{
// lengendre table is now ((NMAX + 1)*(NMAX + 2)) >> 1) by n_dirs which is why i_direction is now needed.
// summation series over l + m => (l*(l+1))/2 + m
return i_direction * (((NMAX + 1) * (NMAX + 2)) >> 1) + ((l * (l + 1)) >> 1) + m;
return ((l * (l + 1)) >> 1) + m;
}

inline __device__ int Pidx_device(const int i_direction, const int i)
inline __device__ int Pidx_device(const int i)
{
// all P arrays are now NMAX*(NMAX + 2) by i_direction, so that's why we need this.
// Used for P1sin_arr and P1_arr
return i_direction * NMAX * (NMAX + 2) + i;
return i;
}

inline __device__ void legendre_polynomials_device(FLOAT *legendre, const FLOAT *thetas, int i_direction)
inline __device__ void legendre_polynomials_device(FLOAT *legendre, const FLOAT u)
{
int l, m;
const FLOAT x = COS(thetas[i_direction]);
const FLOAT factor = -SQRT(1.0 - (x * x));
const FLOAT factor = -SQRT(1.0 - (u * u));

// Init legendre
legendre[lidx_device(i_direction, 0, 0)] = 1.0; // P_0,0(x) = 1
legendre[lidx_device(0, 0)] = 1.0; // P_0,0(u) = 1

// Easy values
legendre[lidx_device(i_direction, 1, 0)] = x; // P_1,0(x) = x
legendre[lidx_device(i_direction, 1, 1)] = factor; // P_1,1(x) = 342210222sqrt(1 342210222 x^2)
legendre[lidx_device(1, 0)] = u; // P_1,0(u) = u
legendre[lidx_device(1, 1)] = factor; // P_1,1(u) = -sqrt(1 - u^2)

for (l = 2; l <= NMAX; ++l)
{
for (m = 0; m < l - 1; ++m)
{
// P_l,m = (2l-1)*x*P_l-1,m - (l+m-1)*x*P_l-2,m / (l-k)
legendre[lidx_device(i_direction, l, m)] = ((FLOAT)(2 * l - 1) * x * legendre[lidx_device(i_direction, l - 1, m)] -
(FLOAT)(l + m - 1) * legendre[lidx_device(i_direction, l - 2, m)]) /
(FLOAT)(l - m);
// P_l,m = (2l-1)*u*P_l-1,m - (l+m-1)*u*P_l-2,m / (l-k)
legendre[lidx_device(l, m)] = ((FLOAT)(2 * l - 1) * u * legendre[lidx_device(l - 1, m)] -
(FLOAT)(l + m - 1) * legendre[lidx_device(l - 2, m)]) /
(FLOAT)(l - m);
}
// P_l,l-1 = (2l-1)*x*P_l-1,l-1
legendre[lidx_device(i_direction, l, l - 1)] = (FLOAT)(2 * l - 1) * x * legendre[lidx_device(i_direction, l - 1, l - 1)];
// P_l,l-1 = (2l-1)*u*P_l-1,l-1
legendre[lidx_device(l, l - 1)] = (FLOAT)(2 * l - 1) * u * legendre[lidx_device(l - 1, l - 1)];
// P_l,l = (2l-1)*factor*P_l-1,l-1
legendre[lidx_device(i_direction, l, l)] = (FLOAT)(2 * l - 1) * factor * legendre[lidx_device(i_direction, l - 1, l - 1)];
legendre[lidx_device(l, l)] = (FLOAT)(2 * l - 1) * factor * legendre[lidx_device(l - 1, l - 1)];
}
}

inline __device__ int jones_p1sin_device(const FLOAT *thetas, int i_direction, FLOAT *p1sin_out, FLOAT *p1_out, FLOAT *legendret)
inline __device__ int jones_p1sin_device(const FLOAT theta, FLOAT *p1sin_out, FLOAT *p1_out, FLOAT *legendret)
{
int n, m;
int ind_start, ind_stop;
Expand All @@ -268,15 +267,14 @@ extern "C"
FLOAT P[NMAX + 1], Pm1[NMAX + 1], Pm_sin[NMAX + 1], Pu_mdelu[NMAX + 1], Pm_sin_merged[NMAX * 2 + 1],
Pm1_merged[NMAX * 2 + 1];

const FLOAT theta = thetas[i_direction];
SINCOS(theta, &sin_th, &u);

for (n = 1; n <= NMAX; n++)
{
int i;
for (m = 0; m != n + 1; ++m)
{
P[m] = legendret[lidx_device(i_direction, n, m)];
P[m] = legendret[lidx_device(n, m)];
}
memcpy(Pm1, &(P[1]), n * sizeof(FLOAT));
Pm1[n] = 0;
Expand Down Expand Up @@ -313,7 +311,7 @@ extern "C"
modified = 0;
for (i = ind_start; i < ind_stop; i++)
{
p1sin_out[Pidx_device(i_direction, i)] = Pm_sin_merged[modified];
p1sin_out[Pidx_device(i)] = Pm_sin_merged[modified];
modified++;
}

Expand All @@ -324,21 +322,20 @@ extern "C"
modified = 0;
for (i = ind_start; i < ind_stop; i++)
{
p1_out[Pidx_device(i_direction, i)] = Pm1_merged[modified];
p1_out[Pidx_device(i)] = Pm1_merged[modified];
modified++;
}
}

return NMAX;
}

inline __device__ void jones_calc_sigmas_device(const FLOAT phi, const FLOAT *thetas, const int i_direction, const COMPLEX *q1_accum,
inline __device__ void jones_calc_sigmas_device(const FLOAT phi, const FLOAT u, const COMPLEX *q1_accum,
const COMPLEX *q2_accum, const int8_t *m_accum, const int8_t *n_accum,
const int8_t *m_signs, const int8_t *m_abs_m, const int coeff_length,
const FLOAT *P1sin_arr, const FLOAT *P1_arr, const char pol,
JONES *jm)
{
const FLOAT u = COS(thetas[i_direction]);
COMPLEX sigma_P = MAKE_COMPLEX(0.0, 0.0);
COMPLEX sigma_T = MAKE_COMPLEX(0.0, 0.0);
COMPLEX ejm_phi;
Expand All @@ -361,15 +358,15 @@ extern "C"
const COMPLEX j_power_n = J_POWERS[n % 4];
const COMPLEX q1 = q1_accum[i];
const COMPLEX q2 = q2_accum[i];
const COMPLEX s1 = q2 * (P1sin_arr[Pidx_device(i_direction, i)] * FABS(M) * u);
const COMPLEX s2 = q1 * (P1sin_arr[Pidx_device(i_direction, i)] * M);
const COMPLEX s3 = q2 * P1_arr[Pidx_device(i_direction, i)];
const COMPLEX s1 = q2 * (P1sin_arr[Pidx_device(i)] * FABS(M) * u);
const COMPLEX s2 = q1 * (P1sin_arr[Pidx_device(i)] * M);
const COMPLEX s3 = q2 * P1_arr[Pidx_device(i)];
const COMPLEX s4 = CSUB(s1, s2);
const COMPLEX E_theta_mn = CMUL(j_power_n, CADD(s4, s3));
const COMPLEX j_power_np1 = J_POWERS[(n + 1) % 4];
const COMPLEX o1 = q2 * (P1sin_arr[Pidx_device(i_direction, i)] * M);
const COMPLEX o2 = q1 * (P1sin_arr[Pidx_device(i_direction, i)] * FABS(M) * u);
const COMPLEX o3 = q1 * P1_arr[Pidx_device(i_direction, i)];
const COMPLEX o1 = q2 * (P1sin_arr[Pidx_device(i)] * M);
const COMPLEX o2 = q1 * (P1sin_arr[Pidx_device(i)] * FABS(M) * u);
const COMPLEX o3 = q1 * P1_arr[Pidx_device(i)];
const COMPLEX o4 = CSUB(o1, o2);
const COMPLEX E_phi_mn = CMUL(j_power_np1, CSUB(o4, o3));
sigma_P += CMUL(phi_comp, E_phi_mn);
Expand All @@ -394,34 +391,42 @@ extern "C"
* blockIdx.x * blockDim.x + threadIdx.x corresponds to direction.
*/
__global__ void fee_kernel(const FEECoeffs coeffs, const FLOAT *azs, const FLOAT *zas, const int num_directions,
const JONES *norm_jones, const FLOAT *latitude_rad, const int iau_order, JONES *fee_jones,
FLOAT *legendret, FLOAT *P1sin_arr, FLOAT *P1_arr)
const JONES *norm_jones, const FLOAT *latitude_rad, const int iau_order, JONES *fee_jones
// , FLOAT *legendret, FLOAT *P1sin_arr, FLOAT *P1_arr
)
{
int i_direction = blockIdx.x * blockDim.x + threadIdx.x;

if (i_direction >= num_directions)
return;

const FLOAT az = azs[i_direction];
const FLOAT za = zas[i_direction];
const FLOAT za = zas[i_direction]; // theta
const FLOAT phi = M_PI_2 - az;
const FLOAT u = COS(za);

const uint legendret_size = (((NMAX + 1) * (NMAX + 2)) >> 1);
const uint P_size = NMAX * (NMAX + 2);
FLOAT legendret[legendret_size];
FLOAT P1sin_arr[P_size];
FLOAT P1_arr[P_size];

// Create a look-up table for the legendre polynomials
// Such that legendre_table[ m * nmax + (n-1) ] = legendre(n, m, u)
legendre_polynomials_device(legendret, zas, i_direction);
legendre_polynomials_device(legendret, u);

// Set up our "P1sin" arrays. This is pretty expensive, but only depends
// on the zenith angle and "n_max".
jones_p1sin_device(zas, i_direction, P1sin_arr, P1_arr, legendret);
jones_p1sin_device(za, P1sin_arr, P1_arr, legendret);

const int x_offset = coeffs.x_offsets[blockIdx.y];
const int y_offset = coeffs.y_offsets[blockIdx.y];
JONES jm;
jones_calc_sigmas_device(phi, zas, i_direction, (const COMPLEX *)coeffs.x_q1_accum + x_offset,
jones_calc_sigmas_device(phi, u, (const COMPLEX *)coeffs.x_q1_accum + x_offset,
(const COMPLEX *)coeffs.x_q2_accum + x_offset, coeffs.x_m_accum + x_offset,
coeffs.x_n_accum + x_offset, coeffs.x_m_signs + x_offset, coeffs.x_m_abs_m + x_offset,
coeffs.x_lengths[blockIdx.y], P1sin_arr, P1_arr, 'x', &jm);
jones_calc_sigmas_device(phi, zas, i_direction, (const COMPLEX *)coeffs.y_q1_accum + y_offset,
jones_calc_sigmas_device(phi, u, (const COMPLEX *)coeffs.y_q1_accum + y_offset,
(const COMPLEX *)coeffs.y_q2_accum + y_offset, coeffs.y_m_accum + y_offset,
coeffs.y_n_accum + y_offset, coeffs.y_m_signs + y_offset, coeffs.y_m_abs_m + y_offset,
coeffs.y_lengths[blockIdx.y], P1sin_arr, P1_arr, 'y', &jm);
Expand Down Expand Up @@ -451,29 +456,30 @@ extern "C"
const FLOAT *d_latitude_rad, const int iau_order, void *d_results)
{
// Allocate device memory for legendre polynomials
FLOAT *d_legendret;
FLOAT *d_P1sin_arr, *d_P1_arr;
// TODO: replace NMAX with d_coeffs->n_max
GPUCHECK(gpuMalloc(&d_legendret, num_directions * (((NMAX + 1) * (NMAX + 2)) >> 1) * sizeof(FLOAT)));
GPUCHECK(gpuMalloc(&d_P1sin_arr, num_directions * (NMAX * (NMAX + 2)) * sizeof(FLOAT)));
GPUCHECK(gpuMalloc(&d_P1_arr, num_directions * (NMAX * (NMAX + 2)) * sizeof(FLOAT)));
// FLOAT *d_legendret, *d_P1sin_arr, *d_P1_arr;
// GPUCHECK(gpuMalloc(&d_legendret, num_directions * (((NMAX + 1) * (NMAX + 2)) >> 1) * sizeof(FLOAT)));
// GPUCHECK(gpuMalloc(&d_P1sin_arr, num_directions * (NMAX * (NMAX + 2)) * sizeof(FLOAT)));
// GPUCHECK(gpuMalloc(&d_P1_arr, num_directions * (NMAX * (NMAX + 2)) * sizeof(FLOAT)));

dim3 gridDim, blockDim;
blockDim.x = num_directions < warpSize ? num_directions : warpSize;
gridDim.x = num_directions < warpSize ? 1 : (num_directions - 1) / blockDim.x + 1;
gridDim.y = num_coeffs;
fee_kernel<<<gridDim, blockDim>>>(*d_coeffs, d_azs, d_zas, num_directions, (JONES *)d_norm_jones, d_latitude_rad,
iau_order, (JONES *)d_results, d_legendret, d_P1sin_arr, d_P1_arr);
iau_order, (JONES *)d_results
// , d_legendret, d_P1sin_arr, d_P1_arr
);

#ifdef DEBUG
GPUCHECK(gpuDeviceSynchronize());
#endif
GPUCHECK(gpuGetLastError());

// Free device memory
GPUCHECK(gpuFree(d_legendret));
GPUCHECK(gpuFree(d_P1sin_arr));
GPUCHECK(gpuFree(d_P1_arr));
// GPUCHECK(gpuFree(d_legendret));
// GPUCHECK(gpuFree(d_P1sin_arr));
// GPUCHECK(gpuFree(d_P1_arr));

return NULL;
}
Expand Down

0 comments on commit f125ec4

Please sign in to comment.