From f125ec4769702b2e0e58db58ca6a62617721af3e Mon Sep 17 00:00:00 2001
From: d3v-null <dev.null@curtin.edu.au>
Date: Fri, 21 Jun 2024 06:13:11 +0000
Subject: [PATCH] 0.9.1 fixes #11 go back to stack allocation

---
 Cargo.lock        |  2 +-
 Cargo.toml        |  2 +-
 src/fee/gpu/fee.h | 98 +++++++++++++++++++++++++----------------------
 3 files changed, 54 insertions(+), 48 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4580410..341b4c2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -869,7 +869,7 @@ dependencies = [
 
 [[package]]
 name = "mwa_hyperbeam"
-version = "0.9.0"
+version = "0.9.1"
 dependencies = [
  "approx",
  "cbindgen",
diff --git a/Cargo.toml b/Cargo.toml
index 4a94427..b956c00 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "mwa_hyperbeam"
-version = "0.9.0"
+version = "0.9.1"
 authors = [
     "Christopher H. Jordan <christopherjordan87@gmail.com>",
     "Jack L. B. Line <jack.line@curtin.edu.au>",
diff --git a/src/fee/gpu/fee.h b/src/fee/gpu/fee.h
index 1878871..5718c2b 100644
--- a/src/fee/gpu/fee.h
+++ b/src/fee/gpu/fee.h
@@ -215,50 +215,49 @@ extern "C"
         }
     }
 
-    inline __device__ int lidx_device(const int i_direction, const int l, const int m)
+    inline __device__ int lidx_device(const int l, const int m)
     {
         // lengendre table is now ((NMAX + 1)*(NMAX + 2)) >> 1) by n_dirs which is why i_direction is now needed.
         // summation series over l + m => (l*(l+1))/2 + m
-        return i_direction * (((NMAX + 1) * (NMAX + 2)) >> 1) + ((l * (l + 1)) >> 1) + m;
+        return ((l * (l + 1)) >> 1) + m;
     }
 
-    inline __device__ int Pidx_device(const int i_direction, const int i)
+    inline __device__ int Pidx_device(const int i)
     {
         // all P arrays are now NMAX*(NMAX + 2) by i_direction, so that's why we need this.
         // Used for P1sin_arr and P1_arr
-        return i_direction * NMAX * (NMAX + 2) + i;
+        return i;
     }
 
-    inline __device__ void legendre_polynomials_device(FLOAT *legendre, const FLOAT *thetas, int i_direction)
+    inline __device__ void legendre_polynomials_device(FLOAT *legendre, const FLOAT u)
     {
         int l, m;
-        const FLOAT x = COS(thetas[i_direction]);
-        const FLOAT factor = -SQRT(1.0 - (x * x));
+        const FLOAT factor = -SQRT(1.0 - (u * u));
 
         // Init legendre
-        legendre[lidx_device(i_direction, 0, 0)] = 1.0; // P_0,0(x) = 1
+        legendre[lidx_device(0, 0)] = 1.0; // P_0,0(u) = 1
 
         // Easy values
-        legendre[lidx_device(i_direction, 1, 0)] = x;      // P_1,0(x) = x
-        legendre[lidx_device(i_direction, 1, 1)] = factor; // P_1,1(x) = 342210222sqrt(1 342210222 x^2)
+        legendre[lidx_device(1, 0)] = u;      // P_1,0(u) = u
+        legendre[lidx_device(1, 1)] = factor; // P_1,1(u) = -sqrt(1 - u^2)
 
         for (l = 2; l <= NMAX; ++l)
         {
             for (m = 0; m < l - 1; ++m)
             {
-                // P_l,m = (2l-1)*x*P_l-1,m - (l+m-1)*x*P_l-2,m / (l-k)
-                legendre[lidx_device(i_direction, l, m)] = ((FLOAT)(2 * l - 1) * x * legendre[lidx_device(i_direction, l - 1, m)] -
-                                                            (FLOAT)(l + m - 1) * legendre[lidx_device(i_direction, l - 2, m)]) /
-                                                           (FLOAT)(l - m);
+                // P_l,m = (2l-1)*u*P_l-1,m - (l+m-1)*u*P_l-2,m / (l-k)
+                legendre[lidx_device(l, m)] = ((FLOAT)(2 * l - 1) * u * legendre[lidx_device(l - 1, m)] -
+                                               (FLOAT)(l + m - 1) * legendre[lidx_device(l - 2, m)]) /
+                                              (FLOAT)(l - m);
             }
-            // P_l,l-1 = (2l-1)*x*P_l-1,l-1
-            legendre[lidx_device(i_direction, l, l - 1)] = (FLOAT)(2 * l - 1) * x * legendre[lidx_device(i_direction, l - 1, l - 1)];
+            // P_l,l-1 = (2l-1)*u*P_l-1,l-1
+            legendre[lidx_device(l, l - 1)] = (FLOAT)(2 * l - 1) * u * legendre[lidx_device(l - 1, l - 1)];
             // P_l,l = (2l-1)*factor*P_l-1,l-1
-            legendre[lidx_device(i_direction, l, l)] = (FLOAT)(2 * l - 1) * factor * legendre[lidx_device(i_direction, l - 1, l - 1)];
+            legendre[lidx_device(l, l)] = (FLOAT)(2 * l - 1) * factor * legendre[lidx_device(l - 1, l - 1)];
         }
     }
 
-    inline __device__ int jones_p1sin_device(const FLOAT *thetas, int i_direction, FLOAT *p1sin_out, FLOAT *p1_out, FLOAT *legendret)
+    inline __device__ int jones_p1sin_device(const FLOAT theta, FLOAT *p1sin_out, FLOAT *p1_out, FLOAT *legendret)
     {
         int n, m;
         int ind_start, ind_stop;
@@ -268,7 +267,6 @@ extern "C"
         FLOAT P[NMAX + 1], Pm1[NMAX + 1], Pm_sin[NMAX + 1], Pu_mdelu[NMAX + 1], Pm_sin_merged[NMAX * 2 + 1],
             Pm1_merged[NMAX * 2 + 1];
 
-        const FLOAT theta = thetas[i_direction];
         SINCOS(theta, &sin_th, &u);
 
         for (n = 1; n <= NMAX; n++)
@@ -276,7 +274,7 @@ extern "C"
             int i;
             for (m = 0; m != n + 1; ++m)
             {
-                P[m] = legendret[lidx_device(i_direction, n, m)];
+                P[m] = legendret[lidx_device(n, m)];
             }
             memcpy(Pm1, &(P[1]), n * sizeof(FLOAT));
             Pm1[n] = 0;
@@ -313,7 +311,7 @@ extern "C"
             modified = 0;
             for (i = ind_start; i < ind_stop; i++)
             {
-                p1sin_out[Pidx_device(i_direction, i)] = Pm_sin_merged[modified];
+                p1sin_out[Pidx_device(i)] = Pm_sin_merged[modified];
                 modified++;
             }
 
@@ -324,7 +322,7 @@ extern "C"
             modified = 0;
             for (i = ind_start; i < ind_stop; i++)
             {
-                p1_out[Pidx_device(i_direction, i)] = Pm1_merged[modified];
+                p1_out[Pidx_device(i)] = Pm1_merged[modified];
                 modified++;
             }
         }
@@ -332,13 +330,12 @@ extern "C"
         return NMAX;
     }
 
-    inline __device__ void jones_calc_sigmas_device(const FLOAT phi, const FLOAT *thetas, const int i_direction, const COMPLEX *q1_accum,
+    inline __device__ void jones_calc_sigmas_device(const FLOAT phi, const FLOAT u, const COMPLEX *q1_accum,
                                                     const COMPLEX *q2_accum, const int8_t *m_accum, const int8_t *n_accum,
                                                     const int8_t *m_signs, const int8_t *m_abs_m, const int coeff_length,
                                                     const FLOAT *P1sin_arr, const FLOAT *P1_arr, const char pol,
                                                     JONES *jm)
     {
-        const FLOAT u = COS(thetas[i_direction]);
         COMPLEX sigma_P = MAKE_COMPLEX(0.0, 0.0);
         COMPLEX sigma_T = MAKE_COMPLEX(0.0, 0.0);
         COMPLEX ejm_phi;
@@ -361,15 +358,15 @@ extern "C"
             const COMPLEX j_power_n = J_POWERS[n % 4];
             const COMPLEX q1 = q1_accum[i];
             const COMPLEX q2 = q2_accum[i];
-            const COMPLEX s1 = q2 * (P1sin_arr[Pidx_device(i_direction, i)] * FABS(M) * u);
-            const COMPLEX s2 = q1 * (P1sin_arr[Pidx_device(i_direction, i)] * M);
-            const COMPLEX s3 = q2 * P1_arr[Pidx_device(i_direction, i)];
+            const COMPLEX s1 = q2 * (P1sin_arr[Pidx_device(i)] * FABS(M) * u);
+            const COMPLEX s2 = q1 * (P1sin_arr[Pidx_device(i)] * M);
+            const COMPLEX s3 = q2 * P1_arr[Pidx_device(i)];
             const COMPLEX s4 = CSUB(s1, s2);
             const COMPLEX E_theta_mn = CMUL(j_power_n, CADD(s4, s3));
             const COMPLEX j_power_np1 = J_POWERS[(n + 1) % 4];
-            const COMPLEX o1 = q2 * (P1sin_arr[Pidx_device(i_direction, i)] * M);
-            const COMPLEX o2 = q1 * (P1sin_arr[Pidx_device(i_direction, i)] * FABS(M) * u);
-            const COMPLEX o3 = q1 * P1_arr[Pidx_device(i_direction, i)];
+            const COMPLEX o1 = q2 * (P1sin_arr[Pidx_device(i)] * M);
+            const COMPLEX o2 = q1 * (P1sin_arr[Pidx_device(i)] * FABS(M) * u);
+            const COMPLEX o3 = q1 * P1_arr[Pidx_device(i)];
             const COMPLEX o4 = CSUB(o1, o2);
             const COMPLEX E_phi_mn = CMUL(j_power_np1, CSUB(o4, o3));
             sigma_P += CMUL(phi_comp, E_phi_mn);
@@ -394,8 +391,9 @@ extern "C"
      * blockIdx.x * blockDim.x + threadIdx.x corresponds to direction.
      */
     __global__ void fee_kernel(const FEECoeffs coeffs, const FLOAT *azs, const FLOAT *zas, const int num_directions,
-                               const JONES *norm_jones, const FLOAT *latitude_rad, const int iau_order, JONES *fee_jones,
-                               FLOAT *legendret, FLOAT *P1sin_arr, FLOAT *P1_arr)
+                               const JONES *norm_jones, const FLOAT *latitude_rad, const int iau_order, JONES *fee_jones
+                               //    , FLOAT *legendret, FLOAT *P1sin_arr, FLOAT *P1_arr
+    )
     {
         int i_direction = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -403,25 +401,32 @@ extern "C"
             return;
 
         const FLOAT az = azs[i_direction];
-        const FLOAT za = zas[i_direction];
+        const FLOAT za = zas[i_direction]; // theta
         const FLOAT phi = M_PI_2 - az;
+        const FLOAT u = COS(za);
+
+        const uint legendret_size = (((NMAX + 1) * (NMAX + 2)) >> 1);
+        const uint P_size = NMAX * (NMAX + 2);
+        FLOAT legendret[legendret_size];
+        FLOAT P1sin_arr[P_size];
+        FLOAT P1_arr[P_size];
 
         // Create a look-up table for the legendre polynomials
         // Such that legendre_table[ m * nmax + (n-1) ] = legendre(n, m, u)
-        legendre_polynomials_device(legendret, zas, i_direction);
+        legendre_polynomials_device(legendret, u);
 
         // Set up our "P1sin" arrays. This is pretty expensive, but only depends
         // on the zenith angle and "n_max".
-        jones_p1sin_device(zas, i_direction, P1sin_arr, P1_arr, legendret);
+        jones_p1sin_device(za, P1sin_arr, P1_arr, legendret);
 
         const int x_offset = coeffs.x_offsets[blockIdx.y];
         const int y_offset = coeffs.y_offsets[blockIdx.y];
         JONES jm;
-        jones_calc_sigmas_device(phi, zas, i_direction, (const COMPLEX *)coeffs.x_q1_accum + x_offset,
+        jones_calc_sigmas_device(phi, u, (const COMPLEX *)coeffs.x_q1_accum + x_offset,
                                  (const COMPLEX *)coeffs.x_q2_accum + x_offset, coeffs.x_m_accum + x_offset,
                                  coeffs.x_n_accum + x_offset, coeffs.x_m_signs + x_offset, coeffs.x_m_abs_m + x_offset,
                                  coeffs.x_lengths[blockIdx.y], P1sin_arr, P1_arr, 'x', &jm);
-        jones_calc_sigmas_device(phi, zas, i_direction, (const COMPLEX *)coeffs.y_q1_accum + y_offset,
+        jones_calc_sigmas_device(phi, u, (const COMPLEX *)coeffs.y_q1_accum + y_offset,
                                  (const COMPLEX *)coeffs.y_q2_accum + y_offset, coeffs.y_m_accum + y_offset,
                                  coeffs.y_n_accum + y_offset, coeffs.y_m_signs + y_offset, coeffs.y_m_abs_m + y_offset,
                                  coeffs.y_lengths[blockIdx.y], P1sin_arr, P1_arr, 'y', &jm);
@@ -451,19 +456,20 @@ extern "C"
                                               const FLOAT *d_latitude_rad, const int iau_order, void *d_results)
     {
         // Allocate device memory for legendre polynomials
-        FLOAT *d_legendret;
-        FLOAT *d_P1sin_arr, *d_P1_arr;
         // TODO: replace NMAX with d_coeffs->n_max
-        GPUCHECK(gpuMalloc(&d_legendret, num_directions * (((NMAX + 1) * (NMAX + 2)) >> 1) * sizeof(FLOAT)));
-        GPUCHECK(gpuMalloc(&d_P1sin_arr, num_directions * (NMAX * (NMAX + 2)) * sizeof(FLOAT)));
-        GPUCHECK(gpuMalloc(&d_P1_arr, num_directions * (NMAX * (NMAX + 2)) * sizeof(FLOAT)));
+        // FLOAT *d_legendret, *d_P1sin_arr, *d_P1_arr;
+        // GPUCHECK(gpuMalloc(&d_legendret, num_directions * (((NMAX + 1) * (NMAX + 2)) >> 1) * sizeof(FLOAT)));
+        // GPUCHECK(gpuMalloc(&d_P1sin_arr, num_directions * (NMAX * (NMAX + 2)) * sizeof(FLOAT)));
+        // GPUCHECK(gpuMalloc(&d_P1_arr, num_directions * (NMAX * (NMAX + 2)) * sizeof(FLOAT)));
 
         dim3 gridDim, blockDim;
         blockDim.x = num_directions < warpSize ? num_directions : warpSize;
         gridDim.x = num_directions < warpSize ? 1 : (num_directions - 1) / blockDim.x + 1;
         gridDim.y = num_coeffs;
         fee_kernel<<<gridDim, blockDim>>>(*d_coeffs, d_azs, d_zas, num_directions, (JONES *)d_norm_jones, d_latitude_rad,
-                                          iau_order, (JONES *)d_results, d_legendret, d_P1sin_arr, d_P1_arr);
+                                          iau_order, (JONES *)d_results
+                                          //   , d_legendret, d_P1sin_arr, d_P1_arr
+        );
 
 #ifdef DEBUG
         GPUCHECK(gpuDeviceSynchronize());
@@ -471,9 +477,9 @@ extern "C"
         GPUCHECK(gpuGetLastError());
 
         // Free device memory
-        GPUCHECK(gpuFree(d_legendret));
-        GPUCHECK(gpuFree(d_P1sin_arr));
-        GPUCHECK(gpuFree(d_P1_arr));
+        // GPUCHECK(gpuFree(d_legendret));
+        // GPUCHECK(gpuFree(d_P1sin_arr));
+        // GPUCHECK(gpuFree(d_P1_arr));
 
         return NULL;
     }