state.h

/*
 *  Copyright (c) 2011-2019, Triad National Security, LLC.
 *  All rights Reserved.
 *
 *  CLAMR -- LA-CC-11-094
 *
 *  Copyright 2011-2019. Triad National Security, LLC. This software was produced 
 *  under U.S. Government contract 89233218CNA000001 for Los Alamos National 
 *  Laboratory (LANL), which is operated by Triad National Security, LLC 
 *  for the U.S. Department of Energy. The U.S. Government has rights to use, 
 *  reproduce, and distribute this software.  NEITHER THE GOVERNMENT NOR
 *  TRIAD NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR 
 *  ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE.  If software is modified
 *  to produce derivative works, such modified software should be clearly marked,
 *  so as not to confuse it with the version available from LANL.
 *
 *  Additionally, redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the Triad National Security, LLC, Los Alamos 
 *       National Laboratory, LANL, the U.S. Government, nor the names of its 
 *       contributors may be used to endorse or promote products derived from 
 *       this software without specific prior written permission.
 *  
 *  THIS SOFTWARE IS PROVIDED BY THE TRIAD NATIONAL SECURITY, LLC AND 
 *  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT 
 *  NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL TRIAD NATIONAL
 *  SECURITY, LLC OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *  POSSIBILITY OF SUCH DAMAGE.
 *  
 *  CLAMR -- LA-CC-11-094
 *  This research code is being developed as part of the 
 *  2011 X Division Summer Workshop for the express purpose
 *  of a collaborative code for development of ideas in
 *  the implementation of AMR codes for Exascale platforms
 *  
 *  AMR implementation of the Wave code previously developed
 *  as a demonstration code for regular grids on Exascale platforms
 *  as part of the Supercomputing Challenge and Los Alamos 
 *  National Laboratory
 *  
 *  Authors: Bob Robey       XCP-2   brobey@lanl.gov
 *           Neal Davis              davis68@lanl.gov, davis68@illinois.edu
 *           David Nicholaeff        dnic@lanl.gov, mtrxknight@aol.com
 *           Dennis Trujillo         dptrujillo@lanl.gov, dptru10@gmail.com
 * 
 */
#ifndef STATE_H_
#define STATE_H_

#include <list>
#include "MallocPlus/MallocPlus.h"
#include "mesh/mesh.h"
#include "crux/crux.h"
#ifdef HAVE_OPENCL
#include "ezcl/ezcl.h"
#endif
#include "l7/l7.h"
#include <atomic>

#define STATUS_OK        0
#define STATUS_NAN       1
#define STATUS_MASS_LOSS 2

#if !defined(FULL_PRECISION) && !defined(MIXED_PRECISION) && !defined(MINIMUM_PRECISION) && !defined(HALF_PRECISION)
#define FULL_PRECISION
#endif
#ifdef NO_CL_DOUBLE
#undef  FULL_PRECISION
#undef  MIXED_PRECISION
#define MINIMUM_PRECISION
#undef  HALF_PRECISION
#endif

#if defined(HALF_PRECISION)
    #include "half.hpp"
   using half_float::half;
   using namespace half_float::literal;
   typedef half state_t; // this is for physics state variables ncell in size
   typedef float real_t; // this is used for intermediate calculations
   typedef struct
   {
      float s0;
      float s1;
   }  real2_t;
#define CONSERVATION_EPS    15.0
#ifdef HAVE_OPENCL
   typedef cl_half  cl_state_t; // for gpu physics state variables
   typedef cl_half4 cl_state4_t; // for gpu physics state variables
   typedef cl_float  cl_real_t; // for intermediate gpu physics state variables
   typedef cl_float2 cl_real2_t; // for intermediate gpu physics state variables
   typedef cl_float4 cl_real4_t; // for intermediate gpu physics state variables
#endif
#ifdef HAVE_MPI
   #define MPI_STATE_T MPI_SHORT // for MPI communication for physics state variables
   #define MPI_REAL_T MPI_FLOAT // for MPI communication for physics state variables
   #define L7_STATE_T L7_SHORT
   #define L7_REAL_T L7_FLOAT
#endif

#elif defined(MINIMUM_PRECISION)
   typedef float state_t; // this is for physics state variables ncell in size
   typedef float real_t; // this is used for intermediate calculations
   typedef struct
   {
      float s0;
      float s1;
   }  real2_t;
#define CONSERVATION_EPS    15.0
#ifdef HAVE_OPENCL
   typedef cl_float  cl_state_t; // for gpu physics state variables
   typedef cl_float4 cl_state4_t; // for gpu physics state variables
   typedef cl_float  cl_real_t; // for intermediate gpu physics state variables
   typedef cl_float2 cl_real2_t; // for intermediate gpu physics state variables
   typedef cl_float4 cl_real4_t; // for intermediate gpu physics state variables
#endif
#ifdef HAVE_MPI
   #define MPI_STATE_T MPI_FLOAT // for MPI communication for physics state variables
   #define MPI_REAL_T MPI_FLOAT // for MPI communication for physics state variables
   #define L7_STATE_T L7_FLOAT
   #define L7_REAL_T L7_FLOAT
#endif

#elif defined(MIXED_PRECISION) // intermediate values calculated high precision and stored as floats
   typedef float state_t;
   typedef double real_t;
   typedef struct
   {
      double s0;
      double s1;
   }  real2_t;
#define CONSERVATION_EPS    .02
#ifdef HAVE_OPENCL
   typedef cl_float   cl_state_t;
   typedef cl_float4  cl_state4_t;
   typedef cl_double  cl_real_t; // for intermediate gpu physics state variables
   typedef cl_double2 cl_real2_t; // for intermediate gpu physics state variables
   typedef cl_double4 cl_real4_t; // for intermediate gpu physics state variables
#endif
#ifdef HAVE_MPI
   #define MPI_STATE_T MPI_FLOAT
   #define MPI_REAL_T MPI_DOUBLE
   #define L7_STATE_T L7_FLOAT
   #define L7_REAL_T L7_DOUBLE
#endif

#elif defined(FULL_PRECISION)
   typedef double state_t;
   typedef double real_t;
   typedef struct
   {
      double s0;
      double s1;
   }  real2_t;
#define CONSERVATION_EPS    .02
#ifdef HAVE_OPENCL
   typedef cl_double  cl_state_t;
   typedef cl_double4 cl_state4_t;
   typedef cl_double  cl_real_t; // for intermediate gpu physics state variables
   typedef cl_double2 cl_real2_t; // for intermediate gpu physics state variables
   typedef cl_double4 cl_real4_t; // for intermediate gpu physics state variables
#endif
#ifdef HAVE_MPI
   #define MPI_STATE_T MPI_DOUBLE
   #define MPI_REAL_T MPI_DOUBLE
   #define L7_STATE_T L7_DOUBLE
   #define L7_REAL_T L7_DOUBLE
#endif
#endif

extern "C" void do_calc(void);

enum CUT_TYPE {
   CUT_NONE = 0,
   CUT_XAXIS,
   CUT_YAXIS,
   CUT_45DEG,
   CUT_ALL
};

enum SUM_TYPE {
   SUM_REGULAR,
   SUM_KAHAN
};


enum SIGN_RULE {
   DIAG_RULE,
   X_RULE,
   Y_RULE,
};

enum state_timers
{
   STATE_TIMER_APPLY_BCS,
   STATE_TIMER_SET_TIMESTEP,
   STATE_TIMER_FINITE_DIFFERENCE,
   STATE_TIMER_FINITE_DIFFERENCE_PART1,
   STATE_TIMER_FINITE_DIFFERENCE_PART2,
   STATE_TIMER_FINITE_DIFFERENCE_PART3,
   STATE_TIMER_FINITE_DIFFERENCE_PART4,
   STATE_TIMER_FINITE_DIFFERENCE_PART5,
   STATE_TIMER_FINITE_DIFFERENCE_PART6,
   STATE_TIMER_REFINE_POTENTIAL,
   STATE_TIMER_CALC_MPOT,
   STATE_TIMER_REZONE_ALL,
   STATE_TIMER_MASS_SUM,
   STATE_TIMER_READ,
   STATE_TIMER_WRITE,
   STATE_TIMER_SIZE
};

typedef enum state_timers   state_timer_category;

using namespace std;

class State {
   
public:
   MallocPlus state_memory;
   MallocPlus gpu_state_memory;
   Mesh *mesh;
   state_t *H = NULL;
   state_t *U = NULL;
   state_t *V = NULL;
#ifdef PRECISION_CHECK
   state_t *PCHECK = NULL;
#endif

#ifdef HAVE_OPENCL
   cl_mem dev_H;
   cl_mem dev_U;
   cl_mem dev_V;
   cl_mem dev_HxFlux;
   cl_mem dev_Hxfluxplus;
   cl_mem dev_Hxfluxminus;
   cl_mem dev_UxFlux;
   cl_mem dev_Uxfluxplus;
   cl_mem dev_Uxfluxminus;
   cl_mem dev_VxFlux;
   cl_mem dev_Vxfluxplus;
   cl_mem dev_Vxfluxminus;
   cl_mem dev_HyFlux;
   cl_mem dev_Hyfluxplus;
   cl_mem dev_Hyfluxminus;
   cl_mem dev_UyFlux;
   cl_mem dev_Uyfluxplus;
   cl_mem dev_Uyfluxminus;
   cl_mem dev_VyFlux;
   cl_mem dev_Vyfluxplus;
   cl_mem dev_Vyfluxminus;
   cl_mem dev_Wx_H;
   cl_mem dev_Wplusx_H;
   cl_mem dev_Wminusx_H;
   cl_mem dev_Wx_U;
   cl_mem dev_Wplusx_U;
   cl_mem dev_Wminusx_U;
   cl_mem dev_Wy_H;
   cl_mem dev_Wplusy_H;
   cl_mem dev_Wminusy_H;
   cl_mem dev_Wy_V;
   cl_mem dev_Wplusy_V;
   cl_mem dev_Wminusy_V;
   cl_mem dev_H_reg_lev;
   cl_mem dev_U_reg_lev;
   cl_mem dev_V_reg_lev;
   cl_mem dev_H_state_new;
   cl_mem dev_U_state_new;
   cl_mem dev_V_state_new;
   cl_mem dev_reg_start;
   cl_mem dev_lev_jregmin;
   cl_mem dev_lev_iregmin;
   cl_mem dev_lev_jregsize;
   cl_mem dev_lev_iregsize;

   cl_mem dev_mass_sum;
   cl_mem dev_deltaT;

   cl_event apply_BCs_event;

   cl_mem dev_mpot;
   //cl_mem dev_ioffset;
   cl_mem dev_result;
#endif


   double    cpu_timers[STATE_TIMER_SIZE];
   long long gpu_timers[STATE_TIMER_SIZE];

   // constructor -- allocates state arrays to size ncells
   State(Mesh *mesh_in);

   void init(int do_gpu_calc);
   void terminate(void);

   /* Memory routines for linked list of state arrays */
   void allocate(size_t ncells);
   void allocate_from_backup_file(FILE *fp);
   void allocate_for_rollback(State *state_to_copy);
   void resize(size_t ncells);
   void memory_reset_ptrs(void);
#ifdef HAVE_OPENCL
   void gpu_memory_reset_ptrs(void);
   void allocate_device_memory(size_t ncells);
#endif
   void resize_old_device_memory(size_t ncells);

   /* Accessor routines */
   double get_cpu_timer(state_timer_category category)  {return(cpu_timers[category]); };
   /* Convert nanoseconds to msecs */
   double get_gpu_timer(state_timer_category category)  {return((double)(gpu_timers[category])*1.0e-9); };

   /* Boundary routines -- not currently used */
   void add_boundary_cells(void);
   void apply_boundary_conditions(void);
   void remove_boundary_cells(void);

   /*******************************************************************
   * set_timestep
   *  Input
   *    H, U, V -- from state object
   *    celltype, level, lev_delta
   *  Output
   *    mindeltaT returned
   *******************************************************************/
   double set_timestep(double g, double sigma);
#ifdef HAVE_OPENCL
   double gpu_set_timestep(double sigma);
#endif

   /*******************************************************************
   * calc finite difference
   *      will add ghost region to H, U, V and fill at start of routine
   *   Input
   *      H, U, V -- from state object
   *      nlft, nrht, nbot, ntop, level, celltype -- from mesh object
   *   Output
   *      H, U, V
   *******************************************************************/
   void calc_finite_difference(double deltaT);
   void calc_finite_difference_cell_in_place(double deltaT);
   void calc_finite_difference_via_faces(double deltaT);
   void calc_finite_difference_face_in_place(double deltaT);
   void calc_finite_difference_regular_cells(double deltaT);
   void calc_finite_difference_regular_cells_by_faces(double deltaT);
#ifdef HAVE_OPENCL
   //void gpu_faces_realloc(size_t mem_requestx, size_t mem_requesty);
   void gpu_faces_setup(size_t mem_requestx, size_t mem_requesty);
   void gpu_faces_setup_phantom(size_t mem_request);
   void gpu_faces_delete(void);
   void gpu_faces_delete_phantom(void);
   void gpu_calc_finite_difference(double deltaT);
   void gpu_calc_finite_difference_via_faces(double deltaT);
   void gpu_calc_finite_difference_in_place(double deltaT);
   void gpu_calc_finite_difference_via_face_in_place(double deltaT);
   void gpu_reggrid_setup(size_t mem_request);
   void gpu_reggrid_delete(void);
   void gpu_calc_finite_difference_regular_cells(double deltaT);
   void gpu_calc_finite_difference_regular_cells_by_faces(double deltaT);
#endif

   /*******************************************************************
   * calc refine potential -- state has responsibility to calc initial
   *      refinement potential array that is then passed to mesh for
   *      smoothing and enforcing refinement ruiles
   *  Input
   *    H, U, V -- from state object
   *  Output
   *    mpot
   *    ioffset
   *    count
   *******************************************************************/
   size_t calc_refine_potential(vector<char_t> &mpot, int &icount, int &jcount);
#ifdef HAVE_OPENCL
   size_t gpu_calc_refine_potential(int &icount, int &jcount);
#endif

   /*******************************************************************
   * rezone all -- most of call is done in mesh
   *  Input
   *    Mesh and state variables
   *  Output
   *    New mesh and state variables on refined mesh
   *******************************************************************/
   void rezone_all(int icount, int jcount, vector<char_t> mpot);
#ifdef HAVE_OPENCL
   void gpu_rezone_all(int icount, int jcount, bool localStencil);
#endif

   /*******************************************************************
   * load balance -- most of call is done in mesh, but pointers are
   *    reset to newly allocated state arrays
   *  Input
   *    Mesh and state variables
   *  Output
   *    New mesh and state variables on refined mesh
   *******************************************************************/
#ifdef HAVE_MPI
   void do_load_balance_local(size_t &numcells);
#ifdef HAVE_OPENCL
   void gpu_do_load_balance_local(size_t &numcells);
#endif
#endif

   /*******************************************************************
   * mass sum -- Conservation of mass check
   *  Input
   *    H from state object
   *    Precision type for sum
   *  Output
   *    total mass is returned
   *******************************************************************/
   double mass_sum(int enhanced_precision_sum);
#ifdef HAVE_OPENCL
   double gpu_mass_sum(int enhanced_precision_sum);
#endif
   
   void fill_circle(double circ_radius, double fill_value, double background);
   void state_reorder(vector<int> iorder);

   void symmetry_check(const char *string, vector<int> sym_index, double eps, 
                       SIGN_RULE sign_rule, int &flag);

   void output_timing_info(int do_cpu_calc, int do_gpu_calc, double total_elapsed_time);

   /* state comparison routines */
#ifdef HAVE_OPENCL
   void compare_state_gpu_global_to_cpu_global(const char* string, int cycle, uint ncells);
#endif
   void compare_state_cpu_local_to_cpu_global(State *state_global, const char* string, int cycle, uint ncells, uint ncells_global, int *nsizes, int *ndispl);
#ifdef HAVE_OPENCL
   void compare_state_all_to_gpu_local(State *state_global, uint ncells, uint ncells_global, int mype, int ncycle, int *nsizes, int *ndispl);
#endif

   void output_timer_block(mesh_device_types device_type, double elapsed_time,
      double mesh_time, double compute_time, double total_elapsed_time, double speedup_ratio);

   void timer_output(state_timer_category category, mesh_device_types device_type, int timer_level);

   void print(void);
   void print_data_dump(int ncycle);

   size_t get_checkpoint_size(void);
   void store_checkpoint(Crux *crux);
   void restore_checkpoint(Crux *crux);
   //Added to for second print for every interation: Brian Atkinson (5-29-14)
   void print(int iteration, double simTime, double initial_mass, double iteration_mass, double mass_diff_percentage);  
   void print_local(int ncycle);
   void print_failure_log(int iteration, double simTime, double initial_mass, double iteration_mass, double mass_diff_percentage, bool got_nan);
   void print_rollback_log(int iteration, double simTime, double initial_mass, double iteration_mass, double mass_diff_percentage, int backup_attempt, int num_of_attempts, int error_status);

private:
   State(const State&); // To block copy constructor so copies are not made inadvertently

   void print_object_info(void);
};

class Mesh_CLAMR : public Mesh{
   public:
   Mesh_CLAMR(int, int, int, int, double, double, int, int, int);
   void interpolate(int, int, int, int, double, MallocPlus&);
   void interpolate_fine_x(int, int, int, int, double, MallocPlus&);
   void interpolate_fine_y(int, int, int, int, double, MallocPlus&);
   void interpolate_course_x(int, int, int, int, double, MallocPlus&);
   void interpolate_course_y(int, int, int, int, double, MallocPlus&);
};

#endif // ifndef STATE_H_