srivatsankrishnan · rushichavda · Aug 3, 2023 · Aug 3, 2023 · Aug 3, 2023 · Aug 3, 2023
diff --git a/aco/DeepSwarm/deepswarm/backends.py b/aco/DeepSwarm/deepswarm/backends.py
@@ -364,7 +364,7 @@ def __init__(self, path, exp_name, traject_dir, log_dir, reward_formulation, use
         # SET UP ACTION DICT
         self.action_dict = {"network": {}, "workload": {}}
         self.action_dict["network"]['path'] = "3d_fc_ring_switch.json"
-        self.action_dict["workload"]['path'] = "gnmt_fp16_fused.txt"
+        self.action_dict["workload"]['path'] = "all_reduce/allreduce_0.65.txt"
 
         # PARSE SYSTEM FILE
         self.parse_system(self.system_file, self.action_dict)

diff --git a/arch_gym/envs/AstraSimEnv.py b/arch_gym/envs/AstraSimEnv.py
@@ -18,7 +18,7 @@
 class AstraSimEnv(gym.Env):
     def __init__(self, rl_form="random_walker", max_steps=5, num_agents=1, reward_formulation="None", reward_scaling=1):
         # action space = set of all possible actions. Space.sample() returns a random action
-        self.action_space = gym.spaces.Discrete(2)
+        self.action_space = gym.spaces.Discrete(16)
         # observation space =  set of all possible observations
         self.observation_space = gym.spaces.Discrete(1)
 
@@ -107,9 +107,9 @@ def close(self):
     def calculate_reward(self, observations):
         print("Calculating reward...")
         print(observations)
-        sum = 0
+        sum = 1.0
         for obs in observations:
-            sum += ((float(obs[0]) - 1) ** 2)
+            sum += ((float(obs) - 1) ** 2)
             print(sum)
         return 1 / (sum ** 0.5)
 
@@ -119,6 +119,7 @@ def step(self, action_dict):
         # write the three config files
         # with open(self.network_config, "w") as outfile:
         #     outfile.write(json.dumps(action_dict['network'], indent=4))
+        print(action_dict)
         if "path" in action_dict["network"]:
             self.network_config = action_dict["network"]["path"]
 
@@ -178,17 +179,17 @@ def step(self, action_dict):
              len(detailed) == 0 or len(end_to_end) == 0 or
              len(sample_all_reduce_dimension_utilization) == 0)):
             # set reward to be extremely negative
-            reward = -100000
+            reward = float("-inf")
             print("reward: ", reward)
             return [[], reward, self.done, {"useful_counter": self.useful_counter}, self.state]
         else:
             # only recording the first line because apparently they are all the same? TODO
             self.observations = [
                 backend_end_to_end["CommsTime"][0],
-                end_to_end["fwd compute"][0],
-                end_to_end["wg compute"][0],
-                end_to_end["ig compute"][0],
-                end_to_end["total exposed comm"][0]
+                # end_to_end["fwd compute"][0],
+                # end_to_end["wg compute"][0],
+                # end_to_end["ig compute"][0],
+                # end_to_end["total exposed comm"][0]
             ]
             reward = self.calculate_reward(self.observations)
             print("reward: ", reward)

diff --git a/bo/AstraSimEstimator.py b/bo/AstraSimEstimator.py
@@ -0,0 +1,225 @@
+from sklearn.base import BaseEstimator, ClassifierMixin
+import os
+os.sys.path.insert(0, os.path.abspath('/../arch_gym/envs/'))
+os.sys.path.insert(0, os.path.abspath('/../'))
+from configs import arch_gym_configs
+import json
+from arch_gym.envs.envHelpers import helpers
+from arch_gym.envs.AstraSimEnv import AstraSimEnv
+from arch_gym.envs.AstraSimWrapper import make_astraSim_env
+import configparser
+import envlogger
+import sys
+import numpy as np
+import pandas as pd
+import time
+
+from absl import logging
+from absl import flags
+
+
+class AstraSimEstimator(BaseEstimator):
+
+    def __init__(self, scheduling_policy="FIFO", collective_optimization="baseline", 
+                 intra_dimension_scheduling="FIFO", inter_dimension_scheduling="baseline",
+                 exp_name="test", traject_dir="traj"):
+
+        ''' All the default values of AstraSim should be initialized here. 
+            Take all the parameters here and write it to the config files
+        '''
+        # To do: Implement some default parameters 
+        self.env = AstraSimEnv()
+        self.helper = helpers()
+        self.action_dict = {}
+
+        settings_file_path = os.path.realpath(__file__)
+        settings_dir_path = os.path.dirname(settings_file_path)
+        proj_root_path = os.path.join(settings_dir_path, '..')
+        astrasim_archgym = os.path.join(proj_root_path, "sims/AstraSim/astrasim-archgym")
+
+        # TODO: V1 SPEC:
+        archgen_v1_knobs = os.path.join(astrasim_archgym, "dse/archgen_v1_knobs")
+        knobs_spec = os.path.join(archgen_v1_knobs, "archgen_v1_knobs_spec.py")
+        networks_folder = os.path.join(archgen_v1_knobs, "templates/network")
+        systems_folder = os.path.join(astrasim_archgym, "themis/inputs/system")
+        workloads_folder = os.path.join(astrasim_archgym, "themis/inputs/workload")
+
+
+        self.network_file = "4d_ring_fc_ring_switch.json"
+        self.system_file = os.path.join(systems_folder, "4d_ring_fc_ring_switch_baseline.txt")
+        self.workload_file = "all_reduce/allreduce_0.20.txt"
+
+        # self.action_dict['network'] = {"path": self.network_file}
+        # self.action_dict['workload'] = {"path": self.workload_file}
+
+        # self.parse_system(self.system_file, self.action_dict)
+
+        # self.action_dict["system"]["scheduling-policy"] = scheduling_policy
+        self.action_dict["scheduling_policy"] = scheduling_policy
+        self.action_dict["collective_optimization"] = collective_optimization
+        self.action_dict["intra_dimension_scheduling"] = intra_dimension_scheduling
+        self.action_dict["inter_dimension_scheduling"] = inter_dimension_scheduling
+
+        self.exp_name = exp_name
+        self.traject_dir = traject_dir
+        self.fitness_hist = []
+        self.exp_log_dir = os.path.join(os.getcwd(), "bo_logs")
+        self.reward_formulation = 'power'
+
+        print("[Experiment]: ", self.exp_name)
+        print("[Trajectory Log path]: ", self.traject_dir)
+
+
+        self.bo_steps=0
+
+
+    def parse_system(self, system_file, action_dict):
+        # parse system_file (above is the content) into dict
+        action_dict['system'] = {}
+        with open(system_file, 'r') as file:
+            lines = file.readlines()
+
+            for line in lines:
+                key, value = line.strip().split(': ')
+                action_dict['system'][key] = value
+
+
+    def wrap_in_envlogger(self, env, envlogger_dir, use_envlogger):
+        metadata = {
+            'agent_type': 'Bayesian Optimization',
+            'env_type': type(env).__name__,
+        }
+        if use_envlogger == 'True':
+            logging.info('Wrapping environment with EnvironmentLogger...')
+            env = envlogger.EnvLogger(env,
+                                    data_directory=envlogger_dir,
+                                    max_episodes_per_file=1000,
+                                    metadata=metadata)
+            logging.info('Done wrapping environment with EnvironmentLogger.')
+            return env
+        else:
+            print("Not using envlogger")
+            return env
+
+
+    def fit (self, X, y=None):
+        '''
+        1) Call the AstraSim simulator and return performance, power, and energy
+        2) The parameter must be updated before the calling the AstraSim simulator
+        3)  X is the trace files (.e., Workload)
+        '''
+        self.bo_steps += 1
+
+        def step_fn(unused_timestep, unused_action, unused_env):
+            return {'timestamp': time.time()}
+
+        reward = 0
+        self.fitness_hist = {}
+
+        # read from the config file
+        config = configparser.ConfigParser()
+        config.read("exp_config.ini")
+
+        # read the all the parameters from exp_config.ini
+        traj_dir = config.get("experiment_configuration", "trajectory_dir")
+        exp_name = config.get("experiment_configuration", "exp_name")
+        log_dir = config.get("experiment_configuration", "log_dir")
+        reward_formulation = config.get("experiment_configuration", "reward_formulation")
+        use_envlogger = config.get("experiment_configuration", "use_envlogger")
+
+        env_wrapper = make_astraSim_env(reward_formulation = reward_formulation,
+            rl_form = 'bo')
+
+        # check if trajectory directory exists
+        if use_envlogger == 'True':
+            if not os.path.exists(traj_dir):
+                os.makedirs(traj_dir)
+        # check if log directory exists
+        if not os.path.exists(log_dir):
+            os.makedirs(log_dir)
+
+        env = self.wrap_in_envlogger(env_wrapper, self.exp_log_dir, use_envlogger)
+        env.reset()
+        print("Action dict: ", self.action_dict)
+
+        # convert the action dict to a list with the same order 
+        # action_list = []
+
+        actual_action = {}
+        actual_action['network'] = {"path": self.network_file}
+        actual_action['workload'] = {"path": self.workload_file}
+        self.parse_system(self.system_file, actual_action)
+
+        actual_action["system"]["scheduling-policy"] = self.action_dict["scheduling_policy"]
+        actual_action["system"]["collective-optimization"] = self.action_dict["collective_optimization"]
+        actual_action["system"]["intra-dimension-scheduling"] = self.action_dict["intra_dimension_scheduling"]
+        actual_action["system"]["inter-dimension-scheduling"] = self.action_dict["inter_dimension_scheduling"]
+
+        _, reward, _, info = env.step(actual_action)
+
+        self.fitness_hist['reward'] = reward
+        self.fitness_hist['action'] = self.action_dict
+        self.fitness_hist['obs'] = info
+
+        fitness_filename = os.path.join(self.exp_name)
+
+        # logging twice due to the cv. So we will track the bo_steps and log only once
+        if self.bo_steps == 1:
+            self.log_fitness_to_csv(log_dir)
+
+        # clear the self.fitness_hist
+        self.fitness_hist = []
+
+        return reward
+
+    def predict(self, X, y):
+        return NotImplementedError
+
+    def score(self,X, y=None):
+        return NotImplementedError
+
+    def get_params(self, deep=False):
+        return {
+            "scheduling_policy": self.action_dict["scheduling_policy"],
+            "collective_optimization": self.action_dict["collective_optimization"],
+            "intra_dimension_scheduling": self.action_dict["intra_dimension_scheduling"],
+            "inter_dimension_scheduling": self.action_dict["inter_dimension_scheduling"]
+        }
+
+    def set_params(self, **params):
+        """
+        scheduling-policy: LIFO
+        endpoint-delay: 1
+        active-chunks-per-dimension: 1
+        preferred-dataset-splits: 64
+        boost-mode: 1
+        all-reduce-implementation: direct_ring_halvingDoubling
+        all-gather-implementation: direct_ring_halvingDoubling
+        reduce-scatter-implementation: direct_ring_halvingDoubling
+        all-to-all-implementation: direct_direct_direct
+        collective-optimization: localBWAware
+        intra-dimension-scheduling: FIFO
+        inter-dimension-scheduling: baseline
+        """
+        _params = params
+        self.action_dict["scheduling_policy"] = _params["scheduling_policy"]
+        self.action_dict["collective_optimization"] = _params["collective_optimization"]
+        self.action_dict["intra_dimension_scheduling"] = _params["intra_dimension_scheduling"]
+        self.action_dict["inter_dimension_scheduling"] = _params["inter_dimension_scheduling"]
+
+        return self
+
+
+    def calculate_reward(self, energy, latency):
+        sum = ((float(latency) - 1) ** 2)
+        return 1 / (sum ** 0.5)
+
+
+    def log_fitness_to_csv(self, filename):
+        df = pd.DataFrame([self.fitness_hist['reward']])
+        csvfile = os.path.join(filename, "fitness.csv")
+        df.to_csv(csvfile, index=False, header=False, mode='a')
+
+        df = pd.DataFrame([self.fitness_hist])
+        csvfile = os.path.join(filename, "actions.csv")
+        df.to_csv(csvfile, index=False, header=False, mode='a')
diff --git a/docs/installation_images/file_preview.png b/docs/installation_images/file_preview.png
diff --git a/installation.md b/installation.md
@@ -0,0 +1,35 @@
+# Installation for Ubuntu VM
+## Step-by-Step Guide
+
+If you are reading this, we suppose you have ready to run Ubuntu on VM, in your system. if not, then go to some recent youtube videos and make sure you have Ubuntu successfully running on VM.
+
+1. Open Your Virtual Machine
+2. Open Terminal (ctrl + T)
+3. Clone this repo https://github.com/google/CFU-Playground by using ```git clone https://github.com/google/CFU-Playground```
+4. Go to this directory "CFU-Playground/third_party/python/vizier/" using ```cd CFU-Playground/third_party/python/vizier/```
+5. Now you will see that there is a CFU-Playground folder. Now go to "CFU-Playground/third_party/python/vizier/" and see if vizier folder is empty or not! ![Alt text](./docs/installation_images/file_preview.png?raw=true "Title")
+10. Go to python folder using terminal (location: CFU-Playground/third_party/python/) using command like ```cd CFU-Playground/third_party/python/```
+11. run ```rm -rf vizier```
+13. Clone vizier repo, run  ```git clone https://github.com/ShvetankPrakash/vizier```
+15. ```cd CFU-Playground/``` -> ```cd scripts/``` ->  Run setup_vizier.sh file using command ```./setup_vizier.sh```
+19. Might give some errors, therefore activate conda environment/ or create one, If you have existing environments, you can find the list using this command ```conda env list``` if you don't find one, create using following commands: Open anaconda terminal and run ```conda create -n myenv```. Replace myenv with the environment name. now activate that enviroment using ```conda activate myenv```
+      1. Install anaconda
+      2. 
+22. Now run ```./setup_vizier.sh```again
+23. Some Errors might occur due to version of python. 
+24. run ```sudo apt install build-essential```
+25. run ```pip install cvxopt```
+26. run ```export CVXOPT_BUILD_FFTW=1```
+27. ```pip install cvxopt --no-binary cvxopt```
+28. ```conda install -c conda-forge cvxopt```
+29. ```./setup_vizier.sh```
+30. Might give errors related to ale-py
+31. Go to file requirements-benchmarks.txt:
+32. third party -> python -> vizier -> vizier -> requirements-benchmarks.txt
+33. Comment out all the lines
+34. Now running setup_vizier.sh file would not give ale-py error
+35. Now, try some other example - go to CFU-Playground -> proj -> dse_template -> vizier_dse.py
+36. Go to line 40, comment it out and add this line
+37. cycles, cells = 1, 1
+38. Run this file. It should run successfully without any errors.
+
diff --git a/new_VM_setup_archgym.md b/new_VM_setup_archgym.md
@@ -0,0 +1,66 @@
+# Setting Up a New Virtual Machine and Arch Gym Env Installation !
+
+Follow the below instruction to setup a remote access of virtual machine and peform the necessary intallation for creating arch-gym enviroment
+
+
+## Initializing and Starting VM
+1. Generating the ssh public and private key using : `ssh-keygen -t rsa -b 2048 -C [USERNAME]`
+2. Get the ssh access from adminstrator
+3. Open terminal and run :`ssh -i <PATH_TO_PRIVATE_KEY> <USERNAME@IP_ADDRESS>` . Example : `ssh -i C:\Users\yashc\.ssh/id_rsa [email protected]`
+4.  Open VS code and download Remote-SSH extension by microsoft
+5.  Press F1 select Remote-SSH: Connect to Host...use the same `USERNAME@IP_ADDRESS` as in step 2
+6. New VS Code window will be opened and If VS Code cannot automatically detect the type of server you are connecting to, you will be asked to select the type manually. 
+
+## Installing Conda
+In terminal run the following commands to install conda for your remote virtual machine
+1. `curl -O https://repo.anaconda.com/archive/Anaconda3-2023.07-1-Linux-x86_64.sh`
+2. `sha256sum Anaconda3-2023.07-1-Linux-x86_64.sh`
+3. `bash Anaconda3-2023.07-1-Linux-x86_64.sh ( press yes and enter for all steps )`
+4. `source ~/.bashrc`
+
+
+## Creating Arch-Gym Enviroment
+Follow the below steps for setting up vizier and arch-gym. In terminal 
+1. `git clone https://github.com/srivatsankrishnan/oss-arch-gym.git`
+2. `cd oss-arch-gym/`
+3. `conda env create -f environment.yml`
+4. `conda activate arch-gym`
+5. `cd ..`
+6. `git clone https://github.com/ShvetankPrakash/vizier.git`
+7. cd into vizier directory
+8. `sudo apt-get install -y libprotobuf-dev`
+9. `pip install -r requirements.txt --use-deprecated=legacy-resolver` ( you may see some package compatibility issues, ignore them )
+10. `pip install -e .` ( you may see some package compatibility issues, ignore them )
+11. `./build_protos.sh`
+12. `pip install -r requirements-algorithms.txt` (you may probably end up with gcc compiler issue, ignore as of now)
+13. `pip install -r requirements-benchmarks.txt` ( you may see some package compatibility issues, ignore them )
+14. Open VS code and make a copy of this script:  [https://github.com/google/CFU-Playground/blob/main/proj/dse_template/vizier_dse.py](https://github.com/google/CFU-Playground/blob/main/proj/dse_template/vizier_dse.py)
+15. Remove line 10 of your local copy
+16. Go to line 40, comment it out and add this line `cycles, cells = 1, 1`
+17.  In terminal Run `python vizier_dse.py` to test working. Note : all this should be done with arch-gym virtual env activated only
+18. If you get ModuleNotFoundError: No module named 'emukit'. Run `pip install emukit`
+19. If pip install emukit throws error related to gcc compiler, then try to install it using this -  `sudo apt update && sudo apt install -y build-essential`
+20. Run `pip install emukit` again 
+21.  Run `python vizier_dse.py` to test its working
+The output should look like 
+`Suggested Parameters (bypass, cfu, dCacheSize, hardwareDiv, iCacheSize, mulDiv, prediction, safe, singleCycleShift, singleCycleMulDiv): True False 8192.0 True 4096.0 True static False True False.............`
+
+
+## Testing Overall Installation
+
+Come out of vizier directory in terminal using `cd ..`
+
+1. `cd oss-arch-gym/acme`
+2. `pip install .[jax,tf,testing,envs]`
+3. `which python`
+	Output eg : `/home/yashc/anaconda3/envs/arch-gym/bin/python`
+	Replace `bin/python` with `lib` and copy it : `/home/yashc/anaconda3/envs/arch-gym/lib`
+####  In VS Code
+1. Go to .bashrc file inside your username folder ( for me its yashc)
+2. Paste this in last : 	`export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/<USERNAME>/anaconda3/envs/arch-gym/lib/"`
+
+#### In terminal
+Remember that your arch-gym env should be activated all the times
+1. Run  `sudo apt-get install libgmp-dev`
+2. `cd oss-arch-gym/sims/customenv`
+3. Rull all the Python files ( all should run without error )
diff --git a/sims/AstraSim/run_general.sh b/sims/AstraSim/run_general.sh
@@ -8,7 +8,7 @@ SCRIPT_DIR=$(dirname "$(realpath $0)")
 BINARY="${SCRIPT_DIR:?}"/astrasim-archgym/astra-sim/build/astra_analytical/build/AnalyticalAstra/bin/AnalyticalAstra
 SYSTEM="${SCRIPT_DIR:?}"/general_system.txt
 NETWORK="${SCRIPT_DIR:?}"/astrasim-archgym/themis/inputs/network/analytical/$1
-WORKLOAD="${SCRIPT_DIR:?}"/astrasim-archgym/themis/inputs/workload/realworld_workloads/$3
+WORKLOAD="${SCRIPT_DIR:?}"/astrasim-archgym/themis/inputs/workload/$3
 
 echo "SH NETWORK: ${NETWORK}"
 echo "SH SYSTEM: ${SYSTEM}"