From c1b985a712c2166d5da86e2940f467cebfa21c39 Mon Sep 17 00:00:00 2001
From: Feiyi Wang <fwang2@gmail.com>
Date: Sun, 27 Aug 2023 08:37:19 -0400
Subject: [PATCH] re-org

---
 Andes.md => andes/Andes.md                    |   0
 dl-notebooks/Llama.ipynb                      | 261 ++++++++++++++++++
 .../Frontier-cmake.md                         |   0
 .../crusher-deepspeed.md                      |   0
 .../crusher-pytorch.md                        |   0
 frontier/frontier_pytorch.md                  | 115 ++++++++
 pytorch-sbs/README.md                         |  11 +
 .../JupyterOnSummit.md                        |   0
 .../Summit-deepspeed.md                       |   0
 Summit.md => summit/Summit.md                 |   0
 .../olcf-jupyterhub.md                        |   0
 11 files changed, 387 insertions(+)
 rename Andes.md => andes/Andes.md (100%)
 create mode 100644 dl-notebooks/Llama.ipynb
 rename Frontier-cmake.md => frontier/Frontier-cmake.md (100%)
 rename Crusher-deepspeed.md => frontier/crusher-deepspeed.md (100%)
 rename Crusher-pytorch.md => frontier/crusher-pytorch.md (100%)
 create mode 100644 frontier/frontier_pytorch.md
 create mode 100644 pytorch-sbs/README.md
 rename JupyterOnSummit.md => summit/JupyterOnSummit.md (100%)
 rename Summit-deepspeed.md => summit/Summit-deepspeed.md (100%)
 rename Summit.md => summit/Summit.md (100%)
 rename olcf-jupyterhub.md => summit/olcf-jupyterhub.md (100%)

diff --git a/Andes.md b/andes/Andes.md
similarity index 100%
rename from Andes.md
rename to andes/Andes.md
diff --git a/dl-notebooks/Llama.ipynb b/dl-notebooks/Llama.ipynb
new file mode 100644
index 0000000..0d9d1d5
--- /dev/null
+++ b/dl-notebooks/Llama.ipynb
@@ -0,0 +1,261 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "bc091be0-2260-4eea-98ba-46daf4b10899",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "##\n",
+    "## conda update --all\n",
+    "## conda install -c conda-forge ipywidgets\n",
+    "##\n",
+    "## source: https://github.com/bkitano/llama-from-scratch\n",
+    "##\n",
+    "\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "from torch.nn import functional as F\n",
+    "import numpy as np\n",
+    "from matplotlib import pyplot as plt\n",
+    "import time\n",
+    "import pandas as pd\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "1f51d6f7-17ff-4765-b30e-cbedb3dc24fc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# read in text\n",
+    "lines = open('./input.txt', 'r').read()\n",
+    "vocab = sorted(list(set(lines)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "87acf3dc-7610-42f5-9e6a-f262ba058e7a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['\\n', ' ', '!', '$', '&', \"'\", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(vocab)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "3ea85c0a-6732-4a6e-899b-4aedf93bd714",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "itos = {i:ch for i, ch in enumerate(vocab)}\n",
+    "stoi = {ch:i for i, ch in enumerate(vocab)}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "f559fdcd-3049-4406-bfc3-0ebc23a1d204",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{0: '\\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: \"'\", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 45: 'g', 46: 'h', 47: 'i', 48: 'j', 49: 'k', 50: 'l', 51: 'm', 52: 'n', 53: 'o', 54: 'p', 55: 'q', 56: 'r', 57: 's', 58: 't', 59: 'u', 60: 'v', 61: 'w', 62: 'x', 63: 'y', 64: 'z'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(itos)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "fefead73-d6d4-4371-b6f1-921308c06391",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'\\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, \"'\": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(stoi)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "b6d823b5-7b01-43af-8a16-703ecc73bdb1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[46, 43, 50, 50, 53]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# simple tokenization by characters\n",
+    "def encode(s):\n",
+    "    return [stoi[ch] for ch in s]\n",
+    "\n",
+    "def decode(l):\n",
+    "    return ''.join([itos[i] for i in l])\n",
+    "\n",
+    "print(encode(\"hello\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "083fb2e5-fe3c-4149-a541-f0bfb53eb4dc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "hello\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(decode(encode(\"hello\")))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "55e5d77f-aed6-4db0-b99b-2b2eb8e56c91",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MASTER_CONFIG = {\n",
+    "    \"vocab_size\": len(vocab),\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "fad0ede8-0152-4939-83f8-9f7807c6826a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1115394])"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset = torch.tensor(encode(lines), dtype=torch.int8)\n",
+    "dataset.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "fb3e6512-78cc-4601-b40d-213f0d24138f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_batches(data, split, batch_size, context_window, config=MASTER_CONFIG):\n",
+    "    train = data[:int(.8 * len(data))]\n",
+    "    val = data[int(.8 * len(data)): int(.9 * len(data))]\n",
+    "    test = data[int(.9 * len(data)):]\n",
+    "\n",
+    "    batch_data = train\n",
+    "    if split == 'val':\n",
+    "        batch_data = val\n",
+    "\n",
+    "    if split == 'test':\n",
+    "        batch_data = test\n",
+    "\n",
+    "    # pick random starting points\n",
+    "    ix = torch.randint(0, batch_data.size(0) - context_window - 1, (batch_size,))\n",
+    "    x = torch.stack([batch_data[i:i+context_window] for i in ix]).long()\n",
+    "    y = torch.stack([batch_data[i+1:i+context_window+1] for i in ix]).long()\n",
+    "    return x, y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "2b5a6586-2fa0-4019-a255-3fd0fd179d47",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'vocab_size': 65, 'batch_size': 32, 'context_window': 16}"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "MASTER_CONFIG.update({\n",
+    "    'batch_size': 32,\n",
+    "    'context_window': 16\n",
+    "})\n",
+    "MASTER_CONFIG"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a419246-6cc7-4cec-ab41-f42ab021f3cc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Frontier-cmake.md b/frontier/Frontier-cmake.md
similarity index 100%
rename from Frontier-cmake.md
rename to frontier/Frontier-cmake.md
diff --git a/Crusher-deepspeed.md b/frontier/crusher-deepspeed.md
similarity index 100%
rename from Crusher-deepspeed.md
rename to frontier/crusher-deepspeed.md
diff --git a/Crusher-pytorch.md b/frontier/crusher-pytorch.md
similarity index 100%
rename from Crusher-pytorch.md
rename to frontier/crusher-pytorch.md
diff --git a/frontier/frontier_pytorch.md b/frontier/frontier_pytorch.md
new file mode 100644
index 0000000..7592967
--- /dev/null
+++ b/frontier/frontier_pytorch.md
@@ -0,0 +1,115 @@
+
+# pytorch on Frontier
+
+- [pytorch on Frontier](#pytorch-on-frontier)
+  - [prep Frontier modules](#prep-frontier-modules)
+  - [module output on Frontier](#module-output-on-frontier)
+  - [setup miniconda3](#setup-miniconda3)
+  - [build pytorch](#build-pytorch)
+    - [Build options: see `setup.py`](#build-options-see-setuppy)
+    - [regenerate CMAKE build files](#regenerate-cmake-build-files)
+    - [Kineto and roctracer.h problem](#kineto-and-roctracerh-problem)
+  - [DeepSpeed](#deepspeed)
+  - [GPTNeoX](#gptneox)
+  - [Verification](#verification)
+
+
+
+## prep Frontier modules
+
+```
+module load PrgEnv-gnu
+module load gcc/10.3.0
+module load rocm/5.1.0
+module load craype-x86-trento
+export HCC_AMDGPU_TARGET=gfx90a
+export PYTORCH_ROCM_ARCH=gfx90a
+export ROCM_SOURCE_DIR=/opt/rocm-5.1.0
+export CRAY_CPU_TARGET=x86_64 # just to remove warning noise
+```
+## module output on Frontier
+
+```
+Currently Loaded Modules:
+  1) libfabric/1.15.2.0   4) cray-dsmml/0.2.2        7) gcc/10.3.0             10) DefApps/default    13) craype-accel-amd-gfx90a
+  2) craype-network-ofi   5) cray-libsci/22.12.1.1   8) darshan-runtime/3.4.0  11) cray-mpich/8.1.23  14) craype-x86-trento
+  3) craype/2.7.19        6) PrgEnv-gnu/8.3.3        9) hsi/default            12) rocm/5.1.0
+```
+
+Note: One of the module between `craype-x86-trento` and `craype-accel-amd-gfx90a` fixed a linking problem. My guess is the former. 
+
+## setup miniconda3
+
+```
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+bash ./Miniconda3-latest-Linux-x86_64.sh -b -p miniconda
+conda create -n pytorch python=3.8
+conda active pytorch
+pip install -r requirements.txt
+```
+
+## build pytorch
+
+```
+git clone --recursive -b IFU-master-2022-11-22 https://github.com/ROCmSoftwarePlatform/pytorch
+python tools/amd_build/build_amd.py
+USE_KINETO=0 USE_ROCM=1 MAX_JOBS=4 python setup.py bdist_wheel 2>&1 | tee output
+```
+
+### Build options: see `setup.py`
+
+```
+USE_KINETO=0 # disable profiler, ask for roctracer.h
+```
+### regenerate CMAKE build files
+
+This will trigger a rebuild for the changed configuration.
+
+```
+cd pytorch/build
+rm CMakeCache.txt
+```
+To remove previous build as well:
+
+```
+python setup.py clean
+```
+### Kineto and roctracer.h problem
+
+Kineto requires roctracer, which fails in rocm 5.1.0
+
+```
+if (NOT ROCM_SOURCE_DIR)
+    set(ROCM_SOURCE_DIR "$ENV{ROCM_SOURCE_DIR}")
+    message(INFO " ROCM_SOURCE_DIR = ${ROCM_SOURCE_DIR}")
+endif()
+```
+
+For reason unknown at this point, the `ROCM_SOURCE_DIR` is still set as `/opt/rocm` instead of `/opt/rocm-5.1.0` even though the environment variable is set.
+
+So the easy workaround is:
+
+```
+set(ROCM_SOURCE_DIR /opt/rocm-5.1.0)
+```
+
+## DeepSpeed
+
+```
+git clone https://github.com/microsoft/DeepSpeed
+DS_BUILD_FUSED_LAMB=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1  DS_BUILD_UTILS=1 python setup.py bdist_wheel
+python setup.py install
+```
+
+## GPTNeoX
+
+```
+pip install shortuuid # missed from
+git clone https://github.com/EleutherAI/gpt-neox.git
+pip install -r requirements/requirements.txt
+pip install -r requirements/requirements-wandb.txt
+pip install -r requirements/requirements-tensorboard.txt
+```
+## Verification
+
+
diff --git a/pytorch-sbs/README.md b/pytorch-sbs/README.md
new file mode 100644
index 0000000..f64dbd0
--- /dev/null
+++ b/pytorch-sbs/README.md
@@ -0,0 +1,11 @@
+# Study Notes
+
+
+## Standardization
+
+* Based on loss function, each feature has a loss curve
+* Each feature, based on the loss curve, has **very different** optimal learning rates
+* However, learning rate is global (apply to all features)
+* therefore, it makes sense to _try_ to make all features loss curve similar - but not always possible.
+* Standardization (`StandardScalar` in sklearn) is one such technique that could make loss curve(s) more uniform (?), thus converge better (reaching optimal point) and faster.
+
diff --git a/JupyterOnSummit.md b/summit/JupyterOnSummit.md
similarity index 100%
rename from JupyterOnSummit.md
rename to summit/JupyterOnSummit.md
diff --git a/Summit-deepspeed.md b/summit/Summit-deepspeed.md
similarity index 100%
rename from Summit-deepspeed.md
rename to summit/Summit-deepspeed.md
diff --git a/Summit.md b/summit/Summit.md
similarity index 100%
rename from Summit.md
rename to summit/Summit.md
diff --git a/olcf-jupyterhub.md b/summit/olcf-jupyterhub.md
similarity index 100%
rename from olcf-jupyterhub.md
rename to summit/olcf-jupyterhub.md