From c1b985a712c2166d5da86e2940f467cebfa21c39 Mon Sep 17 00:00:00 2001 From: Feiyi Wang Date: Sun, 27 Aug 2023 08:37:19 -0400 Subject: [PATCH] re-org --- Andes.md => andes/Andes.md | 0 dl-notebooks/Llama.ipynb | 261 ++++++++++++++++++ .../Frontier-cmake.md | 0 .../crusher-deepspeed.md | 0 .../crusher-pytorch.md | 0 frontier/frontier_pytorch.md | 115 ++++++++ pytorch-sbs/README.md | 11 + .../JupyterOnSummit.md | 0 .../Summit-deepspeed.md | 0 Summit.md => summit/Summit.md | 0 .../olcf-jupyterhub.md | 0 11 files changed, 387 insertions(+) rename Andes.md => andes/Andes.md (100%) create mode 100644 dl-notebooks/Llama.ipynb rename Frontier-cmake.md => frontier/Frontier-cmake.md (100%) rename Crusher-deepspeed.md => frontier/crusher-deepspeed.md (100%) rename Crusher-pytorch.md => frontier/crusher-pytorch.md (100%) create mode 100644 frontier/frontier_pytorch.md create mode 100644 pytorch-sbs/README.md rename JupyterOnSummit.md => summit/JupyterOnSummit.md (100%) rename Summit-deepspeed.md => summit/Summit-deepspeed.md (100%) rename Summit.md => summit/Summit.md (100%) rename olcf-jupyterhub.md => summit/olcf-jupyterhub.md (100%) diff --git a/Andes.md b/andes/Andes.md similarity index 100% rename from Andes.md rename to andes/Andes.md diff --git a/dl-notebooks/Llama.ipynb b/dl-notebooks/Llama.ipynb new file mode 100644 index 0000000..0d9d1d5 --- /dev/null +++ b/dl-notebooks/Llama.ipynb @@ -0,0 +1,261 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "bc091be0-2260-4eea-98ba-46daf4b10899", + "metadata": {}, + "outputs": [], + "source": [ + "##\n", + "## conda update --all\n", + "## conda install -c conda-forge ipywidgets\n", + "##\n", + "## source: https://github.com/bkitano/llama-from-scratch\n", + "##\n", + "\n", + "import torch\n", + "from torch import nn\n", + "from torch.nn import functional as F\n", + "import numpy as np\n", + "from matplotlib import pyplot as plt\n", + "import time\n", + "import pandas as pd\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1f51d6f7-17ff-4765-b30e-cbedb3dc24fc", + "metadata": {}, + "outputs": [], + "source": [ + "# read in text\n", + "lines = open('./input.txt', 'r').read()\n", + "vocab = sorted(list(set(lines)))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "87acf3dc-7610-42f5-9e6a-f262ba058e7a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['\\n', ' ', '!', '$', '&', \"'\", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']\n" + ] + } + ], + "source": [ + "print(vocab)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "3ea85c0a-6732-4a6e-899b-4aedf93bd714", + "metadata": {}, + "outputs": [], + "source": [ + "itos = {i:ch for i, ch in enumerate(vocab)}\n", + "stoi = {ch:i for i, ch in enumerate(vocab)}" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f559fdcd-3049-4406-bfc3-0ebc23a1d204", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{0: '\\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: \"'\", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 45: 'g', 46: 'h', 47: 'i', 48: 'j', 49: 'k', 50: 'l', 51: 'm', 52: 'n', 53: 'o', 54: 'p', 55: 'q', 56: 'r', 57: 's', 58: 't', 59: 'u', 60: 'v', 61: 'w', 62: 'x', 63: 'y', 64: 'z'}\n" + ] + } + ], + "source": [ + "print(itos)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "fefead73-d6d4-4371-b6f1-921308c06391", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'\\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, \"'\": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}\n" + ] + } + ], + "source": [ + "print(stoi)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "b6d823b5-7b01-43af-8a16-703ecc73bdb1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[46, 43, 50, 50, 53]\n" + ] + } + ], + "source": [ + "# simple tokenization by characters\n", + "def encode(s):\n", + " return [stoi[ch] for ch in s]\n", + "\n", + "def decode(l):\n", + " return ''.join([itos[i] for i in l])\n", + "\n", + "print(encode(\"hello\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "083fb2e5-fe3c-4149-a541-f0bfb53eb4dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hello\n" + ] + } + ], + "source": [ + "print(decode(encode(\"hello\")))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "55e5d77f-aed6-4db0-b99b-2b2eb8e56c91", + "metadata": {}, + "outputs": [], + "source": [ + "MASTER_CONFIG = {\n", + " \"vocab_size\": len(vocab),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "fad0ede8-0152-4939-83f8-9f7807c6826a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1115394])" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = torch.tensor(encode(lines), dtype=torch.int8)\n", + "dataset.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "fb3e6512-78cc-4601-b40d-213f0d24138f", + "metadata": {}, + "outputs": [], + "source": [ + "def get_batches(data, split, batch_size, context_window, config=MASTER_CONFIG):\n", + " train = data[:int(.8 * len(data))]\n", + " val = data[int(.8 * len(data)): int(.9 * len(data))]\n", + " test = data[int(.9 * len(data)):]\n", + "\n", + " batch_data = train\n", + " if split == 'val':\n", + " batch_data = val\n", + "\n", + " if split == 'test':\n", + " batch_data = test\n", + "\n", + " # pick random starting points\n", + " ix = torch.randint(0, batch_data.size(0) - context_window - 1, (batch_size,))\n", + " x = torch.stack([batch_data[i:i+context_window] for i in ix]).long()\n", + " y = torch.stack([batch_data[i+1:i+context_window+1] for i in ix]).long()\n", + " return x, y" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "2b5a6586-2fa0-4019-a255-3fd0fd179d47", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'vocab_size': 65, 'batch_size': 32, 'context_window': 16}" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "MASTER_CONFIG.update({\n", + " 'batch_size': 32,\n", + " 'context_window': 16\n", + "})\n", + "MASTER_CONFIG" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a419246-6cc7-4cec-ab41-f42ab021f3cc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Frontier-cmake.md b/frontier/Frontier-cmake.md similarity index 100% rename from Frontier-cmake.md rename to frontier/Frontier-cmake.md diff --git a/Crusher-deepspeed.md b/frontier/crusher-deepspeed.md similarity index 100% rename from Crusher-deepspeed.md rename to frontier/crusher-deepspeed.md diff --git a/Crusher-pytorch.md b/frontier/crusher-pytorch.md similarity index 100% rename from Crusher-pytorch.md rename to frontier/crusher-pytorch.md diff --git a/frontier/frontier_pytorch.md b/frontier/frontier_pytorch.md new file mode 100644 index 0000000..7592967 --- /dev/null +++ b/frontier/frontier_pytorch.md @@ -0,0 +1,115 @@ + +# pytorch on Frontier + +- [pytorch on Frontier](#pytorch-on-frontier) + - [prep Frontier modules](#prep-frontier-modules) + - [module output on Frontier](#module-output-on-frontier) + - [setup miniconda3](#setup-miniconda3) + - [build pytorch](#build-pytorch) + - [Build options: see `setup.py`](#build-options-see-setuppy) + - [regenerate CMAKE build files](#regenerate-cmake-build-files) + - [Kineto and roctracer.h problem](#kineto-and-roctracerh-problem) + - [DeepSpeed](#deepspeed) + - [GPTNeoX](#gptneox) + - [Verification](#verification) + + + +## prep Frontier modules + +``` +module load PrgEnv-gnu +module load gcc/10.3.0 +module load rocm/5.1.0 +module load craype-x86-trento +export HCC_AMDGPU_TARGET=gfx90a +export PYTORCH_ROCM_ARCH=gfx90a +export ROCM_SOURCE_DIR=/opt/rocm-5.1.0 +export CRAY_CPU_TARGET=x86_64 # just to remove warning noise +``` +## module output on Frontier + +``` +Currently Loaded Modules: + 1) libfabric/1.15.2.0 4) cray-dsmml/0.2.2 7) gcc/10.3.0 10) DefApps/default 13) craype-accel-amd-gfx90a + 2) craype-network-ofi 5) cray-libsci/22.12.1.1 8) darshan-runtime/3.4.0 11) cray-mpich/8.1.23 14) craype-x86-trento + 3) craype/2.7.19 6) PrgEnv-gnu/8.3.3 9) hsi/default 12) rocm/5.1.0 +``` + +Note: One of the module between `craype-x86-trento` and `craype-accel-amd-gfx90a` fixed a linking problem. My guess is the former. + +## setup miniconda3 + +``` +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh +bash ./Miniconda3-latest-Linux-x86_64.sh -b -p miniconda +conda create -n pytorch python=3.8 +conda active pytorch +pip install -r requirements.txt +``` + +## build pytorch + +``` +git clone --recursive -b IFU-master-2022-11-22 https://github.com/ROCmSoftwarePlatform/pytorch +python tools/amd_build/build_amd.py +USE_KINETO=0 USE_ROCM=1 MAX_JOBS=4 python setup.py bdist_wheel 2>&1 | tee output +``` + +### Build options: see `setup.py` + +``` +USE_KINETO=0 # disable profiler, ask for roctracer.h +``` +### regenerate CMAKE build files + +This will trigger a rebuild for the changed configuration. + +``` +cd pytorch/build +rm CMakeCache.txt +``` +To remove previous build as well: + +``` +python setup.py clean +``` +### Kineto and roctracer.h problem + +Kineto requires roctracer, which fails in rocm 5.1.0 + +``` +if (NOT ROCM_SOURCE_DIR) + set(ROCM_SOURCE_DIR "$ENV{ROCM_SOURCE_DIR}") + message(INFO " ROCM_SOURCE_DIR = ${ROCM_SOURCE_DIR}") +endif() +``` + +For reason unknown at this point, the `ROCM_SOURCE_DIR` is still set as `/opt/rocm` instead of `/opt/rocm-5.1.0` even though the environment variable is set. + +So the easy workaround is: + +``` +set(ROCM_SOURCE_DIR /opt/rocm-5.1.0) +``` + +## DeepSpeed + +``` +git clone https://github.com/microsoft/DeepSpeed +DS_BUILD_FUSED_LAMB=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 python setup.py bdist_wheel +python setup.py install +``` + +## GPTNeoX + +``` +pip install shortuuid # missed from +git clone https://github.com/EleutherAI/gpt-neox.git +pip install -r requirements/requirements.txt +pip install -r requirements/requirements-wandb.txt +pip install -r requirements/requirements-tensorboard.txt +``` +## Verification + + diff --git a/pytorch-sbs/README.md b/pytorch-sbs/README.md new file mode 100644 index 0000000..f64dbd0 --- /dev/null +++ b/pytorch-sbs/README.md @@ -0,0 +1,11 @@ +# Study Notes + + +## Standardization + +* Based on loss function, each feature has a loss curve +* Each feature, based on the loss curve, has **very different** optimal learning rates +* However, learning rate is global (apply to all features) +* therefore, it makes sense to _try_ to make all features loss curve similar - but not always possible. +* Standardization (`StandardScalar` in sklearn) is one such technique that could make loss curve(s) more uniform (?), thus converge better (reaching optimal point) and faster. + diff --git a/JupyterOnSummit.md b/summit/JupyterOnSummit.md similarity index 100% rename from JupyterOnSummit.md rename to summit/JupyterOnSummit.md diff --git a/Summit-deepspeed.md b/summit/Summit-deepspeed.md similarity index 100% rename from Summit-deepspeed.md rename to summit/Summit-deepspeed.md diff --git a/Summit.md b/summit/Summit.md similarity index 100% rename from Summit.md rename to summit/Summit.md diff --git a/olcf-jupyterhub.md b/summit/olcf-jupyterhub.md similarity index 100% rename from olcf-jupyterhub.md rename to summit/olcf-jupyterhub.md