-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
387 additions
and
0 deletions.
There are no files selected for viewing
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,261 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "bc091be0-2260-4eea-98ba-46daf4b10899", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"##\n", | ||
"## conda update --all\n", | ||
"## conda install -c conda-forge ipywidgets\n", | ||
"##\n", | ||
"## source: https://github.com/bkitano/llama-from-scratch\n", | ||
"##\n", | ||
"\n", | ||
"import torch\n", | ||
"from torch import nn\n", | ||
"from torch.nn import functional as F\n", | ||
"import numpy as np\n", | ||
"from matplotlib import pyplot as plt\n", | ||
"import time\n", | ||
"import pandas as pd\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "1f51d6f7-17ff-4765-b30e-cbedb3dc24fc", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# read in text\n", | ||
"lines = open('./input.txt', 'r').read()\n", | ||
"vocab = sorted(list(set(lines)))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 19, | ||
"id": "87acf3dc-7610-42f5-9e6a-f262ba058e7a", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"['\\n', ' ', '!', '$', '&', \"'\", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(vocab)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 14, | ||
"id": "3ea85c0a-6732-4a6e-899b-4aedf93bd714", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"itos = {i:ch for i, ch in enumerate(vocab)}\n", | ||
"stoi = {ch:i for i, ch in enumerate(vocab)}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 17, | ||
"id": "f559fdcd-3049-4406-bfc3-0ebc23a1d204", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"{0: '\\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: \"'\", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 45: 'g', 46: 'h', 47: 'i', 48: 'j', 49: 'k', 50: 'l', 51: 'm', 52: 'n', 53: 'o', 54: 'p', 55: 'q', 56: 'r', 57: 's', 58: 't', 59: 'u', 60: 'v', 61: 'w', 62: 'x', 63: 'y', 64: 'z'}\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(itos)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 18, | ||
"id": "fefead73-d6d4-4371-b6f1-921308c06391", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"{'\\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, \"'\": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(stoi)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"id": "b6d823b5-7b01-43af-8a16-703ecc73bdb1", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[46, 43, 50, 50, 53]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# simple tokenization by characters\n", | ||
"def encode(s):\n", | ||
" return [stoi[ch] for ch in s]\n", | ||
"\n", | ||
"def decode(l):\n", | ||
" return ''.join([itos[i] for i in l])\n", | ||
"\n", | ||
"print(encode(\"hello\"))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 23, | ||
"id": "083fb2e5-fe3c-4149-a541-f0bfb53eb4dc", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"hello\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(decode(encode(\"hello\")))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 24, | ||
"id": "55e5d77f-aed6-4db0-b99b-2b2eb8e56c91", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"MASTER_CONFIG = {\n", | ||
" \"vocab_size\": len(vocab),\n", | ||
"}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 26, | ||
"id": "fad0ede8-0152-4939-83f8-9f7807c6826a", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"torch.Size([1115394])" | ||
] | ||
}, | ||
"execution_count": 26, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"dataset = torch.tensor(encode(lines), dtype=torch.int8)\n", | ||
"dataset.shape" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 27, | ||
"id": "fb3e6512-78cc-4601-b40d-213f0d24138f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def get_batches(data, split, batch_size, context_window, config=MASTER_CONFIG):\n", | ||
" train = data[:int(.8 * len(data))]\n", | ||
" val = data[int(.8 * len(data)): int(.9 * len(data))]\n", | ||
" test = data[int(.9 * len(data)):]\n", | ||
"\n", | ||
" batch_data = train\n", | ||
" if split == 'val':\n", | ||
" batch_data = val\n", | ||
"\n", | ||
" if split == 'test':\n", | ||
" batch_data = test\n", | ||
"\n", | ||
" # pick random starting points\n", | ||
" ix = torch.randint(0, batch_data.size(0) - context_window - 1, (batch_size,))\n", | ||
" x = torch.stack([batch_data[i:i+context_window] for i in ix]).long()\n", | ||
" y = torch.stack([batch_data[i+1:i+context_window+1] for i in ix]).long()\n", | ||
" return x, y" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 30, | ||
"id": "2b5a6586-2fa0-4019-a255-3fd0fd179d47", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"{'vocab_size': 65, 'batch_size': 32, 'context_window': 16}" | ||
] | ||
}, | ||
"execution_count": 30, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"MASTER_CONFIG.update({\n", | ||
" 'batch_size': 32,\n", | ||
" 'context_window': 16\n", | ||
"})\n", | ||
"MASTER_CONFIG" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "7a419246-6cc7-4cec-ab41-f42ab021f3cc", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.15" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
|
||
# pytorch on Frontier | ||
|
||
- [pytorch on Frontier](#pytorch-on-frontier) | ||
- [prep Frontier modules](#prep-frontier-modules) | ||
- [module output on Frontier](#module-output-on-frontier) | ||
- [setup miniconda3](#setup-miniconda3) | ||
- [build pytorch](#build-pytorch) | ||
- [Build options: see `setup.py`](#build-options-see-setuppy) | ||
- [regenerate CMAKE build files](#regenerate-cmake-build-files) | ||
- [Kineto and roctracer.h problem](#kineto-and-roctracerh-problem) | ||
- [DeepSpeed](#deepspeed) | ||
- [GPTNeoX](#gptneox) | ||
- [Verification](#verification) | ||
|
||
|
||
|
||
## prep Frontier modules | ||
|
||
``` | ||
module load PrgEnv-gnu | ||
module load gcc/10.3.0 | ||
module load rocm/5.1.0 | ||
module load craype-x86-trento | ||
export HCC_AMDGPU_TARGET=gfx90a | ||
export PYTORCH_ROCM_ARCH=gfx90a | ||
export ROCM_SOURCE_DIR=/opt/rocm-5.1.0 | ||
export CRAY_CPU_TARGET=x86_64 # just to remove warning noise | ||
``` | ||
## module output on Frontier | ||
|
||
``` | ||
Currently Loaded Modules: | ||
1) libfabric/1.15.2.0 4) cray-dsmml/0.2.2 7) gcc/10.3.0 10) DefApps/default 13) craype-accel-amd-gfx90a | ||
2) craype-network-ofi 5) cray-libsci/22.12.1.1 8) darshan-runtime/3.4.0 11) cray-mpich/8.1.23 14) craype-x86-trento | ||
3) craype/2.7.19 6) PrgEnv-gnu/8.3.3 9) hsi/default 12) rocm/5.1.0 | ||
``` | ||
|
||
Note: One of the module between `craype-x86-trento` and `craype-accel-amd-gfx90a` fixed a linking problem. My guess is the former. | ||
|
||
## setup miniconda3 | ||
|
||
``` | ||
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh | ||
bash ./Miniconda3-latest-Linux-x86_64.sh -b -p miniconda | ||
conda create -n pytorch python=3.8 | ||
conda active pytorch | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## build pytorch | ||
|
||
``` | ||
git clone --recursive -b IFU-master-2022-11-22 https://github.com/ROCmSoftwarePlatform/pytorch | ||
python tools/amd_build/build_amd.py | ||
USE_KINETO=0 USE_ROCM=1 MAX_JOBS=4 python setup.py bdist_wheel 2>&1 | tee output | ||
``` | ||
|
||
### Build options: see `setup.py` | ||
|
||
``` | ||
USE_KINETO=0 # disable profiler, ask for roctracer.h | ||
``` | ||
### regenerate CMAKE build files | ||
|
||
This will trigger a rebuild for the changed configuration. | ||
|
||
``` | ||
cd pytorch/build | ||
rm CMakeCache.txt | ||
``` | ||
To remove previous build as well: | ||
|
||
``` | ||
python setup.py clean | ||
``` | ||
### Kineto and roctracer.h problem | ||
|
||
Kineto requires roctracer, which fails in rocm 5.1.0 | ||
|
||
``` | ||
if (NOT ROCM_SOURCE_DIR) | ||
set(ROCM_SOURCE_DIR "$ENV{ROCM_SOURCE_DIR}") | ||
message(INFO " ROCM_SOURCE_DIR = ${ROCM_SOURCE_DIR}") | ||
endif() | ||
``` | ||
|
||
For reason unknown at this point, the `ROCM_SOURCE_DIR` is still set as `/opt/rocm` instead of `/opt/rocm-5.1.0` even though the environment variable is set. | ||
|
||
So the easy workaround is: | ||
|
||
``` | ||
set(ROCM_SOURCE_DIR /opt/rocm-5.1.0) | ||
``` | ||
|
||
## DeepSpeed | ||
|
||
``` | ||
git clone https://github.com/microsoft/DeepSpeed | ||
DS_BUILD_FUSED_LAMB=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 python setup.py bdist_wheel | ||
python setup.py install | ||
``` | ||
|
||
## GPTNeoX | ||
|
||
``` | ||
pip install shortuuid # missed from | ||
git clone https://github.com/EleutherAI/gpt-neox.git | ||
pip install -r requirements/requirements.txt | ||
pip install -r requirements/requirements-wandb.txt | ||
pip install -r requirements/requirements-tensorboard.txt | ||
``` | ||
## Verification | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Study Notes | ||
|
||
|
||
## Standardization | ||
|
||
* Based on loss function, each feature has a loss curve | ||
* Each feature, based on the loss curve, has **very different** optimal learning rates | ||
* However, learning rate is global (apply to all features) | ||
* therefore, it makes sense to _try_ to make all features loss curve similar - but not always possible. | ||
* Standardization (`StandardScalar` in sklearn) is one such technique that could make loss curve(s) more uniform (?), thus converge better (reaching optimal point) and faster. | ||
|
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.