add holly

fwang2 · Dec 7, 2023 · db92fe4 · db92fe4
1 parent 41b4626
commit db92fe4
Showing 1 changed file with 116 additions and 0 deletions.
diff --git a/README-holly.ipynb b/README-holly.ipynb
@@ -0,0 +1,116 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "4cb5e3f4-8bb3-4bcf-8e43-ac5e6a5b3765",
+   "metadata": {},
+   "source": [
+    "## Jupyter setup\n",
+    "\n",
+    "run `ssh-add -s /Library/OpenSC/lib/opensc-pkcs11.so` on local (Mac) to make sure the ssh-agent working.\n",
+    "`config.common` and `.zshrc` is set up in a way that we can:\n",
+    "* run `holly-tunnel` for reverse ssh tunning\n",
+    "* run `ssh wombat[1,2]` for other work\n",
+    "\n",
+    "Once tunnel into holly, we can:\n",
+    "\n",
+    "```\n",
+    "source activate torch\n",
+    "cd hpc-ml; jupyter-lab \n",
+    "```\n",
+    "After Jupyter server starts, it will spit out a message such as:\n",
+    "\n",
+    "```\n",
+    "  To access the server, open this file in a browser:\n",
+    "        file:///autofs/nccs-svm1_envoy_od/f7b/.local/share/jupyter/runtime/jpserver-2302606-open.html\n",
+    "    Or copy and paste one of these URLs:\n",
+    "        http://localhost:8888/lab?token=8fd8fb9ee265905d2f01f8f168e97aecebcf90f0ae54ffcb\n",
+    "     or http://127.0.0.1:8888/lab?token=8fd8fb9ee265905d2f01f8f168e97aecebcf90f0ae54ffcb\n",
+    "```\n",
+    "\n",
+    "Copy this and paste to the **local** browser, and you should have the connection to the server.\n",
+    "\n",
+    "There are two errors from jupyter server console, `OSError: [Errno 97] Address family not supported by protocol`. I think this can be ignored.\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "692803d2-3ee1-4762-bd6f-d56a89d5c498",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Holly System Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "a220287d-da92-4094-8791-60ad860c8209",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch_version: 2.1.0\n",
+      "device_count: 8\n",
+      "Using device: cuda\n",
+      "NVIDIA H100 80GB HBM3\n",
+      "Total GPU memory: 79.11 GB\n",
+      "Memory Usage:\n",
+      "Allocated: 0.0 GB\n",
+      "Cached:    0.0 GB\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "print('torch_version:', torch.__version__)\n",
+    "print('device_count:', torch.cuda.device_count())\n",
+    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+    "print('Using device:', device)\n",
+    "if device.type == 'cuda':\n",
+    "    print(torch.cuda.get_device_name(0))\n",
+    "    total_memory = torch.cuda.get_device_properties(0).total_memory\n",
+    "    print(f\"Total GPU memory: {total_memory / 1024**3:.2f} GB\")\n",
+    "    print('Memory Usage:')\n",
+    "    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')\n",
+    "    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79bf9a58-3d29-4a40-adb2-f9f0152e09aa",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}