From c388e86eb490b4ec3c0cd5f9e4b44ffcd5741cd0 Mon Sep 17 00:00:00 2001 From: Feiyi Wang Date: Wed, 13 Dec 2023 07:18:25 -0500 Subject: [PATCH 1/2] update embeddings --- RAG/LC-FORGE-frontier2.ipynb | 4 +- RAG/embeddings.ipynb | 1450 ++++++++++++++++++++++++++++++++++ README-holly.ipynb | 21 +- 3 files changed, 1458 insertions(+), 17 deletions(-) create mode 100644 RAG/embeddings.ipynb diff --git a/RAG/LC-FORGE-frontier2.ipynb b/RAG/LC-FORGE-frontier2.ipynb index 08f00aa..f40206d 100644 --- a/RAG/LC-FORGE-frontier2.ipynb +++ b/RAG/LC-FORGE-frontier2.ipynb @@ -74,9 +74,7 @@ "output_type": "execute_result" } ], - "source": [ - "word_embedding_model.get_word_embedding_dimension()" - ] + "source": [] }, { "cell_type": "code", diff --git a/RAG/embeddings.ipynb b/RAG/embeddings.ipynb new file mode 100644 index 0000000..1af5a9b --- /dev/null +++ b/RAG/embeddings.ipynb @@ -0,0 +1,1450 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "794c94f6-f382-466f-a0d4-e18c436fe65f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%html\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "38137e22-f1df-489d-be90-602a3799bfb2", + "metadata": {}, + "source": [ + "# Embedding\n", + "\n", + "\n", + "## Overview\n", + "\n", + "what is it?\n", + "**numberical representation** of:\n", + "* word\n", + "* sentence\n", + "* image\n", + "* audio\n", + "\n", + "**BEFORE word embedding:one-hot encoding**\n", + "\n", + "* sparse\n", + "* vocab size\n", + "* pure position index, lose all kinds of information" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f7f7e3a7-3cd2-45e8-8d45-aac475d4aceb", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[1, 0, 0, 0],\n", + " [0, 1, 0, 0],\n", + " [0, 0, 1, 0],\n", + " [0, 0, 0, 1]])\n" + ] + } + ], + "source": [ + "import torch\n", + "import torch.nn.functional as F\n", + "\n", + "# Tensor of indices for our words\n", + "# king -> 0, queen -> 1, man -> 2, woman -> 3\n", + "indices = torch.tensor([0, 1, 2, 3])\n", + "\n", + "# Number of classes (unique words)\n", + "num_classes = 4\n", + "\n", + "# One-hot encoding\n", + "one_hot_encoded = F.one_hot(indices, num_classes=num_classes)\n", + "\n", + "print(one_hot_encoded)" + ] + }, + { + "cell_type": "markdown", + "id": "704f0556-6f93-4eb0-a1ad-9b543e53119a", + "metadata": {}, + "source": [ + "| king | queen | man | woman |\n", + "|------|-------|-----|-------|\n", + "| 1 | 0 | 0 | 0 |\n", + "| 0 | 1 | 0 | 0 |\n", + "| 0 | 0 | 1 | 0 |\n", + "| 0 | 0 | 0 | 1 |\n" + ] + }, + { + "cell_type": "markdown", + "id": "add51b4a-df9c-491c-a447-829125bbf909", + "metadata": {}, + "source": [ + "why not just using an integer?\n", + "\n", + "* ordinal relationship is implied in integer encoding or label encoding\n", + "* for categorical values, no such relationship exists\n" + ] + }, + { + "cell_type": "markdown", + "id": "0f90d301-497f-4ddc-b371-4bb42d4face3", + "metadata": { + "tags": [] + }, + "source": [ + "### what is a good embedding (words/texts)? \n", + "\n", + "we want to capture:\n", + "\n", + "* context of the paragraph\n", + "* semantic property\n", + "* syntatic property (grammar)\n", + "\n", + "$$f(w_i) = \\theta_i $$\n", + "f(cat) = (0.3, 0, 0.4 ...)\n", + "\n", + "\n", + "### Why embeddings is important?\n", + "\n", + "* they are the compact form of compressed data\n", + "* they preserve relationship within the data\n", + "* they are the output of DL layer - a **linear view** into complex **non-linear relationship** learned by the model\n", + "\n", + "\n", + "\n", + "before word2vec, there is Bengio's \"A Neural Probablistic Language Model\" (2003) https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf\n", + "\n", + "(benjio_nnlm.py)\n" + ] + }, + { + "cell_type": "markdown", + "id": "500bd34a-55bd-44ff-9f26-b38fc0d663a6", + "metadata": {}, + "source": [ + "## word2vec (2013)\n", + "\n", + "* Mikolov paper \"Efficient Estimation of Word Representation in Vector Space\". \n", + "* Continous bag-of-words model (CBoW)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "460b96e5-8c26-4181-b35d-acfe75b6bf6d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "OrderedDict([('weight',\n", + " tensor([[ 1.9269, 1.4873, 0.9007],\n", + " [-2.1055, 0.6784, -1.2345],\n", + " [-0.0431, -1.6047, -0.7521],\n", + " [ 1.6487, -0.3925, -1.4036],\n", + " [-0.7279, -0.5594, -0.7688],\n", + " [ 0.7624, 1.6423, -0.1596],\n", + " [-0.4974, 0.4396, -0.7581],\n", + " [ 1.0783, 0.8008, 1.6806],\n", + " [ 1.2791, 1.2964, 1.5736],\n", + " [-0.8455, 1.3123, 0.6872],\n", + " [-1.0892, -0.3553, -0.9138],\n", + " [-0.6581, 0.0499, 2.2667],\n", + " [ 1.1790, -0.4345, -1.3864],\n", + " [-1.2862, -1.4032, 0.0360]]))])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "\n", + "tiny_vocab=\"We must forever conduct our struggle on the high plane of dignity and discipline\".split()\n", + "\n", + "class CBOW(nn.Module):\n", + " def __init__(self, vocab_size, embedding_dim):\n", + " super().__init__()\n", + " self.embedding = nn.Embedding(vocab_size, embedding_dim)\n", + " self.linear = nn.Linear(embedding_dim, vocab_size)\n", + " \n", + " def forward(self, X):\n", + " embeddings = self.embedding(X)\n", + " bow = embeddings.mean(dim=1)\n", + " logits = self.linear(bow)\n", + " return logits\n", + "\n", + "torch.manual_seed(42)\n", + "dummy_cbow = CBOW(vocab_size=len(tiny_vocab), embedding_dim=3)\n", + "dummy_cbow.embedding.state_dict()" + ] + }, + { + "cell_type": "markdown", + "id": "e1fc58ff-c3cc-4529-a683-cfcb37ac5006", + "metadata": {}, + "source": [ + "| Word | Value 1 | Value 2 | Value 3 |\n", + "|-----------|---------|---------|---------|\n", + "| We | 1.9269 | 1.4873 | 0.9007 |\n", + "| must | -2.1055 | 0.6784 | -1.2345 |\n", + "| forever | -0.0431 | -1.6047 | -0.7521 |\n", + "| conduct | 1.6487 | -0.3925 | -1.4036 |\n", + "| our | -0.7279 | -0.5594 | -0.7688 |\n", + "| struggle | 0.7624 | 1.6423 | -0.1596 |\n", + "| on | -0.4974 | 0.4396 | -0.7581 |\n", + "| the | 1.0783 | 0.8008 | 1.6806 |\n", + "| high | 1.2791 | 1.2964 | 1.5736 |\n", + "| plane | -0.8455 | 1.3123 | 0.6872 |\n", + "| of | -1.0892 | -0.3553 | -0.9138 |\n", + "| dignity | -0.6581 | 0.0499 | 2.2667 |\n", + "| and | 1.1790 | -0.4345 | -1.3864 |\n", + "| discipline| -1.2862 | -1.4032 | 0.0360 |\n" + ] + }, + { + "cell_type": "markdown", + "id": "c7304159-c104-48fc-9ed0-223d30c444d0", + "metadata": {}, + "source": [ + "### notes on embedding layer\n", + "* `nn.Embedding` is a look up table: randomly initialized with vocab size and embedding dimension\n", + "* it is dense, not sparse anymore\n", + "* you can retrieve the embedding value via **token IDs**, which is one of functions of a tokenizer \n", + "* e.g. 50,000 vacob size, 300 - 1500 dimensions\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "47337247-c037-4792-8ae1-d751aa4b9f3a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[[-0.0431, -1.6047, -0.7521],\n", + " [ 1.6487, -0.3925, -1.4036],\n", + " [-0.7279, -0.5594, -2.3169],\n", + " [-0.2168, -1.3847, -0.8712]]], grad_fn=)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## pytorch embedding layer\n", + "import torch\n", + "from torch import nn\n", + "torch.manual_seed(42)\n", + "\n", + "test_embedding = nn.Embedding(10, 3) # 10 tensors, each with dimension 3\n", + "\n", + "# pass in 2-d tensor\n", + "idx = torch.as_tensor([[2,3,4,5]]).long()\n", + "\n", + "test_embedding(idx)\n", + "\n", + "# uncomment for error case\n", + "# idx=torch.tensor([10])\n", + "# test_embedding(idx)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "504a76b1-fe16-47b1-9f2e-fb0215c416eb", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[ 0.1652, -0.9853, -1.3360]], grad_fn=)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# column-based mean\n", + "test_embedding(idx).mean(dim=1)" + ] + }, + { + "cell_type": "markdown", + "id": "ef793b20-9756-4cc2-aac5-14ad52e3f34d", + "metadata": { + "tags": [] + }, + "source": [ + "### target and context\n", + "\n", + "* target is the word we want to predict, context is the surroundings that we use to predict.\n", + "* the following code is manually define one target word from vocabulary - index 5, and use the rest of the words as context.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "25b2f5b2-dc6a-4979-bb54-f447bd885b96", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13] [5]\n" + ] + } + ], + "source": [ + "def find_indices(keys, targets):\n", + " # Create a dictionary mapping each key to its index\n", + " key_to_index = {key: index for index, key in enumerate(keys)}\n", + "\n", + " # Find the index for each target string\n", + " indices = [key_to_index.get(target, -1) for target in targets]\n", + "\n", + " return indices\n", + "\n", + "context_words = tiny_vocab.copy()\n", + "target_words = 'struggle'\n", + "context_words.remove(target_words)\n", + "context_idx = find_indices(tiny_vocab, context_words)\n", + "target_idx = find_indices(tiny_vocab, [target_words])\n", + "print(context_idx, target_idx)" + ] + }, + { + "cell_type": "markdown", + "id": "9d777f56-d195-4033-97d3-3ad39192c1d7", + "metadata": { + "tags": [] + }, + "source": [ + "### get embeddings from context words\n", + "\n", + "the following code is to illustrate:\n", + "\n", + "* we use batch context words as input to embedding layer\n", + "* the embedding layer uses \"embeddings\" of ALL context words, and compute mean\n", + "* since each \"embeddings\" of the word is 3 dim, the output is also 3-dim\n", + "* these 3-dim vector is also known as \"features\"\n", + "* we will use these features to compute **logits**\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "e0e1af82-d715-4c1a-8196-9acfad967ed3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[[ 1.9269, 1.4873, 0.9007],\n", + " [-2.1055, 0.6784, -1.2345],\n", + " [-0.0431, -1.6047, -0.7521],\n", + " [ 1.6487, -0.3925, -1.4036],\n", + " [-0.7279, -0.5594, -0.7688],\n", + " [-0.4974, 0.4396, -0.7581],\n", + " [ 1.0783, 0.8008, 1.6806],\n", + " [ 1.2791, 1.2964, 1.5736],\n", + " [-0.8455, 1.3123, 0.6872],\n", + " [-1.0892, -0.3553, -0.9138],\n", + " [-0.6581, 0.0499, 2.2667],\n", + " [ 1.1790, -0.4345, -1.3864],\n", + " [-1.2862, -1.4032, 0.0360]]], grad_fn=)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "batch_context = torch.as_tensor([context_idx]).long()\n", + "batch_target = torch.as_tensor([target_idx]).long()\n", + "dummy_cbow.embedding(batch_context)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "0bba2fe9-db9a-4cca-ad76-1dc4905944b5", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[-0.0108, 0.1012, -0.0056]], grad_fn=)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cbow_features = dummy_cbow.embedding(batch_context).mean(dim=1)\n", + "cbow_features" + ] + }, + { + "cell_type": "markdown", + "id": "7c1f323f-e936-4d49-8251-4beaf6efddfe", + "metadata": {}, + "source": [ + "### compute logits using embedding features\n", + "\n", + "the embedding features (3-dim) is passed to linear layer to compute logits: a probalistic output, where the large value indicts *the most likely outcome*.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "a1266ba6-95c4-4e55-b8aa-c968a75488f6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Linear(in_features=3, out_features=14, bias=True)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dummy_cbow.linear" + ] + }, + { + "cell_type": "markdown", + "id": "9fb34180-e63d-4cb9-adb2-ef379beab749", + "metadata": {}, + "source": [ + "Logits: In the context of classification tasks, the term \"logits\" typically refers to the raw, unnormalized scores (output of the last linear layer) that a classification model outputs, which are then passed through a softmax function to obtain probabilities. If nn.Linear() is the last layer in a classification model, and you haven't applied an activation function like softmax to its output, then yes, the output of nn.Linear() can be considered as logits." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "02377315-73db-4d26-aeed-d175684f3afe", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[-0.3772, 0.1676, -0.0930, -0.4483, 0.0243, -0.4446, -0.4631, -0.3511,\n", + " -0.5342, -0.3302, 0.5974, 0.1433, 0.1483, -0.5540]],\n", + " grad_fn=)" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logits = dummy_cbow.linear(cbow_features)\n", + "logits" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "24f93be3-8d6a-4be1-b373-0e91fe0ca904", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor(10)" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.argmax(logits)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "8e189a30-6c87-4306-88af-beb96bcac315", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'of'" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tiny_vocab[10]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "47758dfe-3606-4098-bd75-233786cf1b00", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[0.0552, 0.0951, 0.0733, 0.0514, 0.0824, 0.0516, 0.0506, 0.0566, 0.0472,\n", + " 0.0578, 0.1462, 0.0929, 0.0933, 0.0462]], grad_fn=)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch\n", + "import torch.nn.functional as F\n", + "# Apply softmax along the last dimension (dim=-1) to convert logits into probabilities\n", + "# The dimension parameter specifies the axis along which softmax is computed\n", + "# if the input is 2 x 3 x 3, where 2 is the number of classes\n", + "# the softmax will be applied the last class, which is 2\n", + "probabilities = F.softmax(logits, dim=-1)\n", + "probabilities" + ] + }, + { + "cell_type": "markdown", + "id": "23c08016-e71f-4afe-ae07-7b381a5add43", + "metadata": {}, + "source": [ + "* \"of\" is the predicted word, where the target word is \"struggle\"\n", + "* but ... this is random initialized model, and we need to LEARN\n", + "* The point is, given enough dataset of context words and target, we could TRAIN the CBOW model using `nn.CrossEntropyLoss()` to learn the actual word embeddings.\n", + "\n", + "\n", + "**How are embeddings being used?**\n", + "\n", + "* similarity search\n", + "* clustering\n", + "* and many more" + ] + }, + { + "cell_type": "markdown", + "id": "32564584-fcd2-4b54-bd44-53b7ec8dc900", + "metadata": {}, + "source": [ + "## Global Vectors (GloVe)\n", + "\n", + "Pennington, \"GloVe: Global Vector for Word Representation\" (2014)\n", + "\n", + "* the following note is more about how to use it, rather than explaining it though.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "764719c9-d1bd-4b9d-8112-a7c031139b7f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:gensim.models.keyedvectors:loading projection weights from /ccsopen/home/f7b/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz\n", + "INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (400000, 50) matrix of type float32 from /ccsopen/home/f7b/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-12-09T11:19:37.374334', 'gensim': '4.3.2', 'python': '3.11.5 (main, Sep 11 2023, 13:54:46) [GCC 11.2.0]', 'platform': 'Linux-4.18.0-372.32.1.el8_6.x86_64-x86_64-with-glibc2.28', 'event': 'load_word2vec_format'}\n" + ] + } + ], + "source": [ + "# download pre-trained word embeddings\n", + "from gensim import downloader\n", + "glove = downloader.load('glove-wiki-gigaword-50')" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "a286def9-17a6-4859-ae6a-839e549263f6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "king = [ 0.50451 0.68607 -0.59517 -0.022801 0.60046 -0.13498 -0.08813\n", + " 0.47377 -0.61798 -0.31012 -0.076666 1.493 -0.034189 -0.98173\n", + " 0.68229 0.81722 -0.51874 -0.31503 -0.55809 0.66421 0.1961\n", + " -0.13495 -0.11476 -0.30344 0.41177 -2.223 -1.0756 -1.0783\n", + " -0.34354 0.33505 1.9927 -0.04234 -0.64319 0.71125 0.49159\n", + " 0.16754 0.34344 -0.25663 -0.8523 0.1661 0.40102 1.1685\n", + " -1.0137 -0.21585 -0.15155 0.78321 -0.91241 -1.6106 -0.64426\n", + " -0.51042 ]\n", + " queen = [ 0.37854 1.8233 -1.2648 -0.1043 0.35829 0.60029\n", + " -0.17538 0.83767 -0.056798 -0.75795 0.22681 0.98587\n", + " 0.60587 -0.31419 0.28877 0.56013 -0.77456 0.071421\n", + " -0.5741 0.21342 0.57674 0.3868 -0.12574 0.28012\n", + " 0.28135 -1.8053 -1.0421 -0.19255 -0.55375 -0.054526\n", + " 1.5574 0.39296 -0.2475 0.34251 0.45365 0.16237\n", + " 0.52464 -0.070272 -0.83744 -1.0326 0.45946 0.25302\n", + " -0.17837 -0.73398 -0.20025 0.2347 -0.56095 -2.2839\n", + " 0.0092753 -0.60284 ]\n", + " man = [-0.094386 0.43007 -0.17224 -0.45529 1.6447 0.40335 -0.37263\n", + " 0.25071 -0.10588 0.10778 -0.10848 0.15181 -0.65396 0.55054\n", + " 0.59591 -0.46278 0.11847 0.64448 -0.70948 0.23947 -0.82905\n", + " 1.272 0.033021 0.2935 0.3911 -2.8094 -0.70745 0.4106\n", + " 0.3894 -0.2913 2.6124 -0.34576 -0.16832 0.25154 0.31216\n", + " 0.31639 0.12539 -0.012646 0.22297 -0.56585 -0.086264 0.62549\n", + " -0.0576 0.29375 0.66005 -0.53115 -0.48233 -0.97925 0.53135\n", + " -0.11725 ]\n", + " woman = [-1.8153e-01 6.4827e-01 -5.8210e-01 -4.9451e-01 1.5415e+00 1.3450e+00\n", + " -4.3305e-01 5.8059e-01 3.5556e-01 -2.5184e-01 2.0254e-01 -7.1643e-01\n", + " 3.0610e-01 5.6127e-01 8.3928e-01 -3.8085e-01 -9.0875e-01 4.3326e-01\n", + " -1.4436e-02 2.3725e-01 -5.3799e-01 1.7773e+00 -6.6433e-02 6.9795e-01\n", + " 6.9291e-01 -2.6739e+00 -7.6805e-01 3.3929e-01 1.9695e-01 -3.5245e-01\n", + " 2.2920e+00 -2.7411e-01 -3.0169e-01 8.5286e-04 1.6923e-01 9.1433e-02\n", + " -2.3610e-02 3.6236e-02 3.4488e-01 -8.3947e-01 -2.5174e-01 4.2123e-01\n", + " 4.8616e-01 2.2325e-02 5.5760e-01 -8.5223e-01 -2.3073e-01 -1.3138e+00\n", + " 4.8764e-01 -1.0467e-01]\n" + ] + } + ], + "source": [ + "king = glove['king']\n", + "queen = glove['queen']\n", + "man = glove['man']\n", + "woman = glove['woman']\n", + "print(f\"king = {king}\\n queen = {queen}\\n man = {man}\\n woman = {woman}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "ce90bca4-bfbe-4e35-822d-f88b358faad2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor(0.8610)" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch.nn.functional as F\n", + "synthetic_queen = king - man + woman\n", + "F.cosine_similarity(torch.from_numpy(synthetic_queen), torch.from_numpy(queen), dim=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "bb539efe-c991-4ade-ab1e-ff49446170db", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[0.8610]])" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sentence_transformers import SentenceTransformer, models,util\n", + "util.cos_sim(synethtic_queen, queen)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "200a7fc5-7814-4394-b94b-bc24a29759df", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[1.]])" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "util.cos_sim(queen, queen)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "8fdb0c42-716d-4582-bc12-8032470431ee", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('queen', 1.0000001192092896),\n", + " ('princess', 0.8515165448188782),\n", + " ('lady', 0.805060863494873),\n", + " ('elizabeth', 0.7873042225837708),\n", + " ('king', 0.7839043736457825)]" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "glove.similar_by_vector(queen, topn=5)" + ] + }, + { + "cell_type": "markdown", + "id": "59ddde57-c1c5-4b15-82c8-da4a78c252d8", + "metadata": { + "tags": [] + }, + "source": [ + "## Embeddings: Part 2\n", + "\n", + "\n", + "\n", + "### Transformer vs. sentence transformer\n", + "\n", + "* *regular* transformers works at *word/token*-level embeddings, not *sentence*-level embeddings.\n", + "* regular transformer CAN produce sentence embeddings by performing *pooling* operation such as element-wise arithmetic mean on its token-level embeddings.\n", + "* A good pooling choice for BERT is CLS pooling - BERT has special `` token that is suppose to capture all the sequence information. It gets tuned on the next-sentence prediction (NSP) during pre-training.\n", + "\n", + "\n", + "\n", + "\n", + "### The Process of Generating Embeddings\n", + "\n", + "Given a sentence, how do we get its embeddings?\n", + "\n", + "1 **Initial tokenization**: converting raw text into a sequence of toke IDs that model can understand. \n", + "\n", + "2 **Embedding layer** (BERT/GPT starts with such layer): tokenized inputs passed through this layer, convert token ID into initial vector representations (embeddings). In transformer models, positional enbeddings are also added at this stage.\n", + "\n", + "3 **Passing through model layers** The vector representation from embedding layer pass through the rest of the model layers such as self-attention in transformer and Feed-forward networks. Each layer process the input, and **refine** the embeddings, adding contextual information.\n", + "\n", + "4 **Contextualized embeddings** By the time the input reaches the final layer of the model, embeddings are deeply contextualized. The final-layer embeddings can be considered as sentence embeddings.\n", + "\n", + "\n", + "5 **Pooling** (optional)\n", + "\n", + "* aggregate word/token embeddings into a single sentence embeddings\n", + "* transform a **variable-length inputs** into **fixed-length output**\n", + " * mean pooling (avg across dimensions across all tokens)\n", + " * max pooling (max across dimensions across all tokens)\n", + " * CLS token pooling (in models such as BERT, first token is a special classification token, CLS)\n", + " * others ... such as adaptive pooling\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7bb5fcb5-a467-4f11-84a7-419a7ef248d7", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of embeddings: torch.Size([1, 17, 768])\n" + ] + } + ], + "source": [ + "from transformers import BertModel, BertTokenizer\n", + "import torch\n", + "\n", + "# Load pre-trained model and tokenizer\n", + "model_name = \"bert-base-uncased\"\n", + "model = BertModel.from_pretrained(model_name)\n", + "tokenizer = BertTokenizer.from_pretrained(model_name)\n", + "\n", + "# Sample text\n", + "text = \"we must forever conduct our struggle on the high plane of dignity and discipline.\"\n", + "\n", + "# Encode text\n", + "inputs = tokenizer(text, return_tensors=\"pt\")\n", + "\n", + "# Extract embeddings\n", + "with torch.no_grad():\n", + " outputs = model(**inputs)\n", + "\n", + "# The output is a tuple, where the first item contains the hidden states\n", + "# The hidden states are the embeddings; for BERT, you typically use the last hidden state\n", + "embeddings = outputs.last_hidden_state\n", + "\n", + "print(\"Shape of embeddings:\", embeddings.shape)\n", + "# The shape of the embeddings is (batch_size, sequence_length, hidden_size)" + ] + }, + { + "cell_type": "markdown", + "id": "41e53232-0b4d-45ca-a70a-919452a727dc", + "metadata": {}, + "source": [ + "### My (mis)percetion\n", + "\n", + "1. Not all models (embeddings) are created equal\n", + " * Deep neural networks such as transformer-based LLM models is \"supposely\" good at capture the contextual relationship but ... \n", + " * It should not be considered a \"default\"\n", + " * However, just because LLaMA is powerful model, it doesn't mean it has a good embedding model\n", + "\n", + "2. For different downstream tasks, you may need different embeddings models\n", + "\n", + "3. We extract and last hidden state, but the process of generating embeddings is a inference process that pass through the whole network.\n", + "\n", + "4. Embeddings also needs FINE TUNING" + ] + }, + { + "cell_type": "markdown", + "id": "e95c9b89-ab28-4451-8890-5cbf46623701", + "metadata": { + "tags": [] + }, + "source": [ + "### Using FORGE-S as embedddings\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f3764942-fd4f-4e1f-90d2-f72c1fe3bf8c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "2064" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sentence_transformers import SentenceTransformer, models,util\n", + "model_path = \"/proj/f7b/forge-s-instruct-base1\"\n", + "word_embedding_model = models.Transformer(model_path, max_seq_length=512)\n", + "word_embedding_model.tokenizer.pad_token=word_embedding_model.tokenizer.eos_token\n", + "pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode=\"mean\" )\n", + "model = SentenceTransformer(modules=[word_embedding_model, pooling_model]).cuda()\n", + "\n", + "word_embedding_model.get_word_embedding_dimension()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c90ee83-ab8c-4eee-ba79-58b90da4fd21", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "e1=model.encode(\"I am a happy person\")\n", + "e2=model.encode(\"the sky is falling\")\n", + "e3=model.encode(\"I am a sad person\")\n", + "e4=model.encode(\"I am a happy person\")\n", + "e5=model.encode(\"I am happy person\")\n", + "print(util.cos_sim(e1, e2))\n", + "print(util.cos_sim(e1, e3))\n", + "print(util.cos_sim(e1, e4))\n", + "print(util.cos_sim(e1, e5))" + ] + }, + { + "cell_type": "markdown", + "id": "2632b7a1-eeb9-4e9c-9669-ab993951ac04", + "metadata": {}, + "source": [ + "### Using UAE-Large-V1" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "15be845a-1477-4b7d-bdfe-2d173e90da26", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[0.3978]])\n", + "tensor([[0.6961]])\n", + "tensor([[1.0000]])\n", + "tensor([[0.9913]])\n" + ] + } + ], + "source": [ + "from angle_emb import AnglE\n", + "\n", + "angle = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls').cuda()\n", + "\n", + "e1=angle.encode(\"I am a happy person\")\n", + "e2=angle.encode(\"the sky is falling\")\n", + "e3=angle.encode(\"I am a sad person\")\n", + "e4=angle.encode(\"I am a happy person\")\n", + "e5=angle.encode(\"I am happy person\")\n", + "print(util.cos_sim(e1, e2))\n", + "print(util.cos_sim(e1, e3))\n", + "print(util.cos_sim(e1, e4))\n", + "print(util.cos_sim(e1, e5))" + ] + }, + { + "cell_type": "markdown", + "id": "d785848b-f58e-4a3c-a4d6-744b53a8b259", + "metadata": {}, + "source": [ + "### Cross-encoder BERT\n", + "\n", + "* this is a setup or application of BERT\n", + "* the goal is to **compare** inputs such as sentence pairs\n", + "* This is typically done by concatenating two pieces of text with a special token **[SEP]** between them. For example, the input to the model: **[CLS] Sentence A [SEP] Sentence B [SEP]** \n", + "* \n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "850e672e-bceb-47a7-b565-baef3cf8469a", + "metadata": {}, + "source": [ + "## Evaluating Embedding Models\n", + "\n", + "Massive Text Embedding Benchmark (MTEB) Leaderboard:\n", + "https://huggingface.co/spaces/mteb/leaderboard\n" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "f150d782-b094-423a-8ed8-a7ec995cb6ab", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda\n", + "INFO:mteb.evaluation.MTEB:\n", + "\n", + "## Evaluating 2 tasks:\n" + ] + }, + { + "data": { + "text/html": [ + "
───────────────────────────────────────────────── Selected tasks  ─────────────────────────────────────────────────\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;5;235m───────────────────────────────────────────────── \u001b[0m\u001b[1mSelected tasks \u001b[0m\u001b[38;5;235m ─────────────────────────────────────────────────\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
STS\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1mSTS\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
    - STS22, p2p, crosslingual 1 / 18 pairs\n",
+       "
\n" + ], + "text/plain": [ + " - STS22, \u001b[3;38;5;241mp2p\u001b[0m, \u001b[3;36mcrosslingual \u001b[0m\u001b[1;3;36m1\u001b[0m\u001b[3;36m \u001b[0m\u001b[3;36m/\u001b[0m\u001b[3;36m \u001b[0m\u001b[1;3;36m18\u001b[0m\u001b[3;36m pairs\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
    - STSBenchmark, s2s\n",
+       "
\n" + ], + "text/plain": [ + " - STSBenchmark, \u001b[3;38;5;241ms2s\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n",
+       "\n",
+       "
\n" + ], + "text/plain": [ + "\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:mteb.evaluation.MTEB:\n", + "\n", + "********************** Evaluating STS22 **********************\n", + "INFO:mteb.evaluation.MTEB:Loading dataset for STS22\n", + "INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...\n", + "INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 199 sentences1...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e2f3a2b2abbd4c0493359ebda7378fc7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/4 [00:00 Date: Thu, 8 Feb 2024 05:49:45 -0500 Subject: [PATCH 2/2] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 9e334a6..f8c90cf 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,11 @@ Now, when you start a new launcher, you should see the new kernel "mistral" list * [Word embedding](shorturl.at/BIL25) * [Running Jupyter on Summit](JupyterOnSummit.md) + +## Transformer + +* [Transformer from scratch](https://e2eml.school/transformers.html) + ## Tools * [numpy](tools/numpy.ipynb), [matplotlib](tools/matplotlib.ipynb)