diff --git a/0.1_Hugging_Face_basics.py b/0.1_Hugging_Face_basics.py index fbb7165..a70f5b1 100644 --- a/0.1_Hugging_Face_basics.py +++ b/0.1_Hugging_Face_basics.py @@ -7,12 +7,12 @@ # MAGIC # MAGIC In these exercises we will focus on the _transformers_ library but _datasets_, _evaluate_ and _accelerate_ are commonly used in training models. # MAGIC -# MAGIC All code here is tested on MLR 13.2 on a g5 AWS instance (A10G GPU). +# MAGIC All code here is tested on MLR 13.3 LTS on a g5 AWS instance (A10G GPU) and also on m5.4xlarge for CPU version. # MAGIC We suggest a ```g5.4xlarge``` single node cluster to start # MAGIC The Azure equivalent is ```NC6s_v3``` series. However, for this lab we will be using ```g5.4xlarge``` instances. # MAGIC ---- # MAGIC **Notes** -# MAGIC - Falcon requires Torch 2.0 coming soon.... +# MAGIC - Falcon requires Torch 2.0 which is available in MLR 14.x # MAGIC - The LLM Space is fast moving. Many models are provided by independent companies as well so model revision and pinning library versions is important. # MAGIC - If using an MLR prior to 13.2, you will need to run ```%pip install einops``` # MAGIC - It may also be necessary to manually install extra Nvidia libraries via [init_scripts](https://docs.databricks.com/clusters/init-scripts.html) diff --git a/0.3_Vector_DBs.py b/0.3_Vector_DBs.py index df34179..13c0270 100644 --- a/0.3_Vector_DBs.py +++ b/0.3_Vector_DBs.py @@ -1,6 +1,10 @@ # Databricks notebook source # MAGIC %md # MAGIC # Exploring Vector DBs +# MAGIC In this notebook we will explore the process of converting text to numbers and what that means for our sentences +# MAGIC We will use the faiss library which provides a large variety of different algorithms that you can try out. +# MAGIC The difference between FAISS and a full Vector Database solution is around things like governance, +# MAGIC convenience features like updates and production grade featuers like failover and backups. # COMMAND ---------- @@ -24,7 +28,8 @@ # COMMAND ---------- # MAGIC %md -# MAGIC # Get some sample data +# MAGIC # Load some sample data +# MAGIC We will use wikipedia for our initial sample data # COMMAND ---------- # Load Sample Data @@ -42,6 +47,14 @@ # MAGIC %md # MAGIC # Load Embedding Model +# MAGIC In this example, we will use the tokeniser from MPT-7B to start +# MAGIC +# MAGIC *NOTE* When we build out our full architecture there will be two functions that turn text to tokens. +# MAGIC - Model Tokenizer - This component we are experimenting with here +# MAGIC - Embedding Tokenizer - This will be explored later and is used to populate the VectorDB +# MAGIC +# MAGIC Whilst the _Model Tokenizer_ is set, you have to use the one intended for your model, the _Embedding Tokenizer_ is something +# MAGIC that we can select to suit our use case # COMMAND ---------- from transformers import AutoTokenizer @@ -55,6 +68,7 @@ # MAGIC %md # MAGIC # Explore tokenization +# MAGIC Lets explore the way that words are encoded for our LLM # COMMAND ---------- @@ -80,9 +94,11 @@ # COMMAND ---------- # MAGIC %md -# MAGIC # Sentence Encoders for FAISS -# MAGIC Tokenizers from an LLM and for VectorStores are a bit different -# MAGIC SentenceTransformers from Huggingface is focused on the latter. +# MAGIC # Sentence Transformers for Embedding tokenization +# MAGIC The Sentence Transformers library provides a series of embedding algorithms that can be used to popuiate our VectorDB. +# MAGIC Unlike the _Model Tokenizer_ which produced a variable length output depending on the input. +# MAGIC An embedding algorithm produces a fixed length vector so that we can run approximate nearest neighbour algorithms. + # COMMAND ---------- from sentence_transformers import SentenceTransformer @@ -90,6 +106,7 @@ model = SentenceTransformer('bert-base-nli-mean-tokens') # COMMAND ---------- +# Split the document into paragraphs paragraph_form = page.content.split('\n\n') len(paragraph_form) @@ -97,8 +114,15 @@ # COMMAND ---------- # MAGIC %md -# MAGIC Tokenizations work best when it receives chunks of the same size -# MAGIC +# MAGIC Encode the paragraphs into dence vectors +# MAGIC Different models will produce a different length vector +# MAGIC In theory, a model that produces a longer length can represent the input data better. +# MAGIC But really it depends on the type of data it is trained on. +# MAGIC +# MAGIC ie a Sentence Transformer that produces 512 length vectors BUT is trained on medical data +# MAGIC will provide a better representation for medical documents than a Sentence Transformer that produces 1024 length vectors +# MAGIC but is only trained on social media. + # COMMAND ---------- sentence_encode = model.encode(paragraph_form) @@ -107,6 +131,8 @@ # MAGIC %md # MAGIC # Lets build out a FAISS index +# MAGIC FAISS lets us experiment with a wide variety of different search algorithms +# MAGIC Most VectorDBs will offer just one option. # COMMAND ---------- diff --git a/1.1_Evaluations.py b/1.1_Evaluations.py index d0d5d10..7ee99fa 100644 --- a/1.1_Evaluations.py +++ b/1.1_Evaluations.py @@ -3,7 +3,7 @@ # MAGIC # Evaluations # MAGIC Running Evaluations on RAGs is still more art than science \ # MAGIC We will use llama_index to assist in generating evaluation questions \ -# MAGIC And the ragas library for generating metrics to assess your RAG \ +# MAGIC And use the inbuilt assessment prompt in llama_index \ # MAGIC # MAGIC We use an older llama_index to align with MLR 13.3 LTS Langchain version \ # MAGIC as llama_index relies a lot on Langchain diff --git a/1.3_Evaluating_Embeddings.py b/1.3_Evaluating_Embeddings.py index 979f19e..8bcd24a 100644 --- a/1.3_Evaluating_Embeddings.py +++ b/1.3_Evaluating_Embeddings.py @@ -2,8 +2,13 @@ # MAGIC %md # MAGIC # Understanding Embeddings # MAGIC Embeddings are just vectors and we can visualise and analyse them as such \ -# MAGIC In this case we will use the Arize Phoenix tool \ -# MAGIC for more info see: https://docs.arize.com/phoenix/ +# MAGIC A common way to look at and explore embeddings is to use TSNE visualisations. \ +# MAGIC This can be applied to our VectorDB Data too. +# MAGIC +# MAGIC See: https://www.kaggle.com/code/colinmorris/visualizing-embeddings-with-t-sne +# MAGIC +# MAGIC An open source tool that you might want to investigate for this as well is Arize Phoenix \ +# MAGIC See: https://docs.arize.com/phoenix/ # COMMAND ---------- # MAGIC # "arize-phoenix[experimental]" pandas==1.5.3 diff --git a/1.4_Advanced_RAG.py b/1.4_Advanced_RAG.py index 95bc4ae..4e899fb 100644 --- a/1.4_Advanced_RAG.py +++ b/1.4_Advanced_RAG.py @@ -6,7 +6,7 @@ # COMMAND ---------- # DBTITLE 1,Extra Libs to install -%pip install pypdf ctransformers unstructured["local-inference"] sqlalchemy 'git+https://github.com/facebookresearch/detectron2.git' poppler-utils scrapy llama_index==0.8.9 langchain==0.0.284 opencv-python chromadb==0.4.9 +%pip install pypdf ctransformers==0.2.26 unstructured["local-inference"] sqlalchemy 'git+https://github.com/facebookresearch/detectron2.git' poppler-utils scrapy llama_index==0.8.9 langchain==0.0.284 opencv-python chromadb==0.4.9 # COMMAND ----------