fixed 1.4 issue was ctransformers versioning added some more notes in…

… the notebooks
Data-drone · Sep 25, 2023 · 8cef7c4 · 8cef7c4
1 parent b48c6f1
commit 8cef7c4
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 12 deletions.
diff --git a/0.1_Hugging_Face_basics.py b/0.1_Hugging_Face_basics.py
@@ -7,12 +7,12 @@
 # MAGIC
 # MAGIC In these exercises we will focus on the _transformers_ library but _datasets_, _evaluate_ and _accelerate_ are commonly used in training models. 
 # MAGIC
-# MAGIC All code here is tested on MLR 13.2 on a g5 AWS instance (A10G GPU).
+# MAGIC All code here is tested on MLR 13.3 LTS on a g5 AWS instance (A10G GPU) and also on m5.4xlarge for CPU version.
 # MAGIC We suggest a ```g5.4xlarge``` single node cluster to start
 # MAGIC The Azure equivalent is ```NC6s_v3``` series. However, for this lab we will be using ```g5.4xlarge``` instances.
 # MAGIC ----
 # MAGIC **Notes**
-# MAGIC - Falcon requires Torch 2.0 coming soon....
+# MAGIC - Falcon requires Torch 2.0 which is available in MLR 14.x
 # MAGIC - The LLM Space is fast moving. Many models are provided by independent companies as well so model revision and pinning library versions is important.
 # MAGIC - If using an MLR prior to 13.2, you will need to run ```%pip install einops```
 # MAGIC - It may also be necessary to manually install extra Nvidia libraries via [init_scripts](https://docs.databricks.com/clusters/init-scripts.html)

diff --git a/0.3_Vector_DBs.py b/0.3_Vector_DBs.py
@@ -1,6 +1,10 @@
 # Databricks notebook source
 # MAGIC %md
 # MAGIC # Exploring Vector DBs
+# MAGIC In this notebook we will explore the process of converting text to numbers and what that means for our sentences
+# MAGIC We will use the faiss library which provides a large variety of different algorithms that you can try out.
+# MAGIC The difference between FAISS and a full Vector Database solution is around things like governance, 
+# MAGIC convenience features like updates and production grade featuers like failover and backups.
 
 # COMMAND ----------
 
@@ -24,7 +28,8 @@
 # COMMAND ----------
 
 # MAGIC %md
-# MAGIC # Get some sample data
+# MAGIC # Load some sample data
+# MAGIC We will use wikipedia for our initial sample data
 # COMMAND ----------
 
 # Load Sample Data
@@ -42,6 +47,14 @@
 
 # MAGIC %md
 # MAGIC # Load Embedding Model
+# MAGIC In this example, we will use the tokeniser from MPT-7B to start
+# MAGIC 
+# MAGIC *NOTE* When we build out our full architecture there will be two functions that turn text to tokens.
+# MAGIC - Model Tokenizer - This component we are experimenting with here
+# MAGIC - Embedding Tokenizer - This will be explored later and is used to populate the VectorDB
+# MAGIC
+# MAGIC Whilst the _Model Tokenizer_ is set, you have to use the one intended for your model, the _Embedding Tokenizer_ is something 
+# MAGIC that we can select to suit our use case
 
 # COMMAND ----------
 from transformers import AutoTokenizer
@@ -55,6 +68,7 @@
 
 # MAGIC %md
 # MAGIC # Explore tokenization
+# MAGIC Lets explore the way that words are encoded for our LLM
 
 # COMMAND ----------
 
@@ -80,25 +94,35 @@
 # COMMAND ----------
 
 # MAGIC %md
-# MAGIC # Sentence Encoders for FAISS
-# MAGIC Tokenizers from an LLM and for VectorStores are a bit different
-# MAGIC SentenceTransformers from Huggingface is focused on the latter.
+# MAGIC # Sentence Transformers for Embedding tokenization
+# MAGIC The Sentence Transformers library provides a series of embedding algorithms that can be used to popuiate our VectorDB.
+# MAGIC Unlike the _Model Tokenizer_ which produced a variable length output depending on the input.
+# MAGIC An embedding algorithm produces a fixed length vector so that we can run approximate nearest neighbour algorithms.
+
 # COMMAND ----------
 
 from sentence_transformers import SentenceTransformer
 # initialize sentence transformer model
 model = SentenceTransformer('bert-base-nli-mean-tokens')
 # COMMAND ----------
 
+# Split the document into paragraphs
 paragraph_form = page.content.split('\n\n')
 
 len(paragraph_form)
 
 # COMMAND ----------
 
 # MAGIC %md
-# MAGIC Tokenizations work best when it receives chunks of the same size
-# MAGIC 
+# MAGIC Encode the paragraphs into dence vectors
+# MAGIC Different models will produce a different length vector
+# MAGIC In theory, a model that produces a longer length can represent the input data better.
+# MAGIC But really it depends on the type of data it is trained on.
+# MAGIC
+# MAGIC ie a Sentence Transformer that produces 512 length vectors BUT is trained on medical data 
+# MAGIC will provide a better representation for medical documents than a Sentence Transformer that produces 1024 length vectors 
+# MAGIC but is only trained on social media.
+
 # COMMAND ----------
 
 sentence_encode = model.encode(paragraph_form)
@@ -107,6 +131,8 @@
 
 # MAGIC %md
 # MAGIC # Lets build out a FAISS index
+# MAGIC FAISS lets us experiment with a wide variety of different search algorithms
+# MAGIC Most VectorDBs will offer just one option.
 
 # COMMAND ----------
 

diff --git a/1.1_Evaluations.py b/1.1_Evaluations.py
@@ -3,7 +3,7 @@
 # MAGIC # Evaluations
 # MAGIC Running Evaluations on RAGs is still more art than science \
 # MAGIC We will use llama_index to assist in generating evaluation questions \
-# MAGIC And the ragas library for generating metrics to assess your RAG \
+# MAGIC And use the inbuilt assessment prompt in llama_index \
 # MAGIC 
 # MAGIC We use an older llama_index to align with MLR 13.3 LTS Langchain version \
 # MAGIC as llama_index relies a lot on Langchain

diff --git a/1.3_Evaluating_Embeddings.py b/1.3_Evaluating_Embeddings.py
@@ -2,8 +2,13 @@
 # MAGIC %md
 # MAGIC # Understanding Embeddings
 # MAGIC Embeddings are just vectors and we can visualise and analyse them as such \
-# MAGIC In this case we will use the Arize Phoenix tool \
-# MAGIC for more info see: https://docs.arize.com/phoenix/
+# MAGIC A common way to look at and explore embeddings is to use TSNE visualisations. \
+# MAGIC This can be applied to our VectorDB Data too.
+# MAGIC
+# MAGIC See: https://www.kaggle.com/code/colinmorris/visualizing-embeddings-with-t-sne
+# MAGIC
+# MAGIC An open source tool that you might want to investigate for this as well is Arize Phoenix \
+# MAGIC See: https://docs.arize.com/phoenix/
 
 # COMMAND ----------
 # MAGIC # "arize-phoenix[experimental]"  pandas==1.5.3

diff --git a/1.4_Advanced_RAG.py b/1.4_Advanced_RAG.py
@@ -6,7 +6,7 @@
 # COMMAND ----------
 
 # DBTITLE 1,Extra Libs to install
-%pip install pypdf ctransformers unstructured["local-inference"] sqlalchemy 'git+https://github.com/facebookresearch/detectron2.git' poppler-utils scrapy llama_index==0.8.9 langchain==0.0.284 opencv-python chromadb==0.4.9
+%pip install pypdf ctransformers==0.2.26 unstructured["local-inference"] sqlalchemy 'git+https://github.com/facebookresearch/detectron2.git' poppler-utils scrapy llama_index==0.8.9 langchain==0.0.284 opencv-python chromadb==0.4.9
 
 # COMMAND ----------