From 247225abe1a961139f2116c73c138ff119d1658c Mon Sep 17 00:00:00 2001 From: trishamaturi <63947635+trishamaturi@users.noreply.github.com> Date: Mon, 10 Jul 2023 10:52:12 -0700 Subject: [PATCH 1/4] adding hybrid search notebook --- notebooks/hybrid-search/notebook.ipynb | 1 + 1 file changed, 1 insertion(+) create mode 100644 notebooks/hybrid-search/notebook.ipynb diff --git a/notebooks/hybrid-search/notebook.ipynb b/notebooks/hybrid-search/notebook.ipynb new file mode 100644 index 0000000..d7ad799 --- /dev/null +++ b/notebooks/hybrid-search/notebook.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","id":"d9f9e629-6eb9-4ca5-bcf2-1b8672b86725","metadata":{"execution":{"iopub.execute_input":"2023-06-06T03:34:15.712942Z","iopub.status.busy":"2023-06-06T03:34:15.712613Z","iopub.status.idle":"2023-06-06T03:34:15.715753Z","shell.execute_reply":"2023-06-06T03:34:15.715128Z","shell.execute_reply.started":"2023-06-06T03:34:15.712919Z"},"tags":[]},"source":"# Hybrid Search\nHybrid search integrates both keyword-based search and semantic search in order to combine the strengths of both and provide users with a more comprehensive and efficient search experience. This notebook is an example on how to perform hybrid search with SingleStore's database and notebooks."},{"cell_type":"markdown","id":"532e8d3f-007d-48a4-8d36-44b561dd1109","metadata":{},"source":"## Setup\nLet's first download the libraries necessary."},{"cell_type":"code","execution_count":null,"id":"07990b64-9447-46a8-abbc-51be1972dfda","metadata":{"execution":{"iopub.status.busy":"2023-06-06T21:08:59.945009Z","iopub.status.idle":"2023-06-06T21:08:59.945409Z","shell.execute_reply":"2023-06-06T21:08:59.945223Z","shell.execute_reply.started":"2023-06-06T21:08:59.945200Z"},"tags":[],"trusted":true},"outputs":[],"source":"!pip install matplotlib --quiet\n!pip install plotly.express --quiet\n!pip install scikit-learn --quiet\n!pip install tabulate --quiet\n!pip install tiktoken --quiet\n!pip install wget --quiet\n!pip install openai --quiet"},{"cell_type":"code","execution_count":3,"id":"a592dd5e-4114-4abf-923d-74038f5244eb","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:02:56.564548Z","iopub.status.busy":"2023-06-06T21:02:56.563787Z","iopub.status.idle":"2023-06-06T21:02:56.569852Z","shell.execute_reply":"2023-06-06T21:02:56.569020Z","shell.execute_reply.started":"2023-06-06T21:02:56.564514Z"},"tags":[],"trusted":true},"outputs":[],"source":"import pandas as pd\nimport os\nimport wget\nimport ast\nimport json"},{"cell_type":"code","execution_count":4,"id":"c2bffc74-4b6a-4c0f-acef-f72bb255ec79","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:02:56.571594Z","iopub.status.busy":"2023-06-06T21:02:56.571082Z","iopub.status.idle":"2023-06-06T21:03:03.267025Z","shell.execute_reply":"2023-06-06T21:03:03.266304Z","shell.execute_reply.started":"2023-06-06T21:02:56.571565Z"},"tags":[],"trusted":true},"outputs":[],"source":"# Import the library for vectorizing the data (Up to 2 minutes)\n!pip install sentence-transformers --quiet\nfrom sentence_transformers import SentenceTransformer\nmodel = SentenceTransformer('flax-sentence-embeddings/all_datasets_v3_mpnet-base')"},{"cell_type":"markdown","id":"0aa95a80-5683-4dc3-9e52-c3e890ab87af","metadata":{},"source":"## Import data from CSV File\nThis csv file holds the title, summary, and category of approximately 2000 news articles."},{"cell_type":"code","execution_count":5,"id":"b1b2971e-d0f6-4cfa-a9a7-954602bda460","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:03:03.268584Z","iopub.status.busy":"2023-06-06T21:03:03.268085Z","iopub.status.idle":"2023-06-06T21:03:03.277120Z","shell.execute_reply":"2023-06-06T21:03:03.276481Z","shell.execute_reply.started":"2023-06-06T21:03:03.268551Z"},"tags":[],"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":"File already exists in the local file system.\n"}],"source":"# download reviews csv file\ncvs_file_path = \"https://raw.githubusercontent.com/openai/openai-cookbook/main/examples/data/AG_news_samples.csv\"\nfile_path = \"AG_news_samples.csv\"\n\nif not os.path.exists(file_path):\n wget.download(cvs_file_path, file_path)\n print(\"File downloaded successfully.\")\nelse:\n print(\"File already exists in the local file system.\")"},{"cell_type":"code","execution_count":6,"id":"6c821edd-ce7b-46d9-aa79-0ab1766266a0","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:03:03.278520Z","iopub.status.busy":"2023-06-06T21:03:03.278228Z","iopub.status.idle":"2023-06-06T21:03:03.293099Z","shell.execute_reply":"2023-06-06T21:03:03.292469Z","shell.execute_reply.started":"2023-06-06T21:03:03.278492Z"},"tags":[],"trusted":true},"outputs":[],"source":"df = pd.read_csv('AG_news_samples.csv')"},{"cell_type":"code","execution_count":7,"id":"cf7caa13-848a-46f1-9730-7f441339e65c","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:03:03.296401Z","iopub.status.busy":"2023-06-06T21:03:03.296190Z","iopub.status.idle":"2023-06-06T21:03:03.308541Z","shell.execute_reply":"2023-06-06T21:03:03.307827Z","shell.execute_reply.started":"2023-06-06T21:03:03.296383Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/html":"\u003cdiv\u003e\n\u003cstyle scoped\u003e\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n\u003c/style\u003e\n\u003ctable border=\"1\" class=\"dataframe\"\u003e\n \u003cthead\u003e\n \u003ctr style=\"text-align: right;\"\u003e\n \u003cth\u003e\u003c/th\u003e\n \u003cth\u003etitle\u003c/th\u003e\n \u003cth\u003edescription\u003c/th\u003e\n \u003cth\u003elabel_int\u003c/th\u003e\n \u003cth\u003elabel\u003c/th\u003e\n \u003c/tr\u003e\n \u003c/thead\u003e\n \u003ctbody\u003e\n \u003ctr\u003e\n \u003cth\u003e0\u003c/th\u003e\n \u003ctd\u003eWorld Briefings\u003c/td\u003e\n \u003ctd\u003eBRITAIN: BLAIR WARNS OF CLIMATE THREAT Prime M...\u003c/td\u003e\n \u003ctd\u003e1\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1\u003c/th\u003e\n \u003ctd\u003eNvidia Puts a Firewall on a Motherboard (PC Wo...\u003c/td\u003e\n \u003ctd\u003ePC World - Upcoming chip set will include buil...\u003c/td\u003e\n \u003ctd\u003e4\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e2\u003c/th\u003e\n \u003ctd\u003eOlympic joy in Greek, Chinese press\u003c/td\u003e\n \u003ctd\u003eNewspapers in Greece reflect a mixture of exhi...\u003c/td\u003e\n \u003ctd\u003e2\u003c/td\u003e\n \u003ctd\u003eSports\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e3\u003c/th\u003e\n \u003ctd\u003eU2 Can iPod with Pictures\u003c/td\u003e\n \u003ctd\u003eSAN JOSE, Calif. -- Apple Computer (Quote, Cha...\u003c/td\u003e\n \u003ctd\u003e4\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e4\u003c/th\u003e\n \u003ctd\u003eThe Dream Factory\u003c/td\u003e\n \u003ctd\u003eAny product, any shape, any size -- manufactur...\u003c/td\u003e\n \u003ctd\u003e4\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e...\u003c/th\u003e\n \u003ctd\u003e...\u003c/td\u003e\n \u003ctd\u003e...\u003c/td\u003e\n \u003ctd\u003e...\u003c/td\u003e\n \u003ctd\u003e...\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1995\u003c/th\u003e\n \u003ctd\u003eYou Control: iTunes puts control in OS X menu ...\u003c/td\u003e\n \u003ctd\u003eMacCentral - You Software Inc. announced on Tu...\u003c/td\u003e\n \u003ctd\u003e4\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1996\u003c/th\u003e\n \u003ctd\u003eArgentina beat Italy for place in football final\u003c/td\u003e\n \u003ctd\u003eFavourites Argentina beat Italy 3-0 this morni...\u003c/td\u003e\n \u003ctd\u003e2\u003c/td\u003e\n \u003ctd\u003eSports\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1997\u003c/th\u003e\n \u003ctd\u003eNCAA case no worry for Spurrier\u003c/td\u003e\n \u003ctd\u003eShortly after Steve Spurrier arrived at Florid...\u003c/td\u003e\n \u003ctd\u003e2\u003c/td\u003e\n \u003ctd\u003eSports\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1998\u003c/th\u003e\n \u003ctd\u003eSecret Service Busts Cyber Gangs\u003c/td\u003e\n \u003ctd\u003eThe US Secret Service Thursday announced arres...\u003c/td\u003e\n \u003ctd\u003e4\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1999\u003c/th\u003e\n \u003ctd\u003eStocks Flat; Higher Oil Limits Gains\u003c/td\u003e\n \u003ctd\u003eUS stocks were little changed on Thursday as a...\u003c/td\u003e\n \u003ctd\u003e3\u003c/td\u003e\n \u003ctd\u003eBusiness\u003c/td\u003e\n \u003c/tr\u003e\n \u003c/tbody\u003e\n\u003c/table\u003e\n\u003cp\u003e2000 rows × 4 columns\u003c/p\u003e\n\u003c/div\u003e","text/plain":" title \\\n0 World Briefings \n1 Nvidia Puts a Firewall on a Motherboard (PC Wo... \n2 Olympic joy in Greek, Chinese press \n3 U2 Can iPod with Pictures \n4 The Dream Factory \n... ... \n1995 You Control: iTunes puts control in OS X menu ... \n1996 Argentina beat Italy for place in football final \n1997 NCAA case no worry for Spurrier \n1998 Secret Service Busts Cyber Gangs \n1999 Stocks Flat; Higher Oil Limits Gains \n\n description label_int label \n0 BRITAIN: BLAIR WARNS OF CLIMATE THREAT Prime M... 1 World \n1 PC World - Upcoming chip set will include buil... 4 Sci/Tech \n2 Newspapers in Greece reflect a mixture of exhi... 2 Sports \n3 SAN JOSE, Calif. -- Apple Computer (Quote, Cha... 4 Sci/Tech \n4 Any product, any shape, any size -- manufactur... 4 Sci/Tech \n... ... ... ... \n1995 MacCentral - You Software Inc. announced on Tu... 4 Sci/Tech \n1996 Favourites Argentina beat Italy 3-0 this morni... 2 Sports \n1997 Shortly after Steve Spurrier arrived at Florid... 2 Sports \n1998 The US Secret Service Thursday announced arres... 4 Sci/Tech \n1999 US stocks were little changed on Thursday as a... 3 Business \n\n[2000 rows x 4 columns]"},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":"df"},{"cell_type":"code","execution_count":8,"id":"e30c69d3-a807-4437-84e9-6972e3bc3d85","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:03:03.310062Z","iopub.status.busy":"2023-06-06T21:03:03.309759Z","iopub.status.idle":"2023-06-06T21:03:03.315541Z","shell.execute_reply":"2023-06-06T21:03:03.314814Z","shell.execute_reply.started":"2023-06-06T21:03:03.310033Z"},"tags":[],"trusted":true},"outputs":[],"source":"data = df.values.tolist()"},{"cell_type":"markdown","id":"0b6c6560-bc60-43ba-93a4-1b4aee933d5b","metadata":{},"source":"## Set up SingleStore Database"},{"cell_type":"markdown","id":"d6a1952b-7313-4007-9ec5-4c506425190f","metadata":{},"source":"Connect to your SingleStoreDB Cloud workspaces using SQLAlchemy."},{"cell_type":"code","execution_count":26,"id":"1e8b918f-d849-4bad-b5e9-1cf8be138026","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:09:17.465072Z","iopub.status.busy":"2023-06-06T21:09:17.464722Z","iopub.status.idle":"2023-06-06T21:09:17.469360Z","shell.execute_reply":"2023-06-06T21:09:17.468395Z","shell.execute_reply.started":"2023-06-06T21:09:17.465048Z"},"tags":[],"trusted":true},"outputs":[],"source":"from sqlalchemy import *\n\ndb_connection = create_engine(connection_url)"},{"cell_type":"markdown","id":"e1dd6296-54b0-4f8d-886a-13cacfc28163","metadata":{},"source":"Set up your SingleStore Database which will hold your data."},{"cell_type":"code","execution_count":27,"id":"e1874b6f-706a-4638-ad2a-ca387953acaa","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:09:18.544191Z","iopub.status.busy":"2023-06-06T21:09:18.543841Z","iopub.status.idle":"2023-06-06T21:09:25.532992Z","shell.execute_reply":"2023-06-06T21:09:25.532154Z","shell.execute_reply.started":"2023-06-06T21:09:18.544164Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/plain":"[]"},"execution_count":27,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\n-- Create the database\nDROP DATABASE IF EXISTS news;\nCREATE DATABASE IF NOT EXISTS news;"},{"cell_type":"code","execution_count":28,"id":"3f1e2c3d-6fbd-46bb-9bd3-235eb51941cf","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:09:25.535009Z","iopub.status.busy":"2023-06-06T21:09:25.534700Z","iopub.status.idle":"2023-06-06T21:09:25.891270Z","shell.execute_reply":"2023-06-06T21:09:25.890595Z","shell.execute_reply.started":"2023-06-06T21:09:25.534981Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/plain":"[]"},"execution_count":28,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nUSE news;\n-- Create the table\nDROP TABLE IF EXISTS news_articles;\nCREATE TABLE IF NOT EXISTS news_articles (\n id INT PRIMARY KEY,\n title TEXT,\n description TEXT,\n genre TEXT,\n embedding BLOB,\n FULLTEXT (title, description)\n);"},{"cell_type":"markdown","id":"8bd97023-3d02-44d4-8bd3-59875cb22b6c","metadata":{"execution":{"iopub.execute_input":"2023-06-06T06:31:11.967693Z","iopub.status.busy":"2023-06-06T06:31:11.967312Z","iopub.status.idle":"2023-06-06T06:31:11.971035Z","shell.execute_reply":"2023-06-06T06:31:11.970370Z","shell.execute_reply.started":"2023-06-06T06:31:11.967669Z"},"tags":[]},"source":"### Get embeddings for every row based on the description column."},{"cell_type":"code","execution_count":null,"id":"496f84d0-51b6-4b66-bf5b-b1b260e4c2de","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:03:11.982453Z","iopub.status.busy":"2023-06-06T21:03:11.981961Z"},"tags":[],"trusted":true},"outputs":[],"source":"descriptions = [row[1] if row[1] is not None else row[1] for row in data]\nall_embeddings = model.encode(descriptions)\nall_embeddings.shape"},{"cell_type":"markdown","id":"46b1628c-0ffc-4a84-ba8b-43e8df081b01","metadata":{},"source":"### Populate the database"},{"cell_type":"code","execution_count":29,"id":"ca761550-f6f9-45f2-a3bf-1c25cd2aee38","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:09:28.486691Z","iopub.status.busy":"2023-06-06T21:09:28.486185Z","iopub.status.idle":"2023-06-06T21:09:42.080303Z","shell.execute_reply":"2023-06-06T21:09:42.079627Z","shell.execute_reply.started":"2023-06-06T21:09:28.486662Z"},"tags":[],"trusted":true},"outputs":[],"source":"%sql TRUNCATE TABLE news_articles;\nstatement = f\"\"\"\n INSERT INTO news.news_articles (\n id,\n title,\n description,\n genre,\n embedding\n )\n VALUES (\n %s,\n %s,\n %s,\n %s,\n JSON_ARRAY_PACK_F64(%s)\n )\n \"\"\"\nfor i in range(0, len(data)):\n try:\n ndarray_to_list = all_embeddings[i].tolist()\n json_vector = json.dumps(ndarray_to_list)\n db_connection.execute(statement, (i, data[i][0], data[i][1], data[i][3], str(json_vector)))\n except Exception as e:\n print(\"Error inserting row {}: {}\".format(i, e))\n continue"},{"cell_type":"markdown","id":"a2f3d567-eaf4-487a-a1f9-2eb7e1071991","metadata":{"tags":[]},"source":"## Semantic Search"},{"cell_type":"markdown","id":"7ad3b8f6-d3a8-4954-a737-f11c785ce9ce","metadata":{},"source":"### Connect to OpenAI"},{"cell_type":"code","execution_count":30,"id":"598d7077-d04c-46b3-b7c4-7b4362dd4507","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:09:45.778023Z","iopub.status.busy":"2023-06-06T21:09:45.777510Z","iopub.status.idle":"2023-06-06T21:09:45.849344Z","shell.execute_reply":"2023-06-06T21:09:45.848671Z","shell.execute_reply.started":"2023-06-06T21:09:45.777998Z"},"tags":[],"trusted":true},"outputs":[],"source":"import openai\n\n# models\nEMBEDDING_MODEL = \"text-embedding-ada-002\"\nGPT_MODEL = \"gpt-3.5-turbo\""},{"cell_type":"code","execution_count":31,"id":"9eea2f67-3c2e-4d1a-87c2-052c2acf4026","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:09:45.962790Z","iopub.status.busy":"2023-06-06T21:09:45.962139Z","iopub.status.idle":"2023-06-06T21:09:45.966722Z","shell.execute_reply":"2023-06-06T21:09:45.965769Z","shell.execute_reply.started":"2023-06-06T21:09:45.962761Z"},"tags":[],"trusted":true},"outputs":[],"source":"openai.api_key = 'YOUR_API_KEY'"},{"cell_type":"markdown","id":"6504f561-1ab1-4dbf-a523-0aef23b66e4b","metadata":{},"source":"### Run Semantic Search and get scores"},{"cell_type":"code","execution_count":40,"id":"a62a4c06-d77a-49b1-beaf-4c54b04d001f","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:11:26.965676Z","iopub.status.busy":"2023-06-06T21:11:26.965313Z","iopub.status.idle":"2023-06-06T21:11:27.367072Z","shell.execute_reply":"2023-06-06T21:11:27.366404Z","shell.execute_reply.started":"2023-06-06T21:11:26.965649Z"},"tags":[],"trusted":true},"outputs":[],"source":"from openai.embeddings_utils import get_embedding\nsearch_query = \"Articles about Merck stock\"\nsearch_embedding = json.dumps(model.encode(search_query).tolist())\n\n# Get the embedding of the query.\nquery_embedding_response = get_embedding(search_query, EMBEDDING_MODEL)\n\n# Create the SQL statement.\nquery_statement = f\"\"\"\n SELECT\n title,\n description,\n genre,\n DOT_PRODUCT_F64(JSON_ARRAY_PACK_F64(%s), embedding) AS score\n FROM news.news_articles\n ORDER BY score DESC\n LIMIT 10\n \"\"\"\n\n# Execute the SQL statement.\nresults = db_connection.execute(query_statement, (search_embedding,)).fetchall()"},{"cell_type":"markdown","id":"b9128422-3375-4801-b525-ea9381c50719","metadata":{},"source":"Printing the results:"},{"cell_type":"code","execution_count":41,"id":"0430b21f-9ace-4907-99ad-246abcf3b862","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:11:27.685235Z","iopub.status.busy":"2023-06-06T21:11:27.684880Z","iopub.status.idle":"2023-06-06T21:11:27.689124Z","shell.execute_reply":"2023-06-06T21:11:27.688337Z","shell.execute_reply.started":"2023-06-06T21:11:27.685209Z"},"tags":[],"trusted":true},"outputs":[],"source":"output_list = []\n\nfor res in results:\n output_list.append([\n res[0], res[2], res[3]\n ])"},{"cell_type":"code","execution_count":42,"id":"76769481-a3fa-48f8-851d-7cfb35043916","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:11:27.691335Z","iopub.status.busy":"2023-06-06T21:11:27.690636Z","iopub.status.idle":"2023-06-06T21:11:27.697204Z","shell.execute_reply":"2023-06-06T21:11:27.696451Z","shell.execute_reply.started":"2023-06-06T21:11:27.691305Z"},"tags":[],"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":"title genre score\n----------------------------------------------------------------- -------- --------\nMerck plunges on report it hid Vioxx risks Business 0.612733\nDrug Firm Shares in Slump Business 0.526167\nMerck Pulls Arthritis Drug Off Market Business 0.518859\nMerck takes down the Dow Business 0.492008\n#39;Best #39; ratings a response to rising costs, safety concerns Business 0.468971\nWashington Post Profit Up on Education,TV Business 0.447177\nDRUG LIABILITY: Attorneys want to query Merck CEO Business 0.445199\nDow Jones to Buy MarketWatch for \\$463 Mln Business 0.417015\nWill This Takeover Plan Turn Into Gold? (The Motley Fool) Sci/Tech 0.376652\nNorthrop Third-Quarter Profit Rises Business 0.369725\n"}],"source":"from tabulate import tabulate\nheaders = [\"title\", \"genre\", \"score\"]\nprint(tabulate(output_list, headers))"},{"cell_type":"markdown","id":"2c8ff862-ea5b-4960-be5b-bcd530d6e918","metadata":{},"source":"## Hybrid Search"},{"cell_type":"markdown","id":"d0b2cff3-76f8-4a35-a596-4f001a9b4c8c","metadata":{},"source":"This search finds the average of the score gotten from the semantic search and the score gotten from the key-word search and sorts the news articles by this combined score to perform an effective hybrid search."},{"cell_type":"code","execution_count":123,"id":"9df7073f-6a89-4528-968c-7d5c21876a83","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:29:56.917413Z","iopub.status.busy":"2023-06-06T21:29:56.916997Z","iopub.status.idle":"2023-06-06T21:29:58.107311Z","shell.execute_reply":"2023-06-06T21:29:58.106593Z","shell.execute_reply.started":"2023-06-06T21:29:56.917385Z"},"tags":[],"trusted":true},"outputs":[],"source":"hyb_query = \"Articles about Merck stock\"\nhyb_embedding = json.dumps(model.encode(hyb_query).tolist())\n\n# Get the embedding of the query.\nhyb_embedding_response = get_embedding(hyb_query, EMBEDDING_MODEL)\n\n# Create the SQL statement.\nhyb_statement = f\"\"\"\n SELECT\n title,\n description,\n genre,\n DOT_PRODUCT_F64(JSON_ARRAY_PACK_F64(%s), embedding) AS semantic_score,\n MATCH(title, description) AGAINST (%s) AS keyword_score,\n (semantic_score + keyword_score) / 2 AS combined_score\n FROM news.news_articles\n ORDER BY combined_score DESC\n LIMIT 10\n \"\"\"\n\n# Execute the SQL statement.\nhyb_results = db_connection.execute(hyb_statement, (hyb_embedding, hyb_query)).fetchall()"},{"cell_type":"code","execution_count":124,"id":"acef6808-706d-4002-9e2d-1967ab5974c7","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:29:58.109179Z","iopub.status.busy":"2023-06-06T21:29:58.108868Z","iopub.status.idle":"2023-06-06T21:29:58.113525Z","shell.execute_reply":"2023-06-06T21:29:58.112736Z","shell.execute_reply.started":"2023-06-06T21:29:58.109148Z"},"tags":[],"trusted":true},"outputs":[],"source":"hybrid_output_list = []\n\nfor res in hyb_results:\n hybrid_output_list.append([\n res[0], res[2], res[5]\n ])"},{"cell_type":"code","execution_count":125,"id":"c2b8bae4-a16d-4aa4-944d-74e7ffa396e3","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:29:58.115238Z","iopub.status.busy":"2023-06-06T21:29:58.114862Z","iopub.status.idle":"2023-06-06T21:29:58.120714Z","shell.execute_reply":"2023-06-06T21:29:58.119903Z","shell.execute_reply.started":"2023-06-06T21:29:58.115209Z"},"tags":[],"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":"title genre score\n------------------------------------------------------------------- -------- --------\nDRUG LIABILITY: Attorneys want to query Merck CEO Business 0.492007\nMerck plunges on report it hid Vioxx risks Business 0.485847\nMerck takes down the Dow Business 0.426202\nMerck Pulls Arthritis Drug Off Market Business 0.402937\nDrug Firm Shares in Slump Business 0.331145\nUS Stock-Index Futures Decline; Citigroup, GE Slip in Europe Business 0.237872\n#39;Best #39; ratings a response to rising costs, safety concerns Business 0.234486\nOwner of Big Electronic Stock Trading System Is Said to Be for Sale Business 0.224903\nWashington Post Profit Up on Education,TV Business 0.223588\nDow Jones to Buy MarketWatch for \\$463 Mln Business 0.217986\n"}],"source":"from tabulate import tabulate\nheaders = [\"title\", \"genre\", \"score\"]\nprint(tabulate(hybrid_output_list, headers))"}],"metadata":{"jupyterlab":{"notebooks":{"version_major":6,"version_minor":4}},"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.9"},"singlestore_connection":{"connectionID":"1efa4dba-bf60-42f3-8d19-19dc6b6ffb35","defaultDatabase":""},"singlestore_row_limit":300},"nbformat":4,"nbformat_minor":5} \ No newline at end of file From b44b5c7c1f7bfca9a28697aefdc7047d72f90398 Mon Sep 17 00:00:00 2001 From: trishamaturi <63947635+trishamaturi@users.noreply.github.com> Date: Mon, 10 Jul 2023 11:05:27 -0700 Subject: [PATCH 2/4] adding meta.toml file --- notebooks/hybrid-search/meta.toml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 notebooks/hybrid-search/meta.toml diff --git a/notebooks/hybrid-search/meta.toml b/notebooks/hybrid-search/meta.toml new file mode 100644 index 0000000..a4da391 --- /dev/null +++ b/notebooks/hybrid-search/meta.toml @@ -0,0 +1,4 @@ +[meta] +title="Hybrid Search" +description="Hybrid search combines keyword search with semantic search, aiming to provide more accurate results." +tags=[] From 54eedf6662345ccffb0c8280e5a7b60704866033 Mon Sep 17 00:00:00 2001 From: trishamaturi <63947635+trishamaturi@users.noreply.github.com> Date: Thu, 3 Aug 2023 13:20:44 -0700 Subject: [PATCH 3/4] updating notebook after review --- notebooks/hybrid-search/notebook.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/hybrid-search/notebook.ipynb b/notebooks/hybrid-search/notebook.ipynb index d7ad799..91fff25 100644 --- a/notebooks/hybrid-search/notebook.ipynb +++ b/notebooks/hybrid-search/notebook.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"markdown","id":"d9f9e629-6eb9-4ca5-bcf2-1b8672b86725","metadata":{"execution":{"iopub.execute_input":"2023-06-06T03:34:15.712942Z","iopub.status.busy":"2023-06-06T03:34:15.712613Z","iopub.status.idle":"2023-06-06T03:34:15.715753Z","shell.execute_reply":"2023-06-06T03:34:15.715128Z","shell.execute_reply.started":"2023-06-06T03:34:15.712919Z"},"tags":[]},"source":"# Hybrid Search\nHybrid search integrates both keyword-based search and semantic search in order to combine the strengths of both and provide users with a more comprehensive and efficient search experience. This notebook is an example on how to perform hybrid search with SingleStore's database and notebooks."},{"cell_type":"markdown","id":"532e8d3f-007d-48a4-8d36-44b561dd1109","metadata":{},"source":"## Setup\nLet's first download the libraries necessary."},{"cell_type":"code","execution_count":null,"id":"07990b64-9447-46a8-abbc-51be1972dfda","metadata":{"execution":{"iopub.status.busy":"2023-06-06T21:08:59.945009Z","iopub.status.idle":"2023-06-06T21:08:59.945409Z","shell.execute_reply":"2023-06-06T21:08:59.945223Z","shell.execute_reply.started":"2023-06-06T21:08:59.945200Z"},"tags":[],"trusted":true},"outputs":[],"source":"!pip install matplotlib --quiet\n!pip install plotly.express --quiet\n!pip install scikit-learn --quiet\n!pip install tabulate --quiet\n!pip install tiktoken --quiet\n!pip install wget --quiet\n!pip install openai --quiet"},{"cell_type":"code","execution_count":3,"id":"a592dd5e-4114-4abf-923d-74038f5244eb","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:02:56.564548Z","iopub.status.busy":"2023-06-06T21:02:56.563787Z","iopub.status.idle":"2023-06-06T21:02:56.569852Z","shell.execute_reply":"2023-06-06T21:02:56.569020Z","shell.execute_reply.started":"2023-06-06T21:02:56.564514Z"},"tags":[],"trusted":true},"outputs":[],"source":"import pandas as pd\nimport os\nimport wget\nimport ast\nimport json"},{"cell_type":"code","execution_count":4,"id":"c2bffc74-4b6a-4c0f-acef-f72bb255ec79","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:02:56.571594Z","iopub.status.busy":"2023-06-06T21:02:56.571082Z","iopub.status.idle":"2023-06-06T21:03:03.267025Z","shell.execute_reply":"2023-06-06T21:03:03.266304Z","shell.execute_reply.started":"2023-06-06T21:02:56.571565Z"},"tags":[],"trusted":true},"outputs":[],"source":"# Import the library for vectorizing the data (Up to 2 minutes)\n!pip install sentence-transformers --quiet\nfrom sentence_transformers import SentenceTransformer\nmodel = SentenceTransformer('flax-sentence-embeddings/all_datasets_v3_mpnet-base')"},{"cell_type":"markdown","id":"0aa95a80-5683-4dc3-9e52-c3e890ab87af","metadata":{},"source":"## Import data from CSV File\nThis csv file holds the title, summary, and category of approximately 2000 news articles."},{"cell_type":"code","execution_count":5,"id":"b1b2971e-d0f6-4cfa-a9a7-954602bda460","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:03:03.268584Z","iopub.status.busy":"2023-06-06T21:03:03.268085Z","iopub.status.idle":"2023-06-06T21:03:03.277120Z","shell.execute_reply":"2023-06-06T21:03:03.276481Z","shell.execute_reply.started":"2023-06-06T21:03:03.268551Z"},"tags":[],"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":"File already exists in the local file system.\n"}],"source":"# download reviews csv file\ncvs_file_path = \"https://raw.githubusercontent.com/openai/openai-cookbook/main/examples/data/AG_news_samples.csv\"\nfile_path = \"AG_news_samples.csv\"\n\nif not os.path.exists(file_path):\n wget.download(cvs_file_path, file_path)\n print(\"File downloaded successfully.\")\nelse:\n print(\"File already exists in the local file system.\")"},{"cell_type":"code","execution_count":6,"id":"6c821edd-ce7b-46d9-aa79-0ab1766266a0","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:03:03.278520Z","iopub.status.busy":"2023-06-06T21:03:03.278228Z","iopub.status.idle":"2023-06-06T21:03:03.293099Z","shell.execute_reply":"2023-06-06T21:03:03.292469Z","shell.execute_reply.started":"2023-06-06T21:03:03.278492Z"},"tags":[],"trusted":true},"outputs":[],"source":"df = pd.read_csv('AG_news_samples.csv')"},{"cell_type":"code","execution_count":7,"id":"cf7caa13-848a-46f1-9730-7f441339e65c","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:03:03.296401Z","iopub.status.busy":"2023-06-06T21:03:03.296190Z","iopub.status.idle":"2023-06-06T21:03:03.308541Z","shell.execute_reply":"2023-06-06T21:03:03.307827Z","shell.execute_reply.started":"2023-06-06T21:03:03.296383Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/html":"\u003cdiv\u003e\n\u003cstyle scoped\u003e\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n\u003c/style\u003e\n\u003ctable border=\"1\" class=\"dataframe\"\u003e\n \u003cthead\u003e\n \u003ctr style=\"text-align: right;\"\u003e\n \u003cth\u003e\u003c/th\u003e\n \u003cth\u003etitle\u003c/th\u003e\n \u003cth\u003edescription\u003c/th\u003e\n \u003cth\u003elabel_int\u003c/th\u003e\n \u003cth\u003elabel\u003c/th\u003e\n \u003c/tr\u003e\n \u003c/thead\u003e\n \u003ctbody\u003e\n \u003ctr\u003e\n \u003cth\u003e0\u003c/th\u003e\n \u003ctd\u003eWorld Briefings\u003c/td\u003e\n \u003ctd\u003eBRITAIN: BLAIR WARNS OF CLIMATE THREAT Prime M...\u003c/td\u003e\n \u003ctd\u003e1\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1\u003c/th\u003e\n \u003ctd\u003eNvidia Puts a Firewall on a Motherboard (PC Wo...\u003c/td\u003e\n \u003ctd\u003ePC World - Upcoming chip set will include buil...\u003c/td\u003e\n \u003ctd\u003e4\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e2\u003c/th\u003e\n \u003ctd\u003eOlympic joy in Greek, Chinese press\u003c/td\u003e\n \u003ctd\u003eNewspapers in Greece reflect a mixture of exhi...\u003c/td\u003e\n \u003ctd\u003e2\u003c/td\u003e\n \u003ctd\u003eSports\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e3\u003c/th\u003e\n \u003ctd\u003eU2 Can iPod with Pictures\u003c/td\u003e\n \u003ctd\u003eSAN JOSE, Calif. -- Apple Computer (Quote, Cha...\u003c/td\u003e\n \u003ctd\u003e4\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e4\u003c/th\u003e\n \u003ctd\u003eThe Dream Factory\u003c/td\u003e\n \u003ctd\u003eAny product, any shape, any size -- manufactur...\u003c/td\u003e\n \u003ctd\u003e4\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e...\u003c/th\u003e\n \u003ctd\u003e...\u003c/td\u003e\n \u003ctd\u003e...\u003c/td\u003e\n \u003ctd\u003e...\u003c/td\u003e\n \u003ctd\u003e...\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1995\u003c/th\u003e\n \u003ctd\u003eYou Control: iTunes puts control in OS X menu ...\u003c/td\u003e\n \u003ctd\u003eMacCentral - You Software Inc. announced on Tu...\u003c/td\u003e\n \u003ctd\u003e4\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1996\u003c/th\u003e\n \u003ctd\u003eArgentina beat Italy for place in football final\u003c/td\u003e\n \u003ctd\u003eFavourites Argentina beat Italy 3-0 this morni...\u003c/td\u003e\n \u003ctd\u003e2\u003c/td\u003e\n \u003ctd\u003eSports\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1997\u003c/th\u003e\n \u003ctd\u003eNCAA case no worry for Spurrier\u003c/td\u003e\n \u003ctd\u003eShortly after Steve Spurrier arrived at Florid...\u003c/td\u003e\n \u003ctd\u003e2\u003c/td\u003e\n \u003ctd\u003eSports\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1998\u003c/th\u003e\n \u003ctd\u003eSecret Service Busts Cyber Gangs\u003c/td\u003e\n \u003ctd\u003eThe US Secret Service Thursday announced arres...\u003c/td\u003e\n \u003ctd\u003e4\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1999\u003c/th\u003e\n \u003ctd\u003eStocks Flat; Higher Oil Limits Gains\u003c/td\u003e\n \u003ctd\u003eUS stocks were little changed on Thursday as a...\u003c/td\u003e\n \u003ctd\u003e3\u003c/td\u003e\n \u003ctd\u003eBusiness\u003c/td\u003e\n \u003c/tr\u003e\n \u003c/tbody\u003e\n\u003c/table\u003e\n\u003cp\u003e2000 rows × 4 columns\u003c/p\u003e\n\u003c/div\u003e","text/plain":" title \\\n0 World Briefings \n1 Nvidia Puts a Firewall on a Motherboard (PC Wo... \n2 Olympic joy in Greek, Chinese press \n3 U2 Can iPod with Pictures \n4 The Dream Factory \n... ... \n1995 You Control: iTunes puts control in OS X menu ... \n1996 Argentina beat Italy for place in football final \n1997 NCAA case no worry for Spurrier \n1998 Secret Service Busts Cyber Gangs \n1999 Stocks Flat; Higher Oil Limits Gains \n\n description label_int label \n0 BRITAIN: BLAIR WARNS OF CLIMATE THREAT Prime M... 1 World \n1 PC World - Upcoming chip set will include buil... 4 Sci/Tech \n2 Newspapers in Greece reflect a mixture of exhi... 2 Sports \n3 SAN JOSE, Calif. -- Apple Computer (Quote, Cha... 4 Sci/Tech \n4 Any product, any shape, any size -- manufactur... 4 Sci/Tech \n... ... ... ... \n1995 MacCentral - You Software Inc. announced on Tu... 4 Sci/Tech \n1996 Favourites Argentina beat Italy 3-0 this morni... 2 Sports \n1997 Shortly after Steve Spurrier arrived at Florid... 2 Sports \n1998 The US Secret Service Thursday announced arres... 4 Sci/Tech \n1999 US stocks were little changed on Thursday as a... 3 Business \n\n[2000 rows x 4 columns]"},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":"df"},{"cell_type":"code","execution_count":8,"id":"e30c69d3-a807-4437-84e9-6972e3bc3d85","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:03:03.310062Z","iopub.status.busy":"2023-06-06T21:03:03.309759Z","iopub.status.idle":"2023-06-06T21:03:03.315541Z","shell.execute_reply":"2023-06-06T21:03:03.314814Z","shell.execute_reply.started":"2023-06-06T21:03:03.310033Z"},"tags":[],"trusted":true},"outputs":[],"source":"data = df.values.tolist()"},{"cell_type":"markdown","id":"0b6c6560-bc60-43ba-93a4-1b4aee933d5b","metadata":{},"source":"## Set up SingleStore Database"},{"cell_type":"markdown","id":"d6a1952b-7313-4007-9ec5-4c506425190f","metadata":{},"source":"Connect to your SingleStoreDB Cloud workspaces using SQLAlchemy."},{"cell_type":"code","execution_count":26,"id":"1e8b918f-d849-4bad-b5e9-1cf8be138026","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:09:17.465072Z","iopub.status.busy":"2023-06-06T21:09:17.464722Z","iopub.status.idle":"2023-06-06T21:09:17.469360Z","shell.execute_reply":"2023-06-06T21:09:17.468395Z","shell.execute_reply.started":"2023-06-06T21:09:17.465048Z"},"tags":[],"trusted":true},"outputs":[],"source":"from sqlalchemy import *\n\ndb_connection = create_engine(connection_url)"},{"cell_type":"markdown","id":"e1dd6296-54b0-4f8d-886a-13cacfc28163","metadata":{},"source":"Set up your SingleStore Database which will hold your data."},{"cell_type":"code","execution_count":27,"id":"e1874b6f-706a-4638-ad2a-ca387953acaa","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:09:18.544191Z","iopub.status.busy":"2023-06-06T21:09:18.543841Z","iopub.status.idle":"2023-06-06T21:09:25.532992Z","shell.execute_reply":"2023-06-06T21:09:25.532154Z","shell.execute_reply.started":"2023-06-06T21:09:18.544164Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/plain":"[]"},"execution_count":27,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\n-- Create the database\nDROP DATABASE IF EXISTS news;\nCREATE DATABASE IF NOT EXISTS news;"},{"cell_type":"code","execution_count":28,"id":"3f1e2c3d-6fbd-46bb-9bd3-235eb51941cf","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:09:25.535009Z","iopub.status.busy":"2023-06-06T21:09:25.534700Z","iopub.status.idle":"2023-06-06T21:09:25.891270Z","shell.execute_reply":"2023-06-06T21:09:25.890595Z","shell.execute_reply.started":"2023-06-06T21:09:25.534981Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/plain":"[]"},"execution_count":28,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nUSE news;\n-- Create the table\nDROP TABLE IF EXISTS news_articles;\nCREATE TABLE IF NOT EXISTS news_articles (\n id INT PRIMARY KEY,\n title TEXT,\n description TEXT,\n genre TEXT,\n embedding BLOB,\n FULLTEXT (title, description)\n);"},{"cell_type":"markdown","id":"8bd97023-3d02-44d4-8bd3-59875cb22b6c","metadata":{"execution":{"iopub.execute_input":"2023-06-06T06:31:11.967693Z","iopub.status.busy":"2023-06-06T06:31:11.967312Z","iopub.status.idle":"2023-06-06T06:31:11.971035Z","shell.execute_reply":"2023-06-06T06:31:11.970370Z","shell.execute_reply.started":"2023-06-06T06:31:11.967669Z"},"tags":[]},"source":"### Get embeddings for every row based on the description column."},{"cell_type":"code","execution_count":null,"id":"496f84d0-51b6-4b66-bf5b-b1b260e4c2de","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:03:11.982453Z","iopub.status.busy":"2023-06-06T21:03:11.981961Z"},"tags":[],"trusted":true},"outputs":[],"source":"descriptions = [row[1] if row[1] is not None else row[1] for row in data]\nall_embeddings = model.encode(descriptions)\nall_embeddings.shape"},{"cell_type":"markdown","id":"46b1628c-0ffc-4a84-ba8b-43e8df081b01","metadata":{},"source":"### Populate the database"},{"cell_type":"code","execution_count":29,"id":"ca761550-f6f9-45f2-a3bf-1c25cd2aee38","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:09:28.486691Z","iopub.status.busy":"2023-06-06T21:09:28.486185Z","iopub.status.idle":"2023-06-06T21:09:42.080303Z","shell.execute_reply":"2023-06-06T21:09:42.079627Z","shell.execute_reply.started":"2023-06-06T21:09:28.486662Z"},"tags":[],"trusted":true},"outputs":[],"source":"%sql TRUNCATE TABLE news_articles;\nstatement = f\"\"\"\n INSERT INTO news.news_articles (\n id,\n title,\n description,\n genre,\n embedding\n )\n VALUES (\n %s,\n %s,\n %s,\n %s,\n JSON_ARRAY_PACK_F64(%s)\n )\n \"\"\"\nfor i in range(0, len(data)):\n try:\n ndarray_to_list = all_embeddings[i].tolist()\n json_vector = json.dumps(ndarray_to_list)\n db_connection.execute(statement, (i, data[i][0], data[i][1], data[i][3], str(json_vector)))\n except Exception as e:\n print(\"Error inserting row {}: {}\".format(i, e))\n continue"},{"cell_type":"markdown","id":"a2f3d567-eaf4-487a-a1f9-2eb7e1071991","metadata":{"tags":[]},"source":"## Semantic Search"},{"cell_type":"markdown","id":"7ad3b8f6-d3a8-4954-a737-f11c785ce9ce","metadata":{},"source":"### Connect to OpenAI"},{"cell_type":"code","execution_count":30,"id":"598d7077-d04c-46b3-b7c4-7b4362dd4507","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:09:45.778023Z","iopub.status.busy":"2023-06-06T21:09:45.777510Z","iopub.status.idle":"2023-06-06T21:09:45.849344Z","shell.execute_reply":"2023-06-06T21:09:45.848671Z","shell.execute_reply.started":"2023-06-06T21:09:45.777998Z"},"tags":[],"trusted":true},"outputs":[],"source":"import openai\n\n# models\nEMBEDDING_MODEL = \"text-embedding-ada-002\"\nGPT_MODEL = \"gpt-3.5-turbo\""},{"cell_type":"code","execution_count":31,"id":"9eea2f67-3c2e-4d1a-87c2-052c2acf4026","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:09:45.962790Z","iopub.status.busy":"2023-06-06T21:09:45.962139Z","iopub.status.idle":"2023-06-06T21:09:45.966722Z","shell.execute_reply":"2023-06-06T21:09:45.965769Z","shell.execute_reply.started":"2023-06-06T21:09:45.962761Z"},"tags":[],"trusted":true},"outputs":[],"source":"openai.api_key = 'YOUR_API_KEY'"},{"cell_type":"markdown","id":"6504f561-1ab1-4dbf-a523-0aef23b66e4b","metadata":{},"source":"### Run Semantic Search and get scores"},{"cell_type":"code","execution_count":40,"id":"a62a4c06-d77a-49b1-beaf-4c54b04d001f","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:11:26.965676Z","iopub.status.busy":"2023-06-06T21:11:26.965313Z","iopub.status.idle":"2023-06-06T21:11:27.367072Z","shell.execute_reply":"2023-06-06T21:11:27.366404Z","shell.execute_reply.started":"2023-06-06T21:11:26.965649Z"},"tags":[],"trusted":true},"outputs":[],"source":"from openai.embeddings_utils import get_embedding\nsearch_query = \"Articles about Merck stock\"\nsearch_embedding = json.dumps(model.encode(search_query).tolist())\n\n# Get the embedding of the query.\nquery_embedding_response = get_embedding(search_query, EMBEDDING_MODEL)\n\n# Create the SQL statement.\nquery_statement = f\"\"\"\n SELECT\n title,\n description,\n genre,\n DOT_PRODUCT_F64(JSON_ARRAY_PACK_F64(%s), embedding) AS score\n FROM news.news_articles\n ORDER BY score DESC\n LIMIT 10\n \"\"\"\n\n# Execute the SQL statement.\nresults = db_connection.execute(query_statement, (search_embedding,)).fetchall()"},{"cell_type":"markdown","id":"b9128422-3375-4801-b525-ea9381c50719","metadata":{},"source":"Printing the results:"},{"cell_type":"code","execution_count":41,"id":"0430b21f-9ace-4907-99ad-246abcf3b862","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:11:27.685235Z","iopub.status.busy":"2023-06-06T21:11:27.684880Z","iopub.status.idle":"2023-06-06T21:11:27.689124Z","shell.execute_reply":"2023-06-06T21:11:27.688337Z","shell.execute_reply.started":"2023-06-06T21:11:27.685209Z"},"tags":[],"trusted":true},"outputs":[],"source":"output_list = []\n\nfor res in results:\n output_list.append([\n res[0], res[2], res[3]\n ])"},{"cell_type":"code","execution_count":42,"id":"76769481-a3fa-48f8-851d-7cfb35043916","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:11:27.691335Z","iopub.status.busy":"2023-06-06T21:11:27.690636Z","iopub.status.idle":"2023-06-06T21:11:27.697204Z","shell.execute_reply":"2023-06-06T21:11:27.696451Z","shell.execute_reply.started":"2023-06-06T21:11:27.691305Z"},"tags":[],"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":"title genre score\n----------------------------------------------------------------- -------- --------\nMerck plunges on report it hid Vioxx risks Business 0.612733\nDrug Firm Shares in Slump Business 0.526167\nMerck Pulls Arthritis Drug Off Market Business 0.518859\nMerck takes down the Dow Business 0.492008\n#39;Best #39; ratings a response to rising costs, safety concerns Business 0.468971\nWashington Post Profit Up on Education,TV Business 0.447177\nDRUG LIABILITY: Attorneys want to query Merck CEO Business 0.445199\nDow Jones to Buy MarketWatch for \\$463 Mln Business 0.417015\nWill This Takeover Plan Turn Into Gold? (The Motley Fool) Sci/Tech 0.376652\nNorthrop Third-Quarter Profit Rises Business 0.369725\n"}],"source":"from tabulate import tabulate\nheaders = [\"title\", \"genre\", \"score\"]\nprint(tabulate(output_list, headers))"},{"cell_type":"markdown","id":"2c8ff862-ea5b-4960-be5b-bcd530d6e918","metadata":{},"source":"## Hybrid Search"},{"cell_type":"markdown","id":"d0b2cff3-76f8-4a35-a596-4f001a9b4c8c","metadata":{},"source":"This search finds the average of the score gotten from the semantic search and the score gotten from the key-word search and sorts the news articles by this combined score to perform an effective hybrid search."},{"cell_type":"code","execution_count":123,"id":"9df7073f-6a89-4528-968c-7d5c21876a83","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:29:56.917413Z","iopub.status.busy":"2023-06-06T21:29:56.916997Z","iopub.status.idle":"2023-06-06T21:29:58.107311Z","shell.execute_reply":"2023-06-06T21:29:58.106593Z","shell.execute_reply.started":"2023-06-06T21:29:56.917385Z"},"tags":[],"trusted":true},"outputs":[],"source":"hyb_query = \"Articles about Merck stock\"\nhyb_embedding = json.dumps(model.encode(hyb_query).tolist())\n\n# Get the embedding of the query.\nhyb_embedding_response = get_embedding(hyb_query, EMBEDDING_MODEL)\n\n# Create the SQL statement.\nhyb_statement = f\"\"\"\n SELECT\n title,\n description,\n genre,\n DOT_PRODUCT_F64(JSON_ARRAY_PACK_F64(%s), embedding) AS semantic_score,\n MATCH(title, description) AGAINST (%s) AS keyword_score,\n (semantic_score + keyword_score) / 2 AS combined_score\n FROM news.news_articles\n ORDER BY combined_score DESC\n LIMIT 10\n \"\"\"\n\n# Execute the SQL statement.\nhyb_results = db_connection.execute(hyb_statement, (hyb_embedding, hyb_query)).fetchall()"},{"cell_type":"code","execution_count":124,"id":"acef6808-706d-4002-9e2d-1967ab5974c7","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:29:58.109179Z","iopub.status.busy":"2023-06-06T21:29:58.108868Z","iopub.status.idle":"2023-06-06T21:29:58.113525Z","shell.execute_reply":"2023-06-06T21:29:58.112736Z","shell.execute_reply.started":"2023-06-06T21:29:58.109148Z"},"tags":[],"trusted":true},"outputs":[],"source":"hybrid_output_list = []\n\nfor res in hyb_results:\n hybrid_output_list.append([\n res[0], res[2], res[5]\n ])"},{"cell_type":"code","execution_count":125,"id":"c2b8bae4-a16d-4aa4-944d-74e7ffa396e3","metadata":{"execution":{"iopub.execute_input":"2023-06-06T21:29:58.115238Z","iopub.status.busy":"2023-06-06T21:29:58.114862Z","iopub.status.idle":"2023-06-06T21:29:58.120714Z","shell.execute_reply":"2023-06-06T21:29:58.119903Z","shell.execute_reply.started":"2023-06-06T21:29:58.115209Z"},"tags":[],"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":"title genre score\n------------------------------------------------------------------- -------- --------\nDRUG LIABILITY: Attorneys want to query Merck CEO Business 0.492007\nMerck plunges on report it hid Vioxx risks Business 0.485847\nMerck takes down the Dow Business 0.426202\nMerck Pulls Arthritis Drug Off Market Business 0.402937\nDrug Firm Shares in Slump Business 0.331145\nUS Stock-Index Futures Decline; Citigroup, GE Slip in Europe Business 0.237872\n#39;Best #39; ratings a response to rising costs, safety concerns Business 0.234486\nOwner of Big Electronic Stock Trading System Is Said to Be for Sale Business 0.224903\nWashington Post Profit Up on Education,TV Business 0.223588\nDow Jones to Buy MarketWatch for \\$463 Mln Business 0.217986\n"}],"source":"from tabulate import tabulate\nheaders = [\"title\", \"genre\", \"score\"]\nprint(tabulate(hybrid_output_list, headers))"}],"metadata":{"jupyterlab":{"notebooks":{"version_major":6,"version_minor":4}},"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.9"},"singlestore_connection":{"connectionID":"1efa4dba-bf60-42f3-8d19-19dc6b6ffb35","defaultDatabase":""},"singlestore_row_limit":300},"nbformat":4,"nbformat_minor":5} \ No newline at end of file +{"cells":[{"cell_type":"markdown","id":"505a207d-82ee-406d-bb92-e6a6900d6d18","metadata":{},"source":"\u003cdiv id=\"singlestore-header\" style=\"display: flex; background-color: rgba(209, 153, 255, 0.25); padding: 5px;\"\u003e\n \u003cdiv id=\"icon-image\" style=\"width: 90px; height: 90px;\"\u003e\n \u003cimg width=\"100%\" height=\"100%\" src=\"https://raw.githubusercontent.com/singlestore-labs/spaces-notebooks/master/common/images/header-icons/vector-circle.png\" /\u003e\n \u003c/div\u003e\n \u003cdiv id=\"text\" style=\"padding: 5px; margin-left: 10px;\"\u003e\n \u003cdiv id=\"badge\" style=\"display: inline-block; background-color: rgba(0, 0, 0, 0.15); border-radius: 4px; padding: 4px 8px; align-items: center; margin-top: 6px; margin-bottom: -2px; font-size: 80%\"\u003eSingleStore Notebooks\u003c/div\u003e\n \u003ch1 style=\"font-weight: 500; margin: 8px 0 0 4px;\"\u003eHybrid Search\u003c/h1\u003e\n \u003c/div\u003e\n\u003c/div\u003e"},{"cell_type":"markdown","id":"d9f9e629-6eb9-4ca5-bcf2-1b8672b86725","metadata":{"execution":{"iopub.execute_input":"2023-06-06T03:34:15.712942Z","iopub.status.busy":"2023-06-06T03:34:15.712613Z","iopub.status.idle":"2023-06-06T03:34:15.715753Z","shell.execute_reply":"2023-06-06T03:34:15.715128Z","shell.execute_reply.started":"2023-06-06T03:34:15.712919Z"},"tags":[]},"source":"*Source*: [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/data/AG_news_samples.csv)\n\nHybrid search integrates both keyword-based search and semantic search in order to combine the strengths of both and provide users with a more comprehensive and efficient search experience. This notebook is an example on how to perform hybrid search with SingleStore's database and notebooks."},{"cell_type":"markdown","id":"532e8d3f-007d-48a4-8d36-44b561dd1109","metadata":{},"source":"## Setup\nLet's first download the libraries necessary."},{"cell_type":"code","execution_count":4,"id":"07990b64-9447-46a8-abbc-51be1972dfda","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:26.985627Z","iopub.status.busy":"2023-08-03T20:10:26.985087Z","iopub.status.idle":"2023-08-03T20:10:40.655368Z","shell.execute_reply":"2023-08-03T20:10:40.654602Z","shell.execute_reply.started":"2023-08-03T20:10:26.985608Z"},"tags":[],"trusted":true},"outputs":[],"source":"!pip install matplotlib --quiet\n!pip install plotly.express --quiet\n!pip install scikit-learn --quiet\n!pip install tabulate --quiet\n!pip install tiktoken --quiet\n!pip install wget --quiet\n!pip install openai --quiet"},{"cell_type":"code","execution_count":5,"id":"a592dd5e-4114-4abf-923d-74038f5244eb","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:40.657067Z","iopub.status.busy":"2023-08-03T20:10:40.656816Z","iopub.status.idle":"2023-08-03T20:10:40.663127Z","shell.execute_reply":"2023-08-03T20:10:40.662413Z","shell.execute_reply.started":"2023-08-03T20:10:40.657044Z"},"tags":[],"trusted":true},"outputs":[],"source":"import pandas as pd\nimport os\nimport wget\nimport ast\nimport json"},{"cell_type":"code","execution_count":6,"id":"c2bffc74-4b6a-4c0f-acef-f72bb255ec79","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:40.664287Z","iopub.status.busy":"2023-08-03T20:10:40.664046Z","iopub.status.idle":"2023-08-03T20:10:45.214897Z","shell.execute_reply":"2023-08-03T20:10:45.214240Z","shell.execute_reply.started":"2023-08-03T20:10:40.664266Z"},"tags":[],"trusted":true},"outputs":[],"source":"# Import the library for vectorizing the data (Up to 2 minutes)\n!pip install sentence-transformers --quiet\nfrom sentence_transformers import SentenceTransformer\nmodel = SentenceTransformer('flax-sentence-embeddings/all_datasets_v3_mpnet-base')"},{"cell_type":"markdown","id":"0aa95a80-5683-4dc3-9e52-c3e890ab87af","metadata":{},"source":"## Import data from CSV File\nThis csv file holds the title, summary, and category of approximately 2000 news articles."},{"cell_type":"code","execution_count":7,"id":"b1b2971e-d0f6-4cfa-a9a7-954602bda460","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:45.216911Z","iopub.status.busy":"2023-08-03T20:10:45.216522Z","iopub.status.idle":"2023-08-03T20:10:45.221195Z","shell.execute_reply":"2023-08-03T20:10:45.220623Z","shell.execute_reply.started":"2023-08-03T20:10:45.216889Z"},"tags":[],"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":"File already exists in the local file system.\n"}],"source":"# download reviews csv file\ncvs_file_path = \"https://raw.githubusercontent.com/openai/openai-cookbook/main/examples/data/AG_news_samples.csv\"\nfile_path = \"AG_news_samples.csv\"\n\nif not os.path.exists(file_path):\n wget.download(cvs_file_path, file_path)\n print(\"File downloaded successfully.\")\nelse:\n print(\"File already exists in the local file system.\")"},{"cell_type":"code","execution_count":8,"id":"6c821edd-ce7b-46d9-aa79-0ab1766266a0","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:45.222435Z","iopub.status.busy":"2023-08-03T20:10:45.222055Z","iopub.status.idle":"2023-08-03T20:10:45.241632Z","shell.execute_reply":"2023-08-03T20:10:45.240965Z","shell.execute_reply.started":"2023-08-03T20:10:45.222413Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/html":"\u003cdiv\u003e\n\u003cstyle scoped\u003e\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n\u003c/style\u003e\n\u003ctable border=\"1\" class=\"dataframe\"\u003e\n \u003cthead\u003e\n \u003ctr style=\"text-align: right;\"\u003e\n \u003cth\u003e\u003c/th\u003e\n \u003cth\u003etitle\u003c/th\u003e\n \u003cth\u003edescription\u003c/th\u003e\n \u003cth\u003elabel\u003c/th\u003e\n \u003c/tr\u003e\n \u003c/thead\u003e\n \u003ctbody\u003e\n \u003ctr\u003e\n \u003cth\u003e0\u003c/th\u003e\n \u003ctd\u003eWorld Briefings\u003c/td\u003e\n \u003ctd\u003eBRITAIN: BLAIR WARNS OF CLIMATE THREAT Prime M...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1\u003c/th\u003e\n \u003ctd\u003eNvidia Puts a Firewall on a Motherboard (PC Wo...\u003c/td\u003e\n \u003ctd\u003ePC World - Upcoming chip set will include buil...\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e2\u003c/th\u003e\n \u003ctd\u003eOlympic joy in Greek, Chinese press\u003c/td\u003e\n \u003ctd\u003eNewspapers in Greece reflect a mixture of exhi...\u003c/td\u003e\n \u003ctd\u003eSports\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e3\u003c/th\u003e\n \u003ctd\u003eU2 Can iPod with Pictures\u003c/td\u003e\n \u003ctd\u003eSAN JOSE, Calif. -- Apple Computer (Quote, Cha...\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e4\u003c/th\u003e\n \u003ctd\u003eThe Dream Factory\u003c/td\u003e\n \u003ctd\u003eAny product, any shape, any size -- manufactur...\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e...\u003c/th\u003e\n \u003ctd\u003e...\u003c/td\u003e\n \u003ctd\u003e...\u003c/td\u003e\n \u003ctd\u003e...\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1995\u003c/th\u003e\n \u003ctd\u003eYou Control: iTunes puts control in OS X menu ...\u003c/td\u003e\n \u003ctd\u003eMacCentral - You Software Inc. announced on Tu...\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1996\u003c/th\u003e\n \u003ctd\u003eArgentina beat Italy for place in football final\u003c/td\u003e\n \u003ctd\u003eFavourites Argentina beat Italy 3-0 this morni...\u003c/td\u003e\n \u003ctd\u003eSports\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1997\u003c/th\u003e\n \u003ctd\u003eNCAA case no worry for Spurrier\u003c/td\u003e\n \u003ctd\u003eShortly after Steve Spurrier arrived at Florid...\u003c/td\u003e\n \u003ctd\u003eSports\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1998\u003c/th\u003e\n \u003ctd\u003eSecret Service Busts Cyber Gangs\u003c/td\u003e\n \u003ctd\u003eThe US Secret Service Thursday announced arres...\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1999\u003c/th\u003e\n \u003ctd\u003eStocks Flat; Higher Oil Limits Gains\u003c/td\u003e\n \u003ctd\u003eUS stocks were little changed on Thursday as a...\u003c/td\u003e\n \u003ctd\u003eBusiness\u003c/td\u003e\n \u003c/tr\u003e\n \u003c/tbody\u003e\n\u003c/table\u003e\n\u003cp\u003e2000 rows × 3 columns\u003c/p\u003e\n\u003c/div\u003e","text/plain":" title \\\n0 World Briefings \n1 Nvidia Puts a Firewall on a Motherboard (PC Wo... \n2 Olympic joy in Greek, Chinese press \n3 U2 Can iPod with Pictures \n4 The Dream Factory \n... ... \n1995 You Control: iTunes puts control in OS X menu ... \n1996 Argentina beat Italy for place in football final \n1997 NCAA case no worry for Spurrier \n1998 Secret Service Busts Cyber Gangs \n1999 Stocks Flat; Higher Oil Limits Gains \n\n description label \n0 BRITAIN: BLAIR WARNS OF CLIMATE THREAT Prime M... World \n1 PC World - Upcoming chip set will include buil... Sci/Tech \n2 Newspapers in Greece reflect a mixture of exhi... Sports \n3 SAN JOSE, Calif. -- Apple Computer (Quote, Cha... Sci/Tech \n4 Any product, any shape, any size -- manufactur... Sci/Tech \n... ... ... \n1995 MacCentral - You Software Inc. announced on Tu... Sci/Tech \n1996 Favourites Argentina beat Italy 3-0 this morni... Sports \n1997 Shortly after Steve Spurrier arrived at Florid... Sports \n1998 The US Secret Service Thursday announced arres... Sci/Tech \n1999 US stocks were little changed on Thursday as a... Business \n\n[2000 rows x 3 columns]"},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":"df = pd.read_csv('AG_news_samples.csv')\ndf.pop('label_int')\ndf"},{"cell_type":"code","execution_count":9,"id":"e30c69d3-a807-4437-84e9-6972e3bc3d85","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:45.242886Z","iopub.status.busy":"2023-08-03T20:10:45.242651Z","iopub.status.idle":"2023-08-03T20:10:45.246343Z","shell.execute_reply":"2023-08-03T20:10:45.245714Z","shell.execute_reply.started":"2023-08-03T20:10:45.242864Z"},"tags":[],"trusted":true},"outputs":[],"source":"data = df.values.tolist()"},{"cell_type":"markdown","id":"0b6c6560-bc60-43ba-93a4-1b4aee933d5b","metadata":{},"source":"## Set up SingleStore Database"},{"cell_type":"markdown","id":"d6a1952b-7313-4007-9ec5-4c506425190f","metadata":{},"source":"Connect to your SingleStoreDB Cloud workspaces using SQLAlchemy."},{"cell_type":"code","execution_count":10,"id":"1e8b918f-d849-4bad-b5e9-1cf8be138026","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:45.247824Z","iopub.status.busy":"2023-08-03T20:10:45.247190Z","iopub.status.idle":"2023-08-03T20:10:45.293473Z","shell.execute_reply":"2023-08-03T20:10:45.292864Z","shell.execute_reply.started":"2023-08-03T20:10:45.247804Z"},"tags":[],"trusted":true},"outputs":[],"source":"from sqlalchemy import *\n\ndb_connection = create_engine(connection_url).connect()"},{"cell_type":"markdown","id":"e1dd6296-54b0-4f8d-886a-13cacfc28163","metadata":{},"source":"Set up the SingleStore Database which will hold your data."},{"cell_type":"code","execution_count":11,"id":"e1874b6f-706a-4638-ad2a-ca387953acaa","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:45.294726Z","iopub.status.busy":"2023-08-03T20:10:45.294501Z","iopub.status.idle":"2023-08-03T20:11:33.916475Z","shell.execute_reply":"2023-08-03T20:11:33.915742Z","shell.execute_reply.started":"2023-08-03T20:10:45.294706Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/plain":""},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\n-- Create the database\nDROP DATABASE IF EXISTS news;\nCREATE DATABASE IF NOT EXISTS news;"},{"cell_type":"code","execution_count":12,"id":"3f1e2c3d-6fbd-46bb-9bd3-235eb51941cf","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:11:33.917770Z","iopub.status.busy":"2023-08-03T20:11:33.917541Z","iopub.status.idle":"2023-08-03T20:11:34.179324Z","shell.execute_reply":"2023-08-03T20:11:34.178632Z","shell.execute_reply.started":"2023-08-03T20:11:33.917754Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/plain":""},"execution_count":12,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nUSE news;\n-- Create the table\nDROP TABLE IF EXISTS news_articles;\nCREATE TABLE IF NOT EXISTS news_articles (\n title TEXT,\n description TEXT,\n genre TEXT,\n embedding BLOB,\n FULLTEXT (title, description)\n);"},{"cell_type":"markdown","id":"8bd97023-3d02-44d4-8bd3-59875cb22b6c","metadata":{"execution":{"iopub.execute_input":"2023-06-06T06:31:11.967693Z","iopub.status.busy":"2023-06-06T06:31:11.967312Z","iopub.status.idle":"2023-06-06T06:31:11.971035Z","shell.execute_reply":"2023-06-06T06:31:11.970370Z","shell.execute_reply.started":"2023-06-06T06:31:11.967669Z"},"tags":[]},"source":"### Get embeddings for every row based on the description column."},{"cell_type":"code","execution_count":13,"id":"496f84d0-51b6-4b66-bf5b-b1b260e4c2de","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:11:34.181980Z","iopub.status.busy":"2023-08-03T20:11:34.181788Z","iopub.status.idle":"2023-08-03T20:14:59.572758Z","shell.execute_reply":"2023-08-03T20:14:59.571998Z","shell.execute_reply.started":"2023-08-03T20:11:34.181964Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/plain":"(2000, 768)"},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":"# Will take around 3.5 minutes to get embeddings for all 2000 columns\n\ndescriptions = [row[1] for row in data]\nall_embeddings = model.encode(descriptions)\nall_embeddings.shape"},{"cell_type":"code","execution_count":14,"id":"05b2f3fe-c35c-4252-b416-9f7b7aec60a6","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:14:59.573852Z","iopub.status.busy":"2023-08-03T20:14:59.573612Z","iopub.status.idle":"2023-08-03T20:14:59.578230Z","shell.execute_reply":"2023-08-03T20:14:59.577595Z","shell.execute_reply.started":"2023-08-03T20:14:59.573821Z"},"tags":[],"trusted":true},"outputs":[],"source":"combined_data = [tuple(row) + (embedding,) for embedding, row in zip(all_embeddings, data)]"},{"cell_type":"markdown","id":"46b1628c-0ffc-4a84-ba8b-43e8df081b01","metadata":{},"source":"### Populate the database"},{"cell_type":"code","execution_count":15,"id":"cd3e5f9b-d9e5-45fe-ba20-4fb021d7a425","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:14:59.579579Z","iopub.status.busy":"2023-08-03T20:14:59.579094Z","iopub.status.idle":"2023-08-03T20:15:12.791488Z","shell.execute_reply":"2023-08-03T20:15:12.790862Z","shell.execute_reply.started":"2023-08-03T20:14:59.579558Z"},"tags":[],"trusted":true},"outputs":[],"source":"%sql TRUNCATE TABLE news_articles;\nstatement = '''\n INSERT INTO news.news_articles (\n title,\n description,\n genre,\n embedding\n )\n VALUES (\n %s,\n %s,\n %s,\n %s\n )\n '''\n\nfor i, row in enumerate(combined_data):\n try:\n db_connection.execute(statement, row)\n except Exception as e:\n print(\"Error inserting row {}: {}\".format(i, e))\n continue"},{"cell_type":"markdown","id":"a2f3d567-eaf4-487a-a1f9-2eb7e1071991","metadata":{"tags":[]},"source":"## Semantic Search"},{"cell_type":"markdown","id":"7ad3b8f6-d3a8-4954-a737-f11c785ce9ce","metadata":{},"source":"### Connect to OpenAI"},{"cell_type":"code","execution_count":16,"id":"598d7077-d04c-46b3-b7c4-7b4362dd4507","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:15:12.792954Z","iopub.status.busy":"2023-08-03T20:15:12.792546Z","iopub.status.idle":"2023-08-03T20:15:12.845759Z","shell.execute_reply":"2023-08-03T20:15:12.845167Z","shell.execute_reply.started":"2023-08-03T20:15:12.792931Z"},"tags":[],"trusted":true},"outputs":[],"source":"import openai\n\n# models\nEMBEDDING_MODEL = \"text-embedding-ada-002\"\nGPT_MODEL = \"gpt-3.5-turbo\""},{"cell_type":"code","execution_count":17,"id":"9eea2f67-3c2e-4d1a-87c2-052c2acf4026","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:15:12.847096Z","iopub.status.busy":"2023-08-03T20:15:12.846702Z","iopub.status.idle":"2023-08-03T20:15:12.850061Z","shell.execute_reply":"2023-08-03T20:15:12.849401Z","shell.execute_reply.started":"2023-08-03T20:15:12.847074Z"},"tags":[],"trusted":true},"outputs":[],"source":"openai.api_key = 'YOUR_API_KEY_HERE'"},{"cell_type":"markdown","id":"6504f561-1ab1-4dbf-a523-0aef23b66e4b","metadata":{},"source":"### Run Semantic Search and get scores"},{"cell_type":"code","execution_count":18,"id":"a62a4c06-d77a-49b1-beaf-4c54b04d001f","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:15:12.851400Z","iopub.status.busy":"2023-08-03T20:15:12.851132Z","iopub.status.idle":"2023-08-03T20:15:13.128352Z","shell.execute_reply":"2023-08-03T20:15:13.127794Z","shell.execute_reply.started":"2023-08-03T20:15:12.851379Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/html":"\u003cdiv\u003e\n\u003cstyle scoped\u003e\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n\u003c/style\u003e\n\u003ctable border=\"1\" class=\"dataframe\"\u003e\n \u003cthead\u003e\n \u003ctr style=\"text-align: right;\"\u003e\n \u003cth\u003e\u003c/th\u003e\n \u003cth\u003etitle\u003c/th\u003e\n \u003cth\u003edescription\u003c/th\u003e\n \u003cth\u003egenre\u003c/th\u003e\n \u003cth\u003escore\u003c/th\u003e\n \u003c/tr\u003e\n \u003c/thead\u003e\n \u003ctbody\u003e\n \u003ctr\u003e\n \u003cth\u003e0\u003c/th\u003e\n \u003ctd\u003eAll Australians accounted for in Iraq: Downer ...\u003c/td\u003e\n \u003ctd\u003eAFP - Australia has accounted for all its nati...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.445395\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1\u003c/th\u003e\n \u003ctd\u003eCricket: Aussies dominate India\u003c/td\u003e\n \u003ctd\u003eAustralia tighten their grip on the third Test...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.368577\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e2\u003c/th\u003e\n \u003ctd\u003eMan tried for UK student's murder\u003c/td\u003e\n \u003ctd\u003eThe trial of a man accused of murdering York b...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.350485\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e3\u003c/th\u003e\n \u003ctd\u003ePonting doesn #39;t think much of Kiwis or win...\u003c/td\u003e\n \u003ctd\u003eRICKY PONTING believes the game #39;s watchers...\u003c/td\u003e\n \u003ctd\u003eSports\u003c/td\u003e\n \u003ctd\u003e0.345483\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e4\u003c/th\u003e\n \u003ctd\u003eHassan Body Found in Fallujah: Australian PM\u003c/td\u003e\n \u003ctd\u003eAustralia #39;s prime minister says a body fou...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.341777\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e5\u003c/th\u003e\n \u003ctd\u003eAussie alive after capture in Iraq\u003c/td\u003e\n \u003ctd\u003eAUSTRALIAN journalist John Martinkus is lucky ...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.334077\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e6\u003c/th\u003e\n \u003ctd\u003eA trio of television technologies\u003c/td\u003e\n \u003ctd\u003eAUSTRALIANS went into a television-buying fren...\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003ctd\u003e0.332006\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e7\u003c/th\u003e\n \u003ctd\u003eAustralia PM Gets Down to Work on Fourth Term ...\u003c/td\u003e\n \u003ctd\u003eReuters - Australia's conservative Prime Minis...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.324335\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e8\u003c/th\u003e\n \u003ctd\u003ePolice pull body of lost autistic man, 46, fro...\u003c/td\u003e\n \u003ctd\u003eCanadian Press - OAKVILLE, Ont. (CP) - The bod...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.322738\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e9\u003c/th\u003e\n \u003ctd\u003eAustralia targeted for first time in Iraq car ...\u003c/td\u003e\n \u003ctd\u003eAustralian troops in Baghdad came under attack...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.321895\u003c/td\u003e\n \u003c/tr\u003e\n \u003c/tbody\u003e\n\u003c/table\u003e\n\u003c/div\u003e","text/plain":" title \\\n0 All Australians accounted for in Iraq: Downer ... \n1 Cricket: Aussies dominate India \n2 Man tried for UK student's murder \n3 Ponting doesn #39;t think much of Kiwis or win... \n4 Hassan Body Found in Fallujah: Australian PM \n5 Aussie alive after capture in Iraq \n6 A trio of television technologies \n7 Australia PM Gets Down to Work on Fourth Term ... \n8 Police pull body of lost autistic man, 46, fro... \n9 Australia targeted for first time in Iraq car ... \n\n description genre score \n0 AFP - Australia has accounted for all its nati... World 0.445395 \n1 Australia tighten their grip on the third Test... World 0.368577 \n2 The trial of a man accused of murdering York b... World 0.350485 \n3 RICKY PONTING believes the game #39;s watchers... Sports 0.345483 \n4 Australia #39;s prime minister says a body fou... World 0.341777 \n5 AUSTRALIAN journalist John Martinkus is lucky ... World 0.334077 \n6 AUSTRALIANS went into a television-buying fren... Sci/Tech 0.332006 \n7 Reuters - Australia's conservative Prime Minis... World 0.324335 \n8 Canadian Press - OAKVILLE, Ont. (CP) - The bod... World 0.322738 \n9 Australian troops in Baghdad came under attack... World 0.321895 "},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":"from openai.embeddings_utils import get_embedding\nsearch_query = \"Articles about Aussie captures\"\nsearch_embedding = model.encode(search_query)\n\n# Create the SQL statement.\nquery_statement = \"\"\"\n SELECT\n title,\n description,\n genre,\n DOT_PRODUCT(embedding, %(embedding)s) AS score\n FROM news.news_articles\n ORDER BY score DESC\n LIMIT 10\n \"\"\"\n\n# Execute the SQL statement.\nresults = pd.DataFrame(db_connection.execute(query_statement, dict(embedding=search_embedding)))\nresults"},{"cell_type":"markdown","id":"2c8ff862-ea5b-4960-be5b-bcd530d6e918","metadata":{},"source":"## Hybrid Search"},{"cell_type":"markdown","id":"d0b2cff3-76f8-4a35-a596-4f001a9b4c8c","metadata":{},"source":"This search finds the average of the score gotten from the semantic search and the score gotten from the key-word search and sorts the news articles by this combined score to perform an effective hybrid search."},{"cell_type":"code","execution_count":19,"id":"9df7073f-6a89-4528-968c-7d5c21876a83","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:15:13.129535Z","iopub.status.busy":"2023-08-03T20:15:13.129108Z","iopub.status.idle":"2023-08-03T20:15:13.334690Z","shell.execute_reply":"2023-08-03T20:15:13.334045Z","shell.execute_reply.started":"2023-08-03T20:15:13.129512Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/html":"\u003cdiv\u003e\n\u003cstyle scoped\u003e\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n\u003c/style\u003e\n\u003ctable border=\"1\" class=\"dataframe\"\u003e\n \u003cthead\u003e\n \u003ctr style=\"text-align: right;\"\u003e\n \u003cth\u003e\u003c/th\u003e\n \u003cth\u003etitle\u003c/th\u003e\n \u003cth\u003edescription\u003c/th\u003e\n \u003cth\u003egenre\u003c/th\u003e\n \u003cth\u003esemantic_score\u003c/th\u003e\n \u003cth\u003ekeyword_score\u003c/th\u003e\n \u003cth\u003ecombined_score\u003c/th\u003e\n \u003c/tr\u003e\n \u003c/thead\u003e\n \u003ctbody\u003e\n \u003ctr\u003e\n \u003cth\u003e0\u003c/th\u003e\n \u003ctd\u003eAll Australians accounted for in Iraq: Downer ...\u003c/td\u003e\n \u003ctd\u003eAFP - Australia has accounted for all its nati...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.445395\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.222698\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1\u003c/th\u003e\n \u003ctd\u003eCricket: Aussies dominate India\u003c/td\u003e\n \u003ctd\u003eAustralia tighten their grip on the third Test...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.368577\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.184289\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e2\u003c/th\u003e\n \u003ctd\u003eMan tried for UK student's murder\u003c/td\u003e\n \u003ctd\u003eThe trial of a man accused of murdering York b...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.350485\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.175242\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e3\u003c/th\u003e\n \u003ctd\u003ePonting doesn #39;t think much of Kiwis or win...\u003c/td\u003e\n \u003ctd\u003eRICKY PONTING believes the game #39;s watchers...\u003c/td\u003e\n \u003ctd\u003eSports\u003c/td\u003e\n \u003ctd\u003e0.345483\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.172742\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e4\u003c/th\u003e\n \u003ctd\u003eHassan Body Found in Fallujah: Australian PM\u003c/td\u003e\n \u003ctd\u003eAustralia #39;s prime minister says a body fou...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.341777\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.170889\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e5\u003c/th\u003e\n \u003ctd\u003eAussie alive after capture in Iraq\u003c/td\u003e\n \u003ctd\u003eAUSTRALIAN journalist John Martinkus is lucky ...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.334077\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.167039\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e6\u003c/th\u003e\n \u003ctd\u003eA trio of television technologies\u003c/td\u003e\n \u003ctd\u003eAUSTRALIANS went into a television-buying fren...\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003ctd\u003e0.332006\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.166003\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e7\u003c/th\u003e\n \u003ctd\u003eAustralia PM Gets Down to Work on Fourth Term ...\u003c/td\u003e\n \u003ctd\u003eReuters - Australia's conservative Prime Minis...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.324335\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.162168\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e8\u003c/th\u003e\n \u003ctd\u003ePolice pull body of lost autistic man, 46, fro...\u003c/td\u003e\n \u003ctd\u003eCanadian Press - OAKVILLE, Ont. (CP) - The bod...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.322738\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.161369\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e9\u003c/th\u003e\n \u003ctd\u003eAustralia targeted for first time in Iraq car ...\u003c/td\u003e\n \u003ctd\u003eAustralian troops in Baghdad came under attack...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.321895\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.160948\u003c/td\u003e\n \u003c/tr\u003e\n \u003c/tbody\u003e\n\u003c/table\u003e\n\u003c/div\u003e","text/plain":" title \\\n0 All Australians accounted for in Iraq: Downer ... \n1 Cricket: Aussies dominate India \n2 Man tried for UK student's murder \n3 Ponting doesn #39;t think much of Kiwis or win... \n4 Hassan Body Found in Fallujah: Australian PM \n5 Aussie alive after capture in Iraq \n6 A trio of television technologies \n7 Australia PM Gets Down to Work on Fourth Term ... \n8 Police pull body of lost autistic man, 46, fro... \n9 Australia targeted for first time in Iraq car ... \n\n description genre \\\n0 AFP - Australia has accounted for all its nati... World \n1 Australia tighten their grip on the third Test... World \n2 The trial of a man accused of murdering York b... World \n3 RICKY PONTING believes the game #39;s watchers... Sports \n4 Australia #39;s prime minister says a body fou... World \n5 AUSTRALIAN journalist John Martinkus is lucky ... World \n6 AUSTRALIANS went into a television-buying fren... Sci/Tech \n7 Reuters - Australia's conservative Prime Minis... World \n8 Canadian Press - OAKVILLE, Ont. (CP) - The bod... World \n9 Australian troops in Baghdad came under attack... World \n\n semantic_score keyword_score combined_score \n0 0.445395 0.0 0.222698 \n1 0.368577 0.0 0.184289 \n2 0.350485 0.0 0.175242 \n3 0.345483 0.0 0.172742 \n4 0.341777 0.0 0.170889 \n5 0.334077 0.0 0.167039 \n6 0.332006 0.0 0.166003 \n7 0.324335 0.0 0.162168 \n8 0.322738 0.0 0.161369 \n9 0.321895 0.0 0.160948 "},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":"hyb_query = \"Articles about Aussie captures\"\nhyb_embedding = model.encode(hyb_query)\n\n# Create the SQL statement.\nhyb_statement = \"\"\"\n SELECT\n title,\n description,\n genre,\n DOT_PRODUCT(embedding, %(embedding)s) AS semantic_score,\n MATCH(title, description) AGAINST (%(query)s) AS keyword_score,\n (semantic_score + keyword_score) / 2 AS combined_score\n FROM news.news_articles\n ORDER BY combined_score DESC\n LIMIT 10\n \"\"\"\n\n# Execute the SQL statement.\nhyb_results = pd.DataFrame(db_connection.execute(hyb_statement, dict(embedding=hyb_embedding, query=hyb_query)))\nhyb_results"},{"cell_type":"markdown","id":"f9f6e53b-fb02-4d1a-908f-b96d1c2cdfd0","metadata":{},"source":"\u003cdiv id=\"singlestore-footer\" style=\"background-color: rgba(194, 193, 199, 0.25); height:2px; margin-bottom:10px\"\u003e\u003c/div\u003e\n\u003cdiv\u003e\u003cimg src=\"https://raw.githubusercontent.com/singlestore-labs/spaces-notebooks/master/common/images/singlestore-logo-grey.png\" style=\"padding: 0px; margin: 0px; height: 24px\"/\u003e\u003c/div\u003e\n\u003c/div\u003e"}],"metadata":{"jupyterlab":{"notebooks":{"version_major":6,"version_minor":4}},"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.9"},"singlestore_connection":{"connectionID":"1efa4dba-bf60-42f3-8d19-19dc6b6ffb35","defaultDatabase":"news"},"singlestore_row_limit":300},"nbformat":4,"nbformat_minor":5} From 8a5ea93e8fef8879a875f3ef61fc5603927c5d3f Mon Sep 17 00:00:00 2001 From: trishamaturi <63947635+trishamaturi@users.noreply.github.com> Date: Fri, 4 Aug 2023 10:58:11 -0700 Subject: [PATCH 4/4] Update notebook.ipynb changed after a few more revisions --- notebooks/hybrid-search/notebook.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/hybrid-search/notebook.ipynb b/notebooks/hybrid-search/notebook.ipynb index 91fff25..1ebd161 100644 --- a/notebooks/hybrid-search/notebook.ipynb +++ b/notebooks/hybrid-search/notebook.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"markdown","id":"505a207d-82ee-406d-bb92-e6a6900d6d18","metadata":{},"source":"\u003cdiv id=\"singlestore-header\" style=\"display: flex; background-color: rgba(209, 153, 255, 0.25); padding: 5px;\"\u003e\n \u003cdiv id=\"icon-image\" style=\"width: 90px; height: 90px;\"\u003e\n \u003cimg width=\"100%\" height=\"100%\" src=\"https://raw.githubusercontent.com/singlestore-labs/spaces-notebooks/master/common/images/header-icons/vector-circle.png\" /\u003e\n \u003c/div\u003e\n \u003cdiv id=\"text\" style=\"padding: 5px; margin-left: 10px;\"\u003e\n \u003cdiv id=\"badge\" style=\"display: inline-block; background-color: rgba(0, 0, 0, 0.15); border-radius: 4px; padding: 4px 8px; align-items: center; margin-top: 6px; margin-bottom: -2px; font-size: 80%\"\u003eSingleStore Notebooks\u003c/div\u003e\n \u003ch1 style=\"font-weight: 500; margin: 8px 0 0 4px;\"\u003eHybrid Search\u003c/h1\u003e\n \u003c/div\u003e\n\u003c/div\u003e"},{"cell_type":"markdown","id":"d9f9e629-6eb9-4ca5-bcf2-1b8672b86725","metadata":{"execution":{"iopub.execute_input":"2023-06-06T03:34:15.712942Z","iopub.status.busy":"2023-06-06T03:34:15.712613Z","iopub.status.idle":"2023-06-06T03:34:15.715753Z","shell.execute_reply":"2023-06-06T03:34:15.715128Z","shell.execute_reply.started":"2023-06-06T03:34:15.712919Z"},"tags":[]},"source":"*Source*: [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/data/AG_news_samples.csv)\n\nHybrid search integrates both keyword-based search and semantic search in order to combine the strengths of both and provide users with a more comprehensive and efficient search experience. This notebook is an example on how to perform hybrid search with SingleStore's database and notebooks."},{"cell_type":"markdown","id":"532e8d3f-007d-48a4-8d36-44b561dd1109","metadata":{},"source":"## Setup\nLet's first download the libraries necessary."},{"cell_type":"code","execution_count":4,"id":"07990b64-9447-46a8-abbc-51be1972dfda","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:26.985627Z","iopub.status.busy":"2023-08-03T20:10:26.985087Z","iopub.status.idle":"2023-08-03T20:10:40.655368Z","shell.execute_reply":"2023-08-03T20:10:40.654602Z","shell.execute_reply.started":"2023-08-03T20:10:26.985608Z"},"tags":[],"trusted":true},"outputs":[],"source":"!pip install matplotlib --quiet\n!pip install plotly.express --quiet\n!pip install scikit-learn --quiet\n!pip install tabulate --quiet\n!pip install tiktoken --quiet\n!pip install wget --quiet\n!pip install openai --quiet"},{"cell_type":"code","execution_count":5,"id":"a592dd5e-4114-4abf-923d-74038f5244eb","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:40.657067Z","iopub.status.busy":"2023-08-03T20:10:40.656816Z","iopub.status.idle":"2023-08-03T20:10:40.663127Z","shell.execute_reply":"2023-08-03T20:10:40.662413Z","shell.execute_reply.started":"2023-08-03T20:10:40.657044Z"},"tags":[],"trusted":true},"outputs":[],"source":"import pandas as pd\nimport os\nimport wget\nimport ast\nimport json"},{"cell_type":"code","execution_count":6,"id":"c2bffc74-4b6a-4c0f-acef-f72bb255ec79","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:40.664287Z","iopub.status.busy":"2023-08-03T20:10:40.664046Z","iopub.status.idle":"2023-08-03T20:10:45.214897Z","shell.execute_reply":"2023-08-03T20:10:45.214240Z","shell.execute_reply.started":"2023-08-03T20:10:40.664266Z"},"tags":[],"trusted":true},"outputs":[],"source":"# Import the library for vectorizing the data (Up to 2 minutes)\n!pip install sentence-transformers --quiet\nfrom sentence_transformers import SentenceTransformer\nmodel = SentenceTransformer('flax-sentence-embeddings/all_datasets_v3_mpnet-base')"},{"cell_type":"markdown","id":"0aa95a80-5683-4dc3-9e52-c3e890ab87af","metadata":{},"source":"## Import data from CSV File\nThis csv file holds the title, summary, and category of approximately 2000 news articles."},{"cell_type":"code","execution_count":7,"id":"b1b2971e-d0f6-4cfa-a9a7-954602bda460","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:45.216911Z","iopub.status.busy":"2023-08-03T20:10:45.216522Z","iopub.status.idle":"2023-08-03T20:10:45.221195Z","shell.execute_reply":"2023-08-03T20:10:45.220623Z","shell.execute_reply.started":"2023-08-03T20:10:45.216889Z"},"tags":[],"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":"File already exists in the local file system.\n"}],"source":"# download reviews csv file\ncvs_file_path = \"https://raw.githubusercontent.com/openai/openai-cookbook/main/examples/data/AG_news_samples.csv\"\nfile_path = \"AG_news_samples.csv\"\n\nif not os.path.exists(file_path):\n wget.download(cvs_file_path, file_path)\n print(\"File downloaded successfully.\")\nelse:\n print(\"File already exists in the local file system.\")"},{"cell_type":"code","execution_count":8,"id":"6c821edd-ce7b-46d9-aa79-0ab1766266a0","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:45.222435Z","iopub.status.busy":"2023-08-03T20:10:45.222055Z","iopub.status.idle":"2023-08-03T20:10:45.241632Z","shell.execute_reply":"2023-08-03T20:10:45.240965Z","shell.execute_reply.started":"2023-08-03T20:10:45.222413Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/html":"\u003cdiv\u003e\n\u003cstyle scoped\u003e\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n\u003c/style\u003e\n\u003ctable border=\"1\" class=\"dataframe\"\u003e\n \u003cthead\u003e\n \u003ctr style=\"text-align: right;\"\u003e\n \u003cth\u003e\u003c/th\u003e\n \u003cth\u003etitle\u003c/th\u003e\n \u003cth\u003edescription\u003c/th\u003e\n \u003cth\u003elabel\u003c/th\u003e\n \u003c/tr\u003e\n \u003c/thead\u003e\n \u003ctbody\u003e\n \u003ctr\u003e\n \u003cth\u003e0\u003c/th\u003e\n \u003ctd\u003eWorld Briefings\u003c/td\u003e\n \u003ctd\u003eBRITAIN: BLAIR WARNS OF CLIMATE THREAT Prime M...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1\u003c/th\u003e\n \u003ctd\u003eNvidia Puts a Firewall on a Motherboard (PC Wo...\u003c/td\u003e\n \u003ctd\u003ePC World - Upcoming chip set will include buil...\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e2\u003c/th\u003e\n \u003ctd\u003eOlympic joy in Greek, Chinese press\u003c/td\u003e\n \u003ctd\u003eNewspapers in Greece reflect a mixture of exhi...\u003c/td\u003e\n \u003ctd\u003eSports\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e3\u003c/th\u003e\n \u003ctd\u003eU2 Can iPod with Pictures\u003c/td\u003e\n \u003ctd\u003eSAN JOSE, Calif. -- Apple Computer (Quote, Cha...\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e4\u003c/th\u003e\n \u003ctd\u003eThe Dream Factory\u003c/td\u003e\n \u003ctd\u003eAny product, any shape, any size -- manufactur...\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e...\u003c/th\u003e\n \u003ctd\u003e...\u003c/td\u003e\n \u003ctd\u003e...\u003c/td\u003e\n \u003ctd\u003e...\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1995\u003c/th\u003e\n \u003ctd\u003eYou Control: iTunes puts control in OS X menu ...\u003c/td\u003e\n \u003ctd\u003eMacCentral - You Software Inc. announced on Tu...\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1996\u003c/th\u003e\n \u003ctd\u003eArgentina beat Italy for place in football final\u003c/td\u003e\n \u003ctd\u003eFavourites Argentina beat Italy 3-0 this morni...\u003c/td\u003e\n \u003ctd\u003eSports\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1997\u003c/th\u003e\n \u003ctd\u003eNCAA case no worry for Spurrier\u003c/td\u003e\n \u003ctd\u003eShortly after Steve Spurrier arrived at Florid...\u003c/td\u003e\n \u003ctd\u003eSports\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1998\u003c/th\u003e\n \u003ctd\u003eSecret Service Busts Cyber Gangs\u003c/td\u003e\n \u003ctd\u003eThe US Secret Service Thursday announced arres...\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1999\u003c/th\u003e\n \u003ctd\u003eStocks Flat; Higher Oil Limits Gains\u003c/td\u003e\n \u003ctd\u003eUS stocks were little changed on Thursday as a...\u003c/td\u003e\n \u003ctd\u003eBusiness\u003c/td\u003e\n \u003c/tr\u003e\n \u003c/tbody\u003e\n\u003c/table\u003e\n\u003cp\u003e2000 rows × 3 columns\u003c/p\u003e\n\u003c/div\u003e","text/plain":" title \\\n0 World Briefings \n1 Nvidia Puts a Firewall on a Motherboard (PC Wo... \n2 Olympic joy in Greek, Chinese press \n3 U2 Can iPod with Pictures \n4 The Dream Factory \n... ... \n1995 You Control: iTunes puts control in OS X menu ... \n1996 Argentina beat Italy for place in football final \n1997 NCAA case no worry for Spurrier \n1998 Secret Service Busts Cyber Gangs \n1999 Stocks Flat; Higher Oil Limits Gains \n\n description label \n0 BRITAIN: BLAIR WARNS OF CLIMATE THREAT Prime M... World \n1 PC World - Upcoming chip set will include buil... Sci/Tech \n2 Newspapers in Greece reflect a mixture of exhi... Sports \n3 SAN JOSE, Calif. -- Apple Computer (Quote, Cha... Sci/Tech \n4 Any product, any shape, any size -- manufactur... Sci/Tech \n... ... ... \n1995 MacCentral - You Software Inc. announced on Tu... Sci/Tech \n1996 Favourites Argentina beat Italy 3-0 this morni... Sports \n1997 Shortly after Steve Spurrier arrived at Florid... Sports \n1998 The US Secret Service Thursday announced arres... Sci/Tech \n1999 US stocks were little changed on Thursday as a... Business \n\n[2000 rows x 3 columns]"},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":"df = pd.read_csv('AG_news_samples.csv')\ndf.pop('label_int')\ndf"},{"cell_type":"code","execution_count":9,"id":"e30c69d3-a807-4437-84e9-6972e3bc3d85","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:45.242886Z","iopub.status.busy":"2023-08-03T20:10:45.242651Z","iopub.status.idle":"2023-08-03T20:10:45.246343Z","shell.execute_reply":"2023-08-03T20:10:45.245714Z","shell.execute_reply.started":"2023-08-03T20:10:45.242864Z"},"tags":[],"trusted":true},"outputs":[],"source":"data = df.values.tolist()"},{"cell_type":"markdown","id":"0b6c6560-bc60-43ba-93a4-1b4aee933d5b","metadata":{},"source":"## Set up SingleStore Database"},{"cell_type":"markdown","id":"d6a1952b-7313-4007-9ec5-4c506425190f","metadata":{},"source":"Connect to your SingleStoreDB Cloud workspaces using SQLAlchemy."},{"cell_type":"code","execution_count":10,"id":"1e8b918f-d849-4bad-b5e9-1cf8be138026","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:45.247824Z","iopub.status.busy":"2023-08-03T20:10:45.247190Z","iopub.status.idle":"2023-08-03T20:10:45.293473Z","shell.execute_reply":"2023-08-03T20:10:45.292864Z","shell.execute_reply.started":"2023-08-03T20:10:45.247804Z"},"tags":[],"trusted":true},"outputs":[],"source":"from sqlalchemy import *\n\ndb_connection = create_engine(connection_url).connect()"},{"cell_type":"markdown","id":"e1dd6296-54b0-4f8d-886a-13cacfc28163","metadata":{},"source":"Set up the SingleStore Database which will hold your data."},{"cell_type":"code","execution_count":11,"id":"e1874b6f-706a-4638-ad2a-ca387953acaa","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:45.294726Z","iopub.status.busy":"2023-08-03T20:10:45.294501Z","iopub.status.idle":"2023-08-03T20:11:33.916475Z","shell.execute_reply":"2023-08-03T20:11:33.915742Z","shell.execute_reply.started":"2023-08-03T20:10:45.294706Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/plain":""},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\n-- Create the database\nDROP DATABASE IF EXISTS news;\nCREATE DATABASE IF NOT EXISTS news;"},{"cell_type":"code","execution_count":12,"id":"3f1e2c3d-6fbd-46bb-9bd3-235eb51941cf","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:11:33.917770Z","iopub.status.busy":"2023-08-03T20:11:33.917541Z","iopub.status.idle":"2023-08-03T20:11:34.179324Z","shell.execute_reply":"2023-08-03T20:11:34.178632Z","shell.execute_reply.started":"2023-08-03T20:11:33.917754Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/plain":""},"execution_count":12,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nUSE news;\n-- Create the table\nDROP TABLE IF EXISTS news_articles;\nCREATE TABLE IF NOT EXISTS news_articles (\n title TEXT,\n description TEXT,\n genre TEXT,\n embedding BLOB,\n FULLTEXT (title, description)\n);"},{"cell_type":"markdown","id":"8bd97023-3d02-44d4-8bd3-59875cb22b6c","metadata":{"execution":{"iopub.execute_input":"2023-06-06T06:31:11.967693Z","iopub.status.busy":"2023-06-06T06:31:11.967312Z","iopub.status.idle":"2023-06-06T06:31:11.971035Z","shell.execute_reply":"2023-06-06T06:31:11.970370Z","shell.execute_reply.started":"2023-06-06T06:31:11.967669Z"},"tags":[]},"source":"### Get embeddings for every row based on the description column."},{"cell_type":"code","execution_count":13,"id":"496f84d0-51b6-4b66-bf5b-b1b260e4c2de","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:11:34.181980Z","iopub.status.busy":"2023-08-03T20:11:34.181788Z","iopub.status.idle":"2023-08-03T20:14:59.572758Z","shell.execute_reply":"2023-08-03T20:14:59.571998Z","shell.execute_reply.started":"2023-08-03T20:11:34.181964Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/plain":"(2000, 768)"},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":"# Will take around 3.5 minutes to get embeddings for all 2000 columns\n\ndescriptions = [row[1] for row in data]\nall_embeddings = model.encode(descriptions)\nall_embeddings.shape"},{"cell_type":"code","execution_count":14,"id":"05b2f3fe-c35c-4252-b416-9f7b7aec60a6","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:14:59.573852Z","iopub.status.busy":"2023-08-03T20:14:59.573612Z","iopub.status.idle":"2023-08-03T20:14:59.578230Z","shell.execute_reply":"2023-08-03T20:14:59.577595Z","shell.execute_reply.started":"2023-08-03T20:14:59.573821Z"},"tags":[],"trusted":true},"outputs":[],"source":"combined_data = [tuple(row) + (embedding,) for embedding, row in zip(all_embeddings, data)]"},{"cell_type":"markdown","id":"46b1628c-0ffc-4a84-ba8b-43e8df081b01","metadata":{},"source":"### Populate the database"},{"cell_type":"code","execution_count":15,"id":"cd3e5f9b-d9e5-45fe-ba20-4fb021d7a425","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:14:59.579579Z","iopub.status.busy":"2023-08-03T20:14:59.579094Z","iopub.status.idle":"2023-08-03T20:15:12.791488Z","shell.execute_reply":"2023-08-03T20:15:12.790862Z","shell.execute_reply.started":"2023-08-03T20:14:59.579558Z"},"tags":[],"trusted":true},"outputs":[],"source":"%sql TRUNCATE TABLE news_articles;\nstatement = '''\n INSERT INTO news.news_articles (\n title,\n description,\n genre,\n embedding\n )\n VALUES (\n %s,\n %s,\n %s,\n %s\n )\n '''\n\nfor i, row in enumerate(combined_data):\n try:\n db_connection.execute(statement, row)\n except Exception as e:\n print(\"Error inserting row {}: {}\".format(i, e))\n continue"},{"cell_type":"markdown","id":"a2f3d567-eaf4-487a-a1f9-2eb7e1071991","metadata":{"tags":[]},"source":"## Semantic Search"},{"cell_type":"markdown","id":"7ad3b8f6-d3a8-4954-a737-f11c785ce9ce","metadata":{},"source":"### Connect to OpenAI"},{"cell_type":"code","execution_count":16,"id":"598d7077-d04c-46b3-b7c4-7b4362dd4507","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:15:12.792954Z","iopub.status.busy":"2023-08-03T20:15:12.792546Z","iopub.status.idle":"2023-08-03T20:15:12.845759Z","shell.execute_reply":"2023-08-03T20:15:12.845167Z","shell.execute_reply.started":"2023-08-03T20:15:12.792931Z"},"tags":[],"trusted":true},"outputs":[],"source":"import openai\n\n# models\nEMBEDDING_MODEL = \"text-embedding-ada-002\"\nGPT_MODEL = \"gpt-3.5-turbo\""},{"cell_type":"code","execution_count":17,"id":"9eea2f67-3c2e-4d1a-87c2-052c2acf4026","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:15:12.847096Z","iopub.status.busy":"2023-08-03T20:15:12.846702Z","iopub.status.idle":"2023-08-03T20:15:12.850061Z","shell.execute_reply":"2023-08-03T20:15:12.849401Z","shell.execute_reply.started":"2023-08-03T20:15:12.847074Z"},"tags":[],"trusted":true},"outputs":[],"source":"openai.api_key = 'YOUR_API_KEY_HERE'"},{"cell_type":"markdown","id":"6504f561-1ab1-4dbf-a523-0aef23b66e4b","metadata":{},"source":"### Run Semantic Search and get scores"},{"cell_type":"code","execution_count":18,"id":"a62a4c06-d77a-49b1-beaf-4c54b04d001f","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:15:12.851400Z","iopub.status.busy":"2023-08-03T20:15:12.851132Z","iopub.status.idle":"2023-08-03T20:15:13.128352Z","shell.execute_reply":"2023-08-03T20:15:13.127794Z","shell.execute_reply.started":"2023-08-03T20:15:12.851379Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/html":"\u003cdiv\u003e\n\u003cstyle scoped\u003e\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n\u003c/style\u003e\n\u003ctable border=\"1\" class=\"dataframe\"\u003e\n \u003cthead\u003e\n \u003ctr style=\"text-align: right;\"\u003e\n \u003cth\u003e\u003c/th\u003e\n \u003cth\u003etitle\u003c/th\u003e\n \u003cth\u003edescription\u003c/th\u003e\n \u003cth\u003egenre\u003c/th\u003e\n \u003cth\u003escore\u003c/th\u003e\n \u003c/tr\u003e\n \u003c/thead\u003e\n \u003ctbody\u003e\n \u003ctr\u003e\n \u003cth\u003e0\u003c/th\u003e\n \u003ctd\u003eAll Australians accounted for in Iraq: Downer ...\u003c/td\u003e\n \u003ctd\u003eAFP - Australia has accounted for all its nati...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.445395\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1\u003c/th\u003e\n \u003ctd\u003eCricket: Aussies dominate India\u003c/td\u003e\n \u003ctd\u003eAustralia tighten their grip on the third Test...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.368577\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e2\u003c/th\u003e\n \u003ctd\u003eMan tried for UK student's murder\u003c/td\u003e\n \u003ctd\u003eThe trial of a man accused of murdering York b...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.350485\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e3\u003c/th\u003e\n \u003ctd\u003ePonting doesn #39;t think much of Kiwis or win...\u003c/td\u003e\n \u003ctd\u003eRICKY PONTING believes the game #39;s watchers...\u003c/td\u003e\n \u003ctd\u003eSports\u003c/td\u003e\n \u003ctd\u003e0.345483\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e4\u003c/th\u003e\n \u003ctd\u003eHassan Body Found in Fallujah: Australian PM\u003c/td\u003e\n \u003ctd\u003eAustralia #39;s prime minister says a body fou...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.341777\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e5\u003c/th\u003e\n \u003ctd\u003eAussie alive after capture in Iraq\u003c/td\u003e\n \u003ctd\u003eAUSTRALIAN journalist John Martinkus is lucky ...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.334077\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e6\u003c/th\u003e\n \u003ctd\u003eA trio of television technologies\u003c/td\u003e\n \u003ctd\u003eAUSTRALIANS went into a television-buying fren...\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003ctd\u003e0.332006\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e7\u003c/th\u003e\n \u003ctd\u003eAustralia PM Gets Down to Work on Fourth Term ...\u003c/td\u003e\n \u003ctd\u003eReuters - Australia's conservative Prime Minis...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.324335\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e8\u003c/th\u003e\n \u003ctd\u003ePolice pull body of lost autistic man, 46, fro...\u003c/td\u003e\n \u003ctd\u003eCanadian Press - OAKVILLE, Ont. (CP) - The bod...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.322738\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e9\u003c/th\u003e\n \u003ctd\u003eAustralia targeted for first time in Iraq car ...\u003c/td\u003e\n \u003ctd\u003eAustralian troops in Baghdad came under attack...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.321895\u003c/td\u003e\n \u003c/tr\u003e\n \u003c/tbody\u003e\n\u003c/table\u003e\n\u003c/div\u003e","text/plain":" title \\\n0 All Australians accounted for in Iraq: Downer ... \n1 Cricket: Aussies dominate India \n2 Man tried for UK student's murder \n3 Ponting doesn #39;t think much of Kiwis or win... \n4 Hassan Body Found in Fallujah: Australian PM \n5 Aussie alive after capture in Iraq \n6 A trio of television technologies \n7 Australia PM Gets Down to Work on Fourth Term ... \n8 Police pull body of lost autistic man, 46, fro... \n9 Australia targeted for first time in Iraq car ... \n\n description genre score \n0 AFP - Australia has accounted for all its nati... World 0.445395 \n1 Australia tighten their grip on the third Test... World 0.368577 \n2 The trial of a man accused of murdering York b... World 0.350485 \n3 RICKY PONTING believes the game #39;s watchers... Sports 0.345483 \n4 Australia #39;s prime minister says a body fou... World 0.341777 \n5 AUSTRALIAN journalist John Martinkus is lucky ... World 0.334077 \n6 AUSTRALIANS went into a television-buying fren... Sci/Tech 0.332006 \n7 Reuters - Australia's conservative Prime Minis... World 0.324335 \n8 Canadian Press - OAKVILLE, Ont. (CP) - The bod... World 0.322738 \n9 Australian troops in Baghdad came under attack... World 0.321895 "},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":"from openai.embeddings_utils import get_embedding\nsearch_query = \"Articles about Aussie captures\"\nsearch_embedding = model.encode(search_query)\n\n# Create the SQL statement.\nquery_statement = \"\"\"\n SELECT\n title,\n description,\n genre,\n DOT_PRODUCT(embedding, %(embedding)s) AS score\n FROM news.news_articles\n ORDER BY score DESC\n LIMIT 10\n \"\"\"\n\n# Execute the SQL statement.\nresults = pd.DataFrame(db_connection.execute(query_statement, dict(embedding=search_embedding)))\nresults"},{"cell_type":"markdown","id":"2c8ff862-ea5b-4960-be5b-bcd530d6e918","metadata":{},"source":"## Hybrid Search"},{"cell_type":"markdown","id":"d0b2cff3-76f8-4a35-a596-4f001a9b4c8c","metadata":{},"source":"This search finds the average of the score gotten from the semantic search and the score gotten from the key-word search and sorts the news articles by this combined score to perform an effective hybrid search."},{"cell_type":"code","execution_count":19,"id":"9df7073f-6a89-4528-968c-7d5c21876a83","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:15:13.129535Z","iopub.status.busy":"2023-08-03T20:15:13.129108Z","iopub.status.idle":"2023-08-03T20:15:13.334690Z","shell.execute_reply":"2023-08-03T20:15:13.334045Z","shell.execute_reply.started":"2023-08-03T20:15:13.129512Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/html":"\u003cdiv\u003e\n\u003cstyle scoped\u003e\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n\u003c/style\u003e\n\u003ctable border=\"1\" class=\"dataframe\"\u003e\n \u003cthead\u003e\n \u003ctr style=\"text-align: right;\"\u003e\n \u003cth\u003e\u003c/th\u003e\n \u003cth\u003etitle\u003c/th\u003e\n \u003cth\u003edescription\u003c/th\u003e\n \u003cth\u003egenre\u003c/th\u003e\n \u003cth\u003esemantic_score\u003c/th\u003e\n \u003cth\u003ekeyword_score\u003c/th\u003e\n \u003cth\u003ecombined_score\u003c/th\u003e\n \u003c/tr\u003e\n \u003c/thead\u003e\n \u003ctbody\u003e\n \u003ctr\u003e\n \u003cth\u003e0\u003c/th\u003e\n \u003ctd\u003eAll Australians accounted for in Iraq: Downer ...\u003c/td\u003e\n \u003ctd\u003eAFP - Australia has accounted for all its nati...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.445395\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.222698\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e1\u003c/th\u003e\n \u003ctd\u003eCricket: Aussies dominate India\u003c/td\u003e\n \u003ctd\u003eAustralia tighten their grip on the third Test...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.368577\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.184289\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e2\u003c/th\u003e\n \u003ctd\u003eMan tried for UK student's murder\u003c/td\u003e\n \u003ctd\u003eThe trial of a man accused of murdering York b...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.350485\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.175242\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e3\u003c/th\u003e\n \u003ctd\u003ePonting doesn #39;t think much of Kiwis or win...\u003c/td\u003e\n \u003ctd\u003eRICKY PONTING believes the game #39;s watchers...\u003c/td\u003e\n \u003ctd\u003eSports\u003c/td\u003e\n \u003ctd\u003e0.345483\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.172742\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e4\u003c/th\u003e\n \u003ctd\u003eHassan Body Found in Fallujah: Australian PM\u003c/td\u003e\n \u003ctd\u003eAustralia #39;s prime minister says a body fou...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.341777\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.170889\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e5\u003c/th\u003e\n \u003ctd\u003eAussie alive after capture in Iraq\u003c/td\u003e\n \u003ctd\u003eAUSTRALIAN journalist John Martinkus is lucky ...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.334077\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.167039\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e6\u003c/th\u003e\n \u003ctd\u003eA trio of television technologies\u003c/td\u003e\n \u003ctd\u003eAUSTRALIANS went into a television-buying fren...\u003c/td\u003e\n \u003ctd\u003eSci/Tech\u003c/td\u003e\n \u003ctd\u003e0.332006\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.166003\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e7\u003c/th\u003e\n \u003ctd\u003eAustralia PM Gets Down to Work on Fourth Term ...\u003c/td\u003e\n \u003ctd\u003eReuters - Australia's conservative Prime Minis...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.324335\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.162168\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e8\u003c/th\u003e\n \u003ctd\u003ePolice pull body of lost autistic man, 46, fro...\u003c/td\u003e\n \u003ctd\u003eCanadian Press - OAKVILLE, Ont. (CP) - The bod...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.322738\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.161369\u003c/td\u003e\n \u003c/tr\u003e\n \u003ctr\u003e\n \u003cth\u003e9\u003c/th\u003e\n \u003ctd\u003eAustralia targeted for first time in Iraq car ...\u003c/td\u003e\n \u003ctd\u003eAustralian troops in Baghdad came under attack...\u003c/td\u003e\n \u003ctd\u003eWorld\u003c/td\u003e\n \u003ctd\u003e0.321895\u003c/td\u003e\n \u003ctd\u003e0.0\u003c/td\u003e\n \u003ctd\u003e0.160948\u003c/td\u003e\n \u003c/tr\u003e\n \u003c/tbody\u003e\n\u003c/table\u003e\n\u003c/div\u003e","text/plain":" title \\\n0 All Australians accounted for in Iraq: Downer ... \n1 Cricket: Aussies dominate India \n2 Man tried for UK student's murder \n3 Ponting doesn #39;t think much of Kiwis or win... \n4 Hassan Body Found in Fallujah: Australian PM \n5 Aussie alive after capture in Iraq \n6 A trio of television technologies \n7 Australia PM Gets Down to Work on Fourth Term ... \n8 Police pull body of lost autistic man, 46, fro... \n9 Australia targeted for first time in Iraq car ... \n\n description genre \\\n0 AFP - Australia has accounted for all its nati... World \n1 Australia tighten their grip on the third Test... World \n2 The trial of a man accused of murdering York b... World \n3 RICKY PONTING believes the game #39;s watchers... Sports \n4 Australia #39;s prime minister says a body fou... World \n5 AUSTRALIAN journalist John Martinkus is lucky ... World \n6 AUSTRALIANS went into a television-buying fren... Sci/Tech \n7 Reuters - Australia's conservative Prime Minis... World \n8 Canadian Press - OAKVILLE, Ont. (CP) - The bod... World \n9 Australian troops in Baghdad came under attack... World \n\n semantic_score keyword_score combined_score \n0 0.445395 0.0 0.222698 \n1 0.368577 0.0 0.184289 \n2 0.350485 0.0 0.175242 \n3 0.345483 0.0 0.172742 \n4 0.341777 0.0 0.170889 \n5 0.334077 0.0 0.167039 \n6 0.332006 0.0 0.166003 \n7 0.324335 0.0 0.162168 \n8 0.322738 0.0 0.161369 \n9 0.321895 0.0 0.160948 "},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":"hyb_query = \"Articles about Aussie captures\"\nhyb_embedding = model.encode(hyb_query)\n\n# Create the SQL statement.\nhyb_statement = \"\"\"\n SELECT\n title,\n description,\n genre,\n DOT_PRODUCT(embedding, %(embedding)s) AS semantic_score,\n MATCH(title, description) AGAINST (%(query)s) AS keyword_score,\n (semantic_score + keyword_score) / 2 AS combined_score\n FROM news.news_articles\n ORDER BY combined_score DESC\n LIMIT 10\n \"\"\"\n\n# Execute the SQL statement.\nhyb_results = pd.DataFrame(db_connection.execute(hyb_statement, dict(embedding=hyb_embedding, query=hyb_query)))\nhyb_results"},{"cell_type":"markdown","id":"f9f6e53b-fb02-4d1a-908f-b96d1c2cdfd0","metadata":{},"source":"\u003cdiv id=\"singlestore-footer\" style=\"background-color: rgba(194, 193, 199, 0.25); height:2px; margin-bottom:10px\"\u003e\u003c/div\u003e\n\u003cdiv\u003e\u003cimg src=\"https://raw.githubusercontent.com/singlestore-labs/spaces-notebooks/master/common/images/singlestore-logo-grey.png\" style=\"padding: 0px; margin: 0px; height: 24px\"/\u003e\u003c/div\u003e\n\u003c/div\u003e"}],"metadata":{"jupyterlab":{"notebooks":{"version_major":6,"version_minor":4}},"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.9"},"singlestore_connection":{"connectionID":"1efa4dba-bf60-42f3-8d19-19dc6b6ffb35","defaultDatabase":"news"},"singlestore_row_limit":300},"nbformat":4,"nbformat_minor":5} +{"cells":[{"cell_type":"markdown","id":"505a207d-82ee-406d-bb92-e6a6900d6d18","metadata":{},"source":["
\n","
\n"," \n","
\n","
\n","
SingleStore Notebooks
\n","

Hybrid Search

\n","
\n","
"]},{"cell_type":"markdown","id":"f3a978dd","metadata":{},"source":[]},{"cell_type":"markdown","id":"d9f9e629-6eb9-4ca5-bcf2-1b8672b86725","metadata":{"execution":{"iopub.execute_input":"2023-06-06T03:34:15.712942Z","iopub.status.busy":"2023-06-06T03:34:15.712613Z","iopub.status.idle":"2023-06-06T03:34:15.715753Z","shell.execute_reply":"2023-06-06T03:34:15.715128Z","shell.execute_reply.started":"2023-06-06T03:34:15.712919Z"},"tags":[]},"source":["*Source*: [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/data/AG_news_samples.csv)\n","\n","Hybrid search integrates both keyword-based search and semantic search in order to combine the strengths of both and provide users with a more comprehensive and efficient search experience. This notebook is an example on how to perform hybrid search with SingleStore's database and notebooks."]},{"cell_type":"markdown","id":"532e8d3f-007d-48a4-8d36-44b561dd1109","metadata":{},"source":["## Setup\n","Let's first download the libraries necessary."]},{"cell_type":"code","execution_count":5,"id":"07990b64-9447-46a8-abbc-51be1972dfda","metadata":{"execution":{"iopub.execute_input":"2023-08-04T17:43:48.417768Z","iopub.status.busy":"2023-08-04T17:43:48.417505Z","iopub.status.idle":"2023-08-04T17:44:11.220790Z","shell.execute_reply":"2023-08-04T17:44:11.219877Z","shell.execute_reply.started":"2023-08-04T17:43:48.417751Z"},"tags":[],"trusted":true},"outputs":[],"source":["!pip install matplotlib --quiet\n","!pip install plotly.express --quiet\n","!pip install scikit-learn --quiet\n","!pip install tabulate --quiet\n","!pip install tiktoken --quiet\n","!pip install wget --quiet\n","!pip install openai --quiet"]},{"cell_type":"code","execution_count":6,"id":"a592dd5e-4114-4abf-923d-74038f5244eb","metadata":{"execution":{"iopub.execute_input":"2023-08-04T17:44:11.226906Z","iopub.status.busy":"2023-08-04T17:44:11.224865Z","iopub.status.idle":"2023-08-04T17:44:11.232457Z","shell.execute_reply":"2023-08-04T17:44:11.231887Z","shell.execute_reply.started":"2023-08-04T17:44:11.226882Z"},"tags":[],"trusted":true},"outputs":[],"source":["import pandas as pd\n","import os\n","import wget\n","import json"]},{"cell_type":"code","execution_count":7,"id":"c2bffc74-4b6a-4c0f-acef-f72bb255ec79","metadata":{"execution":{"iopub.execute_input":"2023-08-04T17:44:11.235361Z","iopub.status.busy":"2023-08-04T17:44:11.235113Z","iopub.status.idle":"2023-08-04T17:45:38.257796Z","shell.execute_reply":"2023-08-04T17:45:38.257117Z","shell.execute_reply.started":"2023-08-04T17:44:11.235346Z"},"tags":[],"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"7746987e293444e382f2b9efa253c86d","version_major":2,"version_minor":0},"text/plain":["Downloading (…)e933c/.gitattributes: 0%| | 0.00/737 [00:00\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
titledescriptionlabel
0World BriefingsBRITAIN: BLAIR WARNS OF CLIMATE THREAT Prime M...World
1Nvidia Puts a Firewall on a Motherboard (PC Wo...PC World - Upcoming chip set will include buil...Sci/Tech
2Olympic joy in Greek, Chinese pressNewspapers in Greece reflect a mixture of exhi...Sports
3U2 Can iPod with PicturesSAN JOSE, Calif. -- Apple Computer (Quote, Cha...Sci/Tech
4The Dream FactoryAny product, any shape, any size -- manufactur...Sci/Tech
............
1995You Control: iTunes puts control in OS X menu ...MacCentral - You Software Inc. announced on Tu...Sci/Tech
1996Argentina beat Italy for place in football finalFavourites Argentina beat Italy 3-0 this morni...Sports
1997NCAA case no worry for SpurrierShortly after Steve Spurrier arrived at Florid...Sports
1998Secret Service Busts Cyber GangsThe US Secret Service Thursday announced arres...Sci/Tech
1999Stocks Flat; Higher Oil Limits GainsUS stocks were little changed on Thursday as a...Business
\n","

2000 rows × 3 columns

\n",""],"text/plain":[" title \\\n","0 World Briefings \n","1 Nvidia Puts a Firewall on a Motherboard (PC Wo... \n","2 Olympic joy in Greek, Chinese press \n","3 U2 Can iPod with Pictures \n","4 The Dream Factory \n","... ... \n","1995 You Control: iTunes puts control in OS X menu ... \n","1996 Argentina beat Italy for place in football final \n","1997 NCAA case no worry for Spurrier \n","1998 Secret Service Busts Cyber Gangs \n","1999 Stocks Flat; Higher Oil Limits Gains \n","\n"," description label \n","0 BRITAIN: BLAIR WARNS OF CLIMATE THREAT Prime M... World \n","1 PC World - Upcoming chip set will include buil... Sci/Tech \n","2 Newspapers in Greece reflect a mixture of exhi... Sports \n","3 SAN JOSE, Calif. -- Apple Computer (Quote, Cha... Sci/Tech \n","4 Any product, any shape, any size -- manufactur... Sci/Tech \n","... ... ... \n","1995 MacCentral - You Software Inc. announced on Tu... Sci/Tech \n","1996 Favourites Argentina beat Italy 3-0 this morni... Sports \n","1997 Shortly after Steve Spurrier arrived at Florid... Sports \n","1998 The US Secret Service Thursday announced arres... Sci/Tech \n","1999 US stocks were little changed on Thursday as a... Business \n","\n","[2000 rows x 3 columns]"]},"execution_count":15,"metadata":{},"output_type":"execute_result"}],"source":["df = pd.read_csv('AG_news_samples.csv')\n","df.pop('label_int')\n","df"]},{"cell_type":"code","execution_count":16,"id":"e30c69d3-a807-4437-84e9-6972e3bc3d85","metadata":{"execution":{"iopub.execute_input":"2023-08-04T17:46:28.813086Z","iopub.status.busy":"2023-08-04T17:46:28.812535Z","iopub.status.idle":"2023-08-04T17:46:28.817391Z","shell.execute_reply":"2023-08-04T17:46:28.816738Z","shell.execute_reply.started":"2023-08-04T17:46:28.813067Z"},"tags":[],"trusted":true},"outputs":[],"source":["data = df.values.tolist()"]},{"cell_type":"markdown","id":"0b6c6560-bc60-43ba-93a4-1b4aee933d5b","metadata":{},"source":["## Set up SingleStore Database"]},{"cell_type":"markdown","id":"d6a1952b-7313-4007-9ec5-4c506425190f","metadata":{},"source":["Connect to your SingleStoreDB Cloud workspaces using SQLAlchemy."]},{"cell_type":"code","execution_count":17,"id":"1e8b918f-d849-4bad-b5e9-1cf8be138026","metadata":{"execution":{"iopub.execute_input":"2023-08-04T17:46:30.588682Z","iopub.status.busy":"2023-08-04T17:46:30.588364Z","iopub.status.idle":"2023-08-04T17:46:30.647333Z","shell.execute_reply":"2023-08-04T17:46:30.646647Z","shell.execute_reply.started":"2023-08-04T17:46:30.588665Z"},"tags":[],"trusted":true},"outputs":[],"source":["from singlestoredb import create_engine\n","\n","db_connection = create_engine().connect()"]},{"cell_type":"markdown","id":"e1dd6296-54b0-4f8d-886a-13cacfc28163","metadata":{},"source":["Set up the SingleStore Database which will hold your data."]},{"cell_type":"code","execution_count":11,"id":"e1874b6f-706a-4638-ad2a-ca387953acaa","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:10:45.294726Z","iopub.status.busy":"2023-08-03T20:10:45.294501Z","iopub.status.idle":"2023-08-03T20:11:33.916475Z","shell.execute_reply":"2023-08-03T20:11:33.915742Z","shell.execute_reply.started":"2023-08-03T20:10:45.294706Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/plain":[]},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":["%%sql\n","-- Create the database\n","DROP DATABASE IF EXISTS news;\n","CREATE DATABASE IF NOT EXISTS news;"]},{"cell_type":"markdown","id":"553f42af-0b29-4e11-a54b-9879447b2a27","metadata":{},"source":["
\n"," \n","
\n","

Action Required

\n","

Make sure to select the news database from the drop-down menu at the top of this notebook. It updates the connection_url which is used by the %%sql magic command and SQLAlchemy to make connections to the selected database.\n","

\n","
\n","
"]},{"cell_type":"code","execution_count":12,"id":"3f1e2c3d-6fbd-46bb-9bd3-235eb51941cf","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:11:33.917770Z","iopub.status.busy":"2023-08-03T20:11:33.917541Z","iopub.status.idle":"2023-08-03T20:11:34.179324Z","shell.execute_reply":"2023-08-03T20:11:34.178632Z","shell.execute_reply.started":"2023-08-03T20:11:33.917754Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/plain":[]},"execution_count":12,"metadata":{},"output_type":"execute_result"}],"source":["%%sql\n","-- Create the table\n","DROP TABLE IF EXISTS news_articles;\n","CREATE TABLE IF NOT EXISTS news_articles (\n"," title TEXT,\n"," description TEXT,\n"," genre TEXT,\n"," embedding BLOB,\n"," FULLTEXT (title, description)\n",");"]},{"cell_type":"markdown","id":"8bd97023-3d02-44d4-8bd3-59875cb22b6c","metadata":{"execution":{"iopub.execute_input":"2023-06-06T06:31:11.967693Z","iopub.status.busy":"2023-06-06T06:31:11.967312Z","iopub.status.idle":"2023-06-06T06:31:11.971035Z","shell.execute_reply":"2023-06-06T06:31:11.970370Z","shell.execute_reply.started":"2023-06-06T06:31:11.967669Z"},"tags":[]},"source":["### Get embeddings for every row based on the description column."]},{"cell_type":"code","execution_count":13,"id":"496f84d0-51b6-4b66-bf5b-b1b260e4c2de","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:11:34.181980Z","iopub.status.busy":"2023-08-03T20:11:34.181788Z","iopub.status.idle":"2023-08-03T20:14:59.572758Z","shell.execute_reply":"2023-08-03T20:14:59.571998Z","shell.execute_reply.started":"2023-08-03T20:11:34.181964Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/plain":["(2000, 768)"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["# Will take around 3.5 minutes to get embeddings for all 2000 columns\n","\n","descriptions = [row[1] for row in data]\n","all_embeddings = model.encode(descriptions)\n","all_embeddings.shape"]},{"cell_type":"code","execution_count":14,"id":"05b2f3fe-c35c-4252-b416-9f7b7aec60a6","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:14:59.573852Z","iopub.status.busy":"2023-08-03T20:14:59.573612Z","iopub.status.idle":"2023-08-03T20:14:59.578230Z","shell.execute_reply":"2023-08-03T20:14:59.577595Z","shell.execute_reply.started":"2023-08-03T20:14:59.573821Z"},"tags":[],"trusted":true},"outputs":[],"source":["combined_data = [tuple(row) + (embedding,) for embedding, row in zip(all_embeddings, data)]"]},{"cell_type":"markdown","id":"46b1628c-0ffc-4a84-ba8b-43e8df081b01","metadata":{},"source":["### Populate the database"]},{"cell_type":"code","execution_count":15,"id":"cd3e5f9b-d9e5-45fe-ba20-4fb021d7a425","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:14:59.579579Z","iopub.status.busy":"2023-08-03T20:14:59.579094Z","iopub.status.idle":"2023-08-03T20:15:12.791488Z","shell.execute_reply":"2023-08-03T20:15:12.790862Z","shell.execute_reply.started":"2023-08-03T20:14:59.579558Z"},"tags":[],"trusted":true},"outputs":[],"source":["%sql TRUNCATE TABLE news_articles;\n","statement = '''\n"," INSERT INTO news.news_articles (\n"," title,\n"," description,\n"," genre,\n"," embedding\n"," )\n"," VALUES (\n"," %s,\n"," %s,\n"," %s,\n"," %s\n"," )\n"," '''\n","\n","for i, row in enumerate(combined_data):\n"," try:\n"," db_connection.execute(statement, row)\n"," except Exception as e:\n"," print(\"Error inserting row {}: {}\".format(i, e))\n"," continue"]},{"cell_type":"markdown","id":"a2f3d567-eaf4-487a-a1f9-2eb7e1071991","metadata":{"tags":[]},"source":["## Semantic Search"]},{"cell_type":"markdown","id":"7ad3b8f6-d3a8-4954-a737-f11c785ce9ce","metadata":{},"source":["### Connect to OpenAI"]},{"cell_type":"code","execution_count":16,"id":"598d7077-d04c-46b3-b7c4-7b4362dd4507","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:15:12.792954Z","iopub.status.busy":"2023-08-03T20:15:12.792546Z","iopub.status.idle":"2023-08-03T20:15:12.845759Z","shell.execute_reply":"2023-08-03T20:15:12.845167Z","shell.execute_reply.started":"2023-08-03T20:15:12.792931Z"},"tags":[],"trusted":true},"outputs":[],"source":["import openai\n","\n","# models\n","EMBEDDING_MODEL = \"text-embedding-ada-002\"\n","GPT_MODEL = \"gpt-3.5-turbo\""]},{"cell_type":"code","execution_count":17,"id":"9eea2f67-3c2e-4d1a-87c2-052c2acf4026","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:15:12.847096Z","iopub.status.busy":"2023-08-03T20:15:12.846702Z","iopub.status.idle":"2023-08-03T20:15:12.850061Z","shell.execute_reply":"2023-08-03T20:15:12.849401Z","shell.execute_reply.started":"2023-08-03T20:15:12.847074Z"},"tags":[],"trusted":true},"outputs":[],"source":["openai.api_key = 'YOUR_API_KEY_HERE'"]},{"cell_type":"markdown","id":"6504f561-1ab1-4dbf-a523-0aef23b66e4b","metadata":{},"source":["### Run Semantic Search and get scores"]},{"cell_type":"code","execution_count":18,"id":"a62a4c06-d77a-49b1-beaf-4c54b04d001f","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:15:12.851400Z","iopub.status.busy":"2023-08-03T20:15:12.851132Z","iopub.status.idle":"2023-08-03T20:15:13.128352Z","shell.execute_reply":"2023-08-03T20:15:13.127794Z","shell.execute_reply.started":"2023-08-03T20:15:12.851379Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
titledescriptiongenrescore
0All Australians accounted for in Iraq: Downer ...AFP - Australia has accounted for all its nati...World0.445395
1Cricket: Aussies dominate IndiaAustralia tighten their grip on the third Test...World0.368577
2Man tried for UK student's murderThe trial of a man accused of murdering York b...World0.350485
3Ponting doesn #39;t think much of Kiwis or win...RICKY PONTING believes the game #39;s watchers...Sports0.345483
4Hassan Body Found in Fallujah: Australian PMAustralia #39;s prime minister says a body fou...World0.341777
5Aussie alive after capture in IraqAUSTRALIAN journalist John Martinkus is lucky ...World0.334077
6A trio of television technologiesAUSTRALIANS went into a television-buying fren...Sci/Tech0.332006
7Australia PM Gets Down to Work on Fourth Term ...Reuters - Australia's conservative Prime Minis...World0.324335
8Police pull body of lost autistic man, 46, fro...Canadian Press - OAKVILLE, Ont. (CP) - The bod...World0.322738
9Australia targeted for first time in Iraq car ...Australian troops in Baghdad came under attack...World0.321895
\n","
"],"text/plain":[" title \\\n","0 All Australians accounted for in Iraq: Downer ... \n","1 Cricket: Aussies dominate India \n","2 Man tried for UK student's murder \n","3 Ponting doesn #39;t think much of Kiwis or win... \n","4 Hassan Body Found in Fallujah: Australian PM \n","5 Aussie alive after capture in Iraq \n","6 A trio of television technologies \n","7 Australia PM Gets Down to Work on Fourth Term ... \n","8 Police pull body of lost autistic man, 46, fro... \n","9 Australia targeted for first time in Iraq car ... \n","\n"," description genre score \n","0 AFP - Australia has accounted for all its nati... World 0.445395 \n","1 Australia tighten their grip on the third Test... World 0.368577 \n","2 The trial of a man accused of murdering York b... World 0.350485 \n","3 RICKY PONTING believes the game #39;s watchers... Sports 0.345483 \n","4 Australia #39;s prime minister says a body fou... World 0.341777 \n","5 AUSTRALIAN journalist John Martinkus is lucky ... World 0.334077 \n","6 AUSTRALIANS went into a television-buying fren... Sci/Tech 0.332006 \n","7 Reuters - Australia's conservative Prime Minis... World 0.324335 \n","8 Canadian Press - OAKVILLE, Ont. (CP) - The bod... World 0.322738 \n","9 Australian troops in Baghdad came under attack... World 0.321895 "]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["from openai.embeddings_utils import get_embedding\n","search_query = \"Articles about Aussie captures\"\n","search_embedding = model.encode(search_query)\n","\n","# Create the SQL statement.\n","query_statement = \"\"\"\n"," SELECT\n"," title,\n"," description,\n"," genre,\n"," DOT_PRODUCT(embedding, %(embedding)s) AS score\n"," FROM news.news_articles\n"," ORDER BY score DESC\n"," LIMIT 10\n"," \"\"\"\n","\n","# Execute the SQL statement.\n","results = pd.DataFrame(db_connection.execute(query_statement, dict(embedding=search_embedding)))\n","results"]},{"cell_type":"markdown","id":"2c8ff862-ea5b-4960-be5b-bcd530d6e918","metadata":{},"source":["## Hybrid Search"]},{"cell_type":"markdown","id":"d0b2cff3-76f8-4a35-a596-4f001a9b4c8c","metadata":{},"source":["This search finds the average of the score gotten from the semantic search and the score gotten from the key-word search and sorts the news articles by this combined score to perform an effective hybrid search."]},{"cell_type":"code","execution_count":19,"id":"9df7073f-6a89-4528-968c-7d5c21876a83","metadata":{"execution":{"iopub.execute_input":"2023-08-03T20:15:13.129535Z","iopub.status.busy":"2023-08-03T20:15:13.129108Z","iopub.status.idle":"2023-08-03T20:15:13.334690Z","shell.execute_reply":"2023-08-03T20:15:13.334045Z","shell.execute_reply.started":"2023-08-03T20:15:13.129512Z"},"tags":[],"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
titledescriptiongenresemantic_scorekeyword_scorecombined_score
0All Australians accounted for in Iraq: Downer ...AFP - Australia has accounted for all its nati...World0.4453950.00.222698
1Cricket: Aussies dominate IndiaAustralia tighten their grip on the third Test...World0.3685770.00.184289
2Man tried for UK student's murderThe trial of a man accused of murdering York b...World0.3504850.00.175242
3Ponting doesn #39;t think much of Kiwis or win...RICKY PONTING believes the game #39;s watchers...Sports0.3454830.00.172742
4Hassan Body Found in Fallujah: Australian PMAustralia #39;s prime minister says a body fou...World0.3417770.00.170889
5Aussie alive after capture in IraqAUSTRALIAN journalist John Martinkus is lucky ...World0.3340770.00.167039
6A trio of television technologiesAUSTRALIANS went into a television-buying fren...Sci/Tech0.3320060.00.166003
7Australia PM Gets Down to Work on Fourth Term ...Reuters - Australia's conservative Prime Minis...World0.3243350.00.162168
8Police pull body of lost autistic man, 46, fro...Canadian Press - OAKVILLE, Ont. (CP) - The bod...World0.3227380.00.161369
9Australia targeted for first time in Iraq car ...Australian troops in Baghdad came under attack...World0.3218950.00.160948
\n","
"],"text/plain":[" title \\\n","0 All Australians accounted for in Iraq: Downer ... \n","1 Cricket: Aussies dominate India \n","2 Man tried for UK student's murder \n","3 Ponting doesn #39;t think much of Kiwis or win... \n","4 Hassan Body Found in Fallujah: Australian PM \n","5 Aussie alive after capture in Iraq \n","6 A trio of television technologies \n","7 Australia PM Gets Down to Work on Fourth Term ... \n","8 Police pull body of lost autistic man, 46, fro... \n","9 Australia targeted for first time in Iraq car ... \n","\n"," description genre \\\n","0 AFP - Australia has accounted for all its nati... World \n","1 Australia tighten their grip on the third Test... World \n","2 The trial of a man accused of murdering York b... World \n","3 RICKY PONTING believes the game #39;s watchers... Sports \n","4 Australia #39;s prime minister says a body fou... World \n","5 AUSTRALIAN journalist John Martinkus is lucky ... World \n","6 AUSTRALIANS went into a television-buying fren... Sci/Tech \n","7 Reuters - Australia's conservative Prime Minis... World \n","8 Canadian Press - OAKVILLE, Ont. (CP) - The bod... World \n","9 Australian troops in Baghdad came under attack... World \n","\n"," semantic_score keyword_score combined_score \n","0 0.445395 0.0 0.222698 \n","1 0.368577 0.0 0.184289 \n","2 0.350485 0.0 0.175242 \n","3 0.345483 0.0 0.172742 \n","4 0.341777 0.0 0.170889 \n","5 0.334077 0.0 0.167039 \n","6 0.332006 0.0 0.166003 \n","7 0.324335 0.0 0.162168 \n","8 0.322738 0.0 0.161369 \n","9 0.321895 0.0 0.160948 "]},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":["hyb_query = \"Articles about Aussie captures\"\n","hyb_embedding = model.encode(hyb_query)\n","\n","# Create the SQL statement.\n","hyb_statement = \"\"\"\n"," SELECT\n"," title,\n"," description,\n"," genre,\n"," DOT_PRODUCT(embedding, %(embedding)s) AS semantic_score,\n"," MATCH(title, description) AGAINST (%(query)s) AS keyword_score,\n"," (semantic_score + keyword_score) / 2 AS combined_score\n"," FROM news.news_articles\n"," ORDER BY combined_score DESC\n"," LIMIT 10\n"," \"\"\"\n","\n","# Execute the SQL statement.\n","hyb_results = pd.DataFrame(db_connection.execute(hyb_statement, dict(embedding=hyb_embedding, query=hyb_query)))\n","hyb_results"]},{"cell_type":"markdown","id":"f9f6e53b-fb02-4d1a-908f-b96d1c2cdfd0","metadata":{},"source":["
\n","
\n",""]}],"metadata":{"jupyterlab":{"notebooks":{"version_major":6,"version_minor":4}},"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.9"},"singlestore_connection":{"connectionID":"1efa4dba-bf60-42f3-8d19-19dc6b6ffb35","defaultDatabase":""},"singlestore_row_limit":300},"nbformat":4,"nbformat_minor":5}