From cab4780866cc5cd92e013884bb7b29630ed4854d Mon Sep 17 00:00:00 2001 From: eliotu Date: Tue, 20 Aug 2024 11:53:10 +0200 Subject: [PATCH] pipeline test --- .gitignore | 2 +- react_notebooks/final_file.viz | 12487 ++++++++++++++++--------------- src/VizContent.tsx | 6 +- src/colorScheme.ts | 1 + 4 files changed, 6275 insertions(+), 6221 deletions(-) diff --git a/.gitignore b/.gitignore index 187fd12..8d60b68 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,7 @@ backend/api_key.txt jupyterlab_apod/labextension # Version file is handled by hatchling jupyterlab_apod/_version.py - +/secrets # Integration tests ui-tests/test-results/ ui-tests/playwright-report/ diff --git a/react_notebooks/final_file.viz b/react_notebooks/final_file.viz index ec60b5b..54925a4 100644 --- a/react_notebooks/final_file.viz +++ b/react_notebooks/final_file.viz @@ -1,2926 +1,2938 @@ { "notebooks": [{ "cells": [{ - "cell_id": 44, - "code": "submission = pd.read_csv(\"../input/nlp-getting-started/sample_submission.csv\")\ntest_pred = model_glove.predict(X_test_seq)\ntest_pred_int = test_pred.round().astype('int')\nsubmission['target'] = test_pred_int\nsubmission.to_csv('submission.csv', index=False)", + "cell_id": 22, + "code": "sub = pd.read_csv(dir_path + \"sample_submission.csv\")\nprediction = (F.softmax(test_preds[0], dim=1)[:, 1]>min_threshold).int()\nsub = pd.read_csv(dir_path + \"sample_submission.csv\")\nsub[\"target\"] = prediction\nsub.to_csv(\"submission.csv\", index=False)", "class": "Data Export", - "desc": "This code snippet loads a sample submission file, generates predictions on the test data using the GloVe-embeddings-based model, assigns the predictions to the 'target' column of the submission DataFrame, and saves it to a CSV file.", + "desc": "The code snippet loads a sample submission file into a DataFrame, generates binary predictions using the previously determined optimal threshold, updates the \"target\" column of the submission DataFrame with these predictions, and exports the final submission DataFrame to a CSV file named \"submission.csv\" without including the index.", "testing": { "class": "Data_Export", "subclass": "save_to_csv", "subclass_id": 25, - "predicted_subclass_probability": 0.99928826 + "predicted_subclass_probability": 0.974422 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 1, - "code": "train_data = pd.read_csv(\"../input/nlp-getting-started/train.csv\")\ntrain_data.head(5)", - "class": "Data Extraction", - "desc": "This code snippet reads a CSV file containing training data into a pandas DataFrame and displays the first five rows.", - "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.9996433 - }, - "cluster": 1 - }, { - "cell_id": 2, - "code": "test_data = pd.read_csv(\"../input/nlp-getting-started/test.csv\")\ntest_data.head(5)", + "code": "dir_path = \"/kaggle/input/nlp-getting-started/\"\ntrain_df = pd.read_csv(dir_path + \"train.csv\")\ntest_df = pd.read_csv(dir_path + \"test.csv\")", "class": "Data Extraction", - "desc": "This code snippet reads a CSV file containing test data into a pandas DataFrame and displays the first five rows.", + "desc": "The code snippet reads CSV files from the specified directory path and loads them into Pandas DataFrames named `train_df` and `test_df`.", "testing": { "class": "Data_Extraction", "subclass": "load_from_csv", "subclass_id": 45, - "predicted_subclass_probability": 0.9996711 - }, - "cluster": 1 - }, { - "cell_id": 34, - "code": "# Loading the embedding dictionary from file\n\nembedding_dict={}\nwith open('../input/glovetwitter27b100dtxt/glove.twitter.27B.100d.txt','r') as f:\n for line in f:\n values=line.split()\n word = values[0]\n vectors=np.asarray(values[1:],'float32')\n embedding_dict[word]=vectors\nf.close()", - "class": "Data Extraction", - "desc": "This code snippet loads pre-trained GloVe word embeddings from a file and stores them in a dictionary mapping words to their corresponding vector representations.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.23091643 + "predicted_subclass_probability": 0.99974483 }, "cluster": 0 }, { - "cell_id": 7, - "code": "from bs4 import BeautifulSoup # Text Cleaning\nimport re, string # Regular Expressions, String\nfrom nltk.corpus import stopwords # stopwords\nfrom nltk.stem.porter import PorterStemmer # for word stemming\nfrom nltk.stem import WordNetLemmatizer # for word lemmatization\nimport unicodedata\nimport html\n\n# set of stopwords to be removed from text\nstop = set(stopwords.words('english'))\n\n# update stopwords to have punctuation too\nstop.update(list(string.punctuation))\n\ndef clean_tweets(text):\n \n # Remove unwanted html characters\n re1 = re.compile(r' +')\n x1 = text.lower().replace('#39;', \"'\").replace('amp;', '&').replace('#146;', \"'\").replace(\n 'nbsp;', ' ').replace('#36;', '$').replace('\\\\n', \"\\n\").replace('quot;', \"'\").replace(\n '
', \"\\n\").replace('\\\\\"', '\"').replace('', 'u_n').replace(' @.@ ', '.').replace(\n ' @-@ ', '-').replace('\\\\', ' \\\\ ')\n text = re1.sub(' ', html.unescape(x1))\n \n # remove non-ascii characters\n text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')\n \n # strip html\n soup = BeautifulSoup(text, 'html.parser')\n text = soup.get_text()\n \n # remove between square brackets\n text = re.sub('\\[[^]]*\\]', '', text)\n \n # remove URLs\n text = re.sub(r'http\\S+', '', text)\n \n # remove twitter tags\n text = text.replace(\"@\", \"\")\n \n # remove hashtags\n text = text.replace(\"#\", \"\")\n \n # remove all non-alphabetic characters\n text = re.sub(r'[^a-zA-Z ]', '', text)\n \n # remove stopwords from text\n final_text = []\n for word in text.split():\n if word.strip().lower() not in stop:\n final_text.append(word.strip().lower())\n \n text = \" \".join(final_text)\n \n # lemmatize words\n lemmatizer = WordNetLemmatizer() \n text = \" \".join([lemmatizer.lemmatize(word) for word in text.split()])\n text = \" \".join([lemmatizer.lemmatize(word, pos = 'v') for word in text.split()])\n \n # replace all numbers with \"num\"\n text = re.sub(\"\\d\", \"num\", text)\n \n return text.lower()\n\ntrain_data['prep_text'] = train_data['text'].apply(clean_tweets)\ntrain_data['prep_text'].head(5)", + "cell_id": 3, + "code": "train_df = train_df.drop(columns=[\"id\", \"keyword\", \"location\"])", "class": "Data Transform", - "desc": "This code defines a `clean_tweets` function to preprocess textual data by removing unwanted characters, text normalization, stopwords removal, and lemmatization, and then applies this function to the training data's 'text' column to create a new 'prep_text' column.", + "desc": "The code snippet removes the columns \"id\", \"keyword\", and \"location\" from the `train_df` DataFrame using the Pandas library.", "testing": { "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.27900088 + "subclass": "drop_column", + "subclass_id": 10, + "predicted_subclass_probability": 0.99919885 }, - "cluster": 1 + "cluster": 7 }, { - "cell_id": 8, - "code": "test_data['text'] = test_data['text'].apply(clean_tweets)\ntest_data['text'].head(5)", + "cell_id": 5, + "code": "def remove_URL(text):\n url = re.compile(r'https?://\\S+|www\\.\\S+')\n return url.sub(r'',text)\n\ntrain_df[\"text\"] = train_df[\"text\"].apply(remove_URL)\ntest_df[\"text\"] = test_df[\"text\"].apply(remove_URL)", "class": "Data Transform", - "desc": "This code snippet applies the previously defined `clean_tweets` function to the 'text' column of the test data to preprocess the text.", + "desc": "The code snippet defines a function `remove_URL` to remove URLs from text using the `re` module and then applies this function to the \"text\" column of both `train_df` and `test_df` DataFrames using the `apply` method in Pandas.", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, - "predicted_subclass_probability": 0.9986481 - }, - "cluster": 1 - }, { - "cell_id": 9, - "code": "from keras.preprocessing.text import Tokenizer # Text tokenization\n\n# Setting up the tokenizer\nvocab_size = 1000\ntokenizer = Tokenizer(num_words = vocab_size, oov_token = 'UNK')\ntokenizer.fit_on_texts(list(train_data['prep_text']) + list(test_data['text']))", - "class": "Data Transform", - "desc": "This code sets up a Keras Tokenizer with a specified vocabulary size, and fits it on the preprocessed text from both the training and test datasets for tokenization.", - "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.9994017 - }, - "cluster": 1 - }, { - "cell_id": 10, - "code": "# Representing texts as one hot encoded sequence\n\nX_train_ohe = tokenizer.texts_to_matrix(train_data['prep_text'], mode = 'binary')\nX_test_ohe = tokenizer.texts_to_matrix(test_data['text'], mode = 'binary')\ny_train = np.array(train_data['target']).astype(int)\n\nprint(f\"X_train shape: {X_train_ohe.shape}\")\nprint(f\"X_test shape: {X_test_ohe.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")", - "class": "Data Transform", - "desc": "This code snippet converts the preprocessed text data into one-hot encoded sequences using the tokenizer, and then it prints the shapes of the transformed training and test data as well as the target variable.", - "testing": { - "class": "Data_Transform", - "subclass": "data_type_conversions", - "subclass_id": 16, - "predicted_subclass_probability": 0.5156552 - }, - "cluster": 1 - }, { - "cell_id": 11, - "code": "from sklearn.model_selection import train_test_split\nX_train_ohe, X_val_ohe, y_train, y_val = train_test_split(X_train_ohe, y_train, random_state = 42, test_size = 0.2)\n\nprint(f\"X_train shape: {X_train_ohe.shape}\")\nprint(f\"X_val shape: {X_val_ohe.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")\nprint(f\"y_val shape: {y_val.shape}\")", - "class": "Data Transform", - "desc": "This code snippet splits the one-hot encoded training data and target variable into training and validation sets, and then prints the shapes of these newly created datasets.", - "testing": { - "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.961794 + "predicted_subclass_probability": 0.9983814 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 16, - "code": "X_train_wc = tokenizer.texts_to_matrix(train_data['prep_text'], mode = 'count')\nX_test_wc = tokenizer.texts_to_matrix(test_data['text'], mode = 'count')\ny_train = np.array(train_data['target']).astype(int)\n\nprint(f\"X_train shape: {X_train_wc.shape}\")\nprint(f\"X_test shape: {X_test_wc.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")\n", + "cell_id": 6, + "code": "def remove_html(text):\n html=re.compile(r'<.*?>')\n return html.sub(r'',text)\n\ntrain_df[\"text\"] = train_df[\"text\"].apply(remove_html)\ntest_df[\"text\"] = test_df[\"text\"].apply(remove_html)", "class": "Data Transform", - "desc": "This code snippet converts the preprocessed text data into count-based sequences using the tokenizer, and then prints the shapes of the transformed training and test datasets as well as the target variable.", + "desc": "The code snippet defines a function `remove_html` to remove HTML tags from text using the `re` module and then applies this function to the \"text\" column of both `train_df` and `test_df` DataFrames using the `apply` method in Pandas.", "testing": { "class": "Data_Transform", - "subclass": "data_type_conversions", - "subclass_id": 16, - "predicted_subclass_probability": 0.61748403 + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.995577 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 17, - "code": "X_train_wc, X_val_wc, y_train, y_val = train_test_split(X_train_wc, y_train, random_state = 42, test_size = 0.2)\n\nprint(f\"X_train shape: {X_train_wc.shape}\")\nprint(f\"X_val shape: {X_val_wc.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")\nprint(f\"y_val shape: {y_val.shape}\")", + "cell_id": 7, + "code": "def remove_emoji(text):\n emoji_pattern = re.compile(\"[\"\n u\"\\U0001F600-\\U0001F64F\" # emoticons\n u\"\\U0001F300-\\U0001F5FF\" # symbols & pictographs\n u\"\\U0001F680-\\U0001F6FF\" # transport & map symbols\n u\"\\U0001F1E0-\\U0001F1FF\" # flags (iOS)\n u\"\\U00002702-\\U000027B0\"\n u\"\\U000024C2-\\U0001F251\"\n \"]+\", flags=re.UNICODE)\n return emoji_pattern.sub(r'', text)\n\ntrain_df[\"text\"] = train_df[\"text\"].apply(remove_emoji)\ntest_df[\"text\"] = test_df[\"text\"].apply(remove_emoji)", "class": "Data Transform", - "desc": "This code snippet splits the count-based encoded training data and target variable into training and validation sets, and then prints the shapes of these newly created datasets.", + "desc": "The code snippet defines a function `remove_emoji` to remove emojis from text using a `re` module pattern and then applies this function to the \"text\" column of both `train_df` and `test_df` DataFrames using the `apply` method in Pandas.", "testing": { "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.80229384 + "subclass": "drop_column", + "subclass_id": 10, + "predicted_subclass_probability": 0.62529504 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 22, - "code": "X_train_freq = tokenizer.texts_to_matrix(train_data['prep_text'], mode = 'freq')\nX_test_freq = tokenizer.texts_to_matrix(test_data['text'], mode = 'freq')\ny_train = np.array(train_data['target']).astype(int)\n\nprint(f\"X_train shape: {X_train_freq.shape}\")\nprint(f\"X_test shape: {X_test_freq.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")", + "cell_id": 12, + "code": "train_tensor = tokenizer(list(train_df[\"text\"]), padding=\"max_length\",\n truncation=True, max_length=30,\n return_tensors=\"pt\")[\"input_ids\"]", "class": "Data Transform", - "desc": "This code snippet converts the preprocessed text data into frequency-based sequences using the tokenizer, and then prints the shapes of the transformed training and test datasets as well as the target variable.", + "desc": "The code snippet tokenizes the text data from the \"text\" column of `train_df` into tensors with padding to a maximum length, truncates to a maximum length of 30 tokens, and returns the tokenized output in PyTorch tensor format using the `tokenizer` from the `transformers` library.", "testing": { "class": "Data_Transform", - "subclass": "data_type_conversions", - "subclass_id": 16, - "predicted_subclass_probability": 0.74439836 + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.96896535 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 23, - "code": "X_train_freq, X_val_freq, y_train, y_val = train_test_split(X_train_freq, y_train, test_size = 0.2, random_state = 42)\nprint(f\"X_train shape: {X_train_freq.shape}\")\nprint(f\"X_val shape: {X_val_freq.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")\nprint(f\"y_val shape: {y_val.shape}\")", + "cell_id": 13, + "code": "class TweetDataset:\n def __init__(self, tensors, targ, ids):\n self.text = tensors[ids, :]\n self.targ = targ[ids].reset_index(drop=True)\n \n def __len__(self):\n return len(self.text)\n \n def __getitem__(self, idx):\n \n t = self.text[idx]\n y = self.targ[idx]\n \n return t, tensor(y)", "class": "Data Transform", - "desc": "This code snippet splits the frequency-based encoded training data and target variable into training and validation sets, and then prints the shapes of these newly created datasets.", + "desc": "The code snippet defines a custom dataset class `TweetDataset` that initializes with tokenized text data, target labels, and indices, and implements methods to get the length of the dataset and retrieve items by index, converting target labels to PyTorch tensors.", "testing": { "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.84670115 - }, - "cluster": 1 - }, { - "cell_id": 28, - "code": "from sklearn.feature_extraction.text import TfidfVectorizer # Term Frequency - Inverse Document Frequency\n\nvectorizer = TfidfVectorizer(max_features = vocab_size)\nvectorizer.fit(list(train_data['prep_text']) + list(test_data['text']))\n\n# Fitting on training and testing data\nX_train_tfidf = vectorizer.transform(list(train_data['prep_text'])).toarray() \nX_test_tfidf = vectorizer.transform(list(test_data['text'])).toarray()\n\ny_train = np.array(train_data['target']).astype(int)\n\nprint(f\"X_train shape {X_train_tfidf.shape}\")\nprint(f\"X_test shape {X_test_tfidf.shape}\")\nprint(f\"y_train shape {y_train.shape}\")", - "class": "Data Transform", - "desc": "This code snippet initializes a `TfidfVectorizer`, fits it on the combined preprocessed text from both training and test datasets, transforms the training and test text data into TF-IDF encoded arrays, and then prints the shapes of these arrays and the target variable.", - "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.57227826 + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.69012034 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 29, - "code": "X_train_tfidf, X_val_tfidf, y_train, y_val = train_test_split(X_train_tfidf, y_train, test_size = 0.2, random_state = 42)\nprint(f\"X_train shape: {X_train_tfidf.shape}\")\nprint(f\"X_val shape: {X_val_tfidf.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")\nprint(f\"y_val shape: {y_val.shape}\")", + "cell_id": 14, + "code": "train_ids, valid_ids = RandomSplitter()(train_df)\n\n\ntarget = train_df[\"target\"]\n\ntrain_ds = TweetDataset(train_tensor, target, train_ids)\nvalid_ds = TweetDataset(train_tensor, target, valid_ids)\n\ntrain_dl = DataLoader(train_ds, bs=64)\nvalid_dl = DataLoader(valid_ds, bs=512)\ndls = DataLoaders(train_dl, valid_dl).to(\"cuda\")", "class": "Data Transform", - "desc": "This code snippet splits the TF-IDF encoded training data and target variable into training and validation sets, and then prints the shapes of these newly created datasets.", + "desc": "The code snippet splits the dataset into training and validation sets using `RandomSplitter`, creates instances of the `TweetDataset` class for both sets, and then wraps them in `DataLoader` instances with specified batch sizes, finally combining them into a `DataLoaders` object and moving it to the GPU using PyTorch's and Fastai's data utility classes.", "testing": { "class": "Data_Transform", "subclass": "split", "subclass_id": 13, - "predicted_subclass_probability": 0.70971876 - }, - "cluster": 1 - }, { - "cell_id": 35, - "code": "# Sequences creation, truncation and padding\n\nfrom keras.preprocessing.sequence import pad_sequences\n\n# Setting up the tokenizer\nvocab_size = 10000\ntokenizer = Tokenizer(num_words = vocab_size, oov_token = 'UNK')\ntokenizer.fit_on_texts(list(train_data['prep_text']) + list(test_data['text']))\n\nmax_len = 15\nX_train_seq = tokenizer.texts_to_sequences(train_data['prep_text'])\nX_test_seq = tokenizer.texts_to_sequences(test_data['text'])\n\nX_train_seq = pad_sequences(X_train_seq, maxlen = max_len, truncating = 'post', padding = 'post')\nX_test_seq = pad_sequences(X_test_seq, maxlen = max_len, truncating = 'post', padding = 'post')\ny_train = np.array(train_data['target']).astype(int)\n\nprint(f\"X_train shape: {X_train_seq.shape}\")\nprint(f\"X_test shape: {X_test_seq.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")", - "class": "Data Transform", - "desc": "This code snippet initializes a tokenizer, converts the preprocessed training and test text data into sequences, and then applies truncation and padding to ensure that all sequences have a uniform length of 15, finally printing the shapes of the resulting arrays.", - "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.7396719 + "predicted_subclass_probability": 0.50315964 }, - "cluster": 1 + "cluster": 5 }, { - "cell_id": 36, - "code": "X_train_seq, X_val_seq, y_train, y_val = train_test_split(X_train_seq, y_train, test_size = 0.2, random_state = 42)\nprint(f\"X_train shape: {X_train_seq.shape}\")\nprint(f\"X_val shape: {X_val_seq.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")\nprint(f\"y_val shape: {y_val.shape}\")", + "cell_id": 19, + "code": "test_tensor = tokenizer(list(test_df[\"text\"]),\n padding=\"max_length\",\n truncation=True,\n max_length=30,\n return_tensors=\"pt\")[\"input_ids\"]", "class": "Data Transform", - "desc": "This code snippet splits the padded sequence training data and target variable into training and validation sets, and then prints the shapes of these newly created datasets.", + "desc": "The code snippet tokenizes the text data from the `test_df` DataFrame into tensors with padding to a maximum length, truncates to a maximum length of 30 tokens, and returns the tokenized output in PyTorch tensor format using the `tokenizer` from the `transformers` library.", "testing": { "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.9086002 + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.9433598 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 38, - "code": "# Applying GloVE representations on our corpus\n\nembedding_matrix=np.zeros((num_words,100))\n\nfor word,i in tokenizer.word_index.items():\n if i < num_words:\n emb_vec = embedding_dict.get(word)\n if emb_vec is not None:\n embedding_matrix[i] = emb_vec ", + "cell_id": 20, + "code": "class TestDS:\n def __init__(self, tensors):\n self.tensors = tensors\n \n def __len__(self):\n return len(self.tensors)\n \n def __getitem__(self, idx):\n t = self.tensors[idx]\n return t, tensor(0)\n\ntest_dl = DataLoader(TestDS(test_tensor), bs=128)", "class": "Data Transform", - "desc": "This code snippet initializes an embedding matrix with zeros and populates it with GloVe word vector representations corresponding to words in the tokenizer's word index.", + "desc": "The code snippet defines a `TestDS` class to wrap the tokenized test data, implementing methods to get the length of the dataset and retrieve items by index, and then creates a `DataLoader` instance for this test dataset with a batch size of 128.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.91961074 + "class": "Model_Train", + "subclass": "load_pretrained", + "subclass_id": 30, + "predicted_subclass_probability": 0.9415805 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 3, - "code": "train_data.info()", + "cell_id": 2, + "code": "train_df", "class": "Exploratory Data Analysis", - "desc": "This code snippet provides a summary of the DataFrame, including the data types and non-null counts for each column.", + "desc": "The code snippet outputs and displays the contents of the `train_df` DataFrame. ", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.99936634 + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.99972683 }, - "cluster": 11 + "cluster": 0 }, { "cell_id": 4, - "code": "test_data.info()", + "code": "train_df[\"target\"].value_counts()", "class": "Exploratory Data Analysis", - "desc": "This code snippet provides a summary of the test DataFrame, including the data types and non-null counts for each column.", + "desc": "The code snippet counts and displays the number of occurrences of each unique value in the \"target\" column of the `train_df` DataFrame using the `value_counts` method in Pandas.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.9993579 + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.999521 }, - "cluster": 11 + "cluster": 4 }, { - "cell_id": 27, - "code": "train_data.head()", + "cell_id": 8, + "code": "train_df", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first five rows of the `train_data` DataFrame to examine its structure and content.", + "desc": "The code snippet outputs and displays the contents of the `train_df` DataFrame.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.9997532 - }, - "cluster": 12 - }, { - "cell_id": 37, - "code": "num_words = len(tokenizer.word_index)\nprint(f\"Number of unique words: {num_words}\")", - "class": "Exploratory Data Analysis", - "desc": "This code snippet calculates and prints the number of unique words in the tokenizer's word index.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_unique_values", - "subclass_id": 54, - "predicted_subclass_probability": 0.9865096 + "predicted_subclass_probability": 0.99972683 }, - "cluster": -1 + "cluster": 0 }, { "cell_id": 0, - "code": "# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport seaborn as sns # data visualization\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session", + "code": "import numpy as np\nimport pandas as pd\nfrom fastai.text.all import *\nimport re", "class": "Imports and Environment", - "desc": "This code imports essential libraries for data manipulation and visualization, and lists all files in the input directory.", + "desc": "The code snippet imports essential libraries such as NumPy, Pandas, Fastai's text processing module, and the regular expression module 're'.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "list_files", - "subclass_id": 88, - "predicted_subclass_probability": 0.99922085 + "class": "Imports_and_Environment", + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.99934 }, "cluster": 0 }, { - "cell_id": 6, - "code": "!pip install BeautifulSoup4", + "cell_id": 10, + "code": "from transformers import AutoTokenizer, AutoModelForSequenceClassification", "class": "Imports and Environment", - "desc": "This code snippet installs the BeautifulSoup4 library, which is used for web scraping.", + "desc": "The code snippet imports `AutoTokenizer` and `AutoModelForSequenceClassification` from the `transformers` library.", "testing": { "class": "Imports_and_Environment", - "subclass": "install_modules", - "subclass_id": 87, - "predicted_subclass_probability": 0.9954203 + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.99929786 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 14, - "code": "_, accuracy = model.evaluate(X_val_ohe, y_val)", - "class": "Model Evaluation", - "desc": "This code snippet evaluates the trained model on the validation set and retrieves the accuracy.", - "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.99220455 + "cell_id": 11, + "code": "tokenizer = AutoTokenizer.from_pretrained(\"roberta-large\")", + "class": "Imports and Environment", + "desc": "The code snippet initializes a tokenizer by loading the pre-trained \"roberta-large\" model using the `AutoTokenizer` class from the `transformers` library.", + "testing": { + "class": "Model_Train", + "subclass": "load_pretrained", + "subclass_id": 30, + "predicted_subclass_probability": 0.9938804 }, "cluster": 0 }, { - "cell_id": 20, - "code": "_, accuracy = model.evaluate(X_val_wc, y_val)", + "cell_id": 18, + "code": "from sklearn.metrics import f1_score\n\npreds, targs = learn.get_preds()\n\nmin_threshold = None\nmax_f1 = -float(\"inf\")\nthresholds = np.linspace(0.3, 0.7, 50)\nfor threshold in thresholds:\n f1 = f1_score(targs, F.softmax(preds, dim=1)[:, 1]>threshold)\n if f1 > max_f1:\n min_threshold = threshold\n min_f1 = f1\n print(f\"threshold:{threshold:.4f} - f1:{f1:.4f}\")", "class": "Model Evaluation", - "desc": "This code snippet evaluates the newly trained model on the count-based validation set and retrieves the accuracy.", + "desc": "The code snippet uses Fastai's `get_preds` method to obtain predictions and targets from the model, then iterates through a range of thresholds to calculate the F1 score using `sklearn.metrics.f1_score`, printing each threshold and corresponding F1 score, and recording the threshold that yields the highest F1 score.", "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.9896338 + "class": "Model_Train", + "subclass": "find_best_params", + "subclass_id": 2, + "predicted_subclass_probability": 0.6100683 }, "cluster": 0 }, { - "cell_id": 12, - "code": "from keras.models import Sequential\nfrom keras import layers, metrics, optimizers, losses\n\ndef setup_model():\n \n model = Sequential()\n# model.add(layers.Dense(16, activation='relu', input_shape=(vocab_size,)))\n# model.add(layers.Dense(16, activation='relu'))\n model.add(layers.Dense(1, activation='sigmoid', input_shape=(vocab_size,)))\n \n model.compile(optimizer=optimizers.RMSprop(lr=0.001),\n loss=losses.binary_crossentropy,\n metrics=[metrics.binary_accuracy])\n \n return model\n\nmodel = setup_model()\nmodel.summary()", + "cell_id": 21, + "code": "test_preds = learn.get_preds(dl=test_dl)", + "class": "Model Evaluation", + "desc": "The code snippet uses the `get_preds` method from Fastai's `Learner` class to obtain predictions for the test dataset loaded in the `test_dl` DataLoader.", + "testing": { + "class": "Model_Evaluation", + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.9943605 + }, + "cluster": -1 + }, { + "cell_id": 15, + "code": "bert = AutoModelForSequenceClassification.from_pretrained(\"roberta-large\", num_labels=2).train().to(\"cuda\")\n\nclass BertClassifier(Module):\n def __init__(self, bert):\n self.bert = bert\n def forward(self, x):\n return self.bert(x).logits\n\nmodel = BertClassifier(bert)", "class": "Model Training", - "desc": "This code snippet defines a function to setup a Sequential neural network model with a single dense layer, compiles the model using the RMSprop optimizer and binary cross-entropy loss, and then displays the model summary.", + "desc": "The code snippet initializes a pre-trained \"roberta-large\" model for sequence classification with 2 labels using the `AutoModelForSequenceClassification` from the `transformers` library, sets it to training mode and moves it to the GPU, and defines a custom `BertClassifier` class that wraps this model and overrides the `forward` method, followed by instantiating the classifier.", "testing": { "class": "Model_Train", "subclass": "choose_model_class", "subclass_id": 4, - "predicted_subclass_probability": 0.990493 + "predicted_subclass_probability": 0.86304575 }, "cluster": 0 }, { - "cell_id": 13, - "code": "history = model.fit(X_train_ohe, y_train, epochs = 20, batch_size = 512, validation_data = (X_val_ohe, y_val))", + "cell_id": 16, + "code": "learn = Learner(dls, model, metrics=[accuracy, F1Score()]).to_fp16()\nlearn.lr_find()", "class": "Model Training", - "desc": "This code snippet trains the sequential neural network model on the one-hot encoded training data for 20 epochs with a batch size of 512 and validates the performance using the validation data.", + "desc": "The code snippet creates a `Learner` object using Fastai with the provided data loaders, model, and evaluation metrics (accuracy and F1Score), converts the model to mixed precision with `to_fp16()`, and then runs a learning rate finder to suggest a suitable learning rate.", "testing": { "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.9996803 + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.9948531 }, "cluster": 0 }, { - "cell_id": 18, - "code": "model = setup_model()\nmodel.summary()", - "class": "Model Training", - "desc": "This code snippet initializes a new neural network model using the previously defined `setup_model` function and displays the model summary.", - "testing": { - "class": "Visualization", - "subclass": "model_coefficients", - "subclass_id": 79, - "predicted_subclass_probability": 0.9821353 - }, - "cluster": 1 - }, { - "cell_id": 19, - "code": "history = model.fit(X_train_wc, y_train, epochs = 20, batch_size = 512, validation_data = (X_val_wc, y_val))", + "cell_id": 17, + "code": "learn.fit_one_cycle(3, lr_max=1e-5)", "class": "Model Training", - "desc": "This code snippet trains the neural network model on the count-based encoded training data for 20 epochs with a batch size of 512 and validates the performance using the validation data.", + "desc": "The code snippet trains the model for 3 epochs using the One Cycle learning rate policy and a maximum learning rate of 1e-5 with the `fit_one_cycle` method in Fastai's `Learner` class.", "testing": { "class": "Model_Train", "subclass": "train_model", "subclass_id": 7, - "predicted_subclass_probability": 0.99967945 - }, - "cluster": 0 - }, { - "cell_id": 24, - "code": "model = setup_model()\nmodel.summary()", - "class": "Model Training", - "desc": "This code snippet initializes a new neural network model using the previously defined `setup_model` function and displays the model summary.", - "testing": { - "class": "Visualization", - "subclass": "model_coefficients", - "subclass_id": 79, - "predicted_subclass_probability": 0.9821353 + "predicted_subclass_probability": 0.9996973 }, "cluster": 1 }, { - "cell_id": 25, - "code": "history = model.fit(X_train_freq, y_train, epochs = 20, batch_size = 512, validation_data = (X_val_freq, y_val))", - "class": "Model Training", - "desc": "This code snippet trains the neural network model on the frequency-based encoded training data for 20 epochs with a batch size of 512 and validates the performance using the validation data.", + "cell_id": 9, + "code": "train_df[\"text\"].apply(lambda x:len(x.split())).plot(kind=\"hist\");", + "class": "Visualization", + "desc": "The code snippet creates a histogram plot of the length of the text in the \"text\" column of `train_df` by splitting the text into words and counting the number of words in each entry using Pandas' `plot` method with the \"hist\" kind argument.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.99967873 + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.6361653 }, - "cluster": 0 - }, { - "cell_id": 30, - "code": "model = setup_model()\nmodel.summary()", - "class": "Model Training", - "desc": "This code snippet initializes a new neural network model using the previously defined `setup_model` function and displays the model summary.", + "cluster": -1 + }], + "notebook_id": 0, + "notebook_name": "roberta-with-pytorch-and-fastai.ipynb" + }, { + "cells": [{ + "cell_id": 14, + "code": "pd.DataFrame({\n 'id':test.id,\n 'target':pred\n}).to_csv('submission.csv',index=False)", + "class": "Data Export", + "desc": "This code creates a DataFrame with the `id` from the `test` DataFrame and the model `pred` predictions, and exports it to a CSV file named `submission.csv` without the index.", "testing": { - "class": "Visualization", - "subclass": "model_coefficients", - "subclass_id": 79, - "predicted_subclass_probability": 0.9821353 + "class": "Data_Export", + "subclass": "save_to_csv", + "subclass_id": 25, + "predicted_subclass_probability": 0.9992506 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 31, - "code": "history = model.fit(X_train_tfidf, y_train, epochs = 20, batch_size = 512, validation_data = (X_val_tfidf, y_val))", - "class": "Model Training", - "desc": "This code snippet trains the neural network model on the TF-IDF encoded training data for 20 epochs with a batch size of 512 and validates the performance using the validation data.", + "cell_id": 1, + "code": "train=pd.read_csv('../input/nlp-getting-started/train.csv')\ntest=pd.read_csv('../input/nlp-getting-started/test.csv')", + "class": "Data Extraction", + "desc": "This code reads the training and testing datasets from CSV files located at the specified paths into pandas DataFrames named `train` and `test`.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.9996803 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.99974996 }, "cluster": 0 }, { - "cell_id": 39, - "code": "# Setting up the model\n\nn_latent_factors = 100\nmodel_glove = Sequential()\nmodel_glove.add(layers.Embedding(num_words, n_latent_factors, weights = [embedding_matrix], \n input_length = max_len, trainable=True))\nmodel_glove.add(layers.Flatten())\n# model_glove.add(layers.Dense(16, activation='relu'))\nmodel_glove.add(layers.Dropout(0.5))\n# model_glove.add(layers.Dense(16, activation='relu'))\nmodel_glove.add(layers.Dense(1, activation='sigmoid'))\nmodel_glove.summary()", - "class": "Model Training", - "desc": "This code snippet sets up a neural network model with an embedding layer initialized with GloVe embeddings, followed by a Flatten layer, a Dropout layer, and a final Dense layer with sigmoid activation, and then displays the model summary.", + "cell_id": 8, + "code": "stop_words=nltk.corpus.stopwords.words('english')\ni=0\n#sc=SpellChecker()\n#data=pd.concat([train,test])\nwnl=WordNetLemmatizer()\nstemmer=SnowballStemmer('english')\nfor doc in train.text:\n doc=re.sub(r'https?://\\S+|www\\.\\S+','',doc)\n doc=re.sub(r'<.*?>','',doc)\n doc=re.sub(r'[^a-zA-Z\\s]','',doc,re.I|re.A)\n #doc=' '.join([stemmer.stem(i) for i in doc.lower().split()])\n doc=' '.join([wnl.lemmatize(i) for i in doc.lower().split()])\n #doc=' '.join([sc.correction(i) for i in doc.split()])\n doc=contractions.fix(doc)\n tokens=nltk.word_tokenize(doc)\n filtered=[token for token in tokens if token not in stop_words]\n doc=' '.join(filtered)\n train.text[i]=doc\n i+=1\ni=0\nfor doc in test.text:\n doc=re.sub(r'https?://\\S+|www\\.\\S+','',doc)\n doc=re.sub(r'<.*?>','',doc)\n doc=re.sub(r'[^a-zA-Z\\s]','',doc,re.I|re.A)\n #doc=' '.join([stemmer.stem(i) for i in doc.lower().split()])\n doc=' '.join([wnl.lemmatize(i) for i in doc.lower().split()])\n #doc=' '.join([sc.correction(i) for i in doc.split()])\n doc=contractions.fix(doc)\n tokens=nltk.word_tokenize(doc)\n filtered=[token for token in tokens if token not in stop_words]\n doc=' '.join(filtered)\n test.text[i]=doc\n i+=1", + "class": "Data Transform", + "desc": "This code preprocesses the text in the `train` and `test` DataFrames using techniques such as removing URLs, HTML tags, and non-alphabetic characters, lemmatizing words, expanding contractions, tokenizing, and filtering out stopwords.", "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.9871293 + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.54864067 }, "cluster": 0 }, { - "cell_id": 40, - "code": "model_glove.compile(optimizer = optimizers.RMSprop(lr=0.001),\n loss = losses.binary_crossentropy,\n metrics = [metrics.binary_accuracy])\n\nhistory = model_glove.fit(X_train_seq,\n y_train,\n epochs=20,\n batch_size=512,\n validation_data=(X_val_seq, y_val))", - "class": "Model Training", - "desc": "This code snippet compiles the GloVe-embeddings-based neural network model using the RMSprop optimizer and binary cross-entropy loss, and then trains it on the padded sequence training data for 20 epochs with a batch size of 512, validating the performance using the validation data.", + "cell_id": 11, + "code": "from sklearn.feature_extraction.text import CountVectorizer\ncv=CountVectorizer(ngram_range=(1,1)) \n\n# ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, \n# and (2, 2) means only bigrams.\n\ncv_matrix=cv.fit_transform(train.text).toarray()\ntrain_df=pd.DataFrame(cv_matrix,columns=cv.get_feature_names())\ntest_df=pd.DataFrame(cv.transform(test.text).toarray(),columns=cv.get_feature_names())\ntrain_df.head()", + "class": "Data Transform", + "desc": "This code transforms the text data in the `train` and `test` DataFrames into feature matrices using the CountVectorizer from sklearn with unigrams, and then converts these matrices into pandas DataFrames.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.99457717 + "class": "Visualization", + "subclass": "model_coefficients", + "subclass_id": 79, + "predicted_subclass_probability": 0.17395471 }, - "cluster": 0 + "cluster": 5 }, { - "cell_id": 42, - "code": "max_len = 15\nX_train_seq = tokenizer.texts_to_sequences(train_data['prep_text'])\nX_test_seq = tokenizer.texts_to_sequences(test_data['text'])\n\nX_train_seq = pad_sequences(X_train_seq, maxlen = max_len, truncating = 'post', padding = 'post')\nX_test_seq = pad_sequences(X_test_seq, maxlen = max_len, truncating = 'post', padding = 'post')\ny_train = np.array(train_data['target']).astype(int)\n\nprint(f\"X_train shape: {X_train_seq.shape}\")\nprint(f\"X_test shape: {X_test_seq.shape}\")\nprint(f\"y_train shape: {y_train.shape}\\n\")\n\n# Setting up the model\n\nn_latent_factors = 100\nmodel_glove = Sequential()\nmodel_glove.add(layers.Embedding(num_words, n_latent_factors, weights = [embedding_matrix], \n input_length = max_len, trainable=True))\nmodel_glove.add(layers.Flatten())\n# model_glove.add(layers.Dense(16, activation='relu'))\nmodel_glove.add(layers.Dropout(0.5))\n# model_glove.add(layers.Dense(16, activation='relu'))\nmodel_glove.add(layers.Dense(1, activation='sigmoid'))\nprint(f\"{model_glove.summary()}\\n\")\n\n\nmodel_glove.compile(optimizer = optimizers.RMSprop(lr=0.001),\n loss = losses.binary_crossentropy,\n metrics = [metrics.binary_accuracy])\n\nhistory = model_glove.fit(X_train_seq,\n y_train,\n epochs=20,\n batch_size=512)", - "class": "Model Training", - "desc": "This code snippet reinitializes and preprocesses the text data as padded sequences, sets up, compiles, and trains a neural network model using GloVe word embeddings on the training data for 20 epochs with a batch size of 512, and prints the model summary.", + "cell_id": 12, + "code": "from sklearn.feature_extraction.text import TfidfVectorizer\ntfidf=TfidfVectorizer(ngram_range=(1,1),use_idf=True)\nmat=tfidf.fit_transform(train.text).toarray()\ntrain_df=pd.DataFrame(mat,columns=tfidf.get_feature_names())\ntest_df=pd.DataFrame(tfidf.transform(test.text).toarray(),columns=tfidf.get_feature_names())\ntrain_df.head()", + "class": "Data Transform", + "desc": "This code transforms the text data in the `train` and `test` DataFrames into TF-IDF feature matrices using TfidfVectorizer from sklearn with unigrams, and then converts these matrices into pandas DataFrames.", "testing": { "class": "Data_Transform", "subclass": "categorify", "subclass_id": 20, - "predicted_subclass_probability": 0.80148226 + "predicted_subclass_probability": 0.9525827 }, - "cluster": 0 + "cluster": 5 }, { - "cell_id": 43, - "code": "# Setting up the tokenizer\nvocab_size = 1000\ntokenizer = Tokenizer(num_words = vocab_size, oov_token = 'UNK')\ntokenizer.fit_on_texts(list(train_data['text']) + list(test_data['text']))\n\n# Word count representation\nX_train_wc = tokenizer.texts_to_matrix(train_data['text'], mode = 'count')\nX_test_wc = tokenizer.texts_to_matrix(test_data['text'], mode = 'count')\ny_train = np.array(train_data['target']).astype(int)\n\nprint(f\"X_train shape: {X_train_wc.shape}\")\nprint(f\"X_test shape: {X_test_wc.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")\n\n# Train Validation Split\nX_train_wc, X_val_wc, y_train, y_val = train_test_split(X_train_wc, y_train, test_size = 0.2, random_state = 42)\n\nprint(f\"X_train shape: {X_train_wc.shape}\")\nprint(f\"X_val shape: {X_val_wc.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")\nprint(f\"y_val shape: {y_val.shape}\\n\")\n\n# Setting up the model\nmodel = setup_model()\n\n# Fitting the model on un-preprocessed text\nhistory = model.fit(X_train_wc, y_train, epochs = 20, batch_size = 512, validation_data = (X_val_wc, y_val))", - "class": "Model Training", - "desc": "This code snippet prepares word count representations of the unprocessed text data, splits it into training and validation sets, sets up, and trains a neural network model on this data for 20 epochs with a batch size of 512.", + "cell_id": 2, + "code": "train.head()", + "class": "Exploratory Data Analysis", + "desc": "This code displays the first five rows of the `train` DataFrame to provide an initial look at the data structure and contents.", "testing": { - "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.59386915 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997507 }, "cluster": 0 }, { - "cell_id": 5, - "code": "sns.countplot(train_data['target'])", - "class": "Visualization", - "desc": "This code snippet creates a count plot to visualize the distribution of the target variable in the training data.", + "cell_id": 3, + "code": "test.head()", + "class": "Exploratory Data Analysis", + "desc": "This code displays the first five rows of the `test` DataFrame to provide an initial look at the data structure and contents.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.99602413 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997483 }, "cluster": 0 }, { - "cell_id": 15, - "code": "import matplotlib.pyplot as plt\n\ndef plot_history(history): \n\n history_dict = history.history\n history_dict.keys()\n\n\n acc = history.history['binary_accuracy']\n val_acc = history.history['val_binary_accuracy']\n loss = history.history['loss']\n val_loss = history.history['val_loss']\n\n epochs = range(1, len(acc) + 1)\n\n # \"bo\" is for \"blue dot\"\n plt.plot(epochs, loss, 'bo', label='Training loss')\n # b is for \"solid blue line\"\n plt.plot(epochs, val_loss, 'b', label='Validation loss')\n plt.title('Training and validation loss')\n plt.xlabel('Epochs')\n plt.ylabel('Loss')\n plt.legend()\n\n plt.show()\n \nplot_history(history)", - "class": "Visualization", - "desc": "This code snippet defines a function to plot the training and validation loss over epochs and then calls that function to display the plot.", + "cell_id": 4, + "code": "print(train.shape)\nprint(test.shape)", + "class": "Exploratory Data Analysis", + "desc": "This code prints the dimensions (number of rows and columns) of the `train` and `test` DataFrames to understand the size of the datasets.", "testing": { - "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.9967168 + "class": "Exploratory_Data_Analysis", + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.99933213 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 21, - "code": "plot_history(history)", - "class": "Visualization", - "desc": "This code snippet calls the `plot_history` function to visualize the training and validation loss over epochs for the newly trained model.", + "cell_id": 5, + "code": "print(train.info())\nprint(test.info())", + "class": "Exploratory Data Analysis", + "desc": "This code prints the summary information of the `train` and `test` DataFrames including data types and non-null counts to understand the structure and completeness of the datasets.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9654706 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.9994165 }, - "cluster": 1 + "cluster": 3 }, { - "cell_id": 26, - "code": "plot_history(history)", - "class": "Visualization", - "desc": "This code snippet calls the `plot_history` function to visualize the training and validation loss over epochs for the newly trained model using frequency-based encoding.", + "cell_id": 6, + "code": "train.target.value_counts()", + "class": "Exploratory Data Analysis", + "desc": "This code counts and displays the occurrences of each unique value in the `target` column of the `train` DataFrame to understand the distribution of the target variable.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9654706 + "class": "Exploratory_Data_Analysis", + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.99950993 }, - "cluster": 1 + "cluster": 4 }, { - "cell_id": 32, - "code": "plot_history(history)", - "class": "Visualization", - "desc": "This code snippet calls the `plot_history` function to visualize the training and validation loss over epochs for the newly trained model using TF-IDF encoding.", + "cell_id": 9, + "code": "train.head()", + "class": "Exploratory Data Analysis", + "desc": "This code displays the first five rows of the `train` DataFrame to provide an initial look at the preprocessed data structure and contents.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9654706 - }, - "cluster": 1 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997507 + }, + "cluster": 0 }, { - "cell_id": 33, - "code": "plt.hist(list(train_data['prep_text'].str.split().map(lambda x: len(x))))", - "class": "Visualization", - "desc": "This code snippet creates a histogram to visualize the distribution of the lengths of the preprocessed text data in terms of the number of words.", + "cell_id": 10, + "code": "test.head()", + "class": "Exploratory Data Analysis", + "desc": "This code displays the first five rows of the `test` DataFrame to provide an initial look at the preprocessed data structure and contents.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9977956 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997483 }, "cluster": 0 }, { - "cell_id": 41, - "code": "plot_history(history)", - "class": "Visualization", - "desc": "This code snippet calls the `plot_history` function to visualize the training and validation loss over epochs for the GloVe-embeddings-based model.", + "cell_id": 0, + "code": "import pandas as pd\nimport numpy as np\nfrom sklearn.metrics import f1_score", + "class": "Imports and Environment", + "desc": "This code imports the pandas and numpy libraries for data manipulation and analysis, and the f1_score function from the sklearn.metrics module for model evaluation.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9654706 + "class": "Imports_and_Environment", + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.99931085 }, - "cluster": 1 + "cluster": 0 + }, { + "cell_id": 7, + "code": "import nltk\nnltk.download('punkt')\nnltk.download('stopwords')\nimport re\n!pip install contractions\nimport contractions\nfrom nltk.stem import SnowballStemmer\nfrom nltk.stem import WordNetLemmatizer\nnltk.download('wordnet')\n!pip install pyspellchecker\nfrom spellchecker import SpellChecker", + "class": "Imports and Environment", + "desc": "This code installs and imports various Natural Language Processing (NLP) tools including nltk libraries for tokenization, stopwords, stemming, and lemmatization, contractions for expanding contractions, and pyspellchecker for spell checking, by downloading necessary resources and libraries using nltk and pip commands.", + "testing": { + "class": "Model_Train", + "subclass": "load_pretrained", + "subclass_id": 30, + "predicted_subclass_probability": 0.799391 + }, + "cluster": 0 + }, { + "cell_id": 13, + "code": "from sklearn.linear_model import LogisticRegression\nmodel=LogisticRegression()\nmodel.fit(train_df,train.target)\nprint(f1_score(model.predict(train_df),train.target))\npred=model.predict(test_df)", + "class": "Model Training", + "desc": "This code initializes a Logistic Regression model, fits it to the transformed training data, computes the F1 score on the training set, and makes predictions on the test data using the trained model.", + "testing": { + "class": "Model_Evaluation", + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.5177371 + }, + "cluster": 0 }], - "notebook_id": 0, - "notebook_name": "baseline-nlp" + "notebook_id": 1, + "notebook_name": "bow-tf-idf-models-with-basic-lr-0-80-score.ipynb" }, { "cells": [{ - "cell_id": 13, - "code": "submission = pd.DataFrame({\n 'id': test_raw.id,\n 'target':y_hat\n})", + "cell_id": 19, + "code": "my_submission_preds = pipe.predict(test['text']+ ' ' + test['keyword'].astype(str) + ' ' + test['location'].astype(str))\n\nmy_submission = pd.DataFrame({\"id\":test['id'], 'target':my_submission_preds})", "class": "Data Export", - "desc": "This code snippet creates a DataFrame for the submission file containing the IDs from the test dataset and the predicted target labels.", + "desc": "This code snippet generates predictions for the test data using the trained pipeline and creates a DataFrame containing the 'id' from the test set and the corresponding predicted 'target' values.", "testing": { - "class": "Data_Transform", - "subclass": "create_dataframe", - "subclass_id": 12, - "predicted_subclass_probability": 0.9941958 + "class": "Model_Evaluation", + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.7191785 }, "cluster": -1 }, { - "cell_id": 14, - "code": "submission.to_csv(\"my_submission_linear.csv\", index=False)", + "cell_id": 22, + "code": "my_submission.to_csv('submission.csv', index=False)", "class": "Data Export", - "desc": "This code snippet exports the submission DataFrame to a CSV file named \"my_submission_linear.csv\" without including the index.", + "desc": "This code snippet saves the my_submission DataFrame to a CSV file named 'submission.csv' without including the DataFrame index, using the to_csv method from pandas.", "testing": { "class": "Data_Export", "subclass": "save_to_csv", "subclass_id": 25, - "predicted_subclass_probability": 0.99924576 + "predicted_subclass_probability": 0.99912554 }, - "cluster": 1 + "cluster": -1 }, { "cell_id": 1, - "code": "train_raw = pd.read_csv(\"/kaggle/input/nlp-getting-started/train.csv\")\ntest_raw = pd.read_csv(\"/kaggle/input/nlp-getting-started/test.csv\")\nsubmission_raw = pd.read_csv(\"/kaggle/input/nlp-getting-started/sample_submission.csv\")", + "code": "train = pd.read_csv(\"../input/nlp-getting-started/train.csv\")\ntest = pd.read_csv(\"../input/nlp-getting-started/test.csv\")\nsample_submission = pd.read_csv(\"../input/nlp-getting-started/sample_submission.csv\")", "class": "Data Extraction", - "desc": "This code snippet reads three CSV files (train, test, and sample_submission) into pandas DataFrames from the specified Kaggle input directory.", + "desc": "This code snippet reads the CSV files (train.csv, test.csv, and sample_submission.csv) from the specified directory into pandas DataFrames named train, test, and sample_submission, respectively.", "testing": { "class": "Data_Extraction", "subclass": "load_from_csv", "subclass_id": 45, - "predicted_subclass_probability": 0.9997112 + "predicted_subclass_probability": 0.99971575 }, - "cluster": 2 + "cluster": 0 }, { - "cell_id": 4, - "code": "# remove stopwords,punct\n# remove duplicate tweet\ntexts = []\nlabels = []\ntexts_md5 = set()\nfor target, doc in zip(train_raw.target, nlp.pipe(train_raw.text)):\n tokens = [token.lemma_ for token in doc if token.is_stop is False and token.is_punct is False and token.is_space is False]\n temp_text = ' '.join(tokens)\n # remove duplicate\n md5 = hashlib.md5()\n md5.update(temp_text.encode('utf-8'))\n text_md5 = md5.hexdigest()\n if text_md5 not in texts_md5:\n texts.append(temp_text)\n labels.append(target)\n texts_md5.add(text_md5)", + "cell_id": 7, + "code": "from sklearn.model_selection import train_test_split\n\nX = train['text'] + ' ' + train['keyword'].astype(str) + ' ' + train['location'].astype(str) # the features we want to analyze\nylabels = train['target'] # the labels, or answers, we want to test against\n\nX_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)", "class": "Data Transform", - "desc": "This code snippet processes the training text data by removing stopwords and punctuation, lemmatizing tokens, and eliminating duplicate tweets using their MD5 hash values.", + "desc": "This code snippet combines the 'text', 'keyword', and 'location' columns from the train DataFrame into a single feature set, then splits this feature set and the target labels into training and testing subsets using scikit-learn's train_test_split function with a test size of 30%.", "testing": { "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.9552063 + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.9936372 }, - "cluster": 1 + "cluster": 8 }, { - "cell_id": 5, - "code": "tests = []\nfor doc in nlp.pipe(test_raw.text):\n tokens = [token.lemma_ for token in doc if token.is_stop is False and token.is_punct is False and token.is_space is False]\n tests.append(' '.join(tokens))", + "cell_id": 9, + "code": "\npunctuations = string.punctuation \nnlp = spacy.load('en_core_web_sm') #, exclude=[\"tok2vec\", \"parser\", \"ner\", \"attribute_ruler\"]\nstop_words = spacy.lang.en.stop_words.STOP_WORDS\nparser = English() # Load English tokenizer, tagger, parser, NER and word vectors\n\ndef spacy_tokenizer(sentence):\n mytokens = str(sentence)\n mytokens = nlp(mytokens)\n #mytokens = parser(sentence) \n mytokens = [ word.lemma_.lower().strip() if word.lemma_ != \"-PRON-\" else word.lower_ for word in mytokens ] \n mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ] \n return mytokens # return preprocessed list of tokens\n\nclass predictors(TransformerMixin):\n def transform(self, X, **transform_params):\n return [clean_text(text) for text in X]\n\n def fit(self, X, y=None, **fit_params):\n return self\n\n def get_params(self, deep=True):\n return {}\n\ndef clean_text(text):\n text = text.strip().lower()\n #text = re.sub(r'[^A-Za-z0-9 ]+', '', text)\n return text #.split()\n\nbow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1), stop_words = None)\ntfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer, stop_words = None) #token_pattern='(?u)\\b\\w\\w+\\b', stop_words = 'english'\n\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import RandomForestClassifier\nclassifier = LogisticRegression()\n# classifier = RandomForestClassifier()\n\npipe = Pipeline([(\"cleaner\", predictors()),\n ('vectorizer', tfidf_vector),\n ('classifier', classifier)])\n\n#clean_text(X_train[1773])\n#spacy_tokenizer(X_train[1773])\n#mytokens = parser(X_train[1773])\n\n# mytokens = str(X_train[1773])\n# #mytokens = re.sub(r'[^A-Za-z0-9 ]+', '', mytokens)\n# #mytokens = parser(mytokens)\n# mytokens = nlp(mytokens)\n# mytokens = [ word.lemma_.lower().strip() if word.lemma_ != \"-PRON-\" else word.lower_ for word in mytokens ]\n# print(mytokens)", "class": "Data Transform", - "desc": "This code snippet processes the test text data by removing stopwords and punctuation, lemmatizing tokens, and appending the cleaned text to a list.", + "desc": "This code snippet defines a spaCy-based custom tokenizer, a predictors class for cleaning text, and initializes CountVectorizer and TfidfVectorizer with the custom tokenizer; then it sets up a logistic regression classifier and a pipeline combining text cleaning, vectorization, and classification.", "testing": { "class": "Data_Transform", "subclass": "string_transform", "subclass_id": 78, - "predicted_subclass_probability": 0.9769126 - }, - "cluster": 1 - }, { - "cell_id": 6, - "code": "tf_idf = TfidfVectorizer(max_features=10000).fit(texts)\ntrain = tf_idf.transform(texts)\ntest = tf_idf.transform(tests)", - "class": "Data Transform", - "desc": "This code snippet creates a TF-IDF vectorizer with a maximum of 10,000 features, fits it to the cleaned training text data, and transforms both the training and test text data into TF-IDF features.", - "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.9555307 + "predicted_subclass_probability": 0.5153541 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 7, - "code": "X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.3)", - "class": "Data Transform", - "desc": "This code snippet splits the transformed training data and associated labels into training and testing sets, with 30% of the data allocated for testing.", + "cell_id": 2, + "code": "train.head()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet displays the first five rows of the train DataFrame using the head method in pandas.", "testing": { - "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.99791616 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997507 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 0, - "code": "# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session", - "class": "Imports and Environment", - "desc": "This code snippet imports essential libraries such as numpy and pandas, and lists all files under the input directory on Kaggle.", + "cell_id": 3, + "code": "test.head()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet displays the first five rows of the test DataFrame using the head method in pandas.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "list_files", - "subclass_id": 88, - "predicted_subclass_probability": 0.99921954 + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997483 }, "cluster": 0 }, { - "cell_id": 2, - "code": "import hashlib\n\nimport spacy\nimport sklearn\nfrom sklearn.svm import SVC\nfrom sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\nfrom sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.metrics import f1_score", - "class": "Imports and Environment", - "desc": "This code snippet imports additional libraries and modules required for natural language processing, machine learning model training, and evaluation, including spacy, sklearn's SVC, vectorizers, model selection tools, scalers, pipelines, and metrics.", + "cell_id": 4, + "code": "print(train.apply(lambda col: col.unique()))\nprint(train.apply(lambda col: col.nunique()))", + "class": "Exploratory Data Analysis", + "desc": "This code snippet prints the unique values in each column and the number of unique values in each column of the train DataFrame using the pandas apply method with lambda functions.", "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.99932015 + "class": "Exploratory_Data_Analysis", + "subclass": "count_unique_values", + "subclass_id": 54, + "predicted_subclass_probability": 0.95746493 }, - "cluster": 0 + "cluster": 3 }, { - "cell_id": 3, - "code": "nlp = spacy.load('en')", - "class": "Imports and Environment", - "desc": "This code snippet loads the English language model from the spaCy library for natural language processing tasks.", + "cell_id": 8, + "code": "X_train[100:500]\n#type(X_train[1])\n#y_train[:100]", + "class": "Exploratory Data Analysis", + "desc": "This code snippet displays a subset of the X_train data (from index 100 to 500) and includes commented-out lines initially intended to check the type of the first element in X_train and to display the first 100 elements of y_train.", "testing": { - "class": "Model_Train", - "subclass": "load_pretrained", - "subclass_id": 30, - "predicted_subclass_probability": 0.9950819 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9915615 }, - "cluster": -1 + "cluster": 3 }, { - "cell_id": 10, - "code": "best_params= {'C': 1, 'gamma': 0.001, 'kernel': 'linear'}\nsvc = SVC(**best_params)\nscores = cross_val_score(svc,X_train, y_train, cv=5, scoring='f1')\nprint(scores)\nprint(sum(scores)/len(scores))", - "class": "Model Evaluation", - "desc": "This code snippet initializes an SVC with the best hyperparameters, evaluates its performance using 5-fold cross-validation on the training data by calculating the F1 score, and prints the individual scores as well as their average.", + "cell_id": 14, + "code": "predicted_df.head()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet displays the first five rows of the predicted values DataFrame using the head method in pandas.", "testing": { - "class": "Model_Train", - "subclass": "compute_train_metric", - "subclass_id": 28, - "predicted_subclass_probability": 0.9838514 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997552 }, "cluster": 0 }, { - "cell_id": 11, - "code": "val_texts = [\"A happy day!\", 'An earthquake happened!']\nval_data = tf_idf.transform(val_texts)\nsvc.fit(X_train, y_train)\nprint(svc.predict(val_data))", - "class": "Model Evaluation", - "desc": "This code snippet fits the SVC model to the training data, transforms the validation texts using the previously fitted TF-IDF vectorizer, and predicts the labels for the validation texts, printing the results.", + "cell_id": 15, + "code": "test.head()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet displays the first five rows of the test DataFrame using the head method in pandas.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.6037878 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997483 }, "cluster": 0 }, { - "cell_id": 12, - "code": "y_hat = svc.predict(test)", - "class": "Model Evaluation", - "desc": "This code snippet uses the trained SVC model to predict labels for the test data.", + "cell_id": 16, + "code": "sample_submission.head()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet displays the first five rows of the sample_submission DataFrame using the head method in pandas.", "testing": { - "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.99424076 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.99975234 }, "cluster": 0 }, { - "cell_id": 8, - "code": "param_grid = {\n \"gamma\" : [0.001,0.01,1,10,100],\n \"C\":[0.001,0.01,1,10,100],\n 'kernel' : ['poly', \"linear\", 'sigmoid', 'rbf']\n}", - "class": "Model Training", - "desc": "This code snippet defines a parameter grid for hyperparameter tuning of an SVM classifier, specifying different values for gamma, C, and kernel types.", + "cell_id": 18, + "code": "test", + "class": "Exploratory Data Analysis", + "desc": "This code snippet outputs the entire contents of the test DataFrame.", "testing": { - "class": "Model_Train", - "subclass": "define_search_space", - "subclass_id": 5, - "predicted_subclass_probability": 0.99403256 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997199 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 9, - "code": "svc = SVC()\ngrid_searcher = GridSearchCV(svc, param_grid, cv=5, scoring='f1')\ngrid_searcher.fit(X_train, y_train)\ngrid_searcher.best_params_", - "class": "Model Training", - "desc": "This code snippet sets up a GridSearchCV with a Support Vector Classifier (SVC) to find the best combination of hyperparameters from the specified param_grid using 5-fold cross-validation and fits it to the training data.", + "cell_id": 20, + "code": "my_submission.head()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet displays the first five rows of the my_submission DataFrame using the head method in pandas.", "testing": { - "class": "Model_Train", - "subclass": "train_on_grid", - "subclass_id": 6, - "predicted_subclass_probability": 0.99040365 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997489 }, "cluster": 0 - }], - "notebook_id": 1, - "notebook_name": "baseline-svc-79" - }, { - "cells": [{ - "cell_id": 37, - "code": "preds = np.squeeze(model.predict(test_multi_input_dataset.batch(32)))\npreds = (preds >= 0.5).astype(int)\npd.DataFrame({\"id\": test_df.id, \"target\": preds}).to_csv(\"submission.csv\", index=False)", - "class": "Data Export", - "desc": "This code generates predictions on the test dataset using the trained model, converts the predictions to binary labels, and then saves the results in a CSV file named \"submission.csv\".", + }, { + "cell_id": 21, + "code": "len(my_submission)", + "class": "Exploratory Data Analysis", + "desc": "This code snippet returns the number of rows in the my_submission DataFrame using the len function.", "testing": { - "class": "Data_Export", - "subclass": "save_to_csv", - "subclass_id": 25, - "predicted_subclass_probability": 0.9992853 + "class": "Exploratory_Data_Analysis", + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.99885213 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 4, - "code": "train_df = pd.read_csv(\"/kaggle/input/nlp-getting-started/train.csv\")\ntest_df = pd.read_csv(\"/kaggle/input/nlp-getting-started/test.csv\")", - "class": "Data Extraction", - "desc": "This code reads the training and testing datasets from CSV files into Pandas DataFrames.", + "cell_id": 0, + "code": "# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session", + "class": "Imports and Environment", + "desc": "This code snippet imports essential libraries (NumPy and pandas) and prints the paths of files in the '/kaggle/input' directory using the os module.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.99975425 + "class": "Exploratory_Data_Analysis", + "subclass": "list_files", + "subclass_id": 88, + "predicted_subclass_probability": 0.99921954 }, - "cluster": 2 + "cluster": 0 }, { - "cell_id": 16, - "code": "x_train, x_val, y_train, y_val = sklearn.model_selection.train_test_split(\n train_df[[\"text\", \"keyword\"]], train_df[\"target\"], test_size=0.3, random_state=42, stratify=train_df[\"target\"]\n)", - "class": "Data Extraction", - "desc": "This code splits the training DataFrame into training and validation sets, using 30% of the data for validation, while stratifying by the 'target' column to maintain class balance.", + "cell_id": 5, + "code": "!pip install spacy -q\n!python -m spacy download en_core_web_sm -q", + "class": "Imports and Environment", + "desc": "This code snippet installs the spaCy library and downloads the English language model 'en_core_web_sm' using pip and the spaCy command-line interface.", "testing": { - "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.9980861 + "class": "Imports_and_Environment", + "subclass": "install_modules", + "subclass_id": 87, + "predicted_subclass_probability": 0.993651 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 10, - "code": "# We'll use these weights later on to make up for the slightly imbalanced dataset\nclasses = np.unique(train_df[\"target\"])\nclass_weights = sklearn.utils.class_weight.compute_class_weight(\n \"balanced\", classes=classes, y=train_df[\"target\"]\n)\n\nclass_weights = {clazz : weight for clazz, weight in zip(classes, class_weights)}", - "class": "Data Transform", - "desc": "This code calculates and stores the class weights for the target variable in the training dataset to address class imbalance.", + "cell_id": 6, + "code": "import matplotlib.pyplot as plt\nfrom sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer\nfrom sklearn.base import TransformerMixin\nfrom sklearn.pipeline import Pipeline\nimport string\nfrom spacy.lang.en.stop_words import STOP_WORDS\nfrom spacy.lang.en import English\nimport spacy\nimport en_core_web_sm\nimport re", + "class": "Imports and Environment", + "desc": "This code snippet imports various libraries and modules including matplotlib for plotting, scikit-learn for text feature extraction and pipeline creation, and spaCy for NLP tasks.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.3043228 + "class": "Imports_and_Environment", + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.9992192 }, - "cluster": 1 + "cluster": 0 }, { "cell_id": 11, - "code": "# Commented out the graceful handling of duplicated because the Kaggle kernel version of statistics.mode()\n# won't handle multimodal results\n\n# Duplicates aren't consistently labeled, so we keep one example of the most frequently occuring label\n# train_df[\"duplicated\"] = train_df.duplicated(subset=\"text\")\n# duplicated_tweets = train_df.loc[lambda df: df[\"duplicated\"] == True, :]\n# aggregated_duplicates = duplicated_tweets.groupby(\"text\", as_index=False).aggregate(\n# statistics.mode\n# )\n\n# train_df.drop_duplicates(subset=\"text\", inplace=True, keep=False)\n# train_df = train_df.append(aggregated_duplicates, ignore_index=True)\n\ntrain_df.drop_duplicates(subset=\"text\", inplace=True, keep=False)\nprint(\"train rows:\", len(train_df.index))\nprint(\"test rows:\", len(test_df.index))", - "class": "Data Transform", - "desc": "This code removes duplicate rows in the training DataFrame based on the 'text' column and then prints the number of rows in the updated training and testing DataFrames.", + "code": "from sklearn import metrics\n# Predicting with a test dataset\npredicted = pipe.predict(X_test)\n\n# Model Accuracy\nprint(\"Accuracy:\",metrics.accuracy_score(y_test, predicted))\nprint(\"Precision:\",metrics.precision_score(y_test, predicted))\nprint(\"Recall:\",metrics.recall_score(y_test, predicted))", + "class": "Model Evaluation", + "desc": "This code snippet uses the trained pipeline to predict the test data, then evaluates and prints the accuracy, precision, and recall of the model using scikit-learn's metrics functions.", + "testing": { + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.758966 + }, + "cluster": 0 + }, { + "cell_id": 12, + "code": "predicted_df = pd.DataFrame(predicted)\npredicted_df.value_counts()", + "class": "Model Evaluation", + "desc": "This code snippet creates a DataFrame from the predicted values and then counts the occurrences of each unique prediction using pandas value_counts method.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "count_duplicates", - "subclass_id": 38, - "predicted_subclass_probability": 0.7884562 + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.99671495 + }, + "cluster": 0 + }, { + "cell_id": 17, + "code": "predicted", + "class": "Model Evaluation", + "desc": "This code snippet outputs the array of predicted values generated by the trained pipeline on the test data.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.99976486 + }, + "cluster": 2 + }, { + "cell_id": 10, + "code": "pipe.fit(X_train, y_train)", + "class": "Model Training", + "desc": "This code snippet trains the pipeline (consisting of text cleaning, feature extraction using TfidfVectorizer, and classification using LogisticRegression) on the training data (X_train and y_train) using the fit method.", + "testing": { + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.99970585 }, "cluster": 1 }, { - "cell_id": 12, - "code": "class TweetPreProcessor:\n \"\"\"\n This class does some cleaning and normalization prior to BPE tokenization\n \"\"\"\n\n def __init__(self):\n\n self.text_processor = TextPreProcessor(\n # terms that will be normalized\n normalize=[\n \"url\",\n \"email\",\n \"phone\",\n \"user\",\n \"time\",\n \"date\",\n ],\n # terms that will be annotated\n annotate={\"repeated\", \"elongated\"},\n # corpus from which the word statistics are going to be used\n # for word segmentation\n segmenter=\"twitter\",\n # corpus from which the word statistics are going to be used\n # for spell correction\n spell_correction=True,\n corrector=\"twitter\",\n unpack_hashtags=False, # perform word segmentation on hashtags\n unpack_contractions=False, # Unpack contractions (can't -> can not)\n spell_correct_elong=True, # spell correction for elongated words\n fix_bad_unicode=True,\n tokenizer=Tokenizer(lowercase=True).tokenize,\n # list of dictionaries, for replacing tokens extracted from the text,\n # with other expressions. You can pass more than one dictionaries.\n dicts=[emoticons, slangdict],\n )\n\n def preprocess_tweet(self, tweet):\n return \" \".join(self.text_processor.pre_process_doc(tweet))\n \n # this will return the tokenized text \n def __call__(self, tweet):\n return self.text_processor.pre_process_doc(tweet)\n \ntweet_preprocessor = TweetPreProcessor()", - "class": "Data Transform", - "desc": "This code defines a `TweetPreProcessor` class for cleaning and normalizing tweets using specified text preprocessing rules and instantiates an object of this class.", + "cell_id": 13, + "code": "predicted_df.plot.hist()", + "class": "Visualization", + "desc": "This code snippet generates a histogram plot of the predicted values DataFrame using the plot.hist method from pandas.", + "testing": { + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.9975446 + }, + "cluster": -1 + }], + "notebook_id": 2, + "notebook_name": "nlp-starter-spacy-binary-text-classifier.ipynb" + }, { + "cells": [{ + "cell_id": 13, + "code": "submission = pd.DataFrame({\n 'id': test_raw.id,\n 'target':y_hat\n})", + "class": "Data Export", + "desc": "The snippet creates a DataFrame for the submission, which includes the test data IDs and their corresponding predicted target values.", "testing": { "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.95362186 + "subclass": "create_dataframe", + "subclass_id": 12, + "predicted_subclass_probability": 0.9941958 }, - "cluster": 1 + "cluster": -1 }, { "cell_id": 14, - "code": "train_df[\"text\"] = train_df[\"text\"].apply(tweet_preprocessor.preprocess_tweet)\ntest_df[\"text\"] = test_df[\"text\"].apply(tweet_preprocessor.preprocess_tweet)", - "class": "Data Transform", - "desc": "This code applies the preprocessing function from the `TweetPreProcessor` to the 'text' column of both the training and testing DataFrames.", + "code": "submission.to_csv(\"my_submission_linear.csv\", index=False)", + "class": "Data Export", + "desc": "The snippet saves the submission DataFrame to a CSV file named \"my_submission_linear.csv\" without including the index.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.72112745 + "class": "Data_Export", + "subclass": "save_to_csv", + "subclass_id": 25, + "predicted_subclass_probability": 0.99924576 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 15, - "code": "# Fill NA\ntrain_df[\"keyword\"].fillna(\"\", inplace=True)\ntest_df[\"keyword\"].fillna(\"\", inplace=True)\n\n# remove %20 from keywords\ntrain_df[\"keyword\"] = train_df[\"keyword\"].apply(urllib.parse.unquote)\ntest_df[\"keyword\"] = test_df[\"keyword\"].apply(urllib.parse.unquote)", + "cell_id": 1, + "code": "train_raw = pd.read_csv(\"/kaggle/input/nlp-getting-started/train.csv\")\ntest_raw = pd.read_csv(\"/kaggle/input/nlp-getting-started/test.csv\")\nsubmission_raw = pd.read_csv(\"/kaggle/input/nlp-getting-started/sample_submission.csv\")", + "class": "Data Extraction", + "desc": "The snippet reads the training, testing, and sample submission CSV files into pandas DataFrames from the specified Kaggle input directory.", + "testing": { + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.9997112 + }, + "cluster": 0 + }, { + "cell_id": 3, + "code": "nlp = spacy.load('en')", "class": "Data Transform", - "desc": "This code fills missing values in the 'keyword' column with empty strings and removes URL encoding (\"%20\") from the keywords in both the training and testing DataFrames.", + "desc": "The snippet loads the English language model 'en' in the SpaCy library for natural language processing tasks.", "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.98655176 + "class": "Model_Train", + "subclass": "load_pretrained", + "subclass_id": 30, + "predicted_subclass_probability": 0.9950819 }, - "cluster": 1 + "cluster": 4 }, { - "cell_id": 17, - "code": "def tokenize_encode(tweets, max_length=None):\n return pretrained_bert_tokenizer(\n tweets,\n add_special_tokens=True,\n truncation=True,\n padding=\"max_length\",\n max_length=max_length,\n return_tensors=\"tf\",\n )\n\n\n# need to be explicit about the lengths (instead of just specifying padding=True in the tokenizer)\n# otherwise train tweets end up being 71 and validation tweets end up as 70, which causes problems/warnings\nmax_length_tweet = 72\nmax_length_keyword = 8\n\ntrain_tweets_encoded = tokenize_encode(x_train[\"text\"].to_list(), max_length_tweet) \nvalidation_tweets_encoded = tokenize_encode(x_val[\"text\"].to_list(), max_length_tweet) \n\ntrain_keywords_encoded = tokenize_encode(x_train[\"keyword\"].to_list(), max_length_keyword) \nvalidation_keywords_encoded = tokenize_encode(x_val[\"keyword\"].to_list(), max_length_keyword) \n\ntrain_inputs_encoded = dict(train_tweets_encoded)\ntrain_inputs_encoded[\"keywords\"] = train_keywords_encoded[\"input_ids\"]\n\nvalidation_inputs_encoded = dict(validation_tweets_encoded)\nvalidation_inputs_encoded[\"keywords\"] = validation_keywords_encoded[\"input_ids\"]\n", + "cell_id": 4, + "code": "# remove stopwords,punct\n# remove duplicate tweet\ntexts = []\nlabels = []\ntexts_md5 = set()\nfor target, doc in zip(train_raw.target, nlp.pipe(train_raw.text)):\n tokens = [token.lemma_ for token in doc if token.is_stop is False and token.is_punct is False and token.is_space is False]\n temp_text = ' '.join(tokens)\n # remove duplicate\n md5 = hashlib.md5()\n md5.update(temp_text.encode('utf-8'))\n text_md5 = md5.hexdigest()\n if text_md5 not in texts_md5:\n texts.append(temp_text)\n labels.append(target)\n texts_md5.add(text_md5)", "class": "Data Transform", - "desc": "This code defines a function to tokenize and encode tweet texts using the pretrained BERT tokenizer, then applies it to the training and validation datasets for both tweets and keywords, and combines the encoded inputs into dictionaries.", + "desc": "The snippet processes the text by removing stopwords, punctuation, and duplicate tweets while lemmatizing the tokens using SpaCy, and then collects the cleaned text and labels into separate lists.", "testing": { "class": "Data_Transform", "subclass": "categorify", "subclass_id": 20, - "predicted_subclass_probability": 0.8425683 + "predicted_subclass_probability": 0.9552063 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 18, - "code": "train_dataset = tf.data.Dataset.from_tensor_slices(\n (dict(train_tweets_encoded), y_train)\n)\n\nval_dataset = tf.data.Dataset.from_tensor_slices(\n (dict(validation_tweets_encoded), y_val)\n)\n\ntrain_multi_input_dataset = tf.data.Dataset.from_tensor_slices(\n (train_inputs_encoded, y_train)\n)\n\nval_multi_input_dataset = tf.data.Dataset.from_tensor_slices(\n (validation_inputs_encoded, y_val)\n)\n", + "cell_id": 5, + "code": "tests = []\nfor doc in nlp.pipe(test_raw.text):\n tokens = [token.lemma_ for token in doc if token.is_stop is False and token.is_punct is False and token.is_space is False]\n tests.append(' '.join(tokens))", "class": "Data Transform", - "desc": "This code creates TensorFlow datasets from the tokenized and encoded training and validation data for both single input (tweets only) and multi-input (tweets and keywords) scenarios.", + "desc": "The snippet processes the test set text by removing stopwords, punctuation, and spaces while lemmatizing the tokens using SpaCy, and collecting the cleaned text into a list.", "testing": { "class": "Data_Transform", - "subclass": "create_dataframe", - "subclass_id": 12, - "predicted_subclass_probability": 0.5969537 + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.9769126 }, - "cluster": 1 + "cluster": 6 }, { - "cell_id": 19, - "code": "tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(\n tokenizer=tweet_preprocessor, min_df=1, ngram_range=(1, 1), norm=\"l2\"\n)\n\ntrain_vectors = tfidf_vectorizer.fit_transform(raw_documents=x_train[\"text\"]).toarray()\nvalidation_vectors = tfidf_vectorizer.transform(x_val[\"text\"]).toarray()", + "cell_id": 6, + "code": "tf_idf = TfidfVectorizer(max_features=10000).fit(texts)\ntrain = tf_idf.transform(texts)\ntest = tf_idf.transform(tests)", "class": "Data Transform", - "desc": "This code creates and fits a TF-IDF vectorizer to the training tweet texts and transforms both the training and validation tweet texts into TF-IDF vectors.", + "desc": "The snippet creates a TF-IDF Vectorizer with a maximum of 10,000 features, fits it on the cleaned training texts, and transforms both the training and test texts into TF-IDF feature representations.", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, - "predicted_subclass_probability": 0.9926306 + "predicted_subclass_probability": 0.9555307 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 35, - "code": "test_tweets_encoded = tokenize_encode(test_df[\"text\"].to_list(), max_length_tweet)\ntest_inputs_encoded = dict(test_tweets_encoded)\ntest_dataset = tf.data.Dataset.from_tensor_slices(test_inputs_encoded)\n\ntest_keywords_encoded = tokenize_encode(test_df[\"keyword\"].to_list(), max_length_keyword)\ntest_inputs_encoded[\"keywords\"] = test_keywords_encoded[\"input_ids\"]\ntest_multi_input_dataset = tf.data.Dataset.from_tensor_slices(test_inputs_encoded)", + "cell_id": 7, + "code": "X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.3)", "class": "Data Transform", - "desc": "This code tokenizes and encodes the tweets and keywords from the test dataset using the pre-trained BERT tokenizer, and then creates TensorFlow datasets for both single input (tweets only) and multi-input scenarios.", + "desc": "The snippet splits the TF-IDF transformed training data and labels into training and validation sets with 70% training data and 30% test data using the `train_test_split` method from sklearn.", "testing": { "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.78393567 + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.99791616 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 5, - "code": "print(train_df.info())\n\nprint(\"\")\nprint(\"train rows:\", len(train_df.index))\nprint(\"test rows:\", len(test_df.index))", - "class": "Exploratory Data Analysis", - "desc": "This code prints out the information about the training DataFrame and the number of rows in both the training and testing DataFrames.", + "cell_id": 0, + "code": "# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session", + "class": "Imports and Environment", + "desc": "The snippet imports essential libraries such as NumPy and pandas, and lists the available files in the specified Kaggle input directory using the os module.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.9827071 + "subclass": "list_files", + "subclass_id": 88, + "predicted_subclass_probability": 0.99921954 }, - "cluster": 7 + "cluster": 0 }, { - "cell_id": 6, - "code": "print(\"label counts:\")\ntrain_df.target.value_counts()", - "class": "Exploratory Data Analysis", - "desc": "This code prints the counts of each unique value in the 'target' column of the training DataFrame.", + "cell_id": 2, + "code": "import hashlib\n\nimport spacy\nimport sklearn\nfrom sklearn.svm import SVC\nfrom sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\nfrom sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.metrics import f1_score", + "class": "Imports and Environment", + "desc": "The snippet imports various libraries and modules including hashlib, spacy, and multiple functionalities from sklearn for text processing, model building, and evaluation.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.99948514 + "class": "Imports_and_Environment", + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.99932015 }, - "cluster": 9 + "cluster": 0 }, { - "cell_id": 7, - "code": "print(\"train precentage of nulls:\")\nprint(round(train_df.isnull().sum() / train_df.count() * 100, 2))", - "class": "Exploratory Data Analysis", - "desc": "This code calculates and prints the percentage of null values in each column of the training DataFrame.", + "cell_id": 12, + "code": "y_hat = svc.predict(test)", + "class": "Model Evaluation", + "desc": "The snippet generates predictions on the test data using the fitted Support Vector Classifier.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_missing_values", - "subclass_id": 39, - "predicted_subclass_probability": 0.9960549 + "class": "Model_Evaluation", + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.99424076 }, - "cluster": 7 + "cluster": 2 }, { "cell_id": 8, - "code": "print(\"test precentage of nulls:\")\nprint(round(test_df.isnull().sum() / test_df.count() * 100, 2))", - "class": "Exploratory Data Analysis", - "desc": "This code calculates and prints the percentage of null values in each column of the testing DataFrame.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_missing_values", - "subclass_id": 39, - "predicted_subclass_probability": 0.99783856 - }, - "cluster": 7 - }, { - "cell_id": 9, - "code": "# check that we don't have any keywords appearing in one set and not the other\ntrain_keywords = set(train_df[\"keyword\"].dropna())\ntest_keywords = set(test_df[\"keyword\"].dropna())\n\nall_keywords = train_keywords.union(test_keywords)\nunique_test_keywords = all_keywords - train_keywords\nunique_train_keywords = all_keywords - test_keywords\n\nprint(f\"unique_test_keywords: {unique_test_keywords}\")\nprint(f\"unique_train_keywords: {unique_train_keywords}\")", - "class": "Exploratory Data Analysis", - "desc": "This code identifies keywords that are unique to either the training or testing datasets and prints those unique keywords.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_unique_values", - "subclass_id": 57, - "predicted_subclass_probability": 0.966651 - }, - "cluster": -1 - }, { - "cell_id": 13, - "code": "# Have a look at how the TweetProcessor is doing\nfor tweet in train_df[100:120][\"text\"]:\n print(\"original: \", tweet)\n print(\"processed: \", tweet_preprocessor.preprocess_tweet(tweet))\n print(\"\")", - "class": "Exploratory Data Analysis", - "desc": "This code prints out original and processed versions of a sample of tweets from the training DataFrame to show the effect of the preprocessing.", - "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.7224992 - }, - "cluster": -1 - }, { - "cell_id": 0, - "code": "!pip install -q transformers ekphrasis keras-tuner", - "class": "Imports and Environment", - "desc": "This line installs the required packages 'transformers', 'ekphrasis', and 'keras-tuner' quietly.", + "code": "param_grid = {\n \"gamma\" : [0.001,0.01,1,10,100],\n \"C\":[0.001,0.01,1,10,100],\n 'kernel' : ['poly', \"linear\", 'sigmoid', 'rbf']\n}", + "class": "Model Training", + "desc": "The snippet defines a parameter grid for a Support Vector Classifier with various values for gamma, C, and kernel to be used for hyperparameter tuning.", "testing": { - "class": "Imports_and_Environment", - "subclass": "install_modules", - "subclass_id": 87, - "predicted_subclass_probability": 0.99613416 + "class": "Model_Train", + "subclass": "define_search_space", + "subclass_id": 5, + "predicted_subclass_probability": 0.99403256 }, "cluster": -1 }, { - "cell_id": 1, - "code": "import numpy as np\nimport pandas as pd\nimport urllib\nimport statistics\nimport math\nimport pprint\nimport sklearn\nfrom sklearn.linear_model import LogisticRegression\nimport tensorflow as tf\nimport tensorflow.keras as keras\nfrom tensorflow.keras.layers import (\n Input,\n Dense,\n Embedding,\n Flatten,\n Dropout,\n GlobalMaxPooling1D,\n GRU,\n concatenate,\n)\nfrom tensorflow.keras.callbacks import EarlyStopping\nfrom transformers import (\n DistilBertTokenizerFast,\n TFDistilBertModel,\n DistilBertConfig,\n)\n\nfrom ekphrasis.classes.preprocessor import TextPreProcessor\nfrom ekphrasis.classes.tokenizer import Tokenizer\nfrom ekphrasis.dicts.emoticons import emoticons\nfrom ekphrasis.dicts.noslang.slangdict import slangdict\n\nimport kerastuner", - "class": "Imports and Environment", - "desc": "This code imports various libraries and modules needed for data manipulation, statistical operations, machine learning, text preprocessing, and neural network training.", + "cell_id": 9, + "code": "svc = SVC()\ngrid_searcher = GridSearchCV(svc, param_grid, cv=5, scoring='f1')\ngrid_searcher.fit(X_train, y_train)\ngrid_searcher.best_params_", + "class": "Model Training", + "desc": "The snippet initializes a Support Vector Classifier and performs a grid search with 5-fold cross-validation to find the best hyperparameters based on the F1 score, and then fits the model to the training data.", "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.9993593 + "class": "Model_Train", + "subclass": "train_on_grid", + "subclass_id": 6, + "predicted_subclass_probability": 0.99040365 }, "cluster": 0 }, { - "cell_id": 2, - "code": "def print_metrics(model, x_train, y_train, x_val, y_val):\n train_acc = dict(model.evaluate(x_train, y_train, verbose=0, return_dict=True))[\n \"accuracy\"\n ]\n val_acc = dict(model.evaluate(x_val, y_val, verbose=0, return_dict=True))[\n \"accuracy\"\n ]\n\n val_preds = model.predict(x_val)\n val_preds_bool = val_preds >= 0.5\n\n print(\"\")\n print(f\"Training Accuracy: {train_acc:.2%}\")\n print(f\"Validation Accuracy: {val_acc:.2%}\")\n print(\"\")\n print(f\"Validation f1 score: {sklearn.metrics.f1_score(val_preds_bool, y_val):.2%}\")", - "class": "Model Evaluation", - "desc": "This function evaluates the provided model's training and validation accuracy, makes predictions on the validation data, and prints out the training accuracy, validation accuracy, and validation f1 score.", + "cell_id": 10, + "code": "best_params= {'C': 1, 'gamma': 0.001, 'kernel': 'linear'}\nsvc = SVC(**best_params)\nscores = cross_val_score(svc,X_train, y_train, cv=5, scoring='f1')\nprint(scores)\nprint(sum(scores)/len(scores))", + "class": "Model Training", + "desc": "The snippet initializes a Support Vector Classifier with the best parameters found from the grid search, performs 5-fold cross-validation on the training data to evaluate the F1 scores, and prints the individual and average scores.", "testing": { "class": "Model_Train", "subclass": "compute_train_metric", "subclass_id": 28, - "predicted_subclass_probability": 0.5876385 + "predicted_subclass_probability": 0.9838514 }, "cluster": 0 }, { - "cell_id": 29, - "code": "# tuner.results_summary()", - "class": "Model Evaluation", - "desc": "This commented-out line, if executed, would print a summary of the results from the Hyperband tuner, including the best-performing hyperparameters.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "commented", - "subclass_id": 76, - "predicted_subclass_probability": 0.9971831 - }, - "cluster": 1 - }, { - "cell_id": 30, - "code": "best_model = tuner.get_best_models()[0]\n# best_model.summary()\nprint(\"\")\nbest_arch_hp = tuner.get_best_hyperparameters()[0]\npprint.pprint(best_arch_hp.values, indent=4)\nprint(\"\")\n\nprint_metrics(best_model, train_inputs, y_train, validation_inputs, y_val)", - "class": "Model Evaluation", - "desc": "This code retrieves and prints the best-performing model's hyperparameters obtained from the Hyperband tuning and evaluates the model by printing its training and validation performance metrics.", + "cell_id": 11, + "code": "val_texts = [\"A happy day!\", 'An earthquake happened!']\nval_data = tf_idf.transform(val_texts)\nsvc.fit(X_train, y_train)\nprint(svc.predict(val_data))", + "class": "Model Training", + "desc": "The snippet transforms sample validation texts using the TF-IDF Vectorizer, fits the Support Vector Classifier on the training data, and prints the model's predictions for the sample validation texts.", "testing": { "class": "Model_Train", "subclass": "train_model", "subclass_id": 7, - "predicted_subclass_probability": 0.43187767 + "predicted_subclass_probability": 0.6037878 }, "cluster": 0 - }, { - "cell_id": 3, - "code": "# Using DistilBERT:\nmodel_class, tokenizer_class, pretrained_weights = (TFDistilBertModel, DistilBertTokenizerFast, 'distilbert-base-uncased')\n\npretrained_bert_tokenizer = tokenizer_class.from_pretrained(pretrained_weights)\n\ndef get_pretrained_bert_model(config=pretrained_weights):\n if not config:\n config = DistilBertConfig(num_labels=2)\n\n return model_class.from_pretrained(pretrained_weights, config=config)\n\n", - "class": "Model Training", - "desc": "This code sets up and retrieves a pre-trained DistilBERT model and tokenizer using the specified pre-trained weights, and defines a function to get this model with a provided configuration.", + }], + "notebook_id": 3, + "notebook_name": "baseline-svc-79.ipynb" + }, { + "cells": [{ + "cell_id": 39, + "code": "submission = pd.DataFrame({'id':test['id'].values.tolist(),'target':predictions})\nsubmission.to_csv('submission.csv',index=False)", + "class": "Data Export", + "desc": "This code snippet creates a DataFrame containing the test set IDs and their corresponding prediction labels, and then exports this DataFrame to a CSV file named \"submission.csv\" without including the index.", "testing": { - "class": "Model_Train", - "subclass": "load_pretrained", - "subclass_id": 30, - "predicted_subclass_probability": 0.9911644 + "class": "Data_Export", + "subclass": "save_to_csv", + "subclass_id": 25, + "predicted_subclass_probability": 0.9992361 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 20, - "code": "# I obtained the value of C by experimenting with LogisticRegressionCV but I'm leaving it out for brevity\nlogisticRegressionClf = LogisticRegression(n_jobs=-1, C=2.78)\nlogisticRegressionClf.fit(train_vectors, y_train)\n\ndef print_metrics_sk(clf, x_train, y_train, x_val, y_val):\n print(f\"Train Accuracy: {clf.score(x_train, y_train):.2%}\")\n print(f\"Validation Accuracy: {clf.score(x_val, y_val):.2%}\")\n print(\"\")\n print(f\"f1 score: {sklearn.metrics.f1_score(y_val, clf.predict(x_val)):.2%}\")\n\nprint_metrics_sk(logisticRegressionClf, train_vectors, y_train, validation_vectors, y_val)", - "class": "Model Training", - "desc": "This code fits a logistic regression model to the TF-IDF vectorized training data, and then evaluates and prints out its training and validation accuracy, as well as the validation f1 score using a helper function.", + "cell_id": 1, + "code": "train = pd.read_csv(\"../input/nlp-getting-started/train.csv\")\ntest = pd.read_csv('../input/nlp-getting-started/test.csv')", + "class": "Data Extraction", + "desc": "This code snippet loads the training and test datasets from CSV files using pandas.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.9330293 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.9997564 }, "cluster": 0 }, { - "cell_id": 21, - "code": "feature_extractor = get_pretrained_bert_model()\n\n# Run a forward pass on the tokenized inputs\n# model_outputs = feature_extractor(\n# train_tweets_encoded[\"input_ids\"], train_tweets_encoded[\"attention_mask\"]\n# )\nmodel_outputs = feature_extractor.predict(\n train_dataset.batch(32)\n)\n# BERT's sentence representation can be retrieved from a hidden vector at index 0 in the sequence, \n# (where the special token CLS was prepended by the tokenizer)\ntrain_sentence_vectors = model_outputs.last_hidden_state[:, 0, :]\n\n# The rest of the sequence contains the embeddings \n# (modified by successive layers of self-attention) for each token\ntrain_word_vectors = model_outputs.last_hidden_state[:, 1:, :]\n\n# And the same again for the validation set\n# model_outputs = feature_extractor(\n# validation_tweets_encoded[\"input_ids\"], validation_tweets_encoded[\"attention_mask\"]\n# )\nmodel_outputs = feature_extractor.predict(\n val_dataset.batch(32)\n)\nvalidation_sentence_vectors = model_outputs.last_hidden_state[:, 0, :]\nvalidation_word_vectors = model_outputs.last_hidden_state[:, 1:, :]", - "class": "Model Training", - "desc": "This code retrieves sentence and word vectors by running a pre-trained BERT model's forward pass on the tokenized training and validation datasets, extracting the BERT's sentence representation and embeddings for each token.", + "cell_id": 5, + "code": "string.punctuation", + "class": "Data Transform", + "desc": "This code snippet refers to the `string.punctuation` attribute, which contains a string of all the punctuation characters, and is typically used for text cleaning or preprocessing tasks.", "testing": { - "class": "Model_Train", - "subclass": "find_best_model_class", - "subclass_id": 3, - "predicted_subclass_probability": 0.2968278 + "class": "Imports_and_Environment", + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.73409814 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 22, - "code": "logisticRegressionClf = LogisticRegression(n_jobs=-1, class_weight=class_weights)\nlogisticRegressionClf.fit(train_sentence_vectors, y_train)\n\nprint_metrics_sk(\n logisticRegressionClf,\n train_sentence_vectors,\n y_train,\n validation_sentence_vectors,\n y_val,\n)", - "class": "Model Training", - "desc": "This code retrains a logistic regression model using the sentence vectors obtained from the BERT model, applies class weights to account for class imbalance, and evaluates the model's performance by printing its training and validation accuracies and the validation f1 score.", + "cell_id": 6, + "code": "def remove_URL(text):\n url = re.compile(r\"https?://\\S+|www\\.\\S+\")\n return url.sub(r\"\", text)\n\ndef remove_punct(text):\n translator = str.maketrans(\"\", \"\", string.punctuation)\n return text.translate(translator)", + "class": "Data Transform", + "desc": "This code snippet defines two functions: `remove_URL`, which removes URLs from text using regex, and `remove_punct`, which removes punctuation from text using the `translate` method with a translation table.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.99934644 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.7244959 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 23, - "code": "def create_gru_model() -> keras.Model:\n\n model = keras.Sequential()\n model.add(keras.layers.InputLayer(input_shape=train_word_vectors.shape[1:]))\n model.add(GRU(32, return_sequences=True))\n model.add(GlobalMaxPooling1D())\n model.add(Dense(1, activation=\"sigmoid\"))\n\n model.compile(\n optimizer=keras.optimizers.Adam(),\n loss=\"binary_crossentropy\",\n metrics=keras.metrics.BinaryAccuracy(name=\"accuracy\"),\n )\n return model\n\nmodel = create_gru_model()\n\nhistory = model.fit(\n train_word_vectors,\n y_train,\n validation_data=(validation_word_vectors, y_val),\n class_weight=class_weights,\n epochs=20,\n verbose=0,\n callbacks=[\n EarlyStopping(\n monitor=\"val_accuracy\",\n min_delta=0.001,\n patience=5,\n restore_best_weights=True,\n )\n ],\n)\n\nprint_metrics(model, train_word_vectors, y_train, validation_word_vectors, y_val)", - "class": "Model Training", - "desc": "This code defines and compiles a GRU-based neural network model, trains it on the word vectors from the BERT model with class weights to handle imbalances, and then prints out the model's training and validation performance metrics.", + "cell_id": 7, + "code": "#regex pattern to remove links\npattern = re.compile(r\"https?://(\\S+|www)\\.\\S+\")\n#for train\nfor t in train.text:\n matches = pattern.findall(t)\n for match in matches:\n print(t)\n print('After Transformation:')\n print(pattern.sub(r\"\", t))\n if len(matches) > 0:\n break", + "class": "Data Transform", + "desc": "This code snippet creates a regex pattern to remove URLs from text and applies it to the 'text' column of the training data, then prints the text before and after transformation for samples containing URLs.", "testing": { - "class": "Model_Train", - "subclass": "train_on_grid", - "subclass_id": 6, - "predicted_subclass_probability": 0.509667 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.9533664 }, - "cluster": 0 + "cluster": 8 }, { - "cell_id": 24, - "code": "def create_multi_input_model() -> keras.Model:\n\n keyword_ids = keras.Input((8,), name=\"keywords\")\n keyword_features = Embedding(input_dim=feature_extractor.config.vocab_size, output_dim=16, input_length=8, mask_zero=True)(keyword_ids)\n keyword_features = Flatten()(keyword_features)\n keyword_features = Dense(1)(keyword_features)\n\n tweet_classification_vectors = keras.Input((train_sentence_vectors.shape[1],), name=\"tweets\")\n tweet_features = Dense(1, activation='relu')(tweet_classification_vectors) \n\n combined_features = concatenate([keyword_features, tweet_features])\n combined_prediction = Dense(1, activation=\"sigmoid\")(combined_features)\n\n model = keras.Model(inputs = [keyword_ids, tweet_classification_vectors], outputs=combined_prediction)\n\n model.compile(\n optimizer=keras.optimizers.Adam(),\n loss=\"binary_crossentropy\",\n metrics=keras.metrics.BinaryAccuracy(name=\"accuracy\"),\n )\n return model\n\n\nmodel = create_multi_input_model()\n\ntrain_inputs = {\"keywords\" : train_keywords_encoded[\"input_ids\"], \"tweets\" : train_sentence_vectors}\nvalidation_inputs = {\"keywords\" : validation_keywords_encoded[\"input_ids\"], \"tweets\" : validation_sentence_vectors}\n\nhistory = model.fit(\n train_inputs,\n y_train,\n validation_data=(validation_inputs, y_val),\n class_weight=class_weights,\n epochs=20,\n verbose=0,\n callbacks=[\n EarlyStopping(\n monitor=\"val_accuracy\",\n min_delta=0.001,\n patience=5,\n restore_best_weights=True,\n )\n ],\n)\n\n\nprint_metrics(model, train_inputs, y_train, validation_inputs, y_val)", - "class": "Model Training", - "desc": "This code defines a multi-input neural network model that combines keyword embeddings and BERT's sentence embeddings, trains it with class weights to account for imbalances, and evaluates its training and validation performance metrics.", + "cell_id": 8, + "code": "#for test:\nfor t in test.text:\n matches = pattern.findall(t)\n for match in matches:\n print(t)\n print('After Transformation:')\n print(pattern.sub(r\"\", t))\n if len(matches) > 0:\n break", + "class": "Data Transform", + "desc": "This code snippet applies the previously defined regex pattern to the 'text' column of the test data to remove URLs and prints the text before and after transformation for samples containing URLs.", "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.9820961 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.9523689 }, - "cluster": 0 + "cluster": 8 }, { - "cell_id": 25, - "code": "def create_multi_input_rnn_model() -> keras.Model:\n\n keyword_ids = keras.Input((8,), name=\"keywords\")\n keyword_features = Embedding(input_dim=feature_extractor.config.vocab_size, output_dim=16, input_length=8, mask_zero=True)(keyword_ids)\n keyword_features = Flatten()(keyword_features)\n keyword_features = Dense(1)(keyword_features)\n\n tweet_token_embeddings = Input(train_word_vectors.shape[1:], name=\"tweets\")\n tweet_features = GRU(32, return_sequences=True)(tweet_token_embeddings)\n tweet_features = GlobalMaxPooling1D()(tweet_features)\n tweet_features = Dense(1, activation='relu')(tweet_features) \n\n combined_features = concatenate([keyword_features, tweet_features])\n combined_prediction = Dense(1, activation=\"sigmoid\")(combined_features)\n\n model = keras.Model(inputs = [keyword_ids, tweet_token_embeddings], outputs=combined_prediction)\n\n model.compile(\n optimizer=keras.optimizers.Adam(),\n loss=\"binary_crossentropy\",\n metrics=keras.metrics.BinaryAccuracy(name=\"accuracy\"),\n )\n return model\n\n\nmodel = create_multi_input_rnn_model()\n\ntrain_inputs = {\"keywords\" : train_keywords_encoded[\"input_ids\"], \"tweets\" : train_word_vectors}\nvalidation_inputs = {\"keywords\" : validation_keywords_encoded[\"input_ids\"], \"tweets\" : validation_word_vectors}\n\nhistory = model.fit(\n train_inputs,\n y_train,\n validation_data=(validation_inputs, y_val),\n class_weight=class_weights,\n epochs=20,\n verbose=0,\n callbacks=[\n EarlyStopping(\n monitor=\"val_accuracy\",\n min_delta=0.001,\n patience=5,\n restore_best_weights=True,\n )\n ],\n)\n\nprint_metrics(model, train_inputs, y_train, validation_inputs, y_val)", - "class": "Model Training", - "desc": "This code defines a multi-input RNN-based model that combines keyword embeddings and tweet token embeddings processed by a GRU layer, trains the model with class weights to address imbalance, and evaluates and prints its training and validation performance metrics.", + "cell_id": 9, + "code": "#preprocess data frames:\n#train\ntrain[\"text\"] = train.text.map(remove_URL) \ntrain[\"text\"] = train.text.map(remove_punct)\n#test\ntest[\"text\"] = test.text.map(remove_URL) \ntest[\"text\"] = test.text.map(remove_punct)", + "class": "Data Transform", + "desc": "This code snippet preprocesses the 'text' column in both the training and test datasets by mapping the `remove_URL` and `remove_punct` functions to remove URLs and punctuation from the text.", "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.8343507 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.90332294 }, - "cluster": 0 + "cluster": 3 }, { - "cell_id": 26, - "code": "def create_candidate_model_with_fx(hp: kerastuner.HyperParameters) -> keras.Model:\n\n keyword_ids = keras.Input((8,), name=\"keywords\")\n keyword_features = Embedding(input_dim=feature_extractor.config.vocab_size, output_dim=16, input_length=8, mask_zero=True)(keyword_ids)\n keyword_features = Flatten()(keyword_features)\n keyword_features = Dense(hp.Choice(\"keyword_units\", values=[1, 8, 16, 32], default=1))(keyword_features)\n\n tweet_token_embeddings = Input(train_word_vectors.shape[1:], name=\"tweets\")\n \n tweet_features = GRU(hp.Choice(\"GRU_units\", values=[8, 16, 32, 64, 128], default=32), return_sequences=True)(tweet_token_embeddings)\n tweet_features = Dropout(hp.Float(\"GRU_dropout\", min_value=0.0, max_value=0.5, step=0.1))(tweet_features)\n tweet_features = GlobalMaxPooling1D()(tweet_features)\n \n for i in range(hp.Int(\"num_layers\", min_value=0, max_value=3, step=1)):\n tweet_features = Dense(hp.Choice(\"layer_\" + str(i) + \"_units\", values=[2, 8, 16, 32, 64, 128, 256]), activation=\"relu\")(tweet_features)\n tweet_features = Dropout(hp.Float(\"layer_\" + str(i) + \"_dropout\", min_value=0.0, max_value=0.5, step=0.1))(tweet_features)\n \n combined_features = concatenate([keyword_features, tweet_features])\n combined_prediction = Dense(1, activation=\"sigmoid\")(combined_features)\n\n model = keras.Model(inputs = [keyword_ids, tweet_token_embeddings], outputs=combined_prediction)\n\n model.compile(\n optimizer=keras.optimizers.Adam(),\n loss=\"binary_crossentropy\",\n metrics=keras.metrics.BinaryAccuracy(name=\"accuracy\"),\n )\n return model\n\ntrain_inputs = {\"keywords\" : train_keywords_encoded[\"input_ids\"], \"tweets\" : train_word_vectors}\nvalidation_inputs = {\"keywords\" : validation_keywords_encoded[\"input_ids\"], \"tweets\" : validation_word_vectors}\n", - "class": "Model Training", - "desc": "This code defines a candidate multi-input model for hyperparameter tuning using Keras Tuner, which includes keyword embeddings and tweet token embeddings processed by a GRU layer followed by configurable dense layers and dropout, and compiles it for training.", + "cell_id": 10, + "code": "# remove stopwords\nnltk.download('stopwords')\n\nstop = set(stopwords.words(\"english\"))\n\ndef remove_stopwords(text):\n filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]\n return \" \".join(filtered_words)", + "class": "Data Transform", + "desc": "This code snippet downloads NLTK stopwords, defines a set of these English stopwords, and provides a function `remove_stopwords` to filter out these stopwords from text.", "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.37123346 + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.9945669 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 27, - "code": "# Hyperband Tuning\nMAX_EPOCHS = 10\nFACTOR = 3\nITERATIONS = 3\n\nprint(f\"Number of models in each bracket: {math.ceil(1 + math.log(MAX_EPOCHS, FACTOR))}\")\nprint(f\"Number of epochs over all trials: {round(ITERATIONS * (MAX_EPOCHS * (math.log(MAX_EPOCHS, FACTOR) ** 2)))}\")", - "class": "Model Training", - "desc": "This code calculates and prints the number of models in each bracket and the total number of epochs over all trials for Hyperband tuning based on the specified maximum epochs, factor, and iterations.", + "cell_id": 11, + "code": "stop", + "class": "Data Transform", + "desc": "This code snippet refers to the previously defined `stop` set which contains the downloaded English stopwords from NLTK, used for text preprocessing.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.5920417 + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9975351 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 28, - "code": "tuner = kerastuner.Hyperband(\n create_candidate_model_with_fx,\n max_epochs=MAX_EPOCHS,\n hyperband_iterations=ITERATIONS, \n factor=FACTOR, \n objective=\"val_accuracy\",\n directory=\"hyperparam-search\",\n project_name=\"architecture-hyperband\",\n)\n\ntuner.search(\n train_inputs,\n y_train,\n validation_data=(validation_inputs, y_val),\n class_weight=class_weights,\n epochs=10,\n verbose=1,\n callbacks=[\n EarlyStopping(\n monitor=\"val_accuracy\",\n min_delta=0.001,\n patience=3,\n restore_best_weights=True,\n )\n ],\n)\n", - "class": "Model Training", - "desc": "This code initializes a Hyperband tuner for hyperparameter optimization of the multi-input model and launches a search over specified epochs and iterations, utilizing early stopping and class weights.", + "cell_id": 12, + "code": "#train\ntrain[\"text\"] = train.text.map(remove_stopwords)\n#test\ntest[\"text\"] = test.text.map(remove_stopwords)", + "class": "Data Transform", + "desc": "This code snippet applies the `remove_stopwords` function to the 'text' column in both the training and test datasets to remove stopwords from the text.", "testing": { - "class": "Model_Train", - "subclass": "train_on_grid", - "subclass_id": 6, - "predicted_subclass_probability": 0.7966217 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.76800144 }, - "cluster": 0 + "cluster": 3 }, { - "cell_id": 31, - "code": "# To create a baseline for the simplest possible fine-tuned BERT\ndef create_bert_simple_for_ft():\n input_ids = Input(shape=(max_length_tweet,), dtype=\"int32\", name=\"input_ids\")\n attention_mask = Input(shape=(max_length_tweet,), dtype=\"int32\", name=\"attention_mask\")\n\n pretrained_bert_model = get_pretrained_bert_model()\n bert_outputs = pretrained_bert_model(input_ids, attention_mask)\n\n prediction = Dense(1, activation=\"sigmoid\")(bert_outputs.last_hidden_state[:, 0, :])\n return keras.Model(inputs=[input_ids, attention_mask], outputs=prediction)\n\nmodel = create_bert_simple_for_ft()\n\nmodel.compile(\n optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),\n loss=\"binary_crossentropy\",\n metrics=[\"accuracy\"],\n)\n\nmodel.fit(\n train_dataset.batch(32),\n validation_data=val_dataset.batch(32),\n class_weight=class_weights,\n epochs=20,\n callbacks=[\n EarlyStopping(\n monitor=\"val_accuracy\",\n min_delta=0.001,\n patience=5,\n restore_best_weights=True,\n )\n ],\n)\n\nprint_metrics(\n model, dict(train_tweets_encoded), y_train, dict(validation_tweets_encoded), y_val\n)\n", - "class": "Model Training", - "desc": "This code defines, compiles, and trains a baseline fine-tuned BERT model on the tokenized training data, applying class weights and early stopping, and then evaluates the model by printing its training and validation performance metrics.", + "cell_id": 19, + "code": "# Split dataset into training and validation set\nX = train.text\ny = train.target\ntrain_sentences, val_sentences , train_labels, val_labels = train_test_split(X, y, test_size=0.2)", + "class": "Data Transform", + "desc": "This code snippet splits the training dataset into training and validation sets using the `train_test_split` method from Scikit-learn, with 20% of the data reserved for validation.", "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.98817086 + "class": "Data_Transform", + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.9967854 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 32, - "code": "def create_bert_rnn_for_ft():\n \n pretrained_bert_model = get_pretrained_bert_model()\n \n keyword_ids = keras.Input((8,), name=\"keywords\")\n keyword_features = Embedding(input_dim=pretrained_bert_model.config.vocab_size, output_dim=16, input_length=8, mask_zero=True)(keyword_ids)\n keyword_features = Flatten()(keyword_features)\n keyword_features = Dense(1)(keyword_features)\n\n input_ids = Input(shape=(max_length_tweet,), dtype=\"int32\", name=\"input_ids\")\n attention_mask = Input(shape=(max_length_tweet,), dtype=\"int32\", name=\"attention_mask\")\n bert_outputs = pretrained_bert_model(input_ids, attention_mask)\n\n bert_token_embeddings = bert_outputs.last_hidden_state[:, 1:, :]\n tweet_features = GRU(32, return_sequences=True)(bert_token_embeddings)\n tweet_features = GlobalMaxPooling1D()(tweet_features)\n\n combined_features = concatenate([keyword_features, tweet_features])\n combined_prediction = Dense(1, activation=\"sigmoid\")(combined_features)\n\n model = keras.Model(inputs = [keyword_ids, input_ids, attention_mask], outputs=combined_prediction)\n\n model.compile(\n optimizer=keras.optimizers.Adam(learning_rate=5e-5),\n loss=\"binary_crossentropy\",\n metrics=keras.metrics.BinaryAccuracy(name=\"accuracy\"),\n )\n return model\n\nmodel = create_bert_rnn_for_ft()\n\nmodel.fit(\n train_multi_input_dataset.batch(32),\n validation_data=val_multi_input_dataset.batch(32),\n epochs=20,\n class_weight=class_weights,\n callbacks=[\n EarlyStopping(\n monitor=\"val_accuracy\",\n min_delta=0.001,\n patience=3,\n restore_best_weights=True,\n )\n ],\n)\n\nprint_metrics(\n model, train_inputs_encoded, y_train, validation_inputs_encoded, y_val\n)", - "class": "Model Training", - "desc": "This code defines, compiles, and trains a multi-input BERT-RNN model, which combines keyword embeddings and BERT token embeddings processed by a GRU layer, applying class weights and early stopping, and evaluates the model by printing its training and validation performance metrics.", + "cell_id": 20, + "code": "#train/val\ntrain_sentences = train_sentences.to_numpy()\ntrain_labels = train_labels.to_numpy()\nval_sentences = val_sentences.to_numpy()\nval_labels = val_labels.to_numpy()", + "class": "Data Transform", + "desc": "This code snippet converts the training and validation sentences and labels from pandas Series to NumPy arrays using the `to_numpy` method.", "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.7985944 + "class": "Data_Transform", + "subclass": "data_type_conversions", + "subclass_id": 16, + "predicted_subclass_probability": 0.9735998 }, - "cluster": 0 + "cluster": 8 }, { - "cell_id": 33, - "code": "def create_model_candidate() -> keras.Model:\n pretrained_bert_model = get_pretrained_bert_model()\n\n keyword_ids = keras.Input((8,), name=\"keywords\")\n keyword_features = Embedding(input_dim=pretrained_bert_model.config.vocab_size, output_dim=16, input_length=8, mask_zero=True)(keyword_ids)\n keyword_features = Flatten()(keyword_features)\n keyword_features = Dense(best_arch_hp.get(\"keyword_units\"))(keyword_features)\n\n input_ids = Input(shape=(max_length_tweet,), dtype=\"int32\", name=\"input_ids\")\n attention_mask = Input(shape=(max_length_tweet,), dtype=\"int32\", name=\"attention_mask\")\n bert_outputs = pretrained_bert_model(input_ids, attention_mask)\n bert_token_embeddings = bert_outputs.last_hidden_state[:, 1:, :]\n tweet_features = GRU(best_arch_hp.get(\"GRU_units\"), return_sequences=True)(bert_token_embeddings)\n tweet_features = Dropout(best_arch_hp.get(\"GRU_dropout\"))(tweet_features)\n tweet_features = GlobalMaxPooling1D()(tweet_features)\n \n for i in range(best_arch_hp.get(\"num_layers\")):\n tweet_features = Dense(best_arch_hp.get(\"layer_\" + str(i) + \"_units\"), activation=\"relu\")(tweet_features)\n tweet_features = Dropout(best_arch_hp.get(\"layer_\" + str(i) + \"_dropout\"))(tweet_features)\n \n combined_features = concatenate([keyword_features, tweet_features])\n combined_prediction = Dense(1, activation=\"sigmoid\")(combined_features)\n\n model = keras.Model(inputs = [keyword_ids, input_ids, attention_mask], outputs=combined_prediction)\n\n model.compile(\n optimizer=keras.optimizers.Adam(learning_rate=5e-5),\n loss=\"binary_crossentropy\",\n metrics=keras.metrics.BinaryAccuracy(name=\"accuracy\"),\n )\n return model\n", - "class": "Model Training", - "desc": "This code defines a model candidate using the best hyperparameters obtained from the Hyperband tuner, which combines keyword embeddings and BERT token embeddings processed by a GRU layer, with additional dense layers and dropout as specified by the best hyperparameters, and compiles it for training.", + "cell_id": 21, + "code": "#test\ntest_sentences = test.text.to_numpy()", + "class": "Data Transform", + "desc": "This code snippet converts the 'text' column of the test dataset from a pandas Series to a NumPy array using the `to_numpy` method.", "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.90136576 + "class": "Data_Transform", + "subclass": "data_type_conversions", + "subclass_id": 16, + "predicted_subclass_probability": 0.98378074 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 34, - "code": "model = create_model_candidate()\n\nhistory = model.fit(\n train_multi_input_dataset.batch(32),\n validation_data=val_multi_input_dataset.batch(32),\n epochs=6,\n class_weight=class_weights,\n callbacks=[\n keras.callbacks.EarlyStopping(\n monitor=\"val_accuracy\", restore_best_weights=True\n )\n ],\n)\n\nbest_epoch = len(history.history[\"val_accuracy\"]) - 1\n\nprint_metrics(\n model, train_inputs_encoded, y_train, validation_inputs_encoded, y_val\n)", - "class": "Model Training", - "desc": "This code trains the model candidate using the multi-input dataset, applying class weights and early stopping to restore the best weights, and then evaluates the model by printing its training and validation performance metrics while noting the best epoch reached during training.", + "cell_id": 23, + "code": "# Tokenize\n# vectorize a text corpus by turning each text into a sequence of integers\n\ntokenizer = Tokenizer(num_words=num_unique_words)\ntokenizer.fit_on_texts(train_sentences) # fit only to training", + "class": "Data Transform", + "desc": "This code snippet initializes a `Tokenizer` from TensorFlow's Keras with a specified number of unique words and fits it on the training sentences to convert the text corpus into sequences of integers.", "testing": { "class": "Model_Train", "subclass": "train_model", "subclass_id": 7, - "predicted_subclass_probability": 0.99578863 + "predicted_subclass_probability": 0.6493907 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 36, - "code": "full_train_dataset = train_multi_input_dataset.concatenate(val_multi_input_dataset)\nmodel = create_model_candidate()\n\nmodel.fit(\n full_train_dataset.batch(32),\n epochs=best_epoch,\n class_weight=class_weights,\n)", - "class": "Model Training", - "desc": "This code concatenates the training and validation datasets into a full training dataset, reinitializes the model candidate, and then trains it using the full training dataset for the number of epochs determined to be the best during previous training.", + "cell_id": 24, + "code": "# Now each word has unique index\nword_index = tokenizer.word_index\nword_index", + "class": "Data Transform", + "desc": "This code snippet retrieves the word index dictionary from the tokenizer, which maps each word to a unique integer index.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.9959798 + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.66235614 }, - "cluster": 0 - }], - "notebook_id": 2, - "notebook_name": "bert-feature-extraction-and-fine-tuning" - }, { - "cells": [{ - "cell_id": 26, - "code": "# SAVE SUBMISSION FILE\n\nsubmission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')\nsubmission.target = flat_predictions\nsubmission.to_csv('submission.csv', index=False)", - "class": "Data Export", - "desc": "This code snippet saves the predictions to a submission file by reading a sample submission file, updating the `target` column with the predictions, and exporting the modified DataFrame to a CSV file.", + "cluster": 1 + }, { + "cell_id": 25, + "code": "#apply on train, validation, and test sentences\n\ntrain_sequences = tokenizer.texts_to_sequences(train_sentences)\nval_sequences = tokenizer.texts_to_sequences(val_sentences)\ntest_sequences = tokenizer.texts_to_sequences(test_sentences)", + "class": "Data Transform", + "desc": "This code snippet converts the training, validation, and test sentences into sequences of integers based on the fitted tokenizer's word index using the `texts_to_sequences` method.", "testing": { - "class": "Data_Export", - "subclass": "save_to_csv", - "subclass_id": 25, - "predicted_subclass_probability": 0.99789846 + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.5915242 }, - "cluster": 0 + "cluster": 3 }, { - "cell_id": 0, - "code": "# LOADING THE TRAIN DATA\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename)) \ndata = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')\ndata.sample(10)", - "class": "Data Extraction", - "desc": "This code snippet imports necessary libraries, lists files in the specified input directory, and reads a CSV file into a pandas DataFrame while displaying a random sample of 10 rows.", + "cell_id": 27, + "code": "# Pad the sequences to have the same length\nmax_length = 15 #arbitrary number\n\ntrain_padded = pad_sequences(train_sequences, maxlen=max_length, padding=\"post\", truncating=\"post\") #post-> 0\nval_padded = pad_sequences(val_sequences, maxlen=max_length, padding=\"post\", truncating=\"post\")\ntest_padded = pad_sequences(test_sequences, maxlen=max_length, padding=\"post\", truncating=\"post\")", + "class": "Data Transform", + "desc": "This code snippet pads the tokenized sequences of the training, validation, and test sets to a uniform length of 15 using the `pad_sequences` method from TensorFlow's Keras, adding zeros at the end if necessary.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.9061924 + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.717097 }, - "cluster": 0 + "cluster": 5 }, { - "cell_id": 7, - "code": "# GET THE LISTS OF TWEETS AND THEIR LABELS\n\nsentences = data.text.values\nlabels =data.target.values", - "class": "Data Extraction", - "desc": "This code snippet extracts the text and target values from the data DataFrame into separate lists for tweets and their labels.", + "cell_id": 31, + "code": "# flip (key, value)\nreverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])", + "class": "Data Transform", + "desc": "This code snippet creates a reverse map from the word index dictionary by flipping keys and values, mapping indices back to words.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "define_variables", "subclass_id": 77, - "predicted_subclass_probability": 0.9971167 + "predicted_subclass_probability": 0.9888599 }, - "cluster": -1 + "cluster": 2 }, { - "cell_id": 23, - "code": "# PREPARE TEST DATA\n\n# Load the dataset into a pandas dataframe.\ntest_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')\n\n# Report the number of sentences.\nprint('Number of test sentences: {:,}\\n'.format(test_data.shape[0]))\n\n# Create sentence and label lists\nsentences = test_data.text.values\n#labels = test_data.target.values\n\n# Tokenize all of the sentences and map the tokens to thier word IDs.\ninput_ids = []\nattention_masks = []\n\n# For every sentence...\nfor sent in sentences:\n # `encode_plus` will:\n # (1) Tokenize the sentence.\n # (2) Prepend the `[CLS]` token to the start.\n # (3) Append the `[SEP]` token to the end.\n # (4) Map tokens to their IDs.\n # (5) Pad or truncate the sentence to `max_length`\n # (6) Create attention masks for [PAD] tokens.\n encoded_dict = tokenizer.encode_plus(\n sent, # Sentence to encode.\n add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n max_length = 64, # Pad & truncate all sentences.\n pad_to_max_length = True,\n return_attention_mask = True, # Construct attn. masks.\n return_tensors = 'pt', # Return pytorch tensors.\n )\n \n # Add the encoded sentence to the list. \n input_ids.append(encoded_dict['input_ids'])\n \n # And its attention mask (simply differentiates padding from non-padding).\n attention_masks.append(encoded_dict['attention_mask'])\n\n# Convert the lists into tensors.\ninput_ids = torch.cat(input_ids, dim=0)\nattention_masks = torch.cat(attention_masks, dim=0)\n#labels = torch.tensor(labels)\n\n# Set the batch size. \nbatch_size = 32 \n\n# Create the DataLoader.\nprediction_data = TensorDataset(input_ids, attention_masks, ) #labels\nprediction_sampler = SequentialSampler(prediction_data)\nprediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)", - "class": "Data Extraction", - "desc": "This code snippet prepares the test data by loading it into a pandas DataFrame, tokenizing the sentences, converting them into tensors, and creating a DataLoader for batching during prediction.", + "cell_id": 33, + "code": "#decoding\ndef decode(sequence):\n return \" \".join([reverse_word_index.get(idx, \"?\") for idx in sequence])", + "class": "Data Transform", + "desc": "This code snippet defines a function `decode` that converts a sequence of integers back into a readable text string using the `reverse_word_index` dictionary.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.99884653 + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.95032895 }, - "cluster": -1 + "cluster": 2 }, { - "cell_id": 3, - "code": "# DROP DUPLICATE SAMPLES WITH CONFLICTING LABELS\n\nconflicting = conflicting_check.loc[(conflicting_check.target != 1) & (conflicting_check.target != 0)].index\ndata = data.drop(data[text.isin(conflicting)].index)\nprint ('Conflicting samples count:', conflicting.shape[0])", - "class": "Data Transform", - "desc": "This code snippet drops duplicate samples with conflicting target labels from the dataset and prints the count of such conflicting samples.", + "cell_id": 2, + "code": "train.head()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet displays the first few rows of the training dataset using the `head()` method of pandas DataFrame.", "testing": { - "class": "Data_Transform", - "subclass": "filter", - "subclass_id": 14, - "predicted_subclass_probability": 0.43520904 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997507 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 11, - "code": "# TOKENIZE ALL THE SENTENCES AND MAP THE TOKENS TO THEIR WORD IDs\n\ninput_ids = []\nattention_masks = []\n\n# For every sentence...\nfor sent in sentences:\n # `encode_plus` will:\n # (1) Tokenize the sentence.\n # (2) Prepend the `[CLS]` token to the start.\n # (3) Append the `[SEP]` token to the end.\n # (4) Map tokens to their IDs.\n # (5) Pad or truncate the sentence to `max_length`\n # (6) Create attention masks for [PAD] tokens.\n encoded_dict = tokenizer.encode_plus(\n sent, # Sentence to encode.\n add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n max_length = 64, # Pad & truncate all sentences.\n pad_to_max_length = True,\n return_attention_mask = True, # Construct attn. masks.\n return_tensors = 'pt', # Return pytorch tensors.\n )\n \n # Add the encoded sentence to the list. \n input_ids.append(encoded_dict['input_ids'])\n \n # And its attention mask (simply differentiates padding from non-padding).\n attention_masks.append(encoded_dict['attention_mask'])\n\n# Convert the lists into tensors.\ninput_ids = torch.cat(input_ids, dim=0)\nattention_masks = torch.cat(attention_masks, dim=0)\nlabels = torch.tensor(labels)\n\n# Print sentence 0, now as a list of IDs.\nprint('Original: ', sentences[0])\nprint('Token IDs:', input_ids[0])", - "class": "Data Transform", - "desc": "This code snippet tokenizes all sentences, maps the tokens to their word IDs, ensures they are of uniform length with padding, constructs attention masks, converts the lists of token IDs and attention masks into tensors, and prints the transformed data for the first sentence.", + "cell_id": 3, + "code": "train.shape", + "class": "Exploratory Data Analysis", + "desc": "This code snippet outputs the dimensions (number of rows and columns) of the training dataset using the `shape` attribute of the pandas DataFrame.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.99864405 + "class": "Exploratory_Data_Analysis", + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.99950194 }, "cluster": 1 }, { - "cell_id": 12, - "code": "# SPLIT TRAIN DATA INTO TRAIN AND TEST SET\n# I used small test set (SPLIT=0,999) in order to train the model on the majority of the data, after all parameters were tuned\n# Use 0,9 or lower to train the model and look at the perfomance/ tune parameters\n\nSPLIT = 0.999\n\nfrom torch.utils.data import TensorDataset, random_split\n\n# Combine the training inputs into a TensorDataset.\ndataset = TensorDataset(input_ids, attention_masks, labels)\n\n# Create a 90-10 train-validation split.\n\n# Calculate the number of samples to include in each set.\ntrain_size = int(SPLIT * len(dataset))\nval_size = len(dataset) - train_size\n\n# Divide the dataset by randomly selecting samples.\ntrain_dataset, val_dataset = random_split(dataset, [train_size, val_size])\n\nprint('{:>5,} training samples'.format(train_size))\nprint('{:>5,} validation samples'.format(val_size))", - "class": "Data Transform", - "desc": "This code snippet splits the dataset into training and validation sets based on a specified split ratio (`SPLIT=0.999`) and prints the number of samples in each set.", + "cell_id": 4, + "code": "print((train.target == 1).sum()) # Disaster\nprint((train.target == 0).sum()) # No Disaster", + "class": "Exploratory Data Analysis", + "desc": "This code snippet calculates and prints the number of disaster and non-disaster samples in the training dataset by summing the occurrences of '1' and '0' in the 'target' column, respectively.", "testing": { - "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.9402513 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.99325174 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 13, - "code": "# CREATE DATA ITERATOR TO SAVE MEMORY\n\nfrom torch.utils.data import DataLoader, RandomSampler, SequentialSampler\n\n# The DataLoader needs to know our batch size for training, so we specify it \n# here. For fine-tuning BERT on a specific task, the authors recommend a batch \n# size of 16 or 32.\nbatch_size = 32\n\n# Create the DataLoaders for our training and validation sets.\n# We'll take training samples in random order. \ntrain_dataloader = DataLoader(\n train_dataset, # The training samples.\n sampler = RandomSampler(train_dataset), # Select batches randomly\n batch_size = batch_size # Trains with this batch size.\n )\n\n# For validation the order doesn't matter, so we'll just read them sequentially.\nvalidation_dataloader = DataLoader(\n val_dataset, # The validation samples.\n sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.\n batch_size = batch_size # Evaluate with this batch size.\n )", - "class": "Data Transform", - "desc": "This code snippet creates DataLoaders for both the training and validation datasets to manage data in batches, using random sampling for training and sequential sampling for validation, which helps in efficient memory usage.", + "code": "#Check\ntrain.text", + "class": "Exploratory Data Analysis", + "desc": "This code snippet outputs the 'text' column of the training dataset, likely to verify the preprocessing steps.", "testing": { - "class": "Model_Train", - "subclass": "load_pretrained", - "subclass_id": 30, - "predicted_subclass_probability": 0.98473305 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.53716487 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 25, - "code": "# PREPARE PREDICTIONS FOR SUBMISSION\n\n# Combine the results across all batches. \nflat_predictions = np.concatenate(predictions, axis=0)\n\n# For each sample, pick the label (0 or 1) with the higher score.\nflat_predictions = np.argmax(flat_predictions, axis=1).flatten()", - "class": "Data Transform", - "desc": "This code snippet combines the predictions across all batches and determines the final predicted label (0 or 1) for each sample by selecting the label with the highest score.", + "cell_id": 14, + "code": "# Count unique words\ndef counter_word(text_col):\n count = Counter()\n for text in text_col.values:\n for word in text.split():\n count[word] += 1\n return count\n\n\ncounter = counter_word(train.text)", + "class": "Exploratory Data Analysis", + "desc": "This code snippet defines a function `counter_word` that counts the frequency of each unique word in a given text column using `Counter`, and then uses this function to count words in the 'text' column of the training dataset.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.5844843 + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.94201726 }, - "cluster": 1 + "cluster": 3 }, { - "cell_id": 1, - "code": "print ('Train data shape:', data.shape)", + "cell_id": 15, + "code": "len(counter)", "class": "Exploratory Data Analysis", - "desc": "This code snippet prints the shape of the training data DataFrame to provide an overview of the dataset's dimensions.", + "desc": "This code snippet calculates and outputs the number of unique words in the 'text' column of the training dataset by finding the length of the `counter` object.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_shape", "subclass_id": 58, - "predicted_subclass_probability": 0.9942768 + "predicted_subclass_probability": 0.98219407 }, - "cluster": -1 + "cluster": 2 }, { - "cell_id": 2, - "code": "# CHECK FOR DUPLICATE SAMPLES WITH CONFLICTING LABELS\n\ntext = data.text\nduplicates = data[text.isin(text[text.duplicated()])].sort_values(by='text')\n\n# If the mean target value is different from 0 or 1 - we have duplicate samples with conflicting value\nconflicting_check = pd.DataFrame(duplicates.groupby(['text']).target.mean())\nconflicting_check.sample(10)", + "cell_id": 16, + "code": "# counter", "class": "Exploratory Data Analysis", - "desc": "This code snippet identifies duplicate text samples in the data and checks for conflicting target labels by calculating the mean target value for each duplicated text.", + "desc": "This code snippet refers to the previously defined `counter` object which contains the frequency of each unique word in the 'text' column of the training dataset.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "count_duplicates", - "subclass_id": 38, - "predicted_subclass_probability": 0.8007328 + "subclass": "commented", + "subclass_id": 76, + "predicted_subclass_probability": 0.9903482 }, "cluster": -1 }, { - "cell_id": 9, - "code": "# LOOK HOW THE TOKENIZER WORK\n\n# Print the original sentence.\nprint(' Original: ', sentences[0])\n\n# Print the sentence split into tokens.\nprint('Tokenized: ', tokenizer.tokenize(sentences[0]))\n\n# Print the sentence mapped to token ids.\nprint('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))", + "cell_id": 17, + "code": "counter.most_common(5)", "class": "Exploratory Data Analysis", - "desc": "This code snippet demonstrates how the BERT tokenizer processes text by showing the original sentence, its tokenized form, and the corresponding token IDs for the first sentence in the dataset.", + "desc": "This code snippet outputs the five most common words in the 'text' column of the training dataset using the `most_common` method of the `Counter` object.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.74620485 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.9768316 }, - "cluster": 7 + "cluster": -1 }, { - "cell_id": 10, - "code": "# GET MAX LENGTH OF THE TWEETS\n\nmax_len = 0\n# For every sentence...\nfor sent in sentences:\n # Tokenize the text and add `[CLS]` and `[SEP]` tokens.\n input_ids = tokenizer.encode(sent, add_special_tokens=True)\n # Update the maximum sentence length.\n max_len = max(max_len, len(input_ids))\n\nprint('Max tweet length: ', max_len)", + "cell_id": 18, + "code": "num_unique_words = len(counter)\nnum_unique_words", "class": "Exploratory Data Analysis", - "desc": "This code snippet calculates the maximum length of tokenized tweets in the dataset by tokenizing each sentence and updating the maximum length found.", + "desc": "This code snippet stores and outputs the number of unique words in the 'text' column of the training dataset by using the length of the `counter` object.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.858368 + "subclass": "count_unique_values", + "subclass_id": 54, + "predicted_subclass_probability": 0.9817012 }, - "cluster": 6 + "cluster": 3 }, { - "cell_id": 4, - "code": "# CONNECT KAGGLE GPU FOR SPEED UP\n\nimport tensorflow as tf\n# Get the GPU device name.\ndevice_name = tf.test.gpu_device_name()\nif device_name == '/device:GPU:0':\n print('Found GPU at: {}'.format(device_name))\nelse:\n raise SystemError('GPU device not found')", - "class": "Imports and Environment", - "desc": "This code snippet checks for the availability of a GPU device using TensorFlow and raises an error if a GPU is not found.", + "cell_id": 22, + "code": "train_sentences.shape, val_sentences.shape", + "class": "Exploratory Data Analysis", + "desc": "This code snippet outputs the dimensions of the training and validation sentences arrays to verify the data split.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.9054986 + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.9996413 }, - "cluster": 0 + "cluster": 6 }, { - "cell_id": 5, - "code": "# SPECIFY THE GPU AS THE TORCH DEVICE\n\nimport torch\nif torch.cuda.is_available(): \n # Tell PyTorch to use the GPU. \n device = torch.device(\"cuda\")\n print('There are %d GPU(s) available.' % torch.cuda.device_count())\n print('We will use the GPU:', torch.cuda.get_device_name(0))\nelse:\n print('No GPU available, using the CPU instead.')\n device = torch.device(\"cpu\")", - "class": "Imports and Environment", - "desc": "This code snippet checks for the availability of a GPU device using PyTorch and sets the device to either the GPU or CPU accordingly, providing information about the GPU if available.", + "cell_id": 26, + "code": "#Check\nprint(train_sentences[10:15])\nprint(train_sequences[10:15])", + "class": "Exploratory Data Analysis", + "desc": "This code snippet prints a subset of the original training sentences and their corresponding sequences of integers to verify the tokenization process.", "testing": { - "class": "Imports_and_Environment", - "subclass": "set_options", - "subclass_id": 23, - "predicted_subclass_probability": 0.9812268 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9992624 }, - "cluster": 0 + "cluster": 5 }, { - "cell_id": 6, - "code": "# INSTALL THE TRANSFORMERS PACKAGE TO GET A PYTORCH INTERFACE FOR BERT\n!pip install transformers", - "class": "Imports and Environment", - "desc": "This code snippet installs the Transformers package, which provides a PyTorch interface for BERT and other transformer models.", + "cell_id": 28, + "code": "#Check\ntrain_padded.shape, val_padded.shape", + "class": "Exploratory Data Analysis", + "desc": "This code snippet outputs the dimensions of the padded training and validation sequences to ensure the padding process resulted in sequences of the expected uniform length.", "testing": { - "class": "Imports_and_Environment", - "subclass": "install_modules", - "subclass_id": 87, - "predicted_subclass_probability": 0.989985 + "class": "Exploratory_Data_Analysis", + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.9994609 }, - "cluster": 1 + "cluster": 6 }, { - "cell_id": 8, - "code": "# LOAD THE BERT TOKENIZER\n\nfrom transformers import BertTokenizer\ntokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)", - "class": "Imports and Environment", - "desc": "This code snippet imports the BertTokenizer from the Transformers library and loads a pre-trained BERT tokenizer with the 'bert-base-uncased' model for text tokenization.", + "cell_id": 29, + "code": "#Check\ntrain_padded[10]", + "class": "Exploratory Data Analysis", + "desc": "This code snippet outputs a specific padded training sequence to verify the results of the padding process.", "testing": { - "class": "Model_Train", - "subclass": "load_pretrained", - "subclass_id": 30, - "predicted_subclass_probability": 0.9928005 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9994702 }, - "cluster": 0 + "cluster": 5 }, { - "cell_id": 18, - "code": "# HELPER FUNCTION TO CALCULATE ACCURACY\n\nimport numpy as np\n\n# Function to calculate the accuracy of our predictions vs labels\ndef flat_accuracy(preds, labels):\n pred_flat = np.argmax(preds, axis=1).flatten()\n labels_flat = labels.flatten()\n return np.sum(pred_flat == labels_flat) / len(labels_flat)", - "class": "Model Evaluation", - "desc": "This code snippet defines a helper function to calculate the accuracy of predictions by comparing the flattened predicted class indices with the actual labels.", + "cell_id": 30, + "code": "#Check\nprint(train_sentences[10])\nprint(train_sequences[10])\nprint(train_padded[10])", + "class": "Exploratory Data Analysis", + "desc": "This code snippet prints an original training sentence, its corresponding sequence of integers, and the padded sequence to verify the entire text preprocessing pipeline.", "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.984026 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.99758375 }, - "cluster": 0 + "cluster": 5 }, { - "cell_id": 21, - "code": "# THE SUMMARY OF THE TRAIN PROCESS\n\n# Display floats with two decimal places\npd.set_option('precision', 2)\n\n# Create a DataFrame from our training statistics\ndf_stats = pd.DataFrame(data=training_stats)\n\n# Use the 'epoch' as the row index\ndf_stats = df_stats.set_index('epoch')\n\n# A hack to force the column headers to wrap\n#df = df.style.set_table_styles([dict(selector=\"th\",props=[('max-width', '70px')])])\n\n# Display the table\ndf_stats", - "class": "Model Evaluation", - "desc": "This code snippet creates a DataFrame from the training statistics collected during the training process, formats it to display floats with two decimal places, sets the epoch column as the index, and then displays the DataFrame.", + "cell_id": 32, + "code": "#Check\nreverse_word_index", + "class": "Exploratory Data Analysis", + "desc": "This code snippet outputs the `reverse_word_index` dictionary to verify the mapping from indices back to words.", "testing": { - "class": "Data_Transform", - "subclass": "create_dataframe", - "subclass_id": 12, - "predicted_subclass_probability": 0.46057627 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.98285496 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 24, - "code": "# GET PREDICTIONS\n\nprint('Predicting labels for {:,} test sentences...'.format(len(input_ids)))\n\n# Put model in evaluation mode\nmodel.eval()\n\n# Tracking variables \npredictions = []\n#true_labels = []\n\n# Predict \nfor batch in prediction_dataloader:\n # Add batch to GPU\n batch = tuple(t.to(device) for t in batch)\n \n # Unpack the inputs from our dataloader\n b_input_ids, b_input_mask = batch #b_labels\n \n # Telling the model not to compute or store gradients, saving memory and \n # speeding up prediction\n with torch.no_grad():\n # Forward pass, calculate logit predictions\n outputs = model(b_input_ids, token_type_ids=None, \n attention_mask=b_input_mask)\n\n logits = outputs[0]\n\n # Move logits and labels to CPU\n logits = logits.detach().cpu().numpy()\n label_ids = b_labels.to('cpu').numpy()\n \n # Store predictions and true labels\n predictions.append(logits)\n #true_labels.append(label_ids)\n\nprint(' DONE.')", - "class": "Model Evaluation", - "desc": "This code snippet evaluates the model on the test dataset by generating predictions for each batch, storing these predictions, and printing a completion message.", + "cell_id": 34, + "code": "decoded_text = decode(train_sequences[10])\n#Check\nprint(train_sequences[10])\nprint(decoded_text)", + "class": "Exploratory Data Analysis", + "desc": "This code snippet decodes a specific training sequence back into text and prints both the sequence and the decoded text to verify the decoding process.", "testing": { - "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.75300366 + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.42135346 }, - "cluster": 0 + "cluster": 5 }, { - "cell_id": 14, - "code": "# GET BERT MODEL FOR CLASSIFICATION\n\nfrom transformers import BertForSequenceClassification, AdamW, BertConfig\n\n# Load BertForSequenceClassification, the pretrained BERT model with a single \n# linear classification layer on top. \nmodel = BertForSequenceClassification.from_pretrained(\n \"bert-base-uncased\", # Use the 12-layer BERT model, with an uncased vocab.\n num_labels = 2, # The number of output labels--2 for binary classification.\n # You can increase this for multi-class tasks. \n output_attentions = False, # Whether the model returns attentions weights.\n output_hidden_states = False, # Whether the model returns all hidden-states.\n)\n\n# Tell pytorch to run this model on the GPU.\nmodel.cuda()", - "class": "Model Training", - "desc": "This code snippet loads a pre-trained BERT model for sequence classification with a single linear classification layer from the Transformers library and sets it to run on the GPU.", + "cell_id": 0, + "code": "import numpy as np \nimport pandas as pd \nimport re\nimport string\nimport nltk\nfrom nltk.corpus import stopwords\nfrom collections import Counter\nfrom sklearn.model_selection import train_test_split\nfrom tensorflow.keras.preprocessing.text import Tokenizer\nfrom tensorflow.keras.preprocessing.sequence import pad_sequences\nfrom tensorflow import keras\nfrom tensorflow.keras import layers", + "class": "Imports and Environment", + "desc": "This code snippet imports various libraries and modules including NumPy, pandas, regex, string manipulation, NLTK for natural language processing, Scikit-learn for model splitting, and TensorFlow along with Keras for neural network model building.", "testing": { - "class": "Model_Train", - "subclass": "load_pretrained", - "subclass_id": 30, - "predicted_subclass_probability": 0.567976 + "class": "Imports_and_Environment", + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.9993357 }, "cluster": 0 }, { - "cell_id": 15, - "code": "# PRINT NAMES AND DIMENSIONS FOR THE MODEL LAYERS\n\n# Get all of the model's parameters as a list of tuples.\nparams = list(model.named_parameters())\n\nprint('The BERT model has {:} different named parameters.\\n'.format(len(params)))\n\nprint('==== Embedding Layer ====\\n')\n\nfor p in params[0:5]:\n print(\"{:<55} {:>12}\".format(p[0], str(tuple(p[1].size()))))\n\nprint('\\n==== First Transformer ====\\n')\n\nfor p in params[5:21]:\n print(\"{:<55} {:>12}\".format(p[0], str(tuple(p[1].size()))))\n\nprint('\\n==== Output Layer ====\\n')\n\nfor p in params[-4:]:\n print(\"{:<55} {:>12}\".format(p[0], str(tuple(p[1].size()))))", - "class": "Model Training", - "desc": "This code snippet prints the names and dimensions of the parameters for different layers of the BERT model, including the embedding layer, the first transformer layer, and the output layer, to help understand the model's architecture.", + "cell_id": 38, + "code": "predictions = model.predict(test_padded)\npredictions = [1 if p > 0.5 else 0 for p in predictions]", + "class": "Model Evaluation", + "desc": "This code snippet generates predictions from the test data using the trained model, then converts these predictions into binary labels (1 for values greater than 0.5, otherwise 0).", "testing": { - "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.24847664 + "class": "Model_Evaluation", + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.99410737 }, - "cluster": 0 + "cluster": 3 }, { - "cell_id": 16, - "code": "# SET UP THE OPTIMIZER\n\n# Note: AdamW is a class from the huggingface library (as opposed to pytorch) \n# I believe the 'W' stands for 'Weight Decay fix\"\noptimizer = AdamW(model.parameters(),\n lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5\n eps = 1e-8 # args.adam_epsilon - default is 1e-8.\n )", + "cell_id": 35, + "code": "# Create LSTM model\n\n# Embedding: Turns positive integers (indexes) into dense vectors of fixed size.\n\nmodel = keras.models.Sequential()\nmodel.add(layers.Embedding(num_unique_words, 100, input_length=max_length))\n\nmodel.add(layers.LSTM(32, dropout=0.25))\nmodel.add(layers.Dense(1, activation=\"sigmoid\"))\n\nmodel.summary()", "class": "Model Training", - "desc": "This code snippet sets up the AdamW optimizer for training the BERT model, specifying the learning rate and epsilon parameters.", + "desc": "This code snippet defines a Sequential neural network model using Keras, with an Embedding layer for transforming word indices into dense vectors, an LSTM layer with dropout for sequence learning, and a Dense output layer with a sigmoid activation function, then summarizes the model's architecture.", "testing": { "class": "Model_Train", "subclass": "choose_model_class", "subclass_id": 4, - "predicted_subclass_probability": 0.9948212 + "predicted_subclass_probability": 0.984528 }, "cluster": 0 }, { - "cell_id": 17, - "code": "# SET UP MODEL HYPERPARAMETERS\n\nfrom transformers import get_linear_schedule_with_warmup\n\n# Number of training epochs. The BERT authors recommend between 2 and 4. \n# We chose to run for 4, but we'll see later that this may be over-fitting the\n# training data.\nepochs = 2\n\n# Total number of training steps is [number of batches] x [number of epochs]. \n# (Note that this is not the same as the number of training samples).\ntotal_steps = len(train_dataloader) * epochs\n\n# Create the learning rate scheduler.\nscheduler = get_linear_schedule_with_warmup(optimizer, \n num_warmup_steps = 0, # Default value in run_glue.py\n num_training_steps = total_steps)", + "cell_id": 36, + "code": "loss = keras.losses.BinaryCrossentropy(from_logits=False)\noptim = keras.optimizers.Adam(learning_rate=0.001)\nmetrics = [\"accuracy\"]\n\nmodel.compile(loss=loss, optimizer=optim, metrics=metrics)", "class": "Model Training", - "desc": "This code snippet sets up the hyperparameters for the BERT model training, including the number of epochs, total training steps, and a learning rate scheduler to adjust the learning rate during training.", + "desc": "This code snippet compiles the previously defined model using binary cross-entropy as the loss function, Adam optimizer with a learning rate of 0.001, and accuracy as a performance metric.", "testing": { "class": "Model_Train", - "subclass": "init_hyperparams", - "subclass_id": 59, - "predicted_subclass_probability": 0.6898819 + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.9955751 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 19, - "code": "# HELPER FUNCTION FOR TIME FORMAT\n\nimport time\nimport datetime\n\ndef format_time(elapsed):\n '''\n Takes a time in seconds and returns a string hh:mm:ss\n '''\n # Round to the nearest second.\n elapsed_rounded = int(round((elapsed)))\n \n # Format as hh:mm:ss\n return str(datetime.timedelta(seconds=elapsed_rounded))", + "cell_id": 37, + "code": "model.fit(train_padded, train_labels, epochs=25, validation_data=(val_padded, val_labels), verbose=2)", "class": "Model Training", - "desc": "This code snippet defines a helper function that formats elapsed time in seconds into a human-readable string in the format hh:mm:ss.", + "desc": "This code snippet trains the compiled model on the padded training data and labels for 25 epochs, using the padded validation data and labels for validation, with verbose output set to 2.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.5951723 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.9996841 }, "cluster": -1 - }, { - "cell_id": 20, - "code": "# TRAINING SCRIPT\n\nimport random\nimport numpy as np\n\n# This training code is based on the `run_glue.py` script here:\n# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128\n\n# Set the seed value all over the place to make this reproducible.\nseed_val = 42\n\nrandom.seed(seed_val)\nnp.random.seed(seed_val)\ntorch.manual_seed(seed_val)\ntorch.cuda.manual_seed_all(seed_val)\n\n# We'll store a number of quantities such as training and validation loss, \n# validation accuracy, and timings.\ntraining_stats = []\n\n# Measure the total training time for the whole run.\ntotal_t0 = time.time()\n\n# For each epoch...\nfor epoch_i in range(0, epochs):\n \n # ========================================\n # Training\n # ========================================\n \n # Perform one full pass over the training set.\n\n print(\"\")\n print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n print('Training...')\n\n # Measure how long the training epoch takes.\n t0 = time.time()\n\n # Reset the total loss for this epoch.\n total_train_loss = 0\n\n # Put the model into training mode. Don't be mislead--the call to \n # `train` just changes the *mode*, it doesn't *perform* the training.\n # `dropout` and `batchnorm` layers behave differently during training\n # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)\n model.train()\n\n # For each batch of training data...\n for step, batch in enumerate(train_dataloader):\n\n # Progress update every 40 batches.\n if step % 40 == 0 and not step == 0:\n # Calculate elapsed time in minutes.\n elapsed = format_time(time.time() - t0)\n \n # Report progress.\n print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))\n\n # Unpack this training batch from our dataloader. \n #\n # As we unpack the batch, we'll also copy each tensor to the GPU using the \n # `to` method.\n #\n # `batch` contains three pytorch tensors:\n # [0]: input ids \n # [1]: attention masks\n # [2]: labels \n b_input_ids = batch[0].to(device)\n b_input_mask = batch[1].to(device)\n b_labels = batch[2].to(device)\n\n # Always clear any previously calculated gradients before performing a\n # backward pass. PyTorch doesn't do this automatically because \n # accumulating the gradients is \"convenient while training RNNs\". \n # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)\n model.zero_grad() \n\n # Perform a forward pass (evaluate the model on this training batch).\n # The documentation for this `model` function is here: \n # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n # It returns different numbers of parameters depending on what arguments\n # arge given and what flags are set. For our useage here, it returns\n # the loss (because we provided labels) and the \"logits\"--the model\n # outputs prior to activation.\n outputs = model(b_input_ids, \n token_type_ids=None, \n attention_mask=b_input_mask, \n labels=b_labels)\n \n loss = outputs[0]\n logits = outputs[1]\n\n # Accumulate the training loss over all of the batches so that we can\n # calculate the average loss at the end. `loss` is a Tensor containing a\n # single value; the `.item()` function just returns the Python value \n # from the tensor.\n total_train_loss += loss.item()\n\n # Perform a backward pass to calculate the gradients.\n loss.backward()\n\n # Clip the norm of the gradients to 1.0.\n # This is to help prevent the \"exploding gradients\" problem.\n torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n\n # Update parameters and take a step using the computed gradient.\n # The optimizer dictates the \"update rule\"--how the parameters are\n # modified based on their gradients, the learning rate, etc.\n optimizer.step()\n\n # Update the learning rate.\n scheduler.step()\n\n # Calculate the average loss over all of the batches.\n avg_train_loss = total_train_loss / len(train_dataloader) \n \n # Measure how long this epoch took.\n training_time = format_time(time.time() - t0)\n\n print(\"\")\n print(\" Average training loss: {0:.2f}\".format(avg_train_loss))\n print(\" Training epcoh took: {:}\".format(training_time))\n \n # ========================================\n # Validation\n # ========================================\n # After the completion of each training epoch, measure our performance on\n # our validation set.\n\n print(\"\")\n print(\"Running Validation...\")\n\n t0 = time.time()\n\n # Put the model in evaluation mode--the dropout layers behave differently\n # during evaluation.\n model.eval()\n\n # Tracking variables \n total_eval_accuracy = 0\n total_eval_loss = 0\n nb_eval_steps = 0\n\n # Evaluate data for one epoch\n for batch in validation_dataloader:\n \n # Unpack this training batch from our dataloader. \n #\n # As we unpack the batch, we'll also copy each tensor to the GPU using \n # the `to` method.\n #\n # `batch` contains three pytorch tensors:\n # [0]: input ids \n # [1]: attention masks\n # [2]: labels \n b_input_ids = batch[0].to(device)\n b_input_mask = batch[1].to(device)\n b_labels = batch[2].to(device)\n \n # Tell pytorch not to bother with constructing the compute graph during\n # the forward pass, since this is only needed for backprop (training).\n with torch.no_grad(): \n\n # Forward pass, calculate logit predictions.\n # token_type_ids is the same as the \"segment ids\", which \n # differentiates sentence 1 and 2 in 2-sentence tasks.\n # The documentation for this `model` function is here: \n # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n # Get the \"logits\" output by the model. The \"logits\" are the output\n # values prior to applying an activation function like the softmax.\n output = model(b_input_ids, \n token_type_ids=None, \n attention_mask=b_input_mask,\n labels=b_labels)\n \n loss = output[0]\n logits = output[1]\n \n # Accumulate the validation loss.\n total_eval_loss += loss.item()\n\n # Move logits and labels to CPU\n logits = logits.detach().cpu().numpy()\n label_ids = b_labels.to('cpu').numpy()\n\n # Calculate the accuracy for this batch of test sentences, and\n # accumulate it over all batches.\n total_eval_accuracy += flat_accuracy(logits, label_ids)\n \n\n # Report the final accuracy for this validation run.\n avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)\n print(\" Accuracy: {0:.2f}\".format(avg_val_accuracy))\n\n # Calculate the average loss over all of the batches.\n avg_val_loss = total_eval_loss / len(validation_dataloader)\n \n # Measure how long the validation run took.\n validation_time = format_time(time.time() - t0)\n \n print(\" Validation Loss: {0:.2f}\".format(avg_val_loss))\n print(\" Validation took: {:}\".format(validation_time))\n\n # Record all statistics from this epoch.\n training_stats.append(\n {\n 'epoch': epoch_i + 1,\n 'Training Loss': avg_train_loss,\n 'Valid. Loss': avg_val_loss,\n 'Valid. Accur.': avg_val_accuracy,\n 'Training Time': training_time,\n 'Validation Time': validation_time\n }\n )\n\nprint(\"\")\nprint(\"Training complete!\")\n\nprint(\"Total training took {:} (h:mm:ss)\".format(format_time(time.time()-total_t0)))", - "class": "Model Training", - "desc": "This code snippet provides the training script for the BERT model, including training and validation phases for each epoch, tracking statistics such as training and validation loss, accuracy, and elapsed time, and outputting progress updates and final results.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.7243838 - }, - "cluster": 0 - }, { - "cell_id": 22, - "code": "# PLOT THE VALIDATION LOSS\n\nimport matplotlib.pyplot as plt\n%matplotlib inline\n\nimport seaborn as sns\n\n# Use plot styling from seaborn.\nsns.set(style='darkgrid')\n\n# Increase the plot size and font size.\nsns.set(font_scale=1.5)\nplt.rcParams[\"figure.figsize\"] = (12,6)\n\n# Plot the learning curve.\nplt.plot(df_stats['Training Loss'], 'b-o', label=\"Training\")\nplt.plot(df_stats['Valid. Loss'], 'g-o', label=\"Validation\")\n\n# Label the plot.\nplt.title(\"Training & Validation Loss\")\nplt.xlabel(\"Epoch\")\nplt.ylabel(\"Loss\")\nplt.legend()\nplt.xticks([1, 2, 3, 4])\n\nplt.show()", - "class": "Visualization", - "desc": "This code snippet plots the training and validation loss over epochs using Matplotlib and Seaborn to visualize the learning curve.", - "testing": { - "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.9963574 - }, - "cluster": 0 }], - "notebook_id": 3, - "notebook_name": "bert-with-disaster-tweets" + "notebook_id": 4, + "notebook_name": "nlp-tokenization-embedding-lstm.ipynb" }, { "cells": [{ - "cell_id": 14, - "code": "pd.DataFrame({\n 'id':test.id,\n 'target':pred\n}).to_csv('submission.csv',index=False)", + "cell_id": 37, + "code": "preds = np.squeeze(model.predict(test_multi_input_dataset.batch(32)))\npreds = (preds >= 0.5).astype(int)\npd.DataFrame({\"id\": test_df.id, \"target\": preds}).to_csv(\"submission.csv\", index=False)", "class": "Data Export", - "desc": "This code snippet creates a DataFrame with `id` from the `test` DataFrame and predictions `pred`, then exports it as a CSV file named 'submission.csv' without the index.", + "desc": "This code uses the trained model to predict binary targets for the test dataset, converts the predictions to integer format (0 or 1), and exports the results as a CSV file with the columns 'id' and 'target' named \"submission.csv\" using pandas.", "testing": { "class": "Data_Export", "subclass": "save_to_csv", "subclass_id": 25, - "predicted_subclass_probability": 0.9992506 + "predicted_subclass_probability": 0.9992853 }, "cluster": -1 }, { - "cell_id": 1, - "code": "train=pd.read_csv('../input/nlp-getting-started/train.csv')\ntest=pd.read_csv('../input/nlp-getting-started/test.csv')", + "cell_id": 4, + "code": "train_df = pd.read_csv(\"/kaggle/input/nlp-getting-started/train.csv\")\ntest_df = pd.read_csv(\"/kaggle/input/nlp-getting-started/test.csv\")", "class": "Data Extraction", - "desc": "This code snippet reads the training and test datasets from CSV files into Pandas DataFrames named `train` and `test`, respectively.", + "desc": "This code reads training and test datasets from specified CSV files into pandas DataFrames named `train_df` and `test_df`.", "testing": { "class": "Data_Extraction", "subclass": "load_from_csv", "subclass_id": 45, - "predicted_subclass_probability": 0.99974996 + "predicted_subclass_probability": 0.99975425 }, - "cluster": 3 + "cluster": 0 }, { - "cell_id": 8, - "code": "stop_words=nltk.corpus.stopwords.words('english')\ni=0\n#sc=SpellChecker()\n#data=pd.concat([train,test])\nwnl=WordNetLemmatizer()\nstemmer=SnowballStemmer('english')\nfor doc in train.text:\n doc=re.sub(r'https?://\\S+|www\\.\\S+','',doc)\n doc=re.sub(r'<.*?>','',doc)\n doc=re.sub(r'[^a-zA-Z\\s]','',doc,re.I|re.A)\n #doc=' '.join([stemmer.stem(i) for i in doc.lower().split()])\n doc=' '.join([wnl.lemmatize(i) for i in doc.lower().split()])\n #doc=' '.join([sc.correction(i) for i in doc.split()])\n doc=contractions.fix(doc)\n tokens=nltk.word_tokenize(doc)\n filtered=[token for token in tokens if token not in stop_words]\n doc=' '.join(filtered)\n train.text[i]=doc\n i+=1\ni=0\nfor doc in test.text:\n doc=re.sub(r'https?://\\S+|www\\.\\S+','',doc)\n doc=re.sub(r'<.*?>','',doc)\n doc=re.sub(r'[^a-zA-Z\\s]','',doc,re.I|re.A)\n #doc=' '.join([stemmer.stem(i) for i in doc.lower().split()])\n doc=' '.join([wnl.lemmatize(i) for i in doc.lower().split()])\n #doc=' '.join([sc.correction(i) for i in doc.split()])\n doc=contractions.fix(doc)\n tokens=nltk.word_tokenize(doc)\n filtered=[token for token in tokens if token not in stop_words]\n doc=' '.join(filtered)\n test.text[i]=doc\n i+=1", - "class": "Data Transform", - "desc": "This code snippet preprocesses the text data in both the `train` and `test` DataFrames by cleaning the text, removing stop words, fixing contractions, and lemmatizing the words.", + "cell_id": 16, + "code": "x_train, x_val, y_train, y_val = sklearn.model_selection.train_test_split(\n train_df[[\"text\", \"keyword\"]], train_df[\"target\"], test_size=0.3, random_state=42, stratify=train_df[\"target\"]\n)", + "class": "Data Extraction", + "desc": "This code splits the training DataFrame into training and validation sets for both features ('text' and 'keyword') and target using a 70-30 split, with stratification on the target variable, using `sklearn.model_selection.train_test_split`.", "testing": { "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.54864067 + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.9980861 }, "cluster": 1 }, { - "cell_id": 11, - "code": "from sklearn.feature_extraction.text import CountVectorizer\ncv=CountVectorizer(ngram_range=(1,1)) \n\n# ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, \n# and (2, 2) means only bigrams.\n\ncv_matrix=cv.fit_transform(train.text).toarray()\ntrain_df=pd.DataFrame(cv_matrix,columns=cv.get_feature_names())\ntest_df=pd.DataFrame(cv.transform(test.text).toarray(),columns=cv.get_feature_names())\ntrain_df.head()", - "class": "Data Transform", - "desc": "This code snippet uses CountVectorizer to convert the preprocessed text data in the `train` and `test` DataFrames into a matrix of token counts, creating new DataFrames `train_df` and `test_df` for further analysis.", + "cell_id": 21, + "code": "feature_extractor = get_pretrained_bert_model()\n\n# Run a forward pass on the tokenized inputs\n# model_outputs = feature_extractor(\n# train_tweets_encoded[\"input_ids\"], train_tweets_encoded[\"attention_mask\"]\n# )\nmodel_outputs = feature_extractor.predict(\n train_dataset.batch(32)\n)\n# BERT's sentence representation can be retrieved from a hidden vector at index 0 in the sequence, \n# (where the special token CLS was prepended by the tokenizer)\ntrain_sentence_vectors = model_outputs.last_hidden_state[:, 0, :]\n\n# The rest of the sequence contains the embeddings \n# (modified by successive layers of self-attention) for each token\ntrain_word_vectors = model_outputs.last_hidden_state[:, 1:, :]\n\n# And the same again for the validation set\n# model_outputs = feature_extractor(\n# validation_tweets_encoded[\"input_ids\"], validation_tweets_encoded[\"attention_mask\"]\n# )\nmodel_outputs = feature_extractor.predict(\n val_dataset.batch(32)\n)\nvalidation_sentence_vectors = model_outputs.last_hidden_state[:, 0, :]\nvalidation_word_vectors = model_outputs.last_hidden_state[:, 1:, :]", + "class": "Data Extraction", + "desc": "This code uses a pretrained DistilBERT model to extract sentence and word embeddings from the tokenized training and validation tweet datasets, performing a forward pass through the model to obtain these representations using TensorFlow.", "testing": { - "class": "Visualization", - "subclass": "model_coefficients", - "subclass_id": 79, - "predicted_subclass_probability": 0.17395471 + "class": "Model_Train", + "subclass": "find_best_model_class", + "subclass_id": 3, + "predicted_subclass_probability": 0.2968278 }, "cluster": 1 }, { - "cell_id": 12, - "code": "from sklearn.feature_extraction.text import TfidfVectorizer\ntfidf=TfidfVectorizer(ngram_range=(1,1),use_idf=True)\nmat=tfidf.fit_transform(train.text).toarray()\ntrain_df=pd.DataFrame(mat,columns=tfidf.get_feature_names())\ntest_df=pd.DataFrame(tfidf.transform(test.text).toarray(),columns=tfidf.get_feature_names())\ntrain_df.head()", + "cell_id": 10, + "code": "# We'll use these weights later on to make up for the slightly imbalanced dataset\nclasses = np.unique(train_df[\"target\"])\nclass_weights = sklearn.utils.class_weight.compute_class_weight(\n \"balanced\", classes=classes, y=train_df[\"target\"]\n)\n\nclass_weights = {clazz : weight for clazz, weight in zip(classes, class_weights)}", "class": "Data Transform", - "desc": "This code snippet uses TfidfVectorizer to convert the preprocessed text data in the `train` and `test` DataFrames into a TF-IDF matrix, creating new DataFrames `train_df` and `test_df` for further analysis.", + "desc": "This code computes and stores class weights to handle the slightly imbalanced dataset by using sklearn's `compute_class_weight` function and numpy to balance the weights for each unique class in the training DataFrame's target column.", "testing": { "class": "Data_Transform", "subclass": "categorify", "subclass_id": 20, - "predicted_subclass_probability": 0.9525827 + "predicted_subclass_probability": 0.3043228 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 2, - "code": "train.head()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first few rows of the `train` DataFrame to give an initial overview of the dataset's structure and contents.", + "cell_id": 11, + "code": "# Commented out the graceful handling of duplicated because the Kaggle kernel version of statistics.mode()\n# won't handle multimodal results\n\n# Duplicates aren't consistently labeled, so we keep one example of the most frequently occuring label\n# train_df[\"duplicated\"] = train_df.duplicated(subset=\"text\")\n# duplicated_tweets = train_df.loc[lambda df: df[\"duplicated\"] == True, :]\n# aggregated_duplicates = duplicated_tweets.groupby(\"text\", as_index=False).aggregate(\n# statistics.mode\n# )\n\n# train_df.drop_duplicates(subset=\"text\", inplace=True, keep=False)\n# train_df = train_df.append(aggregated_duplicates, ignore_index=True)\n\ntrain_df.drop_duplicates(subset=\"text\", inplace=True, keep=False)\nprint(\"train rows:\", len(train_df.index))\nprint(\"test rows:\", len(test_df.index))", + "class": "Data Transform", + "desc": "This code removes duplicated rows in the training DataFrame based on the 'text' column and prints the number of rows in both the training and test DataFrames after deduplication using pandas.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997507 + "subclass": "count_duplicates", + "subclass_id": 38, + "predicted_subclass_probability": 0.7884562 }, - "cluster": 12 + "cluster": 2 }, { - "cell_id": 3, - "code": "test.head()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first few rows of the `test` DataFrame to provide an initial overview of the test dataset's structure and contents.", + "cell_id": 12, + "code": "class TweetPreProcessor:\n \"\"\"\n This class does some cleaning and normalization prior to BPE tokenization\n \"\"\"\n\n def __init__(self):\n\n self.text_processor = TextPreProcessor(\n # terms that will be normalized\n normalize=[\n \"url\",\n \"email\",\n \"phone\",\n \"user\",\n \"time\",\n \"date\",\n ],\n # terms that will be annotated\n annotate={\"repeated\", \"elongated\"},\n # corpus from which the word statistics are going to be used\n # for word segmentation\n segmenter=\"twitter\",\n # corpus from which the word statistics are going to be used\n # for spell correction\n spell_correction=True,\n corrector=\"twitter\",\n unpack_hashtags=False, # perform word segmentation on hashtags\n unpack_contractions=False, # Unpack contractions (can't -> can not)\n spell_correct_elong=True, # spell correction for elongated words\n fix_bad_unicode=True,\n tokenizer=Tokenizer(lowercase=True).tokenize,\n # list of dictionaries, for replacing tokens extracted from the text,\n # with other expressions. You can pass more than one dictionaries.\n dicts=[emoticons, slangdict],\n )\n\n def preprocess_tweet(self, tweet):\n return \" \".join(self.text_processor.pre_process_doc(tweet))\n \n # this will return the tokenized text \n def __call__(self, tweet):\n return self.text_processor.pre_process_doc(tweet)\n \ntweet_preprocessor = TweetPreProcessor()", + "class": "Data Transform", + "desc": "This code defines a `TweetPreProcessor` class that performs text cleaning and normalization using the ekphrasis library, and initializes an instance of this class named `tweet_preprocessor` for further text preprocessing.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997483 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.95362186 }, - "cluster": 12 + "cluster": 0 }, { - "cell_id": 4, - "code": "print(train.shape)\nprint(test.shape)", - "class": "Exploratory Data Analysis", - "desc": "This code snippet prints the dimensions (number of rows and columns) of both the `train` and `test` DataFrames to provide insight into the size of the datasets.", + "cell_id": 14, + "code": "train_df[\"text\"] = train_df[\"text\"].apply(tweet_preprocessor.preprocess_tweet)\ntest_df[\"text\"] = test_df[\"text\"].apply(tweet_preprocessor.preprocess_tweet)", + "class": "Data Transform", + "desc": "This code applies the `preprocess_tweet` method from the `TweetPreProcessor` class to preprocess the text in both the training and test DataFrames using pandas.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.99933213 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.72112745 }, - "cluster": -1 + "cluster": 3 + }, { + "cell_id": 15, + "code": "# Fill NA\ntrain_df[\"keyword\"].fillna(\"\", inplace=True)\ntest_df[\"keyword\"].fillna(\"\", inplace=True)\n\n# remove %20 from keywords\ntrain_df[\"keyword\"] = train_df[\"keyword\"].apply(urllib.parse.unquote)\ntest_df[\"keyword\"] = test_df[\"keyword\"].apply(urllib.parse.unquote)", + "class": "Data Transform", + "desc": "This code fills missing values in the 'keyword' column with empty strings and decodes URL-encoded characters in the 'keyword' column for both the training and test DataFrames using the `urllib.parse.unquote` method and pandas.", + "testing": { + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.98655176 + }, + "cluster": 3 + }, { + "cell_id": 17, + "code": "def tokenize_encode(tweets, max_length=None):\n return pretrained_bert_tokenizer(\n tweets,\n add_special_tokens=True,\n truncation=True,\n padding=\"max_length\",\n max_length=max_length,\n return_tensors=\"tf\",\n )\n\n\n# need to be explicit about the lengths (instead of just specifying padding=True in the tokenizer)\n# otherwise train tweets end up being 71 and validation tweets end up as 70, which causes problems/warnings\nmax_length_tweet = 72\nmax_length_keyword = 8\n\ntrain_tweets_encoded = tokenize_encode(x_train[\"text\"].to_list(), max_length_tweet) \nvalidation_tweets_encoded = tokenize_encode(x_val[\"text\"].to_list(), max_length_tweet) \n\ntrain_keywords_encoded = tokenize_encode(x_train[\"keyword\"].to_list(), max_length_keyword) \nvalidation_keywords_encoded = tokenize_encode(x_val[\"keyword\"].to_list(), max_length_keyword) \n\ntrain_inputs_encoded = dict(train_tweets_encoded)\ntrain_inputs_encoded[\"keywords\"] = train_keywords_encoded[\"input_ids\"]\n\nvalidation_inputs_encoded = dict(validation_tweets_encoded)\nvalidation_inputs_encoded[\"keywords\"] = validation_keywords_encoded[\"input_ids\"]\n", + "class": "Data Transform", + "desc": "This code tokenizes and encodes the 'text' and 'keyword' columns of the training and validation sets using the pretrained DistilBERT tokenizer with specified maximum lengths, and combines the encoded inputs into dictionaries for training and validation datasets.", + "testing": { + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.8425683 + }, + "cluster": 5 + }, { + "cell_id": 18, + "code": "train_dataset = tf.data.Dataset.from_tensor_slices(\n (dict(train_tweets_encoded), y_train)\n)\n\nval_dataset = tf.data.Dataset.from_tensor_slices(\n (dict(validation_tweets_encoded), y_val)\n)\n\ntrain_multi_input_dataset = tf.data.Dataset.from_tensor_slices(\n (train_inputs_encoded, y_train)\n)\n\nval_multi_input_dataset = tf.data.Dataset.from_tensor_slices(\n (validation_inputs_encoded, y_val)\n)\n", + "class": "Data Transform", + "desc": "This code creates TensorFlow datasets from the encoded training and validation sets, including both tweet-only and multi-input datasets, using TensorFlow's `tf.data.Dataset.from_tensor_slices` method.", + "testing": { + "class": "Data_Transform", + "subclass": "create_dataframe", + "subclass_id": 12, + "predicted_subclass_probability": 0.5969537 + }, + "cluster": 5 + }, { + "cell_id": 19, + "code": "tfidf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(\n tokenizer=tweet_preprocessor, min_df=1, ngram_range=(1, 1), norm=\"l2\"\n)\n\ntrain_vectors = tfidf_vectorizer.fit_transform(raw_documents=x_train[\"text\"]).toarray()\nvalidation_vectors = tfidf_vectorizer.transform(x_val[\"text\"]).toarray()", + "class": "Data Transform", + "desc": "This code vectorizes the preprocessed text from the training and validation datasets using the TF-IDF method with sklearn's `TfidfVectorizer`, utilizing the `TweetPreProcessor` as the tokenizer, and converts the result into arrays.", + "testing": { + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.9926306 + }, + "cluster": 5 + }, { + "cell_id": 35, + "code": "test_tweets_encoded = tokenize_encode(test_df[\"text\"].to_list(), max_length_tweet)\ntest_inputs_encoded = dict(test_tweets_encoded)\ntest_dataset = tf.data.Dataset.from_tensor_slices(test_inputs_encoded)\n\ntest_keywords_encoded = tokenize_encode(test_df[\"keyword\"].to_list(), max_length_keyword)\ntest_inputs_encoded[\"keywords\"] = test_keywords_encoded[\"input_ids\"]\ntest_multi_input_dataset = tf.data.Dataset.from_tensor_slices(test_inputs_encoded)", + "class": "Data Transform", + "desc": "This code tokenizes and encodes the 'text' and 'keyword' columns of the test DataFrame using the pretrained DistilBERT tokenizer with specified maximum lengths, and creates TensorFlow datasets for both single and multi-input models using the encoded inputs.", + "testing": { + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.78393567 + }, + "cluster": 5 }, { "cell_id": 5, - "code": "print(train.info())\nprint(test.info())", + "code": "print(train_df.info())\n\nprint(\"\")\nprint(\"train rows:\", len(train_df.index))\nprint(\"test rows:\", len(test_df.index))", "class": "Exploratory Data Analysis", - "desc": "This code snippet prints a concise summary of the `train` and `test` DataFrames, including the number of non-null entries, datatype of each column, and memory usage, to help understand the structure and quality of the datasets.", + "desc": "This code prints out the summary information of the training DataFrame and the number of rows in both the training and testing DataFrames using pandas.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table_attributes", "subclass_id": 40, - "predicted_subclass_probability": 0.9994165 + "predicted_subclass_probability": 0.9827071 }, - "cluster": 7 + "cluster": 3 }, { "cell_id": 6, - "code": "train.target.value_counts()", + "code": "print(\"label counts:\")\ntrain_df.target.value_counts()", "class": "Exploratory Data Analysis", - "desc": "This code snippet counts the occurrences of each unique value in the `target` column of the `train` DataFrame to analyze the distribution of the target variable.", + "desc": "This code prints the counts of each unique value in the 'target' column of the training DataFrame using pandas.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "count_values", "subclass_id": 72, - "predicted_subclass_probability": 0.99950993 + "predicted_subclass_probability": 0.99948514 }, - "cluster": 9 + "cluster": 4 }, { - "cell_id": 9, - "code": "train.head()", + "cell_id": 7, + "code": "print(\"train precentage of nulls:\")\nprint(round(train_df.isnull().sum() / train_df.count() * 100, 2))", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first few rows of the `train` DataFrame to examine the preprocessed text data and verify the transformations.", + "desc": "This code calculates and prints the percentage of null values for each column in the training DataFrame using pandas.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997507 + "subclass": "count_missing_values", + "subclass_id": 39, + "predicted_subclass_probability": 0.9960549 }, - "cluster": 12 + "cluster": 3 }, { - "cell_id": 10, - "code": "test.head()", + "cell_id": 8, + "code": "print(\"test precentage of nulls:\")\nprint(round(test_df.isnull().sum() / test_df.count() * 100, 2))", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first few rows of the `test` DataFrame to examine the preprocessed text data and verify the transformations.", + "desc": "This code calculates and prints the percentage of null values for each column in the test DataFrame using pandas.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997483 + "subclass": "count_missing_values", + "subclass_id": 39, + "predicted_subclass_probability": 0.99783856 + }, + "cluster": 3 + }, { + "cell_id": 9, + "code": "# check that we don't have any keywords appearing in one set and not the other\ntrain_keywords = set(train_df[\"keyword\"].dropna())\ntest_keywords = set(test_df[\"keyword\"].dropna())\n\nall_keywords = train_keywords.union(test_keywords)\nunique_test_keywords = all_keywords - train_keywords\nunique_train_keywords = all_keywords - test_keywords\n\nprint(f\"unique_test_keywords: {unique_test_keywords}\")\nprint(f\"unique_train_keywords: {unique_train_keywords}\")", + "class": "Exploratory Data Analysis", + "desc": "This code identifies unique keywords appearing exclusively in either the training or test DataFrame by comparing keyword sets and prints the unique keywords for both sets using pandas.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "show_unique_values", + "subclass_id": 57, + "predicted_subclass_probability": 0.966651 + }, + "cluster": 5 + }, { + "cell_id": 13, + "code": "# Have a look at how the TweetProcessor is doing\nfor tweet in train_df[100:120][\"text\"]:\n print(\"original: \", tweet)\n print(\"processed: \", tweet_preprocessor.preprocess_tweet(tweet))\n print(\"\")", + "class": "Exploratory Data Analysis", + "desc": "This code prints out original and processed versions of tweets from a specific range (100 to 120) in the training DataFrame to demonstrate how the `TweetPreProcessor` class is handling text preprocessing.", + "testing": { + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.7224992 }, - "cluster": 12 + "cluster": 3 }, { "cell_id": 0, - "code": "import pandas as pd\nimport numpy as np\nfrom sklearn.metrics import f1_score", + "code": "!pip install -q transformers ekphrasis keras-tuner", "class": "Imports and Environment", - "desc": "This code snippet imports necessary libraries, specifically Pandas for data manipulation, NumPy for numerical operations, and f1_score from Scikit-learn for evaluating the model's performance.", + "desc": "This code installs the 'transformers', 'ekphrasis', and 'keras-tuner' libraries quietly using pip.", "testing": { "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.99931085 + "subclass": "install_modules", + "subclass_id": 87, + "predicted_subclass_probability": 0.99613416 }, "cluster": 0 }, { - "cell_id": 7, - "code": "import nltk\nnltk.download('punkt')\nnltk.download('stopwords')\nimport re\n!pip install contractions\nimport contractions\nfrom nltk.stem import SnowballStemmer\nfrom nltk.stem import WordNetLemmatizer\nnltk.download('wordnet')\n!pip install pyspellchecker\nfrom spellchecker import SpellChecker", + "cell_id": 1, + "code": "import numpy as np\nimport pandas as pd\nimport urllib\nimport statistics\nimport math\nimport pprint\nimport sklearn\nfrom sklearn.linear_model import LogisticRegression\nimport tensorflow as tf\nimport tensorflow.keras as keras\nfrom tensorflow.keras.layers import (\n Input,\n Dense,\n Embedding,\n Flatten,\n Dropout,\n GlobalMaxPooling1D,\n GRU,\n concatenate,\n)\nfrom tensorflow.keras.callbacks import EarlyStopping\nfrom transformers import (\n DistilBertTokenizerFast,\n TFDistilBertModel,\n DistilBertConfig,\n)\n\nfrom ekphrasis.classes.preprocessor import TextPreProcessor\nfrom ekphrasis.classes.tokenizer import Tokenizer\nfrom ekphrasis.dicts.emoticons import emoticons\nfrom ekphrasis.dicts.noslang.slangdict import slangdict\n\nimport kerastuner", "class": "Imports and Environment", - "desc": "This code snippet installs and imports various NLP-related libraries and resources, including NLTK modules, a library for handling contractions, a stemming and lemmatization library, and a spell checker, preparing the environment for text preprocessing tasks.", + "desc": "This code imports various libraries and modules, including numpy, pandas, sklearn, tensorflow, transformers, and ekphrasis for data processing, machine learning, and model training.", "testing": { - "class": "Model_Train", - "subclass": "load_pretrained", - "subclass_id": 30, - "predicted_subclass_probability": 0.799391 + "class": "Imports_and_Environment", + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.9993593 }, "cluster": 0 }, { - "cell_id": 13, - "code": "from sklearn.linear_model import LogisticRegression\nmodel=LogisticRegression()\nmodel.fit(train_df,train.target)\nprint(f1_score(model.predict(train_df),train.target))\npred=model.predict(test_df)", - "class": "Model Training", - "desc": "This code snippet initiates a Logistic Regression model, trains it on the `train_df` DataFrame and the `train` DataFrame's `target` variable, evaluates the model using the F1 score on the training data, and generates predictions for the `test_df` DataFrame.", + "cell_id": 2, + "code": "def print_metrics(model, x_train, y_train, x_val, y_val):\n train_acc = dict(model.evaluate(x_train, y_train, verbose=0, return_dict=True))[\n \"accuracy\"\n ]\n val_acc = dict(model.evaluate(x_val, y_val, verbose=0, return_dict=True))[\n \"accuracy\"\n ]\n\n val_preds = model.predict(x_val)\n val_preds_bool = val_preds >= 0.5\n\n print(\"\")\n print(f\"Training Accuracy: {train_acc:.2%}\")\n print(f\"Validation Accuracy: {val_acc:.2%}\")\n print(\"\")\n print(f\"Validation f1 score: {sklearn.metrics.f1_score(val_preds_bool, y_val):.2%}\")", + "class": "Model Evaluation", + "desc": "This function evaluates a given model's accuracy on training and validation datasets, prints these metrics, and also calculates and prints the F1 score of the validation predictions using sklearn.", "testing": { - "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.5177371 + "class": "Model_Train", + "subclass": "compute_train_metric", + "subclass_id": 28, + "predicted_subclass_probability": 0.5876385 }, - "cluster": 0 - }], - "notebook_id": 4, - "notebook_name": "bow-tf-idf-models-with-basic-lr-0-80-score" - }, { - "cells": [{ - "cell_id": 26, - "code": "# Copy the results to a pandas dataframe with an \"id\" column and a \"target\" column\nfinal_submission = pd.DataFrame( data={\"id\":test_data[\"id\"], \"target\":y_test_predictions})\n# Save the submission file\nfinal_submission.to_csv(\"submissionTweets.csv\", index=False)", - "class": "Data Export", - "desc": "This code snippet creates a DataFrame containing the test data IDs and their corresponding predicted target labels and then saves this DataFrame to a CSV file named \"submissionTweets.csv\".", + "cluster": 1 + }, { + "cell_id": 29, + "code": "# tuner.results_summary()", + "class": "Model Evaluation", + "desc": "This code, when uncommented, would display a summary of the tuning results from the Keras Tuner's hyperparameter search, providing details on the performance and configuration of the best models.", "testing": { - "class": "Data_Export", - "subclass": "save_to_csv", - "subclass_id": 25, - "predicted_subclass_probability": 0.99925834 + "class": "Exploratory_Data_Analysis", + "subclass": "commented", + "subclass_id": 76, + "predicted_subclass_probability": 0.9971831 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 2, - "code": "train_data = pd.read_csv('../input/nlp-getting-started/train.csv')\nprint(train_data.shape)\ntrain_data.head(3)", - "class": "Data Extraction", - "desc": "This code snippet reads the training dataset from a CSV file and displays the shape of the dataset along with the first three rows.", + "cell_id": 30, + "code": "best_model = tuner.get_best_models()[0]\n# best_model.summary()\nprint(\"\")\nbest_arch_hp = tuner.get_best_hyperparameters()[0]\npprint.pprint(best_arch_hp.values, indent=4)\nprint(\"\")\n\nprint_metrics(best_model, train_inputs, y_train, validation_inputs, y_val)", + "class": "Model Evaluation", + "desc": "This code retrieves the best model and its hyperparameters from the Keras Tuner search, optionally prints the model summary, and prints out the best hyperparameter values and evaluation metrics for the model on both the training and validation datasets.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.9993906 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.43187767 }, - "cluster": 1 + "cluster": 0 }, { "cell_id": 3, - "code": "# load test dataset\ntest_data = pd.read_csv('../input/nlp-getting-started/test.csv')\nprint(test_data.shape)\ntest_data.head(3)", - "class": "Data Extraction", - "desc": "This code snippet reads the test dataset from a CSV file and displays the shape of the dataset along with the first three rows.", + "code": "# Using DistilBERT:\nmodel_class, tokenizer_class, pretrained_weights = (TFDistilBertModel, DistilBertTokenizerFast, 'distilbert-base-uncased')\n\npretrained_bert_tokenizer = tokenizer_class.from_pretrained(pretrained_weights)\n\ndef get_pretrained_bert_model(config=pretrained_weights):\n if not config:\n config = DistilBertConfig(num_labels=2)\n\n return model_class.from_pretrained(pretrained_weights, config=config)\n\n", + "class": "Model Training", + "desc": "This code sets up the DistilBERT model and tokenizer with pretrained weights and defines a function to return a pretrained DistilBERT model instance using either default or specified configuration.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.99944836 + "class": "Model_Train", + "subclass": "load_pretrained", + "subclass_id": 30, + "predicted_subclass_probability": 0.9911644 }, "cluster": 1 }, { - "cell_id": 11, - "code": "X = train_data[\"text\"]\ny = train_data[\"target\"]", - "class": "Data Extraction", - "desc": "This code snippet extracts the text data and target labels from the training dataset into separate variables, `X` and `y` respectively.", + "cell_id": 20, + "code": "# I obtained the value of C by experimenting with LogisticRegressionCV but I'm leaving it out for brevity\nlogisticRegressionClf = LogisticRegression(n_jobs=-1, C=2.78)\nlogisticRegressionClf.fit(train_vectors, y_train)\n\ndef print_metrics_sk(clf, x_train, y_train, x_val, y_val):\n print(f\"Train Accuracy: {clf.score(x_train, y_train):.2%}\")\n print(f\"Validation Accuracy: {clf.score(x_val, y_val):.2%}\")\n print(\"\")\n print(f\"f1 score: {sklearn.metrics.f1_score(y_val, clf.predict(x_val)):.2%}\")\n\nprint_metrics_sk(logisticRegressionClf, train_vectors, y_train, validation_vectors, y_val)", + "class": "Model Training", + "desc": "This code trains a logistic regression classifier with a specified regularization parameter `C` on the TF-IDF vectors of the training dataset, and then evaluates its performance using the `print_metrics_sk` function to print training and validation accuracy, and F1 score using sklearn.", "testing": { - "class": "Data_Transform", - "subclass": "prepare_x_and_y", - "subclass_id": 21, - "predicted_subclass_probability": 0.99927586 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.9330293 }, - "cluster": -1 + "cluster": 2 }, { - "cell_id": 12, - "code": "# Split the training dataset for training and test\nX_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, \n random_state=1)", - "class": "Data Extraction", - "desc": "This code snippet splits the training dataset into training and validation sets, with 10% of the data reserved for validation.", + "cell_id": 22, + "code": "logisticRegressionClf = LogisticRegression(n_jobs=-1, class_weight=class_weights)\nlogisticRegressionClf.fit(train_sentence_vectors, y_train)\n\nprint_metrics_sk(\n logisticRegressionClf,\n train_sentence_vectors,\n y_train,\n validation_sentence_vectors,\n y_val,\n)", + "class": "Model Training", + "desc": "This code trains a logistic regression classifier with computed class weights on the BERT-derived sentence embeddings from the training dataset, and evaluates its performance by printing training and validation accuracy as well as the F1 score using the `print_metrics_sk` function and sklearn.", "testing": { - "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.99780315 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.99934644 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 14, - "code": "train_dataset = list(zip(X_train, y_train))\nval_dataset = list(zip(X_val, y_val))", - "class": "Data Extraction", - "desc": "This code snippet combines the text data and labels from the training and validation sets into tuples, storing them as lists called `train_dataset` and `val_dataset` respectively.", + "cell_id": 23, + "code": "def create_gru_model() -> keras.Model:\n\n model = keras.Sequential()\n model.add(keras.layers.InputLayer(input_shape=train_word_vectors.shape[1:]))\n model.add(GRU(32, return_sequences=True))\n model.add(GlobalMaxPooling1D())\n model.add(Dense(1, activation=\"sigmoid\"))\n\n model.compile(\n optimizer=keras.optimizers.Adam(),\n loss=\"binary_crossentropy\",\n metrics=keras.metrics.BinaryAccuracy(name=\"accuracy\"),\n )\n return model\n\nmodel = create_gru_model()\n\nhistory = model.fit(\n train_word_vectors,\n y_train,\n validation_data=(validation_word_vectors, y_val),\n class_weight=class_weights,\n epochs=20,\n verbose=0,\n callbacks=[\n EarlyStopping(\n monitor=\"val_accuracy\",\n min_delta=0.001,\n patience=5,\n restore_best_weights=True,\n )\n ],\n)\n\nprint_metrics(model, train_word_vectors, y_train, validation_word_vectors, y_val)", + "class": "Model Training", + "desc": "This code defines a sequential GRU-based neural network model with a global max-pooling and a dense output layer, compiles it with an Adam optimizer and binary cross-entropy loss, and trains it on BERT-derived word embeddings from the training dataset, applying class weights and early stopping based on validation accuracy using TensorFlow and Keras.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.7672821 - }, - "cluster": -1 - }, { - "cell_id": 8, - "code": "def prepare_sequence(text):\n \"\"\"\n Tokenize and prepare a sequence for the model. It tokenizes the text sequence\n adding special tokens ([CLS], [SEP]), padding to the max length and truncate \n reviews longer than the max length.\n Return the token IDs, the segment IDs and the mask IDs.\n \"\"\"\n\n prepared_sequence = tokenizer.encode_plus(\n text, \n add_special_tokens = True, \n max_length = MAX_LENGHT, \n padding = 'max_length',\n return_attention_mask = True\n )\n return prepared_sequence", - "class": "Data Transform", - "desc": "This code snippet defines a function to tokenize and prepare a text sequence for the BERT model by adding special tokens, padding or truncating to a maximum length, and returning token IDs, segment IDs, and mask IDs.", - "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.9985875 - }, - "cluster": 1 - }, { - "cell_id": 9, - "code": "# Prepare a test sentence\ntest_sentence = 'Is this jacksonville?'\ntest_sentence_encoded = prepare_sequence(test_sentence)\ntoken_ids = test_sentence_encoded[\"input_ids\"]\nprint(f'Test sentence: {test_sentence}')\nprint(f'Keys: {test_sentence_encoded.keys()}')\nprint(f'Tokens: {tokenizer.convert_ids_to_tokens(token_ids)[:12]}')\nprint(f'Token IDs: {token_ids[:12]}')\nprint(f'Segment IDs: {test_sentence_encoded[\"token_type_ids\"][:12]}')\nprint(f'Mask IDs {test_sentence_encoded[\"attention_mask\"][:12]}')\nprint(f'Input dimension: {len(token_ids)}')", - "class": "Data Transform", - "desc": "This code snippet prepares a test sentence for the BERT model by tokenizing it using the previously defined function, and then prints the encoded output including token IDs, segment IDs, mask IDs, and the input dimension.", - "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.42718402 - }, - "cluster": 1 - }, { - "cell_id": 10, - "code": "def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):\n \"\"\"\n Map to the expected input to TFBertForSequenceClassification.\n \"\"\"\n mapped_example = {\n \"input_ids\": input_ids,\n \"token_type_ids\": token_type_ids,\n \"attention_mask\": attention_masks,\n }\n return mapped_example, label \n\ndef encode_examples(texts_and_labels):\n \"\"\"\n Prepare all sequences of text and build TF dataset.\n \"\"\"\n\n input_ids_list = []\n token_type_ids_list = []\n attention_mask_list = []\n label_list = []\n \n for text, label in texts_and_labels:\n\n bert_input = prepare_sequence(text)\n\n input_ids_list.append(bert_input['input_ids'])\n token_type_ids_list.append(bert_input['token_type_ids'])\n attention_mask_list.append(bert_input['attention_mask'])\n label_list.append([label])\n\n # Create TF dataset\n dataset = tf.data.Dataset.from_tensor_slices(\n (input_ids_list, attention_mask_list, token_type_ids_list,\n label_list)\n )\n # Map to the expected input to TFBertForSequenceClassification\n dataset_mapped = dataset.map(map_example_to_dict)\n return dataset_mapped", - "class": "Data Transform", - "desc": "This code snippet defines functions to map inputs to the expected format for `TFBertForSequenceClassification` and to encode multiple text sequences to build a TensorFlow dataset ready for model training.", - "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.9974004 - }, - "cluster": 1 - }, { - "cell_id": 15, - "code": "# Prepare sequences of text and build TF train dataset\nds_train_encoded = encode_examples(train_dataset).shuffle(10000).batch(BATCH_SIZE)\n\n# Prepare sequences of text and build TF validation dataset\nds_val_encoded = encode_examples(val_dataset).batch(BATCH_SIZE)", - "class": "Data Transform", - "desc": "This code snippet encodes the training and validation datasets into sequences suitable for the BERT model, shuffles and batches the training data, and batches the validation data, preparing them for model training.", - "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.8710919 - }, - "cluster": 1 - }, { - "cell_id": 23, - "code": "def encode_test_examples(texts):\n \"\"\"\n Prepare all sequences of text and build TF dataset.\n \"\"\"\n\n input_ids_list = []\n token_type_ids_list = []\n attention_mask_list = []\n \n for text in texts:\n\n bert_input = prepare_sequence(text)\n\n input_ids_list.append(bert_input['input_ids'])\n token_type_ids_list.append(bert_input['token_type_ids'])\n attention_mask_list.append(bert_input['attention_mask'])\n\n # Create TF dataset\n dataset = tf.data.Dataset.from_tensor_slices(\n (input_ids_list, attention_mask_list, token_type_ids_list)\n )\n # Map to the expected input to TFBertForSequenceClassification\n dataset_mapped = dataset.map(map_test_example_to_dict)\n return dataset_mapped\n\ndef map_test_example_to_dict(input_ids, attention_masks, token_type_ids):\n \"\"\"\n Map to the expected input to TFBertForSequenceClassification.\n \"\"\"\n mapped_example = {\n \"input_ids\": input_ids,\n \"token_type_ids\": token_type_ids,\n \"attention_mask\": attention_masks,\n }\n return mapped_example", - "class": "Data Transform", - "desc": "This code snippet defines functions to encode text sequences in the test dataset, preparing them as a TensorFlow dataset and mapping them to the expected input format for `TFBertForSequenceClassification`.", - "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.9800521 + "class": "Model_Train", + "subclass": "train_on_grid", + "subclass_id": 6, + "predicted_subclass_probability": 0.509667 }, - "cluster": 1 + "cluster": 2 }, { "cell_id": 24, - "code": "X_test = test_data[\"text\"]\ntest_dataset = list(X_test)\nds_test_encoded = encode_test_examples(test_dataset).batch(BATCH_SIZE)", - "class": "Data Transform", - "desc": "This code snippet extracts the text data from the test dataset, encodes it using the previously defined function, and batches the encoded test dataset for model prediction.", - "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.9627081 - }, - "cluster": 1 - }, { - "cell_id": 4, - "code": "for tweet_index in range(1,30,5):\n print(f'Text of the tweet: {train_data[\"text\"][tweet_index]}')\n print(f'Target: {\"Real disaster\" if train_data[\"target\"][tweet_index]==1 else \"Not real disaster\"}\\n')", - "class": "Exploratory Data Analysis", - "desc": "This code snippet iterates through specific rows of the training data, printing out the text of the tweets along with their corresponding target label, which indicates whether the tweet is about a real disaster or not.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.29876474 - }, - "cluster": -1 - }, { - "cell_id": 13, - "code": "n_training_examples = X_train.shape[0]\nn_positive_training_examples = y_train.value_counts()[1]\nn_negative_training_examples = y_train.value_counts()[0]\nprint(f'Number examples in training dataset: {n_training_examples}')\nprint(f'Number of positive examples in training dataset: {n_positive_training_examples}')\nprint(f'Number of negative examples in training dataset: {n_negative_training_examples}')", - "class": "Exploratory Data Analysis", - "desc": "This code snippet calculates and prints the number of total, positive, and negative examples in the training dataset after the split.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.9978163 - }, - "cluster": -1 - }, { - "cell_id": 0, - "code": "import random\n\nimport pandas as pd\nimport numpy as np \nfrom scipy.special import softmax\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import (roc_auc_score, classification_report, \n confusion_matrix)\nimport tensorflow as tf\nfrom transformers import BertTokenizer\nfrom transformers import TFBertForSequenceClassification\nfrom transformers import AutoConfig\n", - "class": "Imports and Environment", - "desc": "This code imports necessary libraries and modules for data manipulation, visualization, machine learning model creation, and evaluation, as well as specific tools from TensorFlow and the Transformers library. ", - "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.99931514 - }, - "cluster": 0 - }, { - "cell_id": 20, - "code": "# Get predictions in the validation dataset\nval_predictions = model.predict(ds_val_encoded)\nval_probabilities = softmax(val_predictions[0], axis=1)\ny_val_predictions = np.argmax(val_probabilities, axis=1).flatten()", - "class": "Model Evaluation", - "desc": "This code snippet obtains predictions for the validation dataset using the trained model, applies the softmax function to obtain class probabilities, and then derives the final predicted class labels.", - "testing": { - "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.9947659 - }, - "cluster": 0 - }, { - "cell_id": 21, - "code": "# Compute metrics to evaluate the model\nclassification_metrics = classification_report(y_val, y_val_predictions)\n# Compute the area under the ROC curve\narea_under_the_curve = roc_auc_score(y_val, val_probabilities[:,1:2], multi_class=\"ovr\")\n# Compute the confusion matrix\nerror_matrix = confusion_matrix(y_val, y_val_predictions)\nprint(f'Area under the ROC curve: {area_under_the_curve}')\nprint(f'Classification metrics:\\n{classification_metrics}')\n# Plot the confusion matrix\nax = plt.axes()\nsns.heatmap(error_matrix, annot=True, fmt=\"d\")\nax.set_title('Confusion matrix Validation set')", - "class": "Model Evaluation", - "desc": "This code snippet calculates various evaluation metrics for the model on the validation dataset, including the classification report, area under the ROC curve, and confusion matrix, and also visualizes the confusion matrix using a heatmap.", + "code": "def create_multi_input_model() -> keras.Model:\n\n keyword_ids = keras.Input((8,), name=\"keywords\")\n keyword_features = Embedding(input_dim=feature_extractor.config.vocab_size, output_dim=16, input_length=8, mask_zero=True)(keyword_ids)\n keyword_features = Flatten()(keyword_features)\n keyword_features = Dense(1)(keyword_features)\n\n tweet_classification_vectors = keras.Input((train_sentence_vectors.shape[1],), name=\"tweets\")\n tweet_features = Dense(1, activation='relu')(tweet_classification_vectors) \n\n combined_features = concatenate([keyword_features, tweet_features])\n combined_prediction = Dense(1, activation=\"sigmoid\")(combined_features)\n\n model = keras.Model(inputs = [keyword_ids, tweet_classification_vectors], outputs=combined_prediction)\n\n model.compile(\n optimizer=keras.optimizers.Adam(),\n loss=\"binary_crossentropy\",\n metrics=keras.metrics.BinaryAccuracy(name=\"accuracy\"),\n )\n return model\n\n\nmodel = create_multi_input_model()\n\ntrain_inputs = {\"keywords\" : train_keywords_encoded[\"input_ids\"], \"tweets\" : train_sentence_vectors}\nvalidation_inputs = {\"keywords\" : validation_keywords_encoded[\"input_ids\"], \"tweets\" : validation_sentence_vectors}\n\nhistory = model.fit(\n train_inputs,\n y_train,\n validation_data=(validation_inputs, y_val),\n class_weight=class_weights,\n epochs=20,\n verbose=0,\n callbacks=[\n EarlyStopping(\n monitor=\"val_accuracy\",\n min_delta=0.001,\n patience=5,\n restore_best_weights=True,\n )\n ],\n)\n\n\nprint_metrics(model, train_inputs, y_train, validation_inputs, y_val)", + "class": "Model Training", + "desc": "This code defines and trains a multi-input neural network model that combines keyword embeddings and BERT-derived sentence embeddings, compiles it using an Adam optimizer and binary cross-entropy loss, trains it with class weights, and employs early stopping on the validation accuracy using TensorFlow and Keras.", "testing": { - "class": "Visualization", - "subclass": "heatmap", - "subclass_id": 80, - "predicted_subclass_probability": 0.68382823 + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.9820961 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 22, - "code": "# Show some predictions in the validation dataset\nfor i in random.sample(range(len(val_dataset)), k=N_PREDICTIONS_TO_SHOW):\n print(f'\\nText: {X_test.values[i]}')\n print(f'Ground truth: {\"Real disaster\" if y_val.values[i]==1 else \"Not real disaster\"}')\n print(f'Predicted: {\"Real disaster\" if y_val_predictions[i]==1 else \"Not real disaster\"}')", - "class": "Model Evaluation", - "desc": "This code snippet selects random samples from the validation dataset and prints the tweet text along with the ground truth and predicted labels, indicating whether the tweet is about a real disaster or not.", + "cell_id": 25, + "code": "def create_multi_input_rnn_model() -> keras.Model:\n\n keyword_ids = keras.Input((8,), name=\"keywords\")\n keyword_features = Embedding(input_dim=feature_extractor.config.vocab_size, output_dim=16, input_length=8, mask_zero=True)(keyword_ids)\n keyword_features = Flatten()(keyword_features)\n keyword_features = Dense(1)(keyword_features)\n\n tweet_token_embeddings = Input(train_word_vectors.shape[1:], name=\"tweets\")\n tweet_features = GRU(32, return_sequences=True)(tweet_token_embeddings)\n tweet_features = GlobalMaxPooling1D()(tweet_features)\n tweet_features = Dense(1, activation='relu')(tweet_features) \n\n combined_features = concatenate([keyword_features, tweet_features])\n combined_prediction = Dense(1, activation=\"sigmoid\")(combined_features)\n\n model = keras.Model(inputs = [keyword_ids, tweet_token_embeddings], outputs=combined_prediction)\n\n model.compile(\n optimizer=keras.optimizers.Adam(),\n loss=\"binary_crossentropy\",\n metrics=keras.metrics.BinaryAccuracy(name=\"accuracy\"),\n )\n return model\n\n\nmodel = create_multi_input_rnn_model()\n\ntrain_inputs = {\"keywords\" : train_keywords_encoded[\"input_ids\"], \"tweets\" : train_word_vectors}\nvalidation_inputs = {\"keywords\" : validation_keywords_encoded[\"input_ids\"], \"tweets\" : validation_word_vectors}\n\nhistory = model.fit(\n train_inputs,\n y_train,\n validation_data=(validation_inputs, y_val),\n class_weight=class_weights,\n epochs=20,\n verbose=0,\n callbacks=[\n EarlyStopping(\n monitor=\"val_accuracy\",\n min_delta=0.001,\n patience=5,\n restore_best_weights=True,\n )\n ],\n)\n\nprint_metrics(model, train_inputs, y_train, validation_inputs, y_val)", + "class": "Model Training", + "desc": "This code defines and trains a multi-input recurrent neural network (RNN) model that combines keyword embeddings and GRU-processed tweet token embeddings, compiles it using an Adam optimizer and binary cross-entropy loss, trains it with class weights, and employs early stopping based on validation accuracy using TensorFlow and Keras.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.49180317 + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.8343507 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 25, - "code": "test_predictions = model.predict(ds_test_encoded)\ntest_probabilities = softmax(test_predictions[0], axis=1)\ny_test_predictions = np.argmax(test_probabilities, axis=1).flatten()", - "class": "Model Evaluation", - "desc": "This code snippet obtains predictions for the test dataset using the trained model, applies the softmax function to get class probabilities, and then determines the final predicted class labels.", + "cell_id": 26, + "code": "def create_candidate_model_with_fx(hp: kerastuner.HyperParameters) -> keras.Model:\n\n keyword_ids = keras.Input((8,), name=\"keywords\")\n keyword_features = Embedding(input_dim=feature_extractor.config.vocab_size, output_dim=16, input_length=8, mask_zero=True)(keyword_ids)\n keyword_features = Flatten()(keyword_features)\n keyword_features = Dense(hp.Choice(\"keyword_units\", values=[1, 8, 16, 32], default=1))(keyword_features)\n\n tweet_token_embeddings = Input(train_word_vectors.shape[1:], name=\"tweets\")\n \n tweet_features = GRU(hp.Choice(\"GRU_units\", values=[8, 16, 32, 64, 128], default=32), return_sequences=True)(tweet_token_embeddings)\n tweet_features = Dropout(hp.Float(\"GRU_dropout\", min_value=0.0, max_value=0.5, step=0.1))(tweet_features)\n tweet_features = GlobalMaxPooling1D()(tweet_features)\n \n for i in range(hp.Int(\"num_layers\", min_value=0, max_value=3, step=1)):\n tweet_features = Dense(hp.Choice(\"layer_\" + str(i) + \"_units\", values=[2, 8, 16, 32, 64, 128, 256]), activation=\"relu\")(tweet_features)\n tweet_features = Dropout(hp.Float(\"layer_\" + str(i) + \"_dropout\", min_value=0.0, max_value=0.5, step=0.1))(tweet_features)\n \n combined_features = concatenate([keyword_features, tweet_features])\n combined_prediction = Dense(1, activation=\"sigmoid\")(combined_features)\n\n model = keras.Model(inputs = [keyword_ids, tweet_token_embeddings], outputs=combined_prediction)\n\n model.compile(\n optimizer=keras.optimizers.Adam(),\n loss=\"binary_crossentropy\",\n metrics=keras.metrics.BinaryAccuracy(name=\"accuracy\"),\n )\n return model\n\ntrain_inputs = {\"keywords\" : train_keywords_encoded[\"input_ids\"], \"tweets\" : train_word_vectors}\nvalidation_inputs = {\"keywords\" : validation_keywords_encoded[\"input_ids\"], \"tweets\" : validation_word_vectors}\n", + "class": "Model Training", + "desc": "This code defines a candidate RNN-based multi-input neural network model using Keras Tuner's `HyperParameters` to allow hyperparameter tuning, setting up inputs for keyword embeddings and GRU-processed tweet token embeddings, and compiling the model with an Adam optimizer and binary cross-entropy loss.", "testing": { - "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.99444926 + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.37123346 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 1, - "code": "# The name of the BERT model used\nPRETRAINED_MODEL_NAME = 'bert-base-uncased'\n# The number of labels of the target variable\nLABELS_NUMBER = 2\n\n# The max lenght of text can be up to 512 for BERT\nMAX_LENGHT = 512\n\nBATCH_SIZE = 6\nLEARNING_RATE = 2e-5\nEPOCHS_NUMBER = 1\n\nN_PREDICTIONS_TO_SHOW = 10", + "cell_id": 27, + "code": "# Hyperband Tuning\nMAX_EPOCHS = 10\nFACTOR = 3\nITERATIONS = 3\n\nprint(f\"Number of models in each bracket: {math.ceil(1 + math.log(MAX_EPOCHS, FACTOR))}\")\nprint(f\"Number of epochs over all trials: {round(ITERATIONS * (MAX_EPOCHS * (math.log(MAX_EPOCHS, FACTOR) ** 2)))}\")", "class": "Model Training", - "desc": "This code snippet defines several constants including the pre-trained BERT model name, the number of target variable labels, maximum text length, batch size, learning rate, number of epochs, and number of predictions to show.", + "desc": "This code sets up parameters and calculations to perform Hyperband tuning by specifying the maximum number of epochs, factor, and iterations, and prints the number of models in each bracket as well as the total number of epochs over all trials.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "define_variables", "subclass_id": 77, - "predicted_subclass_probability": 0.99904436 + "predicted_subclass_probability": 0.5920417 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 6, - "code": "# Get the Bert tokenizer\ntokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME, \n do_lower_case=True)", + "cell_id": 28, + "code": "tuner = kerastuner.Hyperband(\n create_candidate_model_with_fx,\n max_epochs=MAX_EPOCHS,\n hyperband_iterations=ITERATIONS, \n factor=FACTOR, \n objective=\"val_accuracy\",\n directory=\"hyperparam-search\",\n project_name=\"architecture-hyperband\",\n)\n\ntuner.search(\n train_inputs,\n y_train,\n validation_data=(validation_inputs, y_val),\n class_weight=class_weights,\n epochs=10,\n verbose=1,\n callbacks=[\n EarlyStopping(\n monitor=\"val_accuracy\",\n min_delta=0.001,\n patience=3,\n restore_best_weights=True,\n )\n ],\n)\n", "class": "Model Training", - "desc": "This code snippet initializes the BERT tokenizer using the pre-trained BERT model name and sets it to perform lowercasing while tokenizing text data.", + "desc": "This code initiates a Hyperband tuner using Keras Tuner to optimize the hyperparameters of the multi-input RNN model defined by `create_candidate_model_with_fx`, and starts the search over specified epochs and iterations using the training dataset and early stopping callback based on validation accuracy.", "testing": { "class": "Model_Train", - "subclass": "load_pretrained", - "subclass_id": 30, - "predicted_subclass_probability": 0.9954691 + "subclass": "train_on_grid", + "subclass_id": 6, + "predicted_subclass_probability": 0.7966217 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 7, - "code": "# Print some words of the vocabulary\nvocabulary = tokenizer.get_vocab()\nprint(f'Size of the vocabulary: {len(vocabulary)}')\nprint(f'Some tokens of the vocabulary: {list(vocabulary.keys())[5000:5010]}')", + "cell_id": 31, + "code": "# To create a baseline for the simplest possible fine-tuned BERT\ndef create_bert_simple_for_ft():\n input_ids = Input(shape=(max_length_tweet,), dtype=\"int32\", name=\"input_ids\")\n attention_mask = Input(shape=(max_length_tweet,), dtype=\"int32\", name=\"attention_mask\")\n\n pretrained_bert_model = get_pretrained_bert_model()\n bert_outputs = pretrained_bert_model(input_ids, attention_mask)\n\n prediction = Dense(1, activation=\"sigmoid\")(bert_outputs.last_hidden_state[:, 0, :])\n return keras.Model(inputs=[input_ids, attention_mask], outputs=prediction)\n\nmodel = create_bert_simple_for_ft()\n\nmodel.compile(\n optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),\n loss=\"binary_crossentropy\",\n metrics=[\"accuracy\"],\n)\n\nmodel.fit(\n train_dataset.batch(32),\n validation_data=val_dataset.batch(32),\n class_weight=class_weights,\n epochs=20,\n callbacks=[\n EarlyStopping(\n monitor=\"val_accuracy\",\n min_delta=0.001,\n patience=5,\n restore_best_weights=True,\n )\n ],\n)\n\nprint_metrics(\n model, dict(train_tweets_encoded), y_train, dict(validation_tweets_encoded), y_val\n)\n", "class": "Model Training", - "desc": "This code snippet retrieves and prints the size of the BERT tokenizer vocabulary and displays a sample of tokens from the vocabulary.", + "desc": "This code defines, compiles, and trains a simple fine-tuned BERT model on tokenized tweet inputs, using an Adam optimizer with a defined learning rate, binary cross-entropy loss, class weights, and early stopping based on validation accuracy, and then evaluates the model's performance by printing training and validation metrics.", "testing": { - "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.9397918 + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.98817086 }, - "cluster": -1 + "cluster": 2 }, { - "cell_id": 16, - "code": "def get_model():\n # Define the configuration of the model\n config = AutoConfig.from_pretrained(PRETRAINED_MODEL_NAME,\n hidden_dropout_prob=0.2,\n num_labels=LABELS_NUMBER)\n # Model initialization\n model = TFBertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, \n config=config)\n return model", + "cell_id": 32, + "code": "def create_bert_rnn_for_ft():\n \n pretrained_bert_model = get_pretrained_bert_model()\n \n keyword_ids = keras.Input((8,), name=\"keywords\")\n keyword_features = Embedding(input_dim=pretrained_bert_model.config.vocab_size, output_dim=16, input_length=8, mask_zero=True)(keyword_ids)\n keyword_features = Flatten()(keyword_features)\n keyword_features = Dense(1)(keyword_features)\n\n input_ids = Input(shape=(max_length_tweet,), dtype=\"int32\", name=\"input_ids\")\n attention_mask = Input(shape=(max_length_tweet,), dtype=\"int32\", name=\"attention_mask\")\n bert_outputs = pretrained_bert_model(input_ids, attention_mask)\n\n bert_token_embeddings = bert_outputs.last_hidden_state[:, 1:, :]\n tweet_features = GRU(32, return_sequences=True)(bert_token_embeddings)\n tweet_features = GlobalMaxPooling1D()(tweet_features)\n\n combined_features = concatenate([keyword_features, tweet_features])\n combined_prediction = Dense(1, activation=\"sigmoid\")(combined_features)\n\n model = keras.Model(inputs = [keyword_ids, input_ids, attention_mask], outputs=combined_prediction)\n\n model.compile(\n optimizer=keras.optimizers.Adam(learning_rate=5e-5),\n loss=\"binary_crossentropy\",\n metrics=keras.metrics.BinaryAccuracy(name=\"accuracy\"),\n )\n return model\n\nmodel = create_bert_rnn_for_ft()\n\nmodel.fit(\n train_multi_input_dataset.batch(32),\n validation_data=val_multi_input_dataset.batch(32),\n epochs=20,\n class_weight=class_weights,\n callbacks=[\n EarlyStopping(\n monitor=\"val_accuracy\",\n min_delta=0.001,\n patience=3,\n restore_best_weights=True,\n )\n ],\n)\n\nprint_metrics(\n model, train_inputs_encoded, y_train, validation_inputs_encoded, y_val\n)", "class": "Model Training", - "desc": "This code snippet defines a function to obtain a BERT model for sequence classification by configuring dropout probability and the number of labels, and initializing the model with these settings.", + "desc": "This code defines, compiles, and trains a multi-input RNN-enhanced fine-tuned BERT model that combines keyword embeddings with GRU-processed BERT token embeddings from tweets, using an Adam optimizer with a specified learning rate, binary cross-entropy loss, class weights, and early stopping based on validation accuracy, then evaluates the model by printing training and validation metrics.", "testing": { "class": "Model_Train", - "subclass": "load_pretrained", - "subclass_id": 30, - "predicted_subclass_probability": 0.98904186 + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.7985944 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 17, - "code": "# Model initialization\nmodel = get_model()\n\n# Define the optimizer, the loss function and metrics\noptimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)\nloss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\nmetric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n# Compile the model\nmodel.compile(optimizer=optimizer, loss=loss, metrics=[metric])", + "cell_id": 33, + "code": "def create_model_candidate() -> keras.Model:\n pretrained_bert_model = get_pretrained_bert_model()\n\n keyword_ids = keras.Input((8,), name=\"keywords\")\n keyword_features = Embedding(input_dim=pretrained_bert_model.config.vocab_size, output_dim=16, input_length=8, mask_zero=True)(keyword_ids)\n keyword_features = Flatten()(keyword_features)\n keyword_features = Dense(best_arch_hp.get(\"keyword_units\"))(keyword_features)\n\n input_ids = Input(shape=(max_length_tweet,), dtype=\"int32\", name=\"input_ids\")\n attention_mask = Input(shape=(max_length_tweet,), dtype=\"int32\", name=\"attention_mask\")\n bert_outputs = pretrained_bert_model(input_ids, attention_mask)\n bert_token_embeddings = bert_outputs.last_hidden_state[:, 1:, :]\n tweet_features = GRU(best_arch_hp.get(\"GRU_units\"), return_sequences=True)(bert_token_embeddings)\n tweet_features = Dropout(best_arch_hp.get(\"GRU_dropout\"))(tweet_features)\n tweet_features = GlobalMaxPooling1D()(tweet_features)\n \n for i in range(best_arch_hp.get(\"num_layers\")):\n tweet_features = Dense(best_arch_hp.get(\"layer_\" + str(i) + \"_units\"), activation=\"relu\")(tweet_features)\n tweet_features = Dropout(best_arch_hp.get(\"layer_\" + str(i) + \"_dropout\"))(tweet_features)\n \n combined_features = concatenate([keyword_features, tweet_features])\n combined_prediction = Dense(1, activation=\"sigmoid\")(combined_features)\n\n model = keras.Model(inputs = [keyword_ids, input_ids, attention_mask], outputs=combined_prediction)\n\n model.compile(\n optimizer=keras.optimizers.Adam(learning_rate=5e-5),\n loss=\"binary_crossentropy\",\n metrics=keras.metrics.BinaryAccuracy(name=\"accuracy\"),\n )\n return model\n", "class": "Model Training", - "desc": "This code snippet initializes the BERT model for sequence classification, defines the optimizer, loss function, and accuracy metric, and compiles the model with these configurations to prepare it for training.", + "desc": "This code defines a candidate model architecture that incorporates the optimal hyperparameters identified through Keras Tuner, combining keyword embeddings and GRU-processed BERT token embeddings from tweets, and compiles the model using an Adam optimizer with a specified learning rate, binary cross-entropy loss, and binary accuracy metric using TensorFlow and Keras.", "testing": { "class": "Model_Train", "subclass": "choose_model_class", "subclass_id": 4, - "predicted_subclass_probability": 0.9943469 + "predicted_subclass_probability": 0.90136576 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 18, - "code": "# Scaling by total/2 helps keep the loss to a similar magnitude.\n# The sum of the weights of all examples stays the same.\nweight_for_0 = (1 / n_negative_training_examples)*(n_training_examples)/2.0 \nweight_for_1 = (1 / n_positive_training_examples)*(n_training_examples)/2.0\n\nclass_weight = {0: weight_for_0, 1: weight_for_1}\n\nprint('Weight for class 0: {:.2f}'.format(weight_for_0))\nprint('Weight for class 1: {:.2f}'.format(weight_for_1))", + "cell_id": 34, + "code": "model = create_model_candidate()\n\nhistory = model.fit(\n train_multi_input_dataset.batch(32),\n validation_data=val_multi_input_dataset.batch(32),\n epochs=6,\n class_weight=class_weights,\n callbacks=[\n keras.callbacks.EarlyStopping(\n monitor=\"val_accuracy\", restore_best_weights=True\n )\n ],\n)\n\nbest_epoch = len(history.history[\"val_accuracy\"]) - 1\n\nprint_metrics(\n model, train_inputs_encoded, y_train, validation_inputs_encoded, y_val\n)", "class": "Model Training", - "desc": "This code snippet calculates the class weights for the training dataset to handle class imbalance by scaling the weights for each class such that their total remains balanced, and prints the calculated weights for each class.", + "desc": "This code trains the candidate model created with the optimal hyperparameters on the multi-input dataset using an Adam optimizer, class weights, and early stopping based on validation accuracy, then evaluates the model's performance by printing training and validation metrics.", "testing": { "class": "Model_Train", "subclass": "train_model", "subclass_id": 7, - "predicted_subclass_probability": 0.44395435 + "predicted_subclass_probability": 0.99578863 }, "cluster": 0 }, { - "cell_id": 19, - "code": "# Train the model\nmodel.fit(ds_train_encoded, epochs=EPOCHS_NUMBER, validation_data=ds_val_encoded,\n class_weight = class_weight)", + "cell_id": 36, + "code": "full_train_dataset = train_multi_input_dataset.concatenate(val_multi_input_dataset)\nmodel = create_model_candidate()\n\nmodel.fit(\n full_train_dataset.batch(32),\n epochs=best_epoch,\n class_weight=class_weights,\n)", "class": "Model Training", - "desc": "This code snippet trains the BERT model on the encoded training dataset for the specified number of epochs, using the validation dataset for evaluation and applying the calculated class weights to address class imbalance.", + "desc": "This code concatenates the training and validation datasets to form a full training dataset, creates a new instance of the candidate model, and trains it on the combined dataset for the number of epochs determined by the best validation accuracy epoch, using class weights.", "testing": { "class": "Model_Train", "subclass": "train_model", "subclass_id": 7, - "predicted_subclass_probability": 0.9996909 - }, - "cluster": 0 - }, { - "cell_id": 5, - "code": "sns.countplot(train_data[\"target\"])", - "class": "Visualization", - "desc": "This code snippet creates a count plot to visualize the distribution of the target variable in the training dataset using Seaborn.", - "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9974095 + "predicted_subclass_probability": 0.9959798 }, "cluster": 0 }], "notebook_id": 5, - "notebook_name": "detecting-disaster-tweets-fine-tuning-bert" + "notebook_name": "bert-feature-extraction-and-fine-tuning.ipynb" }, { "cells": [{ - "cell_id": 13, - "code": "def submission(submission_file_path,model,test_vectors):\n sample_submission = pd.read_csv(submission_file_path)\n sample_submission[\"target\"] = model.predict(test_vectors)\n sample_submission.to_csv(\"submission.csv\", index=False)", + "cell_id": 11, + "code": "def submission(model, test_df, fname = 'submission'):\n y_hat = model.predict(test_df)\n submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')\n submission['target'] = y_hat\n submission.to_csv('submission.csv', index=False)", "class": "Data Export", - "desc": "The code defines a function to create a submission file by predicting the target values using a given model and test feature vectors, and then saving the results to a CSV file named \"submission.csv\".", + "desc": "This code snippet defines a function `submission` that takes a trained model and test DataFrame, generates predictions, loads a sample submission file, updates the 'target' column with the predictions, and saves the final submission file as 'submission.csv' using Pandas.", "testing": { "class": "Data_Export", "subclass": "save_to_csv", "subclass_id": 25, - "predicted_subclass_probability": 0.9992994 + "predicted_subclass_probability": 0.9989374 }, "cluster": -1 }, { - "cell_id": 14, - "code": "submission_file_path = \"../input/nlp-getting-started/sample_submission.csv\"\ntest_vectors=test_tfidf\nclf = clf_xgb_TFIDF\nsubmission(submission_file_path,clf,test_vectors)", + "cell_id": 17, + "code": "# submission\ny_hat = lr.predict_proba(test)\ny_hat = y_hat[:, 1]\n\npreds = to_class_label(y_hat, opt_thres)\nsubmission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')\nsubmission['target'] = preds\n\nsubmission.to_csv('submission.csv', index=False)", "class": "Data Export", - "desc": "The code calls the previously defined `submission` function with a specified submission file path, the XGBoost classifier, and the test feature vectors to generate and save the prediction results into a \"submission.csv\" file.", + "desc": "This code snippet generates predictions on the test dataset using the trained logistic regression model, converts the predicted probabilities to class labels using the optimal threshold, updates the sample submission file with these predictions, and saves the final submission as 'submission.csv'.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.7684079 + "class": "Data_Export", + "subclass": "save_to_csv", + "subclass_id": 25, + "predicted_subclass_probability": 0.9982597 }, "cluster": -1 }, { "cell_id": 1, - "code": "train = pd.read_csv(\"../input/nlp-getting-started/train.csv\")\ntest = pd.read_csv(\"../input/nlp-getting-started/test.csv\")\nX_train = train.iloc[:, :4]\ny_train = train.iloc[:, 4]\nX_test = test\nprint(X_train.shape, y_train.shape, X_test.shape)", + "code": "# print files in input dir\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n ", "class": "Data Extraction", - "desc": "The code reads the training and test datasets from specified CSV files, extracts features and target variables, and then prints their dimensions to verify the data import process.", + "desc": "This code snippet traverses the directory '/kaggle/input' and prints the paths of all files present using the `os.walk` method.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.99967587 + "class": "Exploratory_Data_Analysis", + "subclass": "list_files", + "subclass_id": 88, + "predicted_subclass_probability": 0.9993166 }, - "cluster": -1 + "cluster": 0 }, { "cell_id": 2, - "code": "def lowercase_text(text):\n return text.lower()\n\nX_train.text=X_train.text.apply(lambda x: lowercase_text(x))\nX_test.text=X_test.text.apply(lambda x: lowercase_text(x))\nX_train.head()", - "class": "Data Transform", - "desc": "This code defines a function to convert text to lowercase and then applies this function to the 'text' column of both the training and test feature sets to standardize the text data.", + "code": "train = pd.read_csv('../input/nlp-getting-started/train.csv')\ntest = pd.read_csv('../input/nlp-getting-started/test.csv')\n\ntrain.head()", + "class": "Data Extraction", + "desc": "This code snippet reads the training and testing datasets from CSV files into Pandas DataFrames and displays the first few rows of the training dataset using the `head` method.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.9221044 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.9996737 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 3, - "code": "import re\nimport string\ndef remove_noise(text):\n text = re.sub('\\[.*?\\]', '', text)\n text = re.sub('https?://\\S+|www\\.\\S+', '', text)\n text = re.sub('<.*?>+', '', text)\n text = re.sub('[%s]' % re.escape(string.punctuation), '', text)\n text = re.sub('\\n', '', text)\n text = re.sub('\\w*\\d\\w*', '', text)\n text = re.sub('\u0089\u00fb\u00f2', '', text)\n return text\nX_train.text=X_train.text.apply(lambda x: remove_noise(x))\nX_test.text=X_test.text.apply(lambda x: remove_noise(x))\nX_train.head()", - "class": "Data Transform", - "desc": "The code defines a function to clean text data by removing various types of noise such as brackets, URLs, HTML tags, punctuation, newline characters, digits, and special characters, and then applies this function to the 'text' column in both the training and test datasets to preprocess the text data.", + "cell_id": 9, + "code": "x_train, x_test, y_train, y_test = train_test_split(train.loc[:,train.columns != 'target'], train.target, test_size=0.2)\nprint(x_train.shape, y_train.shape, x_test.shape, y_test.shape)", + "class": "Data Extraction", + "desc": "This code snippet splits the training dataset into training and validation subsets for features and target variable using the `train_test_split` function from Scikit-learn, and prints the shapes of these subsets.", "testing": { "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.7514645 + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.99772364 }, "cluster": 1 }, { "cell_id": 4, - "code": "# Tokenizing the training and the test set\nimport nltk\nfrom nltk.corpus import stopwords\ntokenizer = nltk.tokenize.RegexpTokenizer(r'\\w+')\nX_train['text'] = X_train['text'].apply(lambda x: tokenizer.tokenize(x))\nX_test['text'] = X_test['text'].apply(lambda x: tokenizer.tokenize(x))\nX_train['text'].head()", + "code": "# plot prop of missing for each feature\nsns.set_theme(style='white')\nsns.barplot(x=train.columns, y=train.isnull().mean())\nplt.show()\n\n# drop location and keyword\ntrain.drop(columns=['id', 'keyword', 'location'], inplace=True)\ntest.drop(columns=['id', 'keyword', 'location'], inplace=True)\ntrain.drop_duplicates(inplace=True, ignore_index=True)", "class": "Data Transform", - "desc": "This code tokenizes the 'text' column in both the training and test datasets using a regular expression tokenizer to split the text into individual words, preparing the data for further processing.", + "desc": "This code snippet visualizes the proportion of missing values for each feature using a bar plot with Seaborn and Matplotlib, and then drops the 'id', 'keyword', and 'location' columns from the training and testing datasets, as well as removes duplicate rows from the training dataset using Pandas.", "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.92383534 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.8657076 }, - "cluster": 1 + "cluster": 5 }, { - "cell_id": 5, - "code": "# Removing stopwords belonging to english language\ndef remove_stopwords(text):\n words = [w for w in text if w not in stopwords.words('english')]\n return words\n\nX_train['text'] = X_train['text'].apply(lambda x : remove_stopwords(x))\nX_test['text'] = X_test['text'].apply(lambda x : remove_stopwords(x))\nX_train.head()", + "cell_id": 6, + "code": "wordnet_lemmatizer = WordNetLemmatizer()\n\ndef quick_clean(text):\n \"\"\"\n adapted from: https://www.kaggle.com/sophiejermy/sj-eda1\n \"\"\"\n# text = text + ' '\n #remove links\n text = re.sub(r'(?:(?:https?|ftp):\\/\\/)?[\\w/\\-?=%.]+\\.[\\w/\\-&?=%.]+', '', text)\n #lower case\n text = text.lower() \n #remove special characters\n text = re.sub(r'[\\W]+', ' ', text)\n #remove double spaces\n text = re.sub(r'\\s+', ' ', text)\n #tokenize\n text = word_tokenize(text)\n #remove stop words\n text = [word for word in text if not word in stopwords.words('english')] \n #lemmatize\n text= [wordnet_lemmatizer.lemmatize(word, pos='v') for word in text]\n #rejoin text to string\n text = ' '.join(text)\n return text\n\ndef quick_clean_vectorized(col):\n return pd.DataFrame(data=col.apply(lambda x: quick_clean(x)).tolist())\n\nquiklean_transformer = FunctionTransformer(quick_clean_vectorized) # to use in pipeline\n ", "class": "Data Transform", - "desc": "This code defines a function to remove English stopwords from the tokenized text and applies this function to the 'text' column in both the training and test datasets to reduce noise and potentially improve model performance.", + "desc": "This code snippet defines a text cleaning function named `quick_clean` that removes links, converts text to lowercase, removes special characters and stop words, tokenizes and lemmatizes text using regular expressions and NLTK, and then creates a vectorized version of this function to be used in a machine learning pipeline using Scikit-learn's `FunctionTransformer`.", "testing": { "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.98372287 + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.75697106 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 6, - "code": "# After preprocessing, the text format\ndef combine_text(list_of_text):\n '''Takes a list of text and combines them into one large chunk of text.'''\n combined_text = ' '.join(list_of_text)\n return combined_text\n\nX_train['text'] = X_train['text'].apply(lambda x : combine_text(x))\nX_test['text'] = X_test['text'].apply(lambda x : combine_text(x))\n# X_train['text']\nX_train.head()", + "cell_id": 10, + "code": "tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenize, stop_words='english', max_features = 300)\n\npreprocess = Pipeline(steps=[\n ('clean', ColumnTransformer([\n ('cl', quiklean_transformer, 'text')\n ],\n remainder='drop')),\n ('TFIDF', ColumnTransformer([\n ('tfidf', tfidf_vectorizer, 0)\n ], \n remainder='passthrough')),\n ('dim_reduce', TruncatedSVD(n_components=250, random_state=42)),\n ('scale', MinMaxScaler())\n \n ])", "class": "Data Transform", - "desc": "This code defines a function to concatenate a list of words into a single string and applies this function to the 'text' column in both the training and test datasets, rejoining the tokenized and cleaned text into a continuous string format.", + "desc": "This code snippet defines a preprocessing pipeline named `preprocess` using Scikit-learn's `Pipeline` that includes text cleaning (`quiklean_transformer`), TF-IDF vectorization (`TfidfVectorizer`), dimensionality reduction (`TruncatedSVD`), and feature scaling (`MinMaxScaler`).", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, - "predicted_subclass_probability": 0.7641233 + "predicted_subclass_probability": 0.9810301 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 7, - "code": "# Stemming\nfrom nltk.stem.snowball import SnowballStemmer\nstemmer = SnowballStemmer(\"english\")\n\ndef stemming(text):\n text = [stemmer.stem(word) for word in text.split()]\n return ' '.join(text)\n\n#X_train['text'] = X_train['text'].apply(lambda x : stemming(x))\n#X_test['text'] = X_test['text'].apply(lambda x : stemming(x))\n#X_train", - "class": "Data Transform", - "desc": "The code initializes a SnowballStemmer for the English language and defines a function to stem words in the text, although the actual application of this function to the 'text' column in both the training and test datasets is currently commented out.", + "cell_id": 3, + "code": "print(f'Train dims {train.shape}', f'Test dims {test.shape}', sep = '\\n')", + "class": "Exploratory Data Analysis", + "desc": "This code snippet prints out the dimensions (number of rows and columns) of the training and testing datasets using the `shape` attribute of the Pandas DataFrames.", "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.6512052 + "class": "Exploratory_Data_Analysis", + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.99101853 }, - "cluster": 1 + "cluster": 3 }, { - "cell_id": 10, - "code": "from sklearn.feature_extraction.text import CountVectorizer\ncount_vectorizer=CountVectorizer() # analyzer='word', stop_words = \"english\"\ntrain_vec = count_vectorizer.fit_transform(X_train.text)\ntest_vec = count_vectorizer.transform(X_test.text)\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nTfidf_vectorizer = TfidfVectorizer() # min_df=2, max_df=0.5, ngram_range=(1, 2)analyzer='word', stop_words = \"english\"analyzer='word', stop_words='english'# , ngram_range=(1, 2), lowercase=True, max_features=150000\ntrain_tfidf = Tfidf_vectorizer.fit_transform(X_train.text)\ntest_tfidf = Tfidf_vectorizer.transform(X_test.text)\n\nprint(\"train_vec\" ,train_vec[7].todense())\nprint(\"test_vec\", test_vec[7].todense())\n\nprint(\"train_tfidf\" ,train_tfidf[7].todense())\nprint(\"test_tfidf\", test_vec[7].todense())", - "class": "Data Transform", - "desc": "The code uses CountVectorizer and TfidfVectorizer to convert the preprocessed text data into numerical feature vectors for both the training and test datasets, enabling them for machine learning model inputs, and prints out a sample vector for verification.", + "cell_id": 5, + "code": "# plot target distribution\nsns.countplot(x='target', data=train)\nplt.title('Target distribution')\nplt.show()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet visualizes the distribution of the 'target' variable in the training dataset using a count plot generated with Seaborn and Matplotlib.", "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.8695518 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.99668735 }, - "cluster": 1 + "cluster": 4 }, { "cell_id": 0, - "code": "# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session", + "code": "# setup\n\nfrom collections import Counter, defaultdict\n\nimport numpy as np \nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport os\nimport re\n\nfrom sklearn.preprocessing import FunctionTransformer, MinMaxScaler\nfrom sklearn.pipeline import Pipeline, make_pipeline\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.model_selection import train_test_split \nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nfrom sklearn import set_config\n\nimport optuna\nimport nltk\nfrom nltk.stem import WordNetLemmatizer\nfrom nltk.corpus import stopwords\nfrom nltk.tokenize import word_tokenize\n\nplt.style.use('ggplot')\n%matplotlib inline\n\nset_config(display='diagram')", "class": "Imports and Environment", - "desc": "The code imports essential libraries (numpy for linear algebra and pandas for data processing) and lists all files in the input directory to verify the available dataset files. ", + "desc": "This code snippet imports various libraries and modules for data manipulation (Pandas, NumPy), visualization (Matplotlib, Seaborn), natural language processing (NLTK), and machine learning (Scikit-learn, Optuna), and sets up the Matplotlib style and Scikit-learn configuration.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "list_files", - "subclass_id": 88, - "predicted_subclass_probability": 0.99921954 + "class": "Imports_and_Environment", + "subclass": "set_options", + "subclass_id": 23, + "predicted_subclass_probability": 0.99920374 }, "cluster": 0 }, { - "cell_id": 11, - "code": "from sklearn.model_selection import KFold\nkF = KFold(shuffle=True, random_state=241) # \u0440\u0430\u0437\u0434\u0435\u043b\u0435\u043d\u0438\u0435 \u043d\u0430 5 \u0432\u044b\u0431\u043e\u0440\u043e\u043a\n# MultinomialNB\nfrom sklearn import model_selection\nfrom sklearn.naive_bayes import MultinomialNB\nclf = MultinomialNB() \nscores = model_selection.cross_val_score(clf,train_vec,y_train,cv=kF,scoring='f1')\nprint(\"MultinomialNB score: \" ,scores.mean())\n\n# LogisticRegression\nfrom sklearn.linear_model import LogisticRegression\nclf_tfidf = LogisticRegression()\nscores_tfidf = model_selection.cross_val_score(clf_tfidf,train_tfidf,y_train,\n cv=kF,scoring='f1')\nprint(\"LogisticRegretion score: \" ,scores_tfidf.mean())\n\n# SVC\nfrom sklearn.svm import SVC # \u0440\u0435\u0430\u043b\u0438\u0437\u0430\u0446\u0438\u044f \u043c\u0435\u0442\u043e\u0434\u0430 \u043e\u043f\u043e\u0440\u043d\u044b\u0445 \u0432\u0435\u043a\u0442\u043e\u0440\u043e\u0432\nclf_svc = SVC()#kernel='linear', random_state=241\nscores_svc = model_selection.cross_val_score(clf_svc,train_tfidf,y_train,\n cv=kF,scoring='f1')\nprint(\"SVC score: \" ,scores_svc.mean())\n\n# XGBoost\nimport xgboost as xgb\nclf_xgb_TFIDF = xgb.XGBClassifier()#max_depth=7, n_estimators=150, colsample_bytree=0.8, \n #subsample=0.8, nthread=10, learning_rate=0.1\nscores_xgb = model_selection.cross_val_score(clf_xgb_TFIDF, train_tfidf, y_train, cv=kF, scoring=\"f1\")\nprint(\"XGBost score: \" ,scores_xgb.mean())\n\n", + "cell_id": 14, + "code": "def to_class_label(probs, threshold):\n \"\"\"convert predicted probabilities to class labels\"\"\"\n return (probs >= threshold).astype('int')\n\ndef get_optimal_threshold(fitted_model, x_test, y_test):\n \"\"\"Threshold tuning\"\"\"\n thresholds = np.arange(0, 1, 0.0005)\n y_hat = fitted_model.predict_proba(x_test)\n pos_clas_probs = y_hat[:, 1]\n acc_scores = [accuracy_score(y_test, to_class_label(pos_clas_probs, thres)) for thres in thresholds]\n idx = np.argmax(acc_scores)\n \n return thresholds[idx]\n ", + "class": "Model Evaluation", + "desc": "This code snippet defines two functions: `to_class_label`, which converts predicted probabilities to class labels based on a given threshold, and `get_optimal_threshold`, which tunes the threshold for converting probabilities to class labels by maximizing the accuracy score on the validation dataset.", + "testing": { + "class": "Model_Evaluation", + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.54912084 + }, + "cluster": 3 + }, { + "cell_id": 16, + "code": "# get optimal threshold\nopt_thres = get_optimal_threshold(lr, x_test, y_test)\nprint(f'Optimal threshold for trained LR {get_optimal_threshold(lr, x_test, y_test):.4f}')", "class": "Model Evaluation", - "desc": "The code sets up a K-Fold cross-validation and evaluates the performance of four different classifiers (Multinomial Naive Bayes, Logistic Regression, SVC, and XGBoost) using F1 scores, then prints the mean F1 scores for each model to compare their performance.", + "desc": "This code snippet calculates and prints the optimal threshold for converting predicted probabilities to class labels for the trained logistic regression model by maximizing the accuracy score on the validation dataset.", "testing": { "class": "Model_Train", - "subclass": "compute_train_metric", - "subclass_id": 28, - "predicted_subclass_probability": 0.6108105 + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.6867679 }, "cluster": 0 }, { "cell_id": 12, - "code": "# MultinomialNB\nclf.fit(train_vec,y_train)\ny_pred = clf.predict(test_vec)\nscores = model_selection.cross_val_score(clf,test_vec,y_pred,cv=kF,scoring='f1')\nprint(\"MultinomialNB prediction score: \" ,scores.mean())\n\n# LogisticRegression\nclf_tfidf.fit(train_tfidf, y_train)\ny_pred_tfidf = clf_tfidf.predict(test_tfidf)\nscores_tfidf = model_selection.cross_val_score(clf_tfidf,test_tfidf,y_pred_tfidf,cv=kF,\n scoring='f1')\nprint(\"LogisticRegretion prediction score: \" ,scores_tfidf.mean())\n\n# SVC\nclf_svc.fit(train_tfidf, y_train)\ny_pred_svc = clf_svc.predict(test_tfidf)\nscores_svc = model_selection.cross_val_score(clf_svc,test_tfidf,y_pred_svc, cv=kF,\n scoring='f1') \nprint(\"SVC prediction score: \" ,scores_svc.mean())\n\n# XGBoost\nclf_xgb_TFIDF.fit(train_tfidf, y_train)\ny_pred_xgb = clf_xgb_TFIDF.predict(test_tfidf)\nscores_xgb = model_selection.cross_val_score(clf_xgb_TFIDF,test_tfidf,y_pred_xgb, cv=kF,\n scoring='f1') \nprint(\"XGBoosting prediction score: \" ,scores_xgb.mean())", + "code": "# Tune logistic regression\ndef objective(trial):\n x, y = x_train, y_train\n C = trial.suggest_float('C', 1e-6, 1e6, log=True)\n penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])\n l1_ratio = trial.suggest_float('l1_ratio', 0, 1)\n if penalty != 'elasticnet':\n l1_ratio = None\n \n clf = make_pipeline(preprocess, LogisticRegression(C=C,\n penalty=penalty,\n l1_ratio=l1_ratio,\n solver='saga',\n max_iter=800))\n clf.fit(x,y)\n \n acc = accuracy_score(y_test, clf.predict(x_test))\n \n return acc\n\nclass EarlyStopping:\n \"\"\"stop tuning after value remains unchanged after 10 successive trials\"\"\"\n def __init__(self, max_rounds = 10):\n self.max_rounds = max_rounds\n self.current_rounds = 0\n \n def __call__(self, study, trial, tol = 1e-6):\n if abs(trial.value - study.best_value) <= tol:\n self.current_rounds += 1\n elif trial.value == study.best_value:\n self.current_rounds = 0\n if self.current_rounds >= self.max_rounds:\n study.stop()", "class": "Model Training", - "desc": "The code trains four different classifiers (Multinomial Naive Bayes, Logistic Regression, SVC, and XGBoost) on the training dataset, makes predictions on the test dataset, evaluates their performance using K-Fold cross-validation and F1 scores, then prints the mean F1 scores for each predictive model on the test dataset.", + "desc": "This code snippet defines an `objective` function for tuning a logistic regression model using the Optuna framework by suggesting hyperparameters and evaluating accuracy, and an `EarlyStopping` class to halt the tuning process after a specified number of successive trials without improvement.", "testing": { "class": "Model_Train", "subclass": "find_best_model_class", "subclass_id": 3, - "predicted_subclass_probability": 0.7733168 + "predicted_subclass_probability": 0.3142369 + }, + "cluster": 3 + }, { + "cell_id": 13, + "code": "# # create study and run trials\nes = EarlyStopping()\n\nstudy = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler()) # using Tree-structured Parzen Estimator to sample\nstudy.optimize(objective, n_trials=250, callbacks=[es])", + "class": "Model Training", + "desc": "This code snippet creates an Optuna study object with the direction set to maximize and the Tree-structured Parzen Estimator (TPE) sampler, and runs the hyperparameter optimization process for 250 trials using the defined `objective` function and the `EarlyStopping` callback.", + "testing": { + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.80090225 }, "cluster": 0 }, { - "cell_id": 8, - "code": "from wordcloud import WordCloud\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n%matplotlib inline\ndef wordsCloud (dF):\n fig , ax1 = plt.subplots(1,figsize=(12,12))\n stopword_list = stopwords.words(\"english\")\n wordcloud=WordCloud(stopwords = stopword_list, background_color='white',collocations = False , width=600,height=600).generate(\" \".join(dF))\n ax1.imshow(wordcloud)\n ax1.axis('off')\n ax1.set_title(\"Frequent Words\",fontsize=24) \n # print(stopword_list)\n return\nwordsCloud(X_train.text)", + "cell_id": 15, + "code": "# train LR on best parameters\nlr = LogisticRegression(**study.best_params, solver='saga', max_iter=800)\nlr = make_pipeline(preprocess, lr)\nlr.fit(x_train, y_train)", + "class": "Model Training", + "desc": "This code snippet trains a logistic regression model using the best hyperparameters obtained from the Optuna study, integrates the preprocessing pipeline, and fits the model on the training data.", + "testing": { + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.99761343 + }, + "cluster": 1 + }, { + "cell_id": 7, + "code": "def plot_top_n_words(target = 1, n=50):\n \n count_dict = defaultdict(int)\n\n for tweet in train.query(f'target=={target}')['text']:\n for word in word_tokenize(tweet):\n count_dict[word] += 1\n\n wc_df = pd.DataFrame(data=count_dict.items(), columns = ['word', 'count'])\n sns.barplot(x = 'count', y='word', data=wc_df.sort_values(by=['count'], ascending=False)[:n])", "class": "Visualization", - "desc": "The code defines a function to generate and display a word cloud from the text data, visualizing frequently occurring words in the training dataset, and then calls this function on the 'text' column of the training data.", + "desc": "This code snippet defines a function `plot_top_n_words` that counts the frequency of words in tweets based on the target value and plots the top N most frequent words using a bar plot generated with Seaborn and Pandas.", "testing": { - "class": "Imports_and_Environment", - "subclass": "set_options", - "subclass_id": 23, - "predicted_subclass_probability": 0.55752915 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.8610096 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 9, - "code": "wordsCloud(X_test.text)", + "cell_id": 8, + "code": "plot_top_n_words()", "class": "Visualization", - "desc": "This code generates and displays a word cloud from the 'text' column of the test dataset to visualize frequently occurring words in the test data.", + "desc": "This code snippet calls the `plot_top_n_words` function to display a bar plot of the top N words (default 50) most frequently occurring in tweets labeled with target value 1 in the training dataset.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.8625756 + "class": "Visualization", + "subclass": "relationship", + "subclass_id": 81, + "predicted_subclass_probability": 0.58346504 }, - "cluster": 0 + "cluster": -1 }], - "notebook_id": 7, - "notebook_name": "distweetrhinosceros" + "notebook_id": 6, + "notebook_name": "logistic-regression-with-threshold-tuning.ipynb" }, { "cells": [{ "cell_id": 17, - "code": "df2.to_csv(\"submission.csv\", index=False)", + "code": "my_submission = pd.DataFrame({'Id': dataset_test_original.id, 'target': test_dataframe_prediction})\nmy_submission.to_csv('submission.csv', index=False)", "class": "Data Export", - "desc": "The code exports the final DataFrame to a CSV file named \"submission.csv\" without including the index.", + "desc": "The code creates a new dataframe with 'Id' and 'target' columns from the original test dataset and the predicted values, and saves it as a CSV file named 'submission.csv' without including the index.", "testing": { "class": "Data_Export", "subclass": "save_to_csv", "subclass_id": 25, - "predicted_subclass_probability": 0.9993235 + "predicted_subclass_probability": 0.9993311 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 4, - "code": "# read training data\ndf = pd.read_csv(\"../input/nlp-getting-started/train.csv\")\ndisplay(df.head())\ndisplay(df.shape)", + "cell_id": 1, + "code": "dataset_test_original = dataset_test", "class": "Data Extraction", - "desc": "The code reads the training data from a CSV file into a pandas DataFrame and displays the first few rows and the shape of the DataFrame.", + "desc": "The code creates a copy of the `dataset_test` dataframe and assigns it to `dataset_test_original`.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.99592197 + "class": "Data_Transform", + "subclass": "drop_column", + "subclass_id": 10, + "predicted_subclass_probability": 0.40974942 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 12, - "code": "# apply model to test data\ndf1 = pd.read_csv(\"../input/nlp-getting-started/test.csv\")\ndisplay(df1.head())\ndisplay(df1.shape)", + "cell_id": 2, + "code": "dataset_test_original", "class": "Data Extraction", - "desc": "The code reads the test data from a CSV file into a pandas DataFrame and displays the first few rows and the shape of the DataFrame.", + "desc": "The code outputs the `dataset_test_original` dataframe.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.99765056 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.99960965 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 5, - "code": "# clean training text\nl=len(df)\ndisplay(l)\ncleanlist=[]\ntextlength=[]\nfor i in range(l):\n ct=cleantext.clean(df.iloc[i,3], clean_all= True)\n cleanlist.append(ct)\n lct=len(ct)\n textlength.append(lct)\n ", - "class": "Data Transform", - "desc": "The code cleans the text data in the DataFrame using `cleantext.clean`, appends the cleaned text to a list, and records the length of each cleaned text.", + "cell_id": 4, + "code": "index_train = dataset_train.index\nindex_test = dataset_test.index\ntrain_len = index_train\ntest_len = index_test", + "class": "Data Extraction", + "desc": "The code extracts the index of `dataset_train` and `dataset_test` dataframes and assigns them to `index_train` and `index_test`, then assigns these indices to `train_len` and `test_len` respectively.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.9911814 + "class": "Exploratory_Data_Analysis", + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.99675506 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 6, - "code": "# combine clean text with training data\ndf_clean=pd.DataFrame(cleanlist)\ndf_clean.columns=['cleantext']\nframes=[df,df_clean]\nnewdf=pd.concat(frames, axis=1)\ndisplay(newdf)", - "class": "Data Transform", - "desc": "The code creates a new DataFrame with the cleaned text and concatenates it with the original DataFrame to combine the cleaned text with the training data.", + "cell_id": 5, + "code": "dataset_train = dataset_train[['text','target']]\nprint(dataset_train)", + "class": "Data Extraction", + "desc": "The code selects the 'text' and 'target' columns from `dataset_train` and displays the resulting dataframe.", "testing": { - "class": "Data_Transform", - "subclass": "create_dataframe", - "subclass_id": 12, - "predicted_subclass_probability": 0.9970824 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.3228975 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 14, - "code": "l=len(df1)\ndisplay(l)\npredlist=[]\n#l=1\nfor i in range(l):\n ct=cleantext.clean(df1.iloc[i,3], clean_all= True)\n new=predictor.predict(ct)\n predlist.append(new)", + "cell_id": 12, + "code": "dataset_test = dataset_test[['text']]", + "class": "Data Extraction", + "desc": "The code selects the 'text' column from the `dataset_test` dataframe and updates `dataset_test` to contain only this column.", + "testing": { + "class": "Data_Transform", + "subclass": "create_dataframe", + "subclass_id": 12, + "predicted_subclass_probability": 0.8538315 + }, + "cluster": -1 + }, { + "cell_id": 6, + "code": "corpus = []\nfor i in range(0, len(train_len)):\n review = re.sub('[^a-zA-Z]', ' ', dataset_train['text'][i])\n review = review.lower()\n review = review.split()\n ps = PorterStemmer()\n all_stopwords = stopwords.words('english')\n all_stopwords.remove('not')\n review = [ps.stem(word) for word in review if not word in set(all_stopwords)]\n review = ' '.join(review)\n corpus.append(review)", "class": "Data Transform", - "desc": "The code iterates through the test DataFrame, cleans each text entry using `cleantext.clean`, predicts the label using the trained predictor, and appends the predictions to a list.", + "desc": "The code preprocesses the 'text' data in the `dataset_train` dataframe by removing non-alphabetic characters, converting to lowercase, removing stopwords (except 'not'), and applying stemming, then builds a list of cleaned text entries called `corpus`.", "testing": { - "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.99204236 + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.91630965 }, - "cluster": 1 + "cluster": 5 }, { - "cell_id": 15, - "code": "df_pred=pd.DataFrame(predlist)\ndf_pred.columns=['target']\nframes=[df1,df_pred]\ndf2=pd.concat(frames, axis=1)\ndisplay(df2.head())", + "cell_id": 7, + "code": "from sklearn.feature_extraction.text import CountVectorizer\ncv = CountVectorizer(max_features = 1500)\nX = cv.fit_transform(corpus).toarray()\ny = dataset_train.iloc[:, -1].values", "class": "Data Transform", - "desc": "The code creates a DataFrame from the prediction list, names the column 'target', and concatenates it with the test DataFrame to form a new DataFrame which it then displays.", + "desc": "The code uses `CountVectorizer` from scikit-learn to transform the cleaned text data (`corpus`) into a feature matrix `X` with up to 1500 features, and extracts the target values `y` from the `dataset_train` dataframe.", "testing": { - "class": "Data_Export", - "subclass": "prepare_output", - "subclass_id": 55, - "predicted_subclass_probability": 0.9379232 + "class": "Data_Transform", + "subclass": "prepare_x_and_y", + "subclass_id": 21, + "predicted_subclass_probability": 0.99771535 }, - "cluster": 1 + "cluster": 4 }, { - "cell_id": 16, - "code": "df2.loc[df2['target']=='target','target']=1\ndf2.loc[df2['target']=='not_target','target']=0\ndisplay(df2['target'].mean())\ndf2=df2[['id','target']]\ndisplay(df2.shape)\ndisplay(df2.head())", + "cell_id": 8, + "code": "from sklearn.model_selection import train_test_split\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)", "class": "Data Transform", - "desc": "The code converts textual prediction labels in the 'target' column to numeric values (1 for 'target' and 0 for 'not_target'), calculates and displays the mean of the 'target' column, selects specific columns ('id' and 'target') for the final DataFrame, and displays the shape and first few rows of this DataFrame.", + "desc": "The code splits the feature matrix `X` and target values `y` into training and test sets using `train_test_split` from scikit-learn, with 20% of the data as the test set and a fixed random state for reproducibility.", "testing": { "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.8699204 + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.99815947 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 7, - "code": "# distribution of clean text length\ndisplay(pd.Series(textlength).describe())", - "class": "Exploratory Data Analysis", - "desc": "The code calculates and displays the descriptive statistics for the lengths of the cleaned text.", + "cell_id": 14, + "code": "testcorpus = []\nfor i in range(0, len(test_len)):\n review = re.sub('[^a-zA-Z]', ' ', dataset_test['text'][i])\n review = review.lower()\n review = review.split()\n ps = PorterStemmer()\n all_stopwords = stopwords.words('english')\n all_stopwords.remove('not')\n review = [ps.stem(word) for word in review if not word in set(all_stopwords)]\n review = ' '.join(review)\n testcorpus.append(review)\n\nxtest = cv.transform(testcorpus).toarray()\ntest_dataframe_prediction = classifier.predict(xtest)", + "class": "Data Transform", + "desc": "The code preprocesses the 'text' data in the `dataset_test` dataframe by cleaning, splitting, removing stopwords, and stemming, compiles the cleaned text into `testcorpus`, transforms it into a feature matrix `xtest` using the previously fitted `CountVectorizer`, and predicts the target values using the trained Gaussian Naive Bayes classifier.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.9994604 + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.39147574 }, - "cluster": -1 + "cluster": 5 }, { "cell_id": 13, - "code": "# example\ndf1.iloc[0,3]", + "code": "print(len(test_len))", "class": "Exploratory Data Analysis", - "desc": "The code retrieves and displays the text content from the fourth column of the first row in the test DataFrame.", + "desc": "The code prints the number of rows in the `dataset_test` dataframe by outputting the length of `test_len`.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9183029 + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.99938715 }, - "cluster": -1 + "cluster": 5 }, { - "cell_id": 0, - "code": "# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session", - "class": "Imports and Environment", - "desc": "The code imports necessary libraries and checks the files available in the input directory.", + "cell_id": 16, + "code": "print(dataset_test.head())", + "class": "Exploratory Data Analysis", + "desc": "The code prints the first few rows of the `dataset_test` dataframe to provide a glimpse of its contents.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "list_files", - "subclass_id": 88, - "predicted_subclass_probability": 0.99921954 + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.99974805 }, "cluster": 0 }, { - "cell_id": 1, - "code": "pip install cleantext", - "class": "Imports and Environment", - "desc": "The code installs the `cleantext` library using pip for text cleaning tasks.", - "testing": { - "class": "Imports_and_Environment", - "subclass": "install_modules", - "subclass_id": 87, - "predicted_subclass_probability": 0.9904579 - }, - "cluster": 1 - }, { - "cell_id": 2, - "code": "pip install ktrain", + "cell_id": 0, + "code": "import numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n if filename == 'train.csv':\n dataset_train = pd.read_csv(os.path.join(dirname, filename))\n elif filename == 'test.csv':\n dataset_test = pd.read_csv(os.path.join(dirname, filename))\n else:\n dataset_sample = pd.read_csv(os.path.join(dirname, filename))", "class": "Imports and Environment", - "desc": "The code installs the `ktrain` library using pip for training machine learning models.", + "desc": "The code imports necessary libraries like numpy and pandas for data processing and CSV file I/O, and it reads and loads CSV files from a directory into pandas dataframes.", "testing": { - "class": "Imports_and_Environment", - "subclass": "install_modules", - "subclass_id": 87, - "predicted_subclass_probability": 0.99148107 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.999468 }, - "cluster": 1 + "cluster": -1 }, { "cell_id": 3, - "code": "import pandas as pd\nimport ktrain\nfrom ktrain import text\nimport cleantext\nimport warnings\nwarnings.filterwarnings(\"ignore\")", + "code": "import re\nimport nltk\nnltk.download('stopwords')\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer", "class": "Imports and Environment", - "desc": "The code imports additional libraries, specifically `pandas` for data processing, `ktrain` and `ktrain.text` for model training, `cleantext` for text cleaning, and suppresses warnings.", + "desc": "The code imports the `re` and `nltk` libraries, downloads the NLTK stopwords, and imports specific components for text processing like `stopwords` and `PorterStemmer`.", "testing": { - "class": "Imports_and_Environment", - "subclass": "set_options", - "subclass_id": 23, - "predicted_subclass_probability": 0.9993291 + "class": "Model_Train", + "subclass": "load_pretrained", + "subclass_id": 30, + "predicted_subclass_probability": 0.9607691 }, "cluster": 0 }, { - "cell_id": 11, - "code": "# example\npredictor.predict(['calm','earthquake'])", + "cell_id": 10, + "code": "y_pred = classifier.predict(X_test)\nprint(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))", "class": "Model Evaluation", - "desc": "The code uses the trained predictor to make predictions on new example text inputs \"calm\" and \"earthquake.\"", + "desc": "The code uses the trained Gaussian Naive Bayes classifier to predict the target values for `X_test`, and prints the predicted values alongside the actual test values for comparison.", "testing": { "class": "Model_Evaluation", "subclass": "predict_on_test", "subclass_id": 48, - "predicted_subclass_probability": 0.9897764 + "predicted_subclass_probability": 0.99374145 }, "cluster": 0 }, { - "cell_id": 8, - "code": "#https://github.com/amaiya/ktrain\n# train model\n(x_train, y_train), (x_test, y_test), preproc=text.texts_from_df(newdf, 'cleantext',label_columns=['target'],\n maxlen=127,max_features=100000,\n preprocess_mode='bert', val_pct=.1)", - "class": "Model Training", - "desc": "The code prepares the text data from the DataFrame for training and validation by splitting it into training and test sets, and applies preprocessing suitable for BERT models using ktrain's `texts_from_df` function.", + "cell_id": 11, + "code": "from sklearn.metrics import confusion_matrix, accuracy_score\ncm = confusion_matrix(y_test, y_pred)\nprint(cm)\naccuracy_score(y_test, y_pred)", + "class": "Model Evaluation", + "desc": "The code calculates and prints the confusion matrix and the accuracy score using scikit-learn metrics to evaluate the performance of the Gaussian Naive Bayes classifier on the test data.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.38557464 + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.996639 }, "cluster": 0 }, { - "cell_id": 9, - "code": "model=text.text_classifier('bert', (x_train, y_train), preproc=preproc)\nlearner=ktrain.get_learner(model, train_data=(x_train, y_train),\n val_data=(x_test, y_test),\n batch_size=32)", - "class": "Model Training", - "desc": "The code creates a BERT text classifier using the preprocessed training data and then initializes a ktrain learner object for training the model with specified training and validation data and a batch size of 32.", + "cell_id": 15, + "code": "print(test_dataframe_prediction)", + "class": "Model Evaluation", + "desc": "The code outputs the predicted values for the `dataset_test` dataframe using the previously trained Gaussian Naive Bayes classifier.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.87104684 + "class": "Data_Transform", + "subclass": "create_dataframe", + "subclass_id": 12, + "predicted_subclass_probability": 0.9981881 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 10, - "code": "learner.fit_onecycle(2e-5, 3)\npredictor=ktrain.get_predictor(learner.model, preproc)", + "cell_id": 9, + "code": "from sklearn.naive_bayes import GaussianNB\nclassifier = GaussianNB()\nclassifier.fit(X_train, y_train)", "class": "Model Training", - "desc": "The code trains the model using the one-cycle learning rate policy for 3 epochs with a learning rate of 2e-5 and then creates a predictor object from the trained model.", + "desc": "The code initializes a Gaussian Naive Bayes classifier from scikit-learn and trains it on the training data (`X_train` and `y_train`).", "testing": { "class": "Model_Train", "subclass": "train_model", "subclass_id": 7, - "predicted_subclass_probability": 0.9968579 + "predicted_subclass_probability": 0.9989604 }, - "cluster": 0 + "cluster": 1 }], - "notebook_id": 8, - "notebook_name": "ktrain-disaster-tweet-model" + "notebook_id": 7, + "notebook_name": "nlp-prediction.ipynb" }, { "cells": [{ - "cell_id": 11, - "code": "def submission(model, test_df, fname = 'submission'):\n y_hat = model.predict(test_df)\n submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')\n submission['target'] = y_hat\n submission.to_csv('submission.csv', index=False)", - "class": "Data Export", - "desc": "This code snippet defines a function to generate submission files by predicting labels for the test dataset using a trained model and saving the results into a CSV file.", - "testing": { - "class": "Data_Export", - "subclass": "save_to_csv", - "subclass_id": 25, - "predicted_subclass_probability": 0.9989374 - }, - "cluster": 0 - }, { - "cell_id": 17, - "code": "# submission\ny_hat = lr.predict_proba(test)\ny_hat = y_hat[:, 1]\n\npreds = to_class_label(y_hat, opt_thres)\nsubmission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')\nsubmission['target'] = preds\n\nsubmission.to_csv('submission.csv', index=False)", + "cell_id": 28, + "code": "submission['target'] = test_pred_BERT_int\nsubmission.to_csv(\"submission_BERT.csv\", index=False, header=True)", "class": "Data Export", - "desc": "This code snippet generates predictions for the test dataset using the trained logistic regression model, converts the probabilities to class labels using the optimal threshold, and saves the results into a CSV submission file.", + "desc": "This code snippet assigns the BERT model predictions to the 'target' column of the submission dataframe and saves it as a CSV file named \"submission_BERT.csv\" using `pandas.to_csv`.", "testing": { "class": "Data_Export", "subclass": "save_to_csv", "subclass_id": 25, - "predicted_subclass_probability": 0.9982597 - }, - "cluster": 0 - }, { - "cell_id": 1, - "code": "# print files in input dir\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n ", - "class": "Data Extraction", - "desc": "This code snippet prints the paths of all files in the specified input directory to identify available data files.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "list_files", - "subclass_id": 88, - "predicted_subclass_probability": 0.9993166 + "predicted_subclass_probability": 0.9993593 }, "cluster": -1 }, { - "cell_id": 2, - "code": "train = pd.read_csv('../input/nlp-getting-started/train.csv')\ntest = pd.read_csv('../input/nlp-getting-started/test.csv')\n\ntrain.head()", + "cell_id": 1, + "code": "train_df = pd.read_csv(\"../input/nlp-getting-started/train.csv\")\ntest_df = pd.read_csv(\"../input/nlp-getting-started/test.csv\")\nsubmission = pd.read_csv(\"../input/nlp-getting-started/sample_submission.csv\")\n\nprint(\"Training Shape rows = {}, columns = {}\".format(train_df.shape[0],train_df.shape[1]))\nprint(\"Testing Shape rows = {}, columns = {}\".format(test_df.shape[0],test_df.shape[1]))", "class": "Data Extraction", - "desc": "This code snippet reads the training and test datasets from CSV files into Pandas DataFrames and displays the first few rows of the training data.", + "desc": "This code snippet reads in the training, testing, and submission CSV files using `pandas` and prints the shapes of the train and test dataframes.", "testing": { "class": "Data_Extraction", "subclass": "load_from_csv", "subclass_id": 45, - "predicted_subclass_probability": 0.9996737 + "predicted_subclass_probability": 0.99957687 }, - "cluster": 2 + "cluster": 0 }, { - "cell_id": 4, - "code": "# plot prop of missing for each feature\nsns.set_theme(style='white')\nsns.barplot(x=train.columns, y=train.isnull().mean())\nplt.show()\n\n# drop location and keyword\ntrain.drop(columns=['id', 'keyword', 'location'], inplace=True)\ntest.drop(columns=['id', 'keyword', 'location'], inplace=True)\ntrain.drop_duplicates(inplace=True, ignore_index=True)", - "class": "Data Transform", - "desc": "This code snippet visualizes the proportion of missing values for each feature in the training dataset and removes specific columns ('id', 'keyword', 'location') from both the training and test datasets, followed by dropping duplicate rows in the training dataset.", + "cell_id": 23, + "code": "vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()\ndo_lower_case = bert_layer.resolved_object.do_lower_case.numpy()\ntokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)", + "class": "Data Extraction", + "desc": "This code snippet retrieves the vocabulary file and case sensitivity setting from the loaded BERT layer, and initializes a tokenizer using the `FullTokenizer` class from the `tokenization` module.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.8657076 + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.56229645 }, "cluster": 1 }, { - "cell_id": 6, - "code": "wordnet_lemmatizer = WordNetLemmatizer()\n\ndef quick_clean(text):\n \"\"\"\n adapted from: https://www.kaggle.com/sophiejermy/sj-eda1\n \"\"\"\n# text = text + ' '\n #remove links\n text = re.sub(r'(?:(?:https?|ftp):\\/\\/)?[\\w/\\-?=%.]+\\.[\\w/\\-&?=%.]+', '', text)\n #lower case\n text = text.lower() \n #remove special characters\n text = re.sub(r'[\\W]+', ' ', text)\n #remove double spaces\n text = re.sub(r'\\s+', ' ', text)\n #tokenize\n text = word_tokenize(text)\n #remove stop words\n text = [word for word in text if not word in stopwords.words('english')] \n #lemmatize\n text= [wordnet_lemmatizer.lemmatize(word, pos='v') for word in text]\n #rejoin text to string\n text = ' '.join(text)\n return text\n\ndef quick_clean_vectorized(col):\n return pd.DataFrame(data=col.apply(lambda x: quick_clean(x)).tolist())\n\nquiklean_transformer = FunctionTransformer(quick_clean_vectorized) # to use in pipeline\n ", + "cell_id": 11, + "code": "keyword_dist = train_df.groupby(\"keyword\")['target'].value_counts().unstack(fill_value=0)\nkeyword_dist = keyword_dist.add_prefix(keyword_dist.columns.name).rename_axis(columns=None).reset_index()", "class": "Data Transform", - "desc": "This code snippet defines a function to clean and preprocess text data by removing links, converting to lowercase, removing special characters and stopwords, tokenizing, lemmatizing, and then creates a transformer for applying this function in a pipeline.", + "desc": "This code snippet groups the training dataframe by the 'keyword' column, counts the values for each target class, fills missing values with zero, adjusts the column names, and resets the index using `pandas` methods such as `groupby`, `value_counts`, `unstack`, `add_prefix`, `rename_axis`, and `reset_index`.", "testing": { "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.75697106 + "subclass": "groupby", + "subclass_id": 60, + "predicted_subclass_probability": 0.8236565 }, - "cluster": 1 + "cluster": 8 }, { - "cell_id": 9, - "code": "x_train, x_test, y_train, y_test = train_test_split(train.loc[:,train.columns != 'target'], train.target, test_size=0.2)\nprint(x_train.shape, y_train.shape, x_test.shape, y_test.shape)", + "cell_id": 12, + "code": "keyword_dist.sort_values('target1',ascending = False).head(10)", "class": "Data Transform", - "desc": "This code snippet splits the training dataset into training and validation sets, then prints their dimensions to provide an overview of the resulting subsets.", + "desc": "This code snippet sorts the keyword distribution dataframe by the 'target1' column in descending order and displays the top 10 rows using the `sort_values` and `head` methods in `pandas`.", "testing": { "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.99772364 + "subclass": "sort_values", + "subclass_id": 9, + "predicted_subclass_probability": 0.8409343 }, - "cluster": 1 + "cluster": 7 }, { - "cell_id": 10, - "code": "tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenize, stop_words='english', max_features = 300)\n\npreprocess = Pipeline(steps=[\n ('clean', ColumnTransformer([\n ('cl', quiklean_transformer, 'text')\n ],\n remainder='drop')),\n ('TFIDF', ColumnTransformer([\n ('tfidf', tfidf_vectorizer, 0)\n ], \n remainder='passthrough')),\n ('dim_reduce', TruncatedSVD(n_components=250, random_state=42)),\n ('scale', MinMaxScaler())\n \n ])", + "cell_id": 13, + "code": "keyword_dist.sort_values('target0',ascending = False).head(10)", "class": "Data Transform", - "desc": "This code snippet defines a preprocessing pipeline that includes cleaning the text data, applying TF-IDF vectorization, dimensionality reduction using Truncated SVD, and scaling features using MinMaxScaler.", + "desc": "This code snippet sorts the keyword distribution dataframe by the 'target0' column in descending order and displays the top 10 rows using the `sort_values` and `head` methods in `pandas`.", "testing": { "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.9810301 + "subclass": "sort_values", + "subclass_id": 9, + "predicted_subclass_probability": 0.7933884 }, - "cluster": 1 + "cluster": 7 }, { - "cell_id": 3, - "code": "print(f'Train dims {train.shape}', f'Test dims {test.shape}', sep = '\\n')", + "cell_id": 14, + "code": "#word count\ntrain_df['word_count'] = train_df['text'].apply(lambda x : len(str(x).split()))\ntest_df['word_count'] = test_df['text'].apply(lambda x : len(str(x).split()))\n#Unique word count\ntrain_df['unique_word_count'] = train_df['text'].apply(lambda x : len(set(str(x).split())))\ntest_df['unique_word_count'] = test_df['text'].apply(lambda x : len(set(str(x).split())))\n#Count of letters\ntrain_df['count_letters'] = train_df['text'].apply(lambda x : len(str(x)))\ntest_df['count_letters'] = test_df['text'].apply(lambda x : len(str(x)))\n#Count of punctuations\ntrain_df['count_punctuations'] = train_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))\ntest_df['count_punctuations'] = test_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))\n#count of stopwords\ntrain_df['stop_word_count'] = train_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))\ntest_df['stop_word_count'] = test_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))\n#Count of hashtag\ntrain_df['hashtag_count'] = train_df['text'].apply(lambda x : len([c for c in str(x) if c == '#']))\ntest_df['hashtag_count'] = test_df['text'].apply(lambda x : len([c for c in str(x) if c == '#']))\n#Count of mentions\ntrain_df['mention_count'] = train_df['text'].apply(lambda x : len([c for c in str(x) if c=='@']))\ntest_df['mention_count'] = test_df['text'].apply(lambda x : len([c for c in str(x) if c=='@']))", + "class": "Data Transform", + "desc": "This code snippet adds several new columns to both the training and testing dataframes, calculating word count, unique word count, letter count, punctuation count, stop word count, hashtag count, and mention count using `pandas` and `lambda` functions.", + "testing": { + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.9991715 + }, + "cluster": 8 + }, { + "cell_id": 19, + "code": "# Refrenced from Gunes Evitan and Vitalii Mokin Notebook\ndef clean(tweet): \n \n # Special characters\n tweet = re.sub(r\"\\x89\u00db_\", \"\", tweet)\n tweet = re.sub(r\"\\x89\u00db\u00d2\", \"\", tweet)\n tweet = re.sub(r\"\\x89\u00db\u00d3\", \"\", tweet)\n tweet = re.sub(r\"\\x89\u00db\u00cfWhen\", \"When\", tweet)\n tweet = re.sub(r\"\\x89\u00db\u00cf\", \"\", tweet)\n tweet = re.sub(r\"China\\x89\u00db\u00aas\", \"China's\", tweet)\n tweet = re.sub(r\"let\\x89\u00db\u00aas\", \"let's\", tweet)\n tweet = re.sub(r\"\\x89\u00db\u00f7\", \"\", tweet)\n tweet = re.sub(r\"\\x89\u00db\u00aa\", \"\", tweet)\n tweet = re.sub(r\"\\x89\u00db\\x9d\", \"\", tweet)\n tweet = re.sub(r\"\u00e5_\", \"\", tweet)\n tweet = re.sub(r\"\\x89\u00db\u00a2\", \"\", tweet)\n tweet = re.sub(r\"\\x89\u00db\u00a2\u00e5\u00ca\", \"\", tweet)\n tweet = re.sub(r\"from\u00e5\u00cawounds\", \"from wounds\", tweet)\n tweet = re.sub(r\"\u00e5\u00ca\", \"\", tweet)\n tweet = re.sub(r\"\u00e5\u00c8\", \"\", tweet)\n tweet = re.sub(r\"Jap\u00cc_n\", \"Japan\", tweet) \n tweet = re.sub(r\"\u00cc\u00a9\", \"e\", tweet)\n tweet = re.sub(r\"\u00e5\u00a8\", \"\", tweet)\n tweet = re.sub(r\"Suru\u00cc\u00a4\", \"Suruc\", tweet)\n tweet = re.sub(r\"\u00e5\u00c7\", \"\", tweet)\n tweet = re.sub(r\"\u00e5\u00a33million\", \"3 million\", tweet)\n tweet = re.sub(r\"\u00e5\u00c0\", \"\", tweet)\n \n # Contractions\n tweet = re.sub(r\"he's\", \"he is\", tweet)\n tweet = re.sub(r\"there's\", \"there is\", tweet)\n tweet = re.sub(r\"We're\", \"We are\", tweet)\n tweet = re.sub(r\"That's\", \"That is\", tweet)\n tweet = re.sub(r\"won't\", \"will not\", tweet)\n tweet = re.sub(r\"they're\", \"they are\", tweet)\n tweet = re.sub(r\"Can't\", \"Cannot\", tweet)\n tweet = re.sub(r\"wasn't\", \"was not\", tweet)\n tweet = re.sub(r\"don\\x89\u00db\u00aat\", \"do not\", tweet)\n tweet = re.sub(r\"aren't\", \"are not\", tweet)\n tweet = re.sub(r\"isn't\", \"is not\", tweet)\n tweet = re.sub(r\"What's\", \"What is\", tweet)\n tweet = re.sub(r\"haven't\", \"have not\", tweet)\n tweet = re.sub(r\"hasn't\", \"has not\", tweet)\n tweet = re.sub(r\"There's\", \"There is\", tweet)\n tweet = re.sub(r\"He's\", \"He is\", tweet)\n tweet = re.sub(r\"It's\", \"It is\", tweet)\n tweet = re.sub(r\"You're\", \"You are\", tweet)\n tweet = re.sub(r\"I'M\", \"I am\", tweet)\n tweet = re.sub(r\"shouldn't\", \"should not\", tweet)\n tweet = re.sub(r\"wouldn't\", \"would not\", tweet)\n tweet = re.sub(r\"i'm\", \"I am\", tweet)\n tweet = re.sub(r\"I\\x89\u00db\u00aam\", \"I am\", tweet)\n tweet = re.sub(r\"I'm\", \"I am\", tweet)\n tweet = re.sub(r\"Isn't\", \"is not\", tweet)\n tweet = re.sub(r\"Here's\", \"Here is\", tweet)\n tweet = re.sub(r\"you've\", \"you have\", tweet)\n tweet = re.sub(r\"you\\x89\u00db\u00aave\", \"you have\", tweet)\n tweet = re.sub(r\"we're\", \"we are\", tweet)\n tweet = re.sub(r\"what's\", \"what is\", tweet)\n tweet = re.sub(r\"couldn't\", \"could not\", tweet)\n tweet = re.sub(r\"we've\", \"we have\", tweet)\n tweet = re.sub(r\"it\\x89\u00db\u00aas\", \"it is\", tweet)\n tweet = re.sub(r\"doesn\\x89\u00db\u00aat\", \"does not\", tweet)\n tweet = re.sub(r\"It\\x89\u00db\u00aas\", \"It is\", tweet)\n tweet = re.sub(r\"Here\\x89\u00db\u00aas\", \"Here is\", tweet)\n tweet = re.sub(r\"who's\", \"who is\", tweet)\n tweet = re.sub(r\"I\\x89\u00db\u00aave\", \"I have\", tweet)\n tweet = re.sub(r\"y'all\", \"you all\", tweet)\n tweet = re.sub(r\"can\\x89\u00db\u00aat\", \"cannot\", tweet)\n tweet = re.sub(r\"would've\", \"would have\", tweet)\n tweet = re.sub(r\"it'll\", \"it will\", tweet)\n tweet = re.sub(r\"we'll\", \"we will\", tweet)\n tweet = re.sub(r\"wouldn\\x89\u00db\u00aat\", \"would not\", tweet)\n tweet = re.sub(r\"We've\", \"We have\", tweet)\n tweet = re.sub(r\"he'll\", \"he will\", tweet)\n tweet = re.sub(r\"Y'all\", \"You all\", tweet)\n tweet = re.sub(r\"Weren't\", \"Were not\", tweet)\n tweet = re.sub(r\"Didn't\", \"Did not\", tweet)\n tweet = re.sub(r\"they'll\", \"they will\", tweet)\n tweet = re.sub(r\"they'd\", \"they would\", tweet)\n tweet = re.sub(r\"DON'T\", \"DO NOT\", tweet)\n tweet = re.sub(r\"That\\x89\u00db\u00aas\", \"That is\", tweet)\n tweet = re.sub(r\"they've\", \"they have\", tweet)\n tweet = re.sub(r\"i'd\", \"I would\", tweet)\n tweet = re.sub(r\"should've\", \"should have\", tweet)\n tweet = re.sub(r\"You\\x89\u00db\u00aare\", \"You are\", tweet)\n tweet = re.sub(r\"where's\", \"where is\", tweet)\n tweet = re.sub(r\"Don\\x89\u00db\u00aat\", \"Do not\", tweet)\n tweet = re.sub(r\"we'd\", \"we would\", tweet)\n tweet = re.sub(r\"i'll\", \"I will\", tweet)\n tweet = re.sub(r\"weren't\", \"were not\", tweet)\n tweet = re.sub(r\"They're\", \"They are\", tweet)\n tweet = re.sub(r\"Can\\x89\u00db\u00aat\", \"Cannot\", tweet)\n tweet = re.sub(r\"you\\x89\u00db\u00aall\", \"you will\", tweet)\n tweet = re.sub(r\"I\\x89\u00db\u00aad\", \"I would\", tweet)\n tweet = re.sub(r\"let's\", \"let us\", tweet)\n tweet = re.sub(r\"it's\", \"it is\", tweet)\n tweet = re.sub(r\"can't\", \"cannot\", tweet)\n tweet = re.sub(r\"don't\", \"do not\", tweet)\n tweet = re.sub(r\"you're\", \"you are\", tweet)\n tweet = re.sub(r\"i've\", \"I have\", tweet)\n tweet = re.sub(r\"that's\", \"that is\", tweet)\n tweet = re.sub(r\"i'll\", \"I will\", tweet)\n tweet = re.sub(r\"doesn't\", \"does not\", tweet)\n tweet = re.sub(r\"i'd\", \"I would\", tweet)\n tweet = re.sub(r\"didn't\", \"did not\", tweet)\n tweet = re.sub(r\"ain't\", \"am not\", tweet)\n tweet = re.sub(r\"you'll\", \"you will\", tweet)\n tweet = re.sub(r\"I've\", \"I have\", tweet)\n tweet = re.sub(r\"Don't\", \"do not\", tweet)\n tweet = re.sub(r\"I'll\", \"I will\", tweet)\n tweet = re.sub(r\"I'd\", \"I would\", tweet)\n tweet = re.sub(r\"Let's\", \"Let us\", tweet)\n tweet = re.sub(r\"you'd\", \"You would\", tweet)\n tweet = re.sub(r\"It's\", \"It is\", tweet)\n tweet = re.sub(r\"Ain't\", \"am not\", tweet)\n tweet = re.sub(r\"Haven't\", \"Have not\", tweet)\n tweet = re.sub(r\"Could've\", \"Could have\", tweet)\n tweet = re.sub(r\"youve\", \"you have\", tweet) \n tweet = re.sub(r\"don\u00e5\u00abt\", \"do not\", tweet) \n \n # Character entity references\n tweet = re.sub(r\">\", \">\", tweet)\n tweet = re.sub(r\"<\", \"<\", tweet)\n tweet = re.sub(r\"&\", \"&\", tweet)\n \n # Typos, slang and informal abbreviations\n tweet = re.sub(r\"w/e\", \"whatever\", tweet)\n tweet = re.sub(r\"w/\", \"with\", tweet)\n tweet = re.sub(r\"USAgov\", \"USA government\", tweet)\n tweet = re.sub(r\"recentlu\", \"recently\", tweet)\n tweet = re.sub(r\"Ph0tos\", \"Photos\", tweet)\n tweet = re.sub(r\"amirite\", \"am I right\", tweet)\n tweet = re.sub(r\"exp0sed\", \"exposed\", tweet)\n tweet = re.sub(r\"<3\", \"love\", tweet)\n tweet = re.sub(r\"amageddon\", \"armageddon\", tweet)\n tweet = re.sub(r\"Trfc\", \"Traffic\", tweet)\n tweet = re.sub(r\"8/5/2015\", \"2015-08-05\", tweet)\n tweet = re.sub(r\"WindStorm\", \"Wind Storm\", tweet)\n tweet = re.sub(r\"8/6/2015\", \"2015-08-06\", tweet)\n tweet = re.sub(r\"10:38PM\", \"10:38 PM\", tweet)\n tweet = re.sub(r\"10:30pm\", \"10:30 PM\", tweet)\n tweet = re.sub(r\"16yr\", \"16 year\", tweet)\n tweet = re.sub(r\"lmao\", \"laughing my ass off\", tweet) \n tweet = re.sub(r\"TRAUMATISED\", \"traumatized\", tweet)\n \n # Hashtags and usernames\n tweet = re.sub(r\"IranDeal\", \"Iran Deal\", tweet)\n tweet = re.sub(r\"ArianaGrande\", \"Ariana Grande\", tweet)\n tweet = re.sub(r\"camilacabello97\", \"camila cabello\", tweet) \n tweet = re.sub(r\"RondaRousey\", \"Ronda Rousey\", tweet) \n tweet = re.sub(r\"MTVHottest\", \"MTV Hottest\", tweet)\n tweet = re.sub(r\"TrapMusic\", \"Trap Music\", tweet)\n tweet = re.sub(r\"ProphetMuhammad\", \"Prophet Muhammad\", tweet)\n tweet = re.sub(r\"PantherAttack\", \"Panther Attack\", tweet)\n tweet = re.sub(r\"StrategicPatience\", \"Strategic Patience\", tweet)\n tweet = re.sub(r\"socialnews\", \"social news\", tweet)\n tweet = re.sub(r\"NASAHurricane\", \"NASA Hurricane\", tweet)\n tweet = re.sub(r\"onlinecommunities\", \"online communities\", tweet)\n tweet = re.sub(r\"humanconsumption\", \"human consumption\", tweet)\n tweet = re.sub(r\"Typhoon-Devastated\", \"Typhoon Devastated\", tweet)\n tweet = re.sub(r\"Meat-Loving\", \"Meat Loving\", tweet)\n tweet = re.sub(r\"facialabuse\", \"facial abuse\", tweet)\n tweet = re.sub(r\"LakeCounty\", \"Lake County\", tweet)\n tweet = re.sub(r\"BeingAuthor\", \"Being Author\", tweet)\n tweet = re.sub(r\"withheavenly\", \"with heavenly\", tweet)\n tweet = re.sub(r\"thankU\", \"thank you\", tweet)\n tweet = re.sub(r\"iTunesMusic\", \"iTunes Music\", tweet)\n tweet = re.sub(r\"OffensiveContent\", \"Offensive Content\", tweet)\n tweet = re.sub(r\"WorstSummerJob\", \"Worst Summer Job\", tweet)\n tweet = re.sub(r\"HarryBeCareful\", \"Harry Be Careful\", tweet)\n tweet = re.sub(r\"NASASolarSystem\", \"NASA Solar System\", tweet)\n tweet = re.sub(r\"animalrescue\", \"animal rescue\", tweet)\n tweet = re.sub(r\"KurtSchlichter\", \"Kurt Schlichter\", tweet)\n tweet = re.sub(r\"aRmageddon\", \"armageddon\", tweet)\n tweet = re.sub(r\"Throwingknifes\", \"Throwing knives\", tweet)\n tweet = re.sub(r\"GodsLove\", \"God's Love\", tweet)\n tweet = re.sub(r\"bookboost\", \"book boost\", tweet)\n tweet = re.sub(r\"ibooklove\", \"I book love\", tweet)\n tweet = re.sub(r\"NestleIndia\", \"Nestle India\", tweet)\n tweet = re.sub(r\"realDonaldTrump\", \"Donald Trump\", tweet)\n tweet = re.sub(r\"DavidVonderhaar\", \"David Vonderhaar\", tweet)\n tweet = re.sub(r\"CecilTheLion\", \"Cecil The Lion\", tweet)\n tweet = re.sub(r\"weathernetwork\", \"weather network\", tweet)\n tweet = re.sub(r\"withBioterrorism&use\", \"with Bioterrorism & use\", tweet)\n tweet = re.sub(r\"Hostage&2\", \"Hostage & 2\", tweet)\n tweet = re.sub(r\"GOPDebate\", \"GOP Debate\", tweet)\n tweet = re.sub(r\"RickPerry\", \"Rick Perry\", tweet)\n tweet = re.sub(r\"frontpage\", \"front page\", tweet)\n tweet = re.sub(r\"NewsInTweets\", \"News In Tweets\", tweet)\n tweet = re.sub(r\"ViralSpell\", \"Viral Spell\", tweet)\n tweet = re.sub(r\"til_now\", \"until now\", tweet)\n tweet = re.sub(r\"volcanoinRussia\", \"volcano in Russia\", tweet)\n tweet = re.sub(r\"ZippedNews\", \"Zipped News\", tweet)\n tweet = re.sub(r\"MicheleBachman\", \"Michele Bachman\", tweet)\n tweet = re.sub(r\"53inch\", \"53 inch\", tweet)\n tweet = re.sub(r\"KerrickTrial\", \"Kerrick Trial\", tweet)\n tweet = re.sub(r\"abstorm\", \"Alberta Storm\", tweet)\n tweet = re.sub(r\"Beyhive\", \"Beyonce hive\", tweet)\n tweet = re.sub(r\"IDFire\", \"Idaho Fire\", tweet)\n tweet = re.sub(r\"DETECTADO\", \"Detected\", tweet)\n tweet = re.sub(r\"RockyFire\", \"Rocky Fire\", tweet)\n tweet = re.sub(r\"Listen/Buy\", \"Listen / Buy\", tweet)\n tweet = re.sub(r\"NickCannon\", \"Nick Cannon\", tweet)\n tweet = re.sub(r\"FaroeIslands\", \"Faroe Islands\", tweet)\n tweet = re.sub(r\"yycstorm\", \"Calgary Storm\", tweet)\n tweet = re.sub(r\"IDPs:\", \"Internally Displaced People :\", tweet)\n tweet = re.sub(r\"ArtistsUnited\", \"Artists United\", tweet)\n tweet = re.sub(r\"ClaytonBryant\", \"Clayton Bryant\", tweet)\n tweet = re.sub(r\"jimmyfallon\", \"jimmy fallon\", tweet)\n tweet = re.sub(r\"justinbieber\", \"justin bieber\", tweet) \n tweet = re.sub(r\"UTC2015\", \"UTC 2015\", tweet)\n tweet = re.sub(r\"Time2015\", \"Time 2015\", tweet)\n tweet = re.sub(r\"djicemoon\", \"dj icemoon\", tweet)\n tweet = re.sub(r\"LivingSafely\", \"Living Safely\", tweet)\n tweet = re.sub(r\"FIFA16\", \"Fifa 2016\", tweet)\n tweet = re.sub(r\"thisiswhywecanthavenicethings\", \"this is why we cannot have nice things\", tweet)\n tweet = re.sub(r\"bbcnews\", \"bbc news\", tweet)\n tweet = re.sub(r\"UndergroundRailraod\", \"Underground Railraod\", tweet)\n tweet = re.sub(r\"c4news\", \"c4 news\", tweet)\n tweet = re.sub(r\"OBLITERATION\", \"obliteration\", tweet)\n tweet = re.sub(r\"MUDSLIDE\", \"mudslide\", tweet)\n tweet = re.sub(r\"NoSurrender\", \"No Surrender\", tweet)\n tweet = re.sub(r\"NotExplained\", \"Not Explained\", tweet)\n tweet = re.sub(r\"greatbritishbakeoff\", \"great british bake off\", tweet)\n tweet = re.sub(r\"LondonFire\", \"London Fire\", tweet)\n tweet = re.sub(r\"KOTAWeather\", \"KOTA Weather\", tweet)\n tweet = re.sub(r\"LuchaUnderground\", \"Lucha Underground\", tweet)\n tweet = re.sub(r\"KOIN6News\", \"KOIN 6 News\", tweet)\n tweet = re.sub(r\"LiveOnK2\", \"Live On K2\", tweet)\n tweet = re.sub(r\"9NewsGoldCoast\", \"9 News Gold Coast\", tweet)\n tweet = re.sub(r\"nikeplus\", \"nike plus\", tweet)\n tweet = re.sub(r\"david_cameron\", \"David Cameron\", tweet)\n tweet = re.sub(r\"peterjukes\", \"Peter Jukes\", tweet)\n tweet = re.sub(r\"JamesMelville\", \"James Melville\", tweet)\n tweet = re.sub(r\"megynkelly\", \"Megyn Kelly\", tweet)\n tweet = re.sub(r\"cnewslive\", \"C News Live\", tweet)\n tweet = re.sub(r\"JamaicaObserver\", \"Jamaica Observer\", tweet)\n tweet = re.sub(r\"TweetLikeItsSeptember11th2001\", \"Tweet like it is september 11th 2001\", tweet)\n tweet = re.sub(r\"cbplawyers\", \"cbp lawyers\", tweet)\n tweet = re.sub(r\"fewmoretweets\", \"few more tweets\", tweet)\n tweet = re.sub(r\"BlackLivesMatter\", \"Black Lives Matter\", tweet)\n tweet = re.sub(r\"cjoyner\", \"Chris Joyner\", tweet)\n tweet = re.sub(r\"ENGvAUS\", \"England vs Australia\", tweet)\n tweet = re.sub(r\"ScottWalker\", \"Scott Walker\", tweet)\n tweet = re.sub(r\"MikeParrActor\", \"Michael Parr\", tweet)\n tweet = re.sub(r\"4PlayThursdays\", \"Foreplay Thursdays\", tweet)\n tweet = re.sub(r\"TGF2015\", \"Tontitown Grape Festival\", tweet)\n tweet = re.sub(r\"realmandyrain\", \"Mandy Rain\", tweet)\n tweet = re.sub(r\"GraysonDolan\", \"Grayson Dolan\", tweet)\n tweet = re.sub(r\"ApolloBrown\", \"Apollo Brown\", tweet)\n tweet = re.sub(r\"saddlebrooke\", \"Saddlebrooke\", tweet)\n tweet = re.sub(r\"TontitownGrape\", \"Tontitown Grape\", tweet)\n tweet = re.sub(r\"AbbsWinston\", \"Abbs Winston\", tweet)\n tweet = re.sub(r\"ShaunKing\", \"Shaun King\", tweet)\n tweet = re.sub(r\"MeekMill\", \"Meek Mill\", tweet)\n tweet = re.sub(r\"TornadoGiveaway\", \"Tornado Giveaway\", tweet)\n tweet = re.sub(r\"GRupdates\", \"GR updates\", tweet)\n tweet = re.sub(r\"SouthDowns\", \"South Downs\", tweet)\n tweet = re.sub(r\"braininjury\", \"brain injury\", tweet)\n tweet = re.sub(r\"auspol\", \"Australian politics\", tweet)\n tweet = re.sub(r\"PlannedParenthood\", \"Planned Parenthood\", tweet)\n tweet = re.sub(r\"calgaryweather\", \"Calgary Weather\", tweet)\n tweet = re.sub(r\"weallheartonedirection\", \"we all heart one direction\", tweet)\n tweet = re.sub(r\"edsheeran\", \"Ed Sheeran\", tweet)\n tweet = re.sub(r\"TrueHeroes\", \"True Heroes\", tweet)\n tweet = re.sub(r\"S3XLEAK\", \"sex leak\", tweet)\n tweet = re.sub(r\"ComplexMag\", \"Complex Magazine\", tweet)\n tweet = re.sub(r\"TheAdvocateMag\", \"The Advocate Magazine\", tweet)\n tweet = re.sub(r\"CityofCalgary\", \"City of Calgary\", tweet)\n tweet = re.sub(r\"EbolaOutbreak\", \"Ebola Outbreak\", tweet)\n tweet = re.sub(r\"SummerFate\", \"Summer Fate\", tweet)\n tweet = re.sub(r\"RAmag\", \"Royal Academy Magazine\", tweet)\n tweet = re.sub(r\"offers2go\", \"offers to go\", tweet)\n tweet = re.sub(r\"foodscare\", \"food scare\", tweet)\n tweet = re.sub(r\"MNPDNashville\", \"Metropolitan Nashville Police Department\", tweet)\n tweet = re.sub(r\"TfLBusAlerts\", \"TfL Bus Alerts\", tweet)\n tweet = re.sub(r\"GamerGate\", \"Gamer Gate\", tweet)\n tweet = re.sub(r\"IHHen\", \"Humanitarian Relief\", tweet)\n tweet = re.sub(r\"spinningbot\", \"spinning bot\", tweet)\n tweet = re.sub(r\"ModiMinistry\", \"Modi Ministry\", tweet)\n tweet = re.sub(r\"TAXIWAYS\", \"taxi ways\", tweet)\n tweet = re.sub(r\"Calum5SOS\", \"Calum Hood\", tweet)\n tweet = re.sub(r\"po_st\", \"po.st\", tweet)\n tweet = re.sub(r\"scoopit\", \"scoop.it\", tweet)\n tweet = re.sub(r\"UltimaLucha\", \"Ultima Lucha\", tweet)\n tweet = re.sub(r\"JonathanFerrell\", \"Jonathan Ferrell\", tweet)\n tweet = re.sub(r\"aria_ahrary\", \"Aria Ahrary\", tweet)\n tweet = re.sub(r\"rapidcity\", \"Rapid City\", tweet)\n tweet = re.sub(r\"OutBid\", \"outbid\", tweet)\n tweet = re.sub(r\"lavenderpoetrycafe\", \"lavender poetry cafe\", tweet)\n tweet = re.sub(r\"EudryLantiqua\", \"Eudry Lantiqua\", tweet)\n tweet = re.sub(r\"15PM\", \"15 PM\", tweet)\n tweet = re.sub(r\"OriginalFunko\", \"Funko\", tweet)\n tweet = re.sub(r\"rightwaystan\", \"Richard Tan\", tweet)\n tweet = re.sub(r\"CindyNoonan\", \"Cindy Noonan\", tweet)\n tweet = re.sub(r\"RT_America\", \"RT America\", tweet)\n tweet = re.sub(r\"narendramodi\", \"Narendra Modi\", tweet)\n tweet = re.sub(r\"BakeOffFriends\", \"Bake Off Friends\", tweet)\n tweet = re.sub(r\"TeamHendrick\", \"Hendrick Motorsports\", tweet)\n tweet = re.sub(r\"alexbelloli\", \"Alex Belloli\", tweet)\n tweet = re.sub(r\"itsjustinstuart\", \"Justin Stuart\", tweet)\n tweet = re.sub(r\"gunsense\", \"gun sense\", tweet)\n tweet = re.sub(r\"DebateQuestionsWeWantToHear\", \"debate questions we want to hear\", tweet)\n tweet = re.sub(r\"RoyalCarribean\", \"Royal Carribean\", tweet)\n tweet = re.sub(r\"samanthaturne19\", \"Samantha Turner\", tweet)\n tweet = re.sub(r\"JonVoyage\", \"Jon Stewart\", tweet)\n tweet = re.sub(r\"renew911health\", \"renew 911 health\", tweet)\n tweet = re.sub(r\"SuryaRay\", \"Surya Ray\", tweet)\n tweet = re.sub(r\"pattonoswalt\", \"Patton Oswalt\", tweet)\n tweet = re.sub(r\"minhazmerchant\", \"Minhaz Merchant\", tweet)\n tweet = re.sub(r\"TLVFaces\", \"Israel Diaspora Coalition\", tweet)\n tweet = re.sub(r\"pmarca\", \"Marc Andreessen\", tweet)\n tweet = re.sub(r\"pdx911\", \"Portland Police\", tweet)\n tweet = re.sub(r\"jamaicaplain\", \"Jamaica Plain\", tweet)\n tweet = re.sub(r\"Japton\", \"Arkansas\", tweet)\n tweet = re.sub(r\"RouteComplex\", \"Route Complex\", tweet)\n tweet = re.sub(r\"INSubcontinent\", \"Indian Subcontinent\", tweet)\n tweet = re.sub(r\"NJTurnpike\", \"New Jersey Turnpike\", tweet)\n tweet = re.sub(r\"Politifiact\", \"PolitiFact\", tweet)\n tweet = re.sub(r\"Hiroshima70\", \"Hiroshima\", tweet)\n tweet = re.sub(r\"GMMBC\", \"Greater Mt Moriah Baptist Church\", tweet)\n tweet = re.sub(r\"versethe\", \"verse the\", tweet)\n tweet = re.sub(r\"TubeStrike\", \"Tube Strike\", tweet)\n tweet = re.sub(r\"MissionHills\", \"Mission Hills\", tweet)\n tweet = re.sub(r\"ProtectDenaliWolves\", \"Protect Denali Wolves\", tweet)\n tweet = re.sub(r\"NANKANA\", \"Nankana\", tweet)\n tweet = re.sub(r\"SAHIB\", \"Sahib\", tweet)\n tweet = re.sub(r\"PAKPATTAN\", \"Pakpattan\", tweet)\n tweet = re.sub(r\"Newz_Sacramento\", \"News Sacramento\", tweet)\n tweet = re.sub(r\"gofundme\", \"go fund me\", tweet)\n tweet = re.sub(r\"pmharper\", \"Stephen Harper\", tweet)\n tweet = re.sub(r\"IvanBerroa\", \"Ivan Berroa\", tweet)\n tweet = re.sub(r\"LosDelSonido\", \"Los Del Sonido\", tweet)\n tweet = re.sub(r\"bancodeseries\", \"banco de series\", tweet)\n tweet = re.sub(r\"timkaine\", \"Tim Kaine\", tweet)\n tweet = re.sub(r\"IdentityTheft\", \"Identity Theft\", tweet)\n tweet = re.sub(r\"AllLivesMatter\", \"All Lives Matter\", tweet)\n tweet = re.sub(r\"mishacollins\", \"Misha Collins\", tweet)\n tweet = re.sub(r\"BillNeelyNBC\", \"Bill Neely\", tweet)\n tweet = re.sub(r\"BeClearOnCancer\", \"be clear on cancer\", tweet)\n tweet = re.sub(r\"Kowing\", \"Knowing\", tweet)\n tweet = re.sub(r\"ScreamQueens\", \"Scream Queens\", tweet)\n tweet = re.sub(r\"AskCharley\", \"Ask Charley\", tweet)\n tweet = re.sub(r\"BlizzHeroes\", \"Heroes of the Storm\", tweet)\n tweet = re.sub(r\"BradleyBrad47\", \"Bradley Brad\", tweet)\n tweet = re.sub(r\"HannaPH\", \"Typhoon Hanna\", tweet)\n tweet = re.sub(r\"meinlcymbals\", \"MEINL Cymbals\", tweet)\n tweet = re.sub(r\"Ptbo\", \"Peterborough\", tweet)\n tweet = re.sub(r\"cnnbrk\", \"CNN Breaking News\", tweet)\n tweet = re.sub(r\"IndianNews\", \"Indian News\", tweet)\n tweet = re.sub(r\"savebees\", \"save bees\", tweet)\n tweet = re.sub(r\"GreenHarvard\", \"Green Harvard\", tweet)\n tweet = re.sub(r\"StandwithPP\", \"Stand with planned parenthood\", tweet)\n tweet = re.sub(r\"hermancranston\", \"Herman Cranston\", tweet)\n tweet = re.sub(r\"WMUR9\", \"WMUR-TV\", tweet)\n tweet = re.sub(r\"RockBottomRadFM\", \"Rock Bottom Radio\", tweet)\n tweet = re.sub(r\"ameenshaikh3\", \"Ameen Shaikh\", tweet)\n tweet = re.sub(r\"ProSyn\", \"Project Syndicate\", tweet)\n tweet = re.sub(r\"Daesh\", \"ISIS\", tweet)\n tweet = re.sub(r\"s2g\", \"swear to god\", tweet)\n tweet = re.sub(r\"listenlive\", \"listen live\", tweet)\n tweet = re.sub(r\"CDCgov\", \"Centers for Disease Control and Prevention\", tweet)\n tweet = re.sub(r\"FoxNew\", \"Fox News\", tweet)\n tweet = re.sub(r\"CBSBigBrother\", \"Big Brother\", tweet)\n tweet = re.sub(r\"JulieDiCaro\", \"Julie DiCaro\", tweet)\n tweet = re.sub(r\"theadvocatemag\", \"The Advocate Magazine\", tweet)\n tweet = re.sub(r\"RohnertParkDPS\", \"Rohnert Park Police Department\", tweet)\n tweet = re.sub(r\"THISIZBWRIGHT\", \"Bonnie Wright\", tweet)\n tweet = re.sub(r\"Popularmmos\", \"Popular MMOs\", tweet)\n tweet = re.sub(r\"WildHorses\", \"Wild Horses\", tweet)\n tweet = re.sub(r\"FantasticFour\", \"Fantastic Four\", tweet)\n tweet = re.sub(r\"HORNDALE\", \"Horndale\", tweet)\n tweet = re.sub(r\"PINER\", \"Piner\", tweet)\n tweet = re.sub(r\"BathAndNorthEastSomerset\", \"Bath and North East Somerset\", tweet)\n tweet = re.sub(r\"thatswhatfriendsarefor\", \"that is what friends are for\", tweet)\n tweet = re.sub(r\"residualincome\", \"residual income\", tweet)\n tweet = re.sub(r\"YahooNewsDigest\", \"Yahoo News Digest\", tweet)\n tweet = re.sub(r\"MalaysiaAirlines\", \"Malaysia Airlines\", tweet)\n tweet = re.sub(r\"AmazonDeals\", \"Amazon Deals\", tweet)\n tweet = re.sub(r\"MissCharleyWebb\", \"Charley Webb\", tweet)\n tweet = re.sub(r\"shoalstraffic\", \"shoals traffic\", tweet)\n tweet = re.sub(r\"GeorgeFoster72\", \"George Foster\", tweet)\n tweet = re.sub(r\"pop2015\", \"pop 2015\", tweet)\n tweet = re.sub(r\"_PokemonCards_\", \"Pokemon Cards\", tweet)\n tweet = re.sub(r\"DianneG\", \"Dianne Gallagher\", tweet)\n tweet = re.sub(r\"KashmirConflict\", \"Kashmir Conflict\", tweet)\n tweet = re.sub(r\"BritishBakeOff\", \"British Bake Off\", tweet)\n tweet = re.sub(r\"FreeKashmir\", \"Free Kashmir\", tweet)\n tweet = re.sub(r\"mattmosley\", \"Matt Mosley\", tweet)\n tweet = re.sub(r\"BishopFred\", \"Bishop Fred\", tweet)\n tweet = re.sub(r\"EndConflict\", \"End Conflict\", tweet)\n tweet = re.sub(r\"EndOccupation\", \"End Occupation\", tweet)\n tweet = re.sub(r\"UNHEALED\", \"unhealed\", tweet)\n tweet = re.sub(r\"CharlesDagnall\", \"Charles Dagnall\", tweet)\n tweet = re.sub(r\"Latestnews\", \"Latest news\", tweet)\n tweet = re.sub(r\"KindleCountdown\", \"Kindle Countdown\", tweet)\n tweet = re.sub(r\"NoMoreHandouts\", \"No More Handouts\", tweet)\n tweet = re.sub(r\"datingtips\", \"dating tips\", tweet)\n tweet = re.sub(r\"charlesadler\", \"Charles Adler\", tweet)\n tweet = re.sub(r\"twia\", \"Texas Windstorm Insurance Association\", tweet)\n tweet = re.sub(r\"txlege\", \"Texas Legislature\", tweet)\n tweet = re.sub(r\"WindstormInsurer\", \"Windstorm Insurer\", tweet)\n tweet = re.sub(r\"Newss\", \"News\", tweet)\n tweet = re.sub(r\"hempoil\", \"hemp oil\", tweet)\n tweet = re.sub(r\"CommoditiesAre\", \"Commodities are\", tweet)\n tweet = re.sub(r\"tubestrike\", \"tube strike\", tweet)\n tweet = re.sub(r\"JoeNBC\", \"Joe Scarborough\", tweet)\n tweet = re.sub(r\"LiteraryCakes\", \"Literary Cakes\", tweet)\n tweet = re.sub(r\"TI5\", \"The International 5\", tweet)\n tweet = re.sub(r\"thehill\", \"the hill\", tweet)\n tweet = re.sub(r\"3others\", \"3 others\", tweet)\n tweet = re.sub(r\"stighefootball\", \"Sam Tighe\", tweet)\n tweet = re.sub(r\"whatstheimportantvideo\", \"what is the important video\", tweet)\n tweet = re.sub(r\"ClaudioMeloni\", \"Claudio Meloni\", tweet)\n tweet = re.sub(r\"DukeSkywalker\", \"Duke Skywalker\", tweet)\n tweet = re.sub(r\"carsonmwr\", \"Fort Carson\", tweet)\n tweet = re.sub(r\"offdishduty\", \"off dish duty\", tweet)\n tweet = re.sub(r\"andword\", \"and word\", tweet)\n tweet = re.sub(r\"rhodeisland\", \"Rhode Island\", tweet)\n tweet = re.sub(r\"easternoregon\", \"Eastern Oregon\", tweet)\n tweet = re.sub(r\"WAwildfire\", \"Washington Wildfire\", tweet)\n tweet = re.sub(r\"fingerrockfire\", \"Finger Rock Fire\", tweet)\n tweet = re.sub(r\"57am\", \"57 am\", tweet)\n tweet = re.sub(r\"fingerrockfire\", \"Finger Rock Fire\", tweet)\n tweet = re.sub(r\"JacobHoggard\", \"Jacob Hoggard\", tweet)\n tweet = re.sub(r\"newnewnew\", \"new new new\", tweet)\n tweet = re.sub(r\"under50\", \"under 50\", tweet)\n tweet = re.sub(r\"getitbeforeitsgone\", \"get it before it is gone\", tweet)\n tweet = re.sub(r\"freshoutofthebox\", \"fresh out of the box\", tweet)\n tweet = re.sub(r\"amwriting\", \"am writing\", tweet)\n tweet = re.sub(r\"Bokoharm\", \"Boko Haram\", tweet)\n tweet = re.sub(r\"Nowlike\", \"Now like\", tweet)\n tweet = re.sub(r\"seasonfrom\", \"season from\", tweet)\n tweet = re.sub(r\"epicente\", \"epicenter\", tweet)\n tweet = re.sub(r\"epicenterr\", \"epicenter\", tweet)\n tweet = re.sub(r\"sicklife\", \"sick life\", tweet)\n tweet = re.sub(r\"yycweather\", \"Calgary Weather\", tweet)\n tweet = re.sub(r\"calgarysun\", \"Calgary Sun\", tweet)\n tweet = re.sub(r\"approachng\", \"approaching\", tweet)\n tweet = re.sub(r\"evng\", \"evening\", tweet)\n tweet = re.sub(r\"Sumthng\", \"something\", tweet)\n tweet = re.sub(r\"EllenPompeo\", \"Ellen Pompeo\", tweet)\n tweet = re.sub(r\"shondarhimes\", \"Shonda Rhimes\", tweet)\n tweet = re.sub(r\"ABCNetwork\", \"ABC Network\", tweet)\n tweet = re.sub(r\"SushmaSwaraj\", \"Sushma Swaraj\", tweet)\n tweet = re.sub(r\"pray4japan\", \"Pray for Japan\", tweet)\n tweet = re.sub(r\"hope4japan\", \"Hope for Japan\", tweet)\n tweet = re.sub(r\"Illusionimagess\", \"Illusion images\", tweet)\n tweet = re.sub(r\"SummerUnderTheStars\", \"Summer Under The Stars\", tweet)\n tweet = re.sub(r\"ShallWeDance\", \"Shall We Dance\", tweet)\n tweet = re.sub(r\"TCMParty\", \"TCM Party\", tweet)\n tweet = re.sub(r\"marijuananews\", \"marijuana news\", tweet)\n tweet = re.sub(r\"onbeingwithKristaTippett\", \"on being with Krista Tippett\", tweet)\n tweet = re.sub(r\"Beingtweets\", \"Being tweets\", tweet)\n tweet = re.sub(r\"newauthors\", \"new authors\", tweet)\n tweet = re.sub(r\"remedyyyy\", \"remedy\", tweet)\n tweet = re.sub(r\"44PM\", \"44 PM\", tweet)\n tweet = re.sub(r\"HeadlinesApp\", \"Headlines App\", tweet)\n tweet = re.sub(r\"40PM\", \"40 PM\", tweet)\n tweet = re.sub(r\"myswc\", \"Severe Weather Center\", tweet)\n tweet = re.sub(r\"ithats\", \"that is\", tweet)\n tweet = re.sub(r\"icouldsitinthismomentforever\", \"I could sit in this moment forever\", tweet)\n tweet = re.sub(r\"FatLoss\", \"Fat Loss\", tweet)\n tweet = re.sub(r\"02PM\", \"02 PM\", tweet)\n tweet = re.sub(r\"MetroFmTalk\", \"Metro Fm Talk\", tweet)\n tweet = re.sub(r\"Bstrd\", \"bastard\", tweet)\n tweet = re.sub(r\"bldy\", \"bloody\", tweet)\n tweet = re.sub(r\"MetrofmTalk\", \"Metro Fm Talk\", tweet)\n tweet = re.sub(r\"terrorismturn\", \"terrorism turn\", tweet)\n tweet = re.sub(r\"BBCNewsAsia\", \"BBC News Asia\", tweet)\n tweet = re.sub(r\"BehindTheScenes\", \"Behind The Scenes\", tweet)\n tweet = re.sub(r\"GeorgeTakei\", \"George Takei\", tweet)\n tweet = re.sub(r\"WomensWeeklyMag\", \"Womens Weekly Magazine\", tweet)\n tweet = re.sub(r\"SurvivorsGuidetoEarth\", \"Survivors Guide to Earth\", tweet)\n tweet = re.sub(r\"incubusband\", \"incubus band\", tweet)\n tweet = re.sub(r\"Babypicturethis\", \"Baby picture this\", tweet)\n tweet = re.sub(r\"BombEffects\", \"Bomb Effects\", tweet)\n tweet = re.sub(r\"win10\", \"Windows 10\", tweet)\n tweet = re.sub(r\"idkidk\", \"I do not know I do not know\", tweet)\n tweet = re.sub(r\"TheWalkingDead\", \"The Walking Dead\", tweet)\n tweet = re.sub(r\"amyschumer\", \"Amy Schumer\", tweet)\n tweet = re.sub(r\"crewlist\", \"crew list\", tweet)\n tweet = re.sub(r\"Erdogans\", \"Erdogan\", tweet)\n tweet = re.sub(r\"BBCLive\", \"BBC Live\", tweet)\n tweet = re.sub(r\"TonyAbbottMHR\", \"Tony Abbott\", tweet)\n tweet = re.sub(r\"paulmyerscough\", \"Paul Myerscough\", tweet)\n tweet = re.sub(r\"georgegallagher\", \"George Gallagher\", tweet)\n tweet = re.sub(r\"JimmieJohnson\", \"Jimmie Johnson\", tweet)\n tweet = re.sub(r\"pctool\", \"pc tool\", tweet)\n tweet = re.sub(r\"DoingHashtagsRight\", \"Doing Hashtags Right\", tweet)\n tweet = re.sub(r\"ThrowbackThursday\", \"Throwback Thursday\", tweet)\n tweet = re.sub(r\"SnowBackSunday\", \"Snowback Sunday\", tweet)\n tweet = re.sub(r\"LakeEffect\", \"Lake Effect\", tweet)\n tweet = re.sub(r\"RTphotographyUK\", \"Richard Thomas Photography UK\", tweet)\n tweet = re.sub(r\"BigBang_CBS\", \"Big Bang CBS\", tweet)\n tweet = re.sub(r\"writerslife\", \"writers life\", tweet)\n tweet = re.sub(r\"NaturalBirth\", \"Natural Birth\", tweet)\n tweet = re.sub(r\"UnusualWords\", \"Unusual Words\", tweet)\n tweet = re.sub(r\"wizkhalifa\", \"Wiz Khalifa\", tweet)\n tweet = re.sub(r\"acreativedc\", \"a creative DC\", tweet)\n tweet = re.sub(r\"vscodc\", \"vsco DC\", tweet)\n tweet = re.sub(r\"VSCOcam\", \"vsco camera\", tweet)\n tweet = re.sub(r\"TheBEACHDC\", \"The beach DC\", tweet)\n tweet = re.sub(r\"buildingmuseum\", \"building museum\", tweet)\n tweet = re.sub(r\"WorldOil\", \"World Oil\", tweet)\n tweet = re.sub(r\"redwedding\", \"red wedding\", tweet)\n tweet = re.sub(r\"AmazingRaceCanada\", \"Amazing Race Canada\", tweet)\n tweet = re.sub(r\"WakeUpAmerica\", \"Wake Up America\", tweet)\n tweet = re.sub(r\"\\\\Allahuakbar\\\\\", \"Allahu Akbar\", tweet)\n tweet = re.sub(r\"bleased\", \"blessed\", tweet)\n tweet = re.sub(r\"nigeriantribune\", \"Nigerian Tribune\", tweet)\n tweet = re.sub(r\"HIDEO_KOJIMA_EN\", \"Hideo Kojima\", tweet)\n tweet = re.sub(r\"FusionFestival\", \"Fusion Festival\", tweet)\n tweet = re.sub(r\"50Mixed\", \"50 Mixed\", tweet)\n tweet = re.sub(r\"NoAgenda\", \"No Agenda\", tweet)\n tweet = re.sub(r\"WhiteGenocide\", \"White Genocide\", tweet)\n tweet = re.sub(r\"dirtylying\", \"dirty lying\", tweet)\n tweet = re.sub(r\"SyrianRefugees\", \"Syrian Refugees\", tweet)\n tweet = re.sub(r\"changetheworld\", \"change the world\", tweet)\n tweet = re.sub(r\"Ebolacase\", \"Ebola case\", tweet)\n tweet = re.sub(r\"mcgtech\", \"mcg technologies\", tweet)\n tweet = re.sub(r\"withweapons\", \"with weapons\", tweet)\n tweet = re.sub(r\"advancedwarfare\", \"advanced warfare\", tweet)\n tweet = re.sub(r\"letsFootball\", \"let us Football\", tweet)\n tweet = re.sub(r\"LateNiteMix\", \"late night mix\", tweet)\n tweet = re.sub(r\"PhilCollinsFeed\", \"Phil Collins\", tweet)\n tweet = re.sub(r\"RudyHavenstein\", \"Rudy Havenstein\", tweet)\n tweet = re.sub(r\"22PM\", \"22 PM\", tweet)\n tweet = re.sub(r\"54am\", \"54 AM\", tweet)\n tweet = re.sub(r\"38am\", \"38 AM\", tweet)\n tweet = re.sub(r\"OldFolkExplainStuff\", \"Old Folk Explain Stuff\", tweet)\n tweet = re.sub(r\"BlacklivesMatter\", \"Black Lives Matter\", tweet)\n tweet = re.sub(r\"InsaneLimits\", \"Insane Limits\", tweet)\n tweet = re.sub(r\"youcantsitwithus\", \"you cannot sit with us\", tweet)\n tweet = re.sub(r\"2k15\", \"2015\", tweet)\n tweet = re.sub(r\"TheIran\", \"Iran\", tweet)\n tweet = re.sub(r\"JimmyFallon\", \"Jimmy Fallon\", tweet)\n tweet = re.sub(r\"AlbertBrooks\", \"Albert Brooks\", tweet)\n tweet = re.sub(r\"defense_news\", \"defense news\", tweet)\n tweet = re.sub(r\"nuclearrcSA\", \"Nuclear Risk Control Self Assessment\", tweet)\n tweet = re.sub(r\"Auspol\", \"Australia Politics\", tweet)\n tweet = re.sub(r\"NuclearPower\", \"Nuclear Power\", tweet)\n tweet = re.sub(r\"WhiteTerrorism\", \"White Terrorism\", tweet)\n tweet = re.sub(r\"truthfrequencyradio\", \"Truth Frequency Radio\", tweet)\n tweet = re.sub(r\"ErasureIsNotEquality\", \"Erasure is not equality\", tweet)\n tweet = re.sub(r\"ProBonoNews\", \"Pro Bono News\", tweet)\n tweet = re.sub(r\"JakartaPost\", \"Jakarta Post\", tweet)\n tweet = re.sub(r\"toopainful\", \"too painful\", tweet)\n tweet = re.sub(r\"melindahaunton\", \"Melinda Haunton\", tweet)\n tweet = re.sub(r\"NoNukes\", \"No Nukes\", tweet)\n tweet = re.sub(r\"curryspcworld\", \"Currys PC World\", tweet)\n tweet = re.sub(r\"ineedcake\", \"I need cake\", tweet)\n tweet = re.sub(r\"blackforestgateau\", \"black forest gateau\", tweet)\n tweet = re.sub(r\"BBCOne\", \"BBC One\", tweet)\n tweet = re.sub(r\"AlexxPage\", \"Alex Page\", tweet)\n tweet = re.sub(r\"jonathanserrie\", \"Jonathan Serrie\", tweet)\n tweet = re.sub(r\"SocialJerkBlog\", \"Social Jerk Blog\", tweet)\n tweet = re.sub(r\"ChelseaVPeretti\", \"Chelsea Peretti\", tweet)\n tweet = re.sub(r\"irongiant\", \"iron giant\", tweet)\n tweet = re.sub(r\"RonFunches\", \"Ron Funches\", tweet)\n tweet = re.sub(r\"TimCook\", \"Tim Cook\", tweet)\n tweet = re.sub(r\"sebastianstanisaliveandwell\", \"Sebastian Stan is alive and well\", tweet)\n tweet = re.sub(r\"Madsummer\", \"Mad summer\", tweet)\n tweet = re.sub(r\"NowYouKnow\", \"Now you know\", tweet)\n tweet = re.sub(r\"concertphotography\", \"concert photography\", tweet)\n tweet = re.sub(r\"TomLandry\", \"Tom Landry\", tweet)\n tweet = re.sub(r\"showgirldayoff\", \"show girl day off\", tweet)\n tweet = re.sub(r\"Yougslavia\", \"Yugoslavia\", tweet)\n tweet = re.sub(r\"QuantumDataInformatics\", \"Quantum Data Informatics\", tweet)\n tweet = re.sub(r\"FromTheDesk\", \"From The Desk\", tweet)\n tweet = re.sub(r\"TheaterTrial\", \"Theater Trial\", tweet)\n tweet = re.sub(r\"CatoInstitute\", \"Cato Institute\", tweet)\n tweet = re.sub(r\"EmekaGift\", \"Emeka Gift\", tweet)\n tweet = re.sub(r\"LetsBe_Rational\", \"Let us be rational\", tweet)\n tweet = re.sub(r\"Cynicalreality\", \"Cynical reality\", tweet)\n tweet = re.sub(r\"FredOlsenCruise\", \"Fred Olsen Cruise\", tweet)\n tweet = re.sub(r\"NotSorry\", \"not sorry\", tweet)\n tweet = re.sub(r\"UseYourWords\", \"use your words\", tweet)\n tweet = re.sub(r\"WordoftheDay\", \"word of the day\", tweet)\n tweet = re.sub(r\"Dictionarycom\", \"Dictionary.com\", tweet)\n tweet = re.sub(r\"TheBrooklynLife\", \"The Brooklyn Life\", tweet)\n tweet = re.sub(r\"jokethey\", \"joke they\", tweet)\n tweet = re.sub(r\"nflweek1picks\", \"NFL week 1 picks\", tweet)\n tweet = re.sub(r\"uiseful\", \"useful\", tweet)\n tweet = re.sub(r\"JusticeDotOrg\", \"The American Association for Justice\", tweet)\n tweet = re.sub(r\"autoaccidents\", \"auto accidents\", tweet)\n tweet = re.sub(r\"SteveGursten\", \"Steve Gursten\", tweet)\n tweet = re.sub(r\"MichiganAutoLaw\", \"Michigan Auto Law\", tweet)\n tweet = re.sub(r\"birdgang\", \"bird gang\", tweet)\n tweet = re.sub(r\"nflnetwork\", \"NFL Network\", tweet)\n tweet = re.sub(r\"NYDNSports\", \"NY Daily News Sports\", tweet)\n tweet = re.sub(r\"RVacchianoNYDN\", \"Ralph Vacchiano NY Daily News\", tweet)\n tweet = re.sub(r\"EdmontonEsks\", \"Edmonton Eskimos\", tweet)\n tweet = re.sub(r\"david_brelsford\", \"David Brelsford\", tweet)\n tweet = re.sub(r\"TOI_India\", \"The Times of India\", tweet)\n tweet = re.sub(r\"hegot\", \"he got\", tweet)\n tweet = re.sub(r\"SkinsOn9\", \"Skins on 9\", tweet)\n tweet = re.sub(r\"sothathappened\", \"so that happened\", tweet)\n tweet = re.sub(r\"LCOutOfDoors\", \"LC Out Of Doors\", tweet)\n tweet = re.sub(r\"NationFirst\", \"Nation First\", tweet)\n tweet = re.sub(r\"IndiaToday\", \"India Today\", tweet)\n tweet = re.sub(r\"HLPS\", \"helps\", tweet)\n tweet = re.sub(r\"HOSTAGESTHROSW\", \"hostages throw\", tweet)\n tweet = re.sub(r\"SNCTIONS\", \"sanctions\", tweet)\n tweet = re.sub(r\"BidTime\", \"Bid Time\", tweet)\n tweet = re.sub(r\"crunchysensible\", \"crunchy sensible\", tweet)\n tweet = re.sub(r\"RandomActsOfRomance\", \"Random acts of romance\", tweet)\n tweet = re.sub(r\"MomentsAtHill\", \"Moments at hill\", tweet)\n tweet = re.sub(r\"eatshit\", \"eat shit\", tweet)\n tweet = re.sub(r\"liveleakfun\", \"live leak fun\", tweet)\n tweet = re.sub(r\"SahelNews\", \"Sahel News\", tweet)\n tweet = re.sub(r\"abc7newsbayarea\", \"ABC 7 News Bay Area\", tweet)\n tweet = re.sub(r\"facilitiesmanagement\", \"facilities management\", tweet)\n tweet = re.sub(r\"facilitydude\", \"facility dude\", tweet)\n tweet = re.sub(r\"CampLogistics\", \"Camp logistics\", tweet)\n tweet = re.sub(r\"alaskapublic\", \"Alaska public\", tweet)\n tweet = re.sub(r\"MarketResearch\", \"Market Research\", tweet)\n tweet = re.sub(r\"AccuracyEsports\", \"Accuracy Esports\", tweet)\n tweet = re.sub(r\"TheBodyShopAust\", \"The Body Shop Australia\", tweet)\n tweet = re.sub(r\"yychail\", \"Calgary hail\", tweet)\n tweet = re.sub(r\"yyctraffic\", \"Calgary traffic\", tweet)\n tweet = re.sub(r\"eliotschool\", \"eliot school\", tweet)\n tweet = re.sub(r\"TheBrokenCity\", \"The Broken City\", tweet)\n tweet = re.sub(r\"OldsFireDept\", \"Olds Fire Department\", tweet)\n tweet = re.sub(r\"RiverComplex\", \"River Complex\", tweet)\n tweet = re.sub(r\"fieldworksmells\", \"field work smells\", tweet)\n tweet = re.sub(r\"IranElection\", \"Iran Election\", tweet)\n tweet = re.sub(r\"glowng\", \"glowing\", tweet)\n tweet = re.sub(r\"kindlng\", \"kindling\", tweet)\n tweet = re.sub(r\"riggd\", \"rigged\", tweet)\n tweet = re.sub(r\"slownewsday\", \"slow news day\", tweet)\n tweet = re.sub(r\"MyanmarFlood\", \"Myanmar Flood\", tweet)\n tweet = re.sub(r\"abc7chicago\", \"ABC 7 Chicago\", tweet)\n tweet = re.sub(r\"copolitics\", \"Colorado Politics\", tweet)\n tweet = re.sub(r\"AdilGhumro\", \"Adil Ghumro\", tweet)\n tweet = re.sub(r\"netbots\", \"net bots\", tweet)\n tweet = re.sub(r\"byebyeroad\", \"bye bye road\", tweet)\n tweet = re.sub(r\"massiveflooding\", \"massive flooding\", tweet)\n tweet = re.sub(r\"EndofUS\", \"End of United States\", tweet)\n tweet = re.sub(r\"35PM\", \"35 PM\", tweet)\n tweet = re.sub(r\"greektheatrela\", \"Greek Theatre Los Angeles\", tweet)\n tweet = re.sub(r\"76mins\", \"76 minutes\", tweet)\n tweet = re.sub(r\"publicsafetyfirst\", \"public safety first\", tweet)\n tweet = re.sub(r\"livesmatter\", \"lives matter\", tweet)\n tweet = re.sub(r\"myhometown\", \"my hometown\", tweet)\n tweet = re.sub(r\"tankerfire\", \"tanker fire\", tweet)\n tweet = re.sub(r\"MEMORIALDAY\", \"memorial day\", tweet)\n tweet = re.sub(r\"MEMORIAL_DAY\", \"memorial day\", tweet)\n tweet = re.sub(r\"instaxbooty\", \"instagram booty\", tweet)\n tweet = re.sub(r\"Jerusalem_Post\", \"Jerusalem Post\", tweet)\n tweet = re.sub(r\"WayneRooney_INA\", \"Wayne Rooney\", tweet)\n tweet = re.sub(r\"VirtualReality\", \"Virtual Reality\", tweet)\n tweet = re.sub(r\"OculusRift\", \"Oculus Rift\", tweet)\n tweet = re.sub(r\"OwenJones84\", \"Owen Jones\", tweet)\n tweet = re.sub(r\"jeremycorbyn\", \"Jeremy Corbyn\", tweet)\n tweet = re.sub(r\"paulrogers002\", \"Paul Rogers\", tweet)\n tweet = re.sub(r\"mortalkombatx\", \"Mortal Kombat X\", tweet)\n tweet = re.sub(r\"mortalkombat\", \"Mortal Kombat\", tweet)\n tweet = re.sub(r\"FilipeCoelho92\", \"Filipe Coelho\", tweet)\n tweet = re.sub(r\"OnlyQuakeNews\", \"Only Quake News\", tweet)\n tweet = re.sub(r\"kostumes\", \"costumes\", tweet)\n tweet = re.sub(r\"YEEESSSS\", \"yes\", tweet)\n tweet = re.sub(r\"ToshikazuKatayama\", \"Toshikazu Katayama\", tweet)\n tweet = re.sub(r\"IntlDevelopment\", \"Intl Development\", tweet)\n tweet = re.sub(r\"ExtremeWeather\", \"Extreme Weather\", tweet)\n tweet = re.sub(r\"WereNotGruberVoters\", \"We are not gruber voters\", tweet)\n tweet = re.sub(r\"NewsThousands\", \"News Thousands\", tweet)\n tweet = re.sub(r\"EdmundAdamus\", \"Edmund Adamus\", tweet)\n tweet = re.sub(r\"EyewitnessWV\", \"Eye witness WV\", tweet)\n tweet = re.sub(r\"PhiladelphiaMuseu\", \"Philadelphia Museum\", tweet)\n tweet = re.sub(r\"DublinComicCon\", \"Dublin Comic Con\", tweet)\n tweet = re.sub(r\"NicholasBrendon\", \"Nicholas Brendon\", tweet)\n tweet = re.sub(r\"Alltheway80s\", \"All the way 80s\", tweet)\n tweet = re.sub(r\"FromTheField\", \"From the field\", tweet)\n tweet = re.sub(r\"NorthIowa\", \"North Iowa\", tweet)\n tweet = re.sub(r\"WillowFire\", \"Willow Fire\", tweet)\n tweet = re.sub(r\"MadRiverComplex\", \"Mad River Complex\", tweet)\n tweet = re.sub(r\"feelingmanly\", \"feeling manly\", tweet)\n tweet = re.sub(r\"stillnotoverit\", \"still not over it\", tweet)\n tweet = re.sub(r\"FortitudeValley\", \"Fortitude Valley\", tweet)\n tweet = re.sub(r\"CoastpowerlineTramTr\", \"Coast powerline\", tweet)\n tweet = re.sub(r\"ServicesGold\", \"Services Gold\", tweet)\n tweet = re.sub(r\"NewsbrokenEmergency\", \"News broken emergency\", tweet)\n tweet = re.sub(r\"Evaucation\", \"evacuation\", tweet)\n tweet = re.sub(r\"leaveevacuateexitbe\", \"leave evacuate exit be\", tweet)\n tweet = re.sub(r\"P_EOPLE\", \"PEOPLE\", tweet)\n tweet = re.sub(r\"Tubestrike\", \"tube strike\", tweet)\n tweet = re.sub(r\"CLASS_SICK\", \"CLASS SICK\", tweet)\n tweet = re.sub(r\"localplumber\", \"local plumber\", tweet)\n tweet = re.sub(r\"awesomejobsiri\", \"awesome job siri\", tweet)\n tweet = re.sub(r\"PayForItHow\", \"Pay for it how\", tweet)\n tweet = re.sub(r\"ThisIsAfrica\", \"This is Africa\", tweet)\n tweet = re.sub(r\"crimeairnetwork\", \"crime air network\", tweet)\n tweet = re.sub(r\"KimAcheson\", \"Kim Acheson\", tweet)\n tweet = re.sub(r\"cityofcalgary\", \"City of Calgary\", tweet)\n tweet = re.sub(r\"prosyndicate\", \"pro syndicate\", tweet)\n tweet = re.sub(r\"660NEWS\", \"660 NEWS\", tweet)\n tweet = re.sub(r\"BusInsMagazine\", \"Business Insurance Magazine\", tweet)\n tweet = re.sub(r\"wfocus\", \"focus\", tweet)\n tweet = re.sub(r\"ShastaDam\", \"Shasta Dam\", tweet)\n tweet = re.sub(r\"go2MarkFranco\", \"Mark Franco\", tweet)\n tweet = re.sub(r\"StephGHinojosa\", \"Steph Hinojosa\", tweet)\n tweet = re.sub(r\"Nashgrier\", \"Nash Grier\", tweet)\n tweet = re.sub(r\"NashNewVideo\", \"Nash new video\", tweet)\n tweet = re.sub(r\"IWouldntGetElectedBecause\", \"I would not get elected because\", tweet)\n tweet = re.sub(r\"SHGames\", \"Sledgehammer Games\", tweet)\n tweet = re.sub(r\"bedhair\", \"bed hair\", tweet)\n tweet = re.sub(r\"JoelHeyman\", \"Joel Heyman\", tweet)\n tweet = re.sub(r\"viaYouTube\", \"via YouTube\", tweet)\n \n # Urls\n tweet = re.sub(r\"https?:\\/\\/t.co\\/[A-Za-z0-9]+\", \"\", tweet)\n \n # Words with punctuations and special characters\n punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + \"'`\"\n for p in punctuations:\n tweet = tweet.replace(p, f' {p} ')\n \n # ... and ..\n tweet = tweet.replace('...', ' ... ')\n if '...' not in tweet:\n tweet = tweet.replace('..', ' ... ') \n \n # Acronyms\n tweet = re.sub(r\"MH370\", \"Malaysia Airlines Flight 370\", tweet)\n tweet = re.sub(r\"m\u00cc\u00bcsica\", \"music\", tweet)\n tweet = re.sub(r\"okwx\", \"Oklahoma City Weather\", tweet)\n tweet = re.sub(r\"arwx\", \"Arkansas Weather\", tweet) \n tweet = re.sub(r\"gawx\", \"Georgia Weather\", tweet) \n tweet = re.sub(r\"scwx\", \"South Carolina Weather\", tweet) \n tweet = re.sub(r\"cawx\", \"California Weather\", tweet)\n tweet = re.sub(r\"tnwx\", \"Tennessee Weather\", tweet)\n tweet = re.sub(r\"azwx\", \"Arizona Weather\", tweet) \n tweet = re.sub(r\"alwx\", \"Alabama Weather\", tweet)\n tweet = re.sub(r\"wordpressdotcom\", \"wordpress\", tweet) \n tweet = re.sub(r\"usNWSgov\", \"United States National Weather Service\", tweet)\n tweet = re.sub(r\"Suruc\", \"Sanliurfa\", tweet) \n \n # Grouping same words without embeddings\n tweet = re.sub(r\"Bestnaijamade\", \"bestnaijamade\", tweet)\n tweet = re.sub(r\"SOUDELOR\", \"Soudelor\", tweet)\n \n #Remove Emoji\n tweet = re.sub(u\"\\U0001F600-\\U0001F64F\",\"\", tweet) # emoticons\n tweet = re.sub(u\"\\U0001F300-\\U0001F5FF\",\"\", tweet) # symbols & pictographs\n tweet = re.sub(u\"\\U0001F680-\\U0001F6FF\",\"\", tweet) # transport & map symbols\n tweet = re.sub(u\"\\U0001F1E0-\\U0001F1FF\",\"\", tweet) # flags (iOS)\n tweet = re.sub(u\"\\U00002702-\\U000027B0\",\"\", tweet)\n tweet = re.sub(u\"\\U000024C2-\\U0001F251\",\"\", tweet)\n \n return tweet\n\ntrain_df['text_cleaned'] = train_df['text'].apply(lambda s : clean(s))\ntest_df['text_cleaned'] = test_df['text'].apply(lambda s : clean(s))", + "class": "Data Transform", + "desc": "This code snippet defines a `clean` function for preprocessing and normalizing the text data by handling special characters, contractions, acronyms, punctuations, URLs, and emojis, and then applies this function to add a new cleaned text column to both the training and testing dataframes using `pandas` and `lambda` functions.", + "testing": { + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.9979898 + }, + "cluster": 0 + }, { + "cell_id": 20, + "code": "def encode(texts, tokenizer, max_len=512):\n all_tokens = []\n all_masks = []\n all_segments = []\n \n for text in texts:\n # Tokenise text\n text = tokenizer.tokenize(text)\n #Reduce 2 slots for start and end tag\n text = text[:max_len-2]\n #Add start and end tag\n input_sequence = [\"[CLS]\"] + text + [\"[SEP]\"]\n #Padding to be added\n pad_len = max_len - len(input_sequence)\n #Get token ids\n tokens = tokenizer.convert_tokens_to_ids(input_sequence)\n #add padding\n tokens += [0] * pad_len\n #Create padding mask with 1's of length of input and 0's with padding length\n pad_masks = [1] * len(input_sequence) + [0] * pad_len\n #Create segment ids with all 0's \n segment_ids = [0] * max_len\n \n all_tokens.append(tokens)\n all_masks.append(pad_masks)\n all_segments.append(segment_ids)\n \n return np.array(all_tokens), np.array(all_masks), np.array(all_segments)", + "class": "Data Transform", + "desc": "This code snippet defines an `encode` function to tokenize, pad, and encode a list of texts using a tokenizer, and returns token ids, attention masks, and segment ids in the form of NumPy arrays.", + "testing": { + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.9931865 + }, + "cluster": 0 + }, { + "cell_id": 24, + "code": "train_input = encode(train_df.text_cleaned.values, tokenizer, max_len=160)\ntest_input = encode(test_df.text_cleaned.values, tokenizer, max_len=160)\ntrain_labels = train_df.target.values", + "class": "Data Transform", + "desc": "This code snippet encodes the cleaned text data from the training and testing dataframes into token ids, attention masks, and segment ids using the previously defined `encode` function and the initialized tokenizer, and extracts the target labels from the training data.", + "testing": { + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.997544 + }, + "cluster": 2 + }, { + "cell_id": 2, + "code": "print(\"Train columns = {}\".format(train_df.columns))\nprint(\"Test columns = {}\".format(test_df.columns))", "class": "Exploratory Data Analysis", - "desc": "This code snippet prints the dimensions (number of rows and columns) of the training and test datasets to provide an initial overview of the data.", + "desc": "This code snippet prints the column names of the training and testing dataframes using `pandas`. ", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.99101853 + "subclass": "show_columns", + "subclass_id": 71, + "predicted_subclass_probability": 0.99450326 }, - "cluster": 7 + "cluster": 3 + }, { + "cell_id": 4, + "code": "print(\"So there are {} occourance of disastrous twitts and {} occourances of non disastrous\".format(x[1],x[0]))", + "class": "Exploratory Data Analysis", + "desc": "This code snippet prints the number of occurrences of disastrous and non-disastrous tweets in the training dataframe using the `format` method on the value counts result.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.91736937 + }, + "cluster": 3 }, { "cell_id": 5, - "code": "# plot target distribution\nsns.countplot(x='target', data=train)\nplt.title('Target distribution')\nplt.show()", + "code": "train_df.head(10)", "class": "Exploratory Data Analysis", - "desc": "This code snippet creates a count plot to visualize the distribution of the target variable in the training dataset.", + "desc": "This code snippet displays the first 10 rows of the training dataframe using the `head` method in `pandas`.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.99668735 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997634 }, - "cluster": -1 + "cluster": 2 + }, { + "cell_id": 6, + "code": "train_df.isnull().sum()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet checks for and sums up the total number of missing values in each column of the training dataframe using the `isnull` and `sum` methods in `pandas`.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "count_missing_values", + "subclass_id": 39, + "predicted_subclass_probability": 0.99896073 + }, + "cluster": 6 + }, { + "cell_id": 7, + "code": "test_df.isnull().sum()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet checks for and sums up the total number of missing values in each column of the testing dataframe using the `isnull` and `sum` methods in `pandas`.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "count_missing_values", + "subclass_id": 39, + "predicted_subclass_probability": 0.9990055 + }, + "cluster": 6 + }, { + "cell_id": 8, + "code": "train_df[train_df.keyword.notnull()].head(10)", + "class": "Exploratory Data Analysis", + "desc": "This code snippet filters the training dataframe to display the first 10 rows where the 'keyword' column is not null, using the `notnull` and `head` methods in `pandas`.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997584 + }, + "cluster": 4 + }, { + "cell_id": 9, + "code": "train_df[train_df.keyword.notnull()].tail(10)", + "class": "Exploratory Data Analysis", + "desc": "This code snippet filters the training dataframe to display the last 10 rows where the 'keyword' column is not null, using the `notnull` and `tail` methods in `pandas`.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997335 + }, + "cluster": 4 + }, { + "cell_id": 10, + "code": "train_df[train_df.keyword.isnull()].head(10)", + "class": "Exploratory Data Analysis", + "desc": "This code snippet filters the training dataframe to display the first 10 rows where the 'keyword' column is null, using the `isnull` and `head` methods in `pandas`.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9996885 + }, + "cluster": 4 }, { "cell_id": 0, - "code": "# setup\n\nfrom collections import Counter, defaultdict\n\nimport numpy as np \nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport os\nimport re\n\nfrom sklearn.preprocessing import FunctionTransformer, MinMaxScaler\nfrom sklearn.pipeline import Pipeline, make_pipeline\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.model_selection import train_test_split \nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nfrom sklearn import set_config\n\nimport optuna\nimport nltk\nfrom nltk.stem import WordNetLemmatizer\nfrom nltk.corpus import stopwords\nfrom nltk.tokenize import word_tokenize\n\nplt.style.use('ggplot')\n%matplotlib inline\n\nset_config(display='diagram')", + "code": "# Import packeges\nimport os\nimport gc\nimport re\nimport time\nimport warnings\nimport string\nimport numpy as np\nimport pandas as pd\npd.set_option('display.max_rows', 500)\npd.set_option('display.max_columns', 500)\npd.set_option('display.width', 1000)\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom nltk.corpus import stopwords\nfrom nltk.util import ngrams # function for making ngrams\nfrom collections import defaultdict\nimport tensorflow as tf\nfrom tensorflow.keras.layers import Dense, Input\nfrom tensorflow.keras.optimizers import Adam\nfrom tensorflow.keras.models import Model\nfrom tensorflow.keras.callbacks import ModelCheckpoint\nimport tensorflow_hub as hub\nimport tokenization\n\nwarnings.filterwarnings(\"ignore\")\neng_stopwords = set(stopwords.words(\"english\"))", "class": "Imports and Environment", - "desc": "This code snippet imports various libraries and modules essential for data manipulation, visualization, preprocessing, machine learning, and sets some display configurations for Matplotlib and Scikit-learn. ", + "desc": "This code snippet imports various packages and libraries including `os`, `gc`, `re`, `time`, `warnings`, `string`, `numpy`, `pandas`, `matplotlib.pyplot`, `seaborn`, NLTK, TensorFlow, and TensorFlow Hub, and configures pandas display options.", "testing": { "class": "Imports_and_Environment", "subclass": "set_options", "subclass_id": 23, - "predicted_subclass_probability": 0.99920374 + "predicted_subclass_probability": 0.9975701 }, "cluster": 0 }, { - "cell_id": 14, - "code": "def to_class_label(probs, threshold):\n \"\"\"convert predicted probabilities to class labels\"\"\"\n return (probs >= threshold).astype('int')\n\ndef get_optimal_threshold(fitted_model, x_test, y_test):\n \"\"\"Threshold tuning\"\"\"\n thresholds = np.arange(0, 1, 0.0005)\n y_hat = fitted_model.predict_proba(x_test)\n pos_clas_probs = y_hat[:, 1]\n acc_scores = [accuracy_score(y_test, to_class_label(pos_clas_probs, thres)) for thres in thresholds]\n idx = np.argmax(acc_scores)\n \n return thresholds[idx]\n ", + "cell_id": 27, + "code": "# Thanks to https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub\n# Prediction by BERT model\nmodel.load_weights('model.h5')\ntest_pred_BERT = model.predict(test_input)\ntest_pred_BERT_int = test_pred_BERT.round().astype('int')", "class": "Model Evaluation", - "desc": "This code snippet defines functions to convert predicted probabilities to class labels based on a threshold and to find the optimal threshold that maximizes accuracy on the validation set.", + "desc": "This code snippet loads the best weights for the model from the checkpoint file 'model.h5', makes predictions on the test data, and rounds the predictions to obtain integer values (0 or 1), using `model.load_weights` and `model.predict` in TensorFlow and Keras.", "testing": { "class": "Model_Evaluation", "subclass": "predict_on_test", "subclass_id": 48, - "predicted_subclass_probability": 0.54912084 + "predicted_subclass_probability": 0.9870588 }, "cluster": 0 }, { - "cell_id": 16, - "code": "# get optimal threshold\nopt_thres = get_optimal_threshold(lr, x_test, y_test)\nprint(f'Optimal threshold for trained LR {get_optimal_threshold(lr, x_test, y_test):.4f}')", - "class": "Model Evaluation", - "desc": "This code snippet calculates and prints the optimal threshold for converting predicted probabilities to class labels, maximizing accuracy on the validation set for the trained logistic regression model.", + "cell_id": 21, + "code": "def build_model(bert_layer, max_len=512):\n input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"input_word_ids\")\n input_mask = Input(shape=(max_len,), dtype=tf.int32, name=\"input_mask\")\n segment_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"segment_ids\")\n\n _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])\n clf_output = sequence_output[:, 0, :]\n out = Dense(1, activation='sigmoid')(clf_output)\n \n model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)\n model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])\n \n return model", + "class": "Model Training", + "desc": "This code snippet defines a `build_model` function that constructs and compiles a binary classification model using a BERT layer for encoding text and a dense layer with a sigmoid activation function for output, and compiles the model with the Adam optimizer and binary cross-entropy loss using `TensorFlow` and `Keras`.", "testing": { "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.6867679 + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.9912102 }, "cluster": 0 }, { - "cell_id": 12, - "code": "# Tune logistic regression\ndef objective(trial):\n x, y = x_train, y_train\n C = trial.suggest_float('C', 1e-6, 1e6, log=True)\n penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])\n l1_ratio = trial.suggest_float('l1_ratio', 0, 1)\n if penalty != 'elasticnet':\n l1_ratio = None\n \n clf = make_pipeline(preprocess, LogisticRegression(C=C,\n penalty=penalty,\n l1_ratio=l1_ratio,\n solver='saga',\n max_iter=800))\n clf.fit(x,y)\n \n acc = accuracy_score(y_test, clf.predict(x_test))\n \n return acc\n\nclass EarlyStopping:\n \"\"\"stop tuning after value remains unchanged after 10 successive trials\"\"\"\n def __init__(self, max_rounds = 10):\n self.max_rounds = max_rounds\n self.current_rounds = 0\n \n def __call__(self, study, trial, tol = 1e-6):\n if abs(trial.value - study.best_value) <= tol:\n self.current_rounds += 1\n elif trial.value == study.best_value:\n self.current_rounds = 0\n if self.current_rounds >= self.max_rounds:\n study.stop()", + "cell_id": 22, + "code": "%%time\n\nbert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1', trainable=True)", "class": "Model Training", - "desc": "This code snippet defines a function for hyperparameter tuning of a logistic regression model using Optuna and an early stopping class to halt the study when improvement plateaus, and it calculates the accuracy score after fitting the model.", + "desc": "This code snippet loads a pre-trained BERT model from TensorFlow Hub as a Keras layer and sets it to be trainable using `tensorflow_hub.KerasLayer`.", "testing": { "class": "Model_Train", - "subclass": "find_best_model_class", - "subclass_id": 3, - "predicted_subclass_probability": 0.3142369 + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.8483843 }, "cluster": 0 }, { - "cell_id": 13, - "code": "# # create study and run trials\nes = EarlyStopping()\n\nstudy = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler()) # using Tree-structured Parzen Estimator to sample\nstudy.optimize(objective, n_trials=250, callbacks=[es])", + "cell_id": 25, + "code": "model = build_model(bert_layer, max_len=160)\nmodel.summary()", "class": "Model Training", - "desc": "This code snippet creates an Optuna study for hyperparameter optimization using the Tree-structured Parzen Estimator sampler and runs the optimization process for 250 trials with early stopping via the defined callback function.", + "desc": "This code snippet builds a binary classification model using the previously defined `build_model` function with a maximum sequence length of 160 and prints the summary of the model using TensorFlow and Keras.", "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.80090225 + "class": "Visualization", + "subclass": "model_coefficients", + "subclass_id": 79, + "predicted_subclass_probability": 0.9372223 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 15, - "code": "# train LR on best parameters\nlr = LogisticRegression(**study.best_params, solver='saga', max_iter=800)\nlr = make_pipeline(preprocess, lr)\nlr.fit(x_train, y_train)", + "cell_id": 26, + "code": "checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)\n\ntrain_history = model.fit(\n train_input, train_labels,\n validation_split=0.2,\n epochs=3,\n callbacks=[checkpoint],\n batch_size=32\n)", "class": "Model Training", - "desc": "This code snippet trains a logistic regression model with the best hyperparameters identified by the Optuna study, within a pipeline that includes preprocessing steps.", + "desc": "This code snippet trains the BERT-based model on the encoded training data for 3 epochs, using a validation split of 0.2 and a checkpoint callback to save the best model based on validation loss, with a batch size of 32, using `ModelCheckpoint` and `model.fit` in TensorFlow and Keras.", "testing": { "class": "Model_Train", "subclass": "train_model", "subclass_id": 7, - "predicted_subclass_probability": 0.99761343 + "predicted_subclass_probability": 0.9943159 }, "cluster": 0 }, { - "cell_id": 7, - "code": "def plot_top_n_words(target = 1, n=50):\n \n count_dict = defaultdict(int)\n\n for tweet in train.query(f'target=={target}')['text']:\n for word in word_tokenize(tweet):\n count_dict[word] += 1\n\n wc_df = pd.DataFrame(data=count_dict.items(), columns = ['word', 'count'])\n sns.barplot(x = 'count', y='word', data=wc_df.sort_values(by=['count'], ascending=False)[:n])", + "cell_id": 3, + "code": "x=train_df.target.value_counts()\nsns.barplot(x.index,x)\nplt.gca().set_ylabel('# of occurrence')", "class": "Visualization", - "desc": "This code snippet defines a function to plot the top `n` most frequent words from the tweets in the training dataset that belong to a specified target class (either 1 or 0).", + "desc": "This code snippet visualizes the distribution of the target variable in the training dataframe using a bar plot created with `seaborn`.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, - "predicted_subclass_probability": 0.8610096 + "predicted_subclass_probability": 0.9978562 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 8, - "code": "plot_top_n_words()", + "cell_id": 15, + "code": "plt.figure(figsize=(12,6))\n## sentenses\nplt.subplot(121)\nplt.suptitle(\"Are longer comments more Disastrous\",fontsize=20)\nsns.violinplot(y='word_count',x='target', data=train_df,split=True)\nplt.xlabel('Target?', fontsize=12)\nplt.ylabel('# of words', fontsize=12)\nplt.title(\"Number of words in each comment\", fontsize=15)\n\n# words\nplt.subplot(122)\nsns.violinplot(y='count_letters',x='target', data=train_df,split=True,inner=\"quart\")\nplt.xlabel('Target?', fontsize=12)\nplt.ylabel('# of letters', fontsize=12)\nplt.title(\"Number of letters in each comment\", fontsize=15)\n\nplt.show()", "class": "Visualization", - "desc": "This code snippet calls the previously defined function to plot the top 50 most frequent words from the tweets labeled as target class 1 in the training dataset.", + "desc": "This code snippet creates a figure with two subplots displaying violin plots of word count and letter count distributions against the target variable, using `matplotlib` and `seaborn`.", "testing": { "class": "Visualization", - "subclass": "relationship", - "subclass_id": 81, - "predicted_subclass_probability": 0.58346504 + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.9839205 }, - "cluster": 0 + "cluster": -1 + }, { + "cell_id": 16, + "code": "train_df['word_unique_percent']=train_df['unique_word_count']*100/train_df['word_count']\ntest_df['word_unique_percent']=test_df['unique_word_count']*100/test_df['word_count']\nplt.figure(figsize=(12,6))\nplt.subplot(121)\nplt.title(\"Percentage of unique words of total words in comment\")\n#sns.boxplot(x='clean', y='word_unique_percent', data=train_feats)\nax=sns.kdeplot(train_df[train_df.target == 0].word_unique_percent, label=\"Disastrous\",shade=True,color='r')\nax=sns.kdeplot(train_df[train_df.target == 1].word_unique_percent, label=\" Non Disastrous\")\nplt.legend()\nplt.ylabel('Number of occurances', fontsize=12)\nplt.xlabel('Percent unique words', fontsize=12)", + "class": "Visualization", + "desc": "This code snippet adds a new column to the training and testing dataframes that calculates the percentage of unique words out of the total word count, and then creates a KDE plot visualizing this percentage for both classes of the target variable using `pandas` and `seaborn`.", + "testing": { + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.97993535 + }, + "cluster": -1 + }, { + "cell_id": 17, + "code": "def generate_ngrams(text, n_gram=1):\n token = [token for token in text.lower().split(' ') if token != '' if token not in eng_stopwords]\n ngrams = zip(*[token[i:] for i in range(n_gram)])\n return [' '.join(ngram) for ngram in ngrams]\n\n# Bigrams\ndisaster_bigrams = defaultdict(int)\nnondisaster_bigrams = defaultdict(int)\n\nfor tweet in train_df[train_df['target']==1]['text']:\n for word in generate_ngrams(tweet, n_gram=2):\n disaster_bigrams[word] += 1\n \nfor tweet in train_df[train_df['target']==0]['text']:\n for word in generate_ngrams(tweet, n_gram=2):\n nondisaster_bigrams[word] += 1\n \ndf_disaster_bigrams = pd.DataFrame(sorted(disaster_bigrams.items(), key=lambda x: x[1])[::-1])\ndf_nondisaster_bigrams = pd.DataFrame(sorted(nondisaster_bigrams.items(), key=lambda x: x[1])[::-1])\n\nfig, axes = plt.subplots(ncols=2, figsize=(10, 10))\nplt.tight_layout()\nsns.barplot(y=df_disaster_bigrams[0].values[:10], x=df_disaster_bigrams[1].values[:10], ax=axes[0], color='cyan')\nsns.barplot(y=df_nondisaster_bigrams[0].values[:10], x=df_nondisaster_bigrams[1].values[:10], ax=axes[1], color='pink')\nfor i in range(2):\n axes[i].spines['right'].set_visible(False)\n axes[i].set_xlabel('')\n axes[i].set_ylabel('')\n axes[i].tick_params(axis='x', labelsize=10)\n axes[i].tick_params(axis='y', labelsize=10)\naxes[0].set_title('most common bigrams in Disaster Tweets', fontsize=15)\naxes[1].set_title('most common bigrams in Non-disaster Tweets', fontsize=15)\nplt.show()", + "class": "Visualization", + "desc": "This code snippet defines a function to generate n-grams, computes the frequency of bigrams in disaster and non-disaster tweets, creates dataframes of sorted bigram counts, and visualizes the top 10 bigrams for each class with bar plots using `pandas`, `collections.defaultdict`, and `seaborn`.", + "testing": { + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.95510435 + }, + "cluster": -1 + }, { + "cell_id": 18, + "code": "# Trigrams\ndisaster_trigrams = defaultdict(int)\nnondisaster_trigrams = defaultdict(int)\n\nfor tweet in train_df[train_df['target']==1]['text']:\n for word in generate_ngrams(tweet, n_gram=3):\n disaster_trigrams[word] += 1\n \nfor tweet in train_df[train_df['target']==0]['text']:\n for word in generate_ngrams(tweet, n_gram=3):\n nondisaster_trigrams[word] += 1\n \ndf_disaster_trigrams = pd.DataFrame(sorted(disaster_trigrams.items(), key=lambda x: x[1])[::-1])\ndf_nondisaster_trigrams = pd.DataFrame(sorted(nondisaster_trigrams.items(), key=lambda x: x[1])[::-1])\n\nfig, axes = plt.subplots(ncols=2, figsize=(10, 10))\nplt.tight_layout()\nsns.barplot(y=df_disaster_trigrams[0].values[:10], x=df_disaster_trigrams[1].values[:10], ax=axes[0], color='cyan')\nsns.barplot(y=df_nondisaster_trigrams[0].values[:10], x=df_nondisaster_trigrams[1].values[:10], ax=axes[1], color='pink')\nfor i in range(2):\n axes[i].spines['right'].set_visible(False)\n axes[i].set_xlabel('')\n axes[i].set_ylabel('')\n axes[i].tick_params(axis='x', labelsize=10)\n axes[i].tick_params(axis='y', labelsize=10)\naxes[0].set_title('most common trigrams in Disaster Tweets', fontsize=15)\naxes[1].set_title('most common trigrams in Non-disaster Tweets', fontsize=15)\nplt.show()", + "class": "Visualization", + "desc": "This code snippet creates trigrams from disaster and non-disaster tweets, counts their occurrences, sorts these counts into dataframes, and visualizes the top 10 most common trigrams for each class with bar plots using `pandas`, `collections.defaultdict`, `matplotlib`, and `seaborn`.", + "testing": { + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.94842196 + }, + "cluster": -1 }], - "notebook_id": 9, - "notebook_name": "logistic-regression-with-threshold-tuning" + "notebook_id": 8, + "notebook_name": "nlp-eda-cleaning-bert.ipynb" }, { "cells": [{ "cell_id": 89, "code": "output = pd.DataFrame({'id': full_test_df.id, 'target': predictions_rf})\noutput.to_csv('my_submission.csv', index=False)\nprint(\"Your submission was successfully saved!\")", "class": "Data Export", - "desc": "This code creates a DataFrame containing the 'id' and predicted 'target' values, then saves it as a CSV file named 'my_submission.csv', and prints a confirmation message.", + "desc": "This code creates a DataFrame named `output` containing the 'id' and predicted 'target' from the test data, and saves it as a CSV file named 'my_submission.csv' without the index.", "testing": { "class": "Data_Export", "subclass": "save_to_csv", "subclass_id": 25, "predicted_subclass_probability": 0.9990031 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 2, "code": "# loading the dataset\ntrain_df = pd.read_csv(\"/kaggle/input/nlp-getting-started/train.csv\")\ntest_df = pd.read_csv(\"/kaggle/input/nlp-getting-started/test.csv\")", "class": "Data Extraction", - "desc": "This code loads the training and test datasets from specified CSV files into pandas DataFrames named `train_df` and `test_df`.", + "desc": "This code loads the training and test datasets from CSV files into pandas DataFrames named `train_df` and `test_df`, respectively.", "testing": { "class": "Data_Extraction", "subclass": "load_from_csv", "subclass_id": 45, "predicted_subclass_probability": 0.99973744 }, - "cluster": 3 + "cluster": 0 }, { "cell_id": 10, "code": "ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]\ntrain_df.at[train_df['id'].isin(ids_with_target_error),'target'] = 0\ntrain_df[train_df['id'].isin(ids_with_target_error)]", "class": "Data Transform", - "desc": "This code identifies specific rows in the `train_df` DataFrame by their 'id' values, sets their 'target' column to 0 to correct errors, and then displays these rows.", + "desc": "This code corrects the 'target' column for selected rows in the `train_df` DataFrame where the 'id' is in the specified list `ids_with_target_error`, setting their 'target' values to 0.", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, "predicted_subclass_probability": 0.9975581 }, - "cluster": 1 + "cluster": 5 }, { "cell_id": 13, "code": "set1 = set(train_df[train_df.duplicated(subset=['text'])]['id'].values)", "class": "Data Transform", - "desc": "This code creates a set of 'id' values from the `train_df` DataFrame which have duplicate 'text' entries.", + "desc": "This code creates a set named `set1` containing the 'id' values of the rows in the `train_df` DataFrame that have duplicate 'text' entries.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "count_duplicates", "subclass_id": 38, "predicted_subclass_probability": 0.8107515 }, - "cluster": 1 + "cluster": 7 }, { "cell_id": 14, "code": "set2 = (train_df[train_df.duplicated(subset=['text','target'])]['id'].values)", "class": "Data Transform", - "desc": "This code creates an array of 'id' values from the `train_df` DataFrame which have duplicate 'text' and 'target' entries.", + "desc": "This code creates an array named `set2` containing the 'id' values of the rows in the `train_df` DataFrame that have duplicate entries based on both 'text' and 'target' columns.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "count_duplicates", "subclass_id": 38, "predicted_subclass_probability": 0.8105881 }, - "cluster": 1 + "cluster": 7 }, { "cell_id": 15, "code": "tweet_ids_possible_wrong = set1.difference(set2)\n# those are the tweets which have duplicated text but different label\nprint(train_df[train_df['id'].isin(tweet_ids_possible_wrong)].text.values)\nprint(train_df[train_df['id'].isin(tweet_ids_possible_wrong)].target.values)", "class": "Data Transform", - "desc": "This code identifies 'id' values with duplicated text but different labels by calculating the difference between two sets, and then prints the 'text' and 'target' values for these tweets from the `train_df` DataFrame.", + "desc": "This code identifies tweets in `train_df` that have duplicated text but different labels by computing the difference between `set1` and `set2`, then prints the text and target values of these potentially mislabeled tweets.", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, "predicted_subclass_probability": 0.4053326 }, - "cluster": 1 + "cluster": 8 }, { "cell_id": 16, "code": "# setting the argument keep=False, drops all the duplicates\ntrain_df.drop_duplicates(subset=['text'], keep=False, inplace=True)", "class": "Data Transform", - "desc": "This code removes all rows with duplicated 'text' entries in the `train_df` DataFrame by dropping duplicates while not keeping any instance of the duplicate rows.", + "desc": "This code removes all rows with duplicate 'text' entries from the `train_df` DataFrame by using the `drop_duplicates` method with the `keep=False` argument.", "testing": { "class": "Data_Transform", "subclass": "remove_duplicates", "subclass_id": 19, "predicted_subclass_probability": 0.89899284 }, - "cluster": 1 + "cluster": 7 }, { "cell_id": 27, "code": "train_df['location'] = train_df['location'].str.lower()\ntrain_df['location'] = train_df['location'].str.strip()\ntest_df['location'] = train_df['location'].str.lower()\ntest_df['location'] = train_df['location'].str.strip()", "class": "Data Transform", - "desc": "This code normalizes the 'location' column in both `train_df` and `test_df` by converting all text to lowercase and stripping any leading or trailing whitespace.", + "desc": "This code standardizes the 'location' column in both `train_df` and `test_df` DataFrames by converting the text to lowercase and stripping leading/trailing whitespace using pandas string methods.", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, "predicted_subclass_probability": 0.7562164 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 28, "code": "loc_dict = {'united states':'usa',\n 'us':'usa',\n 'united kingdom':'uk',\n 'nyc':'new york',\n 'london, uk': 'london',\n 'london, england':'london',\n 'new york, ny':'new york',\n 'everywhere':'worldwide'}", "class": "Data Transform", - "desc": "This code creates a dictionary `loc_dict` that maps various location labels to standardized shorter forms, aiding in consistent location data transformation.", + "desc": "This code defines a dictionary named `loc_dict` that maps various location names and their synonyms to a standardized location name for easier data uniformity.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "define_variables", "subclass_id": 77, "predicted_subclass_probability": 0.99767166 }, - "cluster": 1 + "cluster": 0 }, { "cell_id": 29, "code": "train_df['location'].replace(loc_dict, inplace=True)\ntest_df['location'].replace(loc_dict, inplace=True)", "class": "Data Transform", - "desc": "This code replaces values in the 'location' column of both `train_df` and `test_df` DataFrames with standardized labels based on the `loc_dict` dictionary.", + "desc": "This code standardizes location names in the 'location' column of both `train_df` and `test_df` DataFrames by replacing them according to the `loc_dict` dictionary.", "testing": { "class": "Data_Transform", "subclass": "categorify", "subclass_id": 20, "predicted_subclass_probability": 0.92937183 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 33, "code": "abbreviations = {\n \"$\" : \" dollar \",\n \"\u20ac\" : \" euro \",\n \"4ao\" : \"for adults only\",\n \"a.m\" : \"before midday\",\n \"a3\" : \"anytime anywhere anyplace\",\n \"aamof\" : \"as a matter of fact\",\n \"acct\" : \"account\",\n \"adih\" : \"another day in hell\",\n \"afaic\" : \"as far as i am concerned\",\n \"afaict\" : \"as far as i can tell\",\n \"afaik\" : \"as far as i know\",\n \"afair\" : \"as far as i remember\",\n \"afk\" : \"away from keyboard\",\n \"app\" : \"application\",\n \"approx\" : \"approximately\",\n \"apps\" : \"applications\",\n \"asap\" : \"as soon as possible\",\n \"asl\" : \"age, sex, location\",\n \"atk\" : \"at the keyboard\",\n \"ave.\" : \"avenue\",\n \"aymm\" : \"are you my mother\",\n \"ayor\" : \"at your own risk\", \n \"b&b\" : \"bed and breakfast\",\n \"b+b\" : \"bed and breakfast\",\n \"b.c\" : \"before christ\",\n \"b2b\" : \"business to business\",\n \"b2c\" : \"business to customer\",\n \"b4\" : \"before\",\n \"b4n\" : \"bye for now\",\n \"b@u\" : \"back at you\",\n \"bae\" : \"before anyone else\",\n \"bak\" : \"back at keyboard\",\n \"bbbg\" : \"bye bye be good\",\n \"bbc\" : \"british broadcasting corporation\",\n \"bbias\" : \"be back in a second\",\n \"bbl\" : \"be back later\",\n \"bbs\" : \"be back soon\",\n \"be4\" : \"before\",\n \"bfn\" : \"bye for now\",\n \"blvd\" : \"boulevard\",\n \"bout\" : \"about\",\n \"brb\" : \"be right back\",\n \"bros\" : \"brothers\",\n \"brt\" : \"be right there\",\n \"bsaaw\" : \"big smile and a wink\",\n \"btw\" : \"by the way\",\n \"bwl\" : \"bursting with laughter\",\n \"c/o\" : \"care of\",\n \"cet\" : \"central european time\",\n \"cf\" : \"compare\",\n \"cia\" : \"central intelligence agency\",\n \"csl\" : \"can not stop laughing\",\n \"cu\" : \"see you\",\n \"cul8r\" : \"see you later\",\n \"cv\" : \"curriculum vitae\",\n \"cwot\" : \"complete waste of time\",\n \"cya\" : \"see you\",\n \"cyt\" : \"see you tomorrow\",\n \"dae\" : \"does anyone else\",\n \"dbmib\" : \"do not bother me i am busy\",\n \"diy\" : \"do it yourself\",\n \"dm\" : \"direct message\",\n \"dwh\" : \"during work hours\",\n \"e123\" : \"easy as one two three\",\n \"eet\" : \"eastern european time\",\n \"eg\" : \"example\",\n \"embm\" : \"early morning business meeting\",\n \"encl\" : \"enclosed\",\n \"encl.\" : \"enclosed\",\n \"etc\" : \"and so on\",\n \"faq\" : \"frequently asked questions\",\n \"fawc\" : \"for anyone who cares\",\n \"fb\" : \"facebook\",\n \"fc\" : \"fingers crossed\",\n \"fig\" : \"figure\",\n \"fimh\" : \"forever in my heart\", \n \"ft.\" : \"feet\",\n \"ft\" : \"featuring\",\n \"ftl\" : \"for the loss\",\n \"ftw\" : \"for the win\",\n \"fwiw\" : \"for what it is worth\",\n \"fyi\" : \"for your information\",\n \"g9\" : \"genius\",\n \"gahoy\" : \"get a hold of yourself\",\n \"gal\" : \"get a life\",\n \"gcse\" : \"general certificate of secondary education\",\n \"gfn\" : \"gone for now\",\n \"gg\" : \"good game\",\n \"gl\" : \"good luck\",\n \"glhf\" : \"good luck have fun\",\n \"gmt\" : \"greenwich mean time\",\n \"gmta\" : \"great minds think alike\",\n \"gn\" : \"good night\",\n \"g.o.a.t\" : \"greatest of all time\",\n \"goat\" : \"greatest of all time\",\n \"goi\" : \"get over it\",\n \"gps\" : \"global positioning system\",\n \"gr8\" : \"great\",\n \"gratz\" : \"congratulations\",\n \"gyal\" : \"girl\",\n \"h&c\" : \"hot and cold\",\n \"hp\" : \"horsepower\",\n \"hr\" : \"hour\",\n \"hrh\" : \"his royal highness\",\n \"ht\" : \"height\",\n \"ibrb\" : \"i will be right back\",\n \"ic\" : \"i see\",\n \"icq\" : \"i seek you\",\n \"icymi\" : \"in case you missed it\",\n \"idc\" : \"i do not care\",\n \"idgadf\" : \"i do not give a damn fuck\",\n \"idgaf\" : \"i do not give a fuck\",\n \"idk\" : \"i do not know\",\n \"ie\" : \"that is\",\n \"i.e\" : \"that is\",\n \"ifyp\" : \"i feel your pain\",\n \"IG\" : \"instagram\",\n \"iirc\" : \"if i remember correctly\",\n \"ilu\" : \"i love you\",\n \"ily\" : \"i love you\",\n \"imho\" : \"in my humble opinion\",\n \"imo\" : \"in my opinion\",\n \"imu\" : \"i miss you\",\n \"iow\" : \"in other words\",\n \"irl\" : \"in real life\",\n \"j4f\" : \"just for fun\",\n \"jic\" : \"just in case\",\n \"jk\" : \"just kidding\",\n \"jsyk\" : \"just so you know\",\n \"l8r\" : \"later\",\n \"lb\" : \"pound\",\n \"lbs\" : \"pounds\",\n \"ldr\" : \"long distance relationship\",\n \"lmao\" : \"laugh my ass off\",\n \"lmfao\" : \"laugh my fucking ass off\",\n \"lol\" : \"laughing out loud\",\n \"ltd\" : \"limited\",\n \"ltns\" : \"long time no see\",\n \"m8\" : \"mate\",\n \"mf\" : \"motherfucker\",\n \"mfs\" : \"motherfuckers\",\n \"mfw\" : \"my face when\",\n \"mofo\" : \"motherfucker\",\n \"mph\" : \"miles per hour\",\n \"mr\" : \"mister\",\n \"mrw\" : \"my reaction when\",\n \"ms\" : \"miss\",\n \"mte\" : \"my thoughts exactly\",\n \"nagi\" : \"not a good idea\",\n \"nbc\" : \"national broadcasting company\",\n \"nbd\" : \"not big deal\",\n \"nfs\" : \"not for sale\",\n \"ngl\" : \"not going to lie\",\n \"nhs\" : \"national health service\",\n \"nrn\" : \"no reply necessary\",\n \"nsfl\" : \"not safe for life\",\n \"nsfw\" : \"not safe for work\",\n \"nth\" : \"nice to have\",\n \"nvr\" : \"never\",\n \"nyc\" : \"new york city\",\n \"oc\" : \"original content\",\n \"og\" : \"original\",\n \"ohp\" : \"overhead projector\",\n \"oic\" : \"oh i see\",\n \"omdb\" : \"over my dead body\",\n \"omg\" : \"oh my god\",\n \"omw\" : \"on my way\",\n \"p.a\" : \"per annum\",\n \"p.m\" : \"after midday\",\n \"pm\" : \"prime minister\",\n \"poc\" : \"people of color\",\n \"pov\" : \"point of view\",\n \"pp\" : \"pages\",\n \"ppl\" : \"people\",\n \"prw\" : \"parents are watching\",\n \"ps\" : \"postscript\",\n \"pt\" : \"point\",\n \"ptb\" : \"please text back\",\n \"pto\" : \"please turn over\",\n \"qpsa\" : \"what happens\", #\"que pasa\",\n \"ratchet\" : \"rude\",\n \"rbtl\" : \"read between the lines\",\n \"rlrt\" : \"real life retweet\", \n \"rofl\" : \"rolling on the floor laughing\",\n \"roflol\" : \"rolling on the floor laughing out loud\",\n \"rotflmao\" : \"rolling on the floor laughing my ass off\",\n \"rt\" : \"retweet\",\n \"ruok\" : \"are you ok\",\n \"sfw\" : \"safe for work\",\n \"sk8\" : \"skate\",\n \"smh\" : \"shake my head\",\n \"sq\" : \"square\",\n \"srsly\" : \"seriously\", \n \"ssdd\" : \"same stuff different day\",\n \"tbh\" : \"to be honest\",\n \"tbs\" : \"tablespooful\",\n \"tbsp\" : \"tablespooful\",\n \"tfw\" : \"that feeling when\",\n \"thks\" : \"thank you\",\n \"tho\" : \"though\",\n \"thx\" : \"thank you\",\n \"tia\" : \"thanks in advance\",\n \"til\" : \"today i learned\",\n \"tl;dr\" : \"too long i did not read\",\n \"tldr\" : \"too long i did not read\",\n \"tmb\" : \"tweet me back\",\n \"tntl\" : \"trying not to laugh\",\n \"ttyl\" : \"talk to you later\",\n \"u\" : \"you\",\n \"u2\" : \"you too\",\n \"u4e\" : \"yours for ever\",\n \"utc\" : \"coordinated universal time\",\n \"w/\" : \"with\",\n \"w/o\" : \"without\",\n \"w8\" : \"wait\",\n \"wassup\" : \"what is up\",\n \"wb\" : \"welcome back\",\n \"wtf\" : \"what the fuck\",\n \"wtg\" : \"way to go\",\n \"wtpa\" : \"where the party at\",\n \"wuf\" : \"where are you from\",\n \"wuzup\" : \"what is up\",\n \"wywh\" : \"wish you were here\",\n \"yd\" : \"yard\",\n \"ygtr\" : \"you got that right\",\n \"ynk\" : \"you never know\",\n \"zzz\" : \"sleeping bored and tired\"\n}", "class": "Data Transform", - "desc": "This code defines a dictionary `abbreviations` that maps common abbreviations and shortcuts to their expanded forms, aiding in text normalization and preprocessing.", + "desc": "This code defines a dictionary named `abbreviations` that maps various abbreviations and slangs to their expanded forms for text normalization purposes.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "define_variables", "subclass_id": 77, "predicted_subclass_probability": 0.9984091 }, - "cluster": 1 + "cluster": 0 }, { "cell_id": 34, "code": "def convert_abb(x):\n word_list = x.split()\n r_string = []\n for word in word_list:\n r_string.append(abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word)\n return ' '.join(r_string)\n\ntest = 'afk hello world!'\nconvert_abb(test)", "class": "Data Transform", - "desc": "This code defines a function `convert_abb` that converts abbreviations in a given text to their full forms by looking up the `abbreviations` dictionary and demonstrates its use on a test string.", + "desc": "This code defines a function `convert_abb` that replaces abbreviations in a given text with their expanded forms using the `abbreviations` dictionary, and demonstrates its usage with a test string.", "testing": { "class": "Data_Transform", "subclass": "string_transform", "subclass_id": 78, "predicted_subclass_probability": 0.8039966 }, - "cluster": 1 + "cluster": 2 }, { "cell_id": 35, "code": "train_df['text'] = train_df.text.apply(convert_abb)\ntest_df['text'] = test_df.text.apply(convert_abb)", "class": "Data Transform", - "desc": "This code applies the `convert_abb` function to the 'text' column of both `train_df` and `test_df` DataFrames to replace abbreviations with their full forms in all the text entries.", + "desc": "This code applies the `convert_abb` function to the 'text' column of both the `train_df` and `test_df` DataFrames to replace abbreviations with their expanded forms.", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, "predicted_subclass_probability": 0.9974021 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 36, "code": "train_df['clean_text'] = train_df.text.apply(lambda x: re.sub('https?://\\S+|www\\.\\S+', '', x))\ntest_df['clean_text'] = test_df.text.apply(lambda x: re.sub('https?://\\S+|www\\.\\S+', '', x))", "class": "Data Transform", - "desc": "This code creates a new column 'clean_text' in both `train_df` and `test_df` where URLs are removed from the text entries using regular expressions.", + "desc": "This code adds a 'clean_text' column to both `train_df` and `test_df` DataFrames, where each tweet's text has had URLs removed using a regular expression with the `re.sub` function.", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, "predicted_subclass_probability": 0.9974329 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 37, "code": "train_df['clean_text'] = train_df.clean_text.apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))\ntest_df['clean_text'] = test_df.clean_text.apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))", "class": "Data Transform", - "desc": "This code updates the 'clean_text' column in both `train_df` and `test_df` DataFrames by removing non-ASCII characters to ensure the text only contains standard ASCII characters.", + "desc": "This code updates the 'clean_text' column in both `train_df` and `test_df` DataFrames by removing non-ASCII characters from each text using encoding and decoding with ASCII.", "testing": { "class": "Data_Transform", "subclass": "data_type_conversions", "subclass_id": 16, "predicted_subclass_probability": 0.9229938 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 38, "code": "non_alpha = string.punctuation + '0123456789'", "class": "Data Transform", - "desc": "This code creates a string `non_alpha` that includes all punctuation characters and digits, which can be used later for filtering or removing these characters from the text.", + "desc": "This code defines a string named `non_alpha` that contains all punctuation characters and digits for use in subsequent text processing tasks.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "define_variables", "subclass_id": 77, "predicted_subclass_probability": 0.9975495 }, - "cluster": 1 + "cluster": 4 }, { "cell_id": 39, "code": "train_df['clean_text'] = train_df.clean_text.apply(lambda x: x.translate(str.maketrans('','',non_alpha)))\ntest_df['clean_text'] = test_df.clean_text.apply(lambda x: x.translate(str.maketrans('','',non_alpha)))", "class": "Data Transform", - "desc": "This code updates the 'clean_text' column in both `train_df` and `test_df` DataFrames by removing all punctuation and numeric characters using translation mapping.", + "desc": "This code updates the 'clean_text' column in both `train_df` and `test_df` DataFrames by removing all punctuation characters and digits from each text using the `translate` method with a translation table created from `non_alpha`.", "testing": { "class": "Data_Transform", "subclass": "categorify", "subclass_id": 20, "predicted_subclass_probability": 0.5500296 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 40, "code": "train_df['token_text'] = train_df.clean_text.str.lower()\ntrain_df['token_text'] = train_df.token_text.apply(lambda x: nltk.word_tokenize(x))\ntest_df['token_text'] = test_df.clean_text.str.lower()\ntest_df['token_text'] = test_df.token_text.apply(lambda x: nltk.word_tokenize(x))", "class": "Data Transform", - "desc": "This code creates a new column 'token_text' in both `train_df` and `test_df` where the cleaned text is first converted to lowercase and then tokenized into individual words using the NLTK library.", + "desc": "This code converts the 'clean_text' column in both `train_df` and `test_df` DataFrames to lowercase and tokenizes the text into words using the `nltk.word_tokenize` method, storing the results in a new 'token_text' column.", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, "predicted_subclass_probability": 0.3968303 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 41, "code": "stopwords = nltk.corpus.stopwords.words(\"english\")\naddingStopWords = ['im','get','dont','got','amp']\nstopwords.extend(addingStopWords)\ntrain_df['token_text'] = train_df['token_text'].apply(lambda x: [word for word in x if word not in stopwords])\ntest_df['token_text'] = test_df['token_text'].apply(lambda x: [word for word in x if word not in stopwords])", "class": "Data Transform", - "desc": "This code extends the NLTK stopwords list with additional common words and then filters out these stopwords from the 'token_text' column in both `train_df` and `test_df` DataFrames.", + "desc": "This code defines a list of English stopwords using the `nltk.corpus` and extends it with additional custom stopwords, then removes these stopwords from the 'token_text' column in both `train_df` and `test_df` DataFrames using a list comprehension.", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, "predicted_subclass_probability": 0.9972505 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 42, "code": "# since we have a dataset containing text from social media, there might be many spelling mistakes and words which cannot be found in the word lemmatizer corpus, so they would be remained untouched.\n# To this end we would use the Porter Stemming which just removes affixes of the words\nfrom nltk.stem.porter import PorterStemmer\nps = PorterStemmer()\ntrain_df['token_text'] = train_df['token_text'].apply(lambda x: [ps.stem(word) for word in x ])\ntest_df['token_text'] = test_df['token_text'].apply(lambda x: [ps.stem(word) for word in x ])", "class": "Data Transform", - "desc": "This code applies Porter Stemming to the tokenized text in both `train_df` and `test_df` DataFrames to reduce words to their root forms by removing affixes, considering the potential for spelling mistakes in social media text.", + "desc": "This code uses the Porter Stemming method from the `nltk.stem.porter` module to reduce words in the 'token_text' column of both `train_df` and `test_df` DataFrames to their root forms by removing affixes.", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, "predicted_subclass_probability": 0.988744 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 43, "code": "train_df.reset_index(inplace=True, drop=True)", "class": "Data Transform", - "desc": "This code resets the index of the `train_df` DataFrame, dropping the old index to ensure a continuous integer-based index after data transformations.", + "desc": "This code resets the index of the `train_df` DataFrame, dropping the old index and creating a new default integer index.", "testing": { "class": "Data_Transform", "subclass": "drop_column", "subclass_id": 10, "predicted_subclass_probability": 0.9991253 }, - "cluster": 1 + "cluster": 7 }, { "cell_id": 45, "code": "def dummy(doc):\n return doc\ntfidf_vect = TfidfVectorizer(analyzer=dummy)", "class": "Data Transform", - "desc": "This code defines a custom dummy tokenizer function and initializes a `TfidfVectorizer` with this function to vectorize the text data as-is without additional tokenization.", + "desc": "This code initializes a `TfidfVectorizer` with a custom analyzer function named `dummy`, which allows the vectorizer to accept pre-tokenized input (lists of words) directly.", "testing": { "class": "Data_Transform", "subclass": "categorify", "subclass_id": 20, "predicted_subclass_probability": 0.9941847 }, - "cluster": 1 + "cluster": 4 }, { "cell_id": 46, "code": "tfidf_fit = tfidf_vect.fit(train_df['token_text'])", "class": "Data Transform", - "desc": "This code fits the `TfidfVectorizer` to the 'token_text' column of the `train_df` DataFrame, learning the vocabulary and IDF (Inverse Document Frequency) values from the training data.", + "desc": "This code fits the `TfidfVectorizer` to the 'token_text' column of the `train_df` DataFrame, learning the vocabulary and IDF (inverse document frequency) from the training data.", "testing": { "class": "Model_Train", "subclass": "train_model", @@ -2932,187 +2944,187 @@ "cell_id": 47, "code": "matrix_train = tfidf_fit.transform(train_df['token_text'])\nmatrix_test = tfidf_fit.transform(test_df['token_text'])", "class": "Data Transform", - "desc": "This code transforms the 'token_text' column of both `train_df` and `test_df` DataFrames into TF-IDF feature matrices using the fitted `TfidfVectorizer`.", + "desc": "This code transforms the 'token_text' column of both `train_df` and `test_df` DataFrames into TF-IDF matrices using the fitted `TfidfVectorizer`.", "testing": { "class": "Data_Transform", "subclass": "categorify", "subclass_id": 20, "predicted_subclass_probability": 0.9759149 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 49, "code": "counts_df_train = pd.DataFrame(matrix_train.toarray())\ncounts_df_test = pd.DataFrame(matrix_test.toarray())", "class": "Data Transform", - "desc": "This code converts the sparse TF-IDF feature matrices for the training and test datasets into dense DataFrames (`counts_df_train` and `counts_df_test`) for easier manipulation and analysis.", + "desc": "This code converts the sparse TF-IDF matrices `matrix_train` and `matrix_test` into dense pandas DataFrames named `counts_df_train` and `counts_df_test`.", "testing": { "class": "Data_Transform", "subclass": "create_dataframe", "subclass_id": 12, "predicted_subclass_probability": 0.9986713 }, - "cluster": 1 + "cluster": 8 }, { "cell_id": 50, "code": "train_df['length'] = train_df.text.apply(lambda x: len(x) - x.count(' '))\ntest_df['length'] = test_df.text.apply(lambda x: len(x) - x.count(' '))", "class": "Data Transform", - "desc": "This code adds a new column 'length' to both `train_df` and `test_df` DataFrames, calculating the length of each text entry by counting the number of characters excluding spaces.", + "desc": "This code adds a 'length' column to both `train_df` and `test_df` DataFrames, which calculates the number of characters in the text excluding spaces.", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, "predicted_subclass_probability": 0.9990262 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 51, "code": "train_df['punct_perc'] = train_df.text.apply(lambda x: sum([1 for char in x if char in non_alpha])/(len(x) - x.count(' '))*100)\ntest_df['punct_perc'] = test_df.text.apply(lambda x: sum([1 for char in x if char in non_alpha])/(len(x) - x.count(' '))*100)", "class": "Data Transform", - "desc": "This code adds a new column 'punct_perc' to both `train_df` and `test_df` DataFrames, calculating the percentage of punctuation and numeric characters in each text entry relative to the length of the text excluding spaces.", + "desc": "This code adds a 'punct_perc' column to both `train_df` and `test_df` DataFrames, which calculates the percentage of characters in the text that are punctuation or digits, relative to the total number of non-space characters.", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, "predicted_subclass_probability": 0.9990037 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 52, "code": "train_df['word_count'] = train_df.token_text.apply(len)\ntest_df['word_count'] = train_df.token_text.apply(len)", "class": "Data Transform", - "desc": "This code adds a new column 'word_count' to both `train_df` and `test_df` DataFrames, calculating the number of words in the tokenized text for each entry.", + "desc": "This code adds a 'word_count' column to both `train_df` and `test_df` DataFrames, which calculates the number of tokens (words) in each text by taking the length of the 'token_text' lists.", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, "predicted_subclass_probability": 0.9987459 }, - "cluster": 1 + "cluster": 2 }, { "cell_id": 54, "code": "train_df['length_int'] =pd.cut(train_df.length, 14, include_lowest=True)\n #bins=[0, 15, 30, 40, 50,60, 80, 100, 120, 140, 180]\ntest_df['length_int'] =pd.cut(test_df.length, 14, include_lowest=True)\n #bins=[0, 15, 30, 40, 50,60, 80, 100, 120, 140, 180]", "class": "Data Transform", - "desc": "This code creates a new column 'length_int' in both `train_df` and `test_df` DataFrames by binning the 'length' values into 14 intervals, facilitating categorical analysis of text lengths.", + "desc": "This code creates a new 'length_int' column in both `train_df` and `test_df` DataFrames, which categorizes the 'length' feature into 14 equal-width bins using the `pd.cut` function, and includes the lowest value in the first bin.", "testing": { "class": "Visualization", "subclass": "relationship", "subclass_id": 81, "predicted_subclass_probability": 0.30801016 }, - "cluster": 1 + "cluster": 8 }, { "cell_id": 59, "code": "train_df['length'] = train_df['length']**2.3\ntest_df['length'] = train_df['length']**2.3", "class": "Data Transform", - "desc": "This code applies a power transformation of 2.3 to the 'length' column in both `train_df` and `test_df` DataFrames, modifying the data to potentially improve model performance.", + "desc": "This code updates the 'length' column in both `train_df` and `test_df` DataFrames by raising the 'length' values to the 2.3rd power to transform the distribution.", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, "predicted_subclass_probability": 0.9994217 }, - "cluster": 1 + "cluster": 7 }, { "cell_id": 61, "code": "train_df['punct_perc'] = train_df['punct_perc']**(1/3)\ntest_df['punct_perc'] = train_df['punct_perc']**(1/3)", "class": "Data Transform", - "desc": "This code applies a cube root transformation to the 'punct_perc' column in both `train_df` and `test_df` DataFrames, modifying the data to potentially normalize the distribution.", + "desc": "This code updates the 'punct_perc' column in both `train_df` and `test_df` DataFrames by applying a cube root transformation to the 'punct_perc' values.", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, "predicted_subclass_probability": 0.99934 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 64, "code": "# assign an emotion to each tweet\ntrain_df['emotion'] = train_df.text.apply(lambda x: te.get_emotion(x))\ntest_df['emotion'] = test_df.text.apply(lambda x: te.get_emotion(x))", "class": "Data Transform", - "desc": "This code adds a new column 'emotion' to both `train_df` and `test_df` DataFrames, assigning an emotion profile to each tweet using the `text2emotion` library.", + "desc": "This code adds an 'emotion' column to both `train_df` and `test_df` DataFrames, which assigns a dictionary of emotions to each tweet by using the `te.get_emotion` function from the `text2emotion` library.", "testing": { "class": "Data_Transform", "subclass": "categorify", "subclass_id": 20, "predicted_subclass_probability": 0.8783199 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 65, "code": "# exploding the dictionary into 4 different columns, based on the dictionary keys\ntrain_df = pd.concat([train_df, pd.DataFrame(train_df['emotion'].tolist())], axis =1)\ntest_df = pd.concat([test_df, pd.DataFrame(test_df['emotion'].tolist())], axis =1)", "class": "Data Transform", - "desc": "This code expands the 'emotion' dictionary column into separate columns for each emotion (e.g., 'Sad', 'Happy'), adding these as new columns to both `train_df` and `test_df` DataFrames.", + "desc": "This code expands the 'emotion' dictionary into separate columns based on the dictionary keys for both `train_df` and `test_df` DataFrames by using the `pd.concat` method to concatenate the original DataFrame with a new DataFrame created from the 'emotion' column.", "testing": { "class": "Data_Transform", "subclass": "concatenate", "subclass_id": 11, "predicted_subclass_probability": 0.99380034 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 72, "code": "train_df['sentiment'] = train_df.text.astype(str).apply(lambda x: sia.polarity_scores(x))\ntest_df['sentiment'] = test_df.text.astype(str).apply(lambda x: sia.polarity_scores(x))", "class": "Data Transform", - "desc": "This code adds a new column 'sentiment' to both `train_df` and `test_df` DataFrames, calculating sentiment scores for each tweet using the `SentimentIntensityAnalyzer`.", + "desc": "This code adds a 'sentiment' column to both `train_df` and `test_df` DataFrames, which assigns a dictionary of sentiment scores to each tweet by using the `sia.polarity_scores` function from the `SentimentIntensityAnalyzer`.", "testing": { "class": "Data_Transform", "subclass": "data_type_conversions", "subclass_id": 16, "predicted_subclass_probability": 0.9847083 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 73, "code": "train_df = pd.concat([train_df, pd.DataFrame(train_df['sentiment'].tolist())], axis =1)\ntest_df = pd.concat([test_df, pd.DataFrame(test_df['sentiment'].tolist())], axis =1)", "class": "Data Transform", - "desc": "This code expands the 'sentiment' dictionary column into separate columns for each sentiment score (e.g., 'neg', 'neu', 'pos', 'compound'), adding these as new columns to both `train_df` and `test_df` DataFrames.", + "desc": "This code expands the 'sentiment' dictionary into separate columns for both `train_df` and `test_df` DataFrames by using the `pd.concat` method to concatenate the original DataFrame with a new DataFrame created from the 'sentiment' column.", "testing": { "class": "Data_Transform", "subclass": "concatenate", "subclass_id": 11, "predicted_subclass_probability": 0.98916924 }, - "cluster": 1 + "cluster": 3 }, { "cell_id": 78, "code": "#full_train_df = pd.concat([train_df.drop(['location','keyword','text','clean_text','token_text','sentiment','neg','neu','pos','word_count','length_int'], axis=1), counts_df_train], axis=1)\nfull_train_df = pd.concat([train_df.drop(['location','keyword','text','clean_text','token_text','sentiment','neg','neu','pos','word_count','length_int','Happy', 'Angry', 'Surprise', 'Sad', 'Fear'], axis=1), counts_df_train], axis=1)", "class": "Data Transform", - "desc": "This code creates a new DataFrame `full_train_df` by concatenating the `train_df` DataFrame after dropping multiple specified columns with `counts_df_train`, which contains the TF-IDF features.", + "desc": "This code creates a new DataFrame `full_train_df` by concatenating the `train_df` DataFrame, with specified columns dropped, and the `counts_df_train` DataFrame containing TF-IDF features.", "testing": { "class": "Data_Transform", "subclass": "drop_column", "subclass_id": 10, "predicted_subclass_probability": 0.64140093 }, - "cluster": 1 + "cluster": 8 }, { "cell_id": 79, "code": "#full_test_df = pd.concat([test_df.drop(['location','keyword','text','clean_text','token_text','sentiment','neg','neu','pos','word_count','length_int'], axis=1), counts_df_test], axis=1)\nfull_test_df = pd.concat([test_df.drop(['location','keyword','text','clean_text','token_text','sentiment','neg','neu','pos','word_count','length_int','Happy', 'Angry', 'Surprise', 'Sad', 'Fear'], axis=1), counts_df_test], axis=1)", "class": "Data Transform", - "desc": "This code creates a new DataFrame `full_test_df` by concatenating the `test_df` DataFrame after dropping multiple specified columns with `counts_df_test`, which contains the TF-IDF features.", + "desc": "This code creates a new DataFrame `full_test_df` by concatenating the `test_df` DataFrame, with specified columns dropped, and the `counts_df_test` DataFrame containing TF-IDF features.", "testing": { "class": "Data_Transform", "subclass": "drop_column", "subclass_id": 10, "predicted_subclass_probability": 0.699594 }, - "cluster": 1 + "cluster": 8 }, { "cell_id": 80, "code": "# deleting unnecessary dataframes to save memory\ndel train_df\ndel test_df\ndel counts_df_train\ndel counts_df_test", "class": "Data Transform", - "desc": "This code deletes the `train_df`, `test_df`, `counts_df_train`, and `counts_df_test` DataFrames to free up memory resources.", + "desc": "This code deletes the `train_df`, `test_df`, `counts_df_train`, and `counts_df_test` DataFrames to free up memory.", "testing": { "class": "Data_Transform", "subclass": "drop_column", "subclass_id": 10, "predicted_subclass_probability": 0.9989153 }, - "cluster": 1 + "cluster": 8 }, { "cell_id": 3, "code": "train_df.head(5)", "class": "Exploratory Data Analysis", - "desc": "This code displays the first five rows of the `train_df` DataFrame to give a quick overview of the dataset's structure and initial contents.", + "desc": "This code displays the first 5 rows of the `train_df` DataFrame using the pandas `head` method.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", @@ -3124,175 +3136,175 @@ "cell_id": 4, "code": "train_df.info()", "class": "Exploratory Data Analysis", - "desc": "This code provides a concise summary of the `train_df` DataFrame, including the number of non-null entries, data types of columns, and memory usage.", + "desc": "This code provides a concise summary of the `train_df` DataFrame using the pandas `info` method, which includes details such as the number of non-null entries and data types for each column.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table_attributes", "subclass_id": 40, "predicted_subclass_probability": 0.9993624 }, - "cluster": 11 + "cluster": 0 }, { "cell_id": 5, "code": "test_df.info()", "class": "Exploratory Data Analysis", - "desc": "This code provides a concise summary of the `test_df` DataFrame, including the number of non-null entries, data types of columns, and memory usage.", + "desc": "This code provides a concise summary of the `test_df` DataFrame using the pandas `info` method, which includes details such as the number of non-null entries and data types for each column.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table_attributes", "subclass_id": 40, "predicted_subclass_probability": 0.9993699 }, - "cluster": 11 + "cluster": 0 }, { "cell_id": 6, "code": "print(f\"Rows | Columns\\n{train_df.shape}\")\nprint(test_df.shape)", "class": "Exploratory Data Analysis", - "desc": "This code prints the number of rows and columns of both the `train_df` and `test_df` DataFrames, providing a quick understanding of the dataset sizes.", + "desc": "This code prints the number of rows and columns in the `train_df` and `test_df` DataFrames by displaying their shapes.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_shape", "subclass_id": 58, "predicted_subclass_probability": 0.99951255 }, - "cluster": 7 + "cluster": 3 }, { "cell_id": 9, "code": "# we notice many missing values for the location column", "class": "Exploratory Data Analysis", - "desc": "This code comments on the observation that there are many missing values in the 'location' column, as revealed by the previous visualizations and analyses.", + "desc": "This comment notes the observation of numerous missing values in the 'location' column of the DataFrames.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, "predicted_subclass_probability": 0.2938043 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 11, "code": "# checking for duplicates\nprint(train_df.duplicated(subset=['text']).sum())\nprint(test_df.duplicated(subset=['text']).sum())", "class": "Exploratory Data Analysis", - "desc": "This code checks for and prints the number of duplicate rows in the 'text' column within both the `train_df` and `test_df` DataFrames.", + "desc": "This code checks for duplicate entries in the 'text' column of both the `train_df` and `test_df` DataFrames and prints the number of detected duplicates.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "count_duplicates", "subclass_id": 38, "predicted_subclass_probability": 0.87618726 }, - "cluster": -1 + "cluster": 2 }, { "cell_id": 12, "code": "print(train_df.duplicated(subset=['text','target']).sum())", "class": "Exploratory Data Analysis", - "desc": "This code checks for and prints the number of duplicate rows in the 'text' and 'target' columns combined within the `train_df` DataFrame.", + "desc": "This code checks for duplicate entries based on the 'text' and 'target' columns in the `train_df` DataFrame and prints the number of detected duplicates.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "count_duplicates", "subclass_id": 38, "predicted_subclass_probability": 0.7811897 }, - "cluster": -1 + "cluster": 4 }, { "cell_id": 17, "code": "train_df.shape", "class": "Exploratory Data Analysis", - "desc": "This code displays the dimensions of the `train_df` DataFrame to show the size of the dataset after removing duplicates.", + "desc": "This code outputs the shape of the `train_df` DataFrame after duplicates have been removed, which indicates the number of rows and columns.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_shape", "subclass_id": 58, "predicted_subclass_probability": 0.9995821 }, - "cluster": 10 + "cluster": 1 }, { "cell_id": 22, "code": "# Somes actual tweets (non-disaster)\nprint(train_df.text[train_df['target'] == 0][:10].values)", "class": "Exploratory Data Analysis", - "desc": "This code prints the first ten entries of the 'text' column from the `train_df` DataFrame where the 'target' is 0, showcasing examples of non-disaster tweets.", + "desc": "This code prints the text of the first 10 non-disaster tweets (where 'target' is 0) from the `train_df` DataFrame.", "testing": { "class": "Data_Transform", "subclass": "filter", "subclass_id": 14, "predicted_subclass_probability": 0.8921785 }, - "cluster": 4 + "cluster": -1 }, { "cell_id": 23, "code": "print(train_df.text[train_df['target'] == 1][:10].values)", "class": "Exploratory Data Analysis", - "desc": "This code prints the first ten entries of the 'text' column from the `train_df` DataFrame where the 'target' is 1, showcasing examples of disaster-related tweets.", + "desc": "This code prints the text of the first 10 disaster tweets (where 'target' is 1) from the `train_df` DataFrame.", "testing": { "class": "Data_Transform", "subclass": "filter", "subclass_id": 14, "predicted_subclass_probability": 0.5046063 }, - "cluster": 4 + "cluster": -1 }, { "cell_id": 48, "code": "print(matrix_train.shape)\nprint(matrix_test.shape)", "class": "Exploratory Data Analysis", - "desc": "This code prints the dimensions of the TF-IDF feature matrices for the training (`matrix_train`) and test (`matrix_test`) datasets, giving an overview of the transformed data size.", + "desc": "This code prints the shapes of the `matrix_train` and `matrix_test` TF-IDF matrices to show the dimensions of the transformed training and test datasets.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_shape", "subclass_id": 58, "predicted_subclass_probability": 0.9989506 }, - "cluster": -1 + "cluster": 2 }, { "cell_id": 66, "code": "total_emotions = train_df[['Happy', 'Angry', 'Surprise', 'Sad', 'Fear','target']].groupby('target').sum()", "class": "Exploratory Data Analysis", - "desc": "This code aggregates the total counts of each emotion ('Happy', 'Angry', 'Surprise', 'Sad', 'Fear') grouped by the 'target' variable in the `train_df` DataFrame.", + "desc": "This code calculates the sum of each emotion type for both non-disaster and disaster tweets in the `train_df` DataFrame by grouping the data by the 'target' column.", "testing": { "class": "Data_Transform", "subclass": "groupby", "subclass_id": 60, "predicted_subclass_probability": 0.99574137 }, - "cluster": -1 + "cluster": 2 }, { "cell_id": 67, "code": "mean_emotions = train_df[['Happy', 'Angry', 'Surprise', 'Sad', 'Fear','target']].groupby('target').mean()", "class": "Exploratory Data Analysis", - "desc": "This code calculates the mean values of each emotion ('Happy', 'Angry', 'Surprise', 'Sad', 'Fear') grouped by the 'target' variable in the `train_df` DataFrame.", + "desc": "This code calculates the mean value of each emotion type for both non-disaster and disaster tweets in the `train_df` DataFrame by grouping the data by the 'target' column.", "testing": { "class": "Data_Transform", "subclass": "groupby", "subclass_id": 60, "predicted_subclass_probability": 0.99561524 }, - "cluster": -1 + "cluster": 2 }, { "cell_id": 70, "code": "# From the graphs and tables above we notice that, in cases of actual disasters the tweets have greater fear score\n# while for the non-disaster the mean score of 'Happy' is higher.\n# however those variables appear not to have a great importance. They might be dropped from the model.", "class": "Exploratory Data Analysis", - "desc": "This code comments on the observation that tweets related to actual disasters have a higher 'Fear' score, while non-disaster tweets have a higher 'Happy' score, but suggests these variables might be dropped from the model due to their low importance.", + "desc": "This comment notes the observed differences in emotion scores for disaster and non-disaster tweets, suggesting that the 'Happy' and 'Fear' variables may be less important and could be dropped from the model.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "commented", "subclass_id": 76, "predicted_subclass_probability": 0.57685375 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 74, "code": "mean_sentiment = train_df[['neg', 'neu', 'pos', 'compound','target']].groupby('target').mean()\ntotal_sentiment = train_df[['neg', 'neu', 'pos', 'compound','target']].groupby('target').sum()", "class": "Exploratory Data Analysis", - "desc": "This code calculates both the mean and total sentiment scores ('neg', 'neu', 'pos', 'compound') grouped by the 'target' variable in the `train_df` DataFrame.", + "desc": "This code calculates the mean and total sentiment scores for both non-disaster and disaster tweets in the `train_df` DataFrame by grouping the data by the 'target' column.", "testing": { "class": "Data_Transform", "subclass": "groupby", "subclass_id": 60, "predicted_subclass_probability": 0.9969091 }, - "cluster": -1 + "cluster": 2 }, { "cell_id": 0, "code": "# importing libraries\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom IPython.display import display\nfrom collections import Counter, defaultdict\n%matplotlib inline", "class": "Imports and Environment", - "desc": "This code imports several essential libraries for data manipulation (pandas and numpy), visualization (matplotlib and seaborn), and other utilities (IPython.display, collections) while setting up inline plotting for Jupyter notebooks.", + "desc": "This code imports necessary libraries including pandas for data manipulation, numpy for numerical operations, matplotlib and seaborn for data visualization, and sets the inline display for matplotlib plots.", "testing": { "class": "Imports_and_Environment", "subclass": "set_options", @@ -3304,7 +3316,7 @@ "cell_id": 1, "code": "# display 100 columns\npd.set_option('display.max_columns', 100)", "class": "Imports and Environment", - "desc": "This code sets the display option in pandas to show up to 100 columns of a DataFrame when printed.", + "desc": "This code sets the pandas display option to show up to 100 columns when displaying DataFrames in the notebook.", "testing": { "class": "Imports_and_Environment", "subclass": "set_options", @@ -3323,12 +3335,12 @@ "subclass_id": 22, "predicted_subclass_probability": 0.9992773 }, - "cluster": -1 + "cluster": 1 }, { "cell_id": 32, "code": "import string\nimport emoji\nimport nltk\nimport re", "class": "Imports and Environment", - "desc": "This code imports several libraries and modules for text processing and natural language processing, including `string`, `emoji`, `nltk`, and `re`.", + "desc": "This code imports the `string`, `emoji`, and `re` libraries, along with the `nltk` package, which are typically used for text processing and natural language processing tasks.", "testing": { "class": "Imports_and_Environment", "subclass": "import_modules", @@ -3340,19 +3352,19 @@ "cell_id": 44, "code": "from sklearn.feature_extraction.text import TfidfVectorizer", "class": "Imports and Environment", - "desc": "This code imports the `TfidfVectorizer` from the `sklearn.feature_extraction.text` module, which is used to convert text data into TF-IDF feature vectors for machine learning models.", + "desc": "This code imports the `TfidfVectorizer` class from the `sklearn.feature_extraction.text` module, which is used to convert a collection of raw documents to a matrix of TF-IDF features.", "testing": { "class": "Imports_and_Environment", "subclass": "import_modules", "subclass_id": 22, "predicted_subclass_probability": 0.99927324 }, - "cluster": -1 + "cluster": 0 }, { "cell_id": 62, "code": "!pip install text2emotion", "class": "Imports and Environment", - "desc": "This code installs the `text2emotion` library, which is used for extracting emotional information from text data.", + "desc": "This code installs the `text2emotion` library, which is used for extracting emotions from text.", "testing": { "class": "Imports_and_Environment", "subclass": "install_modules", @@ -3364,7 +3376,7 @@ "cell_id": 63, "code": "import text2emotion as te", "class": "Imports and Environment", - "desc": "This code imports the `text2emotion` library, which is used to analyze and extract emotions from text data.", + "desc": "This code imports the `text2emotion` library as `te` for extracting emotions from textual data.", "testing": { "class": "Imports_and_Environment", "subclass": "import_modules", @@ -3376,7 +3388,7 @@ "cell_id": 71, "code": "!pip install twython\nfrom nltk.sentiment import SentimentIntensityAnalyzer\nsia = SentimentIntensityAnalyzer()", "class": "Imports and Environment", - "desc": "This code installs the `twython` library and imports the `SentimentIntensityAnalyzer` from NLTK, initializing it for sentiment analysis.", + "desc": "This code installs the `twython` library and imports the `SentimentIntensityAnalyzer` from the `nltk.sentiment` module to perform sentiment analysis.", "testing": { "class": "Imports_and_Environment", "subclass": "install_modules", @@ -3388,19 +3400,19 @@ "cell_id": 81, "code": "from sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import GridSearchCV", "class": "Imports and Environment", - "desc": "This code imports the `RandomForestClassifier` from `sklearn.ensemble` and `GridSearchCV` from `sklearn.model_selection` for building and tuning a machine learning model.", + "desc": "This code imports the `RandomForestClassifier` from `sklearn.ensemble` and the `GridSearchCV` from `sklearn.model_selection` for model training and hyperparameter tuning.", "testing": { "class": "Imports_and_Environment", "subclass": "import_modules", "subclass_id": 22, "predicted_subclass_probability": 0.99927956 }, - "cluster": -1 + "cluster": 0 }, { "cell_id": 85, "code": "pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending = False)", "class": "Model Evaluation", - "desc": "This code creates a DataFrame from the cross-validation results of `GridSearchCV` and sorts it by the 'mean_test_score' in descending order to identify the best-performing hyperparameter configurations.", + "desc": "This code creates a DataFrame from the cross-validation results of the `GridSearchCV` fitting process and sorts it by the 'mean_test_score' in descending order to evaluate the best-performing hyperparameters and their scores.", "testing": { "class": "Data_Transform", "subclass": "sort_values", @@ -3412,6627 +3424,6668 @@ "cell_id": 88, "code": "predictions_rf = rf.predict(full_test_df.drop(['id'], axis=1))", "class": "Model Evaluation", - "desc": "This code generates predictions using the trained `RandomForestClassifier` on the test data in `full_test_df`, excluding the 'id' column.", + "desc": "This code generates predictions using the trained `RandomForestClassifier` on the test data from `full_test_df`, excluding the 'id' column.", "testing": { "class": "Model_Evaluation", "subclass": "predict_on_test", "subclass_id": 48, "predicted_subclass_probability": 0.9943375 }, - "cluster": 0 + "cluster": 1 }, { "cell_id": 82, "code": "forest = RandomForestClassifier()\nparam = {'n_estimators':[200, 500],\n 'max_depth':[200, 300]}", "class": "Model Training", - "desc": "This code initializes a `RandomForestClassifier` and defines a parameter grid `param` for tuning the model, specifically varying the number of estimators and maximum depth.", + "desc": "This code initializes a `RandomForestClassifier` and defines a dictionary `param` specifying the values for the 'n_estimators' and 'max_depth' hyperparameters to be tuned.", "testing": { "class": "Model_Train", "subclass": "choose_model_class", "subclass_id": 4, "predicted_subclass_probability": 0.99327236 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 83, "code": "gs = GridSearchCV(forest, param, cv=3)", "class": "Model Training", - "desc": "This code initializes a `GridSearchCV` object with the `RandomForestClassifier` and the specified parameter grid, using 3-fold cross-validation to find the best hyperparameters.", + "desc": "This code sets up a `GridSearchCV` object named `gs` to perform cross-validated grid search for hyperparameter tuning of the `RandomForestClassifier` using the parameters specified in the `param` dictionary with 3-fold cross-validation.", "testing": { "class": "Model_Train", "subclass": "train_on_grid", "subclass_id": 6, "predicted_subclass_probability": 0.40436822 }, - "cluster": 0 + "cluster": 1 }, { "cell_id": 84, "code": "gs_fit = gs.fit(full_train_df.drop(['target','id'], axis=1), full_train_df['target'])", "class": "Model Training", - "desc": "This code fits the `GridSearchCV` on the training data (`full_train_df`), excluding the 'target' and 'id' columns, to find the best hyperparameters for the `RandomForestClassifier`.", + "desc": "This code fits the `GridSearchCV` object `gs` to the training data from `full_train_df`, excluding the 'target' and 'id' columns, to find the best hyperparameters for the `RandomForestClassifier`.", "testing": { "class": "Model_Train", "subclass": "train_model", "subclass_id": 7, "predicted_subclass_probability": 0.9996861 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 86, "code": "rf = RandomForestClassifier(n_estimators=200, max_depth=200)", "class": "Model Training", - "desc": "This code initializes a `RandomForestClassifier` with the best-performing hyperparameters found from the GridSearchCV, specifically setting `n_estimators` to 200 and `max_depth` to 200.", + "desc": "This code initializes a `RandomForestClassifier` with the specified hyperparameters: 200 trees (`n_estimators=200`) and a maximum depth of 200 (`max_depth=200`).", "testing": { "class": "Model_Train", "subclass": "choose_model_class", "subclass_id": 4, "predicted_subclass_probability": 0.9980399 }, - "cluster": -1 + "cluster": 1 }, { "cell_id": 87, "code": "rf.fit(full_train_df.drop(['target','id'], axis=1), full_train_df['target'])", "class": "Model Training", - "desc": "This code trains the `RandomForestClassifier` with the specified parameters on the training data in `full_train_df`, excluding the 'target' and 'id' columns.", + "desc": "This code trains the `RandomForestClassifier` using the training data from `full_train_df`, excluding the 'target' and 'id' columns.", "testing": { "class": "Model_Train", "subclass": "train_model", "subclass_id": 7, "predicted_subclass_probability": 0.99969363 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 7, "code": "table = plt.table(cellText=[[train_df.location.isnull().sum()], [test_df.location.isnull().sum()]],\n rowLabels=['Train Data', 'Test Data'],\n colLabels=['Number of Missing values'],\n loc='top')\nplt.box(on=None)\nplt.axis('off')\n# plt.subplots_adjust(top = 1, bottom = 0.1, right = 1, left = 0, \n# hspace = 0, wspace = 0)\n#plt.margins(0,0)\ntable.set_fontsize(14);", "class": "Visualization", - "desc": "This code creates and displays a table within a plot to showcase the number of missing values in the 'location' column for both the training and test datasets.", + "desc": "This code creates a table using Matplotlib to display the number of missing values in the 'location' column of both the training and test DataFrames, `train_df` and `test_df`, and formats its appearance.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.86134624 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 8, "code": "# checking for missing values\nfig, (ax1, ax2) = plt.subplots(2,1, figsize=(8,5))\n\nsns.heatmap(train_df.isnull(),yticklabels=False,cbar=False,cmap='flare', ax = ax1)\nax1.tick_params(axis='x', labelsize=13, rotation = 45)\nax1.set_title('train data')\n\nsns.heatmap(test_df.isnull(),yticklabels=False,cbar=False,cmap='flare', ax = ax2)\nax2.tick_params(axis='x', labelsize=13, rotation = 45)\nax2.set_title('test data')\nfig.tight_layout();", "class": "Visualization", - "desc": "This code generates heatmaps to visualize the distribution of missing values in both the training (`train_df`) and test (`test_df`) datasets, highlighting missing data patterns.", + "desc": "This code creates heatmaps using Seaborn to visually represent the location of missing values in the `train_df` and `test_df` DataFrames, arranging them in a vertically stacked layout with customized tick parameters.", "testing": { "class": "Visualization", "subclass": "heatmap", "subclass_id": 80, "predicted_subclass_probability": 0.99526525 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 18, "code": "fig = plt.figure(figsize=(20,5))\ntrain_df.keyword.value_counts().head(25).plot(kind = 'bar')\nplt.tick_params(axis='x', labelsize=11, rotation = 45)\nplt.title('Top keywords');", "class": "Visualization", - "desc": "This code generates a bar plot displaying the top 25 most frequent keywords in the `train_df` DataFrame, providing insights into the most common keywords in the training data.", + "desc": "This code generates a bar plot using Matplotlib to visualize the top 25 most frequent keywords in the `train_df` DataFrame, with customized tick parameters and a title.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.99566513 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 19, "code": "fig = plt.figure(figsize=(20,5))\ntrain_df.location.value_counts().head(25).plot(kind = 'bar')\nplt.tick_params(axis='x', labelsize=9, rotation = 45)\nplt.title('Top Locations');", "class": "Visualization", - "desc": "This code generates a bar plot displaying the top 25 most frequent locations in the `train_df` DataFrame, providing insights into the most common locations in the training data.", + "desc": "This code generates a bar plot using Matplotlib to visualize the top 25 most frequent locations in the `train_df` DataFrame, with customized tick parameters and a title.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.99453616 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 20, "code": "fig = plt.figure(figsize=(7,5))\nsns.countplot(data = train_df, x = \"target\")\nplt.title('Non-actual vs actual distater tweets counts');", "class": "Visualization", - "desc": "This code generates a count plot to visualize the distribution of non-disaster (0) and disaster (1) tweets in the `train_df` DataFrame.", + "desc": "This code generates a count plot using Seaborn to visualize the distribution of non-actual versus actual disaster tweets in the `train_df` DataFrame, with a specified title.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.9882289 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 21, "code": "fig = plt.figure(figsize=(7,5))\n(train_df.groupby(['target']).count() / len(train_df['target']))['id'].plot(kind='bar', width = 0.85, color = ['tomato', 'steelblue'])\nplt.title('Non-actual vs actual distater tweets percentage');", "class": "Visualization", - "desc": "This code generates a bar plot to visualize the percentage distribution of non-disaster (0) and disaster (1) tweets in the `train_df` DataFrame.", + "desc": "This code creates a bar plot using Matplotlib to visualize the percentage distribution of non-actual versus actual disaster tweets in the `train_df` DataFrame by grouping based on the 'target' column.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.9961461 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 25, "code": "wordcloud_non_dis = WordCloud( background_color='white',\n width=600,\n height=400).generate(\" \".join(train_df.text[train_df['target'] == 0]))\nwordcloud_dis = WordCloud( background_color='white',\n width=600,\n height=400).generate(\" \".join(train_df.text[train_df['target'] == 1]))", "class": "Visualization", - "desc": "This code generates two word clouds, one for non-disaster tweets and one for disaster tweets, by concatenating the text of tweets with respective targets and using the `WordCloud` class.", + "desc": "This code generates two word clouds using the WordCloud library, one for non-disaster tweets and one for disaster tweets, by concatenating the text of tweets with 'target' 0 and 1 respectively.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.68369985 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 26, "code": "fig, (ax1, ax2) = plt.subplots(1,2, figsize=(14,14))\n\nax1.imshow(wordcloud_non_dis, interpolation='bilinear')\nax1.set_title('Non disaster')\nax1.axis(\"off\")\nax2.imshow(wordcloud_dis, interpolation='bilinear')\nax2.set_title('Disaster')\nax2.axis(\"off\");", "class": "Visualization", - "desc": "This code creates a side-by-side display of two word clouds generated from non-disaster and disaster tweets, providing a visual comparison of the common words in each category.", + "desc": "This code creates a side-by-side visualization of the word clouds for non-disaster and disaster tweets using Matplotlib, with titles and axes turned off.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.99756926 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 30, "code": "fig, axes = plt.subplots(1,2,figsize=(25,6))\nax1 = train_df[train_df['target']==0]['location'].value_counts().head(25).plot(kind = 'bar', color = 'b', alpha = 0.5, ax=axes[0])\nax1.tick_params(axis='x', labelsize=9, rotation = 45)\nax1.set_title('Non-Disaster')\nax2 = train_df[train_df['target']==1]['location'].value_counts().head(25).plot(kind = 'bar', color = 'r', alpha = 0.5, ax=axes[1])\nax2.tick_params(axis='x', labelsize=9, rotation = 45)\nax2.set_title('Disaster');", "class": "Visualization", - "desc": "This code generates two bar plots to visualize the top 25 locations for non-disaster and disaster tweets separately, providing a side-by-side comparison of their geographical distribution.", + "desc": "This code generates two side-by-side bar plots using Matplotlib to display the top 25 most frequent locations for non-disaster and disaster tweets from the `train_df` DataFrame, with customized tick parameters and titles.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.99818254 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 31, "code": "fig, axes = plt.subplots(1,2,figsize=(25,6))\nax1 = train_df[train_df['target']==0]['keyword'].value_counts().head(25).plot(kind = 'bar', color = 'b', alpha = 0.5, ax=axes[0])\nax1.tick_params(axis='x', labelsize=9, rotation = 45)\nax1.set_title('Top Keywords (non disaster)')\nax2 = train_df[train_df['target']==1]['keyword'].value_counts().head(25).plot(kind = 'bar', color = 'r', alpha = 0.5, ax=axes[1])\nax2.tick_params(axis='x', labelsize=9, rotation = 45)\nax2.set_title('Top Keywords (disaster)');", "class": "Visualization", - "desc": "This code generates two bar plots to visualize the top 25 keywords for non-disaster and disaster tweets separately, providing a side-by-side comparison of the most common keywords in each category.", + "desc": "This code generates two side-by-side bar plots using Matplotlib to display the top 25 most frequent keywords for non-disaster and disaster tweets from the `train_df` DataFrame, with customized tick parameters and titles.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.981807 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 53, "code": "fig, axes = plt.subplots(1,2,figsize=(17,5))\nsns.histplot(data = train_df, \n x= 'length', \n hue = 'target',\n element='step',\n stat='probability',\n bins=40,\n ax=axes[0])\nsns.boxplot(data = train_df, x = 'target', y = 'length',ax=axes[1]);", "class": "Visualization", - "desc": "This code generates a histogram and a box plot to visualize the distribution and summary statistics of text lengths in the `train_df` DataFrame, with separate categories for non-disaster and disaster tweets.", + "desc": "This code generates a side-by-side visualization consisting of a histogram with density estimates and a box plot using Seaborn to display the distribution and variation of the 'length' feature for non-disaster and disaster tweets in the `train_df` DataFrame.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.9957883 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 55, "code": "fig = plt.figure(figsize=(7,5))\ntrain_df[train_df['target']==0]['length_int'].value_counts(sort=False).plot(kind='bar', alpha = 0.5, color='blue', label = 'No')\ntrain_df[train_df['target']==1]['length_int'].value_counts(sort=False).plot(kind='bar', alpha = 0.5, color='orange', label = 'Yes')\nplt.legend(title='Actual Disaster', fontsize=11, title_fontsize=12);\n ", "class": "Visualization", - "desc": "This code generates a bar plot to visualize the distribution of non-disaster and disaster tweets across different text length intervals in the `train_df` DataFrame.", + "desc": "This code creates a bar plot using Matplotlib to visualize the distribution of the 'length_int' bins for non-disaster and disaster tweets in the `train_df` DataFrame, with different colors and a legend for clarity.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.9890703 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 56, "code": "fig, axes = plt.subplots(1,2,figsize=(17,5))\nsns.histplot(data = train_df, \n x= 'punct_perc', \n hue = 'target',\n element='step',\n stat='probability',\n bins=40,\n ax=axes[0])\nsns.boxplot(data = train_df, x = 'target', y = 'punct_perc',ax=axes[1]);", "class": "Visualization", - "desc": "This code generates a histogram and a box plot to visualize the distribution and summary statistics of punctuation percentage in the text for non-disaster and disaster tweets in the `train_df` DataFrame.", + "desc": "This code generates a side-by-side visualization consisting of a histogram with density estimates and a box plot using Seaborn to display the distribution and variation of the 'punct_perc' feature for non-disaster and disaster tweets in the `train_df` DataFrame.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.99661547 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 57, "code": "fig, axes = plt.subplots(1,2,figsize=(17,5))\nsns.histplot(data = train_df, \n x= 'word_count', \n hue = 'target',\n element='step',\n stat='probability',\n bins=40,\n ax=axes[0])\nsns.boxplot(data = train_df, x = 'target', y = 'word_count',ax=axes[1]);", "class": "Visualization", - "desc": "This code generates a histogram and a box plot to visualize the distribution and summary statistics of word count in the text for non-disaster and disaster tweets in the `train_df` DataFrame.", + "desc": "This code generates a side-by-side visualization consisting of a histogram with density estimates and a box plot using Seaborn to display the distribution and variation of the 'word_count' feature for non-disaster and disaster tweets in the `train_df` DataFrame.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.9946801 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 58, "code": "## data transformation\nplt.hist(train_df[train_df.target==1]['length']**2.3, bins = 40, color = 'blue', alpha=0.5)\nplt.hist(train_df[train_df.target==0]['length']**2.3, bins = 40, color = 'red', alpha=0.5);\n## by transforming the distribution into a bimodal we can notice that the data are more separated", "class": "Visualization", - "desc": "This code generates histograms to visualize the lengths of non-disaster and disaster tweets in the `train_df` DataFrame after raising the lengths to the power of 2.3, illustrating that the transformation results in a more bimodal distribution.", + "desc": "This code generates a histogram using Matplotlib to display the distribution of the 'length' feature raised to the 2.3rd power for non-disaster and disaster tweets in the `train_df` DataFrame, illustrating that the transformation creates a more bimodal and separable distribution.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.9983096 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 60, "code": "plt.hist(train_df['punct_perc']**(1/3), bins = 40);", "class": "Visualization", - "desc": "This code generates a histogram to visualize the distribution of the transformed punctuation percentage in the `train_df` DataFrame, where the transformation applied is the cube root of punctuation percentage.", + "desc": "This code generates a histogram using Matplotlib to display the distribution of the 'punct_perc' feature raised to the power of one-third (cube root transformation) in the `train_df` DataFrame.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.9975884 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 68, "code": "fig, axes = plt.subplots(1,2,figsize=(14,5))\nax1 = total_emotions.plot(kind='bar', ax = axes[0])\nax1.set_title('Total values of emotion scores per target class')\nax2 = mean_emotions.plot(kind='bar', ax = axes[1])\nax2.set_title('Mean Scores of emotions per target class');", "class": "Visualization", - "desc": "This code generates two bar plots to visualize the total and mean values of emotion scores for each target class (non-disaster and disaster tweets) in the `train_df` DataFrame.", + "desc": "This code generates two bar plots using Matplotlib to visualize the total and mean emotion scores for both non-disaster and disaster tweets in the `train_df` DataFrame, with appropriate titles for each plot.", "testing": { "class": "Visualization", "subclass": "time_series", "subclass_id": 75, "predicted_subclass_probability": 0.8920648 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 69, "code": "fig, axes = plt.subplots(2,2,figsize=(20,10))\nsns.histplot(data = train_df, \n x= 'Happy', \n hue = 'target',\n element='step',\n stat='probability',\n bins=40,\n ax=axes[0,0])\nsns.boxplot(data = train_df, x = 'target', y = 'Happy',ax=axes[0,1])\nsns.histplot(data = train_df, \n x= 'Fear', \n hue = 'target',\n element='step',\n stat='probability',\n bins=40,\n ax=axes[1,0])\nsns.boxplot(data = train_df, x = 'target', y = 'Fear',ax=axes[1,1])", "class": "Visualization", - "desc": "This code generates multiple plots, including histograms and box plots, to visualize the distribution and summary statistics of 'Happy' and 'Fear' emotion scores for non-disaster and disaster tweets in the `train_df` DataFrame.", + "desc": "This code generates four plots using Seaborn to visualize the distribution and variation of the 'Happy' and 'Fear' emotion scores for non-disaster and disaster tweets in the `train_df` DataFrame, with histograms and box plots for each emotion type.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.99813986 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 75, "code": "fig, axes = plt.subplots(1,2,figsize=(14,5))\nax1 = total_sentiment.plot(kind='bar', ax = axes[0])\nax1.set_title('Total values of emotion scores per target class')\nax2 = mean_sentiment.plot(kind='bar', ax = axes[1])\nax2.set_title('Mean Scores of emotions per target class');", "class": "Visualization", - "desc": "This code generates two bar plots to visualize the total and mean sentiment scores for each target class (non-disaster and disaster tweets) in the `train_df` DataFrame.", + "desc": "This code generates two bar plots using Matplotlib to visualize the total and mean sentiment scores for both non-disaster and disaster tweets in the `train_df` DataFrame, with appropriate titles for each plot.", "testing": { "class": "Visualization", "subclass": "time_series", "subclass_id": 75, "predicted_subclass_probability": 0.86384803 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 76, "code": "fig, axes = plt.subplots(1,2,figsize=(17,5))\nsns.histplot(data = train_df, \n x= 'compound', \n hue = 'target',\n element='step',\n stat='probability',\n bins=40,\n ax=axes[0])\nsns.boxplot(data = train_df, x = 'target', y = 'compound',ax=axes[1]);", "class": "Visualization", - "desc": "This code generates a histogram and a box plot to visualize the distribution and summary statistics of the 'compound' sentiment scores for non-disaster and disaster tweets in the `train_df` DataFrame.", + "desc": "This code generates a side-by-side visualization consisting of a histogram with density estimates and a box plot using Seaborn to display the distribution and variation of the 'compound' sentiment score for non-disaster and disaster tweets in the `train_df` DataFrame.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.9953399 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 77, "code": "fig, axes = plt.subplots(2,2,figsize=(20,10))\nsns.histplot(data = train_df, \n x= 'neg', \n hue = 'target',\n element='step',\n stat='probability',\n bins=40,\n ax=axes[0,0])\nsns.boxplot(data = train_df, x = 'target', y = 'neg',ax=axes[0,1])\nsns.histplot(data = train_df, \n x= 'pos', \n hue = 'target',\n element='step',\n stat='probability',\n bins=40,\n ax=axes[1,0])\nsns.boxplot(data = train_df, x = 'target', y = 'pos',ax=axes[1,1])", "class": "Visualization", - "desc": "This code generates multiple plots, including histograms and box plots, to visualize the distribution and summary statistics of 'neg' and 'pos' sentiment scores for non-disaster and disaster tweets in the `train_df` DataFrame.", + "desc": "This code generates four plots using Seaborn to visualize the distribution and variation of the 'neg' and 'pos' sentiment scores for non-disaster and disaster tweets in the `train_df` DataFrame, with histograms and box plots for each sentiment type.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, "predicted_subclass_probability": 0.99770266 }, - "cluster": 0 + "cluster": -1 }], - "notebook_id": 12, - "notebook_name": "nlp-disaster-tweets-text2emotion-vader" + "notebook_id": 9, + "notebook_name": "nlp-disaster-tweets-text2emotion-vader.ipynb" }, { "cells": [{ - "cell_id": 77, - "code": "pred_df.to_csv(\"/kaggle/working/nlp_disaster_tweets_tfidf_lr_submission.csv\", index=False)", + "cell_id": 40, + "code": "sub_sample = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')\n\nsubmit = sub_sample.copy()\nsubmit.target = y_test\nsubmit.to_csv('submit.csv',index=False)", "class": "Data Export", - "desc": "This code snippet exports the modified `pred_df` DataFrame to a CSV file named \"nlp_disaster_tweets_tfidf_lr_submission.csv\" without including the index, saving the predictions for external use or submission.", + "desc": "This code reads a sample submission file, updates the 'target' column with the logistic regression model predictions, and saves the modified DataFrame as 'submit.csv' for submission.", "testing": { "class": "Data_Export", "subclass": "save_to_csv", "subclass_id": 25, - "predicted_subclass_probability": 0.99896014 - }, - "cluster": -1 - }, { - "cell_id": 2, - "code": "df = pd.read_csv(\"../input/nlp-getting-started/train.csv\", sep=\",\")", - "class": "Data Extraction", - "desc": "This code snippet reads a CSV file named \"train.csv\" from the specified directory into a pandas DataFrame for further processing and analysis.", - "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.9997379 + "predicted_subclass_probability": 0.999146 }, "cluster": -1 }, { - "cell_id": 68, - "code": "pred_df = pd.read_csv(\"../input/nlp-getting-started/test.csv\", sep=\",\")", + "cell_id": 1, + "code": "df = pd.read_csv('../input/nlp-getting-started/train.csv')\ntest = pd.read_csv('../input/nlp-getting-started/test.csv')", "class": "Data Extraction", - "desc": "This code snippet reads a CSV file named \"test.csv\" from the specified directory into a pandas DataFrame named `pred_df` for the purpose of making predictions on a new dataset.", + "desc": "This code reads two CSV files, 'train.csv' and 'test.csv', into Pandas DataFrames named 'df' and 'test', respectively, for further processing.", "testing": { "class": "Data_Extraction", "subclass": "load_from_csv", "subclass_id": 45, - "predicted_subclass_probability": 0.999752 + "predicted_subclass_probability": 0.9997588 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 17, - "code": "# Since classes are imbalanced, we need to resample the dataframe\n# First divide by class\ndf_class_0 = df[df[\"target\"] == 0]\ndf_class_1 = df[df[\"target\"] == 1]", + "cell_id": 5, + "code": "df = df.drop_duplicates().reset_index(drop = True)", "class": "Data Transform", - "desc": "This code snippet separates the DataFrame into two subsets based on the \"target\" column, one for each class, in preparation for resampling to address class imbalance.", + "desc": "This code removes duplicate rows from the DataFrame 'df' and resets the index, ensuring no duplicates and a clean index structure.", "testing": { "class": "Data_Transform", - "subclass": "filter", - "subclass_id": 14, - "predicted_subclass_probability": 0.45292664 + "subclass": "remove_duplicates", + "subclass_id": 19, + "predicted_subclass_probability": 0.8511636 }, - "cluster": 1 + "cluster": 7 }, { - "cell_id": 18, - "code": "df_class_0", + "cell_id": 20, + "code": "a = []\nfor i in w_nd:\n if i in nw_nd:\n a.apeend(i)\na\n#there is no common item on both group of tweets", "class": "Data Transform", - "desc": "This code snippet outputs the DataFrame subset where the \"target\" column equals 0, which was previously divided for the purpose of resampling to address class imbalance.", + "desc": "This code attempts to create a list of common keywords between non-disastrous and disastrous tweets by iterating through the top keywords and appending the common ones to the list 'a', although it contains a typo in the append method.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9996333 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.3016006 }, - "cluster": 2 + "cluster": 6 }, { - "cell_id": 19, - "code": "df_class_1", + "cell_id": 23, + "code": "#Cleaning the Data", "class": "Data Transform", - "desc": "This code snippet outputs the DataFrame subset where the \"target\" column equals 1, which was previously divided for the purpose of resampling to address class imbalance.", + "desc": "This comment indicates that the subsequent code will focus on cleaning the dataset, including handling missing values, removing noise, and possibly normalizing data.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.99961746 + "subclass": "commented", + "subclass_id": 76, + "predicted_subclass_probability": 0.892342 }, - "cluster": 2 + "cluster": 1 }, { - "cell_id": 20, - "code": "# Second resample - try both under- and over-sampling\ndf_class_0_under = df_class_0.sample(count_class_1) # undersampling by loosing objects\ndf_under = pd.concat([df_class_0_under, df_class_1], axis=0)\n\ndf_class_1_over = df_class_1.sample(count_class_0, replace=True) # oversampling by duplicaitng objects\ndf_over = pd.concat([df_class_0, df_class_1_over], axis=0)\n\n#df = df_under\n#df = df_over\n\n# Looks like oversampling works better since we use more objects - more training cases", + "cell_id": 24, + "code": "import re\n#Conver lowercase remove punctuation and Character and then strip \ntext = df[\"text\"].iloc[0]\nprint(text)\ntext = re.sub(r'[^\\w\\s]', '', str(text).lower().strip())\ntxt = text.split()\nprint(txt)\n\n", "class": "Data Transform", - "desc": "This code snippet performs both undersampling of the majority class and oversampling of the minority class to create two balanced datasets, which helps mitigate the class imbalance issue for machine learning tasks.", + "desc": "This code snippet converts a sample text from the 'text' column of the DataFrame 'df' to lowercase, removes punctuation and extra characters, and then splits the cleaned text into a list of words using regular expressions.", "testing": { "class": "Data_Transform", - "subclass": "concatenate", - "subclass_id": 11, - "predicted_subclass_probability": 0.7727316 + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.9692003 }, - "cluster": 1 + "cluster": 6 }, { - "cell_id": 22, - "code": "train_df, test_df = train_test_split(df, train_size=0.9)", + "cell_id": 25, + "code": "#remove stopwords\nimport nltk\nlst_stopwords = nltk.corpus.stopwords.words(\"english\")\ntxt = [word for word in txt if word not in lst_stopwords]\nprint(txt)", "class": "Data Transform", - "desc": "This code snippet splits the DataFrame into training and testing sets using a 90-10 split ratio, preparing the data for subsequent model training and evaluation phases.", + "desc": "This code removes English stopwords from the list of words 'txt' using the NLTK library and prints the filtered words.", "testing": { "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.9968256 + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.9619886 }, "cluster": 1 }, { "cell_id": 26, - "code": "#eng_words = words.words(\"en\")", + "code": "#stemming\nps = nltk.stem.porter.PorterStemmer()\nprint([ps.stem(word) for word in txt])", "class": "Data Transform", - "desc": "This commented-out code snippet, if activated, would load a list of English words from the NLTK corpus, which can be used for text preprocessing tasks such as filtering or validation.", + "desc": "This code applies stemming to the list of words 'txt' using the PorterStemmer from the NLTK library and prints the stemmed words.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "commented", - "subclass_id": 76, - "predicted_subclass_probability": 0.9975278 + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.23080209 }, - "cluster": 0 + "cluster": 1 }, { "cell_id": 27, - "code": "#print(\"wort\" in eng_words)", + "code": "#Lemmentization\nlem = nltk.stem.wordnet.WordNetLemmatizer()\nprint([lem.lemmatize(word) for word in txt])", "class": "Data Transform", - "desc": "This commented-out code snippet, if activated, would check if the word \"wort\" is in the list of English words loaded from the NLTK corpus, which helps validate the presence of specific words during text preprocessing.", + "desc": "This code applies lemmatization to the list of words 'txt' using the WordNetLemmatizer from the NLTK library and prints the lemmatized words.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "commented", - "subclass_id": 76, - "predicted_subclass_probability": 0.9939898 - }, - "cluster": 0 - }, { - "cell_id": 28, - "code": "snowball = SnowballStemmer(language=\"english\")", - "class": "Data Transform", - "desc": "This code snippet initializes a SnowballStemmer for the English language from the NLTK library, which will be used to perform stemming on text data to reduce words to their base or root form.", - "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.9888525 + "class": "Model_Train", + "subclass": "define_search_model", + "subclass_id": 82, + "predicted_subclass_probability": 0.22034359 }, "cluster": 1 }, { - "cell_id": 29, - "code": "def tokenize_sentence(sentence: str, remove_stop_words: bool = True):\n '''Tokenize sentences with nltk dropping non-english words and punctuation and optionally stop words'''\n tokens = word_tokenize(sentence, language=\"english\")\n #tokens = [i for i in tokens if i in eng_words and i not in string.punctuation]\n tokens = [i for i in tokens if i not in string.punctuation]\n if remove_stop_words:\n tokens = [i for i in tokens if i not in stopwords.words(\"english\")]\n tokens = [snowball.stem(i) for i in tokens]\n return tokens", + "cell_id": 28, + "code": "#to apply all the technique to all the records on dataset\ndef utils_preprocess_text(text, flg_stemm=True, flg_lemm =True, lst_stopwords=None ):\n text = re.sub(r'[^\\w\\s]', '', str(text).lower().strip())\n \n #tokenization(convert from string to List)\n lst_text = text.split()\n #remove stopwords\n if lst_stopwords is not None:\n lst_text = [word for word in lst_text if word not in\n lst_stopwords]\n \n #stemming\n if flg_stemm == True:\n ps = nltk.stem.porter.PorterStemmer()\n lst_text = [ps.stem(word) for word in lst_text]\n \n #Lemmentization\n if flg_lemm == True:\n lem = nltk.stem.wordnet.WordNetLemmatizer()\n lst_text = [lem.lemmatize(word) for word in lst_text]\n \n # back to string from list\n text = \" \".join(lst_text)\n return text\n ", "class": "Data Transform", - "desc": "This function tokenizes a given sentence by removing punctuation, optionally filtering out stop words, and applying stemming to each token, thereby preparing the text for further analysis or modeling.", + "desc": "This code defines a function 'utils_preprocess_text' that preprocesses the input text by converting to lowercase, removing punctuation, tokenizing, optionally removing stopwords, stemming, lemmatizing, and then recombining the text into a string.", "testing": { "class": "Data_Transform", "subclass": "string_transform", "subclass_id": 78, - "predicted_subclass_probability": 0.99213856 + "predicted_subclass_probability": 0.93941545 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 30, - "code": "tokenize_sentence(\"the sentence and asdf fy krkr\", False)", + "cell_id": 29, + "code": "#apply dataset\ndf['clean_text'] = df['text'].apply(lambda x: utils_preprocess_text(x, flg_stemm = False, flg_lemm=True))\ntest['clean_text'] = test['text'].apply(lambda x: utils_preprocess_text(x, flg_stemm = False, flg_lemm=True))", "class": "Data Transform", - "desc": "This code snippet calls the `tokenize_sentence` function on a sample sentence without removing stop words, to demonstrate or test the tokenization and stemming process.", + "desc": "This code applies the 'utils_preprocess_text' function to the 'text' column of both 'df' and 'test' DataFrames, creating a new column 'clean_text' with the preprocessed text, using lemmatization but not stemming.", "testing": { "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.9920244 + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.96186244 }, - "cluster": 1 + "cluster": 3 }, { - "cell_id": 31, - "code": "vectorizer_params = {\n #\"max_features\": 500,\n #\"max_features\": None,\n #\"tokenizer\": lambda x: tokenize_sentence(x, remove_stop_words=False),\n #\"tokenizer\": None,\n #\"ngram_range\": (1, 100),\n #\"min_df\": 0,\n #\"max_df\": 100,\n #\"use_idf\": False,\n #\"decode_error\": \"replace\",\n #\"sublinear_tf\": True,\n #\"analyzer\": \"char\"\n}", + "cell_id": 30, + "code": "#Target Encoding\n", "class": "Data Transform", - "desc": "This code snippet defines a dictionary `vectorizer_params` for configuring the parameters of a TfidfVectorizer, though all parameters are currently commented out, allowing for easy adjustment based on experiment needs.", + "desc": "This comment indicates that the subsequent code will focus on encoding the target variable, typically converting categorical labels into numeric form suitable for machine learning algorithms.", "testing": { "class": "Model_Train", - "subclass": "init_hyperparams", - "subclass_id": 59, - "predicted_subclass_probability": 0.91740084 + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.4920049 }, "cluster": 1 }, { - "cell_id": 32, - "code": "vectorizer = TfidfVectorizer(**vectorizer_params)", + "cell_id": 31, + "code": "import category_encoders as ce\n\n# Target encoding\nfeatures = ['keyword', 'location']\nencoder = ce.TargetEncoder(cols=features)\nencoder.fit(df[features],df['target'])\n\ndf = df.join(encoder.transform(df[features]).add_suffix('_target'))\ntest = test.join(encoder.transform(test[features]).add_suffix('_target'))\n", "class": "Data Transform", - "desc": "This code snippet initializes a TfidfVectorizer with the previously defined (but currently commented out) parameters, setting up the vectorizer for transforming text data into TF-IDF feature vectors.", + "desc": "This code applies target encoding to the 'keyword' and 'location' columns using the Category Encoders library, effectively creating new target-encoded features suffixed with '_target' and adding them to both 'df' and 'test' DataFrames.", "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.9972772 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.9991437 }, - "cluster": 1 + "cluster": 8 }, { "cell_id": 33, - "code": "vectorizer", - "class": "Data Transform", - "desc": "This code snippet outputs the initialized TfidfVectorizer object, which will be used to transform text data into TF-IDF feature vectors for further analysis or model training.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9994728 - }, - "cluster": 2 - }, { - "cell_id": 34, - "code": "features = vectorizer.fit_transform(train_df[\"text\"])", + "code": "from sklearn.feature_extraction.text import TfidfVectorizer\n\nvec_text = TfidfVectorizer(min_df = 10, ngram_range = (1,2), stop_words='english') \n# Only include >=10 occurrences\n# Have unigrams and bigrams\ntext_vec = vec_text.fit_transform(df['clean_text'])\ntext_vec_test = vec_text.transform(test['clean_text'])\nX_train_text = pd.DataFrame(text_vec.toarray(), columns=vec_text.get_feature_names())\nX_test_text = pd.DataFrame(text_vec_test.toarray(), columns=vec_text.get_feature_names())\nprint (X_train_text.shape)", "class": "Data Transform", - "desc": "This code snippet fits the TfidfVectorizer to the \"text\" column of the training DataFrame and transforms the text data into a TF-IDF feature matrix, preparing the text data for model training.", + "desc": "This code utilizes the `TfidfVectorizer` from Scikit-learn to transform the 'clean_text' column in both 'df' and 'test' DataFrames into TF-IDF weighted unigrams and bigrams, creating and printing the shape of the resultant feature matrices for training and testing datasets.", "testing": { "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.99820554 - }, - "cluster": 1 - }, { - "cell_id": 36, - "code": "feature_names = vectorizer.get_feature_names()", - "class": "Data Transform", - "desc": "This code snippet retrieves the list of feature names (terms) from the fitted TfidfVectorizer, which can be useful for understanding the vocabulary used in the TF-IDF representation.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.54732066 + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.6342874 }, - "cluster": 1 + "cluster": 5 }, { - "cell_id": 39, - "code": "X_train = train_df[\"text\"]", + "cell_id": 34, + "code": "df = df.join(X_train_text, rsuffix='_text')\ntest = test.join(X_test_text, rsuffix='_text')\n", "class": "Data Transform", - "desc": "This code snippet assigns the \"text\" column of the training DataFrame to the variable `X_train`, isolating the training feature data for further processing or model training.", + "desc": "This code merges the TF-IDF weighted feature matrices 'X_train_text' and 'X_test_text' with the original 'df' and 'test' DataFrames, respectively, appending the new features with a '_text' suffix.", "testing": { "class": "Data_Transform", - "subclass": "prepare_x_and_y", - "subclass_id": 21, - "predicted_subclass_probability": 0.9986425 + "subclass": "merge", + "subclass_id": 32, + "predicted_subclass_probability": 0.99786144 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 40, - "code": "X_train", - "class": "Data Transform", - "desc": "This code snippet outputs the training feature data (`X_train`), which consists of the text data from the training DataFrame, allowing for inspection or verification before further processing.", + "cell_id": 2, + "code": "df.head()", + "class": "Exploratory Data Analysis", + "desc": "This code displays the first five rows of the DataFrame 'df' to give an initial look at the dataset's structure and contents.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.99974626 + "predicted_subclass_probability": 0.9997553 }, - "cluster": 2 + "cluster": 0 }, { - "cell_id": 41, - "code": "y_train = train_df[\"target\"]", - "class": "Data Transform", - "desc": "This code snippet assigns the \"target\" column of the training DataFrame to the variable `y_train`, isolating the labels for the training data in preparation for model training.", + "cell_id": 3, + "code": "df.info()", + "class": "Exploratory Data Analysis", + "desc": "This code provides a concise summary of the DataFrame 'df', including the data types, non-null counts, and memory usage.", "testing": { - "class": "Data_Transform", - "subclass": "prepare_x_and_y", - "subclass_id": 21, - "predicted_subclass_probability": 0.9991716 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.9992442 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 42, - "code": "y_train", - "class": "Data Transform", - "desc": "This code snippet outputs the training labels (`y_train`), which consist of the target values from the training DataFrame, allowing for inspection or verification before further processing.", + "cell_id": 4, + "code": "df.duplicated().sum()", + "class": "Exploratory Data Analysis", + "desc": "This code calculates and returns the total number of duplicate rows in the DataFrame 'df'.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.99974114 + "subclass": "count_duplicates", + "subclass_id": 38, + "predicted_subclass_probability": 0.8993749 }, - "cluster": 2 + "cluster": 6 }, { - "cell_id": 43, - "code": "lr_model_params = {\n #\"class_weight\": \"balanced\",\n #\"class_weight\": None,\n #\"class_weight\": {1: 1, 0: 1/class_ratio},\n #\"random_state\": 0,\n #\"Cs\": 5,\n #\"penalty\": \"none\",\n #\"penalty\": \"elasticnet\",\n \"solver\": \"liblinear\",\n #\"l1_ratio\": 0.5,\n #\"max_iter\": 10000,\n #\"cv\": 10\n}", - "class": "Data Transform", - "desc": "This code snippet defines a dictionary `lr_model_params` with commented-out parameters for configuring a logistic regression model, allowing for easy modification of the model's hyperparameters and setup.", + "cell_id": 6, + "code": "df['target'].value_counts()", + "class": "Exploratory Data Analysis", + "desc": "This code counts and returns the frequency of each unique value in the 'target' column of the DataFrame 'df' to understand the distribution of the target variable.", "testing": { - "class": "Model_Train", - "subclass": "init_hyperparams", - "subclass_id": 59, - "predicted_subclass_probability": 0.98202676 + "class": "Exploratory_Data_Analysis", + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.9995012 }, - "cluster": 1 + "cluster": 4 }, { - "cell_id": 46, - "code": "text_n = 10\nfeatures[text_n]", - "class": "Data Transform", - "desc": "This code snippet retrieves the feature vector of the 10th text sample from the transformed features matrix, allowing for inspection or analysis of the specific feature values for this sample.", + "cell_id": 7, + "code": "df[df['target'] == 0][:1]", + "class": "Exploratory Data Analysis", + "desc": "This code filters the DataFrame 'df' to display the first row where the 'target' column value is 0, helping in understanding the characteristics of this subset of data.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9924711 + "class": "Data_Transform", + "subclass": "filter", + "subclass_id": 14, + "predicted_subclass_probability": 0.98627037 }, - "cluster": 1 + "cluster": 4 }, { - "cell_id": 59, - "code": "#y_test = y_train\ny_test = test_df[\"target\"]", - "class": "Data Transform", - "desc": "This code snippet sets the `y_test` variable to the \"target\" column of the test DataFrame, storing the true labels for the testing set, which are required for evaluating the model's performance.", + "cell_id": 8, + "code": "df[df['target'] == 1][:1]", + "class": "Exploratory Data Analysis", + "desc": "This code filters the DataFrame 'df' to display the first row where the 'target' column value is 1, aiding in comparing characteristics between different target classes.", "testing": { "class": "Data_Transform", - "subclass": "prepare_x_and_y", - "subclass_id": 21, - "predicted_subclass_probability": 0.99827564 + "subclass": "filter", + "subclass_id": 14, + "predicted_subclass_probability": 0.98621887 }, - "cluster": 1 + "cluster": 4 }, { - "cell_id": 71, - "code": "pred_df[\"target\"] = model_pipeline.predict(pred_df[\"text\"])", - "class": "Data Transform", - "desc": "This code snippet adds a new column \"target\" to the `pred_df` DataFrame, containing the predicted target values generated by the logistic regression model within the pipeline for each row of text data.", + "cell_id": 9, + "code": "df.isnull().sum()", + "class": "Exploratory Data Analysis", + "desc": "This code calculates and returns the total number of missing (null) values in each column of the DataFrame 'df' to identify columns that may require data cleaning.", "testing": { - "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.9946057 + "class": "Exploratory_Data_Analysis", + "subclass": "count_missing_values", + "subclass_id": 39, + "predicted_subclass_probability": 0.9985019 }, - "cluster": 1 + "cluster": 6 }, { - "cell_id": 75, - "code": "pred_df.drop(columns=[\"keyword\", \"location\", \"text\"], inplace=True)", - "class": "Data Transform", - "desc": "This code snippet removes the \"keyword,\" \"location,\" and \"text\" columns from the `pred_df` DataFrame, retaining only the necessary columns for the final output, such as \"target.\"", + "cell_id": 10, + "code": "df['keyword'].value_counts()", + "class": "Exploratory Data Analysis", + "desc": "This code counts and returns the frequency of each unique value in the 'keyword' column of the DataFrame 'df', providing insight into the distribution of keywords in the dataset.", "testing": { - "class": "Data_Transform", - "subclass": "drop_column", - "subclass_id": 10, - "predicted_subclass_probability": 0.99920505 + "class": "Exploratory_Data_Analysis", + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.9995165 }, - "cluster": 1 + "cluster": 4 }, { - "cell_id": 3, - "code": "df.shape", + "cell_id": 11, + "code": "df['location'].value_counts()", "class": "Exploratory Data Analysis", - "desc": "This code snippet outputs the dimensions (number of rows and columns) of the DataFrame, which provides an initial understanding of the dataset's size.", + "desc": "This code counts and returns the frequency of each unique value in the 'location' column of the DataFrame 'df', offering an overview of the distribution of locations in the dataset.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.9995491 + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.9994772 }, - "cluster": -1 + "cluster": 4 }, { - "cell_id": 4, - "code": "df.head(20)", + "cell_id": 21, + "code": "#check no of unique keyword and location\nprint(df.keyword.nunique())\ndf['location'].nunique()", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first 20 rows of the DataFrame to give a preliminary view of the dataset's structure and contents.", + "desc": "This code prints the number of unique keywords and the number of unique locations in the DataFrame 'df' to understand the diversity in these columns.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997683 + "subclass": "count_unique_values", + "subclass_id": 54, + "predicted_subclass_probability": 0.9503388 }, - "cluster": 3 + "cluster": 4 }, { - "cell_id": 5, - "code": "df.head(20)", + "cell_id": 32, + "code": "df.isnull().sum()", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first 20 rows of the DataFrame to provide an initial glimpse into the dataset's structure and contents for initial investigation.", + "desc": "This code calculates and returns the total number of missing (null) values in each column of the DataFrame 'df' to identify columns that may require further data cleaning after transformations.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997683 + "subclass": "count_missing_values", + "subclass_id": 39, + "predicted_subclass_probability": 0.9985019 }, - "cluster": 3 + "cluster": 6 }, { - "cell_id": 6, - "code": "df.tail(20)", + "cell_id": 35, + "code": "df.head(1)", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the last 20 rows of the DataFrame to give insights into the end portion of the dataset, which can be useful for ensuring data consistency and completeness.", + "desc": "This code displays the first row of the DataFrame 'df' to verify the structure and the newly added TF-IDF features.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.99977094 + "predicted_subclass_probability": 0.9997688 }, - "cluster": 3 + "cluster": 2 }, { - "cell_id": 7, - "code": "df[\"target\"].value_counts()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet counts the occurrences of each unique value in the \"target\" column, providing an overview of the distribution of the target variable in the dataset.", + "cell_id": 0, + "code": "import numpy as np\nimport pandas as pd", + "class": "Imports and Environment", + "desc": "This code imports the NumPy and Pandas libraries, which are essential for numerical and data manipulation tasks in Python.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.9994948 + "class": "Imports_and_Environment", + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.9993284 }, - "cluster": 9 + "cluster": 0 }, { - "cell_id": 8, - "code": "count_class_0, count_class_1 = df[\"target\"].value_counts()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet assigns the counts of the two unique values in the \"target\" column to the variables `count_class_0` and `count_class_1`, allowing for easy reference and subsequent analysis of class distribution.", + "cell_id": 12, + "code": "import matplotlib.pyplot as plt\n%matplotlib inline\nimport seaborn as sns", + "class": "Imports and Environment", + "desc": "This code imports the Matplotlib Pyplot and Seaborn libraries, and enables inline plotting for visualizations in the Jupyter notebook.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.99944013 + "class": "Imports_and_Environment", + "subclass": "set_options", + "subclass_id": 23, + "predicted_subclass_probability": 0.99939334 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 9, - "code": "print(count_class_0, count_class_1)", - "class": "Exploratory Data Analysis", - "desc": "This code snippet prints the counts of the two unique values in the \"target\" column, providing a quick check of the class distribution in the dataset.", + "cell_id": 37, + "code": "from sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import MinMaxScaler\n\nfeatures_to_drop = ['id', 'keyword','location', 'text','clean_text' ]\nscaler = MinMaxScaler()", + "class": "Imports and Environment", + "desc": "This code imports the LogisticRegression model, Pipeline, and MinMaxScaler from Scikit-learn, and initializes a list of features to drop as well as a MinMaxScaler instance for feature scaling.", + "testing": { + "class": "Data_Transform", + "subclass": "normalization", + "subclass_id": 18, + "predicted_subclass_probability": 0.9946407 + }, + "cluster": 0 + }, { + "cell_id": 39, + "code": "y_test[:10]", + "class": "Model Evaluation", + "desc": "This code retrieves and displays the first ten predictions from the logistic regression model applied to the test dataset to perform an initial evaluation of the model's performance.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.988794 + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.99966455 }, "cluster": 1 }, { - "cell_id": 10, - "code": "class_ratio = count_class_0 / count_class_1", - "class": "Exploratory Data Analysis", - "desc": "This code snippet calculates the ratio between the counts of the two unique values in the \"target\" column, offering insight into the class balance or imbalance within the dataset.", + "cell_id": 41, + "code": "print ('Training accuracy: %.4f' % pipeline.score(X_train, y_train))", + "class": "Model Evaluation", + "desc": "This code calculates and prints the training accuracy of the logistic regression model using the training dataset to evaluate its performance.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.9978563 + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.8493974 }, "cluster": 1 }, { - "cell_id": 11, - "code": "print(\"{0:.3f}\".format(class_ratio))", - "class": "Exploratory Data Analysis", - "desc": "This code snippet prints the class ratio to three decimal places, providing a concise and readable metric for understanding the degree of class imbalance in the dataset.", + "cell_id": 42, + "code": "from sklearn.metrics import f1_score\n\nprint ('Training f-1 score: %.4f' % f1_score(y_train, pipeline.predict(X_train)))", + "class": "Model Evaluation", + "desc": "This code calculates and prints the F1 score for the logistic regression model on the training dataset to assess the balance between precision and recall.", + "testing": { + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.5244426 + }, + "cluster": 1 + }, { + "cell_id": 43, + "code": "# Confusion matrix\nfrom sklearn.metrics import confusion_matrix\npd.DataFrame(confusion_matrix(y_train, pipeline.predict(X_train)))", + "class": "Model Evaluation", + "desc": "This code generates and displays a confusion matrix for the logistic regression model's predictions on the training dataset to visualize the performance in terms of true/false positives and true/false negatives.", + "testing": { + "class": "Data_Transform", + "subclass": "create_dataframe", + "subclass_id": 12, + "predicted_subclass_probability": 0.91332203 + }, + "cluster": 1 + }, { + "cell_id": 36, + "code": "#Logistic Regression", + "class": "Model Training", + "desc": "This comment indicates that the subsequent code will focus on training a logistic regression model, possibly using the preprocessed and feature-engineered data.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.98255676 + "subclass": "commented", + "subclass_id": 76, + "predicted_subclass_probability": 0.962167 }, "cluster": -1 }, { - "cell_id": 12, - "code": "df.head(20)", - "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first 20 rows of the DataFrame to provide an initial glimpse into the dataset's structure and contents for initial investigation.", + "cell_id": 38, + "code": "X_train = df.drop(columns = features_to_drop + ['target'])\nX_test = test.drop(columns = features_to_drop)\ny_train = df.target\nlr = LogisticRegression(solver = 'liblinear', random_state = 777)\n\npipeline = Pipeline([('scale', scaler), ('lr',lr),])\npipeline.fit(X_train, y_train)\ny_test = pipeline.predict(X_test)", + "class": "Model Training", + "desc": "This code prepares the training and test data by dropping non-feature columns, then sets up and fits a logistic regression model within a pipeline that includes MinMax scaling, and finally predicts the target variable for the test dataset.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997683 + "class": "Model_Train", + "subclass": "find_best_model_class", + "subclass_id": 3, + "predicted_subclass_probability": 0.2473382 }, - "cluster": 3 + "cluster": 0 }, { "cell_id": 13, - "code": "df.loc[df[\"target\"] == 1].head(10)", - "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first 10 rows of the DataFrame where the \"target\" column equals 1, allowing examination of a subset of the data corresponding to one class.", + "code": "#Most Common Words", + "class": "Visualization", + "desc": "This comment indicates that the subsequent code will focus on identifying and visualizing the most common words in the dataset.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.99977666 + "predicted_subclass_probability": 0.74166125 }, - "cluster": 4 + "cluster": -1 }, { "cell_id": 14, - "code": "df.loc[df[\"target\"] == 0].head(10)", - "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first 10 rows of the DataFrame where the \"target\" column equals 0, allowing examination of a subset of the data corresponding to the other class.", + "code": "# plt.figure(figsize=(10,8))\n# sns.barplot(x = df['keyword'].value_counts().head(5).index, y = df['keyword'].value_counts().head(5))", + "class": "Visualization", + "desc": "This code creates a bar plot using Seaborn to visualize the top 5 most common keywords in the 'keyword' column of the DataFrame 'df', although it is currently commented out.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.99977714 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.97318214 }, - "cluster": 4 + "cluster": -1 }, { "cell_id": 15, - "code": "for c in df[df[\"target\"] == 1][\"text\"].head(10):\n print(c)", - "class": "Exploratory Data Analysis", - "desc": "This code snippet iterates through and prints the \"text\" column of the first 10 rows where \"target\" equals 1, aiding in understanding the nature of the textual data associated with this class.", + "code": "plt.figure(figsize= (9,6))\nsns.countplot(y = df.keyword, order = df.keyword.value_counts().iloc[:15].index)\nplt.title('Top 15 Keyword')\nplt.show()", + "class": "Visualization", + "desc": "This code generates a horizontal count plot using Seaborn to visualize the top 15 most common keywords in the 'keyword' column of the DataFrame 'df', with a title 'Top 15 Keyword'.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.999645 + "class": "Visualization", + "subclass": "model_coefficients", + "subclass_id": 79, + "predicted_subclass_probability": 0.9776137 }, "cluster": -1 }, { "cell_id": 16, - "code": "for c in df[df[\"target\"] == 0][\"text\"].head(10):\n print(c)", - "class": "Exploratory Data Analysis", - "desc": "This code snippet iterates through and prints the \"text\" column of the first 10 rows where \"target\" equals 0, aiding in understanding the nature of the textual data associated with this class.", + "code": "sns.countplot(y = df.target)", + "class": "Visualization", + "desc": "This code generates a horizontal count plot using Seaborn to visualize the distribution of the target variable in the DataFrame 'df'.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.99963677 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.98655856 }, "cluster": -1 }, { - "cell_id": 21, - "code": "df[\"target\"].value_counts()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet counts the occurrences of each unique value in the \"target\" column, providing an overview of the distribution of the target variable in the dataset.", + "cell_id": 17, + "code": "#Top 10 words in Disasterous and Non-Disasterous tweets", + "class": "Visualization", + "desc": "This comment indicates that the subsequent code will focus on identifying and visualizing the top 10 most common words in tweets categorized as disastrous and non-disastrous.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.9994948 + "subclass": "commented", + "subclass_id": 76, + "predicted_subclass_probability": 0.7826869 }, - "cluster": 9 + "cluster": -1 }, { - "cell_id": 23, - "code": "test_df.shape", - "class": "Exploratory Data Analysis", - "desc": "This code snippet outputs the dimensions (number of rows and columns) of the test DataFrame, verifying the size of the test set after splitting.", + "cell_id": 18, + "code": "w_nd = df[df.target == 0].keyword.value_counts().head(10)\nsns.barplot(w_nd, w_nd.index, color = 'c')\nplt.title('Top keyword for Disaster tweet')\nplt.show()", + "class": "Visualization", + "desc": "This code generates a horizontal bar plot using Seaborn to visualize the top 10 keywords in non-disastrous tweets from the DataFrame 'df', with a title 'Top keyword for Disaster tweet'.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.99960893 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.8726834 }, - "cluster": 10 + "cluster": -1 }, { - "cell_id": 24, - "code": "train_df[\"target\"].value_counts()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet counts the occurrences of each unique value in the \"target\" column of the training DataFrame, providing insight into the distribution of the target variable within the training set.", + "cell_id": 19, + "code": "nw_nd = df[df.target == 1].keyword.value_counts().head(10)\nsns.barplot(nw_nd, nw_nd.index, color = 'y')\nplt.title('Top keyword for Non-Disaster tweet')\nplt.show()", + "class": "Visualization", + "desc": "This code generates a horizontal bar plot using Seaborn to visualize the top 10 keywords in disastrous tweets from the DataFrame 'df', with a title 'Top keyword for Non-Disaster tweet'.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.999521 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.9795735 }, - "cluster": 9 + "cluster": -1 }, { - "cell_id": 25, - "code": "test_df[\"target\"].value_counts()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet counts the occurrences of each unique value in the \"target\" column of the testing DataFrame, providing insight into the distribution of the target variable within the testing set.", + "cell_id": 22, + "code": "#Most Common Location\nplt.figure(figsize = (9,6))\nsns.countplot(y = df.location, order = df.location.value_counts().iloc[:15].index)\nplt.title('Top 15 Location')\nplt.show()", + "class": "Visualization", + "desc": "This code generates a horizontal count plot using Seaborn to visualize the top 15 most common locations in the 'location' column of the DataFrame 'df', with a title 'Top 15 Location'.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.9995276 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.97139984 }, - "cluster": 9 - }, { - "cell_id": 35, - "code": "print(features.shape)", - "class": "Exploratory Data Analysis", - "desc": "This code snippet prints the dimensions of the TF-IDF feature matrix, providing insight into the number of samples and the number of features generated by the vectorizer.", + "cluster": -1 + }], + "notebook_id": 10, + "notebook_name": "nlp-from-beginner-to-expert.ipynb" + }, { + "cells": [{ + "cell_id": 26, + "code": "# Copy the results to a pandas dataframe with an \"id\" column and a \"target\" column\nfinal_submission = pd.DataFrame( data={\"id\":test_data[\"id\"], \"target\":y_test_predictions})\n# Save the submission file\nfinal_submission.to_csv(\"submissionTweets.csv\", index=False)", + "class": "Data Export", + "desc": "This code snippet creates a pandas dataframe containing the test data IDs and their corresponding predicted class labels, and then saves this dataframe to a CSV file named \"submissionTweets.csv\" without including the index.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.9995484 + "class": "Data_Export", + "subclass": "save_to_csv", + "subclass_id": 25, + "predicted_subclass_probability": 0.99925834 }, "cluster": -1 }, { - "cell_id": 37, - "code": "print(\"Feature names (unique tokens): {0}.\\nFeature count: {1}\".format(feature_names, len(feature_names)))", - "class": "Exploratory Data Analysis", - "desc": "This code snippet prints the unique tokens (feature names) extracted by the TfidfVectorizer and their count, providing insight into the vocabulary size and the specific terms considered in the TF-IDF representation.", + "cell_id": 2, + "code": "train_data = pd.read_csv('../input/nlp-getting-started/train.csv')\nprint(train_data.shape)\ntrain_data.head(3)", + "class": "Data Extraction", + "desc": "This code snippet reads the training data from a CSV file using pandas, prints the shape of the dataframe, and displays the first three rows.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_unique_values", - "subclass_id": 54, - "predicted_subclass_probability": 0.9350609 - }, - "cluster": 7 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.9993906 + }, + "cluster": 1 }, { - "cell_id": 38, - "code": "print('fire' in feature_names)", - "class": "Exploratory Data Analysis", - "desc": "This code snippet checks if the word 'fire' is included in the list of feature names from the TfidfVectorizer, helping to verify whether specific terms are represented in the feature matrix.", + "cell_id": 3, + "code": "# load test dataset\ntest_data = pd.read_csv('../input/nlp-getting-started/test.csv')\nprint(test_data.shape)\ntest_data.head(3)", + "class": "Data Extraction", + "desc": "This code snippet reads the test data from a CSV file using pandas, prints the shape of the dataframe, and displays the first three rows.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.9059723 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.99944836 + }, + "cluster": 1 + }, { + "cell_id": 7, + "code": "# Print some words of the vocabulary\nvocabulary = tokenizer.get_vocab()\nprint(f'Size of the vocabulary: {len(vocabulary)}')\nprint(f'Some tokens of the vocabulary: {list(vocabulary.keys())[5000:5010]}')", + "class": "Data Extraction", + "desc": "This code snippet retrieves the vocabulary from the initialized BERT tokenizer, prints the size of the vocabulary, and displays a sample of tokens from the vocabulary.", + "testing": { + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.9397918 + }, + "cluster": 1 + }, { + "cell_id": 11, + "code": "X = train_data[\"text\"]\ny = train_data[\"target\"]", + "class": "Data Extraction", + "desc": "This code snippet extracts the features (text) and the target labels from the training data dataframe into separate variables `X` and `y`.", + "testing": { + "class": "Data_Transform", + "subclass": "prepare_x_and_y", + "subclass_id": 21, + "predicted_subclass_probability": 0.99927586 }, "cluster": -1 }, { - "cell_id": 49, - "code": "train_df[\"text\"].iloc[text_n]", - "class": "Exploratory Data Analysis", - "desc": "This code snippet retrieves and outputs the original text of the 10th sample from the training DataFrame, allowing comparison between the raw input text and the model's prediction.", + "cell_id": 8, + "code": "def prepare_sequence(text):\n \"\"\"\n Tokenize and prepare a sequence for the model. It tokenizes the text sequence\n adding special tokens ([CLS], [SEP]), padding to the max length and truncate \n reviews longer than the max length.\n Return the token IDs, the segment IDs and the mask IDs.\n \"\"\"\n\n prepared_sequence = tokenizer.encode_plus(\n text, \n add_special_tokens = True, \n max_length = MAX_LENGHT, \n padding = 'max_length',\n return_attention_mask = True\n )\n return prepared_sequence", + "class": "Data Transform", + "desc": "This code snippet defines a function named `prepare_sequence` that tokenizes a given text sequence using the BERT tokenizer, adds special tokens, pads or truncates the text to a maximum length, and returns token IDs, segment IDs, and mask IDs.", "testing": { "class": "Data_Transform", - "subclass": "filter", - "subclass_id": 14, - "predicted_subclass_probability": 0.9789242 + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.9985875 }, - "cluster": 5 + "cluster": 0 }, { - "cell_id": 69, - "code": "pred_df.shape", - "class": "Exploratory Data Analysis", - "desc": "This code snippet outputs the dimensions (number of rows and columns) of the `pred_df` DataFrame, providing an initial understanding of the size of the new dataset.", + "cell_id": 9, + "code": "# Prepare a test sentence\ntest_sentence = 'Is this jacksonville?'\ntest_sentence_encoded = prepare_sequence(test_sentence)\ntoken_ids = test_sentence_encoded[\"input_ids\"]\nprint(f'Test sentence: {test_sentence}')\nprint(f'Keys: {test_sentence_encoded.keys()}')\nprint(f'Tokens: {tokenizer.convert_ids_to_tokens(token_ids)[:12]}')\nprint(f'Token IDs: {token_ids[:12]}')\nprint(f'Segment IDs: {test_sentence_encoded[\"token_type_ids\"][:12]}')\nprint(f'Mask IDs {test_sentence_encoded[\"attention_mask\"][:12]}')\nprint(f'Input dimension: {len(token_ids)}')", + "class": "Data Transform", + "desc": "This code snippet prepares a test sentence using the `prepare_sequence` function, prints various aspects of the encoded sequence such as token IDs, segment IDs, mask IDs, and displays the first 12 tokens and their corresponding IDs from the test sentence.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.99961746 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.42718402 }, - "cluster": 10 + "cluster": 8 }, { - "cell_id": 70, - "code": "pred_df.head()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first few rows of the `pred_df` DataFrame, offering a preliminary look at the structure and contents of the new dataset.", + "cell_id": 10, + "code": "def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):\n \"\"\"\n Map to the expected input to TFBertForSequenceClassification.\n \"\"\"\n mapped_example = {\n \"input_ids\": input_ids,\n \"token_type_ids\": token_type_ids,\n \"attention_mask\": attention_masks,\n }\n return mapped_example, label \n\ndef encode_examples(texts_and_labels):\n \"\"\"\n Prepare all sequences of text and build TF dataset.\n \"\"\"\n\n input_ids_list = []\n token_type_ids_list = []\n attention_mask_list = []\n label_list = []\n \n for text, label in texts_and_labels:\n\n bert_input = prepare_sequence(text)\n\n input_ids_list.append(bert_input['input_ids'])\n token_type_ids_list.append(bert_input['token_type_ids'])\n attention_mask_list.append(bert_input['attention_mask'])\n label_list.append([label])\n\n # Create TF dataset\n dataset = tf.data.Dataset.from_tensor_slices(\n (input_ids_list, attention_mask_list, token_type_ids_list,\n label_list)\n )\n # Map to the expected input to TFBertForSequenceClassification\n dataset_mapped = dataset.map(map_example_to_dict)\n return dataset_mapped", + "class": "Data Transform", + "desc": "This code snippet defines two functions: `map_example_to_dict`, which maps token IDs, token type IDs, and attention masks to the expected input format for `TFBertForSequenceClassification`, and `encode_examples`, which prepares sequences of text and builds a TensorFlow dataset from them, including mapping to the required input format.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997497 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.9974004 }, - "cluster": 11 + "cluster": 0 }, { - "cell_id": 72, - "code": "pred_df.shape", - "class": "Exploratory Data Analysis", - "desc": "This code snippet outputs the dimensions (number of rows and columns) of the `pred_df` DataFrame, verifying its size after adding the new \"target\" column with predictions.", + "cell_id": 12, + "code": "# Split the training dataset for training and test\nX_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, \n random_state=1)", + "class": "Data Transform", + "desc": "This code snippet splits the dataset into training and validation sets using the `train_test_split` function from scikit-learn with 10% of the data reserved for validation and a random state for reproducibility.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.99961746 + "class": "Data_Transform", + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.99780315 }, - "cluster": 10 + "cluster": 0 }, { - "cell_id": 73, - "code": "pred_df.head(20)", - "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first 20 rows of the `pred_df` DataFrame, allowing inspection of the original text data alongside the newly added predicted target values.", + "cell_id": 14, + "code": "train_dataset = list(zip(X_train, y_train))\nval_dataset = list(zip(X_val, y_val))", + "class": "Data Transform", + "desc": "This code snippet creates the training and validation datasets by combining the respective feature and target variables into tuples using the `zip` function and converting them to lists.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997557 + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.7672821 }, - "cluster": 2 + "cluster": 8 }, { - "cell_id": 74, - "code": "pred_df.tail(20)", + "cell_id": 15, + "code": "# Prepare sequences of text and build TF train dataset\nds_train_encoded = encode_examples(train_dataset).shuffle(10000).batch(BATCH_SIZE)\n\n# Prepare sequences of text and build TF validation dataset\nds_val_encoded = encode_examples(val_dataset).batch(BATCH_SIZE)", + "class": "Data Transform", + "desc": "This code snippet encodes the training and validation datasets using the `encode_examples` function, then shuffles and batches the encoded training data and batches the encoded validation data using TensorFlow's dataset operations.", + "testing": { + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.8710919 + }, + "cluster": 8 + }, { + "cell_id": 23, + "code": "def encode_test_examples(texts):\n \"\"\"\n Prepare all sequences of text and build TF dataset.\n \"\"\"\n\n input_ids_list = []\n token_type_ids_list = []\n attention_mask_list = []\n \n for text in texts:\n\n bert_input = prepare_sequence(text)\n\n input_ids_list.append(bert_input['input_ids'])\n token_type_ids_list.append(bert_input['token_type_ids'])\n attention_mask_list.append(bert_input['attention_mask'])\n\n # Create TF dataset\n dataset = tf.data.Dataset.from_tensor_slices(\n (input_ids_list, attention_mask_list, token_type_ids_list)\n )\n # Map to the expected input to TFBertForSequenceClassification\n dataset_mapped = dataset.map(map_test_example_to_dict)\n return dataset_mapped\n\ndef map_test_example_to_dict(input_ids, attention_masks, token_type_ids):\n \"\"\"\n Map to the expected input to TFBertForSequenceClassification.\n \"\"\"\n mapped_example = {\n \"input_ids\": input_ids,\n \"token_type_ids\": token_type_ids,\n \"attention_mask\": attention_masks,\n }\n return mapped_example", + "class": "Data Transform", + "desc": "This code snippet defines two functions: `encode_test_examples`, which prepares sequences of text and builds a TensorFlow dataset for test data, and `map_test_example_to_dict`, which maps token IDs, token type IDs, and attention masks to the expected input format for `TFBertForSequenceClassification`.", + "testing": { + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.9800521 + }, + "cluster": 0 + }, { + "cell_id": 24, + "code": "X_test = test_data[\"text\"]\ntest_dataset = list(X_test)\nds_test_encoded = encode_test_examples(test_dataset).batch(BATCH_SIZE)", + "class": "Data Transform", + "desc": "This code snippet extracts the feature (text) from the test data dataframe, converts it into a list, encodes the test dataset using the `encode_test_examples` function, and batches the encoded test dataset using TensorFlow's `batch` method.", + "testing": { + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.9627081 + }, + "cluster": 4 + }, { + "cell_id": 4, + "code": "for tweet_index in range(1,30,5):\n print(f'Text of the tweet: {train_data[\"text\"][tweet_index]}')\n print(f'Target: {\"Real disaster\" if train_data[\"target\"][tweet_index]==1 else \"Not real disaster\"}\\n')", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the last 20 rows of the `pred_df` DataFrame, allowing inspection of the original text data alongside the newly added predicted target values at the end of the dataset.", + "desc": "This code snippet iterates through specific indices of the training data to print the text of selected tweets and their corresponding target labels, indicating whether each tweet is about a real disaster or not.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997602 + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.29876474 }, - "cluster": 2 + "cluster": 3 }, { - "cell_id": 76, - "code": "pred_df.head()", + "cell_id": 13, + "code": "n_training_examples = X_train.shape[0]\nn_positive_training_examples = y_train.value_counts()[1]\nn_negative_training_examples = y_train.value_counts()[0]\nprint(f'Number examples in training dataset: {n_training_examples}')\nprint(f'Number of positive examples in training dataset: {n_positive_training_examples}')\nprint(f'Number of negative examples in training dataset: {n_negative_training_examples}')", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first few rows of the `pred_df` DataFrame after dropping the specified columns, allowing verification of the remaining dataset structure and contents.", + "desc": "This code snippet calculates and prints the total number of training examples, as well as the counts of positive and negative examples in the training dataset, by using the `value_counts` method on the target labels.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997497 + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.9978163 }, - "cluster": 11 + "cluster": 5 }, { "cell_id": 0, - "code": "import pandas as pd\nimport platform\nfrom sklearn.model_selection import train_test_split\nimport nltk\nimport string\nfrom nltk.corpus import stopwords\nfrom nltk.tokenize import word_tokenize\nfrom nltk.stem import SnowballStemmer\nfrom nltk.corpus import words\n#nltk.download('punkt')\nfrom sklearn.pipeline import Pipeline\n#from sklearn.linear_model import LogisticRegression\nimport sklearn.linear_model as lm\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics import precision_score, recall_score, precision_recall_curve, f1_score\nfrom sklearn import metrics\nfrom matplotlib import pyplot as plt\nfrom sklearn.metrics import plot_precision_recall_curve\nimport numpy as np\nfrom sklearn.model_selection import GridSearchCV", + "code": "import random\n\nimport pandas as pd\nimport numpy as np \nfrom scipy.special import softmax\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import (roc_auc_score, classification_report, \n confusion_matrix)\nimport tensorflow as tf\nfrom transformers import BertTokenizer\nfrom transformers import TFBertForSequenceClassification\nfrom transformers import AutoConfig\n", "class": "Imports and Environment", - "desc": "This code snippet imports various Python libraries and modules necessary for data manipulation (pandas), natural language processing (nltk), machine learning (sklearn), and data visualization (matplotlib).", + "desc": "This code snippet imports various libraries and modules such as pandas, numpy, seaborn, matplotlib, scikit-learn, tensorflow, and transformers for data manipulation, visualization, model training, and evaluation in a machine learning task.", "testing": { "class": "Imports_and_Environment", "subclass": "import_modules", "subclass_id": 22, - "predicted_subclass_probability": 0.9993006 + "predicted_subclass_probability": 0.99931514 }, "cluster": 0 }, { "cell_id": 1, - "code": "pd.set_option('display.max_colwidth', None)", + "code": "# The name of the BERT model used\nPRETRAINED_MODEL_NAME = 'bert-base-uncased'\n# The number of labels of the target variable\nLABELS_NUMBER = 2\n\n# The max lenght of text can be up to 512 for BERT\nMAX_LENGHT = 512\n\nBATCH_SIZE = 6\nLEARNING_RATE = 2e-5\nEPOCHS_NUMBER = 1\n\nN_PREDICTIONS_TO_SHOW = 10", "class": "Imports and Environment", - "desc": "This code snippet sets the pandas option to display the full contents of columns without truncating, which is useful for visualizing and debugging data frames, especially those containing long text entries.", + "desc": "This code snippet sets various configuration parameters for the BERT model, such as the pretrained model name, number of labels, maximum text length, batch size, learning rate, number of epochs, and the number of predictions to show.", "testing": { - "class": "Imports_and_Environment", - "subclass": "set_options", - "subclass_id": 23, - "predicted_subclass_probability": 0.99864024 + "class": "Exploratory_Data_Analysis", + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.99904436 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 45, - "code": "model.n_features_in_", - "class": "Model Evaluation", - "desc": "This code snippet outputs the number of features used to fit the logistic regression model, providing insight into the dimensionality of the input data for the model.", + "cell_id": 6, + "code": "# Get the Bert tokenizer\ntokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME, \n do_lower_case=True)", + "class": "Imports and Environment", + "desc": "This code snippet initializes a BERT tokenizer using the pretrained model specified by the `PRETRAINED_MODEL_NAME` variable with the `do_lower_case` option set to true.", "testing": { "class": "Model_Train", - "subclass": "find_best_params", - "subclass_id": 2, - "predicted_subclass_probability": 0.2740468 + "subclass": "load_pretrained", + "subclass_id": 30, + "predicted_subclass_probability": 0.9954691 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 47, - "code": "test_model_y = model.predict(features[text_n])", + "cell_id": 20, + "code": "# Get predictions in the validation dataset\nval_predictions = model.predict(ds_val_encoded)\nval_probabilities = softmax(val_predictions[0], axis=1)\ny_val_predictions = np.argmax(val_probabilities, axis=1).flatten()", "class": "Model Evaluation", - "desc": "This code snippet uses the trained logistic regression model to predict the target value for the 10th text sample's feature vector, evaluating the model's output for a specific instance.", + "desc": "This code snippet generates predictions for the validation dataset using the trained model, applies the softmax function to obtain class probabilities, and determines the predicted class labels by taking the argmax of the probabilities.", "testing": { "class": "Model_Evaluation", "subclass": "predict_on_test", "subclass_id": 48, - "predicted_subclass_probability": 0.99349266 + "predicted_subclass_probability": 0.9947659 }, "cluster": 0 }, { - "cell_id": 48, - "code": "test_model_y[0]", + "cell_id": 21, + "code": "# Compute metrics to evaluate the model\nclassification_metrics = classification_report(y_val, y_val_predictions)\n# Compute the area under the ROC curve\narea_under_the_curve = roc_auc_score(y_val, val_probabilities[:,1:2], multi_class=\"ovr\")\n# Compute the confusion matrix\nerror_matrix = confusion_matrix(y_val, y_val_predictions)\nprint(f'Area under the ROC curve: {area_under_the_curve}')\nprint(f'Classification metrics:\\n{classification_metrics}')\n# Plot the confusion matrix\nax = plt.axes()\nsns.heatmap(error_matrix, annot=True, fmt=\"d\")\nax.set_title('Confusion matrix Validation set')", "class": "Model Evaluation", - "desc": "This code snippet outputs the predicted target value for the 10th text sample, providing a direct view of the model's prediction for this instance.", + "desc": "This code snippet computes and prints various evaluation metrics such as the classification report, the area under the ROC curve, and the confusion matrix for the validation dataset, and also plots the confusion matrix using Seaborn's heatmap function.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9995427 + "class": "Visualization", + "subclass": "heatmap", + "subclass_id": 80, + "predicted_subclass_probability": 0.68382823 }, "cluster": 0 }, { - "cell_id": 52, - "code": "model_pipeline.classes_", + "cell_id": 22, + "code": "# Show some predictions in the validation dataset\nfor i in random.sample(range(len(val_dataset)), k=N_PREDICTIONS_TO_SHOW):\n print(f'\\nText: {X_test.values[i]}')\n print(f'Ground truth: {\"Real disaster\" if y_val.values[i]==1 else \"Not real disaster\"}')\n print(f'Predicted: {\"Real disaster\" if y_val_predictions[i]==1 else \"Not real disaster\"}')", "class": "Model Evaluation", - "desc": "This code snippet outputs the class labels known to the logistic regression model within the pipeline, providing insight into the target labels the model has been trained to predict.", + "desc": "This code snippet randomly selects a specified number of samples from the validation dataset to display their text, actual labels, and predicted labels, indicating whether each sample is about a real disaster or not.", "testing": { - "class": "Model_Train", - "subclass": "find_best_params", - "subclass_id": 2, - "predicted_subclass_probability": 0.4742122 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.49180317 }, "cluster": 0 }, { - "cell_id": 53, - "code": "len(model.coef_[0])", + "cell_id": 25, + "code": "test_predictions = model.predict(ds_test_encoded)\ntest_probabilities = softmax(test_predictions[0], axis=1)\ny_test_predictions = np.argmax(test_probabilities, axis=1).flatten()", "class": "Model Evaluation", - "desc": "This code snippet outputs the length of the coefficient vector for the first class of the logistic regression model, providing the number of features the model has been trained on.", + "desc": "This code snippet generates predictions for the test dataset using the trained model, applies the softmax function to obtain class probabilities, and determines the predicted class labels by taking the argmax of the probabilities.", "testing": { - "class": "Visualization", - "subclass": "model_coefficients", - "subclass_id": 79, - "predicted_subclass_probability": 0.9872142 + "class": "Model_Evaluation", + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.99444926 }, "cluster": 0 }, { - "cell_id": 54, - "code": "model.C_", - "class": "Model Evaluation", - "desc": "This code snippet outputs the inverse of the regularization strength parameter for the logistic regression model, providing information on the degree of regularization applied to the model.", - "testing": { - "class": "Visualization", - "subclass": "model_coefficients", - "subclass_id": 79, - "predicted_subclass_probability": 0.84522194 - }, - "cluster": 0 - }, { - "cell_id": 56, - "code": "#model_pipeline.predict([\"Attention: bush fire reported!\"])", - "class": "Model Evaluation", - "desc": "This commented-out code snippet, if executed, would use the trained pipeline to predict the target value for a new input text sample, allowing for evaluation of the model's performance on unseen data.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "commented", - "subclass_id": 76, - "predicted_subclass_probability": 0.9979772 - }, - "cluster": 1 - }, { - "cell_id": 57, - "code": "#model_pipeline.predict([\"Kids were playing in the park.\"])", - "class": "Model Evaluation", - "desc": "This commented-out code snippet, if executed, would use the trained pipeline to predict the target value for a new input text about kids playing in the park, allowing for evaluation of the model's performance on non-emergency text data.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "commented", - "subclass_id": 76, - "predicted_subclass_probability": 0.99806374 - }, - "cluster": 1 - }, { - "cell_id": 60, - "code": "#y_pred = model_pipeline.predict(X_train)\ny_pred = model_pipeline.predict(test_df[\"text\"])", - "class": "Model Evaluation", - "desc": "This code snippet uses the trained pipeline to predict target values for the text data in the test DataFrame, generating predicted labels for evaluating the model's performance.", - "testing": { - "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.9517882 - }, - "cluster": 0 - }, { - "cell_id": 61, - "code": "#print(precision_score(y_true=y_test, y_pred=y_pred))", - "class": "Model Evaluation", - "desc": "This commented-out code snippet, if executed, would calculate and print the precision score of the model's predictions on the test set, providing a measure of the model's accuracy in identifying relevant instances.", - "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.9712182 - }, - "cluster": 1 - }, { - "cell_id": 62, - "code": "#print(recall_score(y_true=y_test, y_pred=y_pred))", - "class": "Model Evaluation", - "desc": "This commented-out code snippet, if executed, would calculate and print the recall score of the model's predictions on the test set, indicating the model's ability to identify all relevant instances.", - "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.9629949 - }, - "cluster": 1 - }, { - "cell_id": 63, - "code": "#print(f1_score(y_true=y_test, y_pred=y_pred))", - "class": "Model Evaluation", - "desc": "This commented-out code snippet, if executed, would calculate and print the F1 score of the model's predictions on the test set, providing a balance between precision and recall for evaluating the model's performance.", - "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.91732484 - }, - "cluster": 1 - }, { - "cell_id": 64, - "code": "print(metrics.classification_report(y_test, y_pred, labels = [1, 0], digits=5))", - "class": "Model Evaluation", - "desc": "This code snippet prints a detailed classification report, including precision, recall, F1 score, and support for each class, offering a comprehensive evaluation of the model's performance on the test set.", - "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.8476077 - }, - "cluster": 0 - }, { - "cell_id": 65, - "code": "f1_1 = metrics.classification_report(y_test, y_pred, output_dict=True)[\"1\"][\"f1-score\"]\nf1_0 = metrics.classification_report(y_test, y_pred, output_dict=True)[\"0\"][\"f1-score\"]\nprint(\"Mean f1 score: {0:.5f}\".format((f1_1 + f1_0)/2))", - "class": "Model Evaluation", - "desc": "This code snippet calculates the F1 scores for both classes, averages them, and prints the mean F1 score, providing an overall measure of the model's balanced performance across both classes.", - "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.9809368 - }, - "cluster": 0 - }, { - "cell_id": 67, - "code": "model_pipeline.score(test_df[\"text\"], y_test)", - "class": "Model Evaluation", - "desc": "This code snippet computes and outputs the accuracy score of the logistic regression model within the pipeline on the test set, providing a measure of the overall correctness of the model's predictions.", - "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.99815553 - }, - "cluster": 0 - }, { - "cell_id": 44, - "code": "model = lm.LogisticRegressionCV(**lr_model_params)\n#features = features[:,-2000:]\nmodel.fit(features, y_train)", - "class": "Model Training", - "desc": "This code snippet initializes a logistic regression model with cross-validation using the specified parameters and fits it to the TF-IDF features and corresponding training labels, training the model on the provided data.", + "cell_id": 16, + "code": "def get_model():\n # Define the configuration of the model\n config = AutoConfig.from_pretrained(PRETRAINED_MODEL_NAME,\n hidden_dropout_prob=0.2,\n num_labels=LABELS_NUMBER)\n # Model initialization\n model = TFBertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, \n config=config)\n return model", + "class": "Model Training", + "desc": "This code snippet defines a function `get_model` that initializes the configuration and instance of a BERT model for sequence classification using `AutoConfig` and `TFBertForSequenceClassification` from the transformers library, while setting specific model parameters like hidden dropout probability and the number of labels.", "testing": { "class": "Model_Train", - "subclass": "train_on_grid", - "subclass_id": 6, - "predicted_subclass_probability": 0.86612976 + "subclass": "load_pretrained", + "subclass_id": 30, + "predicted_subclass_probability": 0.98904186 }, - "cluster": 0 + "cluster": 3 }, { - "cell_id": 50, - "code": "model_pipeline = Pipeline([\n (\"vectorizer\", vectorizer),\n (\"model\", model)\n]\n)", + "cell_id": 17, + "code": "# Model initialization\nmodel = get_model()\n\n# Define the optimizer, the loss function and metrics\noptimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)\nloss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\nmetric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n# Compile the model\nmodel.compile(optimizer=optimizer, loss=loss, metrics=[metric])", "class": "Model Training", - "desc": "This code snippet sets up a machine learning pipeline that combines the TfidfVectorizer and the logistic regression model into a single workflow, streamlining the process of transforming text data and making predictions.", + "desc": "This code snippet initializes the BERT model using the `get_model` function, defines the optimizer (Adam), loss function (SparseCategoricalCrossentropy), and accuracy metric, and then compiles the model using TensorFlow's `compile` method.", "testing": { "class": "Model_Train", "subclass": "choose_model_class", "subclass_id": 4, - "predicted_subclass_probability": 0.92996585 + "predicted_subclass_probability": 0.9943469 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 51, - "code": "model_pipeline.fit(X_train, y_train)", + "cell_id": 18, + "code": "# Scaling by total/2 helps keep the loss to a similar magnitude.\n# The sum of the weights of all examples stays the same.\nweight_for_0 = (1 / n_negative_training_examples)*(n_training_examples)/2.0 \nweight_for_1 = (1 / n_positive_training_examples)*(n_training_examples)/2.0\n\nclass_weight = {0: weight_for_0, 1: weight_for_1}\n\nprint('Weight for class 0: {:.2f}'.format(weight_for_0))\nprint('Weight for class 1: {:.2f}'.format(weight_for_1))", "class": "Model Training", - "desc": "This code snippet fits the pipeline to the training text data and labels, which involves both transforming the text into TF-IDF features using the vectorizer and training the logistic regression model on the transformed features.", + "desc": "This code snippet calculates and prints the class weights for handling class imbalance during model training, scaling weights based on the number of positive and negative training examples to ensure balanced contributions from each class.", "testing": { "class": "Model_Train", "subclass": "train_model", "subclass_id": 7, - "predicted_subclass_probability": 0.9997104 - }, - "cluster": -1 - }, { - "cell_id": 55, - "code": "model_pipeline.named_steps", - "class": "Model Training", - "desc": "This code snippet outputs the named steps of the pipeline, showing the components (vectorizer and logistic regression model) and their configuration within the pipeline.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.3446452 + "predicted_subclass_probability": 0.44395435 }, "cluster": -1 }, { - "cell_id": 58, - "code": "#model_pipeline.get_params()", + "cell_id": 19, + "code": "# Train the model\nmodel.fit(ds_train_encoded, epochs=EPOCHS_NUMBER, validation_data=ds_val_encoded,\n class_weight = class_weight)", "class": "Model Training", - "desc": "This commented-out code snippet, if executed, would retrieve the parameters and their values for each step in the pipeline, providing detailed information about the pipeline's configuration.", + "desc": "This code snippet trains the BERT model using the training dataset, specified number of epochs, validation dataset, and class weights to handle class imbalance during training with the `fit` method.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "commented", - "subclass_id": 76, - "predicted_subclass_probability": 0.9974099 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.9996909 }, - "cluster": -1 + "cluster": 1 }, { - "cell_id": 66, - "code": "plot_precision_recall_curve(estimator=model_pipeline, X=test_df[\"text\"], y=y_test)", + "cell_id": 5, + "code": "sns.countplot(train_data[\"target\"])", "class": "Visualization", - "desc": "This code snippet generates a precision-recall curve for the logistic regression model's predictions on the test set, visualizing the trade-off between precision and recall at different thresholds.", + "desc": "This code snippet creates and displays a count plot using Seaborn to visualize the distribution of the target variable in the training data.", "testing": { "class": "Visualization", - "subclass": "model_coefficients", - "subclass_id": 79, - "predicted_subclass_probability": 0.6427428 + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.9974095 }, - "cluster": 0 + "cluster": -1 }], - "notebook_id": 13, - "notebook_name": "nlp-disaster-tweets-tfidf-lr" + "notebook_id": 11, + "notebook_name": "detecting-disaster-tweets-fine-tuning-bert.ipynb" }, { "cells": [{ - "cell_id": 28, - "code": "submission['target'] = test_pred_BERT_int\nsubmission.to_csv(\"submission_BERT.csv\", index=False, header=True)", + "cell_id": 34, + "code": "def submission(model, test):\n sample_sub = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')\n predictions = model.predict(test)\n y_preds = [ int(i) for i in np.rint(predictions)]\n sub = pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_preds})\n sub.to_csv('submission.csv', index=False)", "class": "Data Export", - "desc": "This code snippet updates the submission DataFrame with the predicted target values from the BERT model and saves it as a CSV file named \"submission_BERT.csv\".", + "desc": "The code defines a function `submission` that generates predictions for the test dataset using the trained model, creates a new DataFrame with the predicted labels, and saves it as a 'submission.csv' file for submission.", "testing": { "class": "Data_Export", "subclass": "save_to_csv", "subclass_id": 25, - "predicted_subclass_probability": 0.9993593 + "predicted_subclass_probability": 0.9988925 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 1, - "code": "train_df = pd.read_csv(\"../input/nlp-getting-started/train.csv\")\ntest_df = pd.read_csv(\"../input/nlp-getting-started/test.csv\")\nsubmission = pd.read_csv(\"../input/nlp-getting-started/sample_submission.csv\")\n\nprint(\"Training Shape rows = {}, columns = {}\".format(train_df.shape[0],train_df.shape[1]))\nprint(\"Testing Shape rows = {}, columns = {}\".format(test_df.shape[0],test_df.shape[1]))", - "class": "Data Extraction", - "desc": "This code snippet reads training, testing, and sample submission datasets from CSV files and prints their shapes to confirm the number of rows and columns in each.", + "cell_id": 35, + "code": "submission(bert_classifier, test_ds)", + "class": "Data Export", + "desc": "The code calls the `submission` function to generate predictions for the test dataset using the `bert_classifier` model and saves the results to a 'submission.csv' file.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.99957687 + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.61737144 }, "cluster": -1 }, { - "cell_id": 23, - "code": "vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()\ndo_lower_case = bert_layer.resolved_object.do_lower_case.numpy()\ntokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)", + "cell_id": 4, + "code": "train_full = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')\ntest_full = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')\n\nprint('Training Set Shape = {}'.format(train_full.shape))\nprint('Training Set Memory Usage = {:.2f}MB'.format(train_full.memory_usage().sum()/2**20))\n\nprint('Test Set Shape = {}'.format(test_full.shape))\nprint('Test Set Memory Usage = {:.2f}MB'.format(test_full.memory_usage().sum()/2**20))", "class": "Data Extraction", - "desc": "This code snippet extracts the vocabulary file and lowercase setting from the BERT layer and uses them to initialize a tokenizer for text preprocessing.", + "desc": "The code reads training and test datasets from CSV files into Pandas DataFrames and prints their shapes and memory usage.", "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.56229645 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.9972459 }, - "cluster": -1 + "cluster": 0 }, { "cell_id": 11, - "code": "keyword_dist = train_df.groupby(\"keyword\")['target'].value_counts().unstack(fill_value=0)\nkeyword_dist = keyword_dist.add_prefix(keyword_dist.columns.name).rename_axis(columns=None).reset_index()", - "class": "Data Transform", - "desc": "This code snippet creates a new DataFrame that groups the training data by 'keyword' and counts the occurrences of each target value, filling missing values with zero and flattening the resulting DataFrame.", + "code": "# Read commited-dataset\ndf_train = pd.read_csv(\"/kaggle/input/disastertweet-prepared2/train_prepared.csv\")\ndf_test = pd.read_csv(\"/kaggle/input/disastertweet-prepared2/test_prepared.csv\")", + "class": "Data Extraction", + "desc": "The code reads prepared training and test datasets from CSV files into Pandas DataFrames.", "testing": { - "class": "Data_Transform", - "subclass": "groupby", - "subclass_id": 60, - "predicted_subclass_probability": 0.8236565 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.99974114 }, - "cluster": 1 + "cluster": 0 }, { "cell_id": 12, - "code": "keyword_dist.sort_values('target1',ascending = False).head(10)", + "code": "# Only apply 'keyword' columns in full data, because other features cleaned in df_train/test\ntrain_full = clean_text(train_full,'keyword')\ntest_full = clean_text(test_full, 'keyword')", "class": "Data Transform", - "desc": "This code snippet sorts the previously created keyword distribution DataFrame by the count of 'target1' (disastrous tweets) in descending order and displays the top 10 keywords with the highest count of disastrous tweets.", + "desc": "The code cleans the 'keyword' column in both the training and test datasets using the `clean_text` function from the DataPrep library.", "testing": { "class": "Data_Transform", - "subclass": "sort_values", - "subclass_id": 9, - "predicted_subclass_probability": 0.8409343 + "subclass": "drop_column", + "subclass_id": 10, + "predicted_subclass_probability": 0.9898755 }, - "cluster": 1 + "cluster": 2 }, { "cell_id": 13, - "code": "keyword_dist.sort_values('target0',ascending = False).head(10)", - "class": "Data Transform", - "desc": "This code snippet sorts the keyword distribution DataFrame by the count of 'target0' (non-disastrous tweets) in descending order and displays the top 10 keywords with the highest count of non-disastrous tweets.", - "testing": { - "class": "Data_Transform", - "subclass": "sort_values", - "subclass_id": 9, - "predicted_subclass_probability": 0.7933884 - }, - "cluster": 1 - }, { - "cell_id": 14, - "code": "#word count\ntrain_df['word_count'] = train_df['text'].apply(lambda x : len(str(x).split()))\ntest_df['word_count'] = test_df['text'].apply(lambda x : len(str(x).split()))\n#Unique word count\ntrain_df['unique_word_count'] = train_df['text'].apply(lambda x : len(set(str(x).split())))\ntest_df['unique_word_count'] = test_df['text'].apply(lambda x : len(set(str(x).split())))\n#Count of letters\ntrain_df['count_letters'] = train_df['text'].apply(lambda x : len(str(x)))\ntest_df['count_letters'] = test_df['text'].apply(lambda x : len(str(x)))\n#Count of punctuations\ntrain_df['count_punctuations'] = train_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))\ntest_df['count_punctuations'] = test_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))\n#count of stopwords\ntrain_df['stop_word_count'] = train_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))\ntest_df['stop_word_count'] = test_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))\n#Count of hashtag\ntrain_df['hashtag_count'] = train_df['text'].apply(lambda x : len([c for c in str(x) if c == '#']))\ntest_df['hashtag_count'] = test_df['text'].apply(lambda x : len([c for c in str(x) if c == '#']))\n#Count of mentions\ntrain_df['mention_count'] = train_df['text'].apply(lambda x : len([c for c in str(x) if c=='@']))\ntest_df['mention_count'] = test_df['text'].apply(lambda x : len([c for c in str(x) if c=='@']))", + "code": "# Adding cleaned data into df_train/test\ndf_train['keyword'] = train_full['keyword']\ndf_test['keyword'] = test_full['keyword']", "class": "Data Transform", - "desc": "This code snippet adds several new columns to both the training and testing datasets, computing various text-related features such as word count, unique word count, letter count, punctuation count, stop word count, hashtag count, and mention count for each tweet.", + "desc": "The code updates the prepared training and test datasets by adding the cleaned 'keyword' column from the full datasets into `df_train` and `df_test` DataFrames.", "testing": { "class": "Data_Transform", "subclass": "feature_engineering", "subclass_id": 8, - "predicted_subclass_probability": 0.9991715 + "predicted_subclass_probability": 0.99662316 }, - "cluster": 1 + "cluster": 3 }, { - "cell_id": 19, - "code": "# Refrenced from Gunes Evitan and Vitalii Mokin Notebook\ndef clean(tweet): \n \n # Special characters\n tweet = re.sub(r\"\\x89\u00db_\", \"\", tweet)\n tweet = re.sub(r\"\\x89\u00db\u00d2\", \"\", tweet)\n tweet = re.sub(r\"\\x89\u00db\u00d3\", \"\", tweet)\n tweet = re.sub(r\"\\x89\u00db\u00cfWhen\", \"When\", tweet)\n tweet = re.sub(r\"\\x89\u00db\u00cf\", \"\", tweet)\n tweet = re.sub(r\"China\\x89\u00db\u00aas\", \"China's\", tweet)\n tweet = re.sub(r\"let\\x89\u00db\u00aas\", \"let's\", tweet)\n tweet = re.sub(r\"\\x89\u00db\u00f7\", \"\", tweet)\n tweet = re.sub(r\"\\x89\u00db\u00aa\", \"\", tweet)\n tweet = re.sub(r\"\\x89\u00db\\x9d\", \"\", tweet)\n tweet = re.sub(r\"\u00e5_\", \"\", tweet)\n tweet = re.sub(r\"\\x89\u00db\u00a2\", \"\", tweet)\n tweet = re.sub(r\"\\x89\u00db\u00a2\u00e5\u00ca\", \"\", tweet)\n tweet = re.sub(r\"from\u00e5\u00cawounds\", \"from wounds\", tweet)\n tweet = re.sub(r\"\u00e5\u00ca\", \"\", tweet)\n tweet = re.sub(r\"\u00e5\u00c8\", \"\", tweet)\n tweet = re.sub(r\"Jap\u00cc_n\", \"Japan\", tweet) \n tweet = re.sub(r\"\u00cc\u00a9\", \"e\", tweet)\n tweet = re.sub(r\"\u00e5\u00a8\", \"\", tweet)\n tweet = re.sub(r\"Suru\u00cc\u00a4\", \"Suruc\", tweet)\n tweet = re.sub(r\"\u00e5\u00c7\", \"\", tweet)\n tweet = re.sub(r\"\u00e5\u00a33million\", \"3 million\", tweet)\n tweet = re.sub(r\"\u00e5\u00c0\", \"\", tweet)\n \n # Contractions\n tweet = re.sub(r\"he's\", \"he is\", tweet)\n tweet = re.sub(r\"there's\", \"there is\", tweet)\n tweet = re.sub(r\"We're\", \"We are\", tweet)\n tweet = re.sub(r\"That's\", \"That is\", tweet)\n tweet = re.sub(r\"won't\", \"will not\", tweet)\n tweet = re.sub(r\"they're\", \"they are\", tweet)\n tweet = re.sub(r\"Can't\", \"Cannot\", tweet)\n tweet = re.sub(r\"wasn't\", \"was not\", tweet)\n tweet = re.sub(r\"don\\x89\u00db\u00aat\", \"do not\", tweet)\n tweet = re.sub(r\"aren't\", \"are not\", tweet)\n tweet = re.sub(r\"isn't\", \"is not\", tweet)\n tweet = re.sub(r\"What's\", \"What is\", tweet)\n tweet = re.sub(r\"haven't\", \"have not\", tweet)\n tweet = re.sub(r\"hasn't\", \"has not\", tweet)\n tweet = re.sub(r\"There's\", \"There is\", tweet)\n tweet = re.sub(r\"He's\", \"He is\", tweet)\n tweet = re.sub(r\"It's\", \"It is\", tweet)\n tweet = re.sub(r\"You're\", \"You are\", tweet)\n tweet = re.sub(r\"I'M\", \"I am\", tweet)\n tweet = re.sub(r\"shouldn't\", \"should not\", tweet)\n tweet = re.sub(r\"wouldn't\", \"would not\", tweet)\n tweet = re.sub(r\"i'm\", \"I am\", tweet)\n tweet = re.sub(r\"I\\x89\u00db\u00aam\", \"I am\", tweet)\n tweet = re.sub(r\"I'm\", \"I am\", tweet)\n tweet = re.sub(r\"Isn't\", \"is not\", tweet)\n tweet = re.sub(r\"Here's\", \"Here is\", tweet)\n tweet = re.sub(r\"you've\", \"you have\", tweet)\n tweet = re.sub(r\"you\\x89\u00db\u00aave\", \"you have\", tweet)\n tweet = re.sub(r\"we're\", \"we are\", tweet)\n tweet = re.sub(r\"what's\", \"what is\", tweet)\n tweet = re.sub(r\"couldn't\", \"could not\", tweet)\n tweet = re.sub(r\"we've\", \"we have\", tweet)\n tweet = re.sub(r\"it\\x89\u00db\u00aas\", \"it is\", tweet)\n tweet = re.sub(r\"doesn\\x89\u00db\u00aat\", \"does not\", tweet)\n tweet = re.sub(r\"It\\x89\u00db\u00aas\", \"It is\", tweet)\n tweet = re.sub(r\"Here\\x89\u00db\u00aas\", \"Here is\", tweet)\n tweet = re.sub(r\"who's\", \"who is\", tweet)\n tweet = re.sub(r\"I\\x89\u00db\u00aave\", \"I have\", tweet)\n tweet = re.sub(r\"y'all\", \"you all\", tweet)\n tweet = re.sub(r\"can\\x89\u00db\u00aat\", \"cannot\", tweet)\n tweet = re.sub(r\"would've\", \"would have\", tweet)\n tweet = re.sub(r\"it'll\", \"it will\", tweet)\n tweet = re.sub(r\"we'll\", \"we will\", tweet)\n tweet = re.sub(r\"wouldn\\x89\u00db\u00aat\", \"would not\", tweet)\n tweet = re.sub(r\"We've\", \"We have\", tweet)\n tweet = re.sub(r\"he'll\", \"he will\", tweet)\n tweet = re.sub(r\"Y'all\", \"You all\", tweet)\n tweet = re.sub(r\"Weren't\", \"Were not\", tweet)\n tweet = re.sub(r\"Didn't\", \"Did not\", tweet)\n tweet = re.sub(r\"they'll\", \"they will\", tweet)\n tweet = re.sub(r\"they'd\", \"they would\", tweet)\n tweet = re.sub(r\"DON'T\", \"DO NOT\", tweet)\n tweet = re.sub(r\"That\\x89\u00db\u00aas\", \"That is\", tweet)\n tweet = re.sub(r\"they've\", \"they have\", tweet)\n tweet = re.sub(r\"i'd\", \"I would\", tweet)\n tweet = re.sub(r\"should've\", \"should have\", tweet)\n tweet = re.sub(r\"You\\x89\u00db\u00aare\", \"You are\", tweet)\n tweet = re.sub(r\"where's\", \"where is\", tweet)\n tweet = re.sub(r\"Don\\x89\u00db\u00aat\", \"Do not\", tweet)\n tweet = re.sub(r\"we'd\", \"we would\", tweet)\n tweet = re.sub(r\"i'll\", \"I will\", tweet)\n tweet = re.sub(r\"weren't\", \"were not\", tweet)\n tweet = re.sub(r\"They're\", \"They are\", tweet)\n tweet = re.sub(r\"Can\\x89\u00db\u00aat\", \"Cannot\", tweet)\n tweet = re.sub(r\"you\\x89\u00db\u00aall\", \"you will\", tweet)\n tweet = re.sub(r\"I\\x89\u00db\u00aad\", \"I would\", tweet)\n tweet = re.sub(r\"let's\", \"let us\", tweet)\n tweet = re.sub(r\"it's\", \"it is\", tweet)\n tweet = re.sub(r\"can't\", \"cannot\", tweet)\n tweet = re.sub(r\"don't\", \"do not\", tweet)\n tweet = re.sub(r\"you're\", \"you are\", tweet)\n tweet = re.sub(r\"i've\", \"I have\", tweet)\n tweet = re.sub(r\"that's\", \"that is\", tweet)\n tweet = re.sub(r\"i'll\", \"I will\", tweet)\n tweet = re.sub(r\"doesn't\", \"does not\", tweet)\n tweet = re.sub(r\"i'd\", \"I would\", tweet)\n tweet = re.sub(r\"didn't\", \"did not\", tweet)\n tweet = re.sub(r\"ain't\", \"am not\", tweet)\n tweet = re.sub(r\"you'll\", \"you will\", tweet)\n tweet = re.sub(r\"I've\", \"I have\", tweet)\n tweet = re.sub(r\"Don't\", \"do not\", tweet)\n tweet = re.sub(r\"I'll\", \"I will\", tweet)\n tweet = re.sub(r\"I'd\", \"I would\", tweet)\n tweet = re.sub(r\"Let's\", \"Let us\", tweet)\n tweet = re.sub(r\"you'd\", \"You would\", tweet)\n tweet = re.sub(r\"It's\", \"It is\", tweet)\n tweet = re.sub(r\"Ain't\", \"am not\", tweet)\n tweet = re.sub(r\"Haven't\", \"Have not\", tweet)\n tweet = re.sub(r\"Could've\", \"Could have\", tweet)\n tweet = re.sub(r\"youve\", \"you have\", tweet) \n tweet = re.sub(r\"don\u00e5\u00abt\", \"do not\", tweet) \n \n # Character entity references\n tweet = re.sub(r\">\", \">\", tweet)\n tweet = re.sub(r\"<\", \"<\", tweet)\n tweet = re.sub(r\"&\", \"&\", tweet)\n \n # Typos, slang and informal abbreviations\n tweet = re.sub(r\"w/e\", \"whatever\", tweet)\n tweet = re.sub(r\"w/\", \"with\", tweet)\n tweet = re.sub(r\"USAgov\", \"USA government\", tweet)\n tweet = re.sub(r\"recentlu\", \"recently\", tweet)\n tweet = re.sub(r\"Ph0tos\", \"Photos\", tweet)\n tweet = re.sub(r\"amirite\", \"am I right\", tweet)\n tweet = re.sub(r\"exp0sed\", \"exposed\", tweet)\n tweet = re.sub(r\"<3\", \"love\", tweet)\n tweet = re.sub(r\"amageddon\", \"armageddon\", tweet)\n tweet = re.sub(r\"Trfc\", \"Traffic\", tweet)\n tweet = re.sub(r\"8/5/2015\", \"2015-08-05\", tweet)\n tweet = re.sub(r\"WindStorm\", \"Wind Storm\", tweet)\n tweet = re.sub(r\"8/6/2015\", \"2015-08-06\", tweet)\n tweet = re.sub(r\"10:38PM\", \"10:38 PM\", tweet)\n tweet = re.sub(r\"10:30pm\", \"10:30 PM\", tweet)\n tweet = re.sub(r\"16yr\", \"16 year\", tweet)\n tweet = re.sub(r\"lmao\", \"laughing my ass off\", tweet) \n tweet = re.sub(r\"TRAUMATISED\", \"traumatized\", tweet)\n \n # Hashtags and usernames\n tweet = re.sub(r\"IranDeal\", \"Iran Deal\", tweet)\n tweet = re.sub(r\"ArianaGrande\", \"Ariana Grande\", tweet)\n tweet = re.sub(r\"camilacabello97\", \"camila cabello\", tweet) \n tweet = re.sub(r\"RondaRousey\", \"Ronda Rousey\", tweet) \n tweet = re.sub(r\"MTVHottest\", \"MTV Hottest\", tweet)\n tweet = re.sub(r\"TrapMusic\", \"Trap Music\", tweet)\n tweet = re.sub(r\"ProphetMuhammad\", \"Prophet Muhammad\", tweet)\n tweet = re.sub(r\"PantherAttack\", \"Panther Attack\", tweet)\n tweet = re.sub(r\"StrategicPatience\", \"Strategic Patience\", tweet)\n tweet = re.sub(r\"socialnews\", \"social news\", tweet)\n tweet = re.sub(r\"NASAHurricane\", \"NASA Hurricane\", tweet)\n tweet = re.sub(r\"onlinecommunities\", \"online communities\", tweet)\n tweet = re.sub(r\"humanconsumption\", \"human consumption\", tweet)\n tweet = re.sub(r\"Typhoon-Devastated\", \"Typhoon Devastated\", tweet)\n tweet = re.sub(r\"Meat-Loving\", \"Meat Loving\", tweet)\n tweet = re.sub(r\"facialabuse\", \"facial abuse\", tweet)\n tweet = re.sub(r\"LakeCounty\", \"Lake County\", tweet)\n tweet = re.sub(r\"BeingAuthor\", \"Being Author\", tweet)\n tweet = re.sub(r\"withheavenly\", \"with heavenly\", tweet)\n tweet = re.sub(r\"thankU\", \"thank you\", tweet)\n tweet = re.sub(r\"iTunesMusic\", \"iTunes Music\", tweet)\n tweet = re.sub(r\"OffensiveContent\", \"Offensive Content\", tweet)\n tweet = re.sub(r\"WorstSummerJob\", \"Worst Summer Job\", tweet)\n tweet = re.sub(r\"HarryBeCareful\", \"Harry Be Careful\", tweet)\n tweet = re.sub(r\"NASASolarSystem\", \"NASA Solar System\", tweet)\n tweet = re.sub(r\"animalrescue\", \"animal rescue\", tweet)\n tweet = re.sub(r\"KurtSchlichter\", \"Kurt Schlichter\", tweet)\n tweet = re.sub(r\"aRmageddon\", \"armageddon\", tweet)\n tweet = re.sub(r\"Throwingknifes\", \"Throwing knives\", tweet)\n tweet = re.sub(r\"GodsLove\", \"God's Love\", tweet)\n tweet = re.sub(r\"bookboost\", \"book boost\", tweet)\n tweet = re.sub(r\"ibooklove\", \"I book love\", tweet)\n tweet = re.sub(r\"NestleIndia\", \"Nestle India\", tweet)\n tweet = re.sub(r\"realDonaldTrump\", \"Donald Trump\", tweet)\n tweet = re.sub(r\"DavidVonderhaar\", \"David Vonderhaar\", tweet)\n tweet = re.sub(r\"CecilTheLion\", \"Cecil The Lion\", tweet)\n tweet = re.sub(r\"weathernetwork\", \"weather network\", tweet)\n tweet = re.sub(r\"withBioterrorism&use\", \"with Bioterrorism & use\", tweet)\n tweet = re.sub(r\"Hostage&2\", \"Hostage & 2\", tweet)\n tweet = re.sub(r\"GOPDebate\", \"GOP Debate\", tweet)\n tweet = re.sub(r\"RickPerry\", \"Rick Perry\", tweet)\n tweet = re.sub(r\"frontpage\", \"front page\", tweet)\n tweet = re.sub(r\"NewsInTweets\", \"News In Tweets\", tweet)\n tweet = re.sub(r\"ViralSpell\", \"Viral Spell\", tweet)\n tweet = re.sub(r\"til_now\", \"until now\", tweet)\n tweet = re.sub(r\"volcanoinRussia\", \"volcano in Russia\", tweet)\n tweet = re.sub(r\"ZippedNews\", \"Zipped News\", tweet)\n tweet = re.sub(r\"MicheleBachman\", \"Michele Bachman\", tweet)\n tweet = re.sub(r\"53inch\", \"53 inch\", tweet)\n tweet = re.sub(r\"KerrickTrial\", \"Kerrick Trial\", tweet)\n tweet = re.sub(r\"abstorm\", \"Alberta Storm\", tweet)\n tweet = re.sub(r\"Beyhive\", \"Beyonce hive\", tweet)\n tweet = re.sub(r\"IDFire\", \"Idaho Fire\", tweet)\n tweet = re.sub(r\"DETECTADO\", \"Detected\", tweet)\n tweet = re.sub(r\"RockyFire\", \"Rocky Fire\", tweet)\n tweet = re.sub(r\"Listen/Buy\", \"Listen / Buy\", tweet)\n tweet = re.sub(r\"NickCannon\", \"Nick Cannon\", tweet)\n tweet = re.sub(r\"FaroeIslands\", \"Faroe Islands\", tweet)\n tweet = re.sub(r\"yycstorm\", \"Calgary Storm\", tweet)\n tweet = re.sub(r\"IDPs:\", \"Internally Displaced People :\", tweet)\n tweet = re.sub(r\"ArtistsUnited\", \"Artists United\", tweet)\n tweet = re.sub(r\"ClaytonBryant\", \"Clayton Bryant\", tweet)\n tweet = re.sub(r\"jimmyfallon\", \"jimmy fallon\", tweet)\n tweet = re.sub(r\"justinbieber\", \"justin bieber\", tweet) \n tweet = re.sub(r\"UTC2015\", \"UTC 2015\", tweet)\n tweet = re.sub(r\"Time2015\", \"Time 2015\", tweet)\n tweet = re.sub(r\"djicemoon\", \"dj icemoon\", tweet)\n tweet = re.sub(r\"LivingSafely\", \"Living Safely\", tweet)\n tweet = re.sub(r\"FIFA16\", \"Fifa 2016\", tweet)\n tweet = re.sub(r\"thisiswhywecanthavenicethings\", \"this is why we cannot have nice things\", tweet)\n tweet = re.sub(r\"bbcnews\", \"bbc news\", tweet)\n tweet = re.sub(r\"UndergroundRailraod\", \"Underground Railraod\", tweet)\n tweet = re.sub(r\"c4news\", \"c4 news\", tweet)\n tweet = re.sub(r\"OBLITERATION\", \"obliteration\", tweet)\n tweet = re.sub(r\"MUDSLIDE\", \"mudslide\", tweet)\n tweet = re.sub(r\"NoSurrender\", \"No Surrender\", tweet)\n tweet = re.sub(r\"NotExplained\", \"Not Explained\", tweet)\n tweet = re.sub(r\"greatbritishbakeoff\", \"great british bake off\", tweet)\n tweet = re.sub(r\"LondonFire\", \"London Fire\", tweet)\n tweet = re.sub(r\"KOTAWeather\", \"KOTA Weather\", tweet)\n tweet = re.sub(r\"LuchaUnderground\", \"Lucha Underground\", tweet)\n tweet = re.sub(r\"KOIN6News\", \"KOIN 6 News\", tweet)\n tweet = re.sub(r\"LiveOnK2\", \"Live On K2\", tweet)\n tweet = re.sub(r\"9NewsGoldCoast\", \"9 News Gold Coast\", tweet)\n tweet = re.sub(r\"nikeplus\", \"nike plus\", tweet)\n tweet = re.sub(r\"david_cameron\", \"David Cameron\", tweet)\n tweet = re.sub(r\"peterjukes\", \"Peter Jukes\", tweet)\n tweet = re.sub(r\"JamesMelville\", \"James Melville\", tweet)\n tweet = re.sub(r\"megynkelly\", \"Megyn Kelly\", tweet)\n tweet = re.sub(r\"cnewslive\", \"C News Live\", tweet)\n tweet = re.sub(r\"JamaicaObserver\", \"Jamaica Observer\", tweet)\n tweet = re.sub(r\"TweetLikeItsSeptember11th2001\", \"Tweet like it is september 11th 2001\", tweet)\n tweet = re.sub(r\"cbplawyers\", \"cbp lawyers\", tweet)\n tweet = re.sub(r\"fewmoretweets\", \"few more tweets\", tweet)\n tweet = re.sub(r\"BlackLivesMatter\", \"Black Lives Matter\", tweet)\n tweet = re.sub(r\"cjoyner\", \"Chris Joyner\", tweet)\n tweet = re.sub(r\"ENGvAUS\", \"England vs Australia\", tweet)\n tweet = re.sub(r\"ScottWalker\", \"Scott Walker\", tweet)\n tweet = re.sub(r\"MikeParrActor\", \"Michael Parr\", tweet)\n tweet = re.sub(r\"4PlayThursdays\", \"Foreplay Thursdays\", tweet)\n tweet = re.sub(r\"TGF2015\", \"Tontitown Grape Festival\", tweet)\n tweet = re.sub(r\"realmandyrain\", \"Mandy Rain\", tweet)\n tweet = re.sub(r\"GraysonDolan\", \"Grayson Dolan\", tweet)\n tweet = re.sub(r\"ApolloBrown\", \"Apollo Brown\", tweet)\n tweet = re.sub(r\"saddlebrooke\", \"Saddlebrooke\", tweet)\n tweet = re.sub(r\"TontitownGrape\", \"Tontitown Grape\", tweet)\n tweet = re.sub(r\"AbbsWinston\", \"Abbs Winston\", tweet)\n tweet = re.sub(r\"ShaunKing\", \"Shaun King\", tweet)\n tweet = re.sub(r\"MeekMill\", \"Meek Mill\", tweet)\n tweet = re.sub(r\"TornadoGiveaway\", \"Tornado Giveaway\", tweet)\n tweet = re.sub(r\"GRupdates\", \"GR updates\", tweet)\n tweet = re.sub(r\"SouthDowns\", \"South Downs\", tweet)\n tweet = re.sub(r\"braininjury\", \"brain injury\", tweet)\n tweet = re.sub(r\"auspol\", \"Australian politics\", tweet)\n tweet = re.sub(r\"PlannedParenthood\", \"Planned Parenthood\", tweet)\n tweet = re.sub(r\"calgaryweather\", \"Calgary Weather\", tweet)\n tweet = re.sub(r\"weallheartonedirection\", \"we all heart one direction\", tweet)\n tweet = re.sub(r\"edsheeran\", \"Ed Sheeran\", tweet)\n tweet = re.sub(r\"TrueHeroes\", \"True Heroes\", tweet)\n tweet = re.sub(r\"S3XLEAK\", \"sex leak\", tweet)\n tweet = re.sub(r\"ComplexMag\", \"Complex Magazine\", tweet)\n tweet = re.sub(r\"TheAdvocateMag\", \"The Advocate Magazine\", tweet)\n tweet = re.sub(r\"CityofCalgary\", \"City of Calgary\", tweet)\n tweet = re.sub(r\"EbolaOutbreak\", \"Ebola Outbreak\", tweet)\n tweet = re.sub(r\"SummerFate\", \"Summer Fate\", tweet)\n tweet = re.sub(r\"RAmag\", \"Royal Academy Magazine\", tweet)\n tweet = re.sub(r\"offers2go\", \"offers to go\", tweet)\n tweet = re.sub(r\"foodscare\", \"food scare\", tweet)\n tweet = re.sub(r\"MNPDNashville\", \"Metropolitan Nashville Police Department\", tweet)\n tweet = re.sub(r\"TfLBusAlerts\", \"TfL Bus Alerts\", tweet)\n tweet = re.sub(r\"GamerGate\", \"Gamer Gate\", tweet)\n tweet = re.sub(r\"IHHen\", \"Humanitarian Relief\", tweet)\n tweet = re.sub(r\"spinningbot\", \"spinning bot\", tweet)\n tweet = re.sub(r\"ModiMinistry\", \"Modi Ministry\", tweet)\n tweet = re.sub(r\"TAXIWAYS\", \"taxi ways\", tweet)\n tweet = re.sub(r\"Calum5SOS\", \"Calum Hood\", tweet)\n tweet = re.sub(r\"po_st\", \"po.st\", tweet)\n tweet = re.sub(r\"scoopit\", \"scoop.it\", tweet)\n tweet = re.sub(r\"UltimaLucha\", \"Ultima Lucha\", tweet)\n tweet = re.sub(r\"JonathanFerrell\", \"Jonathan Ferrell\", tweet)\n tweet = re.sub(r\"aria_ahrary\", \"Aria Ahrary\", tweet)\n tweet = re.sub(r\"rapidcity\", \"Rapid City\", tweet)\n tweet = re.sub(r\"OutBid\", \"outbid\", tweet)\n tweet = re.sub(r\"lavenderpoetrycafe\", \"lavender poetry cafe\", tweet)\n tweet = re.sub(r\"EudryLantiqua\", \"Eudry Lantiqua\", tweet)\n tweet = re.sub(r\"15PM\", \"15 PM\", tweet)\n tweet = re.sub(r\"OriginalFunko\", \"Funko\", tweet)\n tweet = re.sub(r\"rightwaystan\", \"Richard Tan\", tweet)\n tweet = re.sub(r\"CindyNoonan\", \"Cindy Noonan\", tweet)\n tweet = re.sub(r\"RT_America\", \"RT America\", tweet)\n tweet = re.sub(r\"narendramodi\", \"Narendra Modi\", tweet)\n tweet = re.sub(r\"BakeOffFriends\", \"Bake Off Friends\", tweet)\n tweet = re.sub(r\"TeamHendrick\", \"Hendrick Motorsports\", tweet)\n tweet = re.sub(r\"alexbelloli\", \"Alex Belloli\", tweet)\n tweet = re.sub(r\"itsjustinstuart\", \"Justin Stuart\", tweet)\n tweet = re.sub(r\"gunsense\", \"gun sense\", tweet)\n tweet = re.sub(r\"DebateQuestionsWeWantToHear\", \"debate questions we want to hear\", tweet)\n tweet = re.sub(r\"RoyalCarribean\", \"Royal Carribean\", tweet)\n tweet = re.sub(r\"samanthaturne19\", \"Samantha Turner\", tweet)\n tweet = re.sub(r\"JonVoyage\", \"Jon Stewart\", tweet)\n tweet = re.sub(r\"renew911health\", \"renew 911 health\", tweet)\n tweet = re.sub(r\"SuryaRay\", \"Surya Ray\", tweet)\n tweet = re.sub(r\"pattonoswalt\", \"Patton Oswalt\", tweet)\n tweet = re.sub(r\"minhazmerchant\", \"Minhaz Merchant\", tweet)\n tweet = re.sub(r\"TLVFaces\", \"Israel Diaspora Coalition\", tweet)\n tweet = re.sub(r\"pmarca\", \"Marc Andreessen\", tweet)\n tweet = re.sub(r\"pdx911\", \"Portland Police\", tweet)\n tweet = re.sub(r\"jamaicaplain\", \"Jamaica Plain\", tweet)\n tweet = re.sub(r\"Japton\", \"Arkansas\", tweet)\n tweet = re.sub(r\"RouteComplex\", \"Route Complex\", tweet)\n tweet = re.sub(r\"INSubcontinent\", \"Indian Subcontinent\", tweet)\n tweet = re.sub(r\"NJTurnpike\", \"New Jersey Turnpike\", tweet)\n tweet = re.sub(r\"Politifiact\", \"PolitiFact\", tweet)\n tweet = re.sub(r\"Hiroshima70\", \"Hiroshima\", tweet)\n tweet = re.sub(r\"GMMBC\", \"Greater Mt Moriah Baptist Church\", tweet)\n tweet = re.sub(r\"versethe\", \"verse the\", tweet)\n tweet = re.sub(r\"TubeStrike\", \"Tube Strike\", tweet)\n tweet = re.sub(r\"MissionHills\", \"Mission Hills\", tweet)\n tweet = re.sub(r\"ProtectDenaliWolves\", \"Protect Denali Wolves\", tweet)\n tweet = re.sub(r\"NANKANA\", \"Nankana\", tweet)\n tweet = re.sub(r\"SAHIB\", \"Sahib\", tweet)\n tweet = re.sub(r\"PAKPATTAN\", \"Pakpattan\", tweet)\n tweet = re.sub(r\"Newz_Sacramento\", \"News Sacramento\", tweet)\n tweet = re.sub(r\"gofundme\", \"go fund me\", tweet)\n tweet = re.sub(r\"pmharper\", \"Stephen Harper\", tweet)\n tweet = re.sub(r\"IvanBerroa\", \"Ivan Berroa\", tweet)\n tweet = re.sub(r\"LosDelSonido\", \"Los Del Sonido\", tweet)\n tweet = re.sub(r\"bancodeseries\", \"banco de series\", tweet)\n tweet = re.sub(r\"timkaine\", \"Tim Kaine\", tweet)\n tweet = re.sub(r\"IdentityTheft\", \"Identity Theft\", tweet)\n tweet = re.sub(r\"AllLivesMatter\", \"All Lives Matter\", tweet)\n tweet = re.sub(r\"mishacollins\", \"Misha Collins\", tweet)\n tweet = re.sub(r\"BillNeelyNBC\", \"Bill Neely\", tweet)\n tweet = re.sub(r\"BeClearOnCancer\", \"be clear on cancer\", tweet)\n tweet = re.sub(r\"Kowing\", \"Knowing\", tweet)\n tweet = re.sub(r\"ScreamQueens\", \"Scream Queens\", tweet)\n tweet = re.sub(r\"AskCharley\", \"Ask Charley\", tweet)\n tweet = re.sub(r\"BlizzHeroes\", \"Heroes of the Storm\", tweet)\n tweet = re.sub(r\"BradleyBrad47\", \"Bradley Brad\", tweet)\n tweet = re.sub(r\"HannaPH\", \"Typhoon Hanna\", tweet)\n tweet = re.sub(r\"meinlcymbals\", \"MEINL Cymbals\", tweet)\n tweet = re.sub(r\"Ptbo\", \"Peterborough\", tweet)\n tweet = re.sub(r\"cnnbrk\", \"CNN Breaking News\", tweet)\n tweet = re.sub(r\"IndianNews\", \"Indian News\", tweet)\n tweet = re.sub(r\"savebees\", \"save bees\", tweet)\n tweet = re.sub(r\"GreenHarvard\", \"Green Harvard\", tweet)\n tweet = re.sub(r\"StandwithPP\", \"Stand with planned parenthood\", tweet)\n tweet = re.sub(r\"hermancranston\", \"Herman Cranston\", tweet)\n tweet = re.sub(r\"WMUR9\", \"WMUR-TV\", tweet)\n tweet = re.sub(r\"RockBottomRadFM\", \"Rock Bottom Radio\", tweet)\n tweet = re.sub(r\"ameenshaikh3\", \"Ameen Shaikh\", tweet)\n tweet = re.sub(r\"ProSyn\", \"Project Syndicate\", tweet)\n tweet = re.sub(r\"Daesh\", \"ISIS\", tweet)\n tweet = re.sub(r\"s2g\", \"swear to god\", tweet)\n tweet = re.sub(r\"listenlive\", \"listen live\", tweet)\n tweet = re.sub(r\"CDCgov\", \"Centers for Disease Control and Prevention\", tweet)\n tweet = re.sub(r\"FoxNew\", \"Fox News\", tweet)\n tweet = re.sub(r\"CBSBigBrother\", \"Big Brother\", tweet)\n tweet = re.sub(r\"JulieDiCaro\", \"Julie DiCaro\", tweet)\n tweet = re.sub(r\"theadvocatemag\", \"The Advocate Magazine\", tweet)\n tweet = re.sub(r\"RohnertParkDPS\", \"Rohnert Park Police Department\", tweet)\n tweet = re.sub(r\"THISIZBWRIGHT\", \"Bonnie Wright\", tweet)\n tweet = re.sub(r\"Popularmmos\", \"Popular MMOs\", tweet)\n tweet = re.sub(r\"WildHorses\", \"Wild Horses\", tweet)\n tweet = re.sub(r\"FantasticFour\", \"Fantastic Four\", tweet)\n tweet = re.sub(r\"HORNDALE\", \"Horndale\", tweet)\n tweet = re.sub(r\"PINER\", \"Piner\", tweet)\n tweet = re.sub(r\"BathAndNorthEastSomerset\", \"Bath and North East Somerset\", tweet)\n tweet = re.sub(r\"thatswhatfriendsarefor\", \"that is what friends are for\", tweet)\n tweet = re.sub(r\"residualincome\", \"residual income\", tweet)\n tweet = re.sub(r\"YahooNewsDigest\", \"Yahoo News Digest\", tweet)\n tweet = re.sub(r\"MalaysiaAirlines\", \"Malaysia Airlines\", tweet)\n tweet = re.sub(r\"AmazonDeals\", \"Amazon Deals\", tweet)\n tweet = re.sub(r\"MissCharleyWebb\", \"Charley Webb\", tweet)\n tweet = re.sub(r\"shoalstraffic\", \"shoals traffic\", tweet)\n tweet = re.sub(r\"GeorgeFoster72\", \"George Foster\", tweet)\n tweet = re.sub(r\"pop2015\", \"pop 2015\", tweet)\n tweet = re.sub(r\"_PokemonCards_\", \"Pokemon Cards\", tweet)\n tweet = re.sub(r\"DianneG\", \"Dianne Gallagher\", tweet)\n tweet = re.sub(r\"KashmirConflict\", \"Kashmir Conflict\", tweet)\n tweet = re.sub(r\"BritishBakeOff\", \"British Bake Off\", tweet)\n tweet = re.sub(r\"FreeKashmir\", \"Free Kashmir\", tweet)\n tweet = re.sub(r\"mattmosley\", \"Matt Mosley\", tweet)\n tweet = re.sub(r\"BishopFred\", \"Bishop Fred\", tweet)\n tweet = re.sub(r\"EndConflict\", \"End Conflict\", tweet)\n tweet = re.sub(r\"EndOccupation\", \"End Occupation\", tweet)\n tweet = re.sub(r\"UNHEALED\", \"unhealed\", tweet)\n tweet = re.sub(r\"CharlesDagnall\", \"Charles Dagnall\", tweet)\n tweet = re.sub(r\"Latestnews\", \"Latest news\", tweet)\n tweet = re.sub(r\"KindleCountdown\", \"Kindle Countdown\", tweet)\n tweet = re.sub(r\"NoMoreHandouts\", \"No More Handouts\", tweet)\n tweet = re.sub(r\"datingtips\", \"dating tips\", tweet)\n tweet = re.sub(r\"charlesadler\", \"Charles Adler\", tweet)\n tweet = re.sub(r\"twia\", \"Texas Windstorm Insurance Association\", tweet)\n tweet = re.sub(r\"txlege\", \"Texas Legislature\", tweet)\n tweet = re.sub(r\"WindstormInsurer\", \"Windstorm Insurer\", tweet)\n tweet = re.sub(r\"Newss\", \"News\", tweet)\n tweet = re.sub(r\"hempoil\", \"hemp oil\", tweet)\n tweet = re.sub(r\"CommoditiesAre\", \"Commodities are\", tweet)\n tweet = re.sub(r\"tubestrike\", \"tube strike\", tweet)\n tweet = re.sub(r\"JoeNBC\", \"Joe Scarborough\", tweet)\n tweet = re.sub(r\"LiteraryCakes\", \"Literary Cakes\", tweet)\n tweet = re.sub(r\"TI5\", \"The International 5\", tweet)\n tweet = re.sub(r\"thehill\", \"the hill\", tweet)\n tweet = re.sub(r\"3others\", \"3 others\", tweet)\n tweet = re.sub(r\"stighefootball\", \"Sam Tighe\", tweet)\n tweet = re.sub(r\"whatstheimportantvideo\", \"what is the important video\", tweet)\n tweet = re.sub(r\"ClaudioMeloni\", \"Claudio Meloni\", tweet)\n tweet = re.sub(r\"DukeSkywalker\", \"Duke Skywalker\", tweet)\n tweet = re.sub(r\"carsonmwr\", \"Fort Carson\", tweet)\n tweet = re.sub(r\"offdishduty\", \"off dish duty\", tweet)\n tweet = re.sub(r\"andword\", \"and word\", tweet)\n tweet = re.sub(r\"rhodeisland\", \"Rhode Island\", tweet)\n tweet = re.sub(r\"easternoregon\", \"Eastern Oregon\", tweet)\n tweet = re.sub(r\"WAwildfire\", \"Washington Wildfire\", tweet)\n tweet = re.sub(r\"fingerrockfire\", \"Finger Rock Fire\", tweet)\n tweet = re.sub(r\"57am\", \"57 am\", tweet)\n tweet = re.sub(r\"fingerrockfire\", \"Finger Rock Fire\", tweet)\n tweet = re.sub(r\"JacobHoggard\", \"Jacob Hoggard\", tweet)\n tweet = re.sub(r\"newnewnew\", \"new new new\", tweet)\n tweet = re.sub(r\"under50\", \"under 50\", tweet)\n tweet = re.sub(r\"getitbeforeitsgone\", \"get it before it is gone\", tweet)\n tweet = re.sub(r\"freshoutofthebox\", \"fresh out of the box\", tweet)\n tweet = re.sub(r\"amwriting\", \"am writing\", tweet)\n tweet = re.sub(r\"Bokoharm\", \"Boko Haram\", tweet)\n tweet = re.sub(r\"Nowlike\", \"Now like\", tweet)\n tweet = re.sub(r\"seasonfrom\", \"season from\", tweet)\n tweet = re.sub(r\"epicente\", \"epicenter\", tweet)\n tweet = re.sub(r\"epicenterr\", \"epicenter\", tweet)\n tweet = re.sub(r\"sicklife\", \"sick life\", tweet)\n tweet = re.sub(r\"yycweather\", \"Calgary Weather\", tweet)\n tweet = re.sub(r\"calgarysun\", \"Calgary Sun\", tweet)\n tweet = re.sub(r\"approachng\", \"approaching\", tweet)\n tweet = re.sub(r\"evng\", \"evening\", tweet)\n tweet = re.sub(r\"Sumthng\", \"something\", tweet)\n tweet = re.sub(r\"EllenPompeo\", \"Ellen Pompeo\", tweet)\n tweet = re.sub(r\"shondarhimes\", \"Shonda Rhimes\", tweet)\n tweet = re.sub(r\"ABCNetwork\", \"ABC Network\", tweet)\n tweet = re.sub(r\"SushmaSwaraj\", \"Sushma Swaraj\", tweet)\n tweet = re.sub(r\"pray4japan\", \"Pray for Japan\", tweet)\n tweet = re.sub(r\"hope4japan\", \"Hope for Japan\", tweet)\n tweet = re.sub(r\"Illusionimagess\", \"Illusion images\", tweet)\n tweet = re.sub(r\"SummerUnderTheStars\", \"Summer Under The Stars\", tweet)\n tweet = re.sub(r\"ShallWeDance\", \"Shall We Dance\", tweet)\n tweet = re.sub(r\"TCMParty\", \"TCM Party\", tweet)\n tweet = re.sub(r\"marijuananews\", \"marijuana news\", tweet)\n tweet = re.sub(r\"onbeingwithKristaTippett\", \"on being with Krista Tippett\", tweet)\n tweet = re.sub(r\"Beingtweets\", \"Being tweets\", tweet)\n tweet = re.sub(r\"newauthors\", \"new authors\", tweet)\n tweet = re.sub(r\"remedyyyy\", \"remedy\", tweet)\n tweet = re.sub(r\"44PM\", \"44 PM\", tweet)\n tweet = re.sub(r\"HeadlinesApp\", \"Headlines App\", tweet)\n tweet = re.sub(r\"40PM\", \"40 PM\", tweet)\n tweet = re.sub(r\"myswc\", \"Severe Weather Center\", tweet)\n tweet = re.sub(r\"ithats\", \"that is\", tweet)\n tweet = re.sub(r\"icouldsitinthismomentforever\", \"I could sit in this moment forever\", tweet)\n tweet = re.sub(r\"FatLoss\", \"Fat Loss\", tweet)\n tweet = re.sub(r\"02PM\", \"02 PM\", tweet)\n tweet = re.sub(r\"MetroFmTalk\", \"Metro Fm Talk\", tweet)\n tweet = re.sub(r\"Bstrd\", \"bastard\", tweet)\n tweet = re.sub(r\"bldy\", \"bloody\", tweet)\n tweet = re.sub(r\"MetrofmTalk\", \"Metro Fm Talk\", tweet)\n tweet = re.sub(r\"terrorismturn\", \"terrorism turn\", tweet)\n tweet = re.sub(r\"BBCNewsAsia\", \"BBC News Asia\", tweet)\n tweet = re.sub(r\"BehindTheScenes\", \"Behind The Scenes\", tweet)\n tweet = re.sub(r\"GeorgeTakei\", \"George Takei\", tweet)\n tweet = re.sub(r\"WomensWeeklyMag\", \"Womens Weekly Magazine\", tweet)\n tweet = re.sub(r\"SurvivorsGuidetoEarth\", \"Survivors Guide to Earth\", tweet)\n tweet = re.sub(r\"incubusband\", \"incubus band\", tweet)\n tweet = re.sub(r\"Babypicturethis\", \"Baby picture this\", tweet)\n tweet = re.sub(r\"BombEffects\", \"Bomb Effects\", tweet)\n tweet = re.sub(r\"win10\", \"Windows 10\", tweet)\n tweet = re.sub(r\"idkidk\", \"I do not know I do not know\", tweet)\n tweet = re.sub(r\"TheWalkingDead\", \"The Walking Dead\", tweet)\n tweet = re.sub(r\"amyschumer\", \"Amy Schumer\", tweet)\n tweet = re.sub(r\"crewlist\", \"crew list\", tweet)\n tweet = re.sub(r\"Erdogans\", \"Erdogan\", tweet)\n tweet = re.sub(r\"BBCLive\", \"BBC Live\", tweet)\n tweet = re.sub(r\"TonyAbbottMHR\", \"Tony Abbott\", tweet)\n tweet = re.sub(r\"paulmyerscough\", \"Paul Myerscough\", tweet)\n tweet = re.sub(r\"georgegallagher\", \"George Gallagher\", tweet)\n tweet = re.sub(r\"JimmieJohnson\", \"Jimmie Johnson\", tweet)\n tweet = re.sub(r\"pctool\", \"pc tool\", tweet)\n tweet = re.sub(r\"DoingHashtagsRight\", \"Doing Hashtags Right\", tweet)\n tweet = re.sub(r\"ThrowbackThursday\", \"Throwback Thursday\", tweet)\n tweet = re.sub(r\"SnowBackSunday\", \"Snowback Sunday\", tweet)\n tweet = re.sub(r\"LakeEffect\", \"Lake Effect\", tweet)\n tweet = re.sub(r\"RTphotographyUK\", \"Richard Thomas Photography UK\", tweet)\n tweet = re.sub(r\"BigBang_CBS\", \"Big Bang CBS\", tweet)\n tweet = re.sub(r\"writerslife\", \"writers life\", tweet)\n tweet = re.sub(r\"NaturalBirth\", \"Natural Birth\", tweet)\n tweet = re.sub(r\"UnusualWords\", \"Unusual Words\", tweet)\n tweet = re.sub(r\"wizkhalifa\", \"Wiz Khalifa\", tweet)\n tweet = re.sub(r\"acreativedc\", \"a creative DC\", tweet)\n tweet = re.sub(r\"vscodc\", \"vsco DC\", tweet)\n tweet = re.sub(r\"VSCOcam\", \"vsco camera\", tweet)\n tweet = re.sub(r\"TheBEACHDC\", \"The beach DC\", tweet)\n tweet = re.sub(r\"buildingmuseum\", \"building museum\", tweet)\n tweet = re.sub(r\"WorldOil\", \"World Oil\", tweet)\n tweet = re.sub(r\"redwedding\", \"red wedding\", tweet)\n tweet = re.sub(r\"AmazingRaceCanada\", \"Amazing Race Canada\", tweet)\n tweet = re.sub(r\"WakeUpAmerica\", \"Wake Up America\", tweet)\n tweet = re.sub(r\"\\\\Allahuakbar\\\\\", \"Allahu Akbar\", tweet)\n tweet = re.sub(r\"bleased\", \"blessed\", tweet)\n tweet = re.sub(r\"nigeriantribune\", \"Nigerian Tribune\", tweet)\n tweet = re.sub(r\"HIDEO_KOJIMA_EN\", \"Hideo Kojima\", tweet)\n tweet = re.sub(r\"FusionFestival\", \"Fusion Festival\", tweet)\n tweet = re.sub(r\"50Mixed\", \"50 Mixed\", tweet)\n tweet = re.sub(r\"NoAgenda\", \"No Agenda\", tweet)\n tweet = re.sub(r\"WhiteGenocide\", \"White Genocide\", tweet)\n tweet = re.sub(r\"dirtylying\", \"dirty lying\", tweet)\n tweet = re.sub(r\"SyrianRefugees\", \"Syrian Refugees\", tweet)\n tweet = re.sub(r\"changetheworld\", \"change the world\", tweet)\n tweet = re.sub(r\"Ebolacase\", \"Ebola case\", tweet)\n tweet = re.sub(r\"mcgtech\", \"mcg technologies\", tweet)\n tweet = re.sub(r\"withweapons\", \"with weapons\", tweet)\n tweet = re.sub(r\"advancedwarfare\", \"advanced warfare\", tweet)\n tweet = re.sub(r\"letsFootball\", \"let us Football\", tweet)\n tweet = re.sub(r\"LateNiteMix\", \"late night mix\", tweet)\n tweet = re.sub(r\"PhilCollinsFeed\", \"Phil Collins\", tweet)\n tweet = re.sub(r\"RudyHavenstein\", \"Rudy Havenstein\", tweet)\n tweet = re.sub(r\"22PM\", \"22 PM\", tweet)\n tweet = re.sub(r\"54am\", \"54 AM\", tweet)\n tweet = re.sub(r\"38am\", \"38 AM\", tweet)\n tweet = re.sub(r\"OldFolkExplainStuff\", \"Old Folk Explain Stuff\", tweet)\n tweet = re.sub(r\"BlacklivesMatter\", \"Black Lives Matter\", tweet)\n tweet = re.sub(r\"InsaneLimits\", \"Insane Limits\", tweet)\n tweet = re.sub(r\"youcantsitwithus\", \"you cannot sit with us\", tweet)\n tweet = re.sub(r\"2k15\", \"2015\", tweet)\n tweet = re.sub(r\"TheIran\", \"Iran\", tweet)\n tweet = re.sub(r\"JimmyFallon\", \"Jimmy Fallon\", tweet)\n tweet = re.sub(r\"AlbertBrooks\", \"Albert Brooks\", tweet)\n tweet = re.sub(r\"defense_news\", \"defense news\", tweet)\n tweet = re.sub(r\"nuclearrcSA\", \"Nuclear Risk Control Self Assessment\", tweet)\n tweet = re.sub(r\"Auspol\", \"Australia Politics\", tweet)\n tweet = re.sub(r\"NuclearPower\", \"Nuclear Power\", tweet)\n tweet = re.sub(r\"WhiteTerrorism\", \"White Terrorism\", tweet)\n tweet = re.sub(r\"truthfrequencyradio\", \"Truth Frequency Radio\", tweet)\n tweet = re.sub(r\"ErasureIsNotEquality\", \"Erasure is not equality\", tweet)\n tweet = re.sub(r\"ProBonoNews\", \"Pro Bono News\", tweet)\n tweet = re.sub(r\"JakartaPost\", \"Jakarta Post\", tweet)\n tweet = re.sub(r\"toopainful\", \"too painful\", tweet)\n tweet = re.sub(r\"melindahaunton\", \"Melinda Haunton\", tweet)\n tweet = re.sub(r\"NoNukes\", \"No Nukes\", tweet)\n tweet = re.sub(r\"curryspcworld\", \"Currys PC World\", tweet)\n tweet = re.sub(r\"ineedcake\", \"I need cake\", tweet)\n tweet = re.sub(r\"blackforestgateau\", \"black forest gateau\", tweet)\n tweet = re.sub(r\"BBCOne\", \"BBC One\", tweet)\n tweet = re.sub(r\"AlexxPage\", \"Alex Page\", tweet)\n tweet = re.sub(r\"jonathanserrie\", \"Jonathan Serrie\", tweet)\n tweet = re.sub(r\"SocialJerkBlog\", \"Social Jerk Blog\", tweet)\n tweet = re.sub(r\"ChelseaVPeretti\", \"Chelsea Peretti\", tweet)\n tweet = re.sub(r\"irongiant\", \"iron giant\", tweet)\n tweet = re.sub(r\"RonFunches\", \"Ron Funches\", tweet)\n tweet = re.sub(r\"TimCook\", \"Tim Cook\", tweet)\n tweet = re.sub(r\"sebastianstanisaliveandwell\", \"Sebastian Stan is alive and well\", tweet)\n tweet = re.sub(r\"Madsummer\", \"Mad summer\", tweet)\n tweet = re.sub(r\"NowYouKnow\", \"Now you know\", tweet)\n tweet = re.sub(r\"concertphotography\", \"concert photography\", tweet)\n tweet = re.sub(r\"TomLandry\", \"Tom Landry\", tweet)\n tweet = re.sub(r\"showgirldayoff\", \"show girl day off\", tweet)\n tweet = re.sub(r\"Yougslavia\", \"Yugoslavia\", tweet)\n tweet = re.sub(r\"QuantumDataInformatics\", \"Quantum Data Informatics\", tweet)\n tweet = re.sub(r\"FromTheDesk\", \"From The Desk\", tweet)\n tweet = re.sub(r\"TheaterTrial\", \"Theater Trial\", tweet)\n tweet = re.sub(r\"CatoInstitute\", \"Cato Institute\", tweet)\n tweet = re.sub(r\"EmekaGift\", \"Emeka Gift\", tweet)\n tweet = re.sub(r\"LetsBe_Rational\", \"Let us be rational\", tweet)\n tweet = re.sub(r\"Cynicalreality\", \"Cynical reality\", tweet)\n tweet = re.sub(r\"FredOlsenCruise\", \"Fred Olsen Cruise\", tweet)\n tweet = re.sub(r\"NotSorry\", \"not sorry\", tweet)\n tweet = re.sub(r\"UseYourWords\", \"use your words\", tweet)\n tweet = re.sub(r\"WordoftheDay\", \"word of the day\", tweet)\n tweet = re.sub(r\"Dictionarycom\", \"Dictionary.com\", tweet)\n tweet = re.sub(r\"TheBrooklynLife\", \"The Brooklyn Life\", tweet)\n tweet = re.sub(r\"jokethey\", \"joke they\", tweet)\n tweet = re.sub(r\"nflweek1picks\", \"NFL week 1 picks\", tweet)\n tweet = re.sub(r\"uiseful\", \"useful\", tweet)\n tweet = re.sub(r\"JusticeDotOrg\", \"The American Association for Justice\", tweet)\n tweet = re.sub(r\"autoaccidents\", \"auto accidents\", tweet)\n tweet = re.sub(r\"SteveGursten\", \"Steve Gursten\", tweet)\n tweet = re.sub(r\"MichiganAutoLaw\", \"Michigan Auto Law\", tweet)\n tweet = re.sub(r\"birdgang\", \"bird gang\", tweet)\n tweet = re.sub(r\"nflnetwork\", \"NFL Network\", tweet)\n tweet = re.sub(r\"NYDNSports\", \"NY Daily News Sports\", tweet)\n tweet = re.sub(r\"RVacchianoNYDN\", \"Ralph Vacchiano NY Daily News\", tweet)\n tweet = re.sub(r\"EdmontonEsks\", \"Edmonton Eskimos\", tweet)\n tweet = re.sub(r\"david_brelsford\", \"David Brelsford\", tweet)\n tweet = re.sub(r\"TOI_India\", \"The Times of India\", tweet)\n tweet = re.sub(r\"hegot\", \"he got\", tweet)\n tweet = re.sub(r\"SkinsOn9\", \"Skins on 9\", tweet)\n tweet = re.sub(r\"sothathappened\", \"so that happened\", tweet)\n tweet = re.sub(r\"LCOutOfDoors\", \"LC Out Of Doors\", tweet)\n tweet = re.sub(r\"NationFirst\", \"Nation First\", tweet)\n tweet = re.sub(r\"IndiaToday\", \"India Today\", tweet)\n tweet = re.sub(r\"HLPS\", \"helps\", tweet)\n tweet = re.sub(r\"HOSTAGESTHROSW\", \"hostages throw\", tweet)\n tweet = re.sub(r\"SNCTIONS\", \"sanctions\", tweet)\n tweet = re.sub(r\"BidTime\", \"Bid Time\", tweet)\n tweet = re.sub(r\"crunchysensible\", \"crunchy sensible\", tweet)\n tweet = re.sub(r\"RandomActsOfRomance\", \"Random acts of romance\", tweet)\n tweet = re.sub(r\"MomentsAtHill\", \"Moments at hill\", tweet)\n tweet = re.sub(r\"eatshit\", \"eat shit\", tweet)\n tweet = re.sub(r\"liveleakfun\", \"live leak fun\", tweet)\n tweet = re.sub(r\"SahelNews\", \"Sahel News\", tweet)\n tweet = re.sub(r\"abc7newsbayarea\", \"ABC 7 News Bay Area\", tweet)\n tweet = re.sub(r\"facilitiesmanagement\", \"facilities management\", tweet)\n tweet = re.sub(r\"facilitydude\", \"facility dude\", tweet)\n tweet = re.sub(r\"CampLogistics\", \"Camp logistics\", tweet)\n tweet = re.sub(r\"alaskapublic\", \"Alaska public\", tweet)\n tweet = re.sub(r\"MarketResearch\", \"Market Research\", tweet)\n tweet = re.sub(r\"AccuracyEsports\", \"Accuracy Esports\", tweet)\n tweet = re.sub(r\"TheBodyShopAust\", \"The Body Shop Australia\", tweet)\n tweet = re.sub(r\"yychail\", \"Calgary hail\", tweet)\n tweet = re.sub(r\"yyctraffic\", \"Calgary traffic\", tweet)\n tweet = re.sub(r\"eliotschool\", \"eliot school\", tweet)\n tweet = re.sub(r\"TheBrokenCity\", \"The Broken City\", tweet)\n tweet = re.sub(r\"OldsFireDept\", \"Olds Fire Department\", tweet)\n tweet = re.sub(r\"RiverComplex\", \"River Complex\", tweet)\n tweet = re.sub(r\"fieldworksmells\", \"field work smells\", tweet)\n tweet = re.sub(r\"IranElection\", \"Iran Election\", tweet)\n tweet = re.sub(r\"glowng\", \"glowing\", tweet)\n tweet = re.sub(r\"kindlng\", \"kindling\", tweet)\n tweet = re.sub(r\"riggd\", \"rigged\", tweet)\n tweet = re.sub(r\"slownewsday\", \"slow news day\", tweet)\n tweet = re.sub(r\"MyanmarFlood\", \"Myanmar Flood\", tweet)\n tweet = re.sub(r\"abc7chicago\", \"ABC 7 Chicago\", tweet)\n tweet = re.sub(r\"copolitics\", \"Colorado Politics\", tweet)\n tweet = re.sub(r\"AdilGhumro\", \"Adil Ghumro\", tweet)\n tweet = re.sub(r\"netbots\", \"net bots\", tweet)\n tweet = re.sub(r\"byebyeroad\", \"bye bye road\", tweet)\n tweet = re.sub(r\"massiveflooding\", \"massive flooding\", tweet)\n tweet = re.sub(r\"EndofUS\", \"End of United States\", tweet)\n tweet = re.sub(r\"35PM\", \"35 PM\", tweet)\n tweet = re.sub(r\"greektheatrela\", \"Greek Theatre Los Angeles\", tweet)\n tweet = re.sub(r\"76mins\", \"76 minutes\", tweet)\n tweet = re.sub(r\"publicsafetyfirst\", \"public safety first\", tweet)\n tweet = re.sub(r\"livesmatter\", \"lives matter\", tweet)\n tweet = re.sub(r\"myhometown\", \"my hometown\", tweet)\n tweet = re.sub(r\"tankerfire\", \"tanker fire\", tweet)\n tweet = re.sub(r\"MEMORIALDAY\", \"memorial day\", tweet)\n tweet = re.sub(r\"MEMORIAL_DAY\", \"memorial day\", tweet)\n tweet = re.sub(r\"instaxbooty\", \"instagram booty\", tweet)\n tweet = re.sub(r\"Jerusalem_Post\", \"Jerusalem Post\", tweet)\n tweet = re.sub(r\"WayneRooney_INA\", \"Wayne Rooney\", tweet)\n tweet = re.sub(r\"VirtualReality\", \"Virtual Reality\", tweet)\n tweet = re.sub(r\"OculusRift\", \"Oculus Rift\", tweet)\n tweet = re.sub(r\"OwenJones84\", \"Owen Jones\", tweet)\n tweet = re.sub(r\"jeremycorbyn\", \"Jeremy Corbyn\", tweet)\n tweet = re.sub(r\"paulrogers002\", \"Paul Rogers\", tweet)\n tweet = re.sub(r\"mortalkombatx\", \"Mortal Kombat X\", tweet)\n tweet = re.sub(r\"mortalkombat\", \"Mortal Kombat\", tweet)\n tweet = re.sub(r\"FilipeCoelho92\", \"Filipe Coelho\", tweet)\n tweet = re.sub(r\"OnlyQuakeNews\", \"Only Quake News\", tweet)\n tweet = re.sub(r\"kostumes\", \"costumes\", tweet)\n tweet = re.sub(r\"YEEESSSS\", \"yes\", tweet)\n tweet = re.sub(r\"ToshikazuKatayama\", \"Toshikazu Katayama\", tweet)\n tweet = re.sub(r\"IntlDevelopment\", \"Intl Development\", tweet)\n tweet = re.sub(r\"ExtremeWeather\", \"Extreme Weather\", tweet)\n tweet = re.sub(r\"WereNotGruberVoters\", \"We are not gruber voters\", tweet)\n tweet = re.sub(r\"NewsThousands\", \"News Thousands\", tweet)\n tweet = re.sub(r\"EdmundAdamus\", \"Edmund Adamus\", tweet)\n tweet = re.sub(r\"EyewitnessWV\", \"Eye witness WV\", tweet)\n tweet = re.sub(r\"PhiladelphiaMuseu\", \"Philadelphia Museum\", tweet)\n tweet = re.sub(r\"DublinComicCon\", \"Dublin Comic Con\", tweet)\n tweet = re.sub(r\"NicholasBrendon\", \"Nicholas Brendon\", tweet)\n tweet = re.sub(r\"Alltheway80s\", \"All the way 80s\", tweet)\n tweet = re.sub(r\"FromTheField\", \"From the field\", tweet)\n tweet = re.sub(r\"NorthIowa\", \"North Iowa\", tweet)\n tweet = re.sub(r\"WillowFire\", \"Willow Fire\", tweet)\n tweet = re.sub(r\"MadRiverComplex\", \"Mad River Complex\", tweet)\n tweet = re.sub(r\"feelingmanly\", \"feeling manly\", tweet)\n tweet = re.sub(r\"stillnotoverit\", \"still not over it\", tweet)\n tweet = re.sub(r\"FortitudeValley\", \"Fortitude Valley\", tweet)\n tweet = re.sub(r\"CoastpowerlineTramTr\", \"Coast powerline\", tweet)\n tweet = re.sub(r\"ServicesGold\", \"Services Gold\", tweet)\n tweet = re.sub(r\"NewsbrokenEmergency\", \"News broken emergency\", tweet)\n tweet = re.sub(r\"Evaucation\", \"evacuation\", tweet)\n tweet = re.sub(r\"leaveevacuateexitbe\", \"leave evacuate exit be\", tweet)\n tweet = re.sub(r\"P_EOPLE\", \"PEOPLE\", tweet)\n tweet = re.sub(r\"Tubestrike\", \"tube strike\", tweet)\n tweet = re.sub(r\"CLASS_SICK\", \"CLASS SICK\", tweet)\n tweet = re.sub(r\"localplumber\", \"local plumber\", tweet)\n tweet = re.sub(r\"awesomejobsiri\", \"awesome job siri\", tweet)\n tweet = re.sub(r\"PayForItHow\", \"Pay for it how\", tweet)\n tweet = re.sub(r\"ThisIsAfrica\", \"This is Africa\", tweet)\n tweet = re.sub(r\"crimeairnetwork\", \"crime air network\", tweet)\n tweet = re.sub(r\"KimAcheson\", \"Kim Acheson\", tweet)\n tweet = re.sub(r\"cityofcalgary\", \"City of Calgary\", tweet)\n tweet = re.sub(r\"prosyndicate\", \"pro syndicate\", tweet)\n tweet = re.sub(r\"660NEWS\", \"660 NEWS\", tweet)\n tweet = re.sub(r\"BusInsMagazine\", \"Business Insurance Magazine\", tweet)\n tweet = re.sub(r\"wfocus\", \"focus\", tweet)\n tweet = re.sub(r\"ShastaDam\", \"Shasta Dam\", tweet)\n tweet = re.sub(r\"go2MarkFranco\", \"Mark Franco\", tweet)\n tweet = re.sub(r\"StephGHinojosa\", \"Steph Hinojosa\", tweet)\n tweet = re.sub(r\"Nashgrier\", \"Nash Grier\", tweet)\n tweet = re.sub(r\"NashNewVideo\", \"Nash new video\", tweet)\n tweet = re.sub(r\"IWouldntGetElectedBecause\", \"I would not get elected because\", tweet)\n tweet = re.sub(r\"SHGames\", \"Sledgehammer Games\", tweet)\n tweet = re.sub(r\"bedhair\", \"bed hair\", tweet)\n tweet = re.sub(r\"JoelHeyman\", \"Joel Heyman\", tweet)\n tweet = re.sub(r\"viaYouTube\", \"via YouTube\", tweet)\n \n # Urls\n tweet = re.sub(r\"https?:\\/\\/t.co\\/[A-Za-z0-9]+\", \"\", tweet)\n \n # Words with punctuations and special characters\n punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + \"'`\"\n for p in punctuations:\n tweet = tweet.replace(p, f' {p} ')\n \n # ... and ..\n tweet = tweet.replace('...', ' ... ')\n if '...' not in tweet:\n tweet = tweet.replace('..', ' ... ') \n \n # Acronyms\n tweet = re.sub(r\"MH370\", \"Malaysia Airlines Flight 370\", tweet)\n tweet = re.sub(r\"m\u00cc\u00bcsica\", \"music\", tweet)\n tweet = re.sub(r\"okwx\", \"Oklahoma City Weather\", tweet)\n tweet = re.sub(r\"arwx\", \"Arkansas Weather\", tweet) \n tweet = re.sub(r\"gawx\", \"Georgia Weather\", tweet) \n tweet = re.sub(r\"scwx\", \"South Carolina Weather\", tweet) \n tweet = re.sub(r\"cawx\", \"California Weather\", tweet)\n tweet = re.sub(r\"tnwx\", \"Tennessee Weather\", tweet)\n tweet = re.sub(r\"azwx\", \"Arizona Weather\", tweet) \n tweet = re.sub(r\"alwx\", \"Alabama Weather\", tweet)\n tweet = re.sub(r\"wordpressdotcom\", \"wordpress\", tweet) \n tweet = re.sub(r\"usNWSgov\", \"United States National Weather Service\", tweet)\n tweet = re.sub(r\"Suruc\", \"Sanliurfa\", tweet) \n \n # Grouping same words without embeddings\n tweet = re.sub(r\"Bestnaijamade\", \"bestnaijamade\", tweet)\n tweet = re.sub(r\"SOUDELOR\", \"Soudelor\", tweet)\n \n #Remove Emoji\n tweet = re.sub(u\"\\U0001F600-\\U0001F64F\",\"\", tweet) # emoticons\n tweet = re.sub(u\"\\U0001F300-\\U0001F5FF\",\"\", tweet) # symbols & pictographs\n tweet = re.sub(u\"\\U0001F680-\\U0001F6FF\",\"\", tweet) # transport & map symbols\n tweet = re.sub(u\"\\U0001F1E0-\\U0001F1FF\",\"\", tweet) # flags (iOS)\n tweet = re.sub(u\"\\U00002702-\\U000027B0\",\"\", tweet)\n tweet = re.sub(u\"\\U000024C2-\\U0001F251\",\"\", tweet)\n \n return tweet\n\ntrain_df['text_cleaned'] = train_df['text'].apply(lambda s : clean(s))\ntest_df['text_cleaned'] = test_df['text'].apply(lambda s : clean(s))", + "cell_id": 15, + "code": "def extract_keywords(text):\n potential_keywords = []\n TOP_KEYWORD = -1\n # Create a list for keyword parts of speech\n pos_tag = ['ADJ', 'NOUN', 'PROPN']\n doc = nlp_spacy(text)\n \n for i in doc:\n if i.pos_ in pos_tag:\n potential_keywords.append(i.text)\n\n document_embed = sentence_enc([text])\n potential_embed = sentence_enc(potential_keywords) \n \n vector_distances = cosine_similarity(document_embed, potential_embed)\n keyword = [potential_keywords[i] for i in vector_distances.argsort()[0][TOP_KEYWORD:]]\n\n return keyword\n\ndef keyword_filler(keyword, text):\n if pd.isnull(keyword):\n try:\n keyword = extract_keywords(text)[0]\n except:\n keyword = '' \n \n return keyword", "class": "Data Transform", - "desc": "This code snippet defines a function to clean tweets by removing special characters, expanding contractions, normalizing slang, removing URLs, handling punctuation, acronyms, emojis, and then applies this function to create a cleaned text column in both the training and testing datasets.", + "desc": "The code defines two functions: `extract_keywords`, which uses SpaCy for POS tagging and TensorFlow's Universal Sentence Encoder along with cosine similarity to extract potential keywords from text, and `keyword_filler`, which fills missing keywords by extracting them from the 'text' column.", "testing": { "class": "Data_Transform", "subclass": "categorify", "subclass_id": 20, - "predicted_subclass_probability": 0.9979898 + "predicted_subclass_probability": 0.41046342 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 20, - "code": "def encode(texts, tokenizer, max_len=512):\n all_tokens = []\n all_masks = []\n all_segments = []\n \n for text in texts:\n # Tokenise text\n text = tokenizer.tokenize(text)\n #Reduce 2 slots for start and end tag\n text = text[:max_len-2]\n #Add start and end tag\n input_sequence = [\"[CLS]\"] + text + [\"[SEP]\"]\n #Padding to be added\n pad_len = max_len - len(input_sequence)\n #Get token ids\n tokens = tokenizer.convert_tokens_to_ids(input_sequence)\n #add padding\n tokens += [0] * pad_len\n #Create padding mask with 1's of length of input and 0's with padding length\n pad_masks = [1] * len(input_sequence) + [0] * pad_len\n #Create segment ids with all 0's \n segment_ids = [0] * max_len\n \n all_tokens.append(tokens)\n all_masks.append(pad_masks)\n all_segments.append(segment_ids)\n \n return np.array(all_tokens), np.array(all_masks), np.array(all_segments)", + "cell_id": 16, + "code": "df_train.keyword = pd.DataFrame(list(map(keyword_filler, df_train.keyword, df_train.text))).astype(str)\ndf_test.keyword = pd.DataFrame(list(map(keyword_filler, df_test.keyword, df_test.text))).astype(str)\n\nprint('Null Training Keywords => ', df_train['keyword'].isnull().any())\nprint('Null Test Keywords => ', df_test['keyword'].isnull().any())", "class": "Data Transform", - "desc": "This code snippet defines a function called `encode` that tokenizes and processes input text data for use with a BERT model, including adding special tokens, padding, and creating token, mask, and segment ID arrays.", + "desc": "The code fills missing 'keyword' values in both the training and test datasets by applying the `keyword_filler` function and then checks for any remaining null values in the 'keyword' column of each dataset.", "testing": { "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.9931865 - }, - "cluster": 1 + "subclass": "data_type_conversions", + "subclass_id": 16, + "predicted_subclass_probability": 0.95129913 + }, + "cluster": 3 }, { - "cell_id": 24, - "code": "train_input = encode(train_df.text_cleaned.values, tokenizer, max_len=160)\ntest_input = encode(test_df.text_cleaned.values, tokenizer, max_len=160)\ntrain_labels = train_df.target.values", + "cell_id": 20, + "code": "# Spilt data\nX_train, X_val, y_train, y_val = train_test_split(df_train[['text','keyword']],\n df_train.target, \n test_size=0.2, \n random_state=42)\nX_train.shape, X_val.shape", + "class": "Data Transform", + "desc": "The code splits the preprocessed training dataset into training and validation sets, with 80% of the data for training and 20% for validation, using the `train_test_split` function from Scikit-learn and then outputs the shapes of the resulting sets.", + "testing": { + "class": "Data_Transform", + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.998221 + }, + "cluster": 0 + }, { + "cell_id": 21, + "code": "train_ds = tf.data.Dataset.from_tensor_slices((dict(X_train), y_train))\nval_ds = tf.data.Dataset.from_tensor_slices((dict(X_val), y_val))\ntest_ds = tf.data.Dataset.from_tensor_slices(dict(df_test[['text','keyword']]))", + "class": "Data Transform", + "desc": "The code converts the training, validation, and test datasets into TensorFlow `tf.data.Dataset` objects to facilitate efficient data loading and preprocessing during model training and evaluation.", + "testing": { + "class": "Data_Transform", + "subclass": "create_dataframe", + "subclass_id": 12, + "predicted_subclass_probability": 0.8992567 + }, + "cluster": 5 + }, { + "cell_id": 22, + "code": "AUTOTUNE = tf.data.experimental.AUTOTUNE\n\nBUFFER_SIZE = 1000\nBATCH_SIZE = 32\nRANDOM_SEED = 319\n\ndef configure_dataset(dataset, shuffle=False, test=False):\n if shuffle:\n dataset = dataset.cache()\\\n .shuffle(BUFFER_SIZE, seed=RANDOM_SEED, reshuffle_each_iteration=True)\\\n .batch(BATCH_SIZE, drop_remainder=True)\\\n .prefetch(AUTOTUNE)\n elif test:\n dataset = dataset.cache()\\\n .batch(BATCH_SIZE, drop_remainder=False)\\\n .prefetch(AUTOTUNE)\n else:\n dataset = dataset.cache()\\\n .batch(BATCH_SIZE, drop_remainder=True)\\\n .prefetch(AUTOTUNE)\n return dataset", "class": "Data Transform", - "desc": "This code snippet preprocesses the cleaned text data from the training and testing datasets using the previously defined `encode` function, setting a maximum length of 160 tokens, and extracts the target labels for training.", + "desc": "The code defines the `configure_dataset` function, which configures TensorFlow `tf.data.Dataset` objects for training, validation, or testing by caching, shuffling, batching, and prefetching data to optimize performance, with specific parameters for buffer size, batch size, and random seed.", "testing": { "class": "Data_Transform", "subclass": "categorify", "subclass_id": 20, - "predicted_subclass_probability": 0.997544 + "predicted_subclass_probability": 0.49705026 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 2, - "code": "print(\"Train columns = {}\".format(train_df.columns))\nprint(\"Test columns = {}\".format(test_df.columns))", - "class": "Exploratory Data Analysis", - "desc": "This code snippet prints the column names of the training and testing datasets to provide an overview of the available features.", + "cell_id": 23, + "code": "a3 = configure_dataset(train_ds, shuffle=True)\ndict3 = []\nfor elem in a3:\n dict3.append(elem[0]['text'][0])\ndict3[:10]", + "class": "Data Transform", + "desc": "The code configures the training dataset by shuffling, batching, and prefetching it using the `configure_dataset` function, and then extracts and displays the first 10 'text' entries from the configured dataset to verify the shuffling and batching process.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_columns", - "subclass_id": 71, - "predicted_subclass_probability": 0.99450326 + "class": "Data_Transform", + "subclass": "create_dataframe", + "subclass_id": 12, + "predicted_subclass_probability": 0.61363506 }, - "cluster": 7 + "cluster": 6 }, { - "cell_id": 4, - "code": "print(\"So there are {} occourance of disastrous twitts and {} occourances of non disastrous\".format(x[1],x[0]))", - "class": "Exploratory Data Analysis", - "desc": "This code snippet prints the number of occurrences of disastrous and non-disastrous tweets in the training dataset.", + "cell_id": 24, + "code": "# Configure the datasets\ntrain_ds = configure_dataset(train_ds, shuffle=True)\nval_ds = configure_dataset(val_ds)\ntest_ds = configure_dataset(test_ds, test=True)", + "class": "Data Transform", + "desc": "The code configures the training, validation, and test datasets for efficient performance by calling the `configure_dataset` function with appropriate parameters for shuffling, batching, and prefetching.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.91736937 + "class": "Data_Transform", + "subclass": "prepare_x_and_y", + "subclass_id": 21, + "predicted_subclass_probability": 0.36260855 }, - "cluster": -1 + "cluster": 3 + }, { + "cell_id": 25, + "code": "# Free memory\ndel X_train, X_val, y_train, y_val, df_train, df_test, train_full, test_full", + "class": "Data Transform", + "desc": "The code frees up memory by deleting unnecessary variables and DataFrame objects (`X_train`, `X_val`, `y_train`, `y_val`, `df_train`, `df_test`, `train_full`, `test_full`) from the workspace.", + "testing": { + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.4480442 + }, + "cluster": 8 }, { "cell_id": 5, - "code": "train_df.head(10)", + "code": "plot(train_full)", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first 10 rows of the training dataset to give an initial look at the data.", + "desc": "The code generates and displays an exploratory data analysis plot for the training dataset using the DataPrep library.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997634 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.99482274 }, "cluster": 2 }, { "cell_id": 6, - "code": "train_df.isnull().sum()", + "code": "create_report(train_full)", "class": "Exploratory Data Analysis", - "desc": "This code snippet calculates and displays the number of missing values in each column of the training dataset.", + "desc": "The code generates a comprehensive report for the training dataset using the DataPrep library.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "count_missing_values", - "subclass_id": 39, - "predicted_subclass_probability": 0.99896073 + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.5819901 }, - "cluster": 8 + "cluster": 2 }, { "cell_id": 7, - "code": "test_df.isnull().sum()", + "code": "plot(train_full, 'text')", "class": "Exploratory Data Analysis", - "desc": "This code snippet calculates and displays the number of missing values in each column of the testing dataset.", + "desc": "The code generates and displays an exploratory data analysis plot specifically for the 'text' column in the training dataset using the DataPrep library.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_missing_values", - "subclass_id": 39, - "predicted_subclass_probability": 0.9990055 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.99791676 }, - "cluster": 8 + "cluster": 6 }, { "cell_id": 8, - "code": "train_df[train_df.keyword.notnull()].head(10)", + "code": "train_full.text", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first 10 rows of the training dataset where the 'keyword' column is not null, providing an overview of the non-missing keyword entries.", + "desc": "The code accesses and displays the 'text' column from the training dataset.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.9997584 + "predicted_subclass_probability": 0.9863797 }, - "cluster": 4 + "cluster": 2 }, { "cell_id": 9, - "code": "train_df[train_df.keyword.notnull()].tail(10)", + "code": "plot(train_full, \"text\", \"target\")", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the last 10 rows of the training dataset where the 'keyword' column is not null, providing an overview of the non-missing keyword entries towards the end of the dataset.", + "desc": "The code generates and displays an exploratory data analysis plot to visualize the relationship between the 'text' and 'target' columns in the training dataset using the DataPrep library.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997335 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.988972 }, - "cluster": 4 + "cluster": 6 }, { "cell_id": 10, - "code": "train_df[train_df.keyword.isnull()].head(10)", + "code": "df1 = train_full.text[train_full.target == 0]\ndf2 = train_full.text[train_full.target == 1]\nplot_diff([df1, df2])", + "class": "Exploratory Data Analysis", + "desc": "The code creates two subsets of the 'text' column based on the 'target' column values (0 and 1) and then generates a plot that highlights the differences between these two subsets using the DataPrep library.", + "testing": { + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.9446237 + }, + "cluster": 2 + }, { + "cell_id": 17, + "code": "df_train", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first 10 rows of the training dataset where the 'keyword' column is null, providing insight into entries with missing keyword values.", + "desc": "The code displays the `df_train` DataFrame, presumably to allow for inspection of its contents after transformations.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.9996885 + "predicted_subclass_probability": 0.9994585 }, - "cluster": 4 + "cluster": 0 }, { "cell_id": 0, - "code": "# Import packeges\nimport os\nimport gc\nimport re\nimport time\nimport warnings\nimport string\nimport numpy as np\nimport pandas as pd\npd.set_option('display.max_rows', 500)\npd.set_option('display.max_columns', 500)\npd.set_option('display.width', 1000)\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom nltk.corpus import stopwords\nfrom nltk.util import ngrams # function for making ngrams\nfrom collections import defaultdict\nimport tensorflow as tf\nfrom tensorflow.keras.layers import Dense, Input\nfrom tensorflow.keras.optimizers import Adam\nfrom tensorflow.keras.models import Model\nfrom tensorflow.keras.callbacks import ModelCheckpoint\nimport tensorflow_hub as hub\nimport tokenization\n\nwarnings.filterwarnings(\"ignore\")\neng_stopwords = set(stopwords.words(\"english\"))", + "code": "! pip install tf-models-official==2.4.0 -q\n! pip install tensorflow-gpu==2.4.1 -q\n! pip install tensorflow-text==2.4.1 -q\n! python -m spacy download en_core_web_sm -q\n! pip install dataprep | grep -v 'already satisfied'", + "class": "Imports and Environment", + "desc": "The code installs specific versions of TensorFlow Models, TensorFlow GPU, TensorFlow Text, 'en_core_web_sm' model for SpaCy, and the 'dataprep' package.", + "testing": { + "class": "Imports_and_Environment", + "subclass": "install_modules", + "subclass_id": 87, + "predicted_subclass_probability": 0.9604686 + }, + "cluster": 0 + }, { + "cell_id": 1, + "code": "import pandas as pd\nimport numpy as np\nnp.set_printoptions(precision=4)\n\nimport tensorflow as tf\nfrom tensorflow import keras\n\n# Visualization\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom dataprep.eda import plot, plot_diff, plot_correlation, create_report\nfrom dataprep.clean import clean_text\n\n# Preprocessing and Modelling\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics.pairwise import cosine_similarity\nimport spacy\nimport tensorflow_text as text\nimport tensorflow_hub as hub\nfrom tensorflow.keras.layers import Input, Dense, Flatten, Dropout, concatenate \nfrom tensorflow.keras import Model, regularizers \nfrom tensorflow.keras.metrics import BinaryAccuracy\nfrom tensorflow.keras.losses import BinaryCrossentropy\nfrom official.nlp.optimization import create_optimizer # AdamW optimizer\n# Warning\nimport warnings\nwarnings.filterwarnings('ignore')", "class": "Imports and Environment", - "desc": "This code snippet imports various packages and libraries necessary for data manipulation, visualization, natural language processing, and machine learning, and it also sets some pandas display options and filters warnings.", + "desc": "The code imports essential libraries and modules for data manipulation (Pandas, NumPy), machine learning (TensorFlow, Keras, Scikit-learn), NLP (SpaCy, TensorFlow Text, TensorFlow Hub), visualization (Seaborn, Matplotlib, DataPrep), preprocessing, and modeling, as well as sets print options and configures warning filters.", "testing": { "class": "Imports_and_Environment", "subclass": "set_options", "subclass_id": 23, - "predicted_subclass_probability": 0.9975701 + "predicted_subclass_probability": 0.98988277 }, "cluster": 0 }, { - "cell_id": 27, - "code": "# Thanks to https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub\n# Prediction by BERT model\nmodel.load_weights('model.h5')\ntest_pred_BERT = model.predict(test_input)\ntest_pred_BERT_int = test_pred_BERT.round().astype('int')", - "class": "Model Evaluation", - "desc": "This code snippet loads the best model weights saved during training, makes predictions on the preprocessed test data, and rounds the predictions to the nearest integer for categorical output.", + "cell_id": 2, + "code": "tf.__version__", + "class": "Imports and Environment", + "desc": "The code retrieves and displays the current version of TensorFlow installed in the environment.", "testing": { - "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.9870588 + "class": "Imports_and_Environment", + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.9983907 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 21, - "code": "def build_model(bert_layer, max_len=512):\n input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"input_word_ids\")\n input_mask = Input(shape=(max_len,), dtype=tf.int32, name=\"input_mask\")\n segment_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"segment_ids\")\n\n _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])\n clf_output = sequence_output[:, 0, :]\n out = Dense(1, activation='sigmoid')(clf_output)\n \n model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)\n model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])\n \n return model", - "class": "Model Training", - "desc": "This code snippet defines a function called `build_model` that constructs and compiles a deep learning model using a BERT layer for text classification, with sigmoid activation for binary output and binary cross-entropy loss.", + "cell_id": 3, + "code": "# Random seeds\nimport random\nimport numpy as np\nimport tensorflow as tf\nrandom.seed(319)\nnp.random.seed(319)\ntf.random.set_seed(319)", + "class": "Imports and Environment", + "desc": "The code sets the random seeds for the `random`, `NumPy`, and `TensorFlow` libraries to ensure reproducibility.", "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.9912102 + "class": "Exploratory_Data_Analysis", + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.9086065 }, "cluster": 0 }, { - "cell_id": 22, - "code": "%%time\n\nbert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1', trainable=True)", - "class": "Model Training", - "desc": "This code snippet loads a pre-trained BERT model from TensorFlow Hub as a Keras Layer and sets it to be trainable, enabling fine-tuning during the training process.", + "cell_id": 14, + "code": "# Load Spacy Library\nnlp_spacy = spacy.load('en_core_web_sm')\n# Load the sentence encoder\nsentence_enc = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')", + "class": "Imports and Environment", + "desc": "The code loads the 'en_core_web_sm' model from the SpaCy library and the Universal Sentence Encoder from TensorFlow Hub.", "testing": { "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.8483843 + "subclass": "load_pretrained", + "subclass_id": 30, + "predicted_subclass_probability": 0.99203765 }, "cluster": 0 }, { - "cell_id": 25, - "code": "model = build_model(bert_layer, max_len=160)\nmodel.summary()", - "class": "Model Training", - "desc": "This code snippet builds the BERT-based deep learning model for text classification with a maximum token length of 160 and outputs the model summary to show its architecture.", + "cell_id": 26, + "code": "# Bidirectional Encoder Representations from Transformers (BERT).\nbert_encoder_path = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4\"\n# Text preprocessing for BERT.\nbert_preprocessor_path = \"https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3\"\n# Token based text embedding trained on English Google News 200B corpus.\nkeyword_embedding_path = \"https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2\"", + "class": "Imports and Environment", + "desc": "The code defines the URLs for three TensorFlow Hub modules: a BERT encoder, a BERT text preprocessor, and a keyword text embedding model, which will be used in the subsequent model setup and training processes.", "testing": { - "class": "Visualization", - "subclass": "model_coefficients", - "subclass_id": 79, - "predicted_subclass_probability": 0.9372223 + "class": "Exploratory_Data_Analysis", + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.9857056 }, "cluster": 0 }, { - "cell_id": 26, - "code": "checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)\n\ntrain_history = model.fit(\n train_input, train_labels,\n validation_split=0.2,\n epochs=3,\n callbacks=[checkpoint],\n batch_size=32\n)", - "class": "Model Training", - "desc": "This code snippet trains the BERT-based model on the preprocessed training data for 3 epochs, using a validation split of 20% and a model checkpoint callback to save the best model based on validation loss.", + "cell_id": 27, + "code": "bert_encoder = hub.KerasLayer(bert_encoder_path, trainable=True, name=\"BERT_Encoder\")\nbert_preprocessor = hub.KerasLayer(bert_preprocessor_path, name=\"BERT_Preprocessor\")\nnnlm_embed = hub.KerasLayer(keyword_embedding_path, name=\"NNLM_Embedding\")", + "class": "Imports and Environment", + "desc": "The code creates Keras layers for the BERT encoder, BERT text preprocessor, and keyword embedding by loading the specified TensorFlow Hub modules into `hub.KerasLayer` objects, naming them accordingly, and setting the BERT encoder to be trainable.", "testing": { "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.9943159 + "subclass": "load_pretrained", + "subclass_id": 30, + "predicted_subclass_probability": 0.506471 }, "cluster": 0 }, { - "cell_id": 3, - "code": "x=train_df.target.value_counts()\nsns.barplot(x.index,x)\nplt.gca().set_ylabel('# of occurrence')", - "class": "Visualization", - "desc": "This code snippet creates a bar plot to visualize the distribution of the target variable in the training dataset, showing the number of occurrences for each target value.", + "cell_id": 28, + "code": "kernel_initializer = tf.keras.initializers.GlorotNormal(seed=319)\n# Model function\ndef create_model():\n # Keyword Branch\n text_input = Input(shape=(), dtype=tf.string, name=\"text\")\n encoder_inputs = bert_preprocessor(text_input)\n encoder_outputs = bert_encoder(encoder_inputs)\n # Pooled output\n pooled_output = encoder_outputs[\"pooled_output\"]\n bert_branch = Dropout(0.1,\n seed=319,\n name=\"BERT_Dropout\")(pooled_output)\n # Construct keyword layers\n keyword_input = Input(shape=(), dtype=tf.string, name='keyword')\n keyword_embed = nnlm_embed(keyword_input)\n keyword_flat = Flatten(name=\"Keyword_Flatten\")(keyword_embed)\n keyword_dense1 = Dense(128, \n activation='relu',\n kernel_initializer=kernel_initializer,\n kernel_regularizer=regularizers.l2(1e-4),\n name=\"Keyword_Dense1\"\n )(keyword_flat)\n keyword_branch1 = Dropout(0.5,\n seed=319,\n name='Keyword_dropout1'\n )(keyword_dense1)\n keyword_dense2 = Dense(128, \n activation='relu',\n kernel_initializer=kernel_initializer,\n kernel_regularizer=regularizers.l2(1e-4),\n name=\"Keyword_Dense2\"\n )(keyword_branch1)\n keyword_branch2 = Dropout(0.5,\n seed=319,\n name='Keyword_dropout2'\n )(keyword_dense2)\n keyword_dense3 = Dense(128, \n activation='relu',\n kernel_initializer=kernel_initializer,\n kernel_regularizer=regularizers.l2(1e-4),\n name=\"Keyword_Dense3\"\n )(keyword_branch2)\n keyword_branch3 = Dropout(0.5,\n seed=319,\n name='Keyword_dropout3'\n )(keyword_dense3)\n \n # Merge the layers and classify\n merge = concatenate([bert_branch, keyword_branch3], name=\"Concatenate\")\n dense = Dense(128, \n activation='relu',\n kernel_initializer=kernel_initializer,\n kernel_regularizer=regularizers.l2(1e-4), \n name=\"Merged_Dense\")(merge)\n dropout = Dropout(0.5,\n seed=319,\n name=\"Merged_Dropout\"\n )(dense)\n clf = Dense(1,\n activation=\"sigmoid\", \n kernel_initializer=kernel_initializer,\n name=\"Classifier\"\n )(dropout)\n return Model([text_input, keyword_input], \n clf, \n name=\"BERT_Classifier\")", + "class": "Model Training", + "desc": "The code defines a function `create_model` that constructs a neural network combining BERT-based text embeddings and additional keyword-based embeddings, followed by dense and dropout layers, and a final sigmoid-activated layer for binary classification using TensorFlow and Keras.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9978562 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.99885345 }, - "cluster": 0 + "cluster": 3 }, { - "cell_id": 15, - "code": "plt.figure(figsize=(12,6))\n## sentenses\nplt.subplot(121)\nplt.suptitle(\"Are longer comments more Disastrous\",fontsize=20)\nsns.violinplot(y='word_count',x='target', data=train_df,split=True)\nplt.xlabel('Target?', fontsize=12)\nplt.ylabel('# of words', fontsize=12)\nplt.title(\"Number of words in each comment\", fontsize=15)\n\n# words\nplt.subplot(122)\nsns.violinplot(y='count_letters',x='target', data=train_df,split=True,inner=\"quart\")\nplt.xlabel('Target?', fontsize=12)\nplt.ylabel('# of letters', fontsize=12)\nplt.title(\"Number of letters in each comment\", fontsize=15)\n\nplt.show()", - "class": "Visualization", - "desc": "This code snippet creates a figure with two violin plots to visualize whether longer comments (in terms of word count and letter count) are more likely to be classified as disastrous or non-disastrous.", + "cell_id": 29, + "code": "bert_classifier = create_model()\nbert_classifier.summary()", + "class": "Model Training", + "desc": "The code initializes the BERT classifier model by calling the `create_model` function and then displays a summary of the model architecture using TensorFlow and Keras.", "testing": { "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9839205 - }, - "cluster": 0 + "subclass": "model_coefficients", + "subclass_id": 79, + "predicted_subclass_probability": 0.99306774 + }, + "cluster": -1 }, { - "cell_id": 16, - "code": "train_df['word_unique_percent']=train_df['unique_word_count']*100/train_df['word_count']\ntest_df['word_unique_percent']=test_df['unique_word_count']*100/test_df['word_count']\nplt.figure(figsize=(12,6))\nplt.subplot(121)\nplt.title(\"Percentage of unique words of total words in comment\")\n#sns.boxplot(x='clean', y='word_unique_percent', data=train_feats)\nax=sns.kdeplot(train_df[train_df.target == 0].word_unique_percent, label=\"Disastrous\",shade=True,color='r')\nax=sns.kdeplot(train_df[train_df.target == 1].word_unique_percent, label=\" Non Disastrous\")\nplt.legend()\nplt.ylabel('Number of occurances', fontsize=12)\nplt.xlabel('Percent unique words', fontsize=12)", + "cell_id": 31, + "code": "EPOCHS = 3\nLEARNING_RATE = 5e-5\n\nSTEPS_PER_EPOCH = int(train_ds.unbatch().cardinality().numpy() / BATCH_SIZE)\nVAL_STEPS = int(val_ds.unbatch().cardinality().numpy() / BATCH_SIZE)\n# Calculate the train and warmup steps for the optimizer\nTRAIN_STEPS = STEPS_PER_EPOCH * EPOCHS\nWARMUP_STEPS = int(TRAIN_STEPS * 0.1)\n\nadamw_optimizer = create_optimizer(\n init_lr=LEARNING_RATE,\n num_train_steps=TRAIN_STEPS,\n num_warmup_steps=WARMUP_STEPS,\n optimizer_type='adamw'\n)", + "class": "Model Training", + "desc": "The code sets training parameters such as the number of epochs and learning rate, calculates steps per epoch and validation steps, and creates an AdamW optimizer with a learning rate warm-up using the `create_optimizer` function from TensorFlow's official NLP module.", + "testing": { + "class": "Model_Train", + "subclass": "init_hyperparams", + "subclass_id": 59, + "predicted_subclass_probability": 0.6065405 + }, + "cluster": 2 + }, { + "cell_id": 32, + "code": "STEPS_PER_EPOCH, VAL_STEPS, TRAIN_STEPS, WARMUP_STEPS", + "class": "Model Training", + "desc": "The code outputs the calculated values for steps per epoch, validation steps, total training steps, and warmup steps, which are essential for configuring the optimizer and training process in TensorFlow.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.94944763 + }, + "cluster": -1 + }, { + "cell_id": 33, + "code": "bert_classifier.compile(loss=BinaryCrossentropy(from_logits=True),\n optimizer=adamw_optimizer, \n metrics=[BinaryAccuracy(name=\"accuracy\")]\n )\nhistory = bert_classifier.fit(train_ds, \n epochs=EPOCHS,\n steps_per_epoch=STEPS_PER_EPOCH,\n validation_data=val_ds,\n validation_steps=VAL_STEPS\n )", + "class": "Model Training", + "desc": "The code compiles the BERT classifier model using Binary Crossentropy loss, the AdamW optimizer, and Binary Accuracy metric, and then trains the model for the specified number of epochs while validating it at each epoch using the prepared training and validation datasets.", + "testing": { + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.97614974 + }, + "cluster": 3 + }, { + "cell_id": 18, + "code": "keyword_non_disaster = df_train.keyword[df_train.target==0].value_counts().reset_index()\nsns.barplot(data=keyword_non_disaster[:10], x='keyword', y='index')\nplt.title('Non-Disaster Keyword Frequency (0)')\nplt.xlabel('Frequency')\nplt.ylabel('Top 10 Keywords')\nplt.show()", "class": "Visualization", - "desc": "This code snippet first calculates the percentage of unique words relative to the total word count for each tweet in both the training and testing datasets, and then creates a KDE plot to visualize the distribution of these percentages for disastrous and non-disastrous tweets in the training dataset.", + "desc": "The code generates a bar plot using Seaborn to display the frequency of the top 10 keywords associated with non-disaster tweets (target=0) in the training dataset.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, - "predicted_subclass_probability": 0.97993535 + "predicted_subclass_probability": 0.88048565 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 17, - "code": "def generate_ngrams(text, n_gram=1):\n token = [token for token in text.lower().split(' ') if token != '' if token not in eng_stopwords]\n ngrams = zip(*[token[i:] for i in range(n_gram)])\n return [' '.join(ngram) for ngram in ngrams]\n\n# Bigrams\ndisaster_bigrams = defaultdict(int)\nnondisaster_bigrams = defaultdict(int)\n\nfor tweet in train_df[train_df['target']==1]['text']:\n for word in generate_ngrams(tweet, n_gram=2):\n disaster_bigrams[word] += 1\n \nfor tweet in train_df[train_df['target']==0]['text']:\n for word in generate_ngrams(tweet, n_gram=2):\n nondisaster_bigrams[word] += 1\n \ndf_disaster_bigrams = pd.DataFrame(sorted(disaster_bigrams.items(), key=lambda x: x[1])[::-1])\ndf_nondisaster_bigrams = pd.DataFrame(sorted(nondisaster_bigrams.items(), key=lambda x: x[1])[::-1])\n\nfig, axes = plt.subplots(ncols=2, figsize=(10, 10))\nplt.tight_layout()\nsns.barplot(y=df_disaster_bigrams[0].values[:10], x=df_disaster_bigrams[1].values[:10], ax=axes[0], color='cyan')\nsns.barplot(y=df_nondisaster_bigrams[0].values[:10], x=df_nondisaster_bigrams[1].values[:10], ax=axes[1], color='pink')\nfor i in range(2):\n axes[i].spines['right'].set_visible(False)\n axes[i].set_xlabel('')\n axes[i].set_ylabel('')\n axes[i].tick_params(axis='x', labelsize=10)\n axes[i].tick_params(axis='y', labelsize=10)\naxes[0].set_title('most common bigrams in Disaster Tweets', fontsize=15)\naxes[1].set_title('most common bigrams in Non-disaster Tweets', fontsize=15)\nplt.show()", + "cell_id": 19, + "code": "keyword_disaster = df_train.keyword[df_train.target==1].value_counts().reset_index()\nsns.barplot(data=keyword_non_disaster[:10], x='keyword', y='index')\nplt.title('Non-Disaster Keyword Frequency (0)')\nplt.xlabel('Frequency')\nplt.ylabel('Top 10 Keywords')\nplt.show()", "class": "Visualization", - "desc": "This code snippet defines a function to generate n-grams, computes the most common bigrams for both disastrous and non-disastrous tweets in the training dataset, and visualizes the top 10 bigrams for each category using bar plots side by side.", + "desc": "The code generates a bar plot using Seaborn to display the frequency of the top 10 keywords associated with non-disaster tweets (target=0), mistakenly repeating the previous plot instead of focusing on disaster tweets (target=1).", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.95510435 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.8303329 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 18, - "code": "# Trigrams\ndisaster_trigrams = defaultdict(int)\nnondisaster_trigrams = defaultdict(int)\n\nfor tweet in train_df[train_df['target']==1]['text']:\n for word in generate_ngrams(tweet, n_gram=3):\n disaster_trigrams[word] += 1\n \nfor tweet in train_df[train_df['target']==0]['text']:\n for word in generate_ngrams(tweet, n_gram=3):\n nondisaster_trigrams[word] += 1\n \ndf_disaster_trigrams = pd.DataFrame(sorted(disaster_trigrams.items(), key=lambda x: x[1])[::-1])\ndf_nondisaster_trigrams = pd.DataFrame(sorted(nondisaster_trigrams.items(), key=lambda x: x[1])[::-1])\n\nfig, axes = plt.subplots(ncols=2, figsize=(10, 10))\nplt.tight_layout()\nsns.barplot(y=df_disaster_trigrams[0].values[:10], x=df_disaster_trigrams[1].values[:10], ax=axes[0], color='cyan')\nsns.barplot(y=df_nondisaster_trigrams[0].values[:10], x=df_nondisaster_trigrams[1].values[:10], ax=axes[1], color='pink')\nfor i in range(2):\n axes[i].spines['right'].set_visible(False)\n axes[i].set_xlabel('')\n axes[i].set_ylabel('')\n axes[i].tick_params(axis='x', labelsize=10)\n axes[i].tick_params(axis='y', labelsize=10)\naxes[0].set_title('most common trigrams in Disaster Tweets', fontsize=15)\naxes[1].set_title('most common trigrams in Non-disaster Tweets', fontsize=15)\nplt.show()", + "cell_id": 30, + "code": "keras.utils.plot_model(bert_classifier, \n show_shapes=False)", "class": "Visualization", - "desc": "This code snippet calculates the most common trigrams for both disastrous and non-disastrous tweets in the training dataset, and visualizes the top 10 trigrams for each category using bar plots side by side.", + "desc": "The code generates a visual diagram of the BERT classifier model architecture using the `plot_model` function from Keras' utilities.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.94842196 + "class": "Visualization", + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.9785347 }, - "cluster": 0 + "cluster": -1 }], - "notebook_id": 14, - "notebook_name": "nlp-eda-cleaning-bert" + "notebook_id": 12, + "notebook_name": "using-keywords-embedding-to-improve-bert-model.ipynb" }, { "cells": [{ - "cell_id": 43, - "code": "test_bow = vectorizer.transform(test.tokens)\ntest_bow = selector.transform(test_bow)\nclassifier = LogisticRegression(C=0.1)\n\n# use the whole training dataset now\nclassifier.fit(x, y)\npredicted = classifier.predict(test_bow)\nsubmission = pd.DataFrame({'id': test.id, 'target': predicted})\nsubmission.to_csv('bow-linear.csv', index=False)", - "class": "Data Export", - "desc": "This code snippet applies the previously trained vectorizer and feature selector to the test data, retrains a Logistic Regression classifier on the entire training dataset, makes predictions on the test set, and exports the results to a CSV file named 'bow-linear.csv'.", - "testing": { - "class": "Data_Export", - "subclass": "save_to_csv", - "subclass_id": 25, - "predicted_subclass_probability": 0.99929416 - }, - "cluster": 0 - }, { - "cell_id": 62, - "code": "predicted = logits > 0\nsubmission = pd.DataFrame({'id': test.id, 'target': predicted.astype(np.int)})\nsubmission.to_csv('embeddings.csv', index=False)", + "cell_id": 35, + "code": "sample_submission.to_csv(\"submission.csv\", index=False)", "class": "Data Export", - "desc": "This code snippet converts the predicted logits into binary class labels, creates a DataFrame with the test sample IDs and their corresponding predictions, and exports the results to a CSV file named 'embeddings.csv'.", + "desc": "This code snippet saves the sample_submission DataFrame to a CSV file named \"submission.csv\" without including the index, using the to_csv() method.", "testing": { "class": "Data_Export", "subclass": "save_to_csv", "subclass_id": 25, - "predicted_subclass_probability": 0.9992009 + "predicted_subclass_probability": 0.999154 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 73, - "code": "# trainer.predict returns a list with batch results\nlogits = np.concatenate(trainer.predict(model, test_loader), axis=0)\npredicted = logits.argmax(1)\nsubmission = pd.DataFrame({'id': test.id, 'target': predicted})\nsubmission.to_csv('roberta.csv', index=False)", - "class": "Data Export", - "desc": "This code snippet uses the trained `TransformerWrapper` model to make predictions on the test data using the trainer's `predict` method, converts the logits to binary class labels, and exports the results to a CSV file named 'roberta.csv'.", + "cell_id": 1, + "code": "train_df = pd.read_csv(\"/kaggle/input/nlp-getting-started/train.csv\")\ntest_df = pd.read_csv(\"/kaggle/input/nlp-getting-started/test.csv\")", + "class": "Data Extraction", + "desc": "This code snippet reads the training and test datasets from CSV files using pandas and stores them in DataFrame objects train_df and test_df.", "testing": { - "class": "Data_Export", - "subclass": "save_to_csv", - "subclass_id": 25, - "predicted_subclass_probability": 0.9993868 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.99975425 }, "cluster": 0 }, { - "cell_id": 74, - "code": "!head *.csv", - "class": "Data Export", - "desc": "This code snippet executes a shell command to display the first few lines of all CSV files in the current directory, likely to verify the contents of the exported results.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.99973243 - }, - "cluster": -1 - }, { - "cell_id": 2, - "code": "train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')\ntest = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')", + "cell_id": 32, + "code": "sample_submission = pd.read_csv(\"/kaggle/input/nlp-getting-started/sample_submission.csv\")", "class": "Data Extraction", - "desc": "This code snippet reads the training and test datasets from specified CSV files into Pandas DataFrames.", + "desc": "This code snippet reads the sample submission CSV file into a DataFrame named sample_submission using the pandas read_csv() method.", "testing": { "class": "Data_Extraction", "subclass": "load_from_csv", "subclass_id": 45, - "predicted_subclass_probability": 0.99975055 + "predicted_subclass_probability": 0.99969256 }, - "cluster": 2 + "cluster": 0 }, { - "cell_id": 44, - "code": "filename = '/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt'\nword_dict = {}\nembeddings = []\nwith open(filename, 'r') as f:\n for line in tqdm(f, total=400000):\n word, vector_string = line.split(' ', 1)\n vector = [float(value) for value in vector_string.split()]\n embeddings.append(vector)\n word_dict[word] = len(word_dict)\n\nembeddings = torch.tensor(embeddings)", - "class": "Data Extraction", - "desc": "This code snippet reads the GloVe word embeddings from a specified file, constructs a dictionary mapping each word to its index, and creates a tensor to store the embeddings.", + "cell_id": 3, + "code": "train_df = train_df.drop(['id', 'keyword', 'location'], axis = 1)", + "class": "Data Transform", + "desc": "This code snippet removes the columns 'id', 'keyword', and 'location' from the train_df DataFrame using the drop() method with axis set to 1.", "testing": { "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.9478619 + "subclass": "drop_column", + "subclass_id": 10, + "predicted_subclass_probability": 0.9992505 }, - "cluster": 0 + "cluster": 7 }, { - "cell_id": 63, - "code": "pretrained_name = 'distilroberta-base'\ntokenizer = RobertaTokenizerFast.from_pretrained(pretrained_name)\nroberta = RobertaForSequenceClassification.from_pretrained(pretrained_name, num_labels=2)", - "class": "Data Extraction", - "desc": "This code snippet initializes a tokenizer and a pretrained RoBERTa model for sequence classification from the 'distilroberta-base' model, configured for a binary classification task.", + "cell_id": 12, + "code": "#remove duplicated rows\ntrain_df.drop_duplicates(inplace=True)", + "class": "Data Transform", + "desc": "This code snippet removes duplicate rows from the train_df DataFrame in-place using the drop_duplicates() method.", "testing": { - "class": "Model_Train", - "subclass": "load_pretrained", - "subclass_id": 30, - "predicted_subclass_probability": 0.9948885 + "class": "Data_Transform", + "subclass": "remove_duplicates", + "subclass_id": 19, + "predicted_subclass_probability": 0.8869491 }, - "cluster": -1 + "cluster": 7 }, { - "cell_id": 9, - "code": "train.drop(['location', 'keyword'], axis=1, inplace=True)\ntest.drop(['location', 'keyword'], axis=1, inplace=True)", + "cell_id": 16, + "code": "Real_Disaster_df = train_df[train_df['target'] == 1]\nReal_Disaster_df.head()", + "class": "Data Transform", + "desc": "This code snippet creates a new DataFrame, Real_Disaster_df, containing only the rows from train_df where the 'target' column equals 1, and displays the first few rows using the head() method.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.5334526 + }, + "cluster": 7 + }, { + "cell_id": 17, + "code": "Not_Real_Disaster_df = train_df[train_df['target'] == 0]\nNot_Real_Disaster_df.head()", "class": "Data Transform", - "desc": "This code snippet removes the 'location' and 'keyword' columns from both the training and test datasets.", + "desc": "This code snippet creates a new DataFrame, Not_Real_Disaster_df, containing only the rows from train_df where the 'target' column equals 0, and displays the first few rows using the head() method.", "testing": { "class": "Data_Transform", - "subclass": "drop_column", - "subclass_id": 10, - "predicted_subclass_probability": 0.9991115 + "subclass": "filter", + "subclass_id": 14, + "predicted_subclass_probability": 0.5007087 }, - "cluster": 1 + "cluster": 7 }, { - "cell_id": 11, - "code": "nlp = English()\ntokenizer = nlp.tokenizer\ntokens = tokenizer('This is a test!')\nprint(tokens)\nprint(type(tokens))\nprint([t.text for t in tokens])", + "cell_id": 18, + "code": "Real_Disaster_text = ' '.join(Real_Disaster_df.text.tolist())", "class": "Data Transform", - "desc": "This code snippet initializes a tokenizer using SpaCy's English language model, tokenizes a sample sentence, and prints the tokens and their types along with their text representation.", + "desc": "This code snippet concatenates all the text entries from the Real_Disaster_df DataFrame into a single string using the join() method.", "testing": { "class": "Data_Transform", "subclass": "string_transform", "subclass_id": 78, - "predicted_subclass_probability": 0.46164313 + "predicted_subclass_probability": 0.9610504 }, "cluster": 1 }, { - "cell_id": 12, - "code": "text = \"Don't split #hashtags!\"\nprint('Before:', [t for t in tokenizer(text)])\n\nprefixes = list(nlp.Defaults.prefixes)\nprefixes.remove('#')\nprefix_regex = spacy.util.compile_prefix_regex(prefixes)\ntokenizer.prefix_search = prefix_regex.search\n\nprint('After:', [t for t in tokenizer(text)])", + "cell_id": 20, + "code": "Not_Real_Disaster_text = ' '.join(Not_Real_Disaster_df.text.tolist())", "class": "Data Transform", - "desc": "This code snippet customizes the SpaCy tokenizer to ensure that hashtags are not split by removing the '#' symbol from the default prefixes, then demonstrates the effect on a sample text.", + "desc": "This code snippet concatenates all the text entries from the Not_Real_Disaster_df DataFrame into a single string using the join() method.", "testing": { "class": "Data_Transform", "subclass": "string_transform", "subclass_id": 78, - "predicted_subclass_probability": 0.5677337 + "predicted_subclass_probability": 0.9576982 }, "cluster": 1 }, { - "cell_id": 13, - "code": "text = 'This is a test\\n , ok?'\nprint('All tokens:', [t.text for t in tokenizer(text)])\n\nprint('Check for is_space():', [t.text for t in tokenizer(text) if not t.is_space])", + "cell_id": 22, + "code": "# take text and preprocess 'remove stopwords [a, the, and, thus, ... etc] and punctations[,%$ ..etc] and len of text less than 3' \ndef clean_text(text):\n \"\"\"\n text: a string \n return: cleaned string\n \"\"\"\n result = []\n for token in simple_preprocess(text):\n if token not in STOPWORDS and token not in punctation and len(token) >= 3 :\n token = token.lower() \n result.append(token) \n return \" \".join(result)", "class": "Data Transform", - "desc": "This code snippet tokenizes a sample text and prints all tokens, then filters and prints tokens that are not spaces by checking the `is_space` attribute.", + "desc": "This code snippet defines a function named clean_text that preprocesses a given text string by removing stopwords, punctuation, and words shorter than three characters using the Gensim simple_preprocess method and Python string operations.", "testing": { "class": "Data_Transform", "subclass": "string_transform", "subclass_id": 78, - "predicted_subclass_probability": 0.9729691 + "predicted_subclass_probability": 0.9118299 }, - "cluster": 1 + "cluster": 8 }, { - "cell_id": 14, - "code": "train['tokens'] = train['text'].apply(lambda row: [t.text.lower() for t in tokenizer(row) if not t.is_space])\ntest['tokens'] = test['text'].apply(lambda row: [t.text.lower() for t in tokenizer(row) if not t.is_space])", + "cell_id": 23, + "code": "train_df['text'] = train_df['text'].map(clean_text)\ntrain_df.head()", "class": "Data Transform", - "desc": "This code snippet creates a new 'tokens' column in both the training and test datasets by tokenizing the 'text' column, converting tokens to lowercase, and excluding space tokens.", + "desc": "This code snippet applies the clean_text function to preprocess the 'text' column of the train_df DataFrame and displays the first few rows using the head() method.", "testing": { "class": "Data_Transform", - "subclass": "data_type_conversions", - "subclass_id": 16, - "predicted_subclass_probability": 0.5443664 + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.99127215 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 18, - "code": "from sklearn.feature_extraction.text import CountVectorizer\n\n# min and max document frequency (ratio of documents containing that token)\nmin_df = 5\nmax_df = 0.6\n\n# limit vocabulary size as a function of the training data\nmax_features = len(train) * 2\n\nvectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x, min_df=min_df, max_df=max_df, max_features=max_features, binary=True)\ntrain_bow = vectorizer.fit_transform(train.tokens)\ntrain_bow", + "cell_id": 24, + "code": "from sklearn.utils import shuffle\ntrain_df_shuffled = shuffle(train_df)\ntrain_df_shuffled.head()", "class": "Data Transform", - "desc": "This code snippet initializes a `CountVectorizer` with specific parameters, including lower and upper document frequency thresholds, and fits it to the tokenized training data to transform it into a binary Bag-of-Words (BoW) representation.", + "desc": "This code snippet shuffles the rows in the train_df DataFrame using the shuffle() method from the sklearn.utils module and displays the first few rows of the shuffled DataFrame.", "testing": { "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.97597075 + "subclass": "normalization", + "subclass_id": 18, + "predicted_subclass_probability": 0.7203666 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 21, - "code": "x = train_bow\ny = train['target']", + "cell_id": 25, + "code": "X = train_df_shuffled['text']\ny = train_df_shuffled['target']\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42, stratify = y)", "class": "Data Transform", - "desc": "This code snippet extracts the features (Bag-of-Words matrix) and target labels from the training dataset and assigns them to variables `x` and `y` respectively.", + "desc": "This code snippet divides the text data (X) and target labels (y) from the shuffled DataFrame into training and testing sets using the train_test_split() method, with 20% of the data reserved for testing, a random state for reproducibility, and stratification based on the target labels.", "testing": { "class": "Data_Transform", - "subclass": "prepare_x_and_y", - "subclass_id": 21, - "predicted_subclass_probability": 0.99928766 + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.995934 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 33, - "code": "# min and max document frequency (ratio of documents containing that token)\nmin_df = 10\nmax_df = 0.6\n\n# limit vocabulary size as a function of the training data\nmax_features = len(train) * 2\n\n# single words to 3-grams\nngram_range = (1, 3)\n\nvectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x, min_df=min_df, max_df=max_df, max_features=max_features, binary=True, ngram_range=ngram_range)\nx = train_bow = vectorizer.fit_transform(train.tokens)\n\nvocab = vectorizer.get_feature_names()\nword_count = train_bow.toarray().sum(0)\n\nplot_top_values(word_count, k, vocab, 'Count', 'Type')", + "cell_id": 29, + "code": "test_df = test_df.drop(['id', 'keyword', 'location'], axis = 1)", "class": "Data Transform", - "desc": "This code snippet initializes a `CountVectorizer` to create a binary Bag-of-Words representation of the training data that includes unigrams, bigrams, and trigrams, and then visualizes the top 50 most frequent n-grams.", + "desc": "This code snippet removes the columns 'id', 'keyword', and 'location' from the test_df DataFrame using the drop() method with axis set to 1.", "testing": { - "class": "Visualization", - "subclass": "relationship", - "subclass_id": 81, - "predicted_subclass_probability": 0.74612176 + "class": "Data_Transform", + "subclass": "drop_column", + "subclass_id": 10, + "predicted_subclass_probability": 0.99925584 }, - "cluster": 1 + "cluster": 7 }, { - "cell_id": 46, - "code": "oov_count = Counter()\nall_tokens = []\n\nfor row in train.tokens:\n tokens = [t[1:] if t.startswith('#') else t for t in row]\n all_tokens.append(tokens)\n oov_count.update(set(t for t in tokens if t not in word_dict))", + "cell_id": 30, + "code": "test_df['text'] = test_df['text'].map(clean_text)\ntest_df.head()", "class": "Data Transform", - "desc": "This code snippet processes the tokens from the training data by stripping hashtags and counts the occurrences of out-of-vocabulary (OOV) tokens that are not found in the GloVe word dictionary.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.68699807 - }, - "cluster": 1 - }, { - "cell_id": 47, - "code": "test_tokens = []\nfor row in test.tokens:\n tokens = [t[1:] if t.startswith('#') else t for t in row]\n test_tokens.append(tokens)", - "class": "Data Transform", - "desc": "This code snippet processes the tokens from the test data by stripping hashtags and stores the processed tokens in the `test_tokens` list.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.9887138 - }, - "cluster": 1 - }, { - "cell_id": 49, - "code": "words_to_add = [w for w in oov_count if oov_count[w] > 2]\nfor word in words_to_add:\n word_dict[word] = len(word_dict)\n\nnew_vectors = torch.zeros((len(words_to_add), embeddings.shape[1]))\nembeddings = torch.cat([embeddings, new_vectors], dim=0)\nprint(len(word_dict), embeddings.shape)", - "class": "Data Transform", - "desc": "This code snippet updates the word dictionary to include OOV tokens that appear more than twice, initializes zero vectors for these new words, and appends these vectors to the existing embeddings tensor.", - "testing": { - "class": "Data_Transform", - "subclass": "concatenate", - "subclass_id": 11, - "predicted_subclass_probability": 0.89962006 - }, - "cluster": 1 - }, { - "cell_id": 51, - "code": "def convert_to_indices(all_tokens):\n word_indices = []\n\n for tokens in all_tokens:\n tweet_inds = torch.tensor([word_dict[t] for t in tokens if t in word_dict], dtype=torch.long)\n word_indices.append(tweet_inds)\n \n return word_indices\n\nword_indices = convert_to_indices(all_tokens)\ntest_word_indices = convert_to_indices(test_tokens)", - "class": "Data Transform", - "desc": "This code snippet defines a function to convert tokenized texts into lists of word indices based on the word dictionary, then applies this function to both the training and test data tokens.", + "desc": "This code snippet applies the clean_text function to preprocess the 'text' column of the test_df DataFrame and displays the first few rows using the head() method.", "testing": { "class": "Data_Transform", "subclass": "categorify", "subclass_id": 20, - "predicted_subclass_probability": 0.92438155 + "predicted_subclass_probability": 0.99245125 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 54, - "code": "def collate_as_list(samples):\n \"\"\"Function for the DataLoader to combine samples in a batch. Each sample is a (x, y) pair.\"\"\"\n x, y = list(zip(*samples))\n if y[0] is None:\n return x\n return x, torch.tensor(y).float()\n\n\nclass WordIndexDataset(Dataset):\n def __init__(self, x, y=None):\n self.x = x\n self.y = y\n \n def __getitem__(self, i):\n if self.y is not None:\n return self.x[i], self.y[i]\n else:\n return self.x[i], None\n \n def __len__(self):\n return len(self.x)\n", + "cell_id": 33, + "code": "sample_submission[\"target\"] = y_pred", "class": "Data Transform", - "desc": "This code snippet defines a function for the DataLoader to properly collate samples into batches and a dataset class `WordIndexDataset` that handles data loading by providing the length and indexing methods for datasets with or without target labels.", + "desc": "This code snippet assigns the predicted target values (y_pred) to the \"target\" column of the sample_submission DataFrame.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.7811093 + "class": "Data_Export", + "subclass": "prepare_output", + "subclass_id": 55, + "predicted_subclass_probability": 0.7656245 }, "cluster": 1 }, { - "cell_id": 55, - "code": "validation_size = int(0.1 * len(train))\nvalidation_inds = np.random.choice(np.arange(len(train)), size=validation_size, replace=False)\nis_train = np.ones(len(train), dtype=np.bool)\nis_train[validation_inds] = False\n\n# use an object array since we have varied size tensors\ntweets = np.array(word_indices, dtype=object)\ntarget = train.target.to_numpy()\n# train_tweets, valid_tweets, train_target, valid_target = train_test_split(tweets, target, test_size=0.1, stratify=target)\ntrain_tweets = tweets[is_train].tolist()\ntrain_target = target[is_train]\nvalid_tweets = tweets[~is_train].tolist()\nvalid_target = target[~is_train]\n\ntrain_data = WordIndexDataset(train_tweets, train_target)\nvalid_data = WordIndexDataset(valid_tweets, valid_target)\ntest_data = WordIndexDataset(test_word_indices)\ntrain_loader = DataLoader(train_data, batch_size=32, collate_fn=collate_as_list)\nvalid_loader = DataLoader(valid_data, batch_size=256, collate_fn=collate_as_list)\ntest_loader = DataLoader(test_data, batch_size=256, collate_fn=collate_as_list)", - "class": "Data Transform", - "desc": "This code snippet creates train-validation splits by randomly selecting a validation set, converts tweets and targets into the appropriate format, and initializes DataLoader objects for the training, validation, and test datasets with specified batch sizes and a custom collation function.", + "cell_id": 2, + "code": "train_df.head()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet displays the first few rows of the training dataset stored in the train_df DataFrame using the head() method.", "testing": { - "class": "Data_Transform", - "subclass": "prepare_x_and_y", - "subclass_id": 21, - "predicted_subclass_probability": 0.46862042 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997545 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 64, - "code": "# create tensors of variable sizes\n# note that the tokenizer returns a tensor with shape [1, num_tokens]\ntrain_tokens = train.text[is_train].apply(lambda s: tokenizer.encode(s, return_tensors='pt')[0]).tolist()\nvalid_tokens = train.text[~is_train].apply(lambda s: tokenizer.encode(s, return_tensors='pt')[0]).tolist()\ntest_tokens = test.text.apply(lambda s: tokenizer.encode(s, return_tensors='pt')[0]).tolist()\n\n# add padding to have a fixed size matrix. With bigger datasets we should be careful about memory usage, but this is small enough to skip this kind of optimization\npadding = tokenizer.pad_token_id\nx_train = pad_sequence(train_tokens, batch_first=True, padding_value=padding)\nx_valid = pad_sequence(valid_tokens, batch_first=True, padding_value=padding)\nx_test = pad_sequence(test_tokens, batch_first=True, padding_value=padding)\n\nx_train_mask = x_train != padding\nx_valid_mask = x_valid != padding\nx_test_mask = x_test != padding\nprint(f'x_train shape: {x_train.shape}, x_valid shape: {x_valid.shape}, x_test shape: {x_test.shape}')", - "class": "Data Transform", - "desc": "This code snippet tokenizes the text data for training, validation, and test sets using the RoBERTa tokenizer, pads the sequences to ensure uniform lengths, and creates attention masks where padding tokens are marked for the training, validation, and test sets, finally printing the shapes of these padded tensors.", + "cell_id": 4, + "code": "train_df.shape", + "class": "Exploratory Data Analysis", + "desc": "This code snippet returns the dimensions (number of rows and columns) of the train_df DataFrame using the shape attribute.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.9920512 + "class": "Exploratory_Data_Analysis", + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.9995821 }, "cluster": 1 }, { - "cell_id": 65, - "code": "train_data = TensorDataset(x_train, x_train_mask, torch.tensor(train_target))\nvalid_data = TensorDataset(x_valid, x_valid_mask, torch.tensor(valid_target))\ntest_data = TensorDataset(x_test, x_test_mask)\n\ntrain_loader = DataLoader(train_data, batch_size=32)\nvalid_loader = DataLoader(valid_data, batch_size=256)\ntest_loader = DataLoader(test_data, batch_size=256)", - "class": "Data Transform", - "desc": "This code snippet organizes the padded sequences and their corresponding attention masks, along with their target labels for training and validation sets, into `TensorDataset` objects, and then initializes DataLoaders for the training, validation, and test datasets with specified batch sizes.", + "cell_id": 5, + "code": "train_df.columns", + "class": "Exploratory Data Analysis", + "desc": "This code snippet returns the column labels of the train_df DataFrame using the columns attribute.", "testing": { - "class": "Data_Transform", - "subclass": "create_dataframe", - "subclass_id": 12, - "predicted_subclass_probability": 0.97746176 + "class": "Exploratory_Data_Analysis", + "subclass": "show_columns", + "subclass_id": 71, + "predicted_subclass_probability": 0.9984144 }, "cluster": 1 }, { - "cell_id": 3, - "code": "train.head()", + "cell_id": 6, + "code": "train_df.info()", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first few rows of the training dataset to provide an initial overview of the data.", + "desc": "This code snippet provides a concise summary of the train_df DataFrame, including the data types and non-null counts for each column by using the info() method.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997507 + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.9993624 }, - "cluster": 12 + "cluster": 0 }, { - "cell_id": 8, - "code": "min_freq = 5\nabove_threshold = train.location.value_counts() > min_freq\nfrequent_places = above_threshold.index[above_threshold]\ndata = train[train.location.isin(frequent_places)].location\nprint(f'{data.nunique()} unique locations with more than {min_freq} occurrences')", + "cell_id": 7, + "code": "train_df.describe()", "class": "Exploratory Data Analysis", - "desc": "This code snippet filters the training dataset to find locations with more than a specified minimum frequency and prints the count of unique locations that meet this criterion.", + "desc": "This code snippet generates descriptive statistics for the numeric columns in the train_df DataFrame using the describe() method.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "count_unique_values", - "subclass_id": 54, - "predicted_subclass_probability": 0.9539694 + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.9994492 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 10, - "code": "train.text.isna().sum(), test.text.isna().sum()", + "cell_id": 8, + "code": "train_df[train_df[\"target\"] == 1][\"text\"].values[0]", "class": "Exploratory Data Analysis", - "desc": "This code snippet checks and returns the number of missing values in the 'text' column of both the training and test datasets.", + "desc": "This code snippet retrieves the text of the first row in the train_df DataFrame where the \"target\" column has a value of 1, by using the filter and selection methods.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "count_missing_values", - "subclass_id": 39, - "predicted_subclass_probability": 0.9989147 + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.95915043 }, - "cluster": -1 + "cluster": 4 }, { - "cell_id": 15, - "code": "train.sample(10)", + "cell_id": 9, + "code": "train_df[train_df[\"target\"] == 1][\"text\"].values[1]", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays a random sample of 10 rows from the training dataset.", + "desc": "This code snippet retrieves the text of the second row in the train_df DataFrame where the \"target\" column has a value of 1, by using the filter and selection methods.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.99975437 + "predicted_subclass_probability": 0.5788779 }, - "cluster": -1 + "cluster": 4 }, { - "cell_id": 25, - "code": "def get_rows_containing(data, term):\n \"\"\"Return rows containing a term\"\"\"\n has_term = data.tokens.apply(lambda row: term in row)\n return data[has_term]\n\nterms = ['bags', 'australia']\nfor term in terms:\n rows = get_rows_containing(train, term)\n print(f'Distribution containing {term}:')\n print(rows.target.value_counts())\n for i, row in rows.sample(5).iterrows():\n print(row.target, row.text)\n print()", + "cell_id": 10, + "code": "print(\"Number of duplicates in data : {}\".format(len(train_df[train_df.duplicated()])))", "class": "Exploratory Data Analysis", - "desc": "This code snippet defines a function to retrieve dataset rows containing a specific term, and then analyzes and prints the target distribution and sample texts for rows containing certain specified terms ('bags' and 'australia').", + "desc": "This code snippet prints the number of duplicate rows in the train_df DataFrame by using the duplicated() method and the len() function.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.57849276 + "subclass": "count_duplicates", + "subclass_id": 38, + "predicted_subclass_probability": 0.8543922 }, - "cluster": -1 + "cluster": 3 }, { - "cell_id": 34, - "code": "x.shape", + "cell_id": 11, + "code": "print(\"Duplicated rows before remove them : \")\ntrain_df[train_df.duplicated(keep=False)].sort_values(by=\"text\").head(8)", "class": "Exploratory Data Analysis", - "desc": "This code snippet returns the shape of the feature matrix `x` to provide information on the number of samples and features after transforming the training data with the updated `CountVectorizer`.", + "desc": "This code snippet prints a message and displays the first 8 duplicate rows (including all occurrences of each duplicate) from the train_df DataFrame, sorted by the \"text\" column using the duplicated(keep=False) method and sort_values() method.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.9995432 + "subclass": "count_duplicates", + "subclass_id": 38, + "predicted_subclass_probability": 0.859677 }, - "cluster": -1 + "cluster": 3 }, { - "cell_id": 45, - "code": "print(embeddings.shape)\nprint(len(word_dict))", + "cell_id": 13, + "code": "print(\"Number of duplicates in data : {}\".format(len(train_df[train_df.duplicated()])))", "class": "Exploratory Data Analysis", - "desc": "This code snippet prints the shape of the embeddings tensor and the length of the word dictionary to confirm the dimensions and size of the loaded GloVe embeddings.", + "desc": "This code snippet prints the number of remaining duplicate rows in the train_df DataFrame after duplicates have been removed, by using the duplicated() method and the len() function.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.9995413 + "subclass": "count_duplicates", + "subclass_id": 38, + "predicted_subclass_probability": 0.8543922 }, - "cluster": -1 + "cluster": 3 }, { - "cell_id": 48, - "code": "oov_count.most_common(10)", + "cell_id": 14, + "code": "train_df['target'].value_counts()", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the 10 most common out-of-vocabulary (OOV) tokens from the processed training data.", + "desc": "This code snippet returns the count of occurrences for each unique value in the 'target' column of the train_df DataFrame using the value_counts() method.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.94204676 + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.9995184 }, - "cluster": -1 + "cluster": 4 }, { - "cell_id": 50, - "code": "len(oov_count)", + "cell_id": 26, + "code": "X_test", "class": "Exploratory Data Analysis", - "desc": "This code snippet returns the total number of unique out-of-vocabulary (OOV) tokens found in the processed training data.", + "desc": "This code snippet returns the content of the X_test DataFrame, which contains the text data designated for testing.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.9988147 + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.99974364 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 0, - "code": "from collections import Counter\n\nimport seaborn as sns\nimport numpy as np \nimport pandas as pd\nfrom matplotlib import pyplot as plt\nimport spacy\nfrom tqdm import tqdm\nfrom spacy.lang.en import English\nimport torch\nfrom torch import nn\nfrom torch.nn.utils.rnn import pad_sequence\nfrom torch.nn import functional as F\nfrom torch.utils.data import Dataset, TensorDataset, DataLoader\nimport pytorch_lightning as pl\n\nfrom sklearn.model_selection import train_test_split, cross_validate, cross_val_score\nfrom sklearn.metrics import f1_score\nfrom sklearn.linear_model import LogisticRegression, RidgeClassifier\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\nfrom sklearn.svm import SVC\nfrom xgboost import XGBClassifier\nfrom transformers import RobertaForSequenceClassification, RobertaTokenizerFast", - "class": "Imports and Environment", - "desc": "This code snippet imports various libraries and modules required for tasks related to data manipulation, visualization, natural language processing, neural networks, and machine learning.", + "cell_id": 34, + "code": "sample_submission.head()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet displays the first few rows of the sample_submission DataFrame using the head() method.", "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.9993081 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.99975234 }, "cluster": 0 }, { - "cell_id": 1, - "code": "np.random.seed(42)\n\n# prettier graphs!\nplt.style.use('ggplot')", + "cell_id": 0, + "code": "import numpy as np \nimport pandas as pd \nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom wordcloud import WordCloud\nfrom sklearn import feature_extraction, linear_model, model_selection, preprocessing\n\n#sklearn \nfrom sklearn.model_selection import train_test_split\nfrom sklearn.utils.class_weight import compute_sample_weight\nfrom sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.metrics import accuracy_score, confusion_matrix\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.linear_model import SGDClassifier\n\n# nlp preprocessing lib\nimport gensim\nfrom gensim.utils import simple_preprocess\nfrom gensim.parsing.preprocessing import STOPWORDS\nimport string \npunctation = string.punctuation", "class": "Imports and Environment", - "desc": "This snippet sets a random seed for NumPy to ensure reproducibility and configures Matplotlib to use the 'ggplot' style for prettier graphs.", + "desc": "This code snippet imports various libraries and packages essential for data manipulation (NumPy, pandas), visualization (seaborn, matplotlib), natural language processing (Gensim, WordCloud), and machine learning (scikit-learn), and sets a variable for punctuation characters.", "testing": { "class": "Imports_and_Environment", - "subclass": "set_options", - "subclass_id": 23, - "predicted_subclass_probability": 0.9983991 + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.99901414 }, "cluster": 0 }, { - "cell_id": 22, - "code": "majority = y.mode()[0] == y\nprint(f'Majority class baseline: {majority.mean()}')", + "cell_id": 31, + "code": "y_pred = nb_classifier.predict(test_df['text'])", "class": "Model Evaluation", - "desc": "This code snippet calculates the accuracy of a baseline model that always predicts the majority class and prints the result.", + "desc": "This code snippet uses the trained Naive Bayes classifier pipeline (nb_classifier) to predict the target values for the preprocessed text data in the test_df DataFrame.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.91397643 + "class": "Model_Evaluation", + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.994578 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 23, - "code": "classifier = LogisticRegression()\ncv_scores = cross_val_score(classifier, x, y, scoring='f1', cv=10, n_jobs=-1)\nprint(f'Mean F1: {cv_scores.mean()}')", - "class": "Model Evaluation", - "desc": "This code snippet trains a Logistic Regression classifier using 10-fold cross-validation and prints the mean F1 score.", + "cell_id": 27, + "code": "from sklearn.model_selection import cross_val_score\nnb_classifier = Pipeline([('vect', CountVectorizer()),\n ('tfidf', TfidfTransformer()),\n ('clf', MultinomialNB()),])\n\nnb_classifier.fit(X_train, y_train)\n\ny_pred = nb_classifier.predict(X_test)\nprint('accuracy {}'.format(accuracy_score(y_pred, y_test)))", + "class": "Model Training", + "desc": "This code snippet constructs a machine learning pipeline with scikit-learn components\u2014CountVectorizer, TfidfTransformer, and MultinomialNB classifier\u2014fits the pipeline to the training data, makes predictions on the test data, and prints the accuracy score.", "testing": { "class": "Model_Train", - "subclass": "compute_train_metric", - "subclass_id": 28, - "predicted_subclass_probability": 0.97591215 + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.7887582 }, "cluster": 0 }, { - "cell_id": 26, - "code": "from sklearn.feature_selection import chi2, SelectKBest\n\nnum_features = [1000, 500, 250, 100, 50]\nf1 = []\nfor k in num_features:\n selector = SelectKBest(chi2, k=k)\n x_selected = selector.fit_transform(x, y)\n scores = cross_val_score(classifier, x_selected, y, scoring='f1', cv=10, n_jobs=-1)\n f1.append(scores.mean())\n", - "class": "Model Evaluation", - "desc": "This code snippet performs feature selection using the chi-squared test and evaluates Logistic Regression classifier performance for different numbers of features, storing the mean F1 scores for each configuration.", + "cell_id": 28, + "code": "sgd = Pipeline([('vect', CountVectorizer()),\n ('tfidf', TfidfTransformer()),\n ('clf', SGDClassifier(loss='epsilon_insensitive', penalty='l2',alpha=1e-3, random_state=42, max_iter=1000, tol=None)),])\n\n\nsgd.fit(X_train, y_train)\ny_pred = sgd.predict(X_test)\nprint('accuracy {}'.format(accuracy_score(y_pred, y_test)))", + "class": "Model Training", + "desc": "This code snippet constructs and trains a machine learning pipeline using scikit-learn components\u2014CountVectorizer, TfidfTransformer, and SGDClassifier\u2014with specific hyperparameters, fits the pipeline to the training data, makes predictions on the test data, and prints the accuracy score.", "testing": { "class": "Model_Train", - "subclass": "find_best_params", - "subclass_id": 2, - "predicted_subclass_probability": 0.48363486 - }, - "cluster": 0 - }, { - "cell_id": 30, - "code": "regularization = [1, 0.1, 0.01, 0.001, 0.0001]\nl1_scores = []\nl2_scores = []\nl1_std = []\nl2_std = []\n\nfor value in regularization:\n log_reg = LogisticRegression(C=value)\n results = cross_val_score(log_reg, x_selected, y, scoring='f1', cv=10, n_jobs=-1)\n l2_scores.append(results.mean())\n l2_std.append(results.std())\n \n alpha = 1 / (2 * value) # as defined in sklearn\n ridge = RidgeClassifier(alpha=alpha)\n results = cross_val_score(ridge, x_selected, y, scoring='f1', cv=10, n_jobs=-1)\n l1_scores.append(results.mean())\n l1_std.append(results.std())", - "class": "Model Evaluation", - "desc": "This code snippet evaluates the performance of Logistic Regression and Ridge Classifier models with different regularization strengths using 10-fold cross-validation, storing both the mean F1 scores and their standard deviations for each configuration.", - "testing": { - "class": "Visualization", - "subclass": "model_coefficients", - "subclass_id": 79, - "predicted_subclass_probability": 0.39513838 + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.95042425 }, "cluster": 0 }, { - "cell_id": 32, - "code": "print(f'Best baseline F1: {l2_scores[1]}')", - "class": "Model Evaluation", - "desc": "This code snippet prints the F1 score corresponding to the second regularization strength value from the list, which is considered the best baseline F1 score from the previously evaluated models.", + "cell_id": 15, + "code": "# count plot \"Histogram\" of Frequencies of Subjects for true news\nplt.figure(figsize=(10,6))\nplt.title(\"Frequencies of tweets for Disaster\")\nsns.countplot(x = 'target', data = train_df)\nplt.xlabel('Disaster Type')", + "class": "Visualization", + "desc": "This code snippet creates a histogram using seaborn's countplot() to visualize the frequency of tweets categorized as disasters in the train_df DataFrame and labels the x-axis as 'Disaster Type'.", "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.5057336 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.9293306 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 35, - "code": "classifier = LogisticRegression(C=0.1)\nselector = SelectKBest(chi2, k=500)\nx = selector.fit_transform(x, y)\ncv_scores = cross_validate(classifier, x, y, scoring='f1', cv=10, n_jobs=-1, return_train_score=True)\nmean_f1 = cv_scores['test_score'].mean()\nprint(f'Mean F1: {mean_f1}')", - "class": "Model Evaluation", - "desc": "This code snippet selects the top 500 features using the chi-squared test, trains a Logistic Regression classifier with a specified regularization strength, evaluates it using 10-fold cross-validation, and prints the mean F1 score of the model.", + "cell_id": 19, + "code": "wordcloud_true = WordCloud().generate(Real_Disaster_text)\nplt.figure(figsize=(10,10))\nplt.imshow(wordcloud_true)\nplt.axis('off')\nplt.title(\"Word Cloud of Real Disaster news\")\nplt.tight_layout(pad=0)\nplt.show()", + "class": "Visualization", + "desc": "This code snippet generates and displays a word cloud from the concatenated text of real disaster tweets using the WordCloud class, and visualizes it with matplotlib by customizing title, size, axis, and layout.", "testing": { - "class": "Model_Train", - "subclass": "compute_train_metric", - "subclass_id": 28, - "predicted_subclass_probability": 0.98264277 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.96466434 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 38, - "code": "c = RandomForestClassifier(n_estimators=100, n_jobs=-1)\ncv_scores = cross_validate(c, x, y, scoring='f1', cv=8, n_jobs=-1, return_train_score=True)\nplot_model_score(cv_scores['train_score'], cv_scores['test_score'])", - "class": "Model Evaluation", - "desc": "This code snippet trains a Random Forest classifier using 8-fold cross-validation, evaluates it, and then visualizes the mean F1 scores and their standard deviations for the training and validation datasets to assess model performance.", + "cell_id": 21, + "code": "wordcloud_true = WordCloud().generate(Not_Real_Disaster_text)\nplt.figure(figsize=(10,10))\nplt.imshow(wordcloud_true)\nplt.axis('off')\nplt.title(\"Word Cloud of Not RealDisaster twittes\")\nplt.tight_layout(pad=0)\nplt.show()\n", + "class": "Visualization", + "desc": "This code snippet generates and displays a word cloud from the concatenated text of non-disaster tweets using the WordCloud class, and visualizes it with matplotlib by customizing title, size, axis, and layout.", "testing": { "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.7403155 + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.9845956 }, - "cluster": 0 - }, { - "cell_id": 39, - "code": "c = RandomForestClassifier(n_estimators=100, min_samples_leaf=3)\ncv_scores = cross_validate(c, x, y, scoring='f1', cv=8, n_jobs=-1, return_train_score=True)\nplot_model_score(cv_scores['train_score'], cv_scores['test_score'])", - "class": "Model Evaluation", - "desc": "This code snippet trains a Random Forest classifier with 100 estimators and a minimum of 3 samples per leaf using 8-fold cross-validation, evaluates it, and then visualizes the mean F1 scores and their standard deviations for the training and validation datasets.", + "cluster": -1 + }], + "notebook_id": 13, + "notebook_name": "nlp-with-disaster-tweets.ipynb" + }, { + "cells": [{ + "cell_id": 26, + "code": "# SAVE SUBMISSION FILE\n\nsubmission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')\nsubmission.target = flat_predictions\nsubmission.to_csv('submission.csv', index=False)", + "class": "Data Export", + "desc": "This code loads a sample submission CSV file, updates its target column with the generated predictions, and saves the results to a new CSV file named 'submission.csv'.", "testing": { - "class": "Model_Train", - "subclass": "compute_train_metric", - "subclass_id": 28, - "predicted_subclass_probability": 0.8847018 + "class": "Data_Export", + "subclass": "save_to_csv", + "subclass_id": 25, + "predicted_subclass_probability": 0.99789846 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 40, - "code": "c = RandomForestClassifier(n_estimators=500, min_samples_split=10)\ncv_scores = cross_validate(c, x, y, scoring='f1', cv=8, n_jobs=-1, return_train_score=True)\nplot_model_score(cv_scores['train_score'], cv_scores['test_score'])", - "class": "Model Evaluation", - "desc": "This code snippet trains a Random Forest classifier with 500 estimators and a minimum of 10 samples required to split an internal node using 8-fold cross-validation, evaluates it, and then visualizes the mean F1 scores and their standard deviations for the training and validation datasets.", + "cell_id": 0, + "code": "# LOADING THE TRAIN DATA\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename)) \ndata = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')\ndata.sample(10)", + "class": "Data Extraction", + "desc": "This code imports essential libraries such as NumPy and Pandas, explores the directory for input files, and loads the train dataset from a CSV file into a Pandas DataFrame, displaying a random sample of 10 rows.", "testing": { - "class": "Model_Train", - "subclass": "compute_train_metric", - "subclass_id": 28, - "predicted_subclass_probability": 0.88807696 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.9061924 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 41, - "code": "c = RandomForestClassifier(n_estimators=200, min_samples_split=5, max_depth=50)\ncv_scores = cross_validate(c, x, y, scoring='f1', cv=8, n_jobs=-1, return_train_score=True)\nplot_model_score(cv_scores['train_score'], cv_scores['test_score'])", - "class": "Model Evaluation", - "desc": "This code snippet trains a Random Forest classifier with 200 estimators, a minimum of 5 samples required to split an internal node, and a maximum depth of 50 using 8-fold cross-validation, evaluates it, and then visualizes the mean F1 scores and their standard deviations for the training and validation datasets.", + "cell_id": 7, + "code": "# GET THE LISTS OF TWEETS AND THEIR LABELS\n\nsentences = data.text.values\nlabels =data.target.values", + "class": "Data Extraction", + "desc": "This code extracts the tweet texts and their corresponding labels from the DataFrame into separate NumPy arrays.", "testing": { - "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.41321388 + "class": "Exploratory_Data_Analysis", + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.9971167 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 42, - "code": "c = XGBClassifier()\ncv_scores = cross_validate(c, x, y, scoring='f1', cv=8, n_jobs=-1, return_train_score=True)\nplot_model_score(cv_scores['train_score'], cv_scores['test_score'])", - "class": "Model Evaluation", - "desc": "This code snippet trains an XGBoost classifier using 8-fold cross-validation, evaluates it, and then visualizes the mean F1 scores and their standard deviations for the training and validation datasets.", + "cell_id": 23, + "code": "# PREPARE TEST DATA\n\n# Load the dataset into a pandas dataframe.\ntest_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')\n\n# Report the number of sentences.\nprint('Number of test sentences: {:,}\\n'.format(test_data.shape[0]))\n\n# Create sentence and label lists\nsentences = test_data.text.values\n#labels = test_data.target.values\n\n# Tokenize all of the sentences and map the tokens to thier word IDs.\ninput_ids = []\nattention_masks = []\n\n# For every sentence...\nfor sent in sentences:\n # `encode_plus` will:\n # (1) Tokenize the sentence.\n # (2) Prepend the `[CLS]` token to the start.\n # (3) Append the `[SEP]` token to the end.\n # (4) Map tokens to their IDs.\n # (5) Pad or truncate the sentence to `max_length`\n # (6) Create attention masks for [PAD] tokens.\n encoded_dict = tokenizer.encode_plus(\n sent, # Sentence to encode.\n add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n max_length = 64, # Pad & truncate all sentences.\n pad_to_max_length = True,\n return_attention_mask = True, # Construct attn. masks.\n return_tensors = 'pt', # Return pytorch tensors.\n )\n \n # Add the encoded sentence to the list. \n input_ids.append(encoded_dict['input_ids'])\n \n # And its attention mask (simply differentiates padding from non-padding).\n attention_masks.append(encoded_dict['attention_mask'])\n\n# Convert the lists into tensors.\ninput_ids = torch.cat(input_ids, dim=0)\nattention_masks = torch.cat(attention_masks, dim=0)\n#labels = torch.tensor(labels)\n\n# Set the batch size. \nbatch_size = 32 \n\n# Create the DataLoader.\nprediction_data = TensorDataset(input_ids, attention_masks, ) #labels\nprediction_sampler = SequentialSampler(prediction_data)\nprediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)", + "class": "Data Extraction", + "desc": "This code prepares the test data by loading it into a pandas DataFrame, tokenizing the text data, creating attention masks, converting them into PyTorch tensors, and setting up a DataLoader for the test samples.", "testing": { - "class": "Model_Train", - "subclass": "compute_train_metric", - "subclass_id": 28, - "predicted_subclass_probability": 0.8236538 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.99884653 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 61, - "code": "# trainer.predict returns a list with batch results\nlogits = np.concatenate(trainer.predict(model, test_loader))", - "class": "Model Evaluation", - "desc": "This code snippet uses the trained `BagOfEmbeddingsClassifier` model to make predictions on the test data via the trainer's `predict` method, concatenating the resulting logits from all batches into a single array.", + "cell_id": 3, + "code": "# DROP DUPLICATE SAMPLES WITH CONFLICTING LABELS\n\nconflicting = conflicting_check.loc[(conflicting_check.target != 1) & (conflicting_check.target != 0)].index\ndata = data.drop(data[text.isin(conflicting)].index)\nprint ('Conflicting samples count:', conflicting.shape[0])", + "class": "Data Transform", + "desc": "This code snippet identifies and drops text samples with conflicting labels from the dataset and prints the count of such conflicting samples removed.", "testing": { - "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.99380153 + "class": "Data_Transform", + "subclass": "filter", + "subclass_id": 14, + "predicted_subclass_probability": 0.43520904 }, - "cluster": 0 + "cluster": 6 }, { - "cell_id": 24, - "code": "k = 50\nclassifier = LogisticRegression(max_iter=500)\nclassifier.fit(x, y)\nplot_top_values(classifier.coef_[0], k, vocab, 'Type', 'Weight', use_abs=True)", - "class": "Model Training", - "desc": "This code snippet trains a Logistic Regression classifier on the entire training dataset and uses the previously defined plotting function to visualize the top 50 features based on their absolute weights.", + "cell_id": 11, + "code": "# TOKENIZE ALL THE SENTENCES AND MAP THE TOKENS TO THEIR WORD IDs\n\ninput_ids = []\nattention_masks = []\n\n# For every sentence...\nfor sent in sentences:\n # `encode_plus` will:\n # (1) Tokenize the sentence.\n # (2) Prepend the `[CLS]` token to the start.\n # (3) Append the `[SEP]` token to the end.\n # (4) Map tokens to their IDs.\n # (5) Pad or truncate the sentence to `max_length`\n # (6) Create attention masks for [PAD] tokens.\n encoded_dict = tokenizer.encode_plus(\n sent, # Sentence to encode.\n add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n max_length = 64, # Pad & truncate all sentences.\n pad_to_max_length = True,\n return_attention_mask = True, # Construct attn. masks.\n return_tensors = 'pt', # Return pytorch tensors.\n )\n \n # Add the encoded sentence to the list. \n input_ids.append(encoded_dict['input_ids'])\n \n # And its attention mask (simply differentiates padding from non-padding).\n attention_masks.append(encoded_dict['attention_mask'])\n\n# Convert the lists into tensors.\ninput_ids = torch.cat(input_ids, dim=0)\nattention_masks = torch.cat(attention_masks, dim=0)\nlabels = torch.tensor(labels)\n\n# Print sentence 0, now as a list of IDs.\nprint('Original: ', sentences[0])\nprint('Token IDs:', input_ids[0])", + "class": "Data Transform", + "desc": "This code tokenizes all sentences, adds special tokens, maps tokens to their word IDs, pads or truncates sentences to a maximum length, creates attention masks, and converts the results into PyTorch tensors.", "testing": { - "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.7323191 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.99864405 }, "cluster": 0 }, { - "cell_id": 28, - "code": "selector = SelectKBest(chi2, k=250)\nx_selected = selector.fit_transform(x, y)\nvocab = [vocab[i] for i, selected in enumerate(selector.get_support()) if selected]\nclassifier.fit(x_selected, y)\nplot_top_values(classifier.coef_[0], k, vocab, 'Type', 'Weight', use_abs=True)", - "class": "Model Training", - "desc": "This code snippet selects the top 250 features using the chi-squared test, updates the vocabulary accordingly, trains a Logistic Regression classifier on the selected features, and visualizes the top 50 features based on their absolute weights.", + "cell_id": 12, + "code": "# SPLIT TRAIN DATA INTO TRAIN AND TEST SET\n# I used small test set (SPLIT=0,999) in order to train the model on the majority of the data, after all parameters were tuned\n# Use 0,9 or lower to train the model and look at the perfomance/ tune parameters\n\nSPLIT = 0.999\n\nfrom torch.utils.data import TensorDataset, random_split\n\n# Combine the training inputs into a TensorDataset.\ndataset = TensorDataset(input_ids, attention_masks, labels)\n\n# Create a 90-10 train-validation split.\n\n# Calculate the number of samples to include in each set.\ntrain_size = int(SPLIT * len(dataset))\nval_size = len(dataset) - train_size\n\n# Divide the dataset by randomly selecting samples.\ntrain_dataset, val_dataset = random_split(dataset, [train_size, val_size])\n\nprint('{:>5,} training samples'.format(train_size))\nprint('{:>5,} validation samples'.format(val_size))", + "class": "Data Transform", + "desc": "This code splits the dataset into a training set and a validation set with a specified split ratio using PyTorch's `random_split` method, and prints the number of samples in each subset.", "testing": { - "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.58552986 + "class": "Data_Transform", + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.9402513 }, - "cluster": 0 + "cluster": 5 }, { - "cell_id": 52, - "code": "class BagOfEmbeddingsClassifier(pl.LightningModule):\n def __init__(self, embeddings, learning_rate=0.001, l2=0.001):\n super().__init__()\n self.learning_rate = learning_rate\n self.l2 = l2\n \n vocab_size, embedding_dim = embeddings.shape\n self.embedding_bag = nn.EmbeddingBag.from_pretrained(embeddings, freeze=False)\n \n # a single output value determines the probability of class 1 with a sigmoid function\n self.linear = nn.Linear(embedding_dim, 1, bias=True)\n \n def forward(self, x):\n \"\"\"x is a list of tensors with any shape\"\"\"\n # embedding bag operates with a single tensor of concatenated inputs and another of offsets\n lengths = torch.tensor([0] + [len(sample) for sample in x[:-1]])\n offsets = lengths.cumsum(0).to(x[0].device)\n x = torch.cat(x)\n embedded = self.embedding_bag(x, offsets)\n logits = self.linear(embedded).squeeze(-1)\n return logits\n \n def _get_loss_and_acc(self, logits, y):\n \"\"\"Internal function\"\"\"\n predicted = logits > 0\n acc = (predicted == y).float().mean()\n loss = F.binary_cross_entropy_with_logits(logits, y.float())\n \n return loss, acc\n \n def on_fit_start(self): \n self.train_losses = []\n self.train_accs = []\n self.valid_losses = []\n self.valid_accs = []\n \n self.reset_metrics()\n \n def reset_metrics(self):\n self.partial_train_losses = []\n self.partial_train_accs = []\n self.partial_valid_losses = []\n self.partial_valid_accs = []\n \n def on_validation_end(self):\n self.train_losses.append(np.array(self.partial_train_losses).mean())\n self.train_accs.append(np.array(self.partial_train_accs).mean())\n self.valid_losses.append(np.array(self.partial_valid_losses).mean())\n self.valid_accs.append(np.array(self.partial_valid_accs).mean())\n self.reset_metrics()\n \n def training_step(self, batch, batch_idx):\n \"\"\"\n batch is a tuple (x, y)\n x is a list of tensors as in forward\n y is a tensor with the classes\n \"\"\"\n x, y = batch\n logits = self(x)\n loss, acc = self._get_loss_and_acc(logits, y)\n \n # ideally we'd use tensorboard to see the graphs, but currently it is disabled in Kaggle\n # so we resort to manually plotting\n# self.log('train_loss', loss)\n# self.log('train_acc', acc)\n self.partial_train_losses.append(loss.detach().cpu().numpy())\n self.partial_train_accs.append(acc.detach().cpu().numpy())\n \n return loss\n \n def validation_step(self, batch, batch_idx):\n x, y = batch\n logits = self(x)\n loss, acc = self._get_loss_and_acc(logits, y)\n \n# self.log('valid_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)\n# self.log('valid_acc', acc)\n self.partial_valid_losses.append(loss.detach().cpu().numpy())\n self.partial_valid_accs.append(acc.detach().cpu().numpy())\n \n return loss\n \n def configure_optimizers(self):\n optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.l2)\n return optimizer", - "class": "Model Training", - "desc": "This code snippet defines a PyTorch Lightning module class `BagOfEmbeddingsClassifier`, which initializes an embedding bag layer with pretrained embeddings and a linear layer for binary classification, and includes methods for the forward pass, calculating loss and accuracy, training, validation steps, and configuring the optimizer.", + "cell_id": 13, + "code": "# CREATE DATA ITERATOR TO SAVE MEMORY\n\nfrom torch.utils.data import DataLoader, RandomSampler, SequentialSampler\n\n# The DataLoader needs to know our batch size for training, so we specify it \n# here. For fine-tuning BERT on a specific task, the authors recommend a batch \n# size of 16 or 32.\nbatch_size = 32\n\n# Create the DataLoaders for our training and validation sets.\n# We'll take training samples in random order. \ntrain_dataloader = DataLoader(\n train_dataset, # The training samples.\n sampler = RandomSampler(train_dataset), # Select batches randomly\n batch_size = batch_size # Trains with this batch size.\n )\n\n# For validation the order doesn't matter, so we'll just read them sequentially.\nvalidation_dataloader = DataLoader(\n val_dataset, # The validation samples.\n sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.\n batch_size = batch_size # Evaluate with this batch size.\n )", + "class": "Data Transform", + "desc": "This code sets up data loaders using PyTorch's `DataLoader` with specified batch size, random sampling for the training set, and sequential sampling for the validation set to efficiently handle data during model training and evaluation.", "testing": { "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.94578296 + "subclass": "load_pretrained", + "subclass_id": 30, + "predicted_subclass_probability": 0.98473305 }, "cluster": 0 }, { - "cell_id": 56, - "code": "model = BagOfEmbeddingsClassifier(embeddings, 0.001, l2=0)\nbatch = next(iter(train_loader))\n\n# batch is x, y\nlogits = model(batch[0])\nprint(logits)", - "class": "Model Training", - "desc": "This code snippet initializes the `BagOfEmbeddingsClassifier` with the pre-trained embeddings and specified hyperparameters, retrieves the first batch of training data, and tests the model's forward pass by printing the resulting logits.", + "cell_id": 25, + "code": "# PREPARE PREDICTIONS FOR SUBMISSION\n\n# Combine the results across all batches. \nflat_predictions = np.concatenate(predictions, axis=0)\n\n# For each sample, pick the label (0 or 1) with the higher score.\nflat_predictions = np.argmax(flat_predictions, axis=1).flatten()", + "class": "Data Transform", + "desc": "This code combines the prediction results from all batches, selects the labels with the highest scores for each sample, and flattens the array to prepare the predictions for submission.", "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.9270925 + "class": "Exploratory_Data_Analysis", + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.5844843 }, - "cluster": 0 + "cluster": 4 }, { - "cell_id": 57, - "code": "trainer = pl.Trainer(gpus=1, max_epochs=5, val_check_interval=0.5)\ntrainer.fit(model, train_loader, valid_loader)", - "class": "Model Training", - "desc": "This code snippet initializes a PyTorch Lightning Trainer to use one GPU, run for a maximum of 5 epochs, and validate at specified intervals, then trains the `BagOfEmbeddingsClassifier` model using the training and validation DataLoaders.", + "cell_id": 1, + "code": "print ('Train data shape:', data.shape)", + "class": "Exploratory Data Analysis", + "desc": "This code prints the shape of the train dataset. ", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.999678 + "class": "Exploratory_Data_Analysis", + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.9942768 }, - "cluster": 0 + "cluster": 5 }, { - "cell_id": 59, - "code": "model = BagOfEmbeddingsClassifier(embeddings, 0.001, l2=0.0001)\ntrainer = pl.Trainer(gpus=1, max_epochs=6, val_check_interval=0.5)\ntrainer.fit(model, train_loader, valid_loader)", - "class": "Model Training", - "desc": "This code snippet reinitializes the `BagOfEmbeddingsClassifier` with the same embeddings but adds L2 regularization, and then trains the model using the same data configuration for 6 epochs with validation checks at specified intervals.", + "cell_id": 2, + "code": "# CHECK FOR DUPLICATE SAMPLES WITH CONFLICTING LABELS\n\ntext = data.text\nduplicates = data[text.isin(text[text.duplicated()])].sort_values(by='text')\n\n# If the mean target value is different from 0 or 1 - we have duplicate samples with conflicting value\nconflicting_check = pd.DataFrame(duplicates.groupby(['text']).target.mean())\nconflicting_check.sample(10)", + "class": "Exploratory Data Analysis", + "desc": "This code snippet checks for duplicate text samples in the dataset that may have conflicting label values by calculating the mean target value for each duplicated text sample and displaying a random sample of 10 such text samples.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.9926635 + "class": "Exploratory_Data_Analysis", + "subclass": "count_duplicates", + "subclass_id": 38, + "predicted_subclass_probability": 0.8007328 }, - "cluster": 0 + "cluster": 3 }, { - "cell_id": 66, - "code": "class TransformerWrapper(pl.LightningModule):\n def __init__(self, transformer, learning_rate=0.001, l2=0.0001):\n super().__init__()\n self.model = transformer\n self.learning_rate = learning_rate\n self.l2 = l2\n \n def forward(self, batch):\n x, mask = batch\n output = self.model(x, mask)\n return output.logits\n \n def training_step(self, batch, batch_idx):\n loss, acc = self._get_loss_and_acc(batch)\n self.partial_train_losses.append(loss.detach().cpu().numpy())\n self.partial_train_accs.append(acc.detach().cpu().numpy())\n \n return loss\n \n def _get_loss_and_acc(self, batch):\n x, mask, y = batch\n output = self.model(x, mask, labels=y)\n loss = output.loss\n logits = output.logits\n \n predicted = logits.argmax(1)\n acc = (predicted == y).float().mean()\n \n return loss, acc\n \n # these functions are copied from the BagOfWords class to allow ploting without tensorboard\n # ideally, we'd inherit from a common base class. well, ideally we'd have access to tensorboard and none of this would exist :)\n def on_fit_start(self): \n self.train_losses = []\n self.train_accs = []\n self.valid_losses = []\n self.valid_accs = []\n \n self.reset_metrics()\n \n def reset_metrics(self):\n self.partial_train_losses = []\n self.partial_train_accs = []\n self.partial_valid_losses = []\n self.partial_valid_accs = []\n \n def on_validation_end(self):\n self.train_losses.append(np.array(self.partial_train_losses).mean())\n self.train_accs.append(np.array(self.partial_train_accs).mean())\n self.valid_losses.append(np.array(self.partial_valid_losses).mean())\n self.valid_accs.append(np.array(self.partial_valid_accs).mean())\n self.reset_metrics()\n \n def validation_step(self, batch, batch_idx):\n loss, acc = self._get_loss_and_acc(batch)\n \n# self.log('valid_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)\n# self.log('valid_acc', acc)\n self.partial_valid_losses.append(loss.cpu().numpy())\n self.partial_valid_accs.append(acc.cpu().numpy())\n \n return loss\n \n def configure_optimizers(self):\n # to make it lighter, fine tune only the classifier on top of the language model\n parameters = [p[1] for p in self.model.named_parameters() if p[0].startswith('classifier')]\n optimizer = torch.optim.AdamW(parameters, lr=self.learning_rate, weight_decay=self.l2)\n return optimizer", - "class": "Model Training", - "desc": "This code snippet defines a PyTorch Lightning module class `TransformerWrapper` for fine-tuning a pretrained transformer model (such as RoBERTa) on a classification task, including methods for forward pass, training and validation steps, calculating loss and accuracy, tracking and resetting metrics, and configuring the optimizer to fine-tune only the classifier layer of the model.", + "cell_id": 9, + "code": "# LOOK HOW THE TOKENIZER WORK\n\n# Print the original sentence.\nprint(' Original: ', sentences[0])\n\n# Print the sentence split into tokens.\nprint('Tokenized: ', tokenizer.tokenize(sentences[0]))\n\n# Print the sentence mapped to token ids.\nprint('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))", + "class": "Exploratory Data Analysis", + "desc": "This code demonstrates how the BERT tokenizer works by printing an original sentence, its tokenized version, and the corresponding token IDs for the first tweet in the dataset.", "testing": { "class": "Data_Transform", "subclass": "categorify", "subclass_id": 20, - "predicted_subclass_probability": 0.48894703 + "predicted_subclass_probability": 0.74620485 }, - "cluster": 0 + "cluster": 3 }, { - "cell_id": 67, - "code": "model = TransformerWrapper(roberta, 0.001, l2=0)\ntrainer = pl.Trainer(gpus=1, max_epochs=6, val_check_interval=0.5)\ntrainer.fit(model, train_loader, valid_loader)", - "class": "Model Training", - "desc": "This code snippet initializes the `TransformerWrapper` with the pretrained RoBERTa model and specified hyperparameters, then trains it using the training and validation DataLoaders with a PyTorch Lightning Trainer configured to use one GPU and run for a maximum of 6 epochs, performing validation checks at specified intervals.", + "cell_id": 10, + "code": "# GET MAX LENGTH OF THE TWEETS\n\nmax_len = 0\n# For every sentence...\nfor sent in sentences:\n # Tokenize the text and add `[CLS]` and `[SEP]` tokens.\n input_ids = tokenizer.encode(sent, add_special_tokens=True)\n # Update the maximum sentence length.\n max_len = max(max_len, len(input_ids))\n\nprint('Max tweet length: ', max_len)", + "class": "Exploratory Data Analysis", + "desc": "This code calculates and prints the maximum length of the tokenized tweets in the dataset by tokenizing each tweet and comparing its length to update the maximum length observed.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.9910779 - }, - "cluster": 0 - }, { - "cell_id": 69, - "code": "roberta = RobertaForSequenceClassification.from_pretrained(pretrained_name, num_labels=2)\nmodel = TransformerWrapper(roberta, 0.01, l2=0)\ntrainer = pl.Trainer(gpus=1, max_epochs=4, val_check_interval=0.5)\ntrainer.fit(model, train_loader, valid_loader)", - "class": "Model Training", - "desc": "This code snippet reinitializes the `RobertaForSequenceClassification` model and wraps it in a `TransformerWrapper` with specified hyperparameters, then trains the model using the training and validation DataLoaders with a PyTorch Lightning Trainer configured to use one GPU and run for a maximum of 4 epochs, performing validation checks at specified intervals.", - "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.98994815 + "class": "Exploratory_Data_Analysis", + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.858368 }, - "cluster": 0 + "cluster": 3 }, { - "cell_id": 71, - "code": "roberta = RobertaForSequenceClassification.from_pretrained(pretrained_name, num_labels=2)\n\ntrain_loader = DataLoader(train_data, batch_size=128)\n\nmodel = TransformerWrapper(roberta, 0.005, l2=0)\ntrainer = pl.Trainer(gpus=1, max_epochs=4, val_check_interval=0.5)\ntrainer.fit(model, train_loader, valid_loader)", - "class": "Model Training", - "desc": "This code snippet reinitializes the `RobertaForSequenceClassification` model, updates the training DataLoader with a larger batch size, wraps the updated model in a `TransformerWrapper` with new hyperparameters, and trains it using the updated training and validation DataLoaders with a PyTorch Lightning Trainer configured to use one GPU and run for a maximum of 4 epochs, performing validation checks at specified intervals.", + "cell_id": 15, + "code": "# PRINT NAMES AND DIMENSIONS FOR THE MODEL LAYERS\n\n# Get all of the model's parameters as a list of tuples.\nparams = list(model.named_parameters())\n\nprint('The BERT model has {:} different named parameters.\\n'.format(len(params)))\n\nprint('==== Embedding Layer ====\\n')\n\nfor p in params[0:5]:\n print(\"{:<55} {:>12}\".format(p[0], str(tuple(p[1].size()))))\n\nprint('\\n==== First Transformer ====\\n')\n\nfor p in params[5:21]:\n print(\"{:<55} {:>12}\".format(p[0], str(tuple(p[1].size()))))\n\nprint('\\n==== Output Layer ====\\n')\n\nfor p in params[-4:]:\n print(\"{:<55} {:>12}\".format(p[0], str(tuple(p[1].size()))))", + "class": "Exploratory Data Analysis", + "desc": "This code prints the names and dimensions of the parameters for the embedding layer, the first transformer layer, and the output layer of the BERT model.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.898724 + "class": "Visualization", + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.24847664 }, - "cluster": 0 + "cluster": 3 }, { "cell_id": 4, - "code": "target_counts = train.target.value_counts()\nsns.barplot(y=target_counts, x=target_counts.index)\nplt.ylabel('Samples')\nplt.title('Target')\nplt.show()", - "class": "Visualization", - "desc": "This code snippet generates a bar plot to visualize the distribution of the target variable in the training dataset.", + "code": "# CONNECT KAGGLE GPU FOR SPEED UP\n\nimport tensorflow as tf\n# Get the GPU device name.\ndevice_name = tf.test.gpu_device_name()\nif device_name == '/device:GPU:0':\n print('Found GPU at: {}'.format(device_name))\nelse:\n raise SystemError('GPU device not found')", + "class": "Imports and Environment", + "desc": "This code imports TensorFlow, verifies the availability of a GPU for training, and prints the GPU device name if found, raising an error if not.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.99522096 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.9054986 }, "cluster": 0 }, { "cell_id": 5, - "code": "has_kw = ~train.keyword.isna()\nfig, ax = plt.subplots(1, 2, sharey=True)\ntrain[has_kw]\nsns.countplot(data=train[has_kw], x='target', ax=ax[0])\nax[0].set_title('With keyword')\nsns.countplot(data=train[~has_kw], x='target', ax=ax[1])\nax[1].set_title('Without keyword')\nplt.show()", - "class": "Visualization", - "desc": "This code snippet creates count plots to compare the distribution of the target variable between samples that have keywords and those that do not.", + "code": "# SPECIFY THE GPU AS THE TORCH DEVICE\n\nimport torch\nif torch.cuda.is_available(): \n # Tell PyTorch to use the GPU. \n device = torch.device(\"cuda\")\n print('There are %d GPU(s) available.' % torch.cuda.device_count())\n print('We will use the GPU:', torch.cuda.get_device_name(0))\nelse:\n print('No GPU available, using the CPU instead.')\n device = torch.device(\"cpu\")", + "class": "Imports and Environment", + "desc": "This code imports the PyTorch library, checks for GPU availability, and sets the device to GPU if available, otherwise defaults to using the CPU, while printing the number of GPUs and the GPU name if applicable.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.98656905 + "class": "Imports_and_Environment", + "subclass": "set_options", + "subclass_id": 23, + "predicted_subclass_probability": 0.9812268 }, "cluster": 0 }, { "cell_id": 6, - "code": "has_loc = ~train.location.isna()\nsns.countplot(x=has_loc)\nplt.xlabel('Has location')", - "class": "Visualization", - "desc": "This code snippet generates a count plot to visualize the number of samples in the training dataset that have location information versus those that do not.", + "code": "# INSTALL THE TRANSFORMERS PACKAGE TO GET A PYTORCH INTERFACE FOR BERT\n!pip install transformers", + "class": "Imports and Environment", + "desc": "This command installs the 'transformers' package, which provides a PyTorch interface for BERT and other pre-trained models.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.99126256 + "class": "Imports_and_Environment", + "subclass": "install_modules", + "subclass_id": 87, + "predicted_subclass_probability": 0.989985 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 7, - "code": "loc_count = train.location.value_counts()\ntop_loc = loc_count.iloc[:50]\nplt.subplots(figsize=(20, 8))\nplt.xticks(rotation=80)\nsns.barplot(x=top_loc.index, y=top_loc)", - "class": "Visualization", - "desc": "This code snippet creates a bar plot to visualize the top 50 most common locations in the training dataset.", + "cell_id": 8, + "code": "# LOAD THE BERT TOKENIZER\n\nfrom transformers import BertTokenizer\ntokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)", + "class": "Imports and Environment", + "desc": "This code imports the `BertTokenizer` from the `transformers` package and loads the pre-trained 'bert-base-uncased' tokenizer with the option to convert all text to lower case.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.99783665 + "class": "Model_Train", + "subclass": "load_pretrained", + "subclass_id": 30, + "predicted_subclass_probability": 0.9928005 }, "cluster": 0 }, { - "cell_id": 16, - "code": "train['num_tokens'] = train.tokens.apply(len)\nplt.hist(train.num_tokens, bins=20)\nplt.show()", - "class": "Visualization", - "desc": "This code snippet adds a new column to the training dataset representing the number of tokens in each text and then creates a histogram to visualize the distribution of token counts.", + "cell_id": 18, + "code": "# HELPER FUNCTION TO CALCULATE ACCURACY\n\nimport numpy as np\n\n# Function to calculate the accuracy of our predictions vs labels\ndef flat_accuracy(preds, labels):\n pred_flat = np.argmax(preds, axis=1).flatten()\n labels_flat = labels.flatten()\n return np.sum(pred_flat == labels_flat) / len(labels_flat)", + "class": "Model Evaluation", + "desc": "This code defines a helper function `flat_accuracy` that calculates the accuracy of predictions by comparing the flattened predicted and true labels using NumPy operations.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.99763453 + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.984026 }, "cluster": 0 }, { - "cell_id": 17, - "code": "inds40 = train.num_tokens <= 40\nfig, ax = plt.subplots(figsize=(16, 8))\nplt.hist(train[inds40 & train.target].num_tokens, bins=20, alpha=0.5, label='Positive', density=True)\nplt.hist(train[inds40 & ~train.target].num_tokens, bins=20, alpha=0.5, label='Negative', density=True)\nplt.legend()\nplt.title('Tweet length distribution')\nplt.show()", - "class": "Visualization", - "desc": "This code snippet filters the training dataset for texts with 40 or fewer tokens, and then creates a histogram to visualize the distribution of tweet lengths, separated by the target class (Positive and Negative).", + "cell_id": 19, + "code": "# HELPER FUNCTION FOR TIME FORMAT\n\nimport time\nimport datetime\n\ndef format_time(elapsed):\n '''\n Takes a time in seconds and returns a string hh:mm:ss\n '''\n # Round to the nearest second.\n elapsed_rounded = int(round((elapsed)))\n \n # Format as hh:mm:ss\n return str(datetime.timedelta(seconds=elapsed_rounded))", + "class": "Model Evaluation", + "desc": "This code defines a helper function `format_time` that converts elapsed time in seconds to a string in the format hh:mm:ss using the `datetime.timedelta` method.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9983352 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.5951723 }, - "cluster": 0 + "cluster": 3 }, { - "cell_id": 19, - "code": "def plot_top_values(data, k, names, xlabel=None, ylabel=None, use_abs=False):\n \"\"\"\n Function to plot a barplot with counts of the top k items in data and their corresponding names.\n \n Args:\n data: a numpy array\n k: int\n names: list of strings corresponding to the positions in data\n use_abs: if True, take the highest absolute values\n \"\"\"\n if use_abs:\n inds = np.abs(data).argsort()\n else:\n inds = data.argsort()\n \n # inverted argsort and top k\n top_inds = inds[::-1][:k]\n top_values = data[top_inds]\n top_names = [names[i] for i in top_inds]\n \n fig, ax = plt.subplots(figsize=(16, 8))\n plt.bar(np.arange(k), top_values)\n if ylabel:\n ax.set_ylabel(ylabel)\n if xlabel:\n ax.set_xlabel(xlabel)\n ax.set_xticks(np.arange(k))\n ax.set_xticklabels(top_names, rotation=80)\n fig.tight_layout()", - "class": "Visualization", - "desc": "This code snippet defines a function to plot a bar plot showing the counts of the top `k` items in the provided data array along with their corresponding names, with optional parameters for axis labels and the option to use absolute values for sorting.", + "cell_id": 21, + "code": "# THE SUMMARY OF THE TRAIN PROCESS\n\n# Display floats with two decimal places\npd.set_option('precision', 2)\n\n# Create a DataFrame from our training statistics\ndf_stats = pd.DataFrame(data=training_stats)\n\n# Use the 'epoch' as the row index\ndf_stats = df_stats.set_index('epoch')\n\n# A hack to force the column headers to wrap\n#df = df.style.set_table_styles([dict(selector=\"th\",props=[('max-width', '70px')])])\n\n# Display the table\ndf_stats", + "class": "Model Evaluation", + "desc": "This code creates a Pandas DataFrame to summarize and display the recorded training statistics, such as training loss, validation loss, validation accuracy, and timings, using the epoch number as the row index.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9957224 + "class": "Data_Transform", + "subclass": "create_dataframe", + "subclass_id": 12, + "predicted_subclass_probability": 0.46057627 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 20, - "code": "k = 50\n\nvocab = vectorizer.get_feature_names()\nword_count = train_bow.toarray().sum(0)\n\nplot_top_values(word_count, k, vocab, 'Count', 'Type')", - "class": "Visualization", - "desc": "This code snippet retrieves the feature names (vocabulary) and word counts from the Bag-of-Words representation of the training data, then uses the previously defined function to plot the top 50 most frequent words.", + "cell_id": 24, + "code": "# GET PREDICTIONS\n\nprint('Predicting labels for {:,} test sentences...'.format(len(input_ids)))\n\n# Put model in evaluation mode\nmodel.eval()\n\n# Tracking variables \npredictions = []\n#true_labels = []\n\n# Predict \nfor batch in prediction_dataloader:\n # Add batch to GPU\n batch = tuple(t.to(device) for t in batch)\n \n # Unpack the inputs from our dataloader\n b_input_ids, b_input_mask = batch #b_labels\n \n # Telling the model not to compute or store gradients, saving memory and \n # speeding up prediction\n with torch.no_grad():\n # Forward pass, calculate logit predictions\n outputs = model(b_input_ids, token_type_ids=None, \n attention_mask=b_input_mask)\n\n logits = outputs[0]\n\n # Move logits and labels to CPU\n logits = logits.detach().cpu().numpy()\n label_ids = b_labels.to('cpu').numpy()\n \n # Store predictions and true labels\n predictions.append(logits)\n #true_labels.append(label_ids)\n\nprint(' DONE.')", + "class": "Model Evaluation", + "desc": "This code snippet sets the model to evaluation mode, iterates through the test data to generate predictions using the trained BERT model, and stores the prediction logits while ensuring no gradient calculations are performed.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.97462577 + "class": "Model_Evaluation", + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.75300366 }, "cluster": 0 }, { - "cell_id": 27, - "code": "ticks = np.arange(len(f1))\nplt.plot(ticks, f1)\nplt.xticks(ticks, [str(k) for k in num_features])\nplt.title('F1 per number of features (chi2 selector)')\nplt.show()", - "class": "Visualization", - "desc": "This code snippet plots the mean F1 scores against different numbers of selected features, using a line plot, to visualize how feature selection impacts model performance.", + "cell_id": 14, + "code": "# GET BERT MODEL FOR CLASSIFICATION\n\nfrom transformers import BertForSequenceClassification, AdamW, BertConfig\n\n# Load BertForSequenceClassification, the pretrained BERT model with a single \n# linear classification layer on top. \nmodel = BertForSequenceClassification.from_pretrained(\n \"bert-base-uncased\", # Use the 12-layer BERT model, with an uncased vocab.\n num_labels = 2, # The number of output labels--2 for binary classification.\n # You can increase this for multi-class tasks. \n output_attentions = False, # Whether the model returns attentions weights.\n output_hidden_states = False, # Whether the model returns all hidden-states.\n)\n\n# Tell pytorch to run this model on the GPU.\nmodel.cuda()", + "class": "Model Training", + "desc": "This code loads the pre-trained \"bert-base-uncased\" BERT model configured for sequence classification with two output labels and prepares the model to run on the GPU.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.91531163 + "class": "Model_Train", + "subclass": "load_pretrained", + "subclass_id": 30, + "predicted_subclass_probability": 0.567976 }, "cluster": 0 }, { - "cell_id": 29, - "code": "rows = get_rows_containing(train, 'ebay')\nsns.countplot(x='target', data=rows)\nplt.title('Target distribution containing \"ebay\"')\nplt.show()", - "class": "Visualization", - "desc": "This code snippet retrieves rows from the training dataset that contain the term 'ebay' and creates a count plot to visualize the target distribution for these rows.", + "cell_id": 16, + "code": "# SET UP THE OPTIMIZER\n\n# Note: AdamW is a class from the huggingface library (as opposed to pytorch) \n# I believe the 'W' stands for 'Weight Decay fix\"\noptimizer = AdamW(model.parameters(),\n lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5\n eps = 1e-8 # args.adam_epsilon - default is 1e-8.\n )", + "class": "Model Training", + "desc": "This code sets up the AdamW optimizer from the Hugging Face library for the BERT model with specified learning rate and epsilon values.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9911773 + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.9948212 }, - "cluster": 0 + "cluster": 3 }, { - "cell_id": 31, - "code": "n = np.arange(len(regularization)) + 1\nfig, ax = plt.subplots(figsize=(14, 6))\nwidth = 0.4\n\nax.bar(n, l1_scores, width, label='L1 reg', yerr=l1_std)\nax.bar(n + width, l2_scores, width, label='L2 reg', yerr=l2_std)\nax.set_xlabel('Regularization (lower is stronger)')\nax.set_ylabel('Mean F1')\nax.set_xticks(n + width / 2)\nax.set_xticklabels([str(val) for val in regularization])\nax.legend(loc='best')\n", - "class": "Visualization", - "desc": "This code snippet creates a bar plot to compare the mean F1 scores, including error bars representing standard deviations, for Logistic Regression (L2 regularization) and Ridge Classifier (L1 regularization) models evaluated at different regularization strengths.", + "cell_id": 17, + "code": "# SET UP MODEL HYPERPARAMETERS\n\nfrom transformers import get_linear_schedule_with_warmup\n\n# Number of training epochs. The BERT authors recommend between 2 and 4. \n# We chose to run for 4, but we'll see later that this may be over-fitting the\n# training data.\nepochs = 2\n\n# Total number of training steps is [number of batches] x [number of epochs]. \n# (Note that this is not the same as the number of training samples).\ntotal_steps = len(train_dataloader) * epochs\n\n# Create the learning rate scheduler.\nscheduler = get_linear_schedule_with_warmup(optimizer, \n num_warmup_steps = 0, # Default value in run_glue.py\n num_training_steps = total_steps)", + "class": "Model Training", + "desc": "This code sets up model hyperparameters including the number of training epochs and creates a linear learning rate scheduler with warmup steps for the optimizer, using the Hugging Face library's `get_linear_schedule_with_warmup`.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.97958606 + "class": "Model_Train", + "subclass": "init_hyperparams", + "subclass_id": 59, + "predicted_subclass_probability": 0.6898819 }, - "cluster": 0 + "cluster": 3 }, { - "cell_id": 36, - "code": "def plot_model_score(train_scores, valid_scores):\n \"\"\"Plot train and validation score for comparison and checking overfitting\"\"\"\n mean_train = train_scores.mean()\n mean_valid = valid_scores.mean()\n fig, ax = plt.subplots()\n plt.bar(0, mean_train, yerr=train_scores.std())\n plt.bar(1, mean_valid, yerr=valid_scores.std())\n ax.text(0, mean_train + 0.01, f'{mean_train:.4f}')\n ax.text(1, mean_valid + 0.01, f'{mean_valid:.4f}')\n plt.title('Model F1 and standard deviation')\n plt.xticks([0, 1], ['Train', 'Validation'])\n ymin = np.min([mean_train, mean_valid]) * 0.8\n plt.ylim(bottom=ymin)\n plt.show()", - "class": "Visualization", - "desc": "This code snippet defines a function to plot and compare the mean and standard deviation of training and validation F1 scores, allowing for the assessment of model performance and potential overfitting.", + "cell_id": 20, + "code": "# TRAINING SCRIPT\n\nimport random\nimport numpy as np\n\n# This training code is based on the `run_glue.py` script here:\n# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128\n\n# Set the seed value all over the place to make this reproducible.\nseed_val = 42\n\nrandom.seed(seed_val)\nnp.random.seed(seed_val)\ntorch.manual_seed(seed_val)\ntorch.cuda.manual_seed_all(seed_val)\n\n# We'll store a number of quantities such as training and validation loss, \n# validation accuracy, and timings.\ntraining_stats = []\n\n# Measure the total training time for the whole run.\ntotal_t0 = time.time()\n\n# For each epoch...\nfor epoch_i in range(0, epochs):\n \n # ========================================\n # Training\n # ========================================\n \n # Perform one full pass over the training set.\n\n print(\"\")\n print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n print('Training...')\n\n # Measure how long the training epoch takes.\n t0 = time.time()\n\n # Reset the total loss for this epoch.\n total_train_loss = 0\n\n # Put the model into training mode. Don't be mislead--the call to \n # `train` just changes the *mode*, it doesn't *perform* the training.\n # `dropout` and `batchnorm` layers behave differently during training\n # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)\n model.train()\n\n # For each batch of training data...\n for step, batch in enumerate(train_dataloader):\n\n # Progress update every 40 batches.\n if step % 40 == 0 and not step == 0:\n # Calculate elapsed time in minutes.\n elapsed = format_time(time.time() - t0)\n \n # Report progress.\n print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))\n\n # Unpack this training batch from our dataloader. \n #\n # As we unpack the batch, we'll also copy each tensor to the GPU using the \n # `to` method.\n #\n # `batch` contains three pytorch tensors:\n # [0]: input ids \n # [1]: attention masks\n # [2]: labels \n b_input_ids = batch[0].to(device)\n b_input_mask = batch[1].to(device)\n b_labels = batch[2].to(device)\n\n # Always clear any previously calculated gradients before performing a\n # backward pass. PyTorch doesn't do this automatically because \n # accumulating the gradients is \"convenient while training RNNs\". \n # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)\n model.zero_grad() \n\n # Perform a forward pass (evaluate the model on this training batch).\n # The documentation for this `model` function is here: \n # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n # It returns different numbers of parameters depending on what arguments\n # arge given and what flags are set. For our useage here, it returns\n # the loss (because we provided labels) and the \"logits\"--the model\n # outputs prior to activation.\n outputs = model(b_input_ids, \n token_type_ids=None, \n attention_mask=b_input_mask, \n labels=b_labels)\n \n loss = outputs[0]\n logits = outputs[1]\n\n # Accumulate the training loss over all of the batches so that we can\n # calculate the average loss at the end. `loss` is a Tensor containing a\n # single value; the `.item()` function just returns the Python value \n # from the tensor.\n total_train_loss += loss.item()\n\n # Perform a backward pass to calculate the gradients.\n loss.backward()\n\n # Clip the norm of the gradients to 1.0.\n # This is to help prevent the \"exploding gradients\" problem.\n torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n\n # Update parameters and take a step using the computed gradient.\n # The optimizer dictates the \"update rule\"--how the parameters are\n # modified based on their gradients, the learning rate, etc.\n optimizer.step()\n\n # Update the learning rate.\n scheduler.step()\n\n # Calculate the average loss over all of the batches.\n avg_train_loss = total_train_loss / len(train_dataloader) \n \n # Measure how long this epoch took.\n training_time = format_time(time.time() - t0)\n\n print(\"\")\n print(\" Average training loss: {0:.2f}\".format(avg_train_loss))\n print(\" Training epcoh took: {:}\".format(training_time))\n \n # ========================================\n # Validation\n # ========================================\n # After the completion of each training epoch, measure our performance on\n # our validation set.\n\n print(\"\")\n print(\"Running Validation...\")\n\n t0 = time.time()\n\n # Put the model in evaluation mode--the dropout layers behave differently\n # during evaluation.\n model.eval()\n\n # Tracking variables \n total_eval_accuracy = 0\n total_eval_loss = 0\n nb_eval_steps = 0\n\n # Evaluate data for one epoch\n for batch in validation_dataloader:\n \n # Unpack this training batch from our dataloader. \n #\n # As we unpack the batch, we'll also copy each tensor to the GPU using \n # the `to` method.\n #\n # `batch` contains three pytorch tensors:\n # [0]: input ids \n # [1]: attention masks\n # [2]: labels \n b_input_ids = batch[0].to(device)\n b_input_mask = batch[1].to(device)\n b_labels = batch[2].to(device)\n \n # Tell pytorch not to bother with constructing the compute graph during\n # the forward pass, since this is only needed for backprop (training).\n with torch.no_grad(): \n\n # Forward pass, calculate logit predictions.\n # token_type_ids is the same as the \"segment ids\", which \n # differentiates sentence 1 and 2 in 2-sentence tasks.\n # The documentation for this `model` function is here: \n # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification\n # Get the \"logits\" output by the model. The \"logits\" are the output\n # values prior to applying an activation function like the softmax.\n output = model(b_input_ids, \n token_type_ids=None, \n attention_mask=b_input_mask,\n labels=b_labels)\n \n loss = output[0]\n logits = output[1]\n \n # Accumulate the validation loss.\n total_eval_loss += loss.item()\n\n # Move logits and labels to CPU\n logits = logits.detach().cpu().numpy()\n label_ids = b_labels.to('cpu').numpy()\n\n # Calculate the accuracy for this batch of test sentences, and\n # accumulate it over all batches.\n total_eval_accuracy += flat_accuracy(logits, label_ids)\n \n\n # Report the final accuracy for this validation run.\n avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)\n print(\" Accuracy: {0:.2f}\".format(avg_val_accuracy))\n\n # Calculate the average loss over all of the batches.\n avg_val_loss = total_eval_loss / len(validation_dataloader)\n \n # Measure how long the validation run took.\n validation_time = format_time(time.time() - t0)\n \n print(\" Validation Loss: {0:.2f}\".format(avg_val_loss))\n print(\" Validation took: {:}\".format(validation_time))\n\n # Record all statistics from this epoch.\n training_stats.append(\n {\n 'epoch': epoch_i + 1,\n 'Training Loss': avg_train_loss,\n 'Valid. Loss': avg_val_loss,\n 'Valid. Accur.': avg_val_accuracy,\n 'Training Time': training_time,\n 'Validation Time': validation_time\n }\n )\n\nprint(\"\")\nprint(\"Training complete!\")\n\nprint(\"Total training took {:} (h:mm:ss)\".format(format_time(time.time()-total_t0)))", + "class": "Model Training", + "desc": "This code snippet is a comprehensive training script that trains a BERT model for sequence classification over multiple epochs, managing elements like setting seeds for reproducibility, computing loss, updating model parameters using an optimizer, and evaluating performance on a validation set, while recording training statistics and timing information.", "testing": { - "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.91516846 + "class": "Exploratory_Data_Analysis", + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.7243838 }, - "cluster": 0 + "cluster": 3 }, { - "cell_id": 37, - "code": "plot_model_score(cv_scores['train_score'], cv_scores['test_score'])", + "cell_id": 22, + "code": "# PLOT THE VALIDATION LOSS\n\nimport matplotlib.pyplot as plt\n%matplotlib inline\n\nimport seaborn as sns\n\n# Use plot styling from seaborn.\nsns.set(style='darkgrid')\n\n# Increase the plot size and font size.\nsns.set(font_scale=1.5)\nplt.rcParams[\"figure.figsize\"] = (12,6)\n\n# Plot the learning curve.\nplt.plot(df_stats['Training Loss'], 'b-o', label=\"Training\")\nplt.plot(df_stats['Valid. Loss'], 'g-o', label=\"Validation\")\n\n# Label the plot.\nplt.title(\"Training & Validation Loss\")\nplt.xlabel(\"Epoch\")\nplt.ylabel(\"Loss\")\nplt.legend()\nplt.xticks([1, 2, 3, 4])\n\nplt.show()", "class": "Visualization", - "desc": "This code snippet calls the `plot_model_score` function to visualize and compare the mean F1 scores and their standard deviations for the training and validation datasets, obtained from cross-validation.", + "desc": "This code plots the training and validation loss over epochs using Matplotlib and Seaborn, styling the plot and labeling the axes and legend.", "testing": { "class": "Visualization", "subclass": "learning_history", "subclass_id": 35, - "predicted_subclass_probability": 0.977454 + "predicted_subclass_probability": 0.9963574 }, - "cluster": 0 - }, { - "cell_id": 53, - "code": "def plot_model_performance(model):\n fig, ax = plt.subplots(2, 1, figsize=(16, 8))\n ax[0].set_title('Loss')\n ax[1].set_title('Accuracy')\n\n n = np.arange(len(model.train_losses))\n ax[0].plot(n, model.train_losses, 'bo', label='Train', linestyle='--')\n ax[1].plot(n, model.train_accs, 'bo', linestyle='--')\n ax[0].plot(n, model.valid_losses, 'ro', label='Validation', linestyle='--')\n ax[1].plot(n, model.valid_accs, 'ro', linestyle='--')\n ax[0].legend()\n plt.show()", - "class": "Visualization", - "desc": "This code snippet defines a function to plot the training and validation loss and accuracy over epochs for a given model, using line plots with different colors and styles to distinguish between training and validation metrics.", + "cluster": -1 + }], + "notebook_id": 14, + "notebook_name": "bert-with-disaster-tweets.ipynb" + }, { + "cells": [{ + "cell_id": 17, + "code": "df2.to_csv(\"submission.csv\", index=False)", + "class": "Data Export", + "desc": "The code snippet exports the processed DataFrame containing the 'id' and 'target' columns to a CSV file named \"submission.csv\" without including DataFrame indices.", "testing": { - "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.9959293 + "class": "Data_Export", + "subclass": "save_to_csv", + "subclass_id": 25, + "predicted_subclass_probability": 0.9993235 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 58, - "code": "plot_model_performance(model)", - "class": "Visualization", - "desc": "This code snippet calls the `plot_model_performance` function to visualize the training and validation loss and accuracy over the epochs for the trained `BagOfEmbeddingsClassifier` model.", + "cell_id": 4, + "code": "# read training data\ndf = pd.read_csv(\"../input/nlp-getting-started/train.csv\")\ndisplay(df.head())\ndisplay(df.shape)", + "class": "Data Extraction", + "desc": "The code snippet reads the training data from a CSV file using pandas, and displays the first few rows and the shape of the DataFrame to understand its structure.", "testing": { - "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.8608386 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.99592197 }, - "cluster": 2 + "cluster": 1 }, { - "cell_id": 60, - "code": "plot_model_performance(model)", - "class": "Visualization", - "desc": "This code snippet calls the `plot_model_performance` function to visualize the training and validation loss and accuracy over the epochs for the newly trained `BagOfEmbeddingsClassifier` model with L2 regularization.", + "cell_id": 12, + "code": "# apply model to test data\ndf1 = pd.read_csv(\"../input/nlp-getting-started/test.csv\")\ndisplay(df1.head())\ndisplay(df1.shape)", + "class": "Data Extraction", + "desc": "The code snippet reads the test data from a CSV file using pandas, and displays the first few rows and the shape of the DataFrame to understand its structure.", "testing": { - "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.8608386 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.99765056 }, - "cluster": 2 + "cluster": 1 }, { - "cell_id": 68, - "code": "plot_model_performance(model)", - "class": "Visualization", - "desc": "This code snippet calls the `plot_model_performance` function to visualize the training and validation loss and accuracy over the epochs for the trained `TransformerWrapper` model.", + "cell_id": 5, + "code": "# clean training text\nl=len(df)\ndisplay(l)\ncleanlist=[]\ntextlength=[]\nfor i in range(l):\n ct=cleantext.clean(df.iloc[i,3], clean_all= True)\n cleanlist.append(ct)\n lct=len(ct)\n textlength.append(lct)\n ", + "class": "Data Transform", + "desc": "The code snippet iterates over the training data, cleans the text in a specific column using the cleantext library, appends the cleaned text to a list, and records the length of each cleaned text.", "testing": { - "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.8608386 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.9911814 }, - "cluster": 2 + "cluster": 6 }, { - "cell_id": 70, - "code": "plot_model_performance(model)", - "class": "Visualization", - "desc": "This code snippet calls the `plot_model_performance` function to visualize the training and validation loss and accuracy over the epochs for the newly trained `TransformerWrapper` model.", + "cell_id": 6, + "code": "# combine clean text with training data\ndf_clean=pd.DataFrame(cleanlist)\ndf_clean.columns=['cleantext']\nframes=[df,df_clean]\nnewdf=pd.concat(frames, axis=1)\ndisplay(newdf)", + "class": "Data Transform", + "desc": "The code snippet creates a DataFrame from the cleaned text list, then combines this DataFrame with the original training DataFrame along the column axis to include the cleaned text as a new column, and displays the resulting DataFrame.", "testing": { - "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.8608386 + "class": "Data_Transform", + "subclass": "create_dataframe", + "subclass_id": 12, + "predicted_subclass_probability": 0.9970824 }, "cluster": 2 }, { - "cell_id": 72, - "code": "plot_model_performance(model)", - "class": "Visualization", - "desc": "This code snippet calls the `plot_model_performance` function to visualize the training and validation loss and accuracy over the epochs for the newly trained `TransformerWrapper` model with the updated configuration.", - "testing": { - "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.8608386 - }, - "cluster": 2 - }], - "notebook_id": 15, - "notebook_name": "nlp-for-tweets-from-bag-of-words-to-transformers" - }, { - "cells": [{ - "cell_id": 40, - "code": "sub_sample = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')\n\nsubmit = sub_sample.copy()\nsubmit.target = y_test\nsubmit.to_csv('submit.csv',index=False)", - "class": "Data Export", - "desc": "This code snippet reads a sample submission file into a DataFrame, updates the 'target' column with the logistic regression model's predictions, and saves the updated DataFrame as a CSV file named 'submit.csv', preparing the submission for a competition or further use.", + "cell_id": 14, + "code": "l=len(df1)\ndisplay(l)\npredlist=[]\n#l=1\nfor i in range(l):\n ct=cleantext.clean(df1.iloc[i,3], clean_all= True)\n new=predictor.predict(ct)\n predlist.append(new)", + "class": "Data Transform", + "desc": "The code snippet iterates over the test data, cleans each text entry using the cleantext library, makes predictions using the trained predictor on the cleaned text, and appends the prediction results to a list.", "testing": { - "class": "Data_Export", - "subclass": "save_to_csv", - "subclass_id": 25, - "predicted_subclass_probability": 0.999146 + "class": "Model_Evaluation", + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.99204236 }, - "cluster": 0 + "cluster": 6 }, { - "cell_id": 1, - "code": "df = pd.read_csv('../input/nlp-getting-started/train.csv')\ntest = pd.read_csv('../input/nlp-getting-started/test.csv')", - "class": "Data Extraction", - "desc": "This code snippet reads the training and test datasets from CSV files into pandas DataFrame objects named `df` and `test`, respectively, using the `pd.read_csv` method.", + "cell_id": 15, + "code": "df_pred=pd.DataFrame(predlist)\ndf_pred.columns=['target']\nframes=[df1,df_pred]\ndf2=pd.concat(frames, axis=1)\ndisplay(df2.head())", + "class": "Data Transform", + "desc": "The code snippet creates a DataFrame from the prediction results list, adds a 'target' column, combines this DataFrame with the original test DataFrame along the column axis, and displays the first few rows of the resulting DataFrame.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.9997588 + "class": "Data_Export", + "subclass": "prepare_output", + "subclass_id": 55, + "predicted_subclass_probability": 0.9379232 }, - "cluster": 3 + "cluster": 2 }, { - "cell_id": 5, - "code": "df = df.drop_duplicates().reset_index(drop = True)", + "cell_id": 16, + "code": "df2.loc[df2['target']=='target','target']=1\ndf2.loc[df2['target']=='not_target','target']=0\ndisplay(df2['target'].mean())\ndf2=df2[['id','target']]\ndisplay(df2.shape)\ndisplay(df2.head())", "class": "Data Transform", - "desc": "This code snippet removes duplicate rows from the `df` DataFrame and resets the index, ensuring that the data is clean and properly indexed.", + "desc": "The code snippet converts categorical prediction results ('target' and 'not_target') into numerical values (1 and 0), calculates and displays the mean of the 'target' column, and then selects and displays the shape and first few rows of the DataFrame containing only the 'id' and 'target' columns.", "testing": { "class": "Data_Transform", - "subclass": "remove_duplicates", - "subclass_id": 19, - "predicted_subclass_probability": 0.8511636 + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.8699204 }, - "cluster": 1 + "cluster": 5 }, { - "cell_id": 20, - "code": "a = []\nfor i in w_nd:\n if i in nw_nd:\n a.apeend(i)\na\n#there is no common item on both group of tweets", - "class": "Data Transform", - "desc": "This code snippet attempts to identify common keywords between disasterous and non-disasterous tweets by checking if elements of `w_nd` are in `nw_nd`, but contains a typo (`apeend` instead of `append`), which would lead to an intended list `a` holding no common items, concluding that there are no shared keywords between the two groups.", + "cell_id": 7, + "code": "# distribution of clean text length\ndisplay(pd.Series(textlength).describe())", + "class": "Exploratory Data Analysis", + "desc": "The code snippet generates and displays descriptive statistics of the cleaned text length using pandas, providing insights into the distribution of text lengths in the dataset.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.3016006 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.9994604 }, - "cluster": 1 + "cluster": 5 }, { - "cell_id": 23, - "code": "#Cleaning the Data", - "class": "Data Transform", - "desc": "This comment likely indicates that the following code will be focused on performing data cleaning operations to prepare the dataset for further analysis or model training.", + "cell_id": 13, + "code": "# example\ndf1.iloc[0,3]", + "class": "Exploratory Data Analysis", + "desc": "The code snippet retrieves and displays the content of the fourth column for the first row in the test DataFrame to examine an example data entry.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "commented", - "subclass_id": 76, - "predicted_subclass_probability": 0.892342 + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9183029 }, - "cluster": 0 + "cluster": 5 }, { - "cell_id": 24, - "code": "import re\n#Conver lowercase remove punctuation and Character and then strip \ntext = df[\"text\"].iloc[0]\nprint(text)\ntext = re.sub(r'[^\\w\\s]', '', str(text).lower().strip())\ntxt = text.split()\nprint(txt)\n\n", - "class": "Data Transform", - "desc": "This code snippet illustrates converting text to lowercase, removing punctuation and special characters, and then splitting the cleaned text into words, using regular expressions and string operations on the first element of the 'text' column in the `df` DataFrame to demonstrate the cleaning process.", + "cell_id": 0, + "code": "# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session", + "class": "Imports and Environment", + "desc": "The code snippet sets up the environment by importing necessary libraries such as NumPy and pandas for data manipulation and processing, and lists the files available for input using the os library.", "testing": { - "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.9692003 + "class": "Exploratory_Data_Analysis", + "subclass": "list_files", + "subclass_id": 88, + "predicted_subclass_probability": 0.99921954 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 25, - "code": "#remove stopwords\nimport nltk\nlst_stopwords = nltk.corpus.stopwords.words(\"english\")\ntxt = [word for word in txt if word not in lst_stopwords]\nprint(txt)", - "class": "Data Transform", - "desc": "This code snippet removes common English stopwords from the previously cleaned and split text using a list comprehension, leveraging the NLTK library's list of stopwords to filter out these words, and then prints the resulting list.", + "cell_id": 1, + "code": "pip install cleantext", + "class": "Imports and Environment", + "desc": "The code snippet installs the 'cleantext' library using the pip package manager.", "testing": { - "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.9619886 + "class": "Imports_and_Environment", + "subclass": "install_modules", + "subclass_id": 87, + "predicted_subclass_probability": 0.9904579 }, "cluster": 1 }, { - "cell_id": 26, - "code": "#stemming\nps = nltk.stem.porter.PorterStemmer()\nprint([ps.stem(word) for word in txt])", - "class": "Data Transform", - "desc": "This code snippet applies stemming to the cleaned and filtered text using the Porter Stemmer from the NLTK library, transforming each word in `txt` to its root form and printing the results.", + "cell_id": 2, + "code": "pip install ktrain", + "class": "Imports and Environment", + "desc": "The code snippet installs the 'ktrain' library using the pip package manager.", "testing": { - "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.23080209 + "class": "Imports_and_Environment", + "subclass": "install_modules", + "subclass_id": 87, + "predicted_subclass_probability": 0.99148107 }, "cluster": 1 }, { - "cell_id": 27, - "code": "#Lemmentization\nlem = nltk.stem.wordnet.WordNetLemmatizer()\nprint([lem.lemmatize(word) for word in txt])", - "class": "Data Transform", - "desc": "This code snippet applies lemmatization to the cleaned and filtered text using the WordNet Lemmatizer from the NLTK library, transforming each word in `txt` to its base or dictionary form, and prints the results.", + "cell_id": 3, + "code": "import pandas as pd\nimport ktrain\nfrom ktrain import text\nimport cleantext\nimport warnings\nwarnings.filterwarnings(\"ignore\")", + "class": "Imports and Environment", + "desc": "The code snippet imports pandas for data manipulation, ktrain and its text module for machine learning model training and inference, cleantext for cleaning text data, and suppresses warnings using the warnings library.", "testing": { - "class": "Model_Train", - "subclass": "define_search_model", - "subclass_id": 82, - "predicted_subclass_probability": 0.22034359 + "class": "Imports_and_Environment", + "subclass": "set_options", + "subclass_id": 23, + "predicted_subclass_probability": 0.9993291 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 28, - "code": "#to apply all the technique to all the records on dataset\ndef utils_preprocess_text(text, flg_stemm=True, flg_lemm =True, lst_stopwords=None ):\n text = re.sub(r'[^\\w\\s]', '', str(text).lower().strip())\n \n #tokenization(convert from string to List)\n lst_text = text.split()\n #remove stopwords\n if lst_stopwords is not None:\n lst_text = [word for word in lst_text if word not in\n lst_stopwords]\n \n #stemming\n if flg_stemm == True:\n ps = nltk.stem.porter.PorterStemmer()\n lst_text = [ps.stem(word) for word in lst_text]\n \n #Lemmentization\n if flg_lemm == True:\n lem = nltk.stem.wordnet.WordNetLemmatizer()\n lst_text = [lem.lemmatize(word) for word in lst_text]\n \n # back to string from list\n text = \" \".join(lst_text)\n return text\n ", - "class": "Data Transform", - "desc": "This code snippet defines a function `utils_preprocess_text` that applies a series of preprocessing techniques including text normalization, tokenization, stopword removal, stemming, and lemmatization to a given text input, and then converts the processed list of words back into a string, enabling these transformations to be applied to all records in the dataset.", + "cell_id": 11, + "code": "# example\npredictor.predict(['calm','earthquake'])", + "class": "Model Evaluation", + "desc": "The code snippet uses the trained predictor to make predictions on example input text instances ('calm' and 'earthquake').", "testing": { - "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.93941545 + "class": "Model_Evaluation", + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.9897764 }, "cluster": 1 }, { - "cell_id": 29, - "code": "#apply dataset\ndf['clean_text'] = df['text'].apply(lambda x: utils_preprocess_text(x, flg_stemm = False, flg_lemm=True))\ntest['clean_text'] = test['text'].apply(lambda x: utils_preprocess_text(x, flg_stemm = False, flg_lemm=True))", - "class": "Data Transform", - "desc": "This code snippet applies the `utils_preprocess_text` function to the 'text' column of both the training (`df`) and test datasets (`test`), creating new columns 'clean_text' with the preprocessed text without stemming, ensuring the dataset is uniformly cleaned and formatted for further analysis or model training.", + "cell_id": 8, + "code": "#https://github.com/amaiya/ktrain\n# train model\n(x_train, y_train), (x_test, y_test), preproc=text.texts_from_df(newdf, 'cleantext',label_columns=['target'],\n maxlen=127,max_features=100000,\n preprocess_mode='bert', val_pct=.1)", + "class": "Model Training", + "desc": "The code snippet extracts training and testing datasets, along with the preprocessing steps using ktrain's texts_from_df method, setting up the data for model training with BERT by specifying parameters like max text length, maximum features, and validation split.", "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.96186244 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.38557464 }, - "cluster": 1 + "cluster": 3 }, { - "cell_id": 30, - "code": "#Target Encoding\n", - "class": "Data Transform", - "desc": "This comment likely indicates that the following code will focus on encoding the target variable, possibly converting categorical labels into numerical format for use in machine learning models.", + "cell_id": 9, + "code": "model=text.text_classifier('bert', (x_train, y_train), preproc=preproc)\nlearner=ktrain.get_learner(model, train_data=(x_train, y_train),\n val_data=(x_test, y_test),\n batch_size=32)", + "class": "Model Training", + "desc": "The code snippet initializes a BERT text classification model using ktrain's text_classifier method, then creates a learner object with the model and training/testing data, specifying a batch size for training.", "testing": { "class": "Model_Train", "subclass": "train_model", "subclass_id": 7, - "predicted_subclass_probability": 0.4920049 + "predicted_subclass_probability": 0.87104684 }, "cluster": 0 }, { - "cell_id": 31, - "code": "import category_encoders as ce\n\n# Target encoding\nfeatures = ['keyword', 'location']\nencoder = ce.TargetEncoder(cols=features)\nencoder.fit(df[features],df['target'])\n\ndf = df.join(encoder.transform(df[features]).add_suffix('_target'))\ntest = test.join(encoder.transform(test[features]).add_suffix('_target'))\n", - "class": "Data Transform", - "desc": "This code snippet uses the `category_encoders` library's `TargetEncoder` to encode the 'keyword' and 'location' columns based on the target variable in the training dataset (`df`), and applies the same transformation to the test dataset (`test`), appending the encoded features with a '_target' suffix.", + "cell_id": 10, + "code": "learner.fit_onecycle(2e-5, 3)\npredictor=ktrain.get_predictor(learner.model, preproc)", + "class": "Model Training", + "desc": "The code snippet trains the initialized BERT model using the One Cycle learning rate policy over 3 epochs with a learning rate of 2e-5 and then creates a predictor object to facilitate easier predictions using the trained model.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.9991437 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.9968579 }, "cluster": 1 - }, { - "cell_id": 33, - "code": "from sklearn.feature_extraction.text import TfidfVectorizer\n\nvec_text = TfidfVectorizer(min_df = 10, ngram_range = (1,2), stop_words='english') \n# Only include >=10 occurrences\n# Have unigrams and bigrams\ntext_vec = vec_text.fit_transform(df['clean_text'])\ntext_vec_test = vec_text.transform(test['clean_text'])\nX_train_text = pd.DataFrame(text_vec.toarray(), columns=vec_text.get_feature_names())\nX_test_text = pd.DataFrame(text_vec_test.toarray(), columns=vec_text.get_feature_names())\nprint (X_train_text.shape)", - "class": "Data Transform", - "desc": "This code snippet utilizes `TfidfVectorizer` from scikit-learn to convert the 'clean_text' column in both the training and test datasets into TF-IDF features for unigrams and bigrams occurring at least 10 times, and then transforms these features into DataFrame objects named `X_train_text` and `X_test_text`.", + }], + "notebook_id": 15, + "notebook_name": "ktrain-disaster-tweet-model.ipynb" + }, { + "cells": [{ + "cell_id": 43, + "code": "test_bow = vectorizer.transform(test.tokens)\ntest_bow = selector.transform(test_bow)\nclassifier = LogisticRegression(C=0.1)\n\n# use the whole training dataset now\nclassifier.fit(x, y)\npredicted = classifier.predict(test_bow)\nsubmission = pd.DataFrame({'id': test.id, 'target': predicted})\nsubmission.to_csv('bow-linear.csv', index=False)", + "class": "Data Export", + "desc": "This code snippet transforms the test dataset using the previously fitted vectorizer and feature selector, trains a Logistic Regression classifier on the entire training dataset, predicts the target variable for the test dataset, and saves the predictions to a CSV file named 'bow-linear.csv' using pandas.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.6342874 - }, - "cluster": 1 + "class": "Data_Export", + "subclass": "save_to_csv", + "subclass_id": 25, + "predicted_subclass_probability": 0.99929416 + }, + "cluster": -1 }, { - "cell_id": 34, - "code": "df = df.join(X_train_text, rsuffix='_text')\ntest = test.join(X_test_text, rsuffix='_text')\n", - "class": "Data Transform", - "desc": "This code snippet joins the TF-IDF features from `X_train_text` and `X_test_text` back to the original training (`df`) and test (`test`) DataFrames, appending the new feature columns to these datasets.", + "cell_id": 62, + "code": "predicted = logits > 0\nsubmission = pd.DataFrame({'id': test.id, 'target': predicted.astype(np.int)})\nsubmission.to_csv('embeddings.csv', index=False)", + "class": "Data Export", + "desc": "This code snippet generates predictions for the target variable by thresholding the logits, creates a DataFrame containing the test IDs and the predictions, and saves it to a CSV file named 'embeddings.csv' using pandas.", "testing": { - "class": "Data_Transform", - "subclass": "merge", - "subclass_id": 32, - "predicted_subclass_probability": 0.99786144 + "class": "Data_Export", + "subclass": "save_to_csv", + "subclass_id": 25, + "predicted_subclass_probability": 0.9992009 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 2, - "code": "df.head()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first few rows of the `df` DataFrame, giving an initial glimpse of the dataset's structure and contents.", + "cell_id": 73, + "code": "# trainer.predict returns a list with batch results\nlogits = np.concatenate(trainer.predict(model, test_loader), axis=0)\npredicted = logits.argmax(1)\nsubmission = pd.DataFrame({'id': test.id, 'target': predicted})\nsubmission.to_csv('roberta.csv', index=False)", + "class": "Data Export", + "desc": "This code snippet uses the PyTorch Lightning `Trainer` to predict logits for the test dataset with the trained `TransformerWrapper` model, generates predictions by taking the argmax of the logits, creates a DataFrame containing the test IDs and predictions, and saves it to a CSV file named 'roberta.csv' using pandas.", + "testing": { + "class": "Data_Export", + "subclass": "save_to_csv", + "subclass_id": 25, + "predicted_subclass_probability": 0.9993868 + }, + "cluster": -1 + }, { + "cell_id": 74, + "code": "!head *.csv", + "class": "Data Export", + "desc": "This code snippet executes a shell command to display the first few lines of all CSV files in the current directory, showcasing the top rows of each generated submission file.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.9997553 + "predicted_subclass_probability": 0.99973243 }, - "cluster": 12 + "cluster": -1 }, { - "cell_id": 3, - "code": "df.info()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet provides a concise summary of the `df` DataFrame, including the data types, non-null counts, and memory usage of each column.", + "cell_id": 2, + "code": "train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')\ntest = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')", + "class": "Data Extraction", + "desc": "This code snippet reads the training and test datasets from CSV files located at the specified paths using pandas.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.9992442 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.99975055 }, - "cluster": 12 + "cluster": 0 }, { - "cell_id": 4, - "code": "df.duplicated().sum()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet calculates and returns the total number of duplicated rows in the `df` DataFrame, which helps in understanding the dataset's quality and cleanliness.", + "cell_id": 21, + "code": "x = train_bow\ny = train['target']", + "class": "Data Extraction", + "desc": "This code snippet assigns the binary bag-of-words representation of the training data to `x` and the target column to `y`, in preparation for model training.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_duplicates", - "subclass_id": 38, - "predicted_subclass_probability": 0.8993749 + "class": "Data_Transform", + "subclass": "prepare_x_and_y", + "subclass_id": 21, + "predicted_subclass_probability": 0.99928766 }, - "cluster": 8 + "cluster": -1 }, { - "cell_id": 6, - "code": "df['target'].value_counts()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet counts and displays the frequency of each unique value in the 'target' column of the `df` DataFrame, providing insight into the class distribution of the target variable.", + "cell_id": 44, + "code": "filename = '/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt'\nword_dict = {}\nembeddings = []\nwith open(filename, 'r') as f:\n for line in tqdm(f, total=400000):\n word, vector_string = line.split(' ', 1)\n vector = [float(value) for value in vector_string.split()]\n embeddings.append(vector)\n word_dict[word] = len(word_dict)\n\nembeddings = torch.tensor(embeddings)", + "class": "Data Extraction", + "desc": "This code snippet reads GloVe pre-trained word embeddings from a specified file, creating a dictionary that maps words to their corresponding vector indices and storing the embeddings in a tensor using PyTorch, with a progress bar provided by tqdm.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.9995012 + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.9478619 }, - "cluster": 9 + "cluster": 1 }, { - "cell_id": 7, - "code": "df[df['target'] == 0][:1]", - "class": "Exploratory Data Analysis", - "desc": "This code snippet filters the `df` DataFrame to show the first row where the 'target' column is equal to 0, allowing for a specific examination of a negative class example.", + "cell_id": 63, + "code": "pretrained_name = 'distilroberta-base'\ntokenizer = RobertaTokenizerFast.from_pretrained(pretrained_name)\nroberta = RobertaForSequenceClassification.from_pretrained(pretrained_name, num_labels=2)", + "class": "Data Extraction", + "desc": "This code snippet loads the `RobertaTokenizerFast` and `RobertaForSequenceClassification` models pre-trained on the 'distilroberta-base' dataset from the transformers library, setting up for fine-tuning on a classification task with 2 labels.", "testing": { - "class": "Data_Transform", - "subclass": "filter", - "subclass_id": 14, - "predicted_subclass_probability": 0.98627037 + "class": "Model_Train", + "subclass": "load_pretrained", + "subclass_id": 30, + "predicted_subclass_probability": 0.9948885 }, - "cluster": 5 + "cluster": 1 }, { - "cell_id": 8, - "code": "df[df['target'] == 1][:1]", - "class": "Exploratory Data Analysis", - "desc": "This code snippet filters the `df` DataFrame to show the first row where the 'target' column is equal to 1, allowing for a specific examination of a positive class example.", + "cell_id": 9, + "code": "train.drop(['location', 'keyword'], axis=1, inplace=True)\ntest.drop(['location', 'keyword'], axis=1, inplace=True)", + "class": "Data Transform", + "desc": "This code snippet removes the 'location' and 'keyword' columns from both the training and test datasets using the `drop` method from pandas.", "testing": { "class": "Data_Transform", - "subclass": "filter", - "subclass_id": 14, - "predicted_subclass_probability": 0.98621887 + "subclass": "drop_column", + "subclass_id": 10, + "predicted_subclass_probability": 0.9991115 }, - "cluster": 5 + "cluster": 3 }, { - "cell_id": 9, - "code": "df.isnull().sum()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet calculates and displays the total number of missing (null) values in each column of the `df` DataFrame, helping to identify any data completeness issues.", + "cell_id": 12, + "code": "text = \"Don't split #hashtags!\"\nprint('Before:', [t for t in tokenizer(text)])\n\nprefixes = list(nlp.Defaults.prefixes)\nprefixes.remove('#')\nprefix_regex = spacy.util.compile_prefix_regex(prefixes)\ntokenizer.prefix_search = prefix_regex.search\n\nprint('After:', [t for t in tokenizer(text)])", + "class": "Data Transform", + "desc": "This code snippet customizes the Spacy tokenizer to avoid splitting hashtags by modifying the default prefix rules and re-compiling the tokenizer using the updated prefixes.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_missing_values", - "subclass_id": 39, - "predicted_subclass_probability": 0.9985019 + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.5677337 }, - "cluster": 8 + "cluster": 6 }, { - "cell_id": 10, - "code": "df['keyword'].value_counts()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet counts and displays the frequency of each unique value in the 'keyword' column of the `df` DataFrame, providing an overview of the distribution of keywords in the dataset.", + "cell_id": 14, + "code": "train['tokens'] = train['text'].apply(lambda row: [t.text.lower() for t in tokenizer(row) if not t.is_space])\ntest['tokens'] = test['text'].apply(lambda row: [t.text.lower() for t in tokenizer(row) if not t.is_space])", + "class": "Data Transform", + "desc": "This code snippet adds a new column 'tokens' to both the training and test datasets, containing lowercased tokens from the 'text' column with whitespace tokens removed, using Spacy's tokenizer and pandas' `apply` method.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.9995165 + "class": "Data_Transform", + "subclass": "data_type_conversions", + "subclass_id": 16, + "predicted_subclass_probability": 0.5443664 }, - "cluster": 9 + "cluster": 3 }, { - "cell_id": 11, - "code": "df['location'].value_counts()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet counts and displays the frequency of each unique value in the 'location' column of the `df` DataFrame, providing an overview of the distribution of locations in the dataset.", + "cell_id": 18, + "code": "from sklearn.feature_extraction.text import CountVectorizer\n\n# min and max document frequency (ratio of documents containing that token)\nmin_df = 5\nmax_df = 0.6\n\n# limit vocabulary size as a function of the training data\nmax_features = len(train) * 2\n\nvectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x, min_df=min_df, max_df=max_df, max_features=max_features, binary=True)\ntrain_bow = vectorizer.fit_transform(train.tokens)\ntrain_bow", + "class": "Data Transform", + "desc": "This code snippet initializes a `CountVectorizer` from scikit-learn with specified minimum and maximum document frequencies and maximum vocabulary size, then transforms the tokenized training data into a binary bag-of-words representation.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.9994772 + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.97597075 }, - "cluster": 9 + "cluster": 5 }, { - "cell_id": 21, - "code": "#check no of unique keyword and location\nprint(df.keyword.nunique())\ndf['location'].nunique()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet prints the number of unique keywords in the 'keyword' column and calculates the number of unique values in the 'location' column of the `df` DataFrame, providing insights into the diversity of these features.", + "cell_id": 33, + "code": "# min and max document frequency (ratio of documents containing that token)\nmin_df = 10\nmax_df = 0.6\n\n# limit vocabulary size as a function of the training data\nmax_features = len(train) * 2\n\n# single words to 3-grams\nngram_range = (1, 3)\n\nvectorizer = CountVectorizer(lowercase=False, tokenizer=lambda x: x, min_df=min_df, max_df=max_df, max_features=max_features, binary=True, ngram_range=ngram_range)\nx = train_bow = vectorizer.fit_transform(train.tokens)\n\nvocab = vectorizer.get_feature_names()\nword_count = train_bow.toarray().sum(0)\n\nplot_top_values(word_count, k, vocab, 'Count', 'Type')", + "class": "Data Transform", + "desc": "This code snippet initializes a `CountVectorizer` with specified document frequency, vocabulary size, and n-gram range, vectorizes the tokenized training data into a binary bag-of-words representation, and then visualizes the top k most frequent n-grams using the `plot_top_values` function.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_unique_values", - "subclass_id": 54, - "predicted_subclass_probability": 0.9503388 + "class": "Visualization", + "subclass": "relationship", + "subclass_id": 81, + "predicted_subclass_probability": 0.74612176 }, - "cluster": -1 + "cluster": 5 }, { - "cell_id": 32, - "code": "df.isnull().sum()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet calculates and displays the total number of missing (null) values in each column of the `df` DataFrame, helping to identify any remaining data completeness issues after preprocessing.", + "cell_id": 46, + "code": "oov_count = Counter()\nall_tokens = []\n\nfor row in train.tokens:\n tokens = [t[1:] if t.startswith('#') else t for t in row]\n all_tokens.append(tokens)\n oov_count.update(set(t for t in tokens if t not in word_dict))", + "class": "Data Transform", + "desc": "This code snippet processes the tokenized training dataset, adjusting tokens by removing hashtags and counting the occurrences of tokens not found in the word dictionary (out-of-vocabulary tokens) using Counter from the collections module.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "count_missing_values", - "subclass_id": 39, - "predicted_subclass_probability": 0.9985019 + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.68699807 }, - "cluster": 8 + "cluster": 6 }, { - "cell_id": 35, - "code": "df.head(1)", - "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first row of the `df` DataFrame, allowing for the inspection of the DataFrame's current structure and the newly added features after joining the TF-IDF representations.", + "cell_id": 47, + "code": "test_tokens = []\nfor row in test.tokens:\n tokens = [t[1:] if t.startswith('#') else t for t in row]\n test_tokens.append(tokens)", + "class": "Data Transform", + "desc": "This code snippet processes the tokenized test dataset, adjusting tokens by removing hashtags and storing the modified tokens in the `test_tokens` list.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997688 + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.9887138 }, - "cluster": 2 + "cluster": 6 }, { - "cell_id": 0, - "code": "import numpy as np\nimport pandas as pd", - "class": "Imports and Environment", - "desc": "This code snippet imports the NumPy and pandas libraries, which are essential for numerical computations and data manipulation in Python, respectively.", + "cell_id": 49, + "code": "words_to_add = [w for w in oov_count if oov_count[w] > 2]\nfor word in words_to_add:\n word_dict[word] = len(word_dict)\n\nnew_vectors = torch.zeros((len(words_to_add), embeddings.shape[1]))\nembeddings = torch.cat([embeddings, new_vectors], dim=0)\nprint(len(word_dict), embeddings.shape)", + "class": "Data Transform", + "desc": "This code snippet adds frequently occurring out-of-vocabulary words to the word dictionary, initializes new zero vectors for these words, and concatenates these vectors to the existing embeddings tensor, then prints the updated length of the word dictionary and the shape of the embeddings tensor.", "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.9993284 + "class": "Data_Transform", + "subclass": "concatenate", + "subclass_id": 11, + "predicted_subclass_probability": 0.89962006 }, - "cluster": 0 + "cluster": 5 }, { - "cell_id": 12, - "code": "import matplotlib.pyplot as plt\n%matplotlib inline\nimport seaborn as sns", - "class": "Imports and Environment", - "desc": "This code snippet imports the Matplotlib and Seaborn libraries to enable data visualization and sets up inline plotting for Jupyter Notebooks.", + "cell_id": 51, + "code": "def convert_to_indices(all_tokens):\n word_indices = []\n\n for tokens in all_tokens:\n tweet_inds = torch.tensor([word_dict[t] for t in tokens if t in word_dict], dtype=torch.long)\n word_indices.append(tweet_inds)\n \n return word_indices\n\nword_indices = convert_to_indices(all_tokens)\ntest_word_indices = convert_to_indices(test_tokens)", + "class": "Data Transform", + "desc": "This code snippet defines a function `convert_to_indices` that converts tokenized text data into lists of corresponding word indices using the word dictionary, then applies this function to the training and test token lists to create index representations of the datasets.", "testing": { - "class": "Imports_and_Environment", - "subclass": "set_options", - "subclass_id": 23, - "predicted_subclass_probability": 0.99939334 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.92438155 }, - "cluster": 0 + "cluster": 8 }, { - "cell_id": 39, - "code": "y_test[:10]", - "class": "Model Evaluation", - "desc": "This code snippet displays the first 10 predicted target values from the logistic regression model, allowing for an initial assessment of the model's output.", + "cell_id": 54, + "code": "def collate_as_list(samples):\n \"\"\"Function for the DataLoader to combine samples in a batch. Each sample is a (x, y) pair.\"\"\"\n x, y = list(zip(*samples))\n if y[0] is None:\n return x\n return x, torch.tensor(y).float()\n\n\nclass WordIndexDataset(Dataset):\n def __init__(self, x, y=None):\n self.x = x\n self.y = y\n \n def __getitem__(self, i):\n if self.y is not None:\n return self.x[i], self.y[i]\n else:\n return self.x[i], None\n \n def __len__(self):\n return len(self.x)\n", + "class": "Data Transform", + "desc": "This code snippet defines a custom collate function `collate_as_list` for combining samples in a batch and a PyTorch Dataset class `WordIndexDataset` for handling datasets with text indices and optional targets.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.99966455 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.7811093 }, "cluster": 0 }, { - "cell_id": 41, - "code": "print ('Training accuracy: %.4f' % pipeline.score(X_train, y_train))", - "class": "Model Evaluation", - "desc": "This code snippet prints the training accuracy of the logistic regression model, using the defined pipeline to score the model's performance on the training dataset.", + "cell_id": 55, + "code": "validation_size = int(0.1 * len(train))\nvalidation_inds = np.random.choice(np.arange(len(train)), size=validation_size, replace=False)\nis_train = np.ones(len(train), dtype=np.bool)\nis_train[validation_inds] = False\n\n# use an object array since we have varied size tensors\ntweets = np.array(word_indices, dtype=object)\ntarget = train.target.to_numpy()\n# train_tweets, valid_tweets, train_target, valid_target = train_test_split(tweets, target, test_size=0.1, stratify=target)\ntrain_tweets = tweets[is_train].tolist()\ntrain_target = target[is_train]\nvalid_tweets = tweets[~is_train].tolist()\nvalid_target = target[~is_train]\n\ntrain_data = WordIndexDataset(train_tweets, train_target)\nvalid_data = WordIndexDataset(valid_tweets, valid_target)\ntest_data = WordIndexDataset(test_word_indices)\ntrain_loader = DataLoader(train_data, batch_size=32, collate_fn=collate_as_list)\nvalid_loader = DataLoader(valid_data, batch_size=256, collate_fn=collate_as_list)\ntest_loader = DataLoader(test_data, batch_size=256, collate_fn=collate_as_list)", + "class": "Data Transform", + "desc": "This code snippet splits the dataset into training and validation sets, creates custom PyTorch datasets for these splits, and defines DataLoaders for batching and collating the training, validation, and test data.", "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.8493974 + "class": "Data_Transform", + "subclass": "prepare_x_and_y", + "subclass_id": 21, + "predicted_subclass_probability": 0.46862042 }, - "cluster": 0 + "cluster": 5 }, { - "cell_id": 42, - "code": "from sklearn.metrics import f1_score\n\nprint ('Training f-1 score: %.4f' % f1_score(y_train, pipeline.predict(X_train)))", - "class": "Model Evaluation", - "desc": "This code snippet calculates and prints the F1 score for the logistic regression model on the training dataset, providing a measure of the model's classification performance in terms of precision and recall balance.", + "cell_id": 64, + "code": "# create tensors of variable sizes\n# note that the tokenizer returns a tensor with shape [1, num_tokens]\ntrain_tokens = train.text[is_train].apply(lambda s: tokenizer.encode(s, return_tensors='pt')[0]).tolist()\nvalid_tokens = train.text[~is_train].apply(lambda s: tokenizer.encode(s, return_tensors='pt')[0]).tolist()\ntest_tokens = test.text.apply(lambda s: tokenizer.encode(s, return_tensors='pt')[0]).tolist()\n\n# add padding to have a fixed size matrix. With bigger datasets we should be careful about memory usage, but this is small enough to skip this kind of optimization\npadding = tokenizer.pad_token_id\nx_train = pad_sequence(train_tokens, batch_first=True, padding_value=padding)\nx_valid = pad_sequence(valid_tokens, batch_first=True, padding_value=padding)\nx_test = pad_sequence(test_tokens, batch_first=True, padding_value=padding)\n\nx_train_mask = x_train != padding\nx_valid_mask = x_valid != padding\nx_test_mask = x_test != padding\nprint(f'x_train shape: {x_train.shape}, x_valid shape: {x_valid.shape}, x_test shape: {x_test.shape}')", + "class": "Data Transform", + "desc": "This code snippet tokenizes and encodes the text data from the training, validation, and test sets using the pre-trained Roberta tokenizer, pads the sequences to create fixed-size matrices, and generates corresponding attention masks, then prints the shapes of the padded datasets.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.5244426 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.9920512 }, - "cluster": 0 + "cluster": 5 }, { - "cell_id": 43, - "code": "# Confusion matrix\nfrom sklearn.metrics import confusion_matrix\npd.DataFrame(confusion_matrix(y_train, pipeline.predict(X_train)))", - "class": "Model Evaluation", - "desc": "This code snippet calculates and displays the confusion matrix for the logistic regression model's predictions on the training dataset, offering insight into the number of true positives, true negatives, false positives, and false negatives.", + "cell_id": 65, + "code": "train_data = TensorDataset(x_train, x_train_mask, torch.tensor(train_target))\nvalid_data = TensorDataset(x_valid, x_valid_mask, torch.tensor(valid_target))\ntest_data = TensorDataset(x_test, x_test_mask)\n\ntrain_loader = DataLoader(train_data, batch_size=32)\nvalid_loader = DataLoader(valid_data, batch_size=256)\ntest_loader = DataLoader(test_data, batch_size=256)", + "class": "Data Transform", + "desc": "This code snippet creates PyTorch TensorDatasets for the training, validation, and test sets, including the input data and attention masks, and defines DataLoaders for batching these datasets with specified batch sizes.", "testing": { "class": "Data_Transform", "subclass": "create_dataframe", "subclass_id": 12, - "predicted_subclass_probability": 0.91332203 + "predicted_subclass_probability": 0.97746176 + }, + "cluster": 5 + }, { + "cell_id": 3, + "code": "train.head()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet displays the first five rows of the training dataset using the `head` method from pandas.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997507 }, "cluster": 0 }, { - "cell_id": 36, - "code": "#Logistic Regression", - "class": "Model Training", - "desc": "This comment likely indicates that the following code will focus on training a logistic regression model.", + "cell_id": 8, + "code": "min_freq = 5\nabove_threshold = train.location.value_counts() > min_freq\nfrequent_places = above_threshold.index[above_threshold]\ndata = train[train.location.isin(frequent_places)].location\nprint(f'{data.nunique()} unique locations with more than {min_freq} occurrences')", + "class": "Exploratory Data Analysis", + "desc": "This code snippet filters and prints the number of unique locations in the training dataset that have more than a specified minimum frequency of occurrences, using pandas.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "commented", - "subclass_id": 76, - "predicted_subclass_probability": 0.962167 + "subclass": "count_unique_values", + "subclass_id": 54, + "predicted_subclass_probability": 0.9539694 }, - "cluster": -1 + "cluster": 3 }, { - "cell_id": 37, - "code": "from sklearn.linear_model import LogisticRegression\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.preprocessing import MinMaxScaler\n\nfeatures_to_drop = ['id', 'keyword','location', 'text','clean_text' ]\nscaler = MinMaxScaler()", - "class": "Model Training", - "desc": "This code snippet prepares for model training by importing necessary modules, initializing a `LogisticRegression` model, dropping unnecessary features, and initializing a `MinMaxScaler` for scaling the features.", + "cell_id": 10, + "code": "train.text.isna().sum(), test.text.isna().sum()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet counts and outputs the number of missing values in the 'text' column for both the training and test datasets using the `isna` and `sum` methods from pandas.", "testing": { - "class": "Data_Transform", - "subclass": "normalization", - "subclass_id": 18, - "predicted_subclass_probability": 0.9946407 + "class": "Exploratory_Data_Analysis", + "subclass": "count_missing_values", + "subclass_id": 39, + "predicted_subclass_probability": 0.9989147 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 38, - "code": "X_train = df.drop(columns = features_to_drop + ['target'])\nX_test = test.drop(columns = features_to_drop)\ny_train = df.target\nlr = LogisticRegression(solver = 'liblinear', random_state = 777)\n\npipeline = Pipeline([('scale', scaler), ('lr',lr),])\npipeline.fit(X_train, y_train)\ny_test = pipeline.predict(X_test)", - "class": "Model Training", - "desc": "This code snippet defines feature and target training datasets (`X_train` and `y_train` respectively), drops unnecessary columns for the test set (`X_test`), then constructs and trains a logistic regression model within a pipeline that includes MinMax scaling, and finally makes predictions on the test set.", + "cell_id": 11, + "code": "nlp = English()\ntokenizer = nlp.tokenizer\ntokens = tokenizer('This is a test!')\nprint(tokens)\nprint(type(tokens))\nprint([t.text for t in tokens])", + "class": "Exploratory Data Analysis", + "desc": "This code snippet initializes a tokenizer from the `spacy` English language model and demonstrates tokenization on a sample text string, printing the resulting tokens and their type, as well as the token texts.", "testing": { - "class": "Model_Train", - "subclass": "find_best_model_class", - "subclass_id": 3, - "predicted_subclass_probability": 0.2473382 + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.46164313 }, - "cluster": 0 + "cluster": 3 }, { "cell_id": 13, - "code": "#Most Common Words", - "class": "Visualization", - "desc": "This comment is likely a heading or a section marker indicating that the following code will visualize the most common words in the dataset.", + "code": "text = 'This is a test\\n , ok?'\nprint('All tokens:', [t.text for t in tokenizer(text)])\n\nprint('Check for is_space():', [t.text for t in tokenizer(text) if not t.is_space])", + "class": "Exploratory Data Analysis", + "desc": "This code snippet tokenizes a sample text string using Spacy and prints all tokens first, and then filters out and prints tokens that are not whitespace.", + "testing": { + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.9729691 + }, + "cluster": 3 + }, { + "cell_id": 15, + "code": "train.sample(10)", + "class": "Exploratory Data Analysis", + "desc": "This code snippet displays a random sample of 10 rows from the training dataset using the `sample` method from pandas.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.74166125 + "predicted_subclass_probability": 0.99975437 }, - "cluster": -1 + "cluster": 2 }, { - "cell_id": 14, - "code": "# plt.figure(figsize=(10,8))\n# sns.barplot(x = df['keyword'].value_counts().head(5).index, y = df['keyword'].value_counts().head(5))", - "class": "Visualization", - "desc": "This commented-out code was intended to create a bar plot of the top 5 most common keywords in the `df` DataFrame, using Seaborn to visualize their frequencies.", + "cell_id": 25, + "code": "def get_rows_containing(data, term):\n \"\"\"Return rows containing a term\"\"\"\n has_term = data.tokens.apply(lambda row: term in row)\n return data[has_term]\n\nterms = ['bags', 'australia']\nfor term in terms:\n rows = get_rows_containing(train, term)\n print(f'Distribution containing {term}:')\n print(rows.target.value_counts())\n for i, row in rows.sample(5).iterrows():\n print(row.target, row.text)\n print()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet defines a function to retrieve rows containing specific terms from the training dataset and then prints the distribution of the target variable and sample texts for each specified term.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.97318214 + "class": "Exploratory_Data_Analysis", + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.57849276 }, - "cluster": -1 + "cluster": 3 }, { - "cell_id": 15, - "code": "plt.figure(figsize= (9,6))\nsns.countplot(y = df.keyword, order = df.keyword.value_counts().iloc[:15].index)\nplt.title('Top 15 Keyword')\nplt.show()", - "class": "Visualization", - "desc": "This code snippet creates a horizontal bar plot of the top 15 most common keywords in the `df` DataFrame using Seaborn, with the count of each keyword displayed, and a title 'Top 15 Keyword'.", + "cell_id": 34, + "code": "x.shape", + "class": "Exploratory Data Analysis", + "desc": "This code snippet outputs the shape of the matrix `x`, which is the binary bag-of-words representation of the tokenized training data, created using the `CountVectorizer`.", "testing": { - "class": "Visualization", - "subclass": "model_coefficients", - "subclass_id": 79, - "predicted_subclass_probability": 0.9776137 + "class": "Exploratory_Data_Analysis", + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.9995432 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 16, - "code": "sns.countplot(y = df.target)", - "class": "Visualization", - "desc": "This code snippet generates a horizontal count plot using Seaborn to visualize the distribution of the target variable in the `df` DataFrame, showing the frequency of each class.", + "cell_id": 45, + "code": "print(embeddings.shape)\nprint(len(word_dict))", + "class": "Exploratory Data Analysis", + "desc": "This code snippet prints the shape of the embeddings tensor and the length of the word dictionary to verify the dimensions and the number of words loaded from the GloVe embeddings file.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.98655856 + "class": "Exploratory_Data_Analysis", + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.9995413 }, - "cluster": 0 + "cluster": 5 }, { - "cell_id": 17, - "code": "#Top 10 words in Disasterous and Non-Disasterous tweets", - "class": "Visualization", - "desc": "This comment likely indicates that the following code will visualize the top 10 most frequent words in tweets categorized as disasterous and non-disasterous.", + "cell_id": 48, + "code": "oov_count.most_common(10)", + "class": "Exploratory Data Analysis", + "desc": "This code snippet prints the 10 most common out-of-vocabulary tokens found in the training dataset by utilizing the `most_common` method of the `Counter` object.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "commented", - "subclass_id": 76, - "predicted_subclass_probability": 0.7826869 + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.94204676 }, "cluster": -1 }, { - "cell_id": 18, - "code": "w_nd = df[df.target == 0].keyword.value_counts().head(10)\nsns.barplot(w_nd, w_nd.index, color = 'c')\nplt.title('Top keyword for Disaster tweet')\nplt.show()", - "class": "Visualization", - "desc": "This code snippet creates a horizontal bar plot using Seaborn to display the top 10 keywords in tweets labeled as non-disasterous (target == 0) in the `df` DataFrame, with a title 'Top keyword for Disaster tweet'.", + "cell_id": 50, + "code": "len(oov_count)", + "class": "Exploratory Data Analysis", + "desc": "This code snippet outputs the number of unique out-of-vocabulary tokens found in the training dataset by printing the length of the `oov_count` Counter object.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.8726834 + "class": "Exploratory_Data_Analysis", + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.9988147 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 19, - "code": "nw_nd = df[df.target == 1].keyword.value_counts().head(10)\nsns.barplot(nw_nd, nw_nd.index, color = 'y')\nplt.title('Top keyword for Non-Disaster tweet')\nplt.show()", - "class": "Visualization", - "desc": "This code snippet creates a horizontal bar plot using Seaborn to display the top 10 keywords in tweets labeled as disasterous (target == 1) in the `df` DataFrame, with a title 'Top keyword for Non-Disaster tweet'.", + "cell_id": 0, + "code": "from collections import Counter\n\nimport seaborn as sns\nimport numpy as np \nimport pandas as pd\nfrom matplotlib import pyplot as plt\nimport spacy\nfrom tqdm import tqdm\nfrom spacy.lang.en import English\nimport torch\nfrom torch import nn\nfrom torch.nn.utils.rnn import pad_sequence\nfrom torch.nn import functional as F\nfrom torch.utils.data import Dataset, TensorDataset, DataLoader\nimport pytorch_lightning as pl\n\nfrom sklearn.model_selection import train_test_split, cross_validate, cross_val_score\nfrom sklearn.metrics import f1_score\nfrom sklearn.linear_model import LogisticRegression, RidgeClassifier\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\nfrom sklearn.svm import SVC\nfrom xgboost import XGBClassifier\nfrom transformers import RobertaForSequenceClassification, RobertaTokenizerFast", + "class": "Imports and Environment", + "desc": "This code snippet imports various libraries and modules essential for data manipulation (pandas, numpy), visualization (seaborn, matplotlib), natural language processing (spacy, transformers), machine learning (scikit-learn, xgboost), deep learning (torch, pytorch_lightning), and performance tracking (tqdm).", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9795735 + "class": "Imports_and_Environment", + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.9993081 }, "cluster": 0 }, { - "cell_id": 22, - "code": "#Most Common Location\nplt.figure(figsize = (9,6))\nsns.countplot(y = df.location, order = df.location.value_counts().iloc[:15].index)\nplt.title('Top 15 Location')\nplt.show()", - "class": "Visualization", - "desc": "This code snippet creates a horizontal count plot using Seaborn to visualize the top 15 most common locations in the `df` DataFrame, displaying the frequency of each location with the title 'Top 15 Location'.", + "cell_id": 1, + "code": "np.random.seed(42)\n\n# prettier graphs!\nplt.style.use('ggplot')", + "class": "Imports and Environment", + "desc": "This code snippet sets a random seed using NumPy for reproducibility and configures Matplotlib to use the 'ggplot' style for prettier graphs.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.97139984 + "class": "Imports_and_Environment", + "subclass": "set_options", + "subclass_id": 23, + "predicted_subclass_probability": 0.9983991 }, "cluster": 0 - }], - "notebook_id": 16, - "notebook_name": "nlp-from-beginner-to-expert" - }, { - "cells": [{ - "cell_id": 65, - "code": "# submit\nsubmission = pd.read_csv(\"/kaggle/input/nlp-getting-started/sample_submission.csv\")\nsubmission['target'] = np.round(test_pred).astype('int')\nsubmission.to_csv('submission.csv', index=False)\nsubmission.groupby('target').count()", - "class": "Data Export", - "desc": "This code populates a sample submission file with the predicted labels, saves it as a new CSV file for submission, and checks the distribution of the target variable in the submission dataset.", + }, { + "cell_id": 22, + "code": "majority = y.mode()[0] == y\nprint(f'Majority class baseline: {majority.mean()}')", + "class": "Model Evaluation", + "desc": "This code snippet evaluates the baseline accuracy of predicting the majority class for the target variable in the training dataset and prints the result.", "testing": { - "class": "Data_Export", - "subclass": "save_to_csv", - "subclass_id": 25, - "predicted_subclass_probability": 0.99921453 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.91397643 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 1, - "code": "train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')\nsubmit_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')", - "class": "Data Extraction", - "desc": "This code loads the training and test datasets from CSV files into pandas DataFrames for further processing.", + "cell_id": 23, + "code": "classifier = LogisticRegression()\ncv_scores = cross_val_score(classifier, x, y, scoring='f1', cv=10, n_jobs=-1)\nprint(f'Mean F1: {cv_scores.mean()}')", + "class": "Model Evaluation", + "desc": "This code snippet trains a Logistic Regression classifier and evaluates its performance using 10-fold cross-validation to compute the mean F1 score, using scikit-learn's `cross_val_score` method.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.9997477 + "class": "Model_Train", + "subclass": "compute_train_metric", + "subclass_id": 28, + "predicted_subclass_probability": 0.97591215 }, - "cluster": 2 + "cluster": 0 }, { "cell_id": 26, - "code": "import requests\nurl = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'\nfilename = url.split('/')[-1]\nr = requests.get(url)\nwith open(filename, \"wb\") as file:\n file.write(r.content)\n \n!ls", - "class": "Data Extraction", - "desc": "This code downloads the pre-trained Google News word2vec model file from an S3 bucket and saves it locally, then lists the files in the directory to confirm the download.", + "code": "from sklearn.feature_selection import chi2, SelectKBest\n\nnum_features = [1000, 500, 250, 100, 50]\nf1 = []\nfor k in num_features:\n selector = SelectKBest(chi2, k=k)\n x_selected = selector.fit_transform(x, y)\n scores = cross_val_score(classifier, x_selected, y, scoring='f1', cv=10, n_jobs=-1)\n f1.append(scores.mean())\n", + "class": "Model Evaluation", + "desc": "This code snippet evaluates the performance of a Logistic Regression classifier using 10-fold cross-validation to compute the mean F1 scores after selecting varying numbers of top features based on the chi-squared test, using scikit-learn's `SelectKBest`.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "list_files", - "subclass_id": 88, - "predicted_subclass_probability": 0.48958522 + "class": "Model_Train", + "subclass": "find_best_params", + "subclass_id": 2, + "predicted_subclass_probability": 0.48363486 }, "cluster": 0 }, { - "cell_id": 58, - "code": "!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py", - "class": "Data Extraction", - "desc": "This code downloads the 'tokenization.py' script from the TensorFlow models repository to facilitate tokenization for BERT-based models.", + "cell_id": 30, + "code": "regularization = [1, 0.1, 0.01, 0.001, 0.0001]\nl1_scores = []\nl2_scores = []\nl1_std = []\nl2_std = []\n\nfor value in regularization:\n log_reg = LogisticRegression(C=value)\n results = cross_val_score(log_reg, x_selected, y, scoring='f1', cv=10, n_jobs=-1)\n l2_scores.append(results.mean())\n l2_std.append(results.std())\n \n alpha = 1 / (2 * value) # as defined in sklearn\n ridge = RidgeClassifier(alpha=alpha)\n results = cross_val_score(ridge, x_selected, y, scoring='f1', cv=10, n_jobs=-1)\n l1_scores.append(results.mean())\n l1_std.append(results.std())", + "class": "Model Evaluation", + "desc": "This code snippet evaluates Logistic Regression with L2 regularization and Ridge Classifier with different regularization strengths, computing the mean and standard deviation of the F1 scores using 10-fold cross-validation.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_url", - "subclass_id": 42, - "predicted_subclass_probability": 0.8866123 + "class": "Visualization", + "subclass": "model_coefficients", + "subclass_id": 79, + "predicted_subclass_probability": 0.39513838 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 9, - "code": "data = pd.concat([train_data, submit_data])\ndata.shape", - "class": "Data Transform", - "desc": "This code concatenates the training and test datasets and then outputs the shape of the combined DataFrame.", + "cell_id": 32, + "code": "print(f'Best baseline F1: {l2_scores[1]}')", + "class": "Model Evaluation", + "desc": "This code snippet prints the best F1 score among the evaluated Logistic Regression models with L2 regularization.", "testing": { - "class": "Data_Transform", - "subclass": "concatenate", - "subclass_id": 11, - "predicted_subclass_probability": 0.9939737 + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.5057336 }, "cluster": 1 }, { - "cell_id": 10, - "code": "data['text'] = data['text'].apply(lambda x: re.sub(re.compile(r'https?\\S+'), '', x))\ndata['text'] = data['text'].apply(lambda x: re.sub(re.compile(r'[\\//:,.!?@&\\-\\'\\`\\\"\\_\\n\\#]'), ' ', x))\ndata['text'] = data['text'].apply(lambda x: re.sub(re.compile(r'<.*?>'), '', x))\ndata['text'] = data['text'].apply(lambda x: re.sub(re.compile(\"[\"\n u\"\\U0001F600-\\U0001F64F\" \n u\"\\U0001F300-\\U0001F5FF\" \n u\"\\U0001F680-\\U0001F6FF\" \n u\"\\U0001F1E0-\\U0001F1FF\" \n u\"\\U00002702-\\U000027B0\"\n u\"\\U000024C2-\\U0001F251\"\n \"]+\", flags=re.UNICODE), '', x))\ndata['text'] = data['text'].apply(lambda x: re.sub(re.compile(r'\\d'), '', x))\ndata['text'] = data['text'].apply(lambda x: re.sub(re.compile(r'[^\\w]'), ' ', x))\ndata['text'] = data['text'].str.lower()", - "class": "Data Transform", - "desc": "This code snippet applies multiple regular expressions to clean the text data by removing URLs, punctuation, HTML tags, emojis, digits, and non-word characters, and then converts all text to lowercase.", + "cell_id": 35, + "code": "classifier = LogisticRegression(C=0.1)\nselector = SelectKBest(chi2, k=500)\nx = selector.fit_transform(x, y)\ncv_scores = cross_validate(classifier, x, y, scoring='f1', cv=10, n_jobs=-1, return_train_score=True)\nmean_f1 = cv_scores['test_score'].mean()\nprint(f'Mean F1: {mean_f1}')", + "class": "Model Evaluation", + "desc": "This code snippet selects the top 500 features based on the chi-squared test, trains a Logistic Regression classifier with C=0.1, and evaluates its performance using 10-fold cross-validation to compute the mean F1 score, using scikit-learn's `cross_validate` method.", "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.9965412 + "class": "Model_Train", + "subclass": "compute_train_metric", + "subclass_id": 28, + "predicted_subclass_probability": 0.98264277 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 11, - "code": "'''\ntext_series = data.loc[:,'text']\nfor i in range(len(text_series)):\n content = text_series.iloc[i]\n textblob = TextBlob(content)\n text_series.iloc[i] = textblob.correct()\n'''", - "class": "Data Transform", - "desc": "This commented-out code snippet iterates through text data to correct spelling errors using TextBlob, though it is not executed in this instance.", + "cell_id": 38, + "code": "c = RandomForestClassifier(n_estimators=100, n_jobs=-1)\ncv_scores = cross_validate(c, x, y, scoring='f1', cv=8, n_jobs=-1, return_train_score=True)\nplot_model_score(cv_scores['train_score'], cv_scores['test_score'])", + "class": "Model Evaluation", + "desc": "This code snippet trains a Random Forest classifier with 100 estimators and evaluates its performance using 8-fold cross-validation to compute the train and validation F1 scores, then visualizes these scores using the `plot_model_score` function.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "commented", - "subclass_id": 76, - "predicted_subclass_probability": 0.9023682 + "class": "Visualization", + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.7403155 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 12, - "code": "clean_train = data[0:train_data.shape[0]]\nclean_submit = data[train_data.shape[0]:-1]\n\nX_train, X_test, y_train, y_test = train_test_split(clean_train['text'], clean_train['target'],\n test_size = 0.2, random_state = 4)", - "class": "Data Transform", - "desc": "This code separates the cleaned combined data back into training and submission datasets and then splits the training data into training and testing sets with a test size of 20%.", + "cell_id": 39, + "code": "c = RandomForestClassifier(n_estimators=100, min_samples_leaf=3)\ncv_scores = cross_validate(c, x, y, scoring='f1', cv=8, n_jobs=-1, return_train_score=True)\nplot_model_score(cv_scores['train_score'], cv_scores['test_score'])", + "class": "Model Evaluation", + "desc": "This code snippet trains a Random Forest classifier with 100 estimators and a minimum of 3 samples per leaf, evaluates its performance using 8-fold cross-validation to compute the train and validation F1 scores, and visualizes these scores using the `plot_model_score` function.", "testing": { - "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.9978915 + "class": "Model_Train", + "subclass": "compute_train_metric", + "subclass_id": 28, + "predicted_subclass_probability": 0.8847018 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 13, - "code": "def tfidf(words):\n tfidf_vectorizer = TfidfVectorizer()\n data_feature = tfidf_vectorizer.fit_transform(words)\n return data_feature, tfidf_vectorizer\n\nX_train_tfidf, tfidf_vectorizer = tfidf(X_train.tolist())\nX_test_tfidf = tfidf_vectorizer.transform(X_test.tolist())", - "class": "Data Transform", - "desc": "This code defines a function to convert text data into TF-IDF features and then applies this function to both the training and test data for use in machine learning models.", + "cell_id": 40, + "code": "c = RandomForestClassifier(n_estimators=500, min_samples_split=10)\ncv_scores = cross_validate(c, x, y, scoring='f1', cv=8, n_jobs=-1, return_train_score=True)\nplot_model_score(cv_scores['train_score'], cv_scores['test_score'])", + "class": "Model Evaluation", + "desc": "This code snippet trains a Random Forest classifier with 500 estimators and a minimum of 10 samples required to split a node, evaluates its performance using 8-fold cross-validation to compute the train and validation F1 scores, and visualizes these scores using the `plot_model_score` function.", "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.9976406 + "class": "Model_Train", + "subclass": "compute_train_metric", + "subclass_id": 28, + "predicted_subclass_probability": 0.88807696 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 27, - "code": "stop_words = stopwords.words('english')\nfor word in ['us','no','yet']:\n stop_words.append(word)\n\ndata_list = []\ntext_series = data['text']\nfor i in range(len(text_series)):\n content = text_series.iloc[i]\n cutwords = [word for word in content.split(' ') if word not in stop_words if len(word) != 0]\n data_list.append(cutwords)", - "class": "Data Transform", - "desc": "This code enhances the English stopwords list by adding specific words, then tokenizes the text data by removing stopwords and empty tokens, and stores the cleaned text as lists of words.", + "cell_id": 41, + "code": "c = RandomForestClassifier(n_estimators=200, min_samples_split=5, max_depth=50)\ncv_scores = cross_validate(c, x, y, scoring='f1', cv=8, n_jobs=-1, return_train_score=True)\nplot_model_score(cv_scores['train_score'], cv_scores['test_score'])", + "class": "Model Evaluation", + "desc": "This code snippet trains a Random Forest classifier with 200 estimators, a minimum of 5 samples required to split a node, and a maximum depth of 50, evaluates its performance using 8-fold cross-validation to compute the train and validation F1 scores, and visualizes these scores using the `plot_model_score` function.", "testing": { - "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.9944119 + "class": "Visualization", + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.41321388 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 33, - "code": "def get_textVector(data_list, word2vec, textsVectors_list):\n for i in range(len(data_list)):\n words_perText = data_list[i]\n if len(words_perText) < 1:\n words_vector = [np.zeros(300)]\n else:\n words_vector = [word2vec.wv[k] if k in word2vec_model else np.zeros(300) for k in words_perText]\n text_vector = np.array(words_vector).mean(axis=0)\n textsVectors_list.append(text_vector)\n return textsVectors_list", - "class": "Data Transform", - "desc": "This code snippet defines a function that computes the averaged word vectors for each text in the dataset using a pre-trained word2vec model, storing the results in a list.", + "cell_id": 42, + "code": "c = XGBClassifier()\ncv_scores = cross_validate(c, x, y, scoring='f1', cv=8, n_jobs=-1, return_train_score=True)\nplot_model_score(cv_scores['train_score'], cv_scores['test_score'])", + "class": "Model Evaluation", + "desc": "This code snippet trains an XGBoost Classifier, evaluates its performance using 8-fold cross-validation to compute the train and validation F1 scores, and visualizes these scores using the `plot_model_score` function.", "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.966133 + "class": "Model_Train", + "subclass": "compute_train_metric", + "subclass_id": 28, + "predicted_subclass_probability": 0.8236538 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 34, - "code": "textsVectors_list = []\nget_textVector(data_list, word2vec_model, textsVectors_list)\nX = np.array(textsVectors_list)", - "class": "Data Transform", - "desc": "This code applies the `get_textVector` function to the tokenized text data to generate averaged word vectors, then stores these vectors in a NumPy array for use in machine learning models.", + "cell_id": 61, + "code": "# trainer.predict returns a list with batch results\nlogits = np.concatenate(trainer.predict(model, test_loader))", + "class": "Model Evaluation", + "desc": "This code snippet uses the PyTorch Lightning `Trainer` to predict logits for the test dataset using the trained `BagOfEmbeddingsClassifier` model, and concatenates the batch results into a single array.", "testing": { - "class": "Data_Transform", - "subclass": "prepare_x_and_y", - "subclass_id": 21, - "predicted_subclass_probability": 0.91989833 + "class": "Model_Evaluation", + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.99380153 }, "cluster": 1 }, { - "cell_id": 36, - "code": "word2vec_X = X[0:train_data.shape[0]]\ny = data['target'][0:train_data.shape[0]]\nword2vec_submit = X[train_data.shape[0]:-1]\n\nX_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(word2vec_X, y,\n test_size = 0.2, random_state = 4)", - "class": "Data Transform", - "desc": "This code separates the averaged word vectors back into training and submission datasets, then splits the training data into training and testing sets with a test size of 20%.", + "cell_id": 24, + "code": "k = 50\nclassifier = LogisticRegression(max_iter=500)\nclassifier.fit(x, y)\nplot_top_values(classifier.coef_[0], k, vocab, 'Type', 'Weight', use_abs=True)", + "class": "Model Training", + "desc": "This code snippet trains a Logistic Regression classifier with a maximum of 500 iterations on the bag-of-words representation of the training data, then visualizes the top k feature weights of the model using the `plot_top_values` function.", "testing": { - "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.9981791 + "class": "Visualization", + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.7323191 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 44, - "code": "tokenizer = Tokenizer()\ntokenizer.fit_on_texts(data_list)\nsequences = tokenizer.texts_to_sequences(data_list)\nword_index = tokenizer.word_index\ncnn_data = pad_sequences(sequences, maxlen = max_sequence_length)\ncnn_label = to_categorical(np.asarray(train_data['target']))\nprint('len of word_index:', len(word_index))\nprint('shape of data tensor:', cnn_data.shape)\nprint('shape of label tensoe:', cnn_label.shape)", - "class": "Data Transform", - "desc": "This code initializes a tokenizer, fits it to the tokenized text data, converts the text data to sequences of integers, pads these sequences to the maximum length, and converts the labels to a categorical format, while printing the sizes of the word index, data tensor, and label tensor.", + "cell_id": 28, + "code": "selector = SelectKBest(chi2, k=250)\nx_selected = selector.fit_transform(x, y)\nvocab = [vocab[i] for i, selected in enumerate(selector.get_support()) if selected]\nclassifier.fit(x_selected, y)\nplot_top_values(classifier.coef_[0], k, vocab, 'Type', 'Weight', use_abs=True)", + "class": "Model Training", + "desc": "This code snippet selects the top 250 features based on the chi-squared test, updates the vocabulary, trains a Logistic Regression classifier on the selected features, and visualizes the top k feature weights of the model using the `plot_top_values` function.", + "testing": { + "class": "Visualization", + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.58552986 + }, + "cluster": 0 + }, { + "cell_id": 52, + "code": "class BagOfEmbeddingsClassifier(pl.LightningModule):\n def __init__(self, embeddings, learning_rate=0.001, l2=0.001):\n super().__init__()\n self.learning_rate = learning_rate\n self.l2 = l2\n \n vocab_size, embedding_dim = embeddings.shape\n self.embedding_bag = nn.EmbeddingBag.from_pretrained(embeddings, freeze=False)\n \n # a single output value determines the probability of class 1 with a sigmoid function\n self.linear = nn.Linear(embedding_dim, 1, bias=True)\n \n def forward(self, x):\n \"\"\"x is a list of tensors with any shape\"\"\"\n # embedding bag operates with a single tensor of concatenated inputs and another of offsets\n lengths = torch.tensor([0] + [len(sample) for sample in x[:-1]])\n offsets = lengths.cumsum(0).to(x[0].device)\n x = torch.cat(x)\n embedded = self.embedding_bag(x, offsets)\n logits = self.linear(embedded).squeeze(-1)\n return logits\n \n def _get_loss_and_acc(self, logits, y):\n \"\"\"Internal function\"\"\"\n predicted = logits > 0\n acc = (predicted == y).float().mean()\n loss = F.binary_cross_entropy_with_logits(logits, y.float())\n \n return loss, acc\n \n def on_fit_start(self): \n self.train_losses = []\n self.train_accs = []\n self.valid_losses = []\n self.valid_accs = []\n \n self.reset_metrics()\n \n def reset_metrics(self):\n self.partial_train_losses = []\n self.partial_train_accs = []\n self.partial_valid_losses = []\n self.partial_valid_accs = []\n \n def on_validation_end(self):\n self.train_losses.append(np.array(self.partial_train_losses).mean())\n self.train_accs.append(np.array(self.partial_train_accs).mean())\n self.valid_losses.append(np.array(self.partial_valid_losses).mean())\n self.valid_accs.append(np.array(self.partial_valid_accs).mean())\n self.reset_metrics()\n \n def training_step(self, batch, batch_idx):\n \"\"\"\n batch is a tuple (x, y)\n x is a list of tensors as in forward\n y is a tensor with the classes\n \"\"\"\n x, y = batch\n logits = self(x)\n loss, acc = self._get_loss_and_acc(logits, y)\n \n # ideally we'd use tensorboard to see the graphs, but currently it is disabled in Kaggle\n # so we resort to manually plotting\n# self.log('train_loss', loss)\n# self.log('train_acc', acc)\n self.partial_train_losses.append(loss.detach().cpu().numpy())\n self.partial_train_accs.append(acc.detach().cpu().numpy())\n \n return loss\n \n def validation_step(self, batch, batch_idx):\n x, y = batch\n logits = self(x)\n loss, acc = self._get_loss_and_acc(logits, y)\n \n# self.log('valid_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)\n# self.log('valid_acc', acc)\n self.partial_valid_losses.append(loss.detach().cpu().numpy())\n self.partial_valid_accs.append(acc.detach().cpu().numpy())\n \n return loss\n \n def configure_optimizers(self):\n optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=self.l2)\n return optimizer", + "class": "Model Training", + "desc": "This code snippet defines a PyTorch Lightning module class `BagOfEmbeddingsClassifier` that uses an embedding bag layer for text classification, includes methods for forward propagation, training and validation steps, optimizer configuration, and metrics tracking.", + "testing": { + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.94578296 + }, + "cluster": 3 + }, { + "cell_id": 56, + "code": "model = BagOfEmbeddingsClassifier(embeddings, 0.001, l2=0)\nbatch = next(iter(train_loader))\n\n# batch is x, y\nlogits = model(batch[0])\nprint(logits)", + "class": "Model Training", + "desc": "This code snippet initializes an instance of the `BagOfEmbeddingsClassifier` model with specified learning rate and L2 regularization, retrieves a batch from the training DataLoader, and performs a forward pass to print the logits.", + "testing": { + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.9270925 + }, + "cluster": 0 + }, { + "cell_id": 57, + "code": "trainer = pl.Trainer(gpus=1, max_epochs=5, val_check_interval=0.5)\ntrainer.fit(model, train_loader, valid_loader)", + "class": "Model Training", + "desc": "This code snippet initializes a PyTorch Lightning `Trainer` to run on one GPU for a maximum of 5 epochs, with validation checks at 50% intervals, and fits the `BagOfEmbeddingsClassifier` model using the training and validation DataLoaders.", "testing": { "class": "Model_Train", "subclass": "train_model", "subclass_id": 7, - "predicted_subclass_probability": 0.6572918 + "predicted_subclass_probability": 0.999678 }, "cluster": 1 }, { - "cell_id": 45, - "code": "trainCNN_data = cnn_data[0:train_data.shape[0]]\nX_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(trainCNN_data, cnn_label,\n test_size = 0.2, random_state = 4)\nX_cnn, X_val_cnn, y_cnn, y_val_cnn = train_test_split(X_train_cnn, y_train_cnn,\n test_size = 0.2, random_state = 4)", - "class": "Data Transform", - "desc": "This code separates the padded text sequences back into training and submission datasets, then further splits the training data into training, validation, and testing sets, with a validation split to fine-tune model performance.", + "cell_id": 59, + "code": "model = BagOfEmbeddingsClassifier(embeddings, 0.001, l2=0.0001)\ntrainer = pl.Trainer(gpus=1, max_epochs=6, val_check_interval=0.5)\ntrainer.fit(model, train_loader, valid_loader)", + "class": "Model Training", + "desc": "This code snippet initializes a new instance of the `BagOfEmbeddingsClassifier` with updated L2 regularization, and fits it using a PyTorch Lightning `Trainer` set to run on one GPU for a maximum of 6 epochs with validation checks at 50% intervals.", "testing": { - "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.99836665 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.9926635 }, "cluster": 1 }, { - "cell_id": 50, - "code": "embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))\nfor word, i in word_index.items(): \n if word in word2vec_model:\n embedding_matrix[i] = np.asarray(word2vec_model.wv[word])", - "class": "Data Transform", - "desc": "This code creates an embedding matrix initialized with zeros, and then fills it with word vectors from the pre-trained word2vec model for each word in the tokenizer's word index to be used in embedding layers.", + "cell_id": 66, + "code": "class TransformerWrapper(pl.LightningModule):\n def __init__(self, transformer, learning_rate=0.001, l2=0.0001):\n super().__init__()\n self.model = transformer\n self.learning_rate = learning_rate\n self.l2 = l2\n \n def forward(self, batch):\n x, mask = batch\n output = self.model(x, mask)\n return output.logits\n \n def training_step(self, batch, batch_idx):\n loss, acc = self._get_loss_and_acc(batch)\n self.partial_train_losses.append(loss.detach().cpu().numpy())\n self.partial_train_accs.append(acc.detach().cpu().numpy())\n \n return loss\n \n def _get_loss_and_acc(self, batch):\n x, mask, y = batch\n output = self.model(x, mask, labels=y)\n loss = output.loss\n logits = output.logits\n \n predicted = logits.argmax(1)\n acc = (predicted == y).float().mean()\n \n return loss, acc\n \n # these functions are copied from the BagOfWords class to allow ploting without tensorboard\n # ideally, we'd inherit from a common base class. well, ideally we'd have access to tensorboard and none of this would exist :)\n def on_fit_start(self): \n self.train_losses = []\n self.train_accs = []\n self.valid_losses = []\n self.valid_accs = []\n \n self.reset_metrics()\n \n def reset_metrics(self):\n self.partial_train_losses = []\n self.partial_train_accs = []\n self.partial_valid_losses = []\n self.partial_valid_accs = []\n \n def on_validation_end(self):\n self.train_losses.append(np.array(self.partial_train_losses).mean())\n self.train_accs.append(np.array(self.partial_train_accs).mean())\n self.valid_losses.append(np.array(self.partial_valid_losses).mean())\n self.valid_accs.append(np.array(self.partial_valid_accs).mean())\n self.reset_metrics()\n \n def validation_step(self, batch, batch_idx):\n loss, acc = self._get_loss_and_acc(batch)\n \n# self.log('valid_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)\n# self.log('valid_acc', acc)\n self.partial_valid_losses.append(loss.cpu().numpy())\n self.partial_valid_accs.append(acc.cpu().numpy())\n \n return loss\n \n def configure_optimizers(self):\n # to make it lighter, fine tune only the classifier on top of the language model\n parameters = [p[1] for p in self.model.named_parameters() if p[0].startswith('classifier')]\n optimizer = torch.optim.AdamW(parameters, lr=self.learning_rate, weight_decay=self.l2)\n return optimizer", + "class": "Model Training", + "desc": "This code snippet defines a PyTorch Lightning module class `TransformerWrapper` that wraps a transformer model for text classification, including methods for forward propagation, training and validation steps, optimizer configuration, and metrics tracking, with a focus on fine-tuning only the classification layer.", "testing": { "class": "Data_Transform", - "subclass": "prepare_x_and_y", - "subclass_id": 21, - "predicted_subclass_probability": 0.739272 + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.48894703 }, - "cluster": 1 + "cluster": 3 }, { - "cell_id": 62, - "code": "# read and encode train data\ntrain = pd.read_csv(\"/kaggle/input/nlp-getting-started/train.csv\")\n\ntrain_input = bert_encode(train.text.values, bert_layer, max_len=128)\ntrain_labels = np.array(train.target)", - "class": "Data Transform", - "desc": "This code reads the training data from a CSV file, encodes the text data using the BERT tokenizer, and converts the labels to a NumPy array for use in model training.", + "cell_id": 67, + "code": "model = TransformerWrapper(roberta, 0.001, l2=0)\ntrainer = pl.Trainer(gpus=1, max_epochs=6, val_check_interval=0.5)\ntrainer.fit(model, train_loader, valid_loader)", + "class": "Model Training", + "desc": "This code snippet initializes an instance of the `TransformerWrapper` with the pre-trained RoBERTa model, creates a PyTorch Lightning `Trainer` to run with one GPU for up to 6 epochs with validation checks at 50% intervals, and fits the model using the training and validation DataLoaders.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.99927765 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.9910779 }, "cluster": 1 }, { - "cell_id": 2, - "code": "train_data[train_data['text'].isna()]", - "class": "Exploratory Data Analysis", - "desc": "This code snippet checks for rows in the training data where the 'text' column has missing values. ", + "cell_id": 69, + "code": "roberta = RobertaForSequenceClassification.from_pretrained(pretrained_name, num_labels=2)\nmodel = TransformerWrapper(roberta, 0.01, l2=0)\ntrainer = pl.Trainer(gpus=1, max_epochs=4, val_check_interval=0.5)\ntrainer.fit(model, train_loader, valid_loader)", + "class": "Model Training", + "desc": "This code snippet reinitializes the RoBERTa model for sequence classification, instantiates a new `TransformerWrapper` with a higher learning rate, creates a PyTorch Lightning `Trainer` to run on one GPU for up to 4 epochs with validation checks at 50% intervals, and fits the model using the training and validation DataLoaders.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_missing_values", - "subclass_id": 39, - "predicted_subclass_probability": 0.88515306 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.98994815 }, - "cluster": 4 + "cluster": 1 }, { - "cell_id": 3, - "code": "train_data.info()", - "class": "Exploratory Data Analysis", - "desc": "This code provides concise summary information about the DataFrame, including the data types, non-null counts, and memory usage of each column.", + "cell_id": 71, + "code": "roberta = RobertaForSequenceClassification.from_pretrained(pretrained_name, num_labels=2)\n\ntrain_loader = DataLoader(train_data, batch_size=128)\n\nmodel = TransformerWrapper(roberta, 0.005, l2=0)\ntrainer = pl.Trainer(gpus=1, max_epochs=4, val_check_interval=0.5)\ntrainer.fit(model, train_loader, valid_loader)", + "class": "Model Training", + "desc": "This code snippet reinitializes the RoBERTa model for sequence classification, recreates the training DataLoader with a larger batch size of 128, instantiates a new `TransformerWrapper` with an adjusted learning rate, and fits the model using the PyTorch Lightning `Trainer` over 4 epochs with validation checks at 50% intervals.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.99936634 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.898724 }, - "cluster": 11 + "cluster": 1 }, { "cell_id": 4, - "code": "train_data.groupby('target').count()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet groups the training data by the 'target' column and counts the number of occurrences in each group to understand the distribution of the target variable.", + "code": "target_counts = train.target.value_counts()\nsns.barplot(y=target_counts, x=target_counts.index)\nplt.ylabel('Samples')\nplt.title('Target')\nplt.show()", + "class": "Visualization", + "desc": "This code snippet visualizes the distribution of the target variable in the training dataset by creating a bar plot using Seaborn and Matplotlib.", "testing": { - "class": "Data_Transform", - "subclass": "groupby", - "subclass_id": 60, - "predicted_subclass_probability": 0.9970409 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.99522096 }, "cluster": -1 }, { - "cell_id": 14, - "code": "X_train_tfidf.shape", - "class": "Exploratory Data Analysis", - "desc": "This code snippet outputs the shape of the TF-IDF transformed training data to check its dimensions.", + "cell_id": 5, + "code": "has_kw = ~train.keyword.isna()\nfig, ax = plt.subplots(1, 2, sharey=True)\ntrain[has_kw]\nsns.countplot(data=train[has_kw], x='target', ax=ax[0])\nax[0].set_title('With keyword')\nsns.countplot(data=train[~has_kw], x='target', ax=ax[1])\nax[1].set_title('Without keyword')\nplt.show()", + "class": "Visualization", + "desc": "This code snippet creates count plots using Seaborn and Matplotlib to compare the distribution of the target variable for samples with and without keywords in the training dataset.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.9994598 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.98656905 }, "cluster": -1 }, { - "cell_id": 28, - "code": "for i in range(len(data_list)):\n content = data_list[i]\n if len(content) <1:\n print(i)", - "class": "Exploratory Data Analysis", - "desc": "This code snippet iterates through the tokenized text data to print the indices of any entries that result in empty lists after stopword removal, identifying potential issues in the data.", + "cell_id": 6, + "code": "has_loc = ~train.location.isna()\nsns.countplot(x=has_loc)\nplt.xlabel('Has location')", + "class": "Visualization", + "desc": "This code snippet creates a count plot using Seaborn and Matplotlib to visualize the number of samples in the training dataset that have or do not have the 'location' field specified.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.8840107 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.99126256 }, - "cluster": 6 + "cluster": -1 }, { - "cell_id": 29, - "code": "data_list[7626]", - "class": "Exploratory Data Analysis", - "desc": "This code retrieves and displays the tokenized text for the entry at index 7626 to inspect its contents following text preprocessing.", + "cell_id": 7, + "code": "loc_count = train.location.value_counts()\ntop_loc = loc_count.iloc[:50]\nplt.subplots(figsize=(20, 8))\nplt.xticks(rotation=80)\nsns.barplot(x=top_loc.index, y=top_loc)", + "class": "Visualization", + "desc": "This code snippet visualizes the top 50 most common locations in the training dataset by creating a bar plot using Seaborn and Matplotlib, with rotated x-axis labels for readability.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9993057 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.99783665 }, "cluster": -1 }, { - "cell_id": 32, - "code": "word2vec_model.wv['earthquake'].shape", - "class": "Exploratory Data Analysis", - "desc": "This code snippet outputs the shape of the word vector for the word 'earthquake' from the loaded word2vec model, confirming the dimensionality of the word embeddings.", + "cell_id": 16, + "code": "train['num_tokens'] = train.tokens.apply(len)\nplt.hist(train.num_tokens, bins=20)\nplt.show()", + "class": "Visualization", + "desc": "This code snippet adds a column 'num_tokens' to the training dataset, representing the number of tokens in each row, and then plots a histogram of these token counts using Matplotlib.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.9989441 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.99763453 }, "cluster": -1 }, { - "cell_id": 35, - "code": "pd.isnull(X).any()", - "class": "Exploratory Data Analysis", - "desc": "This code checks the array of averaged word vectors for any null values to ensure data integrity before further processing.", + "cell_id": 17, + "code": "inds40 = train.num_tokens <= 40\nfig, ax = plt.subplots(figsize=(16, 8))\nplt.hist(train[inds40 & train.target].num_tokens, bins=20, alpha=0.5, label='Positive', density=True)\nplt.hist(train[inds40 & ~train.target].num_tokens, bins=20, alpha=0.5, label='Negative', density=True)\nplt.legend()\nplt.title('Tweet length distribution')\nplt.show()", + "class": "Visualization", + "desc": "This code snippet filters the training dataset for entries with 40 or fewer tokens, then plots overlapping histograms to compare the distribution of tweet lengths for positive and negative target classes using Matplotlib.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_missing_values", - "subclass_id": 39, - "predicted_subclass_probability": 0.9978543 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.9983352 }, "cluster": -1 }, { - "cell_id": 37, - "code": "print(X_train_word2vec.shape, y_train_word2vec.shape)", - "class": "Exploratory Data Analysis", - "desc": "This code prints the shapes of the averaged word vector training data and its corresponding labels to confirm the dimensions are as expected for model training.", + "cell_id": 19, + "code": "def plot_top_values(data, k, names, xlabel=None, ylabel=None, use_abs=False):\n \"\"\"\n Function to plot a barplot with counts of the top k items in data and their corresponding names.\n \n Args:\n data: a numpy array\n k: int\n names: list of strings corresponding to the positions in data\n use_abs: if True, take the highest absolute values\n \"\"\"\n if use_abs:\n inds = np.abs(data).argsort()\n else:\n inds = data.argsort()\n \n # inverted argsort and top k\n top_inds = inds[::-1][:k]\n top_values = data[top_inds]\n top_names = [names[i] for i in top_inds]\n \n fig, ax = plt.subplots(figsize=(16, 8))\n plt.bar(np.arange(k), top_values)\n if ylabel:\n ax.set_ylabel(ylabel)\n if xlabel:\n ax.set_xlabel(xlabel)\n ax.set_xticks(np.arange(k))\n ax.set_xticklabels(top_names, rotation=80)\n fig.tight_layout()", + "class": "Visualization", + "desc": "This code snippet defines a function `plot_top_values`, which plots a bar plot of the top k items in a given data array along with their corresponding names, using Matplotlib, with options to label the x and y axes and to use absolute values.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.9980696 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.9957224 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 42, - "code": "lenlen = []\nfor i in range(len(data_list)):\n content = data_list[i]\n perlen = len(content)\n lenlen.append(perlen)\nprint(max(lenlen))", - "class": "Exploratory Data Analysis", - "desc": "This code calculates the length of each tokenized text entry in the dataset and then prints the maximum length to provide insights into the data's distribution.", + "cell_id": 20, + "code": "k = 50\n\nvocab = vectorizer.get_feature_names()\nword_count = train_bow.toarray().sum(0)\n\nplot_top_values(word_count, k, vocab, 'Count', 'Type')", + "class": "Visualization", + "desc": "This code snippet identifies the top 50 most frequent words in the training dataset's bag-of-words representation and plots their counts using the `plot_top_values` function and Matplotlib.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.7533818 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.97462577 }, - "cluster": 6 + "cluster": -1 }, { - "cell_id": 0, - "code": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom nltk.corpus import stopwords\nfrom nltk.util import ngrams\nfrom nltk.stem import WordNetLemmatizer\nimport re\nfrom textblob import TextBlob\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.model_selection import GridSearchCV\nimport tensorflow as tf\nfrom keras.preprocessing.text import Tokenizer\nfrom keras.preprocessing.sequence import pad_sequences\nfrom keras.utils import to_categorical\nfrom gensim.models import Word2Vec\nfrom gensim.models.keyedvectors import KeyedVectors\nimport time\nfrom keras.layers import Dense, Input, Flatten, Dropout\nfrom keras.layers import Conv1D, MaxPooling1D, Embedding\nfrom keras.models import Sequential\nfrom keras import losses\nfrom tensorflow.keras.optimizers import Adam\nfrom tensorflow.keras.models import Model", - "class": "Imports and Environment", - "desc": "This code imports numerous libraries and modules necessary for data manipulation, visualization, natural language processing, machine learning, and deep learning model implementation.", + "cell_id": 27, + "code": "ticks = np.arange(len(f1))\nplt.plot(ticks, f1)\nplt.xticks(ticks, [str(k) for k in num_features])\nplt.title('F1 per number of features (chi2 selector)')\nplt.show()", + "class": "Visualization", + "desc": "This code snippet plots the mean F1 scores for different numbers of selected features to visualize the relationship between the number of features and model performance using Matplotlib.", "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.9993228 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.91531163 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 5, - "code": "%matplotlib inline", - "class": "Imports and Environment", - "desc": "This command configures the Jupyter notebook to display matplotlib plots inline within the notebook cells.", + "cell_id": 29, + "code": "rows = get_rows_containing(train, 'ebay')\nsns.countplot(x='target', data=rows)\nplt.title('Target distribution containing \"ebay\"')\nplt.show()", + "class": "Visualization", + "desc": "This code snippet visualizes the distribution of the target variable for rows in the training dataset that contain the token 'ebay' by creating a count plot using Seaborn and Matplotlib.", "testing": { - "class": "Imports_and_Environment", - "subclass": "set_options", - "subclass_id": 23, - "predicted_subclass_probability": 0.9991172 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.9911773 }, "cluster": -1 }, { - "cell_id": 25, - "code": "!pip install gensim -i http://pypi.douban.com/simple --trusted-host pypi.douban.com", - "class": "Imports and Environment", - "desc": "This code installs the 'gensim' library from the specified PyPI source to ensure that the necessary package for word embeddings is available.", + "cell_id": 31, + "code": "n = np.arange(len(regularization)) + 1\nfig, ax = plt.subplots(figsize=(14, 6))\nwidth = 0.4\n\nax.bar(n, l1_scores, width, label='L1 reg', yerr=l1_std)\nax.bar(n + width, l2_scores, width, label='L2 reg', yerr=l2_std)\nax.set_xlabel('Regularization (lower is stronger)')\nax.set_ylabel('Mean F1')\nax.set_xticks(n + width / 2)\nax.set_xticklabels([str(val) for val in regularization])\nax.legend(loc='best')\n", + "class": "Visualization", + "desc": "This code snippet creates a bar plot to compare the mean F1 scores of Logistic Regression with L2 regularization and Ridge Classifier across different regularization strengths, including error bars for the standard deviation, using Matplotlib.", "testing": { - "class": "Imports_and_Environment", - "subclass": "install_modules", - "subclass_id": 87, - "predicted_subclass_probability": 0.9962657 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.97958606 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 56, - "code": "tf.__version__", - "class": "Imports and Environment", - "desc": "This code outputs the version of the installed TensorFlow library to confirm its compatibility and configuration.", + "cell_id": 36, + "code": "def plot_model_score(train_scores, valid_scores):\n \"\"\"Plot train and validation score for comparison and checking overfitting\"\"\"\n mean_train = train_scores.mean()\n mean_valid = valid_scores.mean()\n fig, ax = plt.subplots()\n plt.bar(0, mean_train, yerr=train_scores.std())\n plt.bar(1, mean_valid, yerr=valid_scores.std())\n ax.text(0, mean_train + 0.01, f'{mean_train:.4f}')\n ax.text(1, mean_valid + 0.01, f'{mean_valid:.4f}')\n plt.title('Model F1 and standard deviation')\n plt.xticks([0, 1], ['Train', 'Validation'])\n ymin = np.min([mean_train, mean_valid]) * 0.8\n plt.ylim(bottom=ymin)\n plt.show()", + "class": "Visualization", + "desc": "This code snippet defines a function `plot_model_score` that creates a bar plot comparing the mean train and validation F1 scores for a model, including error bars for standard deviation, using Matplotlib.", "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.9983907 + "class": "Visualization", + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.91516846 }, "cluster": -1 }, { - "cell_id": 57, - "code": "import tensorflow_hub as hub\nhub.__version__", - "class": "Imports and Environment", - "desc": "This code imports the TensorFlow Hub library and outputs its version to confirm its compatibility and configuration.", + "cell_id": 37, + "code": "plot_model_score(cv_scores['train_score'], cv_scores['test_score'])", + "class": "Visualization", + "desc": "This code snippet calls the `plot_model_score` function to visualize and compare the train and validation F1 scores, including their standard deviations, using data from the cross-validation results.", "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.99918145 + "class": "Visualization", + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.977454 }, "cluster": -1 }, { - "cell_id": 59, - "code": "import tensorflow as tf\nfrom tensorflow.keras.callbacks import ModelCheckpoint\nimport tensorflow_hub as hub\n\nimport tokenization", - "class": "Imports and Environment", - "desc": "This code imports TensorFlow, model checkpointing functionality from Keras, TensorFlow Hub, and the downloaded tokenization script to prepare for using BERT-based models.", + "cell_id": 53, + "code": "def plot_model_performance(model):\n fig, ax = plt.subplots(2, 1, figsize=(16, 8))\n ax[0].set_title('Loss')\n ax[1].set_title('Accuracy')\n\n n = np.arange(len(model.train_losses))\n ax[0].plot(n, model.train_losses, 'bo', label='Train', linestyle='--')\n ax[1].plot(n, model.train_accs, 'bo', linestyle='--')\n ax[0].plot(n, model.valid_losses, 'ro', label='Validation', linestyle='--')\n ax[1].plot(n, model.valid_accs, 'ro', linestyle='--')\n ax[0].legend()\n plt.show()", + "class": "Visualization", + "desc": "This code snippet defines a function `plot_model_performance` that creates line plots to visualize the training and validation loss and accuracy metrics over epochs for a given model using Matplotlib.", "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.99932754 + "class": "Visualization", + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.9959293 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 16, - "code": "def score_metrics(y_test, y_predicted):\n accuracy = accuracy_score(y_test, y_predicted)\n precision = precision_score(y_test, y_predicted)\n recall = recall_score(y_test, y_predicted)\n print(\"accuracy = %0.3f, precision = %0.3f, recall = %0.3f\" % (accuracy, precision, recall))", - "class": "Model Evaluation", - "desc": "This code defines a function to calculate and print the accuracy, precision, and recall metrics for the given true and predicted labels to evaluate the performance of a model.", + "cell_id": 58, + "code": "plot_model_performance(model)", + "class": "Visualization", + "desc": "This code snippet visualizes the training and validation loss and accuracy of the `BagOfEmbeddingsClassifier` model over epochs using the `plot_model_performance` function and Matplotlib.", "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.9981325 + "class": "Visualization", + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.8608386 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 17, - "code": "score_metrics(y_test, y_predicted_lr)", - "class": "Model Evaluation", - "desc": "This code calculates and prints the accuracy, precision, and recall for the Logistic Regression model's predictions on the test data.", - "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.9981065 - }, - "cluster": 0 - }, { - "cell_id": 18, - "code": "def plot_confusion_matrix(y_test, y_predicted, title='Confusion Matrix'):\n cm = confusion_matrix(y_test, y_predicted)\n plt.figure(figsize=(8,6))\n sns.heatmap(cm,annot=True, fmt='.20g')\n plt.title(title)\n plt.ylabel('True label')\n plt.xlabel('Predicted label')", - "class": "Model Evaluation", - "desc": "This code defines a function to plot a confusion matrix using a heatmap for visualizing the performance of a classification model by comparing true and predicted labels.", + "cell_id": 60, + "code": "plot_model_performance(model)", + "class": "Visualization", + "desc": "This code snippet visualizes the training and validation loss and accuracy of the new `BagOfEmbeddingsClassifier` model over epochs using the `plot_model_performance` function and Matplotlib.", "testing": { "class": "Visualization", - "subclass": "heatmap", - "subclass_id": 80, - "predicted_subclass_probability": 0.7525936 + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.8608386 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 19, - "code": "plot_confusion_matrix(y_test, y_predicted_lr)", - "class": "Model Evaluation", - "desc": "This code generates and visualizes the confusion matrix for the Logistic Regression model's predictions on the test data using the previously defined function.", + "cell_id": 68, + "code": "plot_model_performance(model)", + "class": "Visualization", + "desc": "This code snippet visualizes the training and validation loss and accuracy of the `TransformerWrapper` model over epochs using the `plot_model_performance` function and Matplotlib.", "testing": { "class": "Visualization", - "subclass": "plot_predictions", - "subclass_id": 56, - "predicted_subclass_probability": 0.76843596 + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.8608386 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 20, - "code": "# fail to sort and plot the top 10 most important features in disaster and non-disaster text\n'''\nindex_to_word = [(v,k) for k,v in tfidf_vectorizer.vocabulary_.items()]\nsorted(index_to_word, key=lambda x: x[0], reverse=True)\n'''", - "class": "Model Evaluation", - "desc": "This commented-out code snippet attempts to create a list of (index, word) tuples from the TF-IDF vectorizer's vocabulary and then sort them in reverse order to identify the top 10 most important features for disaster and non-disaster texts.", + "cell_id": 70, + "code": "plot_model_performance(model)", + "class": "Visualization", + "desc": "This code snippet visualizes the training and validation loss and accuracy of the newly initialized `TransformerWrapper` model over epochs using the `plot_model_performance` function and Matplotlib.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "commented", - "subclass_id": 76, - "predicted_subclass_probability": 0.8911329 + "class": "Visualization", + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.8608386 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 23, - "code": "score_metrics(y_test, y_predicted_dt)", - "class": "Model Evaluation", - "desc": "This code calculates and prints the accuracy, precision, and recall metrics for the Decision Tree model's predictions on the test data.", + "cell_id": 72, + "code": "plot_model_performance(model)", + "class": "Visualization", + "desc": "This code snippet visualizes the training and validation loss and accuracy of the updated `TransformerWrapper` model over epochs using the `plot_model_performance` function and Matplotlib.", "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.9980464 + "class": "Visualization", + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.8608386 }, - "cluster": 0 - }, { - "cell_id": 24, - "code": "plot_confusion_matrix(y_test, y_predicted_dt)", - "class": "Model Evaluation", - "desc": "This code generates and visualizes the confusion matrix for the Decision Tree model's predictions on the test data using the previously defined function.", + "cluster": -1 + }], + "notebook_id": 16, + "notebook_name": "nlp-for-tweets-from-bag-of-words-to-transformers.ipynb" + }, { + "cells": [{ + "cell_id": 13, + "code": "def submission(submission_file_path,model,test_vectors):\n sample_submission = pd.read_csv(submission_file_path)\n sample_submission[\"target\"] = model.predict(test_vectors)\n sample_submission.to_csv(\"submission.csv\", index=False)", + "class": "Data Export", + "desc": "The code snippet defines a `submission` function that reads a sample submission file, uses the given model to predict target values for the provided test vectors, and writes the results to a new CSV file named \"submission.csv\".", "testing": { - "class": "Visualization", - "subclass": "plot_predictions", - "subclass_id": 56, - "predicted_subclass_probability": 0.7584198 + "class": "Data_Export", + "subclass": "save_to_csv", + "subclass_id": 25, + "predicted_subclass_probability": 0.9992994 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 39, - "code": "score_metrics(y_test_word2vec, y_predicted_word2vec_lr)", - "class": "Model Evaluation", - "desc": "This code calculates and prints the accuracy, precision, and recall metrics for the Logistic Regression model's predictions on the test data of averaged word vectors.", + "cell_id": 14, + "code": "submission_file_path = \"../input/nlp-getting-started/sample_submission.csv\"\ntest_vectors=test_tfidf\nclf = clf_xgb_TFIDF\nsubmission(submission_file_path,clf,test_vectors)", + "class": "Data Export", + "desc": "The code snippet sets the paths and variables for the submission file, the test vectors, and the classifier model, then calls the `submission` function to generate and save the final predictions in a CSV file named \"submission.csv\".", "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.9982146 + "class": "Exploratory_Data_Analysis", + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.7684079 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 40, - "code": "plot_confusion_matrix(y_test_word2vec, y_predicted_word2vec_lr)", - "class": "Model Evaluation", - "desc": "This code generates and visualizes the confusion matrix for the Logistic Regression model's predictions on the test data of averaged word vectors using the previously defined function.", + "cell_id": 1, + "code": "train = pd.read_csv(\"../input/nlp-getting-started/train.csv\")\ntest = pd.read_csv(\"../input/nlp-getting-started/test.csv\")\nX_train = train.iloc[:, :4]\ny_train = train.iloc[:, 4]\nX_test = test\nprint(X_train.shape, y_train.shape, X_test.shape)", + "class": "Data Extraction", + "desc": "The code snippet reads the training and testing datasets from CSV files into pandas DataFrames, and then splits the training data into features and target variables, printing the shapes of these datasets.", "testing": { - "class": "Visualization", - "subclass": "plot_predictions", - "subclass_id": 56, - "predicted_subclass_probability": 0.84631115 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.99967587 }, "cluster": 0 }, { - "cell_id": 41, - "code": "compare_list = []\nfor (i,j) in zip(y_test_word2vec, y_predicted_word2vec_lr):\n k = i - j\n compare_list.append(k)\n\nwrong_num = [i for i,j in enumerate(compare_list) if j != 0]\ntext_series[0:train_data.shape[0]][wrong_num]", - "class": "Model Evaluation", - "desc": "This code identifies the indices of misclassified samples by comparing the true and predicted labels, and then retrieves the corresponding text entries for these misclassified instances.", + "cell_id": 2, + "code": "def lowercase_text(text):\n return text.lower()\n\nX_train.text=X_train.text.apply(lambda x: lowercase_text(x))\nX_test.text=X_test.text.apply(lambda x: lowercase_text(x))\nX_train.head()", + "class": "Data Transform", + "desc": "The code snippet defines a function to convert text to lowercase and applies this transformation to the 'text' column of both the training and testing features using the pandas `apply` method.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.96445125 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.9221044 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 49, - "code": "test_loss, test_acc = CNNmodel.evaluate(X_test_cnn, y_test_cnn, verbose=2)\nprint('test loss:',test_loss)\nprint('test acc:',test_acc)", - "class": "Model Evaluation", - "desc": "This code evaluates the CNN model on the test dataset, printing the test loss and accuracy to assess the model's final performance.", + "cell_id": 3, + "code": "import re\nimport string\ndef remove_noise(text):\n text = re.sub('\\[.*?\\]', '', text)\n text = re.sub('https?://\\S+|www\\.\\S+', '', text)\n text = re.sub('<.*?>+', '', text)\n text = re.sub('[%s]' % re.escape(string.punctuation), '', text)\n text = re.sub('\\n', '', text)\n text = re.sub('\\w*\\d\\w*', '', text)\n text = re.sub('\u0089\u00fb\u00f2', '', text)\n return text\nX_train.text=X_train.text.apply(lambda x: remove_noise(x))\nX_test.text=X_test.text.apply(lambda x: remove_noise(x))\nX_train.head()", + "class": "Data Transform", + "desc": "The code snippet defines and applies a `remove_noise` function using regular expressions to clean text by removing various unwanted patterns such as square brackets, URLs, HTML tags, punctuations, newlines, and digits from the 'text' column of both training and testing features.", "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.9945109 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.7514645 }, "cluster": 0 }, { - "cell_id": 55, - "code": "test_loss, test_acc = model.evaluate(X_test_cnn, y_test_cnn, verbose=2)\nprint('test loss:',test_loss)\nprint('test acc:',test_acc)", - "class": "Model Evaluation", - "desc": "This code evaluates the CNN model on the test dataset, printing the test loss and accuracy to assess the model's final performance.", + "cell_id": 4, + "code": "# Tokenizing the training and the test set\nimport nltk\nfrom nltk.corpus import stopwords\ntokenizer = nltk.tokenize.RegexpTokenizer(r'\\w+')\nX_train['text'] = X_train['text'].apply(lambda x: tokenizer.tokenize(x))\nX_test['text'] = X_test['text'].apply(lambda x: tokenizer.tokenize(x))\nX_train['text'].head()", + "class": "Data Transform", + "desc": "The code snippet tokenizes the text in the 'text' column of both the training and testing datasets using NLTK's `RegexpTokenizer`, which splits the text into words based on a regular expression.", "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.9897344 + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.92383534 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 64, - "code": "# predict\ntest = pd.read_csv(\"/kaggle/input/nlp-getting-started/test.csv\")\n\ntest_input = bert_encode(test.text.values, bert_layer, max_len=128)\nmodel.load_weights('model.h5')\ntest_pred = model.predict(test_input)", - "class": "Model Evaluation", - "desc": "This code reads and encodes the test data using the BERT tokenizer, loads the best model weights from training, and predicts the labels for the test data.", + "cell_id": 5, + "code": "# Removing stopwords belonging to english language\ndef remove_stopwords(text):\n words = [w for w in text if w not in stopwords.words('english')]\n return words\n\nX_train['text'] = X_train['text'].apply(lambda x : remove_stopwords(x))\nX_test['text'] = X_test['text'].apply(lambda x : remove_stopwords(x))\nX_train.head()", + "class": "Data Transform", + "desc": "The code snippet defines and applies a `remove_stopwords` function to filter out English stopwords from the tokenized text in the 'text' column of both the training and testing datasets using NLTK's `stopwords`.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.93680686 + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.98372287 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 15, - "code": "lr_tfidf = LogisticRegression(class_weight = 'balanced', solver = 'lbfgs', n_jobs = -1)\nlr_tfidf.fit(X_train_tfidf, y_train)\ny_predicted_lr = lr_tfidf.predict(X_test_tfidf)", - "class": "Model Training", - "desc": "This code initializes a Logistic Regression model with balanced class weights and fits it to the TF-IDF transformed training data, then uses the model to predict labels on the TF-IDF transformed test data.", + "cell_id": 6, + "code": "# After preprocessing, the text format\ndef combine_text(list_of_text):\n '''Takes a list of text and combines them into one large chunk of text.'''\n combined_text = ' '.join(list_of_text)\n return combined_text\n\nX_train['text'] = X_train['text'].apply(lambda x : combine_text(x))\nX_test['text'] = X_test['text'].apply(lambda x : combine_text(x))\n# X_train['text']\nX_train.head()", + "class": "Data Transform", + "desc": "The code snippet defines and applies a `combine_text` function to concatenate the list of tokenized words back into a single string of text for each entry in the 'text' column of both the training and testing datasets.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.6878745 + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.7641233 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 21, - "code": "pipeline = Pipeline([\n ('clf', DecisionTreeClassifier(splitter='random', class_weight='balanced'))\n])\nparameters = {\n 'clf__max_depth':(150,160,165),\n 'clf__min_samples_split':(18,20,23),\n 'clf__min_samples_leaf':(5,6,7)\n}\n\ndf_tfidf = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=-1, scoring='f1')\ndf_tfidf.fit(X_train_tfidf, y_train)\n\nprint(df_tfidf.best_estimator_.get_params())", - "class": "Model Training", - "desc": "This code sets up a machine learning pipeline with a Decision Tree Classifier and uses GridSearchCV to perform hyperparameter tuning to find the best combination of parameters for the classifier when fitted to the TF-IDF transformed training data.", + "cell_id": 7, + "code": "# Stemming\nfrom nltk.stem.snowball import SnowballStemmer\nstemmer = SnowballStemmer(\"english\")\n\ndef stemming(text):\n text = [stemmer.stem(word) for word in text.split()]\n return ' '.join(text)\n\n#X_train['text'] = X_train['text'].apply(lambda x : stemming(x))\n#X_test['text'] = X_test['text'].apply(lambda x : stemming(x))\n#X_train", + "class": "Data Transform", + "desc": "The code snippet defines a `stemming` function that applies NLTK's `SnowballStemmer` to reduce each word in the 'text' column to its stem, although the function is commented out and not applied to the datasets.", "testing": { - "class": "Model_Train", - "subclass": "train_on_grid", - "subclass_id": 6, - "predicted_subclass_probability": 0.9916215 + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.6512052 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 22, - "code": "y_predicted_dt = df_tfidf.predict(X_test_tfidf)", - "class": "Model Training", - "desc": "This code snippet uses the best Decision Tree model from the GridSearchCV tuning to predict labels on the TF-IDF transformed test data.", + "cell_id": 10, + "code": "from sklearn.feature_extraction.text import CountVectorizer\ncount_vectorizer=CountVectorizer() # analyzer='word', stop_words = \"english\"\ntrain_vec = count_vectorizer.fit_transform(X_train.text)\ntest_vec = count_vectorizer.transform(X_test.text)\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nTfidf_vectorizer = TfidfVectorizer() # min_df=2, max_df=0.5, ngram_range=(1, 2)analyzer='word', stop_words = \"english\"analyzer='word', stop_words='english'# , ngram_range=(1, 2), lowercase=True, max_features=150000\ntrain_tfidf = Tfidf_vectorizer.fit_transform(X_train.text)\ntest_tfidf = Tfidf_vectorizer.transform(X_test.text)\n\nprint(\"train_vec\" ,train_vec[7].todense())\nprint(\"test_vec\", test_vec[7].todense())\n\nprint(\"train_tfidf\" ,train_tfidf[7].todense())\nprint(\"test_tfidf\", test_vec[7].todense())", + "class": "Data Transform", + "desc": "The code snippet uses Scikit-learn's `CountVectorizer` and `TfidfVectorizer` to convert the preprocessed text in the 'text' column of both the training and testing datasets into sparse matrices of token counts and TF-IDF features, respectively, and prints the dense representations of specific transformed samples for verification.", "testing": { - "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.99451977 + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.8695518 }, - "cluster": -1 + "cluster": 5 }, { - "cell_id": 30, - "code": "'''\nstarttime = time.time()\nword2vec_model = Word2Vec(data_list, size=300, iter=10, min_count=10)\nusedtime = time.time() - starttime\nprint('It took %.2fseconds to train word2vec' %usedtime)\n'''", - "class": "Model Training", - "desc": "This commented-out code snippet trains a Word2Vec model on the tokenized text data, prints the time taken for training, with parameters specifying a vector size of 300, 10 iterations, and a minimum word count of 10.", + "cell_id": 0, + "code": "# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session", + "class": "Imports and Environment", + "desc": "The code snippet sets up the environment by importing essential libraries such as `numpy` for linear algebra and `pandas` for data processing, and lists all files available in the specified directory using the `os` module.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "commented", - "subclass_id": 76, - "predicted_subclass_probability": 0.9858438 + "subclass": "list_files", + "subclass_id": 88, + "predicted_subclass_probability": 0.99921954 }, "cluster": 0 }, { - "cell_id": 31, - "code": "import gensim\nword2vec_path='./GoogleNews-vectors-negative300.bin.gz'\nword2vec_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)", - "class": "Model Training", - "desc": "This code loads the pre-trained Google News word2vec model from a binary file using Gensim's KeyedVectors class for further use in the NLP task.", + "cell_id": 12, + "code": "# MultinomialNB\nclf.fit(train_vec,y_train)\ny_pred = clf.predict(test_vec)\nscores = model_selection.cross_val_score(clf,test_vec,y_pred,cv=kF,scoring='f1')\nprint(\"MultinomialNB prediction score: \" ,scores.mean())\n\n# LogisticRegression\nclf_tfidf.fit(train_tfidf, y_train)\ny_pred_tfidf = clf_tfidf.predict(test_tfidf)\nscores_tfidf = model_selection.cross_val_score(clf_tfidf,test_tfidf,y_pred_tfidf,cv=kF,\n scoring='f1')\nprint(\"LogisticRegretion prediction score: \" ,scores_tfidf.mean())\n\n# SVC\nclf_svc.fit(train_tfidf, y_train)\ny_pred_svc = clf_svc.predict(test_tfidf)\nscores_svc = model_selection.cross_val_score(clf_svc,test_tfidf,y_pred_svc, cv=kF,\n scoring='f1') \nprint(\"SVC prediction score: \" ,scores_svc.mean())\n\n# XGBoost\nclf_xgb_TFIDF.fit(train_tfidf, y_train)\ny_pred_xgb = clf_xgb_TFIDF.predict(test_tfidf)\nscores_xgb = model_selection.cross_val_score(clf_xgb_TFIDF,test_tfidf,y_pred_xgb, cv=kF,\n scoring='f1') \nprint(\"XGBoosting prediction score: \" ,scores_xgb.mean())", + "class": "Model Evaluation", + "desc": "The code snippet fits the previously trained `MultinomialNB`, `LogisticRegression`, `SVC`, and `XGBClassifier` models to the training data, makes predictions on the testing data, and evaluates these predictions using K-Fold cross-validation to compute the mean F1 scores for each model.", "testing": { "class": "Model_Train", - "subclass": "load_pretrained", - "subclass_id": 30, - "predicted_subclass_probability": 0.99531686 + "subclass": "find_best_model_class", + "subclass_id": 3, + "predicted_subclass_probability": 0.7733168 }, "cluster": 0 }, { - "cell_id": 38, - "code": "word2vec_lr = LogisticRegression(class_weight = 'balanced', solver = 'lbfgs', n_jobs = -1)\nword2vec_lr.fit(X_train_word2vec, y_train_word2vec)\ny_predicted_word2vec_lr = word2vec_lr.predict(X_test_word2vec)", + "cell_id": 11, + "code": "from sklearn.model_selection import KFold\nkF = KFold(shuffle=True, random_state=241) # \u0440\u0430\u0437\u0434\u0435\u043b\u0435\u043d\u0438\u0435 \u043d\u0430 5 \u0432\u044b\u0431\u043e\u0440\u043e\u043a\n# MultinomialNB\nfrom sklearn import model_selection\nfrom sklearn.naive_bayes import MultinomialNB\nclf = MultinomialNB() \nscores = model_selection.cross_val_score(clf,train_vec,y_train,cv=kF,scoring='f1')\nprint(\"MultinomialNB score: \" ,scores.mean())\n\n# LogisticRegression\nfrom sklearn.linear_model import LogisticRegression\nclf_tfidf = LogisticRegression()\nscores_tfidf = model_selection.cross_val_score(clf_tfidf,train_tfidf,y_train,\n cv=kF,scoring='f1')\nprint(\"LogisticRegretion score: \" ,scores_tfidf.mean())\n\n# SVC\nfrom sklearn.svm import SVC # \u0440\u0435\u0430\u043b\u0438\u0437\u0430\u0446\u0438\u044f \u043c\u0435\u0442\u043e\u0434\u0430 \u043e\u043f\u043e\u0440\u043d\u044b\u0445 \u0432\u0435\u043a\u0442\u043e\u0440\u043e\u0432\nclf_svc = SVC()#kernel='linear', random_state=241\nscores_svc = model_selection.cross_val_score(clf_svc,train_tfidf,y_train,\n cv=kF,scoring='f1')\nprint(\"SVC score: \" ,scores_svc.mean())\n\n# XGBoost\nimport xgboost as xgb\nclf_xgb_TFIDF = xgb.XGBClassifier()#max_depth=7, n_estimators=150, colsample_bytree=0.8, \n #subsample=0.8, nthread=10, learning_rate=0.1\nscores_xgb = model_selection.cross_val_score(clf_xgb_TFIDF, train_tfidf, y_train, cv=kF, scoring=\"f1\")\nprint(\"XGBost score: \" ,scores_xgb.mean())\n\n", "class": "Model Training", - "desc": "This code initializes a Logistic Regression model with balanced class weights and fits it to the training data of averaged word vectors, then uses the model to predict labels on the test data.", + "desc": "The code snippet sets up K-Fold cross-validation and trains multiple machine learning classifiers\u2014`MultinomialNB`, `LogisticRegression`, `SVC`, and `XGBClassifier`\u2014using both count vectorized and TF-IDF-transformed training data, evaluating their performance by calculating the mean F1 scores.", "testing": { "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.57594216 - }, - "cluster": 0 - }, { - "cell_id": 43, - "code": "max_sequence_length = 26\nembedding_dim = 300", - "class": "Model Training", - "desc": "This code sets hyperparameters for a deep learning model by defining the maximum sequence length of tokenized text entries and the embedding dimension size for word vectors.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.9975586 - }, - "cluster": -1 - }, { - "cell_id": 46, - "code": "CNNmodel = Sequential()\nCNNmodel.add(Embedding(len(word_index)+1, embedding_dim, input_length = max_sequence_length))\nCNNmodel.add(Conv1D(filters=250, kernel_size=3, strides=1, padding='valid', activation = 'relu'))\nCNNmodel.add(MaxPooling1D(pool_size=3))\nCNNmodel.add(Flatten())\nCNNmodel.add(Dense(embedding_dim, activation='relu'))\nCNNmodel.add(Dropout(0.8))\nCNNmodel.add(Dense(cnn_label.shape[1], activation='sigmoid'))\n\nCNNmodel.summary()", - "class": "Model Training", - "desc": "This code constructs a Sequential Convolutional Neural Network (CNN) model for text classification, comprising embedding, convolutional, max pooling, flattening, dense, and dropout layers, and then prints the model summary.", - "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.9960479 - }, - "cluster": 0 - }, { - "cell_id": 47, - "code": "CNNmodel.compile(optimizer='adam', loss=losses.binary_crossentropy, metrics=['accuracy'])\nhistory = CNNmodel.fit(X_cnn, y_cnn, epochs=3, validation_data=(X_val_cnn, y_val_cnn))", - "class": "Model Training", - "desc": "This code compiles the CNN model with the Adam optimizer and binary cross-entropy loss function, and trains the model on the training data for three epochs while validating performance on the validation dataset.", - "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.925682 - }, - "cluster": 0 - }, { - "cell_id": 51, - "code": "embedding_layer = Embedding(len(word_index)+1,\n embedding_dim,\n weights = [embedding_matrix],\n input_length = max_sequence_length,\n trainable = False)", - "class": "Model Training", - "desc": "This code initializes an embedding layer using the previously created embedding matrix, sets the maximum input length to match the sequence length, and marks the embeddings as non-trainable for use in a neural network model.", - "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.99493694 - }, - "cluster": 0 - }, { - "cell_id": 52, - "code": "model = Sequential()\nmodel.add(embedding_layer)\nmodel.add(Conv1D(filters=150, kernel_size=3, strides=1, padding='valid', activation = 'relu'))\nmodel.add(MaxPooling1D(pool_size=3))\nmodel.add(Flatten())\nmodel.add(Dense(embedding_dim, activation='relu'))\nmodel.add(Dropout(0.8))\nmodel.add(Dense(cnn_label.shape[1], activation='sigmoid'))\n\nmodel.summary()", - "class": "Model Training", - "desc": "This code constructs a Sequential Convolutional Neural Network (CNN) model for text classification using the pre-trained embedding layer, and adds convolutional, max pooling, flattening, dense, and dropout layers, followed by printing the model summary.", - "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.99737453 - }, - "cluster": 0 - }, { - "cell_id": 53, - "code": "model.compile(optimizer='adam', loss=losses.binary_crossentropy, metrics=['accuracy'])\nhistory = model.fit(X_cnn, y_cnn, epochs=10, validation_data=(X_val_cnn, y_val_cnn))", - "class": "Model Training", - "desc": "This code compiles the CNN model with the Adam optimizer and binary cross-entropy loss function, and trains the model on the training data for ten epochs while validating its performance on the validation dataset.", - "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.9875873 - }, - "cluster": 0 - }, { - "cell_id": 60, - "code": "def bert_encode(texts, bert_layer, max_len=128):\n vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()\n do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()\n tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)\n \n all_tokens = []\n all_masks = []\n all_segments = []\n \n for text in texts:\n text = tokenizer.tokenize(text)\n text = text[:max_len - 2]\n input_sequence = [\"[CLS]\"] + text + [\"[SEP]\"]\n pad_len = max_len - len(input_sequence)\n \n tokens = tokenizer.convert_tokens_to_ids(input_sequence)\n input_ids = tokens + [0]* pad_len\n all_tokens.append(input_ids)\n\n masks = [1]*len(input_sequence) + [0]* pad_len\n all_masks.append(masks)\n \n segments = [0]* max_len\n all_segments.append(segments)\n \n return np.array(all_tokens), np.array(all_masks), np.array(all_segments)\n\n \ndef build_model(bert_layer, max_len = 128, lr = 1e-5):\n input_word_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32,name=\"input_word_ids\")\n input_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32,name=\"input_mask\")\n segment_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32,name=\"segment_ids\")\n \n pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])\n dense_out = Dense(1,activation=\"relu\")(pooled_output)\n drop_out = tf.keras.layers.Dropout(0.8)(dense_out)\n out = Dense(1,activation=\"sigmoid\")(pooled_output)\n \n model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)\n adam = tf.keras.optimizers.Adam(lr)\n model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])\n \n return model\n\n\ndef plot_curve(history):\n plt.plot(history.history['accuracy'], label='accuracy')\n plt.plot(history.history['val_accuracy'], label='val_accuracy')\n plt.xlabel('Epoch')\n plt.ylabel('Accuracy')\n plt.ylim([0.5,1])\n plt.legend()\n plt.show()", - "class": "Model Training", - "desc": "This code defines three functions: one to encode texts for BERT input, another to build a BERT-based model, and the third to plot the accuracy curve for training and validation data across epochs.", - "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.9129451 - }, - "cluster": 0 - }, { - "cell_id": 61, - "code": "%%time\nmodule_url = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1\"\nbert_layer = hub.KerasLayer(module_url, trainable=True)", - "class": "Model Training", - "desc": "This code downloads and initializes a BERT layer from TensorFlow Hub, making it trainable for fine-tuning in a custom model.", - "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.856542 - }, - "cluster": 0 - }, { - "cell_id": 63, - "code": "# train model\nmodel = build_model(bert_layer, max_len=128, lr = 1e-5)\nmodel.summary()\n\ncheckpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)\n\ntrain_history = model.fit(\n train_input, train_labels,\n validation_split=0.2,\n epochs=3,\n callbacks=[checkpoint],\n batch_size=16\n)\n\nplot_curve(train_history)", - "class": "Model Training", - "desc": "This code builds and summarizes a BERT-based model, sets up a model checkpointing callback to save the best model, trains the model on the encoded text data and labels, and plots the training and validation accuracy curves.", - "testing": { - "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.61692107 - }, - "cluster": 0 - }, { - "cell_id": 6, - "code": "piedata = train_data['target']\nplt.figure(figsize=(6,6))\npiedata.value_counts().plot(kind = 'pie',autopct = '%.2f%%')", - "class": "Visualization", - "desc": "This code snippet creates a pie chart to visualize the distribution of the 'target' variable in the training data, showing the percentage of each class.", - "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9974884 - }, - "cluster": 0 - }, { - "cell_id": 7, - "code": "num_words_0 = train_data[train_data['target']==0]['text'].apply(lambda x: len(x.split()))\nnum_words_1 = train_data[train_data['target']==1]['text'].apply(lambda x: len(x.split()))\nplt.figure(figsize=(12,6))\nsns.kdeplot(num_words_0, shade=True, color = 'b').set_title('Kernel distribution of number of words')\nsns.kdeplot(num_words_1, shade=True, color = 'r')\nplt.legend(labels=['0_no disaster', '1_disaster'])", - "class": "Visualization", - "desc": "This code snippet calculates the word count for texts labeled with '0' and '1' in the training data and then visualizes their distribution using Kernel Density Estimation (KDE) plots.", - "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.97927135 + "subclass": "compute_train_metric", + "subclass_id": 28, + "predicted_subclass_probability": 0.6108105 }, - "cluster": 0 + "cluster": 2 }, { "cell_id": 8, - "code": "len_word_0 = train_data[train_data['target']==0]['text'].str.split().map(lambda x: [len(i) for i in x])\nave_len_0 = len_word_0.map(lambda x: np.mean(x))\nlen_word_1 = train_data[train_data['target']==1]['text'].str.split().map(lambda x: [len(i) for i in x])\nave_len_1 = len_word_1.map(lambda x: np.mean(x))\nplt.figure(figsize=(12,6))\nsns.kdeplot(ave_len_0, shade=True, color='b').set_title('Kernel distribution of average words lenth')\nsns.kdeplot(ave_len_1, shade=True, color='r')\nplt.legend(labels=['0_no disaster', '1_disaster'])", - "class": "Visualization", - "desc": "This code snippet computes the average word length for texts labeled '0' and '1' and visualizes their distributions using Kernel Density Estimation (KDE) plots.", - "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.98665464 - }, - "cluster": 0 - }, { - "cell_id": 48, - "code": "plt.plot(history.history['accuracy'], label='accuracy')\nplt.plot(history.history['val_accuracy'], label='val_accuracy')\nplt.xlabel('Epoch')\nplt.ylabel('Accuracy')\nplt.ylim([0.5,1])\nplt.legend()\nplt.show()", + "code": "from wordcloud import WordCloud\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n%matplotlib inline\ndef wordsCloud (dF):\n fig , ax1 = plt.subplots(1,figsize=(12,12))\n stopword_list = stopwords.words(\"english\")\n wordcloud=WordCloud(stopwords = stopword_list, background_color='white',collocations = False , width=600,height=600).generate(\" \".join(dF))\n ax1.imshow(wordcloud)\n ax1.axis('off')\n ax1.set_title(\"Frequent Words\",fontsize=24) \n # print(stopword_list)\n return\nwordsCloud(X_train.text)", "class": "Visualization", - "desc": "This code snippet plots the training and validation accuracy over the epochs to visualize the performance and learning progress of the CNN model during training.", + "desc": "The code snippet defines and calls a `wordsCloud` function that uses the WordCloud library to generate and display a word cloud of the most frequent words in the 'text' column of the given DataFrame, using Matplotlib and Seaborn for visualization.", "testing": { - "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.9901933 + "class": "Imports_and_Environment", + "subclass": "set_options", + "subclass_id": 23, + "predicted_subclass_probability": 0.55752915 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 54, - "code": "plt.plot(history.history['accuracy'], label='accuracy')\nplt.plot(history.history['val_accuracy'], label='val_accuracy')\nplt.xlabel('Epoch')\nplt.ylabel('Accuracy')\nplt.ylim([0.5,1])\nplt.legend()\nplt.show()", + "cell_id": 9, + "code": "wordsCloud(X_test.text)", "class": "Visualization", - "desc": "This code snippet plots the training and validation accuracy over the epochs to visualize the performance and learning progress of the CNN model during training.", + "desc": "The code snippet calls the previously defined `wordsCloud` function to generate and display a word cloud for the most frequent words in the 'text' column of the testing dataset.", "testing": { - "class": "Visualization", - "subclass": "learning_history", - "subclass_id": 35, - "predicted_subclass_probability": 0.9901933 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.8625756 }, - "cluster": 0 + "cluster": -1 }], "notebook_id": 17, - "notebook_name": "nlp-getting-started" + "notebook_name": "distweetrhinosceros.ipynb" }, { "cells": [{ - "cell_id": 17, - "code": "my_submission = pd.DataFrame({'Id': dataset_test_original.id, 'target': test_dataframe_prediction})\nmy_submission.to_csv('submission.csv', index=False)", + "cell_id": 44, + "code": "submission = pd.read_csv(\"../input/nlp-getting-started/sample_submission.csv\")\ntest_pred = model_glove.predict(X_test_seq)\ntest_pred_int = test_pred.round().astype('int')\nsubmission['target'] = test_pred_int\nsubmission.to_csv('submission.csv', index=False)", "class": "Data Export", - "desc": "This snippet creates a DataFrame with the test dataset IDs and their corresponding predictions, and exports it to a CSV file named 'submission.csv'.", + "desc": "This code snippet reads the sample submission file, uses the GloVe-based model to predict target values for the test data, rounds the predictions to integer values, updates the 'target' column of the submission DataFrame with these predictions, and writes the final submission to a CSV file.", "testing": { "class": "Data_Export", "subclass": "save_to_csv", "subclass_id": 25, - "predicted_subclass_probability": 0.9993311 + "predicted_subclass_probability": 0.99928826 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 1, - "code": "dataset_test_original = dataset_test", + "code": "train_data = pd.read_csv(\"../input/nlp-getting-started/train.csv\")\ntrain_data.head(5)", "class": "Data Extraction", - "desc": "The code snippet creates a copy of the original test dataset by assigning `dataset_test` to `dataset_test_original`.", + "desc": "This code snippet reads the training data from a CSV file into a pandas DataFrame and displays the first five rows.", "testing": { - "class": "Data_Transform", - "subclass": "drop_column", - "subclass_id": 10, - "predicted_subclass_probability": 0.40974942 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.9996433 }, - "cluster": -1 + "cluster": 1 }, { "cell_id": 2, - "code": "dataset_test_original", + "code": "test_data = pd.read_csv(\"../input/nlp-getting-started/test.csv\")\ntest_data.head(5)", "class": "Data Extraction", - "desc": "The code snippet outputs the contents of the `dataset_test_original` DataFrame.", + "desc": "This code snippet reads the test data from a CSV file into a pandas DataFrame and displays the first five rows.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.99960965 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.9996711 }, - "cluster": -1 + "cluster": 1 }, { - "cell_id": 4, - "code": "index_train = dataset_train.index\nindex_test = dataset_test.index\ntrain_len = index_train\ntest_len = index_test", + "cell_id": 34, + "code": "# Loading the embedding dictionary from file\n\nembedding_dict={}\nwith open('../input/glovetwitter27b100dtxt/glove.twitter.27B.100d.txt','r') as f:\n for line in f:\n values=line.split()\n word = values[0]\n vectors=np.asarray(values[1:],'float32')\n embedding_dict[word]=vectors\nf.close()", "class": "Data Extraction", - "desc": "The snippet extracts the indices of the training and test datasets and assigns them to `index_train` and `index_test`, while also assigning these indices to `train_len` and `test_len`.", + "desc": "This code snippet loads pre-trained word embeddings from a GloVe file into a dictionary, where each word is mapped to its corresponding embedding vector.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "define_variables", "subclass_id": 77, - "predicted_subclass_probability": 0.99675506 - }, - "cluster": -1 - }, { - "cell_id": 5, - "code": "dataset_train = dataset_train[['text','target']]\nprint(dataset_train)", - "class": "Data Extraction", - "desc": "The snippet selects and prints the 'text' and 'target' columns from the `dataset_train` DataFrame.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.3228975 - }, - "cluster": -1 - }, { - "cell_id": 12, - "code": "dataset_test = dataset_test[['text']]", - "class": "Data Extraction", - "desc": "This snippet selects the 'text' column from the `dataset_test` DataFrame.", - "testing": { - "class": "Data_Transform", - "subclass": "create_dataframe", - "subclass_id": 12, - "predicted_subclass_probability": 0.8538315 - }, - "cluster": -1 - }, { - "cell_id": 13, - "code": "print(len(test_len))", - "class": "Data Extraction", - "desc": "This snippet prints the length of the `test_len` index, effectively showing the number of entries in the test dataset.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.99938715 - }, - "cluster": -1 - }, { - "cell_id": 16, - "code": "print(dataset_test.head())", - "class": "Data Extraction", - "desc": "This snippet prints the first few rows of the `dataset_test` DataFrame.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.99974805 - }, - "cluster": -1 - }, { - "cell_id": 6, - "code": "corpus = []\nfor i in range(0, len(train_len)):\n review = re.sub('[^a-zA-Z]', ' ', dataset_train['text'][i])\n review = review.lower()\n review = review.split()\n ps = PorterStemmer()\n all_stopwords = stopwords.words('english')\n all_stopwords.remove('not')\n review = [ps.stem(word) for word in review if not word in set(all_stopwords)]\n review = ' '.join(review)\n corpus.append(review)", - "class": "Data Transform", - "desc": "The code snippet preprocesses the text data in the training dataset by cleaning, converting to lowercase, tokenizing, stemming, and removing stopwords, then appends the processed text to the `corpus` list.", - "testing": { - "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.91630965 + "predicted_subclass_probability": 0.23091643 }, "cluster": 1 }, { "cell_id": 7, - "code": "from sklearn.feature_extraction.text import CountVectorizer\ncv = CountVectorizer(max_features = 1500)\nX = cv.fit_transform(corpus).toarray()\ny = dataset_train.iloc[:, -1].values", + "code": "from bs4 import BeautifulSoup # Text Cleaning\nimport re, string # Regular Expressions, String\nfrom nltk.corpus import stopwords # stopwords\nfrom nltk.stem.porter import PorterStemmer # for word stemming\nfrom nltk.stem import WordNetLemmatizer # for word lemmatization\nimport unicodedata\nimport html\n\n# set of stopwords to be removed from text\nstop = set(stopwords.words('english'))\n\n# update stopwords to have punctuation too\nstop.update(list(string.punctuation))\n\ndef clean_tweets(text):\n \n # Remove unwanted html characters\n re1 = re.compile(r' +')\n x1 = text.lower().replace('#39;', \"'\").replace('amp;', '&').replace('#146;', \"'\").replace(\n 'nbsp;', ' ').replace('#36;', '$').replace('\\\\n', \"\\n\").replace('quot;', \"'\").replace(\n '
', \"\\n\").replace('\\\\\"', '\"').replace('', 'u_n').replace(' @.@ ', '.').replace(\n ' @-@ ', '-').replace('\\\\', ' \\\\ ')\n text = re1.sub(' ', html.unescape(x1))\n \n # remove non-ascii characters\n text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')\n \n # strip html\n soup = BeautifulSoup(text, 'html.parser')\n text = soup.get_text()\n \n # remove between square brackets\n text = re.sub('\\[[^]]*\\]', '', text)\n \n # remove URLs\n text = re.sub(r'http\\S+', '', text)\n \n # remove twitter tags\n text = text.replace(\"@\", \"\")\n \n # remove hashtags\n text = text.replace(\"#\", \"\")\n \n # remove all non-alphabetic characters\n text = re.sub(r'[^a-zA-Z ]', '', text)\n \n # remove stopwords from text\n final_text = []\n for word in text.split():\n if word.strip().lower() not in stop:\n final_text.append(word.strip().lower())\n \n text = \" \".join(final_text)\n \n # lemmatize words\n lemmatizer = WordNetLemmatizer() \n text = \" \".join([lemmatizer.lemmatize(word) for word in text.split()])\n text = \" \".join([lemmatizer.lemmatize(word, pos = 'v') for word in text.split()])\n \n # replace all numbers with \"num\"\n text = re.sub(\"\\d\", \"num\", text)\n \n return text.lower()\n\ntrain_data['prep_text'] = train_data['text'].apply(clean_tweets)\ntrain_data['prep_text'].head(5)", "class": "Data Transform", - "desc": "This snippet vectorizes the preprocessed text data using `CountVectorizer` to create a feature matrix `X` and extracts the target values into `y`.", + "desc": "This code snippet defines a function to clean tweet text by removing HTML and non-alphabetic characters, stopwords, URLs, and then lemmatizing the text, and applies this function to a new 'prep_text' column in the training data using pandas' apply method.", "testing": { "class": "Data_Transform", - "subclass": "prepare_x_and_y", - "subclass_id": 21, - "predicted_subclass_probability": 0.99771535 + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.27900088 }, - "cluster": 1 + "cluster": 0 }, { "cell_id": 8, - "code": "from sklearn.model_selection import train_test_split\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)", + "code": "test_data['text'] = test_data['text'].apply(clean_tweets)\ntest_data['text'].head(5)", "class": "Data Transform", - "desc": "The snippet splits the feature matrix `X` and target variable `y` into training and test sets using an 80-20 split ratio with a fixed random state for reproducibility.", + "desc": "This code snippet applies the previously defined `clean_tweets` function to the 'text' column of the test data to clean the tweets, updating the 'text' column with the cleaned version.", "testing": { "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.99815947 + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.9986481 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 14, - "code": "testcorpus = []\nfor i in range(0, len(test_len)):\n review = re.sub('[^a-zA-Z]', ' ', dataset_test['text'][i])\n review = review.lower()\n review = review.split()\n ps = PorterStemmer()\n all_stopwords = stopwords.words('english')\n all_stopwords.remove('not')\n review = [ps.stem(word) for word in review if not word in set(all_stopwords)]\n review = ' '.join(review)\n testcorpus.append(review)\n\nxtest = cv.transform(testcorpus).toarray()\ntest_dataframe_prediction = classifier.predict(xtest)", + "cell_id": 9, + "code": "from keras.preprocessing.text import Tokenizer # Text tokenization\n\n# Setting up the tokenizer\nvocab_size = 1000\ntokenizer = Tokenizer(num_words = vocab_size, oov_token = 'UNK')\ntokenizer.fit_on_texts(list(train_data['prep_text']) + list(test_data['text']))", "class": "Data Transform", - "desc": "This code preprocesses the text data in the test dataset, vectorizes it using the already-fitted `CountVectorizer`, and makes predictions using the trained classifier, storing the results in `test_dataframe_prediction`.", - "testing": { - "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.39147574 - }, - "cluster": 1 - }, { - "cell_id": 0, - "code": "import numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n if filename == 'train.csv':\n dataset_train = pd.read_csv(os.path.join(dirname, filename))\n elif filename == 'test.csv':\n dataset_test = pd.read_csv(os.path.join(dirname, filename))\n else:\n dataset_sample = pd.read_csv(os.path.join(dirname, filename))", - "class": "Imports and Environment", - "desc": "The snippet imports necessary libraries, scans the input directory for files, and reads specific CSV files into pandas DataFrames. ", - "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.999468 - }, - "cluster": -1 - }, { - "cell_id": 3, - "code": "import re\nimport nltk\nnltk.download('stopwords')\nfrom nltk.corpus import stopwords\nfrom nltk.stem.porter import PorterStemmer", - "class": "Imports and Environment", - "desc": "The snippet imports the `re`, `nltk`, and specific modules from `nltk`, and downloads the 'stopwords' dataset for natural language processing tasks.", + "desc": "This code snippet sets up a Keras Tokenizer with a specified vocabulary size and an out-of-vocabulary token, fitting it on the combined 'prep_text' from the training data and 'text' from the test data.", "testing": { "class": "Model_Train", - "subclass": "load_pretrained", - "subclass_id": 30, - "predicted_subclass_probability": 0.9607691 + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.9994017 }, - "cluster": 0 + "cluster": 2 }, { "cell_id": 10, - "code": "y_pred = classifier.predict(X_test)\nprint(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))", - "class": "Model Evaluation", - "desc": "This snippet uses the trained Gaussian Naive Bayes classifier to predict the target values for the test set and prints the predicted values alongside the actual test values for evaluation.", + "code": "# Representing texts as one hot encoded sequence\n\nX_train_ohe = tokenizer.texts_to_matrix(train_data['prep_text'], mode = 'binary')\nX_test_ohe = tokenizer.texts_to_matrix(test_data['text'], mode = 'binary')\ny_train = np.array(train_data['target']).astype(int)\n\nprint(f\"X_train shape: {X_train_ohe.shape}\")\nprint(f\"X_test shape: {X_test_ohe.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")", + "class": "Data Transform", + "desc": "This code snippet converts the cleaned texts from the training and test data into one-hot encoded matrices using the Keras Tokenizer, and prepares the target variable as an integer numpy array, then prints the shapes of these matrices.", "testing": { - "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.99374145 + "class": "Data_Transform", + "subclass": "data_type_conversions", + "subclass_id": 16, + "predicted_subclass_probability": 0.5156552 }, - "cluster": 0 + "cluster": 5 }, { "cell_id": 11, - "code": "from sklearn.metrics import confusion_matrix, accuracy_score\ncm = confusion_matrix(y_test, y_pred)\nprint(cm)\naccuracy_score(y_test, y_pred)", - "class": "Model Evaluation", - "desc": "The snippet calculates and prints the confusion matrix, and computes the accuracy score of the model's predictions on the test set.", - "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.996639 - }, - "cluster": 0 - }, { - "cell_id": 15, - "code": "print(test_dataframe_prediction)", - "class": "Model Evaluation", - "desc": "This snippet outputs the predictions made by the classifier on the processed test data `xtest`.", + "code": "from sklearn.model_selection import train_test_split\nX_train_ohe, X_val_ohe, y_train, y_val = train_test_split(X_train_ohe, y_train, random_state = 42, test_size = 0.2)\n\nprint(f\"X_train shape: {X_train_ohe.shape}\")\nprint(f\"X_val shape: {X_val_ohe.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")\nprint(f\"y_val shape: {y_val.shape}\")", + "class": "Data Transform", + "desc": "This code snippet splits the one-hot encoded training data and target variable into training and validation sets using sklearn's train_test_split function, and then prints the shapes of these resulting datasets.", "testing": { "class": "Data_Transform", - "subclass": "create_dataframe", - "subclass_id": 12, - "predicted_subclass_probability": 0.9981881 + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.961794 }, - "cluster": 0 + "cluster": 5 }, { - "cell_id": 9, - "code": "from sklearn.naive_bayes import GaussianNB\nclassifier = GaussianNB()\nclassifier.fit(X_train, y_train)", - "class": "Model Training", - "desc": "This snippet initializes a Gaussian Naive Bayes classifier and fits it to the training data (`X_train`, `y_train`).", - "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.9989604 - }, - "cluster": 0 - }], - "notebook_id": 18, - "notebook_name": "nlp-prediction" - }, { - "cells": [{ - "cell_id": 19, - "code": "my_submission_preds = pipe.predict(test['text']+ ' ' + test['keyword'].astype(str) + ' ' + test['location'].astype(str))\n\nmy_submission = pd.DataFrame({\"id\":test['id'], 'target':my_submission_preds})", - "class": "Data Export", - "desc": "This code snippet generates predictions for the test dataset using the trained pipeline and creates a DataFrame for submission that includes the test IDs and the corresponding predicted labels.", + "cell_id": 16, + "code": "X_train_wc = tokenizer.texts_to_matrix(train_data['prep_text'], mode = 'count')\nX_test_wc = tokenizer.texts_to_matrix(test_data['text'], mode = 'count')\ny_train = np.array(train_data['target']).astype(int)\n\nprint(f\"X_train shape: {X_train_wc.shape}\")\nprint(f\"X_test shape: {X_test_wc.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")\n", + "class": "Data Transform", + "desc": "This code snippet converts the cleaned texts from the training and test data into count-encoded matrices using the Keras Tokenizer, prepares the target variable as an integer numpy array, and prints the shapes of these matrices.", "testing": { - "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.7191785 + "class": "Data_Transform", + "subclass": "data_type_conversions", + "subclass_id": 16, + "predicted_subclass_probability": 0.61748403 }, - "cluster": -1 + "cluster": 5 }, { - "cell_id": 20, - "code": "my_submission.head()", - "class": "Data Export", - "desc": "This code snippet displays the first five rows of the submission DataFrame to verify the format and content before exporting the submission.", + "cell_id": 17, + "code": "X_train_wc, X_val_wc, y_train, y_val = train_test_split(X_train_wc, y_train, random_state = 42, test_size = 0.2)\n\nprint(f\"X_train shape: {X_train_wc.shape}\")\nprint(f\"X_val shape: {X_val_wc.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")\nprint(f\"y_val shape: {y_val.shape}\")", + "class": "Data Transform", + "desc": "This code snippet splits the count-encoded training data and target variable into training and validation sets using sklearn's train_test_split function, and then prints the shapes of these resulting datasets.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997489 + "class": "Data_Transform", + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.80229384 }, - "cluster": -1 + "cluster": 5 }, { - "cell_id": 21, - "code": "len(my_submission)", - "class": "Data Export", - "desc": "This code snippet calculates and displays the total number of rows in the submission DataFrame to ensure that the submission includes predictions for all test samples.", + "cell_id": 22, + "code": "X_train_freq = tokenizer.texts_to_matrix(train_data['prep_text'], mode = 'freq')\nX_test_freq = tokenizer.texts_to_matrix(test_data['text'], mode = 'freq')\ny_train = np.array(train_data['target']).astype(int)\n\nprint(f\"X_train shape: {X_train_freq.shape}\")\nprint(f\"X_test shape: {X_test_freq.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")", + "class": "Data Transform", + "desc": "This code snippet converts the cleaned texts from the training and test data into frequency-encoded matrices using the Keras Tokenizer, prepares the target variable as an integer numpy array, and prints the shapes of these matrices.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.99885213 + "class": "Data_Transform", + "subclass": "data_type_conversions", + "subclass_id": 16, + "predicted_subclass_probability": 0.74439836 }, - "cluster": -1 + "cluster": 5 }, { - "cell_id": 22, - "code": "my_submission.to_csv('submission.csv', index=False)", - "class": "Data Export", - "desc": "This code snippet exports the submission DataFrame to a CSV file named 'submission.csv' without including the DataFrame index, preparing it for submission.", + "cell_id": 23, + "code": "X_train_freq, X_val_freq, y_train, y_val = train_test_split(X_train_freq, y_train, test_size = 0.2, random_state = 42)\nprint(f\"X_train shape: {X_train_freq.shape}\")\nprint(f\"X_val shape: {X_val_freq.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")\nprint(f\"y_val shape: {y_val.shape}\")", + "class": "Data Transform", + "desc": "This code snippet splits the frequency-encoded training data and target variable into training and validation sets using sklearn's train_test_split function, and then prints the shapes of these resulting datasets.", "testing": { - "class": "Data_Export", - "subclass": "save_to_csv", - "subclass_id": 25, - "predicted_subclass_probability": 0.99912554 + "class": "Data_Transform", + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.84670115 }, - "cluster": 1 + "cluster": 5 }, { - "cell_id": 1, - "code": "train = pd.read_csv(\"../input/nlp-getting-started/train.csv\")\ntest = pd.read_csv(\"../input/nlp-getting-started/test.csv\")\nsample_submission = pd.read_csv(\"../input/nlp-getting-started/sample_submission.csv\")", - "class": "Data Extraction", - "desc": "This code snippet reads the training data, test data, and sample submission files from the specified input directory into pandas DataFrames.", + "cell_id": 28, + "code": "from sklearn.feature_extraction.text import TfidfVectorizer # Term Frequency - Inverse Document Frequency\n\nvectorizer = TfidfVectorizer(max_features = vocab_size)\nvectorizer.fit(list(train_data['prep_text']) + list(test_data['text']))\n\n# Fitting on training and testing data\nX_train_tfidf = vectorizer.transform(list(train_data['prep_text'])).toarray() \nX_test_tfidf = vectorizer.transform(list(test_data['text'])).toarray()\n\ny_train = np.array(train_data['target']).astype(int)\n\nprint(f\"X_train shape {X_train_tfidf.shape}\")\nprint(f\"X_test shape {X_test_tfidf.shape}\")\nprint(f\"y_train shape {y_train.shape}\")", + "class": "Data Transform", + "desc": "This code snippet creates a TfidfVectorizer with a specified maximum number of features, fits it on the combined cleaned texts from the training and test data, transforms the training and test texts into TF-IDF matrices, and prepares the target variable as an integer numpy array, then prints the shapes of these matrices.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.99971575 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.57227826 }, - "cluster": -1 + "cluster": 5 }, { - "cell_id": 7, - "code": "from sklearn.model_selection import train_test_split\n\nX = train['text'] + ' ' + train['keyword'].astype(str) + ' ' + train['location'].astype(str) # the features we want to analyze\nylabels = train['target'] # the labels, or answers, we want to test against\n\nX_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)", + "cell_id": 29, + "code": "X_train_tfidf, X_val_tfidf, y_train, y_val = train_test_split(X_train_tfidf, y_train, test_size = 0.2, random_state = 42)\nprint(f\"X_train shape: {X_train_tfidf.shape}\")\nprint(f\"X_val shape: {X_val_tfidf.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")\nprint(f\"y_val shape: {y_val.shape}\")", "class": "Data Transform", - "desc": "This code snippet concatenates text, keyword, and location columns to form a feature set, and then splits the data into training and testing sets with a 70-30 ratio.", + "desc": "This code snippet splits the TF-IDF encoded training data and target variable into training and validation sets using sklearn's train_test_split function, and then prints the shapes of these resulting datasets.", "testing": { "class": "Data_Transform", "subclass": "split", "subclass_id": 13, - "predicted_subclass_probability": 0.9936372 + "predicted_subclass_probability": 0.70971876 }, - "cluster": 1 + "cluster": 5 }, { - "cell_id": 2, - "code": "train.head()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first five rows of the training DataFrame to give an initial view of the dataset. ", + "cell_id": 35, + "code": "# Sequences creation, truncation and padding\n\nfrom keras.preprocessing.sequence import pad_sequences\n\n# Setting up the tokenizer\nvocab_size = 10000\ntokenizer = Tokenizer(num_words = vocab_size, oov_token = 'UNK')\ntokenizer.fit_on_texts(list(train_data['prep_text']) + list(test_data['text']))\n\nmax_len = 15\nX_train_seq = tokenizer.texts_to_sequences(train_data['prep_text'])\nX_test_seq = tokenizer.texts_to_sequences(test_data['text'])\n\nX_train_seq = pad_sequences(X_train_seq, maxlen = max_len, truncating = 'post', padding = 'post')\nX_test_seq = pad_sequences(X_test_seq, maxlen = max_len, truncating = 'post', padding = 'post')\ny_train = np.array(train_data['target']).astype(int)\n\nprint(f\"X_train shape: {X_train_seq.shape}\")\nprint(f\"X_test shape: {X_test_seq.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")", + "class": "Data Transform", + "desc": "This code snippet sets up a Keras Tokenizer with a specified vocabulary size, converts the cleaned texts from the training and test data into sequences, and then pads and truncates these sequences to a specified maximum length, preparing the target variable as an integer numpy array, and prints the shapes of these matrices.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997507 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.7396719 }, - "cluster": 12 + "cluster": 5 }, { - "cell_id": 3, - "code": "test.head()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first five rows of the test DataFrame to provide an initial overview of the test dataset.", + "cell_id": 36, + "code": "X_train_seq, X_val_seq, y_train, y_val = train_test_split(X_train_seq, y_train, test_size = 0.2, random_state = 42)\nprint(f\"X_train shape: {X_train_seq.shape}\")\nprint(f\"X_val shape: {X_val_seq.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")\nprint(f\"y_val shape: {y_val.shape}\")", + "class": "Data Transform", + "desc": "This code snippet splits the padded and truncated sequence-encoded training data and target variable into training and validation sets using sklearn's train_test_split function, and then prints the shapes of these resulting datasets.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997483 + "class": "Data_Transform", + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.9086002 }, - "cluster": 12 + "cluster": 5 }, { - "cell_id": 4, - "code": "print(train.apply(lambda col: col.unique()))\nprint(train.apply(lambda col: col.nunique()))", - "class": "Exploratory Data Analysis", - "desc": "This code snippet prints the unique values in each column and the number of unique values in each column of the training DataFrame to help understand the dataset's structure and variability.", + "cell_id": 38, + "code": "# Applying GloVE representations on our corpus\n\nembedding_matrix=np.zeros((num_words,100))\n\nfor word,i in tokenizer.word_index.items():\n if i < num_words:\n emb_vec = embedding_dict.get(word)\n if emb_vec is not None:\n embedding_matrix[i] = emb_vec ", + "class": "Data Transform", + "desc": "This code snippet creates an embedding matrix where each row corresponds to the GloVe embedding vector for a word in the tokenizer's word index, filling in zero vectors for words not found in the GloVe embeddings.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_unique_values", - "subclass_id": 54, - "predicted_subclass_probability": 0.95746493 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.91961074 }, - "cluster": -1 + "cluster": 8 }, { - "cell_id": 8, - "code": "X_train[100:500]\n#type(X_train[1])\n#y_train[:100]", + "cell_id": 3, + "code": "train_data.info()", "class": "Exploratory Data Analysis", - "desc": "This code snippet selects and displays a slice of the training features from index 100 to 500 to inspect a portion of the training data.", + "desc": "This code snippet displays a concise summary of the training DataFrame, including the number of non-null entries and data types of each column.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9915615 + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.99936634 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 15, - "code": "test.head()", + "cell_id": 4, + "code": "test_data.info()", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first five rows of the test DataFrame to provide an initial overview of the test dataset.", + "desc": "This code snippet displays a concise summary of the test DataFrame, including the number of non-null entries and data types of each column.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997483 + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.9993579 }, - "cluster": 12 + "cluster": 0 }, { - "cell_id": 16, - "code": "sample_submission.head()", + "cell_id": 27, + "code": "train_data.head()", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first five rows of the sample submission DataFrame to provide an initial view of the expected format for submission.", + "desc": "This code snippet displays the first five rows of the training DataFrame to inspect the initial data and transformations applied.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.99975234 + "predicted_subclass_probability": 0.9997532 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 18, - "code": "test", + "cell_id": 37, + "code": "num_words = len(tokenizer.word_index)\nprint(f\"Number of unique words: {num_words}\")", "class": "Exploratory Data Analysis", - "desc": "This code snippet, when executed, will display the entire test DataFrame, providing detailed insights into the test dataset.", + "desc": "This code snippet prints the number of unique words in the tokenizer's word index, which indicates the vocabulary size after fitting on the texts.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997199 + "subclass": "count_unique_values", + "subclass_id": 54, + "predicted_subclass_probability": 0.9865096 }, - "cluster": 0 + "cluster": 3 }, { "cell_id": 0, - "code": "# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session", + "code": "# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport seaborn as sns # data visualization\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session", "class": "Imports and Environment", - "desc": "This code snippet imports essential libraries for data processing and lists all files in the input directory in a Kaggle environment.", + "desc": "This code snippet imports essential packages such as numpy, pandas, seaborn, and uses os to list available data files in a read-only directory on Kaggle.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "list_files", "subclass_id": 88, - "predicted_subclass_probability": 0.99921954 + "predicted_subclass_probability": 0.99922085 }, "cluster": 0 }, { - "cell_id": 5, - "code": "!pip install spacy -q\n!python -m spacy download en_core_web_sm -q", + "cell_id": 6, + "code": "!pip install BeautifulSoup4", "class": "Imports and Environment", - "desc": "This code snippet installs the spaCy library and downloads the English core model necessary for natural language processing tasks.", + "desc": "This code snippet installs the BeautifulSoup4 library using pip.", "testing": { "class": "Imports_and_Environment", "subclass": "install_modules", "subclass_id": 87, - "predicted_subclass_probability": 0.993651 + "predicted_subclass_probability": 0.9954203 + }, + "cluster": 1 + }, { + "cell_id": 14, + "code": "_, accuracy = model.evaluate(X_val_ohe, y_val)", + "class": "Model Evaluation", + "desc": "This code snippet evaluates the trained model on the one-hot encoded validation data and retrieves the accuracy metric.", + "testing": { + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.99220455 + }, + "cluster": 2 + }, { + "cell_id": 20, + "code": "_, accuracy = model.evaluate(X_val_wc, y_val)", + "class": "Model Evaluation", + "desc": "This code snippet evaluates the newly trained model on the count-encoded validation data and retrieves the accuracy metric.", + "testing": { + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.9896338 + }, + "cluster": 2 + }, { + "cell_id": 12, + "code": "from keras.models import Sequential\nfrom keras import layers, metrics, optimizers, losses\n\ndef setup_model():\n \n model = Sequential()\n# model.add(layers.Dense(16, activation='relu', input_shape=(vocab_size,)))\n# model.add(layers.Dense(16, activation='relu'))\n model.add(layers.Dense(1, activation='sigmoid', input_shape=(vocab_size,)))\n \n model.compile(optimizer=optimizers.RMSprop(lr=0.001),\n loss=losses.binary_crossentropy,\n metrics=[metrics.binary_accuracy])\n \n return model\n\nmodel = setup_model()\nmodel.summary()", + "class": "Model Training", + "desc": "This code snippet defines and compiles a Sequential neural network model in Keras with a single Dense layer using the sigmoid activation function, and a binary crossentropy loss function, and then outputs the model summary.", + "testing": { + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.990493 + }, + "cluster": 0 + }, { + "cell_id": 13, + "code": "history = model.fit(X_train_ohe, y_train, epochs = 20, batch_size = 512, validation_data = (X_val_ohe, y_val))", + "class": "Model Training", + "desc": "This code snippet trains the compiled neural network model on the one-hot encoded training data for 20 epochs with a batch size of 512, using the one-hot encoded validation data for validation, and stores the training history.", + "testing": { + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.9996803 }, "cluster": -1 }, { - "cell_id": 6, - "code": "import matplotlib.pyplot as plt\nfrom sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer\nfrom sklearn.base import TransformerMixin\nfrom sklearn.pipeline import Pipeline\nimport string\nfrom spacy.lang.en.stop_words import STOP_WORDS\nfrom spacy.lang.en import English\nimport spacy\nimport en_core_web_sm\nimport re", - "class": "Imports and Environment", - "desc": "This code snippet imports various libraries and modules necessary for visualization, text feature extraction, custom transformations, NLP, and regular expressions.", + "cell_id": 18, + "code": "model = setup_model()\nmodel.summary()", + "class": "Model Training", + "desc": "This code snippet initializes a new instance of the predefined neural network model using the setup_model function and outputs the model summary.", "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.9992192 + "class": "Visualization", + "subclass": "model_coefficients", + "subclass_id": 79, + "predicted_subclass_probability": 0.9821353 + }, + "cluster": -1 + }, { + "cell_id": 19, + "code": "history = model.fit(X_train_wc, y_train, epochs = 20, batch_size = 512, validation_data = (X_val_wc, y_val))", + "class": "Model Training", + "desc": "This code snippet trains the newly initialized neural network model on the count-encoded training data for 20 epochs with a batch size of 512, using the count-encoded validation data for validation, and stores the training history.", + "testing": { + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.99967945 + }, + "cluster": -1 + }, { + "cell_id": 24, + "code": "model = setup_model()\nmodel.summary()", + "class": "Model Training", + "desc": "This code snippet initializes a new instance of the predefined neural network model using the setup_model function and outputs the model summary.", + "testing": { + "class": "Visualization", + "subclass": "model_coefficients", + "subclass_id": 79, + "predicted_subclass_probability": 0.9821353 + }, + "cluster": -1 + }, { + "cell_id": 25, + "code": "history = model.fit(X_train_freq, y_train, epochs = 20, batch_size = 512, validation_data = (X_val_freq, y_val))", + "class": "Model Training", + "desc": "This code snippet trains the newly initialized neural network model on the frequency-encoded training data for 20 epochs with a batch size of 512, using the frequency-encoded validation data for validation, and stores the training history.", + "testing": { + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.99967873 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 11, - "code": "from sklearn import metrics\n# Predicting with a test dataset\npredicted = pipe.predict(X_test)\n\n# Model Accuracy\nprint(\"Accuracy:\",metrics.accuracy_score(y_test, predicted))\nprint(\"Precision:\",metrics.precision_score(y_test, predicted))\nprint(\"Recall:\",metrics.recall_score(y_test, predicted))", - "class": "Model Evaluation", - "desc": "This code snippet uses the trained pipeline to predict the test dataset and then calculates and prints the model's accuracy, precision, and recall to evaluate its performance.", + "cell_id": 30, + "code": "model = setup_model()\nmodel.summary()", + "class": "Model Training", + "desc": "This code snippet initializes a new instance of the predefined neural network model using the setup_model function and outputs the model summary.", "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.758966 + "class": "Visualization", + "subclass": "model_coefficients", + "subclass_id": 79, + "predicted_subclass_probability": 0.9821353 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 12, - "code": "predicted_df = pd.DataFrame(predicted)\npredicted_df.value_counts()", - "class": "Model Evaluation", - "desc": "This code snippet creates a DataFrame from the predicted labels and counts the frequency of each unique predicted label to understand the distribution of the predictions.", + "cell_id": 31, + "code": "history = model.fit(X_train_tfidf, y_train, epochs = 20, batch_size = 512, validation_data = (X_val_tfidf, y_val))", + "class": "Model Training", + "desc": "This code snippet trains the newly initialized neural network model on the TF-IDF encoded training data for 20 epochs with a batch size of 512, using the TF-IDF encoded validation data for validation, and stores the training history.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.99671495 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.9996803 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 14, - "code": "predicted_df.head()", - "class": "Model Evaluation", - "desc": "This code snippet displays the first five rows of the DataFrame containing the predicted labels to provide an initial view of the prediction results.", + "cell_id": 39, + "code": "# Setting up the model\n\nn_latent_factors = 100\nmodel_glove = Sequential()\nmodel_glove.add(layers.Embedding(num_words, n_latent_factors, weights = [embedding_matrix], \n input_length = max_len, trainable=True))\nmodel_glove.add(layers.Flatten())\n# model_glove.add(layers.Dense(16, activation='relu'))\nmodel_glove.add(layers.Dropout(0.5))\n# model_glove.add(layers.Dense(16, activation='relu'))\nmodel_glove.add(layers.Dense(1, activation='sigmoid'))\nmodel_glove.summary()", + "class": "Model Training", + "desc": "This code snippet sets up a Sequential neural network model in Keras with an embedding layer initialized with the GloVe embedding matrix, followed by a Flatten layer, a Dropout layer, and a Dense output layer with sigmoid activation, and then outputs the model summary.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9997552 + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.9871293 }, "cluster": 0 }, { - "cell_id": 17, - "code": "predicted", - "class": "Model Evaluation", - "desc": "This code snippet, when executed, will display the array of predicted labels generated by the model, providing a complete view of the prediction results.", + "cell_id": 40, + "code": "model_glove.compile(optimizer = optimizers.RMSprop(lr=0.001),\n loss = losses.binary_crossentropy,\n metrics = [metrics.binary_accuracy])\n\nhistory = model_glove.fit(X_train_seq,\n y_train,\n epochs=20,\n batch_size=512,\n validation_data=(X_val_seq, y_val))", + "class": "Model Training", + "desc": "This code snippet compiles the GloVe-based neural network model with the RMSprop optimizer and binary crossentropy loss function, then trains it on the sequence-encoded training data for 20 epochs with a batch size of 512, using the sequence-encoded validation data for validation, and stores the training history.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.99976486 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.99457717 }, - "cluster": 0 + "cluster": 3 }, { - "cell_id": 9, - "code": "\npunctuations = string.punctuation \nnlp = spacy.load('en_core_web_sm') #, exclude=[\"tok2vec\", \"parser\", \"ner\", \"attribute_ruler\"]\nstop_words = spacy.lang.en.stop_words.STOP_WORDS\nparser = English() # Load English tokenizer, tagger, parser, NER and word vectors\n\ndef spacy_tokenizer(sentence):\n mytokens = str(sentence)\n mytokens = nlp(mytokens)\n #mytokens = parser(sentence) \n mytokens = [ word.lemma_.lower().strip() if word.lemma_ != \"-PRON-\" else word.lower_ for word in mytokens ] \n mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ] \n return mytokens # return preprocessed list of tokens\n\nclass predictors(TransformerMixin):\n def transform(self, X, **transform_params):\n return [clean_text(text) for text in X]\n\n def fit(self, X, y=None, **fit_params):\n return self\n\n def get_params(self, deep=True):\n return {}\n\ndef clean_text(text):\n text = text.strip().lower()\n #text = re.sub(r'[^A-Za-z0-9 ]+', '', text)\n return text #.split()\n\nbow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1), stop_words = None)\ntfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer, stop_words = None) #token_pattern='(?u)\\b\\w\\w+\\b', stop_words = 'english'\n\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import RandomForestClassifier\nclassifier = LogisticRegression()\n# classifier = RandomForestClassifier()\n\npipe = Pipeline([(\"cleaner\", predictors()),\n ('vectorizer', tfidf_vector),\n ('classifier', classifier)])\n\n#clean_text(X_train[1773])\n#spacy_tokenizer(X_train[1773])\n#mytokens = parser(X_train[1773])\n\n# mytokens = str(X_train[1773])\n# #mytokens = re.sub(r'[^A-Za-z0-9 ]+', '', mytokens)\n# #mytokens = parser(mytokens)\n# mytokens = nlp(mytokens)\n# mytokens = [ word.lemma_.lower().strip() if word.lemma_ != \"-PRON-\" else word.lower_ for word in mytokens ]\n# print(mytokens)", + "cell_id": 42, + "code": "max_len = 15\nX_train_seq = tokenizer.texts_to_sequences(train_data['prep_text'])\nX_test_seq = tokenizer.texts_to_sequences(test_data['text'])\n\nX_train_seq = pad_sequences(X_train_seq, maxlen = max_len, truncating = 'post', padding = 'post')\nX_test_seq = pad_sequences(X_test_seq, maxlen = max_len, truncating = 'post', padding = 'post')\ny_train = np.array(train_data['target']).astype(int)\n\nprint(f\"X_train shape: {X_train_seq.shape}\")\nprint(f\"X_test shape: {X_test_seq.shape}\")\nprint(f\"y_train shape: {y_train.shape}\\n\")\n\n# Setting up the model\n\nn_latent_factors = 100\nmodel_glove = Sequential()\nmodel_glove.add(layers.Embedding(num_words, n_latent_factors, weights = [embedding_matrix], \n input_length = max_len, trainable=True))\nmodel_glove.add(layers.Flatten())\n# model_glove.add(layers.Dense(16, activation='relu'))\nmodel_glove.add(layers.Dropout(0.5))\n# model_glove.add(layers.Dense(16, activation='relu'))\nmodel_glove.add(layers.Dense(1, activation='sigmoid'))\nprint(f\"{model_glove.summary()}\\n\")\n\n\nmodel_glove.compile(optimizer = optimizers.RMSprop(lr=0.001),\n loss = losses.binary_crossentropy,\n metrics = [metrics.binary_accuracy])\n\nhistory = model_glove.fit(X_train_seq,\n y_train,\n epochs=20,\n batch_size=512)", "class": "Model Training", - "desc": "This code snippet defines the necessary functions and classes for text preprocessing and tokenization, creates vectorizers for converting text to numerical features, and sets up a logistic regression model wrapped in a pipeline for training.", + "desc": "This code snippet prepares the sequence-encoded data, sets up and initializes a GloVe-based neural network model, compiles it with the RMSprop optimizer and binary crossentropy loss function, and trains it on the sequence-encoded training data for 20 epochs with a batch size of 512 while outputting the model summary.", "testing": { "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.5153541 + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.80148226 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 10, - "code": "pipe.fit(X_train, y_train)", + "cell_id": 43, + "code": "# Setting up the tokenizer\nvocab_size = 1000\ntokenizer = Tokenizer(num_words = vocab_size, oov_token = 'UNK')\ntokenizer.fit_on_texts(list(train_data['text']) + list(test_data['text']))\n\n# Word count representation\nX_train_wc = tokenizer.texts_to_matrix(train_data['text'], mode = 'count')\nX_test_wc = tokenizer.texts_to_matrix(test_data['text'], mode = 'count')\ny_train = np.array(train_data['target']).astype(int)\n\nprint(f\"X_train shape: {X_train_wc.shape}\")\nprint(f\"X_test shape: {X_test_wc.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")\n\n# Train Validation Split\nX_train_wc, X_val_wc, y_train, y_val = train_test_split(X_train_wc, y_train, test_size = 0.2, random_state = 42)\n\nprint(f\"X_train shape: {X_train_wc.shape}\")\nprint(f\"X_val shape: {X_val_wc.shape}\")\nprint(f\"y_train shape: {y_train.shape}\")\nprint(f\"y_val shape: {y_val.shape}\\n\")\n\n# Setting up the model\nmodel = setup_model()\n\n# Fitting the model on un-preprocessed text\nhistory = model.fit(X_train_wc, y_train, epochs = 20, batch_size = 512, validation_data = (X_val_wc, y_val))", "class": "Model Training", - "desc": "This code snippet fits the defined pipeline, which includes text cleaning, vectorization, and the logistic regression classifier, to the training data.", + "desc": "This code snippet sets up a Keras Tokenizer for raw text data, converts the texts to word count matrices, splits the data into training and validation sets, initializes a new neural network model, and trains the model on the un-preprocessed text data for 20 epochs with a batch size of 512 while including validation.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.99970585 + "class": "Data_Transform", + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.59386915 }, - "cluster": -1 + "cluster": 2 }, { - "cell_id": 13, - "code": "predicted_df.plot.hist()", + "cell_id": 5, + "code": "sns.countplot(train_data['target'])", "class": "Visualization", - "desc": "This code snippet generates a histogram plot from the DataFrame of predicted labels to visualize the distribution of the predictions.", + "desc": "This code snippet generates a count plot using Seaborn to visualize the distribution of the 'target' variable in the training data.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, - "predicted_subclass_probability": 0.9975446 - }, - "cluster": 0 - }], - "notebook_id": 19, - "notebook_name": "nlp-starter-spacy-binary-text-classifier" - }, { - "cells": [{ - "cell_id": 39, - "code": "submission = pd.DataFrame({'id':test['id'].values.tolist(),'target':predictions})\nsubmission.to_csv('submission.csv',index=False)", - "class": "Data Export", - "desc": "The code creates a DataFrame with the test IDs and corresponding predictions, and then exports this DataFrame to a CSV file named 'submission.csv' without including the index.", - "testing": { - "class": "Data_Export", - "subclass": "save_to_csv", - "subclass_id": 25, - "predicted_subclass_probability": 0.9992361 + "predicted_subclass_probability": 0.99602413 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 1, - "code": "train = pd.read_csv(\"../input/nlp-getting-started/train.csv\")\ntest = pd.read_csv('../input/nlp-getting-started/test.csv')", - "class": "Data Extraction", - "desc": "The code reads the training and test datasets from CSV files into pandas DataFrame objects named 'train' and 'test'. ", + "cell_id": 15, + "code": "import matplotlib.pyplot as plt\n\ndef plot_history(history): \n\n history_dict = history.history\n history_dict.keys()\n\n\n acc = history.history['binary_accuracy']\n val_acc = history.history['val_binary_accuracy']\n loss = history.history['loss']\n val_loss = history.history['val_loss']\n\n epochs = range(1, len(acc) + 1)\n\n # \"bo\" is for \"blue dot\"\n plt.plot(epochs, loss, 'bo', label='Training loss')\n # b is for \"solid blue line\"\n plt.plot(epochs, val_loss, 'b', label='Validation loss')\n plt.title('Training and validation loss')\n plt.xlabel('Epochs')\n plt.ylabel('Loss')\n plt.legend()\n\n plt.show()\n \nplot_history(history)", + "class": "Visualization", + "desc": "This code snippet defines a function that plots the training and validation loss over epochs using Matplotlib and calls this function to visualize the history obtained from model training.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.9997564 + "class": "Visualization", + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.9967168 }, - "cluster": 3 + "cluster": -1 }, { - "cell_id": 5, - "code": "string.punctuation", - "class": "Data Transform", - "desc": "The code snippet likely refers to the list of all punctuation characters present in the `string` module, which is typically used for text cleaning and preprocessing.", + "cell_id": 21, + "code": "plot_history(history)", + "class": "Visualization", + "desc": "This code snippet calls the previously defined `plot_history` function to visualize the training and validation loss over epochs for the recent model training history.", "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.73409814 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.9654706 }, "cluster": -1 }, { - "cell_id": 6, - "code": "def remove_URL(text):\n url = re.compile(r\"https?://\\S+|www\\.\\S+\")\n return url.sub(r\"\", text)\n\ndef remove_punct(text):\n translator = str.maketrans(\"\", \"\", string.punctuation)\n return text.translate(translator)", - "class": "Data Transform", - "desc": "The code defines two functions for text preprocessing: `remove_URL` to remove URLs from text using regular expressions, and `remove_punct` to remove punctuation using string translation.", + "cell_id": 26, + "code": "plot_history(history)", + "class": "Visualization", + "desc": "This code snippet calls the previously defined `plot_history` function to visualize the training and validation loss over epochs for the recent model training history.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.7244959 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.9654706 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 7, - "code": "#regex pattern to remove links\npattern = re.compile(r\"https?://(\\S+|www)\\.\\S+\")\n#for train\nfor t in train.text:\n matches = pattern.findall(t)\n for match in matches:\n print(t)\n print('After Transformation:')\n print(pattern.sub(r\"\", t))\n if len(matches) > 0:\n break", - "class": "Data Transform", - "desc": "The code snippet defines a regular expression pattern to remove links from text and applies it to the 'text' column in the 'train' DataFrame, printing the original and transformed text for inspection.", + "cell_id": 32, + "code": "plot_history(history)", + "class": "Visualization", + "desc": "This code snippet calls the previously defined `plot_history` function to visualize the training and validation loss over epochs for the recent model training history.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.9533664 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.9654706 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 8, - "code": "#for test:\nfor t in test.text:\n matches = pattern.findall(t)\n for match in matches:\n print(t)\n print('After Transformation:')\n print(pattern.sub(r\"\", t))\n if len(matches) > 0:\n break", - "class": "Data Transform", - "desc": "The code applies the same regular expression pattern to remove links from the 'text' column in the 'test' DataFrame, printing the original and transformed text for verification.", + "cell_id": 33, + "code": "plt.hist(list(train_data['prep_text'].str.split().map(lambda x: len(x))))", + "class": "Visualization", + "desc": "This code snippet generates a histogram using Matplotlib to visualize the distribution of word counts in the cleaned text entries of the training data.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.9523689 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.9977956 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 9, - "code": "#preprocess data frames:\n#train\ntrain[\"text\"] = train.text.map(remove_URL) \ntrain[\"text\"] = train.text.map(remove_punct)\n#test\ntest[\"text\"] = test.text.map(remove_URL) \ntest[\"text\"] = test.text.map(remove_punct)", - "class": "Data Transform", - "desc": "This code applies the `remove_URL` and `remove_punct` functions to the 'text' column in both 'train' and 'test' DataFrames to preprocess the text data by removing URLs and punctuation.", + "cell_id": 41, + "code": "plot_history(history)", + "class": "Visualization", + "desc": "This code snippet calls the previously defined `plot_history` function to visualize the training and validation loss over epochs for the recent model training history.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.90332294 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.9654706 }, - "cluster": 1 - }, { - "cell_id": 10, - "code": "# remove stopwords\nnltk.download('stopwords')\n\nstop = set(stopwords.words(\"english\"))\n\ndef remove_stopwords(text):\n filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]\n return \" \".join(filtered_words)", - "class": "Data Transform", - "desc": "The code downloads the list of English stopwords from the NLTK library and defines a function `remove_stopwords` to remove these stopwords from text by filtering out common words, thus aiding in text preprocessing.", + "cluster": -1 + }], + "notebook_id": 18, + "notebook_name": "baseline-nlp.ipynb" + }, { + "cells": [{ + "cell_id": 77, + "code": "pred_df.to_csv(\"/kaggle/working/nlp_disaster_tweets_tfidf_lr_submission.csv\", index=False)", + "class": "Data Export", + "desc": "This code exports the DataFrame `pred_df` to a CSV file named \"nlp_disaster_tweets_tfidf_lr_submission.csv\" in the specified directory without including the index.", "testing": { - "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.9945669 + "class": "Data_Export", + "subclass": "save_to_csv", + "subclass_id": 25, + "predicted_subclass_probability": 0.99896014 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 11, - "code": "stop", - "class": "Data Transform", - "desc": "The code likely refers to the set of English stopwords obtained from the NLTK library, which is used for filtering out common words in text preprocessing.", + "cell_id": 2, + "code": "df = pd.read_csv(\"../input/nlp-getting-started/train.csv\", sep=\",\")", + "class": "Data Extraction", + "desc": "This code reads a CSV file named \"train.csv\" from the specified directory using pandas and stores it in a DataFrame named `df`.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9975351 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.9997379 }, "cluster": 0 - }, { - "cell_id": 12, - "code": "#train\ntrain[\"text\"] = train.text.map(remove_stopwords)\n#test\ntest[\"text\"] = test.text.map(remove_stopwords)", - "class": "Data Transform", - "desc": "The code applies the `remove_stopwords` function to the 'text' column in both 'train' and 'test' DataFrames to preprocess the text by removing stopwords.", - "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.76800144 - }, - "cluster": 1 - }, { - "cell_id": 19, - "code": "# Split dataset into training and validation set\nX = train.text\ny = train.target\ntrain_sentences, val_sentences , train_labels, val_labels = train_test_split(X, y, test_size=0.2)", - "class": "Data Transform", - "desc": "The code splits the 'text' and 'target' columns of the 'train' DataFrame into training and validation sets using an 80-20 split ratio, with `train_sentences` and `val_sentences` containing the text data and `train_labels` and `val_labels` containing the labels.", + }, { + "cell_id": 22, + "code": "train_df, test_df = train_test_split(df, train_size=0.9)", + "class": "Data Extraction", + "desc": "This code uses the `train_test_split` function from scikit-learn to split the DataFrame `df` into `train_df` and `test_df` sets with a 90% training size.", "testing": { "class": "Data_Transform", "subclass": "split", "subclass_id": 13, - "predicted_subclass_probability": 0.9967854 + "predicted_subclass_probability": 0.9968256 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 20, - "code": "#train/val\ntrain_sentences = train_sentences.to_numpy()\ntrain_labels = train_labels.to_numpy()\nval_sentences = val_sentences.to_numpy()\nval_labels = val_labels.to_numpy()", - "class": "Data Transform", - "desc": "The code converts the training and validation sentences and labels from pandas Series objects to NumPy arrays, facilitating their use in machine learning models.", + "cell_id": 68, + "code": "pred_df = pd.read_csv(\"../input/nlp-getting-started/test.csv\", sep=\",\")", + "class": "Data Extraction", + "desc": "This code reads a CSV file named \"test.csv\" from the specified directory using pandas and stores it in a DataFrame named `pred_df`.", "testing": { - "class": "Data_Transform", - "subclass": "data_type_conversions", - "subclass_id": 16, - "predicted_subclass_probability": 0.9735998 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.999752 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 21, - "code": "#test\ntest_sentences = test.text.to_numpy()", + "cell_id": 8, + "code": "count_class_0, count_class_1 = df[\"target\"].value_counts()", "class": "Data Transform", - "desc": "The code converts the 'text' column of the 'test' DataFrame to a NumPy array, storing it in the `test_sentences` variable for further processing or model predictions.", + "desc": "This code assigns the counts of each unique value in the \"target\" column of the DataFrame `df` to the variables `count_class_0` and `count_class_1`. ", "testing": { - "class": "Data_Transform", - "subclass": "data_type_conversions", - "subclass_id": 16, - "predicted_subclass_probability": 0.98378074 + "class": "Exploratory_Data_Analysis", + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.99944013 }, - "cluster": 1 + "cluster": 7 }, { - "cell_id": 23, - "code": "# Tokenize\n# vectorize a text corpus by turning each text into a sequence of integers\n\ntokenizer = Tokenizer(num_words=num_unique_words)\ntokenizer.fit_on_texts(train_sentences) # fit only to training", + "cell_id": 9, + "code": "print(count_class_0, count_class_1)", "class": "Data Transform", - "desc": "The code initializes a `Tokenizer` with the specified number of unique words and fits it on the training sentences to create a vocabulary and prepare the text data for conversion into sequences of integers.", + "desc": "This code prints the count values of `count_class_0` and `count_class_1`, showing the distribution of the target variable.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.6493907 + "class": "Exploratory_Data_Analysis", + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.988794 }, - "cluster": 1 + "cluster": 7 }, { - "cell_id": 24, - "code": "# Now each word has unique index\nword_index = tokenizer.word_index\nword_index", + "cell_id": 10, + "code": "class_ratio = count_class_0 / count_class_1", "class": "Data Transform", - "desc": "The code retrieves and outputs the `word_index` dictionary from the tokenizer, which maps each word to a unique integer index based on the training corpus.", + "desc": "This code calculates the ratio between the counts of class 0 and class 1 in the target variable and stores the result in the variable `class_ratio`.", "testing": { - "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.66235614 + "class": "Exploratory_Data_Analysis", + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.9978563 }, - "cluster": 1 + "cluster": 7 }, { - "cell_id": 25, - "code": "#apply on train, validation, and test sentences\n\ntrain_sequences = tokenizer.texts_to_sequences(train_sentences)\nval_sequences = tokenizer.texts_to_sequences(val_sentences)\ntest_sequences = tokenizer.texts_to_sequences(test_sentences)", + "cell_id": 11, + "code": "print(\"{0:.3f}\".format(class_ratio))", "class": "Data Transform", - "desc": "The code applies the tokenizer to convert the training, validation, and test sentences into sequences of integers where each integer represents a word's unique index.", + "desc": "This code prints the `class_ratio` with three decimal places, providing a formatted representation of the class distribution ratio.", "testing": { - "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.5915242 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.98255676 }, "cluster": 1 }, { - "cell_id": 27, - "code": "# Pad the sequences to have the same length\nmax_length = 15 #arbitrary number\n\ntrain_padded = pad_sequences(train_sequences, maxlen=max_length, padding=\"post\", truncating=\"post\") #post-> 0\nval_padded = pad_sequences(val_sequences, maxlen=max_length, padding=\"post\", truncating=\"post\")\ntest_padded = pad_sequences(test_sequences, maxlen=max_length, padding=\"post\", truncating=\"post\")", + "cell_id": 17, + "code": "# Since classes are imbalanced, we need to resample the dataframe\n# First divide by class\ndf_class_0 = df[df[\"target\"] == 0]\ndf_class_1 = df[df[\"target\"] == 1]", "class": "Data Transform", - "desc": "The code pads and truncates the sequences of integers for the training, validation, and test sets to ensure they all have a uniform length of 15, adding padding zeros at the end if necessary.", + "desc": "This code divides the DataFrame `df` into two separate DataFrames, `df_class_0` and `df_class_1`, based on the values in the \"target\" column to address class imbalance.", "testing": { "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.717097 + "subclass": "filter", + "subclass_id": 14, + "predicted_subclass_probability": 0.45292664 }, - "cluster": 1 + "cluster": 7 }, { - "cell_id": 31, - "code": "# flip (key, value)\nreverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])", + "cell_id": 18, + "code": "df_class_0", "class": "Data Transform", - "desc": "The code creates a `reverse_word_index` dictionary by flipping the keys and values of the `word_index`, mapping integer indices back to their corresponding words.", + "desc": "This code outputs the DataFrame `df_class_0`, which consists of all rows from `df` where the \"target\" column is 0.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.9888599 + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9996333 }, "cluster": 1 }, { - "cell_id": 33, - "code": "#decoding\ndef decode(sequence):\n return \" \".join([reverse_word_index.get(idx, \"?\") for idx in sequence])", + "cell_id": 19, + "code": "df_class_1", "class": "Data Transform", - "desc": "The code defines a `decode` function that converts a sequence of integers back into a string of words using the `reverse_word_index` dictionary, aiding in the interpretation of tokenized sequences.", - "testing": { - "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.95032895 - }, - "cluster": 1 - }, { - "cell_id": 2, - "code": "train.head()", - "class": "Exploratory Data Analysis", - "desc": "The code displays the first few rows of the 'train' DataFrame to give an overview of the training data.", + "desc": "This code outputs the DataFrame `df_class_1`, which consists of all rows from `df` where the \"target\" column is 1.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.9997507 - }, - "cluster": 12 - }, { - "cell_id": 3, - "code": "train.shape", - "class": "Exploratory Data Analysis", - "desc": "The code outputs the dimensions (number of rows and columns) of the 'train' DataFrame, providing a quick sense of the dataset's size.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.99950194 + "predicted_subclass_probability": 0.99961746 }, - "cluster": -1 + "cluster": 1 }, { - "cell_id": 4, - "code": "print((train.target == 1).sum()) # Disaster\nprint((train.target == 0).sum()) # No Disaster", - "class": "Exploratory Data Analysis", - "desc": "The code prints the counts of rows in the 'train' DataFrame where the target column is 1 (indicating a disaster) and 0 (indicating no disaster), helping to understand the class distribution.", + "cell_id": 20, + "code": "# Second resample - try both under- and over-sampling\ndf_class_0_under = df_class_0.sample(count_class_1) # undersampling by loosing objects\ndf_under = pd.concat([df_class_0_under, df_class_1], axis=0)\n\ndf_class_1_over = df_class_1.sample(count_class_0, replace=True) # oversampling by duplicaitng objects\ndf_over = pd.concat([df_class_0, df_class_1_over], axis=0)\n\n#df = df_under\n#df = df_over\n\n# Looks like oversampling works better since we use more objects - more training cases", + "class": "Data Transform", + "desc": "This code performs both undersampling and oversampling to balance the classes in the dataset: `df_class_0_under` contains a random sample of `df_class_0` with a size equal to `count_class_1`, and `df_class_1_over` contains a random sample of `df_class_1` with replacement to match `count_class_0`; `df_under` and `df_over` are combined DataFrames using these samples to create balanced datasets.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.99325174 + "class": "Data_Transform", + "subclass": "concatenate", + "subclass_id": 11, + "predicted_subclass_probability": 0.7727316 }, - "cluster": 7 + "cluster": 8 }, { - "cell_id": 13, - "code": "#Check\ntrain.text", - "class": "Exploratory Data Analysis", - "desc": "The code outputs the 'text' column of the 'train' DataFrame to inspect and verify the preprocessed text data.", + "cell_id": 26, + "code": "#eng_words = words.words(\"en\")", + "class": "Data Transform", + "desc": "This line of code (which is currently commented out) fetches a list of valid English words from the nltk corpus to potentially be used for text processing tasks such as filtering or validation.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.53716487 + "subclass": "commented", + "subclass_id": 76, + "predicted_subclass_probability": 0.9975278 }, - "cluster": -1 + "cluster": 1 }, { - "cell_id": 14, - "code": "# Count unique words\ndef counter_word(text_col):\n count = Counter()\n for text in text_col.values:\n for word in text.split():\n count[word] += 1\n return count\n\n\ncounter = counter_word(train.text)", - "class": "Exploratory Data Analysis", - "desc": "The code defines a function `counter_word` to count the frequency of unique words in a given text column and then applies it to the 'text' column of the 'train' DataFrame, storing the result in the variable `counter`.", + "cell_id": 27, + "code": "#print(\"wort\" in eng_words)", + "class": "Data Transform", + "desc": "This line of code (which is currently commented out) checks if the string \"wort\" exists in the list `eng_words` and prints the result, possibly for validation or testing purposes in text processing.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.94201726 + "subclass": "commented", + "subclass_id": 76, + "predicted_subclass_probability": 0.9939898 }, - "cluster": 6 + "cluster": 1 }, { - "cell_id": 15, - "code": "len(counter)", - "class": "Exploratory Data Analysis", - "desc": "The code calculates and outputs the number of unique words in the 'text' column of the 'train' DataFrame by finding the length of the `counter` dictionary.", + "cell_id": 28, + "code": "snowball = SnowballStemmer(language=\"english\")", + "class": "Data Transform", + "desc": "This code initializes a SnowballStemmer object for the English language, which can be used for stemming words during text preprocessing.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.98219407 + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.9888525 }, - "cluster": -1 + "cluster": 4 }, { - "cell_id": 16, - "code": "# counter", - "class": "Exploratory Data Analysis", - "desc": "The code outputs the `counter` dictionary, which contains the frequency of each unique word in the 'text' column of the 'train' DataFrame, providing insight into word distribution.", + "cell_id": 29, + "code": "def tokenize_sentence(sentence: str, remove_stop_words: bool = True):\n '''Tokenize sentences with nltk dropping non-english words and punctuation and optionally stop words'''\n tokens = word_tokenize(sentence, language=\"english\")\n #tokens = [i for i in tokens if i in eng_words and i not in string.punctuation]\n tokens = [i for i in tokens if i not in string.punctuation]\n if remove_stop_words:\n tokens = [i for i in tokens if i not in stopwords.words(\"english\")]\n tokens = [snowball.stem(i) for i in tokens]\n return tokens", + "class": "Data Transform", + "desc": "This code defines a function called `tokenize_sentence` that tokenizes an input `sentence` using NLTK, optionally removes stop words, filters out punctuation, and applies stemming using the SnowballStemmer.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "commented", - "subclass_id": 76, - "predicted_subclass_probability": 0.9903482 + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.99213856 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 17, - "code": "counter.most_common(5)", - "class": "Exploratory Data Analysis", - "desc": "The code outputs the five most common words in the 'text' column of the 'train' DataFrame by accessing the `most_common` method of the `counter` dictionary, providing insight into frequently occurring terms.", + "cell_id": 30, + "code": "tokenize_sentence(\"the sentence and asdf fy krkr\", False)", + "class": "Data Transform", + "desc": "This code calls the `tokenize_sentence` function with the input string \"the sentence and asdf fy krkr\" and the `remove_stop_words` parameter set to `False`, which will tokenize the sentence, remove punctuation, and apply stemming without removing stop words.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.9768316 + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.9920244 }, - "cluster": -1 + "cluster": 4 }, { - "cell_id": 18, - "code": "num_unique_words = len(counter)\nnum_unique_words", - "class": "Exploratory Data Analysis", - "desc": "The code assigns the number of unique words in the 'text' column of the 'train' DataFrame to the variable `num_unique_words` and then outputs this value, summarizing the vocabulary size of the dataset.", + "cell_id": 31, + "code": "vectorizer_params = {\n #\"max_features\": 500,\n #\"max_features\": None,\n #\"tokenizer\": lambda x: tokenize_sentence(x, remove_stop_words=False),\n #\"tokenizer\": None,\n #\"ngram_range\": (1, 100),\n #\"min_df\": 0,\n #\"max_df\": 100,\n #\"use_idf\": False,\n #\"decode_error\": \"replace\",\n #\"sublinear_tf\": True,\n #\"analyzer\": \"char\"\n}", + "class": "Data Transform", + "desc": "This code defines a dictionary named `vectorizer_params`, which includes various commented-out parameters intended for configuring a `TfidfVectorizer` for text processing tasks.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_unique_values", - "subclass_id": 54, - "predicted_subclass_probability": 0.9817012 + "class": "Model_Train", + "subclass": "init_hyperparams", + "subclass_id": 59, + "predicted_subclass_probability": 0.91740084 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 22, - "code": "train_sentences.shape, val_sentences.shape", - "class": "Exploratory Data Analysis", - "desc": "The code outputs the shapes of the `train_sentences` and `val_sentences` NumPy arrays, providing an overview of the number of samples in the training and validation sets.", + "cell_id": 32, + "code": "vectorizer = TfidfVectorizer(**vectorizer_params)", + "class": "Data Transform", + "desc": "This code initializes a `TfidfVectorizer` object with the parameters specified in the `vectorizer_params` dictionary, setting up text vectorization for subsequent machine learning tasks.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.9996413 + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.9972772 }, - "cluster": 1 - }, { - "cell_id": 26, - "code": "#Check\nprint(train_sentences[10:15])\nprint(train_sequences[10:15])", - "class": "Exploratory Data Analysis", - "desc": "The code prints a sample of original training sentences and their corresponding sequences of integers, allowing for verification of the tokenization process.", + "cluster": 1 + }, { + "cell_id": 33, + "code": "vectorizer", + "class": "Data Transform", + "desc": "This code outputs the current state of the `vectorizer` object, providing the configuration and parameters used in the `TfidfVectorizer`.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.9992624 + "predicted_subclass_probability": 0.9994728 }, - "cluster": -1 + "cluster": 1 }, { - "cell_id": 28, - "code": "#Check\ntrain_padded.shape, val_padded.shape", - "class": "Exploratory Data Analysis", - "desc": "The code outputs the shapes of the padded training and validation sequences, providing a quick check to confirm that all sequences have been correctly padded to the specified length.", + "cell_id": 34, + "code": "features = vectorizer.fit_transform(train_df[\"text\"])", + "class": "Data Transform", + "desc": "This code applies the `fit_transform` method of the `TfidfVectorizer` to the \"text\" column of `train_df`, creating a matrix of TF-IDF features for the training data.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_shape", - "subclass_id": 58, - "predicted_subclass_probability": 0.9994609 + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.99820554 }, "cluster": 1 }, { - "cell_id": 29, - "code": "#Check\ntrain_padded[10]", - "class": "Exploratory Data Analysis", - "desc": "The code outputs the padded sequence of integers for a specific training sample (at index 10), allowing for inspection of the padding process and verification of the sequence length.", + "cell_id": 36, + "code": "feature_names = vectorizer.get_feature_names()", + "class": "Data Transform", + "desc": "This code retrieves the feature names (i.e., the terms or tokens) from the fitted `TfidfVectorizer` and stores them in the `feature_names` list.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9994702 + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.54732066 }, - "cluster": -1 + "cluster": 7 }, { - "cell_id": 30, - "code": "#Check\nprint(train_sentences[10])\nprint(train_sequences[10])\nprint(train_padded[10])", - "class": "Exploratory Data Analysis", - "desc": "The code prints the original training sentence, its tokenized sequence, and the corresponding padded sequence for a specific index (10), allowing for thorough verification of the tokenization and padding process.", + "cell_id": 39, + "code": "X_train = train_df[\"text\"]", + "class": "Data Transform", + "desc": "This code stores the \"text\" column of `train_df` in the variable `X_train`, representing the input features for the training dataset.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.99758375 + "class": "Data_Transform", + "subclass": "prepare_x_and_y", + "subclass_id": 21, + "predicted_subclass_probability": 0.9986425 }, - "cluster": -1 + "cluster": 1 }, { - "cell_id": 32, - "code": "#Check\nreverse_word_index", - "class": "Exploratory Data Analysis", - "desc": "The code outputs the `reverse_word_index` dictionary, allowing verification that each integer index correctly maps back to its corresponding word.", + "cell_id": 40, + "code": "X_train", + "class": "Data Transform", + "desc": "This code outputs the `X_train` variable, displaying the \"text\" column of the training DataFrame.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.98285496 + "predicted_subclass_probability": 0.99974626 }, - "cluster": -1 + "cluster": 1 }, { - "cell_id": 34, - "code": "decoded_text = decode(train_sequences[10])\n#Check\nprint(train_sequences[10])\nprint(decoded_text)", - "class": "Exploratory Data Analysis", - "desc": "The code decodes a specific tokenized training sequence back into text using the `decode` function and prints both the sequence and the decoded text for verification.", + "cell_id": 41, + "code": "y_train = train_df[\"target\"]", + "class": "Data Transform", + "desc": "This code extracts the \"target\" column from `train_df` and stores it in the variable `y_train`, representing the target labels for the training dataset.", "testing": { "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.42135346 - }, - "cluster": -1 - }, { - "cell_id": 0, - "code": "import numpy as np \nimport pandas as pd \nimport re\nimport string\nimport nltk\nfrom nltk.corpus import stopwords\nfrom collections import Counter\nfrom sklearn.model_selection import train_test_split\nfrom tensorflow.keras.preprocessing.text import Tokenizer\nfrom tensorflow.keras.preprocessing.sequence import pad_sequences\nfrom tensorflow import keras\nfrom tensorflow.keras import layers", - "class": "Imports and Environment", - "desc": "The code imports various libraries and modules necessary for data manipulation (numpy, pandas), text processing (re, string, nltk), machine learning (sklearn), and deep learning (tensorflow and keras).", - "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.9993357 + "subclass": "prepare_x_and_y", + "subclass_id": 21, + "predicted_subclass_probability": 0.9991716 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 38, - "code": "predictions = model.predict(test_padded)\npredictions = [1 if p > 0.5 else 0 for p in predictions]", - "class": "Model Evaluation", - "desc": "The code generates predictions from the LSTM model on the padded test data, and then converts these predictions into binary class labels based on a threshold of 0.5.", + "cell_id": 42, + "code": "y_train", + "class": "Data Transform", + "desc": "This code outputs the `y_train` variable, displaying the target labels of the training DataFrame.", "testing": { - "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.99410737 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.99974114 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 35, - "code": "# Create LSTM model\n\n# Embedding: Turns positive integers (indexes) into dense vectors of fixed size.\n\nmodel = keras.models.Sequential()\nmodel.add(layers.Embedding(num_unique_words, 100, input_length=max_length))\n\nmodel.add(layers.LSTM(32, dropout=0.25))\nmodel.add(layers.Dense(1, activation=\"sigmoid\"))\n\nmodel.summary()", - "class": "Model Training", - "desc": "The code creates and summarizes a Sequential LSTM model architecture, including an embedding layer to convert word indices to dense vectors, an LSTM layer with dropout for sequence processing, and a dense output layer with a sigmoid activation function for binary classification.", + "cell_id": 43, + "code": "lr_model_params = {\n #\"class_weight\": \"balanced\",\n #\"class_weight\": None,\n #\"class_weight\": {1: 1, 0: 1/class_ratio},\n #\"random_state\": 0,\n #\"Cs\": 5,\n #\"penalty\": \"none\",\n #\"penalty\": \"elasticnet\",\n \"solver\": \"liblinear\",\n #\"l1_ratio\": 0.5,\n #\"max_iter\": 10000,\n #\"cv\": 10\n}", + "class": "Data Transform", + "desc": "This code defines a dictionary named `lr_model_params` with various commented-out potential parameters and the \"solver\" parameter set to \"liblinear\", intended for configuring a Logistic Regression model.", "testing": { "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.984528 + "subclass": "init_hyperparams", + "subclass_id": 59, + "predicted_subclass_probability": 0.98202676 }, "cluster": 0 }, { - "cell_id": 36, - "code": "loss = keras.losses.BinaryCrossentropy(from_logits=False)\noptim = keras.optimizers.Adam(learning_rate=0.001)\nmetrics = [\"accuracy\"]\n\nmodel.compile(loss=loss, optimizer=optim, metrics=metrics)", - "class": "Model Training", - "desc": "The code compiles the LSTM model with binary cross-entropy as the loss function, Adam as the optimizer, and accuracy as the evaluation metric, preparing the model for training.", + "cell_id": 59, + "code": "#y_test = y_train\ny_test = test_df[\"target\"]", + "class": "Data Transform", + "desc": "This code assigns the \"target\" column from `test_df` to the variable `y_test`, representing the target labels for the test dataset.", "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.9955751 + "class": "Data_Transform", + "subclass": "prepare_x_and_y", + "subclass_id": 21, + "predicted_subclass_probability": 0.99827564 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 37, - "code": "model.fit(train_padded, train_labels, epochs=25, validation_data=(val_padded, val_labels), verbose=2)", - "class": "Model Training", - "desc": "The code trains the LSTM model on the padded training data for 25 epochs and validates it on the padded validation data, outputting detailed progress information for each epoch.", - "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.9996841 - }, - "cluster": 0 - }], - "notebook_id": 20, - "notebook_name": "nlp-tokenization-embedding-lstm" - }, { - "cells": [{ - "cell_id": 31, - "code": "def Kaggle_submission(file_name,model,test_data,ids_list):\n #if TARGET in test_data.columns:\n # test_data.drop([TARGET],axis=1,inplace=True)\n #test_pred=model.predict(test_data)[:,1]\n test_pred=model.predict(test_data)\n predictions = []\n predictions = oc.adjusted_classes(test_pred, 0.5)\n\n submit=pd.DataFrame()\n submit['id'] = ids_list\n submit['target'] = predictions\n submit.to_csv(file_name,index=False)\n return submit", - "class": "Data Export", - "desc": "This cell defines a function for generating a Kaggle submission file by using the trained model to make predictions on the test data, adjusting the predicted classes based on a threshold, and saving the results along with the IDs to a CSV file.", + "cell_id": 71, + "code": "pred_df[\"target\"] = model_pipeline.predict(pred_df[\"text\"])", + "class": "Data Transform", + "desc": "This code uses the `model_pipeline` to predict the target labels for the text data in `pred_df` and assigns these predictions to a new column named \"target\" in `pred_df`.", "testing": { - "class": "Data_Export", - "subclass": "save_to_csv", - "subclass_id": 25, - "predicted_subclass_probability": 0.9981369 + "class": "Model_Evaluation", + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.9946057 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 45, - "code": "submit=pd.DataFrame()\nsubmit['id'] = test_df['id'].tolist()\nsubmit['target'] = test_pred_BERT_int", - "class": "Data Export", - "desc": "This cell creates a submission DataFrame containing the IDs from the test DataFrame and the predicted integer class labels from the BERT model.", + "cell_id": 75, + "code": "pred_df.drop(columns=[\"keyword\", \"location\", \"text\"], inplace=True)", + "class": "Data Transform", + "desc": "This code removes the \"keyword\", \"location\", and \"text\" columns from the DataFrame `pred_df`, retaining only necessary columns, such as \"target\".", "testing": { "class": "Data_Transform", - "subclass": "create_dataframe", - "subclass_id": 12, - "predicted_subclass_probability": 0.90852714 + "subclass": "drop_column", + "subclass_id": 10, + "predicted_subclass_probability": 0.99920505 }, - "cluster": -1 + "cluster": 7 }, { - "cell_id": 46, - "code": "submit.to_csv('BERT_model_v3.csv',index=False)", - "class": "Data Export", - "desc": "This cell saves the submission DataFrame to a CSV file named 'BERT_model_v3.csv' for upload and evaluation in a Kaggle competition.", + "cell_id": 3, + "code": "df.shape", + "class": "Exploratory Data Analysis", + "desc": "This code outputs the dimensions (number of rows and columns) of the DataFrame `df`.", "testing": { - "class": "Data_Export", - "subclass": "save_to_csv", - "subclass_id": 25, - "predicted_subclass_probability": 0.9991627 + "class": "Exploratory_Data_Analysis", + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.9995491 }, - "cluster": -1 + "cluster": 1 }, { - "cell_id": 47, - "code": "submit.head(3)", - "class": "Data Export", - "desc": "This cell displays the first three rows of the submission DataFrame to verify the format and content before final submission.", + "cell_id": 4, + "code": "df.head(20)", + "class": "Exploratory Data Analysis", + "desc": "This code displays the first 20 rows of the DataFrame `df` to provide an initial view of the dataset.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.9997651 + "predicted_subclass_probability": 0.9997683 }, - "cluster": -1 + "cluster": 2 }, { - "cell_id": 2, - "code": "train_df = pd.read_csv(\"/kaggle/input/nlp-getting-started/train.csv\")\ntest_df = pd.read_csv(\"/kaggle/input/nlp-getting-started/test.csv\")", - "class": "Data Extraction", - "desc": "This cell reads the training and testing datasets from CSV files into Pandas DataFrames.", + "cell_id": 5, + "code": "df.head(20)", + "class": "Exploratory Data Analysis", + "desc": "This code displays the first 20 rows of the DataFrame `df` to provide an initial view of the dataset.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.99975425 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997683 }, "cluster": 2 }, { "cell_id": 6, - "code": "def wordcount(x):\n length = len(str(x).split())\n return length\ndef charcount(x):\n s = x.split()\n x = ''.join(s)\n return len(x)\n\ndef hashtag_count(x):\n l = len([t for t in x.split() if t.startswith('#')])\n return l\n\ndef mentions_count(x):\n l = len([t for t in x.split() if t.startswith('@')])\n return l\n\n\ntrain_df['char_count'] = train_df['text'].apply(lambda x: charcount(x))\ntrain_df['word_count'] = train_df['text'].apply(lambda x: wordcount(x))\ntrain_df['hashtag_count'] = train_df['text'].apply(lambda x: hashtag_count(x))\ntrain_df['mention_count'] = train_df['text'].apply(lambda x: mentions_count(x))\ntrain_df['length']=train_df['text'].apply(len)\n\ntest_df['char_count'] = test_df['text'].apply(lambda x: charcount(x))\ntest_df['word_count'] = test_df['text'].apply(lambda x: wordcount(x))\ntest_df['hashtag_count'] = test_df['text'].apply(lambda x: hashtag_count(x))\ntest_df['mention_count'] = test_df['text'].apply(lambda x: mentions_count(x))\ntest_df['length']=test_df['text'].apply(len)\n\ntrain_df.head(2)", - "class": "Data Transform", - "desc": "This cell defines functions to compute various textual features and applies these functions to the text columns in the training and testing DataFrames, adding new columns for character count, word count, hashtag count, mention count, and text length.", + "code": "df.tail(20)", + "class": "Exploratory Data Analysis", + "desc": "This code displays the last 20 rows of the DataFrame `df` to examine the end of the dataset.", "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.9946966 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.99977094 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 11, - "code": "# Taken from - Craig Thomas https://www.kaggle.com/craigmthomas/logistic-regression-lightgbm-fe\ntrain_df.drop(\n [\n 6449, 7034, 3589, 3591, 3597, 3600, 3603, 3604, 3610, 3613, 3614, 119, 106, 115,\n 2666, 2679, 1356, 7609, 3382, 1335, 2655, 2674, 1343, 4291, 4303, 1345, 48, 3374,\n 7600, 164, 5292, 2352, 4308, 4306, 4310, 1332, 1156, 7610, 2441, 2449, 2454, 2477,\n 2452, 2456, 3390, 7611, 6656, 1360, 5771, 4351, 5073, 4601, 5665, 7135, 5720, 5723,\n 5734, 1623, 7533, 7537, 7026, 4834, 4631, 3461, 6366, 6373, 6377, 6378, 6392, 2828,\n 2841, 1725, 3795, 1251, 7607\n ], inplace=True\n)\n\ntrain_df.drop(\n [\n 4290, 4299, 4312, 4221, 4239, 4244, 2830, 2831, 2832, 2833, 4597, 4605, 4618, 4232, 4235, 3240,\n 3243, 3248, 3251, 3261, 3266, 4285, 4305, 4313, 1214, 1365, 6614, 6616, 1197, 1331, 4379, 4381,\n 4284, 4286, 4292, 4304, 4309, 4318, 610, 624, 630, 634, 3985, 4013, 4019, 1221, 1349, 6091, 6094, \n 6103, 6123, 5620, 5641\n ], inplace=True\n)", - "class": "Data Transform", - "desc": "This cell removes rows from the training DataFrame corresponding to specific index values to clean the data.", + "cell_id": 7, + "code": "df[\"target\"].value_counts()", + "class": "Exploratory Data Analysis", + "desc": "This code counts the occurrences of each unique value in the \"target\" column of the DataFrame `df` to provide insight into the distribution of the target variable.", "testing": { - "class": "Data_Transform", - "subclass": "drop_column", - "subclass_id": 10, - "predicted_subclass_probability": 0.9402679 + "class": "Exploratory_Data_Analysis", + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.9994948 }, - "cluster": 1 + "cluster": 4 + }, { + "cell_id": 12, + "code": "df.head(20)", + "class": "Exploratory Data Analysis", + "desc": "This code displays the first 20 rows of the DataFrame `df` to provide an initial view of the dataset.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997683 + }, + "cluster": 2 }, { "cell_id": 13, - "code": "def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):\n \n text = re.sub(r'[^\\w\\s]', '', str(text).lower().strip())\n lst_text = text.split()\n if lst_stopwords is not None:\n lst_text = [word for word in lst_text if word not in \n lst_stopwords]\n \n ## Stemming (remove -ing, -ly, ...)\n if flg_stemm == True:\n ps = nltk.stem.porter.PorterStemmer()\n lst_text = [ps.stem(word) for word in lst_text]\n\n if flg_lemm == True:\n lem = nltk.stem.wordnet.WordNetLemmatizer()\n lst_text = [lem.lemmatize(word) for word in lst_text]\n \n \n ## back to string from list\n text = \" \".join(lst_text)\n return text", - "class": "Data Transform", - "desc": "This cell defines a function for preprocessing text, including steps for lowercasing, removing punctuation, stopword removal, and optionally applying stemming or lemmatization.", + "code": "df.loc[df[\"target\"] == 1].head(10)", + "class": "Exploratory Data Analysis", + "desc": "This code filters the DataFrame `df` to show the first 10 rows where the \"target\" column value is 1, for inspecting instances of this class.", "testing": { - "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.9333879 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.99977666 }, - "cluster": 1 + "cluster": -1 }, { "cell_id": 14, - "code": "lst_stopwords = nltk.corpus.stopwords.words(\"english\")\n#lst_stopwords\n", - "class": "Data Transform", - "desc": "This cell creates a list of English stopwords from the NLTK corpus for use in text preprocessing.", + "code": "df.loc[df[\"target\"] == 0].head(10)", + "class": "Exploratory Data Analysis", + "desc": "This code filters the DataFrame `df` to show the first 10 rows where the \"target\" column value is 0, for inspecting instances of this class.", "testing": { - "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.9620199 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.99977714 }, - "cluster": 1 + "cluster": -1 }, { "cell_id": 15, - "code": "contractions = { \n\"ain't\": \"am not\",\n\"aren't\": \"are not\",\n\"can't\": \"cannot\",\n\"can't've\": \"cannot have\",\n\"'cause\": \"because\",\n\"could've\": \"could have\",\n\"couldn't\": \"could not\",\n\"couldn't've\": \"could not have\",\n\"didn't\": \"did not\",\n\"doesn't\": \"does not\",\n\"don't\": \"do not\",\n\"hadn't\": \"had not\",\n\"hadn't've\": \"had not have\",\n\"hasn't\": \"has not\",\n\"haven't\": \"have not\",\n\"he'd\": \"he would\",\n\"he'd've\": \"he would have\",\n\"he'll\": \"he will\",\n\"he'll've\": \"he will have\",\n\"he's\": \"he is\",\n\"how'd\": \"how did\",\n\"how'd'y\": \"how do you\",\n\"how'll\": \"how will\",\n\"how's\": \"how does\",\n\"i'd\": \"i would\",\n\"i'd've\": \"i would have\",\n\"i'll\": \"i will\",\n\"i'll've\": \"i will have\",\n\"i'm\": \"i am\",\n\"i've\": \"i have\",\n\"isn't\": \"is not\",\n\"it'd\": \"it would\",\n\"it'd've\": \"it would have\",\n\"it'll\": \"it will\",\n\"it'll've\": \"it will have\",\n\"it's\": \"it is\",\n\"let's\": \"let us\",\n\"ma'am\": \"madam\",\n\"mayn't\": \"may not\",\n\"might've\": \"might have\",\n\"mightn't\": \"might not\",\n\"mightn't've\": \"might not have\",\n\"must've\": \"must have\",\n\"mustn't\": \"must not\",\n\"mustn't've\": \"must not have\",\n\"needn't\": \"need not\",\n\"needn't've\": \"need not have\",\n\"o'clock\": \"of the clock\",\n\"oughtn't\": \"ought not\",\n\"oughtn't've\": \"ought not have\",\n\"shan't\": \"shall not\",\n\"sha'n't\": \"shall not\",\n\"shan't've\": \"shall not have\",\n\"she'd\": \"she would\",\n\"she'd've\": \"she would have\",\n\"she'll\": \"she will\",\n\"she'll've\": \"she will have\",\n\"she's\": \"she is\",\n\"should've\": \"should have\",\n\"shouldn't\": \"should not\",\n\"shouldn't've\": \"should not have\",\n\"so've\": \"so have\",\n\"so's\": \"so is\",\n\"that'd\": \"that would\",\n\"that'd've\": \"that would have\",\n\"that's\": \"that is\",\n\"there'd\": \"there would\",\n\"there'd've\": \"there would have\",\n\"there's\": \"there is\",\n\"they'd\": \"they would\",\n\"they'd've\": \"they would have\",\n\"they'll\": \"they will\",\n\"they'll've\": \"they will have\",\n\"they're\": \"they are\",\n\"they've\": \"they have\",\n\"to've\": \"to have\",\n\"wasn't\": \"was not\",\n\" u \": \" you \",\n\" ur \": \" your \",\n\" n \": \" and \",\n\"won't\": \"would not\",\n'dis': 'this',\n'bak': 'back',\n'brng': 'bring'}\n\ndef cont_to_exp(x):\n if type(x) is str:\n for key in contractions:\n value = contractions[key]\n x = x.replace(key, value)\n return x\n else:\n return x\n \ntrain_df['text_clean'] = train_df['text'].apply(lambda x: cont_to_exp(x))\ntest_df['text_clean'] = test_df['text'].apply(lambda x: cont_to_exp(x))\n\n\ndef remove_emails(x):\n return re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\\.[a-z0-9+_-]+)',\"\", x)\n\n\ndef remove_urls(x):\n return re.sub(r'(http|https|ftp|ssh)://([\\w_-]+(?:(?:\\.[\\w_-]+)+))([\\w.,@?^=%&:/~+#-]*[\\w@?^=%&/~+#-])?', '' , x)\n\ndef remove_rt(x):\n return re.sub(r'\\brt\\b', '', x).strip()\n\ndef remove_special_chars(x):\n x = re.sub(r'[^\\w ]+', \"\", x)\n x = ' '.join(x.split())\n return x\n\n\ndef remove_accented_chars(x):\n x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')\n return x\n\n\n\ntrain_df['text_clean'] = train_df['text_clean'].apply(lambda x: remove_emails(x))\ntrain_df['text_clean'] = train_df['text_clean'].apply(lambda x: remove_urls(x))\ntrain_df['text_clean'] = train_df['text_clean'].apply(lambda x: remove_rt(x))\ntrain_df['text_clean'] = train_df['text_clean'].apply(lambda x: remove_special_chars(x))\ntrain_df['text_clean'] = train_df['text_clean'].apply(lambda x: remove_accented_chars(x))", - "class": "Data Transform", - "desc": "This cell defines a function to expand contractions and additional functions to clean text by removing emails, URLs, retweets, special characters, and accented characters, and applies these functions to a new 'text_clean' column in the training and testing DataFrames.", + "code": "for c in df[df[\"target\"] == 1][\"text\"].head(10):\n print(c)", + "class": "Exploratory Data Analysis", + "desc": "This code iterates through the first 10 rows of the \"text\" column in the DataFrame `df` where the \"target\" column is 1, printing each text entry for inspection.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.99555004 + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.999645 }, - "cluster": 1 + "cluster": -1 }, { "cell_id": 16, - "code": "train_df[\"text_clean\"] = train_df[\"text_clean\"].apply(lambda x: preprocess_text(x, flg_stemm=True, flg_lemm=False, lst_stopwords=lst_stopwords))\ntrain_df.head()", - "class": "Data Transform", - "desc": "This cell applies the previously defined `preprocess_text` function to the 'text_clean' column in the training DataFrame, incorporating stemming and stopword removal.", + "code": "for c in df[df[\"target\"] == 0][\"text\"].head(10):\n print(c)", + "class": "Exploratory Data Analysis", + "desc": "This code iterates through the first 10 rows of the \"text\" column in the DataFrame `df` where the \"target\" column is 0, printing each text entry for inspection.", "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.9974553 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.99963677 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 17, - "code": "vec=TfidfVectorizer(max_features = 10000,ngram_range=(1,4))\nvec.fit(train_df['text_clean'])", - "class": "Data Transform", - "desc": "This cell initializes a TF-IDF Vectorizer with specific parameters for extracting features from text and fits it on the cleaned text data from the training DataFrame.", + "cell_id": 21, + "code": "df[\"target\"].value_counts()", + "class": "Exploratory Data Analysis", + "desc": "This code counts the occurrences of each unique value in the \"target\" column of the DataFrame `df` to provide insight into the distribution of the target variable.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.5606868 + "class": "Exploratory_Data_Analysis", + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.9994948 }, - "cluster": 1 + "cluster": 4 }, { - "cell_id": 18, - "code": "matrix = vec.transform(train_df['text_clean']).toarray()\nfeatures = vec.get_feature_names()\nmatrix_df = pd.DataFrame(data=matrix, columns=features)\n", - "class": "Data Transform", - "desc": "This cell transforms the cleaned text data into a TF-IDF feature matrix, retrieves the feature names, and then stores the matrix as a DataFrame with the feature names as columns.", + "cell_id": 23, + "code": "test_df.shape", + "class": "Exploratory Data Analysis", + "desc": "This code outputs the dimensions (number of rows and columns) of the DataFrame `test_df` to inspect the size of the test set.", "testing": { - "class": "Data_Transform", - "subclass": "create_dataframe", - "subclass_id": 12, - "predicted_subclass_probability": 0.99546874 + "class": "Exploratory_Data_Analysis", + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.99960893 }, "cluster": 1 }, { - "cell_id": 21, - "code": "matrix_df['length']=train_df['length']\nmatrix_df['char_count']=train_df['char_count']\nmatrix_df['word_count']=train_df['word_count']\nmatrix_df['hashtag_count']=train_df['hashtag_count']\nmatrix_df['mention_count']=train_df['mention_count']\ny=train_df['target']", - "class": "Data Transform", - "desc": "This cell adds additional columns for 'length', 'char_count', 'word_count', 'hashtag_count', and 'mention_count' from the original training DataFrame to the TF-IDF feature matrix DataFrame and stores the target variable in a separate variable `y`.", + "cell_id": 24, + "code": "train_df[\"target\"].value_counts()", + "class": "Exploratory Data Analysis", + "desc": "This code counts the occurrences of each unique value in the \"target\" column of the DataFrame `train_df` to observe the distribution of the target variable in the training set.", "testing": { - "class": "Data_Transform", - "subclass": "prepare_x_and_y", - "subclass_id": 21, - "predicted_subclass_probability": 0.8321087 + "class": "Exploratory_Data_Analysis", + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.999521 }, - "cluster": 1 + "cluster": 4 }, { - "cell_id": 32, - "code": "test_df[\"text_clean\"]=test_df['text']\ntest_df['text_clean'] = test_df['text_clean'].apply(lambda x: remove_emails(x))\ntest_df['text_clean'] = test_df['text_clean'].apply(lambda x: remove_urls(x))\ntest_df['text_clean'] = test_df['text_clean'].apply(lambda x: remove_rt(x))\ntest_df['text_clean'] = test_df['text_clean'].apply(lambda x: remove_special_chars(x))\ntest_df['text_clean'] = test_df['text_clean'].apply(lambda x: remove_accented_chars(x))\n\ntest_df[\"text_clean\"] = test_df[\"text\"].apply(lambda x: preprocess_text(x, flg_stemm=True, flg_lemm=False, lst_stopwords=lst_stopwords))\ntest_df['length']=test_df['text'].apply(len)\n\ntest_df.head()\n\n#vec=TfidfVectorizer(max_features = 20000,ngram_range=(1,4))\n#vec.fit(test_df['text_clean'])\n\n\n\nmatrix = vec.transform(test_df['text_clean']).toarray()\nfeatures = vec.get_feature_names()\nmatrix_df = pd.DataFrame(data=matrix, columns=features)\n\nmatrix_df['length']=test_df['length']\nmatrix_df['char_count']=test_df['char_count']\nmatrix_df['word_count']=test_df['word_count']\nmatrix_df['hashtag_count']=test_df['hashtag_count']\nmatrix_df['mention_count']=test_df['mention_count']", - "class": "Data Transform", - "desc": "This cell cleans and preprocesses the text data in the test DataFrame, then applies the previously fitted TF-IDF Vectorizer to transform the cleaned text into a TF-IDF feature matrix and adds additional columns for 'length', 'char_count', 'word_count', 'hashtag_count', and 'mention_count'.", + "cell_id": 25, + "code": "test_df[\"target\"].value_counts()", + "class": "Exploratory Data Analysis", + "desc": "This code counts the occurrences of each unique value in the \"target\" column of the DataFrame `test_df` to observe the distribution of the target variable in the test set.", "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.99814296 + "class": "Exploratory_Data_Analysis", + "subclass": "count_values", + "subclass_id": 72, + "predicted_subclass_probability": 0.9995276 }, - "cluster": 1 + "cluster": 4 }, { - "cell_id": 36, - "code": "#Credit: https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub\ndef bert_encode(texts, tokenizer, max_len=512):\n all_tokens = []\n all_masks = []\n all_segments = []\n \n for text in texts:\n text = tokenizer.tokenize(text)\n \n text = text[:max_len-2]\n input_sequence = [\"[CLS]\"] + text + [\"[SEP]\"]\n pad_len = max_len - len(input_sequence)\n \n tokens = tokenizer.convert_tokens_to_ids(input_sequence)\n tokens += [0] * pad_len\n pad_masks = [1] * len(input_sequence) + [0] * pad_len\n segment_ids = [0] * max_len\n \n all_tokens.append(tokens)\n all_masks.append(pad_masks)\n all_segments.append(segment_ids)\n \n return np.array(all_tokens), np.array(all_masks), np.array(all_segments)", - "class": "Data Transform", - "desc": "This cell defines a function to encode text using BERT tokenization, producing token IDs, attention masks, and segment IDs, which are necessary inputs for a BERT model.", + "cell_id": 35, + "code": "print(features.shape)", + "class": "Exploratory Data Analysis", + "desc": "This code prints the dimensions (number of rows and columns) of the TF-IDF features matrix created from the training data's text content.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.9645982 + "class": "Exploratory_Data_Analysis", + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.9995484 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 39, - "code": "# Load tokenizer from the bert layer\nvocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()\ndo_lower_case = bert_layer.resolved_object.do_lower_case.numpy()\ntokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)", - "class": "Data Transform", - "desc": "This cell loads the BERT tokenizer using the vocabulary file and 'do_lower_case' setting from the pretrained BERT model to facilitate tokenization of text data.", + "cell_id": 37, + "code": "print(\"Feature names (unique tokens): {0}.\\nFeature count: {1}\".format(feature_names, len(feature_names)))", + "class": "Exploratory Data Analysis", + "desc": "This code prints the unique tokens extracted as feature names by the `TfidfVectorizer` and the total count of these features.", "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.4494561 + "class": "Exploratory_Data_Analysis", + "subclass": "count_unique_values", + "subclass_id": 54, + "predicted_subclass_probability": 0.9350609 }, - "cluster": 1 + "cluster": 3 }, { - "cell_id": 40, - "code": "# Encode the text into tokens, masks, and segment flags\ntrain_input = bert_encode(train_df.text_clean.values, tokenizer, max_len=160)\ntest_input = bert_encode(test_df.text_clean.values, tokenizer, max_len=160)\ntrain_labels = train_df.target.values", - "class": "Data Transform", - "desc": "This cell encodes the cleaned text data from the training and test DataFrames into BERT-compatible tokens, masks, and segment flags, setting a maximum length of 160 tokens for each input sequence.", + "cell_id": 38, + "code": "print('fire' in feature_names)", + "class": "Exploratory Data Analysis", + "desc": "This code checks if the token 'fire' is present in the list of feature names extracted by the `TfidfVectorizer` and prints the result.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.9929838 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.9059723 }, - "cluster": 1 + "cluster": 5 }, { - "cell_id": 3, - "code": "train_df.head(5)", + "cell_id": 46, + "code": "text_n = 10\nfeatures[text_n]", "class": "Exploratory Data Analysis", - "desc": "This cell displays the first five rows of the training DataFrame to give an initial look at the dataset.", + "desc": "This code outputs the feature vector corresponding to the 10th text sample in the `features` matrix, allowing inspection of its TF-IDF representation.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.9997615 + "predicted_subclass_probability": 0.9924711 }, - "cluster": 2 + "cluster": 3 }, { - "cell_id": 4, - "code": "# DataFrane Summary by pandas summary package (extension of pandas.describe method) \ndfs = DataFrameSummary(train_df)\ndfs.summary()", + "cell_id": 49, + "code": "train_df[\"text\"].iloc[text_n]", "class": "Exploratory Data Analysis", - "desc": "This cell generates and displays a summary of the training DataFrame using the pandas-summary package, which provides an extended description of the DataFrame's columns.", + "desc": "This code outputs the text content of the 10th sample in the \"text\" column of `train_df`, allowing inspection of the original text that corresponds to the feature vector and prediction.", "testing": { "class": "Data_Transform", - "subclass": "create_dataframe", - "subclass_id": 12, - "predicted_subclass_probability": 0.988304 + "subclass": "filter", + "subclass_id": 14, + "predicted_subclass_probability": 0.9789242 }, - "cluster": -1 + "cluster": 4 }, { - "cell_id": 10, - "code": "duplicates = pd.concat(x for _, x in train_df.groupby([\"text\"]) if len(x) > 1)\n\n#with pd.option_context(\"display.max_rows\", None, \"max_colwidth\", 80):\n# display(duplicates[[\"id\", \"target\", \"text\"]])", + "cell_id": 69, + "code": "pred_df.shape", "class": "Exploratory Data Analysis", - "desc": "This cell identifies and stores duplicate rows in the training DataFrame based on the 'text' column into a new DataFrame named 'duplicates'.", + "desc": "This code outputs the dimensions (number of rows and columns) of the DataFrame `pred_df`.", "testing": { - "class": "Data_Transform", - "subclass": "concatenate", - "subclass_id": 11, - "predicted_subclass_probability": 0.87380403 + "class": "Exploratory_Data_Analysis", + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.99961746 }, - "cluster": -1 + "cluster": 1 }, { - "cell_id": 19, - "code": "matrix_df.head(2)", + "cell_id": 70, + "code": "pred_df.head()", "class": "Exploratory Data Analysis", - "desc": "This cell displays the first two rows of the DataFrame containing the TF-IDF feature matrix to provide an initial look at the transformed text data.", + "desc": "This code displays the first five rows of the DataFrame `pred_df` to provide an initial view of the test dataset.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.9997633 + "predicted_subclass_probability": 0.9997497 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 20, - "code": "matrix_df.shape", + "cell_id": 72, + "code": "pred_df.shape", "class": "Exploratory Data Analysis", - "desc": "This cell outputs the shape (number of rows and columns) of the DataFrame containing the TF-IDF feature matrix to understand the dimensionality of the transformed text data.", + "desc": "This code outputs the dimensions (number of rows and columns) of the updated DataFrame `pred_df`, now including the \"target\" column with predictions.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_shape", "subclass_id": 58, - "predicted_subclass_probability": 0.9996574 - }, - "cluster": 10 - }, { - "cell_id": 30, - "code": "list_for_correlations=top_features['Feature'].to_list()\nlist_for_correlations.append('target')\noc.correlations(matrix_df,list_for_correlations)", - "class": "Exploratory Data Analysis", - "desc": "This cell creates a list of the top 20 most important features along with the target variable and computes their correlations to understand the relationships between these features and the target.", - "testing": { - "class": "Visualization", - "subclass": "model_coefficients", - "subclass_id": 79, - "predicted_subclass_probability": 0.9603935 - }, - "cluster": -1 - }, { - "cell_id": 0, - "code": "# Octopus ML pakage - github.com/gershonc/octopus-ml\n!pip install octopus-ml", - "class": "Imports and Environment", - "desc": "This cell installs the 'octopus-ml' package from GitHub, which appears to be a machine learning-related library.", - "testing": { - "class": "Imports_and_Environment", - "subclass": "install_modules", - "subclass_id": 87, - "predicted_subclass_probability": 0.99379325 + "predicted_subclass_probability": 0.99961746 }, "cluster": 1 }, { - "cell_id": 1, - "code": "import warnings\nwarnings.simplefilter(\"ignore\")\nimport seaborn as sns \nimport matplotlib.pyplot as plt\nimport time\nimport pandas as pd\nimport numpy as np\nimport lightgbm as lgb\nimport tracemalloc\nfrom pandas_summary import DataFrameSummary\nfrom sklearn.metrics import classification_report\n\nfrom sklearn import feature_extraction, linear_model, model_selection, preprocessing\n\n%matplotlib inline\nsns.set_style(\"whitegrid\")\n\npd.set_option('display.max_columns', None) # or 1000\npd.set_option('display.max_rows', None) # or 1000\npd.set_option('display.max_colwidth', -1) # or 199\n\n#check out https://github.com/gershonc/octopus-ml\nimport octopus_ml as oc\n\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.multiclass import OneVsRestClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split", - "class": "Imports and Environment", - "desc": "This cell imports multiple Python libraries and settings necessary for data manipulation, visualization, machine learning, and configuring the working environment.", + "cell_id": 73, + "code": "pred_df.head(20)", + "class": "Exploratory Data Analysis", + "desc": "This code displays the first 20 rows of the DataFrame `pred_df` to provide an initial view of the data along with the predicted \"target\" labels.", "testing": { - "class": "Imports_and_Environment", - "subclass": "set_options", - "subclass_id": 23, - "predicted_subclass_probability": 0.9992041 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997557 }, - "cluster": -1 + "cluster": 2 }, { - "cell_id": 12, - "code": "## for data\nimport json\nimport pandas as pd\nimport numpy as np\n## for plotting\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n## for processing\nimport re\nimport nltk\n## for bag-of-words\nfrom sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing\n## for explainer\nfrom lime import lime_text\n## for word embedding\nimport gensim\nimport gensim.downloader as gensim_api\n## for deep learning\nfrom tensorflow.keras import models, layers, preprocessing as kprocessing\nfrom tensorflow.keras import backend as K\n## for bert language model\nimport transformers\nimport unicodedata", - "class": "Imports and Environment", - "desc": "This cell imports additional Python libraries for data processing, visualization, text processing, machine learning, model explanation, word embedding, deep learning, and BERT language modeling.", + "cell_id": 74, + "code": "pred_df.tail(20)", + "class": "Exploratory Data Analysis", + "desc": "This code displays the last 20 rows of the DataFrame `pred_df` to inspect the data and the predicted \"target\" labels at the end of the dataset.", "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.9993574 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997602 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 34, - "code": "!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py", - "class": "Imports and Environment", - "desc": "This cell downloads the BERT tokenization script from TensorFlow's GitHub repository for use in subsequent tokenization processes.", + "cell_id": 76, + "code": "pred_df.head()", + "class": "Exploratory Data Analysis", + "desc": "This code displays the first five rows of the modified DataFrame `pred_df`, now without the \"keyword\", \"location\", and \"text\" columns.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_url", - "subclass_id": 42, - "predicted_subclass_probability": 0.8866123 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997497 }, "cluster": 0 }, { - "cell_id": 35, - "code": "import tensorflow as tf\nfrom tensorflow.keras.layers import Dense, Input\nfrom tensorflow.keras.optimizers import Adam\nfrom tensorflow.keras.models import Model\nfrom tensorflow.keras.callbacks import ModelCheckpoint\nimport tensorflow_hub as hub\n\nimport tokenization", + "cell_id": 0, + "code": "import pandas as pd\nimport platform\nfrom sklearn.model_selection import train_test_split\nimport nltk\nimport string\nfrom nltk.corpus import stopwords\nfrom nltk.tokenize import word_tokenize\nfrom nltk.stem import SnowballStemmer\nfrom nltk.corpus import words\n#nltk.download('punkt')\nfrom sklearn.pipeline import Pipeline\n#from sklearn.linear_model import LogisticRegression\nimport sklearn.linear_model as lm\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics import precision_score, recall_score, precision_recall_curve, f1_score\nfrom sklearn import metrics\nfrom matplotlib import pyplot as plt\nfrom sklearn.metrics import plot_precision_recall_curve\nimport numpy as np\nfrom sklearn.model_selection import GridSearchCV", "class": "Imports and Environment", - "desc": "This cell imports TensorFlow and associated libraries for model building, optimization, and callback management, as well as the BERT tokenization script for preprocessing text data.", + "desc": "This code imports libraries and modules required for data manipulation (pandas), machine learning (scikit-learn), natural language processing (nltk), plotting (matplotlib), and numerical operations (numpy).", "testing": { "class": "Imports_and_Environment", "subclass": "import_modules", "subclass_id": 22, - "predicted_subclass_probability": 0.99932015 + "predicted_subclass_probability": 0.9993006 }, "cluster": 0 }, { - "cell_id": 38, - "code": "# Load BERT from the Tensorflow Hub\nmodule_url = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1\"\nbert_layer = hub.KerasLayer(module_url, trainable=True)", + "cell_id": 1, + "code": "pd.set_option('display.max_colwidth', None)", "class": "Imports and Environment", - "desc": "This cell loads a pre-trained BERT model from TensorFlow Hub as a Keras layer, enabling it to be incorporated into a trainable neural network model.", + "desc": "This code sets the pandas display option to ensure that the full content of each column is shown without truncation.", "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.87859446 + "class": "Imports_and_Environment", + "subclass": "set_options", + "subclass_id": 23, + "predicted_subclass_probability": 0.99864024 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 23, - "code": "oc.cv_plot(metrics['f1_weighted'],metrics['f1_macro'],metrics['f1_positive'],'Titanic Kaggle competition')", + "cell_id": 45, + "code": "model.n_features_in_", "class": "Model Evaluation", - "desc": "This cell plots the weighted F1 score, macro F1 score, and positive class F1 score obtained from cross-validation to evaluate the performance of the LightGBM model using the octopus-ml package.", + "desc": "This code outputs the number of features that were used to fit the `LogisticRegressionCV` model, stored in the attribute `n_features_in_`.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.64448214 + "class": "Model_Train", + "subclass": "find_best_params", + "subclass_id": 2, + "predicted_subclass_probability": 0.2740468 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 24, - "code": "print(classification_report(metrics['y'], metrics['predictions_folds']))", + "cell_id": 47, + "code": "test_model_y = model.predict(features[text_n])", "class": "Model Evaluation", - "desc": "This cell prints the classification report, which includes precision, recall, F1-score, and support, using the true labels and predicted values obtained from cross-validation.", + "desc": "This code uses the trained `LogisticRegressionCV` model to predict the target label for the 10th text sample in the `features` matrix and stores the result in `test_model_y`.", "testing": { "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.9977271 + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.99349266 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 25, - "code": "oc.roc_curve_plot(metrics['y'], metrics['predictions_proba'])", + "cell_id": 48, + "code": "test_model_y[0]", "class": "Model Evaluation", - "desc": "This cell plots the ROC curve using the true labels and predicted probabilities from the LightGBM model to visualize the model's performance in distinguishing between classes.", + "desc": "This code retrieves the first element from the `test_model_y` array, showing the predicted target label for the 10th text sample.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.67355084 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9995427 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 26, - "code": "oc.confusion_matrix_plot(metrics['y'], metrics['predictions_folds'])", + "cell_id": 52, + "code": "model_pipeline.classes_", + "class": "Model Evaluation", + "desc": "This code outputs the classes that have been identified and used by the `model_pipeline` during training, stored in the attribute `classes_`.", + "testing": { + "class": "Model_Train", + "subclass": "find_best_params", + "subclass_id": 2, + "predicted_subclass_probability": 0.4742122 + }, + "cluster": 2 + }, { + "cell_id": 53, + "code": "len(model.coef_[0])", "class": "Model Evaluation", - "desc": "This cell generates a confusion matrix plot using the true labels and predicted values obtained from cross-validation to assess the accuracy and error rates of the LightGBM model.", + "desc": "This code outputs the number of coefficients or features used by the `LogisticRegressionCV` model within the pipeline, by accessing the length of the first element in `model.coef_`.", "testing": { "class": "Visualization", "subclass": "model_coefficients", "subclass_id": 79, - "predicted_subclass_probability": 0.65878206 + "predicted_subclass_probability": 0.9872142 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 27, - "code": "feature_imp_list=oc.plot_imp(metrics['final_clf'],matrix_df,'LightGBM Mortality Kaggle',num=40)", + "cell_id": 54, + "code": "model.C_", "class": "Model Evaluation", - "desc": "This cell plots the feature importance for the LightGBM model, focusing on the top 40 most important features, and returns the list of feature importances.", + "desc": "This code outputs the inverse of regularization parameter values estimated by the `LogisticRegressionCV` model, stored in the attribute `C_`.", "testing": { "class": "Visualization", "subclass": "model_coefficients", "subclass_id": 79, - "predicted_subclass_probability": 0.9963425 + "predicted_subclass_probability": 0.84522194 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 28, - "code": "oc.preds_distribution(metrics['y'], metrics['predictions_proba'], bins=40)", + "cell_id": 55, + "code": "model_pipeline.named_steps", "class": "Model Evaluation", - "desc": "This cell creates a distribution plot of the true labels versus the predicted probabilities to visualize the prediction confidence for the LightGBM model.", + "desc": "This code outputs the named steps of the `model_pipeline`, showing the components (vectorizer and model) involved in the pipeline.", "testing": { - "class": "Model_Train", - "subclass": "compute_train_metric", - "subclass_id": 28, - "predicted_subclass_probability": 0.558016 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.3446452 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 29, - "code": "top_features=feature_imp_list.sort_values(by='Value', ascending=False).head(20)\ntop_features", + "cell_id": 56, + "code": "#model_pipeline.predict([\"Attention: bush fire reported!\"])", "class": "Model Evaluation", - "desc": "This cell sorts the feature importance list in descending order and selects the top 20 most important features to provide insight into which features the model relies on most.", + "desc": "This code (which is currently commented out) uses the `model_pipeline` to predict the target label for the input text \"Attention: bush fire reported!\", demonstrating the pipeline's end-to-end prediction capability.", "testing": { - "class": "Data_Transform", - "subclass": "sort_values", - "subclass_id": 9, - "predicted_subclass_probability": 0.992605 + "class": "Exploratory_Data_Analysis", + "subclass": "commented", + "subclass_id": 76, + "predicted_subclass_probability": 0.9979772 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 22, - "code": "params = {\n 'boosting_type': 'gbdt',\n 'objective': 'binary',\n 'metric': 'auc',\n 'learning_rate': 0.01,\n 'num_leaves':32,\n 'subsample': 1,\n #'colsample_bytree': 0.25,\n #'reg_alpha': 0,\n #'reg_lambda': 1,\n #'scale_pos_weight': 5,\n 'n_estimators': 10000,\n 'verbose': -1,\n 'max_depth': -1,\n 'seed':100, \n 'colsample_bytree':0.4,\n 'force_col_wise': True\n\n\n}\n\"\"\"\n boosting_type='gbdt', class_weight=None, colsample_bytree=0.4,\n importance_type='split', learning_rate=0.04, max_depth=-1,\n metric='auc', min_child_samples=20, min_child_weight=0.001,\n min_split_gain=0.0, n_estimators=1500, n_jobs=-1, num_leaves=31,\n objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,\n silent=True, subsample=1.0, subsample_for_bin=200000,\n subsample_freq=0 \n\"\"\"\nmetrics = oc.cv_adv(matrix_df,y,0.5,2000,shuffle=True,params=params)", - "class": "Model Training", - "desc": "This cell defines the parameters for a LightGBM model and performs advanced cross-validation using the octopus-ml package to evaluate the model on the training data.", + "cell_id": 57, + "code": "#model_pipeline.predict([\"Kids were playing in the park.\"])", + "class": "Model Evaluation", + "desc": "This code (which is currently commented out) uses the `model_pipeline` to predict the target label for the input text \"Kids were playing in the park.\", demonstrating the model's prediction capability for another example text.", "testing": { - "class": "Model_Train", - "subclass": "init_hyperparams", - "subclass_id": 59, - "predicted_subclass_probability": 0.99348336 + "class": "Exploratory_Data_Analysis", + "subclass": "commented", + "subclass_id": 76, + "predicted_subclass_probability": 0.99806374 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 33, - "code": "test_pred=metrics['final_clf'].predict(matrix_df)\npredictions = []\n#predictions = oc.adjusted_classes(test_pred, 0.5)", - "class": "Model Training", - "desc": "This cell uses the final trained LightGBM model to predict on the processed test data, storing the raw prediction results in the `test_pred` variable.", + "cell_id": 58, + "code": "#model_pipeline.get_params()", + "class": "Model Evaluation", + "desc": "This code (which is currently commented out) retrieves the parameters of the `model_pipeline`, providing a detailed view of the configuration settings of both the vectorizer and the model.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "commented", + "subclass_id": 76, + "predicted_subclass_probability": 0.9974099 + }, + "cluster": 2 + }, { + "cell_id": 60, + "code": "#y_pred = model_pipeline.predict(X_train)\ny_pred = model_pipeline.predict(test_df[\"text\"])", + "class": "Model Evaluation", + "desc": "This code uses the `model_pipeline` to predict the target labels for the text data in `test_df` and stores the predictions in `y_pred`.", "testing": { "class": "Model_Evaluation", "subclass": "predict_on_test", "subclass_id": 48, - "predicted_subclass_probability": 0.9896577 + "predicted_subclass_probability": 0.9517882 }, - "cluster": -1 + "cluster": 1 }, { - "cell_id": 37, - "code": "def build_model(bert_layer, max_len=512):\n input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"input_word_ids\")\n input_mask = Input(shape=(max_len,), dtype=tf.int32, name=\"input_mask\")\n segment_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"segment_ids\")\n\n _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])\n clf_output = sequence_output[:, 0, :]\n \n if Dropout_num == 0:\n # Without Dropout\n out = Dense(1, activation='sigmoid')(clf_output)\n else:\n # With Dropout(Dropout_num), Dropout_num > 0\n x = Dropout(Dropout_num)(clf_output)\n out = Dense(1, activation='sigmoid')(x)\n\n model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)\n model.compile(Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])\n \n return model", - "class": "Model Training", - "desc": "This cell defines a function to build a BERT-based model using the specified BERT layer, with inputs for token IDs, attention masks, and segment IDs, and includes an optional dropout layer for regularization before the output layer.", + "cell_id": 61, + "code": "#print(precision_score(y_true=y_test, y_pred=y_pred))", + "class": "Model Evaluation", + "desc": "This code (which is currently commented out) calculates and prints the precision score by comparing the true target labels (`y_test`) with the predicted labels (`y_pred`).", "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.9445593 + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.9712182 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 41, - "code": "random_state_split = 2\nDropout_num = 0\nlearning_rate = 6e-6\nvalid = 0.2\nepochs_num = 3\nbatch_size_num = 16\ntarget_corrected = False\ntarget_big_corrected = False\n\n# Build BERT model with my tuning\nmodel_BERT = build_model(bert_layer, max_len=160)\nmodel_BERT.summary()", - "class": "Model Training", - "desc": "This cell initializes and summarizes the BERT-based model using specified hyperparameters, including no dropout, a learning rate of 6e-6, and a maximum token length of 160.", + "cell_id": 62, + "code": "#print(recall_score(y_true=y_test, y_pred=y_pred))", + "class": "Model Evaluation", + "desc": "This code (which is currently commented out) calculates and prints the recall score by comparing the true target labels (`y_test`) with the predicted labels (`y_pred`).", "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.66016126 + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.9629949 }, - "cluster": 0 + "cluster": 2 }, { - "cell_id": 42, - "code": "checkpoint = ModelCheckpoint('model_BERT.h5', monitor='val_loss', save_best_only=True)\n\ntrain_history = model_BERT.fit(\n train_input, train_labels,\n validation_split = valid,\n epochs = epochs_num, # recomended 3-5 epochs\n callbacks=[checkpoint],\n batch_size = batch_size_num\n)", - "class": "Model Training", - "desc": "This cell trains the BERT-based model on the encoded training data with a validation split, for a specified number of epochs and batch size, and saves the best model based on validation loss using the ModelCheckpoint callback.", + "cell_id": 63, + "code": "#print(f1_score(y_true=y_test, y_pred=y_pred))", + "class": "Model Evaluation", + "desc": "This code (which is currently commented out) calculates and prints the F1 score by comparing the true target labels (`y_test`) with the predicted labels (`y_pred`).", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.99751294 + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.91732484 + }, + "cluster": 2 + }, { + "cell_id": 64, + "code": "print(metrics.classification_report(y_test, y_pred, labels = [1, 0], digits=5))", + "class": "Model Evaluation", + "desc": "This code prints a detailed classification report using `metrics.classification_report` for the predicted vs. true target labels (`y_pred` and `y_test`), including precision, recall, F1-score, and support for each class (1 and 0) with five decimal places of precision.", + "testing": { + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.8476077 }, "cluster": 0 }, { - "cell_id": 43, - "code": "model_BERT.load_weights('model_BERT.h5')\ntest_pred_BERT = model_BERT.predict(test_input)\ntest_pred_BERT_int = test_pred_BERT.round().astype('int')", - "class": "Model Training", - "desc": "This cell loads the best weights for the BERT-based model and generates predictions on the encoded test data, rounding the predicted probabilities to obtain integer class labels.", + "cell_id": 65, + "code": "f1_1 = metrics.classification_report(y_test, y_pred, output_dict=True)[\"1\"][\"f1-score\"]\nf1_0 = metrics.classification_report(y_test, y_pred, output_dict=True)[\"0\"][\"f1-score\"]\nprint(\"Mean f1 score: {0:.5f}\".format((f1_1 + f1_0)/2))", + "class": "Model Evaluation", + "desc": "This code calculates the F1-scores for classes \"1\" and \"0\" from the classification report, computes their mean, and prints the result with five decimal places of precision.", "testing": { "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.991269 + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.9809368 }, "cluster": 0 + }, { + "cell_id": 67, + "code": "model_pipeline.score(test_df[\"text\"], y_test)", + "class": "Model Evaluation", + "desc": "This code calculates and returns the mean accuracy score of the `model_pipeline` on the text data from `test_df` and the corresponding target labels `y_test`.", + "testing": { + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.99815553 + }, + "cluster": 1 }, { "cell_id": 44, - "code": "train_pred_BERT = model_BERT.predict(train_input)\ntrain_pred_BERT_int = train_pred_BERT.round().astype('int')", + "code": "model = lm.LogisticRegressionCV(**lr_model_params)\n#features = features[:,-2000:]\nmodel.fit(features, y_train)", "class": "Model Training", - "desc": "This cell generates predictions on the encoded training data using the BERT-based model and rounds the predicted probabilities to obtain integer class labels.", - "testing": { - "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.990164 - }, - "cluster": 0 - }, { - "cell_id": 5, - "code": "# Target distribution analysis\nfig, ax =plt.subplots(1,2)\n\n\nplt.style.use('fivethirtyeight')\nplt.figure(figsize=(3,4))\nsns.set_context(\"paper\", font_scale=1.2) \nsns.countplot('target',data=train_df, ax=ax[0])\ntrain_df['target'].value_counts().plot.pie(explode=[0,0.2],autopct='%1.2f%%',ax=ax[1])\nfig.show()", - "class": "Visualization", - "desc": "This cell visualizes the distribution of the target variable in the training dataset using a count plot and a pie chart.", + "desc": "This code initializes a `LogisticRegressionCV` model with the parameters specified in `lr_model_params` and then fits the model to the `features` matrix and the target labels `y_train`.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9649334 + "class": "Model_Train", + "subclass": "train_on_grid", + "subclass_id": 6, + "predicted_subclass_probability": 0.86612976 }, "cluster": 0 }, { - "cell_id": 7, - "code": "sns.displot(data = train_df, kind = 'hist', x = 'length', hue = 'target', multiple = 'stack',bins=50,height = 5, aspect = 1.9)\n\n# The distibution of tweet text length vs target - there is a correlation between tweet length and target ", - "class": "Visualization", - "desc": "This cell creates a stacked histogram to visualize the distribution of tweet text lengths across different target values in the training DataFrame, suggesting a potential correlation between text length and target.", + "cell_id": 50, + "code": "model_pipeline = Pipeline([\n (\"vectorizer\", vectorizer),\n (\"model\", model)\n]\n)", + "class": "Model Training", + "desc": "This code creates a pipeline named `model_pipeline` that sequentially applies a `TfidfVectorizer` and a `LogisticRegressionCV` model to streamline the text preprocessing and classification in one cohesive process.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9802619 + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.92996585 }, "cluster": 0 }, { - "cell_id": 8, - "code": "sns.displot(data = train_df, kind = 'hist', x = 'hashtag_count', hue = 'target', multiple = 'stack',bins=50,height = 5, aspect = 1.9)", - "class": "Visualization", - "desc": "This cell creates a stacked histogram to visualize the distribution of the number of hashtags in tweets across different target values in the training DataFrame.", + "cell_id": 51, + "code": "model_pipeline.fit(X_train, y_train)", + "class": "Model Training", + "desc": "This code fits the `model_pipeline` to the training data `X_train` and `y_train`, integrating both the vectorization and model training steps.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9985002 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.9997104 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 9, - "code": "sns.displot(data = train_df, kind = 'hist', x = 'word_count', hue = 'target', multiple = 'stack',bins=50,height = 5, aspect = 1.9)\n", + "cell_id": 66, + "code": "plot_precision_recall_curve(estimator=model_pipeline, X=test_df[\"text\"], y=y_test)", "class": "Visualization", - "desc": "This cell creates a stacked histogram to visualize the distribution of word counts in tweets across different target values in the training DataFrame.", + "desc": "This code plots the precision-recall curve for the `model_pipeline` using the text data `X=test_df[\"text\"]` and the corresponding target labels `y=y_test`, to visualize the trade-off between precision and recall.", "testing": { "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9985385 + "subclass": "model_coefficients", + "subclass_id": 79, + "predicted_subclass_probability": 0.6427428 }, - "cluster": 0 + "cluster": -1 }], - "notebook_id": 21, - "notebook_name": "nlp-twitter-tuned-lgbm-model-tfidf-bert" + "notebook_id": 19, + "notebook_name": "nlp-disaster-tweets-tfidf-lr.ipynb" }, { "cells": [{ - "cell_id": 35, - "code": "sample_submission.to_csv(\"submission.csv\", index=False)", + "cell_id": 31, + "code": "def Kaggle_submission(file_name,model,test_data,ids_list):\n #if TARGET in test_data.columns:\n # test_data.drop([TARGET],axis=1,inplace=True)\n #test_pred=model.predict(test_data)[:,1]\n test_pred=model.predict(test_data)\n predictions = []\n predictions = oc.adjusted_classes(test_pred, 0.5)\n\n submit=pd.DataFrame()\n submit['id'] = ids_list\n submit['target'] = predictions\n submit.to_csv(file_name,index=False)\n return submit", "class": "Data Export", - "desc": "This code snippet writes the `sample_submission` DataFrame to a CSV file named \"submission.csv\" without including the DataFrame index, finalizing the submission file for the machine learning competition or task.", + "desc": "This code defines a function `Kaggle_submission` that generates predictions using a given trained model on the test data, adjusts the predictions based on a threshold of 0.5, and then creates a submission DataFrame with 'id' and 'target' columns, which it exports to a CSV file with the specified filename.", "testing": { "class": "Data_Export", "subclass": "save_to_csv", "subclass_id": 25, - "predicted_subclass_probability": 0.999154 + "predicted_subclass_probability": 0.9981369 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 1, + "cell_id": 45, + "code": "submit=pd.DataFrame()\nsubmit['id'] = test_df['id'].tolist()\nsubmit['target'] = test_pred_BERT_int", + "class": "Data Export", + "desc": "This code creates a DataFrame named `submit` with two columns: 'id' containing the IDs from the test DataFrame and 'target' containing the rounded predictions made by the BERT model.", + "testing": { + "class": "Data_Transform", + "subclass": "create_dataframe", + "subclass_id": 12, + "predicted_subclass_probability": 0.90852714 + }, + "cluster": -1 + }, { + "cell_id": 46, + "code": "submit.to_csv('BERT_model_v3.csv',index=False)", + "class": "Data Export", + "desc": "This code exports the `submit` DataFrame to a CSV file named 'BERT_model_v3.csv' without including the DataFrame index.", + "testing": { + "class": "Data_Export", + "subclass": "save_to_csv", + "subclass_id": 25, + "predicted_subclass_probability": 0.9991627 + }, + "cluster": -1 + }, { + "cell_id": 47, + "code": "submit.head(3)", + "class": "Data Export", + "desc": "This code displays the first three rows of the `submit` DataFrame, which contains the IDs and the rounded prediction labels generated by the BERT model for the test dataset.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997651 + }, + "cluster": -1 + }, { + "cell_id": 2, "code": "train_df = pd.read_csv(\"/kaggle/input/nlp-getting-started/train.csv\")\ntest_df = pd.read_csv(\"/kaggle/input/nlp-getting-started/test.csv\")", "class": "Data Extraction", - "desc": "This code snippet reads the training and test datasets from CSV files into pandas DataFrames named `train_df` and `test_df`, respectively.", + "desc": "This code reads the training and testing datasets from CSV files located in the specified directory paths into pandas DataFrames.", "testing": { "class": "Data_Extraction", "subclass": "load_from_csv", "subclass_id": 45, "predicted_subclass_probability": 0.99975425 }, - "cluster": 3 + "cluster": 0 }, { - "cell_id": 32, - "code": "sample_submission = pd.read_csv(\"/kaggle/input/nlp-getting-started/sample_submission.csv\")", - "class": "Data Extraction", - "desc": "This code snippet reads the sample submission file from a CSV into a pandas DataFrame named `sample_submission` to prepare for the final submission format.", + "cell_id": 6, + "code": "def wordcount(x):\n length = len(str(x).split())\n return length\ndef charcount(x):\n s = x.split()\n x = ''.join(s)\n return len(x)\n\ndef hashtag_count(x):\n l = len([t for t in x.split() if t.startswith('#')])\n return l\n\ndef mentions_count(x):\n l = len([t for t in x.split() if t.startswith('@')])\n return l\n\n\ntrain_df['char_count'] = train_df['text'].apply(lambda x: charcount(x))\ntrain_df['word_count'] = train_df['text'].apply(lambda x: wordcount(x))\ntrain_df['hashtag_count'] = train_df['text'].apply(lambda x: hashtag_count(x))\ntrain_df['mention_count'] = train_df['text'].apply(lambda x: mentions_count(x))\ntrain_df['length']=train_df['text'].apply(len)\n\ntest_df['char_count'] = test_df['text'].apply(lambda x: charcount(x))\ntest_df['word_count'] = test_df['text'].apply(lambda x: wordcount(x))\ntest_df['hashtag_count'] = test_df['text'].apply(lambda x: hashtag_count(x))\ntest_df['mention_count'] = test_df['text'].apply(lambda x: mentions_count(x))\ntest_df['length']=test_df['text'].apply(len)\n\ntrain_df.head(2)", + "class": "Data Transform", + "desc": "This code defines functions to calculate various text-based features like character count, word count, hashtag count, mention count, and text length, and then applies these functions to the training and testing DataFrames to add these features as new columns.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.99969256 + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.9946966 }, - "cluster": -1 + "cluster": 8 }, { - "cell_id": 3, - "code": "train_df = train_df.drop(['id', 'keyword', 'location'], axis = 1)", + "cell_id": 11, + "code": "# Taken from - Craig Thomas https://www.kaggle.com/craigmthomas/logistic-regression-lightgbm-fe\ntrain_df.drop(\n [\n 6449, 7034, 3589, 3591, 3597, 3600, 3603, 3604, 3610, 3613, 3614, 119, 106, 115,\n 2666, 2679, 1356, 7609, 3382, 1335, 2655, 2674, 1343, 4291, 4303, 1345, 48, 3374,\n 7600, 164, 5292, 2352, 4308, 4306, 4310, 1332, 1156, 7610, 2441, 2449, 2454, 2477,\n 2452, 2456, 3390, 7611, 6656, 1360, 5771, 4351, 5073, 4601, 5665, 7135, 5720, 5723,\n 5734, 1623, 7533, 7537, 7026, 4834, 4631, 3461, 6366, 6373, 6377, 6378, 6392, 2828,\n 2841, 1725, 3795, 1251, 7607\n ], inplace=True\n)\n\ntrain_df.drop(\n [\n 4290, 4299, 4312, 4221, 4239, 4244, 2830, 2831, 2832, 2833, 4597, 4605, 4618, 4232, 4235, 3240,\n 3243, 3248, 3251, 3261, 3266, 4285, 4305, 4313, 1214, 1365, 6614, 6616, 1197, 1331, 4379, 4381,\n 4284, 4286, 4292, 4304, 4309, 4318, 610, 624, 630, 634, 3985, 4013, 4019, 1221, 1349, 6091, 6094, \n 6103, 6123, 5620, 5641\n ], inplace=True\n)", "class": "Data Transform", - "desc": "This code snippet removes the columns 'id', 'keyword', and 'location' from the `train_df` DataFrame, likely in preparation for focusing on the remaining relevant data features.", + "desc": "This code snippet removes rows with specific indices from the training DataFrame to clean or preprocess the data, possibly based on prior analysis or domain knowledge.", "testing": { "class": "Data_Transform", "subclass": "drop_column", "subclass_id": 10, - "predicted_subclass_probability": 0.9992505 + "predicted_subclass_probability": 0.9402679 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 12, - "code": "#remove duplicated rows\ntrain_df.drop_duplicates(inplace=True)", + "cell_id": 13, + "code": "def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):\n \n text = re.sub(r'[^\\w\\s]', '', str(text).lower().strip())\n lst_text = text.split()\n if lst_stopwords is not None:\n lst_text = [word for word in lst_text if word not in \n lst_stopwords]\n \n ## Stemming (remove -ing, -ly, ...)\n if flg_stemm == True:\n ps = nltk.stem.porter.PorterStemmer()\n lst_text = [ps.stem(word) for word in lst_text]\n\n if flg_lemm == True:\n lem = nltk.stem.wordnet.WordNetLemmatizer()\n lst_text = [lem.lemmatize(word) for word in lst_text]\n \n \n ## back to string from list\n text = \" \".join(lst_text)\n return text", "class": "Data Transform", - "desc": "This code snippet removes duplicate rows from the `train_df` DataFrame in place, ensuring the dataset contains only unique entries.", + "desc": "This code defines a function called `preprocess_text` to clean and preprocess text by removing punctuation, converting to lowercase, optionally removing stopwords, and performing stemming or lemmatization, using Python's `re` library and NLTK.", "testing": { "class": "Data_Transform", - "subclass": "remove_duplicates", - "subclass_id": 19, - "predicted_subclass_probability": 0.8869491 + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.9333879 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 18, - "code": "Real_Disaster_text = ' '.join(Real_Disaster_df.text.tolist())", + "cell_id": 14, + "code": "lst_stopwords = nltk.corpus.stopwords.words(\"english\")\n#lst_stopwords\n", "class": "Data Transform", - "desc": "This code snippet concatenates all text entries from the `Real_Disaster_df` DataFrame into a single string, facilitating further text analysis or visualization.", + "desc": "This code initializes a list of English stopwords using NLTK's corpus of stopwords for subsequent text preprocessing tasks.", "testing": { "class": "Data_Transform", "subclass": "string_transform", "subclass_id": 78, - "predicted_subclass_probability": 0.9610504 + "predicted_subclass_probability": 0.9620199 }, - "cluster": 1 + "cluster": 4 }, { - "cell_id": 20, - "code": "Not_Real_Disaster_text = ' '.join(Not_Real_Disaster_df.text.tolist())", + "cell_id": 15, + "code": "contractions = { \n\"ain't\": \"am not\",\n\"aren't\": \"are not\",\n\"can't\": \"cannot\",\n\"can't've\": \"cannot have\",\n\"'cause\": \"because\",\n\"could've\": \"could have\",\n\"couldn't\": \"could not\",\n\"couldn't've\": \"could not have\",\n\"didn't\": \"did not\",\n\"doesn't\": \"does not\",\n\"don't\": \"do not\",\n\"hadn't\": \"had not\",\n\"hadn't've\": \"had not have\",\n\"hasn't\": \"has not\",\n\"haven't\": \"have not\",\n\"he'd\": \"he would\",\n\"he'd've\": \"he would have\",\n\"he'll\": \"he will\",\n\"he'll've\": \"he will have\",\n\"he's\": \"he is\",\n\"how'd\": \"how did\",\n\"how'd'y\": \"how do you\",\n\"how'll\": \"how will\",\n\"how's\": \"how does\",\n\"i'd\": \"i would\",\n\"i'd've\": \"i would have\",\n\"i'll\": \"i will\",\n\"i'll've\": \"i will have\",\n\"i'm\": \"i am\",\n\"i've\": \"i have\",\n\"isn't\": \"is not\",\n\"it'd\": \"it would\",\n\"it'd've\": \"it would have\",\n\"it'll\": \"it will\",\n\"it'll've\": \"it will have\",\n\"it's\": \"it is\",\n\"let's\": \"let us\",\n\"ma'am\": \"madam\",\n\"mayn't\": \"may not\",\n\"might've\": \"might have\",\n\"mightn't\": \"might not\",\n\"mightn't've\": \"might not have\",\n\"must've\": \"must have\",\n\"mustn't\": \"must not\",\n\"mustn't've\": \"must not have\",\n\"needn't\": \"need not\",\n\"needn't've\": \"need not have\",\n\"o'clock\": \"of the clock\",\n\"oughtn't\": \"ought not\",\n\"oughtn't've\": \"ought not have\",\n\"shan't\": \"shall not\",\n\"sha'n't\": \"shall not\",\n\"shan't've\": \"shall not have\",\n\"she'd\": \"she would\",\n\"she'd've\": \"she would have\",\n\"she'll\": \"she will\",\n\"she'll've\": \"she will have\",\n\"she's\": \"she is\",\n\"should've\": \"should have\",\n\"shouldn't\": \"should not\",\n\"shouldn't've\": \"should not have\",\n\"so've\": \"so have\",\n\"so's\": \"so is\",\n\"that'd\": \"that would\",\n\"that'd've\": \"that would have\",\n\"that's\": \"that is\",\n\"there'd\": \"there would\",\n\"there'd've\": \"there would have\",\n\"there's\": \"there is\",\n\"they'd\": \"they would\",\n\"they'd've\": \"they would have\",\n\"they'll\": \"they will\",\n\"they'll've\": \"they will have\",\n\"they're\": \"they are\",\n\"they've\": \"they have\",\n\"to've\": \"to have\",\n\"wasn't\": \"was not\",\n\" u \": \" you \",\n\" ur \": \" your \",\n\" n \": \" and \",\n\"won't\": \"would not\",\n'dis': 'this',\n'bak': 'back',\n'brng': 'bring'}\n\ndef cont_to_exp(x):\n if type(x) is str:\n for key in contractions:\n value = contractions[key]\n x = x.replace(key, value)\n return x\n else:\n return x\n \ntrain_df['text_clean'] = train_df['text'].apply(lambda x: cont_to_exp(x))\ntest_df['text_clean'] = test_df['text'].apply(lambda x: cont_to_exp(x))\n\n\ndef remove_emails(x):\n return re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\\.[a-z0-9+_-]+)',\"\", x)\n\n\ndef remove_urls(x):\n return re.sub(r'(http|https|ftp|ssh)://([\\w_-]+(?:(?:\\.[\\w_-]+)+))([\\w.,@?^=%&:/~+#-]*[\\w@?^=%&/~+#-])?', '' , x)\n\ndef remove_rt(x):\n return re.sub(r'\\brt\\b', '', x).strip()\n\ndef remove_special_chars(x):\n x = re.sub(r'[^\\w ]+', \"\", x)\n x = ' '.join(x.split())\n return x\n\n\ndef remove_accented_chars(x):\n x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')\n return x\n\n\n\ntrain_df['text_clean'] = train_df['text_clean'].apply(lambda x: remove_emails(x))\ntrain_df['text_clean'] = train_df['text_clean'].apply(lambda x: remove_urls(x))\ntrain_df['text_clean'] = train_df['text_clean'].apply(lambda x: remove_rt(x))\ntrain_df['text_clean'] = train_df['text_clean'].apply(lambda x: remove_special_chars(x))\ntrain_df['text_clean'] = train_df['text_clean'].apply(lambda x: remove_accented_chars(x))", + "class": "Data Transform", + "desc": "This code block defines multiple functions to expand contractions, remove emails, URLs, retweet tags, special characters, and accented characters from text, and applies these preprocessing steps to the 'text' column in the training DataFrame, storing the cleaned text in a new column 'text_clean'.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.99555004 + }, + "cluster": 0 + }, { + "cell_id": 16, + "code": "train_df[\"text_clean\"] = train_df[\"text_clean\"].apply(lambda x: preprocess_text(x, flg_stemm=True, flg_lemm=False, lst_stopwords=lst_stopwords))\ntrain_df.head()", "class": "Data Transform", - "desc": "This code snippet concatenates all text entries from the `Not_Real_Disaster_df` DataFrame into a single string, facilitating further text analysis or visualization.", + "desc": "This code applies the previously defined `preprocess_text` function to the 'text_clean' column in the training DataFrame, performing text preprocessing with stemming enabled, lemmatization disabled, and removing stopwords, and updates the 'text_clean' column with the processed text.", "testing": { "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.9576982 + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.9974553 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 22, - "code": "# take text and preprocess 'remove stopwords [a, the, and, thus, ... etc] and punctations[,%$ ..etc] and len of text less than 3' \ndef clean_text(text):\n \"\"\"\n text: a string \n return: cleaned string\n \"\"\"\n result = []\n for token in simple_preprocess(text):\n if token not in STOPWORDS and token not in punctation and len(token) >= 3 :\n token = token.lower() \n result.append(token) \n return \" \".join(result)", + "cell_id": 17, + "code": "vec=TfidfVectorizer(max_features = 10000,ngram_range=(1,4))\nvec.fit(train_df['text_clean'])", + "class": "Data Transform", + "desc": "This code initializes a `TfidfVectorizer` with a maximum of 10,000 features and a range of 1 to 4 n-grams, and fits it to the 'text_clean' column in the training DataFrame to create a TF-IDF representation of the text data.", + "testing": { + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.5606868 + }, + "cluster": 4 + }, { + "cell_id": 18, + "code": "matrix = vec.transform(train_df['text_clean']).toarray()\nfeatures = vec.get_feature_names()\nmatrix_df = pd.DataFrame(data=matrix, columns=features)\n", "class": "Data Transform", - "desc": "This code snippet defines a function `clean_text` that preprocesses a given text by removing stopwords, punctuation, and tokens with fewer than three characters, standardizing the text for further analysis or model training.", + "desc": "This code transforms the 'text_clean' column of the training DataFrame into a TF-IDF matrix, converts it to a NumPy array, retrieves the feature names, and constructs a new DataFrame with the TF-IDF values using these feature names as column headers.", "testing": { "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.9118299 + "subclass": "create_dataframe", + "subclass_id": 12, + "predicted_subclass_probability": 0.99546874 }, - "cluster": 1 + "cluster": 4 }, { - "cell_id": 23, - "code": "train_df['text'] = train_df['text'].map(clean_text)\ntrain_df.head()", + "cell_id": 21, + "code": "matrix_df['length']=train_df['length']\nmatrix_df['char_count']=train_df['char_count']\nmatrix_df['word_count']=train_df['word_count']\nmatrix_df['hashtag_count']=train_df['hashtag_count']\nmatrix_df['mention_count']=train_df['mention_count']\ny=train_df['target']", "class": "Data Transform", - "desc": "This code snippet applies the `clean_text` function to the 'text' column of the `train_df` DataFrame and updates it with the cleaned text, then displays the first few rows of the cleaned DataFrame.", + "desc": "This code appends additional features from the original training DataFrame, such as 'length', 'char_count', 'word_count', 'hashtag_count', and 'mention_count', to the TF-IDF matrix DataFrame, and extracts the target variable 'target' into a separate Series 'y'.", "testing": { "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.99127215 + "subclass": "prepare_x_and_y", + "subclass_id": 21, + "predicted_subclass_probability": 0.8321087 }, - "cluster": 1 + "cluster": 3 }, { - "cell_id": 24, - "code": "from sklearn.utils import shuffle\ntrain_df_shuffled = shuffle(train_df)\ntrain_df_shuffled.head()", + "cell_id": 32, + "code": "test_df[\"text_clean\"]=test_df['text']\ntest_df['text_clean'] = test_df['text_clean'].apply(lambda x: remove_emails(x))\ntest_df['text_clean'] = test_df['text_clean'].apply(lambda x: remove_urls(x))\ntest_df['text_clean'] = test_df['text_clean'].apply(lambda x: remove_rt(x))\ntest_df['text_clean'] = test_df['text_clean'].apply(lambda x: remove_special_chars(x))\ntest_df['text_clean'] = test_df['text_clean'].apply(lambda x: remove_accented_chars(x))\n\ntest_df[\"text_clean\"] = test_df[\"text\"].apply(lambda x: preprocess_text(x, flg_stemm=True, flg_lemm=False, lst_stopwords=lst_stopwords))\ntest_df['length']=test_df['text'].apply(len)\n\ntest_df.head()\n\n#vec=TfidfVectorizer(max_features = 20000,ngram_range=(1,4))\n#vec.fit(test_df['text_clean'])\n\n\n\nmatrix = vec.transform(test_df['text_clean']).toarray()\nfeatures = vec.get_feature_names()\nmatrix_df = pd.DataFrame(data=matrix, columns=features)\n\nmatrix_df['length']=test_df['length']\nmatrix_df['char_count']=test_df['char_count']\nmatrix_df['word_count']=test_df['word_count']\nmatrix_df['hashtag_count']=test_df['hashtag_count']\nmatrix_df['mention_count']=test_df['mention_count']", "class": "Data Transform", - "desc": "This code snippet shuffles the rows of the `train_df` DataFrame to create a new DataFrame `train_df_shuffled`, ensuring that the data is randomly mixed, which is useful for unbiased model training and evaluation, and then displays the first few rows.", + "desc": "This code preprocesses the 'text' column in the test DataFrame by performing text cleaning (removing emails, URLs, retweet tags, special characters, accented characters), and by applying stemming and stopwords removal; it then transforms the cleaned text into a TF-IDF matrix, adds additional features such as length, char_count, word_count, hashtag_count, and mention_count from the original test DataFrame, and stores these values in a new DataFrame.", "testing": { "class": "Data_Transform", - "subclass": "normalization", - "subclass_id": 18, - "predicted_subclass_probability": 0.7203666 + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.99814296 }, - "cluster": 1 + "cluster": 8 }, { - "cell_id": 25, - "code": "X = train_df_shuffled['text']\ny = train_df_shuffled['target']\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42, stratify = y)", + "cell_id": 36, + "code": "#Credit: https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub\ndef bert_encode(texts, tokenizer, max_len=512):\n all_tokens = []\n all_masks = []\n all_segments = []\n \n for text in texts:\n text = tokenizer.tokenize(text)\n \n text = text[:max_len-2]\n input_sequence = [\"[CLS]\"] + text + [\"[SEP]\"]\n pad_len = max_len - len(input_sequence)\n \n tokens = tokenizer.convert_tokens_to_ids(input_sequence)\n tokens += [0] * pad_len\n pad_masks = [1] * len(input_sequence) + [0] * pad_len\n segment_ids = [0] * max_len\n \n all_tokens.append(tokens)\n all_masks.append(pad_masks)\n all_segments.append(segment_ids)\n \n return np.array(all_tokens), np.array(all_masks), np.array(all_segments)", "class": "Data Transform", - "desc": "This code snippet splits the shuffled dataset into training and testing sets for both features (`X_train`, `X_test`) and labels (`y_train`, `y_test`), with 20% of the data reserved for testing, while maintaining the target variable's class distribution using stratified sampling.", + "desc": "This code defines a function `bert_encode` that tokenizes and encodes a list of text inputs into tokens, masks, and segment IDs suitable for input to a BERT model, using the `tokenizer` provided, and ensures each input sequence fits a specified maximum length (`max_len`).", "testing": { "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.995934 + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.9645982 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 29, - "code": "test_df = test_df.drop(['id', 'keyword', 'location'], axis = 1)", + "cell_id": 40, + "code": "# Encode the text into tokens, masks, and segment flags\ntrain_input = bert_encode(train_df.text_clean.values, tokenizer, max_len=160)\ntest_input = bert_encode(test_df.text_clean.values, tokenizer, max_len=160)\ntrain_labels = train_df.target.values", "class": "Data Transform", - "desc": "This code snippet removes the columns 'id', 'keyword', and 'location' from the `test_df` DataFrame, likely to focus on the remaining relevant data features for subsequent steps.", + "desc": "This code encodes the cleaned text from the training and test DataFrames into BERT-compatible tokens, attention masks, and segment flags using the previously defined `bert_encode` function with a maximum length of 160 tokens and also extracts the target labels from the training DataFrame.", "testing": { "class": "Data_Transform", - "subclass": "drop_column", - "subclass_id": 10, - "predicted_subclass_probability": 0.99925584 + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.9929838 + }, + "cluster": 2 + }, { + "cell_id": 3, + "code": "train_df.head(5)", + "class": "Exploratory Data Analysis", + "desc": "This code displays the first five rows of the training DataFrame to provide an initial glimpse of the data.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9997615 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 30, - "code": "test_df['text'] = test_df['text'].map(clean_text)\ntest_df.head()", - "class": "Data Transform", - "desc": "This code snippet applies the `clean_text` function to the 'text' column of the `test_df` DataFrame and updates it with the cleaned text, then displays the first few rows of the cleaned DataFrame.", + "cell_id": 4, + "code": "# DataFrane Summary by pandas summary package (extension of pandas.describe method) \ndfs = DataFrameSummary(train_df)\ndfs.summary()", + "class": "Exploratory Data Analysis", + "desc": "This code generates a detailed summary of the training DataFrame using the DataFrameSummary class from the pandas_summary package, which extends the functionality of the pandas `describe` method.", "testing": { "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.99245125 + "subclass": "create_dataframe", + "subclass_id": 12, + "predicted_subclass_probability": 0.988304 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 33, - "code": "sample_submission[\"target\"] = y_pred", - "class": "Data Transform", - "desc": "This code snippet assigns the predicted target values `y_pred` to the 'target' column of the `sample_submission` DataFrame to prepare it for submission.", + "cell_id": 10, + "code": "duplicates = pd.concat(x for _, x in train_df.groupby([\"text\"]) if len(x) > 1)\n\n#with pd.option_context(\"display.max_rows\", None, \"max_colwidth\", 80):\n# display(duplicates[[\"id\", \"target\", \"text\"]])", + "class": "Exploratory Data Analysis", + "desc": "This code identifies duplicate rows in the training DataFrame based on the 'text' column and concatenates them into a new DataFrame for further analysis.", "testing": { - "class": "Data_Export", - "subclass": "prepare_output", - "subclass_id": 55, - "predicted_subclass_probability": 0.7656245 + "class": "Data_Transform", + "subclass": "concatenate", + "subclass_id": 11, + "predicted_subclass_probability": 0.87380403 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 2, - "code": "train_df.head()", + "cell_id": 19, + "code": "matrix_df.head(2)", "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first few rows of the training DataFrame `train_df` to give an initial look at the data structure and contents.", + "desc": "This code displays the first two rows of the newly created DataFrame containing the TF-IDF values for the features extracted from the 'text_clean' column.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_table", "subclass_id": 41, - "predicted_subclass_probability": 0.9997545 + "predicted_subclass_probability": 0.9997633 }, - "cluster": 11 + "cluster": 2 }, { - "cell_id": 4, - "code": "train_df.shape", + "cell_id": 20, + "code": "matrix_df.shape", "class": "Exploratory Data Analysis", - "desc": "This code snippet retrieves the dimensions (number of rows and columns) of the `train_df` DataFrame to understand the dataset size after dropping certain columns.", + "desc": "This code outputs the shape of the DataFrame containing the TF-IDF values, providing the number of rows and columns in the DataFrame.", "testing": { "class": "Exploratory_Data_Analysis", "subclass": "show_shape", "subclass_id": 58, - "predicted_subclass_probability": 0.9995821 + "predicted_subclass_probability": 0.9996574 }, - "cluster": 10 + "cluster": 1 }, { - "cell_id": 5, - "code": "train_df.columns", - "class": "Exploratory Data Analysis", - "desc": "This code snippet lists the column names in the `train_df` DataFrame to confirm the currently available features after column removal.", + "cell_id": 0, + "code": "# Octopus ML pakage - github.com/gershonc/octopus-ml\n!pip install octopus-ml", + "class": "Imports and Environment", + "desc": "This code installs the 'octopus-ml' package from GitHub using the pip package manager.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_columns", - "subclass_id": 71, - "predicted_subclass_probability": 0.9984144 + "class": "Imports_and_Environment", + "subclass": "install_modules", + "subclass_id": 87, + "predicted_subclass_probability": 0.99379325 }, - "cluster": -1 + "cluster": 1 }, { - "cell_id": 6, - "code": "train_df.info()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet provides a summary of the `train_df` DataFrame, including the data types and non-null counts for each column, aiding in understanding the dataset's structure and identifying potential data quality issues.", + "cell_id": 1, + "code": "import warnings\nwarnings.simplefilter(\"ignore\")\nimport seaborn as sns \nimport matplotlib.pyplot as plt\nimport time\nimport pandas as pd\nimport numpy as np\nimport lightgbm as lgb\nimport tracemalloc\nfrom pandas_summary import DataFrameSummary\nfrom sklearn.metrics import classification_report\n\nfrom sklearn import feature_extraction, linear_model, model_selection, preprocessing\n\n%matplotlib inline\nsns.set_style(\"whitegrid\")\n\npd.set_option('display.max_columns', None) # or 1000\npd.set_option('display.max_rows', None) # or 1000\npd.set_option('display.max_colwidth', -1) # or 199\n\n#check out https://github.com/gershonc/octopus-ml\nimport octopus_ml as oc\n\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.multiclass import OneVsRestClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.model_selection import train_test_split", + "class": "Imports and Environment", + "desc": "This code imports various libraries and modules required for machine learning tasks, data manipulation, visualization, and environment settings, including seaborn, matplotlib, pandas, numpy, LightGBM, scikit-learn, and the custom package 'octopus-ml', and configures some display options for pandas.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.9993624 + "class": "Imports_and_Environment", + "subclass": "set_options", + "subclass_id": 23, + "predicted_subclass_probability": 0.9992041 }, - "cluster": 11 + "cluster": 1 }, { - "cell_id": 7, - "code": "train_df.describe()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet generates descriptive statistics for the numeric columns in the `train_df` DataFrame, offering insights into the data distribution, central tendency, and variability.", + "cell_id": 12, + "code": "## for data\nimport json\nimport pandas as pd\nimport numpy as np\n## for plotting\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n## for processing\nimport re\nimport nltk\n## for bag-of-words\nfrom sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing\n## for explainer\nfrom lime import lime_text\n## for word embedding\nimport gensim\nimport gensim.downloader as gensim_api\n## for deep learning\nfrom tensorflow.keras import models, layers, preprocessing as kprocessing\nfrom tensorflow.keras import backend as K\n## for bert language model\nimport transformers\nimport unicodedata", + "class": "Imports and Environment", + "desc": "This code imports various libraries for data handling, plotting, text processing, machine learning, word embedding, deep learning, and language models, including pandas, numpy, matplotlib, seaborn, nltk, scikit-learn, lime, gensim, tensorflow.keras, and transformers.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.9994492 + "class": "Imports_and_Environment", + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.9993574 }, - "cluster": 11 + "cluster": 0 }, { - "cell_id": 8, - "code": "train_df[train_df[\"target\"] == 1][\"text\"].values[0]", - "class": "Exploratory Data Analysis", - "desc": "This code snippet retrieves and displays the text of the first record in the `train_df` DataFrame where the `target` column has a value of 1, helping to understand the nature of positive examples in the dataset.", + "cell_id": 34, + "code": "!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py", + "class": "Imports and Environment", + "desc": "This code downloads the `tokenization.py` file from the TensorFlow BERT repository on GitHub using the `wget` command.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.95915043 + "class": "Data_Extraction", + "subclass": "load_from_url", + "subclass_id": 42, + "predicted_subclass_probability": 0.8866123 }, - "cluster": 5 + "cluster": 0 }, { - "cell_id": 9, - "code": "train_df[train_df[\"target\"] == 1][\"text\"].values[1]", - "class": "Exploratory Data Analysis", - "desc": "This code snippet retrieves and displays the text of the second record in the `train_df` DataFrame where the `target` column has a value of 1, providing additional context for positive examples in the dataset.", + "cell_id": 35, + "code": "import tensorflow as tf\nfrom tensorflow.keras.layers import Dense, Input\nfrom tensorflow.keras.optimizers import Adam\nfrom tensorflow.keras.models import Model\nfrom tensorflow.keras.callbacks import ModelCheckpoint\nimport tensorflow_hub as hub\n\nimport tokenization", + "class": "Imports and Environment", + "desc": "This code imports various TensorFlow and Keras modules for building and training neural network models, as well as the `tokenization` module that was downloaded using `wget`.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.5788779 + "class": "Imports_and_Environment", + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.99932015 }, - "cluster": 5 + "cluster": 0 }, { - "cell_id": 10, - "code": "print(\"Number of duplicates in data : {}\".format(len(train_df[train_df.duplicated()])))", - "class": "Exploratory Data Analysis", - "desc": "This code snippet prints the number of duplicate rows in the `train_df` DataFrame, helping to identify and quantify redundancy in the dataset.", + "cell_id": 38, + "code": "# Load BERT from the Tensorflow Hub\nmodule_url = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1\"\nbert_layer = hub.KerasLayer(module_url, trainable=True)", + "class": "Imports and Environment", + "desc": "This code loads a pre-trained BERT model from TensorFlow Hub using the specified URL and wraps it in a Keras layer, setting it to be trainable.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_duplicates", - "subclass_id": 38, - "predicted_subclass_probability": 0.8543922 + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.87859446 }, - "cluster": 7 + "cluster": 0 }, { - "cell_id": 11, - "code": "print(\"Duplicated rows before remove them : \")\ntrain_df[train_df.duplicated(keep=False)].sort_values(by=\"text\").head(8)", - "class": "Exploratory Data Analysis", - "desc": "This code snippet prints and displays the first 8 duplicate rows in the `train_df` DataFrame, sorted by the 'text' column, providing a closer look at the duplicated entries before they are removed.", + "cell_id": 39, + "code": "# Load tokenizer from the bert layer\nvocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()\ndo_lower_case = bert_layer.resolved_object.do_lower_case.numpy()\ntokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)", + "class": "Imports and Environment", + "desc": "This code loads the tokenizer associated with the pre-trained BERT model by accessing the vocabulary file and the `do_lower_case` flag from the BERT layer, and initializes a `FullTokenizer` using the `tokenization` module.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_duplicates", - "subclass_id": 38, - "predicted_subclass_probability": 0.859677 + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.4494561 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 13, - "code": "print(\"Number of duplicates in data : {}\".format(len(train_df[train_df.duplicated()])))", - "class": "Exploratory Data Analysis", - "desc": "This code snippet prints the number of duplicate rows in the `train_df` DataFrame after duplicates have been removed, to confirm the operation's success.", + "cell_id": 23, + "code": "oc.cv_plot(metrics['f1_weighted'],metrics['f1_macro'],metrics['f1_positive'],'Titanic Kaggle competition')", + "class": "Model Evaluation", + "desc": "This code generates plots to visualize the cross-validated F1 scores (weighted, macro, and positive) using the `cv_plot` function from the 'octopus_ml' package, with the title 'Titanic Kaggle competition'.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_duplicates", - "subclass_id": 38, - "predicted_subclass_probability": 0.8543922 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.64448214 }, - "cluster": 7 + "cluster": 0 }, { - "cell_id": 14, - "code": "train_df['target'].value_counts()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet counts and displays the occurrences of each unique value in the 'target' column of the `train_df` DataFrame, providing insight into the class distribution in the dataset.", + "cell_id": 24, + "code": "print(classification_report(metrics['y'], metrics['predictions_folds']))", + "class": "Model Evaluation", + "desc": "This code prints a detailed classification report, including precision, recall, and F1 score, by comparing the true labels to the predicted labels from the cross-validated model using scikit-learn's `classification_report` function.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.9995184 + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.9977271 }, - "cluster": 9 + "cluster": 1 }, { - "cell_id": 16, - "code": "Real_Disaster_df = train_df[train_df['target'] == 1]\nReal_Disaster_df.head()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet filters the `train_df` DataFrame for rows where the 'target' column equals 1 and then displays the first few rows of this filtered DataFrame, examining instances of tweets classified as real disasters.", + "cell_id": 25, + "code": "oc.roc_curve_plot(metrics['y'], metrics['predictions_proba'])", + "class": "Model Evaluation", + "desc": "This code generates a ROC curve plot using the `roc_curve_plot` function from the 'octopus_ml' package to visualize the performance of the model based on the true labels and predicted probabilities from the cross-validated results.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.5334526 + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.67355084 + }, + "cluster": 1 + }, { + "cell_id": 26, + "code": "oc.confusion_matrix_plot(metrics['y'], metrics['predictions_folds'])", + "class": "Model Evaluation", + "desc": "This code generates and visualizes a confusion matrix using the `confusion_matrix_plot` function from the 'octopus_ml' package, based on the true labels and the predicted labels from the cross-validated results.", + "testing": { + "class": "Visualization", + "subclass": "model_coefficients", + "subclass_id": 79, + "predicted_subclass_probability": 0.65878206 + }, + "cluster": 1 + }, { + "cell_id": 27, + "code": "feature_imp_list=oc.plot_imp(metrics['final_clf'],matrix_df,'LightGBM Mortality Kaggle',num=40)", + "class": "Model Evaluation", + "desc": "This code generates a plot to visualize the importance of the top 40 features in the final LightGBM classifier using the `plot_imp` function from the 'octopus_ml' package and returns the feature importance list.", + "testing": { + "class": "Visualization", + "subclass": "model_coefficients", + "subclass_id": 79, + "predicted_subclass_probability": 0.9963425 + }, + "cluster": 1 + }, { + "cell_id": 28, + "code": "oc.preds_distribution(metrics['y'], metrics['predictions_proba'], bins=40)", + "class": "Model Evaluation", + "desc": "This code visualizes the distribution of the predicted probabilities against the true labels using the `preds_distribution` function from the 'octopus_ml' package, with the histogram divided into 40 bins.", + "testing": { + "class": "Model_Train", + "subclass": "compute_train_metric", + "subclass_id": 28, + "predicted_subclass_probability": 0.558016 + }, + "cluster": 3 + }, { + "cell_id": 29, + "code": "top_features=feature_imp_list.sort_values(by='Value', ascending=False).head(20)\ntop_features", + "class": "Model Evaluation", + "desc": "This code sorts the feature importance list in descending order by their importance values and displays the top 20 most important features.", + "testing": { + "class": "Data_Transform", + "subclass": "sort_values", + "subclass_id": 9, + "predicted_subclass_probability": 0.992605 }, "cluster": -1 }, { - "cell_id": 17, - "code": "Not_Real_Disaster_df = train_df[train_df['target'] == 0]\nNot_Real_Disaster_df.head()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet filters the `train_df` DataFrame for rows where the 'target' column equals 0 and then displays the first few rows of this filtered DataFrame, examining instances of tweets classified as not real disasters.", + "cell_id": 30, + "code": "list_for_correlations=top_features['Feature'].to_list()\nlist_for_correlations.append('target')\noc.correlations(matrix_df,list_for_correlations)", + "class": "Model Evaluation", + "desc": "This code extracts the top 20 most important features, appends the target variable 'target' to the list, and generates a correlation matrix using the `correlations` function from the 'octopus_ml' package based on these features and the target variable from the TF-IDF matrix DataFrame.", + "testing": { + "class": "Visualization", + "subclass": "model_coefficients", + "subclass_id": 79, + "predicted_subclass_probability": 0.9603935 + }, + "cluster": 0 + }, { + "cell_id": 22, + "code": "params = {\n 'boosting_type': 'gbdt',\n 'objective': 'binary',\n 'metric': 'auc',\n 'learning_rate': 0.01,\n 'num_leaves':32,\n 'subsample': 1,\n #'colsample_bytree': 0.25,\n #'reg_alpha': 0,\n #'reg_lambda': 1,\n #'scale_pos_weight': 5,\n 'n_estimators': 10000,\n 'verbose': -1,\n 'max_depth': -1,\n 'seed':100, \n 'colsample_bytree':0.4,\n 'force_col_wise': True\n\n\n}\n\"\"\"\n boosting_type='gbdt', class_weight=None, colsample_bytree=0.4,\n importance_type='split', learning_rate=0.04, max_depth=-1,\n metric='auc', min_child_samples=20, min_child_weight=0.001,\n min_split_gain=0.0, n_estimators=1500, n_jobs=-1, num_leaves=31,\n objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,\n silent=True, subsample=1.0, subsample_for_bin=200000,\n subsample_freq=0 \n\"\"\"\nmetrics = oc.cv_adv(matrix_df,y,0.5,2000,shuffle=True,params=params)", + "class": "Model Training", + "desc": "This code defines a set of parameters for training a LightGBM model using gradient boosting decision trees (GBDT) with a binary objective and executes a cross-validation process using the `cv_adv` function from the 'octopus_ml' package on the TF-IDF matrix DataFrame and target variable 'y'.", "testing": { - "class": "Data_Transform", - "subclass": "filter", - "subclass_id": 14, - "predicted_subclass_probability": 0.5007087 + "class": "Model_Train", + "subclass": "init_hyperparams", + "subclass_id": 59, + "predicted_subclass_probability": 0.99348336 }, - "cluster": -1 + "cluster": 3 }, { - "cell_id": 26, - "code": "X_test", - "class": "Exploratory Data Analysis", - "desc": "This code snippet outputs the test set features `X_test`, providing an opportunity to inspect the data allocated for model evaluation.", + "cell_id": 33, + "code": "test_pred=metrics['final_clf'].predict(matrix_df)\npredictions = []\n#predictions = oc.adjusted_classes(test_pred, 0.5)", + "class": "Model Training", + "desc": "This code generates predictions for the test DataFrame using the final classifier stored in `metrics['final_clf']` on the transformed test data matrix.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.99974364 + "class": "Model_Evaluation", + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.9896577 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 34, - "code": "sample_submission.head()", - "class": "Exploratory Data Analysis", - "desc": "This code snippet displays the first few rows of the `sample_submission` DataFrame to verify that the predicted target values have been correctly assigned.", + "cell_id": 37, + "code": "def build_model(bert_layer, max_len=512):\n input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"input_word_ids\")\n input_mask = Input(shape=(max_len,), dtype=tf.int32, name=\"input_mask\")\n segment_ids = Input(shape=(max_len,), dtype=tf.int32, name=\"segment_ids\")\n\n _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])\n clf_output = sequence_output[:, 0, :]\n \n if Dropout_num == 0:\n # Without Dropout\n out = Dense(1, activation='sigmoid')(clf_output)\n else:\n # With Dropout(Dropout_num), Dropout_num > 0\n x = Dropout(Dropout_num)(clf_output)\n out = Dense(1, activation='sigmoid')(x)\n\n model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)\n model.compile(Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])\n \n return model", + "class": "Model Training", + "desc": "This code defines a function `build_model` to create a Keras model using a BERT layer, where it takes the encoded word IDs, attention masks, and segment IDs as inputs, extracts the sequence output from the BERT layer, and uses this output to make predictions through a dense layer with a sigmoid activation function, optionally including a dropout layer, before compiling the model with the Adam optimizer and binary crossentropy loss.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.99975234 + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.9445593 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 0, - "code": "import numpy as np \nimport pandas as pd \nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom wordcloud import WordCloud\nfrom sklearn import feature_extraction, linear_model, model_selection, preprocessing\n\n#sklearn \nfrom sklearn.model_selection import train_test_split\nfrom sklearn.utils.class_weight import compute_sample_weight\nfrom sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.metrics import accuracy_score, confusion_matrix\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.naive_bayes import MultinomialNB\nfrom sklearn.linear_model import SGDClassifier\n\n# nlp preprocessing lib\nimport gensim\nfrom gensim.utils import simple_preprocess\nfrom gensim.parsing.preprocessing import STOPWORDS\nimport string \npunctation = string.punctuation", - "class": "Imports and Environment", - "desc": "This code snippet imports various libraries and modules necessary for data manipulation (e.g., pandas, numpy), visualization (e.g., seaborn, matplotlib, WordCloud), machine learning (e.g., sklearn), and natural language processing (e.g., gensim), along with defining a variable 'punctation' containing punctuation characters.", + "cell_id": 41, + "code": "random_state_split = 2\nDropout_num = 0\nlearning_rate = 6e-6\nvalid = 0.2\nepochs_num = 3\nbatch_size_num = 16\ntarget_corrected = False\ntarget_big_corrected = False\n\n# Build BERT model with my tuning\nmodel_BERT = build_model(bert_layer, max_len=160)\nmodel_BERT.summary()", + "class": "Model Training", + "desc": "This code sets several hyperparameters and configurations, including random state, dropout number, learning rate, validation split, number of epochs, batch size, and flags for target corrections, and then initializes the BERT model using these parameters by calling the `build_model` function with the BERT layer and maximum length of 160 tokens, followed by printing the model summary.", "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.99901414 + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.66016126 }, "cluster": 0 }, { - "cell_id": 27, - "code": "from sklearn.model_selection import cross_val_score\nnb_classifier = Pipeline([('vect', CountVectorizer()),\n ('tfidf', TfidfTransformer()),\n ('clf', MultinomialNB()),])\n\nnb_classifier.fit(X_train, y_train)\n\ny_pred = nb_classifier.predict(X_test)\nprint('accuracy {}'.format(accuracy_score(y_pred, y_test)))", + "cell_id": 42, + "code": "checkpoint = ModelCheckpoint('model_BERT.h5', monitor='val_loss', save_best_only=True)\n\ntrain_history = model_BERT.fit(\n train_input, train_labels,\n validation_split = valid,\n epochs = epochs_num, # recomended 3-5 epochs\n callbacks=[checkpoint],\n batch_size = batch_size_num\n)", "class": "Model Training", - "desc": "This code snippet creates a machine learning pipeline using a CountVectorizer, TfidfTransformer, and Multinomial Naive Bayes classifier, then fits this pipeline to the training data and evaluates prediction accuracy on the test data.", + "desc": "This code initializes a `ModelCheckpoint` to save the best version of the BERT model based on validation loss, and then trains the BERT model using the encoded training inputs and labels, with a specified validation split, number of epochs, and batch size, while applying the checkpoint callback to save the model during training.", "testing": { "class": "Model_Train", "subclass": "train_model", "subclass_id": 7, - "predicted_subclass_probability": 0.7887582 + "predicted_subclass_probability": 0.99751294 }, "cluster": 0 }, { - "cell_id": 28, - "code": "sgd = Pipeline([('vect', CountVectorizer()),\n ('tfidf', TfidfTransformer()),\n ('clf', SGDClassifier(loss='epsilon_insensitive', penalty='l2',alpha=1e-3, random_state=42, max_iter=1000, tol=None)),])\n\n\nsgd.fit(X_train, y_train)\ny_pred = sgd.predict(X_test)\nprint('accuracy {}'.format(accuracy_score(y_pred, y_test)))", + "cell_id": 43, + "code": "model_BERT.load_weights('model_BERT.h5')\ntest_pred_BERT = model_BERT.predict(test_input)\ntest_pred_BERT_int = test_pred_BERT.round().astype('int')", "class": "Model Training", - "desc": "This code snippet creates a machine learning pipeline using a CountVectorizer, TfidfTransformer, and an SGDClassifier, fits the pipeline to the training data, and evaluates the prediction accuracy on the test data.", + "desc": "This code loads the best weights saved during training into the BERT model and makes predictions on the encoded test inputs, rounding the predicted probabilities to the nearest integer for final prediction labels.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.95042425 + "class": "Model_Evaluation", + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.991269 }, "cluster": 0 }, { - "cell_id": 31, - "code": "y_pred = nb_classifier.predict(test_df['text'])", + "cell_id": 44, + "code": "train_pred_BERT = model_BERT.predict(train_input)\ntrain_pred_BERT_int = train_pred_BERT.round().astype('int')", "class": "Model Training", - "desc": "This code snippet uses the previously trained Naive Bayes classifier pipeline to predict the target values for the cleaned text data in the `test_df` DataFrame.", + "desc": "This code makes predictions on the encoded training inputs using the BERT model, rounding the predicted probabilities to the nearest integer to obtain final prediction labels.", "testing": { "class": "Model_Evaluation", "subclass": "predict_on_test", "subclass_id": 48, - "predicted_subclass_probability": 0.994578 + "predicted_subclass_probability": 0.990164 + }, + "cluster": 0 + }, { + "cell_id": 5, + "code": "# Target distribution analysis\nfig, ax =plt.subplots(1,2)\n\n\nplt.style.use('fivethirtyeight')\nplt.figure(figsize=(3,4))\nsns.set_context(\"paper\", font_scale=1.2) \nsns.countplot('target',data=train_df, ax=ax[0])\ntrain_df['target'].value_counts().plot.pie(explode=[0,0.2],autopct='%1.2f%%',ax=ax[1])\nfig.show()", + "class": "Visualization", + "desc": "This code visualizes the distribution of the target variable in the training DataFrame using a count plot and a pie chart, utilizing seaborn and matplotlib for plotting.", + "testing": { + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.9649334 }, "cluster": -1 }, { - "cell_id": 15, - "code": "# count plot \"Histogram\" of Frequencies of Subjects for true news\nplt.figure(figsize=(10,6))\nplt.title(\"Frequencies of tweets for Disaster\")\nsns.countplot(x = 'target', data = train_df)\nplt.xlabel('Disaster Type')", + "cell_id": 7, + "code": "sns.displot(data = train_df, kind = 'hist', x = 'length', hue = 'target', multiple = 'stack',bins=50,height = 5, aspect = 1.9)\n\n# The distibution of tweet text length vs target - there is a correlation between tweet length and target ", "class": "Visualization", - "desc": "This code snippet generates a count plot (or histogram) using Seaborn to visualize the frequencies of tweets categorized under each target type (disaster or non-disaster) within the `train_df` DataFrame, aiding in understanding the class distribution visually.", + "desc": "This code creates a stacked histogram to visualize the distribution of tweet text length versus the target variable in the training DataFrame, using seaborn\u2019s `displot` function.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, - "predicted_subclass_probability": 0.9293306 + "predicted_subclass_probability": 0.9802619 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 19, - "code": "wordcloud_true = WordCloud().generate(Real_Disaster_text)\nplt.figure(figsize=(10,10))\nplt.imshow(wordcloud_true)\nplt.axis('off')\nplt.title(\"Word Cloud of Real Disaster news\")\nplt.tight_layout(pad=0)\nplt.show()", + "cell_id": 8, + "code": "sns.displot(data = train_df, kind = 'hist', x = 'hashtag_count', hue = 'target', multiple = 'stack',bins=50,height = 5, aspect = 1.9)", "class": "Visualization", - "desc": "This code snippet generates and displays a word cloud from the concatenated text of real disaster tweets, providing a visual representation of the most common words used in tweets classified as real disasters.", + "desc": "This code creates a stacked histogram to visualize the distribution of the hashtag count versus the target variable in the training DataFrame, using seaborn\u2019s `displot` function.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, - "predicted_subclass_probability": 0.96466434 + "predicted_subclass_probability": 0.9985002 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 21, - "code": "wordcloud_true = WordCloud().generate(Not_Real_Disaster_text)\nplt.figure(figsize=(10,10))\nplt.imshow(wordcloud_true)\nplt.axis('off')\nplt.title(\"Word Cloud of Not RealDisaster twittes\")\nplt.tight_layout(pad=0)\nplt.show()\n", + "cell_id": 9, + "code": "sns.displot(data = train_df, kind = 'hist', x = 'word_count', hue = 'target', multiple = 'stack',bins=50,height = 5, aspect = 1.9)\n", "class": "Visualization", - "desc": "This code snippet generates and displays a word cloud from the concatenated text of \"not real disaster\" tweets, providing a visual representation of the most common words used in tweets classified as not real disasters.", + "desc": "This code creates a stacked histogram to visualize the distribution of word count versus the target variable in the training DataFrame, using seaborn\u2019s `displot` function.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, - "predicted_subclass_probability": 0.9845956 + "predicted_subclass_probability": 0.9985385 }, - "cluster": 0 + "cluster": -1 }], - "notebook_id": 23, - "notebook_name": "nlp-with-disaster-tweets" + "notebook_id": 20, + "notebook_name": "nlp-twitter-tuned-lgbm-model-tfidf-bert.ipynb" }, { "cells": [{ - "cell_id": 22, - "code": "sub = pd.read_csv(dir_path + \"sample_submission.csv\")\nprediction = (F.softmax(test_preds[0], dim=1)[:, 1]>min_threshold).int()\nsub = pd.read_csv(dir_path + \"sample_submission.csv\")\nsub[\"target\"] = prediction\nsub.to_csv(\"submission.csv\", index=False)", + "cell_id": 65, + "code": "# submit\nsubmission = pd.read_csv(\"/kaggle/input/nlp-getting-started/sample_submission.csv\")\nsubmission['target'] = np.round(test_pred).astype('int')\nsubmission.to_csv('submission.csv', index=False)\nsubmission.groupby('target').count()", "class": "Data Export", - "desc": "This snippet reads the sample submission file, assigns the model predictions to the \"target\" column after applying the determined threshold, and saves the updated DataFrame to a CSV file named \"submission.csv\".", + "desc": "This code snippet updates the sample submission file with the rounded predictions for the 'target' column and saves it as 'submission.csv', then displays a count of each target class using groupby and count methods from pandas.", "testing": { "class": "Data_Export", "subclass": "save_to_csv", "subclass_id": 25, - "predicted_subclass_probability": 0.974422 + "predicted_subclass_probability": 0.99921453 }, - "cluster": 0 + "cluster": -1 }, { "cell_id": 1, - "code": "dir_path = \"/kaggle/input/nlp-getting-started/\"\ntrain_df = pd.read_csv(dir_path + \"train.csv\")\ntest_df = pd.read_csv(dir_path + \"test.csv\")", + "code": "train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')\nsubmit_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')", "class": "Data Extraction", - "desc": "This snippet defines the directory path for the dataset and loads the training and testing data from CSV files into pandas DataFrames.", + "desc": "This code snippet reads training and test datasets from CSV files using pandas' `read_csv` function.", "testing": { "class": "Data_Extraction", "subclass": "load_from_csv", "subclass_id": 45, - "predicted_subclass_probability": 0.99974483 + "predicted_subclass_probability": 0.9997477 }, - "cluster": 2 + "cluster": 0 }, { - "cell_id": 3, - "code": "train_df = train_df.drop(columns=[\"id\", \"keyword\", \"location\"])", - "class": "Data Transform", - "desc": "This snippet removes the columns \"id\", \"keyword\", and \"location\" from the `train_df` DataFrame to clean the data and focus on the essential features for analysis.", + "cell_id": 26, + "code": "import requests\nurl = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'\nfilename = url.split('/')[-1]\nr = requests.get(url)\nwith open(filename, \"wb\") as file:\n file.write(r.content)\n \n!ls", + "class": "Data Extraction", + "desc": "This code snippet downloads the GoogleNews Word2Vec model file using the `requests` library and writes it to the local filesystem.", "testing": { - "class": "Data_Transform", - "subclass": "drop_column", - "subclass_id": 10, - "predicted_subclass_probability": 0.99919885 + "class": "Exploratory_Data_Analysis", + "subclass": "list_files", + "subclass_id": 88, + "predicted_subclass_probability": 0.48958522 }, "cluster": 1 }, { - "cell_id": 5, - "code": "def remove_URL(text):\n url = re.compile(r'https?://\\S+|www\\.\\S+')\n return url.sub(r'',text)\n\ntrain_df[\"text\"] = train_df[\"text\"].apply(remove_URL)\ntest_df[\"text\"] = test_df[\"text\"].apply(remove_URL)", - "class": "Data Transform", - "desc": "This snippet defines a function to remove URLs from text and applies it to the \"text\" column of both the `train_df` and `test_df` DataFrames to clean the text data.", + "cell_id": 31, + "code": "import gensim\nword2vec_path='./GoogleNews-vectors-negative300.bin.gz'\nword2vec_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)", + "class": "Data Extraction", + "desc": "This code snippet loads the pre-trained GoogleNews Word2Vec model in binary format using the Gensim library.", "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.9983814 + "class": "Model_Train", + "subclass": "load_pretrained", + "subclass_id": 30, + "predicted_subclass_probability": 0.99531686 }, "cluster": 1 }, { - "cell_id": 6, - "code": "def remove_html(text):\n html=re.compile(r'<.*?>')\n return html.sub(r'',text)\n\ntrain_df[\"text\"] = train_df[\"text\"].apply(remove_html)\ntest_df[\"text\"] = test_df[\"text\"].apply(remove_html)", - "class": "Data Transform", - "desc": "This snippet defines a function to remove HTML tags from text and applies it to the \"text\" column of both the `train_df` and `test_df` DataFrames to further clean the text data.", + "cell_id": 58, + "code": "!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py", + "class": "Data Extraction", + "desc": "This code snippet downloads the `tokenization.py` script from the TensorFlow Models GitHub repository using the `wget` command.", "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.995577 + "class": "Data_Extraction", + "subclass": "load_from_url", + "subclass_id": 42, + "predicted_subclass_probability": 0.8866123 }, "cluster": 1 }, { - "cell_id": 7, - "code": "def remove_emoji(text):\n emoji_pattern = re.compile(\"[\"\n u\"\\U0001F600-\\U0001F64F\" # emoticons\n u\"\\U0001F300-\\U0001F5FF\" # symbols & pictographs\n u\"\\U0001F680-\\U0001F6FF\" # transport & map symbols\n u\"\\U0001F1E0-\\U0001F1FF\" # flags (iOS)\n u\"\\U00002702-\\U000027B0\"\n u\"\\U000024C2-\\U0001F251\"\n \"]+\", flags=re.UNICODE)\n return emoji_pattern.sub(r'', text)\n\ntrain_df[\"text\"] = train_df[\"text\"].apply(remove_emoji)\ntest_df[\"text\"] = test_df[\"text\"].apply(remove_emoji)", - "class": "Data Transform", - "desc": "This snippet defines a function to remove emojis from text using a regex pattern and applies it to the \"text\" column of both the `train_df` and `test_df` DataFrames to ensure the text data is cleaned of emojis.", + "cell_id": 61, + "code": "%%time\nmodule_url = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1\"\nbert_layer = hub.KerasLayer(module_url, trainable=True)", + "class": "Data Extraction", + "desc": "This code snippet downloads and initializes a BERT layer from TensorFlow Hub, making it trainable within the model, and measures the time taken to do so.", "testing": { - "class": "Data_Transform", - "subclass": "drop_column", - "subclass_id": 10, - "predicted_subclass_probability": 0.62529504 + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.856542 }, "cluster": 1 }, { - "cell_id": 12, - "code": "train_tensor = tokenizer(list(train_df[\"text\"]), padding=\"max_length\",\n truncation=True, max_length=30,\n return_tensors=\"pt\")[\"input_ids\"]", - "class": "Data Transform", - "desc": "This snippet tokenizes the text data from the `train_df` DataFrame using the previously initialized tokenizer, with specified padding, truncation, and maximum length to convert the text into tensors suitable for model input.", + "cell_id": 62, + "code": "# read and encode train data\ntrain = pd.read_csv(\"/kaggle/input/nlp-getting-started/train.csv\")\n\ntrain_input = bert_encode(train.text.values, bert_layer, max_len=128)\ntrain_labels = np.array(train.target)", + "class": "Data Extraction", + "desc": "This code snippet reads the training data from a CSV file and uses the `bert_encode` function to encode the text column, also converting the target column to a NumPy array.", "testing": { - "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.96896535 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.99927765 }, "cluster": 1 }, { - "cell_id": 19, - "code": "test_tensor = tokenizer(list(test_df[\"text\"]),\n padding=\"max_length\",\n truncation=True,\n max_length=30,\n return_tensors=\"pt\")[\"input_ids\"]", + "cell_id": 9, + "code": "data = pd.concat([train_data, submit_data])\ndata.shape", "class": "Data Transform", - "desc": "This snippet tokenizes the text data from the `test_df` DataFrame using the same tokenizer settings as for the training data, converting the text into tensors suitable for model inference.", + "desc": "This code snippet concatenates the `train_data` and `submit_data` dataframes and then displays the shape of the resulting dataframe using pandas' `concat` and `shape` attributes.", "testing": { "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.9433598 - }, - "cluster": 1 - }, { - "cell_id": 2, - "code": "train_df", - "class": "Exploratory Data Analysis", - "desc": "This snippet displays the content of the `train_df` DataFrame, which is typically used to perform an initial inspection of the training data.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.99972683 + "subclass": "concatenate", + "subclass_id": 11, + "predicted_subclass_probability": 0.9939737 }, - "cluster": -1 + "cluster": 2 }, { - "cell_id": 4, - "code": "train_df[\"target\"].value_counts()", - "class": "Exploratory Data Analysis", - "desc": "This snippet counts the occurrences of each unique value in the \"target\" column of the `train_df` DataFrame, providing insights into the class distribution of the target variable.", + "cell_id": 10, + "code": "data['text'] = data['text'].apply(lambda x: re.sub(re.compile(r'https?\\S+'), '', x))\ndata['text'] = data['text'].apply(lambda x: re.sub(re.compile(r'[\\//:,.!?@&\\-\\'\\`\\\"\\_\\n\\#]'), ' ', x))\ndata['text'] = data['text'].apply(lambda x: re.sub(re.compile(r'<.*?>'), '', x))\ndata['text'] = data['text'].apply(lambda x: re.sub(re.compile(\"[\"\n u\"\\U0001F600-\\U0001F64F\" \n u\"\\U0001F300-\\U0001F5FF\" \n u\"\\U0001F680-\\U0001F6FF\" \n u\"\\U0001F1E0-\\U0001F1FF\" \n u\"\\U00002702-\\U000027B0\"\n u\"\\U000024C2-\\U0001F251\"\n \"]+\", flags=re.UNICODE), '', x))\ndata['text'] = data['text'].apply(lambda x: re.sub(re.compile(r'\\d'), '', x))\ndata['text'] = data['text'].apply(lambda x: re.sub(re.compile(r'[^\\w]'), ' ', x))\ndata['text'] = data['text'].str.lower()", + "class": "Data Transform", + "desc": "This code snippet performs a series of text cleaning operations on the 'text' column, removing URLs, punctuation, HTML tags, emojis, digits, and non-word characters, and converting the text to lowercase using regular expressions and pandas' `apply` method.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "count_values", - "subclass_id": 72, - "predicted_subclass_probability": 0.999521 + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.9965412 }, - "cluster": 9 + "cluster": 0 }, { - "cell_id": 8, - "code": "train_df", - "class": "Exploratory Data Analysis", - "desc": "This snippet displays the content of the `train_df` DataFrame, which is typically used to review the changes made during the data cleaning process.", - "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.99972683 + "cell_id": 11, + "code": "'''\ntext_series = data.loc[:,'text']\nfor i in range(len(text_series)):\n content = text_series.iloc[i]\n textblob = TextBlob(content)\n text_series.iloc[i] = textblob.correct()\n'''", + "class": "Data Transform", + "desc": "This code snippet (currently commented out) iterates over the 'text' column and corrects spelling errors in each text entry using the TextBlob library.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "commented", + "subclass_id": 76, + "predicted_subclass_probability": 0.9023682 }, - "cluster": -1 + "cluster": 6 }, { - "cell_id": 0, - "code": "import numpy as np\nimport pandas as pd\nfrom fastai.text.all import *\nimport re", - "class": "Imports and Environment", - "desc": "This snippet imports essential libraries including NumPy for numerical computations, pandas for data manipulation, the fastai library for text processing and model training, and the regex library (re) for regular expression operations.", + "cell_id": 12, + "code": "clean_train = data[0:train_data.shape[0]]\nclean_submit = data[train_data.shape[0]:-1]\n\nX_train, X_test, y_train, y_test = train_test_split(clean_train['text'], clean_train['target'],\n test_size = 0.2, random_state = 4)", + "class": "Data Transform", + "desc": "This code snippet splits the concatenated data back into training and submission datasets and then splits the cleaned training data into training and test sets for model training and evaluation using `train_test_split` from Scikit-learn.", "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.99934 + "class": "Data_Transform", + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.9978915 }, "cluster": 0 }, { - "cell_id": 10, - "code": "from transformers import AutoTokenizer, AutoModelForSequenceClassification", - "class": "Imports and Environment", - "desc": "This snippet imports the `AutoTokenizer` and `AutoModelForSequenceClassification` classes from the transformers library for tokenizing text data and building a sequence classification model, respectively.", + "cell_id": 13, + "code": "def tfidf(words):\n tfidf_vectorizer = TfidfVectorizer()\n data_feature = tfidf_vectorizer.fit_transform(words)\n return data_feature, tfidf_vectorizer\n\nX_train_tfidf, tfidf_vectorizer = tfidf(X_train.tolist())\nX_test_tfidf = tfidf_vectorizer.transform(X_test.tolist())", + "class": "Data Transform", + "desc": "This code snippet defines a function `tfidf` to convert text data into TF-IDF features using Scikit-learn's `TfidfVectorizer`, and then applies this function to transform the `X_train` and `X_test` datasets.", "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.99929786 + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.9976406 }, - "cluster": -1 + "cluster": 5 }, { - "cell_id": 18, - "code": "from sklearn.metrics import f1_score\n\npreds, targs = learn.get_preds()\n\nmin_threshold = None\nmax_f1 = -float(\"inf\")\nthresholds = np.linspace(0.3, 0.7, 50)\nfor threshold in thresholds:\n f1 = f1_score(targs, F.softmax(preds, dim=1)[:, 1]>threshold)\n if f1 > max_f1:\n min_threshold = threshold\n min_f1 = f1\n print(f\"threshold:{threshold:.4f} - f1:{f1:.4f}\")", - "class": "Model Evaluation", - "desc": "This snippet computes the F1 score for different probability thresholds on the validation set predictions, identifying the threshold that maximizes the F1 score.", + "cell_id": 27, + "code": "stop_words = stopwords.words('english')\nfor word in ['us','no','yet']:\n stop_words.append(word)\n\ndata_list = []\ntext_series = data['text']\nfor i in range(len(text_series)):\n content = text_series.iloc[i]\n cutwords = [word for word in content.split(' ') if word not in stop_words if len(word) != 0]\n data_list.append(cutwords)", + "class": "Data Transform", + "desc": "This code snippet creates a list of stop words, adds custom words to this list, and then processes the text data to remove the stop words, producing a list of cleaned words for each text entry.", "testing": { - "class": "Model_Train", - "subclass": "find_best_params", - "subclass_id": 2, - "predicted_subclass_probability": 0.6100683 + "class": "Data_Transform", + "subclass": "string_transform", + "subclass_id": 78, + "predicted_subclass_probability": 0.9944119 }, - "cluster": 0 + "cluster": 6 }, { - "cell_id": 21, - "code": "test_preds = learn.get_preds(dl=test_dl)", - "class": "Model Evaluation", - "desc": "This snippet generates predictions for the test dataset using the trained model and the previously defined test data loader.", + "cell_id": 33, + "code": "def get_textVector(data_list, word2vec, textsVectors_list):\n for i in range(len(data_list)):\n words_perText = data_list[i]\n if len(words_perText) < 1:\n words_vector = [np.zeros(300)]\n else:\n words_vector = [word2vec.wv[k] if k in word2vec_model else np.zeros(300) for k in words_perText]\n text_vector = np.array(words_vector).mean(axis=0)\n textsVectors_list.append(text_vector)\n return textsVectors_list", + "class": "Data Transform", + "desc": "This code snippet defines a function `get_textVector` that converts each list of words into a text vector by averaging the word vectors from a Word2Vec model and appends these text vectors to a provided list.", "testing": { - "class": "Model_Evaluation", - "subclass": "predict_on_test", - "subclass_id": 48, - "predicted_subclass_probability": 0.9943605 + "class": "Data_Transform", + "subclass": "feature_engineering", + "subclass_id": 8, + "predicted_subclass_probability": 0.966133 }, "cluster": 0 }, { - "cell_id": 11, - "code": "tokenizer = AutoTokenizer.from_pretrained(\"roberta-large\")", - "class": "Model Training", - "desc": "This snippet initializes a tokenizer from the pre-trained \"roberta-large\" model, which will be used to convert text data into the format required by the model for training and inference.", + "cell_id": 34, + "code": "textsVectors_list = []\nget_textVector(data_list, word2vec_model, textsVectors_list)\nX = np.array(textsVectors_list)", + "class": "Data Transform", + "desc": "This code snippet generates text vectors for the processed text data using the `get_textVector` function and stores them into an array `X`.", "testing": { - "class": "Model_Train", - "subclass": "load_pretrained", - "subclass_id": 30, - "predicted_subclass_probability": 0.9938804 + "class": "Data_Transform", + "subclass": "prepare_x_and_y", + "subclass_id": 21, + "predicted_subclass_probability": 0.91989833 }, - "cluster": 1 + "cluster": 4 }, { - "cell_id": 13, - "code": "class TweetDataset:\n def __init__(self, tensors, targ, ids):\n self.text = tensors[ids, :]\n self.targ = targ[ids].reset_index(drop=True)\n \n def __len__(self):\n return len(self.text)\n \n def __getitem__(self, idx):\n \n t = self.text[idx]\n y = self.targ[idx]\n \n return t, tensor(y)", - "class": "Model Training", - "desc": "This snippet defines a custom dataset class `TweetDataset` that initializes with text tensors and target labels, providing methods to get the length of the dataset and retrieve individual items, structured for use in model training.", + "cell_id": 36, + "code": "word2vec_X = X[0:train_data.shape[0]]\ny = data['target'][0:train_data.shape[0]]\nword2vec_submit = X[train_data.shape[0]:-1]\n\nX_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(word2vec_X, y,\n test_size = 0.2, random_state = 4)", + "class": "Data Transform", + "desc": "This code snippet splits the array of text vectors back into training and submission datasets and then splits the training text vectors and labels into training and test sets using `train_test_split` from Scikit-learn.", "testing": { "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.69012034 + "subclass": "split", + "subclass_id": 13, + "predicted_subclass_probability": 0.9981791 }, "cluster": 0 }, { - "cell_id": 14, - "code": "train_ids, valid_ids = RandomSplitter()(train_df)\n\n\ntarget = train_df[\"target\"]\n\ntrain_ds = TweetDataset(train_tensor, target, train_ids)\nvalid_ds = TweetDataset(train_tensor, target, valid_ids)\n\ntrain_dl = DataLoader(train_ds, bs=64)\nvalid_dl = DataLoader(valid_ds, bs=512)\ndls = DataLoaders(train_dl, valid_dl).to(\"cuda\")", - "class": "Model Training", - "desc": "This snippet splits the training data into training and validation sets using random splitting, creates datasets and corresponding data loaders for both sets, and moves the data loaders to the GPU for model training.", + "cell_id": 44, + "code": "tokenizer = Tokenizer()\ntokenizer.fit_on_texts(data_list)\nsequences = tokenizer.texts_to_sequences(data_list)\nword_index = tokenizer.word_index\ncnn_data = pad_sequences(sequences, maxlen = max_sequence_length)\ncnn_label = to_categorical(np.asarray(train_data['target']))\nprint('len of word_index:', len(word_index))\nprint('shape of data tensor:', cnn_data.shape)\nprint('shape of label tensoe:', cnn_label.shape)", + "class": "Data Transform", + "desc": "This code snippet tokenizes the processed text data, converts the tokens into sequences, pads these sequences to a maximum length, and converts labels into categorical format using Keras' `Tokenizer`, `texts_to_sequences`, `pad_sequences`, and `to_categorical` methods.", + "testing": { + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.6572918 + }, + "cluster": 5 + }, { + "cell_id": 45, + "code": "trainCNN_data = cnn_data[0:train_data.shape[0]]\nX_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(trainCNN_data, cnn_label,\n test_size = 0.2, random_state = 4)\nX_cnn, X_val_cnn, y_cnn, y_val_cnn = train_test_split(X_train_cnn, y_train_cnn,\n test_size = 0.2, random_state = 4)", + "class": "Data Transform", + "desc": "This code snippet splits the padded sequence data into CNN training and submission datasets, then further splits the CNN training data into training and test sets, and finally splits the training data into training and validation sets using `train_test_split` from Scikit-learn.", "testing": { "class": "Data_Transform", "subclass": "split", "subclass_id": 13, - "predicted_subclass_probability": 0.50315964 + "predicted_subclass_probability": 0.99836665 }, "cluster": 0 }, { - "cell_id": 15, - "code": "bert = AutoModelForSequenceClassification.from_pretrained(\"roberta-large\", num_labels=2).train().to(\"cuda\")\n\nclass BertClassifier(Module):\n def __init__(self, bert):\n self.bert = bert\n def forward(self, x):\n return self.bert(x).logits\n\nmodel = BertClassifier(bert)", - "class": "Model Training", - "desc": "This snippet initializes a pre-trained \"roberta-large\" model for sequence classification with two labels, encapsulates it within a custom `BertClassifier` module to define its forward pass, and transfers the model to the GPU for training.", + "cell_id": 50, + "code": "embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))\nfor word, i in word_index.items(): \n if word in word2vec_model:\n embedding_matrix[i] = np.asarray(word2vec_model.wv[word])", + "class": "Data Transform", + "desc": "This code snippet creates an embedding matrix by mapping the words in the tokenizer's word index to their corresponding word vectors from the pre-trained Word2Vec model, filling in a zero vector for missing words.", "testing": { - "class": "Model_Train", - "subclass": "choose_model_class", - "subclass_id": 4, - "predicted_subclass_probability": 0.86304575 + "class": "Data_Transform", + "subclass": "prepare_x_and_y", + "subclass_id": 21, + "predicted_subclass_probability": 0.739272 }, - "cluster": 0 + "cluster": 8 }, { - "cell_id": 16, - "code": "learn = Learner(dls, model, metrics=[accuracy, F1Score()]).to_fp16()\nlearn.lr_find()", - "class": "Model Training", - "desc": "This snippet creates a `Learner` object with the specified data loaders, model, and evaluation metrics, enables mixed precision training, and performs a learning rate finder analysis to identify an optimal learning rate for model training.", + "cell_id": 51, + "code": "embedding_layer = Embedding(len(word_index)+1,\n embedding_dim,\n weights = [embedding_matrix],\n input_length = max_sequence_length,\n trainable = False)", + "class": "Data Transform", + "desc": "This code snippet creates an embedding layer for a neural network using the pre-trained Word2Vec embeddings, ensuring that the embeddings are not trainable during model training by setting `trainable = False`.", "testing": { "class": "Model_Train", "subclass": "choose_model_class", "subclass_id": 4, - "predicted_subclass_probability": 0.9948531 + "predicted_subclass_probability": 0.99493694 }, "cluster": 0 }, { - "cell_id": 17, - "code": "learn.fit_one_cycle(3, lr_max=1e-5)", - "class": "Model Training", - "desc": "This snippet trains the model using the one-cycle policy for 3 epochs with a maximum learning rate of \\(1 \\times 10^{-5}\\).", + "cell_id": 2, + "code": "train_data[train_data['text'].isna()]", + "class": "Exploratory Data Analysis", + "desc": "This code snippet filters the `train_data` dataframe to display rows where the 'text' column has missing values using the `isna()` function.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.9996973 + "class": "Exploratory_Data_Analysis", + "subclass": "count_missing_values", + "subclass_id": 39, + "predicted_subclass_probability": 0.88515306 }, - "cluster": -1 + "cluster": 4 }, { - "cell_id": 20, - "code": "class TestDS:\n def __init__(self, tensors):\n self.tensors = tensors\n \n def __len__(self):\n return len(self.tensors)\n \n def __getitem__(self, idx):\n t = self.tensors[idx]\n return t, tensor(0)\n\ntest_dl = DataLoader(TestDS(test_tensor), bs=128)", - "class": "Model Training", - "desc": "This snippet defines a `TestDS` dataset class for the test data, providing methods to get the length of the dataset and retrieve individual items, and then creates a data loader for this test dataset for use in model inference.", + "cell_id": 3, + "code": "train_data.info()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet provides a concise summary of the `train_data` dataframe, including the data types and non-null counts for each column using the `info()` method.", "testing": { - "class": "Model_Train", - "subclass": "load_pretrained", - "subclass_id": 30, - "predicted_subclass_probability": 0.9415805 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table_attributes", + "subclass_id": 40, + "predicted_subclass_probability": 0.99936634 }, "cluster": 0 }, { - "cell_id": 9, - "code": "train_df[\"text\"].apply(lambda x:len(x.split())).plot(kind=\"hist\");", - "class": "Visualization", - "desc": "This snippet generates a histogram plot to visualize the distribution of word counts in the \"text\" column of the `train_df` DataFrame.", + "cell_id": 4, + "code": "train_data.groupby('target').count()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet groups the `train_data` dataframe by the 'target' column and counts the number of occurrences in each group using the `groupby` and `count` methods.", "testing": { "class": "Data_Transform", - "subclass": "string_transform", - "subclass_id": 78, - "predicted_subclass_probability": 0.6361653 + "subclass": "groupby", + "subclass_id": 60, + "predicted_subclass_probability": 0.9970409 }, - "cluster": 0 - }], - "notebook_id": 24, - "notebook_name": "roberta-with-pytorch-and-fastai" - }, { - "cells": [{ - "cell_id": 34, - "code": "def submission(model, test):\n sample_sub = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')\n predictions = model.predict(test)\n y_preds = [ int(i) for i in np.rint(predictions)]\n sub = pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_preds})\n sub.to_csv('submission.csv', index=False)", - "class": "Data Export", - "desc": "This code snippet defines a function `submission` that generates predictions using the trained model on a test dataset, prepares the submission DataFrame with the required format, and writes it to a CSV file named 'submission.csv'.", + "cluster": 4 + }, { + "cell_id": 14, + "code": "X_train_tfidf.shape", + "class": "Exploratory Data Analysis", + "desc": "This code snippet outputs the shape of the TF-IDF transformed training dataset `X_train_tfidf` to understand its dimensionality.", "testing": { - "class": "Data_Export", - "subclass": "save_to_csv", - "subclass_id": 25, - "predicted_subclass_probability": 0.9988925 + "class": "Exploratory_Data_Analysis", + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.9994598 }, - "cluster": 0 + "cluster": 1 }, { - "cell_id": 35, - "code": "submission(bert_classifier, test_ds)", - "class": "Data Export", - "desc": "This code snippet calls the `submission` function to generate predictions using the `bert_classifier` model on the `test_ds` dataset and exports the results to a CSV file named 'submission.csv'.", + "cell_id": 28, + "code": "for i in range(len(data_list)):\n content = data_list[i]\n if len(content) <1:\n print(i)", + "class": "Exploratory Data Analysis", + "desc": "This code snippet iterates through the `data_list` to identify and print the indices of entries that have no content after stop word removal.", "testing": { - "class": "Model_Evaluation", - "subclass": "compute_test_metric", - "subclass_id": 49, - "predicted_subclass_probability": 0.61737144 + "class": "Exploratory_Data_Analysis", + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.8840107 }, - "cluster": -1 + "cluster": 3 }, { - "cell_id": 4, - "code": "train_full = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')\ntest_full = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')\n\nprint('Training Set Shape = {}'.format(train_full.shape))\nprint('Training Set Memory Usage = {:.2f}MB'.format(train_full.memory_usage().sum()/2**20))\n\nprint('Test Set Shape = {}'.format(test_full.shape))\nprint('Test Set Memory Usage = {:.2f}MB'.format(test_full.memory_usage().sum()/2**20))", - "class": "Data Extraction", - "desc": "This code snippet reads the training and test datasets from CSV files into pandas DataFrames, then prints their shapes and memory usage to provide an overview of the data size.", + "cell_id": 29, + "code": "data_list[7626]", + "class": "Exploratory Data Analysis", + "desc": "This code snippet outputs the contents of the `data_list` at index 7626 to inspect the text entry at that specific index after stop word removal.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.9972459 + "class": "Exploratory_Data_Analysis", + "subclass": "show_table", + "subclass_id": 41, + "predicted_subclass_probability": 0.9993057 }, - "cluster": -1 + "cluster": 5 + }, { + "cell_id": 32, + "code": "word2vec_model.wv['earthquake'].shape", + "class": "Exploratory Data Analysis", + "desc": "This code snippet retrieves and displays the shape of the word vector for the word 'earthquake' from the loaded Word2Vec model using the Gensim library.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.9989441 + }, + "cluster": 4 + }, { + "cell_id": 35, + "code": "pd.isnull(X).any()", + "class": "Exploratory Data Analysis", + "desc": "This code snippet checks if there are any null values in the array `X` and returns a boolean array indicating the presence of nulls for each feature.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "count_missing_values", + "subclass_id": 39, + "predicted_subclass_probability": 0.9978543 + }, + "cluster": 6 }, { - "cell_id": 8, - "code": "train_full.text", - "class": "Data Extraction", - "desc": "This code snippet accesses and displays the 'text' column of the training dataset DataFrame named `train_full`.", + "cell_id": 37, + "code": "print(X_train_word2vec.shape, y_train_word2vec.shape)", + "class": "Exploratory Data Analysis", + "desc": "This code snippet prints the shapes of the `X_train_word2vec` and `y_train_word2vec` arrays to understand their dimensionality.", "testing": { "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9863797 + "subclass": "show_shape", + "subclass_id": 58, + "predicted_subclass_probability": 0.9980696 }, - "cluster": -1 + "cluster": 2 }, { - "cell_id": 11, - "code": "# Read commited-dataset\ndf_train = pd.read_csv(\"/kaggle/input/disastertweet-prepared2/train_prepared.csv\")\ndf_test = pd.read_csv(\"/kaggle/input/disastertweet-prepared2/test_prepared.csv\")", - "class": "Data Extraction", - "desc": "This code snippet reads prepared training and test datasets from CSV files into pandas DataFrames named `df_train` and `df_test`.", + "cell_id": 42, + "code": "lenlen = []\nfor i in range(len(data_list)):\n content = data_list[i]\n perlen = len(content)\n lenlen.append(perlen)\nprint(max(lenlen))", + "class": "Exploratory Data Analysis", + "desc": "This code snippet calculates the length of each processed text entry in terms of the number of words, stores these lengths in a list, and prints the maximum length.", "testing": { - "class": "Data_Extraction", - "subclass": "load_from_csv", - "subclass_id": 45, - "predicted_subclass_probability": 0.99974114 + "class": "Exploratory_Data_Analysis", + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.7533818 }, "cluster": 3 }, { - "cell_id": 17, - "code": "df_train", - "class": "Data Extraction", - "desc": "This code snippet accesses and displays the `df_train` DataFrame to inspect its content.", + "cell_id": 0, + "code": "import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom nltk.corpus import stopwords\nfrom nltk.util import ngrams\nfrom nltk.stem import WordNetLemmatizer\nimport re\nfrom textblob import TextBlob\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.model_selection import GridSearchCV\nimport tensorflow as tf\nfrom keras.preprocessing.text import Tokenizer\nfrom keras.preprocessing.sequence import pad_sequences\nfrom keras.utils import to_categorical\nfrom gensim.models import Word2Vec\nfrom gensim.models.keyedvectors import KeyedVectors\nimport time\nfrom keras.layers import Dense, Input, Flatten, Dropout\nfrom keras.layers import Conv1D, MaxPooling1D, Embedding\nfrom keras.models import Sequential\nfrom keras import losses\nfrom tensorflow.keras.optimizers import Adam\nfrom tensorflow.keras.models import Model", + "class": "Imports and Environment", + "desc": "This code snippet imports various libraries and modules required for data manipulation, visualization, natural language processing, machine learning, and deep learning tasks including pandas, NumPy, Matplotlib, Seaborn, NLTK, Scikit-learn, TensorFlow, Keras, and Gensim.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.9994585 + "class": "Imports_and_Environment", + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.9993228 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 20, - "code": "# Spilt data\nX_train, X_val, y_train, y_val = train_test_split(df_train[['text','keyword']],\n df_train.target, \n test_size=0.2, \n random_state=42)\nX_train.shape, X_val.shape", - "class": "Data Extraction", - "desc": "This code snippet splits the `df_train` data into training and validation sets with an 80-20 split, based on both 'text' and 'keyword' features, and prints their shapes.", + "cell_id": 5, + "code": "%matplotlib inline", + "class": "Imports and Environment", + "desc": "This code snippet enables the inline display of Matplotlib plots directly within the Jupyter notebook interface.", "testing": { - "class": "Data_Transform", - "subclass": "split", - "subclass_id": 13, - "predicted_subclass_probability": 0.998221 + "class": "Imports_and_Environment", + "subclass": "set_options", + "subclass_id": 23, + "predicted_subclass_probability": 0.9991172 }, - "cluster": -1 + "cluster": 1 }, { - "cell_id": 23, - "code": "a3 = configure_dataset(train_ds, shuffle=True)\ndict3 = []\nfor elem in a3:\n dict3.append(elem[0]['text'][0])\ndict3[:10]", - "class": "Data Extraction", - "desc": "This code snippet configures the `train_ds` dataset using the `configure_dataset` function with shuffling enabled, extracts the 'text' element from the first batch of data, and displays the first 10 text samples to inspect the data.", + "cell_id": 25, + "code": "!pip install gensim -i http://pypi.douban.com/simple --trusted-host pypi.douban.com", + "class": "Imports and Environment", + "desc": "This code snippet installs the `gensim` library using pip from a specific PyPI mirror and sets it as a trusted host.", "testing": { - "class": "Data_Transform", - "subclass": "create_dataframe", - "subclass_id": 12, - "predicted_subclass_probability": 0.61363506 + "class": "Imports_and_Environment", + "subclass": "install_modules", + "subclass_id": 87, + "predicted_subclass_probability": 0.9962657 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 12, - "code": "# Only apply 'keyword' columns in full data, because other features cleaned in df_train/test\ntrain_full = clean_text(train_full,'keyword')\ntest_full = clean_text(test_full, 'keyword')", - "class": "Data Transform", - "desc": "This code snippet applies text cleaning to the 'keyword' columns in both the full training and test datasets using the `clean_text` function from the `dataprep.clean` library.", + "cell_id": 43, + "code": "max_sequence_length = 26\nembedding_dim = 300", + "class": "Imports and Environment", + "desc": "This code snippet defines two variables, `max_sequence_length` and `embedding_dim`, to specify the maximum length of input sequences and the dimensionality of word embeddings, respectively.", "testing": { - "class": "Data_Transform", - "subclass": "drop_column", - "subclass_id": 10, - "predicted_subclass_probability": 0.9898755 + "class": "Exploratory_Data_Analysis", + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.9975586 }, - "cluster": 1 + "cluster": 0 }, { - "cell_id": 13, - "code": "# Adding cleaned data into df_train/test\ndf_train['keyword'] = train_full['keyword']\ndf_test['keyword'] = test_full['keyword']", - "class": "Data Transform", - "desc": "This code snippet integrates the cleaned 'keyword' data from the `train_full` and `test_full` datasets into the `df_train` and `df_test` DataFrames, respectively.", + "cell_id": 56, + "code": "tf.__version__", + "class": "Imports and Environment", + "desc": "This code snippet retrieves and prints the version of the TensorFlow library currently being used.", "testing": { - "class": "Data_Transform", - "subclass": "feature_engineering", - "subclass_id": 8, - "predicted_subclass_probability": 0.99662316 + "class": "Imports_and_Environment", + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.9983907 }, "cluster": 1 }, { - "cell_id": 15, - "code": "def extract_keywords(text):\n potential_keywords = []\n TOP_KEYWORD = -1\n # Create a list for keyword parts of speech\n pos_tag = ['ADJ', 'NOUN', 'PROPN']\n doc = nlp_spacy(text)\n \n for i in doc:\n if i.pos_ in pos_tag:\n potential_keywords.append(i.text)\n\n document_embed = sentence_enc([text])\n potential_embed = sentence_enc(potential_keywords) \n \n vector_distances = cosine_similarity(document_embed, potential_embed)\n keyword = [potential_keywords[i] for i in vector_distances.argsort()[0][TOP_KEYWORD:]]\n\n return keyword\n\ndef keyword_filler(keyword, text):\n if pd.isnull(keyword):\n try:\n keyword = extract_keywords(text)[0]\n except:\n keyword = '' \n \n return keyword", - "class": "Data Transform", - "desc": "This code snippet defines two functions, `extract_keywords` to extract keywords from a given text using spaCy for part-of-speech tagging and the Universal Sentence Encoder for embedding comparison, and `keyword_filler` to fill in missing keywords by extracting them from the given text or assigning an empty string if extraction fails.", + "cell_id": 57, + "code": "import tensorflow_hub as hub\nhub.__version__", + "class": "Imports and Environment", + "desc": "This code snippet imports the TensorFlow Hub library and then prints its version to verify the installation.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.41046342 + "class": "Imports_and_Environment", + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.99918145 }, "cluster": 1 + }, { + "cell_id": 59, + "code": "import tensorflow as tf\nfrom tensorflow.keras.callbacks import ModelCheckpoint\nimport tensorflow_hub as hub\n\nimport tokenization", + "class": "Imports and Environment", + "desc": "This code snippet imports TensorFlow, the ModelCheckpoint callback from Keras, TensorFlow Hub, and the `tokenization` script for BERT tokenization.", + "testing": { + "class": "Imports_and_Environment", + "subclass": "import_modules", + "subclass_id": 22, + "predicted_subclass_probability": 0.99932754 + }, + "cluster": 0 }, { "cell_id": 16, - "code": "df_train.keyword = pd.DataFrame(list(map(keyword_filler, df_train.keyword, df_train.text))).astype(str)\ndf_test.keyword = pd.DataFrame(list(map(keyword_filler, df_test.keyword, df_test.text))).astype(str)\n\nprint('Null Training Keywords => ', df_train['keyword'].isnull().any())\nprint('Null Test Keywords => ', df_test['keyword'].isnull().any())", - "class": "Data Transform", - "desc": "This code snippet applies the `keyword_filler` function to fill missing keywords in both `df_train` and `df_test` DataFrames and then verifies that there are no null values left in the 'keyword' columns.", + "code": "def score_metrics(y_test, y_predicted):\n accuracy = accuracy_score(y_test, y_predicted)\n precision = precision_score(y_test, y_predicted)\n recall = recall_score(y_test, y_predicted)\n print(\"accuracy = %0.3f, precision = %0.3f, recall = %0.3f\" % (accuracy, precision, recall))", + "class": "Model Evaluation", + "desc": "This code snippet defines a function `score_metrics` that calculates and prints the accuracy, precision, and recall scores of the model predictions compared to the test labels using Scikit-learn's evaluation metrics.", "testing": { - "class": "Data_Transform", - "subclass": "data_type_conversions", - "subclass_id": 16, - "predicted_subclass_probability": 0.95129913 + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.9981325 }, - "cluster": 1 + "cluster": 3 }, { - "cell_id": 21, - "code": "train_ds = tf.data.Dataset.from_tensor_slices((dict(X_train), y_train))\nval_ds = tf.data.Dataset.from_tensor_slices((dict(X_val), y_val))\ntest_ds = tf.data.Dataset.from_tensor_slices(dict(df_test[['text','keyword']]))", - "class": "Data Transform", - "desc": "This code snippet converts the training, validation, and test sets into TensorFlow datasets (`train_ds`, `val_ds`, `test_ds`) from the respective pandas DataFrames to facilitate efficient data pipeline handling in TensorFlow models.", + "cell_id": 17, + "code": "score_metrics(y_test, y_predicted_lr)", + "class": "Model Evaluation", + "desc": "This code snippet evaluates the Logistic Regression model's performance on the test data by calling the `score_metrics` function to print accuracy, precision, and recall scores.", "testing": { - "class": "Data_Transform", - "subclass": "create_dataframe", - "subclass_id": 12, - "predicted_subclass_probability": 0.8992567 + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.9981065 }, - "cluster": 1 + "cluster": 3 }, { - "cell_id": 22, - "code": "AUTOTUNE = tf.data.experimental.AUTOTUNE\n\nBUFFER_SIZE = 1000\nBATCH_SIZE = 32\nRANDOM_SEED = 319\n\ndef configure_dataset(dataset, shuffle=False, test=False):\n if shuffle:\n dataset = dataset.cache()\\\n .shuffle(BUFFER_SIZE, seed=RANDOM_SEED, reshuffle_each_iteration=True)\\\n .batch(BATCH_SIZE, drop_remainder=True)\\\n .prefetch(AUTOTUNE)\n elif test:\n dataset = dataset.cache()\\\n .batch(BATCH_SIZE, drop_remainder=False)\\\n .prefetch(AUTOTUNE)\n else:\n dataset = dataset.cache()\\\n .batch(BATCH_SIZE, drop_remainder=True)\\\n .prefetch(AUTOTUNE)\n return dataset", - "class": "Data Transform", - "desc": "This code snippet defines the `configure_dataset` function, which configures TensorFlow datasets for training, validation, or testing by applying caching, shuffling (if specified), batching, and prefetching to optimize data loading performance.", + "cell_id": 18, + "code": "def plot_confusion_matrix(y_test, y_predicted, title='Confusion Matrix'):\n cm = confusion_matrix(y_test, y_predicted)\n plt.figure(figsize=(8,6))\n sns.heatmap(cm,annot=True, fmt='.20g')\n plt.title(title)\n plt.ylabel('True label')\n plt.xlabel('Predicted label')", + "class": "Model Evaluation", + "desc": "This code snippet defines a function `plot_confusion_matrix` that creates and displays a heatmap of the confusion matrix for the true versus predicted labels using Seaborn and Matplotlib.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.49705026 + "class": "Visualization", + "subclass": "heatmap", + "subclass_id": 80, + "predicted_subclass_probability": 0.7525936 }, - "cluster": 1 + "cluster": 3 }, { - "cell_id": 24, - "code": "# Configure the datasets\ntrain_ds = configure_dataset(train_ds, shuffle=True)\nval_ds = configure_dataset(val_ds)\ntest_ds = configure_dataset(test_ds, test=True)", - "class": "Data Transform", - "desc": "This code snippet configures the `train_ds`, `val_ds`, and `test_ds` datasets by applying the `configure_dataset` function with appropriate parameters for shuffling, caching, batching, and prefetching to optimize data loading.", + "cell_id": 19, + "code": "plot_confusion_matrix(y_test, y_predicted_lr)", + "class": "Model Evaluation", + "desc": "This code snippet calls the `plot_confusion_matrix` function to visualize the confusion matrix for the Logistic Regression model's predictions on the test data.", "testing": { - "class": "Data_Transform", - "subclass": "prepare_x_and_y", - "subclass_id": 21, - "predicted_subclass_probability": 0.36260855 + "class": "Visualization", + "subclass": "plot_predictions", + "subclass_id": 56, + "predicted_subclass_probability": 0.76843596 }, - "cluster": 1 + "cluster": 3 }, { - "cell_id": 25, - "code": "# Free memory\ndel X_train, X_val, y_train, y_val, df_train, df_test, train_full, test_full", - "class": "Data Transform", - "desc": "This code snippet deletes the intermediate DataFrames and datasets (`X_train`, `X_val`, `y_train`, `y_val`, `df_train`, `df_test`, `train_full`, and `test_full`) to free up memory resources.", + "cell_id": 22, + "code": "y_predicted_dt = df_tfidf.predict(X_test_tfidf)", + "class": "Model Evaluation", + "desc": "This code snippet uses the best estimator from the grid search to make predictions on the TF-IDF transformed test data.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.4480442 + "class": "Model_Evaluation", + "subclass": "predict_on_test", + "subclass_id": 48, + "predicted_subclass_probability": 0.99451977 }, - "cluster": 1 + "cluster": 2 }, { - "cell_id": 5, - "code": "plot(train_full)", - "class": "Exploratory Data Analysis", - "desc": "This code snippet generates various exploratory data analysis visualizations for the training dataset using the `plot` function from the `dataprep.eda` library.", + "cell_id": 23, + "code": "score_metrics(y_test, y_predicted_dt)", + "class": "Model Evaluation", + "desc": "This code snippet evaluates the Decision Tree model's performance on the test data by calling the `score_metrics` function to print accuracy, precision, and recall scores.", + "testing": { + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.9980464 + }, + "cluster": 3 + }, { + "cell_id": 24, + "code": "plot_confusion_matrix(y_test, y_predicted_dt)", + "class": "Model Evaluation", + "desc": "This code snippet calls the `plot_confusion_matrix` function to visualize the confusion matrix for the Decision Tree model's predictions on the test data.", "testing": { "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.99482274 + "subclass": "plot_predictions", + "subclass_id": 56, + "predicted_subclass_probability": 0.7584198 }, - "cluster": -1 + "cluster": 3 }, { - "cell_id": 6, - "code": "create_report(train_full)", - "class": "Exploratory Data Analysis", - "desc": "This code snippet creates an automated exploratory data analysis report for the training dataset using the `create_report` function from the `dataprep.eda` library.", + "cell_id": 39, + "code": "score_metrics(y_test_word2vec, y_predicted_word2vec_lr)", + "class": "Model Evaluation", + "desc": "This code snippet evaluates the Logistic Regression model's performance on the Word2Vec transformed test data by calling the `score_metrics` function to print accuracy, precision, and recall scores.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table_attributes", - "subclass_id": 40, - "predicted_subclass_probability": 0.5819901 + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.9982146 }, - "cluster": -1 + "cluster": 3 }, { - "cell_id": 7, - "code": "plot(train_full, 'text')", - "class": "Exploratory Data Analysis", - "desc": "This code snippet generates visualizations specifically for the 'text' column of the training dataset using the `plot` function from the `dataprep.eda` library.", + "cell_id": 40, + "code": "plot_confusion_matrix(y_test_word2vec, y_predicted_word2vec_lr)", + "class": "Model Evaluation", + "desc": "This code snippet calls the `plot_confusion_matrix` function to visualize the confusion matrix for the Logistic Regression model's predictions on the Word2Vec transformed test data.", "testing": { "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.99791676 + "subclass": "plot_predictions", + "subclass_id": 56, + "predicted_subclass_probability": 0.84631115 }, - "cluster": -1 + "cluster": 3 }, { - "cell_id": 9, - "code": "plot(train_full, \"text\", \"target\")", - "class": "Exploratory Data Analysis", - "desc": "This code snippet generates visualizations comparing the 'text' column against the 'target' column in the training dataset using the `plot` function from the `dataprep.eda` library.", + "cell_id": 41, + "code": "compare_list = []\nfor (i,j) in zip(y_test_word2vec, y_predicted_word2vec_lr):\n k = i - j\n compare_list.append(k)\n\nwrong_num = [i for i,j in enumerate(compare_list) if j != 0]\ntext_series[0:train_data.shape[0]][wrong_num]", + "class": "Model Evaluation", + "desc": "This code snippet creates a list of differences between actual and predicted labels to identify misclassified instances and then displays the corresponding text entries from the test set that were misclassified.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.988972 + "class": "Exploratory_Data_Analysis", + "subclass": "define_variables", + "subclass_id": 77, + "predicted_subclass_probability": 0.96445125 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 10, - "code": "df1 = train_full.text[train_full.target == 0]\ndf2 = train_full.text[train_full.target == 1]\nplot_diff([df1, df2])", - "class": "Exploratory Data Analysis", - "desc": "This code snippet segments the 'text' column into two groups based on the 'target' value (0 or 1) and then generates visualizations comparing these two groups using the `plot_diff` function from the `dataprep.eda` library.", + "cell_id": 49, + "code": "test_loss, test_acc = CNNmodel.evaluate(X_test_cnn, y_test_cnn, verbose=2)\nprint('test loss:',test_loss)\nprint('test acc:',test_acc)", + "class": "Model Evaluation", + "desc": "This code snippet evaluates the CNN model's performance on the test data by calculating the loss and accuracy, and then prints these metrics.", "testing": { - "class": "Visualization", - "subclass": "distribution", - "subclass_id": 33, - "predicted_subclass_probability": 0.9446237 + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.9945109 }, - "cluster": -1 + "cluster": 0 }, { - "cell_id": 0, - "code": "! pip install tf-models-official==2.4.0 -q\n! pip install tensorflow-gpu==2.4.1 -q\n! pip install tensorflow-text==2.4.1 -q\n! python -m spacy download en_core_web_sm -q\n! pip install dataprep | grep -v 'already satisfied'", - "class": "Imports and Environment", - "desc": "This code snippet installs specific versions of necessary machine learning and NLP libraries, including TensorFlow, tf-models-official, tensorflow-text, and spaCy, as well as the dataprep library.", + "cell_id": 55, + "code": "test_loss, test_acc = model.evaluate(X_test_cnn, y_test_cnn, verbose=2)\nprint('test loss:',test_loss)\nprint('test acc:',test_acc)", + "class": "Model Evaluation", + "desc": "This code snippet evaluates the CNN model's performance on the test data by calculating the loss and accuracy, and then prints these metrics.", "testing": { - "class": "Imports_and_Environment", - "subclass": "install_modules", - "subclass_id": 87, - "predicted_subclass_probability": 0.9604686 + "class": "Model_Evaluation", + "subclass": "compute_test_metric", + "subclass_id": 49, + "predicted_subclass_probability": 0.9897344 }, "cluster": 0 }, { - "cell_id": 1, - "code": "import pandas as pd\nimport numpy as np\nnp.set_printoptions(precision=4)\n\nimport tensorflow as tf\nfrom tensorflow import keras\n\n# Visualization\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nfrom dataprep.eda import plot, plot_diff, plot_correlation, create_report\nfrom dataprep.clean import clean_text\n\n# Preprocessing and Modelling\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics.pairwise import cosine_similarity\nimport spacy\nimport tensorflow_text as text\nimport tensorflow_hub as hub\nfrom tensorflow.keras.layers import Input, Dense, Flatten, Dropout, concatenate \nfrom tensorflow.keras import Model, regularizers \nfrom tensorflow.keras.metrics import BinaryAccuracy\nfrom tensorflow.keras.losses import BinaryCrossentropy\nfrom official.nlp.optimization import create_optimizer # AdamW optimizer\n# Warning\nimport warnings\nwarnings.filterwarnings('ignore')", - "class": "Imports and Environment", - "desc": "This code snippet imports various libraries and modules for data manipulation (pandas, numpy), machine learning (TensorFlow, sklearn), natural language processing (spacy, tensorflow-text), visualization (seaborn, matplotlib, dataprep), and sets some configurations like numpy print options and ignoring warnings.", + "cell_id": 64, + "code": "# predict\ntest = pd.read_csv(\"/kaggle/input/nlp-getting-started/test.csv\")\n\ntest_input = bert_encode(test.text.values, bert_layer, max_len=128)\nmodel.load_weights('model.h5')\ntest_pred = model.predict(test_input)", + "class": "Model Evaluation", + "desc": "This code snippet reads the test data from a CSV file, encodes the text column using BERT, loads the best model weights saved during training, and makes predictions on the encoded test data.", "testing": { - "class": "Imports_and_Environment", - "subclass": "set_options", - "subclass_id": 23, - "predicted_subclass_probability": 0.98988277 + "class": "Data_Extraction", + "subclass": "load_from_csv", + "subclass_id": 45, + "predicted_subclass_probability": 0.93680686 }, "cluster": 0 }, { - "cell_id": 2, - "code": "tf.__version__", - "class": "Imports and Environment", - "desc": "This code snippet prints the version of the TensorFlow library currently installed in the environment to verify its installation and version.", + "cell_id": 15, + "code": "lr_tfidf = LogisticRegression(class_weight = 'balanced', solver = 'lbfgs', n_jobs = -1)\nlr_tfidf.fit(X_train_tfidf, y_train)\ny_predicted_lr = lr_tfidf.predict(X_test_tfidf)", + "class": "Model Training", + "desc": "This code snippet trains a Logistic Regression model with class balancing on the TF-IDF transformed training data and then makes predictions on the TF-IDF transformed test data using Scikit-learn's `LogisticRegression`.", "testing": { - "class": "Imports_and_Environment", - "subclass": "import_modules", - "subclass_id": 22, - "predicted_subclass_probability": 0.9983907 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.6878745 }, "cluster": -1 }, { - "cell_id": 3, - "code": "# Random seeds\nimport random\nimport numpy as np\nimport tensorflow as tf\nrandom.seed(319)\nnp.random.seed(319)\ntf.random.set_seed(319)", - "class": "Imports and Environment", - "desc": "This code snippet sets random seeds for the `random`, `numpy`, and `tensorflow` libraries to ensure reproducibility of results by fixing the randomness.", + "cell_id": 21, + "code": "pipeline = Pipeline([\n ('clf', DecisionTreeClassifier(splitter='random', class_weight='balanced'))\n])\nparameters = {\n 'clf__max_depth':(150,160,165),\n 'clf__min_samples_split':(18,20,23),\n 'clf__min_samples_leaf':(5,6,7)\n}\n\ndf_tfidf = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=-1, scoring='f1')\ndf_tfidf.fit(X_train_tfidf, y_train)\n\nprint(df_tfidf.best_estimator_.get_params())", + "class": "Model Training", + "desc": "This code snippet defines a machine learning pipeline with a Decision Tree classifier, specifies hyperparameters for grid search, performs grid search cross-validation using `GridSearchCV` from Scikit-learn to find the best hyperparameters, and then prints the best parameters.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.9086065 + "class": "Model_Train", + "subclass": "train_on_grid", + "subclass_id": 6, + "predicted_subclass_probability": 0.9916215 }, "cluster": 0 }, { - "cell_id": 14, - "code": "# Load Spacy Library\nnlp_spacy = spacy.load('en_core_web_sm')\n# Load the sentence encoder\nsentence_enc = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')", - "class": "Imports and Environment", - "desc": "This code snippet loads the pre-trained 'en_core_web_sm' model from spaCy for natural language processing tasks and the Universal Sentence Encoder from TensorFlow Hub for sentence embeddings.", + "cell_id": 30, + "code": "'''\nstarttime = time.time()\nword2vec_model = Word2Vec(data_list, size=300, iter=10, min_count=10)\nusedtime = time.time() - starttime\nprint('It took %.2fseconds to train word2vec' %usedtime)\n'''", + "class": "Model Training", + "desc": "This code snippet (currently commented out) measures the time taken to train a Word2Vec model on the processed text data with specific parameters using the Gensim library.", "testing": { - "class": "Model_Train", - "subclass": "load_pretrained", - "subclass_id": 30, - "predicted_subclass_probability": 0.99203765 + "class": "Exploratory_Data_Analysis", + "subclass": "commented", + "subclass_id": 76, + "predicted_subclass_probability": 0.9858438 }, "cluster": 0 }, { - "cell_id": 26, - "code": "# Bidirectional Encoder Representations from Transformers (BERT).\nbert_encoder_path = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4\"\n# Text preprocessing for BERT.\nbert_preprocessor_path = \"https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3\"\n# Token based text embedding trained on English Google News 200B corpus.\nkeyword_embedding_path = \"https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2\"", - "class": "Imports and Environment", - "desc": "This code snippet specifies the paths to three TensorFlow Hub models: a BERT encoder, a BERT preprocessor, and a token-based text embedding model trained on the Google News corpus, which are to be used for natural language processing tasks.", + "cell_id": 38, + "code": "word2vec_lr = LogisticRegression(class_weight = 'balanced', solver = 'lbfgs', n_jobs = -1)\nword2vec_lr.fit(X_train_word2vec, y_train_word2vec)\ny_predicted_word2vec_lr = word2vec_lr.predict(X_test_word2vec)", + "class": "Model Training", + "desc": "This code snippet trains a Logistic Regression model with class balancing on the Word2Vec transformed training data and then makes predictions on the Word2Vec transformed test data using Scikit-learn's `LogisticRegression`.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "define_variables", - "subclass_id": 77, - "predicted_subclass_probability": 0.9857056 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.57594216 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 27, - "code": "bert_encoder = hub.KerasLayer(bert_encoder_path, trainable=True, name=\"BERT_Encoder\")\nbert_preprocessor = hub.KerasLayer(bert_preprocessor_path, name=\"BERT_Preprocessor\")\nnnlm_embed = hub.KerasLayer(keyword_embedding_path, name=\"NNLM_Embedding\")", - "class": "Imports and Environment", - "desc": "This code snippet loads the BERT encoder, BERT preprocessor, and NNLM embedding models from TensorFlow Hub as Keras layers, making them ready for integration into a neural network model.", + "cell_id": 46, + "code": "CNNmodel = Sequential()\nCNNmodel.add(Embedding(len(word_index)+1, embedding_dim, input_length = max_sequence_length))\nCNNmodel.add(Conv1D(filters=250, kernel_size=3, strides=1, padding='valid', activation = 'relu'))\nCNNmodel.add(MaxPooling1D(pool_size=3))\nCNNmodel.add(Flatten())\nCNNmodel.add(Dense(embedding_dim, activation='relu'))\nCNNmodel.add(Dropout(0.8))\nCNNmodel.add(Dense(cnn_label.shape[1], activation='sigmoid'))\n\nCNNmodel.summary()", + "class": "Model Training", + "desc": "This code snippet defines a Convolutional Neural Network model using Keras' `Sequential`, `Embedding`, `Conv1D`, `MaxPooling1D`, `Flatten`, `Dense`, and `Dropout` layers, and then prints the model's summary.", "testing": { "class": "Model_Train", - "subclass": "load_pretrained", - "subclass_id": 30, - "predicted_subclass_probability": 0.506471 + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.9960479 }, "cluster": -1 }, { - "cell_id": 28, - "code": "kernel_initializer = tf.keras.initializers.GlorotNormal(seed=319)\n# Model function\ndef create_model():\n # Keyword Branch\n text_input = Input(shape=(), dtype=tf.string, name=\"text\")\n encoder_inputs = bert_preprocessor(text_input)\n encoder_outputs = bert_encoder(encoder_inputs)\n # Pooled output\n pooled_output = encoder_outputs[\"pooled_output\"]\n bert_branch = Dropout(0.1,\n seed=319,\n name=\"BERT_Dropout\")(pooled_output)\n # Construct keyword layers\n keyword_input = Input(shape=(), dtype=tf.string, name='keyword')\n keyword_embed = nnlm_embed(keyword_input)\n keyword_flat = Flatten(name=\"Keyword_Flatten\")(keyword_embed)\n keyword_dense1 = Dense(128, \n activation='relu',\n kernel_initializer=kernel_initializer,\n kernel_regularizer=regularizers.l2(1e-4),\n name=\"Keyword_Dense1\"\n )(keyword_flat)\n keyword_branch1 = Dropout(0.5,\n seed=319,\n name='Keyword_dropout1'\n )(keyword_dense1)\n keyword_dense2 = Dense(128, \n activation='relu',\n kernel_initializer=kernel_initializer,\n kernel_regularizer=regularizers.l2(1e-4),\n name=\"Keyword_Dense2\"\n )(keyword_branch1)\n keyword_branch2 = Dropout(0.5,\n seed=319,\n name='Keyword_dropout2'\n )(keyword_dense2)\n keyword_dense3 = Dense(128, \n activation='relu',\n kernel_initializer=kernel_initializer,\n kernel_regularizer=regularizers.l2(1e-4),\n name=\"Keyword_Dense3\"\n )(keyword_branch2)\n keyword_branch3 = Dropout(0.5,\n seed=319,\n name='Keyword_dropout3'\n )(keyword_dense3)\n \n # Merge the layers and classify\n merge = concatenate([bert_branch, keyword_branch3], name=\"Concatenate\")\n dense = Dense(128, \n activation='relu',\n kernel_initializer=kernel_initializer,\n kernel_regularizer=regularizers.l2(1e-4), \n name=\"Merged_Dense\")(merge)\n dropout = Dropout(0.5,\n seed=319,\n name=\"Merged_Dropout\"\n )(dense)\n clf = Dense(1,\n activation=\"sigmoid\", \n kernel_initializer=kernel_initializer,\n name=\"Classifier\"\n )(dropout)\n return Model([text_input, keyword_input], \n clf, \n name=\"BERT_Classifier\")", + "cell_id": 47, + "code": "CNNmodel.compile(optimizer='adam', loss=losses.binary_crossentropy, metrics=['accuracy'])\nhistory = CNNmodel.fit(X_cnn, y_cnn, epochs=3, validation_data=(X_val_cnn, y_val_cnn))", "class": "Model Training", - "desc": "This code snippet defines a function `create_model` that constructs a neural network model combining a BERT branch for encoding text and an NNLM embedding branch for keywords, then merges them, applies multiple dense layers with dropout for regularization, and finally classifies the output using a sigmoid activation.", + "desc": "This code snippet compiles the CNN model with the Adam optimizer and binary crossentropy loss, and then trains the model for 3 epochs while validating its performance using the validation data.", "testing": { - "class": "Data_Transform", - "subclass": "categorify", - "subclass_id": 20, - "predicted_subclass_probability": 0.99885345 + "class": "Model_Train", + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.925682 }, "cluster": 0 }, { - "cell_id": 29, - "code": "bert_classifier = create_model()\nbert_classifier.summary()", + "cell_id": 52, + "code": "model = Sequential()\nmodel.add(embedding_layer)\nmodel.add(Conv1D(filters=150, kernel_size=3, strides=1, padding='valid', activation = 'relu'))\nmodel.add(MaxPooling1D(pool_size=3))\nmodel.add(Flatten())\nmodel.add(Dense(embedding_dim, activation='relu'))\nmodel.add(Dropout(0.8))\nmodel.add(Dense(cnn_label.shape[1], activation='sigmoid'))\n\nmodel.summary()", "class": "Model Training", - "desc": "This code snippet creates an instance of the BERT-based classifier model by calling the `create_model` function and then prints a summary of the model architecture to inspect its layers and parameters.", + "desc": "This code snippet defines a Convolutional Neural Network model using the pre-trained embedding layer, `Conv1D`, `MaxPooling1D`, `Flatten`, `Dense`, and `Dropout` layers from Keras, and then prints the model's summary.", "testing": { - "class": "Visualization", - "subclass": "model_coefficients", - "subclass_id": 79, - "predicted_subclass_probability": 0.99306774 + "class": "Model_Train", + "subclass": "choose_model_class", + "subclass_id": 4, + "predicted_subclass_probability": 0.99737453 }, - "cluster": 1 + "cluster": -1 }, { - "cell_id": 31, - "code": "EPOCHS = 3\nLEARNING_RATE = 5e-5\n\nSTEPS_PER_EPOCH = int(train_ds.unbatch().cardinality().numpy() / BATCH_SIZE)\nVAL_STEPS = int(val_ds.unbatch().cardinality().numpy() / BATCH_SIZE)\n# Calculate the train and warmup steps for the optimizer\nTRAIN_STEPS = STEPS_PER_EPOCH * EPOCHS\nWARMUP_STEPS = int(TRAIN_STEPS * 0.1)\n\nadamw_optimizer = create_optimizer(\n init_lr=LEARNING_RATE,\n num_train_steps=TRAIN_STEPS,\n num_warmup_steps=WARMUP_STEPS,\n optimizer_type='adamw'\n)", + "cell_id": 53, + "code": "model.compile(optimizer='adam', loss=losses.binary_crossentropy, metrics=['accuracy'])\nhistory = model.fit(X_cnn, y_cnn, epochs=10, validation_data=(X_val_cnn, y_val_cnn))", "class": "Model Training", - "desc": "This code snippet sets up the training parameters, calculates the number of steps per epoch and validation steps, and initializes the AdamW optimizer with learning rate and warmup steps for training the model over three epochs.", + "desc": "This code snippet compiles the CNN model with the Adam optimizer and binary crossentropy loss, and then trains the model for 10 epochs while validating its performance using the validation data.", "testing": { "class": "Model_Train", - "subclass": "init_hyperparams", - "subclass_id": 59, - "predicted_subclass_probability": 0.6065405 + "subclass": "train_model", + "subclass_id": 7, + "predicted_subclass_probability": 0.9875873 }, "cluster": 0 }, { - "cell_id": 32, - "code": "STEPS_PER_EPOCH, VAL_STEPS, TRAIN_STEPS, WARMUP_STEPS", + "cell_id": 60, + "code": "def bert_encode(texts, bert_layer, max_len=128):\n vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()\n do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()\n tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)\n \n all_tokens = []\n all_masks = []\n all_segments = []\n \n for text in texts:\n text = tokenizer.tokenize(text)\n text = text[:max_len - 2]\n input_sequence = [\"[CLS]\"] + text + [\"[SEP]\"]\n pad_len = max_len - len(input_sequence)\n \n tokens = tokenizer.convert_tokens_to_ids(input_sequence)\n input_ids = tokens + [0]* pad_len\n all_tokens.append(input_ids)\n\n masks = [1]*len(input_sequence) + [0]* pad_len\n all_masks.append(masks)\n \n segments = [0]* max_len\n all_segments.append(segments)\n \n return np.array(all_tokens), np.array(all_masks), np.array(all_segments)\n\n \ndef build_model(bert_layer, max_len = 128, lr = 1e-5):\n input_word_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32,name=\"input_word_ids\")\n input_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32,name=\"input_mask\")\n segment_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32,name=\"segment_ids\")\n \n pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])\n dense_out = Dense(1,activation=\"relu\")(pooled_output)\n drop_out = tf.keras.layers.Dropout(0.8)(dense_out)\n out = Dense(1,activation=\"sigmoid\")(pooled_output)\n \n model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)\n adam = tf.keras.optimizers.Adam(lr)\n model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])\n \n return model\n\n\ndef plot_curve(history):\n plt.plot(history.history['accuracy'], label='accuracy')\n plt.plot(history.history['val_accuracy'], label='val_accuracy')\n plt.xlabel('Epoch')\n plt.ylabel('Accuracy')\n plt.ylim([0.5,1])\n plt.legend()\n plt.show()", "class": "Model Training", - "desc": "This code snippet prints the calculated values for steps per epoch, validation steps, total training steps, and warmup steps, which are essential for configuring the optimizer and training process.", + "desc": "This code snippet defines three functions: `bert_encode` to tokenize and encode texts using a BERT layer, `build_model` to construct a neural network with BERT embeddings and train it using Adam optimizer, and `plot_curve` to plot the accuracy of the model over epochs using Matplotlib.", "testing": { - "class": "Exploratory_Data_Analysis", - "subclass": "show_table", - "subclass_id": 41, - "predicted_subclass_probability": 0.94944763 + "class": "Data_Transform", + "subclass": "categorify", + "subclass_id": 20, + "predicted_subclass_probability": 0.9129451 }, "cluster": -1 }, { - "cell_id": 33, - "code": "bert_classifier.compile(loss=BinaryCrossentropy(from_logits=True),\n optimizer=adamw_optimizer, \n metrics=[BinaryAccuracy(name=\"accuracy\")]\n )\nhistory = bert_classifier.fit(train_ds, \n epochs=EPOCHS,\n steps_per_epoch=STEPS_PER_EPOCH,\n validation_data=val_ds,\n validation_steps=VAL_STEPS\n )", + "cell_id": 63, + "code": "# train model\nmodel = build_model(bert_layer, max_len=128, lr = 1e-5)\nmodel.summary()\n\ncheckpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)\n\ntrain_history = model.fit(\n train_input, train_labels,\n validation_split=0.2,\n epochs=3,\n callbacks=[checkpoint],\n batch_size=16\n)\n\nplot_curve(train_history)", "class": "Model Training", - "desc": "This code snippet compiles the BERT-based classifier model with binary cross-entropy loss, the AdamW optimizer, and binary accuracy as the evaluation metric, and then fits the model using the training and validation datasets over the specified number of epochs.", + "desc": "This code snippet builds and summarizes a BERT-based model, sets up a callback to save the best model based on validation loss, trains the model on the encoded training data for 3 epochs with a validation split of 20%, and plots the training and validation accuracy over epochs.", "testing": { - "class": "Model_Train", - "subclass": "train_model", - "subclass_id": 7, - "predicted_subclass_probability": 0.97614974 + "class": "Visualization", + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.61692107 }, "cluster": 0 }, { - "cell_id": 18, - "code": "keyword_non_disaster = df_train.keyword[df_train.target==0].value_counts().reset_index()\nsns.barplot(data=keyword_non_disaster[:10], x='keyword', y='index')\nplt.title('Non-Disaster Keyword Frequency (0)')\nplt.xlabel('Frequency')\nplt.ylabel('Top 10 Keywords')\nplt.show()", + "cell_id": 6, + "code": "piedata = train_data['target']\nplt.figure(figsize=(6,6))\npiedata.value_counts().plot(kind = 'pie',autopct = '%.2f%%')", "class": "Visualization", - "desc": "This code snippet creates a bar plot of the top 10 most frequent keywords associated with non-disaster tweets (label 0) in the `df_train` dataset using seaborn and matplotlib for visualization.", + "desc": "This code snippet creates a pie chart to visualize the distribution of the 'target' column in the `train_data` dataframe, with percentage labels, using Matplotlib.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, - "predicted_subclass_probability": 0.88048565 + "predicted_subclass_probability": 0.9974884 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 19, - "code": "keyword_disaster = df_train.keyword[df_train.target==1].value_counts().reset_index()\nsns.barplot(data=keyword_non_disaster[:10], x='keyword', y='index')\nplt.title('Non-Disaster Keyword Frequency (0)')\nplt.xlabel('Frequency')\nplt.ylabel('Top 10 Keywords')\nplt.show()", + "cell_id": 7, + "code": "num_words_0 = train_data[train_data['target']==0]['text'].apply(lambda x: len(x.split()))\nnum_words_1 = train_data[train_data['target']==1]['text'].apply(lambda x: len(x.split()))\nplt.figure(figsize=(12,6))\nsns.kdeplot(num_words_0, shade=True, color = 'b').set_title('Kernel distribution of number of words')\nsns.kdeplot(num_words_1, shade=True, color = 'r')\nplt.legend(labels=['0_no disaster', '1_disaster'])", "class": "Visualization", - "desc": "This code snippet creates a bar plot of the top 10 most frequent keywords associated with non-disaster tweets (label 0) in the `df_train` dataset using seaborn and matplotlib for visualization. ", + "desc": "This code snippet calculates the number of words in the 'text' column for each class in the 'target' column and then plots Kernel Density Estimates (KDE) for these distributions using Seaborn and Matplotlib.", "testing": { "class": "Visualization", "subclass": "distribution", "subclass_id": 33, - "predicted_subclass_probability": 0.8303329 + "predicted_subclass_probability": 0.97927135 }, - "cluster": 0 + "cluster": -1 }, { - "cell_id": 30, - "code": "keras.utils.plot_model(bert_classifier, \n show_shapes=False)", + "cell_id": 8, + "code": "len_word_0 = train_data[train_data['target']==0]['text'].str.split().map(lambda x: [len(i) for i in x])\nave_len_0 = len_word_0.map(lambda x: np.mean(x))\nlen_word_1 = train_data[train_data['target']==1]['text'].str.split().map(lambda x: [len(i) for i in x])\nave_len_1 = len_word_1.map(lambda x: np.mean(x))\nplt.figure(figsize=(12,6))\nsns.kdeplot(ave_len_0, shade=True, color='b').set_title('Kernel distribution of average words lenth')\nsns.kdeplot(ave_len_1, shade=True, color='r')\nplt.legend(labels=['0_no disaster', '1_disaster'])", + "class": "Visualization", + "desc": "This code snippet calculates the average word length for each class in the 'target' column and plots Kernel Density Estimates (KDE) for these distributions using Seaborn and Matplotlib.", + "testing": { + "class": "Visualization", + "subclass": "distribution", + "subclass_id": 33, + "predicted_subclass_probability": 0.98665464 + }, + "cluster": -1 + }, { + "cell_id": 20, + "code": "# fail to sort and plot the top 10 most important features in disaster and non-disaster text\n'''\nindex_to_word = [(v,k) for k,v in tfidf_vectorizer.vocabulary_.items()]\nsorted(index_to_word, key=lambda x: x[0], reverse=True)\n'''", + "class": "Visualization", + "desc": "This code snippet (currently commented out) attempts to sort and plot the top 10 most important features by extracting terms and their corresponding scores from the TF-IDF vectorizer's vocabulary.", + "testing": { + "class": "Exploratory_Data_Analysis", + "subclass": "commented", + "subclass_id": 76, + "predicted_subclass_probability": 0.8911329 + }, + "cluster": -1 + }, { + "cell_id": 48, + "code": "plt.plot(history.history['accuracy'], label='accuracy')\nplt.plot(history.history['val_accuracy'], label='val_accuracy')\nplt.xlabel('Epoch')\nplt.ylabel('Accuracy')\nplt.ylim([0.5,1])\nplt.legend()\nplt.show()", "class": "Visualization", - "desc": "This code snippet generates a plot of the architecture of the BERT-based classifier model using Keras' `plot_model` function to visualize its structure.", + "desc": "This code snippet plots the training and validation accuracy of the CNN model over epochs using Matplotlib to visualize the model's performance during training.", "testing": { "class": "Visualization", "subclass": "learning_history", "subclass_id": 35, - "predicted_subclass_probability": 0.9785347 + "predicted_subclass_probability": 0.9901933 }, - "cluster": 0 + "cluster": -1 + }, { + "cell_id": 54, + "code": "plt.plot(history.history['accuracy'], label='accuracy')\nplt.plot(history.history['val_accuracy'], label='val_accuracy')\nplt.xlabel('Epoch')\nplt.ylabel('Accuracy')\nplt.ylim([0.5,1])\nplt.legend()\nplt.show()", + "class": "Visualization", + "desc": "This code snippet plots the training and validation accuracy of the CNN model over epochs using Matplotlib to visualize the model's performance during training.", + "testing": { + "class": "Visualization", + "subclass": "learning_history", + "subclass_id": 35, + "predicted_subclass_probability": 0.9901933 + }, + "cluster": -1 }], - "notebook_id": 27, - "notebook_name": "using-keywords-embedding-to-improve-bert-model" + "notebook_id": 21, + "notebook_name": "nlp-getting-started.ipynb" }], "metadata": { "clusters": { "Data Transform": { - "0": "NLTK Text Preprocessing for Machine Learning", - "1": "Text Preprocessing for ML and NLP", - "2": "Data Preprocessing for Model Training", - "-1": "Punctuation Characters in `string` Module" + "titles": { + "0": "Text Preprocessing with NLP, PyTorch, TensorFlow", + "1": "Text Preprocessing with Pandas and NLTK", + "2": "Text Preprocessing and TF-IDF Encoding using Pandas, NLTK, and Scikit-learn", + "3": "Text Preprocessing with Pandas and NLTK", + "4": "Text Preprocessing with SpaCy, NLTK, TensorFlow", + "5": "Text Data Processing with PyTorch, TensorFlow, Scikit-learn", + "6": "Text Processing with SpaCy, regex, TextBlob", + "7": "Pandas DataFrame Manipulation and Analysis Methods", + "8": "Text Processing with Pandas and scikit-learn" + }, + "accuracy": { + "silhouette_score": 0.06279852515334622, + "ch_index": 21.950849543931543, + "db_index": 2.8032018648947274 + } }, "Data Extraction": { - "0": "Loading and Managing Word Embeddings", - "1": "Reading and Preparing Data for Analysis", - "2": "Loading Data from CSV Files", - "3": "Load Datasets from CSV to DataFrames", - "-1": "DataFrame Processing for Text Classification" + "titles": { + "0": "Reading CSV Files using Pandas", + "1": "Data Processing and Embedding Extraction with BERT", + "-1": "DataFrame Handling and Preparation in Pandas & Scikit-learn" + }, + "accuracy": { + "silhouette_score": 0.1670813390079446, + "ch_index": 8.93159409687521, + "db_index": 1.9368944754222406 + } }, "Visualization": { - "0": "Visualizing Data Distributions and Model Performance", - "1": "Visualizing Model Training and Validation Loss", - "2": "Visualize Model Training and Validation Performance", - "-1": "Visualizing Most Common Words Frequencies" + "titles": { + "-1": "Data Visualization Using Pandas, Seaborn, Matplotlib" + }, + "accuracy": { + "silhouette_score": 0, + "ch_index": 0, + "db_index": 0 + } }, "Model Training": { - "0": "Advanced Text Classification Model Development", - "1": "Initializing and Summarizing Neural Network Models", - "-1": "NLP Models Training and Evaluation Tools" + "titles": { + "0": "Machine Learning and NLP with Transformers", + "1": "Machine Learning Training Techniques: Fastai, TensorFlow, PyTorch", + "2": "TensorFlow, Keras, Sklearn Models for Classification", + "3": "Optuna, BERT, Tf, PyTorch, and LightGBM", + "-1": "Hyperparameter Tuning and Model Training Methods" + }, + "accuracy": { + "silhouette_score": 0.10146063623166789, + "ch_index": 12.201029830435502, + "db_index": 2.5141778747956103 + } }, "Model Evaluation": { - "0": "Model Evaluation and Performance Analysis", - "1": "Commented-Out Model Evaluation Snippets" + "titles": { + "0": "Model Evaluation Using Fastai, Keras, Scikit-learn", + "1": "Model Evaluation with Sklearn and PyTorch", + "2": "Model Evaluation and Prediction Using SVC, Naive Bayes and Logistic Regression", + "3": "Model Evaluation with Scikit-learn and Matplotlib", + "-1": "Fastai Predictions and Feature Importance Analysis" + }, + "accuracy": { + "silhouette_score": 0.08900721844672672, + "ch_index": 14.883553354543007, + "db_index": 1.9602302292000342 + } }, "Imports and Environment": { - "0": "Importing Libraries for Data and NLP", - "1": "Installing Various Python Libraries for NLP", - "-1": "NLP, Data Manipulation, and Machine Learning" + "titles": { + "0": "NLP and ML Using TensorFlow, PyTorch, Transformers", + "1": "Python Libraries and Visualization Methods", + "-1": "BERT Tokenizer, Pandas Data Display Configuration" + }, + "accuracy": { + "silhouette_score": 0.16938431790553382, + "ch_index": 11.01896957823955, + "db_index": 1.7204516936504224 + } }, "Data Export": { - "0": "Predict and Save Model Submissions", - "1": "Export DataFrame to CSV File", - "-1": "Creating and Exporting Submission DataFrames" + "titles": { + "-1": "Pandas Model Predictions to CSV Submissions" + }, + "accuracy": { + "silhouette_score": 0, + "ch_index": 0, + "db_index": 0 + } }, "Exploratory Data Analysis": { - "0": "Dataset Observations, Analysis, and Insights", - "1": "Dataset Summary and Class Distribution Checks", - "2": "Display Initial Rows of DataFrames", - "3": "Display Initial and Final DataFrame Rows", - "4": "DataFrame Filtering and Display Samples", - "5": "DataFrame Text Filtering and Display", - "6": "Analyzing Tokenized Text Data Lengths", - "7": "Dataset Overview and Null Value Analysis", - "8": "Data Quality and Completeness Checks", - "9": "Counting Unique Values in DataFrame Columns", - "10": "DataFrame Dimensions Display and Verification", - "11": "DataFrame Summary and Initial Insights", - "12": "Displaying Initial Rows of DataFrames", - "-1": "Analyzing and Visualizing Dataset Characteristics" + "titles": { + "0": "Pandas DataFrame Display and Summary Methods", + "1": "Pandas DataFrame Shape and Column Analysis", + "2": "Pandas Data Analysis with TF-IDF", + "3": "Pandas, BERT, Spacy: Data Exploration & Preparation", + "4": "Data Analysis with Pandas, Seaborn, Matplotlib, Gensim", + "5": "Data Processing Verification with Pandas", + "6": "Data Cleaning and Verification with Pandas and DataPrep", + "-1": "Data Analysis with DataFrames and `Counter`" + }, + "accuracy": { + "silhouette_score": 0.08713711589873052, + "ch_index": 12.531779174999915, + "db_index": 2.525932192414787 + } } }, - "clustering_accuracy": 0.3300970873786408 + "clustering_accuracy": 0.3616504854368932 } } \ No newline at end of file diff --git a/src/VizContent.tsx b/src/VizContent.tsx index 56be9db..33d51a1 100644 --- a/src/VizContent.tsx +++ b/src/VizContent.tsx @@ -52,7 +52,7 @@ class VizContent extends ReactWidget { jsonData.notebooks.forEach(notebook => { notebook.cells.forEach(cell => { - const classMetadata = data["metadata"]["clusters"][cell.class]; + const classMetadata = data["metadata"]["clusters"][cell.class]["titles"]; if (classMetadata && classMetadata[cell.cluster]) { cell.cluster = classMetadata[cell.cluster]; // Replace cluster ID with the title @@ -119,7 +119,7 @@ class VizContent extends ReactWidget { jsonData.notebooks.forEach(notebook => { notebook.cells.forEach(cell => { - const classMetadata = data["metadata"]["clusters"][cell.class]; + const classMetadata = data["metadata"]["clusters"][cell.class]["titles"]; if (classMetadata && classMetadata[cell.cluster]) { cell.cluster = classMetadata[cell.cluster]; // Replace cluster ID with the title @@ -134,7 +134,7 @@ class VizContent extends ReactWidget { jsonData.notebooks.forEach(notebook => { notebook.cells.forEach(cell => { - const classMetadata = data["metadata"]["clusters"][cell.class]; + const classMetadata = data["metadata"]["clusters"][cell.class]["titles"]; if (classMetadata && classMetadata[cell.cluster]) { cell.cluster = classMetadata[cell.cluster]; // Replace cluster ID with the title diff --git a/src/colorScheme.ts b/src/colorScheme.ts index e16170f..2c2725b 100644 --- a/src/colorScheme.ts +++ b/src/colorScheme.ts @@ -22,3 +22,4 @@ const colorScheme: ColorScheme = { }; export default colorScheme; + \ No newline at end of file