diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 10e3ed15b4..0000000000 --- a/.gitignore +++ /dev/null @@ -1,180 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class -*.pretrain* -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST -license.txt - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -junit -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Locust files: -locustfile.py - -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Environments -.env* -.venv* -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ - -# Tensorflow -*model_checkpoints -**/outputs - -# Azure ML -config.json -aml_config/ -aml_scripts/ -aml_data/ - -# Spark -spark-warehouse/ - -########################## -.DS_Store -.~* -Untitled*.ipynb -*-Copy*.ipynb -~$* -output.ipynb -conda*.yaml -reco_*.yaml -.idea/ -*.npz -*.data -*.dat -*.csv -*.zip -*.7z -.vscode/ -u.item -ml-100k/ -ml-10M100K/ -ml-1m/ -ml-20m/ -*.jar -*.item -*.pkl -*.pdf -.pretrain -*.npy -*.ckpt* -*.png -*.jpg -*.jpeg -*.gif -*.model -*.mml -nohup.out -*.svg -*.html -*.js -*.css -*.tff -*.woff -*.woff2 -*.eot - -##### kdd 2020 tutorial data folder -examples/07_tutorials/KDD2020-tutorial/data_folder/ - -*.vec -*.tsv -*.sh - -tests/**/resources/ -reports/ - -### pip folders -pip-wheel* diff --git a/AUTHORS.md b/AUTHORS.md deleted file mode 100644 index 54664fe0c2..0000000000 --- a/AUTHORS.md +++ /dev/null @@ -1,138 +0,0 @@ - - -Contributors to Recommenders -============================ -Recommenders is developed and maintained by a community of people interested in exploring recommendation algorithms and how best to deploy them in industry settings. The goal is to accelerate the workflow of any individual or organization working on recommender systems. Everyone is encouraged to contribute at any level to add and improve the implemented algorithms, notebooks and utilities. - -
- -
- -Maintainers (sorted alphabetically) ---------------------------------------- -Maintainers are actively supporting the project and have made substantial contributions to the repository.
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import os
+import shutil
+import pandas as pd
+import gzip
+import random
+import logging
+import _pickle as cPickle
+
+from recommenders.utils.constants import SEED
+from recommenders.datasets.download_utils import maybe_download
+
+
+random.seed(SEED)
+logger = logging.getLogger()
+
+
+[docs]def get_review_data(reviews_file):
+ """Downloads amazon review data (only), prepares in the required format
+ and stores in the same location
+
+ Args:
+ reviews_file (str): Filename for downloaded reviews dataset.
+ """
+ reviews_name = reviews_file.split("/")[-1] # *.json (for url)
+ download_and_extract(reviews_name, reviews_file)
+ reviews_output = _reviews_preprocessing(reviews_file)
+ return reviews_output
+
+
+[docs]def data_preprocessing(
+ reviews_file,
+ meta_file,
+ train_file,
+ valid_file,
+ test_file,
+ user_vocab,
+ item_vocab,
+ cate_vocab,
+ sample_rate=0.01,
+ valid_num_ngs=4,
+ test_num_ngs=9,
+ is_history_expanding=True,
+):
+ """Create data for training, validation and testing from original dataset
+
+ Args:
+ reviews_file (str): Reviews dataset downloaded from former operations.
+ meta_file (str): Meta dataset downloaded from former operations.
+ """
+ reviews_output = _reviews_preprocessing(reviews_file)
+ meta_output = _meta_preprocessing(meta_file)
+ instance_output = _create_instance(reviews_output, meta_output)
+ _create_item2cate(instance_output)
+ sampled_instance_file = _get_sampled_data(instance_output, sample_rate=sample_rate)
+ preprocessed_output = _data_processing(sampled_instance_file)
+ if is_history_expanding:
+ _data_generating(preprocessed_output, train_file, valid_file, test_file)
+ else:
+ _data_generating_no_history_expanding(
+ preprocessed_output, train_file, valid_file, test_file
+ )
+ _create_vocab(train_file, user_vocab, item_vocab, cate_vocab)
+ _negative_sampling_offline(
+ sampled_instance_file, valid_file, test_file, valid_num_ngs, test_num_ngs
+ )
+
+
+def _create_vocab(train_file, user_vocab, item_vocab, cate_vocab):
+
+ f_train = open(train_file, "r")
+
+ user_dict = {}
+ item_dict = {}
+ cat_dict = {}
+
+ logger.info("vocab generating...")
+ for line in f_train:
+ arr = line.strip("\n").split("\t")
+ uid = arr[1]
+ mid = arr[2]
+ cat = arr[3]
+ mid_list = arr[5]
+ cat_list = arr[6]
+
+ if uid not in user_dict:
+ user_dict[uid] = 0
+ user_dict[uid] += 1
+ if mid not in item_dict:
+ item_dict[mid] = 0
+ item_dict[mid] += 1
+ if cat not in cat_dict:
+ cat_dict[cat] = 0
+ cat_dict[cat] += 1
+ if len(mid_list) == 0:
+ continue
+ for m in mid_list.split(","):
+ if m not in item_dict:
+ item_dict[m] = 0
+ item_dict[m] += 1
+ for c in cat_list.split(","):
+ if c not in cat_dict:
+ cat_dict[c] = 0
+ cat_dict[c] += 1
+
+ sorted_user_dict = sorted(user_dict.items(), key=lambda x: x[1], reverse=True)
+ sorted_item_dict = sorted(item_dict.items(), key=lambda x: x[1], reverse=True)
+ sorted_cat_dict = sorted(cat_dict.items(), key=lambda x: x[1], reverse=True)
+
+ uid_voc = {}
+ index = 0
+ for key, value in sorted_user_dict:
+ uid_voc[key] = index
+ index += 1
+
+ mid_voc = {}
+ mid_voc["default_mid"] = 0
+ index = 1
+ for key, value in sorted_item_dict:
+ mid_voc[key] = index
+ index += 1
+
+ cat_voc = {}
+ cat_voc["default_cat"] = 0
+ index = 1
+ for key, value in sorted_cat_dict:
+ cat_voc[key] = index
+ index += 1
+
+ cPickle.dump(uid_voc, open(user_vocab, "wb"))
+ cPickle.dump(mid_voc, open(item_vocab, "wb"))
+ cPickle.dump(cat_voc, open(cate_vocab, "wb"))
+
+
+def _negative_sampling_offline(
+ instance_input_file, valid_file, test_file, valid_neg_nums=4, test_neg_nums=49
+):
+
+ columns = ["label", "user_id", "item_id", "timestamp", "cate_id"]
+ ns_df = pd.read_csv(instance_input_file, sep="\t", names=columns)
+ items_with_popular = list(ns_df["item_id"])
+
+ global item2cate
+
+ # valid negative sampling
+ logger.info("start valid negative sampling")
+ with open(valid_file, "r") as f:
+ valid_lines = f.readlines()
+ write_valid = open(valid_file, "w")
+ for line in valid_lines:
+ write_valid.write(line)
+ words = line.strip().split("\t")
+ positive_item = words[2]
+ count = 0
+ neg_items = set()
+ while count < valid_neg_nums:
+ neg_item = random.choice(items_with_popular)
+ if neg_item == positive_item or neg_item in neg_items:
+ continue
+ count += 1
+ neg_items.add(neg_item)
+ words[0] = "0"
+ words[2] = neg_item
+ words[3] = item2cate[neg_item]
+ write_valid.write("\t".join(words) + "\n")
+
+ # test negative sampling
+ logger.info("start test negative sampling")
+ with open(test_file, "r") as f:
+ test_lines = f.readlines()
+ write_test = open(test_file, "w")
+ for line in test_lines:
+ write_test.write(line)
+ words = line.strip().split("\t")
+ positive_item = words[2]
+ count = 0
+ neg_items = set()
+ while count < test_neg_nums:
+ neg_item = random.choice(items_with_popular)
+ if neg_item == positive_item or neg_item in neg_items:
+ continue
+ count += 1
+ neg_items.add(neg_item)
+ words[0] = "0"
+ words[2] = neg_item
+ words[3] = item2cate[neg_item]
+ write_test.write("\t".join(words) + "\n")
+
+
+def _data_generating(input_file, train_file, valid_file, test_file, min_sequence=1):
+ """produce train, valid and test file from processed_output file
+ Each user's behavior sequence will be unfolded and produce multiple lines in trian file.
+ Like, user's behavior sequence: 12345, and this function will write into train file:
+ 1, 12, 123, 1234, 12345
+ """
+ f_input = open(input_file, "r")
+ f_train = open(train_file, "w")
+ f_valid = open(valid_file, "w")
+ f_test = open(test_file, "w")
+ logger.info("data generating...")
+ last_user_id = None
+ for line in f_input:
+ line_split = line.strip().split("\t")
+ tfile = line_split[0]
+ label = int(line_split[1])
+ user_id = line_split[2]
+ movie_id = line_split[3]
+ date_time = line_split[4]
+ category = line_split[5]
+
+ if tfile == "train":
+ fo = f_train
+ elif tfile == "valid":
+ fo = f_valid
+ elif tfile == "test":
+ fo = f_test
+ if user_id != last_user_id:
+ movie_id_list = []
+ cate_list = []
+ dt_list = []
+ else:
+ history_clk_num = len(movie_id_list)
+ cat_str = ""
+ mid_str = ""
+ dt_str = ""
+ for c1 in cate_list:
+ cat_str += c1 + ","
+ for mid in movie_id_list:
+ mid_str += mid + ","
+ for dt_time in dt_list:
+ dt_str += dt_time + ","
+ if len(cat_str) > 0:
+ cat_str = cat_str[:-1]
+ if len(mid_str) > 0:
+ mid_str = mid_str[:-1]
+ if len(dt_str) > 0:
+ dt_str = dt_str[:-1]
+ if history_clk_num >= min_sequence:
+ fo.write(
+ line_split[1]
+ + "\t"
+ + user_id
+ + "\t"
+ + movie_id
+ + "\t"
+ + category
+ + "\t"
+ + date_time
+ + "\t"
+ + mid_str
+ + "\t"
+ + cat_str
+ + "\t"
+ + dt_str
+ + "\n"
+ )
+ last_user_id = user_id
+ if label:
+ movie_id_list.append(movie_id)
+ cate_list.append(category)
+ dt_list.append(date_time)
+
+
+def _data_generating_no_history_expanding(
+ input_file, train_file, valid_file, test_file, min_sequence=1
+):
+ """Produce train, valid and test file from processed_output file
+ Each user's behavior sequence will only produce one line in train file.
+ Like, user's behavior sequence: 12345, and this function will write into train file: 12345
+ """
+ f_input = open(input_file, "r")
+ f_train = open(train_file, "w")
+ f_valid = open(valid_file, "w")
+ f_test = open(test_file, "w")
+ logger.info("data generating...")
+
+ last_user_id = None
+ last_movie_id = None
+ last_category = None
+ last_datetime = None
+ last_tfile = None
+ for line in f_input:
+ line_split = line.strip().split("\t")
+ tfile = line_split[0]
+ label = int(line_split[1])
+ user_id = line_split[2]
+ movie_id = line_split[3]
+ date_time = line_split[4]
+ category = line_split[5]
+
+ if last_tfile == "train":
+ fo = f_train
+ elif last_tfile == "valid":
+ fo = f_valid
+ elif last_tfile == "test":
+ fo = f_test
+ if user_id != last_user_id or tfile == "valid" or tfile == "test":
+ if last_user_id is not None:
+ history_clk_num = len(
+ movie_id_list # noqa: F821 undefined name 'movie_id_list'
+ )
+ cat_str = ""
+ mid_str = ""
+ dt_str = ""
+ for c1 in cate_list[:-1]: # noqa: F821 undefined name 'cate_list'
+ cat_str += c1 + ","
+ for mid in movie_id_list[ # noqa: F821 undefined name 'movie_id_list'
+ :-1
+ ]:
+ mid_str += mid + ","
+ for dt_time in dt_list[:-1]: # noqa: F821 undefined name 'dt_list'
+ dt_str += dt_time + ","
+ if len(cat_str) > 0:
+ cat_str = cat_str[:-1]
+ if len(mid_str) > 0:
+ mid_str = mid_str[:-1]
+ if len(dt_str) > 0:
+ dt_str = dt_str[:-1]
+ if history_clk_num > min_sequence:
+ fo.write(
+ line_split[1]
+ + "\t"
+ + last_user_id
+ + "\t"
+ + last_movie_id
+ + "\t"
+ + last_category
+ + "\t"
+ + last_datetime
+ + "\t"
+ + mid_str
+ + "\t"
+ + cat_str
+ + "\t"
+ + dt_str
+ + "\n"
+ )
+ if tfile == "train" or last_user_id is None:
+ movie_id_list = []
+ cate_list = []
+ dt_list = []
+ last_user_id = user_id
+ last_movie_id = movie_id
+ last_category = category
+ last_datetime = date_time
+ last_tfile = tfile
+ if label:
+ movie_id_list.append(movie_id)
+ cate_list.append(category)
+ dt_list.append(date_time)
+
+
+def _create_item2cate(instance_file):
+ logger.info("creating item2cate dict")
+ global item2cate
+ instance_df = pd.read_csv(
+ instance_file,
+ sep="\t",
+ names=["label", "user_id", "item_id", "timestamp", "cate_id"],
+ )
+ item2cate = instance_df.set_index("item_id")["cate_id"].to_dict()
+
+
+def _get_sampled_data(instance_file, sample_rate):
+ logger.info("getting sampled data...")
+ global item2cate
+ output_file = instance_file + "_" + str(sample_rate)
+ columns = ["label", "user_id", "item_id", "timestamp", "cate_id"]
+ ns_df = pd.read_csv(instance_file, sep="\t", names=columns)
+ items_num = ns_df["item_id"].nunique()
+ items_with_popular = list(ns_df["item_id"])
+ items_sample, count = set(), 0
+ while count < int(items_num * sample_rate):
+ random_item = random.choice(items_with_popular)
+ if random_item not in items_sample:
+ items_sample.add(random_item)
+ count += 1
+ ns_df_sample = ns_df[ns_df["item_id"].isin(items_sample)]
+ ns_df_sample.to_csv(output_file, sep="\t", index=None, header=None)
+ return output_file
+
+
+def _meta_preprocessing(meta_readfile):
+ logger.info("start meta preprocessing...")
+ meta_writefile = meta_readfile + "_output"
+ meta_r = open(meta_readfile, "r")
+ meta_w = open(meta_writefile, "w")
+ for line in meta_r:
+ line_new = eval(line)
+ meta_w.write(line_new["asin"] + "\t" + line_new["categories"][0][-1] + "\n")
+ meta_r.close()
+ meta_w.close()
+ return meta_writefile
+
+
+def _reviews_preprocessing(reviews_readfile):
+ logger.info("start reviews preprocessing...")
+ reviews_writefile = reviews_readfile + "_output"
+ reviews_r = open(reviews_readfile, "r")
+ reviews_w = open(reviews_writefile, "w")
+ for line in reviews_r:
+ line_new = eval(line.strip())
+ reviews_w.write(
+ str(line_new["reviewerID"])
+ + "\t"
+ + str(line_new["asin"])
+ + "\t"
+ + str(line_new["unixReviewTime"])
+ + "\n"
+ )
+ reviews_r.close()
+ reviews_w.close()
+ return reviews_writefile
+
+
+def _create_instance(reviews_file, meta_file):
+ logger.info("start create instances...")
+ dirs, _ = os.path.split(reviews_file)
+ output_file = os.path.join(dirs, "instance_output")
+
+ f_reviews = open(reviews_file, "r")
+ user_dict = {}
+ item_list = []
+ for line in f_reviews:
+ line = line.strip()
+ reviews_things = line.split("\t")
+ if reviews_things[0] not in user_dict:
+ user_dict[reviews_things[0]] = []
+ user_dict[reviews_things[0]].append((line, float(reviews_things[-1])))
+ item_list.append(reviews_things[1])
+
+ f_meta = open(meta_file, "r")
+ meta_dict = {}
+ for line in f_meta:
+ line = line.strip()
+ meta_things = line.split("\t")
+ if meta_things[0] not in meta_dict:
+ meta_dict[meta_things[0]] = meta_things[1]
+
+ f_output = open(output_file, "w")
+ for user_behavior in user_dict:
+ sorted_user_behavior = sorted(user_dict[user_behavior], key=lambda x: x[1])
+ for line, _ in sorted_user_behavior:
+ user_things = line.split("\t")
+ asin = user_things[1]
+ if asin in meta_dict:
+ f_output.write("1" + "\t" + line + "\t" + meta_dict[asin] + "\n")
+ else:
+ f_output.write("1" + "\t" + line + "\t" + "default_cat" + "\n")
+
+ f_reviews.close()
+ f_meta.close()
+ f_output.close()
+ return output_file
+
+
+def _data_processing(input_file):
+ logger.info("start data processing...")
+ dirs, _ = os.path.split(input_file)
+ output_file = os.path.join(dirs, "preprocessed_output")
+
+ f_input = open(input_file, "r")
+ f_output = open(output_file, "w")
+ user_count = {}
+ for line in f_input:
+ line = line.strip()
+ user = line.split("\t")[1]
+ if user not in user_count:
+ user_count[user] = 0
+ user_count[user] += 1
+ f_input.seek(0)
+ i = 0
+ last_user = None
+ for line in f_input:
+ line = line.strip()
+ user = line.split("\t")[1]
+ if user == last_user:
+ if i < user_count[user] - 2:
+ f_output.write("train" + "\t" + line + "\n")
+ elif i < user_count[user] - 1:
+ f_output.write("valid" + "\t" + line + "\n")
+ else:
+ f_output.write("test" + "\t" + line + "\n")
+ else:
+ last_user = user
+ i = 0
+ if i < user_count[user] - 2:
+ f_output.write("train" + "\t" + line + "\n")
+ elif i < user_count[user] - 1:
+ f_output.write("valid" + "\t" + line + "\n")
+ else:
+ f_output.write("test" + "\t" + line + "\n")
+ i += 1
+ return output_file
+
+
+[docs]def download_and_extract(name, dest_path):
+ """Downloads and extracts Amazon reviews and meta datafiles if they don’t already exist
+
+ Args:
+ name (str): Category of reviews.
+ dest_path (str): File path for the downloaded file.
+
+ Returns:
+ str: File path for the extracted file.
+ """
+ dirs, _ = os.path.split(dest_path)
+ if not os.path.exists(dirs):
+ os.makedirs(dirs)
+
+ file_path = os.path.join(dirs, name)
+ if not os.path.exists(file_path):
+ _download_reviews(name, dest_path)
+ _extract_reviews(file_path, dest_path)
+
+ return file_path
+
+
+def _download_reviews(name, dest_path):
+ """Downloads Amazon reviews datafile.
+
+ Args:
+ name (str): Category of reviews
+ dest_path (str): File path for the downloaded file
+ """
+
+ url = (
+ "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/"
+ + name
+ + ".gz"
+ )
+
+ dirs, file = os.path.split(dest_path)
+ maybe_download(url, file + ".gz", work_directory=dirs)
+
+
+def _extract_reviews(file_path, zip_path):
+ """Extract Amazon reviews and meta datafiles from the raw zip files.
+
+ To extract all files,
+ use ZipFile's extractall(path) instead.
+
+ Args:
+ file_path (str): Destination path for datafile
+ zip_path (str): zipfile path
+ """
+ with gzip.open(zip_path + ".gz", "rb") as zf, open(file_path, "wb") as f:
+ shutil.copyfileobj(zf, f)
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+import pandas as pd
+import requests
+
+
+[docs]def load_pandas_df(
+ azure_storage_account_name="azureopendatastorage",
+ azure_storage_sas_token="",
+ container_name="covid19temp",
+ metadata_filename="metadata.csv",
+):
+ """Loads the Azure Open Research COVID-19 dataset as a pd.DataFrame.
+
+ The Azure COVID-19 Open Research Dataset may be found at https://azure.microsoft.com/en-us/services/open-datasets/catalog/covid-19-open-research/
+
+ Args:
+ azure_storage_account_name (str): Azure storage account name.
+ azure_storage_sas_token (str): Azure storage SAS token.
+ container_name (str): Azure storage container name.
+ metadata_filename (str): Name of file containing top-level metadata for the dataset.
+
+ Returns:
+ metadata (pandas.DataFrame): Metadata dataframe.
+ """
+
+ # Load into dataframe
+ uri = "https://{acct}.blob.core.windows.net/{container}/{filename}{sas}".format(
+ acct=azure_storage_account_name,
+ container=container_name,
+ filename=metadata_filename,
+ sas=azure_storage_sas_token,
+ )
+ return pd.read_csv(uri)
+
+
+[docs]def remove_duplicates(df, cols):
+ """Remove duplicated entries.
+
+ Args:
+ df (pd.DataFrame): Pandas dataframe.
+ cols (list of str): Name of columns in which to look for duplicates.
+
+ Returns:
+ df (pandas.DataFrame): Pandas dataframe with duplicate rows dropped.
+
+ """
+ for col in cols:
+ # Reset index
+ df = df.reset_index(drop=True)
+
+ # Find where the identifier variable is duplicated
+ dup_rows = np.where(df.duplicated([col]))[0]
+
+ # Drop duplicated rows
+ df = df.drop(dup_rows)
+
+ return df
+
+
+[docs]def remove_nan(df, cols):
+ """Remove rows with NaN values in specified column.
+
+ Args:
+ df (pandas.DataFrame): Pandas dataframe.
+ cols (list of str): Name of columns in which to look for NaN.
+
+ Returns:
+ df (pandas.DataFrame): Pandas dataframe with invalid rows dropped.
+
+ """
+ for col in cols:
+ # Convert any empty string cells to nan
+ df[col].replace("", np.nan, inplace=True)
+
+ # Remove NaN rows
+ df = df[df[col].notna()]
+
+ return df
+
+
+[docs]def clean_dataframe(df):
+ """Clean up the dataframe.
+
+ Args:
+ df (pandas.DataFrame): Pandas dataframe.
+
+ Returns:
+ df (pandas.DataFrame): Cleaned pandas dataframe.
+ """
+
+ # Remove duplicated rows
+ cols = ["cord_uid", "doi"]
+ df = remove_duplicates(df, cols)
+
+ # Remove rows without values in specified columns
+ cols = ["cord_uid", "doi", "title", "license", "url"]
+ df = remove_nan(df, cols)
+
+ return df
+
+
+[docs]def retrieve_text(
+ entry,
+ container_name,
+ azure_storage_account_name="azureopendatastorage",
+ azure_storage_sas_token="",
+):
+ """Retrieve body text from article of interest.
+
+ Args:
+ entry (pd.Series): A single row from the dataframe (df.iloc[n]).
+ container_name (str): Azure storage container name.
+ azure_storage_account_name (str): Azure storage account name.
+ azure_storage_sas_token (str): Azure storage SAS token.
+
+ Results:
+ text (str): Full text of the blob as a single string.
+ """
+
+ try:
+ filename = entry["pdf_json_files"] or entry["pmc_json_files"]
+
+ # Extract text
+ uri = "https://{acct}.blob.core.windows.net/{container}/{filename}{sas}".format(
+ acct=azure_storage_account_name,
+ container=container_name,
+ filename=filename,
+ sas=azure_storage_sas_token,
+ )
+
+ data = requests.get(uri, headers={"Content-type": "application/json"}).json()
+ text = " ".join([paragraph["text"] for paragraph in data["body_text"]])
+
+ except Exception:
+ text = ""
+
+ return text
+
+
+[docs]def get_public_domain_text(
+ df,
+ container_name,
+ azure_storage_account_name="azureopendatastorage",
+ azure_storage_sas_token="",
+):
+ """Get all public domain text.
+
+ Args:
+ df (pandas.DataFrame): Metadata dataframe for public domain text.
+ container_name (str): Azure storage container name.
+ azure_storage_account_name (str): Azure storage account name.
+ azure_storage_sas_token (str): Azure storage SAS token.
+
+ Returns:
+ df_full (pandas.DataFrame): Dataframe with select metadata and full article text.
+ """
+ # reset index
+ df = df.reset_index(drop=True)
+
+ # Add in full_text
+ df["full_text"] = df.apply(
+ lambda row: retrieve_text(
+ row, container_name, azure_storage_account_name, azure_storage_sas_token
+ ),
+ axis=1,
+ )
+
+ # Remove rows with empty full_text
+ empty_rows = np.where(df["full_text"] == "")[0]
+ df = df.drop(empty_rows)
+
+ # Only keep columns of interest
+ df_full = df[
+ [
+ "cord_uid",
+ "doi",
+ "title",
+ "publish_time",
+ "authors",
+ "journal",
+ "url",
+ "abstract",
+ "full_text",
+ ]
+ ]
+ df_full = df_full.reset_index()
+
+ return df_full
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+
+import pandas as pd
+import os
+import tarfile
+
+try:
+ from pyspark.sql.types import StructType, StructField, IntegerType, StringType
+except ImportError:
+ pass # so the environment without spark doesn't break
+
+from recommenders.datasets.download_utils import maybe_download, download_path
+from recommenders.utils.notebook_utils import is_databricks
+
+
+CRITEO_URL = {
+ "full": "https://ndownloader.figshare.com/files/10082655",
+ "sample": "http://labs.criteo.com/wp-content/uploads/2015/04/dac_sample.tar.gz",
+}
+DEFAULT_HEADER = (
+ ["label"]
+ + ["int{0:02d}".format(i) for i in range(13)]
+ + ["cat{0:02d}".format(i) for i in range(26)]
+)
+
+
+[docs]def load_pandas_df(size="sample", local_cache_path=None, header=DEFAULT_HEADER):
+ """Loads the Criteo DAC dataset as `pandas.DataFrame`. This function download, untar, and load the dataset.
+
+ The dataset consists of a portion of Criteo’s traffic over a period
+ of 24 days. Each row corresponds to a display ad served by Criteo and the first
+ column indicates whether this ad has been clicked or not.
+
+ There are 13 features taking integer values (mostly count features) and 26
+ categorical features. The values of the categorical features have been hashed
+ onto 32 bits for anonymization purposes.
+
+ The schema is:
+
+ .. code-block:: python
+
+ <label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26>
+
+ More details (need to accept user terms to see the information):
+ http://labs.criteo.com/2013/12/download-terabyte-click-logs/
+
+ Args:
+ size (str): Dataset size. It can be "sample" or "full".
+ local_cache_path (str): Path where to cache the tar.gz file locally
+ header (list): Dataset header names.
+
+ Returns:
+ pandas.DataFrame: Criteo DAC sample dataset.
+ """
+ with download_path(local_cache_path) as path:
+ filepath = download_criteo(size, path)
+ filepath = extract_criteo(size, filepath)
+ df = pd.read_csv(filepath, sep="\t", header=None, names=header)
+ return df
+
+
+[docs]def load_spark_df(
+ spark,
+ size="sample",
+ header=DEFAULT_HEADER,
+ local_cache_path=None,
+ dbfs_datapath="dbfs:/FileStore/dac",
+ dbutils=None,
+):
+ """Loads the Criteo DAC dataset as `pySpark.DataFrame`.
+
+ The dataset consists of a portion of Criteo’s traffic over a period
+ of 24 days. Each row corresponds to a display ad served by Criteo and the first
+ column is indicates whether this ad has been clicked or not.
+
+ There are 13 features taking integer values (mostly count features) and 26
+ categorical features. The values of the categorical features have been hashed
+ onto 32 bits for anonymization purposes.
+
+ The schema is:
+
+ .. code-block:: python
+
+ <label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26>
+
+ More details (need to accept user terms to see the information):
+ http://labs.criteo.com/2013/12/download-terabyte-click-logs/
+
+ Args:
+ spark (pySpark.SparkSession): Spark session.
+ size (str): Dataset size. It can be "sample" or "full".
+ local_cache_path (str): Path where to cache the tar.gz file locally.
+ header (list): Dataset header names.
+ dbfs_datapath (str): Where to store the extracted files on Databricks.
+ dbutils (Databricks.dbutils): Databricks utility object.
+
+ Returns:
+ pyspark.sql.DataFrame: Criteo DAC training dataset.
+ """
+ with download_path(local_cache_path) as path:
+ filepath = download_criteo(size, path)
+ filepath = extract_criteo(size, filepath)
+
+ if is_databricks():
+ try:
+ # Driver node's file path
+ node_path = "file:" + filepath
+ # needs to be on dbfs to load
+ dbutils.fs.cp(node_path, dbfs_datapath, recurse=True)
+ path = dbfs_datapath
+ except Exception:
+ raise ValueError(
+ "To use on a Databricks notebook, dbutils object should be passed as an argument"
+ )
+ else:
+ path = filepath
+
+ schema = get_spark_schema(header)
+ df = spark.read.csv(path, schema=schema, sep="\t", header=False)
+ df.cache().count() # trigger execution to overcome spark's lazy evaluation
+ return df
+
+
+[docs]def download_criteo(size="sample", work_directory="."):
+ """Download criteo dataset as a compressed file.
+
+ Args:
+ size (str): Size of criteo dataset. It can be "full" or "sample".
+ work_directory (str): Working directory.
+
+ Returns:
+ str: Path of the downloaded file.
+
+ """
+ url = CRITEO_URL[size]
+ return maybe_download(url, work_directory=work_directory)
+
+
+[docs]def extract_criteo(size, compressed_file, path=None):
+ """Extract Criteo dataset tar.
+
+ Args:
+ size (str): Size of Criteo dataset. It can be "full" or "sample".
+ compressed_file (str): Path to compressed file.
+ path (str): Path to extract the file.
+
+ Returns:
+ str: Path to the extracted file.
+
+ """
+ if path is None:
+ folder = os.path.dirname(compressed_file)
+ extracted_dir = os.path.join(folder, "dac")
+ else:
+ extracted_dir = path
+
+ with tarfile.open(compressed_file) as tar:
+
+ def is_within_directory(directory, target):
+
+ abs_directory = os.path.abspath(directory)
+ abs_target = os.path.abspath(target)
+
+ prefix = os.path.commonprefix([abs_directory, abs_target])
+
+ return prefix == abs_directory
+
+ def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
+
+ for member in tar.getmembers():
+ member_path = os.path.join(path, member.name)
+ if not is_within_directory(path, member_path):
+ raise Exception("Attempted Path Traversal in Tar File")
+
+ tar.extractall(path, members, numeric_owner=numeric_owner)
+
+ safe_extract(tar, extracted_dir)
+
+ filename_selector = {"sample": "dac_sample.txt", "full": "train.txt"}
+ return os.path.join(extracted_dir, filename_selector[size])
+
+
+[docs]def get_spark_schema(header=DEFAULT_HEADER):
+ """Get Spark schema from header.
+
+ Args:
+ header (list): Dataset header names.
+
+ Returns:
+ pyspark.sql.types.StructType: Spark schema.
+ """
+ # create schema
+ schema = StructType()
+ # do label + ints
+ n_ints = 14
+ for i in range(n_ints):
+ schema.add(StructField(header[i], IntegerType()))
+ # do categoricals
+ for i in range(26):
+ schema.add(StructField(header[i + n_ints], StringType()))
+ return schema
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import os
+import logging
+import requests
+import math
+import zipfile
+from contextlib import contextmanager
+from tempfile import TemporaryDirectory
+from tqdm import tqdm
+from retrying import retry
+
+
+log = logging.getLogger(__name__)
+
+
+[docs]@retry(wait_random_min=1000, wait_random_max=5000, stop_max_attempt_number=5)
+def maybe_download(url, filename=None, work_directory=".", expected_bytes=None):
+ """Download a file if it is not already downloaded.
+
+ Args:
+ filename (str): File name.
+ work_directory (str): Working directory.
+ url (str): URL of the file to download.
+ expected_bytes (int): Expected file size in bytes.
+
+ Returns:
+ str: File path of the file downloaded.
+ """
+ if filename is None:
+ filename = url.split("/")[-1]
+ os.makedirs(work_directory, exist_ok=True)
+ filepath = os.path.join(work_directory, filename)
+ if not os.path.exists(filepath):
+ r = requests.get(url, stream=True)
+ if r.status_code == 200:
+ log.info(f"Downloading {url}")
+ total_size = int(r.headers.get("content-length", 0))
+ block_size = 1024
+ num_iterables = math.ceil(total_size / block_size)
+ with open(filepath, "wb") as file:
+ for data in tqdm(
+ r.iter_content(block_size),
+ total=num_iterables,
+ unit="KB",
+ unit_scale=True,
+ ):
+ file.write(data)
+ else:
+ log.error(f"Problem downloading {url}")
+ r.raise_for_status()
+ else:
+ log.info(f"File {filepath} already downloaded")
+ if expected_bytes is not None:
+ statinfo = os.stat(filepath)
+ if statinfo.st_size != expected_bytes:
+ os.remove(filepath)
+ raise IOError(f"Failed to verify {filepath}")
+
+ return filepath
+
+
+[docs]@contextmanager
+def download_path(path=None):
+ """Return a path to download data. If `path=None`, then it yields a temporal path that is eventually deleted,
+ otherwise the real path of the input.
+
+ Args:
+ path (str): Path to download data.
+
+ Returns:
+ str: Real path where the data is stored.
+
+ Examples:
+ >>> with download_path() as path:
+ >>> ... maybe_download(url="http://example.com/file.zip", work_directory=path)
+
+ """
+ if path is None:
+ tmp_dir = TemporaryDirectory()
+ try:
+ yield tmp_dir.name
+ finally:
+ tmp_dir.cleanup()
+ else:
+ path = os.path.realpath(path)
+ yield path
+
+
+[docs]def unzip_file(zip_src, dst_dir, clean_zip_file=False):
+ """Unzip a file
+
+ Args:
+ zip_src (str): Zip file.
+ dst_dir (str): Destination folder.
+ clean_zip_file (bool): Whether or not to clean the zip file.
+ """
+ fz = zipfile.ZipFile(zip_src, "r")
+ for file in fz.namelist():
+ fz.extract(file, dst_dir)
+ if clean_zip_file:
+ os.remove(zip_src)
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import os
+import random
+import logging
+import json
+import numpy as np
+import re
+from tqdm import tqdm
+from nltk.tokenize import RegexpTokenizer
+
+from recommenders.datasets.download_utils import (
+ maybe_download,
+ download_path,
+ unzip_file,
+)
+
+
+URL_MIND_LARGE_TRAIN = (
+ "https://mind201910small.blob.core.windows.net/release/MINDlarge_train.zip"
+)
+URL_MIND_LARGE_VALID = (
+ "https://mind201910small.blob.core.windows.net/release/MINDlarge_dev.zip"
+)
+URL_MIND_SMALL_TRAIN = (
+ "https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip"
+)
+URL_MIND_SMALL_VALID = (
+ "https://mind201910small.blob.core.windows.net/release/MINDsmall_dev.zip"
+)
+URL_MIND_DEMO_TRAIN = (
+ "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_train.zip"
+)
+URL_MIND_DEMO_VALID = (
+ "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_dev.zip"
+)
+URL_MIND_DEMO_UTILS = (
+ "https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_utils.zip"
+)
+
+URL_MIND = {
+ "large": (URL_MIND_LARGE_TRAIN, URL_MIND_LARGE_VALID),
+ "small": (URL_MIND_SMALL_TRAIN, URL_MIND_SMALL_VALID),
+ "demo": (URL_MIND_DEMO_TRAIN, URL_MIND_DEMO_VALID),
+}
+
+logger = logging.getLogger()
+
+
+[docs]def download_mind(size="small", dest_path=None):
+ """Download MIND dataset
+
+ Args:
+ size (str): Dataset size. One of ["small", "large"]
+ dest_path (str): Download path. If path is None, it will download the dataset on a temporal path
+
+ Returns:
+ str, str: Path to train and validation sets.
+ """
+ size_options = ["small", "large", "demo"]
+ if size not in size_options:
+ raise ValueError(f"Wrong size option, available options are {size_options}")
+ url_train, url_valid = URL_MIND[size]
+ with download_path(dest_path) as path:
+ train_path = maybe_download(url=url_train, work_directory=path)
+ valid_path = maybe_download(url=url_valid, work_directory=path)
+ return train_path, valid_path
+
+
+[docs]def extract_mind(
+ train_zip,
+ valid_zip,
+ train_folder="train",
+ valid_folder="valid",
+ clean_zip_file=True,
+):
+ """Extract MIND dataset
+
+ Args:
+ train_zip (str): Path to train zip file
+ valid_zip (str): Path to valid zip file
+ train_folder (str): Destination forder for train set
+ valid_folder (str): Destination forder for validation set
+
+ Returns:
+ str, str: Train and validation folders
+ """
+ root_folder = os.path.basename(train_zip)
+ train_path = os.path.join(root_folder, train_folder)
+ valid_path = os.path.join(root_folder, valid_folder)
+ unzip_file(train_zip, train_path, clean_zip_file=clean_zip_file)
+ unzip_file(valid_zip, valid_path, clean_zip_file=clean_zip_file)
+ return train_path, valid_path
+
+
+[docs]def read_clickhistory(path, filename):
+ """Read click history file
+
+ Args:
+ path (str): Folder path
+ filename (str): Filename
+
+ Returns:
+ list, dict:
+ - A list of user session with user_id, clicks, positive and negative interactions.
+ - A dictionary with user_id click history.
+ """
+ userid_history = {}
+ with open(os.path.join(path, filename)) as f:
+ lines = f.readlines()
+ sessions = []
+ for i in range(len(lines)):
+ _, userid, imp_time, click, imps = lines[i].strip().split("\t")
+ clicks = click.split(" ")
+ pos = []
+ neg = []
+ imps = imps.split(" ")
+ for imp in imps:
+ if imp.split("-")[1] == "1":
+ pos.append(imp.split("-")[0])
+ else:
+ neg.append(imp.split("-")[0])
+ userid_history[userid] = clicks
+ sessions.append([userid, clicks, pos, neg])
+ return sessions, userid_history
+
+
+def _newsample(nnn, ratio):
+ if ratio > len(nnn):
+ return random.sample(nnn * (ratio // len(nnn) + 1), ratio)
+ else:
+ return random.sample(nnn, ratio)
+
+
+[docs]def get_train_input(session, train_file_path, npratio=4):
+ """Generate train file.
+
+ Args:
+ session (list): List of user session with user_id, clicks, positive and negative interactions.
+ train_file_path (str): Path to file.
+ npration (int): Ratio for negative sampling.
+ """
+ fp_train = open(train_file_path, "w", encoding="utf-8")
+ for sess_id in range(len(session)):
+ sess = session[sess_id]
+ userid, _, poss, negs = sess
+ for i in range(len(poss)):
+ pos = poss[i]
+ neg = _newsample(negs, npratio)
+ fp_train.write("1 " + "train_" + userid + " " + pos + "\n")
+ for neg_ins in neg:
+ fp_train.write("0 " + "train_" + userid + " " + neg_ins + "\n")
+ fp_train.close()
+ if os.path.isfile(train_file_path):
+ logger.info(f"Train file {train_file_path} successfully generated")
+ else:
+ raise FileNotFoundError(f"Error when generating {train_file_path}")
+
+
+[docs]def get_valid_input(session, valid_file_path):
+ """Generate validation file.
+
+ Args:
+ session (list): List of user session with user_id, clicks, positive and negative interactions.
+ valid_file_path (str): Path to file.
+ """
+ fp_valid = open(valid_file_path, "w", encoding="utf-8")
+ for sess_id in range(len(session)):
+ userid, _, poss, negs = session[sess_id]
+ for i in range(len(poss)):
+ fp_valid.write(
+ "1 " + "valid_" + userid + " " + poss[i] + "%" + str(sess_id) + "\n"
+ )
+ for i in range(len(negs)):
+ fp_valid.write(
+ "0 " + "valid_" + userid + " " + negs[i] + "%" + str(sess_id) + "\n"
+ )
+ fp_valid.close()
+ if os.path.isfile(valid_file_path):
+ logger.info(f"Validation file {valid_file_path} successfully generated")
+ else:
+ raise FileNotFoundError(f"Error when generating {valid_file_path}")
+
+
+[docs]def get_user_history(train_history, valid_history, user_history_path):
+ """Generate user history file.
+
+ Args:
+ train_history (list): Train history.
+ valid_history (list): Validation history
+ user_history_path (str): Path to file.
+ """
+ fp_user_history = open(user_history_path, "w", encoding="utf-8")
+ for userid in train_history:
+ fp_user_history.write(
+ "train_" + userid + " " + ",".join(train_history[userid]) + "\n"
+ )
+ for userid in valid_history:
+ fp_user_history.write(
+ "valid_" + userid + " " + ",".join(valid_history[userid]) + "\n"
+ )
+ fp_user_history.close()
+ if os.path.isfile(user_history_path):
+ logger.info(f"User history file {user_history_path} successfully generated")
+ else:
+ raise FileNotFoundError(f"Error when generating {user_history_path}")
+
+
+def _read_news(filepath, news_words, news_entities, tokenizer):
+ with open(filepath, encoding="utf-8") as f:
+ lines = f.readlines()
+ for line in lines:
+ splitted = line.strip("\n").split("\t")
+ news_words[splitted[0]] = tokenizer.tokenize(splitted[3].lower())
+ news_entities[splitted[0]] = []
+ for entity in json.loads(splitted[6]):
+ news_entities[splitted[0]].append(
+ (entity["SurfaceForms"], entity["WikidataId"])
+ )
+ return news_words, news_entities
+
+
+[docs]def get_words_and_entities(train_news, valid_news):
+ """Load words and entities
+
+ Args:
+ train_news (str): News train file.
+ valid_news (str): News validation file.
+
+ Returns:
+ dict, dict: Words and entities dictionaries.
+ """
+ news_words = {}
+ news_entities = {}
+ tokenizer = RegexpTokenizer(r"\w+")
+ news_words, news_entities = _read_news(
+ train_news, news_words, news_entities, tokenizer
+ )
+ news_words, news_entities = _read_news(
+ valid_news, news_words, news_entities, tokenizer
+ )
+ return news_words, news_entities
+
+
+[docs]def download_and_extract_glove(dest_path):
+ """Download and extract the Glove embedding
+
+ Args:
+ dest_path (str): Destination directory path for the downloaded file
+
+ Returns:
+ str: File path where Glove was extracted.
+ """
+ # url = "http://nlp.stanford.edu/data/glove.6B.zip"
+ url = "https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip"
+ filepath = maybe_download(url=url, work_directory=dest_path)
+ glove_path = os.path.join(dest_path, "glove")
+ unzip_file(filepath, glove_path, clean_zip_file=False)
+ return glove_path
+
+
+[docs]def generate_embeddings(
+ data_path,
+ news_words,
+ news_entities,
+ train_entities,
+ valid_entities,
+ max_sentence=10,
+ word_embedding_dim=100,
+):
+ """Generate embeddings.
+
+ Args:
+ data_path (str): Data path.
+ news_words (dict): News word dictionary.
+ news_entities (dict): News entity dictionary.
+ train_entities (str): Train entity file.
+ valid_entities (str): Validation entity file.
+ max_sentence (int): Max sentence size.
+ word_embedding_dim (int): Word embedding dimension.
+
+ Returns:
+ str, str, str: File paths to news, word and entity embeddings.
+ """
+ embedding_dimensions = [50, 100, 200, 300]
+ if word_embedding_dim not in embedding_dimensions:
+ raise ValueError(
+ f"Wrong embedding dimension, available options are {embedding_dimensions}"
+ )
+
+ logger.info("Downloading glove...")
+ glove_path = download_and_extract_glove(data_path)
+
+ word_set = set()
+ word_embedding_dict = {}
+ entity_embedding_dict = {}
+
+ logger.info(f"Loading glove with embedding dimension {word_embedding_dim}...")
+ glove_file = "glove.6B." + str(word_embedding_dim) + "d.txt"
+ fp_pretrain_vec = open(os.path.join(glove_path, glove_file), "r", encoding="utf-8")
+ for line in fp_pretrain_vec:
+ linesplit = line.split(" ")
+ word_set.add(linesplit[0])
+ word_embedding_dict[linesplit[0]] = np.asarray(list(map(float, linesplit[1:])))
+ fp_pretrain_vec.close()
+
+ logger.info("Reading train entities...")
+ fp_entity_vec_train = open(train_entities, "r", encoding="utf-8")
+ for line in fp_entity_vec_train:
+ linesplit = line.split()
+ entity_embedding_dict[linesplit[0]] = np.asarray(
+ list(map(float, linesplit[1:]))
+ )
+ fp_entity_vec_train.close()
+
+ logger.info("Reading valid entities...")
+ fp_entity_vec_valid = open(valid_entities, "r", encoding="utf-8")
+ for line in fp_entity_vec_valid:
+ linesplit = line.split()
+ entity_embedding_dict[linesplit[0]] = np.asarray(
+ list(map(float, linesplit[1:]))
+ )
+ fp_entity_vec_valid.close()
+
+ logger.info("Generating word and entity indexes...")
+ word_dict = {}
+ word_index = 1
+ news_word_string_dict = {}
+ news_entity_string_dict = {}
+ entity2index = {}
+ entity_index = 1
+ for doc_id in news_words:
+ news_word_string_dict[doc_id] = [0 for n in range(max_sentence)]
+ news_entity_string_dict[doc_id] = [0 for n in range(max_sentence)]
+ surfaceform_entityids = news_entities[doc_id]
+ for item in surfaceform_entityids:
+ if item[1] not in entity2index and item[1] in entity_embedding_dict:
+ entity2index[item[1]] = entity_index
+ entity_index = entity_index + 1
+ for i in range(len(news_words[doc_id])):
+ if news_words[doc_id][i] in word_embedding_dict:
+ if news_words[doc_id][i] not in word_dict:
+ word_dict[news_words[doc_id][i]] = word_index
+ word_index = word_index + 1
+ news_word_string_dict[doc_id][i] = word_dict[news_words[doc_id][i]]
+ else:
+ news_word_string_dict[doc_id][i] = word_dict[news_words[doc_id][i]]
+ for item in surfaceform_entityids:
+ for surface in item[0]:
+ for surface_word in surface.split(" "):
+ if news_words[doc_id][i] == surface_word.lower():
+ if item[1] in entity_embedding_dict:
+ news_entity_string_dict[doc_id][i] = entity2index[
+ item[1]
+ ]
+ if i == max_sentence - 1:
+ break
+
+ logger.info("Generating word embeddings...")
+ word_embeddings = np.zeros([word_index, word_embedding_dim])
+ for word in word_dict:
+ word_embeddings[word_dict[word]] = word_embedding_dict[word]
+
+ logger.info("Generating entity embeddings...")
+ entity_embeddings = np.zeros([entity_index, word_embedding_dim])
+ for entity in entity2index:
+ entity_embeddings[entity2index[entity]] = entity_embedding_dict[entity]
+
+ news_feature_path = os.path.join(data_path, "doc_feature.txt")
+ logger.info(f"Saving word and entity features in {news_feature_path}")
+ fp_doc_string = open(news_feature_path, "w", encoding="utf-8")
+ for doc_id in news_word_string_dict:
+ fp_doc_string.write(
+ doc_id
+ + " "
+ + ",".join(list(map(str, news_word_string_dict[doc_id])))
+ + " "
+ + ",".join(list(map(str, news_entity_string_dict[doc_id])))
+ + "\n"
+ )
+
+ word_embeddings_path = os.path.join(
+ data_path, "word_embeddings_5w_" + str(word_embedding_dim) + ".npy"
+ )
+ logger.info(f"Saving word embeddings in {word_embeddings_path}")
+ np.save(word_embeddings_path, word_embeddings)
+
+ entity_embeddings_path = os.path.join(
+ data_path, "entity_embeddings_5w_" + str(word_embedding_dim) + ".npy"
+ )
+ logger.info(f"Saving word embeddings in {entity_embeddings_path}")
+ np.save(entity_embeddings_path, entity_embeddings)
+
+ return news_feature_path, word_embeddings_path, entity_embeddings_path
+
+
+[docs]def load_glove_matrix(path_emb, word_dict, word_embedding_dim):
+ """Load pretrained embedding metrics of words in word_dict
+
+ Args:
+ path_emb (string): Folder path of downloaded glove file
+ word_dict (dict): word dictionary
+ word_embedding_dim: dimention of word embedding vectors
+
+ Returns:
+ numpy.ndarray, list: pretrained word embedding metrics, words can be found in glove files
+ """
+
+ embedding_matrix = np.zeros((len(word_dict) + 1, word_embedding_dim))
+ exist_word = []
+
+ with open(os.path.join(path_emb, f"glove.6B.{word_embedding_dim}d.txt"), "rb") as f:
+ for l in tqdm(f): # noqa: E741 ambiguous variable name 'l'
+ l = l.split() # noqa: E741 ambiguous variable name 'l'
+ word = l[0].decode()
+ if len(word) != 0:
+ if word in word_dict:
+ wordvec = [float(x) for x in l[1:]]
+ index = word_dict[word]
+ embedding_matrix[index] = np.array(wordvec)
+ exist_word.append(word)
+
+ return embedding_matrix, exist_word
+
+
+[docs]def word_tokenize(sent):
+ """Tokenize a sententence
+
+ Args:
+ sent: the sentence need to be tokenized
+
+ Returns:
+ list: words in the sentence
+ """
+
+ # treat consecutive words or special punctuation as words
+ pat = re.compile(r"[\w]+|[.,!?;|]")
+ if isinstance(sent, str):
+ return pat.findall(sent.lower())
+ else:
+ return []
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import os
+import re
+import random
+import shutil
+import warnings
+import pandas as pd
+from typing import Optional
+from zipfile import ZipFile
+from recommenders.datasets.download_utils import maybe_download, download_path
+from recommenders.utils.notebook_utils import is_databricks
+from recommenders.utils.constants import (
+ DEFAULT_HEADER,
+ DEFAULT_ITEM_COL,
+ DEFAULT_USER_COL,
+ DEFAULT_RATING_COL,
+ DEFAULT_TIMESTAMP_COL,
+ DEFAULT_TITLE_COL,
+ DEFAULT_GENRE_COL,
+)
+
+try:
+ from pyspark.sql.types import (
+ StructType,
+ StructField,
+ StringType,
+ IntegerType,
+ FloatType,
+ LongType,
+ )
+except ImportError:
+ pass # so the environment without spark doesn't break
+
+import pandera as pa
+import pandera.extensions as extensions
+from pandera import Field
+from pandera.typing import Series
+
+
+class _DataFormat:
+ def __init__(
+ self,
+ sep,
+ path,
+ has_header=False,
+ item_sep=None,
+ item_path=None,
+ item_has_header=False,
+ ):
+ """MovieLens data format container as a different size of MovieLens data file
+ has a different format
+
+ Args:
+ sep (str): Rating data delimiter
+ path (str): Rating data path within the original zip file
+ has_header (bool): Whether the rating data contains a header line or not
+ item_sep (str): Item data delimiter
+ item_path (str): Item data path within the original zip file
+ item_has_header (bool): Whether the item data contains a header line or not
+ """
+
+ # Rating file
+ self._sep = sep
+ self._path = path
+ self._has_header = has_header
+
+ # Item file
+ self._item_sep = item_sep
+ self._item_path = item_path
+ self._item_has_header = item_has_header
+
+ @property
+ def separator(self):
+ return self._sep
+
+ @property
+ def path(self):
+ return self._path
+
+ @property
+ def has_header(self):
+ return self._has_header
+
+ @property
+ def item_separator(self):
+ return self._item_sep
+
+ @property
+ def item_path(self):
+ return self._item_path
+
+ @property
+ def item_has_header(self):
+ return self._item_has_header
+
+
+# 10m and 20m data do not have user data
+DATA_FORMAT = {
+ "100k": _DataFormat("\t", "ml-100k/u.data", False, "|", "ml-100k/u.item", False),
+ "1m": _DataFormat(
+ "::", "ml-1m/ratings.dat", False, "::", "ml-1m/movies.dat", False
+ ),
+ "10m": _DataFormat(
+ "::", "ml-10M100K/ratings.dat", False, "::", "ml-10M100K/movies.dat", False
+ ),
+ "20m": _DataFormat(",", "ml-20m/ratings.csv", True, ",", "ml-20m/movies.csv", True),
+}
+
+# Fake data for testing only
+MOCK_DATA_FORMAT = {
+ "mock100": {"size": 100, "seed": 6},
+}
+
+# 100K data genres index to string mapper. For 1m, 10m, and 20m, the genres labels are already in the dataset.
+GENRES = (
+ "unknown",
+ "Action",
+ "Adventure",
+ "Animation",
+ "Children's",
+ "Comedy",
+ "Crime",
+ "Documentary",
+ "Drama",
+ "Fantasy",
+ "Film-Noir",
+ "Horror",
+ "Musical",
+ "Mystery",
+ "Romance",
+ "Sci-Fi",
+ "Thriller",
+ "War",
+ "Western",
+)
+
+
+# Warning and error messages
+WARNING_MOVIE_LENS_HEADER = """MovieLens rating dataset has four columns
+ (user id, movie id, rating, and timestamp), but more than four column names are provided.
+ Will only use the first four column names."""
+WARNING_HAVE_SCHEMA_AND_HEADER = """Both schema and header are provided.
+ The header argument will be ignored."""
+ERROR_MOVIE_LENS_SIZE = (
+ "Invalid data size. Should be one of {100k, 1m, 10m, or 20m, or mock100}"
+)
+ERROR_HEADER = "Header error. At least user and movie column names should be provided"
+
+
+[docs]def load_pandas_df(
+ size="100k",
+ header=None,
+ local_cache_path=None,
+ title_col=None,
+ genres_col=None,
+ year_col=None,
+):
+ """Loads the MovieLens dataset as pd.DataFrame.
+
+ Download the dataset from https://files.grouplens.org/datasets/movielens, unzip, and load.
+ To load movie information only, you can use load_item_df function.
+
+ Args:
+ size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m", "mock100").
+ header (list or tuple or None): Rating dataset header.
+ If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored and data is rendered using the 'DEFAULT_HEADER' instead.
+ local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
+ If None, all the intermediate files will be stored in a temporary directory and removed after use.
+ If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored.
+ title_col (str): Movie title column name. If None, the column will not be loaded.
+ genres_col (str): Genres column name. Genres are '|' separated string.
+ If None, the column will not be loaded.
+ year_col (str): Movie release year column name. If None, the column will not be loaded.
+ If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored.
+
+ Returns:
+ pandas.DataFrame: Movie rating dataset.
+
+
+ **Examples**
+
+ .. code-block:: python
+
+ # To load just user-id, item-id, and ratings from MovieLens-1M dataset,
+ df = load_pandas_df('1m', ('UserId', 'ItemId', 'Rating'))
+
+ # To load rating's timestamp together,
+ df = load_pandas_df('1m', ('UserId', 'ItemId', 'Rating', 'Timestamp'))
+
+ # To load movie's title, genres, and released year info along with the ratings data,
+ df = load_pandas_df('1m', ('UserId', 'ItemId', 'Rating', 'Timestamp'),
+ title_col='Title',
+ genres_col='Genres',
+ year_col='Year'
+ )
+ """
+ size = size.lower()
+ if size not in DATA_FORMAT and size not in MOCK_DATA_FORMAT:
+ raise ValueError(f"Size: {size}. " + ERROR_MOVIE_LENS_SIZE)
+
+ if header is None:
+ header = DEFAULT_HEADER
+ elif len(header) < 2:
+ raise ValueError(ERROR_HEADER)
+ elif len(header) > 4:
+ warnings.warn(WARNING_MOVIE_LENS_HEADER)
+ header = header[:4]
+
+ if size in MOCK_DATA_FORMAT:
+ # generate fake data
+ return MockMovielensSchema.get_df(
+ keep_first_n_cols=len(header),
+ keep_title_col=(title_col is not None),
+ keep_genre_col=(genres_col is not None),
+ **MOCK_DATA_FORMAT[
+ size
+ ], # supply the rest of the kwarg with the dictionary
+ )
+
+ movie_col = header[1]
+
+ with download_path(local_cache_path) as path:
+ filepath = os.path.join(path, "ml-{}.zip".format(size))
+ datapath, item_datapath = _maybe_download_and_extract(size, filepath)
+
+ # Load movie features such as title, genres, and release year
+ item_df = _load_item_df(
+ size, item_datapath, movie_col, title_col, genres_col, year_col
+ )
+
+ # Load rating data
+ df = pd.read_csv(
+ datapath,
+ sep=DATA_FORMAT[size].separator,
+ engine="python",
+ names=header,
+ usecols=[*range(len(header))],
+ header=0 if DATA_FORMAT[size].has_header else None,
+ )
+
+ # Convert 'rating' type to float
+ if len(header) > 2:
+ df[header[2]] = df[header[2]].astype(float)
+
+ # Merge rating df w/ item_df
+ if item_df is not None:
+ df = df.merge(item_df, on=header[1])
+
+ return df
+
+
+[docs]def load_item_df(
+ size="100k",
+ local_cache_path=None,
+ movie_col=DEFAULT_ITEM_COL,
+ title_col=None,
+ genres_col=None,
+ year_col=None,
+):
+ """Loads Movie info.
+
+ Args:
+ size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
+ local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
+ If None, all the intermediate files will be stored in a temporary directory and removed after use.
+ movie_col (str): Movie id column name.
+ title_col (str): Movie title column name. If None, the column will not be loaded.
+ genres_col (str): Genres column name. Genres are '|' separated string.
+ If None, the column will not be loaded.
+ year_col (str): Movie release year column name. If None, the column will not be loaded.
+
+ Returns:
+ pandas.DataFrame: Movie information data, such as title, genres, and release year.
+ """
+ size = size.lower()
+ if size not in DATA_FORMAT:
+ raise ValueError(f"Size: {size}. " + ERROR_MOVIE_LENS_SIZE)
+
+ with download_path(local_cache_path) as path:
+ filepath = os.path.join(path, "ml-{}.zip".format(size))
+ _, item_datapath = _maybe_download_and_extract(size, filepath)
+ item_df = _load_item_df(
+ size, item_datapath, movie_col, title_col, genres_col, year_col
+ )
+
+ return item_df
+
+
+def _load_item_df(size, item_datapath, movie_col, title_col, genres_col, year_col):
+ """Loads Movie info"""
+ if title_col is None and genres_col is None and year_col is None:
+ return None
+
+ item_header = [movie_col]
+ usecols = [0]
+
+ # Year is parsed from title
+ if title_col is not None or year_col is not None:
+ item_header.append("title_year")
+ usecols.append(1)
+
+ genres_header_100k = None
+ if genres_col is not None:
+ # 100k data's movie genres are encoded as a binary array (the last 19 fields)
+ # For details, see https://files.grouplens.org/datasets/movielens/ml-100k-README.txt
+ if size == "100k":
+ genres_header_100k = [*(str(i) for i in range(19))]
+ item_header.extend(genres_header_100k)
+ usecols.extend([*range(5, 24)]) # genres columns
+ else:
+ item_header.append(genres_col)
+ usecols.append(2) # genres column
+
+ item_df = pd.read_csv(
+ item_datapath,
+ sep=DATA_FORMAT[size].item_separator,
+ engine="python",
+ names=item_header,
+ usecols=usecols,
+ header=0 if DATA_FORMAT[size].item_has_header else None,
+ encoding="ISO-8859-1",
+ )
+
+ # Convert 100k data's format: '0|0|1|...' to 'Action|Romance|..."
+ if genres_header_100k is not None:
+ item_df[genres_col] = item_df[genres_header_100k].values.tolist()
+ item_df[genres_col] = item_df[genres_col].map(
+ lambda l: "|".join([GENRES[i] for i, v in enumerate(l) if v == 1])
+ )
+
+ item_df.drop(genres_header_100k, axis=1, inplace=True)
+
+ # Parse year from movie title. Note, MovieLens title format is "title (year)"
+ # Note, there are very few records that are missing the year info.
+ if year_col is not None:
+
+ def parse_year(t):
+ parsed = re.split("[()]", t)
+ if len(parsed) > 2 and parsed[-2].isdecimal():
+ return parsed[-2]
+ else:
+ return None
+
+ item_df[year_col] = item_df["title_year"].map(parse_year)
+ if title_col is None:
+ item_df.drop("title_year", axis=1, inplace=True)
+
+ if title_col is not None:
+ item_df.rename(columns={"title_year": title_col}, inplace=True)
+
+ return item_df
+
+
+[docs]def load_spark_df(
+ spark,
+ size="100k",
+ header=None,
+ schema=None,
+ local_cache_path=None,
+ dbutils=None,
+ title_col=None,
+ genres_col=None,
+ year_col=None,
+):
+ """Loads the MovieLens dataset as `pyspark.sql.DataFrame`.
+
+ Download the dataset from https://files.grouplens.org/datasets/movielens, unzip, and load as `pyspark.sql.DataFrame`.
+
+ To load movie information only, you can use `load_item_df` function.
+
+ Args:
+ spark (pyspark.SparkSession): Spark session.
+ size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m", "mock100").
+ header (list or tuple): Rating dataset header.
+ If `schema` is provided or `size` is set to any of 'MOCK_DATA_FORMAT', this argument is ignored.
+ schema (pyspark.StructType): Dataset schema.
+ If `size` is set to any of 'MOCK_DATA_FORMAT', data is rendered in the 'MockMovielensSchema' instead.
+ local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
+ If None, all the intermediate files will be stored in a temporary directory and removed after use.
+ dbutils (Databricks.dbutils): Databricks utility object
+ If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored.
+ title_col (str): Title column name. If None, the column will not be loaded.
+ genres_col (str): Genres column name. Genres are '|' separated string.
+ If None, the column will not be loaded.
+ year_col (str): Movie release year column name. If None, the column will not be loaded.
+ If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored.
+
+ Returns:
+ pyspark.sql.DataFrame: Movie rating dataset.
+
+ **Examples**
+
+ .. code-block:: python
+
+ # To load just user-id, item-id, and ratings from MovieLens-1M dataset:
+ spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating'))
+
+ # The schema can be defined as well:
+ schema = StructType([
+ StructField(DEFAULT_USER_COL, IntegerType()),
+ StructField(DEFAULT_ITEM_COL, IntegerType()),
+ StructField(DEFAULT_RATING_COL, FloatType()),
+ StructField(DEFAULT_TIMESTAMP_COL, LongType()),
+ ])
+ spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating'), schema=schema)
+
+ # To load rating's timestamp together:
+ spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating', 'Timestamp'))
+
+ # To load movie's title, genres, and released year info along with the ratings data:
+ spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating', 'Timestamp'),
+ title_col='Title',
+ genres_col='Genres',
+ year_col='Year'
+ )
+
+ # On DataBricks, pass the dbutils argument as follows:
+ spark_df = load_spark_df(spark, dbutils=dbutils)
+ """
+ size = size.lower()
+ if size not in DATA_FORMAT and size not in MOCK_DATA_FORMAT:
+ raise ValueError(f"Size: {size}. " + ERROR_MOVIE_LENS_SIZE)
+
+ if size in MOCK_DATA_FORMAT:
+ # generate fake data
+ return MockMovielensSchema.get_spark_df(
+ spark,
+ keep_title_col=(title_col is not None),
+ keep_genre_col=(genres_col is not None),
+ **MOCK_DATA_FORMAT[
+ size
+ ], # supply the rest of the kwarg with the dictionary
+ )
+
+ schema = _get_schema(header, schema)
+ if len(schema) < 2:
+ raise ValueError(ERROR_HEADER)
+
+ movie_col = schema[1].name
+
+ with download_path(local_cache_path) as path:
+ filepath = os.path.join(path, "ml-{}.zip".format(size))
+ datapath, item_datapath = _maybe_download_and_extract(size, filepath)
+ spark_datapath = "file:///" + datapath # shorten form of file://localhost/
+
+ # Load movie features such as title, genres, and release year.
+ # Since the file size is small, we directly load as pd.DataFrame from the driver node
+ # and then convert into pyspark.sql.DataFrame
+ item_pd_df = _load_item_df(
+ size, item_datapath, movie_col, title_col, genres_col, year_col
+ )
+ item_df = spark.createDataFrame(item_pd_df) if item_pd_df is not None else None
+
+ if is_databricks():
+ if dbutils is None:
+ raise ValueError(
+ """
+ To use on a Databricks, dbutils object should be passed as an argument.
+ E.g. load_spark_df(spark, dbutils=dbutils)
+ """
+ )
+
+ # Move rating file to DBFS in order to load into pyspark.sql.DataFrame
+ dbfs_datapath = "dbfs:/tmp/" + datapath
+ dbutils.fs.mv(spark_datapath, dbfs_datapath)
+ spark_datapath = dbfs_datapath
+
+ # pyspark's read csv currently doesn't support multi-character delimiter, thus we manually handle that
+ separator = DATA_FORMAT[size].separator
+ if len(separator) > 1:
+ raw_data = spark.sparkContext.textFile(spark_datapath)
+ data_rdd = raw_data.map(lambda l: l.split(separator)).map(
+ lambda c: [int(c[0]), int(c[1]), float(c[2]), int(c[3])][: len(schema)]
+ )
+ df = spark.createDataFrame(data_rdd, schema)
+ else:
+ df = spark.read.csv(
+ spark_datapath,
+ schema=schema,
+ sep=separator,
+ header=DATA_FORMAT[size].has_header,
+ )
+
+ # Merge rating df w/ item_df
+ if item_df is not None:
+ df = df.join(item_df, movie_col, "left")
+
+ # Cache and force trigger action since data-file might be removed.
+ df.cache()
+ df.count()
+
+ return df
+
+
+def _get_schema(header, schema):
+ if schema is None or len(schema) == 0:
+ # Use header to generate schema
+ if header is None or len(header) == 0:
+ header = DEFAULT_HEADER
+ elif len(header) > 4:
+ warnings.warn(WARNING_MOVIE_LENS_HEADER)
+ header = header[:4]
+
+ schema = StructType()
+ try:
+ (
+ schema.add(StructField(header[0], IntegerType()))
+ .add(StructField(header[1], IntegerType()))
+ .add(StructField(header[2], FloatType()))
+ .add(StructField(header[3], LongType()))
+ )
+ except IndexError:
+ pass
+ else:
+ if header is not None:
+ warnings.warn(WARNING_HAVE_SCHEMA_AND_HEADER)
+
+ if len(schema) > 4:
+ warnings.warn(WARNING_MOVIE_LENS_HEADER)
+ schema = schema[:4]
+
+ return schema
+
+
+def _maybe_download_and_extract(size, dest_path):
+ """Downloads and extracts MovieLens rating and item datafiles if they don’t already exist"""
+ dirs, _ = os.path.split(dest_path)
+ if not os.path.exists(dirs):
+ os.makedirs(dirs)
+
+ _, rating_filename = os.path.split(DATA_FORMAT[size].path)
+ rating_path = os.path.join(dirs, rating_filename)
+ _, item_filename = os.path.split(DATA_FORMAT[size].item_path)
+ item_path = os.path.join(dirs, item_filename)
+
+ if not os.path.exists(rating_path) or not os.path.exists(item_path):
+ download_movielens(size, dest_path)
+ extract_movielens(size, rating_path, item_path, dest_path)
+
+ return rating_path, item_path
+
+
+[docs]def download_movielens(size, dest_path):
+ """Downloads MovieLens datafile.
+
+ Args:
+ size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
+ dest_path (str): File path for the downloaded file
+ """
+ if size not in DATA_FORMAT:
+ raise ValueError(f"Size: {size}. " + ERROR_MOVIE_LENS_SIZE)
+
+ url = "https://files.grouplens.org/datasets/movielens/ml-" + size + ".zip"
+ dirs, file = os.path.split(dest_path)
+ maybe_download(url, file, work_directory=dirs)
+
+
+[docs]def extract_movielens(size, rating_path, item_path, zip_path):
+ """Extract MovieLens rating and item datafiles from the MovieLens raw zip file.
+
+ To extract all files instead of just rating and item datafiles,
+ use ZipFile's extractall(path) instead.
+
+ Args:
+ size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
+ rating_path (str): Destination path for rating datafile
+ item_path (str): Destination path for item datafile
+ zip_path (str): zipfile path
+ """
+ with ZipFile(zip_path, "r") as z:
+ with z.open(DATA_FORMAT[size].path) as zf, open(rating_path, "wb") as f:
+ shutil.copyfileobj(zf, f)
+ with z.open(DATA_FORMAT[size].item_path) as zf, open(item_path, "wb") as f:
+ shutil.copyfileobj(zf, f)
+
+
+# For more information on data synthesis, see https://pandera.readthedocs.io/en/latest/data_synthesis_strategies.html
+@extensions.register_check_method(statistics=["columns"], supported_types=pd.DataFrame)
+def unique_columns(df, *, columns):
+ return not df[columns].duplicated().any()
+
+
+[docs]class MockMovielensSchema(pa.SchemaModel):
+ """
+ Mock dataset schema to generate fake data for testing purpose.
+ This schema is configured to mimic the Movielens dataset
+
+ https://files.grouplens.org/datasets/movielens/ml-100k/
+
+ Dataset schema and generation is configured using pandera.
+ Please see https://pandera.readthedocs.io/en/latest/schema_models.html
+ for more information.
+ """
+
+ # Some notebooks will do a cross join with userID and itemID,
+ # a sparse range for these IDs can slow down the notebook tests
+ userID: Series[int] = Field(
+ in_range={"min_value": 1, "max_value": 50}, alias=DEFAULT_USER_COL
+ )
+ itemID: Series[int] = Field(
+ in_range={"min_value": 1, "max_value": 50}, alias=DEFAULT_ITEM_COL
+ )
+ rating: Series[float] = Field(
+ in_range={"min_value": 1, "max_value": 5}, alias=DEFAULT_RATING_COL
+ )
+ timestamp: Series[int] = Field(
+ in_range={"min_value": 0, "max_value": 1e9}, alias=DEFAULT_TIMESTAMP_COL
+ )
+ title: Series[str] = Field(eq="foo", alias=DEFAULT_TITLE_COL)
+ genre: Series[str] = Field(eq="genreA|0", alias=DEFAULT_GENRE_COL)
+
+[docs] @classmethod
+ def get_df(
+ cls,
+ size: int = 3,
+ seed: int = 100,
+ keep_first_n_cols: Optional[int] = None,
+ keep_title_col: bool = False,
+ keep_genre_col: bool = False,
+ ) -> pd.DataFrame:
+ """Return fake movielens dataset as a Pandas Dataframe with specified rows.
+
+ Args:
+ size (int): number of rows to generate
+ seed (int, optional): seeding the pseudo-number generation. Defaults to 100.
+ keep_first_n_cols (int, optional): keep the first n default movielens columns.
+ keep_title_col (bool): remove the title column if False. Defaults to True.
+ keep_genre_col (bool): remove the genre column if False. Defaults to True.
+
+ Returns:
+ pandas.DataFrame: a mock dataset
+ """
+ schema = cls.to_schema()
+ if keep_first_n_cols is not None:
+ if keep_first_n_cols < 1 or keep_first_n_cols > len(DEFAULT_HEADER):
+ raise ValueError(
+ f"Invalid value for 'keep_first_n_cols': {keep_first_n_cols}. Valid range: [1-{len(DEFAULT_HEADER)}]"
+ )
+ schema = schema.remove_columns(DEFAULT_HEADER[keep_first_n_cols:])
+ if not keep_title_col:
+ schema = schema.remove_columns([DEFAULT_TITLE_COL])
+ if not keep_genre_col:
+ schema = schema.remove_columns([DEFAULT_GENRE_COL])
+
+ random.seed(seed)
+ schema.checks = [pa.Check.unique_columns([DEFAULT_USER_COL, DEFAULT_ITEM_COL])]
+ return schema.example(size=size)
+
+[docs] @classmethod
+ def get_spark_df(
+ cls,
+ spark,
+ size: int = 3,
+ seed: int = 100,
+ keep_title_col: bool = False,
+ keep_genre_col: bool = False,
+ tmp_path: Optional[str] = None,
+ ):
+ """Return fake movielens dataset as a Spark Dataframe with specified rows
+
+ Args:
+ spark (SparkSession): spark session to load the dataframe into
+ size (int): number of rows to generate
+ seed (int): seeding the pseudo-number generation. Defaults to 100.
+ keep_title_col (bool): remove the title column if False. Defaults to False.
+ keep_genre_col (bool): remove the genre column if False. Defaults to False.
+ tmp_path (str, optional): path to store files for serialization purpose
+ when transferring data from python to java.
+ If None, a temporal path is used instead
+
+ Returns:
+ pyspark.sql.DataFrame: a mock dataset
+ """
+ pandas_df = cls.get_df(
+ size=size, seed=seed, keep_title_col=True, keep_genre_col=True
+ )
+
+ # generate temp folder
+ with download_path(tmp_path) as tmp_folder:
+ filepath = os.path.join(tmp_folder, f"mock_movielens_{size}.csv")
+ # serialize the pandas.df as a csv to avoid the expensive java <-> python communication
+ pandas_df.to_csv(filepath, header=False, index=False)
+ spark_df = spark.read.csv(
+ filepath, schema=cls._get_spark_deserialization_schema()
+ )
+ # Cache and force trigger action since data-file might be removed.
+ spark_df.cache()
+ spark_df.count()
+
+ if not keep_title_col:
+ spark_df = spark_df.drop(DEFAULT_TITLE_COL)
+ if not keep_genre_col:
+ spark_df = spark_df.drop(DEFAULT_GENRE_COL)
+ return spark_df
+
+ @classmethod
+ def _get_spark_deserialization_schema(cls):
+ return StructType(
+ [
+ StructField(DEFAULT_USER_COL, IntegerType()),
+ StructField(DEFAULT_ITEM_COL, IntegerType()),
+ StructField(DEFAULT_RATING_COL, FloatType()),
+ StructField(DEFAULT_TIMESTAMP_COL, StringType()),
+ StructField(DEFAULT_TITLE_COL, StringType()),
+ StructField(DEFAULT_GENRE_COL, StringType()),
+ ]
+ )
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import logging
+import pandas as pd
+import numpy as np
+from functools import lru_cache, wraps
+
+from recommenders.utils.constants import (
+ DEFAULT_USER_COL,
+ DEFAULT_ITEM_COL,
+ DEFAULT_RATING_COL,
+ DEFAULT_LABEL_COL,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+[docs]def user_item_pairs(
+ user_df,
+ item_df,
+ user_col=DEFAULT_USER_COL,
+ item_col=DEFAULT_ITEM_COL,
+ user_item_filter_df=None,
+ shuffle=True,
+ seed=None,
+):
+ """Get all pairs of users and items data.
+
+ Args:
+ user_df (pandas.DataFrame): User data containing unique user ids and maybe their features.
+ item_df (pandas.DataFrame): Item data containing unique item ids and maybe their features.
+ user_col (str): User id column name.
+ item_col (str): Item id column name.
+ user_item_filter_df (pd.DataFrame): User-item pairs to be used as a filter.
+ shuffle (bool): If True, shuffles the result.
+ seed (int): Random seed for shuffle
+
+ Returns:
+ pandas.DataFrame: All pairs of user-item from user_df and item_df, excepting the pairs in user_item_filter_df.
+ """
+
+ # Get all user-item pairs
+ user_df["key"] = 1
+ item_df["key"] = 1
+ users_items = user_df.merge(item_df, on="key")
+
+ user_df.drop("key", axis=1, inplace=True)
+ item_df.drop("key", axis=1, inplace=True)
+ users_items.drop("key", axis=1, inplace=True)
+
+ # Filter
+ if user_item_filter_df is not None:
+ users_items = filter_by(users_items, user_item_filter_df, [user_col, item_col])
+
+ if shuffle:
+ users_items = users_items.sample(frac=1, random_state=seed).reset_index(
+ drop=True
+ )
+
+ return users_items
+
+
+[docs]def filter_by(df, filter_by_df, filter_by_cols):
+ """From the input DataFrame `df`, remove the records whose target column `filter_by_cols` values are
+ exist in the filter-by DataFrame `filter_by_df`.
+
+ Args:
+ df (pandas.DataFrame): Source dataframe.
+ filter_by_df (pandas.DataFrame): Filter dataframe.
+ filter_by_cols (iterable of str): Filter columns.
+
+ Returns:
+ pandas.DataFrame: Dataframe filtered by `filter_by_df` on `filter_by_cols`.
+
+ """
+
+ return df.loc[
+ ~df.set_index(filter_by_cols).index.isin(
+ filter_by_df.set_index(filter_by_cols).index
+ )
+ ]
+
+
+[docs]class LibffmConverter:
+ """Converts an input dataframe to another dataframe in libffm format. A text file of the converted
+ Dataframe is optionally generated.
+
+ Note:
+
+ The input dataframe is expected to represent the feature data in the following schema:
+
+ .. code-block:: python
+
+ |field-1|field-2|...|field-n|rating|
+ |feature-1-1|feature-2-1|...|feature-n-1|1|
+ |feature-1-2|feature-2-2|...|feature-n-2|0|
+ ...
+ |feature-1-i|feature-2-j|...|feature-n-k|0|
+
+ Where
+ 1. each `field-*` is the column name of the dataframe (column of label/rating is excluded), and
+ 2. `feature-*-*` can be either a string or a numerical value, representing the categorical variable or
+ actual numerical variable of the feature value in the field, respectively.
+ 3. If there are ordinal variables represented in int types, users should make sure these columns
+ are properly converted to string type.
+
+ The above data will be converted to the libffm format by following the convention as explained in
+ `this paper <https://www.csie.ntu.edu.tw/~r01922136/slides/ffm.pdf>`_.
+
+ i.e. `<field_index>:<field_feature_index>:1` or `<field_index>:<field_feature_index>:<field_feature_value>`,
+ depending on the data type of the features in the original dataframe.
+
+ Args:
+ filepath (str): path to save the converted data.
+
+ Attributes:
+ field_count (int): count of field in the libffm format data
+ feature_count (int): count of feature in the libffm format data
+ filepath (str or None): file path where the output is stored - it can be None or a string
+
+ Examples:
+ >>> import pandas as pd
+ >>> df_feature = pd.DataFrame({
+ 'rating': [1, 0, 0, 1, 1],
+ 'field1': ['xxx1', 'xxx2', 'xxx4', 'xxx4', 'xxx4'],
+ 'field2': [3, 4, 5, 6, 7],
+ 'field3': [1.0, 2.0, 3.0, 4.0, 5.0],
+ 'field4': ['1', '2', '3', '4', '5']
+ })
+ >>> converter = LibffmConverter().fit(df_feature, col_rating='rating')
+ >>> df_out = converter.transform(df_feature)
+ >>> df_out
+ rating field1 field2 field3 field4
+ 0 1 1:1:1 2:4:3 3:5:1.0 4:6:1
+ 1 0 1:2:1 2:4:4 3:5:2.0 4:7:1
+ 2 0 1:3:1 2:4:5 3:5:3.0 4:8:1
+ 3 1 1:3:1 2:4:6 3:5:4.0 4:9:1
+ 4 1 1:3:1 2:4:7 3:5:5.0 4:10:1
+ """
+
+ def __init__(self, filepath=None):
+ self.filepath = filepath
+ self.col_rating = None
+ self.field_names = None
+ self.field_count = None
+ self.feature_count = None
+
+[docs] def fit(self, df, col_rating=DEFAULT_RATING_COL):
+ """Fit the dataframe for libffm format.
+ This method does nothing but check the validity of the input columns
+
+ Args:
+ df (pandas.DataFrame): input Pandas dataframe.
+ col_rating (str): rating of the data.
+
+ Return:
+ object: the instance of the converter
+ """
+
+ # Check column types.
+ types = df.dtypes
+ if not all(
+ [
+ x == object or np.issubdtype(x, np.integer) or x == np.float
+ for x in types
+ ]
+ ):
+ raise TypeError("Input columns should be only object and/or numeric types.")
+
+ if col_rating not in df.columns:
+ raise TypeError(
+ "Column of {} is not in input dataframe columns".format(col_rating)
+ )
+
+ self.col_rating = col_rating
+ self.field_names = list(df.drop(col_rating, axis=1).columns)
+
+ return self
+
+[docs] def transform(self, df):
+ """Tranform an input dataset with the same schema (column names and dtypes) to libffm format
+ by using the fitted converter.
+
+ Args:
+ df (pandas.DataFrame): input Pandas dataframe.
+
+ Return:
+ pandas.DataFrame: Output libffm format dataframe.
+ """
+ if self.col_rating not in df.columns:
+ raise ValueError(
+ "Input dataset does not contain the label column {} in the fitting dataset".format(
+ self.col_rating
+ )
+ )
+
+ if not all([x in df.columns for x in self.field_names]):
+ raise ValueError(
+ "Not all columns in the input dataset appear in the fitting dataset"
+ )
+
+ # Encode field-feature.
+ idx = 1
+ self.field_feature_dict = {}
+ for field in self.field_names:
+ for feature in df[field].values:
+ # Check whether (field, feature) tuple exists in the dict or not.
+ # If not, put them into the key-values of the dict and count the index.
+ if (field, feature) not in self.field_feature_dict:
+ self.field_feature_dict[(field, feature)] = idx
+ if df[field].dtype == object:
+ idx += 1
+ if df[field].dtype != object:
+ idx += 1
+
+ self.field_count = len(self.field_names)
+ self.feature_count = idx - 1
+
+ def _convert(field, feature, field_index, field_feature_index_dict):
+ field_feature_index = field_feature_index_dict[(field, feature)]
+ if isinstance(feature, str):
+ feature = 1
+ return "{}:{}:{}".format(field_index, field_feature_index, feature)
+
+ for col_index, col in enumerate(self.field_names):
+ df[col] = df[col].apply(
+ lambda x: _convert(col, x, col_index + 1, self.field_feature_dict)
+ )
+
+ # Move rating column to the first.
+ column_names = self.field_names[:]
+ column_names.insert(0, self.col_rating)
+ df = df[column_names]
+
+ if self.filepath is not None:
+ np.savetxt(self.filepath, df.values, delimiter=" ", fmt="%s")
+
+ return df
+
+[docs] def fit_transform(self, df, col_rating=DEFAULT_RATING_COL):
+ """Do fit and transform in a row
+
+ Args:
+ df (pandas.DataFrame): input Pandas dataframe.
+ col_rating (str): rating of the data.
+
+ Return:
+ pandas.DataFrame: Output libffm format dataframe.
+ """
+ return self.fit(df, col_rating=col_rating).transform(df)
+
+[docs] def get_params(self):
+ """Get parameters (attributes) of the libffm converter
+
+ Return:
+ dict: A dictionary that contains parameters field count, feature count, and file path.
+ """
+ return {
+ "field count": self.field_count,
+ "feature count": self.feature_count,
+ "file path": self.filepath,
+ }
+
+
+[docs]def negative_feedback_sampler(
+ df,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_label=DEFAULT_LABEL_COL,
+ col_feedback="feedback",
+ ratio_neg_per_user=1,
+ pos_value=1,
+ neg_value=0,
+ seed=42,
+):
+ """Utility function to sample negative feedback from user-item interaction dataset.
+ This negative sampling function will take the user-item interaction data to create
+ binarized feedback, i.e., 1 and 0 indicate positive and negative feedback,
+ respectively.
+
+ Negative sampling is used in the literature frequently to generate negative samples
+ from a user-item interaction data.
+
+ See for example the `neural collaborative filtering paper <https://www.comp.nus.edu.sg/~xiangnan/papers/ncf.pdf>`_.
+
+ Args:
+ df (pandas.DataFrame): input data that contains user-item tuples.
+ col_user (str): user id column name.
+ col_item (str): item id column name.
+ col_label (str): label column name in df.
+ col_feedback (str): feedback column name in the returned data frame; it is used for the generated column
+ of positive and negative feedback.
+ ratio_neg_per_user (int): ratio of negative feedback w.r.t to the number of positive feedback for each user.
+ If the samples exceed the number of total possible negative feedback samples, it will be reduced to the
+ number of all the possible samples.
+ pos_value (float): value of positive feedback.
+ neg_value (float): value of negative feedback.
+ inplace (bool):
+ seed (int): seed for the random state of the sampling function.
+
+ Returns:
+ pandas.DataFrame: Data with negative feedback.
+
+ Examples:
+ >>> import pandas as pd
+ >>> df = pd.DataFrame({
+ 'userID': [1, 2, 3],
+ 'itemID': [1, 2, 3],
+ 'rating': [5, 5, 5]
+ })
+ >>> df_neg_sampled = negative_feedback_sampler(
+ df, col_user='userID', col_item='itemID', ratio_neg_per_user=1
+ )
+ >>> df_neg_sampled
+ userID itemID feedback
+ 1 1 1
+ 1 2 0
+ 2 2 1
+ 2 1 0
+ 3 3 1
+ 3 1 0
+ """
+ # Get all of the users and items.
+ items = df[col_item].unique()
+ rng = np.random.default_rng(seed=seed)
+
+ def sample_items(user_df):
+ # Sample negative items for the data frame restricted to a specific user
+ n_u = len(user_df)
+ neg_sample_size = max(round(n_u * ratio_neg_per_user), 1)
+ # Draw (n_u + neg_sample_size) items and keep neg_sample_size of these
+ # that are not already in user_df. This requires a set difference from items_sample
+ # instead of items, which is more efficient when len(items) is large.
+ sample_size = min(n_u + neg_sample_size, len(items))
+ items_sample = rng.choice(items, sample_size, replace=False)
+ new_items = np.setdiff1d(items_sample, user_df[col_item])[:neg_sample_size]
+ new_df = pd.DataFrame(
+ data={
+ col_user: user_df.name,
+ col_item: new_items,
+ col_label: neg_value,
+ }
+ )
+ return pd.concat([user_df, new_df], ignore_index=True)
+
+ res_df = df.copy()
+ res_df[col_label] = pos_value
+ return (
+ res_df.groupby(col_user)
+ .apply(sample_items)
+ .reset_index(drop=True)
+ .rename(columns={col_label: col_feedback})
+ )
+
+
+[docs]def has_columns(df, columns):
+ """Check if DataFrame has necessary columns
+
+ Args:
+ df (pandas.DataFrame): DataFrame
+ columns (iterable(str)): columns to check for
+
+ Returns:
+ bool: True if DataFrame has specified columns.
+ """
+ if not isinstance(columns, set):
+ columns = set(columns)
+ return columns.issubset(df.columns)
+
+
+[docs]def has_same_base_dtype(df_1, df_2, columns=None):
+ """Check if specified columns have the same base dtypes across both DataFrames
+
+ Args:
+ df_1 (pandas.DataFrame): first DataFrame
+ df_2 (pandas.DataFrame): second DataFrame
+ columns (list(str)): columns to check, None checks all columns
+
+ Returns:
+ bool: True if DataFrames columns have the same base dtypes.
+ """
+
+ if columns is None:
+ if any(set(df_1.columns).symmetric_difference(set(df_2.columns))):
+ logger.error(
+ "Cannot test all columns because they are not all shared across DataFrames"
+ )
+ return False
+ columns = df_1.columns
+
+ if not (
+ has_columns(df=df_1, columns=columns) and has_columns(df=df_2, columns=columns)
+ ):
+ return False
+
+ result = True
+ for column in columns:
+ if df_1[column].dtype.type.__base__ != df_2[column].dtype.type.__base__:
+ logger.error("Columns {} do not have the same base datatype".format(column))
+ result = False
+
+ return result
+
+
+[docs]class PandasHash:
+ """Wrapper class to allow pandas objects (DataFrames or Series) to be hashable"""
+
+ # reserve space just for a single pandas object
+ __slots__ = "pandas_object"
+
+ def __init__(self, pandas_object):
+ """Initialize class
+
+ Args:
+ pandas_object (pandas.DataFrame|pandas.Series): pandas object
+ """
+
+ if not isinstance(pandas_object, (pd.DataFrame, pd.Series)):
+ raise TypeError("Can only wrap pandas DataFrame or Series objects")
+ self.pandas_object = pandas_object
+
+ def __eq__(self, other):
+ """Overwrite equality comparison
+
+ Args:
+ other (pandas.DataFrame|pandas.Series): pandas object to compare
+
+ Returns:
+ bool: whether other object is the same as this one
+ """
+
+ return hash(self) == hash(other)
+
+ def __hash__(self):
+ """Overwrite hash operator for use with pandas objects
+
+ Returns:
+ int: hashed value of object
+ """
+
+ hashable = tuple(self.pandas_object.values.tobytes())
+ if isinstance(self.pandas_object, pd.DataFrame):
+ hashable += tuple(self.pandas_object.columns)
+ else:
+ hashable += tuple(self.pandas_object.name)
+ return hash(hashable)
+
+
+[docs]def lru_cache_df(maxsize, typed=False):
+ """Least-recently-used cache decorator for pandas Dataframes.
+
+ Decorator to wrap a function with a memoizing callable that saves up to the maxsize most recent calls. It can
+ save time when an expensive or I/O bound function is periodically called with the same arguments.
+
+ Inspired in the `lru_cache function <https://docs.python.org/3/library/functools.html#functools.lru_cache>`_.
+
+ Args:
+ maxsize (int|None): max size of cache, if set to None cache is boundless
+ typed (bool): arguments of different types are cached separately
+ """
+
+ def to_pandas_hash(val):
+ """Return PandaHash object if input is a DataFrame otherwise return input unchanged"""
+ return PandasHash(val) if isinstance(val, pd.DataFrame) else val
+
+ def from_pandas_hash(val):
+ """Extract DataFrame if input is PandaHash object otherwise return input unchanged"""
+ return val.pandas_object if isinstance(val, PandasHash) else val
+
+ def decorating_function(user_function):
+ @wraps(user_function)
+ def wrapper(*args, **kwargs):
+ # convert DataFrames in args and kwargs to PandaHash objects
+ args = tuple([to_pandas_hash(a) for a in args])
+ kwargs = {k: to_pandas_hash(v) for k, v in kwargs.items()}
+ return cached_wrapper(*args, **kwargs)
+
+ @lru_cache(maxsize=maxsize, typed=typed)
+ def cached_wrapper(*args, **kwargs):
+ # get DataFrames from PandaHash objects in args and kwargs
+ args = tuple([from_pandas_hash(a) for a in args])
+ kwargs = {k: from_pandas_hash(v) for k, v in kwargs.items()}
+ return user_function(*args, **kwargs)
+
+ # retain lru_cache attributes
+ wrapper.cache_info = cached_wrapper.cache_info
+ wrapper.cache_clear = cached_wrapper.cache_clear
+
+ return wrapper
+
+ return decorating_function
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split as sk_split
+
+from recommenders.utils.constants import (
+ DEFAULT_ITEM_COL,
+ DEFAULT_USER_COL,
+ DEFAULT_TIMESTAMP_COL,
+)
+from recommenders.datasets.split_utils import (
+ process_split_ratio,
+ min_rating_filter_pandas,
+ split_pandas_data_with_ratios,
+)
+
+
+[docs]def python_random_split(data, ratio=0.75, seed=42):
+ """Pandas random splitter.
+
+ The splitter randomly splits the input data.
+
+ Args:
+ data (pandas.DataFrame): Pandas DataFrame to be split.
+ ratio (float or list): Ratio for splitting data. If it is a single float number
+ it splits data into two halves and the ratio argument indicates the ratio
+ of training data set; if it is a list of float numbers, the splitter splits
+ data into several portions corresponding to the split ratios. If a list is
+ provided and the ratios are not summed to 1, they will be normalized.
+ seed (int): Seed.
+
+ Returns:
+ list: Splits of the input data as pandas.DataFrame.
+ """
+ multi_split, ratio = process_split_ratio(ratio)
+
+ if multi_split:
+ splits = split_pandas_data_with_ratios(data, ratio, shuffle=True, seed=seed)
+ splits_new = [x.drop("split_index", axis=1) for x in splits]
+
+ return splits_new
+ else:
+ return sk_split(data, test_size=None, train_size=ratio, random_state=seed)
+
+
+def _do_stratification(
+ data,
+ ratio=0.75,
+ min_rating=1,
+ filter_by="user",
+ is_random=True,
+ seed=42,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_timestamp=DEFAULT_TIMESTAMP_COL,
+):
+ # A few preliminary checks.
+ if not (filter_by == "user" or filter_by == "item"):
+ raise ValueError("filter_by should be either 'user' or 'item'.")
+
+ if min_rating < 1:
+ raise ValueError("min_rating should be integer and larger than or equal to 1.")
+
+ if col_user not in data.columns:
+ raise ValueError("Schema of data not valid. Missing User Col")
+
+ if col_item not in data.columns:
+ raise ValueError("Schema of data not valid. Missing Item Col")
+
+ if not is_random:
+ if col_timestamp not in data.columns:
+ raise ValueError("Schema of data not valid. Missing Timestamp Col")
+
+ multi_split, ratio = process_split_ratio(ratio)
+
+ split_by_column = col_user if filter_by == "user" else col_item
+
+ ratio = ratio if multi_split else [ratio, 1 - ratio]
+
+ if min_rating > 1:
+ data = min_rating_filter_pandas(
+ data,
+ min_rating=min_rating,
+ filter_by=filter_by,
+ col_user=col_user,
+ col_item=col_item,
+ )
+
+ if is_random:
+ np.random.seed(seed)
+ data["random"] = np.random.rand(data.shape[0])
+ order_by = "random"
+ else:
+ order_by = col_timestamp
+
+ data = data.sort_values([split_by_column, order_by])
+
+ groups = data.groupby(split_by_column)
+
+ data["count"] = groups[split_by_column].transform("count")
+ data["rank"] = groups.cumcount() + 1
+
+ if is_random:
+ data = data.drop("random", axis=1)
+
+ splits = []
+ prev_threshold = None
+ for threshold in np.cumsum(ratio):
+ condition = data["rank"] <= round(threshold * data["count"])
+ if prev_threshold is not None:
+ condition &= data["rank"] > round(prev_threshold * data["count"])
+ splits.append(data[condition].drop(["rank", "count"], axis=1))
+ prev_threshold = threshold
+
+ return splits
+
+
+[docs]def python_chrono_split(
+ data,
+ ratio=0.75,
+ min_rating=1,
+ filter_by="user",
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_timestamp=DEFAULT_TIMESTAMP_COL,
+):
+ """Pandas chronological splitter.
+
+ This function splits data in a chronological manner. That is, for each user / item, the
+ split function takes proportions of ratings which is specified by the split ratio(s).
+ The split is stratified.
+
+ Args:
+ data (pandas.DataFrame): Pandas DataFrame to be split.
+ ratio (float or list): Ratio for splitting data. If it is a single float number
+ it splits data into two halves and the ratio argument indicates the ratio of
+ training data set; if it is a list of float numbers, the splitter splits
+ data into several portions corresponding to the split ratios. If a list is
+ provided and the ratios are not summed to 1, they will be normalized.
+ seed (int): Seed.
+ min_rating (int): minimum number of ratings for user or item.
+ filter_by (str): either "user" or "item", depending on which of the two is to
+ filter with min_rating.
+ col_user (str): column name of user IDs.
+ col_item (str): column name of item IDs.
+ col_timestamp (str): column name of timestamps.
+
+ Returns:
+ list: Splits of the input data as pandas.DataFrame.
+ """
+ return _do_stratification(
+ data,
+ ratio=ratio,
+ min_rating=min_rating,
+ filter_by=filter_by,
+ col_user=col_user,
+ col_item=col_item,
+ col_timestamp=col_timestamp,
+ is_random=False,
+ )
+
+
+[docs]def python_stratified_split(
+ data,
+ ratio=0.75,
+ min_rating=1,
+ filter_by="user",
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ seed=42,
+):
+ """Pandas stratified splitter.
+
+ For each user / item, the split function takes proportions of ratings which is
+ specified by the split ratio(s). The split is stratified.
+
+ Args:
+ data (pandas.DataFrame): Pandas DataFrame to be split.
+ ratio (float or list): Ratio for splitting data. If it is a single float number
+ it splits data into two halves and the ratio argument indicates the ratio of
+ training data set; if it is a list of float numbers, the splitter splits
+ data into several portions corresponding to the split ratios. If a list is
+ provided and the ratios are not summed to 1, they will be normalized.
+ seed (int): Seed.
+ min_rating (int): minimum number of ratings for user or item.
+ filter_by (str): either "user" or "item", depending on which of the two is to
+ filter with min_rating.
+ col_user (str): column name of user IDs.
+ col_item (str): column name of item IDs.
+
+ Returns:
+ list: Splits of the input data as pandas.DataFrame.
+ """
+ return _do_stratification(
+ data,
+ ratio=ratio,
+ min_rating=min_rating,
+ filter_by=filter_by,
+ col_user=col_user,
+ col_item=col_item,
+ is_random=True,
+ seed=seed,
+ )
+
+
+[docs]def numpy_stratified_split(X, ratio=0.75, seed=42):
+ """Split the user/item affinity matrix (sparse matrix) into train and test set matrices while maintaining
+ local (i.e. per user) ratios.
+
+ Main points :
+
+ 1. In a typical recommender problem, different users rate a different number of items,
+ and therefore the user/affinity matrix has a sparse structure with variable number
+ of zeroes (unrated items) per row (user). Cutting a total amount of ratings will
+ result in a non-homogeneous distribution between train and test set, i.e. some test
+ users may have many ratings while other very little if none.
+
+ 2. In an unsupervised learning problem, no explicit answer is given. For this reason
+ the split needs to be implemented in a different way then in supervised learningself.
+ In the latter, one typically split the dataset by rows (by examples), ending up with
+ the same number of features but different number of examples in the train/test setself.
+ This scheme does not work in the unsupervised case, as part of the rated items needs to
+ be used as a test set for fixed number of users.
+
+ Solution:
+
+ 1. Instead of cutting a total percentage, for each user we cut a relative ratio of the rated
+ items. For example, if user1 has rated 4 items and user2 10, cutting 25% will correspond to
+ 1 and 2.6 ratings in the test set, approximated as 1 and 3 according to the round() function.
+ In this way, the 0.75 ratio is satisfied both locally and globally, preserving the original
+ distribution of ratings across the train and test set.
+
+ 2. It is easy (and fast) to satisfy this requirements by creating the test via element subtraction
+ from the original dataset X. We first create two copies of X; for each user we select a random
+ sample of local size ratio (point 1) and erase the remaining ratings, obtaining in this way the
+ train set matrix Xtst. The train set matrix is obtained in the opposite way.
+
+ Args:
+ X (numpy.ndarray, int): a sparse matrix to be split
+ ratio (float): fraction of the entire dataset to constitute the train set
+ seed (int): random seed
+
+ Returns:
+ numpy.ndarray, numpy.ndarray:
+ - Xtr: The train set user/item affinity matrix.
+ - Xtst: The test set user/item affinity matrix.
+ """
+
+ np.random.seed(seed) # set the random seed
+ test_cut = int((1 - ratio) * 100) # percentage of ratings to go in the test set
+
+ # initialize train and test set matrices
+ Xtr = X.copy()
+ Xtst = X.copy()
+
+ # find the number of rated movies per user
+ rated = np.sum(Xtr != 0, axis=1)
+
+ # for each user, cut down a test_size% for the test set
+ tst = np.around((rated * test_cut) / 100).astype(int)
+
+ for u in range(X.shape[0]):
+ # For each user obtain the index of rated movies
+ idx = np.asarray(np.where(Xtr[u] != 0))[0].tolist()
+
+ # extract a random subset of size n from the set of rated movies without repetition
+ idx_tst = np.random.choice(idx, tst[u], replace=False)
+ idx_train = list(set(idx).difference(set(idx_tst)))
+
+ # change the selected rated movies to unrated in the train set
+ Xtr[u, idx_tst] = 0
+ # set the movies that appear already in the train set as 0
+ Xtst[u, idx_train] = 0
+
+ del idx, idx_train, idx_tst
+
+ return Xtr, Xtst
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+
+try:
+ from pyspark.sql import functions as F, Window
+ from pyspark.storagelevel import StorageLevel
+except ImportError:
+ pass # skip this import if we are in pure python environment
+
+from recommenders.utils.constants import (
+ DEFAULT_ITEM_COL,
+ DEFAULT_USER_COL,
+ DEFAULT_TIMESTAMP_COL,
+)
+from recommenders.datasets.split_utils import (
+ process_split_ratio,
+ min_rating_filter_spark,
+)
+
+
+[docs]def spark_random_split(data, ratio=0.75, seed=42):
+ """Spark random splitter.
+
+ Randomly split the data into several splits.
+
+ Args:
+ data (pyspark.sql.DataFrame): Spark DataFrame to be split.
+ ratio (float or list): Ratio for splitting data. If it is a single float number
+ it splits data into two halves and the ratio argument indicates the ratio of
+ training data set; if it is a list of float numbers, the splitter splits
+ data into several portions corresponding to the split ratios. If a list
+ is provided and the ratios are not summed to 1, they will be normalized.
+ seed (int): Seed.
+
+ Returns:
+ list: Splits of the input data as pyspark.sql.DataFrame.
+ """
+ multi_split, ratio = process_split_ratio(ratio)
+
+ if multi_split:
+ return data.randomSplit(ratio, seed=seed)
+ else:
+ return data.randomSplit([ratio, 1 - ratio], seed=seed)
+
+
+def _do_stratification_spark(
+ data,
+ ratio=0.75,
+ min_rating=1,
+ filter_by="user",
+ is_partitioned=True,
+ is_random=True,
+ seed=42,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_timestamp=DEFAULT_TIMESTAMP_COL,
+):
+ """Helper function to perform stratified splits.
+
+ This function splits data in a stratified manner. That is, the same values for the
+ filter_by column are retained in each split, but the corresponding set of entries
+ are divided according to the ratio provided.
+
+ Args:
+ data (pyspark.sql.DataFrame): Spark DataFrame to be split.
+ ratio (float or list): Ratio for splitting data. If it is a single float number
+ it splits data into two sets and the ratio argument indicates the ratio of
+ training data set; if it is a list of float numbers, the splitter splits
+ data into several portions corresponding to the split ratios. If a list is
+ provided and the ratios are not summed to 1, they will be normalized.
+ min_rating (int): minimum number of ratings for user or item.
+ filter_by (str): either "user" or "item", depending on which of the two is to filter
+ with min_rating.
+ is_partitioned (bool): flag to partition data by filter_by column
+ is_random (bool): flag to make split randomly or use timestamp column
+ seed (int): Seed.
+ col_user (str): column name of user IDs.
+ col_item (str): column name of item IDs.
+ col_timestamp (str): column name of timestamps.
+
+ Args:
+
+ Returns:
+ """
+ # A few preliminary checks.
+ if filter_by not in ["user", "item"]:
+ raise ValueError("filter_by should be either 'user' or 'item'.")
+
+ if min_rating < 1:
+ raise ValueError("min_rating should be integer and larger than or equal to 1.")
+
+ if col_user not in data.columns:
+ raise ValueError("Schema of data not valid. Missing User Col")
+
+ if col_item not in data.columns:
+ raise ValueError("Schema of data not valid. Missing Item Col")
+
+ if not is_random:
+ if col_timestamp not in data.columns:
+ raise ValueError("Schema of data not valid. Missing Timestamp Col")
+
+ if min_rating > 1:
+ data = min_rating_filter_spark(
+ data=data,
+ min_rating=min_rating,
+ filter_by=filter_by,
+ col_user=col_user,
+ col_item=col_item,
+ )
+
+ split_by = col_user if filter_by == "user" else col_item
+ partition_by = split_by if is_partitioned else []
+
+ col_random = "_random"
+ if is_random:
+ data = data.withColumn(col_random, F.rand(seed=seed))
+ order_by = F.col(col_random)
+ else:
+ order_by = F.col(col_timestamp)
+
+ window_count = Window.partitionBy(partition_by)
+ window_spec = Window.partitionBy(partition_by).orderBy(order_by)
+
+ data = (
+ data.withColumn("_count", F.count(split_by).over(window_count))
+ .withColumn("_rank", F.row_number().over(window_spec) / F.col("_count"))
+ .drop("_count", col_random)
+ )
+ # Persist to avoid duplicate rows in splits caused by lazy evaluation
+ data.persist(StorageLevel.MEMORY_AND_DISK_2).count()
+
+ multi_split, ratio = process_split_ratio(ratio)
+ ratio = ratio if multi_split else [ratio, 1 - ratio]
+
+ splits = []
+ prev_split = None
+ for split in np.cumsum(ratio):
+ condition = F.col("_rank") <= split
+ if prev_split is not None:
+ condition &= F.col("_rank") > prev_split
+ splits.append(data.filter(condition).drop("_rank"))
+ prev_split = split
+
+ return splits
+
+
+[docs]def spark_chrono_split(
+ data,
+ ratio=0.75,
+ min_rating=1,
+ filter_by="user",
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_timestamp=DEFAULT_TIMESTAMP_COL,
+ no_partition=False,
+):
+ """Spark chronological splitter.
+
+ This function splits data in a chronological manner. That is, for each user / item, the
+ split function takes proportions of ratings which is specified by the split ratio(s).
+ The split is stratified.
+
+ Args:
+ data (pyspark.sql.DataFrame): Spark DataFrame to be split.
+ ratio (float or list): Ratio for splitting data. If it is a single float number
+ it splits data into two sets and the ratio argument indicates the ratio of
+ training data set; if it is a list of float numbers, the splitter splits
+ data into several portions corresponding to the split ratios. If a list is
+ provided and the ratios are not summed to 1, they will be normalized.
+ min_rating (int): minimum number of ratings for user or item.
+ filter_by (str): either "user" or "item", depending on which of the two is to filter
+ with min_rating.
+ col_user (str): column name of user IDs.
+ col_item (str): column name of item IDs.
+ col_timestamp (str): column name of timestamps.
+ no_partition (bool): set to enable more accurate and less efficient splitting.
+
+ Returns:
+ list: Splits of the input data as pyspark.sql.DataFrame.
+ """
+
+ return _do_stratification_spark(
+ data=data,
+ ratio=ratio,
+ min_rating=min_rating,
+ filter_by=filter_by,
+ is_random=False,
+ col_user=col_user,
+ col_item=col_item,
+ col_timestamp=col_timestamp,
+ )
+
+
+[docs]def spark_stratified_split(
+ data,
+ ratio=0.75,
+ min_rating=1,
+ filter_by="user",
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ seed=42,
+):
+ """Spark stratified splitter.
+
+ For each user / item, the split function takes proportions of ratings which is
+ specified by the split ratio(s). The split is stratified.
+
+ Args:
+ data (pyspark.sql.DataFrame): Spark DataFrame to be split.
+ ratio (float or list): Ratio for splitting data. If it is a single float number
+ it splits data into two halves and the ratio argument indicates the ratio of
+ training data set; if it is a list of float numbers, the splitter splits
+ data into several portions corresponding to the split ratios. If a list is
+ provided and the ratios are not summed to 1, they will be normalized.
+ Earlier indexed splits will have earlier times
+ (e.g. the latest time per user or item in split[0] <= the earliest time per user or item in split[1])
+ seed (int): Seed.
+ min_rating (int): minimum number of ratings for user or item.
+ filter_by (str): either "user" or "item", depending on which of the two is to filter
+ with min_rating.
+ col_user (str): column name of user IDs.
+ col_item (str): column name of item IDs.
+
+ Returns:
+ list: Splits of the input data as pyspark.sql.DataFrame.
+ """
+ return _do_stratification_spark(
+ data=data,
+ ratio=ratio,
+ min_rating=min_rating,
+ filter_by=filter_by,
+ seed=seed,
+ col_user=col_user,
+ col_item=col_item,
+ )
+
+
+[docs]def spark_timestamp_split(
+ data,
+ ratio=0.75,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_timestamp=DEFAULT_TIMESTAMP_COL,
+):
+ """Spark timestamp based splitter.
+
+ The splitter splits the data into sets by timestamps without stratification on either user or item.
+ The ratios are applied on the timestamp column which is divided accordingly into several partitions.
+
+ Args:
+ data (pyspark.sql.DataFrame): Spark DataFrame to be split.
+ ratio (float or list): Ratio for splitting data. If it is a single float number
+ it splits data into two sets and the ratio argument indicates the ratio of
+ training data set; if it is a list of float numbers, the splitter splits
+ data into several portions corresponding to the split ratios. If a list is
+ provided and the ratios are not summed to 1, they will be normalized.
+ Earlier indexed splits will have earlier times
+ (e.g. the latest time in split[0] <= the earliest time in split[1])
+ col_user (str): column name of user IDs.
+ col_item (str): column name of item IDs.
+ col_timestamp (str): column name of timestamps. Float number represented in
+ seconds since Epoch.
+
+ Returns:
+ list: Splits of the input data as pyspark.sql.DataFrame.
+ """
+ return _do_stratification_spark(
+ data=data,
+ ratio=ratio,
+ is_random=False,
+ is_partitioned=False,
+ col_user=col_user,
+ col_item=col_item,
+ col_timestamp=col_timestamp,
+ )
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import pandas as pd
+import numpy as np
+import itertools
+
+from scipy.sparse import coo_matrix
+import logging
+
+# import default parameters
+from recommenders.utils.constants import (
+ DEFAULT_USER_COL,
+ DEFAULT_ITEM_COL,
+ DEFAULT_RATING_COL,
+ DEFAULT_PREDICTION_COL,
+)
+
+
+log = logging.getLogger(__name__)
+
+
+[docs]class AffinityMatrix:
+ """Generate the user/item affinity matrix from a pandas dataframe and vice versa"""
+
+ def __init__(
+ self,
+ df,
+ items_list=None,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_pred=DEFAULT_PREDICTION_COL,
+ save_path=None,
+ ):
+ """Initialize class parameters
+
+ Args:
+ df (pandas.DataFrame): a dataframe containing the data
+ items_list (numpy.ndarray): a list of unique items to use (if provided)
+ col_user (str): default name for user column
+ col_item (str): default name for item column
+ col_rating (str): default name for rating columns
+ save_path (str): default path to save item/user maps
+ """
+ self.df = df # dataframe
+ self.items_list = items_list # list of unique items
+
+ # pandas DF parameters
+ self.col_item = col_item
+ self.col_user = col_user
+ self.col_rating = col_rating
+ self.col_pred = col_pred
+
+ # Options to save the model for future use
+ self.save_path = save_path
+
+ def _gen_index(self):
+ """
+ Generate the user/item index:
+ map_users, map_items: dictionaries mapping the original user/item index to matrix indices
+ map_back_users, map_back_items: dictionaries to map back the matrix elements to the original
+ dataframe indices
+
+ Basic mechanics:
+ As a first step we retieve the unique elements in the dataset. In this way we can take care
+ of either completely missing rows (a user with no ratings) or completely missing columns
+ (an item that has not being reviewed by anyone). The original indices in the dataframe are
+ then mapped to an ordered, contiguous integer series to generate a compact matrix representation.
+ Functions to map back to the original indices are also provided and can be saved in order to use
+ a pretrained model.
+ """
+ # sort entries by user index
+ self.df_ = self.df.sort_values(by=[self.col_user])
+
+ # find unique user and item index
+ unique_users = self.df_[self.col_user].unique()
+
+ if self.items_list is not None:
+ unique_items = self.items_list # use this list if provided
+ else:
+ unique_items = self.df_[
+ self.col_item
+ ].unique() # otherwise use unique items from DF
+
+ self.Nusers = len(unique_users)
+ self.Nitems = len(unique_items)
+
+ # create a dictionary to map unique users/items to hashed values to generate the matrix
+ self.map_users = {x: i for i, x in enumerate(unique_users)}
+ self.map_items = {x: i for i, x in enumerate(unique_items)}
+
+ # map back functions used to get back the original dataframe
+ self.map_back_users = {i: x for i, x in enumerate(unique_users)}
+ self.map_back_items = {i: x for i, x in enumerate(unique_items)}
+
+ self.df_.loc[:, "hashedItems"] = self.df_[self.col_item].map(self.map_items)
+ self.df_.loc[:, "hashedUsers"] = self.df_[self.col_user].map(self.map_users)
+
+ # optionally save the inverse dictionary to work with trained models
+ if self.save_path is not None:
+
+ np.save(self.save_path + "/user_dict", self.map_users)
+ np.save(self.save_path + "/item_dict", self.map_items)
+
+ np.save(self.save_path + "/user_back_dict", self.map_back_users)
+ np.save(self.save_path + "/item_back_dict", self.map_back_items)
+
+[docs] def gen_affinity_matrix(self):
+ """Generate the user/item affinity matrix.
+
+ As a first step, two new columns are added to the input DF, containing the index maps
+ generated by the gen_index() method. The new indices, together with the ratings, are
+ then used to generate the user/item affinity matrix using scipy's sparse matrix method
+ coo_matrix; for reference see:
+ https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html.
+ The input format is: `coo_matrix((data, (rows, columns)), shape=(rows, columns))`
+
+ Returns:
+ scipy.sparse.coo_matrix: User-affinity matrix of dimensions (Nusers, Nitems) in numpy format.
+ Unrated movies are assigned a value of 0.
+ """
+
+ log.info("Generating the user/item affinity matrix...")
+
+ self._gen_index()
+
+ ratings = self.df_[self.col_rating] # ratings
+ itm_id = self.df_["hashedItems"] # itm_id serving as columns
+ usr_id = self.df_["hashedUsers"] # usr_id serving as rows
+
+ # generate a sparse matrix representation using scipy's coo_matrix and convert to array format
+ self.AM = coo_matrix(
+ (ratings, (usr_id, itm_id)), shape=(self.Nusers, self.Nitems)
+ ).toarray()
+
+ zero = (self.AM == 0).sum() # number of unrated items
+ total = self.AM.shape[0] * self.AM.shape[1] # number of elements in the matrix
+ sparsness = zero / total * 100 # Percentage of zeros in the matrix
+
+ log.info("Matrix generated, sparseness percentage: %d" % sparsness)
+
+ return self.AM, self.map_users, self.map_items
+
+[docs] def map_back_sparse(self, X, kind):
+ """Map back the user/affinity matrix to a pd dataframe
+
+ Args:
+ X (numpy.ndarray, int32): user/item affinity matrix
+ kind (string): specify if the output values are ratings or predictions
+ Returns:
+ pandas.DataFrame: the generated pandas dataframe
+ """
+ m, n = X.shape
+
+ # 1) Create a DF from a sparse matrix
+ # obtain the non zero items
+ items = [np.asanyarray(np.where(X[i, :] != 0)).flatten() for i in range(m)]
+ ratings = [X[i, items[i]] for i in range(m)] # obtain the non-zero ratings
+
+ # Creates user ids following the DF format
+ userids = []
+ for i in range(0, m):
+ userids.extend([i] * len(items[i]))
+
+ # Flatten the lists to follow the DF input format
+ items = list(itertools.chain.from_iterable(items))
+ ratings = list(itertools.chain.from_iterable(ratings))
+
+ if kind == "ratings":
+ col_out = self.col_rating
+ else:
+ col_out = self.col_pred
+
+ # create a df
+ out_df = pd.DataFrame.from_dict(
+ {self.col_user: userids, self.col_item: items, col_out: ratings}
+ )
+
+ # 2) map back user/item ids to their original value
+
+ out_df[self.col_user] = out_df[self.col_user].map(self.map_back_users)
+ out_df[self.col_item] = out_df[self.col_item].map(self.map_back_items)
+
+ return out_df
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+import math
+import logging
+
+from recommenders.utils.constants import DEFAULT_ITEM_COL, DEFAULT_USER_COL
+
+logger = logging.getLogger(__name__)
+
+try:
+ from pyspark.sql import functions as F, Window
+except ImportError:
+ pass # so the environment without spark doesn't break
+
+
+[docs]def process_split_ratio(ratio):
+ """Generate split ratio lists.
+
+ Args:
+ ratio (float or list): a float number that indicates split ratio or a list of float
+ numbers that indicate split ratios (if it is a multi-split).
+
+ Returns:
+ tuple:
+ - bool: A boolean variable multi that indicates if the splitting is multi or single.
+ - list: A list of normalized split ratios.
+ """
+ if isinstance(ratio, float):
+ if ratio <= 0 or ratio >= 1:
+ raise ValueError("Split ratio has to be between 0 and 1")
+
+ multi = False
+ elif isinstance(ratio, list):
+ if any([x <= 0 for x in ratio]):
+ raise ValueError(
+ "All split ratios in the ratio list should be larger than 0."
+ )
+
+ # normalize split ratios if they are not summed to 1
+ if math.fsum(ratio) != 1.0:
+ ratio = [x / math.fsum(ratio) for x in ratio]
+
+ multi = True
+ else:
+ raise TypeError("Split ratio should be either float or a list of floats.")
+
+ return multi, ratio
+
+
+[docs]def min_rating_filter_pandas(
+ data,
+ min_rating=1,
+ filter_by="user",
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+):
+ """Filter rating DataFrame for each user with minimum rating.
+
+ Filter rating data frame with minimum number of ratings for user/item is usually useful to
+ generate a new data frame with warm user/item. The warmth is defined by min_rating argument. For
+ example, a user is called warm if he has rated at least 4 items.
+
+ Args:
+ data (pandas.DataFrame): DataFrame of user-item tuples. Columns of user and item
+ should be present in the DataFrame while other columns like rating,
+ timestamp, etc. can be optional.
+ min_rating (int): minimum number of ratings for user or item.
+ filter_by (str): either "user" or "item", depending on which of the two is to
+ filter with min_rating.
+ col_user (str): column name of user ID.
+ col_item (str): column name of item ID.
+
+ Returns:
+ pandas.DataFrame: DataFrame with at least columns of user and item that has been filtered by the given specifications.
+ """
+ split_by_column = _get_column_name(filter_by, col_user, col_item)
+
+ if min_rating < 1:
+ raise ValueError("min_rating should be integer and larger than or equal to 1.")
+
+ return data.groupby(split_by_column).filter(lambda x: len(x) >= min_rating)
+
+
+[docs]def min_rating_filter_spark(
+ data,
+ min_rating=1,
+ filter_by="user",
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+):
+ """Filter rating DataFrame for each user with minimum rating.
+
+ Filter rating data frame with minimum number of ratings for user/item is usually useful to
+ generate a new data frame with warm user/item. The warmth is defined by min_rating argument. For
+ example, a user is called warm if he has rated at least 4 items.
+
+ Args:
+ data (pyspark.sql.DataFrame): DataFrame of user-item tuples. Columns of user and item
+ should be present in the DataFrame while other columns like rating,
+ timestamp, etc. can be optional.
+ min_rating (int): minimum number of ratings for user or item.
+ filter_by (str): either "user" or "item", depending on which of the two is to
+ filter with min_rating.
+ col_user (str): column name of user ID.
+ col_item (str): column name of item ID.
+
+ Returns:
+ pyspark.sql.DataFrame: DataFrame with at least columns of user and item that has been filtered by the given specifications.
+ """
+
+ split_by_column = _get_column_name(filter_by, col_user, col_item)
+
+ if min_rating < 1:
+ raise ValueError("min_rating should be integer and larger than or equal to 1.")
+
+ if min_rating > 1:
+ window = Window.partitionBy(split_by_column)
+ data = (
+ data.withColumn("_count", F.count(split_by_column).over(window))
+ .where(F.col("_count") >= min_rating)
+ .drop("_count")
+ )
+
+ return data
+
+
+def _get_column_name(name, col_user, col_item):
+ if name == "user":
+ return col_user
+ elif name == "item":
+ return col_item
+ else:
+ raise ValueError("name should be either 'user' or 'item'.")
+
+
+[docs]def split_pandas_data_with_ratios(data, ratios, seed=42, shuffle=False):
+ """Helper function to split pandas DataFrame with given ratios
+
+ Note:
+ Implementation referenced from `this source <https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test>`_.
+
+ Args:
+ data (pandas.DataFrame): Pandas data frame to be split.
+ ratios (list of floats): list of ratios for split. The ratios have to sum to 1.
+ seed (int): random seed.
+ shuffle (bool): whether data will be shuffled when being split.
+
+ Returns:
+ list: List of pd.DataFrame split by the given specifications.
+ """
+ if math.fsum(ratios) != 1.0:
+ raise ValueError("The ratios have to sum to 1")
+
+ split_index = np.cumsum(ratios).tolist()[:-1]
+
+ if shuffle:
+ data = data.sample(frac=1, random_state=seed)
+
+ splits = np.split(data, [round(x * len(data)) for x in split_index])
+
+ # Add split index (this makes splitting by group more efficient).
+ for i in range(len(ratios)):
+ splits[i]["split_index"] = i
+
+ return splits
+
+
+[docs]def filter_k_core(data, core_num=0, col_user="userID", col_item="itemID"):
+ """Filter rating dataframe for minimum number of users and items by
+ repeatedly applying min_rating_filter until the condition is satisfied.
+
+ """
+ num_users, num_items = len(data[col_user].unique()), len(data[col_item].unique())
+ logger.info("Original: %d users and %d items", num_users, num_items)
+ df_inp = data.copy()
+
+ if core_num > 0:
+ while True:
+ df_inp = min_rating_filter_pandas(
+ df_inp, min_rating=core_num, filter_by="item"
+ )
+ df_inp = min_rating_filter_pandas(
+ df_inp, min_rating=core_num, filter_by="user"
+ )
+ count_u = df_inp.groupby(col_user)[col_item].count()
+ count_i = df_inp.groupby(col_item)[col_user].count()
+ if (
+ len(count_i[count_i < core_num]) == 0
+ and len(count_u[count_u < core_num]) == 0
+ ):
+ break
+ df_inp = df_inp.sort_values(by=[col_user])
+ num_users = len(df_inp[col_user].unique())
+ num_items = len(df_inp[col_item].unique())
+ logger.info("Final: %d users and %d items", num_users, num_items)
+
+ return df_inp
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import pandas as pd
+import requests
+import logging
+from retrying import retry
+
+
+logger = logging.getLogger(__name__)
+
+
+API_URL_WIKIPEDIA = "https://en.wikipedia.org/w/api.php"
+API_URL_WIKIDATA = "https://query.wikidata.org/sparql"
+SESSION = None
+
+
+[docs]def get_session(session=None):
+ """Get session object
+
+ Args:
+ session (requests.Session): request session object
+
+ Returns:
+ requests.Session: request session object
+ """
+
+ if session is None:
+ global SESSION
+ if SESSION is None:
+ SESSION = requests.Session()
+ session = SESSION
+
+ return session
+
+
+[docs]@retry(wait_random_min=1000, wait_random_max=5000, stop_max_attempt_number=5)
+def find_wikidata_id(name, limit=1, session=None):
+ """Find the entity ID in wikidata from a title string.
+
+ Args:
+ name (str): A string with search terms (eg. "Batman (1989) film")
+ limit (int): Number of results to return
+ session (requests.Session): requests session to reuse connections
+
+ Returns:
+ str: wikidata entityID corresponding to the title string. 'entityNotFound' will be returned if no page is found
+ """
+
+ session = get_session(session=session)
+
+ params = dict(
+ action="query",
+ list="search",
+ srsearch=bytes(name, encoding="utf8"),
+ srlimit=limit,
+ srprop="",
+ format="json",
+ )
+
+ try:
+ response = session.get(API_URL_WIKIPEDIA, params=params)
+ page_id = response.json()["query"]["search"][0]["pageid"]
+ except Exception:
+ # TODO: distinguish between connection error and entity not found
+ logger.warning("ENTITY NOT FOUND")
+ return "entityNotFound"
+
+ params = dict(
+ action="query",
+ prop="pageprops",
+ ppprop="wikibase_item",
+ pageids=[page_id],
+ format="json",
+ )
+
+ try:
+ response = session.get(API_URL_WIKIPEDIA, params=params)
+ entity_id = response.json()["query"]["pages"][str(page_id)]["pageprops"][
+ "wikibase_item"
+ ]
+ except Exception:
+ # TODO: distinguish between connection error and entity not found
+ logger.warning("ENTITY NOT FOUND")
+ return "entityNotFound"
+
+ return entity_id
+
+
+[docs]@retry(wait_random_min=1000, wait_random_max=5000, stop_max_attempt_number=5)
+def query_entity_links(entity_id, session=None):
+ """Query all linked pages from a wikidata entityID
+
+ Args:
+ entity_id (str): A wikidata entity ID
+ session (requests.Session): requests session to reuse connections
+
+ Returns:
+ json: Dictionary with linked pages.
+ """
+ query = (
+ """
+ PREFIX entity: <http://www.wikidata.org/entity/>
+ #partial results
+
+ SELECT ?propUrl ?propLabel ?valUrl ?valLabel
+ WHERE
+ {
+ hint:Query hint:optimizer 'None' .
+ { BIND(entity:"""
+ + entity_id
+ + """ AS ?valUrl) .
+ BIND("N/A" AS ?propUrl ) .
+ BIND("identity"@en AS ?propLabel ) .
+ }
+ UNION
+ { entity:"""
+ + entity_id
+ + """ ?propUrl ?valUrl .
+ ?property ?ref ?propUrl .
+ ?property rdf:type wikibase:Property .
+ ?property rdfs:label ?propLabel
+ }
+
+ ?valUrl rdfs:label ?valLabel
+ FILTER (LANG(?valLabel) = 'en') .
+ OPTIONAL{ ?valUrl wdt:P18 ?picture .}
+ FILTER (lang(?propLabel) = 'en' )
+ }
+ ORDER BY ?propUrl ?valUrl
+ LIMIT 500
+ """
+ )
+
+ session = get_session(session=session)
+
+ try:
+ data = session.get(
+ API_URL_WIKIDATA, params=dict(query=query, format="json")
+ ).json()
+ except Exception as e: # noqa: F841
+ logger.warning("ENTITY NOT FOUND")
+ return {}
+
+ return data
+
+
+[docs]def read_linked_entities(data):
+ """Obtain lists of liken entities (IDs and names) from dictionary
+
+ Args:
+ data (json): dictionary with linked pages
+
+ Returns:
+ list, list:
+ - List of liked entityIDs.
+ - List of liked entity names.
+ """
+
+ return [
+ (
+ c.get("valUrl").get("value").replace("http://www.wikidata.org/entity/", ""),
+ c.get("valLabel").get("value"),
+ )
+ for c in data.get("results", {}).get("bindings", [])
+ ]
+
+
+[docs]@retry(wait_random_min=1000, wait_random_max=5000, stop_max_attempt_number=5)
+def query_entity_description(entity_id, session=None):
+ """Query entity wikidata description from entityID
+
+ Args:
+ entity_id (str): A wikidata page ID.
+ session (requests.Session): requests session to reuse connections
+
+ Returns:
+ str: Wikidata short description of the entityID
+ descriptionNotFound' will be returned if no description is found
+ """
+ query = (
+ """
+ PREFIX wd: <http://www.wikidata.org/entity/>
+ PREFIX schema: <http://schema.org/>
+
+ SELECT ?o
+ WHERE
+ {
+ wd:"""
+ + entity_id
+ + """ schema:description ?o.
+ FILTER ( lang(?o) = "en" )
+ }
+ """
+ )
+
+ session = get_session(session=session)
+
+ try:
+ r = session.get(API_URL_WIKIDATA, params=dict(query=query, format="json"))
+ description = r.json()["results"]["bindings"][0]["o"]["value"]
+ except Exception as e: # noqa: F841
+ logger.warning("DESCRIPTION NOT FOUND")
+ return "descriptionNotFound"
+
+ return description
+
+
+[docs]def search_wikidata(names, extras=None, describe=True, verbose=False):
+ """Create DataFrame of Wikidata search results
+
+ Args:
+ names (list[str]): List of names to search for
+ extras (dict(str: list)): Optional extra items to assign to results for corresponding name
+ describe (bool): Optional flag to include description of entity
+ verbose (bool): Optional flag to print out intermediate data
+
+ Returns:
+ pandas.DataFrame: Wikipedia results for all names with found entities
+
+ """
+
+ results = []
+ for idx, name in enumerate(names):
+ entity_id = find_wikidata_id(name)
+ if verbose:
+ print("name: {name}, entity_id: {id}".format(name=name, id=entity_id))
+
+ if entity_id == "entityNotFound":
+ continue
+
+ json_links = query_entity_links(entity_id)
+ related_links = read_linked_entities(json_links)
+ description = query_entity_description(entity_id) if describe else ""
+
+ for related_entity, related_name in related_links:
+ result = dict(
+ name=name,
+ original_entity=entity_id,
+ linked_entities=related_entity,
+ name_linked_entities=related_name,
+ )
+ if describe:
+ result["description"] = description
+ if extras is not None:
+ for field, lst in extras.items():
+ result[field] = lst[idx]
+ results.append(result)
+
+ return pd.DataFrame(results)
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+import pandas as pd
+from functools import wraps
+from sklearn.metrics import (
+ mean_squared_error,
+ mean_absolute_error,
+ r2_score,
+ explained_variance_score,
+ roc_auc_score,
+ log_loss,
+)
+
+from recommenders.utils.constants import (
+ DEFAULT_USER_COL,
+ DEFAULT_ITEM_COL,
+ DEFAULT_RATING_COL,
+ DEFAULT_PREDICTION_COL,
+ DEFAULT_RELEVANCE_COL,
+ DEFAULT_SIMILARITY_COL,
+ DEFAULT_ITEM_FEATURES_COL,
+ DEFAULT_ITEM_SIM_MEASURE,
+ DEFAULT_K,
+ DEFAULT_THRESHOLD,
+)
+from recommenders.datasets.pandas_df_utils import (
+ has_columns,
+ has_same_base_dtype,
+ lru_cache_df,
+)
+
+
+[docs]class ColumnMismatchError(Exception):
+ """Exception raised when there is a mismatch in columns.
+
+ This exception is raised when an operation involving columns
+ encounters a mismatch or inconsistency.
+
+ Attributes:
+ message (str): Explanation of the error.
+ """
+
+ pass
+
+
+[docs]class ColumnTypeMismatchError(Exception):
+ """Exception raised when there is a mismatch in column types.
+
+ This exception is raised when an operation involving column types
+ encounters a mismatch or inconsistency.
+
+ Attributes:
+ message (str): Explanation of the error.
+ """
+
+ pass
+
+
+def _check_column_dtypes(func):
+ """Checks columns of DataFrame inputs
+
+ This includes the checks on:
+
+ * whether the input columns exist in the input DataFrames
+ * whether the data types of col_user as well as col_item are matched in the two input DataFrames.
+
+ Args:
+ func (function): function that will be wrapped
+
+ Returns:
+ function: Wrapper function for checking dtypes.
+ """
+
+ @wraps(func)
+ def check_column_dtypes_wrapper(
+ rating_true,
+ rating_pred,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+ *args,
+ **kwargs,
+ ):
+ """Check columns of DataFrame inputs
+
+ Args:
+ rating_true (pandas.DataFrame): True data
+ rating_pred (pandas.DataFrame): Predicted data
+ col_user (str): column name for user
+ col_item (str): column name for item
+ col_rating (str): column name for rating
+ col_prediction (str): column name for prediction
+ """
+ # Some ranking metrics don't have the rating column, so we don't need to check.
+ expected_true_columns = {col_user, col_item}
+ if "col_rating" in kwargs:
+ expected_true_columns.add(kwargs["col_rating"])
+ if not has_columns(rating_true, expected_true_columns):
+ raise ColumnMismatchError("Missing columns in true rating DataFrame")
+
+ if not has_columns(rating_pred, {col_user, col_item, col_prediction}):
+ raise ColumnMismatchError("Missing columns in predicted rating DataFrame")
+
+ if not has_same_base_dtype(
+ rating_true, rating_pred, columns=[col_user, col_item]
+ ):
+ raise ColumnTypeMismatchError(
+ "Columns in provided DataFrames are not the same datatype"
+ )
+
+ return func(
+ rating_true=rating_true,
+ rating_pred=rating_pred,
+ col_user=col_user,
+ col_item=col_item,
+ col_prediction=col_prediction,
+ *args,
+ **kwargs,
+ )
+
+ return check_column_dtypes_wrapper
+
+
+[docs]@_check_column_dtypes
+@lru_cache_df(maxsize=1)
+def merge_rating_true_pred(
+ rating_true,
+ rating_pred,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+):
+ """Join truth and prediction data frames on userID and itemID and return the true
+ and predicted rated with the correct index.
+
+ Args:
+ rating_true (pandas.DataFrame): True data
+ rating_pred (pandas.DataFrame): Predicted data
+ col_user (str): column name for user
+ col_item (str): column name for item
+ col_rating (str): column name for rating
+ col_prediction (str): column name for prediction
+
+ Returns:
+ numpy.ndarray: Array with the true ratings
+ numpy.ndarray: Array with the predicted ratings
+
+ """
+
+ # pd.merge will apply suffixes to columns which have the same name across both dataframes
+ suffixes = ["_true", "_pred"]
+ rating_true_pred = pd.merge(
+ rating_true, rating_pred, on=[col_user, col_item], suffixes=suffixes
+ )
+ if col_rating in rating_pred.columns:
+ col_rating = col_rating + suffixes[0]
+ if col_prediction in rating_true.columns:
+ col_prediction = col_prediction + suffixes[1]
+ return rating_true_pred[col_rating], rating_true_pred[col_prediction]
+
+
+[docs]def rmse(
+ rating_true,
+ rating_pred,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+):
+ """Calculate Root Mean Squared Error
+
+ Args:
+ rating_true (pandas.DataFrame): True data. There should be no duplicate (userID, itemID) pairs
+ rating_pred (pandas.DataFrame): Predicted data. There should be no duplicate (userID, itemID) pairs
+ col_user (str): column name for user
+ col_item (str): column name for item
+ col_rating (str): column name for rating
+ col_prediction (str): column name for prediction
+
+ Returns:
+ float: Root mean squared error
+ """
+
+ y_true, y_pred = merge_rating_true_pred(
+ rating_true=rating_true,
+ rating_pred=rating_pred,
+ col_user=col_user,
+ col_item=col_item,
+ col_rating=col_rating,
+ col_prediction=col_prediction,
+ )
+ return np.sqrt(mean_squared_error(y_true, y_pred))
+
+
+[docs]def mae(
+ rating_true,
+ rating_pred,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+):
+ """Calculate Mean Absolute Error.
+
+ Args:
+ rating_true (pandas.DataFrame): True data. There should be no duplicate (userID, itemID) pairs
+ rating_pred (pandas.DataFrame): Predicted data. There should be no duplicate (userID, itemID) pairs
+ col_user (str): column name for user
+ col_item (str): column name for item
+ col_rating (str): column name for rating
+ col_prediction (str): column name for prediction
+
+ Returns:
+ float: Mean Absolute Error.
+ """
+
+ y_true, y_pred = merge_rating_true_pred(
+ rating_true=rating_true,
+ rating_pred=rating_pred,
+ col_user=col_user,
+ col_item=col_item,
+ col_rating=col_rating,
+ col_prediction=col_prediction,
+ )
+ return mean_absolute_error(y_true, y_pred)
+
+
+[docs]def rsquared(
+ rating_true,
+ rating_pred,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+):
+ """Calculate R squared
+
+ Args:
+ rating_true (pandas.DataFrame): True data. There should be no duplicate (userID, itemID) pairs
+ rating_pred (pandas.DataFrame): Predicted data. There should be no duplicate (userID, itemID) pairs
+ col_user (str): column name for user
+ col_item (str): column name for item
+ col_rating (str): column name for rating
+ col_prediction (str): column name for prediction
+
+ Returns:
+ float: R squared (min=0, max=1).
+ """
+
+ y_true, y_pred = merge_rating_true_pred(
+ rating_true=rating_true,
+ rating_pred=rating_pred,
+ col_user=col_user,
+ col_item=col_item,
+ col_rating=col_rating,
+ col_prediction=col_prediction,
+ )
+ return r2_score(y_true, y_pred)
+
+
+[docs]def exp_var(
+ rating_true,
+ rating_pred,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+):
+ """Calculate explained variance.
+
+ Args:
+ rating_true (pandas.DataFrame): True data. There should be no duplicate (userID, itemID) pairs
+ rating_pred (pandas.DataFrame): Predicted data. There should be no duplicate (userID, itemID) pairs
+ col_user (str): column name for user
+ col_item (str): column name for item
+ col_rating (str): column name for rating
+ col_prediction (str): column name for prediction
+
+ Returns:
+ float: Explained variance (min=0, max=1).
+ """
+
+ y_true, y_pred = merge_rating_true_pred(
+ rating_true=rating_true,
+ rating_pred=rating_pred,
+ col_user=col_user,
+ col_item=col_item,
+ col_rating=col_rating,
+ col_prediction=col_prediction,
+ )
+ return explained_variance_score(y_true, y_pred)
+
+
+[docs]def auc(
+ rating_true,
+ rating_pred,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+):
+ """Calculate the Area-Under-Curve metric for implicit feedback typed
+ recommender, where rating is binary and prediction is float number ranging
+ from 0 to 1.
+
+ https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
+
+ Note:
+ The evaluation does not require a leave-one-out scenario.
+ This metric does not calculate group-based AUC which considers the AUC scores
+ averaged across users. It is also not limited to k. Instead, it calculates the
+ scores on the entire prediction results regardless the users.
+
+ Args:
+ rating_true (pandas.DataFrame): True data
+ rating_pred (pandas.DataFrame): Predicted data
+ col_user (str): column name for user
+ col_item (str): column name for item
+ col_rating (str): column name for rating
+ col_prediction (str): column name for prediction
+
+ Returns:
+ float: auc_score (min=0, max=1)
+ """
+
+ y_true, y_pred = merge_rating_true_pred(
+ rating_true=rating_true,
+ rating_pred=rating_pred,
+ col_user=col_user,
+ col_item=col_item,
+ col_rating=col_rating,
+ col_prediction=col_prediction,
+ )
+ return roc_auc_score(y_true, y_pred)
+
+
+[docs]def logloss(
+ rating_true,
+ rating_pred,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+):
+ """Calculate the logloss metric for implicit feedback typed
+ recommender, where rating is binary and prediction is float number ranging
+ from 0 to 1.
+
+ https://en.wikipedia.org/wiki/Loss_functions_for_classification#Cross_entropy_loss_(Log_Loss)
+
+ Args:
+ rating_true (pandas.DataFrame): True data
+ rating_pred (pandas.DataFrame): Predicted data
+ col_user (str): column name for user
+ col_item (str): column name for item
+ col_rating (str): column name for rating
+ col_prediction (str): column name for prediction
+
+ Returns:
+ float: log_loss_score (min=-inf, max=inf)
+ """
+
+ y_true, y_pred = merge_rating_true_pred(
+ rating_true=rating_true,
+ rating_pred=rating_pred,
+ col_user=col_user,
+ col_item=col_item,
+ col_rating=col_rating,
+ col_prediction=col_prediction,
+ )
+ return log_loss(y_true, y_pred)
+
+
+[docs]@_check_column_dtypes
+@lru_cache_df(maxsize=1)
+def merge_ranking_true_pred(
+ rating_true,
+ rating_pred,
+ col_user,
+ col_item,
+ col_prediction,
+ relevancy_method,
+ k=DEFAULT_K,
+ threshold=DEFAULT_THRESHOLD,
+ **_,
+):
+ """Filter truth and prediction data frames on common users
+
+ Args:
+ rating_true (pandas.DataFrame): True DataFrame
+ rating_pred (pandas.DataFrame): Predicted DataFrame
+ col_user (str): column name for user
+ col_item (str): column name for item
+ col_prediction (str): column name for prediction
+ relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
+ top k items are directly provided, so there is no need to compute the relevancy operation.
+ k (int): number of top k items per user (optional)
+ threshold (float): threshold of top items per user (optional)
+
+ Returns:
+ pandas.DataFrame, pandas.DataFrame, int: DataFrame of recommendation hits, sorted by `col_user` and `rank`
+ DataFrame of hit counts vs actual relevant items per user number of unique user ids
+ """
+
+ # Make sure the prediction and true data frames have the same set of users
+ common_users = set(rating_true[col_user]).intersection(set(rating_pred[col_user]))
+ rating_true_common = rating_true[rating_true[col_user].isin(common_users)]
+ rating_pred_common = rating_pred[rating_pred[col_user].isin(common_users)]
+ n_users = len(common_users)
+
+ # Return hit items in prediction data frame with ranking information. This is used for calculating NDCG and MAP.
+ # Use first to generate unique ranking values for each item. This is to align with the implementation in
+ # Spark evaluation metrics, where index of each recommended items (the indices are unique to items) is used
+ # to calculate penalized precision of the ordered items.
+ if relevancy_method == "top_k":
+ top_k = k
+ elif relevancy_method == "by_threshold":
+ top_k = threshold
+ elif relevancy_method is None:
+ top_k = None
+ else:
+ raise NotImplementedError("Invalid relevancy_method")
+ df_hit = get_top_k_items(
+ dataframe=rating_pred_common,
+ col_user=col_user,
+ col_rating=col_prediction,
+ k=top_k,
+ )
+ df_hit = pd.merge(df_hit, rating_true_common, on=[col_user, col_item])[
+ [col_user, col_item, "rank"]
+ ]
+
+ # count the number of hits vs actual relevant items per user
+ df_hit_count = pd.merge(
+ df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
+ rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
+ {"actual": "count"}
+ ),
+ on=col_user,
+ )
+
+ return df_hit, df_hit_count, n_users
+
+
+[docs]def precision_at_k(
+ rating_true,
+ rating_pred,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+ relevancy_method="top_k",
+ k=DEFAULT_K,
+ threshold=DEFAULT_THRESHOLD,
+ **_,
+):
+ """Precision at K.
+
+ Note:
+ We use the same formula to calculate precision@k as that in Spark.
+ More details can be found at
+ http://spark.apache.org/docs/2.1.1/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.precisionAt
+ In particular, the maximum achievable precision may be < 1, if the number of items for a
+ user in rating_pred is less than k.
+
+ Args:
+ rating_true (pandas.DataFrame): True DataFrame
+ rating_pred (pandas.DataFrame): Predicted DataFrame
+ col_user (str): column name for user
+ col_item (str): column name for item
+ col_prediction (str): column name for prediction
+ relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
+ top k items are directly provided, so there is no need to compute the relevancy operation.
+ k (int): number of top k items per user
+ threshold (float): threshold of top items per user (optional)
+
+ Returns:
+ float: precision at k (min=0, max=1)
+ """
+ df_hit, df_hit_count, n_users = merge_ranking_true_pred(
+ rating_true=rating_true,
+ rating_pred=rating_pred,
+ col_user=col_user,
+ col_item=col_item,
+ col_prediction=col_prediction,
+ relevancy_method=relevancy_method,
+ k=k,
+ threshold=threshold,
+ )
+
+ if df_hit.shape[0] == 0:
+ return 0.0
+
+ return (df_hit_count["hit"] / k).sum() / n_users
+
+
+[docs]def recall_at_k(
+ rating_true,
+ rating_pred,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+ relevancy_method="top_k",
+ k=DEFAULT_K,
+ threshold=DEFAULT_THRESHOLD,
+ **_,
+):
+ """Recall at K.
+
+ Args:
+ rating_true (pandas.DataFrame): True DataFrame
+ rating_pred (pandas.DataFrame): Predicted DataFrame
+ col_user (str): column name for user
+ col_item (str): column name for item
+ col_prediction (str): column name for prediction
+ relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
+ top k items are directly provided, so there is no need to compute the relevancy operation.
+ k (int): number of top k items per user
+ threshold (float): threshold of top items per user (optional)
+
+ Returns:
+ float: recall at k (min=0, max=1). The maximum value is 1 even when fewer than
+ k items exist for a user in rating_true.
+ """
+ df_hit, df_hit_count, n_users = merge_ranking_true_pred(
+ rating_true=rating_true,
+ rating_pred=rating_pred,
+ col_user=col_user,
+ col_item=col_item,
+ col_prediction=col_prediction,
+ relevancy_method=relevancy_method,
+ k=k,
+ threshold=threshold,
+ )
+
+ if df_hit.shape[0] == 0:
+ return 0.0
+
+ return (df_hit_count["hit"] / df_hit_count["actual"]).sum() / n_users
+
+
+[docs]def ndcg_at_k(
+ rating_true,
+ rating_pred,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+ relevancy_method="top_k",
+ k=DEFAULT_K,
+ threshold=DEFAULT_THRESHOLD,
+ score_type="binary",
+ discfun_type="loge",
+ **_,
+):
+ """Normalized Discounted Cumulative Gain (nDCG).
+
+ Info: https://en.wikipedia.org/wiki/Discounted_cumulative_gain
+
+ Args:
+ rating_true (pandas.DataFrame): True DataFrame
+ rating_pred (pandas.DataFrame): Predicted DataFrame
+ col_user (str): column name for user
+ col_item (str): column name for item
+ col_rating (str): column name for rating
+ col_prediction (str): column name for prediction
+ relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
+ top k items are directly provided, so there is no need to compute the relevancy operation.
+ k (int): number of top k items per user
+ threshold (float): threshold of top items per user (optional)
+ score_type (str): type of relevance scores ['binary', 'raw', 'exp']. With the default option 'binary', the
+ relevance score is reduced to either 1 (hit) or 0 (miss). Option 'raw' uses the raw relevance score.
+ Option 'exp' uses (2 ** RAW_RELEVANCE - 1) as the relevance score
+ discfun_type (str): type of discount function ['loge', 'log2'] used to calculate DCG.
+
+ Returns:
+ float: nDCG at k (min=0, max=1).
+ """
+ df_hit, _, _ = merge_ranking_true_pred(
+ rating_true=rating_true,
+ rating_pred=rating_pred,
+ col_user=col_user,
+ col_item=col_item,
+ col_prediction=col_prediction,
+ relevancy_method=relevancy_method,
+ k=k,
+ threshold=threshold,
+ )
+
+ if df_hit.shape[0] == 0:
+ return 0.0
+
+ df_dcg = df_hit.merge(rating_pred, on=[col_user, col_item]).merge(
+ rating_true, on=[col_user, col_item], how="outer", suffixes=("_left", None)
+ )
+
+ if score_type == "binary":
+ df_dcg["rel"] = 1
+ elif score_type == "raw":
+ df_dcg["rel"] = df_dcg[col_rating]
+ elif score_type == "exp":
+ df_dcg["rel"] = 2 ** df_dcg[col_rating] - 1
+ else:
+ raise ValueError("score_type must be one of 'binary', 'raw', 'exp'")
+
+ if discfun_type == "loge":
+ discfun = np.log
+ elif discfun_type == "log2":
+ discfun = np.log2
+ else:
+ raise ValueError("discfun_type must be one of 'loge', 'log2'")
+
+ # Calculate the actual discounted gain for each record
+ df_dcg["dcg"] = df_dcg["rel"] / discfun(1 + df_dcg["rank"])
+
+ # Calculate the ideal discounted gain for each record
+ df_idcg = df_dcg.sort_values([col_user, col_rating], ascending=False)
+ df_idcg["irank"] = df_idcg.groupby(col_user, as_index=False, sort=False)[
+ col_rating
+ ].rank("first", ascending=False)
+ df_idcg["idcg"] = df_idcg["rel"] / discfun(1 + df_idcg["irank"])
+
+ # Calculate the actual DCG for each user
+ df_user = df_dcg.groupby(col_user, as_index=False, sort=False).agg({"dcg": "sum"})
+
+ # Calculate the ideal DCG for each user
+ df_user = df_user.merge(
+ df_idcg.groupby(col_user, as_index=False, sort=False)
+ .head(k)
+ .groupby(col_user, as_index=False, sort=False)
+ .agg({"idcg": "sum"}),
+ on=col_user,
+ )
+
+ # DCG over IDCG is the normalized DCG
+ df_user["ndcg"] = df_user["dcg"] / df_user["idcg"]
+ return df_user["ndcg"].mean()
+
+
+@lru_cache_df(maxsize=1)
+def _get_reciprocal_rank(
+ rating_true,
+ rating_pred,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+ relevancy_method="top_k",
+ k=DEFAULT_K,
+ threshold=DEFAULT_THRESHOLD,
+):
+ df_hit, df_hit_count, n_users = merge_ranking_true_pred(
+ rating_true=rating_true,
+ rating_pred=rating_pred,
+ col_user=col_user,
+ col_item=col_item,
+ col_prediction=col_prediction,
+ relevancy_method=relevancy_method,
+ k=k,
+ threshold=threshold,
+ )
+
+ if df_hit.shape[0] == 0:
+ return None, n_users
+
+ # calculate reciprocal rank of items for each user and sum them up
+ df_hit_sorted = df_hit.copy()
+ df_hit_sorted["rr"] = (
+ df_hit_sorted.groupby(col_user).cumcount() + 1
+ ) / df_hit_sorted["rank"]
+ df_hit_sorted = df_hit_sorted.groupby(col_user).agg({"rr": "sum"}).reset_index()
+
+ return pd.merge(df_hit_sorted, df_hit_count, on=col_user), n_users
+
+
+[docs]def map(
+ rating_true,
+ rating_pred,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+ relevancy_method="top_k",
+ k=DEFAULT_K,
+ threshold=DEFAULT_THRESHOLD,
+ **_,
+):
+ """Mean Average Precision for top k prediction items
+
+ The implementation of MAP is referenced from Spark MLlib evaluation metrics.
+ https://spark.apache.org/docs/2.3.0/mllib-evaluation-metrics.html#ranking-systems
+
+ A good reference can be found at:
+ http://web.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
+
+ Note:
+ The MAP is meant to calculate Avg. Precision for the relevant items, so it is normalized by the number of
+ relevant items in the ground truth data, instead of k.
+
+ Args:
+ rating_true (pandas.DataFrame): True DataFrame
+ rating_pred (pandas.DataFrame): Predicted DataFrame
+ col_user (str): column name for user
+ col_item (str): column name for item
+ col_prediction (str): column name for prediction
+ relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
+ top k items are directly provided, so there is no need to compute the relevancy operation.
+ k (int): number of top k items per user
+ threshold (float): threshold of top items per user (optional)
+
+ Returns:
+ float: MAP (min=0, max=1)
+ """
+ df_merge, n_users = _get_reciprocal_rank(
+ rating_true=rating_true,
+ rating_pred=rating_pred,
+ col_user=col_user,
+ col_item=col_item,
+ col_prediction=col_prediction,
+ relevancy_method=relevancy_method,
+ k=k,
+ threshold=threshold,
+ )
+
+ if df_merge is None:
+ return 0.0
+ else:
+ return (df_merge["rr"] / df_merge["actual"]).sum() / n_users
+
+
+[docs]def map_at_k(
+ rating_true,
+ rating_pred,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+ relevancy_method="top_k",
+ k=DEFAULT_K,
+ threshold=DEFAULT_THRESHOLD,
+ **_,
+):
+ """Mean Average Precision at k
+
+ The implementation of MAP@k is referenced from Spark MLlib evaluation metrics.
+ https://github.com/apache/spark/blob/b938ff9f520fd4e4997938284ffa0aba9ea271fc/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala#L99
+
+ Args:
+ rating_true (pandas.DataFrame): True DataFrame
+ rating_pred (pandas.DataFrame): Predicted DataFrame
+ col_user (str): column name for user
+ col_item (str): column name for item
+ col_prediction (str): column name for prediction
+ relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
+ top k items are directly provided, so there is no need to compute the relevancy operation.
+ k (int): number of top k items per user
+ threshold (float): threshold of top items per user (optional)
+
+ Returns:
+ float: MAP@k (min=0, max=1)
+ """
+ df_merge, n_users = _get_reciprocal_rank(
+ rating_true=rating_true,
+ rating_pred=rating_pred,
+ col_user=col_user,
+ col_item=col_item,
+ col_prediction=col_prediction,
+ relevancy_method=relevancy_method,
+ k=k,
+ threshold=threshold,
+ )
+
+ if df_merge is None:
+ return 0.0
+ else:
+ return (
+ df_merge["rr"] / df_merge["actual"].apply(lambda x: min(x, k))
+ ).sum() / n_users
+
+
+[docs]def get_top_k_items(
+ dataframe, col_user=DEFAULT_USER_COL, col_rating=DEFAULT_RATING_COL, k=DEFAULT_K
+):
+ """Get the input customer-item-rating tuple in the format of Pandas
+ DataFrame, output a Pandas DataFrame in the dense format of top k items
+ for each user.
+
+ Note:
+ If it is implicit rating, just append a column of constants to be
+ ratings.
+
+ Args:
+ dataframe (pandas.DataFrame): DataFrame of rating data (in the format
+ customerID-itemID-rating)
+ col_user (str): column name for user
+ col_rating (str): column name for rating
+ k (int or None): number of items for each user; None means that the input has already been
+ filtered out top k items and sorted by ratings and there is no need to do that again.
+
+ Returns:
+ pandas.DataFrame: DataFrame of top k items for each user, sorted by `col_user` and `rank`
+ """
+ # Sort dataframe by col_user and (top k) col_rating
+ if k is None:
+ top_k_items = dataframe
+ else:
+ top_k_items = (
+ dataframe.sort_values([col_user, col_rating], ascending=[True, False])
+ .groupby(col_user, as_index=False)
+ .head(k)
+ .reset_index(drop=True)
+ )
+ # Add ranks
+ top_k_items["rank"] = top_k_items.groupby(col_user, sort=False).cumcount() + 1
+ return top_k_items
+
+
+"""Function name and function mapper.
+Useful when we have to serialize evaluation metric names
+and call the functions based on deserialized names"""
+metrics = {
+ rmse.__name__: rmse,
+ mae.__name__: mae,
+ rsquared.__name__: rsquared,
+ exp_var.__name__: exp_var,
+ precision_at_k.__name__: precision_at_k,
+ recall_at_k.__name__: recall_at_k,
+ ndcg_at_k.__name__: ndcg_at_k,
+ map_at_k.__name__: map_at_k,
+ map.__name__: map,
+}
+
+
+# diversity metrics
+def _check_column_dtypes_diversity_serendipity(func):
+ """Checks columns of DataFrame inputs
+
+ This includes the checks on:
+
+ * whether the input columns exist in the input DataFrames
+ * whether the data types of col_user as well as col_item are matched in the two input DataFrames.
+ * whether reco_df contains any user_item pairs that are already shown in train_df
+ * check relevance column in reco_df
+ * check column names in item_feature_df
+
+ Args:
+ func (function): function that will be wrapped
+
+ Returns:
+ function: Wrapper function for checking dtypes.
+ """
+
+ @wraps(func)
+ def check_column_dtypes_diversity_serendipity_wrapper(
+ train_df,
+ reco_df,
+ item_feature_df=None,
+ item_sim_measure=DEFAULT_ITEM_SIM_MEASURE,
+ col_item_features=DEFAULT_ITEM_FEATURES_COL,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_sim=DEFAULT_SIMILARITY_COL,
+ col_relevance=None,
+ *args,
+ **kwargs,
+ ):
+ """Check columns of DataFrame inputs
+
+ Args:
+ train_df (pandas.DataFrame): Data set with historical data for users and items they
+ have interacted with; contains col_user, col_item. Assumed to not contain any duplicate rows.
+ reco_df (pandas.DataFrame): Recommender's prediction output, containing col_user, col_item,
+ col_relevance (optional). Assumed to not contain any duplicate user-item pairs.
+ item_feature_df (pandas.DataFrame): (Optional) It is required only when item_sim_measure='item_feature_vector'.
+ It contains two columns: col_item and features (a feature vector).
+ item_sim_measure (str): (Optional) This column indicates which item similarity measure to be used.
+ Available measures include item_cooccurrence_count (default choice) and item_feature_vector.
+ col_item_features (str): item feature column name.
+ col_user (str): User id column name.
+ col_item (str): Item id column name.
+ col_sim (str): This column indicates the column name for item similarity.
+ col_relevance (str): This column indicates whether the recommended item is actually
+ relevant to the user or not.
+ """
+
+ if not has_columns(train_df, [col_user, col_item]):
+ raise ValueError("Missing columns in train_df DataFrame")
+ if not has_columns(reco_df, [col_user, col_item]):
+ raise ValueError("Missing columns in reco_df DataFrame")
+ if not has_same_base_dtype(train_df, reco_df, columns=[col_user, col_item]):
+ raise ValueError("Columns in provided DataFrames are not the same datatype")
+ if col_relevance is None:
+ col_relevance = DEFAULT_RELEVANCE_COL
+ # relevance term, default is 1 (relevant) for all
+ reco_df = reco_df[[col_user, col_item]]
+ reco_df[col_relevance] = 1.0
+ else:
+ col_relevance = col_relevance
+ reco_df = reco_df[[col_user, col_item, col_relevance]].astype(
+ {col_relevance: np.float16}
+ )
+ if item_sim_measure == "item_feature_vector":
+ required_columns = [col_item, col_item_features]
+ if item_feature_df is not None:
+ if not has_columns(item_feature_df, required_columns):
+ raise ValueError("Missing columns in item_feature_df DataFrame")
+ else:
+ raise Exception(
+ "item_feature_df not specified! item_feature_df must be provided "
+ "if choosing to use item_feature_vector to calculate item similarity. "
+ "item_feature_df should have columns: " + str(required_columns)
+ )
+ # check if reco_df contains any user_item pairs that are already shown in train_df
+ count_intersection = pd.merge(
+ train_df, reco_df, how="inner", on=[col_user, col_item]
+ ).shape[0]
+ if count_intersection != 0:
+ raise Exception(
+ "reco_df should not contain any user_item pairs that are already shown in train_df"
+ )
+
+ return func(
+ train_df=train_df,
+ reco_df=reco_df,
+ item_feature_df=item_feature_df,
+ item_sim_measure=item_sim_measure,
+ col_user=col_user,
+ col_item=col_item,
+ col_sim=col_sim,
+ col_relevance=col_relevance,
+ *args,
+ **kwargs,
+ )
+
+ return check_column_dtypes_diversity_serendipity_wrapper
+
+
+def _check_column_dtypes_novelty_coverage(func):
+ """Checks columns of DataFrame inputs
+
+ This includes the checks on:
+
+ * whether the input columns exist in the input DataFrames
+ * whether the data types of col_user as well as col_item are matched in the two input DataFrames.
+ * whether reco_df contains any user_item pairs that are already shown in train_df
+
+ Args:
+ func (function): function that will be wrapped
+
+ Returns:
+ function: Wrapper function for checking dtypes.
+ """
+
+ @wraps(func)
+ def check_column_dtypes_novelty_coverage_wrapper(
+ train_df,
+ reco_df,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ *args,
+ **kwargs,
+ ):
+ """Check columns of DataFrame inputs
+
+ Args:
+ train_df (pandas.DataFrame): Data set with historical data for users and items they
+ have interacted with; contains col_user, col_item. Assumed to not contain any duplicate rows.
+ Interaction here follows the *item choice model* from Castells et al.
+ reco_df (pandas.DataFrame): Recommender's prediction output, containing col_user, col_item,
+ col_relevance (optional). Assumed to not contain any duplicate user-item pairs.
+ col_user (str): User id column name.
+ col_item (str): Item id column name.
+
+ """
+
+ if not has_columns(train_df, [col_user, col_item]):
+ raise ValueError("Missing columns in train_df DataFrame")
+ if not has_columns(reco_df, [col_user, col_item]):
+ raise ValueError("Missing columns in reco_df DataFrame")
+ if not has_same_base_dtype(train_df, reco_df, columns=[col_user, col_item]):
+ raise ValueError("Columns in provided DataFrames are not the same datatype")
+
+ count_intersection = pd.merge(
+ train_df, reco_df, how="inner", on=[col_user, col_item]
+ ).shape[0]
+ if count_intersection != 0:
+ raise Exception(
+ "reco_df should not contain any user_item pairs that are already shown in train_df"
+ )
+
+ return func(
+ train_df=train_df,
+ reco_df=reco_df,
+ col_user=col_user,
+ col_item=col_item,
+ *args,
+ **kwargs,
+ )
+
+ return check_column_dtypes_novelty_coverage_wrapper
+
+
+@lru_cache_df(maxsize=1)
+def _get_pairwise_items(
+ df,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+):
+ """Get pairwise combinations of items per user (ignoring duplicate pairs [1,2] == [2,1])"""
+ df_user_i1 = df[[col_user, col_item]]
+ df_user_i1.columns = [col_user, "i1"]
+
+ df_user_i2 = df[[col_user, col_item]]
+ df_user_i2.columns = [col_user, "i2"]
+
+ df_user_i1_i2 = pd.merge(df_user_i1, df_user_i2, how="inner", on=[col_user])
+
+ df_pairwise_items = df_user_i1_i2[(df_user_i1_i2["i1"] <= df_user_i1_i2["i2"])][
+ [col_user, "i1", "i2"]
+ ].reset_index(drop=True)
+ return df_pairwise_items
+
+
+@lru_cache_df(maxsize=1)
+def _get_cosine_similarity(
+ train_df,
+ item_feature_df=None,
+ item_sim_measure=DEFAULT_ITEM_SIM_MEASURE,
+ col_item_features=DEFAULT_ITEM_FEATURES_COL,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_sim=DEFAULT_SIMILARITY_COL,
+):
+ if item_sim_measure == "item_cooccurrence_count":
+ # calculate item-item similarity based on item co-occurrence count
+ df_cosine_similarity = _get_cooccurrence_similarity(
+ train_df, col_user, col_item, col_sim
+ )
+ elif item_sim_measure == "item_feature_vector":
+ # calculdf_cosine_similarity = ate item-item similarity based on item feature vectors
+ df_cosine_similarity = _get_item_feature_similarity(
+ item_feature_df, col_item_features, col_user, col_item
+ )
+ else:
+ raise Exception(
+ "item_sim_measure not recognized! The available options include 'item_cooccurrence_count' and 'item_feature_vector'."
+ )
+ return df_cosine_similarity
+
+
+@lru_cache_df(maxsize=1)
+def _get_cooccurrence_similarity(
+ train_df,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_sim=DEFAULT_SIMILARITY_COL,
+):
+ """Cosine similarity metric from
+
+ :Citation:
+
+ Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist:
+ introducing serendipity into music recommendation, WSDM 2012
+
+ The item indexes in the result are such that i1 <= i2.
+ """
+ pairs = _get_pairwise_items(train_df, col_user, col_item)
+ pairs_count = pd.DataFrame(
+ {"count": pairs.groupby(["i1", "i2"]).size()}
+ ).reset_index()
+ item_count = pd.DataFrame(
+ {"count": train_df.groupby([col_item]).size()}
+ ).reset_index()
+ item_count["item_sqrt_count"] = item_count["count"] ** 0.5
+ item_co_occur = pairs_count.merge(
+ item_count[[col_item, "item_sqrt_count"]],
+ left_on=["i1"],
+ right_on=[col_item],
+ ).drop(columns=[col_item])
+
+ item_co_occur.columns = ["i1", "i2", "count", "i1_sqrt_count"]
+
+ item_co_occur = item_co_occur.merge(
+ item_count[[col_item, "item_sqrt_count"]],
+ left_on=["i2"],
+ right_on=[col_item],
+ ).drop(columns=[col_item])
+ item_co_occur.columns = [
+ "i1",
+ "i2",
+ "count",
+ "i1_sqrt_count",
+ "i2_sqrt_count",
+ ]
+
+ item_co_occur[col_sim] = item_co_occur["count"] / (
+ item_co_occur["i1_sqrt_count"] * item_co_occur["i2_sqrt_count"]
+ )
+ df_cosine_similarity = (
+ item_co_occur[["i1", "i2", col_sim]]
+ .sort_values(["i1", "i2"])
+ .reset_index(drop=True)
+ )
+
+ return df_cosine_similarity
+
+
+@lru_cache_df(maxsize=1)
+def _get_item_feature_similarity(
+ item_feature_df,
+ col_item_features=DEFAULT_ITEM_FEATURES_COL,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_sim=DEFAULT_SIMILARITY_COL,
+):
+ """Cosine similarity metric based on item feature vectors
+
+ The item indexes in the result are such that i1 <= i2.
+ """
+ df1 = item_feature_df[[col_item, col_item_features]]
+ df1.columns = ["i1", "f1"]
+ df1["key"] = 0
+ df2 = item_feature_df[[col_item, col_item_features]]
+ df2.columns = ["i2", "f2"]
+ df2["key"] = 0
+
+ df = pd.merge(df1, df2, on="key", how="outer").drop("key", axis=1)
+ df_item_feature_pair = df[(df["i1"] <= df["i2"])].reset_index(drop=True)
+
+ df_item_feature_pair[col_sim] = df_item_feature_pair.apply(
+ lambda x: float(x.f1.dot(x.f2))
+ / float(np.linalg.norm(x.f1, 2) * np.linalg.norm(x.f2, 2)),
+ axis=1,
+ )
+
+ df_cosine_similarity = df_item_feature_pair[["i1", "i2", col_sim]].sort_values(
+ ["i1", "i2"]
+ )
+
+ return df_cosine_similarity
+
+
+# Diversity metrics
+@lru_cache_df(maxsize=1)
+def _get_intralist_similarity(
+ train_df,
+ reco_df,
+ item_feature_df=None,
+ item_sim_measure=DEFAULT_ITEM_SIM_MEASURE,
+ col_item_features=DEFAULT_ITEM_FEATURES_COL,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_sim=DEFAULT_SIMILARITY_COL,
+):
+ """Intra-list similarity from
+
+ :Citation:
+
+ "Improving Recommendation Lists Through Topic Diversification",
+ Ziegler, McNee, Konstan and Lausen, 2005.
+ """
+ pairs = _get_pairwise_items(reco_df, col_user, col_item)
+ similarity_df = _get_cosine_similarity(
+ train_df,
+ item_feature_df,
+ item_sim_measure,
+ col_item_features,
+ col_user,
+ col_item,
+ col_sim,
+ )
+ # Fillna(0) is needed in the cases where similarity_df does not have an entry for a pair of items.
+ # e.g. i1 and i2 have never occurred together.
+
+ item_pair_sim = pairs.merge(similarity_df, on=["i1", "i2"], how="left")
+ item_pair_sim[col_sim].fillna(0, inplace=True)
+ item_pair_sim = item_pair_sim.loc[
+ item_pair_sim["i1"] != item_pair_sim["i2"]
+ ].reset_index(drop=True)
+ df_intralist_similarity = (
+ item_pair_sim.groupby([col_user]).agg({col_sim: "mean"}).reset_index()
+ )
+ df_intralist_similarity.columns = [col_user, "avg_il_sim"]
+
+ return df_intralist_similarity
+
+
+[docs]@_check_column_dtypes_diversity_serendipity
+@lru_cache_df(maxsize=1)
+def user_diversity(
+ train_df,
+ reco_df,
+ item_feature_df=None,
+ item_sim_measure=DEFAULT_ITEM_SIM_MEASURE,
+ col_item_features=DEFAULT_ITEM_FEATURES_COL,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_sim=DEFAULT_SIMILARITY_COL,
+ col_relevance=None,
+):
+ """Calculate average diversity of recommendations for each user.
+ The metric definition is based on formula (3) in the following reference:
+
+ :Citation:
+
+ Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist:
+ introducing serendipity into music recommendation, WSDM 2012
+
+ Args:
+ train_df (pandas.DataFrame): Data set with historical data for users and items they have interacted with;
+ contains col_user, col_item. Assumed to not contain any duplicate rows.
+ reco_df (pandas.DataFrame): Recommender's prediction output, containing col_user, col_item, col_relevance (optional).
+ Assumed to not contain any duplicate user-item pairs.
+ item_feature_df (pandas.DataFrame): (Optional) It is required only when item_sim_measure='item_feature_vector'.
+ It contains two columns: col_item and features (a feature vector).
+ item_sim_measure (str): (Optional) This column indicates which item similarity measure to be used.
+ Available measures include item_cooccurrence_count (default choice) and item_feature_vector.
+ col_item_features (str): item feature column name.
+ col_user (str): User id column name.
+ col_item (str): Item id column name.
+ col_sim (str): This column indicates the column name for item similarity.
+ col_relevance (str): This column indicates whether the recommended item is actually relevant to the user or not.
+
+ Returns:
+ pandas.DataFrame: A dataframe with the following columns: col_user, user_diversity.
+ """
+
+ df_intralist_similarity = _get_intralist_similarity(
+ train_df,
+ reco_df,
+ item_feature_df,
+ item_sim_measure,
+ col_item_features,
+ col_user,
+ col_item,
+ col_sim,
+ )
+ df_user_diversity = df_intralist_similarity
+ df_user_diversity["user_diversity"] = 1 - df_user_diversity["avg_il_sim"]
+ df_user_diversity = (
+ df_user_diversity[[col_user, "user_diversity"]]
+ .sort_values(col_user)
+ .reset_index(drop=True)
+ )
+
+ return df_user_diversity
+
+
+[docs]@_check_column_dtypes_diversity_serendipity
+def diversity(
+ train_df,
+ reco_df,
+ item_feature_df=None,
+ item_sim_measure=DEFAULT_ITEM_SIM_MEASURE,
+ col_item_features=DEFAULT_ITEM_FEATURES_COL,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_sim=DEFAULT_SIMILARITY_COL,
+ col_relevance=None,
+):
+ """Calculate average diversity of recommendations across all users.
+
+ Args:
+ train_df (pandas.DataFrame): Data set with historical data for users and items they have interacted with;
+ contains col_user, col_item. Assumed to not contain any duplicate rows.
+ reco_df (pandas.DataFrame): Recommender's prediction output, containing col_user, col_item, col_relevance (optional).
+ Assumed to not contain any duplicate user-item pairs.
+ item_feature_df (pandas.DataFrame): (Optional) It is required only when item_sim_measure='item_feature_vector'.
+ It contains two columns: col_item and features (a feature vector).
+ item_sim_measure (str): (Optional) This column indicates which item similarity measure to be used.
+ Available measures include item_cooccurrence_count (default choice) and item_feature_vector.
+ col_item_features (str): item feature column name.
+ col_user (str): User id column name.
+ col_item (str): Item id column name.
+ col_sim (str): This column indicates the column name for item similarity.
+ col_relevance (str): This column indicates whether the recommended item is actually relevant to the user or not.
+
+ Returns:
+ float: diversity.
+ """
+ df_user_diversity = user_diversity(
+ train_df,
+ reco_df,
+ item_feature_df,
+ item_sim_measure,
+ col_item_features,
+ col_user,
+ col_item,
+ col_sim,
+ )
+ avg_diversity = df_user_diversity.agg({"user_diversity": "mean"})[0]
+ return avg_diversity
+
+
+# Novelty metrics
+[docs]@_check_column_dtypes_novelty_coverage
+@lru_cache_df(maxsize=1)
+def historical_item_novelty(
+ train_df,
+ reco_df,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+):
+ """Calculate novelty for each item. Novelty is computed as the minus logarithm of
+ (number of interactions with item / total number of interactions). The definition of the metric
+ is based on the following reference using the choice model (eqs. 1 and 6):
+
+ :Citation:
+
+ P. Castells, S. Vargas, and J. Wang, Novelty and diversity metrics for recommender systems:
+ choice, discovery and relevance, ECIR 2011
+
+ The novelty of an item can be defined relative to a set of observed events on the set of all items.
+ These can be events of user choice (item "is picked" by a random user) or user discovery
+ (item "is known" to a random user). The above definition of novelty reflects a factor of item popularity.
+ High novelty values correspond to long-tail items in the density function, that few users have interacted
+ with and low novelty values correspond to popular head items.
+
+ Args:
+ train_df (pandas.DataFrame): Data set with historical data for users and items they
+ have interacted with; contains col_user, col_item. Assumed to not contain any duplicate rows.
+ Interaction here follows the *item choice model* from Castells et al.
+ reco_df (pandas.DataFrame): Recommender's prediction output, containing col_user, col_item,
+ col_relevance (optional). Assumed to not contain any duplicate user-item pairs.
+ col_user (str): User id column name.
+ col_item (str): Item id column name.
+
+ Returns:
+ pandas.DataFrame: A dataframe with the following columns: col_item, item_novelty.
+ """
+
+ n_records = train_df.shape[0]
+ item_count = pd.DataFrame(
+ {"count": train_df.groupby([col_item]).size()}
+ ).reset_index()
+ item_count["item_novelty"] = -np.log2(item_count["count"] / n_records)
+ df_item_novelty = (
+ item_count[[col_item, "item_novelty"]]
+ .sort_values(col_item)
+ .reset_index(drop=True)
+ )
+
+ return df_item_novelty
+
+
+[docs]@_check_column_dtypes_novelty_coverage
+def novelty(train_df, reco_df, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL):
+ """Calculate the average novelty in a list of recommended items (this assumes that the recommendation list
+ is already computed). Follows section 5 from
+
+ :Citation:
+
+ P. Castells, S. Vargas, and J. Wang, Novelty and diversity metrics for recommender systems:
+ choice, discovery and relevance, ECIR 2011
+
+ Args:
+ train_df (pandas.DataFrame): Data set with historical data for users and items they
+ have interacted with; contains col_user, col_item. Assumed to not contain any duplicate rows.
+ Interaction here follows the *item choice model* from Castells et al.
+ reco_df (pandas.DataFrame): Recommender's prediction output, containing col_user, col_item,
+ col_relevance (optional). Assumed to not contain any duplicate user-item pairs.
+ col_user (str): User id column name.
+ col_item (str): Item id column name.
+
+ Returns:
+ float: novelty.
+ """
+
+ df_item_novelty = historical_item_novelty(train_df, reco_df, col_user, col_item)
+ n_recommendations = reco_df.shape[0]
+ reco_item_count = pd.DataFrame(
+ {"count": reco_df.groupby([col_item]).size()}
+ ).reset_index()
+ reco_item_novelty = reco_item_count.merge(df_item_novelty, on=col_item)
+ reco_item_novelty["product"] = (
+ reco_item_novelty["count"] * reco_item_novelty["item_novelty"]
+ )
+ avg_novelty = reco_item_novelty.agg({"product": "sum"})[0] / n_recommendations
+
+ return avg_novelty
+
+
+# Serendipity metrics
+[docs]@_check_column_dtypes_diversity_serendipity
+@lru_cache_df(maxsize=1)
+def user_item_serendipity(
+ train_df,
+ reco_df,
+ item_feature_df=None,
+ item_sim_measure=DEFAULT_ITEM_SIM_MEASURE,
+ col_item_features=DEFAULT_ITEM_FEATURES_COL,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_sim=DEFAULT_SIMILARITY_COL,
+ col_relevance=None,
+):
+ """Calculate serendipity of each item in the recommendations for each user.
+ The metric definition is based on the following references:
+
+ :Citation:
+
+ Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist:
+ introducing serendipity into music recommendation, WSDM 2012
+
+ Eugene Yan, Serendipity: Accuracy’s unpopular best friend in Recommender Systems,
+ eugeneyan.com, April 2020
+
+ Args:
+ train_df (pandas.DataFrame): Data set with historical data for users and items they
+ have interacted with; contains col_user, col_item. Assumed to not contain any duplicate rows.
+ reco_df (pandas.DataFrame): Recommender's prediction output, containing col_user, col_item,
+ col_relevance (optional). Assumed to not contain any duplicate user-item pairs.
+ item_feature_df (pandas.DataFrame): (Optional) It is required only when item_sim_measure='item_feature_vector'.
+ It contains two columns: col_item and features (a feature vector).
+ item_sim_measure (str): (Optional) This column indicates which item similarity measure to be used.
+ Available measures include item_cooccurrence_count (default choice) and item_feature_vector.
+ col_item_features (str): item feature column name.
+ col_user (str): User id column name.
+ col_item (str): Item id column name.
+ col_sim (str): This column indicates the column name for item similarity.
+ col_relevance (str): This column indicates whether the recommended item is actually
+ relevant to the user or not.
+ Returns:
+ pandas.DataFrame: A dataframe with columns: col_user, col_item, user_item_serendipity.
+ """
+ # for every col_user, col_item in reco_df, join all interacted items from train_df.
+ # These interacted items are repeated for each item in reco_df for a specific user.
+ df_cosine_similarity = _get_cosine_similarity(
+ train_df,
+ item_feature_df,
+ item_sim_measure,
+ col_item_features,
+ col_user,
+ col_item,
+ col_sim,
+ )
+ reco_user_item = reco_df[[col_user, col_item]]
+ reco_user_item["reco_item_tmp"] = reco_user_item[col_item]
+
+ train_user_item = train_df[[col_user, col_item]]
+ train_user_item.columns = [col_user, "train_item_tmp"]
+
+ reco_train_user_item = reco_user_item.merge(train_user_item, on=[col_user])
+ reco_train_user_item["i1"] = reco_train_user_item[
+ ["reco_item_tmp", "train_item_tmp"]
+ ].min(axis=1)
+ reco_train_user_item["i2"] = reco_train_user_item[
+ ["reco_item_tmp", "train_item_tmp"]
+ ].max(axis=1)
+
+ reco_train_user_item_sim = reco_train_user_item.merge(
+ df_cosine_similarity, on=["i1", "i2"], how="left"
+ )
+ reco_train_user_item_sim[col_sim].fillna(0, inplace=True)
+
+ reco_user_item_avg_sim = (
+ reco_train_user_item_sim.groupby([col_user, col_item])
+ .agg({col_sim: "mean"})
+ .reset_index()
+ )
+ reco_user_item_avg_sim.columns = [
+ col_user,
+ col_item,
+ "avg_item2interactedHistory_sim",
+ ]
+
+ df_user_item_serendipity = reco_user_item_avg_sim.merge(
+ reco_df, on=[col_user, col_item]
+ )
+ df_user_item_serendipity["user_item_serendipity"] = (
+ 1 - df_user_item_serendipity["avg_item2interactedHistory_sim"]
+ ) * df_user_item_serendipity[col_relevance]
+ df_user_item_serendipity = (
+ df_user_item_serendipity[[col_user, col_item, "user_item_serendipity"]]
+ .sort_values([col_user, col_item])
+ .reset_index(drop=True)
+ )
+
+ return df_user_item_serendipity
+
+
+[docs]@lru_cache_df(maxsize=1)
+@_check_column_dtypes_diversity_serendipity
+def user_serendipity(
+ train_df,
+ reco_df,
+ item_feature_df=None,
+ item_sim_measure=DEFAULT_ITEM_SIM_MEASURE,
+ col_item_features=DEFAULT_ITEM_FEATURES_COL,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_sim=DEFAULT_SIMILARITY_COL,
+ col_relevance=None,
+):
+ """Calculate average serendipity for each user's recommendations.
+
+ Args:
+ train_df (pandas.DataFrame): Data set with historical data for users and items they
+ have interacted with; contains col_user, col_item. Assumed to not contain any duplicate rows.
+ reco_df (pandas.DataFrame): Recommender's prediction output, containing col_user, col_item,
+ col_relevance (optional). Assumed to not contain any duplicate user-item pairs.
+ item_feature_df (pandas.DataFrame): (Optional) It is required only when item_sim_measure='item_feature_vector'.
+ It contains two columns: col_item and features (a feature vector).
+ item_sim_measure (str): (Optional) This column indicates which item similarity measure to be used.
+ Available measures include item_cooccurrence_count (default choice) and item_feature_vector.
+ col_item_features (str): item feature column name.
+ col_user (str): User id column name.
+ col_item (str): Item id column name.
+ col_sim (str): This column indicates the column name for item similarity.
+ col_relevance (str): This column indicates whether the recommended item is actually
+ relevant to the user or not.
+ Returns:
+ pandas.DataFrame: A dataframe with following columns: col_user, user_serendipity.
+ """
+ df_user_item_serendipity = user_item_serendipity(
+ train_df,
+ reco_df,
+ item_feature_df,
+ item_sim_measure,
+ col_item_features,
+ col_user,
+ col_item,
+ col_sim,
+ col_relevance,
+ )
+ df_user_serendipity = (
+ df_user_item_serendipity.groupby(col_user)
+ .agg({"user_item_serendipity": "mean"})
+ .reset_index()
+ )
+ df_user_serendipity.columns = [col_user, "user_serendipity"]
+ df_user_serendipity = df_user_serendipity.sort_values(col_user).reset_index(
+ drop=True
+ )
+
+ return df_user_serendipity
+
+
+[docs]@_check_column_dtypes_diversity_serendipity
+def serendipity(
+ train_df,
+ reco_df,
+ item_feature_df=None,
+ item_sim_measure=DEFAULT_ITEM_SIM_MEASURE,
+ col_item_features=DEFAULT_ITEM_FEATURES_COL,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_sim=DEFAULT_SIMILARITY_COL,
+ col_relevance=None,
+):
+ """Calculate average serendipity for recommendations across all users.
+
+ Args:
+ train_df (pandas.DataFrame): Data set with historical data for users and items they
+ have interacted with; contains col_user, col_item. Assumed to not contain any duplicate rows.
+ reco_df (pandas.DataFrame): Recommender's prediction output, containing col_user, col_item,
+ col_relevance (optional). Assumed to not contain any duplicate user-item pairs.
+ item_feature_df (pandas.DataFrame): (Optional) It is required only when item_sim_measure='item_feature_vector'.
+ It contains two columns: col_item and features (a feature vector).
+ item_sim_measure (str): (Optional) This column indicates which item similarity measure to be used.
+ Available measures include item_cooccurrence_count (default choice) and item_feature_vector.
+ col_item_features (str): item feature column name.
+ col_user (str): User id column name.
+ col_item (str): Item id column name.
+ col_sim (str): This column indicates the column name for item similarity.
+ col_relevance (str): This column indicates whether the recommended item is actually
+ relevant to the user or not.
+ Returns:
+ float: serendipity.
+ """
+ df_user_serendipity = user_serendipity(
+ train_df,
+ reco_df,
+ item_feature_df,
+ item_sim_measure,
+ col_item_features,
+ col_user,
+ col_item,
+ col_sim,
+ col_relevance,
+ )
+ avg_serendipity = df_user_serendipity.agg({"user_serendipity": "mean"})[0]
+ return avg_serendipity
+
+
+# Coverage metrics
+[docs]@_check_column_dtypes_novelty_coverage
+def catalog_coverage(
+ train_df, reco_df, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL
+):
+ """Calculate catalog coverage for recommendations across all users.
+ The metric definition is based on the "catalog coverage" definition in the following reference:
+
+ :Citation:
+
+ G. Shani and A. Gunawardana, Evaluating Recommendation Systems,
+ Recommender Systems Handbook pp. 257-297, 2010.
+
+ Args:
+ train_df (pandas.DataFrame): Data set with historical data for users and items they
+ have interacted with; contains col_user, col_item. Assumed to not contain any duplicate rows.
+ Interaction here follows the *item choice model* from Castells et al.
+ reco_df (pandas.DataFrame): Recommender's prediction output, containing col_user, col_item,
+ col_relevance (optional). Assumed to not contain any duplicate user-item pairs.
+ col_user (str): User id column name.
+ col_item (str): Item id column name.
+
+ Returns:
+ float: catalog coverage
+ """
+ # distinct item count in reco_df
+ count_distinct_item_reco = reco_df[col_item].nunique()
+ # distinct item count in train_df
+ count_distinct_item_train = train_df[col_item].nunique()
+
+ # catalog coverage
+ c_coverage = count_distinct_item_reco / count_distinct_item_train
+ return c_coverage
+
+
+[docs]@_check_column_dtypes_novelty_coverage
+def distributional_coverage(
+ train_df, reco_df, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL
+):
+ """Calculate distributional coverage for recommendations across all users.
+ The metric definition is based on formula (21) in the following reference:
+
+ :Citation:
+
+ G. Shani and A. Gunawardana, Evaluating Recommendation Systems,
+ Recommender Systems Handbook pp. 257-297, 2010.
+
+ Args:
+ train_df (pandas.DataFrame): Data set with historical data for users and items they
+ have interacted with; contains col_user, col_item. Assumed to not contain any duplicate rows.
+ Interaction here follows the *item choice model* from Castells et al.
+ reco_df (pandas.DataFrame): Recommender's prediction output, containing col_user, col_item,
+ col_relevance (optional). Assumed to not contain any duplicate user-item pairs.
+ col_user (str): User id column name.
+ col_item (str): Item id column name.
+
+ Returns:
+ float: distributional coverage
+ """
+ # In reco_df, how many times each col_item is being recommended
+ df_itemcnt_reco = pd.DataFrame(
+ {"count": reco_df.groupby([col_item]).size()}
+ ).reset_index()
+
+ # the number of total recommendations
+ count_row_reco = reco_df.shape[0]
+
+ df_entropy = df_itemcnt_reco
+ df_entropy["p(i)"] = df_entropy["count"] / count_row_reco
+ df_entropy["entropy(i)"] = df_entropy["p(i)"] * np.log2(df_entropy["p(i)"])
+
+ d_coverage = -df_entropy.agg({"entropy(i)": "sum"})[0]
+
+ return d_coverage
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+
+try:
+ from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
+ from pyspark.sql import Window, DataFrame
+ from pyspark.sql.functions import col, row_number, expr
+ from pyspark.sql.functions import udf
+ import pyspark.sql.functions as F
+ from pyspark.sql.types import IntegerType, DoubleType, StructType, StructField
+ from pyspark.ml.linalg import VectorUDT
+except ImportError:
+ pass # skip this import if we are in pure python environment
+
+from recommenders.utils.constants import (
+ DEFAULT_PREDICTION_COL,
+ DEFAULT_USER_COL,
+ DEFAULT_ITEM_COL,
+ DEFAULT_RATING_COL,
+ DEFAULT_RELEVANCE_COL,
+ DEFAULT_SIMILARITY_COL,
+ DEFAULT_ITEM_FEATURES_COL,
+ DEFAULT_ITEM_SIM_MEASURE,
+ DEFAULT_TIMESTAMP_COL,
+ DEFAULT_K,
+ DEFAULT_THRESHOLD,
+)
+
+
+[docs]class SparkRatingEvaluation:
+ """Spark Rating Evaluator"""
+
+ def __init__(
+ self,
+ rating_true,
+ rating_pred,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+ ):
+ """Initializer.
+
+ This is the Spark version of rating metrics evaluator.
+ The methods of this class, calculate rating metrics such as root mean squared error, mean absolute error,
+ R squared, and explained variance.
+
+ Args:
+ rating_true (pyspark.sql.DataFrame): True labels.
+ rating_pred (pyspark.sql.DataFrame): Predicted labels.
+ col_user (str): column name for user.
+ col_item (str): column name for item.
+ col_rating (str): column name for rating.
+ col_prediction (str): column name for prediction.
+ """
+ self.rating_true = rating_true
+ self.rating_pred = rating_pred
+ self.col_user = col_user
+ self.col_item = col_item
+ self.col_rating = col_rating
+ self.col_prediction = col_prediction
+
+ # Check if inputs are Spark DataFrames.
+ if not isinstance(self.rating_true, DataFrame):
+ raise TypeError(
+ "rating_true should be but is not a Spark DataFrame"
+ ) # pragma : No Cover
+
+ if not isinstance(self.rating_pred, DataFrame):
+ raise TypeError(
+ "rating_pred should be but is not a Spark DataFrame"
+ ) # pragma : No Cover
+
+ # Check if columns exist.
+ true_columns = self.rating_true.columns
+ pred_columns = self.rating_pred.columns
+
+ if rating_true.count() == 0:
+ raise ValueError("Empty input dataframe")
+ if rating_pred.count() == 0:
+ raise ValueError("Empty input dataframe")
+
+ if self.col_user not in true_columns:
+ raise ValueError("Schema of rating_true not valid. Missing User Col")
+ if self.col_item not in true_columns:
+ raise ValueError("Schema of rating_true not valid. Missing Item Col")
+ if self.col_rating not in true_columns:
+ raise ValueError("Schema of rating_true not valid. Missing Rating Col")
+
+ if self.col_user not in pred_columns:
+ raise ValueError(
+ "Schema of rating_pred not valid. Missing User Col"
+ ) # pragma : No Cover
+ if self.col_item not in pred_columns:
+ raise ValueError(
+ "Schema of rating_pred not valid. Missing Item Col"
+ ) # pragma : No Cover
+ if self.col_prediction not in pred_columns:
+ raise ValueError("Schema of rating_pred not valid. Missing Prediction Col")
+
+ self.rating_true = self.rating_true.select(
+ col(self.col_user),
+ col(self.col_item),
+ col(self.col_rating).cast("double").alias("label"),
+ )
+ self.rating_pred = self.rating_pred.select(
+ col(self.col_user),
+ col(self.col_item),
+ col(self.col_prediction).cast("double").alias("prediction"),
+ )
+
+ self.y_pred_true = (
+ self.rating_true.join(
+ self.rating_pred, [self.col_user, self.col_item], "inner"
+ )
+ .drop(self.col_user)
+ .drop(self.col_item)
+ )
+
+ self.metrics = RegressionMetrics(
+ self.y_pred_true.rdd.map(lambda x: (x.prediction, x.label))
+ )
+
+[docs] def rmse(self):
+ """Calculate Root Mean Squared Error.
+
+ Returns:
+ float: Root mean squared error.
+ """
+ return self.metrics.rootMeanSquaredError
+
+[docs] def mae(self):
+ """Calculate Mean Absolute Error.
+
+ Returns:
+ float: Mean Absolute Error.
+ """
+ return self.metrics.meanAbsoluteError
+
+[docs] def rsquared(self):
+ """Calculate R squared.
+
+ Returns:
+ float: R squared.
+ """
+ return self.metrics.r2
+
+[docs] def exp_var(self):
+ """Calculate explained variance.
+
+ Note:
+ Spark MLLib's implementation is buggy (can lead to values > 1), hence we use var().
+
+ Returns:
+ float: Explained variance (min=0, max=1).
+ """
+ var1 = self.y_pred_true.selectExpr("variance(label-prediction)").collect()[0][0]
+ var2 = self.y_pred_true.selectExpr("variance(label)").collect()[0][0]
+
+ if var1 is None or var2 is None:
+ return -np.inf
+ else:
+ # numpy divide is more tolerant to var2 being zero
+ return 1 - np.divide(var1, var2)
+
+
+[docs]class SparkRankingEvaluation:
+ """Spark Ranking Evaluator"""
+
+ def __init__(
+ self,
+ rating_true,
+ rating_pred,
+ k=DEFAULT_K,
+ relevancy_method="top_k",
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+ threshold=DEFAULT_THRESHOLD,
+ ):
+ """Initialization.
+ This is the Spark version of ranking metrics evaluator.
+ The methods of this class, calculate ranking metrics such as precision@k, recall@k, ndcg@k, and mean average
+ precision.
+
+ The implementations of precision@k, ndcg@k, and mean average precision are referenced from Spark MLlib, which
+ can be found at `the link <https://spark.apache.org/docs/2.3.0/mllib-evaluation-metrics.html#ranking-systems>`_.
+
+ Args:
+ rating_true (pyspark.sql.DataFrame): DataFrame of true rating data (in the
+ format of customerID-itemID-rating tuple).
+ rating_pred (pyspark.sql.DataFrame): DataFrame of predicted rating data (in
+ the format of customerID-itemID-rating tuple).
+ col_user (str): column name for user.
+ col_item (str): column name for item.
+ col_rating (str): column name for rating.
+ col_prediction (str): column name for prediction.
+ k (int): number of items to recommend to each user.
+ relevancy_method (str): method for determining relevant items. Possible
+ values are "top_k", "by_time_stamp", and "by_threshold".
+ threshold (float): threshold for determining the relevant recommended items.
+ This is used for the case that predicted ratings follow a known
+ distribution. NOTE: this option is only activated if `relevancy_method` is
+ set to "by_threshold".
+ """
+ self.rating_true = rating_true
+ self.rating_pred = rating_pred
+ self.col_user = col_user
+ self.col_item = col_item
+ self.col_rating = col_rating
+ self.col_prediction = col_prediction
+ self.threshold = threshold
+
+ # Check if inputs are Spark DataFrames.
+ if not isinstance(self.rating_true, DataFrame):
+ raise TypeError(
+ "rating_true should be but is not a Spark DataFrame"
+ ) # pragma : No Cover
+
+ if not isinstance(self.rating_pred, DataFrame):
+ raise TypeError(
+ "rating_pred should be but is not a Spark DataFrame"
+ ) # pragma : No Cover
+
+ # Check if columns exist.
+ true_columns = self.rating_true.columns
+ pred_columns = self.rating_pred.columns
+
+ if self.col_user not in true_columns:
+ raise ValueError(
+ "Schema of rating_true not valid. Missing User Col: "
+ + str(true_columns)
+ )
+ if self.col_item not in true_columns:
+ raise ValueError("Schema of rating_true not valid. Missing Item Col")
+ if self.col_rating not in true_columns:
+ raise ValueError("Schema of rating_true not valid. Missing Rating Col")
+
+ if self.col_user not in pred_columns:
+ raise ValueError(
+ "Schema of rating_pred not valid. Missing User Col"
+ ) # pragma : No Cover
+ if self.col_item not in pred_columns:
+ raise ValueError(
+ "Schema of rating_pred not valid. Missing Item Col"
+ ) # pragma : No Cover
+ if self.col_prediction not in pred_columns:
+ raise ValueError("Schema of rating_pred not valid. Missing Prediction Col")
+
+ self.k = k
+
+ relevant_func = {
+ "top_k": _get_top_k_items,
+ "by_time_stamp": _get_relevant_items_by_timestamp,
+ "by_threshold": _get_relevant_items_by_threshold,
+ }
+
+ if relevancy_method not in relevant_func:
+ raise ValueError(
+ "relevancy_method should be one of {}".format(
+ list(relevant_func.keys())
+ )
+ )
+
+ self.rating_pred = (
+ relevant_func[relevancy_method](
+ dataframe=self.rating_pred,
+ col_user=self.col_user,
+ col_item=self.col_item,
+ col_rating=self.col_prediction,
+ threshold=self.threshold,
+ )
+ if relevancy_method == "by_threshold"
+ else relevant_func[relevancy_method](
+ dataframe=self.rating_pred,
+ col_user=self.col_user,
+ col_item=self.col_item,
+ col_rating=self.col_prediction,
+ k=self.k,
+ )
+ )
+
+ self._metrics = self._calculate_metrics()
+
+ def _calculate_metrics(self):
+ """Calculate ranking metrics."""
+ self._items_for_user_pred = self.rating_pred
+
+ self._items_for_user_true = (
+ self.rating_true.groupBy(self.col_user)
+ .agg(expr("collect_list(" + self.col_item + ") as ground_truth"))
+ .select(self.col_user, "ground_truth")
+ )
+
+ self._items_for_user_all = self._items_for_user_pred.join(
+ self._items_for_user_true, on=self.col_user
+ ).drop(self.col_user)
+
+ return RankingMetrics(self._items_for_user_all.rdd)
+
+[docs] def precision_at_k(self):
+ """Get precision@k.
+
+ Note:
+ More details can be found
+ `on the precisionAt PySpark documentation <http://spark.apache.org/docs/3.0.0/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.precisionAt>`_.
+
+ Return:
+ float: precision at k (min=0, max=1)
+ """
+ return self._metrics.precisionAt(self.k)
+
+[docs] def recall_at_k(self):
+ """Get recall@K.
+
+ Note:
+ More details can be found
+ `on the recallAt PySpark documentation <http://spark.apache.org/docs/3.0.0/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.recallAt>`_.
+
+ Return:
+ float: recall at k (min=0, max=1).
+ """
+ return self._metrics.recallAt(self.k)
+
+[docs] def ndcg_at_k(self):
+ """Get Normalized Discounted Cumulative Gain (NDCG)
+
+ Note:
+ More details can be found
+ `on the ndcgAt PySpark documentation <http://spark.apache.org/docs/3.0.0/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.ndcgAt>`_.
+
+ Return:
+ float: nDCG at k (min=0, max=1).
+ """
+ return self._metrics.ndcgAt(self.k)
+
+[docs] def map(self):
+ """Get mean average precision.
+
+ Return:
+ float: MAP (min=0, max=1).
+ """
+ return self._metrics.meanAveragePrecision
+
+[docs] def map_at_k(self):
+ """Get mean average precision at k.
+
+ Note:
+ More details `on the meanAveragePrecision PySpark documentation <http://spark.apache.org/docs/3.0.0/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.meanAveragePrecision>`_.
+
+ Return:
+ float: MAP at k (min=0, max=1).
+ """
+ return self._metrics.meanAveragePrecisionAt(self.k)
+
+
+def _get_top_k_items(
+ dataframe,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+ k=DEFAULT_K,
+):
+ """Get the input customer-item-rating tuple in the format of Spark
+ DataFrame, output a Spark DataFrame in the dense format of top k items
+ for each user.
+
+ Note:
+ if it is implicit rating, just append a column of constants to be ratings.
+
+ Args:
+ dataframe (pyspark.sql.DataFrame): DataFrame of rating data (in the format of
+ customerID-itemID-rating tuple).
+ col_user (str): column name for user.
+ col_item (str): column name for item.
+ col_rating (str): column name for rating.
+ col_prediction (str): column name for prediction.
+ k (int): number of items for each user.
+
+ Return:
+ pyspark.sql.DataFrame: DataFrame of top k items for each user.
+ """
+ window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())
+
+ # this does not work for rating of the same value.
+ items_for_user = (
+ dataframe.select(
+ col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
+ )
+ .where(col("rank") <= k)
+ .groupby(col_user)
+ .agg(F.collect_list(col_item).alias(col_prediction))
+ )
+
+ return items_for_user
+
+
+def _get_relevant_items_by_threshold(
+ dataframe,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+ threshold=DEFAULT_THRESHOLD,
+):
+ """Get relevant items for each customer in the input rating data.
+
+ Relevant items are defined as those having ratings above certain threshold.
+ The threshold is defined as a statistical measure of the ratings for a
+ user, e.g., median.
+
+ Args:
+ dataframe: Spark DataFrame of customerID-itemID-rating tuples.
+ col_user (str): column name for user.
+ col_item (str): column name for item.
+ col_rating (str): column name for rating.
+ col_prediction (str): column name for prediction.
+ threshold (float): threshold for determining the relevant recommended items.
+ This is used for the case that predicted ratings follow a known
+ distribution.
+
+ Return:
+ pyspark.sql.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant
+ items.
+ """
+ items_for_user = (
+ dataframe.orderBy(col_rating, ascending=False)
+ .where(col_rating + " >= " + str(threshold))
+ .select(col_user, col_item, col_rating)
+ .withColumn(
+ col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))
+ )
+ .select(col_user, col_prediction)
+ .dropDuplicates()
+ )
+
+ return items_for_user
+
+
+def _get_relevant_items_by_timestamp(
+ dataframe,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_timestamp=DEFAULT_TIMESTAMP_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+ k=DEFAULT_K,
+):
+ """Get relevant items for each customer defined by timestamp.
+
+ Relevant items are defined as k items that appear mostly recently
+ according to timestamps.
+
+ Args:
+ dataframe (pyspark.sql.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp
+ tuples.
+ col_user (str): column name for user.
+ col_item (str): column name for item.
+ col_rating (str): column name for rating.
+ col_timestamp (str): column name for timestamp.
+ col_prediction (str): column name for prediction.
+ k: number of relevant items to be filtered by the function.
+
+ Return:
+ pyspark.sql.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items.
+ """
+ window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc())
+
+ items_for_user = (
+ dataframe.select(
+ col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
+ )
+ .where(col("rank") <= k)
+ .withColumn(
+ col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))
+ )
+ .select(col_user, col_prediction)
+ .dropDuplicates([col_user, col_prediction])
+ )
+
+ return items_for_user
+
+
+[docs]class SparkDiversityEvaluation:
+ """Spark Evaluator for diversity, coverage, novelty, serendipity"""
+
+ def __init__(
+ self,
+ train_df,
+ reco_df,
+ item_feature_df=None,
+ item_sim_measure=DEFAULT_ITEM_SIM_MEASURE,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_relevance=None,
+ ):
+ """Initializer.
+
+ This is the Spark version of diversity metrics evaluator.
+ The methods of this class calculate the following diversity metrics:
+
+ * Coverage - it includes two metrics:
+ 1. catalog_coverage, which measures the proportion of items that get recommended from the item catalog;
+ 2. distributional_coverage, which measures how unequally different items are recommended in the
+ recommendations to all users.
+ * Novelty - A more novel item indicates it is less popular, i.e. it gets recommended less frequently.
+ * Diversity - The dissimilarity of items being recommended.
+ * Serendipity - The "unusualness" or "surprise" of recommendations to a user. When 'col_relevance' is used,
+ it indicates how "pleasant surprise" of recommendations is to a user.
+
+ The metric definitions/formulations are based on the following references with modification:
+
+ :Citation:
+
+ G. Shani and A. Gunawardana, Evaluating Recommendation Systems,
+ Recommender Systems Handbook pp. 257-297, 2010.
+
+ Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist: introducing
+ serendipity into music recommendation, WSDM 2012
+
+ P. Castells, S. Vargas, and J. Wang, Novelty and diversity metrics for recommender systems:
+ choice, discovery and relevance, ECIR 2011
+
+ Eugene Yan, Serendipity: Accuracy's unpopular best friend in Recommender Systems,
+ eugeneyan.com, April 2020
+
+ Args:
+ train_df (pyspark.sql.DataFrame): Data set with historical data for users and items they
+ have interacted with; contains col_user, col_item. Assumed to not contain any duplicate rows.
+ Interaction here follows the *item choice model* from Castells et al.
+ reco_df (pyspark.sql.DataFrame): Recommender's prediction output, containing col_user, col_item,
+ col_relevance (optional). Assumed to not contain any duplicate user-item pairs.
+ item_feature_df (pyspark.sql.DataFrame): (Optional) It is required only when item_sim_measure='item_feature_vector'.
+ It contains two columns: col_item and features (a feature vector).
+ item_sim_measure (str): (Optional) This column indicates which item similarity measure to be used.
+ Available measures include item_cooccurrence_count (default choice) and item_feature_vector.
+ col_user (str): User id column name.
+ col_item (str): Item id column name.
+ col_relevance (str): Optional. This column indicates whether the recommended item is actually
+ relevant to the user or not.
+ """
+
+ self.train_df = train_df.select(col_user, col_item)
+ self.col_user = col_user
+ self.col_item = col_item
+ self.sim_col = DEFAULT_SIMILARITY_COL
+ self.df_cosine_similarity = None
+ self.df_user_item_serendipity = None
+ self.df_user_serendipity = None
+ self.avg_serendipity = None
+ self.df_item_novelty = None
+ self.avg_novelty = None
+ self.df_intralist_similarity = None
+ self.df_user_diversity = None
+ self.avg_diversity = None
+ self.item_feature_df = item_feature_df
+ self.item_sim_measure = item_sim_measure
+
+ if col_relevance is None:
+ self.col_relevance = DEFAULT_RELEVANCE_COL
+ # relevance term, default is 1 (relevant) for all
+ self.reco_df = reco_df.select(
+ col_user, col_item, F.lit(1.0).alias(self.col_relevance)
+ )
+ else:
+ self.col_relevance = col_relevance
+ self.reco_df = reco_df.select(
+ col_user, col_item, F.col(self.col_relevance).cast(DoubleType())
+ )
+
+ if self.item_sim_measure == "item_feature_vector":
+ self.col_item_features = DEFAULT_ITEM_FEATURES_COL
+ required_schema = StructType(
+ (
+ StructField(self.col_item, IntegerType()),
+ StructField(self.col_item_features, VectorUDT()),
+ )
+ )
+ if self.item_feature_df is not None:
+ if str(required_schema) != str(item_feature_df.schema):
+ raise Exception(
+ "Incorrect schema! item_feature_df should have schema "
+ f"{str(required_schema)} but have {str(item_feature_df.schema)}"
+ )
+ else:
+ raise Exception(
+ "item_feature_df not specified! item_feature_df must be provided "
+ "if choosing to use item_feature_vector to calculate item similarity. "
+ f"item_feature_df should have schema {str(required_schema)}"
+ )
+
+ # check if reco_df contains any user_item pairs that are already shown in train_df
+ count_intersection = (
+ self.train_df.select(self.col_user, self.col_item)
+ .intersect(self.reco_df.select(self.col_user, self.col_item))
+ .count()
+ )
+
+ if count_intersection != 0:
+ raise Exception(
+ "reco_df should not contain any user_item pairs that are already shown in train_df"
+ )
+
+ def _get_pairwise_items(self, df):
+ """Get pairwise combinations of items per user (ignoring duplicate pairs [1,2] == [2,1])"""
+ return (
+ df.select(self.col_user, F.col(self.col_item).alias("i1"))
+ .join(
+ df.select(
+ F.col(self.col_user).alias("_user"),
+ F.col(self.col_item).alias("i2"),
+ ),
+ (F.col(self.col_user) == F.col("_user")) & (F.col("i1") <= F.col("i2")),
+ )
+ .select(self.col_user, "i1", "i2")
+ )
+
+ def _get_cosine_similarity(self, n_partitions=200):
+ if self.item_sim_measure == "item_cooccurrence_count":
+ # calculate item-item similarity based on item co-occurrence count
+ self._get_cooccurrence_similarity(n_partitions)
+ elif self.item_sim_measure == "item_feature_vector":
+ # calculate item-item similarity based on item feature vectors
+ self._get_item_feature_similarity(n_partitions)
+ else:
+ raise Exception(
+ "item_sim_measure not recognized! The available options include 'item_cooccurrence_count' and 'item_feature_vector'."
+ )
+ return self.df_cosine_similarity
+
+ def _get_cooccurrence_similarity(self, n_partitions):
+ """Cosine similarity metric from
+
+ :Citation:
+
+ Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist:
+ introducing serendipity into music recommendation, WSDM 2012
+
+ The item indexes in the result are such that i1 <= i2.
+ """
+ if self.df_cosine_similarity is None:
+ pairs = self._get_pairwise_items(df=self.train_df)
+ item_count = self.train_df.groupBy(self.col_item).count()
+
+ self.df_cosine_similarity = (
+ pairs.groupBy("i1", "i2")
+ .count()
+ .join(
+ item_count.select(
+ F.col(self.col_item).alias("i1"),
+ F.pow(F.col("count"), 0.5).alias("i1_sqrt_count"),
+ ),
+ on="i1",
+ )
+ .join(
+ item_count.select(
+ F.col(self.col_item).alias("i2"),
+ F.pow(F.col("count"), 0.5).alias("i2_sqrt_count"),
+ ),
+ on="i2",
+ )
+ .select(
+ "i1",
+ "i2",
+ (
+ F.col("count")
+ / (F.col("i1_sqrt_count") * F.col("i2_sqrt_count"))
+ ).alias(self.sim_col),
+ )
+ .repartition(n_partitions, "i1", "i2")
+ )
+ return self.df_cosine_similarity
+
+ @staticmethod
+ @udf(returnType=DoubleType())
+ def sim_cos(v1, v2):
+ p = 2
+ return float(v1.dot(v2)) / float(v1.norm(p) * v2.norm(p))
+
+ def _get_item_feature_similarity(self, n_partitions):
+ """Cosine similarity metric based on item feature vectors
+
+ The item indexes in the result are such that i1 <= i2.
+ """
+ if self.df_cosine_similarity is None:
+ self.df_cosine_similarity = (
+ self.item_feature_df.select(
+ F.col(self.col_item).alias("i1"),
+ F.col(self.col_item_features).alias("f1"),
+ )
+ .join(
+ self.item_feature_df.select(
+ F.col(self.col_item).alias("i2"),
+ F.col(self.col_item_features).alias("f2"),
+ ),
+ (F.col("i1") <= F.col("i2")),
+ )
+ .select("i1", "i2", self.sim_cos("f1", "f2").alias("sim"))
+ .sort("i1", "i2")
+ .repartition(n_partitions, "i1", "i2")
+ )
+ return self.df_cosine_similarity
+
+ # Diversity metrics
+ def _get_intralist_similarity(self, df):
+ """Intra-list similarity from
+
+ :Citation:
+
+ "Improving Recommendation Lists Through Topic Diversification",
+ Ziegler, McNee, Konstan and Lausen, 2005.
+ """
+ if self.df_intralist_similarity is None:
+ pairs = self._get_pairwise_items(df=df)
+ similarity_df = self._get_cosine_similarity()
+ # Fillna(0) is needed in the cases where similarity_df does not have an entry for a pair of items.
+ # e.g. i1 and i2 have never occurred together.
+ self.df_intralist_similarity = (
+ pairs.join(similarity_df, on=["i1", "i2"], how="left")
+ .fillna(0)
+ .filter(F.col("i1") != F.col("i2"))
+ .groupBy(self.col_user)
+ .agg(F.mean(self.sim_col).alias("avg_il_sim"))
+ .select(self.col_user, "avg_il_sim")
+ )
+ return self.df_intralist_similarity
+
+[docs] def user_diversity(self):
+ """Calculate average diversity of recommendations for each user.
+ The metric definition is based on formula (3) in the following reference:
+
+ :Citation:
+
+ Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist:
+ introducing serendipity into music recommendation, WSDM 2012
+
+ Returns:
+ pyspark.sql.dataframe.DataFrame: A dataframe with the following columns: col_user, user_diversity.
+ """
+ if self.df_user_diversity is None:
+ self.df_intralist_similarity = self._get_intralist_similarity(self.reco_df)
+ self.df_user_diversity = (
+ self.df_intralist_similarity.withColumn(
+ "user_diversity", 1 - F.col("avg_il_sim")
+ )
+ .select(self.col_user, "user_diversity")
+ .orderBy(self.col_user)
+ )
+ return self.df_user_diversity
+
+[docs] def diversity(self):
+ """Calculate average diversity of recommendations across all users.
+
+ Returns:
+ float: diversity.
+ """
+ if self.avg_diversity is None:
+ self.df_user_diversity = self.user_diversity()
+ self.avg_diversity = self.df_user_diversity.agg(
+ {"user_diversity": "mean"}
+ ).first()[0]
+ return self.avg_diversity
+
+ # Novelty metrics
+[docs] def historical_item_novelty(self):
+ """Calculate novelty for each item. Novelty is computed as the minus logarithm of
+ (number of interactions with item / total number of interactions). The definition of the metric
+ is based on the following reference using the choice model (eqs. 1 and 6):
+
+ :Citation:
+
+ P. Castells, S. Vargas, and J. Wang, Novelty and diversity metrics for recommender systems:
+ choice, discovery and relevance, ECIR 2011
+
+ The novelty of an item can be defined relative to a set of observed events on the set of all items.
+ These can be events of user choice (item "is picked" by a random user) or user discovery
+ (item "is known" to a random user). The above definition of novelty reflects a factor of item popularity.
+ High novelty values correspond to long-tail items in the density function, that few users have interacted
+ with and low novelty values correspond to popular head items.
+
+ Returns:
+ pyspark.sql.dataframe.DataFrame: A dataframe with the following columns: col_item, item_novelty.
+ """
+ if self.df_item_novelty is None:
+ n_records = self.train_df.count()
+ self.df_item_novelty = (
+ self.train_df.groupBy(self.col_item)
+ .count()
+ .withColumn("item_novelty", -F.log2(F.col("count") / n_records))
+ .select(self.col_item, "item_novelty")
+ .orderBy(self.col_item)
+ )
+ return self.df_item_novelty
+
+[docs] def novelty(self):
+ """Calculate the average novelty in a list of recommended items (this assumes that the recommendation list
+ is already computed). Follows section 5 from
+
+ :Citation:
+
+ P. Castells, S. Vargas, and J. Wang, Novelty and diversity metrics for recommender systems:
+ choice, discovery and relevance, ECIR 2011
+
+ Returns:
+ pyspark.sql.dataframe.DataFrame: A dataframe with following columns: novelty.
+ """
+ if self.avg_novelty is None:
+ self.df_item_novelty = self.historical_item_novelty()
+ n_recommendations = self.reco_df.count()
+ self.avg_novelty = (
+ self.reco_df.groupBy(self.col_item)
+ .count()
+ .join(self.df_item_novelty, self.col_item)
+ .selectExpr("sum(count * item_novelty)")
+ .first()[0]
+ / n_recommendations
+ )
+ return self.avg_novelty
+
+ # Serendipity metrics
+[docs] def user_item_serendipity(self):
+ """Calculate serendipity of each item in the recommendations for each user.
+ The metric definition is based on the following references:
+
+ :Citation:
+
+ Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist:
+ introducing serendipity into music recommendation, WSDM 2012
+
+ Eugene Yan, Serendipity: Accuracy’s unpopular best friend in Recommender Systems,
+ eugeneyan.com, April 2020
+
+ Returns:
+ pyspark.sql.dataframe.DataFrame: A dataframe with columns: col_user, col_item, user_item_serendipity.
+ """
+ # for every col_user, col_item in reco_df, join all interacted items from train_df.
+ # These interacted items are repeated for each item in reco_df for a specific user.
+ if self.df_user_item_serendipity is None:
+ self.df_cosine_similarity = self._get_cosine_similarity()
+ self.df_user_item_serendipity = (
+ self.reco_df.select(
+ self.col_user,
+ self.col_item,
+ F.col(self.col_item).alias(
+ "reco_item_tmp"
+ ), # duplicate col_item to keep
+ )
+ .join(
+ self.train_df.select(
+ self.col_user, F.col(self.col_item).alias("train_item_tmp")
+ ),
+ on=[self.col_user],
+ )
+ .select(
+ self.col_user,
+ self.col_item,
+ F.least(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias(
+ "i1"
+ ),
+ F.greatest(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias(
+ "i2"
+ ),
+ )
+ .join(self.df_cosine_similarity, on=["i1", "i2"], how="left")
+ .fillna(0)
+ .groupBy(self.col_user, self.col_item)
+ .agg(F.mean(self.sim_col).alias("avg_item2interactedHistory_sim"))
+ .join(self.reco_df, on=[self.col_user, self.col_item])
+ .withColumn(
+ "user_item_serendipity",
+ (1 - F.col("avg_item2interactedHistory_sim"))
+ * F.col(self.col_relevance),
+ )
+ .select(self.col_user, self.col_item, "user_item_serendipity")
+ .orderBy(self.col_user, self.col_item)
+ )
+ return self.df_user_item_serendipity
+
+[docs] def user_serendipity(self):
+ """Calculate average serendipity for each user's recommendations.
+
+ Returns:
+ pyspark.sql.dataframe.DataFrame: A dataframe with following columns: col_user, user_serendipity.
+ """
+ if self.df_user_serendipity is None:
+ self.df_user_item_serendipity = self.user_item_serendipity()
+ self.df_user_serendipity = (
+ self.df_user_item_serendipity.groupBy(self.col_user)
+ .agg(F.mean("user_item_serendipity").alias("user_serendipity"))
+ .orderBy(self.col_user)
+ )
+ return self.df_user_serendipity
+
+[docs] def serendipity(self):
+ """Calculate average serendipity for recommendations across all users.
+
+ Returns:
+ float: serendipity.
+ """
+ if self.avg_serendipity is None:
+ self.df_user_serendipity = self.user_serendipity()
+ self.avg_serendipity = self.df_user_serendipity.agg(
+ {"user_serendipity": "mean"}
+ ).first()[0]
+ return self.avg_serendipity
+
+ # Coverage metrics
+[docs] def catalog_coverage(self):
+ """Calculate catalog coverage for recommendations across all users.
+ The metric definition is based on the "catalog coverage" definition in the following reference:
+
+ :Citation:
+
+ G. Shani and A. Gunawardana, Evaluating Recommendation Systems,
+ Recommender Systems Handbook pp. 257-297, 2010.
+
+ Returns:
+ float: catalog coverage
+ """
+ # distinct item count in reco_df
+ count_distinct_item_reco = self.reco_df.select(self.col_item).distinct().count()
+ # distinct item count in train_df
+ count_distinct_item_train = (
+ self.train_df.select(self.col_item).distinct().count()
+ )
+
+ # catalog coverage
+ c_coverage = count_distinct_item_reco / count_distinct_item_train
+ return c_coverage
+
+[docs] def distributional_coverage(self):
+ """Calculate distributional coverage for recommendations across all users.
+ The metric definition is based on formula (21) in the following reference:
+
+ :Citation:
+
+ G. Shani and A. Gunawardana, Evaluating Recommendation Systems,
+ Recommender Systems Handbook pp. 257-297, 2010.
+
+ Returns:
+ float: distributional coverage
+ """
+ # In reco_df, how many times each col_item is being recommended
+ df_itemcnt_reco = self.reco_df.groupBy(self.col_item).count()
+
+ # the number of total recommendations
+ count_row_reco = self.reco_df.count()
+ df_entropy = df_itemcnt_reco.withColumn(
+ "p(i)", F.col("count") / count_row_reco
+ ).withColumn("entropy(i)", F.col("p(i)") * F.log2(F.col("p(i)")))
+ # distributional coverage
+ d_coverage = -df_entropy.agg(F.sum("entropy(i)")).collect()[0][0]
+
+ return d_coverage
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import pandas as pd
+import numpy as np
+
+from recommenders.utils.constants import (
+ DEFAULT_USER_COL,
+ DEFAULT_ITEM_COL,
+ DEFAULT_PREDICTION_COL,
+)
+
+
+[docs]def predict(
+ model,
+ data,
+ usercol=DEFAULT_USER_COL,
+ itemcol=DEFAULT_ITEM_COL,
+ predcol=DEFAULT_PREDICTION_COL,
+):
+ """Computes predictions of a recommender model from Cornac on the data.
+ Can be used for computing rating metrics like RMSE.
+
+ Args:
+ model (cornac.models.Recommender): A recommender model from Cornac
+ data (pandas.DataFrame): The data on which to predict
+ usercol (str): Name of the user column
+ itemcol (str): Name of the item column
+
+ Returns:
+ pandas.DataFrame: Dataframe with usercol, itemcol, predcol
+ """
+ uid_map = model.train_set.uid_map
+ iid_map = model.train_set.iid_map
+ predictions = [
+ [
+ getattr(row, usercol),
+ getattr(row, itemcol),
+ model.rate(
+ user_idx=uid_map.get(getattr(row, usercol), len(uid_map)),
+ item_idx=iid_map.get(getattr(row, itemcol), len(iid_map)),
+ ),
+ ]
+ for row in data.itertuples()
+ ]
+ predictions = pd.DataFrame(data=predictions, columns=[usercol, itemcol, predcol])
+ return predictions
+
+
+[docs]def predict_ranking(
+ model,
+ data,
+ usercol=DEFAULT_USER_COL,
+ itemcol=DEFAULT_ITEM_COL,
+ predcol=DEFAULT_PREDICTION_COL,
+ remove_seen=False,
+):
+ """Computes predictions of recommender model from Cornac on all users and items in data.
+ It can be used for computing ranking metrics like NDCG.
+
+ Args:
+ model (cornac.models.Recommender): A recommender model from Cornac
+ data (pandas.DataFrame): The data from which to get the users and items
+ usercol (str): Name of the user column
+ itemcol (str): Name of the item column
+ remove_seen (bool): Flag to remove (user, item) pairs seen in the training data
+
+ Returns:
+ pandas.DataFrame: Dataframe with usercol, itemcol, predcol
+ """
+ users, items, preds = [], [], []
+ item = list(model.train_set.iid_map.keys())
+ for uid, user_idx in model.train_set.uid_map.items():
+ user = [uid] * len(item)
+ users.extend(user)
+ items.extend(item)
+ preds.extend(model.score(user_idx).tolist())
+
+ all_predictions = pd.DataFrame(
+ data={usercol: users, itemcol: items, predcol: preds}
+ )
+
+ if remove_seen:
+ tempdf = pd.concat(
+ [
+ data[[usercol, itemcol]],
+ pd.DataFrame(
+ data=np.ones(data.shape[0]), columns=["dummycol"], index=data.index
+ ),
+ ],
+ axis=1,
+ )
+ merged = pd.merge(tempdf, all_predictions, on=[usercol, itemcol], how="outer")
+ return merged[merged["dummycol"].isnull()].drop("dummycol", axis=1)
+ else:
+ return all_predictions
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import random
+import numpy as np
+import pandas as pd
+import scipy.sparse as sp
+
+from recommenders.utils.constants import (
+ DEFAULT_ITEM_COL,
+ DEFAULT_USER_COL,
+ DEFAULT_RATING_COL,
+ DEFAULT_PREDICTION_COL,
+)
+
+
+[docs]class ImplicitCF(object):
+ """Data processing class for GCN models which use implicit feedback.
+
+ Initialize train and test set, create normalized adjacency matrix and sample data for training epochs.
+
+ """
+
+ def __init__(
+ self,
+ train,
+ test=None,
+ adj_dir=None,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+ seed=None,
+ ):
+ """Constructor
+
+ Args:
+ adj_dir (str): Directory to save / load adjacency matrices. If it is None, adjacency
+ matrices will be created and will not be saved.
+ train (pandas.DataFrame): Training data with at least columns (col_user, col_item, col_rating).
+ test (pandas.DataFrame): Test data with at least columns (col_user, col_item, col_rating).
+ test can be None, if so, we only process the training data.
+ col_user (str): User column name.
+ col_item (str): Item column name.
+ col_rating (str): Rating column name.
+ seed (int): Seed.
+
+ """
+ self.user_idx = None
+ self.item_idx = None
+ self.adj_dir = adj_dir
+ self.col_user = col_user
+ self.col_item = col_item
+ self.col_rating = col_rating
+ self.col_prediction = col_prediction
+ self.train, self.test = self._data_processing(train, test)
+ self._init_train_data()
+
+ random.seed(seed)
+
+ def _data_processing(self, train, test):
+ """Process the dataset to reindex userID and itemID and only keep records with ratings greater than 0.
+
+ Args:
+ train (pandas.DataFrame): Training data with at least columns (col_user, col_item, col_rating).
+ test (pandas.DataFrame): Test data with at least columns (col_user, col_item, col_rating).
+ test can be None, if so, we only process the training data.
+
+ Returns:
+ list: train and test pandas.DataFrame Dataset, which have been reindexed and filtered.
+
+ """
+ df = (
+ train
+ if test is None
+ else pd.concat([train, test], axis=0, ignore_index=True)
+ )
+
+ if self.user_idx is None:
+ user_idx = df[[self.col_user]].drop_duplicates().reindex()
+ user_idx[self.col_user + "_idx"] = np.arange(len(user_idx))
+ self.n_users = len(user_idx)
+ self.user_idx = user_idx
+
+ self.user2id = dict(
+ zip(user_idx[self.col_user], user_idx[self.col_user + "_idx"])
+ )
+ self.id2user = dict(
+ zip(user_idx[self.col_user + "_idx"], user_idx[self.col_user])
+ )
+
+ if self.item_idx is None:
+ item_idx = df[[self.col_item]].drop_duplicates()
+ item_idx[self.col_item + "_idx"] = np.arange(len(item_idx))
+ self.n_items = len(item_idx)
+ self.item_idx = item_idx
+
+ self.item2id = dict(
+ zip(item_idx[self.col_item], item_idx[self.col_item + "_idx"])
+ )
+ self.id2item = dict(
+ zip(item_idx[self.col_item + "_idx"], item_idx[self.col_item])
+ )
+
+ return self._reindex(train), self._reindex(test)
+
+ def _reindex(self, df):
+ """Process the dataset to reindex userID and itemID and only keep records with ratings greater than 0.
+
+ Args:
+ df (pandas.DataFrame): dataframe with at least columns (col_user, col_item, col_rating).
+
+ Returns:
+ list: train and test pandas.DataFrame Dataset, which have been reindexed and filtered.
+
+ """
+
+ if df is None:
+ return None
+
+ df = pd.merge(df, self.user_idx, on=self.col_user, how="left")
+ df = pd.merge(df, self.item_idx, on=self.col_item, how="left")
+
+ df = df[df[self.col_rating] > 0]
+
+ df_reindex = df[
+ [self.col_user + "_idx", self.col_item + "_idx", self.col_rating]
+ ]
+ df_reindex.columns = [self.col_user, self.col_item, self.col_rating]
+
+ return df_reindex
+
+ def _init_train_data(self):
+ """Record items interated with each user in a dataframe self.interact_status, and create adjacency
+ matrix self.R.
+
+ """
+ self.interact_status = (
+ self.train.groupby(self.col_user)[self.col_item]
+ .apply(set)
+ .reset_index()
+ .rename(columns={self.col_item: self.col_item + "_interacted"})
+ )
+ self.R = sp.dok_matrix((self.n_users, self.n_items), dtype=np.float32)
+ self.R[self.train[self.col_user], self.train[self.col_item]] = 1.0
+
+[docs] def get_norm_adj_mat(self):
+ """Load normalized adjacency matrix if it exists, otherwise create (and save) it.
+
+ Returns:
+ scipy.sparse.csr_matrix: Normalized adjacency matrix.
+
+ """
+ try:
+ if self.adj_dir is None:
+ raise FileNotFoundError
+ norm_adj_mat = sp.load_npz(self.adj_dir + "/norm_adj_mat.npz")
+ print("Already load norm adj matrix.")
+
+ except FileNotFoundError:
+ norm_adj_mat = self.create_norm_adj_mat()
+ if self.adj_dir is not None:
+ sp.save_npz(self.adj_dir + "/norm_adj_mat.npz", norm_adj_mat)
+ return norm_adj_mat
+
+[docs] def create_norm_adj_mat(self):
+ """Create normalized adjacency matrix.
+
+ Returns:
+ scipy.sparse.csr_matrix: Normalized adjacency matrix.
+
+ """
+ adj_mat = sp.dok_matrix(
+ (self.n_users + self.n_items, self.n_users + self.n_items), dtype=np.float32
+ )
+ adj_mat = adj_mat.tolil()
+ R = self.R.tolil()
+
+ adj_mat[: self.n_users, self.n_users :] = R
+ adj_mat[self.n_users :, : self.n_users] = R.T
+ adj_mat = adj_mat.todok()
+ print("Already create adjacency matrix.")
+
+ rowsum = np.array(adj_mat.sum(1))
+ d_inv = np.power(rowsum + 1e-9, -0.5).flatten()
+ d_inv[np.isinf(d_inv)] = 0.0
+ d_mat_inv = sp.diags(d_inv)
+ norm_adj_mat = d_mat_inv.dot(adj_mat)
+ norm_adj_mat = norm_adj_mat.dot(d_mat_inv)
+ print("Already normalize adjacency matrix.")
+
+ return norm_adj_mat.tocsr()
+
+[docs] def train_loader(self, batch_size):
+ """Sample train data every batch. One positive item and one negative item sampled for each user.
+
+ Args:
+ batch_size (int): Batch size of users.
+
+ Returns:
+ numpy.ndarray, numpy.ndarray, numpy.ndarray:
+ - Sampled users.
+ - Sampled positive items.
+ - Sampled negative items.
+ """
+
+ def sample_neg(x):
+ while True:
+ neg_id = random.randint(0, self.n_items - 1)
+ if neg_id not in x:
+ return neg_id
+
+ indices = range(self.n_users)
+ if self.n_users < batch_size:
+ users = [random.choice(indices) for _ in range(batch_size)]
+ else:
+ users = random.sample(indices, batch_size)
+
+ interact = self.interact_status.iloc[users]
+ pos_items = interact[self.col_item + "_interacted"].apply(
+ lambda x: random.choice(list(x))
+ )
+ neg_items = interact[self.col_item + "_interacted"].apply(
+ lambda x: sample_neg(x)
+ )
+
+ return np.array(users), np.array(pos_items), np.array(neg_items)
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+
+import os
+from sklearn.metrics import (
+ roc_auc_score,
+ log_loss,
+ mean_squared_error,
+ accuracy_score,
+ f1_score,
+)
+import numpy as np
+import yaml
+import zipfile
+import pickle as pkl
+
+from recommenders.datasets.download_utils import maybe_download
+
+
+[docs]def flat_config(config):
+ """Flat config loaded from a yaml file to a flat dict.
+
+ Args:
+ config (dict): Configuration loaded from a yaml file.
+
+ Returns:
+ dict: Configuration dictionary.
+ """
+ f_config = {}
+ category = config.keys()
+ for cate in category:
+ for key, val in config[cate].items():
+ f_config[key] = val
+ return f_config
+
+
+[docs]def check_type(config):
+ """Check that the config parameters are the correct type
+
+ Args:
+ config (dict): Configuration dictionary.
+
+ Raises:
+ TypeError: If the parameters are not the correct type.
+ """
+
+ int_parameters = [
+ "word_size",
+ "entity_size",
+ "doc_size",
+ "history_size",
+ "FEATURE_COUNT",
+ "FIELD_COUNT",
+ "dim",
+ "epochs",
+ "batch_size",
+ "show_step",
+ "save_epoch",
+ "PAIR_NUM",
+ "DNN_FIELD_NUM",
+ "attention_layer_sizes",
+ "n_user",
+ "n_item",
+ "n_user_attr",
+ "n_item_attr",
+ "item_embedding_dim",
+ "cate_embedding_dim",
+ "user_embedding_dim",
+ "max_seq_length",
+ "hidden_size",
+ "T",
+ "L",
+ "n_v",
+ "n_h",
+ "kernel_size",
+ "min_seq_length",
+ "attention_size",
+ "epochs",
+ "batch_size",
+ "show_step",
+ "save_epoch",
+ "train_num_ngs",
+ ]
+ for param in int_parameters:
+ if param in config and not isinstance(config[param], int):
+ raise TypeError("Parameters {0} must be int".format(param))
+
+ float_parameters = [
+ "init_value",
+ "learning_rate",
+ "embed_l2",
+ "embed_l1",
+ "layer_l2",
+ "layer_l1",
+ "mu",
+ ]
+ for param in float_parameters:
+ if param in config and not isinstance(config[param], float):
+ raise TypeError("Parameters {0} must be float".format(param))
+
+ str_parameters = [
+ "train_file",
+ "eval_file",
+ "test_file",
+ "infer_file",
+ "method",
+ "load_model_name",
+ "infer_model_name",
+ "loss",
+ "optimizer",
+ "init_method",
+ "attention_activation",
+ "user_vocab",
+ "item_vocab",
+ "cate_vocab",
+ ]
+ for param in str_parameters:
+ if param in config and not isinstance(config[param], str):
+ raise TypeError("Parameters {0} must be str".format(param))
+
+ list_parameters = [
+ "layer_sizes",
+ "activation",
+ "dropout",
+ "att_fcn_layer_sizes",
+ "dilations",
+ ]
+ for param in list_parameters:
+ if param in config and not isinstance(config[param], list):
+ raise TypeError("Parameters {0} must be list".format(param))
+
+
+[docs]def check_nn_config(f_config):
+ """Check neural networks configuration.
+
+ Args:
+ f_config (dict): Neural network configuration.
+
+ Raises:
+ ValueError: If the parameters are not correct.
+ """
+ if f_config["model_type"] in ["fm", "FM"]:
+ required_parameters = ["FEATURE_COUNT", "dim", "loss", "data_format", "method"]
+ elif f_config["model_type"] in ["lr", "LR"]:
+ required_parameters = ["FEATURE_COUNT", "loss", "data_format", "method"]
+ elif f_config["model_type"] in ["dkn", "DKN"]:
+ required_parameters = [
+ "doc_size",
+ "history_size",
+ "wordEmb_file",
+ "entityEmb_file",
+ "contextEmb_file",
+ "news_feature_file",
+ "user_history_file",
+ "word_size",
+ "entity_size",
+ "use_entity",
+ "use_context",
+ "data_format",
+ "dim",
+ "layer_sizes",
+ "activation",
+ "attention_activation",
+ "attention_activation",
+ "attention_dropout",
+ "loss",
+ "data_format",
+ "dropout",
+ "method",
+ "num_filters",
+ "filter_sizes",
+ ]
+ elif f_config["model_type"] in ["exDeepFM", "xDeepFM"]:
+ required_parameters = [
+ "FIELD_COUNT",
+ "FEATURE_COUNT",
+ "method",
+ "dim",
+ "layer_sizes",
+ "cross_layer_sizes",
+ "activation",
+ "loss",
+ "data_format",
+ "dropout",
+ ]
+ if f_config["model_type"] in ["gru", "GRU"]:
+ required_parameters = [
+ "item_embedding_dim",
+ "cate_embedding_dim",
+ "max_seq_length",
+ "loss",
+ "method",
+ "user_vocab",
+ "item_vocab",
+ "cate_vocab",
+ "hidden_size",
+ ]
+ elif f_config["model_type"] in ["caser", "CASER", "Caser"]:
+ required_parameters = [
+ "item_embedding_dim",
+ "cate_embedding_dim",
+ "user_embedding_dim",
+ "max_seq_length",
+ "loss",
+ "method",
+ "user_vocab",
+ "item_vocab",
+ "cate_vocab",
+ "T",
+ "L",
+ "n_v",
+ "n_h",
+ "min_seq_length",
+ ]
+ elif f_config["model_type"] in ["asvd", "ASVD", "a2svd", "A2SVD"]:
+ required_parameters = [
+ "item_embedding_dim",
+ "cate_embedding_dim",
+ "max_seq_length",
+ "loss",
+ "method",
+ "user_vocab",
+ "item_vocab",
+ "cate_vocab",
+ ]
+ elif f_config["model_type"] in ["slirec", "sli_rec", "SLI_REC", "Sli_rec"]:
+ required_parameters = [
+ "item_embedding_dim",
+ "cate_embedding_dim",
+ "max_seq_length",
+ "loss",
+ "method",
+ "user_vocab",
+ "item_vocab",
+ "cate_vocab",
+ "attention_size",
+ "hidden_size",
+ "att_fcn_layer_sizes",
+ ]
+ elif f_config["model_type"] in [
+ "nextitnet",
+ "next_it_net",
+ "NextItNet",
+ "NEXT_IT_NET",
+ ]:
+ required_parameters = [
+ "item_embedding_dim",
+ "cate_embedding_dim",
+ "user_embedding_dim",
+ "max_seq_length",
+ "loss",
+ "method",
+ "user_vocab",
+ "item_vocab",
+ "cate_vocab",
+ "dilations",
+ "kernel_size",
+ "min_seq_length",
+ ]
+ else:
+ required_parameters = []
+
+ # check required parameters
+ for param in required_parameters:
+ if param not in f_config:
+ raise ValueError("Parameters {0} must be set".format(param))
+
+ if f_config["model_type"] in ["exDeepFM", "xDeepFM"]:
+ if f_config["data_format"] != "ffm":
+ raise ValueError(
+ "For xDeepFM model, data format must be 'ffm', but your set is {0}".format(
+ f_config["data_format"]
+ )
+ )
+ elif f_config["model_type"] in ["dkn", "DKN"]:
+ if f_config["data_format"] != "dkn":
+ raise ValueError(
+ "For dkn model, data format must be 'dkn', but your set is {0}".format(
+ f_config["data_format"]
+ )
+ )
+ check_type(f_config)
+
+
+[docs]def load_yaml(filename):
+ """Load a yaml file.
+
+ Args:
+ filename (str): Filename.
+
+ Returns:
+ dict: Dictionary.
+ """
+ try:
+ with open(filename, "r") as f:
+ config = yaml.load(f, yaml.SafeLoader)
+ return config
+ except FileNotFoundError: # for file not found
+ raise
+ except Exception: # for other exceptions
+ raise IOError("load {0} error!".format(filename))
+
+
+[docs]class HParams:
+ """Class for holding hyperparameters for DeepRec algorithms."""
+
+ def __init__(self, hparams_dict):
+ """Create an HParams object from a dictionary of hyperparameter values.
+
+ Args:
+ hparams_dict (dict): Dictionary with the model hyperparameters.
+ """
+ for val in hparams_dict.values():
+ if not (
+ isinstance(val, int)
+ or isinstance(val, float)
+ or isinstance(val, str)
+ or isinstance(val, list)
+ ):
+ raise ValueError(
+ "Hyperparameter value {} should be integer, float, string or list.".format(
+ val
+ )
+ )
+ self._values = hparams_dict
+ for hparam in hparams_dict:
+ setattr(self, hparam, hparams_dict[hparam])
+
+ def __repr__(self):
+ return "HParams object with values {}".format(self._values.__repr__())
+
+[docs] def values(self):
+ """Return the hyperparameter values as a dictionary.
+
+ Returns:
+ dict: Dictionary with the hyperparameter values.
+ """
+ return self._values
+
+
+[docs]def create_hparams(flags):
+ """Create the model hyperparameters.
+
+ Args:
+ flags (dict): Dictionary with the model requirements.
+
+ Returns:
+ HParams: Hyperparameter object.
+ """
+ init_dict = {
+ # dkn
+ "use_entity": True,
+ "use_context": True,
+ # model
+ "cross_activation": "identity",
+ "user_dropout": False,
+ "dropout": [0.0],
+ "attention_dropout": 0.0,
+ "load_saved_model": False,
+ "fast_CIN_d": 0,
+ "use_Linear_part": False,
+ "use_FM_part": False,
+ "use_CIN_part": False,
+ "use_DNN_part": False,
+ # train
+ "init_method": "tnormal",
+ "init_value": 0.01,
+ "embed_l2": 0.0,
+ "embed_l1": 0.0,
+ "layer_l2": 0.0,
+ "layer_l1": 0.0,
+ "cross_l2": 0.0,
+ "cross_l1": 0.0,
+ "reg_kg": 0.0,
+ "learning_rate": 0.001,
+ "lr_rs": 1,
+ "lr_kg": 0.5,
+ "kg_training_interval": 5,
+ "max_grad_norm": 2,
+ "is_clip_norm": 0,
+ "dtype": 32,
+ "optimizer": "adam",
+ "epochs": 10,
+ "batch_size": 1,
+ "enable_BN": False,
+ # show info
+ "show_step": 1,
+ "save_model": True,
+ "save_epoch": 5,
+ "write_tfevents": False,
+ # sequential
+ "train_num_ngs": 4,
+ "need_sample": True,
+ "embedding_dropout": 0.0,
+ "EARLY_STOP": 100,
+ # caser,
+ "min_seq_length": 1,
+ # sum
+ "slots": 5,
+ "cell": "SUM",
+ }
+ init_dict.update(flags)
+ return HParams(init_dict)
+
+
+[docs]def prepare_hparams(yaml_file=None, **kwargs):
+ """Prepare the model hyperparameters and check that all have the correct value.
+
+ Args:
+ yaml_file (str): YAML file as configuration.
+
+ Returns:
+ HParams: Hyperparameter object.
+ """
+ if yaml_file is not None:
+ config = load_yaml(yaml_file)
+ config = flat_config(config)
+ else:
+ config = {}
+
+ if kwargs:
+ for name, value in kwargs.items():
+ config[name] = value
+
+ check_nn_config(config)
+ return create_hparams(config)
+
+
+[docs]def download_deeprec_resources(azure_container_url, data_path, remote_resource_name):
+ """Download resources.
+
+ Args:
+ azure_container_url (str): URL of Azure container.
+ data_path (str): Path to download the resources.
+ remote_resource_name (str): Name of the resource.
+ """
+ os.makedirs(data_path, exist_ok=True)
+ remote_path = azure_container_url + remote_resource_name
+ maybe_download(remote_path, remote_resource_name, data_path)
+ zip_ref = zipfile.ZipFile(os.path.join(data_path, remote_resource_name), "r")
+ zip_ref.extractall(data_path)
+ zip_ref.close()
+ os.remove(os.path.join(data_path, remote_resource_name))
+
+
+[docs]def mrr_score(y_true, y_score):
+ """Computing mrr score metric.
+
+ Args:
+ y_true (np.ndarray): Ground-truth labels.
+ y_score (np.ndarray): Predicted labels.
+
+ Returns:
+ numpy.ndarray: mrr scores.
+ """
+ order = np.argsort(y_score)[::-1]
+ y_true = np.take(y_true, order)
+ rr_score = y_true / (np.arange(len(y_true)) + 1)
+ return np.sum(rr_score) / np.sum(y_true)
+
+
+[docs]def ndcg_score(y_true, y_score, k=10):
+ """Computing ndcg score metric at k.
+
+ Args:
+ y_true (np.ndarray): Ground-truth labels.
+ y_score (np.ndarray): Predicted labels.
+
+ Returns:
+ numpy.ndarray: ndcg scores.
+ """
+ best = dcg_score(y_true, y_true, k)
+ actual = dcg_score(y_true, y_score, k)
+ return actual / best
+
+
+[docs]def hit_score(y_true, y_score, k=10):
+ """Computing hit score metric at k.
+
+ Args:
+ y_true (np.ndarray): ground-truth labels.
+ y_score (np.ndarray): predicted labels.
+
+ Returns:
+ np.ndarray: hit score.
+ """
+ ground_truth = np.where(y_true == 1)[0]
+ argsort = np.argsort(y_score)[::-1][:k]
+ for idx in argsort:
+ if idx in ground_truth:
+ return 1
+ return 0
+
+
+[docs]def dcg_score(y_true, y_score, k=10):
+ """Computing dcg score metric at k.
+
+ Args:
+ y_true (np.ndarray): Ground-truth labels.
+ y_score (np.ndarray): Predicted labels.
+
+ Returns:
+ np.ndarray: dcg scores.
+ """
+ k = min(np.shape(y_true)[-1], k)
+ order = np.argsort(y_score)[::-1]
+ y_true = np.take(y_true, order[:k])
+ gains = 2**y_true - 1
+ discounts = np.log2(np.arange(len(y_true)) + 2)
+ return np.sum(gains / discounts)
+
+
+[docs]def cal_metric(labels, preds, metrics):
+ """Calculate metrics.
+
+ Available options are: `auc`, `rmse`, `logloss`, `acc` (accurary), `f1`, `mean_mrr`,
+ `ndcg` (format like: ndcg@2;4;6;8), `hit` (format like: hit@2;4;6;8), `group_auc`.
+
+ Args:
+ labels (array-like): Labels.
+ preds (array-like): Predictions.
+ metrics (list): List of metric names.
+
+ Return:
+ dict: Metrics.
+
+ Examples:
+ >>> cal_metric(labels, preds, ["ndcg@2;4;6", "group_auc"])
+ {'ndcg@2': 0.4026, 'ndcg@4': 0.4953, 'ndcg@6': 0.5346, 'group_auc': 0.8096}
+
+ """
+ res = {}
+ for metric in metrics:
+ if metric == "auc":
+ auc = roc_auc_score(np.asarray(labels), np.asarray(preds))
+ res["auc"] = round(auc, 4)
+ elif metric == "rmse":
+ rmse = mean_squared_error(np.asarray(labels), np.asarray(preds))
+ res["rmse"] = np.sqrt(round(rmse, 4))
+ elif metric == "logloss":
+ # avoid logloss nan
+ preds = [max(min(p, 1.0 - 10e-12), 10e-12) for p in preds]
+ logloss = log_loss(np.asarray(labels), np.asarray(preds))
+ res["logloss"] = round(logloss, 4)
+ elif metric == "acc":
+ pred = np.asarray(preds)
+ pred[pred >= 0.5] = 1
+ pred[pred < 0.5] = 0
+ acc = accuracy_score(np.asarray(labels), pred)
+ res["acc"] = round(acc, 4)
+ elif metric == "f1":
+ pred = np.asarray(preds)
+ pred[pred >= 0.5] = 1
+ pred[pred < 0.5] = 0
+ f1 = f1_score(np.asarray(labels), pred)
+ res["f1"] = round(f1, 4)
+ elif metric == "mean_mrr":
+ mean_mrr = np.mean(
+ [
+ mrr_score(each_labels, each_preds)
+ for each_labels, each_preds in zip(labels, preds)
+ ]
+ )
+ res["mean_mrr"] = round(mean_mrr, 4)
+ elif metric.startswith("ndcg"): # format like: ndcg@2;4;6;8
+ ndcg_list = [1, 2]
+ ks = metric.split("@")
+ if len(ks) > 1:
+ ndcg_list = [int(token) for token in ks[1].split(";")]
+ for k in ndcg_list:
+ ndcg_temp = np.mean(
+ [
+ ndcg_score(each_labels, each_preds, k)
+ for each_labels, each_preds in zip(labels, preds)
+ ]
+ )
+ res["ndcg@{0}".format(k)] = round(ndcg_temp, 4)
+ elif metric.startswith("hit"): # format like: hit@2;4;6;8
+ hit_list = [1, 2]
+ ks = metric.split("@")
+ if len(ks) > 1:
+ hit_list = [int(token) for token in ks[1].split(";")]
+ for k in hit_list:
+ hit_temp = np.mean(
+ [
+ hit_score(each_labels, each_preds, k)
+ for each_labels, each_preds in zip(labels, preds)
+ ]
+ )
+ res["hit@{0}".format(k)] = round(hit_temp, 4)
+ elif metric == "group_auc":
+ group_auc = np.mean(
+ [
+ roc_auc_score(each_labels, each_preds)
+ for each_labels, each_preds in zip(labels, preds)
+ ]
+ )
+ res["group_auc"] = round(group_auc, 4)
+ else:
+ raise ValueError("Metric {0} not defined".format(metric))
+ return res
+
+
+[docs]def load_dict(filename):
+ """Load the vocabularies.
+
+ Args:
+ filename (str): Filename of user, item or category vocabulary.
+
+ Returns:
+ dict: A saved vocabulary.
+ """
+ with open(filename, "rb") as f:
+ f_pkl = pkl.load(f)
+ return f_pkl
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+
+import tensorflow as tf
+from recommenders.models.deeprec.io.dkn_iterator import DKNTextIterator
+
+
+[docs]class DKNItem2itemTextIterator(DKNTextIterator):
+ def __init__(self, hparams, graph):
+ """This new iterator is for DKN's item-to-item recommendations version.
+ The tutorial can be found `on this notebook <https://github.com/microsoft/recommenders/blob/main/examples/07_tutorials/KDD2020-tutorial/step4_run_dkn_item2item.ipynb>`_.
+
+ Compared with user-to-item recommendations, we don't need the user behavior module.
+ So the placeholder can be simplified from the original DKNTextIterator.
+
+ Args:
+ hparams (object): Global hyper-parameters.
+ graph (object): The running graph.
+ """
+ self.hparams = hparams
+ self.graph = graph
+ self.neg_num = hparams.neg_num
+ self.batch_size = hparams.batch_size * (self.neg_num + 2)
+ self.doc_size = hparams.doc_size
+ with self.graph.as_default():
+ self.candidate_news_index_batch = tf.compat.v1.placeholder(
+ tf.int64, [self.batch_size, self.doc_size], name="candidate_news_index"
+ )
+ self.candidate_news_entity_index_batch = tf.compat.v1.placeholder(
+ tf.int64,
+ [self.batch_size, self.doc_size],
+ name="candidate_news_entity_index",
+ )
+
+ self._loading_nessary_files()
+
+ def _loading_nessary_files(self):
+ """Only one feature file is needed: `news_feature_file`.
+ This function loads the news article's features into two dictionaries: `self.news_word_index` and `self.news_entity_index`.
+ """
+ hparams = self.hparams
+ self.news_word_index = {}
+ self.news_entity_index = {}
+ with open(hparams.news_feature_file, "r") as rd:
+ while True:
+ line = rd.readline()
+ if not line:
+ break
+ newsid, word_index, entity_index = line.strip().split(" ")
+ self.news_word_index[newsid] = [
+ int(item) for item in word_index.split(",")
+ ]
+ self.news_entity_index[newsid] = [
+ int(item) for item in entity_index.split(",")
+ ]
+
+[docs] def load_data_from_file(self, infile):
+ """This function will return a mini-batch of data with features,
+ by looking up `news_word_index` dictionary and `news_entity_index` dictionary according to the news article's ID.
+
+ Args:
+ infile (str): File path. Each line of `infile` is a news article's ID.
+
+ Yields:
+ dict, list, int:
+ - A dictionary that maps graph elements to numpy arrays.
+ - A list with news article's ID.
+ - Size of the data in a batch.
+ """
+ newsid_list = []
+ candidate_news_index_batch = []
+ candidate_news_entity_index_batch = []
+ cnt = 0
+ with open(infile, "r") as rd:
+ while True:
+ line = rd.readline()
+ if not line:
+ break
+ newsid = line.strip()
+ word_index, entity_index = (
+ self.news_word_index[newsid],
+ self.news_entity_index[newsid],
+ )
+ newsid_list.append(newsid)
+
+ candidate_news_index_batch.append(word_index)
+ candidate_news_entity_index_batch.append(entity_index)
+
+ cnt += 1
+ if cnt >= self.batch_size:
+ res = self._convert_infer_data(
+ candidate_news_index_batch,
+ candidate_news_entity_index_batch,
+ )
+ data_size = self.batch_size
+ yield self.gen_infer_feed_dict(res), newsid_list, data_size
+ candidate_news_index_batch = []
+ candidate_news_entity_index_batch = []
+ newsid_list = []
+ cnt = 0
+
+ if cnt > 0:
+ data_size = cnt
+ while cnt < self.batch_size:
+ candidate_news_index_batch.append(
+ candidate_news_index_batch[cnt % data_size]
+ )
+ candidate_news_entity_index_batch.append(
+ candidate_news_entity_index_batch[cnt % data_size]
+ )
+ cnt += 1
+ res = self._convert_infer_data(
+ candidate_news_index_batch,
+ candidate_news_entity_index_batch,
+ )
+ yield self.gen_infer_feed_dict(res), newsid_list, data_size
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+import numpy as np
+
+from recommenders.models.deeprec.io.iterator import BaseIterator
+
+
+__all__ = ["DKNTextIterator"]
+
+
+[docs]class DKNTextIterator(BaseIterator):
+ """Data loader for the DKN model.
+ DKN requires a special type of data format, where each instance contains a label, the candidate news article,
+ and user's clicked news article. Articles are represented by title words and title entities. Words and entities
+ are aligned.
+
+ Iterator will not load the whole data into memory. Instead, it loads data into memory
+ per mini-batch, so that large files can be used as input data.
+ """
+
+ def __init__(self, hparams, graph, col_spliter=" ", ID_spliter="%"):
+ """Initialize an iterator. Create necessary placeholders for the model.
+
+ Args:
+ hparams (object): Global hyper-parameters. Some key setttings such as #_feature and #_field are there.
+ graph (object): the running graph. All created placeholder will be added to this graph.
+ col_spliter (str): column spliter in one line.
+ ID_spliter (str): ID spliter in one line.
+ """
+ self.col_spliter = col_spliter
+ self.ID_spliter = ID_spliter
+ self.batch_size = hparams.batch_size
+ self.doc_size = hparams.doc_size
+ self.history_size = hparams.history_size
+
+ self.graph = graph
+ with self.graph.as_default():
+ self.labels = tf.compat.v1.placeholder(tf.float32, [None, 1], name="label")
+ self.candidate_news_index_batch = tf.compat.v1.placeholder(
+ tf.int64, [self.batch_size, self.doc_size], name="candidate_news_index"
+ )
+ self.click_news_index_batch = tf.compat.v1.placeholder(
+ tf.int64,
+ [self.batch_size, self.history_size, self.doc_size],
+ name="click_news_index",
+ )
+ self.candidate_news_entity_index_batch = tf.compat.v1.placeholder(
+ tf.int64,
+ [self.batch_size, self.doc_size],
+ name="candidate_news_entity_index",
+ )
+ self.click_news_entity_index_batch = tf.compat.v1.placeholder(
+ tf.int64,
+ [self.batch_size, self.history_size, self.doc_size],
+ name="click_news_entity_index",
+ )
+ self.news_word_index = {}
+ self.news_entity_index = {}
+ with tf.io.gfile.GFile(hparams.news_feature_file, "r") as rd:
+ for line in rd:
+ newsid, word_index, entity_index = line.strip().split(col_spliter)
+ self.news_word_index[newsid] = [
+ int(item) for item in word_index.split(",")
+ ]
+ self.news_entity_index[newsid] = [
+ int(item) for item in entity_index.split(",")
+ ]
+ self.user_history = {}
+ with tf.io.gfile.GFile(hparams.user_history_file, "r") as rd:
+ for line in rd:
+ if len(line.strip().split(col_spliter)) == 1:
+ userid = line.strip()
+ user_history = []
+ else:
+ userid, user_history_string = line.strip().split(col_spliter)
+ user_history = user_history_string.split(",")
+ click_news_index = []
+ click_news_entity_index = []
+ if len(user_history) > self.history_size:
+ user_history = user_history[-self.history_size :]
+ for newsid in user_history:
+ click_news_index.append(self.news_word_index[newsid])
+ click_news_entity_index.append(self.news_entity_index[newsid])
+ for i in range(self.history_size - len(user_history)):
+ click_news_index.append(np.zeros(self.doc_size))
+ click_news_entity_index.append(np.zeros(self.doc_size))
+ self.user_history[userid] = (click_news_index, click_news_entity_index)
+
+[docs] def parser_one_line(self, line):
+ """Parse one string line into feature values.
+
+ Args:
+ line (str): a string indicating one instance
+
+ Returns:
+ list: Parsed results including `label`, `candidate_news_index`, `click_news_index`,
+ `candidate_news_entity_index`, `click_news_entity_index`, `impression_id`.
+
+ """
+ impression_id = 0
+ words = line.strip().split(self.ID_spliter)
+ if len(words) == 2:
+ impression_id = words[1].strip()
+
+ cols = words[0].strip().split(self.col_spliter)
+ label = float(cols[0])
+
+ userid = cols[1]
+ candidate_news = cols[2]
+
+ candidate_news_index = self.news_word_index[candidate_news]
+ candidate_news_entity_index = self.news_entity_index[candidate_news]
+ click_news_index = self.user_history[userid][0]
+ click_news_entity_index = self.user_history[userid][1]
+
+ return (
+ label,
+ candidate_news_index,
+ click_news_index,
+ candidate_news_entity_index,
+ click_news_entity_index,
+ impression_id,
+ )
+
+[docs] def load_data_from_file(self, infile):
+ """Read and parse data from a file.
+
+ Args:
+ infile (str): text input file. Each line in this file is an instance.
+
+ Yields:
+ obj, list, int:
+ - An iterator that yields parsed results, in the format of graph `feed_dict`.
+ - Impression id list.
+ - Size of the data in a batch.
+ """
+ candidate_news_index_batch = []
+ click_news_index_batch = []
+ candidate_news_entity_index_batch = []
+ click_news_entity_index_batch = []
+ label_list = []
+ impression_id_list = []
+ cnt = 0
+
+ with tf.io.gfile.GFile(infile, "r") as rd:
+ for line in rd:
+ (
+ label,
+ candidate_news_index,
+ click_news_index,
+ candidate_news_entity_index,
+ click_news_entity_index,
+ impression_id,
+ ) = self.parser_one_line(line)
+
+ candidate_news_index_batch.append(candidate_news_index)
+ click_news_index_batch.append(click_news_index)
+ candidate_news_entity_index_batch.append(candidate_news_entity_index)
+ click_news_entity_index_batch.append(click_news_entity_index)
+ label_list.append(label)
+ impression_id_list.append(impression_id)
+
+ cnt += 1
+ if cnt >= self.batch_size:
+ res = self._convert_data(
+ label_list,
+ candidate_news_index_batch,
+ click_news_index_batch,
+ candidate_news_entity_index_batch,
+ click_news_entity_index_batch,
+ impression_id_list,
+ )
+ data_size = self.batch_size
+ yield self.gen_feed_dict(res), impression_id_list, data_size
+ candidate_news_index_batch = []
+ click_news_index_batch = []
+ candidate_news_entity_index_batch = []
+ click_news_entity_index_batch = []
+ label_list = []
+ impression_id_list = []
+ cnt = 0
+ if cnt > 0:
+ data_size = cnt
+ while cnt < self.batch_size:
+ candidate_news_index_batch.append(
+ candidate_news_index_batch[cnt % data_size]
+ )
+ click_news_index_batch.append(
+ click_news_index_batch[cnt % data_size]
+ )
+ candidate_news_entity_index_batch.append(
+ candidate_news_entity_index_batch[cnt % data_size]
+ )
+ click_news_entity_index_batch.append(
+ click_news_entity_index_batch[cnt % data_size]
+ )
+ label_list.append(label_list[cnt % data_size])
+ impression_id_list.append(impression_id_list[cnt % data_size])
+ cnt += 1
+ res = self._convert_data(
+ label_list,
+ candidate_news_index_batch,
+ click_news_index_batch,
+ candidate_news_entity_index_batch,
+ click_news_entity_index_batch,
+ impression_id_list,
+ )
+ yield self.gen_feed_dict(res), impression_id_list, data_size
+
+[docs] def load_infer_data_from_file(self, infile):
+ """Read and parse data from a file for infer document embedding.
+
+ Args:
+ infile (str): text input file. Each line in this file is an instance.
+
+ Yields:
+ obj, list, int:
+ - An iterator that yields parsed results, in the format of graph `feed_dict`.
+ - Impression id list.
+ - Size of the data in a batch.
+ """
+ newsid_list = []
+ candidate_news_index_batch = []
+ candidate_news_entity_index_batch = []
+ cnt = 0
+ with tf.io.gfile.GFile(infile, "r") as rd:
+ for line in rd:
+ newsid, word_index, entity_index = line.strip().split(" ")
+ newsid_list.append(newsid)
+ candidate_news_index = []
+ candidate_news_entity_index = []
+ for item in word_index.split(","):
+ candidate_news_index.append(int(item))
+ for item in entity_index.split(","):
+ candidate_news_entity_index.append(int(item))
+
+ candidate_news_index_batch.append(candidate_news_index)
+ candidate_news_entity_index_batch.append(candidate_news_entity_index)
+
+ cnt += 1
+ if cnt >= self.batch_size:
+ res = self._convert_infer_data(
+ candidate_news_index_batch, candidate_news_entity_index_batch
+ )
+ data_size = self.batch_size
+ yield self.gen_infer_feed_dict(res), newsid_list, data_size
+ candidate_news_index_batch = []
+ candidate_news_entity_index_batch = []
+ newsid_list = []
+ cnt = 0
+
+ if cnt > 0:
+ data_size = cnt
+ while cnt < self.batch_size:
+ candidate_news_index_batch.append(
+ candidate_news_index_batch[cnt % data_size]
+ )
+ candidate_news_entity_index_batch.append(
+ candidate_news_entity_index_batch[cnt % data_size]
+ )
+ cnt += 1
+ res = self._convert_infer_data(
+ candidate_news_index_batch, candidate_news_entity_index_batch
+ )
+ yield self.gen_infer_feed_dict(res), newsid_list, data_size
+
+ def _convert_data(
+ self,
+ label_list,
+ candidate_news_index_batch,
+ click_news_index_batch,
+ candidate_news_entity_index_batch,
+ click_news_entity_index_batch,
+ impression_id_list,
+ ):
+ """Convert data into numpy arrays that are good for further model operation.
+
+ Args:
+ label_list (list): a list of ground-truth labels.
+ candidate_news_index_batch (list): the candidate news article's words indices
+ click_news_index_batch (list): words indices for user's clicked news articles
+ candidate_news_entity_index_batch (list): the candidate news article's entities indices
+ click_news_entity_index_batch (list): the user's clicked news article's entities indices
+ impression_id_list (list) : the session's impression indices
+
+ Returns:
+ dict: A dictionary, containing multiple numpy arrays that are convenient for further operation.
+ """
+ res = {}
+ res["labels"] = np.asarray([[label] for label in label_list], dtype=np.float32)
+ res["candidate_news_index_batch"] = np.asarray(
+ candidate_news_index_batch, dtype=np.int64
+ )
+ res["click_news_index_batch"] = np.asarray(
+ click_news_index_batch, dtype=np.int64
+ )
+ res["candidate_news_entity_index_batch"] = np.asarray(
+ candidate_news_entity_index_batch, dtype=np.int64
+ )
+ res["click_news_entity_index_batch"] = np.asarray(
+ click_news_entity_index_batch, dtype=np.int64
+ )
+ res["impression_id"] = np.asarray(impression_id_list, dtype=np.int64)
+ return res
+
+ def _convert_infer_data(
+ self, candidate_news_index_batch, candidate_news_entity_index_batch
+ ):
+ """Convert data into numpy arrays that are good for further model operation.
+
+ Args:
+ candidate_news_index_batch (list): the candidate news article's words indices
+ candidate_news_entity_index_batch (list): the candidate news article's entities indices
+ Returns:
+ dict: A dictionary, containing multiple numpy arrays that are convenient for further operation.
+ """
+ res = {}
+ res["candidate_news_index_batch"] = np.asarray(
+ candidate_news_index_batch, dtype=np.int64
+ )
+ res["candidate_news_entity_index_batch"] = np.asarray(
+ candidate_news_entity_index_batch, dtype=np.int64
+ )
+ return res
+
+[docs] def gen_feed_dict(self, data_dict):
+ """Construct a dictionary that maps graph elements to values.
+
+ Args:
+ data_dict (dict): a dictionary that maps string name to numpy arrays.
+
+ Returns:
+ dict: A dictionary that maps graph elements to numpy arrays.
+
+ """
+ feed_dict = {
+ self.labels: data_dict["labels"].reshape([-1, 1]),
+ self.candidate_news_index_batch: data_dict[
+ "candidate_news_index_batch"
+ ].reshape([self.batch_size, self.doc_size]),
+ self.click_news_index_batch: data_dict["click_news_index_batch"].reshape(
+ [self.batch_size, self.history_size, self.doc_size]
+ ),
+ self.candidate_news_entity_index_batch: data_dict[
+ "candidate_news_entity_index_batch"
+ ].reshape([-1, self.doc_size]),
+ self.click_news_entity_index_batch: data_dict[
+ "click_news_entity_index_batch"
+ ].reshape([-1, self.history_size, self.doc_size]),
+ }
+ return feed_dict
+
+[docs] def gen_infer_feed_dict(self, data_dict):
+ """Construct a dictionary that maps graph elements to values.
+
+ Args:
+ data_dict (dict): a dictionary that maps string name to numpy arrays.
+
+ Returns:
+ dict: A dictionary that maps graph elements to numpy arrays.
+
+ """
+ feed_dict = {
+ self.candidate_news_index_batch: data_dict[
+ "candidate_news_index_batch"
+ ].reshape([self.batch_size, self.doc_size]),
+ self.candidate_news_entity_index_batch: data_dict[
+ "candidate_news_entity_index_batch"
+ ].reshape([-1, self.doc_size]),
+ }
+ return feed_dict
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+import tensorflow as tf
+import abc
+
+
+[docs]class BaseIterator(object):
+ """Abstract base iterator class"""
+
+[docs] @abc.abstractmethod
+ def parser_one_line(self, line):
+ """Abstract method. Parse one string line into feature values.
+
+ Args:
+ line (str): A string indicating one instance.
+ """
+ pass
+
+[docs] @abc.abstractmethod
+ def load_data_from_file(self, infile):
+ """Abstract method. Read and parse data from a file.
+
+ Args:
+ infile (str): Text input file. Each line in this file is an instance.
+ """
+ pass
+
+ @abc.abstractmethod
+ def _convert_data(self, labels, features):
+ pass
+
+[docs] @abc.abstractmethod
+ def gen_feed_dict(self, data_dict):
+ """Abstract method. Construct a dictionary that maps graph elements to values.
+
+ Args:
+ data_dict (dict): A dictionary that maps string name to numpy arrays.
+ """
+ pass
+
+
+[docs]class FFMTextIterator(BaseIterator):
+ """Data loader for FFM format based models, such as xDeepFM.
+ Iterator will not load the whole data into memory. Instead, it loads data into memory
+ per mini-batch, so that large files can be used as input data.
+ """
+
+ def __init__(self, hparams, graph, col_spliter=" ", ID_spliter="%"):
+ """Initialize an iterator. Create the necessary placeholders for the model.
+
+ Args:
+ hparams (object): Global hyper-parameters. Some key settings such as #_feature and #_field are there.
+ graph (object): The running graph. All created placeholder will be added to this graph.
+ col_spliter (str): column splitter in one line.
+ ID_spliter (str): ID splitter in one line.
+ """
+ self.feature_cnt = hparams.FEATURE_COUNT
+ self.field_cnt = hparams.FIELD_COUNT
+ self.col_spliter = col_spliter
+ self.ID_spliter = ID_spliter
+ self.batch_size = hparams.batch_size
+
+ self.graph = graph
+ with self.graph.as_default():
+ self.labels = tf.compat.v1.placeholder(tf.float32, [None, 1], name="label")
+ self.fm_feat_indices = tf.compat.v1.placeholder(
+ tf.int64, [None, 2], name="fm_feat_indices"
+ )
+ self.fm_feat_values = tf.compat.v1.placeholder(
+ tf.float32, [None], name="fm_feat_values"
+ )
+ self.fm_feat_shape = tf.compat.v1.placeholder(
+ tf.int64, [None], name="fm_feat_shape"
+ )
+ self.dnn_feat_indices = tf.compat.v1.placeholder(
+ tf.int64, [None, 2], name="dnn_feat_indices"
+ )
+ self.dnn_feat_values = tf.compat.v1.placeholder(
+ tf.int64, [None], name="dnn_feat_values"
+ )
+ self.dnn_feat_weights = tf.compat.v1.placeholder(
+ tf.float32, [None], name="dnn_feat_weights"
+ )
+ self.dnn_feat_shape = tf.compat.v1.placeholder(
+ tf.int64, [None], name="dnn_feat_shape"
+ )
+
+[docs] def parser_one_line(self, line):
+ """Parse one string line into feature values.
+
+ Args:
+ line (str): A string indicating one instance.
+
+ Returns:
+ list: Parsed results, including `label`, `features` and `impression_id`.
+
+ """
+ impression_id = 0
+ words = line.strip().split(self.ID_spliter)
+ if len(words) == 2:
+ impression_id = words[1].strip()
+
+ cols = words[0].strip().split(self.col_spliter)
+
+ label = float(cols[0])
+
+ features = []
+ for word in cols[1:]:
+ if not word.strip():
+ continue
+ tokens = word.split(":")
+ features.append([int(tokens[0]) - 1, int(tokens[1]) - 1, float(tokens[2])])
+
+ return label, features, impression_id
+
+[docs] def load_data_from_file(self, infile):
+ """Read and parse data from a file.
+
+ Args:
+ infile (str): Text input file. Each line in this file is an instance.
+
+ Returns:
+ object: An iterator that yields parsed results, in the format of graph `feed_dict`.
+ """
+ label_list = []
+ features_list = []
+ impression_id_list = []
+ cnt = 0
+
+ with tf.io.gfile.GFile(infile, "r") as rd:
+ for line in rd:
+ label, features, impression_id = self.parser_one_line(line)
+
+ features_list.append(features)
+ label_list.append(label)
+ impression_id_list.append(impression_id)
+
+ cnt += 1
+ if cnt == self.batch_size:
+ res = self._convert_data(label_list, features_list)
+ yield self.gen_feed_dict(res), impression_id_list, self.batch_size
+ label_list = []
+ features_list = []
+ impression_id_list = []
+ cnt = 0
+ if cnt > 0:
+ res = self._convert_data(label_list, features_list)
+ yield self.gen_feed_dict(res), impression_id_list, cnt
+
+ def _convert_data(self, labels, features):
+ """Convert data into numpy arrays that are good for further operation.
+
+ Args:
+ labels (list): a list of ground-truth labels.
+ features (list): a 3-dimensional list, carrying a list (batch_size) of feature array,
+ where each feature array is a list of `[field_idx, feature_idx, feature_value]` tuple.
+
+ Returns:
+ dict: A dictionary, containing multiple numpy arrays that are convenient for further operation.
+ """
+ dim = self.feature_cnt
+ FIELD_COUNT = self.field_cnt
+ instance_cnt = len(labels)
+
+ fm_feat_indices = []
+ fm_feat_values = []
+ fm_feat_shape = [instance_cnt, dim]
+
+ dnn_feat_indices = []
+ dnn_feat_values = []
+ dnn_feat_weights = []
+ dnn_feat_shape = [instance_cnt * FIELD_COUNT, -1]
+
+ for i in range(instance_cnt):
+ m = len(features[i])
+ dnn_feat_dic = {}
+ for j in range(m):
+ fm_feat_indices.append([i, features[i][j][1]])
+ fm_feat_values.append(features[i][j][2])
+ if features[i][j][0] not in dnn_feat_dic:
+ dnn_feat_dic[features[i][j][0]] = 0
+ else:
+ dnn_feat_dic[features[i][j][0]] += 1
+ dnn_feat_indices.append(
+ [
+ i * FIELD_COUNT + features[i][j][0],
+ dnn_feat_dic[features[i][j][0]],
+ ]
+ )
+ dnn_feat_values.append(features[i][j][1])
+ dnn_feat_weights.append(features[i][j][2])
+ if dnn_feat_shape[1] < dnn_feat_dic[features[i][j][0]]:
+ dnn_feat_shape[1] = dnn_feat_dic[features[i][j][0]]
+ dnn_feat_shape[1] += 1
+
+ sorted_index = sorted(
+ range(len(dnn_feat_indices)),
+ key=lambda k: (dnn_feat_indices[k][0], dnn_feat_indices[k][1]),
+ )
+
+ res = {}
+ res["fm_feat_indices"] = np.asarray(fm_feat_indices, dtype=np.int64)
+ res["fm_feat_values"] = np.asarray(fm_feat_values, dtype=np.float32)
+ res["fm_feat_shape"] = np.asarray(fm_feat_shape, dtype=np.int64)
+ res["labels"] = np.asarray([[label] for label in labels], dtype=np.float32)
+
+ res["dnn_feat_indices"] = np.asarray(dnn_feat_indices, dtype=np.int64)[
+ sorted_index
+ ]
+ res["dnn_feat_values"] = np.asarray(dnn_feat_values, dtype=np.int64)[
+ sorted_index
+ ]
+ res["dnn_feat_weights"] = np.asarray(dnn_feat_weights, dtype=np.float32)[
+ sorted_index
+ ]
+ res["dnn_feat_shape"] = np.asarray(dnn_feat_shape, dtype=np.int64)
+ return res
+
+[docs] def gen_feed_dict(self, data_dict):
+ """Construct a dictionary that maps graph elements to values.
+
+ Args:
+ data_dict (dict): A dictionary that maps string name to numpy arrays.
+
+ Returns:
+ dict: A dictionary that maps graph elements to numpy arrays.
+
+ """
+ feed_dict = {
+ self.labels: data_dict["labels"],
+ self.fm_feat_indices: data_dict["fm_feat_indices"],
+ self.fm_feat_values: data_dict["fm_feat_values"],
+ self.fm_feat_shape: data_dict["fm_feat_shape"],
+ self.dnn_feat_indices: data_dict["dnn_feat_indices"],
+ self.dnn_feat_values: data_dict["dnn_feat_values"],
+ self.dnn_feat_weights: data_dict["dnn_feat_weights"],
+ self.dnn_feat_shape: data_dict["dnn_feat_shape"],
+ }
+ return feed_dict
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+import numpy as np
+import random
+
+from recommenders.models.deeprec.io.sequential_iterator import SequentialIterator
+from recommenders.models.deeprec.deeprec_utils import load_dict
+
+
+__all__ = ["NextItNetIterator"]
+
+
+[docs]class NextItNetIterator(SequentialIterator):
+ """Data loader for the NextItNet model.
+
+ NextItNet requires a special type of data format. In training stage, each instance will
+ produce `(sequence_length * train_num_ngs)` target items and labels, to let NextItNet
+ output predictions of every item in a sequence except only of the last item.
+ """
+
+ def __init__(self, hparams, graph, col_spliter="\t"):
+ """Initialize an iterator. Create necessary placeholders for the model.
+ Different from sequential iterator
+
+ Args:
+ hparams (object): Global hyper-parameters. Some key settings such as #_feature and #_field are there.
+ graph (object): The running graph. All created placeholder will be added to this graph.
+ col_spliter (str): Column splitter in one line.
+ """
+ self.col_spliter = col_spliter
+
+ self.userdict, self.itemdict, self.catedict = (
+ load_dict(hparams.user_vocab),
+ load_dict(hparams.item_vocab),
+ load_dict(hparams.cate_vocab),
+ )
+
+ self.max_seq_length = hparams.max_seq_length
+ self.batch_size = hparams.batch_size
+ self.iter_data = dict()
+
+ self.graph = graph
+ with self.graph.as_default():
+ self.labels = tf.compat.v1.placeholder(
+ tf.float32, [None, None], name="label"
+ )
+ self.users = tf.compat.v1.placeholder(tf.int32, [None], name="users")
+ self.items = tf.compat.v1.placeholder(tf.int32, [None, None], name="items")
+ self.cates = tf.compat.v1.placeholder(tf.int32, [None, None], name="cates")
+ self.item_history = tf.compat.v1.placeholder(
+ tf.int32, [None, self.max_seq_length], name="item_history"
+ )
+ self.item_cate_history = tf.compat.v1.placeholder(
+ tf.int32, [None, self.max_seq_length], name="item_cate_history"
+ )
+ self.mask = tf.compat.v1.placeholder(
+ tf.int32, [None, self.max_seq_length], name="mask"
+ )
+ self.time = tf.compat.v1.placeholder(tf.float32, [None], name="time")
+ self.time_diff = tf.compat.v1.placeholder(
+ tf.float32, [None, self.max_seq_length], name="time_diff"
+ )
+ self.time_from_first_action = tf.compat.v1.placeholder(
+ tf.float32, [None, self.max_seq_length], name="time_from_first_action"
+ )
+ self.time_to_now = tf.compat.v1.placeholder(
+ tf.float32, [None, self.max_seq_length], name="time_to_now"
+ )
+
+ def _convert_data(
+ self,
+ label_list,
+ user_list,
+ item_list,
+ item_cate_list,
+ item_history_batch,
+ item_cate_history_batch,
+ time_list,
+ time_diff_list,
+ time_from_first_action_list,
+ time_to_now_list,
+ batch_num_ngs,
+ ):
+ """Convert data into numpy arrays that are good for further model operation.
+ Note: This is different from `sequential_iterator`.
+
+ Args:
+ label_list (list): A list of ground-truth labels.
+ user_list (list): A list of user indexes.
+ item_list (list): A list of item indexes.
+ item_cate_list (list): A list of category indexes.
+ item_history_batch (list): A list of item history indexes.
+ item_cate_history_batch (list): A list of category history indexes.
+ time_list (list): A list of current timestamp.
+ time_diff_list (list): A list of timestamp between each sequential opertions.
+ time_from_first_action_list (list): A list of timestamp from the first opertion.
+ time_to_now_list (list): A list of timestamp to the current time.
+ batch_num_ngs (int): The number of negative sampling while training in mini-batch.
+
+ Returns:
+ dict: A dictionary, contains multiple numpy arrays that are convenient for further operation.
+ """
+ if batch_num_ngs:
+ instance_cnt = len(label_list)
+ if instance_cnt < 5:
+ return
+
+ label_list_all = []
+ item_list_all = []
+ item_cate_list_all = []
+ user_list_all = np.asarray(
+ [[user] * (batch_num_ngs + 1) for user in user_list], dtype=np.int32
+ ).flatten()
+ time_list_all = np.asarray(
+ [[t] * (batch_num_ngs + 1) for t in time_list], dtype=np.float32
+ ).flatten()
+
+ history_lengths = [len(item_history_batch[i]) for i in range(instance_cnt)]
+ max_seq_length_batch = self.max_seq_length
+ item_history_batch_all = np.zeros(
+ (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch),
+ dtype=np.int32,
+ )
+ item_cate_history_batch_all = np.zeros(
+ (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch),
+ dtype=np.int32,
+ )
+ time_diff_batch = np.zeros(
+ (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch),
+ dtype=np.float32,
+ )
+ time_from_first_action_batch = np.zeros(
+ (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch),
+ dtype=np.float32,
+ )
+ time_to_now_batch = np.zeros(
+ (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch),
+ dtype=np.float32,
+ )
+ mask = np.zeros(
+ (instance_cnt * (1 + batch_num_ngs), max_seq_length_batch),
+ dtype=np.float32,
+ )
+
+ for i in range(instance_cnt):
+ this_length = min(history_lengths[i], max_seq_length_batch)
+ for index in range(batch_num_ngs + 1):
+ item_history_batch_all[
+ i * (batch_num_ngs + 1) + index, -this_length:
+ ] = np.asarray(item_history_batch[i][-this_length:], dtype=np.int32)
+ item_cate_history_batch_all[
+ i * (batch_num_ngs + 1) + index, -this_length:
+ ] = np.asarray(
+ item_cate_history_batch[i][-this_length:], dtype=np.int32
+ )
+ mask[i * (batch_num_ngs + 1) + index, -this_length:] = 1.0
+ time_diff_batch[
+ i * (batch_num_ngs + 1) + index, -this_length:
+ ] = np.asarray(time_diff_list[i][-this_length:], dtype=np.float32)
+ time_from_first_action_batch[
+ i * (batch_num_ngs + 1) + index, -this_length:
+ ] = np.asarray(
+ time_from_first_action_list[i][-this_length:], dtype=np.float32
+ )
+ time_to_now_batch[
+ i * (batch_num_ngs + 1) + index, -this_length:
+ ] = np.asarray(time_to_now_list[i][-this_length:], dtype=np.float32)
+
+ for i in range(instance_cnt):
+ positive_item = [
+ *item_history_batch_all[i * (batch_num_ngs + 1)][1:],
+ item_list[i],
+ ]
+ positive_item_cate = [
+ *item_cate_history_batch_all[i * (batch_num_ngs + 1)][1:],
+ item_cate_list[i],
+ ]
+ label_list_all.append([1] * max_seq_length_batch)
+ item_list_all.append(positive_item)
+ item_cate_list_all.append(positive_item_cate)
+
+ count = 0
+ while count < batch_num_ngs:
+ negative_item_list = []
+ negative_item_cate_list = []
+ count_inner = 1
+ while count_inner <= max_seq_length_batch:
+ random_value = random.randint(0, instance_cnt - 1)
+ negative_item = item_list[random_value]
+ if negative_item == positive_item[count_inner - 1]:
+ continue
+ negative_item_list.append(negative_item)
+ negative_item_cate_list.append(item_cate_list[random_value])
+ count_inner += 1
+
+ label_list_all.append([0] * max_seq_length_batch)
+ item_list_all.append(negative_item_list)
+ item_cate_list_all.append(negative_item_cate_list)
+ count += 1
+
+ res = {}
+ res["labels"] = np.asarray(
+ label_list_all, dtype=np.float32
+ ) # .reshape(-1,1)
+ res["users"] = user_list_all
+ res["items"] = np.asarray(item_list_all, dtype=np.int32)
+ res["cates"] = np.asarray(item_cate_list_all, dtype=np.int32)
+ res["item_history"] = item_history_batch_all
+ res["item_cate_history"] = item_cate_history_batch_all
+ res["mask"] = mask
+ res["time"] = time_list_all
+ res["time_diff"] = time_diff_batch
+ res["time_from_first_action"] = time_from_first_action_batch
+ res["time_to_now"] = time_to_now_batch
+
+ return res
+
+ else:
+ instance_cnt = len(label_list)
+ history_lengths = [len(item_history_batch[i]) for i in range(instance_cnt)]
+ max_seq_length_batch = self.max_seq_length
+ item_history_batch_all = np.zeros(
+ (instance_cnt, max_seq_length_batch), dtype=np.int32
+ )
+ item_cate_history_batch_all = np.zeros(
+ (instance_cnt, max_seq_length_batch), dtype=np.int32
+ )
+ time_diff_batch = np.zeros(
+ (instance_cnt, max_seq_length_batch), dtype=np.float32
+ )
+ time_from_first_action_batch = np.zeros(
+ (instance_cnt, max_seq_length_batch), dtype=np.float32
+ )
+ time_to_now_batch = np.zeros(
+ (instance_cnt, max_seq_length_batch), dtype=np.float32
+ )
+ mask = np.zeros((instance_cnt, max_seq_length_batch), dtype=np.float32)
+
+ for i in range(instance_cnt):
+ this_length = min(history_lengths[i], max_seq_length_batch)
+ item_history_batch_all[i, -this_length:] = item_history_batch[i][
+ -this_length:
+ ]
+ item_cate_history_batch_all[i, -this_length:] = item_cate_history_batch[
+ i
+ ][-this_length:]
+ mask[i, -this_length:] = 1.0
+ time_diff_batch[i, -this_length:] = time_diff_list[i][-this_length:]
+ time_from_first_action_batch[
+ i, -this_length:
+ ] = time_from_first_action_list[i][-this_length:]
+ time_to_now_batch[i, -this_length:] = time_to_now_list[i][-this_length:]
+
+ res = {}
+ res["labels"] = np.asarray(label_list, dtype=np.float32).reshape([-1, 1])
+ res["users"] = np.asarray(user_list, dtype=np.float32)
+ res["items"] = np.asarray(item_list, dtype=np.int32).reshape([-1, 1])
+ res["cates"] = np.asarray(item_cate_list, dtype=np.int32).reshape([-1, 1])
+ res["item_history"] = item_history_batch_all
+ res["item_cate_history"] = item_cate_history_batch_all
+ res["mask"] = mask
+ res["time"] = np.asarray(time_list, dtype=np.float32)
+ res["time_diff"] = time_diff_batch
+ res["time_from_first_action"] = time_from_first_action_batch
+ res["time_to_now"] = time_to_now_batch
+ return res
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+import numpy as np
+import random
+
+from recommenders.models.deeprec.io.iterator import BaseIterator
+from recommenders.models.deeprec.deeprec_utils import load_dict
+
+
+__all__ = ["SequentialIterator"]
+
+
+[docs]class SequentialIterator(BaseIterator):
+ def __init__(self, hparams, graph, col_spliter="\t"):
+ """Initialize an iterator. Create necessary placeholders for the model.
+
+ Args:
+ hparams (object): Global hyper-parameters. Some key settings such as #_feature and #_field are there.
+ graph (object): The running graph. All created placeholder will be added to this graph.
+ col_spliter (str): Column splitter in one line.
+ """
+ self.col_spliter = col_spliter
+ user_vocab, item_vocab, cate_vocab = (
+ hparams.user_vocab,
+ hparams.item_vocab,
+ hparams.cate_vocab,
+ )
+ self.userdict, self.itemdict, self.catedict = (
+ load_dict(user_vocab),
+ load_dict(item_vocab),
+ load_dict(cate_vocab),
+ )
+
+ self.max_seq_length = hparams.max_seq_length
+ self.batch_size = hparams.batch_size
+ self.iter_data = dict()
+
+ self.graph = graph
+ with self.graph.as_default():
+ self.labels = tf.compat.v1.placeholder(tf.float32, [None, 1], name="label")
+ self.users = tf.compat.v1.placeholder(tf.int32, [None], name="users")
+ self.items = tf.compat.v1.placeholder(tf.int32, [None], name="items")
+ self.cates = tf.compat.v1.placeholder(tf.int32, [None], name="cates")
+ self.item_history = tf.compat.v1.placeholder(
+ tf.int32, [None, self.max_seq_length], name="item_history"
+ )
+ self.item_cate_history = tf.compat.v1.placeholder(
+ tf.int32, [None, self.max_seq_length], name="item_cate_history"
+ )
+ self.mask = tf.compat.v1.placeholder(
+ tf.int32, [None, self.max_seq_length], name="mask"
+ )
+ self.time = tf.compat.v1.placeholder(tf.float32, [None], name="time")
+ self.time_diff = tf.compat.v1.placeholder(
+ tf.float32, [None, self.max_seq_length], name="time_diff"
+ )
+ self.time_from_first_action = tf.compat.v1.placeholder(
+ tf.float32, [None, self.max_seq_length], name="time_from_first_action"
+ )
+ self.time_to_now = tf.compat.v1.placeholder(
+ tf.float32, [None, self.max_seq_length], name="time_to_now"
+ )
+
+[docs] def parse_file(self, input_file):
+ """Parse the file to A list ready to be used for downstream tasks.
+
+ Args:
+ input_file: One of train, valid or test file which has never been parsed.
+
+ Returns:
+ list: A list with parsing result.
+ """
+ with open(input_file, "r") as f:
+ lines = f.readlines()
+ res = []
+ for line in lines:
+ if not line:
+ continue
+ res.append(self.parser_one_line(line))
+ return res
+
+[docs] def parser_one_line(self, line):
+ """Parse one string line into feature values.
+
+ Args:
+ line (str): a string indicating one instance.
+ This string contains tab-separated values including:
+ label, user_hash, item_hash, item_cate, operation_time, item_history_sequence,
+ item_cate_history_sequence, and time_history_sequence.
+
+ Returns:
+ list: Parsed results including `label`, `user_id`, `item_id`, `item_cate`, `item_history_sequence`, `cate_history_sequence`,
+ `current_time`, `time_diff`, `time_from_first_action`, `time_to_now`.
+
+ """
+ words = line.strip().split(self.col_spliter)
+ label = int(words[0])
+ user_id = self.userdict[words[1]] if words[1] in self.userdict else 0
+ item_id = self.itemdict[words[2]] if words[2] in self.itemdict else 0
+ item_cate = self.catedict[words[3]] if words[3] in self.catedict else 0
+ current_time = float(words[4])
+
+ item_history_sequence = []
+ cate_history_sequence = []
+ time_history_sequence = []
+
+ item_history_words = words[5].strip().split(",")
+ for item in item_history_words:
+ item_history_sequence.append(
+ self.itemdict[item] if item in self.itemdict else 0
+ )
+
+ cate_history_words = words[6].strip().split(",")
+ for cate in cate_history_words:
+ cate_history_sequence.append(
+ self.catedict[cate] if cate in self.catedict else 0
+ )
+
+ time_history_words = words[7].strip().split(",")
+ time_history_sequence = [float(i) for i in time_history_words]
+
+ time_range = 3600 * 24
+
+ time_diff = []
+ for i in range(len(time_history_sequence) - 1):
+ diff = (
+ time_history_sequence[i + 1] - time_history_sequence[i]
+ ) / time_range
+ diff = max(diff, 0.5)
+ time_diff.append(diff)
+ last_diff = (current_time - time_history_sequence[-1]) / time_range
+ last_diff = max(last_diff, 0.5)
+ time_diff.append(last_diff)
+ time_diff = np.log(time_diff)
+
+ time_from_first_action = []
+ first_time = time_history_sequence[0]
+ time_from_first_action = [
+ (t - first_time) / time_range for t in time_history_sequence[1:]
+ ]
+ time_from_first_action = [max(t, 0.5) for t in time_from_first_action]
+ last_diff = (current_time - first_time) / time_range
+ last_diff = max(last_diff, 0.5)
+ time_from_first_action.append(last_diff)
+ time_from_first_action = np.log(time_from_first_action)
+
+ time_to_now = []
+ time_to_now = [(current_time - t) / time_range for t in time_history_sequence]
+ time_to_now = [max(t, 0.5) for t in time_to_now]
+ time_to_now = np.log(time_to_now)
+
+ return (
+ label,
+ user_id,
+ item_id,
+ item_cate,
+ item_history_sequence,
+ cate_history_sequence,
+ current_time,
+ time_diff,
+ time_from_first_action,
+ time_to_now,
+ )
+
+[docs] def load_data_from_file(self, infile, batch_num_ngs=0, min_seq_length=1):
+ """Read and parse data from a file.
+
+ Args:
+ infile (str): Text input file. Each line in this file is an instance.
+ batch_num_ngs (int): The number of negative sampling here in batch.
+ 0 represents that there is no need to do negative sampling here.
+ min_seq_length (int): The minimum number of a sequence length.
+ Sequences with length lower than min_seq_length will be ignored.
+
+ Yields:
+ object: An iterator that yields parsed results, in the format of graph `feed_dict`.
+ """
+ label_list = []
+ user_list = []
+ item_list = []
+ item_cate_list = []
+ item_history_batch = []
+ item_cate_history_batch = []
+ time_list = []
+ time_diff_list = []
+ time_from_first_action_list = []
+ time_to_now_list = []
+
+ cnt = 0
+
+ if infile not in self.iter_data:
+ lines = self.parse_file(infile)
+ self.iter_data[infile] = lines
+ else:
+ lines = self.iter_data[infile]
+
+ if batch_num_ngs > 0:
+ random.shuffle(lines)
+
+ for line in lines:
+ if not line:
+ continue
+
+ (
+ label,
+ user_id,
+ item_id,
+ item_cate,
+ item_history_sequence,
+ item_cate_history_sequence,
+ current_time,
+ time_diff,
+ time_from_first_action,
+ time_to_now,
+ ) = line
+ if len(item_history_sequence) < min_seq_length:
+ continue
+
+ label_list.append(label)
+ user_list.append(user_id)
+ item_list.append(item_id)
+ item_cate_list.append(item_cate)
+ item_history_batch.append(item_history_sequence)
+ item_cate_history_batch.append(item_cate_history_sequence)
+ time_list.append(current_time)
+ time_diff_list.append(time_diff)
+ time_from_first_action_list.append(time_from_first_action)
+ time_to_now_list.append(time_to_now)
+
+ cnt += 1
+ if cnt == self.batch_size:
+ res = self._convert_data(
+ label_list,
+ user_list,
+ item_list,
+ item_cate_list,
+ item_history_batch,
+ item_cate_history_batch,
+ time_list,
+ time_diff_list,
+ time_from_first_action_list,
+ time_to_now_list,
+ batch_num_ngs,
+ )
+ batch_input = self.gen_feed_dict(res)
+ yield batch_input if batch_input else None
+ label_list = []
+ user_list = []
+ item_list = []
+ item_cate_list = []
+ item_history_batch = []
+ item_cate_history_batch = []
+ time_list = []
+ time_diff_list = []
+ time_from_first_action_list = []
+ time_to_now_list = []
+ cnt = 0
+ if cnt > 0:
+ res = self._convert_data(
+ label_list,
+ user_list,
+ item_list,
+ item_cate_list,
+ item_history_batch,
+ item_cate_history_batch,
+ time_list,
+ time_diff_list,
+ time_from_first_action_list,
+ time_to_now_list,
+ batch_num_ngs,
+ )
+ batch_input = self.gen_feed_dict(res)
+ yield batch_input if batch_input else None
+
+ def _convert_data(
+ self,
+ label_list,
+ user_list,
+ item_list,
+ item_cate_list,
+ item_history_batch,
+ item_cate_history_batch,
+ time_list,
+ time_diff_list,
+ time_from_first_action_list,
+ time_to_now_list,
+ batch_num_ngs,
+ ):
+ """Convert data into numpy arrays that are good for further model operation.
+
+ Args:
+ label_list (list): A list of ground-truth labels.
+ user_list (list): A list of user indexes.
+ item_list (list): A list of item indexes.
+ item_cate_list (list): A list of category indexes.
+ item_history_batch (list): A list of item history indexes.
+ item_cate_history_batch (list): A list of category history indexes.
+ time_list (list): A list of current timestamp.
+ time_diff_list (list): A list of timestamp between each sequential operations.
+ time_from_first_action_list (list): A list of timestamp from the first operation.
+ time_to_now_list (list): A list of timestamp to the current time.
+ batch_num_ngs (int): The number of negative sampling while training in mini-batch.
+
+ Returns:
+ dict: A dictionary, containing multiple numpy arrays that are convenient for further operation.
+ """
+ if batch_num_ngs:
+ instance_cnt = len(label_list)
+ if instance_cnt < 5:
+ return
+
+ label_list_all = []
+ item_list_all = []
+ item_cate_list_all = []
+ user_list_all = np.asarray(
+ [[user] * (batch_num_ngs + 1) for user in user_list], dtype=np.int32
+ ).flatten()
+ time_list_all = np.asarray(
+ [[t] * (batch_num_ngs + 1) for t in time_list], dtype=np.float32
+ ).flatten()
+
+ history_lengths = [len(item_history_batch[i]) for i in range(instance_cnt)]
+ max_seq_length_batch = self.max_seq_length
+ item_history_batch_all = np.zeros(
+ (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch)
+ ).astype("int32")
+ item_cate_history_batch_all = np.zeros(
+ (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch)
+ ).astype("int32")
+ time_diff_batch = np.zeros(
+ (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch)
+ ).astype("float32")
+ time_from_first_action_batch = np.zeros(
+ (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch)
+ ).astype("float32")
+ time_to_now_batch = np.zeros(
+ (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch)
+ ).astype("float32")
+ mask = np.zeros(
+ (instance_cnt * (1 + batch_num_ngs), max_seq_length_batch)
+ ).astype("float32")
+
+ for i in range(instance_cnt):
+ this_length = min(history_lengths[i], max_seq_length_batch)
+ for index in range(batch_num_ngs + 1):
+ item_history_batch_all[
+ i * (batch_num_ngs + 1) + index, :this_length
+ ] = np.asarray(item_history_batch[i][-this_length:], dtype=np.int32)
+ item_cate_history_batch_all[
+ i * (batch_num_ngs + 1) + index, :this_length
+ ] = np.asarray(
+ item_cate_history_batch[i][-this_length:], dtype=np.int32
+ )
+ mask[i * (batch_num_ngs + 1) + index, :this_length] = 1.0
+ time_diff_batch[
+ i * (batch_num_ngs + 1) + index, :this_length
+ ] = np.asarray(time_diff_list[i][-this_length:], dtype=np.float32)
+ time_from_first_action_batch[
+ i * (batch_num_ngs + 1) + index, :this_length
+ ] = np.asarray(
+ time_from_first_action_list[i][-this_length:], dtype=np.float32
+ )
+ time_to_now_batch[
+ i * (batch_num_ngs + 1) + index, :this_length
+ ] = np.asarray(time_to_now_list[i][-this_length:], dtype=np.float32)
+
+ for i in range(instance_cnt):
+ positive_item = item_list[i]
+ label_list_all.append(1)
+ item_list_all.append(positive_item)
+ item_cate_list_all.append(item_cate_list[i])
+ count = 0
+ while batch_num_ngs:
+ random_value = random.randint(0, instance_cnt - 1)
+ negative_item = item_list[random_value]
+ if negative_item == positive_item:
+ continue
+ label_list_all.append(0)
+ item_list_all.append(negative_item)
+ item_cate_list_all.append(item_cate_list[random_value])
+ count += 1
+ if count == batch_num_ngs:
+ break
+
+ res = {}
+ res["labels"] = np.asarray(label_list_all, dtype=np.float32).reshape(-1, 1)
+ res["users"] = user_list_all
+ res["items"] = np.asarray(item_list_all, dtype=np.int32)
+ res["cates"] = np.asarray(item_cate_list_all, dtype=np.int32)
+ res["item_history"] = item_history_batch_all
+ res["item_cate_history"] = item_cate_history_batch_all
+ res["mask"] = mask
+ res["time"] = time_list_all
+ res["time_diff"] = time_diff_batch
+ res["time_from_first_action"] = time_from_first_action_batch
+ res["time_to_now"] = time_to_now_batch
+ return res
+
+ else:
+ instance_cnt = len(label_list)
+ history_lengths = [len(item_history_batch[i]) for i in range(instance_cnt)]
+ max_seq_length_batch = self.max_seq_length
+ item_history_batch_all = np.zeros(
+ (instance_cnt, max_seq_length_batch)
+ ).astype("int32")
+ item_cate_history_batch_all = np.zeros(
+ (instance_cnt, max_seq_length_batch)
+ ).astype("int32")
+ time_diff_batch = np.zeros((instance_cnt, max_seq_length_batch)).astype(
+ "float32"
+ )
+ time_from_first_action_batch = np.zeros(
+ (instance_cnt, max_seq_length_batch)
+ ).astype("float32")
+ time_to_now_batch = np.zeros((instance_cnt, max_seq_length_batch)).astype(
+ "float32"
+ )
+ mask = np.zeros((instance_cnt, max_seq_length_batch)).astype("float32")
+
+ for i in range(instance_cnt):
+ this_length = min(history_lengths[i], max_seq_length_batch)
+ item_history_batch_all[i, :this_length] = item_history_batch[i][
+ -this_length:
+ ]
+ item_cate_history_batch_all[i, :this_length] = item_cate_history_batch[
+ i
+ ][-this_length:]
+ mask[i, :this_length] = 1.0
+ time_diff_batch[i, :this_length] = time_diff_list[i][-this_length:]
+ time_from_first_action_batch[
+ i, :this_length
+ ] = time_from_first_action_list[i][-this_length:]
+ time_to_now_batch[i, :this_length] = time_to_now_list[i][-this_length:]
+
+ res = {}
+ res["labels"] = np.asarray(label_list, dtype=np.float32).reshape(-1, 1)
+ res["users"] = np.asarray(user_list, dtype=np.float32)
+ res["items"] = np.asarray(item_list, dtype=np.int32)
+ res["cates"] = np.asarray(item_cate_list, dtype=np.int32)
+ res["item_history"] = item_history_batch_all
+ res["item_cate_history"] = item_cate_history_batch_all
+ res["mask"] = mask
+ res["time"] = np.asarray(time_list, dtype=np.float32)
+ res["time_diff"] = time_diff_batch
+ res["time_from_first_action"] = time_from_first_action_batch
+ res["time_to_now"] = time_to_now_batch
+ return res
+
+[docs] def gen_feed_dict(self, data_dict):
+ """Construct a dictionary that maps graph elements to values.
+
+ Args:
+ data_dict (dict): A dictionary that maps string name to numpy arrays.
+
+ Returns:
+ dict: A dictionary that maps graph elements to numpy arrays.
+
+ """
+ if not data_dict:
+ return dict()
+ feed_dict = {
+ self.labels: data_dict["labels"],
+ self.users: data_dict["users"],
+ self.items: data_dict["items"],
+ self.cates: data_dict["cates"],
+ self.item_history: data_dict["item_history"],
+ self.item_cate_history: data_dict["item_cate_history"],
+ self.mask: data_dict["mask"],
+ self.time: data_dict["time"],
+ self.time_diff: data_dict["time_diff"],
+ self.time_from_first_action: data_dict["time_from_first_action"],
+ self.time_to_now: data_dict["time_to_now"],
+ }
+ return feed_dict
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+from os.path import join
+import abc
+import time
+import os
+import numpy as np
+import tensorflow as tf
+from recommenders.models.deeprec.deeprec_utils import cal_metric
+
+
+tf.compat.v1.disable_eager_execution()
+__all__ = ["BaseModel"]
+
+
+[docs]class BaseModel:
+ """Base class for models"""
+
+ def __init__(self, hparams, iterator_creator, graph=None, seed=None):
+ """Initializing the model. Create common logics which are needed by all deeprec models, such as loss function,
+ parameter set.
+
+ Args:
+ hparams (object): An `HParams` object, holds the entire set of hyperparameters.
+ iterator_creator (object): An iterator to load the data.
+ graph (object): An optional graph.
+ seed (int): Random seed.
+ """
+ self.seed = seed
+ tf.compat.v1.set_random_seed(seed)
+ np.random.seed(seed)
+
+ self.graph = graph if graph is not None else tf.Graph()
+ self.iterator = iterator_creator(hparams, self.graph)
+ self.train_num_ngs = (
+ hparams.train_num_ngs if "train_num_ngs" in hparams.values() else None
+ )
+
+ with self.graph.as_default():
+ self.hparams = hparams
+
+ self.layer_params = []
+ self.embed_params = []
+ self.cross_params = []
+ self.layer_keeps = tf.compat.v1.placeholder(tf.float32, name="layer_keeps")
+ self.keep_prob_train = None
+ self.keep_prob_test = None
+ self.is_train_stage = tf.compat.v1.placeholder(
+ tf.bool, shape=(), name="is_training"
+ )
+ self.group = tf.compat.v1.placeholder(tf.int32, shape=(), name="group")
+
+ self.initializer = self._get_initializer()
+
+ self.logit = self._build_graph()
+ self.pred = self._get_pred(self.logit, self.hparams.method)
+
+ self.loss = self._get_loss()
+ self.saver = tf.compat.v1.train.Saver(max_to_keep=self.hparams.epochs)
+ self.update = self._build_train_opt()
+ self.extra_update_ops = tf.compat.v1.get_collection(
+ tf.compat.v1.GraphKeys.UPDATE_OPS
+ )
+ self.init_op = tf.compat.v1.global_variables_initializer()
+ self.merged = self._add_summaries()
+
+ # set GPU use with on demand growth
+ gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
+ self.sess = tf.compat.v1.Session(
+ graph=self.graph, config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)
+ )
+ self.sess.run(self.init_op)
+
+ @abc.abstractmethod
+ def _build_graph(self):
+ """Subclass will implement this."""
+ pass
+
+ def _get_loss(self):
+ """Make loss function, consists of data loss and regularization loss
+
+ Returns:
+ object: Loss value.
+ """
+ self.data_loss = self._compute_data_loss()
+ self.regular_loss = self._compute_regular_loss()
+ self.loss = tf.add(self.data_loss, self.regular_loss)
+ return self.loss
+
+ def _get_pred(self, logit, task):
+ """Make final output as prediction score, according to different tasks.
+
+ Args:
+ logit (object): Base prediction value.
+ task (str): A task (values: regression/classification)
+
+ Returns:
+ object: Transformed score.
+ """
+ if task == "regression":
+ pred = tf.identity(logit)
+ elif task == "classification":
+ pred = tf.sigmoid(logit)
+ else:
+ raise ValueError(
+ "method must be regression or classification, but now is {0}".format(
+ task
+ )
+ )
+ pred = tf.identity(pred, name="pred")
+ return pred
+
+ def _add_summaries(self):
+ tf.compat.v1.summary.scalar("data_loss", self.data_loss)
+ tf.compat.v1.summary.scalar("regular_loss", self.regular_loss)
+ tf.compat.v1.summary.scalar("loss", self.loss)
+ merged = tf.compat.v1.summary.merge_all()
+ return merged
+
+ def _l2_loss(self):
+ l2_loss = tf.zeros([1], dtype=tf.float32)
+ # embedding_layer l2 loss
+ for param in self.embed_params:
+ l2_loss = tf.add(
+ l2_loss, tf.multiply(self.hparams.embed_l2, tf.nn.l2_loss(param))
+ )
+ params = self.layer_params
+ for param in params:
+ l2_loss = tf.add(
+ l2_loss, tf.multiply(self.hparams.layer_l2, tf.nn.l2_loss(param))
+ )
+ return l2_loss
+
+ def _l1_loss(self):
+ l1_loss = tf.zeros([1], dtype=tf.float32)
+ # embedding_layer l2 loss
+ for param in self.embed_params:
+ l1_loss = tf.add(
+ l1_loss,
+ tf.multiply(self.hparams.embed_l1, tf.norm(tensor=param, ord=1)),
+ )
+ params = self.layer_params
+ for param in params:
+ l1_loss = tf.add(
+ l1_loss,
+ tf.multiply(self.hparams.layer_l1, tf.norm(tensor=param, ord=1)),
+ )
+ return l1_loss
+
+ def _cross_l_loss(self):
+ """Construct L1-norm and L2-norm on cross network parameters for loss function.
+
+ Returns:
+ object: Regular loss value on cross network parameters.
+ """
+ cross_l_loss = tf.zeros([1], dtype=tf.float32)
+ for param in self.cross_params:
+ cross_l_loss = tf.add(
+ cross_l_loss,
+ tf.multiply(self.hparams.cross_l1, tf.norm(tensor=param, ord=1)),
+ )
+ cross_l_loss = tf.add(
+ cross_l_loss,
+ tf.multiply(self.hparams.cross_l2, tf.norm(tensor=param, ord=2)),
+ )
+ return cross_l_loss
+
+ def _get_initializer(self):
+ if self.hparams.init_method == "tnormal":
+ return tf.compat.v1.truncated_normal_initializer(
+ stddev=self.hparams.init_value, seed=self.seed
+ )
+ elif self.hparams.init_method == "uniform":
+ return tf.compat.v1.random_uniform_initializer(
+ -self.hparams.init_value, self.hparams.init_value, seed=self.seed
+ )
+ elif self.hparams.init_method == "normal":
+ return tf.compat.v1.random_normal_initializer(
+ stddev=self.hparams.init_value, seed=self.seed
+ )
+ elif self.hparams.init_method == "xavier_normal":
+ return tf.compat.v1.keras.initializers.VarianceScaling(
+ scale=1.0,
+ mode="fan_avg",
+ distribution=("uniform" if False else "truncated_normal"),
+ seed=self.seed,
+ )
+ elif self.hparams.init_method == "xavier_uniform":
+ return tf.compat.v1.keras.initializers.VarianceScaling(
+ scale=1.0,
+ mode="fan_avg",
+ distribution=("uniform" if True else "truncated_normal"),
+ seed=self.seed,
+ )
+ elif self.hparams.init_method == "he_normal":
+ return tf.compat.v1.keras.initializers.VarianceScaling(
+ scale=2.0,
+ mode=("FAN_IN").lower(),
+ distribution=("uniform" if False else "truncated_normal"),
+ seed=self.seed,
+ )
+ elif self.hparams.init_method == "he_uniform":
+ return tf.compat.v1.keras.initializers.VarianceScaling(
+ scale=2.0,
+ mode=("FAN_IN").lower(),
+ distribution=("uniform" if True else "truncated_normal"),
+ seed=self.seed,
+ )
+ else:
+ return tf.compat.v1.truncated_normal_initializer(
+ stddev=self.hparams.init_value, seed=self.seed
+ )
+
+ def _compute_data_loss(self):
+ if self.hparams.loss == "cross_entropy_loss":
+ data_loss = tf.reduce_mean(
+ input_tensor=tf.nn.sigmoid_cross_entropy_with_logits(
+ logits=tf.reshape(self.logit, [-1]),
+ labels=tf.reshape(self.iterator.labels, [-1]),
+ )
+ )
+ elif self.hparams.loss == "square_loss":
+ data_loss = tf.sqrt(
+ tf.reduce_mean(
+ input_tensor=tf.math.squared_difference(
+ tf.reshape(self.pred, [-1]),
+ tf.reshape(self.iterator.labels, [-1]),
+ )
+ )
+ )
+ elif self.hparams.loss == "log_loss":
+ data_loss = tf.reduce_mean(
+ input_tensor=tf.compat.v1.losses.log_loss(
+ predictions=tf.reshape(self.pred, [-1]),
+ labels=tf.reshape(self.iterator.labels, [-1]),
+ )
+ )
+ elif self.hparams.loss == "softmax":
+ group = self.train_num_ngs + 1
+ logits = tf.reshape(self.logit, (-1, group))
+ if self.hparams.model_type == "NextItNet":
+ labels = (
+ tf.transpose(
+ a=tf.reshape(
+ self.iterator.labels,
+ (-1, group, self.hparams.max_seq_length),
+ ),
+ perm=[0, 2, 1],
+ ),
+ )
+ labels = tf.reshape(labels, (-1, group))
+ else:
+ labels = tf.reshape(self.iterator.labels, (-1, group))
+ softmax_pred = tf.nn.softmax(logits, axis=-1)
+ boolean_mask = tf.equal(labels, tf.ones_like(labels))
+ mask_paddings = tf.ones_like(softmax_pred)
+ pos_softmax = tf.compat.v1.where(boolean_mask, softmax_pred, mask_paddings)
+ data_loss = -group * tf.reduce_mean(input_tensor=tf.math.log(pos_softmax))
+ else:
+ raise ValueError("this loss not defined {0}".format(self.hparams.loss))
+ return data_loss
+
+ def _compute_regular_loss(self):
+ """Construct regular loss. Usually it's comprised of l1 and l2 norm.
+ Users can designate which norm to be included via config file.
+
+ Returns:
+ object: Regular loss.
+ """
+ regular_loss = self._l2_loss() + self._l1_loss() + self._cross_l_loss()
+ return tf.reduce_sum(input_tensor=regular_loss)
+
+ def _train_opt(self):
+ """Get the optimizer according to configuration. Usually we will use Adam.
+
+ Returns:
+ object: An optimizer.
+ """
+ lr = self.hparams.learning_rate
+ optimizer = self.hparams.optimizer
+
+ if optimizer == "adadelta":
+ train_step = tf.compat.v1.train.AdadeltaOptimizer(lr)
+ elif optimizer == "adagrad":
+ train_step = tf.compat.v1.train.AdagradOptimizer(lr)
+ elif optimizer == "sgd":
+ train_step = tf.compat.v1.train.GradientDescentOptimizer(lr)
+ elif optimizer == "adam":
+ train_step = tf.compat.v1.train.AdamOptimizer(lr)
+ elif optimizer == "ftrl":
+ train_step = tf.compat.v1.train.FtrlOptimizer(lr)
+ elif optimizer == "gd":
+ train_step = tf.compat.v1.train.GradientDescentOptimizer(lr)
+ elif optimizer == "padagrad":
+ train_step = tf.compat.v1.train.ProximalAdagradOptimizer(lr)
+ elif optimizer == "pgd":
+ train_step = tf.compat.v1.train.ProximalGradientDescentOptimizer(lr)
+ elif optimizer == "rmsprop":
+ train_step = tf.compat.v1.train.RMSPropOptimizer(lr)
+ else:
+ train_step = tf.compat.v1.train.GradientDescentOptimizer(lr)
+ return train_step
+
+ def _build_train_opt(self):
+ """Construct gradient descent based optimization step
+ In this step, we provide gradient clipping option. Sometimes we what to clip the gradients
+ when their absolute values are too large to avoid gradient explosion.
+
+ Returns:
+ object: An operation that applies the specified optimization step.
+ """
+ train_step = self._train_opt()
+ gradients, variables = zip(*train_step.compute_gradients(self.loss))
+ if self.hparams.is_clip_norm:
+ gradients = [
+ None
+ if gradient is None
+ else tf.clip_by_norm(gradient, self.hparams.max_grad_norm)
+ for gradient in gradients
+ ]
+ return train_step.apply_gradients(zip(gradients, variables))
+
+ def _active_layer(self, logit, activation, layer_idx=-1):
+ """Transform the input value with an activation. May use dropout.
+
+ Args:
+ logit (object): Input value.
+ activation (str): A string indicating the type of activation function.
+ layer_idx (int): Index of current layer. Used to retrieve corresponding parameters
+
+ Returns:
+ object: A tensor after applying activation function on logit.
+ """
+ if layer_idx >= 0 and self.hparams.user_dropout:
+ logit = self._dropout(logit, self.layer_keeps[layer_idx])
+ return self._activate(logit, activation)
+
+ def _activate(self, logit, activation):
+ if activation == "sigmoid":
+ return tf.nn.sigmoid(logit)
+ elif activation == "softmax":
+ return tf.nn.softmax(logit)
+ elif activation == "relu":
+ return tf.nn.relu(logit)
+ elif activation == "tanh":
+ return tf.nn.tanh(logit)
+ elif activation == "elu":
+ return tf.nn.elu(logit)
+ elif activation == "identity":
+ return tf.identity(logit)
+ else:
+ raise ValueError("this activations not defined {0}".format(activation))
+
+ def _dropout(self, logit, keep_prob):
+ """Apply drops upon the input value.
+
+ Args:
+ logit (object): The input value.
+ keep_prob (float): The probability of keeping each element.
+
+ Returns:
+ object: A tensor of the same shape of logit.
+ """
+ return tf.nn.dropout(x=logit, rate=1 - (keep_prob))
+
+[docs] def train(self, sess, feed_dict):
+ """Go through the optimization step once with training data in `feed_dict`.
+
+ Args:
+ sess (object): The model session object.
+ feed_dict (dict): Feed values to train the model. This is a dictionary that maps graph elements to values.
+
+ Returns:
+ list: A list of values, including update operation, total loss, data loss, and merged summary.
+ """
+ feed_dict[self.layer_keeps] = self.keep_prob_train
+ feed_dict[self.is_train_stage] = True
+ return sess.run(
+ [
+ self.update,
+ self.extra_update_ops,
+ self.loss,
+ self.data_loss,
+ self.merged,
+ ],
+ feed_dict=feed_dict,
+ )
+
+[docs] def eval(self, sess, feed_dict):
+ """Evaluate the data in `feed_dict` with current model.
+
+ Args:
+ sess (object): The model session object.
+ feed_dict (dict): Feed values for evaluation. This is a dictionary that maps graph elements to values.
+
+ Returns:
+ list: A list of evaluated results, including total loss value, data loss value, predicted scores, and ground-truth labels.
+ """
+ feed_dict[self.layer_keeps] = self.keep_prob_test
+ feed_dict[self.is_train_stage] = False
+ return sess.run([self.pred, self.iterator.labels], feed_dict=feed_dict)
+
+[docs] def infer(self, sess, feed_dict):
+ """Given feature data (in `feed_dict`), get predicted scores with current model.
+
+ Args:
+ sess (object): The model session object.
+ feed_dict (dict): Instances to predict. This is a dictionary that maps graph elements to values.
+
+ Returns:
+ list: Predicted scores for the given instances.
+ """
+ feed_dict[self.layer_keeps] = self.keep_prob_test
+ feed_dict[self.is_train_stage] = False
+ return sess.run([self.pred], feed_dict=feed_dict)
+
+[docs] def load_model(self, model_path=None):
+ """Load an existing model.
+
+ Args:
+ model_path: model path.
+
+ Raises:
+ IOError: if the restore operation failed.
+ """
+ act_path = self.hparams.load_saved_model
+ if model_path is not None:
+ act_path = model_path
+
+ try:
+ self.saver.restore(self.sess, act_path)
+ except Exception:
+ raise IOError("Failed to find any matching files for {0}".format(act_path))
+
+[docs] def fit(self, train_file, valid_file, test_file=None):
+ """Fit the model with `train_file`. Evaluate the model on valid_file per epoch to observe the training status.
+ If `test_file` is not None, evaluate it too.
+
+ Args:
+ train_file (str): training data set.
+ valid_file (str): validation set.
+ test_file (str): test set.
+
+ Returns:
+ object: An instance of self.
+ """
+ if self.hparams.write_tfevents:
+ self.writer = tf.compat.v1.summary.FileWriter(
+ self.hparams.SUMMARIES_DIR, self.sess.graph
+ )
+
+ train_sess = self.sess
+ for epoch in range(1, self.hparams.epochs + 1):
+ step = 0
+ self.hparams.current_epoch = epoch
+
+ epoch_loss = 0
+ train_start = time.time()
+ for (
+ batch_data_input,
+ impression,
+ data_size,
+ ) in self.iterator.load_data_from_file(train_file):
+ step_result = self.train(train_sess, batch_data_input)
+ (_, _, step_loss, step_data_loss, summary) = step_result
+ if self.hparams.write_tfevents:
+ self.writer.add_summary(summary, step)
+ epoch_loss += step_loss
+ step += 1
+ if step % self.hparams.show_step == 0:
+ print(
+ "step {0:d} , total_loss: {1:.4f}, data_loss: {2:.4f}".format(
+ step, step_loss, step_data_loss
+ )
+ )
+
+ train_end = time.time()
+ train_time = train_end - train_start
+
+ if self.hparams.save_model:
+ if not os.path.exists(self.hparams.MODEL_DIR):
+ os.makedirs(self.hparams.MODEL_DIR)
+ if epoch % self.hparams.save_epoch == 0:
+ save_path_str = join(self.hparams.MODEL_DIR, "epoch_" + str(epoch))
+ self.saver.save(sess=train_sess, save_path=save_path_str)
+
+ eval_start = time.time()
+ eval_res = self.run_eval(valid_file)
+ train_info = ",".join(
+ [
+ str(item[0]) + ":" + str(item[1])
+ for item in [("logloss loss", epoch_loss / step)]
+ ]
+ )
+ eval_info = ", ".join(
+ [
+ str(item[0]) + ":" + str(item[1])
+ for item in sorted(eval_res.items(), key=lambda x: x[0])
+ ]
+ )
+ if test_file is not None:
+ test_res = self.run_eval(test_file)
+ test_info = ", ".join(
+ [
+ str(item[0]) + ":" + str(item[1])
+ for item in sorted(test_res.items(), key=lambda x: x[0])
+ ]
+ )
+ eval_end = time.time()
+ eval_time = eval_end - eval_start
+
+ if test_file is not None:
+ print(
+ "at epoch {0:d}".format(epoch)
+ + "\ntrain info: "
+ + train_info
+ + "\neval info: "
+ + eval_info
+ + "\ntest info: "
+ + test_info
+ )
+ else:
+ print(
+ "at epoch {0:d}".format(epoch)
+ + "\ntrain info: "
+ + train_info
+ + "\neval info: "
+ + eval_info
+ )
+ print(
+ "at epoch {0:d} , train time: {1:.1f} eval time: {2:.1f}".format(
+ epoch, train_time, eval_time
+ )
+ )
+
+ if self.hparams.write_tfevents:
+ self.writer.close()
+
+ return self
+
+[docs] def group_labels(self, labels, preds, group_keys):
+ """Devide `labels` and `preds` into several group according to values in group keys.
+
+ Args:
+ labels (list): ground truth label list.
+ preds (list): prediction score list.
+ group_keys (list): group key list.
+
+ Returns:
+ list, list:
+ - Labels after group.
+ - Predictions after group.
+ """
+ all_keys = list(set(group_keys))
+ group_labels = {k: [] for k in all_keys}
+ group_preds = {k: [] for k in all_keys}
+ for label, p, k in zip(labels, preds, group_keys):
+ group_labels[k].append(label)
+ group_preds[k].append(p)
+ all_labels = []
+ all_preds = []
+ for k in all_keys:
+ all_labels.append(group_labels[k])
+ all_preds.append(group_preds[k])
+ return all_labels, all_preds
+
+[docs] def run_eval(self, filename):
+ """Evaluate the given file and returns some evaluation metrics.
+
+ Args:
+ filename (str): A file name that will be evaluated.
+
+ Returns:
+ dict: A dictionary that contains evaluation metrics.
+ """
+ load_sess = self.sess
+ preds = []
+ labels = []
+ imp_indexs = []
+ for batch_data_input, imp_index, data_size in self.iterator.load_data_from_file(
+ filename
+ ):
+ step_pred, step_labels = self.eval(load_sess, batch_data_input)
+ preds.extend(np.reshape(step_pred, -1))
+ labels.extend(np.reshape(step_labels, -1))
+ imp_indexs.extend(np.reshape(imp_index, -1))
+ res = cal_metric(labels, preds, self.hparams.metrics)
+ if "pairwise_metrics" in self.hparams.values():
+ group_labels, group_preds = self.group_labels(labels, preds, imp_indexs)
+ res_pairwise = cal_metric(
+ group_labels, group_preds, self.hparams.pairwise_metrics
+ )
+ res.update(res_pairwise)
+ return res
+
+[docs] def predict(self, infile_name, outfile_name):
+ """Make predictions on the given data, and output predicted scores to a file.
+
+ Args:
+ infile_name (str): Input file name, format is same as train/val/test file.
+ outfile_name (str): Output file name, each line is the predict score.
+
+ Returns:
+ object: An instance of self.
+ """
+ load_sess = self.sess
+ with tf.io.gfile.GFile(outfile_name, "w") as wt:
+ for batch_data_input, _, data_size in self.iterator.load_data_from_file(
+ infile_name
+ ):
+ step_pred = self.infer(load_sess, batch_data_input)
+ step_pred = step_pred[0][:data_size]
+ step_pred = np.reshape(step_pred, -1)
+ wt.write("\n".join(map(str, step_pred)))
+ # line break after each batch.
+ wt.write("\n")
+ return self
+
+ def _attention(self, inputs, attention_size):
+ """Soft alignment attention implement.
+
+ Args:
+ inputs (object): Sequences ready to apply attention.
+ attention_size (int): The dimension of attention operation.
+
+ Returns:
+ object: Weighted sum after attention.
+ """
+ hidden_size = inputs.shape[2]
+ if not attention_size:
+ attention_size = hidden_size
+
+ attention_mat = tf.compat.v1.get_variable(
+ name="attention_mat",
+ shape=[inputs.shape[-1], hidden_size],
+ initializer=self.initializer,
+ )
+ att_inputs = tf.tensordot(inputs, attention_mat, [[2], [0]])
+
+ query = tf.compat.v1.get_variable(
+ name="query",
+ shape=[attention_size],
+ dtype=tf.float32,
+ initializer=self.initializer,
+ )
+ att_logits = tf.tensordot(att_inputs, query, axes=1, name="att_logits")
+ att_weights = tf.nn.softmax(att_logits, name="att_weights")
+ output = inputs * tf.expand_dims(att_weights, -1)
+ return output
+
+ def _fcn_net(self, model_output, layer_sizes, scope):
+ """Construct the MLP part for the model.
+
+ Args:
+ model_output (object): The output of upper layers, input of MLP part
+ layer_sizes (list): The shape of each layer of MLP part
+ scope (object): The scope of MLP part
+
+ Returns:
+ object: Prediction logit after fully connected layer.
+ """
+ hparams = self.hparams
+ with tf.compat.v1.variable_scope(scope):
+ last_layer_size = model_output.shape[-1]
+ layer_idx = 0
+ hidden_nn_layers = []
+ hidden_nn_layers.append(model_output)
+ with tf.compat.v1.variable_scope(
+ "nn_part", initializer=self.initializer
+ ) as scope:
+ for idx, layer_size in enumerate(layer_sizes):
+ curr_w_nn_layer = tf.compat.v1.get_variable(
+ name="w_nn_layer" + str(layer_idx),
+ shape=[last_layer_size, layer_size],
+ dtype=tf.float32,
+ )
+ curr_b_nn_layer = tf.compat.v1.get_variable(
+ name="b_nn_layer" + str(layer_idx),
+ shape=[layer_size],
+ dtype=tf.float32,
+ initializer=tf.compat.v1.zeros_initializer(),
+ )
+ tf.compat.v1.summary.histogram(
+ "nn_part/" + "w_nn_layer" + str(layer_idx), curr_w_nn_layer
+ )
+ tf.compat.v1.summary.histogram(
+ "nn_part/" + "b_nn_layer" + str(layer_idx), curr_b_nn_layer
+ )
+ curr_hidden_nn_layer = (
+ tf.tensordot(
+ hidden_nn_layers[layer_idx], curr_w_nn_layer, axes=1
+ )
+ + curr_b_nn_layer
+ )
+
+ scope = "nn_part" + str(idx)
+ activation = hparams.activation[idx]
+
+ if hparams.enable_BN is True:
+ curr_hidden_nn_layer = tf.compat.v1.layers.batch_normalization(
+ curr_hidden_nn_layer,
+ momentum=0.95,
+ epsilon=0.0001,
+ training=self.is_train_stage,
+ )
+
+ curr_hidden_nn_layer = self._active_layer(
+ logit=curr_hidden_nn_layer, activation=activation, layer_idx=idx
+ )
+ hidden_nn_layers.append(curr_hidden_nn_layer)
+ layer_idx += 1
+ last_layer_size = layer_size
+
+ w_nn_output = tf.compat.v1.get_variable(
+ name="w_nn_output", shape=[last_layer_size, 1], dtype=tf.float32
+ )
+ b_nn_output = tf.compat.v1.get_variable(
+ name="b_nn_output",
+ shape=[1],
+ dtype=tf.float32,
+ initializer=tf.compat.v1.zeros_initializer(),
+ )
+ tf.compat.v1.summary.histogram(
+ "nn_part/" + "w_nn_output" + str(layer_idx), w_nn_output
+ )
+ tf.compat.v1.summary.histogram(
+ "nn_part/" + "b_nn_output" + str(layer_idx), b_nn_output
+ )
+ nn_output = (
+ tf.tensordot(hidden_nn_layers[-1], w_nn_output, axes=1)
+ + b_nn_output
+ )
+ self.logit = nn_output
+ return nn_output
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+import tensorflow as tf
+
+from recommenders.models.deeprec.models.base_model import BaseModel
+
+__all__ = ["DKN"]
+
+
+[docs]class DKN(BaseModel):
+ """DKN model (Deep Knowledge-Aware Network)
+
+ :Citation:
+
+ H. Wang, F. Zhang, X. Xie and M. Guo, "DKN: Deep Knowledge-Aware Network for News
+ Recommendation", in Proceedings of the 2018 World Wide Web Conference on World
+ Wide Web, 2018.
+ """
+
+ def __init__(self, hparams, iterator_creator):
+ """Initialization steps for DKN.
+ Compared with the BaseModel, DKN requires two different pre-computed embeddings,
+ i.e. word embedding and entity embedding.
+ After creating these two embedding variables, BaseModel's `__init__` method will be called.
+
+ Args:
+ hparams (object): Global hyper-parameters.
+ iterator_creator (object): DKN data loader class.
+ """
+ self.graph = tf.Graph()
+ with self.graph.as_default():
+ with tf.compat.v1.name_scope("embedding"):
+ word2vec_embedding = self._init_embedding(hparams.wordEmb_file)
+ self.embedding = tf.Variable(
+ word2vec_embedding, trainable=True, name="word"
+ )
+
+ if hparams.use_entity:
+ e_embedding = self._init_embedding(hparams.entityEmb_file)
+ W = tf.Variable(
+ tf.random.uniform([hparams.entity_dim, hparams.dim], -1, 1),
+ trainable=True,
+ )
+ b = tf.Variable(tf.zeros([hparams.dim]), trainable=True)
+ self.entity_embedding = tf.nn.tanh(tf.matmul(e_embedding, W) + b)
+ else:
+ self.entity_embedding = tf.Variable(
+ tf.constant(
+ 0.0,
+ shape=[hparams.entity_size, hparams.dim],
+ dtype=tf.float32,
+ ),
+ trainable=True,
+ name="entity",
+ )
+
+ if hparams.use_context:
+ c_embedding = self._init_embedding(hparams.contextEmb_file)
+ W = tf.Variable(
+ tf.random.uniform([hparams.entity_dim, hparams.dim], -1, 1),
+ trainable=True,
+ )
+ b = tf.Variable(tf.zeros([hparams.dim]), trainable=True)
+ self.context_embedding = tf.nn.tanh(tf.matmul(c_embedding, W) + b)
+ else:
+ self.context_embedding = tf.Variable(
+ tf.constant(
+ 0.0,
+ shape=[hparams.entity_size, hparams.dim],
+ dtype=tf.float32,
+ ),
+ trainable=True,
+ name="context",
+ )
+
+ super().__init__(hparams, iterator_creator, graph=self.graph)
+
+ def _init_embedding(self, file_path):
+ """Load pre-trained embeddings as a constant tensor.
+
+ Args:
+ file_path (str): the pre-trained embeddings filename.
+
+ Returns:
+ object: A constant tensor.
+ """
+ return tf.constant(np.load(file_path).astype(np.float32))
+
+ def _l2_loss(self):
+ hparams = self.hparams
+ l2_loss = tf.zeros([1], dtype=tf.float32)
+ # embedding_layer l2 loss
+ l2_loss = tf.add(
+ l2_loss, tf.multiply(hparams.embed_l2, tf.nn.l2_loss(self.embedding))
+ )
+ if hparams.use_entity:
+ l2_loss = tf.add(
+ l2_loss,
+ tf.multiply(hparams.embed_l2, tf.nn.l2_loss(self.entity_embedding)),
+ )
+ if hparams.use_entity and hparams.use_context:
+ l2_loss = tf.add(
+ l2_loss,
+ tf.multiply(hparams.embed_l2, tf.nn.l2_loss(self.context_embedding)),
+ )
+ params = self.layer_params
+ for param in params:
+ l2_loss = tf.add(
+ l2_loss, tf.multiply(hparams.layer_l2, tf.nn.l2_loss(param))
+ )
+ return l2_loss
+
+ def _l1_loss(self):
+ hparams = self.hparams
+ l1_loss = tf.zeros([1], dtype=tf.float32)
+ # embedding_layer l2 loss
+ l1_loss = tf.add(
+ l1_loss,
+ tf.multiply(hparams.embed_l1, tf.norm(tensor=self.embedding, ord=1)),
+ )
+ if hparams.use_entity:
+ l1_loss = tf.add(
+ l1_loss,
+ tf.multiply(
+ hparams.embed_l1, tf.norm(tensor=self.entity_embedding, ord=1)
+ ),
+ )
+ if hparams.use_entity and hparams.use_context:
+ l1_loss = tf.add(
+ l1_loss,
+ tf.multiply(
+ hparams.embed_l1, tf.norm(tensor=self.context_embedding, ord=1)
+ ),
+ )
+ params = self.layer_params
+ for param in params:
+ l1_loss = tf.add(
+ l1_loss, tf.multiply(hparams.layer_l1, tf.norm(tensor=param, ord=1))
+ )
+ return l1_loss
+
+ def _build_graph(self):
+ hparams = self.hparams
+ self.keep_prob_train = 1 - np.array(hparams.dropout)
+ self.keep_prob_test = np.ones_like(hparams.dropout)
+ with tf.compat.v1.variable_scope("DKN"):
+ logit = self._build_dkn()
+ return logit
+
+ def _build_dkn(self):
+ """The main function to create DKN's logic.
+
+ Returns:
+ object: Prediction score made by the DKN model.
+ """
+ hparams = self.hparams
+ # build attention model for clicked news and candidate news
+ click_news_embed_batch, candidate_news_embed_batch = self._build_pair_attention(
+ self.iterator.candidate_news_index_batch,
+ self.iterator.candidate_news_entity_index_batch,
+ self.iterator.click_news_index_batch,
+ self.iterator.click_news_entity_index_batch,
+ hparams,
+ )
+
+ nn_input = tf.concat(
+ [click_news_embed_batch, candidate_news_embed_batch], axis=1
+ )
+
+ dnn_channel_part = 2
+ last_layer_size = dnn_channel_part * self.num_filters_total
+ layer_idx = 0
+ hidden_nn_layers = []
+ hidden_nn_layers.append(nn_input)
+ with tf.compat.v1.variable_scope("nn_part", initializer=self.initializer):
+ for idx, layer_size in enumerate(hparams.layer_sizes):
+ curr_w_nn_layer = tf.compat.v1.get_variable(
+ name="w_nn_layer" + str(layer_idx),
+ shape=[last_layer_size, layer_size],
+ dtype=tf.float32,
+ )
+ curr_b_nn_layer = tf.compat.v1.get_variable(
+ name="b_nn_layer" + str(layer_idx),
+ shape=[layer_size],
+ dtype=tf.float32,
+ )
+ curr_hidden_nn_layer = tf.compat.v1.nn.xw_plus_b(
+ hidden_nn_layers[layer_idx], curr_w_nn_layer, curr_b_nn_layer
+ )
+ if hparams.enable_BN is True:
+ curr_hidden_nn_layer = tf.compat.v1.layers.batch_normalization(
+ curr_hidden_nn_layer,
+ momentum=0.95,
+ epsilon=0.0001,
+ training=self.is_train_stage,
+ )
+
+ activation = hparams.activation[idx]
+ curr_hidden_nn_layer = self._active_layer(
+ logit=curr_hidden_nn_layer, activation=activation
+ )
+ hidden_nn_layers.append(curr_hidden_nn_layer)
+ layer_idx += 1
+ last_layer_size = layer_size
+ self.layer_params.append(curr_w_nn_layer)
+ self.layer_params.append(curr_b_nn_layer)
+
+ w_nn_output = tf.compat.v1.get_variable(
+ name="w_nn_output", shape=[last_layer_size, 1], dtype=tf.float32
+ )
+ b_nn_output = tf.compat.v1.get_variable(
+ name="b_nn_output", shape=[1], dtype=tf.float32
+ )
+ self.layer_params.append(w_nn_output)
+ self.layer_params.append(b_nn_output)
+ nn_output = tf.compat.v1.nn.xw_plus_b(
+ hidden_nn_layers[-1], w_nn_output, b_nn_output
+ )
+ return nn_output
+
+ def _build_pair_attention(
+ self,
+ candidate_word_batch,
+ candidate_entity_batch,
+ click_word_batch,
+ click_entity_batch,
+ hparams,
+ ):
+ """This function learns the candidate news article's embedding and user embedding.
+ User embedding is generated from click history and also depends on the candidate news article via attention mechanism.
+ Article embedding is generated via KCNN module.
+ Args:
+ candidate_word_batch (object): tensor word indices for constructing news article
+ candidate_entity_batch (object): tensor entity values for constructing news article
+ click_word_batch (object): tensor word indices for constructing user clicked history
+ click_entity_batch (object): tensor entity indices for constructing user clicked history
+ hparams (object): global hyper-parameters
+ Returns:
+ click_field_embed_final_batch: user embedding
+ news_field_embed_final_batch: candidate news article embedding
+
+ """
+ doc_size = hparams.doc_size
+ attention_hidden_sizes = hparams.attention_layer_sizes
+
+ clicked_words = tf.reshape(click_word_batch, shape=[-1, doc_size])
+ clicked_entities = tf.reshape(click_entity_batch, shape=[-1, doc_size])
+
+ with tf.compat.v1.variable_scope(
+ "attention_net", initializer=self.initializer
+ ) as scope: # noqa: F841
+
+ # use kims cnn to get conv embedding
+ with tf.compat.v1.variable_scope(
+ "kcnn", initializer=self.initializer, reuse=tf.compat.v1.AUTO_REUSE
+ ) as cnn_scope: # noqa: F841
+ news_field_embed = self._kims_cnn(
+ candidate_word_batch, candidate_entity_batch, hparams
+ )
+ click_field_embed = self._kims_cnn(
+ clicked_words, clicked_entities, hparams
+ )
+ click_field_embed = tf.reshape(
+ click_field_embed,
+ shape=[
+ -1,
+ hparams.history_size,
+ hparams.num_filters * len(hparams.filter_sizes),
+ ],
+ )
+
+ avg_strategy = False
+ if avg_strategy:
+ click_field_embed_final = tf.reduce_mean(
+ input_tensor=click_field_embed, axis=1, keepdims=True
+ )
+ else:
+ news_field_embed = tf.expand_dims(news_field_embed, 1)
+ news_field_embed_repeat = tf.add(
+ tf.zeros_like(click_field_embed), news_field_embed
+ )
+ attention_x = tf.concat(
+ axis=-1, values=[click_field_embed, news_field_embed_repeat]
+ )
+ attention_x = tf.reshape(
+ attention_x, shape=[-1, self.num_filters_total * 2]
+ )
+ attention_w = tf.compat.v1.get_variable(
+ name="attention_hidden_w",
+ shape=[self.num_filters_total * 2, attention_hidden_sizes],
+ dtype=tf.float32,
+ )
+ attention_b = tf.compat.v1.get_variable(
+ name="attention_hidden_b",
+ shape=[attention_hidden_sizes],
+ dtype=tf.float32,
+ )
+ curr_attention_layer = tf.compat.v1.nn.xw_plus_b(
+ attention_x, attention_w, attention_b
+ )
+
+ if hparams.enable_BN is True:
+ curr_attention_layer = tf.compat.v1.layers.batch_normalization(
+ curr_attention_layer,
+ momentum=0.95,
+ epsilon=0.0001,
+ training=self.is_train_stage,
+ )
+
+ activation = hparams.attention_activation
+ curr_attention_layer = self._active_layer(
+ logit=curr_attention_layer, activation=activation
+ )
+ attention_output_w = tf.compat.v1.get_variable(
+ name="attention_output_w",
+ shape=[attention_hidden_sizes, 1],
+ dtype=tf.float32,
+ )
+ attention_output_b = tf.compat.v1.get_variable(
+ name="attention_output_b", shape=[1], dtype=tf.float32
+ )
+ attention_weight = tf.compat.v1.nn.xw_plus_b(
+ curr_attention_layer, attention_output_w, attention_output_b
+ )
+ attention_weight = tf.reshape(
+ attention_weight, shape=[-1, hparams.history_size, 1]
+ )
+ norm_attention_weight = tf.nn.softmax(attention_weight, axis=1)
+ click_field_embed_final = tf.reduce_sum(
+ input_tensor=tf.multiply(click_field_embed, norm_attention_weight),
+ axis=1,
+ keepdims=True,
+ )
+ if attention_w not in self.layer_params:
+ self.layer_params.append(attention_w)
+ if attention_b not in self.layer_params:
+ self.layer_params.append(attention_b)
+ if attention_output_w not in self.layer_params:
+ self.layer_params.append(attention_output_w)
+ if attention_output_b not in self.layer_params:
+ self.layer_params.append(attention_output_b)
+ self.news_field_embed_final_batch = tf.squeeze(news_field_embed)
+ click_field_embed_final_batch = tf.squeeze(click_field_embed_final)
+
+ return click_field_embed_final_batch, self.news_field_embed_final_batch
+
+ def _kims_cnn(self, word, entity, hparams):
+ """The KCNN module. KCNN is an extension of traditional CNN that incorporates symbolic knowledge from
+ a knowledge graph into sentence representation learning.
+ Args:
+ word (object): word indices for the sentence.
+ entity (object): entity indices for the sentence. Entities are aligned with words in the sentence.
+ hparams (object): global hyper-parameters.
+
+ Returns:
+ object: Sentence representation.
+ """
+ # kims cnn parameter
+ filter_sizes = hparams.filter_sizes
+ num_filters = hparams.num_filters
+
+ dim = hparams.dim
+ embedded_chars = tf.nn.embedding_lookup(params=self.embedding, ids=word)
+ if hparams.use_entity and hparams.use_context:
+ entity_embedded_chars = tf.nn.embedding_lookup(
+ params=self.entity_embedding, ids=entity
+ )
+ context_embedded_chars = tf.nn.embedding_lookup(
+ params=self.context_embedding, ids=entity
+ )
+ concat = tf.concat(
+ [embedded_chars, entity_embedded_chars, context_embedded_chars], axis=-1
+ )
+ elif hparams.use_entity:
+ entity_embedded_chars = tf.nn.embedding_lookup(
+ params=self.entity_embedding, ids=entity
+ )
+ concat = tf.concat([embedded_chars, entity_embedded_chars], axis=-1)
+ else:
+ concat = embedded_chars
+ concat_expanded = tf.expand_dims(concat, -1)
+
+ # Create a convolution + maxpool layer for each filter size
+ pooled_outputs = []
+ for i, filter_size in enumerate(filter_sizes):
+ with tf.compat.v1.variable_scope(
+ "conv-maxpool-%s" % filter_size, initializer=self.initializer
+ ):
+ # Convolution Layer
+ if hparams.use_entity and hparams.use_context:
+ filter_shape = [filter_size, dim * 3, 1, num_filters]
+ elif hparams.use_entity:
+ filter_shape = [filter_size, dim * 2, 1, num_filters]
+ else:
+ filter_shape = [filter_size, dim, 1, num_filters]
+ W = tf.compat.v1.get_variable(
+ name="W" + "_filter_size_" + str(filter_size),
+ shape=filter_shape,
+ dtype=tf.float32,
+ initializer=tf.compat.v1.keras.initializers.VarianceScaling(
+ scale=1.0,
+ mode="fan_avg",
+ distribution=("uniform" if False else "truncated_normal"),
+ ),
+ )
+ b = tf.compat.v1.get_variable(
+ name="b" + "_filter_size_" + str(filter_size),
+ shape=[num_filters],
+ dtype=tf.float32,
+ )
+ if W not in self.layer_params:
+ self.layer_params.append(W)
+ if b not in self.layer_params:
+ self.layer_params.append(b)
+ conv = tf.nn.conv2d(
+ input=concat_expanded,
+ filters=W,
+ strides=[1, 1, 1, 1],
+ padding="VALID",
+ name="conv",
+ )
+ # Apply nonlinearity
+ h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
+ # Maxpooling over the outputs
+ pooled = tf.nn.max_pool2d(
+ h,
+ ksize=[1, hparams.doc_size - filter_size + 1, 1, 1],
+ strides=[1, 1, 1, 1],
+ padding="VALID",
+ name="pool",
+ )
+ pooled_outputs.append(pooled)
+ # Combine all the pooled features
+ # self.num_filters_total is the kims cnn output dimension
+ self.num_filters_total = num_filters * len(filter_sizes)
+ h_pool = tf.concat(pooled_outputs, axis=-1)
+ h_pool_flat = tf.reshape(h_pool, [-1, self.num_filters_total])
+ return h_pool_flat
+
+[docs] def infer_embedding(self, sess, feed_dict):
+ """Infer document embedding in feed_dict with current model.
+
+ Args:
+ sess (object): The model session object.
+ feed_dict (dict): Feed values for evaluation. This is a dictionary that maps graph elements to values.
+
+ Returns:
+ list: News embedding in a batch.
+ """
+ feed_dict[self.layer_keeps] = self.keep_prob_test
+ feed_dict[self.is_train_stage] = False
+ return sess.run([self.news_field_embed_final_batch], feed_dict=feed_dict)
+
+[docs] def run_get_embedding(self, infile_name, outfile_name):
+ """infer document embedding with current model.
+
+ Args:
+ infile_name (str): Input file name, format is [Newsid] [w1,w2,w3...] [e1,e2,e3...]
+ outfile_name (str): Output file name, format is [Newsid] [embedding]
+
+ Returns:
+ object: An instance of self.
+ """
+ load_sess = self.sess
+ with tf.io.gfile.GFile(outfile_name, "w") as wt:
+ for (
+ batch_data_input,
+ newsid_list,
+ data_size,
+ ) in self.iterator.load_infer_data_from_file(infile_name):
+ news_embedding = self.infer_embedding(load_sess, batch_data_input)[0]
+ for i in range(data_size):
+ wt.write(
+ newsid_list[i]
+ + " "
+ + ",".join(
+ [
+ str(embedding_value)
+ for embedding_value in news_embedding[i]
+ ]
+ )
+ + "\n"
+ )
+ return self
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+import tensorflow as tf
+from recommenders.models.deeprec.models.dkn import DKN
+from recommenders.models.deeprec.deeprec_utils import cal_metric
+
+"""
+This new model adapts DKN's structure for item-to-item recommendations.
+The tutorial can be found at: https://github.com/microsoft/recommenders/blob/main/examples/07_tutorials/KDD2020-tutorial/step4_run_dkn_item2item.ipynb
+ """
+
+
+[docs]class DKNItem2Item(DKN):
+ """Class for item-to-item recommendations using DKN.
+ See https://github.com/microsoft/recommenders/blob/main/examples/07_tutorials/KDD2020-tutorial/step4_run_dkn_item2item.ipynb"""
+
+ def _compute_data_loss(self):
+ logits = self.pred
+ data_loss = -1 * tf.reduce_sum(input_tensor=tf.math.log(logits[:, 0] + 1e-10))
+ return data_loss
+
+ def _build_dkn(self):
+ """The main function to create DKN's logic.
+
+ Returns:
+ object: Prediction of item2item relation scores made by the DKN model, in the shape of (`batch_size`, `num_negative` + 1).
+ """
+ news_field_embed_final_batch = self._build_doc_embedding(
+ self.iterator.candidate_news_index_batch,
+ self.iterator.candidate_news_entity_index_batch,
+ )
+
+ self.news_field_embed_final_batch = tf.math.l2_normalize(
+ news_field_embed_final_batch, axis=-1, epsilon=1e-12
+ )
+
+ item_embs_train = tf.reshape(
+ self.news_field_embed_final_batch,
+ [
+ -1,
+ self.iterator.neg_num + 2,
+ self.news_field_embed_final_batch.shape[-1],
+ ],
+ ) # (B, group, D)
+
+ item_embs_source = item_embs_train[:, 0, :] # get the source item
+ item_embs_source = tf.expand_dims(item_embs_source, 1)
+
+ item_embs_target = item_embs_train[:, 1:, :]
+
+ item_relation = tf.math.multiply(item_embs_target, item_embs_source)
+ item_relation = tf.reduce_sum(
+ input_tensor=item_relation, axis=-1
+ ) # (B, neg_num + 1)
+
+ self.pred_logits = item_relation
+
+ return self.pred_logits
+
+ def _get_pred(self, logit, task):
+ return tf.nn.softmax(logit, axis=-1)
+
+ def _build_doc_embedding(self, candidate_word_batch, candidate_entity_batch):
+ """
+ To make the document embedding be dense, we add one tanh layer on top of the `kims_cnn` module.
+ """
+ with tf.compat.v1.variable_scope("kcnn", initializer=self.initializer):
+ news_field_embed = self._kims_cnn(
+ candidate_word_batch, candidate_entity_batch, self.hparams
+ )
+ W = tf.compat.v1.get_variable(
+ name="W_doc_trans",
+ shape=(news_field_embed.shape[-1], self.num_filters_total),
+ dtype=tf.float32,
+ initializer=tf.compat.v1.keras.initializers.VarianceScaling(
+ scale=1.0,
+ mode="fan_avg",
+ distribution=("uniform" if False else "truncated_normal"),
+ ),
+ )
+ if W not in self.layer_params:
+ self.layer_params.append(W)
+ news_field_embed = tf.tanh(tf.matmul(news_field_embed, W))
+ return news_field_embed
+
+[docs] def eval(self, sess, feed_dict):
+ """Evaluate the data in `feed_dict` with current model.
+
+ Args:
+ sess (object): The model session object.
+ feed_dict (dict): Feed values for evaluation. This is a dictionary that maps graph elements to values.
+
+ Returns:
+ numpy.ndarray, numpy.ndarray: A tuple with predictions and labels arrays.
+ """
+ feed_dict[self.layer_keeps] = self.keep_prob_test
+ feed_dict[self.is_train_stage] = False
+ preds = sess.run(self.pred, feed_dict=feed_dict)
+ labels = np.zeros_like(preds, dtype=np.int32)
+ labels[:, 0] = 1
+ return (preds, labels)
+
+[docs] def run_eval(self, filename):
+ """Evaluate the given file and returns some evaluation metrics.
+
+ Args:
+ filename (str): A file name that will be evaluated.
+
+ Returns:
+ dict: A dictionary containing evaluation metrics.
+ """
+ load_sess = self.sess
+ group_preds = []
+ group_labels = []
+
+ for (
+ batch_data_input,
+ newsid_list,
+ data_size,
+ ) in self.iterator.load_data_from_file(filename):
+ if batch_data_input:
+ step_pred, step_labels = self.eval(load_sess, batch_data_input)
+ group_preds.extend(step_pred)
+ group_labels.extend(step_labels)
+
+ res = cal_metric(group_labels, group_preds, self.hparams.pairwise_metrics)
+ return res
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+import time
+import os
+import sys
+import numpy as np
+import pandas as pd
+from recommenders.evaluation.python_evaluation import (
+ map_at_k,
+ ndcg_at_k,
+ precision_at_k,
+ recall_at_k,
+)
+from recommenders.utils.python_utils import get_top_k_scored_items
+
+tf.compat.v1.disable_eager_execution() # need to disable eager in TF2.x
+
+
+[docs]class LightGCN(object):
+ """LightGCN model
+
+ :Citation:
+
+ He, Xiangnan, Kuan Deng, Xiang Wang, Yan Li, Yongdong Zhang, and Meng Wang.
+ "LightGCN: Simplifying and Powering Graph Convolution Network for Recommendation." arXiv
+ preprint arXiv:2002.02126, 2020.
+ """
+
+ def __init__(self, hparams, data, seed=None):
+ """Initializing the model. Create parameters, placeholders, embeddings and loss function.
+
+ Args:
+ hparams (HParams): A HParams object, hold the entire set of hyperparameters.
+ data (object): A recommenders.models.deeprec.DataModel.ImplicitCF object, load and process data.
+ seed (int): Seed.
+
+ """
+
+ tf.compat.v1.set_random_seed(seed)
+ np.random.seed(seed)
+
+ self.data = data
+ self.epochs = hparams.epochs
+ self.lr = hparams.learning_rate
+ self.emb_dim = hparams.embed_size
+ self.batch_size = hparams.batch_size
+ self.n_layers = hparams.n_layers
+ self.decay = hparams.decay
+ self.eval_epoch = hparams.eval_epoch
+ self.top_k = hparams.top_k
+ self.save_model = hparams.save_model
+ self.save_epoch = hparams.save_epoch
+ self.metrics = hparams.metrics
+ self.model_dir = hparams.MODEL_DIR
+
+ metric_options = ["map", "ndcg", "precision", "recall"]
+ for metric in self.metrics:
+ if metric not in metric_options:
+ raise ValueError(
+ "Wrong metric(s), please select one of this list: {}".format(
+ metric_options
+ )
+ )
+
+ self.norm_adj = data.get_norm_adj_mat()
+
+ self.n_users = data.n_users
+ self.n_items = data.n_items
+
+ self.users = tf.compat.v1.placeholder(tf.int32, shape=(None,))
+ self.pos_items = tf.compat.v1.placeholder(tf.int32, shape=(None,))
+ self.neg_items = tf.compat.v1.placeholder(tf.int32, shape=(None,))
+
+ self.weights = self._init_weights()
+ self.ua_embeddings, self.ia_embeddings = self._create_lightgcn_embed()
+
+ self.u_g_embeddings = tf.nn.embedding_lookup(
+ params=self.ua_embeddings, ids=self.users
+ )
+ self.pos_i_g_embeddings = tf.nn.embedding_lookup(
+ params=self.ia_embeddings, ids=self.pos_items
+ )
+ self.neg_i_g_embeddings = tf.nn.embedding_lookup(
+ params=self.ia_embeddings, ids=self.neg_items
+ )
+ self.u_g_embeddings_pre = tf.nn.embedding_lookup(
+ params=self.weights["user_embedding"], ids=self.users
+ )
+ self.pos_i_g_embeddings_pre = tf.nn.embedding_lookup(
+ params=self.weights["item_embedding"], ids=self.pos_items
+ )
+ self.neg_i_g_embeddings_pre = tf.nn.embedding_lookup(
+ params=self.weights["item_embedding"], ids=self.neg_items
+ )
+
+ self.batch_ratings = tf.matmul(
+ self.u_g_embeddings,
+ self.pos_i_g_embeddings,
+ transpose_a=False,
+ transpose_b=True,
+ )
+
+ self.mf_loss, self.emb_loss = self._create_bpr_loss(
+ self.u_g_embeddings, self.pos_i_g_embeddings, self.neg_i_g_embeddings
+ )
+ self.loss = self.mf_loss + self.emb_loss
+
+ self.opt = tf.compat.v1.train.AdamOptimizer(learning_rate=self.lr).minimize(
+ self.loss
+ )
+ self.saver = tf.compat.v1.train.Saver(max_to_keep=1)
+
+ gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
+ self.sess = tf.compat.v1.Session(
+ config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)
+ )
+ self.sess.run(tf.compat.v1.global_variables_initializer())
+
+ def _init_weights(self):
+ """Initialize user and item embeddings.
+
+ Returns:
+ dict: With keys `user_embedding` and `item_embedding`, embeddings of all users and items.
+
+ """
+ all_weights = dict()
+ initializer = tf.compat.v1.keras.initializers.VarianceScaling(
+ scale=1.0, mode="fan_avg", distribution="uniform"
+ )
+
+ all_weights["user_embedding"] = tf.Variable(
+ initializer([self.n_users, self.emb_dim]), name="user_embedding"
+ )
+ all_weights["item_embedding"] = tf.Variable(
+ initializer([self.n_items, self.emb_dim]), name="item_embedding"
+ )
+ print("Using xavier initialization.")
+
+ return all_weights
+
+ def _create_lightgcn_embed(self):
+ """Calculate the average embeddings of users and items after every layer of the model.
+
+ Returns:
+ tf.Tensor, tf.Tensor: Average user embeddings. Average item embeddings.
+
+ """
+ A_hat = self._convert_sp_mat_to_sp_tensor(self.norm_adj)
+
+ ego_embeddings = tf.concat(
+ [self.weights["user_embedding"], self.weights["item_embedding"]], axis=0
+ )
+ all_embeddings = [ego_embeddings]
+
+ for k in range(0, self.n_layers):
+ ego_embeddings = tf.sparse.sparse_dense_matmul(A_hat, ego_embeddings)
+ all_embeddings += [ego_embeddings]
+
+ all_embeddings = tf.stack(all_embeddings, 1)
+ all_embeddings = tf.reduce_mean(
+ input_tensor=all_embeddings, axis=1, keepdims=False
+ )
+ u_g_embeddings, i_g_embeddings = tf.split(
+ all_embeddings, [self.n_users, self.n_items], 0
+ )
+ return u_g_embeddings, i_g_embeddings
+
+ def _create_bpr_loss(self, users, pos_items, neg_items):
+ """Calculate BPR loss.
+
+ Args:
+ users (tf.Tensor): User embeddings to calculate loss.
+ pos_items (tf.Tensor): Positive item embeddings to calculate loss.
+ neg_items (tf.Tensor): Negative item embeddings to calculate loss.
+
+ Returns:
+ tf.Tensor, tf.Tensor: Matrix factorization loss. Embedding regularization loss.
+
+ """
+ pos_scores = tf.reduce_sum(input_tensor=tf.multiply(users, pos_items), axis=1)
+ neg_scores = tf.reduce_sum(input_tensor=tf.multiply(users, neg_items), axis=1)
+
+ regularizer = (
+ tf.nn.l2_loss(self.u_g_embeddings_pre)
+ + tf.nn.l2_loss(self.pos_i_g_embeddings_pre)
+ + tf.nn.l2_loss(self.neg_i_g_embeddings_pre)
+ )
+ regularizer = regularizer / self.batch_size
+ mf_loss = tf.reduce_mean(
+ input_tensor=tf.nn.softplus(-(pos_scores - neg_scores))
+ )
+ emb_loss = self.decay * regularizer
+ return mf_loss, emb_loss
+
+ def _convert_sp_mat_to_sp_tensor(self, X):
+ """Convert a scipy sparse matrix to tf.SparseTensor.
+
+ Returns:
+ tf.SparseTensor: SparseTensor after conversion.
+
+ """
+ coo = X.tocoo().astype(np.float32)
+ indices = np.mat([coo.row, coo.col]).transpose()
+ return tf.SparseTensor(indices, coo.data, coo.shape)
+
+[docs] def fit(self):
+ """Fit the model on self.data.train. If eval_epoch is not -1, evaluate the model on `self.data.test`
+ every `eval_epoch` epoch to observe the training status.
+
+ """
+ for epoch in range(1, self.epochs + 1):
+ train_start = time.time()
+ loss, mf_loss, emb_loss = 0.0, 0.0, 0.0
+ n_batch = self.data.train.shape[0] // self.batch_size + 1
+ for idx in range(n_batch):
+ users, pos_items, neg_items = self.data.train_loader(self.batch_size)
+ _, batch_loss, batch_mf_loss, batch_emb_loss = self.sess.run(
+ [self.opt, self.loss, self.mf_loss, self.emb_loss],
+ feed_dict={
+ self.users: users,
+ self.pos_items: pos_items,
+ self.neg_items: neg_items,
+ },
+ )
+ loss += batch_loss / n_batch
+ mf_loss += batch_mf_loss / n_batch
+ emb_loss += batch_emb_loss / n_batch
+
+ if np.isnan(loss):
+ print("ERROR: loss is nan.")
+ sys.exit()
+ train_end = time.time()
+ train_time = train_end - train_start
+
+ if self.save_model and epoch % self.save_epoch == 0:
+ save_path_str = os.path.join(self.model_dir, "epoch_" + str(epoch))
+ if not os.path.exists(save_path_str):
+ os.makedirs(save_path_str)
+ checkpoint_path = self.saver.save( # noqa: F841
+ sess=self.sess, save_path=save_path_str
+ )
+ print("Save model to path {0}".format(os.path.abspath(save_path_str)))
+
+ if self.eval_epoch == -1 or epoch % self.eval_epoch != 0:
+ print(
+ "Epoch %d (train)%.1fs: train loss = %.5f = (mf)%.5f + (embed)%.5f"
+ % (epoch, train_time, loss, mf_loss, emb_loss)
+ )
+ else:
+ eval_start = time.time()
+ ret = self.run_eval()
+ eval_end = time.time()
+ eval_time = eval_end - eval_start
+
+ print(
+ "Epoch %d (train)%.1fs + (eval)%.1fs: train loss = %.5f = (mf)%.5f + (embed)%.5f, %s"
+ % (
+ epoch,
+ train_time,
+ eval_time,
+ loss,
+ mf_loss,
+ emb_loss,
+ ", ".join(
+ metric + " = %.5f" % (r)
+ for metric, r in zip(self.metrics, ret)
+ ),
+ )
+ )
+
+[docs] def load(self, model_path=None):
+ """Load an existing model.
+
+ Args:
+ model_path: Model path.
+
+ Raises:
+ IOError: if the restore operation failed.
+
+ """
+ try:
+ self.saver.restore(self.sess, model_path)
+ except Exception:
+ raise IOError(
+ "Failed to find any matching files for {0}".format(model_path)
+ )
+
+[docs] def run_eval(self):
+ """Run evaluation on self.data.test.
+
+ Returns:
+ dict: Results of all metrics in `self.metrics`.
+ """
+ topk_scores = self.recommend_k_items(
+ self.data.test, top_k=self.top_k, use_id=True
+ )
+ ret = []
+ for metric in self.metrics:
+ if metric == "map":
+ ret.append(map_at_k(self.data.test, topk_scores, k=self.top_k))
+ elif metric == "ndcg":
+ ret.append(ndcg_at_k(self.data.test, topk_scores, k=self.top_k))
+ elif metric == "precision":
+ ret.append(precision_at_k(self.data.test, topk_scores, k=self.top_k))
+ elif metric == "recall":
+ ret.append(recall_at_k(self.data.test, topk_scores, k=self.top_k))
+ return ret
+
+[docs] def score(self, user_ids, remove_seen=True):
+ """Score all items for test users.
+
+ Args:
+ user_ids (np.array): Users to test.
+ remove_seen (bool): Flag to remove items seen in training from recommendation.
+
+ Returns:
+ numpy.ndarray: Value of interest of all items for the users.
+
+ """
+ if any(np.isnan(user_ids)):
+ raise ValueError(
+ "LightGCN cannot score users that are not in the training set"
+ )
+ u_batch_size = self.batch_size
+ n_user_batchs = len(user_ids) // u_batch_size + 1
+ test_scores = []
+ for u_batch_id in range(n_user_batchs):
+ start = u_batch_id * u_batch_size
+ end = (u_batch_id + 1) * u_batch_size
+ user_batch = user_ids[start:end]
+ item_batch = range(self.data.n_items)
+ rate_batch = self.sess.run(
+ self.batch_ratings, {self.users: user_batch, self.pos_items: item_batch}
+ )
+ test_scores.append(np.array(rate_batch))
+ test_scores = np.concatenate(test_scores, axis=0)
+ if remove_seen:
+ test_scores += self.data.R.tocsr()[user_ids, :] * -np.inf
+ return test_scores
+
+[docs] def recommend_k_items(
+ self, test, top_k=10, sort_top_k=True, remove_seen=True, use_id=False
+ ):
+ """Recommend top K items for all users in the test set.
+
+ Args:
+ test (pandas.DataFrame): Test data.
+ top_k (int): Number of top items to recommend.
+ sort_top_k (bool): Flag to sort top k results.
+ remove_seen (bool): Flag to remove items seen in training from recommendation.
+
+ Returns:
+ pandas.DataFrame: Top k recommendation items for each user.
+
+ """
+ data = self.data
+ if not use_id:
+ user_ids = np.array([data.user2id[x] for x in test[data.col_user].unique()])
+ else:
+ user_ids = np.array(test[data.col_user].unique())
+
+ test_scores = self.score(user_ids, remove_seen=remove_seen)
+
+ top_items, top_scores = get_top_k_scored_items(
+ scores=test_scores, top_k=top_k, sort_top_k=sort_top_k
+ )
+
+ df = pd.DataFrame(
+ {
+ data.col_user: np.repeat(
+ test[data.col_user].drop_duplicates().values, top_items.shape[1]
+ ),
+ data.col_item: top_items.flatten()
+ if use_id
+ else [data.id2item[item] for item in top_items.flatten()],
+ data.col_prediction: top_scores.flatten(),
+ }
+ )
+
+ return df.replace(-np.inf, np.nan).dropna()
+
+ def output_embeddings(self, idmapper, n, target, user_file):
+ embeddings = list(target.eval(session=self.sess))
+ with open(user_file, "w") as wt:
+ for i in range(n):
+ wt.write(
+ "{0}\t{1}\n".format(
+ idmapper[i], " ".join([str(a) for a in embeddings[i]])
+ )
+ )
+
+[docs] def infer_embedding(self, user_file, item_file):
+ """Export user and item embeddings to csv files.
+
+ Args:
+ user_file (str): Path of file to save user embeddings.
+ item_file (str): Path of file to save item embeddings.
+
+ """
+ # create output directories if they do not exist
+ dirs, _ = os.path.split(user_file)
+ if not os.path.exists(dirs):
+ os.makedirs(dirs)
+ dirs, _ = os.path.split(item_file)
+ if not os.path.exists(dirs):
+ os.makedirs(dirs)
+
+ data = self.data
+
+ self.output_embeddings(
+ data.id2user, self.n_users, self.ua_embeddings, user_file
+ )
+ self.output_embeddings(
+ data.id2item, self.n_items, self.ia_embeddings, item_file
+ )
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+from recommenders.models.deeprec.models.sequential.sequential_base_model import (
+ SequentialBaseModel,
+)
+
+__all__ = ["A2SVDModel"]
+
+
+[docs]class A2SVDModel(SequentialBaseModel):
+ """A2SVD Model (Attentive Asynchronous Singular Value Decomposition)
+
+ It extends ASVD with an attention module.
+
+ :Citation:
+
+ ASVD: Y. Koren, "Factorization Meets the Neighborhood: a Multifaceted Collaborative
+ Filtering Model", in Proceedings of the 14th ACM SIGKDD international conference on
+ Knowledge discovery and data mining, pages 426–434, ACM, 2008.
+
+ A2SVD: Z. Yu, J. Lian, A. Mahmoody, G. Liu and X. Xie, "Adaptive User Modeling with
+ Long and Short-Term Preferences for Personailzed Recommendation", in Proceedings of
+ the 28th International Joint Conferences on Artificial Intelligence, IJCAI’19,
+ Pages 4213-4219, AAAI Press, 2019.
+ """
+
+ def _build_seq_graph(self):
+ """The main function to create A2SVD model.
+
+ Returns:
+ object: The output of A2SVD section.
+ """
+ hparams = self.hparams
+ with tf.compat.v1.variable_scope("a2svd"):
+ hist_input = tf.concat(
+ [self.item_history_embedding, self.cate_history_embedding], 2
+ )
+ with tf.compat.v1.variable_scope("Attention_layer"):
+ att_outputs1 = self._attention(hist_input, hparams.attention_size)
+ asvd_output = tf.reduce_sum(input_tensor=att_outputs1, axis=1)
+ tf.compat.v1.summary.histogram("a2svd_output", asvd_output)
+ model_output = tf.concat([asvd_output, self.target_item_embedding], 1)
+ self.model_output = model_output
+ tf.compat.v1.summary.histogram("model_output", model_output)
+ return model_output
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+from recommenders.models.deeprec.models.sequential.sequential_base_model import (
+ SequentialBaseModel,
+)
+
+__all__ = ["CaserModel"]
+
+
+[docs]class CaserModel(SequentialBaseModel):
+ """Caser Model
+
+ :Citation:
+
+ J. Tang and K. Wang, "Personalized top-n sequential recommendation via convolutional
+ sequence embedding", in Proceedings of the Eleventh ACM International Conference on
+ Web Search and Data Mining, ACM, 2018.
+ """
+
+ def __init__(self, hparams, iterator_creator, seed=None):
+ """Initialization of variables for caser
+
+ Args:
+ hparams (HParams): A HParams object, hold the entire set of hyperparameters.
+ iterator_creator (object): An iterator to load the data.
+ """
+ self.hparams = hparams
+ self.L = hparams.L # history sequence that involved in convolution shape
+ self.T = hparams.T # prediction shape
+ self.n_v = hparams.n_v # number of vertical convolution layers
+ self.n_h = hparams.n_h # number of horizonal convolution layers
+ self.lengths = [
+ i + 1 for i in range(self.L)
+ ] # horizonal convolution filter shape
+ super().__init__(hparams, iterator_creator, seed=seed)
+
+ def _build_seq_graph(self):
+ """The main function to create caser model.
+
+ Returns:
+ object: The output of caser section.
+ """
+ with tf.compat.v1.variable_scope("caser"):
+ cnn_output = self._caser_cnn()
+ model_output = tf.concat([cnn_output, self.target_item_embedding], 1)
+ tf.compat.v1.summary.histogram("model_output", model_output)
+ return model_output
+
+ def _add_cnn(self, hist_matrix, vertical_dim, scope):
+ """The main function to use CNN at both vertical and horizonal aspects.
+
+ Args:
+ hist_matrix (object): The output of history sequential embeddings
+ vertical_dim (int): The shape of embeddings of input
+ scope (object): The scope of CNN input.
+
+ Returns:
+ object: The output of CNN layers.
+ """
+ with tf.compat.v1.variable_scope(scope):
+ with tf.compat.v1.variable_scope("vertical"):
+ embedding_T = tf.transpose(a=hist_matrix, perm=[0, 2, 1])
+ out_v = self._build_cnn(embedding_T, self.n_v, vertical_dim)
+ out_v = tf.compat.v1.layers.flatten(out_v)
+ with tf.compat.v1.variable_scope("horizonal"):
+ out_hs = []
+ for h in self.lengths:
+ conv_out = self._build_cnn(hist_matrix, self.n_h, h)
+ max_pool_out = tf.reduce_max(
+ input_tensor=conv_out, axis=[1], name="max_pool_{0}".format(h)
+ )
+ out_hs.append(max_pool_out)
+ out_h = tf.concat(out_hs, 1)
+ return tf.concat([out_v, out_h], 1)
+
+ def _caser_cnn(self):
+ """The main function to use CNN at both item and category aspects.
+
+ Returns:
+ object: The concatenated output of two parts of item and category.
+ """
+ item_out = self._add_cnn(
+ self.item_history_embedding, self.item_embedding_dim, "item"
+ )
+ tf.compat.v1.summary.histogram("item_out", item_out)
+ cate_out = self._add_cnn(
+ self.cate_history_embedding, self.cate_embedding_dim, "cate"
+ )
+ tf.compat.v1.summary.histogram("cate_out", cate_out)
+ cnn_output = tf.concat([item_out, cate_out], 1)
+ tf.compat.v1.summary.histogram("cnn_output", cnn_output)
+ return cnn_output
+
+ def _build_cnn(self, history_matrix, nums, shape):
+ """Call a CNN layer.
+
+ Returns:
+ object: The output of cnn section.
+ """
+ return tf.compat.v1.layers.conv1d(
+ history_matrix,
+ nums,
+ shape,
+ activation=tf.nn.relu,
+ name="conv_" + str(shape),
+ )
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+from keras.layers.legacy_rnn.rnn_cell_impl import GRUCell, LSTMCell
+from recommenders.models.deeprec.models.sequential.sequential_base_model import (
+ SequentialBaseModel,
+)
+from tensorflow.compat.v1.nn import dynamic_rnn
+
+__all__ = ["GRUModel"]
+
+
+[docs]class GRUModel(SequentialBaseModel):
+ """GRU Model
+
+ :Citation:
+
+ Kyunghyun Cho, Bart van Merrienboer, Caglar Gulcehre, Dzmitry Bahdanau,
+ Fethi Bougares, Holger Schwenk, and Yoshua Bengio. Learning Phrase
+ Representations using RNN Encoder-Decoder for Statistical Machine Translation.
+ arXiv preprint arXiv:1406.1078. 2014.
+ """
+
+ def _build_seq_graph(self):
+ """The main function to create GRU model.
+
+ Returns:
+ object:the output of GRU section.
+ """
+ with tf.compat.v1.variable_scope("gru"):
+ # final_state = self._build_lstm()
+ final_state = self._build_gru()
+ model_output = tf.concat([final_state, self.target_item_embedding], 1)
+ tf.compat.v1.summary.histogram("model_output", model_output)
+ return model_output
+
+ def _build_lstm(self):
+ """Apply an LSTM for modeling.
+
+ Returns:
+ object: The output of LSTM section.
+ """
+ with tf.compat.v1.name_scope("lstm"):
+ self.mask = self.iterator.mask
+ self.sequence_length = tf.reduce_sum(input_tensor=self.mask, axis=1)
+ self.history_embedding = tf.concat(
+ [self.item_history_embedding, self.cate_history_embedding], 2
+ )
+ rnn_outputs, final_state = dynamic_rnn(
+ LSTMCell(self.hidden_size),
+ inputs=self.history_embedding,
+ sequence_length=self.sequence_length,
+ dtype=tf.float32,
+ scope="lstm",
+ )
+ tf.compat.v1.summary.histogram("LSTM_outputs", rnn_outputs)
+ return final_state[1]
+
+ def _build_gru(self):
+ """Apply a GRU for modeling.
+
+ Returns:
+ object: The output of GRU section.
+ """
+ with tf.compat.v1.name_scope("gru"):
+ self.mask = self.iterator.mask
+ self.sequence_length = tf.reduce_sum(input_tensor=self.mask, axis=1)
+ self.history_embedding = tf.concat(
+ [self.item_history_embedding, self.cate_history_embedding], 2
+ )
+ rnn_outputs, final_state = dynamic_rnn(
+ GRUCell(self.hidden_size),
+ inputs=self.history_embedding,
+ sequence_length=self.sequence_length,
+ dtype=tf.float32,
+ scope="gru",
+ )
+ tf.compat.v1.summary.histogram("GRU_outputs", rnn_outputs)
+ return final_state
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+from recommenders.models.deeprec.models.sequential.sequential_base_model import (
+ SequentialBaseModel,
+)
+
+__all__ = ["NextItNetModel"]
+
+
+[docs]class NextItNetModel(SequentialBaseModel):
+ """NextItNet Model
+
+ :Citation:
+ Yuan, Fajie, et al. "A Simple Convolutional Generative Network
+ for Next Item Recommendation", in Web Search and Data Mining, 2019.
+
+ Note:
+ It requires strong sequence with dataset.
+ """
+
+ def _build_seq_graph(self):
+ """The main function to create nextitnet model.
+
+ Returns:
+ object: The output of nextitnet section.
+ """
+ hparams = self.hparams
+ is_training = tf.equal(self.is_train_stage, True)
+ item_history_embedding = tf.cond(
+ pred=is_training,
+ true_fn=lambda: self.item_history_embedding[
+ :: self.hparams.train_num_ngs + 1
+ ],
+ false_fn=lambda: self.item_history_embedding,
+ )
+ cate_history_embedding = tf.cond(
+ pred=is_training,
+ true_fn=lambda: self.cate_history_embedding[
+ :: self.hparams.train_num_ngs + 1
+ ],
+ false_fn=lambda: self.cate_history_embedding,
+ )
+
+ with tf.compat.v1.variable_scope("nextitnet", reuse=tf.compat.v1.AUTO_REUSE):
+ dilate_input = tf.concat(
+ [item_history_embedding, cate_history_embedding], 2
+ )
+
+ for layer_id, dilation in enumerate(hparams.dilations):
+ dilate_input = tf.cond(
+ pred=is_training,
+ true_fn=lambda: self._nextitnet_residual_block_one(
+ dilate_input,
+ dilation,
+ layer_id,
+ dilate_input.get_shape()[-1],
+ hparams.kernel_size,
+ causal=True,
+ train=True,
+ ),
+ false_fn=lambda: self._nextitnet_residual_block_one(
+ dilate_input,
+ dilation,
+ layer_id,
+ dilate_input.get_shape()[-1],
+ hparams.kernel_size,
+ causal=True,
+ train=False,
+ ),
+ )
+
+ self.dilate_input = dilate_input
+ model_output = tf.cond(
+ pred=is_training,
+ true_fn=self._training_output,
+ false_fn=self._normal_output,
+ )
+
+ return model_output
+
+ def _training_output(self):
+ model_output = tf.repeat(
+ self.dilate_input, self.hparams.train_num_ngs + 1, axis=0
+ )
+ model_output = tf.concat([model_output, self.target_item_embedding], -1)
+ model_output = tf.reshape(
+ model_output,
+ (
+ -1,
+ self.hparams.train_num_ngs + 1,
+ self.hparams.max_seq_length,
+ model_output.get_shape()[-1],
+ ),
+ )
+ model_output = tf.transpose(a=model_output, perm=[0, 2, 1, 3])
+ model_output = tf.reshape(model_output, (-1, model_output.get_shape()[-1]))
+ return model_output
+
+ def _normal_output(self):
+ model_output = self.dilate_input[:, -1, :]
+ model_output = tf.concat(
+ [model_output, self.target_item_embedding[:, -1, :]], -1
+ )
+ return model_output
+
+ def _nextitnet_residual_block_one(
+ self,
+ input_,
+ dilation,
+ layer_id,
+ residual_channels,
+ kernel_size,
+ causal=True,
+ train=True,
+ ):
+ """The main function to use dilated CNN and residual network at sequence data
+
+ Args:
+ input_ (object): The output of history sequential embeddings
+ dilation (int): The dilation number of CNN layer
+ layer_id (str): String value of layer ID, 0, 1, 2...
+ residual_channels (int): Embedding size of input sequence
+ kernel_size (int): Kernel size of CNN mask
+ causal (bool): Whether to pad in front of the sequence or to pad surroundingly
+ train (bool): is in training stage
+
+ Returns:
+ object: The output of residual layers.
+ """
+ resblock_type = "decoder"
+ resblock_name = "nextitnet_residual_block_one_{}_layer_{}_{}".format(
+ resblock_type, layer_id, dilation
+ )
+ with tf.compat.v1.variable_scope(resblock_name):
+ input_ln = self._layer_norm(input_, name="layer_norm1", trainable=train)
+ relu1 = tf.nn.relu(input_ln)
+ conv1 = self._conv1d(
+ relu1, int(0.5 * int(residual_channels)), name="conv1d_1"
+ )
+ conv1 = self._layer_norm(conv1, name="layer_norm2", trainable=train)
+ relu2 = tf.nn.relu(conv1)
+
+ dilated_conv = self._conv1d(
+ relu2,
+ int(0.5 * int(residual_channels)),
+ dilation,
+ kernel_size,
+ causal=causal,
+ name="dilated_conv",
+ )
+
+ dilated_conv = self._layer_norm(
+ dilated_conv, name="layer_norm3", trainable=train
+ )
+ relu3 = tf.nn.relu(dilated_conv)
+ conv2 = self._conv1d(relu3, residual_channels, name="conv1d_2")
+ return input_ + conv2
+
+ def _conv1d(
+ self,
+ input_,
+ output_channels,
+ dilation=1,
+ kernel_size=1,
+ causal=False,
+ name="dilated_conv",
+ ):
+ """Call a dilated CNN layer
+
+ Returns:
+ object: The output of dilated CNN layers.
+ """
+ with tf.compat.v1.variable_scope(name):
+ weight = tf.compat.v1.get_variable(
+ "weight",
+ [1, kernel_size, input_.get_shape()[-1], output_channels],
+ initializer=tf.compat.v1.truncated_normal_initializer(
+ stddev=0.02, seed=1
+ ),
+ )
+ bias = tf.compat.v1.get_variable(
+ "bias",
+ [output_channels],
+ initializer=tf.compat.v1.constant_initializer(0.0),
+ )
+
+ if causal:
+ padding = [[0, 0], [(kernel_size - 1) * dilation, 0], [0, 0]]
+ padded = tf.pad(tensor=input_, paddings=padding)
+ input_expanded = tf.expand_dims(padded, axis=1)
+ out = (
+ tf.nn.atrous_conv2d(
+ input_expanded, weight, rate=dilation, padding="VALID"
+ )
+ + bias
+ )
+ else:
+ input_expanded = tf.expand_dims(input_, axis=1)
+ out = (
+ tf.nn.conv2d(
+ input=input_expanded,
+ filters=weight,
+ strides=[1, 1, 1, 1],
+ padding="SAME",
+ )
+ + bias
+ )
+
+ return tf.squeeze(out, [1])
+
+ def _layer_norm(self, x, name, epsilon=1e-8, trainable=True):
+ """Call a layer normalization
+
+ Returns:
+ object: Normalized data
+ """
+ with tf.compat.v1.variable_scope(name):
+ shape = x.get_shape()
+ beta = tf.compat.v1.get_variable(
+ "beta",
+ [int(shape[-1])],
+ initializer=tf.compat.v1.constant_initializer(0),
+ trainable=trainable,
+ )
+ gamma = tf.compat.v1.get_variable(
+ "gamma",
+ [int(shape[-1])],
+ initializer=tf.compat.v1.constant_initializer(1),
+ trainable=trainable,
+ )
+
+ mean, variance = tf.nn.moments(x=x, axes=[len(shape) - 1], keepdims=True)
+
+ x = (x - mean) / tf.sqrt(variance + epsilon)
+
+ return gamma * x + beta
+
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module implementing RNN Cells.
+
+This module provides a number of basic commonly used RNN cells, such as LSTM
+(Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number of
+operators that allow adding dropouts, projections, or embeddings for inputs.
+Constructing multi-layer cells is supported by the class `MultiRNNCell`, or by
+calling the `rnn` ops several times.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+
+from tensorflow.python.ops.rnn_cell_impl import (
+ RNNCell,
+ LSTMStateTuple,
+ _BIAS_VARIABLE_NAME,
+ _WEIGHTS_VARIABLE_NAME,
+)
+
+
+[docs]class Time4LSTMCell(RNNCell):
+ def __init__(
+ self,
+ num_units,
+ use_peepholes=False,
+ cell_clip=None,
+ initializer=None,
+ num_proj=None,
+ proj_clip=None,
+ num_unit_shards=None,
+ num_proj_shards=None,
+ forget_bias=1.0,
+ state_is_tuple=True,
+ activation=None,
+ reuse=None,
+ ):
+ super(Time4LSTMCell, self).__init__(_reuse=reuse)
+ if not state_is_tuple:
+ logging.warn(
+ "%s: Using a concatenated state is slower and will soon be "
+ "deprecated. Use state_is_tuple=True.",
+ self,
+ )
+ if num_unit_shards is not None or num_proj_shards is not None:
+ logging.warn(
+ "%s: The num_unit_shards and proj_unit_shards parameters are "
+ "deprecated and will be removed in Jan 2017. "
+ "Use a variable scope with a partitioner instead.",
+ self,
+ )
+
+ self._num_units = num_units
+ self._use_peepholes = use_peepholes
+ self._cell_clip = cell_clip
+ self._initializer = initializer
+ self._num_proj = num_proj
+ self._proj_clip = proj_clip
+ self._num_unit_shards = num_unit_shards
+ self._num_proj_shards = num_proj_shards
+ self._forget_bias = forget_bias
+ self._state_is_tuple = state_is_tuple
+ self._activation = activation or math_ops.tanh
+
+ if num_proj:
+ self._state_size = (
+ LSTMStateTuple(num_units, num_proj)
+ if state_is_tuple
+ else num_units + num_proj
+ )
+ self._output_size = num_proj
+ else:
+ self._state_size = (
+ LSTMStateTuple(num_units, num_units)
+ if state_is_tuple
+ else 2 * num_units
+ )
+ self._output_size = num_units
+ self._linear1 = None
+ self._linear2 = None
+ self._time_input_w1 = None
+ self._time_input_w2 = None
+ self._time_kernel_w1 = None
+ self._time_kernel_t1 = None
+ self._time_bias1 = None
+ self._time_kernel_w2 = None
+ self._time_kernel_t2 = None
+ self._time_bias2 = None
+ self._o_kernel_t1 = None
+ self._o_kernel_t2 = None
+ if self._use_peepholes:
+ self._w_f_diag = None
+ self._w_i_diag = None
+ self._w_o_diag = None
+
+ @property
+ def state_size(self):
+ return self._state_size
+
+ @property
+ def output_size(self):
+ return self._output_size
+
+[docs] def call(self, inputs, state):
+ """Call method for the Time4LSTMCell.
+
+ Args:
+ inputs: A 2D Tensor of shape [batch_size, input_size].
+ state: A 2D Tensor of shape [batch_size, state_size].
+
+ Returns:
+ A tuple containing:
+ - A 2D Tensor of shape [batch_size, output_size].
+ - A 2D Tensor of shape [batch_size, state_size].
+ """
+ time_now_score = tf.expand_dims(inputs[:, -1], -1)
+ time_last_score = tf.expand_dims(inputs[:, -2], -1)
+ inputs = inputs[:, :-2]
+ num_proj = self._num_units if self._num_proj is None else self._num_proj
+ sigmoid = math_ops.sigmoid
+
+ if self._state_is_tuple:
+ (c_prev, m_prev) = state
+ else:
+ c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
+ m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
+
+ dtype = inputs.dtype
+ input_size = inputs.get_shape().with_rank(2)[1]
+ if input_size is None:
+ raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+
+ if self._time_kernel_w1 is None:
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
+ with vs.variable_scope(unit_scope):
+ self._time_input_w1 = vs.get_variable(
+ "_time_input_w1", shape=[self._num_units], dtype=dtype
+ )
+ self._time_input_bias1 = vs.get_variable(
+ "_time_input_bias1", shape=[self._num_units], dtype=dtype
+ )
+ self._time_input_w2 = vs.get_variable(
+ "_time_input_w2", shape=[self._num_units], dtype=dtype
+ )
+ self._time_input_bias2 = vs.get_variable(
+ "_time_input_bias2", shape=[self._num_units], dtype=dtype
+ )
+ self._time_kernel_w1 = vs.get_variable(
+ "_time_kernel_w1",
+ shape=[input_size, self._num_units],
+ dtype=dtype,
+ )
+ self._time_kernel_t1 = vs.get_variable(
+ "_time_kernel_t1",
+ shape=[self._num_units, self._num_units],
+ dtype=dtype,
+ )
+ self._time_bias1 = vs.get_variable(
+ "_time_bias1", shape=[self._num_units], dtype=dtype
+ )
+ self._time_kernel_w2 = vs.get_variable(
+ "_time_kernel_w2",
+ shape=[input_size, self._num_units],
+ dtype=dtype,
+ )
+ self._time_kernel_t2 = vs.get_variable(
+ "_time_kernel_t2",
+ shape=[self._num_units, self._num_units],
+ dtype=dtype,
+ )
+ self._time_bias2 = vs.get_variable(
+ "_time_bias2", shape=[self._num_units], dtype=dtype
+ )
+ self._o_kernel_t1 = vs.get_variable(
+ "_o_kernel_t1",
+ shape=[self._num_units, self._num_units],
+ dtype=dtype,
+ )
+ self._o_kernel_t2 = vs.get_variable(
+ "_o_kernel_t2",
+ shape=[self._num_units, self._num_units],
+ dtype=dtype,
+ )
+
+ time_now_input = tf.nn.tanh(
+ time_now_score * self._time_input_w1 + self._time_input_bias1
+ )
+ time_last_input = tf.nn.tanh(
+ time_last_score * self._time_input_w2 + self._time_input_bias2
+ )
+
+ time_now_state = (
+ math_ops.matmul(inputs, self._time_kernel_w1)
+ + math_ops.matmul(time_now_input, self._time_kernel_t1)
+ + self._time_bias1
+ )
+ time_last_state = (
+ math_ops.matmul(inputs, self._time_kernel_w2)
+ + math_ops.matmul(time_last_input, self._time_kernel_t2)
+ + self._time_bias2
+ )
+
+ if self._linear1 is None:
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
+ if self._num_unit_shards is not None:
+ unit_scope.set_partitioner(
+ partitioned_variables.fixed_size_partitioner(
+ self._num_unit_shards
+ )
+ )
+ self._linear1 = _Linear([inputs, m_prev], 4 * self._num_units, True)
+
+ # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+ lstm_matrix = self._linear1([inputs, m_prev])
+ i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1)
+ o = (
+ o
+ + math_ops.matmul(time_now_input, self._o_kernel_t1)
+ + math_ops.matmul(time_last_input, self._o_kernel_t2)
+ )
+ # Diagonal connections
+ if self._use_peepholes and not self._w_f_diag:
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
+ with vs.variable_scope(unit_scope):
+ self._w_f_diag = vs.get_variable(
+ "w_f_diag", shape=[self._num_units], dtype=dtype
+ )
+ self._w_i_diag = vs.get_variable(
+ "w_i_diag", shape=[self._num_units], dtype=dtype
+ )
+ self._w_o_diag = vs.get_variable(
+ "w_o_diag", shape=[self._num_units], dtype=dtype
+ )
+
+ if self._use_peepholes:
+ c = sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * sigmoid(
+ time_last_state
+ ) * c_prev + sigmoid(i + self._w_i_diag * c_prev) * sigmoid(
+ time_now_state
+ ) * self._activation(
+ j
+ )
+ else:
+ c = sigmoid(f + self._forget_bias) * sigmoid(
+ time_last_state
+ ) * c_prev + sigmoid(i) * sigmoid(time_now_state) * self._activation(j)
+
+ if self._cell_clip is not None:
+ # pylint: disable=invalid-unary-operand-type
+ c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
+ # pylint: enable=invalid-unary-operand-type
+ if self._use_peepholes:
+ m = sigmoid(o + self._w_o_diag * c) * self._activation(c)
+ else:
+ m = sigmoid(o) * self._activation(c)
+
+ if self._num_proj is not None:
+ if self._linear2 is None:
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope, initializer=self._initializer):
+ with vs.variable_scope("projection") as proj_scope:
+ if self._num_proj_shards is not None:
+ proj_scope.set_partitioner(
+ partitioned_variables.fixed_size_partitioner(
+ self._num_proj_shards
+ )
+ )
+ self._linear2 = _Linear(m, self._num_proj, False)
+ m = self._linear2(m)
+
+ if self._proj_clip is not None:
+ # pylint: disable=invalid-unary-operand-type
+ m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
+ # pylint: enable=invalid-unary-operand-type
+
+ new_state = (
+ LSTMStateTuple(c, m)
+ if self._state_is_tuple
+ else array_ops.concat([c, m], 1)
+ )
+ return m, new_state
+
+
+[docs]class Time4ALSTMCell(RNNCell):
+ def __init__(
+ self,
+ num_units,
+ use_peepholes=False,
+ cell_clip=None,
+ initializer=None,
+ num_proj=None,
+ proj_clip=None,
+ num_unit_shards=None,
+ num_proj_shards=None,
+ forget_bias=1.0,
+ state_is_tuple=True,
+ activation=None,
+ reuse=None,
+ ):
+ super(Time4ALSTMCell, self).__init__(_reuse=reuse)
+ if not state_is_tuple:
+ logging.warn(
+ "%s: Using a concatenated state is slower and will soon be "
+ "deprecated. Use state_is_tuple=True.",
+ self,
+ )
+ if num_unit_shards is not None or num_proj_shards is not None:
+ logging.warn(
+ "%s: The num_unit_shards and proj_unit_shards parameters are "
+ "deprecated and will be removed in Jan 2017. "
+ "Use a variable scope with a partitioner instead.",
+ self,
+ )
+
+ self._num_units = num_units
+ self._use_peepholes = use_peepholes
+ self._cell_clip = cell_clip
+ self._initializer = initializer
+ self._num_proj = num_proj
+ self._proj_clip = proj_clip
+ self._num_unit_shards = num_unit_shards
+ self._num_proj_shards = num_proj_shards
+ self._forget_bias = forget_bias
+ self._state_is_tuple = state_is_tuple
+ self._activation = activation or math_ops.tanh
+
+ if num_proj:
+ self._state_size = (
+ LSTMStateTuple(num_units, num_proj)
+ if state_is_tuple
+ else num_units + num_proj
+ )
+ self._output_size = num_proj
+ else:
+ self._state_size = (
+ LSTMStateTuple(num_units, num_units)
+ if state_is_tuple
+ else 2 * num_units
+ )
+ self._output_size = num_units
+ self._linear1 = None
+ self._linear2 = None
+ self._time_input_w1 = None
+ self._time_input_w2 = None
+ self._time_kernel_w1 = None
+ self._time_kernel_t1 = None
+ self._time_bias1 = None
+ self._time_kernel_w2 = None
+ self._time_kernel_t2 = None
+ self._time_bias2 = None
+ self._o_kernel_t1 = None
+ self._o_kernel_t2 = None
+ if self._use_peepholes:
+ self._w_f_diag = None
+ self._w_i_diag = None
+ self._w_o_diag = None
+
+ @property
+ def state_size(self):
+ return self._state_size
+
+ @property
+ def output_size(self):
+ return self._output_size
+
+[docs] def call(self, inputs, state):
+ """Call method for the Time4ALSTMCell.
+
+ Args:
+ inputs: A 2D Tensor of shape [batch_size, input_size].
+ state: A 2D Tensor of shape [batch_size, state_size].
+
+ Returns:
+ A tuple containing:
+ - A 2D Tensor of shape [batch_size, output_size].
+ - A 2D Tensor of shape [batch_size, state_size].
+ """
+ att_score = tf.expand_dims(inputs[:, -1], -1)
+ time_now_score = tf.expand_dims(inputs[:, -2], -1)
+ time_last_score = tf.expand_dims(inputs[:, -3], -1)
+ inputs = inputs[:, :-3]
+ num_proj = self._num_units if self._num_proj is None else self._num_proj
+ sigmoid = math_ops.sigmoid
+
+ if self._state_is_tuple:
+ (c_prev, m_prev) = state
+ else:
+ c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
+ m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
+
+ dtype = inputs.dtype
+ input_size = inputs.get_shape().with_rank(2)[1]
+ if input_size is None:
+ raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+
+ if self._time_kernel_w1 is None:
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
+ with vs.variable_scope(unit_scope):
+ self._time_input_w1 = vs.get_variable(
+ "_time_input_w1", shape=[self._num_units], dtype=dtype
+ )
+ self._time_input_bias1 = vs.get_variable(
+ "_time_input_bias1", shape=[self._num_units], dtype=dtype
+ )
+ self._time_input_w2 = vs.get_variable(
+ "_time_input_w2", shape=[self._num_units], dtype=dtype
+ )
+ self._time_input_bias2 = vs.get_variable(
+ "_time_input_bias2", shape=[self._num_units], dtype=dtype
+ )
+ self._time_kernel_w1 = vs.get_variable(
+ "_time_kernel_w1",
+ shape=[input_size, self._num_units],
+ dtype=dtype,
+ )
+ self._time_kernel_t1 = vs.get_variable(
+ "_time_kernel_t1",
+ shape=[self._num_units, self._num_units],
+ dtype=dtype,
+ )
+ self._time_bias1 = vs.get_variable(
+ "_time_bias1", shape=[self._num_units], dtype=dtype
+ )
+ self._time_kernel_w2 = vs.get_variable(
+ "_time_kernel_w2",
+ shape=[input_size, self._num_units],
+ dtype=dtype,
+ )
+ self._time_kernel_t2 = vs.get_variable(
+ "_time_kernel_t2",
+ shape=[self._num_units, self._num_units],
+ dtype=dtype,
+ )
+ self._time_bias2 = vs.get_variable(
+ "_time_bias2", shape=[self._num_units], dtype=dtype
+ )
+ self._o_kernel_t1 = vs.get_variable(
+ "_o_kernel_t1",
+ shape=[self._num_units, self._num_units],
+ dtype=dtype,
+ )
+ self._o_kernel_t2 = vs.get_variable(
+ "_o_kernel_t2",
+ shape=[self._num_units, self._num_units],
+ dtype=dtype,
+ )
+
+ time_now_input = tf.nn.tanh(
+ time_now_score * self._time_input_w1 + self._time_input_bias1
+ )
+ time_last_input = tf.nn.tanh(
+ time_last_score * self._time_input_w2 + self._time_input_bias2
+ )
+
+ time_now_state = (
+ math_ops.matmul(inputs, self._time_kernel_w1)
+ + math_ops.matmul(time_now_input, self._time_kernel_t1)
+ + self._time_bias1
+ )
+ time_last_state = (
+ math_ops.matmul(inputs, self._time_kernel_w2)
+ + math_ops.matmul(time_last_input, self._time_kernel_t2)
+ + self._time_bias2
+ )
+
+ if self._linear1 is None:
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
+ if self._num_unit_shards is not None:
+ unit_scope.set_partitioner(
+ partitioned_variables.fixed_size_partitioner(
+ self._num_unit_shards
+ )
+ )
+ self._linear1 = _Linear([inputs, m_prev], 4 * self._num_units, True)
+
+ # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+ lstm_matrix = self._linear1([inputs, m_prev])
+ i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1)
+ o = (
+ o
+ + math_ops.matmul(time_now_input, self._o_kernel_t1)
+ + math_ops.matmul(time_last_input, self._o_kernel_t2)
+ )
+ # Diagonal connections
+ if self._use_peepholes and not self._w_f_diag:
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
+ with vs.variable_scope(unit_scope):
+ self._w_f_diag = vs.get_variable(
+ "w_f_diag", shape=[self._num_units], dtype=dtype
+ )
+ self._w_i_diag = vs.get_variable(
+ "w_i_diag", shape=[self._num_units], dtype=dtype
+ )
+ self._w_o_diag = vs.get_variable(
+ "w_o_diag", shape=[self._num_units], dtype=dtype
+ )
+
+ if self._use_peepholes:
+ c = sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * sigmoid(
+ time_last_state
+ ) * c_prev + sigmoid(i + self._w_i_diag * c_prev) * sigmoid(
+ time_now_state
+ ) * self._activation(
+ j
+ )
+ else:
+ c = sigmoid(f + self._forget_bias) * sigmoid(
+ time_last_state
+ ) * c_prev + sigmoid(i) * sigmoid(time_now_state) * self._activation(j)
+
+ if self._cell_clip is not None:
+ # pylint: disable=invalid-unary-operand-type
+ c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
+ # pylint: enable=invalid-unary-operand-type
+ if self._use_peepholes:
+ m = sigmoid(o + self._w_o_diag * c) * self._activation(c)
+ else:
+ m = sigmoid(o) * self._activation(c)
+
+ if self._num_proj is not None:
+ if self._linear2 is None:
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope, initializer=self._initializer):
+ with vs.variable_scope("projection") as proj_scope:
+ if self._num_proj_shards is not None:
+ proj_scope.set_partitioner(
+ partitioned_variables.fixed_size_partitioner(
+ self._num_proj_shards
+ )
+ )
+ self._linear2 = _Linear(m, self._num_proj, False)
+ m = self._linear2(m)
+
+ if self._proj_clip is not None:
+ # pylint: disable=invalid-unary-operand-type
+ m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
+ # pylint: enable=invalid-unary-operand-type
+ c = att_score * c + (1.0 - att_score) * c
+ m = att_score * m + (1.0 - att_score) * m
+ new_state = (
+ LSTMStateTuple(c, m)
+ if self._state_is_tuple
+ else array_ops.concat([c, m], 1)
+ )
+ return m, new_state
+
+
+class _Linear(object):
+ """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
+
+ Args:
+ args: a 2D Tensor or a list of 2D, batch x n, Tensors.
+ output_size: int, second dimension of weight variable.
+ dtype: data type for variables.
+ build_bias: boolean, whether to build a bias variable.
+ bias_initializer: starting value to initialize the bias
+ (default is all zeros).
+ kernel_initializer: starting value to initialize the weight.
+
+ Raises:
+ ValueError: if inputs_shape is wrong.
+ """
+
+ def __init__(
+ self,
+ args,
+ output_size,
+ build_bias,
+ bias_initializer=None,
+ kernel_initializer=None,
+ ):
+ self._build_bias = build_bias
+
+ if args is None or (nest.is_sequence(args) and not args):
+ raise ValueError("`args` must be specified")
+ if not nest.is_sequence(args):
+ args = [args]
+ self._is_sequence = False
+ else:
+ self._is_sequence = True
+
+ # Calculate the total size of arguments on dimension 1.
+ total_arg_size = 0
+ shapes = [a.get_shape() for a in args]
+ for shape in shapes:
+ if shape.ndims != 2:
+ raise ValueError("linear is expecting 2D arguments: %s" % shapes)
+ if shape[1] is None:
+ raise ValueError(
+ "linear expects shape[1] to be provided for shape %s, "
+ "but saw %s" % (shape, shape[1])
+ )
+ else:
+ total_arg_size += shape[1]
+
+ dtype = [a.dtype for a in args][0]
+
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope) as outer_scope:
+ self._weights = vs.get_variable(
+ _WEIGHTS_VARIABLE_NAME,
+ [total_arg_size, output_size],
+ dtype=dtype,
+ initializer=kernel_initializer,
+ )
+ if build_bias:
+ with vs.variable_scope(outer_scope) as inner_scope:
+ inner_scope.set_partitioner(None)
+ if bias_initializer is None:
+ bias_initializer = init_ops.constant_initializer(
+ 0.0, dtype=dtype
+ )
+ self._biases = vs.get_variable(
+ _BIAS_VARIABLE_NAME,
+ [output_size],
+ dtype=dtype,
+ initializer=bias_initializer,
+ )
+
+ def __call__(self, args):
+ if not self._is_sequence:
+ args = [args]
+
+ if len(args) == 1:
+ res = math_ops.matmul(args[0], self._weights)
+ else:
+ res = math_ops.matmul(array_ops.concat(args, 1), self._weights)
+ if self._build_bias:
+ res = nn_ops.bias_add(res, self._biases)
+ return res
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+
+import os
+import abc
+import numpy as np
+import tensorflow as tf
+
+from recommenders.models.deeprec.models.base_model import BaseModel
+from recommenders.models.deeprec.deeprec_utils import cal_metric, load_dict
+
+
+__all__ = ["SequentialBaseModel"]
+
+
+[docs]class SequentialBaseModel(BaseModel):
+ """Base class for sequential models"""
+
+ def __init__(self, hparams, iterator_creator, graph=None, seed=None):
+ """Initializing the model. Create common logics which are needed by all sequential models, such as loss function,
+ parameter set.
+
+ Args:
+ hparams (HParams): A `HParams` object, hold the entire set of hyperparameters.
+ iterator_creator (object): An iterator to load the data.
+ graph (object): An optional graph.
+ seed (int): Random seed.
+ """
+ self.hparams = hparams
+
+ self.need_sample = hparams.need_sample
+ self.train_num_ngs = hparams.train_num_ngs
+ if self.train_num_ngs is None:
+ raise ValueError(
+ "Please confirm the number of negative samples for each positive instance."
+ )
+ self.min_seq_length = (
+ hparams.min_seq_length if "min_seq_length" in hparams.values() else 1
+ )
+ self.hidden_size = (
+ hparams.hidden_size if "hidden_size" in hparams.values() else None
+ )
+ self.graph = tf.Graph() if not graph else graph
+
+ with self.graph.as_default():
+ self.sequence_length = tf.compat.v1.placeholder(
+ tf.int32, [None], name="sequence_length"
+ )
+
+ super().__init__(hparams, iterator_creator, graph=self.graph, seed=seed)
+
+ @abc.abstractmethod
+ def _build_seq_graph(self):
+ """Subclass will implement this."""
+ pass
+
+ def _build_graph(self):
+ """The main function to create sequential models.
+
+ Returns:
+ object: the prediction score make by the model.
+ """
+ hparams = self.hparams
+ self.keep_prob_train = 1 - np.array(hparams.dropout)
+ self.keep_prob_test = np.ones_like(hparams.dropout)
+
+ with tf.compat.v1.variable_scope("sequential") as self.sequential_scope:
+ self._build_embedding()
+ self._lookup_from_embedding()
+ model_output = self._build_seq_graph()
+ logit = self._fcn_net(model_output, hparams.layer_sizes, scope="logit_fcn")
+ self._add_norm()
+ return logit
+
+[docs] def fit(
+ self,
+ train_file,
+ valid_file,
+ valid_num_ngs,
+ eval_metric="group_auc",
+ ):
+ """Fit the model with `train_file`. Evaluate the model on `valid_file` per epoch to observe the training status.
+ If `test_file` is not None, evaluate it too.
+
+ Args:
+ train_file (str): training data set.
+ valid_file (str): validation set.
+ valid_num_ngs (int): the number of negative instances with one positive instance in validation data.
+ eval_metric (str): the metric that control early stopping. e.g. "auc", "group_auc", etc.
+
+ Returns:
+ object: An instance of self.
+ """
+
+ # check bad input.
+ if not self.need_sample and self.train_num_ngs < 1:
+ raise ValueError(
+ "Please specify a positive integer of negative numbers for training without sampling needed."
+ )
+ if valid_num_ngs < 1:
+ raise ValueError(
+ "Please specify a positive integer of negative numbers for validation."
+ )
+
+ if self.need_sample and self.train_num_ngs < 1:
+ self.train_num_ngs = 1
+
+ if self.hparams.write_tfevents and self.hparams.SUMMARIES_DIR:
+ if not os.path.exists(self.hparams.SUMMARIES_DIR):
+ os.makedirs(self.hparams.SUMMARIES_DIR)
+
+ self.writer = tf.compat.v1.summary.FileWriter(
+ self.hparams.SUMMARIES_DIR, self.sess.graph
+ )
+
+ train_sess = self.sess
+ eval_info = list()
+
+ best_metric, self.best_epoch = 0, 0
+
+ for epoch in range(1, self.hparams.epochs + 1):
+ step = 0
+ self.hparams.current_epoch = epoch
+ epoch_loss = 0
+ file_iterator = self.iterator.load_data_from_file(
+ train_file,
+ min_seq_length=self.min_seq_length,
+ batch_num_ngs=self.train_num_ngs,
+ )
+
+ for batch_data_input in file_iterator:
+ if batch_data_input:
+ step_result = self.train(train_sess, batch_data_input)
+ (_, _, step_loss, step_data_loss, summary) = step_result
+ if self.hparams.write_tfevents and self.hparams.SUMMARIES_DIR:
+ self.writer.add_summary(summary, step)
+ epoch_loss += step_loss
+ step += 1
+ if step % self.hparams.show_step == 0:
+ print(
+ "step {0:d} , total_loss: {1:.4f}, data_loss: {2:.4f}".format(
+ step, step_loss, step_data_loss
+ )
+ )
+
+ valid_res = self.run_eval(valid_file, valid_num_ngs)
+ print(
+ "eval valid at epoch {0}: {1}".format(
+ epoch,
+ ",".join(
+ [
+ "" + str(key) + ":" + str(value)
+ for key, value in valid_res.items()
+ ]
+ ),
+ )
+ )
+ eval_info.append((epoch, valid_res))
+
+ progress = False
+ early_stop = self.hparams.EARLY_STOP
+ if valid_res[eval_metric] > best_metric:
+ best_metric = valid_res[eval_metric]
+ self.best_epoch = epoch
+ progress = True
+ else:
+ if early_stop > 0 and epoch - self.best_epoch >= early_stop:
+ print("early stop at epoch {0}!".format(epoch))
+ break
+
+ if self.hparams.save_model and self.hparams.MODEL_DIR:
+ if not os.path.exists(self.hparams.MODEL_DIR):
+ os.makedirs(self.hparams.MODEL_DIR)
+ if progress:
+ checkpoint_path = self.saver.save(
+ sess=train_sess,
+ save_path=self.hparams.MODEL_DIR + "epoch_" + str(epoch),
+ )
+ checkpoint_path = self.saver.save( # noqa: F841
+ sess=train_sess,
+ save_path=os.path.join(self.hparams.MODEL_DIR, "best_model"),
+ )
+
+ if self.hparams.write_tfevents:
+ self.writer.close()
+
+ print(eval_info)
+ print("best epoch: {0}".format(self.best_epoch))
+ return self
+
+[docs] def run_eval(self, filename, num_ngs):
+ """Evaluate the given file and returns some evaluation metrics.
+
+ Args:
+ filename (str): A file name that will be evaluated.
+ num_ngs (int): The number of negative sampling for a positive instance.
+
+ Returns:
+ dict: A dictionary that contains evaluation metrics.
+ """
+
+ load_sess = self.sess
+ preds = []
+ labels = []
+ group_preds = []
+ group_labels = []
+ group = num_ngs + 1
+
+ for batch_data_input in self.iterator.load_data_from_file(
+ filename, min_seq_length=self.min_seq_length, batch_num_ngs=0
+ ):
+ if batch_data_input:
+ step_pred, step_labels = self.eval(load_sess, batch_data_input)
+ preds.extend(np.reshape(step_pred, -1))
+ labels.extend(np.reshape(step_labels, -1))
+ group_preds.extend(np.reshape(step_pred, (-1, group)))
+ group_labels.extend(np.reshape(step_labels, (-1, group)))
+
+ res = cal_metric(labels, preds, self.hparams.metrics)
+ res_pairwise = cal_metric(
+ group_labels, group_preds, self.hparams.pairwise_metrics
+ )
+ res.update(res_pairwise)
+ return res
+
+[docs] def predict(self, infile_name, outfile_name):
+ """Make predictions on the given data, and output predicted scores to a file.
+
+ Args:
+ infile_name (str): Input file name.
+ outfile_name (str): Output file name.
+
+ Returns:
+ object: An instance of self.
+ """
+
+ load_sess = self.sess
+ with tf.io.gfile.GFile(outfile_name, "w") as wt:
+ for batch_data_input in self.iterator.load_data_from_file(
+ infile_name, batch_num_ngs=0
+ ):
+ if batch_data_input:
+ step_pred = self.infer(load_sess, batch_data_input)
+ step_pred = np.reshape(step_pred, -1)
+ wt.write("\n".join(map(str, step_pred)))
+ wt.write("\n")
+ return self
+
+ def _build_embedding(self):
+ """The field embedding layer. Initialization of embedding variables."""
+ hparams = self.hparams
+ self.user_vocab_length = len(load_dict(hparams.user_vocab))
+ self.item_vocab_length = len(load_dict(hparams.item_vocab))
+ self.cate_vocab_length = len(load_dict(hparams.cate_vocab))
+ self.user_embedding_dim = hparams.user_embedding_dim
+ self.item_embedding_dim = hparams.item_embedding_dim
+ self.cate_embedding_dim = hparams.cate_embedding_dim
+
+ with tf.compat.v1.variable_scope("embedding", initializer=self.initializer):
+ self.user_lookup = tf.compat.v1.get_variable(
+ name="user_embedding",
+ shape=[self.user_vocab_length, self.user_embedding_dim],
+ dtype=tf.float32,
+ )
+ self.item_lookup = tf.compat.v1.get_variable(
+ name="item_embedding",
+ shape=[self.item_vocab_length, self.item_embedding_dim],
+ dtype=tf.float32,
+ )
+ self.cate_lookup = tf.compat.v1.get_variable(
+ name="cate_embedding",
+ shape=[self.cate_vocab_length, self.cate_embedding_dim],
+ dtype=tf.float32,
+ )
+
+ def _lookup_from_embedding(self):
+ """Lookup from embedding variables. A dropout layer follows lookup operations."""
+ self.user_embedding = tf.nn.embedding_lookup(
+ params=self.user_lookup, ids=self.iterator.users
+ )
+ tf.compat.v1.summary.histogram("user_embedding_output", self.user_embedding)
+
+ self.item_embedding = tf.compat.v1.nn.embedding_lookup(
+ params=self.item_lookup, ids=self.iterator.items
+ )
+ self.item_history_embedding = tf.compat.v1.nn.embedding_lookup(
+ params=self.item_lookup, ids=self.iterator.item_history
+ )
+ tf.compat.v1.summary.histogram(
+ "item_history_embedding_output", self.item_history_embedding
+ )
+
+ self.cate_embedding = tf.compat.v1.nn.embedding_lookup(
+ params=self.cate_lookup, ids=self.iterator.cates
+ )
+ self.cate_history_embedding = tf.compat.v1.nn.embedding_lookup(
+ params=self.cate_lookup, ids=self.iterator.item_cate_history
+ )
+ tf.compat.v1.summary.histogram(
+ "cate_history_embedding_output", self.cate_history_embedding
+ )
+
+ involved_items = tf.concat(
+ [
+ tf.reshape(self.iterator.item_history, [-1]),
+ tf.reshape(self.iterator.items, [-1]),
+ ],
+ -1,
+ )
+ self.involved_items, _ = tf.unique(involved_items)
+ involved_item_embedding = tf.nn.embedding_lookup(
+ params=self.item_lookup, ids=self.involved_items
+ )
+ self.embed_params.append(involved_item_embedding)
+
+ involved_cates = tf.concat(
+ [
+ tf.reshape(self.iterator.item_cate_history, [-1]),
+ tf.reshape(self.iterator.cates, [-1]),
+ ],
+ -1,
+ )
+ self.involved_cates, _ = tf.unique(involved_cates)
+ involved_cate_embedding = tf.nn.embedding_lookup(
+ params=self.cate_lookup, ids=self.involved_cates
+ )
+ self.embed_params.append(involved_cate_embedding)
+
+ self.target_item_embedding = tf.concat(
+ [self.item_embedding, self.cate_embedding], -1
+ )
+ tf.compat.v1.summary.histogram(
+ "target_item_embedding_output", self.target_item_embedding
+ )
+
+ def _add_norm(self):
+ """Regularization for embedding variables and other variables."""
+ all_variables, embed_variables = (
+ tf.compat.v1.trainable_variables(),
+ tf.compat.v1.trainable_variables(
+ self.sequential_scope._name + "/embedding"
+ ),
+ )
+ layer_params = list(set(all_variables) - set(embed_variables))
+ layer_params = [a for a in layer_params if "_no_reg" not in a.name]
+ self.layer_params.extend(layer_params)
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+from recommenders.models.deeprec.models.sequential.sequential_base_model import (
+ SequentialBaseModel,
+)
+from tensorflow.compat.v1.nn import dynamic_rnn
+from recommenders.models.deeprec.models.sequential.rnn_cell_implement import (
+ Time4LSTMCell,
+)
+
+__all__ = ["SLI_RECModel"]
+
+
+[docs]class SLI_RECModel(SequentialBaseModel):
+ """SLI Rec model
+
+ :Citation:
+
+ Z. Yu, J. Lian, A. Mahmoody, G. Liu and X. Xie, "Adaptive User Modeling with
+ Long and Short-Term Preferences for Personailzed Recommendation", in Proceedings of
+ the 28th International Joint Conferences on Artificial Intelligence, IJCAI’19,
+ Pages 4213-4219, AAAI Press, 2019.
+ """
+
+ def _build_seq_graph(self):
+ """The main function to create sli_rec model.
+
+ Returns:
+ object: the output of sli_rec section.
+ """
+ hparams = self.hparams
+ with tf.compat.v1.variable_scope("sli_rec"):
+ hist_input = tf.concat(
+ [self.item_history_embedding, self.cate_history_embedding], 2
+ )
+ self.mask = self.iterator.mask
+ self.sequence_length = tf.reduce_sum(input_tensor=self.mask, axis=1)
+
+ with tf.compat.v1.variable_scope("long_term_asvd"):
+ att_outputs1 = self._attention(hist_input, hparams.attention_size)
+ att_fea1 = tf.reduce_sum(input_tensor=att_outputs1, axis=1)
+ tf.compat.v1.summary.histogram("att_fea1", att_fea1)
+
+ item_history_embedding_new = tf.concat(
+ [
+ self.item_history_embedding,
+ tf.expand_dims(self.iterator.time_from_first_action, -1),
+ ],
+ -1,
+ )
+ item_history_embedding_new = tf.concat(
+ [
+ item_history_embedding_new,
+ tf.expand_dims(self.iterator.time_to_now, -1),
+ ],
+ -1,
+ )
+ with tf.compat.v1.variable_scope("rnn"):
+ rnn_outputs, _ = dynamic_rnn(
+ Time4LSTMCell(hparams.hidden_size),
+ inputs=item_history_embedding_new,
+ sequence_length=self.sequence_length,
+ dtype=tf.float32,
+ scope="time4lstm",
+ )
+ tf.compat.v1.summary.histogram("LSTM_outputs", rnn_outputs)
+
+ with tf.compat.v1.variable_scope("attention_fcn"):
+ att_outputs2 = self._attention_fcn(
+ self.target_item_embedding, rnn_outputs
+ )
+ att_fea2 = tf.reduce_sum(input_tensor=att_outputs2, axis=1)
+ tf.compat.v1.summary.histogram("att_fea2", att_fea2)
+
+ # ensemble
+ with tf.compat.v1.name_scope("alpha"):
+ concat_all = tf.concat(
+ [
+ self.target_item_embedding,
+ att_fea1,
+ att_fea2,
+ tf.expand_dims(self.iterator.time_to_now[:, -1], -1),
+ ],
+ 1,
+ )
+ last_hidden_nn_layer = concat_all
+ alpha_logit = self._fcn_net(
+ last_hidden_nn_layer, hparams.att_fcn_layer_sizes, scope="fcn_alpha"
+ )
+ alpha_output = tf.sigmoid(alpha_logit)
+ user_embed = att_fea1 * alpha_output + att_fea2 * (1.0 - alpha_output)
+ model_output = tf.concat([user_embed, self.target_item_embedding], 1)
+ tf.compat.v1.summary.histogram("model_output", model_output)
+ return model_output
+
+ def _attention_fcn(self, query, user_embedding):
+ """Apply attention by fully connected layers.
+
+ Args:
+ query (object): The embedding of target item which is regarded as a query in attention operations.
+ user_embedding (object): The output of RNN layers which is regarded as user modeling.
+
+ Returns:
+ object: Weighted sum of user modeling.
+ """
+ hparams = self.hparams
+ with tf.compat.v1.variable_scope("attention_fcn"):
+ query_size = query.shape[1]
+ boolean_mask = tf.equal(self.mask, tf.ones_like(self.mask))
+
+ attention_mat = tf.compat.v1.get_variable(
+ name="attention_mat",
+ shape=[user_embedding.shape.as_list()[-1], query_size],
+ initializer=self.initializer,
+ )
+ att_inputs = tf.tensordot(user_embedding, attention_mat, [[2], [0]])
+
+ queries = tf.reshape(
+ tf.tile(query, [1, att_inputs.shape[1]]), tf.shape(input=att_inputs)
+ )
+ last_hidden_nn_layer = tf.concat(
+ [att_inputs, queries, att_inputs - queries, att_inputs * queries], -1
+ )
+ att_fnc_output = self._fcn_net(
+ last_hidden_nn_layer, hparams.att_fcn_layer_sizes, scope="att_fcn"
+ )
+ att_fnc_output = tf.squeeze(att_fnc_output, -1)
+ mask_paddings = tf.ones_like(att_fnc_output) * (-(2**32) + 1)
+ att_weights = tf.nn.softmax(
+ tf.compat.v1.where(boolean_mask, att_fnc_output, mask_paddings),
+ name="att_weights",
+ )
+ output = user_embedding * tf.expand_dims(att_weights, -1)
+ return output
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+from tensorflow.compat.v1.nn import dynamic_rnn
+from recommenders.models.deeprec.models.sequential.sequential_base_model import (
+ SequentialBaseModel,
+)
+from recommenders.models.deeprec.models.sequential.sum_cells import (
+ SUMCell,
+ SUMV2Cell,
+)
+
+
+[docs]class SUMModel(SequentialBaseModel):
+ """Sequential User Matrix Model
+
+ :Citation:
+
+ Lian, J., Batal, I., Liu, Z., Soni, A., Kang, E. Y., Wang, Y., & Xie, X.,
+ "Multi-Interest-Aware User Modeling for Large-Scale Sequential Recommendations", arXiv preprint arXiv:2102.09211, 2021.
+ """
+
+ def _build_seq_graph(self):
+ """The main function to create SUM model.
+
+ Returns:
+ object: The output of SUM section, which is a concatenation of user vector and target item vector.
+ """
+ hparams = self.hparams # noqa: F841
+ with tf.compat.v1.variable_scope("sum"):
+ self.history_embedding = tf.concat(
+ [self.item_history_embedding, self.cate_history_embedding], 2
+ )
+ cell = self._create_sumcell()
+ self.cell = cell
+ cell.model = self
+ final_state = self._build_sum(cell)
+
+ for _p in cell.parameter_set:
+ tf.compat.v1.summary.histogram(_p.name, _p)
+ if hasattr(cell, "_alpha") and hasattr(cell._alpha, "name"):
+ tf.compat.v1.summary.histogram(cell._alpha.name, cell._alpha)
+ if hasattr(cell, "_beta") and hasattr(cell._beta, "name"):
+ tf.compat.v1.summary.histogram(cell._beta.name, cell._beta)
+
+ final_state, att_weights = self._attention_query_by_state(
+ final_state, self.target_item_embedding
+ )
+ model_output = tf.concat([final_state, self.target_item_embedding], 1)
+ tf.compat.v1.summary.histogram("model_output", model_output)
+ return model_output
+
+ def _attention_query_by_state(self, seq_output, query):
+ """Merge a user's memory states conditioned by a query item.
+
+ Params:
+ seq_output: A flatten representation of SUM memory states for (a batch of) users
+ query: (a batch of) target item candidates
+
+ Returns:
+ tf.Tensor, tf.Tensor: Merged user representation. Attention weights of each memory channel.
+ """
+ dim_q = query.shape[-1]
+ att_weights = tf.constant(1.0, dtype=tf.float32)
+ with tf.compat.v1.variable_scope("query_att"):
+ if self.hparams.slots > 1:
+ query_att_W = tf.compat.v1.get_variable(
+ name="query_att_W",
+ shape=[self.hidden_size, dim_q],
+ initializer=self.initializer,
+ )
+
+ # reshape the memory states to (BatchSize, Slots, HiddenSize)
+ memory_state = tf.reshape(
+ seq_output, [-1, self.hparams.slots, self.hidden_size]
+ )
+
+ att_weights = tf.nn.softmax(
+ tf.squeeze(
+ tf.matmul(
+ tf.tensordot(memory_state, query_att_W, axes=1),
+ tf.expand_dims(query, -1),
+ ),
+ -1,
+ ),
+ -1,
+ )
+ # merge the memory states, the final shape is (BatchSize, HiddenSize)
+ att_res = tf.reduce_sum(
+ input_tensor=memory_state * tf.expand_dims(att_weights, -1), axis=1
+ )
+
+ else:
+ att_res = seq_output
+
+ return att_res, att_weights
+
+ def _create_sumcell(self):
+ """Create a SUM cell
+
+ Returns:
+ object: An initialized SUM cell
+ """
+ hparams = self.hparams
+ input_embedding_dim = self.history_embedding.shape[-1]
+ input_params = [
+ hparams.hidden_size * hparams.slots + input_embedding_dim,
+ hparams.slots,
+ hparams.attention_size,
+ input_embedding_dim,
+ ]
+ sumcells = {"SUM": SUMCell, "SUMV2": SUMV2Cell}
+ sumCell = sumcells[hparams.cell]
+ res = None
+ if hparams.cell in ["SUM", "SUMV2"]:
+ res = sumCell(*input_params)
+ else:
+ raise ValueError("ERROR! Cell type not support: {0}".format(hparams.cell))
+ return res
+
+ def _build_sum(self, cell):
+ """Generate user memory states from behavior sequence
+
+ Args:
+ object: An initialied SUM cell.
+
+ Returns:
+ object: A flatten representation of user memory states, in the shape of (BatchSize, SlotsNum x HiddenSize)
+ """
+ hparams = self.hparams
+ with tf.compat.v1.variable_scope("sum"):
+ self.mask = self.iterator.mask
+ self.sequence_length = tf.reduce_sum(input_tensor=self.mask, axis=1)
+
+ rum_outputs, final_state = dynamic_rnn(
+ cell,
+ inputs=self.history_embedding,
+ dtype=tf.float32,
+ sequence_length=self.sequence_length,
+ scope="sum",
+ initial_state=cell.zero_state(
+ tf.shape(input=self.history_embedding)[0], tf.float32
+ ),
+ )
+
+ final_state = final_state[:, : hparams.slots * hparams.hidden_size]
+
+ self.heads = cell.heads
+ self.alpha = cell._alpha
+ self.beta = cell._beta
+ tf.compat.v1.summary.histogram("SUM_outputs", rum_outputs)
+
+ return final_state
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+import tensorflow as tf
+from keras.layers.legacy_rnn.rnn_cell_impl import LayerRNNCell
+from tensorflow.python.eager import context
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.ops import init_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.util import nest
+
+
+_BIAS_VARIABLE_NAME = "bias"
+_WEIGHTS_VARIABLE_NAME = "kernel"
+
+
+[docs]class SUMCell(LayerRNNCell):
+ """Cell for Sequential User Matrix"""
+
+ def __init__(
+ self,
+ num_units,
+ slots,
+ attention_size,
+ input_size,
+ activation=None,
+ reuse=None,
+ kernel_initializer=None,
+ bias_initializer=None,
+ name=None,
+ dtype=None,
+ **kwargs
+ ):
+ super(SUMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype, **kwargs)
+ _check_supported_dtypes(self.dtype)
+
+ if context.executing_eagerly() and context.num_gpus() > 0:
+ logging.warn(
+ "%s: Note that this cell is not optimized for performance. "
+ "Please use keras.layers.cudnn_recurrent.CuDNNGRU for better "
+ "performance on GPU.",
+ self,
+ )
+
+ self._input_size = input_size
+ self._slots = slots - 1 # the last channel is reserved for the highway slot
+ self._num_units = num_units
+ self._real_units = (self._num_units - input_size) // slots
+ if activation:
+ self._activation = activations.get(activation)
+ else:
+ self._activation = math_ops.tanh
+ self._kernel_initializer = initializers.get(kernel_initializer)
+ self._bias_initializer = initializers.get(bias_initializer)
+
+ @property
+ def state_size(self):
+ return self._num_units
+
+ @property
+ def output_size(self):
+ return self._num_units
+
+ def _basic_build(self, inputs_shape):
+ """Common initialization operations for SUM cell and its variants.
+ This function creates parameters for the cell.
+ """
+
+ d = inputs_shape[-1]
+ h = self._real_units
+ s = self._slots
+
+ self._erase_W = self.add_variable(
+ name="_erase_W", shape=[d + h, h], initializer=self._kernel_initializer
+ )
+ self._erase_b = self.add_variable(
+ name="_erase_b",
+ shape=[h],
+ initializer=(
+ self._bias_initializer
+ if self._bias_initializer is not None
+ else init_ops.constant_initializer(1.0, dtype=self.dtype)
+ ),
+ )
+
+ self._reset_W = self.add_variable(
+ name="_reset_W", shape=[d + h, 1], initializer=self._kernel_initializer
+ )
+ self._reset_b = self.add_variable(
+ name="_reset_b",
+ shape=[1],
+ initializer=(
+ self._bias_initializer
+ if self._bias_initializer is not None
+ else init_ops.constant_initializer(1.0, dtype=self.dtype)
+ ),
+ )
+
+ self._add_W = self.add_variable(
+ name="_add_W", shape=[d + h, h], initializer=self._kernel_initializer
+ )
+ self._add_b = self.add_variable(
+ name="_add_b",
+ shape=[h],
+ initializer=(
+ self._bias_initializer
+ if self._bias_initializer is not None
+ else init_ops.constant_initializer(1.0, dtype=self.dtype)
+ ),
+ )
+ self.heads = self.add_variable(
+ name="_heads", shape=[s, d], initializer=self._kernel_initializer
+ )
+
+ self._beta = self.add_variable(
+ name="_beta_no_reg",
+ shape=(),
+ initializer=tf.compat.v1.constant_initializer(
+ np.array([1.02]), dtype=np.float32
+ ),
+ )
+ self._alpha = self.add_variable(
+ name="_alpha_no_reg",
+ shape=(),
+ initializer=tf.compat.v1.constant_initializer(
+ np.array([0.98]), dtype=np.float32
+ ),
+ )
+
+ @tf_utils.shape_type_conversion
+ def build(self, inputs_shape):
+ """Initialization operations for SUM cell.
+ this function creates all the parameters for the cell.
+ """
+ if inputs_shape[-1] is None:
+ raise ValueError(
+ "Expected inputs.shape[-1] to be known, saw shape: %s"
+ % str(inputs_shape)
+ )
+ _check_supported_dtypes(self.dtype)
+ d = inputs_shape[-1] # noqa: F841
+ h = self._real_units # noqa: F841
+ s = self._slots # noqa: F841
+
+ self._basic_build(inputs_shape)
+
+ self.parameter_set = [
+ self._erase_W,
+ self._erase_b,
+ self._reset_W,
+ self._reset_b,
+ self._add_W,
+ self._add_b,
+ self.heads,
+ ]
+
+ self.built = True
+
+[docs] def call(self, inputs, state):
+ """The real operations for SUM cell to process user behaviors.
+
+ params:
+ inputs: (a batch of) user behaviors at time T
+ state: (a batch of) user states at time T-1
+
+ returns:
+ state, state:
+ - after process the user behavior at time T, returns (a batch of) new user states at time T
+ - after process the user behavior at time T, returns (a batch of) new user states at time T
+ """
+ _check_rnn_cell_input_dtypes([inputs, state])
+
+ h = self._real_units
+ s = self._slots + 1
+ state, last = state[:, : s * h], state[:, s * h :]
+ state = tf.reshape(state, [-1, s, h])
+
+ att_logit_mat = tf.matmul(inputs, self.heads, transpose_b=True)
+
+ att_weights = tf.nn.softmax(self._beta * att_logit_mat, axis=-1)
+ att_weights = tf.expand_dims(att_weights, 2)
+
+ h_hat = tf.reduce_sum(
+ input_tensor=tf.multiply(state[:, : self._slots, :], att_weights), axis=1
+ )
+ h_hat = (h_hat + state[:, self._slots, :]) / 2
+
+ n_a, n_b = tf.nn.l2_normalize(last, 1), tf.nn.l2_normalize(inputs, 1)
+ dist = tf.expand_dims(tf.reduce_sum(input_tensor=n_a * n_b, axis=1), 1)
+ dist = tf.math.pow(self._alpha, dist)
+
+ att_weights = att_weights * tf.expand_dims(dist, 1)
+
+ reset = tf.sigmoid(
+ tf.compat.v1.nn.xw_plus_b(
+ tf.concat([inputs, h_hat], axis=-1), self._reset_W, self._reset_b
+ )
+ )
+ erase = tf.sigmoid(
+ tf.compat.v1.nn.xw_plus_b(
+ tf.concat([inputs, h_hat], axis=-1), self._erase_W, self._erase_b
+ )
+ )
+ add = tf.tanh(
+ tf.compat.v1.nn.xw_plus_b(
+ tf.concat([inputs, reset * h_hat], axis=-1), self._add_W, self._add_b
+ )
+ )
+
+ start_part01 = state[:, : self._slots, :]
+ state01 = start_part01 * (
+ tf.ones_like(start_part01) - att_weights * tf.expand_dims(erase, 1)
+ )
+ state01 = state01 + att_weights * tf.expand_dims(erase, 1) * tf.expand_dims(
+ add, 1
+ )
+ state01 = tf.reshape(state01, [-1, self._slots * self._real_units])
+
+ start_part02 = state[:, self._slots, :]
+ state02 = start_part02 * (tf.ones_like(start_part02) - dist * erase)
+ state02 = state02 + dist * erase * add
+ state = tf.concat([state01, state02, inputs], axis=-1)
+ return state, state
+
+[docs] def get_config(self):
+ config = {
+ "num_units": self._num_units,
+ "kernel_initializer": initializers.serialize(self._kernel_initializer),
+ "bias_initializer": initializers.serialize(self._bias_initializer),
+ "activation": activations.serialize(self._activation),
+ "reuse": self._reuse,
+ }
+ base_config = super(SUMCell, self).get_config()
+ return dict(list(base_config.items()) + list(config.items()))
+
+
+[docs]class SUMV2Cell(SUMCell):
+ """A variant of SUM cell, which upgrades the writing attention"""
+
+ @tf_utils.shape_type_conversion
+ def build(self, inputs_shape):
+ """Initialization operations for SUMV2 cell.
+ this function creates all the parameters for the cell.
+ """
+ if inputs_shape[-1] is None:
+ raise ValueError(
+ "Expected inputs.shape[-1] to be known, saw shape: %s"
+ % str(inputs_shape)
+ )
+ _check_supported_dtypes(self.dtype)
+ d = inputs_shape[-1]
+ h = self._real_units
+ s = self._slots
+
+ self._basic_build(inputs_shape)
+
+ self._writing_W = self.add_variable(
+ name="_writing_W", shape=[d + h, h], initializer=self._kernel_initializer
+ )
+ self._writing_b = self.add_variable(
+ name="_writing_b",
+ shape=[h],
+ initializer=(
+ self._bias_initializer
+ if self._bias_initializer is not None
+ else init_ops.constant_initializer(1.0, dtype=self.dtype)
+ ),
+ )
+ self._writing_W02 = self.add_variable(
+ name="_writing_W02", shape=[h, s], initializer=self._kernel_initializer
+ )
+
+ self.parameter_set = [
+ self._erase_W,
+ self._erase_b,
+ self._reset_W,
+ self._reset_b,
+ self._add_W,
+ self._add_b,
+ self.heads,
+ self._writing_W,
+ self._writing_W02,
+ self._writing_b,
+ ]
+
+ self.built = True
+
+[docs] def call(self, inputs, state):
+ """The real operations for SUMV2 cell to process user behaviors.
+
+ Args:
+ inputs: (a batch of) user behaviors at time T
+ state: (a batch of) user states at time T-1
+
+ Returns:
+ state: after process the user behavior at time T, returns (a batch of) new user states at time T
+ state: after process the user behavior at time T, returns (a batch of) new user states at time T
+ """
+ _check_rnn_cell_input_dtypes([inputs, state])
+
+ h = self._real_units
+ s = self._slots + 1
+ state, last = state[:, : s * h], state[:, s * h :]
+ state = tf.reshape(state, [-1, s, h])
+
+ att_logit_mat = tf.matmul(inputs, self.heads, transpose_b=True)
+
+ att_weights = tf.nn.softmax(self._beta * att_logit_mat, axis=-1)
+ att_weights = tf.expand_dims(att_weights, 2)
+
+ h_hat = tf.reduce_sum(
+ input_tensor=tf.multiply(state[:, : self._slots, :], att_weights), axis=1
+ )
+ h_hat = (h_hat + state[:, self._slots, :]) / 2
+
+ # get the true writing attentions
+ writing_input = tf.concat([inputs, h_hat], axis=1)
+ att_weights = tf.compat.v1.nn.xw_plus_b(
+ writing_input, self._writing_W, self._writing_b
+ )
+ att_weights = tf.nn.relu(att_weights)
+ att_weights = tf.matmul(att_weights, self._writing_W02)
+ att_weights = tf.nn.softmax(att_weights, axis=-1)
+ att_weights = tf.expand_dims(att_weights, 2)
+
+ n_a, n_b = tf.nn.l2_normalize(last, 1), tf.nn.l2_normalize(inputs, 1)
+ dist = tf.expand_dims(tf.reduce_sum(input_tensor=n_a * n_b, axis=1), 1)
+ dist = tf.math.pow(self._alpha, dist)
+
+ att_weights = att_weights * tf.expand_dims(dist, 1)
+
+ reset = tf.sigmoid(
+ tf.compat.v1.nn.xw_plus_b(
+ tf.concat([inputs, h_hat], axis=-1), self._reset_W, self._reset_b
+ )
+ )
+ erase = tf.sigmoid(
+ tf.compat.v1.nn.xw_plus_b(
+ tf.concat([inputs, h_hat], axis=-1), self._erase_W, self._erase_b
+ )
+ )
+ add = tf.tanh(
+ tf.compat.v1.nn.xw_plus_b(
+ tf.concat([inputs, reset * h_hat], axis=-1), self._add_W, self._add_b
+ )
+ )
+
+ start_part01 = state[:, : self._slots, :]
+ state01 = start_part01 * (
+ tf.ones_like(start_part01) - att_weights * tf.expand_dims(erase, 1)
+ )
+ state01 = state01 + att_weights * tf.expand_dims(erase, 1) * tf.expand_dims(
+ add, 1
+ )
+ state01 = tf.reshape(state01, [-1, self._slots * self._real_units])
+
+ start_part02 = state[:, self._slots, :]
+ state02 = start_part02 * (tf.ones_like(start_part02) - dist * erase)
+ state02 = state02 + dist * erase * add
+ state = tf.concat([state01, state02, inputs], axis=-1)
+ return state, state
+
+
+def _check_rnn_cell_input_dtypes(inputs):
+ for t in nest.flatten(inputs):
+ _check_supported_dtypes(t.dtype)
+
+
+def _check_supported_dtypes(dtype):
+ if dtype is None:
+ return
+ dtype = dtypes.as_dtype(dtype)
+ if not (dtype.is_floating or dtype.is_complex):
+ raise ValueError(
+ "RNN cell only supports floating point inputs, " "but saw dtype: %s" % dtype
+ )
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+import tensorflow as tf
+
+from recommenders.models.deeprec.models.base_model import BaseModel
+
+
+__all__ = ["XDeepFMModel"]
+
+
+[docs]class XDeepFMModel(BaseModel):
+ """xDeepFM model
+
+ :Citation:
+
+ J. Lian, X. Zhou, F. Zhang, Z. Chen, X. Xie, G. Sun, "xDeepFM: Combining Explicit
+ and Implicit Feature Interactions for Recommender Systems", in Proceedings of the
+ 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining,
+ KDD 2018, London, 2018.
+ """
+
+ def _build_graph(self):
+ """The main function to create xdeepfm's logic.
+
+ Returns:
+ object: The prediction score made by the model.
+ """
+ hparams = self.hparams
+ self.keep_prob_train = 1 - np.array(hparams.dropout)
+ self.keep_prob_test = np.ones_like(hparams.dropout)
+
+ with tf.compat.v1.variable_scope("XDeepFM") as scope: # noqa: F841
+ with tf.compat.v1.variable_scope(
+ "embedding", initializer=self.initializer
+ ) as escope: # noqa: F841
+ self.embedding = tf.compat.v1.get_variable(
+ name="embedding_layer",
+ shape=[hparams.FEATURE_COUNT, hparams.dim],
+ dtype=tf.float32,
+ )
+ self.embed_params.append(self.embedding)
+ embed_out, embed_layer_size = self._build_embedding()
+
+ logit = 0
+
+ if hparams.use_Linear_part:
+ print("Add linear part.")
+ logit = logit + self._build_linear()
+
+ if hparams.use_FM_part:
+ print("Add FM part.")
+ logit = logit + self._build_fm()
+
+ if hparams.use_CIN_part:
+ print("Add CIN part.")
+ if hparams.fast_CIN_d <= 0:
+ logit = logit + self._build_CIN(
+ embed_out, res=True, direct=False, bias=False, is_masked=True
+ )
+ else:
+ logit = logit + self._build_fast_CIN(
+ embed_out, res=True, direct=False, bias=False
+ )
+
+ if hparams.use_DNN_part:
+ print("Add DNN part.")
+ logit = logit + self._build_dnn(embed_out, embed_layer_size)
+
+ return logit
+
+ def _build_embedding(self):
+ """The field embedding layer. MLP requires fixed-length vectors as input.
+ This function makes sum pooling of feature embeddings for each field.
+
+ Returns:
+ embedding: The result of field embedding layer, with size of #_fields * #_dim.
+ embedding_size: #_fields * #_dim
+ """
+ hparams = self.hparams
+ fm_sparse_index = tf.SparseTensor(
+ self.iterator.dnn_feat_indices,
+ self.iterator.dnn_feat_values,
+ self.iterator.dnn_feat_shape,
+ )
+ fm_sparse_weight = tf.SparseTensor(
+ self.iterator.dnn_feat_indices,
+ self.iterator.dnn_feat_weights,
+ self.iterator.dnn_feat_shape,
+ )
+ w_fm_nn_input_orgin = tf.nn.embedding_lookup_sparse(
+ params=self.embedding,
+ sp_ids=fm_sparse_index,
+ sp_weights=fm_sparse_weight,
+ combiner="sum",
+ )
+ embedding = tf.reshape(
+ w_fm_nn_input_orgin, [-1, hparams.dim * hparams.FIELD_COUNT]
+ )
+ embedding_size = hparams.FIELD_COUNT * hparams.dim
+ return embedding, embedding_size
+
+ def _build_linear(self):
+ """Construct the linear part for the model.
+ This is a linear regression.
+
+ Returns:
+ object: Prediction score made by linear regression.
+ """
+ with tf.compat.v1.variable_scope(
+ "linear_part", initializer=self.initializer
+ ) as scope: # noqa: F841
+ w = tf.compat.v1.get_variable(
+ name="w", shape=[self.hparams.FEATURE_COUNT, 1], dtype=tf.float32
+ )
+ b = tf.compat.v1.get_variable(
+ name="b",
+ shape=[1],
+ dtype=tf.float32,
+ initializer=tf.compat.v1.zeros_initializer(),
+ )
+ x = tf.SparseTensor(
+ self.iterator.fm_feat_indices,
+ self.iterator.fm_feat_values,
+ self.iterator.fm_feat_shape,
+ )
+ linear_output = tf.add(tf.sparse.sparse_dense_matmul(x, w), b)
+ self.layer_params.append(w)
+ self.layer_params.append(b)
+ tf.compat.v1.summary.histogram("linear_part/w", w)
+ tf.compat.v1.summary.histogram("linear_part/b", b)
+ return linear_output
+
+ def _build_fm(self):
+ """Construct the factorization machine part for the model.
+ This is a traditional 2-order FM module.
+
+ Returns:
+ object: Prediction score made by factorization machine.
+ """
+ with tf.compat.v1.variable_scope("fm_part") as scope: # noqa: F841
+ x = tf.SparseTensor(
+ self.iterator.fm_feat_indices,
+ self.iterator.fm_feat_values,
+ self.iterator.fm_feat_shape,
+ )
+ xx = tf.SparseTensor(
+ self.iterator.fm_feat_indices,
+ tf.pow(self.iterator.fm_feat_values, 2),
+ self.iterator.fm_feat_shape,
+ )
+ fm_output = 0.5 * tf.reduce_sum(
+ input_tensor=tf.pow(tf.sparse.sparse_dense_matmul(x, self.embedding), 2)
+ - tf.sparse.sparse_dense_matmul(xx, tf.pow(self.embedding, 2)),
+ axis=1,
+ keepdims=True,
+ )
+ return fm_output
+
+ def _build_CIN(
+ self, nn_input, res=False, direct=False, bias=False, is_masked=False
+ ):
+ """Construct the compressed interaction network.
+ This component provides explicit and vector-wise higher-order feature interactions.
+
+ Args:
+ nn_input (object): The output of field-embedding layer. This is the input for CIN.
+ res (bool): Whether use residual structure to fuse the results from each layer of CIN.
+ direct (bool): If true, then all hidden units are connected to both next layer and output layer;
+ otherwise, half of hidden units are connected to next layer and the other half will be connected to output layer.
+ bias (bool): Whether to add bias term when calculating the feature maps.
+ is_masked (bool): Controls whether to remove self-interaction in the first layer of CIN.
+
+ Returns:
+ object: Prediction score made by CIN.
+ """
+ hparams = self.hparams
+ hidden_nn_layers = []
+ field_nums = []
+ final_len = 0
+ field_num = hparams.FIELD_COUNT
+ nn_input = tf.reshape(nn_input, shape=[-1, int(field_num), hparams.dim])
+ field_nums.append(int(field_num))
+ hidden_nn_layers.append(nn_input)
+ final_result = []
+ split_tensor0 = tf.split(hidden_nn_layers[0], hparams.dim * [1], 2)
+ with tf.compat.v1.variable_scope(
+ "exfm_part", initializer=self.initializer
+ ) as scope: # noqa: F841
+ for idx, layer_size in enumerate(hparams.cross_layer_sizes):
+ split_tensor = tf.split(hidden_nn_layers[-1], hparams.dim * [1], 2)
+ dot_result_m = tf.matmul(
+ split_tensor0, split_tensor, transpose_b=True
+ ) # shape : (Dim, Batch, FieldNum, HiddenNum), a.k.a (D,B,F,H)
+ dot_result_o = tf.reshape(
+ dot_result_m,
+ shape=[hparams.dim, -1, field_nums[0] * field_nums[-1]],
+ ) # shape: (D,B,FH)
+ dot_result = tf.transpose(a=dot_result_o, perm=[1, 0, 2]) # (B,D,FH)
+
+ filters = tf.compat.v1.get_variable(
+ name="f_" + str(idx),
+ shape=[1, field_nums[-1] * field_nums[0], layer_size],
+ dtype=tf.float32,
+ )
+
+ if is_masked and idx == 0:
+ ones = tf.ones([field_nums[0], field_nums[0]], dtype=tf.float32)
+ mask_matrix = tf.linalg.band_part(
+ ones, 0, -1
+ ) - tf.linalg.tensor_diag(tf.ones(field_nums[0]))
+ mask_matrix = tf.reshape(
+ mask_matrix, shape=[1, field_nums[0] * field_nums[0]]
+ )
+
+ dot_result = tf.multiply(dot_result, mask_matrix) * 2
+ self.dot_result = dot_result
+
+ curr_out = tf.nn.conv1d(
+ input=dot_result, filters=filters, stride=1, padding="VALID"
+ ) # shape : (B,D,H`)
+
+ if bias:
+ b = tf.compat.v1.get_variable(
+ name="f_b" + str(idx),
+ shape=[layer_size],
+ dtype=tf.float32,
+ initializer=tf.compat.v1.zeros_initializer(),
+ )
+ curr_out = tf.nn.bias_add(curr_out, b)
+ self.cross_params.append(b)
+
+ if hparams.enable_BN is True:
+ curr_out = tf.compat.v1.layers.batch_normalization(
+ curr_out,
+ momentum=0.95,
+ epsilon=0.0001,
+ training=self.is_train_stage,
+ )
+
+ curr_out = self._activate(curr_out, hparams.cross_activation)
+
+ curr_out = tf.transpose(a=curr_out, perm=[0, 2, 1]) # shape : (B,H,D)
+
+ if direct:
+ direct_connect = curr_out
+ next_hidden = curr_out
+ final_len += layer_size
+ field_nums.append(int(layer_size))
+
+ else:
+ if idx != len(hparams.cross_layer_sizes) - 1:
+ next_hidden, direct_connect = tf.split(
+ curr_out, 2 * [int(layer_size / 2)], 1
+ )
+ final_len += int(layer_size / 2)
+ else:
+ direct_connect = curr_out
+ next_hidden = 0
+ final_len += layer_size
+ field_nums.append(int(layer_size / 2))
+
+ final_result.append(direct_connect)
+ hidden_nn_layers.append(next_hidden)
+
+ self.cross_params.append(filters)
+
+ result = tf.concat(final_result, axis=1)
+ result = tf.reduce_sum(input_tensor=result, axis=-1) # shape : (B,H)
+
+ if res:
+ base_score = tf.reduce_sum(
+ input_tensor=result, axis=1, keepdims=True
+ ) # (B,1)
+ else:
+ base_score = 0
+
+ w_nn_output = tf.compat.v1.get_variable(
+ name="w_nn_output", shape=[final_len, 1], dtype=tf.float32
+ )
+ b_nn_output = tf.compat.v1.get_variable(
+ name="b_nn_output",
+ shape=[1],
+ dtype=tf.float32,
+ initializer=tf.compat.v1.zeros_initializer(),
+ )
+ self.layer_params.append(w_nn_output)
+ self.layer_params.append(b_nn_output)
+ exFM_out = base_score + tf.compat.v1.nn.xw_plus_b(
+ result, w_nn_output, b_nn_output
+ )
+ return exFM_out
+
+ def _build_fast_CIN(self, nn_input, res=False, direct=False, bias=False):
+ """Construct the compressed interaction network with reduced parameters.
+ This component provides explicit and vector-wise higher-order feature interactions.
+ Parameters from the filters are reduced via a matrix decomposition method.
+ Fast CIN is more space and time efficient than CIN.
+
+ Args:
+ nn_input (object): The output of field-embedding layer. This is the input for CIN.
+ res (bool): Whether use residual structure to fuse the results from each layer of CIN.
+ direct (bool): If true, then all hidden units are connected to both next layer and output layer;
+ otherwise, half of hidden units are connected to next layer and the other half will be connected to output layer.
+ bias (bool): Whether to add bias term when calculating the feature maps.
+
+ Returns:
+ object: Prediction score made by fast CIN.
+ """
+ hparams = self.hparams
+ hidden_nn_layers = []
+ field_nums = []
+ final_len = 0
+ field_num = hparams.FIELD_COUNT
+ fast_CIN_d = hparams.fast_CIN_d
+ nn_input = tf.reshape(
+ nn_input, shape=[-1, int(field_num), hparams.dim]
+ ) # (B,F,D)
+ nn_input = tf.transpose(a=nn_input, perm=[0, 2, 1]) # (B,D,F)
+ field_nums.append(int(field_num))
+ hidden_nn_layers.append(nn_input)
+ final_result = []
+ with tf.compat.v1.variable_scope(
+ "exfm_part", initializer=self.initializer
+ ) as scope: # noqa: F841
+ for idx, layer_size in enumerate(hparams.cross_layer_sizes):
+ if idx == 0:
+ fast_w = tf.compat.v1.get_variable(
+ "fast_CIN_w_" + str(idx),
+ shape=[1, field_nums[0], fast_CIN_d * layer_size],
+ dtype=tf.float32,
+ )
+
+ self.cross_params.append(fast_w)
+ dot_result_1 = tf.nn.conv1d(
+ input=nn_input, filters=fast_w, stride=1, padding="VALID"
+ ) # shape: (B,D,d*H)
+ dot_result_2 = tf.nn.conv1d(
+ input=tf.pow(nn_input, 2),
+ filters=tf.pow(fast_w, 2),
+ stride=1,
+ padding="VALID",
+ ) # shape: ((B,D,d*H)
+ dot_result = tf.reshape(
+ 0.5 * (dot_result_1 - dot_result_2),
+ shape=[-1, hparams.dim, layer_size, fast_CIN_d],
+ )
+ curr_out = tf.reduce_sum(
+ input_tensor=dot_result, axis=3, keepdims=False
+ ) # shape: ((B,D,H)
+ else:
+ fast_w = tf.compat.v1.get_variable(
+ "fast_CIN_w_" + str(idx),
+ shape=[1, field_nums[0], fast_CIN_d * layer_size],
+ dtype=tf.float32,
+ )
+ fast_v = tf.compat.v1.get_variable(
+ "fast_CIN_v_" + str(idx),
+ shape=[1, field_nums[-1], fast_CIN_d * layer_size],
+ dtype=tf.float32,
+ )
+
+ self.cross_params.append(fast_w)
+ self.cross_params.append(fast_v)
+
+ dot_result_1 = tf.nn.conv1d(
+ input=nn_input, filters=fast_w, stride=1, padding="VALID"
+ ) # shape: ((B,D,d*H)
+ dot_result_2 = tf.nn.conv1d(
+ input=hidden_nn_layers[-1],
+ filters=fast_v,
+ stride=1,
+ padding="VALID",
+ ) # shape: ((B,D,d*H)
+ dot_result = tf.reshape(
+ tf.multiply(dot_result_1, dot_result_2),
+ shape=[-1, hparams.dim, layer_size, fast_CIN_d],
+ )
+ curr_out = tf.reduce_sum(
+ input_tensor=dot_result, axis=3, keepdims=False
+ ) # shape: ((B,D,H)
+
+ if bias:
+ b = tf.compat.v1.get_variable(
+ name="f_b" + str(idx),
+ shape=[1, 1, layer_size],
+ dtype=tf.float32,
+ initializer=tf.compat.v1.zeros_initializer(),
+ )
+ curr_out = tf.nn.bias_add(curr_out, b)
+ self.cross_params.append(b)
+
+ if hparams.enable_BN is True:
+ curr_out = tf.compat.v1.layers.batch_normalization(
+ curr_out,
+ momentum=0.95,
+ epsilon=0.0001,
+ training=self.is_train_stage,
+ )
+
+ curr_out = self._activate(curr_out, hparams.cross_activation)
+
+ if direct:
+ direct_connect = curr_out
+ next_hidden = curr_out
+ final_len += layer_size
+ field_nums.append(int(layer_size))
+
+ else:
+ if idx != len(hparams.cross_layer_sizes) - 1:
+ next_hidden, direct_connect = tf.split(
+ curr_out, 2 * [int(layer_size / 2)], 2
+ )
+ final_len += int(layer_size / 2)
+ field_nums.append(int(layer_size / 2))
+ else:
+ direct_connect = curr_out
+ next_hidden = 0
+ final_len += layer_size
+ field_nums.append(int(layer_size))
+
+ final_result.append(direct_connect)
+ hidden_nn_layers.append(next_hidden)
+
+ result = tf.concat(final_result, axis=2)
+ result = tf.reduce_sum(input_tensor=result, axis=1, keepdims=False) # (B,H)
+
+ if res:
+ base_score = tf.reduce_sum(
+ input_tensor=result, axis=1, keepdims=True
+ ) # (B,1)
+ else:
+ base_score = 0
+
+ w_nn_output = tf.compat.v1.get_variable(
+ name="w_nn_output", shape=[final_len, 1], dtype=tf.float32
+ )
+ b_nn_output = tf.compat.v1.get_variable(
+ name="b_nn_output",
+ shape=[1],
+ dtype=tf.float32,
+ initializer=tf.compat.v1.zeros_initializer(),
+ )
+ self.layer_params.append(w_nn_output)
+ self.layer_params.append(b_nn_output)
+ exFM_out = (
+ tf.compat.v1.nn.xw_plus_b(result, w_nn_output, b_nn_output) + base_score
+ )
+
+ return exFM_out
+
+ def _build_dnn(self, embed_out, embed_layer_size):
+ """Construct the MLP part for the model.
+ This components provides implicit higher-order feature interactions.
+
+ Args:
+ embed_out (object): The output of field-embedding layer. This is the input for DNN.
+ embed_layer_size (object): Shape of the embed_out
+
+ Returns:
+ object: Prediction score made by fast CIN.
+ """
+ hparams = self.hparams
+ w_fm_nn_input = embed_out
+ last_layer_size = embed_layer_size
+ layer_idx = 0
+ hidden_nn_layers = []
+ hidden_nn_layers.append(w_fm_nn_input)
+ with tf.compat.v1.variable_scope(
+ "nn_part", initializer=self.initializer
+ ) as scope:
+ for idx, layer_size in enumerate(hparams.layer_sizes):
+ curr_w_nn_layer = tf.compat.v1.get_variable(
+ name="w_nn_layer" + str(layer_idx),
+ shape=[last_layer_size, layer_size],
+ dtype=tf.float32,
+ )
+ curr_b_nn_layer = tf.compat.v1.get_variable(
+ name="b_nn_layer" + str(layer_idx),
+ shape=[layer_size],
+ dtype=tf.float32,
+ initializer=tf.compat.v1.zeros_initializer(),
+ )
+ tf.compat.v1.summary.histogram(
+ "nn_part/" + "w_nn_layer" + str(layer_idx), curr_w_nn_layer
+ )
+ tf.compat.v1.summary.histogram(
+ "nn_part/" + "b_nn_layer" + str(layer_idx), curr_b_nn_layer
+ )
+ curr_hidden_nn_layer = tf.compat.v1.nn.xw_plus_b(
+ hidden_nn_layers[layer_idx], curr_w_nn_layer, curr_b_nn_layer
+ )
+ scope = "nn_part" + str(idx) # noqa: F841
+ activation = hparams.activation[idx]
+
+ if hparams.enable_BN is True:
+ curr_hidden_nn_layer = tf.compat.v1.layers.batch_normalization(
+ curr_hidden_nn_layer,
+ momentum=0.95,
+ epsilon=0.0001,
+ training=self.is_train_stage,
+ )
+
+ curr_hidden_nn_layer = self._active_layer(
+ logit=curr_hidden_nn_layer, activation=activation, layer_idx=idx
+ )
+ hidden_nn_layers.append(curr_hidden_nn_layer)
+ layer_idx += 1
+ last_layer_size = layer_size
+ self.layer_params.append(curr_w_nn_layer)
+ self.layer_params.append(curr_b_nn_layer)
+
+ w_nn_output = tf.compat.v1.get_variable(
+ name="w_nn_output", shape=[last_layer_size, 1], dtype=tf.float32
+ )
+ b_nn_output = tf.compat.v1.get_variable(
+ name="b_nn_output",
+ shape=[1],
+ dtype=tf.float32,
+ initializer=tf.compat.v1.zeros_initializer(),
+ )
+ tf.compat.v1.summary.histogram(
+ "nn_part/" + "w_nn_output" + str(layer_idx), w_nn_output
+ )
+ tf.compat.v1.summary.histogram(
+ "nn_part/" + "b_nn_output" + str(layer_idx), b_nn_output
+ )
+ self.layer_params.append(w_nn_output)
+ self.layer_params.append(b_nn_output)
+ nn_output = tf.compat.v1.nn.xw_plus_b(
+ hidden_nn_layers[-1], w_nn_output, b_nn_output
+ )
+ return nn_output
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+
+import numpy as np
+import pandas as pd
+import fastai
+import fastprogress
+from fastprogress.fastprogress import force_console_behavior
+
+from recommenders.utils import constants as cc
+
+
+[docs]def cartesian_product(*arrays):
+ """Compute the Cartesian product in fastai algo. This is a helper function.
+
+ Args:
+ arrays (tuple of numpy.ndarray): Input arrays
+
+ Returns:
+ numpy.ndarray: product
+
+ """
+ la = len(arrays)
+ dtype = np.result_type(*arrays)
+ arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype)
+ for i, a in enumerate(np.ix_(*arrays)):
+ arr[..., i] = a
+ return arr.reshape(-1, la)
+
+
+[docs]def score(
+ learner,
+ test_df,
+ user_col=cc.DEFAULT_USER_COL,
+ item_col=cc.DEFAULT_ITEM_COL,
+ prediction_col=cc.DEFAULT_PREDICTION_COL,
+ top_k=None,
+):
+ """Score all users+items provided and reduce to top_k items per user if top_k>0
+
+ Args:
+ learner (object): Model.
+ test_df (pandas.DataFrame): Test dataframe.
+ user_col (str): User column name.
+ item_col (str): Item column name.
+ prediction_col (str): Prediction column name.
+ top_k (int): Number of top items to recommend.
+
+ Returns:
+ pandas.DataFrame: Result of recommendation
+ """
+ # replace values not known to the model with NaN
+ total_users, total_items = learner.data.train_ds.x.classes.values()
+ test_df.loc[~test_df[user_col].isin(total_users), user_col] = np.nan
+ test_df.loc[~test_df[item_col].isin(total_items), item_col] = np.nan
+
+ # map ids to embedding ids
+ u = learner.get_idx(test_df[user_col], is_item=False)
+ m = learner.get_idx(test_df[item_col], is_item=True)
+
+ # score the pytorch model
+ pred = learner.model.forward(u, m)
+ scores = pd.DataFrame(
+ {user_col: test_df[user_col], item_col: test_df[item_col], prediction_col: pred}
+ )
+ scores = scores.sort_values([user_col, prediction_col], ascending=[True, False])
+ if top_k is not None:
+ top_scores = scores.groupby(user_col).head(top_k).reset_index(drop=True)
+ else:
+ top_scores = scores
+ return top_scores
+
+
+[docs]def hide_fastai_progress_bar():
+ """Hide fastai progress bar"""
+ fastprogress.fastprogress.NO_BAR = True
+ fastprogress.fastprogress.WRITER_FN = str
+ master_bar, progress_bar = force_console_behavior()
+ fastai.basic_train.master_bar, fastai.basic_train.progress_bar = (
+ master_bar,
+ progress_bar,
+ )
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import logging
+import pandas as pd
+import numpy as np
+from scipy.sparse import coo_matrix, isspmatrix_csr
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import normalize
+
+from recommenders.utils.python_utils import binarize
+from .geoimc_utils import length_normalize, reduce_dims
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("geoimc")
+
+
+[docs]class DataPtr:
+ """
+ Holds data and its respective indices
+ """
+
+ def __init__(self, data, entities):
+ """Initialize a data pointer
+
+ Args:
+ data (csr_matrix): The target data matrix.
+ entities (Iterator): An iterator (of 2 elements (ndarray)) containing
+ the features of row, col entities.
+ """
+ assert isspmatrix_csr(data)
+
+ self.data = data
+ self.entities = entities
+ self.data_indices = None
+ self.entity_indices = [None, None]
+
+[docs] def get_data(self):
+ """
+ Returns:
+ csr_matrix: Target matrix (based on the data_indices filter)
+ """
+ if self.data_indices is None:
+ return self.data
+ return self.data[self.data_indices]
+
+[docs] def get_entity(self, of="row"):
+ """Get entity
+
+ Args:
+ of (str): The entity, either 'row' or 'col'
+ Returns:
+ numpy.ndarray: Entity matrix (based on the entity_indices filter)
+ """
+ idx = 0 if of == "row" else 1
+ if self.entity_indices[idx] is None:
+ return self.entities[idx]
+ return self.entities[idx][self.entity_indices[idx]]
+
+
+[docs]class Dataset:
+ """
+ Base class that holds necessary (minimal) information needed
+ """
+
+ def __init__(self, name, features_dim=0, normalize=False, target_transform=""):
+ """Initialize parameters
+
+ Args:
+ name (str): Name of the dataset
+ features_dim (uint): Dimension of the features. If not 0, PCA is performed
+ on the features as the dimensionality reduction technique
+ normalize (bool): Normalize the features
+ target_transform (str): Transform the target values. Current options are
+ 'normalize' (Normalize the values), '' (Do nothing), 'binarize' (convert
+ the values using a threshold defined per dataset)
+
+ """
+ self.name = None
+ self.training_data = None
+ self.test_data = None
+ self.entities = None
+
+ self.features_dim = features_dim
+ self.feat_normalize = normalize
+ self.target_transform = target_transform
+
+[docs] def normalize(self):
+ """Normalizes the entity features"""
+ if self.feat_normalize:
+ for i in range(len(self.entities)):
+ if isspmatrix_csr(self.entities[i]):
+ logger.info("Normalizing CSR matrix")
+ self.entities[i] = normalize(self.entities[i])
+ else:
+ self.entities[i] = length_normalize(self.entities[i])
+
+[docs] def generate_train_test_data(self, data, test_ratio=0.3):
+ """Generate train, test split. The split is performed on the row
+ entities. So, this essentially becomes a cold start row entity test.
+
+ Args:
+ data (csr_matrix): The entire target matrix.
+ test_ratio (float): Ratio of test split.
+
+ """
+ self.training_data = DataPtr(data, self.entities)
+ self.test_data = DataPtr(data, self.entities)
+
+ self.training_data.data_indices, self.test_data.data_indices = train_test_split(
+ np.array(range(0, data.shape[0])),
+ test_size=test_ratio,
+ shuffle=True,
+ random_state=0,
+ )
+ self.training_data.entity_indices[0] = self.training_data.data_indices
+ self.test_data.entity_indices[0] = self.test_data.data_indices
+
+[docs] def reduce_dims(self):
+ """Reduces the dimensionality of entity features."""
+ if self.features_dim != 0:
+ self.entities[0] = reduce_dims(self.entities[0], self.features_dim)
+ self.entities[1] = reduce_dims(self.entities[1], self.features_dim)
+ logger.info("Dimensionality reduced ...")
+
+
+[docs]class ML_100K(Dataset):
+ """
+ Handles MovieLens-100K
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(self.__class__.__name__, **kwargs)
+ self.min_rating = 1
+ self.max_rating = 5
+
+[docs] def df2coo(self, df):
+ """Convert the input dataframe into a coo matrix
+
+ Args:
+ df (pandas.DataFrame): DataFrame containing the target matrix information.
+ """
+ data = []
+ row = list(df["user id"] - 1)
+ col = list(df["item id"] - 1)
+ for idx in range(0, len(df)):
+ val = df["rating"].iloc[idx]
+ data += [val]
+
+ if self.target_transform == "normalize":
+ data = data / np.sqrt(
+ np.sum(np.arange(self.min_rating, self.max_rating + 1) ** 2)
+ )
+ elif self.target_transform == "binarize":
+ data = binarize(np.array(data), 3)
+
+ # TODO: Get this from `u.info`
+ return coo_matrix((data, (row, col)), shape=(943, 1682))
+
+ def _read_from_file(self, path):
+ """Read the traget matrix from file at path.
+
+ Args:
+ path (str): Path to the target matrix
+ """
+ df = pd.read_csv(
+ path,
+ delimiter="\t",
+ names=["user id", "item id", "rating", "timestamp"],
+ encoding="ISO-8859-1",
+ )
+ df.drop(["timestamp"], axis=1, inplace=True)
+ return self.df2coo(df)
+
+[docs] def load_data(self, path):
+ """Load dataset
+
+ Args:
+ path (str): Path to the directory containing ML100K dataset
+ e1_path (str): Path to the file containing row (user) features of ML100K dataset
+ e2_path (str): Path to the file containing col (movie) features of ML100K dataset
+ """
+ self.entities = [
+ self._load_user_features(f"{path}/u.user"),
+ self._load_item_features(f"{path}/u.item"),
+ ]
+ self.normalize()
+ self.reduce_dims()
+ self.training_data = DataPtr(
+ self._read_from_file(f"{path}/u1.base").tocsr(), self.entities
+ )
+ self.test_data = DataPtr(
+ self._read_from_file(f"{path}/u1.test").tocsr(), self.entities
+ )
+
+ def _load_user_features(self, path):
+ """Load user features
+
+ Args:
+ path (str): Path to the file containing user features information
+
+ """
+ data = pd.read_csv(
+ path,
+ delimiter="|",
+ names=["user_id", "age", "gender", "occupation", "zip_code"],
+ )
+ features_df = pd.concat(
+ [
+ data["user_id"],
+ pd.get_dummies(data["user_id"]),
+ pd.get_dummies(data["age"]),
+ pd.get_dummies(data["gender"]),
+ pd.get_dummies(data["occupation"]),
+ pd.get_dummies(data["zip_code"]),
+ ],
+ axis=1,
+ )
+ features_df.drop(["user_id"], axis=1, inplace=True)
+ user_features = np.nan_to_num(features_df.to_numpy())
+ return user_features
+
+ def _load_item_features(self, path):
+ """Load item features
+
+ Args:
+ path (str): Path to the file containing item features information
+
+ """
+ header = [
+ "movie_id",
+ "movie_title",
+ "release_date",
+ "video_release_date",
+ "IMDb_URL",
+ "unknown",
+ "Action",
+ "Adventure",
+ "Animation",
+ "Childrens",
+ "Comedy",
+ "Crime",
+ "Documentary",
+ "Drama",
+ "Fantasy",
+ "Film-Noir",
+ "Horror",
+ "Musical",
+ "Mystery",
+ "Romance",
+ "Sci-Fi",
+ "Thriller",
+ "War",
+ "Western",
+ ]
+ data = pd.read_csv(path, delimiter="|", names=header, encoding="ISO-8859-1")
+
+ features_df = pd.concat(
+ [
+ pd.get_dummies(data["movie_title"]),
+ pd.get_dummies(data["release_date"]),
+ pd.get_dummies("video_release_date"),
+ pd.get_dummies("IMDb_URL"),
+ data[header[5:]],
+ ],
+ axis=1,
+ )
+ item_features = np.nan_to_num(features_df.to_numpy())
+ return item_features
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+from scipy.linalg import sqrtm
+
+from recommenders.utils.python_utils import binarize as conv_binary
+
+
+[docs]class PlainScalarProduct(object):
+ """
+ Module that implements plain scalar product
+ as the retrieval criterion
+ """
+
+ def __init__(self, X, Y, **kwargs):
+ """
+ Args:
+ X: numpy matrix of shape (users, features)
+ Y: numpy matrix of shape (items, features)
+ """
+ self.X = X
+ self.Y = Y
+
+[docs] def sim(self, **kwargs):
+ """Calculate the similarity score"""
+ sim = self.X.dot(self.Y.T)
+ return sim
+
+
+[docs]class Inferer:
+ """
+ Holds necessary (minimal) information needed for inference
+ """
+
+ def __init__(self, method="dot", k=10, transformation=""):
+ """Initialize parameters
+
+ Args:
+ method (str): The inference method. Currently 'dot'
+ (Dot product) is supported.
+ k (uint): `k` for 'topk' transformation.
+ transformation (str): Transform the inferred values into a
+ different scale. Currently 'mean' (Binarize the values
+ using mean of inferred matrix as the threshold), 'topk'
+ (Pick Top-K inferred values per row and assign them 1,
+ setting rest of them to 0), '' (No transformation) are
+ supported.
+ """
+ self.method = self._get_method(method)
+ self.k = k
+ self.transformation = transformation
+
+ def _get_method(self, k):
+ """Get the inferer method
+
+ Args:
+ k (str): The inferer name
+
+ Returns:
+ class: A class object implementing the inferer 'k'
+ """
+ if k == "dot":
+ method = PlainScalarProduct
+ else:
+ raise ValueError(f"{k} is unknown.")
+ return method
+
+[docs] def infer(self, dataPtr, W, **kwargs):
+ """Main inference method
+
+ Args:
+ dataPtr (DataPtr): An object containing the X, Z features needed for inference
+ W (iterable): An iterable containing the U, B, V parametrized matrices.
+ """
+
+ if isinstance(dataPtr, list):
+ a = dataPtr[0]
+ b = dataPtr[1]
+ else:
+ a = dataPtr.get_entity("row").dot(W[0]).dot(sqrtm(W[1]))
+ b = dataPtr.get_entity("col").dot(W[2]).dot(sqrtm(W[1]))
+
+ sim_score = self.method(a, b).sim(**kwargs)
+
+ if self.transformation == "mean":
+ prediction = conv_binary(sim_score, sim_score.mean())
+ elif self.transformation == "topk":
+ masked_sim_score = sim_score.copy()
+
+ for i in range(sim_score.shape[0]):
+ topKidx = np.argpartition(masked_sim_score[i], -self.k)[-self.k :]
+ mask = np.ones(sim_score[i].size, dtype=bool)
+ mask[topKidx] = False
+
+ masked_sim_score[i][topKidx] = 1
+ masked_sim_score[i][mask] = 0
+ prediction = masked_sim_score
+ else:
+ prediction = sim_score
+
+ return prediction
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+from sklearn.decomposition import PCA
+
+
+[docs]def length_normalize(matrix):
+ """Length normalize the matrix
+
+ Args:
+ matrix (np.ndarray): Input matrix that needs to be normalized
+
+ Returns:
+ Normalized matrix
+ """
+ norms = np.sqrt(np.sum(matrix**2, axis=1))
+ norms[norms == 0] = 1
+ return matrix / norms[:, np.newaxis]
+
+
+[docs]def mean_center(matrix):
+ """Performs mean centering across axis 0
+
+ Args:
+ matrix (np.ndarray): Input matrix that needs to be mean centered
+ """
+ avg = np.mean(matrix, axis=0)
+ matrix -= avg
+
+
+[docs]def reduce_dims(matrix, target_dim):
+ """Reduce dimensionality of the data using PCA.
+
+ Args:
+ matrix (np.ndarray): Matrix of the form (n_sampes, n_features)
+ target_dim (uint): Dimension to which n_features should be reduced to.
+
+ """
+ model = PCA(n_components=target_dim)
+ model.fit(matrix)
+ return model.transform(matrix)
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import pandas as pd
+import numpy as np
+import seaborn as sns
+
+from lightfm.evaluation import precision_at_k, recall_at_k
+
+
+[docs]def model_perf_plots(df):
+ """Function to plot model performance metrics.
+
+ Args:
+ df (pandas.DataFrame): Dataframe in tidy format, with ['epoch','level','value'] columns
+
+ Returns:
+ object: matplotlib axes
+ """
+ g = sns.FacetGrid(df, col="metric", hue="stage", col_wrap=2, sharey=False)
+ g = g.map(sns.scatterplot, "epoch", "value").add_legend()
+
+
+[docs]def compare_metric(df_list, metric="prec", stage="test"):
+ """Function to combine and prepare list of dataframes into tidy format.
+
+ Args:
+ df_list (list): List of dataframes
+ metrics (str): name of metric to be extracted, optional
+ stage (str): name of model fitting stage to be extracted, optional
+
+ Returns:
+ pandas.DataFrame: Metrics
+ """
+ colnames = ["model" + str(x) for x in list(range(1, len(df_list) + 1))]
+ models = [
+ df[(df["stage"] == stage) & (df["metric"] == metric)]["value"]
+ .reset_index(drop=True)
+ .values
+ for df in df_list
+ ]
+
+ output = pd.DataFrame(zip(*models), columns=colnames).stack().reset_index()
+ output.columns = ["epoch", "data", "value"]
+ return output
+
+
+[docs]def track_model_metrics(
+ model,
+ train_interactions,
+ test_interactions,
+ k=10,
+ no_epochs=100,
+ no_threads=8,
+ show_plot=True,
+ **kwargs
+):
+ """Function to record model's performance at each epoch, formats the performance into tidy format,
+ plots the performance and outputs the performance data.
+
+ Args:
+ model (LightFM instance): fitted LightFM model
+ train_interactions (scipy sparse COO matrix): train interactions set
+ test_interactions (scipy sparse COO matrix): test interaction set
+ k (int): number of recommendations, optional
+ no_epochs (int): Number of epochs to run, optional
+ no_threads (int): Number of parallel threads to use, optional
+ **kwargs: other keyword arguments to be passed down
+
+ Returns:
+ pandas.DataFrame, LightFM model, matplotlib axes:
+ - Performance traces of the fitted model
+ - Fitted model
+ - Side effect of the method
+ """
+ # initialising temp data storage
+ model_prec_train = [0] * no_epochs
+ model_prec_test = [0] * no_epochs
+
+ model_rec_train = [0] * no_epochs
+ model_rec_test = [0] * no_epochs
+
+ # fit model and store train/test metrics at each epoch
+ for epoch in range(no_epochs):
+ model.fit_partial(
+ interactions=train_interactions, epochs=1, num_threads=no_threads, **kwargs
+ )
+ model_prec_train[epoch] = precision_at_k(
+ model, train_interactions, k=k, **kwargs
+ ).mean()
+ model_prec_test[epoch] = precision_at_k(
+ model, test_interactions, k=k, **kwargs
+ ).mean()
+
+ model_rec_train[epoch] = recall_at_k(
+ model, train_interactions, k=k, **kwargs
+ ).mean()
+ model_rec_test[epoch] = recall_at_k(
+ model, test_interactions, k=k, **kwargs
+ ).mean()
+
+ # collect the performance metrics into a dataframe
+ fitting_metrics = pd.DataFrame(
+ zip(model_prec_train, model_prec_test, model_rec_train, model_rec_test),
+ columns=[
+ "model_prec_train",
+ "model_prec_test",
+ "model_rec_train",
+ "model_rec_test",
+ ],
+ )
+ # convert into tidy format
+ fitting_metrics = fitting_metrics.stack().reset_index()
+ fitting_metrics.columns = ["epoch", "level", "value"]
+ # exact the labels for each observation
+ fitting_metrics["stage"] = fitting_metrics.level.str.split("_").str[-1]
+ fitting_metrics["metric"] = fitting_metrics.level.str.split("_").str[1]
+ fitting_metrics.drop(["level"], axis=1, inplace=True)
+ # replace the metric keys to improve visualisation
+ metric_keys = {"prec": "Precision", "rec": "Recall"}
+ fitting_metrics.metric.replace(metric_keys, inplace=True)
+ # plots the performance data
+ if show_plot:
+ model_perf_plots(fitting_metrics)
+ return fitting_metrics, model
+
+
+[docs]def similar_users(user_id, user_features, model, N=10):
+ """Function to return top N similar users based on https://github.com/lyst/lightfm/issues/244#issuecomment-355305681
+
+ Args:
+ user_id (int): id of user to be used as reference
+ user_features (scipy sparse CSR matrix): user feature matric
+ model (LightFM instance): fitted LightFM model
+ N (int): Number of top similar users to return
+
+ Returns:
+ pandas.DataFrame: top N most similar users with score
+ """
+ _, user_representations = model.get_user_representations(features=user_features)
+
+ # Cosine similarity
+ scores = user_representations.dot(user_representations[user_id, :])
+ user_norms = np.linalg.norm(user_representations, axis=1)
+ user_norms[user_norms == 0] = 1e-10
+ scores /= user_norms
+
+ best = np.argpartition(scores, -(N + 1))[-(N + 1) :]
+ return pd.DataFrame(
+ sorted(zip(best, scores[best] / user_norms[user_id]), key=lambda x: -x[1])[1:],
+ columns=["userID", "score"],
+ )
+
+
+[docs]def similar_items(item_id, item_features, model, N=10):
+ """Function to return top N similar items
+ based on https://github.com/lyst/lightfm/issues/244#issuecomment-355305681
+
+ Args:
+ item_id (int): id of item to be used as reference
+ item_features (scipy sparse CSR matrix): item feature matric
+ model (LightFM instance): fitted LightFM model
+ N (int): Number of top similar items to return
+
+ Returns:
+ pandas.DataFrame: top N most similar items with score
+ """
+ _, item_representations = model.get_item_representations(features=item_features)
+
+ # Cosine similarity
+ scores = item_representations.dot(item_representations[item_id, :])
+ item_norms = np.linalg.norm(item_representations, axis=1)
+ item_norms[item_norms == 0] = 1e-10
+ scores /= item_norms
+
+ best = np.argpartition(scores, -(N + 1))[-(N + 1) :]
+ return pd.DataFrame(
+ sorted(zip(best, scores[best] / item_norms[item_id]), key=lambda x: -x[1])[1:],
+ columns=["itemID", "score"],
+ )
+
+
+[docs]def prepare_test_df(test_idx, uids, iids, uid_map, iid_map, weights):
+ """Function to prepare test df for evaluation
+
+ Args:
+ test_idx (slice): slice of test indices
+ uids (numpy.ndarray): Array of internal user indices
+ iids (numpy.ndarray): Array of internal item indices
+ uid_map (dict): Keys to map internal user indices to external ids.
+ iid_map (dict): Keys to map internal item indices to external ids.
+ weights (numpy.float32 coo_matrix): user-item interaction
+
+ Returns:
+ pandas.DataFrame: user-item selected for testing
+ """
+ test_df = pd.DataFrame(
+ zip(
+ uids[test_idx],
+ iids[test_idx],
+ [list(uid_map.keys())[x] for x in uids[test_idx]],
+ [list(iid_map.keys())[x] for x in iids[test_idx]],
+ ),
+ columns=["uid", "iid", "userID", "itemID"],
+ )
+
+ dok_weights = weights.todok()
+ test_df["rating"] = test_df.apply(lambda x: dok_weights[x.uid, x.iid], axis=1)
+
+ return test_df[["userID", "itemID", "rating"]]
+
+
+[docs]def prepare_all_predictions(
+ data,
+ uid_map,
+ iid_map,
+ interactions,
+ model,
+ num_threads,
+ user_features=None,
+ item_features=None,
+):
+ """Function to prepare all predictions for evaluation.
+ Args:
+ data (pandas df): dataframe of all users, items and ratings as loaded
+ uid_map (dict): Keys to map internal user indices to external ids.
+ iid_map (dict): Keys to map internal item indices to external ids.
+ interactions (np.float32 coo_matrix): user-item interaction
+ model (LightFM instance): fitted LightFM model
+ num_threads (int): number of parallel computation threads
+ user_features (np.float32 csr_matrix): User weights over features
+ item_features (np.float32 csr_matrix): Item weights over features
+ Returns:
+ pandas.DataFrame: all predictions
+ """
+ users, items, preds = [], [], [] # noqa: F841
+ item = list(data.itemID.unique())
+ for user in data.userID.unique():
+ user = [user] * len(item)
+ users.extend(user)
+ items.extend(item)
+ all_predictions = pd.DataFrame(data={"userID": users, "itemID": items})
+ all_predictions["uid"] = all_predictions.userID.map(uid_map)
+ all_predictions["iid"] = all_predictions.itemID.map(iid_map)
+
+ dok_weights = interactions.todok()
+ all_predictions["rating"] = all_predictions.apply(
+ lambda x: dok_weights[x.uid, x.iid], axis=1
+ )
+
+ all_predictions = all_predictions[all_predictions.rating < 1].reset_index(drop=True)
+ all_predictions = all_predictions.drop("rating", axis=1)
+
+ all_predictions["prediction"] = all_predictions.apply(
+ lambda x: model.predict(
+ user_ids=np.array([x["uid"]], dtype=np.int32),
+ item_ids=np.array([x["iid"]], dtype=np.int32),
+ user_features=user_features,
+ item_features=item_features,
+ num_threads=num_threads,
+ )[0],
+ axis=1,
+ )
+
+ return all_predictions[["userID", "itemID", "prediction"]]
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import logging
+import numpy as np
+import category_encoders as ce
+from tqdm import tqdm
+import collections
+import gc
+
+
+[docs]def unpackbits(x, num_bits):
+ """Convert a decimal value numpy.ndarray into multi-binary value numpy.ndarray ([1,2]->[[0,1],[1,0]])
+
+ Args:
+ x (numpy.ndarray): Decimal array.
+ num_bits (int): The max length of the converted binary value.
+ """
+ xshape = list(x.shape)
+ x = x.reshape([-1, 1])
+ to_and = 2 ** np.arange(num_bits).reshape([1, num_bits])
+ return (x & to_and).astype(bool).astype(int).reshape(xshape + [num_bits])
+
+
+[docs]class NumEncoder(object):
+ """Encode all the categorical features into numerical ones by sequential label encoding, sequential count encoding,
+ and binary encoding. Additionally, it also filters the low-frequency categories and fills the missing values.
+ """
+
+ def __init__(self, cate_cols, nume_cols, label_col, threshold=10, thresrate=0.99):
+ """Constructor.
+
+ Args:
+ cate_cols (list): The columns of categorical features.
+ nume_cols (list): The columns of numerical features.
+ label_col (object): The column of Label.
+ threshold (int): The categories whose frequency is lower than the threshold will be filtered (be treated
+ as "<LESS>").
+ thresrate (float): The (1.0 - thersrate, default 1%) lowest-frequency categories will also be filtered.
+ """
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [INFO] %(message)s")
+ self.label_name = label_col
+ self.cate_cols = cate_cols
+ self.dtype_dict = {}
+ for item in cate_cols:
+ self.dtype_dict[item] = "str"
+ for item in nume_cols:
+ self.dtype_dict[item] = "float"
+ self.nume_cols = nume_cols
+ self.tgt_nume_cols = []
+ self.encoder = ce.ordinal.OrdinalEncoder(cols=cate_cols)
+ self.threshold = threshold
+ self.thresrate = thresrate
+
+ self.save_cate_avgs = {}
+ self.save_value_filter = {}
+ self.save_num_embs = {}
+ self.Max_len = {}
+ self.samples = 0
+
+[docs] def fit_transform(self, df):
+ """Input a training set (pandas.DataFrame) and return the converted 2 numpy.ndarray (x,y).
+
+ Args:
+ df (pandas.DataFrame): Input dataframe
+
+ Returns:
+ numpy.ndarray, numpy.ndarray: New features and labels.
+ """
+ df = df.astype(dtype=self.dtype_dict)
+ self.samples = df.shape[0]
+ logging.info("Filtering and fillna features")
+ for item in tqdm(self.cate_cols):
+ value_counts = df[item].value_counts()
+ num = value_counts.shape[0]
+ self.save_value_filter[item] = list(
+ value_counts[: int(num * self.thresrate)][
+ value_counts > self.threshold
+ ].index
+ )
+ rm_values = set(value_counts.index) - set(self.save_value_filter[item])
+ df[item] = df[item].map(lambda x: "<LESS>" if x in rm_values else x)
+ df[item] = df[item].fillna("<UNK>")
+ del value_counts
+ gc.collect()
+
+ for item in tqdm(self.nume_cols):
+ df[item] = df[item].fillna(df[item].mean())
+ self.save_num_embs[item] = {"sum": df[item].sum(), "cnt": df[item].shape[0]}
+
+ logging.info("Ordinal encoding cate features")
+ # ordinal_encoding
+ df = self.encoder.fit_transform(df)
+
+ logging.info("Target encoding cate features")
+ # dynamic_targeting_encoding
+ for item in tqdm(self.cate_cols):
+ feats = df[item].values
+ labels = df[self.label_name].values
+ feat_encoding = {"mean": [], "count": []}
+ self.save_cate_avgs[item] = collections.defaultdict(lambda: [0, 0])
+ for idx in range(self.samples):
+ cur_feat = feats[idx]
+ if cur_feat in self.save_cate_avgs[item]:
+ feat_encoding["mean"].append(
+ self.save_cate_avgs[item][cur_feat][0]
+ / self.save_cate_avgs[item][cur_feat][1]
+ )
+ feat_encoding["count"].append(
+ self.save_cate_avgs[item][cur_feat][1] / idx
+ )
+ else:
+ feat_encoding["mean"].append(0)
+ feat_encoding["count"].append(0)
+ self.save_cate_avgs[item][cur_feat][0] += labels[idx]
+ self.save_cate_avgs[item][cur_feat][1] += 1
+ df[item + "_t_mean"] = feat_encoding["mean"]
+ df[item + "_t_count"] = feat_encoding["count"]
+ self.tgt_nume_cols.append(item + "_t_mean")
+ self.tgt_nume_cols.append(item + "_t_count")
+
+ logging.info("Start manual binary encoding")
+ rows = None
+ for item in tqdm(self.nume_cols + self.tgt_nume_cols):
+ feats = df[item].values
+ if rows is None:
+ rows = feats.reshape((-1, 1))
+ else:
+ rows = np.concatenate([rows, feats.reshape((-1, 1))], axis=1)
+ del feats
+ gc.collect()
+ for item in tqdm(self.cate_cols):
+ feats = df[item].values
+ Max = df[item].max()
+ bit_len = len(bin(Max)) - 2
+ samples = self.samples
+ self.Max_len[item] = bit_len
+ res = unpackbits(feats, bit_len).reshape((samples, -1))
+ rows = np.concatenate([rows, res], axis=1)
+ del feats
+ gc.collect()
+ trn_y = np.array(df[self.label_name].values).reshape((-1, 1))
+ del df
+ gc.collect()
+ trn_x = np.array(rows)
+ return trn_x, trn_y
+
+ # for test dataset
+[docs] def transform(self, df):
+ """Input a testing / validation set (pandas.DataFrame) and return the converted 2 numpy.ndarray (x,y).
+
+ Args:
+ df (pandas.DataFrame): Input dataframe
+
+ Returns:
+ numpy.ndarray, numpy.ndarray: New features and labels.
+ """
+ df = df.astype(dtype=self.dtype_dict)
+ samples = df.shape[0]
+ logging.info("Filtering and fillna features")
+ for item in tqdm(self.cate_cols):
+ value_counts = df[item].value_counts()
+ rm_values = set(value_counts.index) - set(self.save_value_filter[item])
+ df[item] = df[item].map(lambda x: "<LESS>" if x in rm_values else x)
+ df[item] = df[item].fillna("<UNK>")
+
+ for item in tqdm(self.nume_cols):
+ mean = self.save_num_embs[item]["sum"] / self.save_num_embs[item]["cnt"]
+ df[item] = df[item].fillna(mean)
+
+ logging.info("Ordinal encoding cate features")
+ # ordinal_encoding
+ df = self.encoder.transform(df)
+
+ logging.info("Target encoding cate features")
+ # dynamic_targeting_encoding
+ for item in tqdm(self.cate_cols):
+ avgs = self.save_cate_avgs[item]
+ df[item + "_t_mean"] = df[item].map(
+ lambda x: avgs[x][0] / avgs[x][1] if x in avgs else 0
+ )
+ df[item + "_t_count"] = df[item].map(
+ lambda x: avgs[x][1] / self.samples if x in avgs else 0
+ )
+
+ logging.info("Start manual binary encoding")
+ rows = None
+ for item in tqdm(self.nume_cols + self.tgt_nume_cols):
+ feats = df[item].values
+ if rows is None:
+ rows = feats.reshape((-1, 1))
+ else:
+ rows = np.concatenate([rows, feats.reshape((-1, 1))], axis=1)
+ del feats
+ gc.collect()
+ for item in tqdm(self.cate_cols):
+ feats = df[item].values
+ bit_len = self.Max_len[item]
+ res = unpackbits(feats, bit_len).reshape((samples, -1))
+ rows = np.concatenate([rows, res], axis=1)
+ del feats
+ gc.collect()
+ vld_y = np.array(df[self.label_name].values).reshape((-1, 1))
+ del df
+ gc.collect()
+ vld_x = np.array(rows)
+ return vld_x, vld_y
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import os
+from collections import OrderedDict
+import random
+import numpy as np
+import pandas as pd
+import csv
+import logging
+from tqdm import tqdm
+
+from recommenders.utils.constants import (
+ DEFAULT_ITEM_COL,
+ DEFAULT_USER_COL,
+ DEFAULT_RATING_COL,
+)
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+
+
+
+[docs]class MissingFieldsException(Exception):
+ """Exception raised if file is missing expected fields"""
+
+
+[docs]class FileNotSortedException(Exception):
+ """Exception raised if file is not sorted correctly"""
+
+
+
+
+
+[docs]class DataFile:
+ """
+ DataFile class for NCF. Iterator to read data from a csv file.
+ Data must be sorted by user. Includes utilities for loading user data from
+ file, formatting it and returning a Pandas dataframe.
+ """
+
+ def __init__(
+ self, filename, col_user, col_item, col_rating, col_test_batch=None, binary=True
+ ):
+ """Constructor
+
+ Args:
+ filename (str): Path to file to be processed.
+ col_user (str): User column name.
+ col_item (str): Item column name.
+ col_rating (str): Rating column name.
+ col_test_batch (str): Test batch column name.
+ binary (bool): If true, set rating > 0 to rating = 1.
+ """
+ self.filename = filename
+ self.col_user = col_user
+ self.col_item = col_item
+ self.col_rating = col_rating
+ self.col_test_batch = col_test_batch
+ self.expected_fields = [self.col_user, self.col_item, self.col_rating]
+ if self.col_test_batch is not None:
+ self.expected_fields.append(self.col_test_batch)
+ self.binary = binary
+ self._init_data()
+ self.id2user = {self.user2id[k]: k for k in self.user2id}
+ self.id2item = {self.item2id[k]: k for k in self.item2id}
+
+ @property
+ def users(self):
+ return self.user2id.keys()
+
+ @property
+ def items(self):
+ return self.item2id.keys()
+
+ @property
+ def end_of_file(self):
+ return (self.line_num > 0) and self.next_row is None
+
+ def __iter__(self):
+ return self
+
+ def __enter__(self, *args):
+ self.file = open(self.filename, "r", encoding="UTF8")
+ self.reader = csv.DictReader(self.file)
+ self._check_for_missing_fields(self.expected_fields)
+ self.line_num = 0
+ self.row, self.next_row = None, None
+ return self
+
+ def __exit__(self, *args):
+ self.file.close()
+ self.reader = None
+ self.line_num = 0
+ self.row, self.next_row = None, None
+
+ def __next__(self):
+ if self.next_row:
+ self.row = self.next_row
+ elif self.line_num == 0:
+ self.row = self._extract_row_data(next(self.reader, None))
+ if self.row is None:
+ raise EmptyFileException("{} is empty.".format(self.filename))
+ else:
+ raise StopIteration # end of file
+ self.next_row = self._extract_row_data(next(self.reader, None))
+ self.line_num += 1
+
+ return self.row
+
+ def _check_for_missing_fields(self, fields_to_check):
+ missing_fields = set(fields_to_check).difference(set(self.reader.fieldnames))
+ if len(missing_fields):
+ raise MissingFieldsException(
+ "Columns {} not in header of file {}".format(
+ missing_fields, self.filename
+ )
+ )
+
+ def _extract_row_data(self, row):
+ if row is None:
+ return row
+ user = int(row[self.col_user])
+ item = int(row[self.col_item])
+ rating = float(row[self.col_rating])
+ if self.binary:
+ rating = float(rating > 0)
+ test_batch = None
+ if self.col_test_batch:
+ test_batch = int(row[self.col_test_batch])
+ return {
+ self.col_user: user,
+ self.col_item: item,
+ self.col_rating: rating,
+ self.col_test_batch: test_batch,
+ }
+
+ def _init_data(self):
+ # Compile lists of unique users and items, assign IDs to users and items,
+ # and ensure file is sorted by user (and batch index if test set)
+ logger.info("Indexing {} ...".format(self.filename))
+ with self:
+ user_items = []
+ self.item2id, self.user2id = OrderedDict(), OrderedDict()
+ batch_index = 0
+ for _ in self:
+ item = self.row[self.col_item]
+ user = self.row[self.col_user]
+ test_batch = self.row[self.col_test_batch]
+ if not self.end_of_file:
+ next_user = self.next_row[self.col_user]
+ next_test_batch = self.next_row[self.col_test_batch]
+ if item not in self.items:
+ self.item2id[item] = len(self.item2id)
+ user_items.append(item)
+
+ if (next_user != user) or self.next_row is None:
+ if not self.end_of_file:
+ if next_user in self.users:
+ raise FileNotSortedException(
+ "File {} is not sorted by user".format(self.filename)
+ )
+ self.user2id[user] = len(self.user2id)
+ if self.col_test_batch:
+ if (next_test_batch != test_batch) or self.next_row is None:
+ if not self.end_of_file:
+ if next_test_batch < batch_index:
+ raise FileNotSortedException(
+ "File {} is not sorted by {}".format(
+ self.filename, self.col_test_batch
+ )
+ )
+ batch_index += 1
+ self.batch_indices_range = range(0, batch_index)
+ self.data_len = self.line_num
+
+[docs] def load_data(self, key, by_user=True):
+ """Load data for a specified user or test batch
+
+ Args:
+ key (int): user or test batch index
+ by_user (bool): load data by usr if True, else by test batch
+
+ Returns:
+ pandas.DataFrame
+ """
+ records = []
+ key_col = self.col_user if by_user else self.col_test_batch
+
+ # fast forward in file to user/test batch
+ while (self.line_num == 0) or (self.row[key_col] != key):
+ if self.end_of_file:
+ raise MissingUserException(
+ "User {} not in file {}".format(key, self.filename)
+ )
+ next(self)
+ # collect user/test batch data
+ while self.row[key_col] == key:
+ row = self.row
+ if self.col_test_batch in row:
+ del row[self.col_test_batch]
+ records.append(row)
+ if not self.end_of_file:
+ next(self)
+ else:
+ break
+ return pd.DataFrame.from_records(records)
+
+
+[docs]class NegativeSampler:
+ """NegativeSampler class for NCF. Samples a subset of negative items from a given population of items."""
+
+ def __init__(
+ self,
+ user,
+ n_samples,
+ user_positive_item_pool,
+ item_pool,
+ sample_with_replacement,
+ print_warnings=True,
+ training=True,
+ ):
+ """Constructor
+
+ Args:
+ user (str or int): User to be sampled for.
+ n_samples (int): Number of required samples.
+ user_positive_item_pool (set): Set of items with which user has previously interacted.
+ item_pool (set): Set of all items in population.
+ sample_with_replacement (bool): If true, sample negative examples with replacement,
+ otherwise without replacement.
+ print_warnings (bool): If true, prints warnings if sampling without replacement and
+ there are not enough items to sample from to satisfy n_neg or n_neg_test.
+ training (bool): Set to true if sampling for the training set or false if for the test set.
+ """
+ self.user = user
+ self.n_samples = n_samples
+ self.user_positive_item_pool = user_positive_item_pool
+ self.item_pool = item_pool
+ self.sample_with_replacement = sample_with_replacement
+ self.print_warnings = print_warnings
+ self.training = training
+
+ self.user_negative_item_pool = self._get_user_negatives_pool()
+ self.population_size = len(self.user_negative_item_pool)
+ self._sample = (
+ self._sample_negatives_with_replacement
+ if self.sample_with_replacement
+ else self._sample_negatives_without_replacement
+ )
+ if not self.sample_with_replacement:
+ self._check_sample_size()
+
+[docs] def sample(self):
+ """Method for sampling uniformly from a population of negative items
+
+ Returns: list
+ """
+ return self._sample()
+
+ def _get_user_negatives_pool(self):
+ # get list of items user has not interacted with
+ return list(set(self.item_pool) - self.user_positive_item_pool)
+
+ def _sample_negatives_with_replacement(self):
+ return random.choices(self.user_negative_item_pool, k=self.n_samples)
+
+ def _sample_negatives_without_replacement(self):
+ return random.sample(self.user_negative_item_pool, k=self.n_samples)
+
+ def _check_sample_size(self):
+ # if sampling without replacement, check sample population is sufficient and reduce
+ # n_samples if not.
+ n_neg_var = "n_neg" if self.training else "n_neg_test"
+ dataset_name = "training" if self.training else "test"
+
+ k = min(self.n_samples, self.population_size)
+ if k < self.n_samples and self.print_warnings:
+ warning_string = (
+ "The population of negative items to sample from is too small for user {}. "
+ "Samples needed = {}, negative items = {}. "
+ "Reducing samples to {} for this user."
+ "If an equal number of negative samples for each user is required in the {} set, sample with replacement or reduce {}. "
+ "This warning can be turned off by setting print_warnings=False".format(
+ self.user,
+ self.n_samples,
+ self.population_size,
+ self.population_size,
+ dataset_name,
+ n_neg_var,
+ )
+ )
+ logging.warning(warning_string)
+ self.n_samples = k
+
+
+[docs]class Dataset(object):
+ """Dataset class for NCF"""
+
+ def __init__(
+ self,
+ train_file,
+ test_file=None,
+ test_file_full=None,
+ overwrite_test_file_full=False,
+ n_neg=4,
+ n_neg_test=100,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ binary=True,
+ seed=None,
+ sample_with_replacement=False,
+ print_warnings=False,
+ ):
+ """Constructor
+
+ Args:
+ train_file (str): Path to training dataset file.
+ test_file (str): Path to test dataset file for leave-one-out evaluation.
+ test_file_full (str): Path to full test dataset file including negative samples.
+ overwrite_test_file_full (bool): If true, recreate and overwrite test_file_full.
+ n_neg (int): Number of negative samples per positive example for training set.
+ n_neg_test (int): Number of negative samples per positive example for test set.
+ col_user (str): User column name.
+ col_item (str): Item column name.
+ col_rating (str): Rating column name.
+ binary (bool): If true, set rating > 0 to rating = 1.
+ seed (int): Seed.
+ sample_with_replacement (bool): If true, sample negative examples with replacement,
+ otherwise without replacement.
+ print_warnings (bool): If true, prints warnings if sampling without replacement and
+ there are not enough items to sample from to satisfy n_neg or n_neg_test.
+ """
+ self.train_file = train_file
+ self.test_file = test_file
+ self.test_file_full = test_file_full
+ self.overwrite_test_file_full = overwrite_test_file_full
+ self.n_neg = n_neg
+ self.n_neg_test = n_neg_test
+ self.col_user = col_user
+ self.col_item = col_item
+ self.col_rating = col_rating
+ self.binary = binary
+ self.sample_with_replacement = sample_with_replacement
+ self.print_warnings = print_warnings
+
+ self.col_test_batch = "test_batch"
+
+ self.train_datafile = DataFile(
+ filename=self.train_file,
+ col_user=self.col_user,
+ col_item=self.col_item,
+ col_rating=self.col_rating,
+ binary=self.binary,
+ )
+
+ self.n_users = len(self.train_datafile.users)
+ self.n_items = len(self.train_datafile.items)
+ self.user2id = self.train_datafile.user2id
+ self.item2id = self.train_datafile.item2id
+ self.id2user = self.train_datafile.id2user
+ self.id2item = self.train_datafile.id2item
+ self.train_len = self.train_datafile.data_len
+
+ if self.test_file is not None:
+ self.test_datafile = DataFile(
+ filename=self.test_file,
+ col_user=self.col_user,
+ col_item=self.col_item,
+ col_rating=self.col_rating,
+ binary=self.binary,
+ )
+ if self.test_file_full is None:
+ self.test_file_full = os.path.splitext(self.test_file)[0] + "_full.csv"
+ if self.overwrite_test_file_full or not os.path.isfile(self.test_file_full):
+ self._create_test_file()
+ self.test_full_datafile = DataFile(
+ filename=self.test_file_full,
+ col_user=self.col_user,
+ col_item=self.col_item,
+ col_rating=self.col_rating,
+ col_test_batch=self.col_test_batch,
+ binary=self.binary,
+ )
+ # set random seed
+ random.seed(seed)
+
+ def _create_negative_examples_df(self, user, user_negative_samples):
+ # create dataframe containing negative examples for user assigned zero rating
+ n_samples = len(user_negative_samples)
+ return pd.DataFrame(
+ {
+ self.col_user: [user] * n_samples,
+ self.col_item: user_negative_samples,
+ self.col_rating: [0.0] * n_samples,
+ }
+ )
+
+ def _create_test_file(self):
+
+ logger.info(
+ "Creating full leave-one-out test file {} ...".format(self.test_file_full)
+ )
+
+ # create empty csv
+ pd.DataFrame(
+ columns=[self.col_user, self.col_item, self.col_rating, self.col_test_batch]
+ ).to_csv(self.test_file_full, index=False)
+
+ batch_idx = 0
+
+ with self.train_datafile as train_datafile:
+ with self.test_datafile as test_datafile:
+ for user in tqdm(test_datafile.users):
+ if user in train_datafile.users:
+ user_test_data = test_datafile.load_data(user)
+ user_train_data = train_datafile.load_data(user)
+ # for leave-one-out evaluation, exclude items seen in both training and test sets
+ # when sampling negatives
+ user_positive_item_pool = set(
+ user_test_data[self.col_item].unique()
+ ).union(user_train_data[self.col_item].unique())
+ sampler = NegativeSampler(
+ user,
+ self.n_neg_test,
+ user_positive_item_pool,
+ self.train_datafile.items,
+ self.sample_with_replacement,
+ self.print_warnings,
+ training=False,
+ )
+
+ user_examples_dfs = []
+ # sample n_neg_test negatives for each positive example and assign a batch index
+ for positive_example in np.array_split(
+ user_test_data, user_test_data.shape[0]
+ ):
+ negative_examples = self._create_negative_examples_df(
+ user, sampler.sample()
+ )
+ examples = pd.concat([positive_example, negative_examples])
+ examples[self.col_test_batch] = batch_idx
+ user_examples_dfs.append(examples)
+ batch_idx += 1
+ # append user test data to file
+ user_examples = pd.concat(user_examples_dfs)
+ user_examples.to_csv(
+ self.test_file_full, mode="a", index=False, header=False
+ )
+
+ def _split_into_batches(self, shuffle_buffer, batch_size):
+ for i in range(0, len(shuffle_buffer), batch_size):
+ yield shuffle_buffer[i : i + batch_size]
+
+ def _prepare_batch_with_id(self, batch):
+ return [
+ [self.user2id[user] for user in batch[self.col_user].values],
+ [self.item2id[item] for item in batch[self.col_item].values],
+ batch[self.col_rating].values.tolist(),
+ ]
+
+ def _prepare_batch_without_id(self, batch):
+ return [
+ batch[self.col_user].values.tolist(),
+ batch[self.col_item].values.tolist(),
+ batch[self.col_rating].values.tolist(),
+ ]
+
+ def _release_shuffle_buffer(
+ self, shuffle_buffer, batch_size, yield_id, write_to=None
+ ):
+ prepare_batch = (
+ self._prepare_batch_with_id if yield_id else self._prepare_batch_without_id
+ )
+ shuffle_buffer_df = pd.concat(shuffle_buffer)
+ shuffle_buffer_df = shuffle_buffer_df.sample(
+ shuffle_buffer_df.shape[0]
+ ) # shuffle the buffer
+ for batch in self._split_into_batches(shuffle_buffer_df, batch_size):
+ if batch.shape[0] == batch_size:
+ if write_to:
+ batch.to_csv(write_to, mode="a", header=False, index=False)
+ yield prepare_batch(batch)
+ else:
+ return batch
+
+[docs] def train_loader(
+ self, batch_size, shuffle_size=None, yield_id=False, write_to=None
+ ):
+ """
+ Generator for serving batches of training data. Positive examples are loaded from the
+ original training file, to which negative samples are added. Data is loaded in memory into a
+ shuffle buffer up to a maximum of shuffle_size rows, before the data is shuffled and released.
+ If out-of-memory errors are encountered, try reducing shuffle_size.
+
+ Args:
+ batch_size (int): Number of examples in each batch.
+ shuffle_size (int): Maximum number of examples in shuffle buffer.
+ yield_id (bool): If true, return assigned user and item IDs, else return original values.
+ write_to (str): Path of file to write full dataset (including negative examples).
+
+ Returns:
+ list
+ """
+
+ # if shuffle_size not supplied, use (estimated) full data size i.e. complete in-memory shuffle
+ if shuffle_size is None:
+ shuffle_size = self.train_len * (self.n_neg + 1)
+ if write_to:
+ pd.DataFrame(
+ columns=[self.col_user, self.col_item, self.col_rating]
+ ).to_csv(write_to, header=True, index=False)
+ shuffle_buffer = []
+
+ with self.train_datafile as train_datafile:
+ for user in train_datafile.users:
+ user_positive_examples = train_datafile.load_data(user)
+ user_positive_item_pool = set(
+ user_positive_examples[self.col_item].unique()
+ )
+ n_samples = self.n_neg * user_positive_examples.shape[0]
+ sampler = NegativeSampler(
+ user,
+ n_samples,
+ user_positive_item_pool,
+ self.train_datafile.items,
+ self.sample_with_replacement,
+ self.print_warnings,
+ )
+ user_negative_examples = self._create_negative_examples_df(
+ user, sampler.sample()
+ )
+ user_examples = pd.concat(
+ [user_positive_examples, user_negative_examples]
+ )
+ shuffle_buffer.append(user_examples)
+ shuffle_buffer_len = sum([df.shape[0] for df in shuffle_buffer])
+ if shuffle_buffer_len >= shuffle_size:
+ buffer_remainder = yield from self._release_shuffle_buffer(
+ shuffle_buffer, batch_size, yield_id, write_to
+ )
+ shuffle_buffer = (
+ [buffer_remainder] if buffer_remainder is not None else []
+ )
+ # yield remaining buffer
+ yield from self._release_shuffle_buffer(
+ shuffle_buffer, batch_size, yield_id, write_to
+ )
+
+[docs] def test_loader(self, yield_id=False):
+ """Generator for serving batches of test data for leave-one-out evaluation. Data is loaded from test_file_full.
+
+ Args:
+ yield_id (bool): If true, return assigned user and item IDs, else return original values.
+
+ Returns:
+ list
+ """
+ prepare_batch = (
+ self._prepare_batch_with_id if yield_id else self._prepare_batch_without_id
+ )
+
+ with self.test_full_datafile as test_full_datafile:
+ for test_batch_idx in test_full_datafile.batch_indices_range:
+ test_batch_data = test_full_datafile.load_data(
+ test_batch_idx, by_user=False
+ )
+ yield prepare_batch(test_batch_data)
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import os
+import numpy as np
+import tensorflow as tf
+import tf_slim as slim
+from time import time
+import logging
+
+
+tf.compat.v1.disable_eager_execution()
+logger = logging.getLogger(__name__)
+MODEL_CHECKPOINT = "model.ckpt"
+
+
+[docs]class NCF:
+ """Neural Collaborative Filtering (NCF) implementation
+
+ :Citation:
+
+ He, Xiangnan, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu, and Tat-Seng Chua. "Neural collaborative filtering."
+ In Proceedings of the 26th International Conference on World Wide Web, pp. 173-182. International World Wide Web
+ Conferences Steering Committee, 2017. Link: https://www.comp.nus.edu.sg/~xiangnan/papers/ncf.pdf
+ """
+
+ def __init__(
+ self,
+ n_users,
+ n_items,
+ model_type="NeuMF",
+ n_factors=8,
+ layer_sizes=[16, 8, 4],
+ n_epochs=50,
+ batch_size=64,
+ learning_rate=5e-3,
+ verbose=1,
+ seed=None,
+ ):
+ """Constructor
+
+ Args:
+ n_users (int): Number of users in the dataset.
+ n_items (int): Number of items in the dataset.
+ model_type (str): Model type.
+ n_factors (int): Dimension of latent space.
+ layer_sizes (list): Number of layers for MLP.
+ n_epochs (int): Number of epochs for training.
+ batch_size (int): Batch size.
+ learning_rate (float): Learning rate.
+ verbose (int): Whether to show the training output or not.
+ seed (int): Seed.
+
+ """
+
+ # seed
+ tf.compat.v1.set_random_seed(seed)
+ np.random.seed(seed)
+ self.seed = seed
+
+ self.n_users = n_users
+ self.n_items = n_items
+ self.model_type = model_type.lower()
+ self.n_factors = n_factors
+ self.layer_sizes = layer_sizes
+ self.n_epochs = n_epochs
+ self.verbose = verbose
+ self.batch_size = batch_size
+ self.learning_rate = learning_rate
+
+ # check model type
+ model_options = ["gmf", "mlp", "neumf"]
+ if self.model_type not in model_options:
+ raise ValueError(
+ "Wrong model type, please select one of this list: {}".format(
+ model_options
+ )
+ )
+
+ # ncf layer input size
+ self.ncf_layer_size = n_factors + layer_sizes[-1]
+ # create ncf model
+ self._create_model()
+ # set GPU use with demand growth
+ gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
+ # set TF Session
+ self.sess = tf.compat.v1.Session(
+ config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)
+ )
+ # parameters initialization
+ self.sess.run(tf.compat.v1.global_variables_initializer())
+
+ def _create_model(
+ self,
+ ):
+ # reset graph
+ tf.compat.v1.reset_default_graph()
+
+ with tf.compat.v1.variable_scope("input_data", reuse=tf.compat.v1.AUTO_REUSE):
+
+ # input: index of users, items and ground truth
+ self.user_input = tf.compat.v1.placeholder(tf.int32, shape=[None, 1])
+ self.item_input = tf.compat.v1.placeholder(tf.int32, shape=[None, 1])
+ self.labels = tf.compat.v1.placeholder(tf.float32, shape=[None, 1])
+
+ with tf.compat.v1.variable_scope("embedding", reuse=tf.compat.v1.AUTO_REUSE):
+
+ # set embedding table
+ self.embedding_gmf_P = tf.Variable(
+ tf.random.truncated_normal(
+ shape=[self.n_users, self.n_factors],
+ mean=0.0,
+ stddev=0.01,
+ seed=self.seed,
+ ),
+ name="embedding_gmf_P",
+ dtype=tf.float32,
+ )
+
+ self.embedding_gmf_Q = tf.Variable(
+ tf.random.truncated_normal(
+ shape=[self.n_items, self.n_factors],
+ mean=0.0,
+ stddev=0.01,
+ seed=self.seed,
+ ),
+ name="embedding_gmf_Q",
+ dtype=tf.float32,
+ )
+
+ # set embedding table
+ self.embedding_mlp_P = tf.Variable(
+ tf.random.truncated_normal(
+ shape=[self.n_users, int(self.layer_sizes[0] / 2)],
+ mean=0.0,
+ stddev=0.01,
+ seed=self.seed,
+ ),
+ name="embedding_mlp_P",
+ dtype=tf.float32,
+ )
+
+ self.embedding_mlp_Q = tf.Variable(
+ tf.random.truncated_normal(
+ shape=[self.n_items, int(self.layer_sizes[0] / 2)],
+ mean=0.0,
+ stddev=0.01,
+ seed=self.seed,
+ ),
+ name="embedding_mlp_Q",
+ dtype=tf.float32,
+ )
+
+ with tf.compat.v1.variable_scope("gmf", reuse=tf.compat.v1.AUTO_REUSE):
+
+ # get user embedding p and item embedding q
+ self.gmf_p = tf.reduce_sum(
+ input_tensor=tf.nn.embedding_lookup(
+ params=self.embedding_gmf_P, ids=self.user_input
+ ),
+ axis=1,
+ )
+ self.gmf_q = tf.reduce_sum(
+ input_tensor=tf.nn.embedding_lookup(
+ params=self.embedding_gmf_Q, ids=self.item_input
+ ),
+ axis=1,
+ )
+
+ # get gmf vector
+ self.gmf_vector = self.gmf_p * self.gmf_q
+
+ with tf.compat.v1.variable_scope("mlp", reuse=tf.compat.v1.AUTO_REUSE):
+
+ # get user embedding p and item embedding q
+ self.mlp_p = tf.reduce_sum(
+ input_tensor=tf.nn.embedding_lookup(
+ params=self.embedding_mlp_P, ids=self.user_input
+ ),
+ axis=1,
+ )
+ self.mlp_q = tf.reduce_sum(
+ input_tensor=tf.nn.embedding_lookup(
+ params=self.embedding_mlp_Q, ids=self.item_input
+ ),
+ axis=1,
+ )
+
+ # concatenate user and item vector
+ output = tf.concat([self.mlp_p, self.mlp_q], 1)
+
+ # MLP Layers
+ for layer_size in self.layer_sizes[1:]:
+ output = slim.layers.fully_connected(
+ output,
+ num_outputs=layer_size,
+ activation_fn=tf.nn.relu,
+ weights_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
+ scale=1.0,
+ mode="fan_avg",
+ distribution="uniform",
+ seed=self.seed,
+ ),
+ )
+ self.mlp_vector = output
+
+ # self.output = tf.sigmoid(tf.reduce_sum(self.mlp_vector, axis=1, keepdims=True))
+
+ with tf.compat.v1.variable_scope("ncf", reuse=tf.compat.v1.AUTO_REUSE):
+
+ if self.model_type == "gmf":
+ # GMF only
+ output = slim.layers.fully_connected(
+ self.gmf_vector,
+ num_outputs=1,
+ activation_fn=None,
+ biases_initializer=None,
+ weights_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
+ scale=1.0,
+ mode="fan_avg",
+ distribution="uniform",
+ seed=self.seed,
+ ),
+ )
+ self.output = tf.sigmoid(output)
+
+ elif self.model_type == "mlp":
+ # MLP only
+ output = slim.layers.fully_connected(
+ self.mlp_vector,
+ num_outputs=1,
+ activation_fn=None,
+ biases_initializer=None,
+ weights_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
+ scale=1.0,
+ mode="fan_avg",
+ distribution="uniform",
+ seed=self.seed,
+ ),
+ )
+ self.output = tf.sigmoid(output)
+
+ elif self.model_type == "neumf":
+ # concatenate GMF and MLP vector
+ self.ncf_vector = tf.concat([self.gmf_vector, self.mlp_vector], 1)
+ # get predicted rating score
+ output = slim.layers.fully_connected(
+ self.ncf_vector,
+ num_outputs=1,
+ activation_fn=None,
+ biases_initializer=None,
+ weights_initializer=tf.compat.v1.keras.initializers.VarianceScaling(
+ scale=1.0,
+ mode="fan_avg",
+ distribution="uniform",
+ seed=self.seed,
+ ),
+ )
+ self.output = tf.sigmoid(output)
+
+ with tf.compat.v1.variable_scope("loss", reuse=tf.compat.v1.AUTO_REUSE):
+
+ # set loss function
+ self.loss = tf.compat.v1.losses.log_loss(self.labels, self.output)
+
+ with tf.compat.v1.variable_scope("optimizer", reuse=tf.compat.v1.AUTO_REUSE):
+
+ # set optimizer
+ self.optimizer = tf.compat.v1.train.AdamOptimizer(
+ learning_rate=self.learning_rate
+ ).minimize(self.loss)
+
+[docs] def save(self, dir_name):
+ """Save model parameters in `dir_name`
+
+ Args:
+ dir_name (str): directory name, which should be a folder name instead of file name
+ we will create a new directory if not existing.
+ """
+ # save trained model
+ if not os.path.exists(dir_name):
+ os.makedirs(dir_name)
+ saver = tf.compat.v1.train.Saver()
+ saver.save(self.sess, os.path.join(dir_name, MODEL_CHECKPOINT))
+
+[docs] def load(self, gmf_dir=None, mlp_dir=None, neumf_dir=None, alpha=0.5):
+ """Load model parameters for further use.
+
+ GMF model --> load parameters in `gmf_dir`
+
+ MLP model --> load parameters in `mlp_dir`
+
+ NeuMF model --> load parameters in `neumf_dir` or in `gmf_dir` and `mlp_dir`
+
+ Args:
+ gmf_dir (str): Directory name for GMF model.
+ mlp_dir (str): Directory name for MLP model.
+ neumf_dir (str): Directory name for neumf model.
+ alpha (float): the concatenation hyper-parameter for gmf and mlp output layer.
+
+ Returns:
+ object: Load parameters in this model.
+ """
+
+ # load pre-trained model
+ if self.model_type == "gmf" and gmf_dir is not None:
+ saver = tf.compat.v1.train.Saver()
+ saver.restore(self.sess, os.path.join(gmf_dir, MODEL_CHECKPOINT))
+
+ elif self.model_type == "mlp" and mlp_dir is not None:
+ saver = tf.compat.v1.train.Saver()
+ saver.restore(self.sess, os.path.join(mlp_dir, MODEL_CHECKPOINT))
+
+ elif self.model_type == "neumf" and neumf_dir is not None:
+ saver = tf.compat.v1.train.Saver()
+ saver.restore(self.sess, os.path.join(neumf_dir, MODEL_CHECKPOINT))
+
+ elif self.model_type == "neumf" and gmf_dir is not None and mlp_dir is not None:
+ # load neumf using gmf and mlp
+ self._load_neumf(gmf_dir, mlp_dir, alpha)
+
+ else:
+ raise NotImplementedError
+
+ def _load_neumf(self, gmf_dir, mlp_dir, alpha):
+ """Load gmf and mlp model parameters for further use in NeuMF.
+ NeuMF model --> load parameters in `gmf_dir` and `mlp_dir`
+ """
+ # load gmf part
+ variables = tf.compat.v1.global_variables()
+ # get variables with 'gmf'
+ var_flow_restore = [
+ val for val in variables if "gmf" in val.name and "ncf" not in val.name
+ ]
+ # load 'gmf' variable
+ saver = tf.compat.v1.train.Saver(var_flow_restore)
+ # restore
+ saver.restore(self.sess, os.path.join(gmf_dir, MODEL_CHECKPOINT))
+
+ # load mlp part
+ variables = tf.compat.v1.global_variables()
+ # get variables with 'gmf'
+ var_flow_restore = [
+ val for val in variables if "mlp" in val.name and "ncf" not in val.name
+ ]
+ # load 'gmf' variable
+ saver = tf.compat.v1.train.Saver(var_flow_restore)
+ # restore
+ saver.restore(self.sess, os.path.join(mlp_dir, MODEL_CHECKPOINT))
+
+ # concat pretrain h_from_gmf and h_from_mlp
+ vars_list = tf.compat.v1.get_collection(
+ tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="ncf"
+ )
+
+ assert len(vars_list) == 1
+ ncf_fc = vars_list[0]
+
+ # get weight from gmf and mlp
+ gmf_fc = tf.train.load_variable(gmf_dir, ncf_fc.name)
+ mlp_fc = tf.train.load_variable(mlp_dir, ncf_fc.name)
+
+ # load fc layer by tf.concat
+ assign_op = tf.compat.v1.assign(
+ ncf_fc, tf.concat([alpha * gmf_fc, (1 - alpha) * mlp_fc], axis=0)
+ )
+ self.sess.run(assign_op)
+
+[docs] def fit(self, data):
+ """Fit model with training data
+
+ Args:
+ data (NCFDataset): initilized Dataset in ./dataset.py
+ """
+
+ # get user and item mapping dict
+ self.user2id = data.user2id
+ self.item2id = data.item2id
+ self.id2user = data.id2user
+ self.id2item = data.id2item
+
+ # loop for n_epochs
+ for epoch_count in range(1, self.n_epochs + 1):
+
+ # negative sampling for training
+ train_begin = time()
+
+ # initialize
+ train_loss = []
+
+ # calculate loss and update NCF parameters
+ for user_input, item_input, labels in data.train_loader(self.batch_size):
+
+ user_input = np.array([self.user2id[x] for x in user_input])
+ item_input = np.array([self.item2id[x] for x in item_input])
+ labels = np.array(labels)
+
+ feed_dict = {
+ self.user_input: user_input[..., None],
+ self.item_input: item_input[..., None],
+ self.labels: labels[..., None],
+ }
+
+ # get loss and execute optimization
+ loss, _ = self.sess.run([self.loss, self.optimizer], feed_dict)
+ train_loss.append(loss)
+ train_time = time() - train_begin
+
+ # output every self.verbose
+ if self.verbose and epoch_count % self.verbose == 0:
+ logger.info(
+ "Epoch %d [%.2fs]: train_loss = %.6f "
+ % (epoch_count, train_time, sum(train_loss) / len(train_loss))
+ )
+
+[docs] def predict(self, user_input, item_input, is_list=False):
+ """Predict function of this trained model
+
+ Args:
+ user_input (list or element of list): userID or userID list
+ item_input (list or element of list): itemID or itemID list
+ is_list (bool): if true, the input is list type
+ noting that list-wise type prediction is faster than element-wise's.
+
+ Returns:
+ list or float: A list of predicted rating or predicted rating score.
+ """
+
+ if is_list:
+ output = self._predict(user_input, item_input)
+ return list(output.reshape(-1))
+
+ else:
+ output = self._predict(np.array([user_input]), np.array([item_input]))
+ return float(output.reshape(-1)[0])
+
+ def _predict(self, user_input, item_input):
+
+ # index converting
+ user_input = np.array([self.user2id[x] for x in user_input])
+ item_input = np.array([self.item2id[x] for x in item_input])
+
+ # get feed dict
+ feed_dict = {
+ self.user_input: user_input[..., None],
+ self.item_input: item_input[..., None],
+ }
+
+ # calculate predicted score
+ return self.sess.run(self.output, feed_dict)
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+import numpy as np
+import pickle
+
+from recommenders.models.deeprec.io.iterator import BaseIterator
+from recommenders.models.newsrec.newsrec_utils import word_tokenize, newsample
+
+__all__ = ["MINDAllIterator"]
+
+
+[docs]class MINDAllIterator(BaseIterator):
+ """Train data loader for NAML model.
+ The model require a special type of data format, where each instance contains a label, impresion id, user id,
+ the candidate news articles and user's clicked news article. Articles are represented by title words,
+ body words, verts and subverts.
+
+ Iterator will not load the whole data into memory. Instead, it loads data into memory
+ per mini-batch, so that large files can be used as input data.
+
+ Attributes:
+ col_spliter (str): column spliter in one line.
+ ID_spliter (str): ID spliter in one line.
+ batch_size (int): the samples num in one batch.
+ title_size (int): max word num in news title.
+ body_size (int): max word num in news body (abstract used in MIND).
+ his_size (int): max clicked news num in user click history.
+ npratio (int): negaive and positive ratio used in negative sampling. -1 means no need of negtive sampling.
+ """
+
+ def __init__(
+ self,
+ hparams,
+ npratio=-1,
+ col_spliter="\t",
+ ID_spliter="%",
+ ):
+ """Initialize an iterator. Create necessary placeholders for the model.
+
+ Args:
+ hparams (object): Global hyper-parameters. Some key setttings such as head_num and head_dim are there.
+ graph (object): the running graph. All created placeholder will be added to this graph.
+ col_spliter (str): column spliter in one line.
+ ID_spliter (str): ID spliter in one line.
+ """
+ self.col_spliter = col_spliter
+ self.ID_spliter = ID_spliter
+ self.batch_size = hparams.batch_size
+ self.title_size = hparams.title_size
+ self.body_size = hparams.body_size
+ self.his_size = hparams.his_size
+ self.npratio = npratio
+
+ self.word_dict = self.load_dict(hparams.wordDict_file)
+ self.vert_dict = self.load_dict(hparams.vertDict_file)
+ self.subvert_dict = self.load_dict(hparams.subvertDict_file)
+ self.uid2index = self.load_dict(hparams.userDict_file)
+
+[docs] def load_dict(self, file_path):
+ """Load pickled file
+
+ Args:
+ file path (str): File path
+
+ Returns:
+ object: pickle load obj
+ """
+ with open(file_path, "rb") as f:
+ return pickle.load(f)
+
+[docs] def init_news(self, news_file):
+ """Init news information given news file, such as `news_title_index`, `news_abstract_index`.
+
+ Args:
+ news_file: path of news file
+ """
+ self.nid2index = {}
+ news_title = [""]
+ news_ab = [""]
+ news_vert = [""]
+ news_subvert = [""]
+
+ with tf.io.gfile.GFile(news_file, "r") as rd:
+ for line in rd:
+ nid, vert, subvert, title, ab, url, _, _ = line.strip("\n").split(
+ self.col_spliter
+ )
+
+ if nid in self.nid2index:
+ continue
+
+ self.nid2index[nid] = len(self.nid2index) + 1
+ title = word_tokenize(title)
+ ab = word_tokenize(ab)
+ news_title.append(title)
+ news_ab.append(ab)
+ news_vert.append(vert)
+ news_subvert.append(subvert)
+
+ self.news_title_index = np.zeros(
+ (len(news_title), self.title_size), dtype="int32"
+ )
+
+ self.news_ab_index = np.zeros((len(news_ab), self.body_size), dtype="int32")
+ self.news_vert_index = np.zeros((len(news_vert), 1), dtype="int32")
+ self.news_subvert_index = np.zeros((len(news_subvert), 1), dtype="int32")
+
+ for news_index in range(len(news_title)):
+ title = news_title[news_index]
+ ab = news_ab[news_index]
+ vert = news_vert[news_index]
+ subvert = news_subvert[news_index]
+ for word_index in range(min(self.title_size, len(title))):
+ if title[word_index] in self.word_dict:
+ self.news_title_index[news_index, word_index] = self.word_dict[
+ title[word_index].lower()
+ ]
+ for word_index_ab in range(min(self.body_size, len(ab))):
+ if ab[word_index_ab] in self.word_dict:
+ self.news_ab_index[news_index, word_index_ab] = self.word_dict[
+ ab[word_index_ab].lower()
+ ]
+ if vert in self.vert_dict:
+ self.news_vert_index[news_index, 0] = self.vert_dict[vert]
+ if subvert in self.subvert_dict:
+ self.news_subvert_index[news_index, 0] = self.subvert_dict[subvert]
+
+[docs] def init_behaviors(self, behaviors_file):
+ """Init behavior logs given behaviors file.
+
+ Args:
+ behaviors_file (str): path of behaviors file
+ """
+ self.histories = []
+ self.imprs = []
+ self.labels = []
+ self.impr_indexes = []
+ self.uindexes = []
+
+ with tf.io.gfile.GFile(behaviors_file, "r") as rd:
+ impr_index = 0
+ for line in rd:
+ uid, time, history, impr = line.strip("\n").split(self.col_spliter)[-4:]
+
+ history = [self.nid2index[i] for i in history.split()]
+ history = [0] * (self.his_size - len(history)) + history[
+ : self.his_size
+ ]
+
+ impr_news = [self.nid2index[i.split("-")[0]] for i in impr.split()]
+ label = [int(i.split("-")[1]) for i in impr.split()]
+ uindex = self.uid2index[uid] if uid in self.uid2index else 0
+
+ self.histories.append(history)
+ self.imprs.append(impr_news)
+ self.labels.append(label)
+ self.impr_indexes.append(impr_index)
+ self.uindexes.append(uindex)
+ impr_index += 1
+
+[docs] def parser_one_line(self, line):
+ """Parse one string line into feature values.
+
+ Args:
+ line (str): a string indicating one instance.
+
+ Yields:
+ list: Parsed results including label, impression id , user id,
+ candidate_title_index, clicked_title_index,
+ candidate_ab_index, clicked_ab_index,
+ candidate_vert_index, clicked_vert_index,
+ candidate_subvert_index, clicked_subvert_index,
+ """
+ if self.npratio > 0:
+ impr_label = self.labels[line]
+ impr = self.imprs[line]
+
+ poss = []
+ negs = []
+
+ for news, click in zip(impr, impr_label):
+ if click == 1:
+ poss.append(news)
+ else:
+ negs.append(news)
+
+ for p in poss:
+ candidate_title_index = []
+ impr_index = []
+ user_index = []
+ label = [1] + [0] * self.npratio
+
+ n = newsample(negs, self.npratio)
+ candidate_title_index = self.news_title_index[[p] + n]
+ candidate_ab_index = self.news_ab_index[[p] + n]
+ candidate_vert_index = self.news_vert_index[[p] + n]
+ candidate_subvert_index = self.news_subvert_index[[p] + n]
+ click_title_index = self.news_title_index[self.histories[line]]
+ click_ab_index = self.news_ab_index[self.histories[line]]
+ click_vert_index = self.news_vert_index[self.histories[line]]
+ click_subvert_index = self.news_subvert_index[self.histories[line]]
+ impr_index.append(self.impr_indexes[line])
+ user_index.append(self.uindexes[line])
+
+ yield (
+ label,
+ impr_index,
+ user_index,
+ candidate_title_index,
+ candidate_ab_index,
+ candidate_vert_index,
+ candidate_subvert_index,
+ click_title_index,
+ click_ab_index,
+ click_vert_index,
+ click_subvert_index,
+ )
+
+ else:
+ impr_label = self.labels[line]
+ impr = self.imprs[line]
+
+ for news, label in zip(impr, impr_label):
+ candidate_title_index = []
+ impr_index = []
+ user_index = []
+ label = [label]
+
+ candidate_title_index.append(self.news_title_index[news])
+ click_title_index = self.news_title_index[self.histories[line]]
+
+ candidate_title_index = self.news_title_index[news]
+ candidate_ab_index = self.news_ab_index[news]
+ candidate_vert_index = self.news_vert_index[news]
+ candidate_subvert_index = self.news_subvert_index[news]
+ click_title_index = self.news_title_index[self.histories[line]]
+ click_ab_index = self.news_ab_index[self.histories[line]]
+ click_vert_index = self.news_vert_index[self.histories[line]]
+ click_subvert_index = self.news_subvert_index[self.histories[line]]
+ impr_index.append(self.impr_indexes[line])
+ user_index.append(self.uindexes[line])
+
+ yield (
+ label,
+ impr_index,
+ user_index,
+ candidate_title_index,
+ candidate_ab_index,
+ candidate_vert_index,
+ candidate_subvert_index,
+ click_title_index,
+ click_ab_index,
+ click_vert_index,
+ click_subvert_index,
+ )
+
+[docs] def load_data_from_file(self, news_file, behavior_file):
+ """Read and parse data from a file.
+
+ Args:
+ news_file (str): A file contains several informations of news.
+ beahaviros_file (str): A file contains information of user impressions.
+
+ Yields:
+ object: An iterator that yields parsed results, in the format of graph feed_dict.
+ """
+
+ if not hasattr(self, "news_title_index"):
+ self.init_news(news_file)
+
+ if not hasattr(self, "impr_indexes"):
+ self.init_behaviors(behavior_file)
+
+ label_list = []
+ imp_indexes = []
+ user_indexes = []
+ candidate_title_indexes = []
+ candidate_ab_indexes = []
+ candidate_vert_indexes = []
+ candidate_subvert_indexes = []
+ click_title_indexes = []
+ click_ab_indexes = []
+ click_vert_indexes = []
+ click_subvert_indexes = []
+ cnt = 0
+
+ indexes = np.arange(len(self.labels))
+
+ if self.npratio > 0:
+ np.random.shuffle(indexes)
+
+ for index in indexes:
+ for (
+ label,
+ impr_index,
+ user_index,
+ candidate_title_index,
+ candidate_ab_index,
+ candidate_vert_index,
+ candidate_subvert_index,
+ click_title_index,
+ click_ab_index,
+ click_vert_index,
+ click_subvert_index,
+ ) in self.parser_one_line(index):
+ candidate_title_indexes.append(candidate_title_index)
+ candidate_ab_indexes.append(candidate_ab_index)
+ candidate_vert_indexes.append(candidate_vert_index)
+ candidate_subvert_indexes.append(candidate_subvert_index)
+ click_title_indexes.append(click_title_index)
+ click_ab_indexes.append(click_ab_index)
+ click_vert_indexes.append(click_vert_index)
+ click_subvert_indexes.append(click_subvert_index)
+ imp_indexes.append(impr_index)
+ user_indexes.append(user_index)
+ label_list.append(label)
+
+ cnt += 1
+ if cnt >= self.batch_size:
+ yield self._convert_data(
+ label_list,
+ imp_indexes,
+ user_indexes,
+ candidate_title_indexes,
+ candidate_ab_indexes,
+ candidate_vert_indexes,
+ candidate_subvert_indexes,
+ click_title_indexes,
+ click_ab_indexes,
+ click_vert_indexes,
+ click_subvert_indexes,
+ )
+ label_list = []
+ imp_indexes = []
+ user_indexes = []
+ candidate_title_indexes = []
+ candidate_ab_indexes = []
+ candidate_vert_indexes = []
+ candidate_subvert_indexes = []
+ click_title_indexes = []
+ click_ab_indexes = []
+ click_vert_indexes = []
+ click_subvert_indexes = []
+ cnt = 0
+
+ def _convert_data(
+ self,
+ label_list,
+ imp_indexes,
+ user_indexes,
+ candidate_title_indexes,
+ candidate_ab_indexes,
+ candidate_vert_indexes,
+ candidate_subvert_indexes,
+ click_title_indexes,
+ click_ab_indexes,
+ click_vert_indexes,
+ click_subvert_indexes,
+ ):
+ """Convert data into numpy arrays that are good for further model operation.
+
+ Args:
+ label_list (list): a list of ground-truth labels.
+ imp_indexes (list): a list of impression indexes.
+ user_indexes (list): a list of user indexes.
+ candidate_title_indexes (list): the candidate news titles' words indices.
+ candidate_ab_indexes (list): the candidate news abstarcts' words indices.
+ candidate_vert_indexes (list): the candidate news verts' words indices.
+ candidate_subvert_indexes (list): the candidate news subverts' indices.
+ click_title_indexes (list): words indices for user's clicked news titles.
+ click_ab_indexes (list): words indices for user's clicked news abstarcts.
+ click_vert_indexes (list): indices for user's clicked news verts.
+ click_subvert_indexes (list):indices for user's clicked news subverts.
+
+ Returns:
+ dict: A dictionary, containing multiple numpy arrays that are convenient for further operation.
+ """
+
+ labels = np.asarray(label_list, dtype=np.float32)
+ imp_indexes = np.asarray(imp_indexes, dtype=np.int32)
+ user_indexes = np.asarray(user_indexes, dtype=np.int32)
+ candidate_title_index_batch = np.asarray(
+ candidate_title_indexes, dtype=np.int64
+ )
+ candidate_ab_index_batch = np.asarray(candidate_ab_indexes, dtype=np.int64)
+ candidate_vert_index_batch = np.asarray(candidate_vert_indexes, dtype=np.int64)
+ candidate_subvert_index_batch = np.asarray(
+ candidate_subvert_indexes, dtype=np.int64
+ )
+ click_title_index_batch = np.asarray(click_title_indexes, dtype=np.int64)
+ click_ab_index_batch = np.asarray(click_ab_indexes, dtype=np.int64)
+ click_vert_index_batch = np.asarray(click_vert_indexes, dtype=np.int64)
+ click_subvert_index_batch = np.asarray(click_subvert_indexes, dtype=np.int64)
+ return {
+ "impression_index_batch": imp_indexes,
+ "user_index_batch": user_indexes,
+ "clicked_title_batch": click_title_index_batch,
+ "clicked_ab_batch": click_ab_index_batch,
+ "clicked_vert_batch": click_vert_index_batch,
+ "clicked_subvert_batch": click_subvert_index_batch,
+ "candidate_title_batch": candidate_title_index_batch,
+ "candidate_ab_batch": candidate_ab_index_batch,
+ "candidate_vert_batch": candidate_vert_index_batch,
+ "candidate_subvert_batch": candidate_subvert_index_batch,
+ "labels": labels,
+ }
+
+[docs] def load_user_from_file(self, news_file, behavior_file):
+ """Read and parse user data from news file and behavior file.
+
+ Args:
+ news_file (str): A file contains several informations of news.
+ beahaviros_file (str): A file contains information of user impressions.
+
+ Yields:
+ object: An iterator that yields parsed user feature, in the format of dict.
+ """
+
+ if not hasattr(self, "news_title_index"):
+ self.init_news(news_file)
+
+ if not hasattr(self, "impr_indexes"):
+ self.init_behaviors(behavior_file)
+
+ user_indexes = []
+ impr_indexes = []
+ click_title_indexes = []
+ click_ab_indexes = []
+ click_vert_indexes = []
+ click_subvert_indexes = []
+ cnt = 0
+
+ for index in range(len(self.impr_indexes)):
+ click_title_indexes.append(self.news_title_index[self.histories[index]])
+ click_ab_indexes.append(self.news_ab_index[self.histories[index]])
+ click_vert_indexes.append(self.news_vert_index[self.histories[index]])
+ click_subvert_indexes.append(self.news_subvert_index[self.histories[index]])
+ user_indexes.append(self.uindexes[index])
+ impr_indexes.append(self.impr_indexes[index])
+
+ cnt += 1
+ if cnt >= self.batch_size:
+ yield self._convert_user_data(
+ user_indexes,
+ impr_indexes,
+ click_title_indexes,
+ click_ab_indexes,
+ click_vert_indexes,
+ click_subvert_indexes,
+ )
+ user_indexes = []
+ impr_indexes = []
+ click_title_indexes = []
+ click_ab_indexes = []
+ click_vert_indexes = []
+ click_subvert_indexes = []
+
+ def _convert_user_data(
+ self,
+ user_indexes,
+ impr_indexes,
+ click_title_indexes,
+ click_ab_indexes,
+ click_vert_indexes,
+ click_subvert_indexes,
+ ):
+ """Convert data into numpy arrays that are good for further model operation.
+
+ Args:
+ user_indexes (list): a list of user indexes.
+ click_title_indexes (list): words indices for user's clicked news titles.
+ click_ab_indexes (list): words indices for user's clicked news abs.
+ click_vert_indexes (list): words indices for user's clicked news verts.
+ click_subvert_indexes (list): words indices for user's clicked news subverts.
+
+ Returns:
+ dict: A dictionary, containing multiple numpy arrays that are convenient for further operation.
+ """
+
+ user_indexes = np.asarray(user_indexes, dtype=np.int32)
+ impr_indexes = np.asarray(impr_indexes, dtype=np.int32)
+ click_title_index_batch = np.asarray(click_title_indexes, dtype=np.int64)
+ click_ab_index_batch = np.asarray(click_ab_indexes, dtype=np.int64)
+ click_vert_index_batch = np.asarray(click_vert_indexes, dtype=np.int64)
+ click_subvert_index_batch = np.asarray(click_subvert_indexes, dtype=np.int64)
+
+ return {
+ "user_index_batch": user_indexes,
+ "impr_index_batch": impr_indexes,
+ "clicked_title_batch": click_title_index_batch,
+ "clicked_ab_batch": click_ab_index_batch,
+ "clicked_vert_batch": click_vert_index_batch,
+ "clicked_subvert_batch": click_subvert_index_batch,
+ }
+
+[docs] def load_news_from_file(self, news_file):
+ """Read and parse user data from news file.
+
+ Args:
+ news_file (str): A file contains several informations of news.
+
+ Yields:
+ object: An iterator that yields parsed news feature, in the format of dict.
+ """
+ if not hasattr(self, "news_title_index"):
+ self.init_news(news_file)
+
+ news_indexes = []
+ candidate_title_indexes = []
+ candidate_ab_indexes = []
+ candidate_vert_indexes = []
+ candidate_subvert_indexes = []
+ cnt = 0
+
+ for index in range(len(self.news_title_index)):
+ news_indexes.append(index)
+ candidate_title_indexes.append(self.news_title_index[index])
+ candidate_ab_indexes.append(self.news_ab_index[index])
+ candidate_vert_indexes.append(self.news_vert_index[index])
+ candidate_subvert_indexes.append(self.news_subvert_index[index])
+
+ cnt += 1
+ if cnt >= self.batch_size:
+ yield self._convert_news_data(
+ news_indexes,
+ candidate_title_indexes,
+ candidate_ab_indexes,
+ candidate_vert_indexes,
+ candidate_subvert_indexes,
+ )
+ news_indexes = []
+ candidate_title_indexes = []
+ candidate_ab_indexes = []
+ candidate_vert_indexes = []
+ candidate_subvert_indexes = []
+
+ def _convert_news_data(
+ self,
+ news_indexes,
+ candidate_title_indexes,
+ candidate_ab_indexes,
+ candidate_vert_indexes,
+ candidate_subvert_indexes,
+ ):
+ """Convert data into numpy arrays that are good for further model operation.
+
+ Args:
+ news_indexes (list): a list of news indexes.
+ candidate_title_indexes (list): the candidate news titles' words indices.
+ candidate_ab_indexes (list): the candidate news abstarcts' words indices.
+ candidate_vert_indexes (list): the candidate news verts' words indices.
+ candidate_subvert_indexes (list): the candidate news subverts' words indices.
+
+ Returns:
+ dict: A dictionary, containing multiple numpy arrays that are convenient for further operation.
+ """
+
+ news_indexes_batch = np.asarray(news_indexes, dtype=np.int32)
+ candidate_title_index_batch = np.asarray(
+ candidate_title_indexes, dtype=np.int32
+ )
+ candidate_ab_index_batch = np.asarray(candidate_ab_indexes, dtype=np.int32)
+ candidate_vert_index_batch = np.asarray(candidate_vert_indexes, dtype=np.int32)
+ candidate_subvert_index_batch = np.asarray(
+ candidate_subvert_indexes, dtype=np.int32
+ )
+
+ return {
+ "news_index_batch": news_indexes_batch,
+ "candidate_title_batch": candidate_title_index_batch,
+ "candidate_ab_batch": candidate_ab_index_batch,
+ "candidate_vert_batch": candidate_vert_index_batch,
+ "candidate_subvert_batch": candidate_subvert_index_batch,
+ }
+
+[docs] def load_impression_from_file(self, behaivors_file):
+ """Read and parse impression data from behaivors file.
+
+ Args:
+ behaivors_file (str): A file contains several informations of behaviros.
+
+ Yields:
+ object: An iterator that yields parsed impression data, in the format of dict.
+ """
+
+ if not hasattr(self, "histories"):
+ self.init_behaviors(behaivors_file)
+
+ indexes = np.arange(len(self.labels))
+
+ for index in indexes:
+ impr_label = np.array(self.labels[index], dtype="int32")
+ impr_news = np.array(self.imprs[index], dtype="int32")
+
+ yield (
+ self.impr_indexes[index],
+ impr_news,
+ self.uindexes[index],
+ impr_label,
+ )
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+import numpy as np
+import pickle
+
+from recommenders.models.deeprec.io.iterator import BaseIterator
+from recommenders.models.newsrec.newsrec_utils import word_tokenize, newsample
+
+__all__ = ["MINDIterator"]
+
+
+[docs]class MINDIterator(BaseIterator):
+ """Train data loader for NAML model.
+ The model require a special type of data format, where each instance contains a label, impresion id, user id,
+ the candidate news articles and user's clicked news article. Articles are represented by title words,
+ body words, verts and subverts.
+
+ Iterator will not load the whole data into memory. Instead, it loads data into memory
+ per mini-batch, so that large files can be used as input data.
+
+ Attributes:
+ col_spliter (str): column spliter in one line.
+ ID_spliter (str): ID spliter in one line.
+ batch_size (int): the samples num in one batch.
+ title_size (int): max word num in news title.
+ his_size (int): max clicked news num in user click history.
+ npratio (int): negaive and positive ratio used in negative sampling. -1 means no need of negtive sampling.
+ """
+
+ def __init__(
+ self,
+ hparams,
+ npratio=-1,
+ col_spliter="\t",
+ ID_spliter="%",
+ ):
+ """Initialize an iterator. Create necessary placeholders for the model.
+
+ Args:
+ hparams (object): Global hyper-parameters. Some key setttings such as head_num and head_dim are there.
+ npratio (int): negaive and positive ratio used in negative sampling. -1 means no need of negtive sampling.
+ col_spliter (str): column spliter in one line.
+ ID_spliter (str): ID spliter in one line.
+ """
+ self.col_spliter = col_spliter
+ self.ID_spliter = ID_spliter
+ self.batch_size = hparams.batch_size
+ self.title_size = hparams.title_size
+ self.his_size = hparams.his_size
+ self.npratio = npratio
+
+ self.word_dict = self.load_dict(hparams.wordDict_file)
+ self.uid2index = self.load_dict(hparams.userDict_file)
+
+[docs] def load_dict(self, file_path):
+ """load pickle file
+
+ Args:
+ file path (str): file path
+
+ Returns:
+ object: pickle loaded object
+ """
+ with open(file_path, "rb") as f:
+ return pickle.load(f)
+
+[docs] def init_news(self, news_file):
+ """init news information given news file, such as news_title_index and nid2index.
+ Args:
+ news_file: path of news file
+ """
+
+ self.nid2index = {}
+ news_title = [""]
+
+ with tf.io.gfile.GFile(news_file, "r") as rd:
+ for line in rd:
+ nid, vert, subvert, title, ab, url, _, _ = line.strip("\n").split(
+ self.col_spliter
+ )
+
+ if nid in self.nid2index:
+ continue
+
+ self.nid2index[nid] = len(self.nid2index) + 1
+ title = word_tokenize(title)
+ news_title.append(title)
+
+ self.news_title_index = np.zeros(
+ (len(news_title), self.title_size), dtype="int32"
+ )
+
+ for news_index in range(len(news_title)):
+ title = news_title[news_index]
+ for word_index in range(min(self.title_size, len(title))):
+ if title[word_index] in self.word_dict:
+ self.news_title_index[news_index, word_index] = self.word_dict[
+ title[word_index].lower()
+ ]
+
+[docs] def init_behaviors(self, behaviors_file):
+ """init behavior logs given behaviors file.
+
+ Args:
+ behaviors_file: path of behaviors file
+ """
+ self.histories = []
+ self.imprs = []
+ self.labels = []
+ self.impr_indexes = []
+ self.uindexes = []
+
+ with tf.io.gfile.GFile(behaviors_file, "r") as rd:
+ impr_index = 0
+ for line in rd:
+ uid, time, history, impr = line.strip("\n").split(self.col_spliter)[-4:]
+
+ history = [self.nid2index[i] for i in history.split()]
+ history = [0] * (self.his_size - len(history)) + history[
+ : self.his_size
+ ]
+
+ impr_news = [self.nid2index[i.split("-")[0]] for i in impr.split()]
+ label = [int(i.split("-")[1]) for i in impr.split()]
+ uindex = self.uid2index[uid] if uid in self.uid2index else 0
+
+ self.histories.append(history)
+ self.imprs.append(impr_news)
+ self.labels.append(label)
+ self.impr_indexes.append(impr_index)
+ self.uindexes.append(uindex)
+ impr_index += 1
+
+[docs] def parser_one_line(self, line):
+ """Parse one behavior sample into feature values.
+ if npratio is larger than 0, return negtive sampled result.
+
+ Args:
+ line (int): sample index.
+
+ Yields:
+ list: Parsed results including label, impression id , user id,
+ candidate_title_index, clicked_title_index.
+ """
+ if self.npratio > 0:
+ impr_label = self.labels[line]
+ impr = self.imprs[line]
+
+ poss = []
+ negs = []
+
+ for news, click in zip(impr, impr_label):
+ if click == 1:
+ poss.append(news)
+ else:
+ negs.append(news)
+
+ for p in poss:
+ candidate_title_index = []
+ impr_index = []
+ user_index = []
+ label = [1] + [0] * self.npratio
+
+ n = newsample(negs, self.npratio)
+ candidate_title_index = self.news_title_index[[p] + n]
+ click_title_index = self.news_title_index[self.histories[line]]
+ impr_index.append(self.impr_indexes[line])
+ user_index.append(self.uindexes[line])
+
+ yield (
+ label,
+ impr_index,
+ user_index,
+ candidate_title_index,
+ click_title_index,
+ )
+
+ else:
+ impr_label = self.labels[line]
+ impr = self.imprs[line]
+
+ for news, label in zip(impr, impr_label):
+ candidate_title_index = []
+ impr_index = []
+ user_index = []
+ label = [label]
+
+ candidate_title_index.append(self.news_title_index[news])
+ click_title_index = self.news_title_index[self.histories[line]]
+ impr_index.append(self.impr_indexes[line])
+ user_index.append(self.uindexes[line])
+
+ yield (
+ label,
+ impr_index,
+ user_index,
+ candidate_title_index,
+ click_title_index,
+ )
+
+[docs] def load_data_from_file(self, news_file, behavior_file):
+ """Read and parse data from news file and behavior file.
+
+ Args:
+ news_file (str): A file contains several informations of news.
+ beahaviros_file (str): A file contains information of user impressions.
+
+ Yields:
+ object: An iterator that yields parsed results, in the format of dict.
+ """
+
+ if not hasattr(self, "news_title_index"):
+ self.init_news(news_file)
+
+ if not hasattr(self, "impr_indexes"):
+ self.init_behaviors(behavior_file)
+
+ label_list = []
+ imp_indexes = []
+ user_indexes = []
+ candidate_title_indexes = []
+ click_title_indexes = []
+ cnt = 0
+
+ indexes = np.arange(len(self.labels))
+
+ if self.npratio > 0:
+ np.random.shuffle(indexes)
+
+ for index in indexes:
+ for (
+ label,
+ imp_index,
+ user_index,
+ candidate_title_index,
+ click_title_index,
+ ) in self.parser_one_line(index):
+ candidate_title_indexes.append(candidate_title_index)
+ click_title_indexes.append(click_title_index)
+ imp_indexes.append(imp_index)
+ user_indexes.append(user_index)
+ label_list.append(label)
+
+ cnt += 1
+ if cnt >= self.batch_size:
+ yield self._convert_data(
+ label_list,
+ imp_indexes,
+ user_indexes,
+ candidate_title_indexes,
+ click_title_indexes,
+ )
+ label_list = []
+ imp_indexes = []
+ user_indexes = []
+ candidate_title_indexes = []
+ click_title_indexes = []
+ cnt = 0
+
+ if cnt > 0:
+ yield self._convert_data(
+ label_list,
+ imp_indexes,
+ user_indexes,
+ candidate_title_indexes,
+ click_title_indexes,
+ )
+
+ def _convert_data(
+ self,
+ label_list,
+ imp_indexes,
+ user_indexes,
+ candidate_title_indexes,
+ click_title_indexes,
+ ):
+ """Convert data into numpy arrays that are good for further model operation.
+
+ Args:
+ label_list (list): a list of ground-truth labels.
+ imp_indexes (list): a list of impression indexes.
+ user_indexes (list): a list of user indexes.
+ candidate_title_indexes (list): the candidate news titles' words indices.
+ click_title_indexes (list): words indices for user's clicked news titles.
+
+ Returns:
+ dict: A dictionary, containing multiple numpy arrays that are convenient for further operation.
+ """
+
+ labels = np.asarray(label_list, dtype=np.float32)
+ imp_indexes = np.asarray(imp_indexes, dtype=np.int32)
+ user_indexes = np.asarray(user_indexes, dtype=np.int32)
+ candidate_title_index_batch = np.asarray(
+ candidate_title_indexes, dtype=np.int64
+ )
+ click_title_index_batch = np.asarray(click_title_indexes, dtype=np.int64)
+ return {
+ "impression_index_batch": imp_indexes,
+ "user_index_batch": user_indexes,
+ "clicked_title_batch": click_title_index_batch,
+ "candidate_title_batch": candidate_title_index_batch,
+ "labels": labels,
+ }
+
+[docs] def load_user_from_file(self, news_file, behavior_file):
+ """Read and parse user data from news file and behavior file.
+
+ Args:
+ news_file (str): A file contains several informations of news.
+ beahaviros_file (str): A file contains information of user impressions.
+
+ Yields:
+ object: An iterator that yields parsed user feature, in the format of dict.
+ """
+
+ if not hasattr(self, "news_title_index"):
+ self.init_news(news_file)
+
+ if not hasattr(self, "impr_indexes"):
+ self.init_behaviors(behavior_file)
+
+ user_indexes = []
+ impr_indexes = []
+ click_title_indexes = []
+ cnt = 0
+
+ for index in range(len(self.impr_indexes)):
+ click_title_indexes.append(self.news_title_index[self.histories[index]])
+ user_indexes.append(self.uindexes[index])
+ impr_indexes.append(self.impr_indexes[index])
+
+ cnt += 1
+ if cnt >= self.batch_size:
+ yield self._convert_user_data(
+ user_indexes,
+ impr_indexes,
+ click_title_indexes,
+ )
+ user_indexes = []
+ impr_indexes = []
+ click_title_indexes = []
+ cnt = 0
+
+ if cnt > 0:
+ yield self._convert_user_data(
+ user_indexes,
+ impr_indexes,
+ click_title_indexes,
+ )
+
+ def _convert_user_data(
+ self,
+ user_indexes,
+ impr_indexes,
+ click_title_indexes,
+ ):
+ """Convert data into numpy arrays that are good for further model operation.
+
+ Args:
+ user_indexes (list): a list of user indexes.
+ click_title_indexes (list): words indices for user's clicked news titles.
+
+ Returns:
+ dict: A dictionary, containing multiple numpy arrays that are convenient for further operation.
+ """
+
+ user_indexes = np.asarray(user_indexes, dtype=np.int32)
+ impr_indexes = np.asarray(impr_indexes, dtype=np.int32)
+ click_title_index_batch = np.asarray(click_title_indexes, dtype=np.int64)
+
+ return {
+ "user_index_batch": user_indexes,
+ "impr_index_batch": impr_indexes,
+ "clicked_title_batch": click_title_index_batch,
+ }
+
+[docs] def load_news_from_file(self, news_file):
+ """Read and parse user data from news file.
+
+ Args:
+ news_file (str): A file contains several informations of news.
+
+ Yields:
+ object: An iterator that yields parsed news feature, in the format of dict.
+ """
+ if not hasattr(self, "news_title_index"):
+ self.init_news(news_file)
+
+ news_indexes = []
+ candidate_title_indexes = []
+ cnt = 0
+
+ for index in range(len(self.news_title_index)):
+ news_indexes.append(index)
+ candidate_title_indexes.append(self.news_title_index[index])
+
+ cnt += 1
+ if cnt >= self.batch_size:
+ yield self._convert_news_data(
+ news_indexes,
+ candidate_title_indexes,
+ )
+ news_indexes = []
+ candidate_title_indexes = []
+ cnt = 0
+
+ if cnt > 0:
+ yield self._convert_news_data(
+ news_indexes,
+ candidate_title_indexes,
+ )
+
+ def _convert_news_data(
+ self,
+ news_indexes,
+ candidate_title_indexes,
+ ):
+ """Convert data into numpy arrays that are good for further model operation.
+
+ Args:
+ news_indexes (list): a list of news indexes.
+ candidate_title_indexes (list): the candidate news titles' words indices.
+
+ Returns:
+ dict: A dictionary, containing multiple numpy arrays that are convenient for further operation.
+ """
+
+ news_indexes_batch = np.asarray(news_indexes, dtype=np.int32)
+ candidate_title_index_batch = np.asarray(
+ candidate_title_indexes, dtype=np.int32
+ )
+
+ return {
+ "news_index_batch": news_indexes_batch,
+ "candidate_title_batch": candidate_title_index_batch,
+ }
+
+[docs] def load_impression_from_file(self, behaivors_file):
+ """Read and parse impression data from behaivors file.
+
+ Args:
+ behaivors_file (str): A file contains several informations of behaviros.
+
+ Yields:
+ object: An iterator that yields parsed impression data, in the format of dict.
+ """
+
+ if not hasattr(self, "histories"):
+ self.init_behaviors(behaivors_file)
+
+ indexes = np.arange(len(self.labels))
+
+ for index in indexes:
+ impr_label = np.array(self.labels[index], dtype="int32")
+ impr_news = np.array(self.imprs[index], dtype="int32")
+
+ yield (
+ self.impr_indexes[index],
+ impr_news,
+ self.uindexes[index],
+ impr_label,
+ )
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import abc
+import time
+import numpy as np
+from tqdm import tqdm
+import tensorflow as tf
+from tensorflow.compat.v1 import keras
+
+from recommenders.models.deeprec.deeprec_utils import cal_metric
+
+tf.compat.v1.disable_eager_execution()
+tf.compat.v1.experimental.output_all_intermediates(True)
+__all__ = ["BaseModel"]
+
+
+[docs]class BaseModel:
+ """Basic class of models
+
+ Attributes:
+ hparams (HParams): A HParams object, holds the entire set of hyperparameters.
+ train_iterator (object): An iterator to load the data in training steps.
+ test_iterator (object): An iterator to load the data in testing steps.
+ graph (object): An optional graph.
+ seed (int): Random seed.
+ """
+
+ def __init__(
+ self,
+ hparams,
+ iterator_creator,
+ seed=None,
+ ):
+ """Initializing the model. Create common logics which are needed by all deeprec models, such as loss function,
+ parameter set.
+
+ Args:
+ hparams (HParams): A HParams object, holds the entire set of hyperparameters.
+ iterator_creator (object): An iterator to load the data.
+ graph (object): An optional graph.
+ seed (int): Random seed.
+ """
+ self.seed = seed
+ tf.compat.v1.set_random_seed(seed)
+ np.random.seed(seed)
+
+ self.train_iterator = iterator_creator(
+ hparams,
+ hparams.npratio,
+ col_spliter="\t",
+ )
+ self.test_iterator = iterator_creator(
+ hparams,
+ col_spliter="\t",
+ )
+
+ self.hparams = hparams
+ self.support_quick_scoring = hparams.support_quick_scoring
+
+ # set GPU use with on demand growth
+ gpu_options = tf.compat.v1.GPUOptions(allow_growth=True)
+ sess = tf.compat.v1.Session(
+ config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)
+ )
+
+ # set this TensorFlow session as the default session for Keras
+ tf.compat.v1.keras.backend.set_session(sess)
+
+ # IMPORTANT: models have to be loaded AFTER SETTING THE SESSION for keras!
+ # Otherwise, their weights will be unavailable in the threads after the session there has been set
+ self.model, self.scorer = self._build_graph()
+
+ self.loss = self._get_loss()
+ self.train_optimizer = self._get_opt()
+
+ self.model.compile(loss=self.loss, optimizer=self.train_optimizer)
+
+ def _init_embedding(self, file_path):
+ """Load pre-trained embeddings as a constant tensor.
+
+ Args:
+ file_path (str): the pre-trained glove embeddings file path.
+
+ Returns:
+ numpy.ndarray: A constant numpy array.
+ """
+
+ return np.load(file_path)
+
+ @abc.abstractmethod
+ def _build_graph(self):
+ """Subclass will implement this."""
+ pass
+
+ @abc.abstractmethod
+ def _get_input_label_from_iter(self, batch_data):
+ """Subclass will implement this"""
+ pass
+
+ def _get_loss(self):
+ """Make loss function, consists of data loss and regularization loss
+
+ Returns:
+ object: Loss function or loss function name
+ """
+ if self.hparams.loss == "cross_entropy_loss":
+ data_loss = "categorical_crossentropy"
+ elif self.hparams.loss == "log_loss":
+ data_loss = "binary_crossentropy"
+ else:
+ raise ValueError("this loss not defined {0}".format(self.hparams.loss))
+ return data_loss
+
+ def _get_opt(self):
+ """Get the optimizer according to configuration. Usually we will use Adam.
+ Returns:
+ object: An optimizer.
+ """
+ lr = self.hparams.learning_rate
+ optimizer = self.hparams.optimizer
+
+ if optimizer == "adam":
+ train_opt = keras.optimizers.Adam(lr=lr)
+
+ return train_opt
+
+ def _get_pred(self, logit, task):
+ """Make final output as prediction score, according to different tasks.
+
+ Args:
+ logit (object): Base prediction value.
+ task (str): A task (values: regression/classification)
+
+ Returns:
+ object: Transformed score
+ """
+ if task == "regression":
+ pred = tf.identity(logit)
+ elif task == "classification":
+ pred = tf.sigmoid(logit)
+ else:
+ raise ValueError(
+ "method must be regression or classification, but now is {0}".format(
+ task
+ )
+ )
+ return pred
+
+[docs] def train(self, train_batch_data):
+ """Go through the optimization step once with training data in feed_dict.
+
+ Args:
+ sess (object): The model session object.
+ feed_dict (dict): Feed values to train the model. This is a dictionary that maps graph elements to values.
+
+ Returns:
+ list: A list of values, including update operation, total loss, data loss, and merged summary.
+ """
+ train_input, train_label = self._get_input_label_from_iter(train_batch_data)
+ rslt = self.model.train_on_batch(train_input, train_label)
+ return rslt
+
+[docs] def eval(self, eval_batch_data):
+ """Evaluate the data in feed_dict with current model.
+
+ Args:
+ sess (object): The model session object.
+ feed_dict (dict): Feed values for evaluation. This is a dictionary that maps graph elements to values.
+
+ Returns:
+ list: A list of evaluated results, including total loss value, data loss value, predicted scores, and ground-truth labels.
+ """
+ eval_input, eval_label = self._get_input_label_from_iter(eval_batch_data)
+ imp_index = eval_batch_data["impression_index_batch"]
+
+ pred_rslt = self.scorer.predict_on_batch(eval_input)
+
+ return pred_rslt, eval_label, imp_index
+
+[docs] def fit(
+ self,
+ train_news_file,
+ train_behaviors_file,
+ valid_news_file,
+ valid_behaviors_file,
+ test_news_file=None,
+ test_behaviors_file=None,
+ ):
+ """Fit the model with train_file. Evaluate the model on valid_file per epoch to observe the training status.
+ If test_news_file is not None, evaluate it too.
+
+ Args:
+ train_file (str): training data set.
+ valid_file (str): validation set.
+ test_news_file (str): test set.
+
+ Returns:
+ object: An instance of self.
+ """
+
+ for epoch in range(1, self.hparams.epochs + 1):
+ step = 0
+ self.hparams.current_epoch = epoch
+ epoch_loss = 0
+ train_start = time.time()
+
+ tqdm_util = tqdm(
+ self.train_iterator.load_data_from_file(
+ train_news_file, train_behaviors_file
+ )
+ )
+
+ for batch_data_input in tqdm_util:
+
+ step_result = self.train(batch_data_input)
+ step_data_loss = step_result
+
+ epoch_loss += step_data_loss
+ step += 1
+ if step % self.hparams.show_step == 0:
+ tqdm_util.set_description(
+ "step {0:d} , total_loss: {1:.4f}, data_loss: {2:.4f}".format(
+ step, epoch_loss / step, step_data_loss
+ )
+ )
+
+ train_end = time.time()
+ train_time = train_end - train_start
+
+ eval_start = time.time()
+
+ train_info = ",".join(
+ [
+ str(item[0]) + ":" + str(item[1])
+ for item in [("logloss loss", epoch_loss / step)]
+ ]
+ )
+
+ eval_res = self.run_eval(valid_news_file, valid_behaviors_file)
+ eval_info = ", ".join(
+ [
+ str(item[0]) + ":" + str(item[1])
+ for item in sorted(eval_res.items(), key=lambda x: x[0])
+ ]
+ )
+ if test_news_file is not None:
+ test_res = self.run_eval(test_news_file, test_behaviors_file)
+ test_info = ", ".join(
+ [
+ str(item[0]) + ":" + str(item[1])
+ for item in sorted(test_res.items(), key=lambda x: x[0])
+ ]
+ )
+ eval_end = time.time()
+ eval_time = eval_end - eval_start
+
+ if test_news_file is not None:
+ print(
+ "at epoch {0:d}".format(epoch)
+ + "\ntrain info: "
+ + train_info
+ + "\neval info: "
+ + eval_info
+ + "\ntest info: "
+ + test_info
+ )
+ else:
+ print(
+ "at epoch {0:d}".format(epoch)
+ + "\ntrain info: "
+ + train_info
+ + "\neval info: "
+ + eval_info
+ )
+ print(
+ "at epoch {0:d} , train time: {1:.1f} eval time: {2:.1f}".format(
+ epoch, train_time, eval_time
+ )
+ )
+
+ return self
+
+[docs] def group_labels(self, labels, preds, group_keys):
+ """Devide labels and preds into several group according to values in group keys.
+
+ Args:
+ labels (list): ground truth label list.
+ preds (list): prediction score list.
+ group_keys (list): group key list.
+
+ Returns:
+ list, list, list:
+ - Keys after group.
+ - Labels after group.
+ - Preds after group.
+
+ """
+
+ all_keys = list(set(group_keys))
+ all_keys.sort()
+ group_labels = {k: [] for k in all_keys}
+ group_preds = {k: [] for k in all_keys}
+
+ for label, p, k in zip(labels, preds, group_keys):
+ group_labels[k].append(label)
+ group_preds[k].append(p)
+
+ all_labels = []
+ all_preds = []
+ for k in all_keys:
+ all_labels.append(group_labels[k])
+ all_preds.append(group_preds[k])
+
+ return all_keys, all_labels, all_preds
+
+[docs] def run_eval(self, news_filename, behaviors_file):
+ """Evaluate the given file and returns some evaluation metrics.
+
+ Args:
+ filename (str): A file name that will be evaluated.
+
+ Returns:
+ dict: A dictionary that contains evaluation metrics.
+ """
+
+ if self.support_quick_scoring:
+ _, group_labels, group_preds = self.run_fast_eval(
+ news_filename, behaviors_file
+ )
+ else:
+ _, group_labels, group_preds = self.run_slow_eval(
+ news_filename, behaviors_file
+ )
+ res = cal_metric(group_labels, group_preds, self.hparams.metrics)
+ return res
+
+ def user(self, batch_user_input):
+ user_input = self._get_user_feature_from_iter(batch_user_input)
+ user_vec = self.userencoder.predict_on_batch(user_input)
+ user_index = batch_user_input["impr_index_batch"]
+
+ return user_index, user_vec
+
+ def news(self, batch_news_input):
+ news_input = self._get_news_feature_from_iter(batch_news_input)
+ news_vec = self.newsencoder.predict_on_batch(news_input)
+ news_index = batch_news_input["news_index_batch"]
+
+ return news_index, news_vec
+
+ def run_user(self, news_filename, behaviors_file):
+ if not hasattr(self, "userencoder"):
+ raise ValueError("model must have attribute userencoder")
+
+ user_indexes = []
+ user_vecs = []
+ for batch_data_input in tqdm(
+ self.test_iterator.load_user_from_file(news_filename, behaviors_file)
+ ):
+ user_index, user_vec = self.user(batch_data_input)
+ user_indexes.extend(np.reshape(user_index, -1))
+ user_vecs.extend(user_vec)
+
+ return dict(zip(user_indexes, user_vecs))
+
+ def run_news(self, news_filename):
+ if not hasattr(self, "newsencoder"):
+ raise ValueError("model must have attribute newsencoder")
+
+ news_indexes = []
+ news_vecs = []
+ for batch_data_input in tqdm(
+ self.test_iterator.load_news_from_file(news_filename)
+ ):
+ news_index, news_vec = self.news(batch_data_input)
+ news_indexes.extend(np.reshape(news_index, -1))
+ news_vecs.extend(news_vec)
+
+ return dict(zip(news_indexes, news_vecs))
+
+ def run_slow_eval(self, news_filename, behaviors_file):
+ preds = []
+ labels = []
+ imp_indexes = []
+
+ for batch_data_input in tqdm(
+ self.test_iterator.load_data_from_file(news_filename, behaviors_file)
+ ):
+ step_pred, step_labels, step_imp_index = self.eval(batch_data_input)
+ preds.extend(np.reshape(step_pred, -1))
+ labels.extend(np.reshape(step_labels, -1))
+ imp_indexes.extend(np.reshape(step_imp_index, -1))
+
+ group_impr_indexes, group_labels, group_preds = self.group_labels(
+ labels, preds, imp_indexes
+ )
+ return group_impr_indexes, group_labels, group_preds
+
+ def run_fast_eval(self, news_filename, behaviors_file):
+ news_vecs = self.run_news(news_filename)
+ user_vecs = self.run_user(news_filename, behaviors_file)
+
+ self.news_vecs = news_vecs
+ self.user_vecs = user_vecs
+
+ group_impr_indexes = []
+ group_labels = []
+ group_preds = []
+
+ for (
+ impr_index,
+ news_index,
+ user_index,
+ label,
+ ) in tqdm(self.test_iterator.load_impression_from_file(behaviors_file)):
+ pred = np.dot(
+ np.stack([news_vecs[i] for i in news_index], axis=0),
+ user_vecs[impr_index],
+ )
+ group_impr_indexes.append(impr_index)
+ group_labels.append(label)
+ group_preds.append(pred)
+
+ return group_impr_indexes, group_labels, group_preds
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow.compat.v1.keras as keras
+from tensorflow.compat.v1.linalg import einsum
+from tensorflow.compat.v1.keras import layers
+from tensorflow.compat.v1.keras import backend as K
+
+
+[docs]class AttLayer2(layers.Layer):
+ """Soft alignment attention implement.
+
+ Attributes:
+ dim (int): attention hidden dim
+ """
+
+ def __init__(self, dim=200, seed=0, **kwargs):
+ """Initialization steps for AttLayer2.
+
+ Args:
+ dim (int): attention hidden dim
+ """
+
+ self.dim = dim
+ self.seed = seed
+ super(AttLayer2, self).__init__(**kwargs)
+
+[docs] def build(self, input_shape):
+ """Initialization for variables in AttLayer2
+ There are there variables in AttLayer2, i.e. W, b and q.
+
+ Args:
+ input_shape (object): shape of input tensor.
+ """
+
+ assert len(input_shape) == 3
+ dim = self.dim
+ self.W = self.add_weight(
+ name="W",
+ shape=(int(input_shape[-1]), dim),
+ initializer=keras.initializers.glorot_uniform(seed=self.seed),
+ trainable=True,
+ )
+ self.b = self.add_weight(
+ name="b",
+ shape=(dim,),
+ initializer=keras.initializers.Zeros(),
+ trainable=True,
+ )
+ self.q = self.add_weight(
+ name="q",
+ shape=(dim, 1),
+ initializer=keras.initializers.glorot_uniform(seed=self.seed),
+ trainable=True,
+ )
+ super(AttLayer2, self).build(input_shape) # be sure you call this somewhere!
+
+[docs] def call(self, inputs, mask=None, **kwargs):
+ """Core implementation of soft attention.
+
+ Args:
+ inputs (object): input tensor.
+
+ Returns:
+ object: weighted sum of input tensors.
+ """
+
+ attention = K.tanh(K.dot(inputs, self.W) + self.b)
+ attention = K.dot(attention, self.q)
+
+ attention = K.squeeze(attention, axis=2)
+
+ if mask is None:
+ attention = K.exp(attention)
+ else:
+ attention = K.exp(attention) * K.cast(mask, dtype="float32")
+
+ attention_weight = attention / (
+ K.sum(attention, axis=-1, keepdims=True) + K.epsilon()
+ )
+
+ attention_weight = K.expand_dims(attention_weight)
+ weighted_input = inputs * attention_weight
+ return K.sum(weighted_input, axis=1)
+
+[docs] def compute_mask(self, input, input_mask=None):
+ """Compte output mask value.
+
+ Args:
+ input (object): input tensor.
+ input_mask: input mask
+
+ Returns:
+ object: output mask.
+ """
+ return None
+
+[docs] def compute_output_shape(self, input_shape):
+ """Compute shape of output tensor.
+
+ Args:
+ input_shape (tuple): shape of input tensor.
+
+ Returns:
+ tuple: shape of output tensor.
+ """
+ return input_shape[0], input_shape[-1]
+
+
+[docs]class SelfAttention(layers.Layer):
+ """Multi-head self attention implement.
+
+ Args:
+ multiheads (int): The number of heads.
+ head_dim (object): Dimension of each head.
+ mask_right (boolean): whether to mask right words.
+
+ Returns:
+ object: Weighted sum after attention.
+ """
+
+ def __init__(self, multiheads, head_dim, seed=0, mask_right=False, **kwargs):
+ """Initialization steps for AttLayer2.
+
+ Args:
+ multiheads (int): The number of heads.
+ head_dim (object): Dimension of each head.
+ mask_right (boolean): Whether to mask right words.
+ """
+
+ self.multiheads = multiheads
+ self.head_dim = head_dim
+ self.output_dim = multiheads * head_dim
+ self.mask_right = mask_right
+ self.seed = seed
+ super(SelfAttention, self).__init__(**kwargs)
+
+[docs] def compute_output_shape(self, input_shape):
+ """Compute shape of output tensor.
+
+ Returns:
+ tuple: output shape tuple.
+ """
+
+ return (input_shape[0][0], input_shape[0][1], self.output_dim)
+
+[docs] def build(self, input_shape):
+ """Initialization for variables in SelfAttention.
+ There are three variables in SelfAttention, i.e. WQ, WK ans WV.
+ WQ is used for linear transformation of query.
+ WK is used for linear transformation of key.
+ WV is used for linear transformation of value.
+
+ Args:
+ input_shape (object): shape of input tensor.
+ """
+
+ self.WQ = self.add_weight(
+ name="WQ",
+ shape=(int(input_shape[0][-1]), self.output_dim),
+ initializer=keras.initializers.glorot_uniform(seed=self.seed),
+ trainable=True,
+ )
+ self.WK = self.add_weight(
+ name="WK",
+ shape=(int(input_shape[1][-1]), self.output_dim),
+ initializer=keras.initializers.glorot_uniform(seed=self.seed),
+ trainable=True,
+ )
+ self.WV = self.add_weight(
+ name="WV",
+ shape=(int(input_shape[2][-1]), self.output_dim),
+ initializer=keras.initializers.glorot_uniform(seed=self.seed),
+ trainable=True,
+ )
+ super(SelfAttention, self).build(input_shape)
+
+[docs] def Mask(self, inputs, seq_len, mode="add"):
+ """Mask operation used in multi-head self attention
+
+ Args:
+ seq_len (object): sequence length of inputs.
+ mode (str): mode of mask.
+
+ Returns:
+ object: tensors after masking.
+ """
+
+ if seq_len is None:
+ return inputs
+ else:
+ mask = K.one_hot(indices=seq_len[:, 0], num_classes=K.shape(inputs)[1])
+ mask = 1 - K.cumsum(mask, axis=1)
+
+ for _ in range(len(inputs.shape) - 2):
+ mask = K.expand_dims(mask, 2)
+
+ if mode == "mul":
+ return inputs * mask
+ elif mode == "add":
+ return inputs - (1 - mask) * 1e12
+
+[docs] def call(self, QKVs):
+ """Core logic of multi-head self attention.
+
+ Args:
+ QKVs (list): inputs of multi-head self attention i.e. query, key and value.
+
+ Returns:
+ object: ouput tensors.
+ """
+ if len(QKVs) == 3:
+ Q_seq, K_seq, V_seq = QKVs
+ Q_len, V_len = None, None
+ elif len(QKVs) == 5:
+ Q_seq, K_seq, V_seq, Q_len, V_len = QKVs
+ Q_seq = K.dot(Q_seq, self.WQ)
+ Q_seq = K.reshape(
+ Q_seq, shape=(-1, K.shape(Q_seq)[1], self.multiheads, self.head_dim)
+ )
+ Q_seq = K.permute_dimensions(Q_seq, pattern=(0, 2, 1, 3))
+
+ K_seq = K.dot(K_seq, self.WK)
+ K_seq = K.reshape(
+ K_seq, shape=(-1, K.shape(K_seq)[1], self.multiheads, self.head_dim)
+ )
+ K_seq = K.permute_dimensions(K_seq, pattern=(0, 2, 1, 3))
+
+ V_seq = K.dot(V_seq, self.WV)
+ V_seq = K.reshape(
+ V_seq, shape=(-1, K.shape(V_seq)[1], self.multiheads, self.head_dim)
+ )
+ V_seq = K.permute_dimensions(V_seq, pattern=(0, 2, 1, 3))
+
+ A = einsum("abij, abkj -> abik", Q_seq, K_seq) / K.sqrt(
+ K.cast(self.head_dim, dtype="float32")
+ )
+ A = K.permute_dimensions(
+ A, pattern=(0, 3, 2, 1)
+ ) # A.shape=[batch_size,K_sequence_length,Q_sequence_length,self.multiheads]
+
+ A = self.Mask(A, V_len, "add")
+ A = K.permute_dimensions(A, pattern=(0, 3, 2, 1))
+
+ if self.mask_right:
+ ones = K.ones_like(A[:1, :1])
+ lower_triangular = K.tf.matrix_band_part(ones, num_lower=-1, num_upper=0)
+ mask = (ones - lower_triangular) * 1e12
+ A = A - mask
+ A = K.softmax(A)
+
+ O_seq = einsum("abij, abjk -> abik", A, V_seq)
+ O_seq = K.permute_dimensions(O_seq, pattern=(0, 2, 1, 3))
+
+ O_seq = K.reshape(O_seq, shape=(-1, K.shape(O_seq)[1], self.output_dim))
+ O_seq = self.Mask(O_seq, Q_len, "mul")
+ return O_seq
+
+[docs] def get_config(self):
+ """add multiheads, multiheads and mask_right into layer config.
+
+ Returns:
+ dict: config of SelfAttention layer.
+ """
+ config = super(SelfAttention, self).get_config()
+ config.update(
+ {
+ "multiheads": self.multiheads,
+ "head_dim": self.head_dim,
+ "mask_right": self.mask_right,
+ }
+ )
+ return config
+
+
+[docs]def PersonalizedAttentivePooling(dim1, dim2, dim3, seed=0):
+ """Soft alignment attention implement.
+
+ Attributes:
+ dim1 (int): first dimention of value shape.
+ dim2 (int): second dimention of value shape.
+ dim3 (int): shape of query
+
+ Returns:
+ object: weighted summary of inputs value.
+ """
+ vecs_input = keras.Input(shape=(dim1, dim2), dtype="float32")
+ query_input = keras.Input(shape=(dim3,), dtype="float32")
+
+ user_vecs = layers.Dropout(0.2)(vecs_input)
+ user_att = layers.Dense(
+ dim3,
+ activation="tanh",
+ kernel_initializer=keras.initializers.glorot_uniform(seed=seed),
+ bias_initializer=keras.initializers.Zeros(),
+ )(user_vecs)
+ user_att2 = layers.Dot(axes=-1)([query_input, user_att])
+ user_att2 = layers.Activation("softmax")(user_att2)
+ user_vec = layers.Dot((1, 1))([user_vecs, user_att2])
+
+ model = keras.Model([vecs_input, query_input], user_vec)
+ return model
+
+
+[docs]class ComputeMasking(layers.Layer):
+ """Compute if inputs contains zero value.
+
+ Returns:
+ bool tensor: True for values not equal to zero.
+ """
+
+ def __init__(self, **kwargs):
+ super(ComputeMasking, self).__init__(**kwargs)
+
+[docs] def call(self, inputs, **kwargs):
+ """Call method for ComputeMasking.
+
+ Args:
+ inputs (object): input tensor.
+
+ Returns:
+ bool tensor: True for values not equal to zero.
+ """
+ mask = K.not_equal(inputs, 0)
+ return K.cast(mask, K.floatx())
+
+
+
+
+[docs]class OverwriteMasking(layers.Layer):
+ """Set values at specific positions to zero.
+
+ Args:
+ inputs (list): value tensor and mask tensor.
+
+ Returns:
+ object: tensor after setting values to zero.
+ """
+
+ def __init__(self, **kwargs):
+ super(OverwriteMasking, self).__init__(**kwargs)
+
+
+
+[docs] def call(self, inputs, **kwargs):
+ """Call method for OverwriteMasking.
+
+ Args:
+ inputs (list): value tensor and mask tensor.
+
+ Returns:
+ object: tensor after setting values to zero.
+ """
+ return inputs[0] * K.expand_dims(inputs[1])
+
+
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow.compat.v1.keras as keras
+from tensorflow.compat.v1.keras import layers
+
+
+from recommenders.models.newsrec.models.base_model import BaseModel
+from recommenders.models.newsrec.models.layers import (
+ AttLayer2,
+ ComputeMasking,
+ OverwriteMasking,
+)
+
+__all__ = ["LSTURModel"]
+
+
+[docs]class LSTURModel(BaseModel):
+ """LSTUR model(Neural News Recommendation with Multi-Head Self-Attention)
+
+ Mingxiao An, Fangzhao Wu, Chuhan Wu, Kun Zhang, Zheng Liu and Xing Xie:
+ Neural News Recommendation with Long- and Short-term User Representations, ACL 2019
+
+ Attributes:
+ word2vec_embedding (numpy.ndarray): Pretrained word embedding matrix.
+ hparam (object): Global hyper-parameters.
+ """
+
+ def __init__(self, hparams, iterator_creator, seed=None):
+ """Initialization steps for LSTUR.
+ Compared with the BaseModel, LSTUR need word embedding.
+ After creating word embedding matrix, BaseModel's __init__ method will be called.
+
+ Args:
+ hparams (object): Global hyper-parameters. Some key setttings such as type and gru_unit are there.
+ iterator_creator_train (object): LSTUR data loader class for train data.
+ iterator_creator_test (object): LSTUR data loader class for test and validation data
+ """
+
+ self.word2vec_embedding = self._init_embedding(hparams.wordEmb_file)
+ self.hparam = hparams
+
+ super().__init__(hparams, iterator_creator, seed=seed)
+
+ def _get_input_label_from_iter(self, batch_data):
+ input_feat = [
+ batch_data["user_index_batch"],
+ batch_data["clicked_title_batch"],
+ batch_data["candidate_title_batch"],
+ ]
+ input_label = batch_data["labels"]
+ return input_feat, input_label
+
+ def _get_user_feature_from_iter(self, batch_data):
+ return [batch_data["clicked_title_batch"], batch_data["user_index_batch"]]
+
+ def _get_news_feature_from_iter(self, batch_data):
+ return batch_data["candidate_title_batch"]
+
+ def _build_graph(self):
+ """Build LSTUR model and scorer.
+
+ Returns:
+ object: a model used to train.
+ object: a model used to evaluate and inference.
+ """
+
+ model, scorer = self._build_lstur()
+ return model, scorer
+
+ def _build_userencoder(self, titleencoder, type="ini"):
+ """The main function to create user encoder of LSTUR.
+
+ Args:
+ titleencoder (object): the news encoder of LSTUR.
+
+ Return:
+ object: the user encoder of LSTUR.
+ """
+ hparams = self.hparams
+ his_input_title = keras.Input(
+ shape=(hparams.his_size, hparams.title_size), dtype="int32"
+ )
+ user_indexes = keras.Input(shape=(1,), dtype="int32")
+
+ user_embedding_layer = layers.Embedding(
+ len(self.train_iterator.uid2index),
+ hparams.gru_unit,
+ trainable=True,
+ embeddings_initializer="zeros",
+ )
+
+ long_u_emb = layers.Reshape((hparams.gru_unit,))(
+ user_embedding_layer(user_indexes)
+ )
+ click_title_presents = layers.TimeDistributed(titleencoder)(his_input_title)
+
+ if type == "ini":
+ user_present = layers.GRU(
+ hparams.gru_unit,
+ kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
+ recurrent_initializer=keras.initializers.glorot_uniform(seed=self.seed),
+ bias_initializer=keras.initializers.Zeros(),
+ )(
+ layers.Masking(mask_value=0.0)(click_title_presents),
+ initial_state=[long_u_emb],
+ )
+ elif type == "con":
+ short_uemb = layers.GRU(
+ hparams.gru_unit,
+ kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
+ recurrent_initializer=keras.initializers.glorot_uniform(seed=self.seed),
+ bias_initializer=keras.initializers.Zeros(),
+ )(layers.Masking(mask_value=0.0)(click_title_presents))
+
+ user_present = layers.Concatenate()([short_uemb, long_u_emb])
+ user_present = layers.Dense(
+ hparams.gru_unit,
+ bias_initializer=keras.initializers.Zeros(),
+ kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
+ )(user_present)
+
+ model = keras.Model(
+ [his_input_title, user_indexes], user_present, name="user_encoder"
+ )
+ return model
+
+ def _build_newsencoder(self, embedding_layer):
+ """The main function to create news encoder of LSTUR.
+
+ Args:
+ embedding_layer (object): a word embedding layer.
+
+ Return:
+ object: the news encoder of LSTUR.
+ """
+ hparams = self.hparams
+ sequences_input_title = keras.Input(shape=(hparams.title_size,), dtype="int32")
+ embedded_sequences_title = embedding_layer(sequences_input_title)
+
+ y = layers.Dropout(hparams.dropout)(embedded_sequences_title)
+ y = layers.Conv1D(
+ hparams.filter_num,
+ hparams.window_size,
+ activation=hparams.cnn_activation,
+ padding="same",
+ bias_initializer=keras.initializers.Zeros(),
+ kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
+ )(y)
+ print(y)
+ y = layers.Dropout(hparams.dropout)(y)
+ y = layers.Masking()(
+ OverwriteMasking()([y, ComputeMasking()(sequences_input_title)])
+ )
+ pred_title = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(y)
+ print(pred_title)
+ model = keras.Model(sequences_input_title, pred_title, name="news_encoder")
+ return model
+
+ def _build_lstur(self):
+ """The main function to create LSTUR's logic. The core of LSTUR
+ is a user encoder and a news encoder.
+
+ Returns:
+ object: a model used to train.
+ object: a model used to evaluate and inference.
+ """
+ hparams = self.hparams
+
+ his_input_title = keras.Input(
+ shape=(hparams.his_size, hparams.title_size), dtype="int32"
+ )
+ pred_input_title = keras.Input(
+ shape=(hparams.npratio + 1, hparams.title_size), dtype="int32"
+ )
+ pred_input_title_one = keras.Input(
+ shape=(
+ 1,
+ hparams.title_size,
+ ),
+ dtype="int32",
+ )
+ pred_title_reshape = layers.Reshape((hparams.title_size,))(pred_input_title_one)
+ user_indexes = keras.Input(shape=(1,), dtype="int32")
+
+ embedding_layer = layers.Embedding(
+ self.word2vec_embedding.shape[0],
+ hparams.word_emb_dim,
+ weights=[self.word2vec_embedding],
+ trainable=True,
+ )
+
+ titleencoder = self._build_newsencoder(embedding_layer)
+ self.userencoder = self._build_userencoder(titleencoder, type=hparams.type)
+ self.newsencoder = titleencoder
+
+ user_present = self.userencoder([his_input_title, user_indexes])
+ news_present = layers.TimeDistributed(self.newsencoder)(pred_input_title)
+ news_present_one = self.newsencoder(pred_title_reshape)
+
+ preds = layers.Dot(axes=-1)([news_present, user_present])
+ preds = layers.Activation(activation="softmax")(preds)
+
+ pred_one = layers.Dot(axes=-1)([news_present_one, user_present])
+ pred_one = layers.Activation(activation="sigmoid")(pred_one)
+
+ model = keras.Model([user_indexes, his_input_title, pred_input_title], preds)
+ scorer = keras.Model(
+ [user_indexes, his_input_title, pred_input_title_one], pred_one
+ )
+
+ return model, scorer
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+import tensorflow.keras as keras
+from tensorflow.keras import layers
+
+
+from recommenders.models.newsrec.models.base_model import BaseModel
+from recommenders.models.newsrec.models.layers import AttLayer2
+
+__all__ = ["NAMLModel"]
+
+
+[docs]class NAMLModel(BaseModel):
+ """NAML model(Neural News Recommendation with Attentive Multi-View Learning)
+
+ Chuhan Wu, Fangzhao Wu, Mingxiao An, Jianqiang Huang, Yongfeng Huang and Xing Xie,
+ Neural News Recommendation with Attentive Multi-View Learning, IJCAI 2019
+
+ Attributes:
+ word2vec_embedding (numpy.ndarray): Pretrained word embedding matrix.
+ hparam (object): Global hyper-parameters.
+ """
+
+ def __init__(self, hparams, iterator_creator, seed=None):
+ """Initialization steps for NAML.
+ Compared with the BaseModel, NAML need word embedding.
+ After creating word embedding matrix, BaseModel's __init__ method will be called.
+
+ Args:
+ hparams (object): Global hyper-parameters. Some key setttings such as filter_num are there.
+ iterator_creator_train (object): NAML data loader class for train data.
+ iterator_creator_test (object): NAML data loader class for test and validation data
+ """
+
+ self.word2vec_embedding = self._init_embedding(hparams.wordEmb_file)
+ self.hparam = hparams
+
+ super().__init__(hparams, iterator_creator, seed=seed)
+
+ def _get_input_label_from_iter(self, batch_data):
+ input_feat = [
+ batch_data["clicked_title_batch"],
+ batch_data["clicked_ab_batch"],
+ batch_data["clicked_vert_batch"],
+ batch_data["clicked_subvert_batch"],
+ batch_data["candidate_title_batch"],
+ batch_data["candidate_ab_batch"],
+ batch_data["candidate_vert_batch"],
+ batch_data["candidate_subvert_batch"],
+ ]
+ input_label = batch_data["labels"]
+ return input_feat, input_label
+
+ def _get_user_feature_from_iter(self, batch_data):
+ """get input of user encoder
+ Args:
+ batch_data: input batch data from user iterator
+
+ Returns:
+ numpy.ndarray: input user feature (clicked title batch)
+ """
+ input_feature = [
+ batch_data["clicked_title_batch"],
+ batch_data["clicked_ab_batch"],
+ batch_data["clicked_vert_batch"],
+ batch_data["clicked_subvert_batch"],
+ ]
+ input_feature = np.concatenate(input_feature, axis=-1)
+ return input_feature
+
+ def _get_news_feature_from_iter(self, batch_data):
+ """get input of news encoder
+ Args:
+ batch_data: input batch data from news iterator
+
+ Returns:
+ numpy.ndarray: input news feature (candidate title batch)
+ """
+ input_feature = [
+ batch_data["candidate_title_batch"],
+ batch_data["candidate_ab_batch"],
+ batch_data["candidate_vert_batch"],
+ batch_data["candidate_subvert_batch"],
+ ]
+ input_feature = np.concatenate(input_feature, axis=-1)
+ return input_feature
+
+ def _build_graph(self):
+ """Build NAML model and scorer.
+
+ Returns:
+ object: a model used to train.
+ object: a model used to evaluate and inference.
+ """
+
+ model, scorer = self._build_naml()
+ return model, scorer
+
+ def _build_userencoder(self, newsencoder):
+ """The main function to create user encoder of NAML.
+
+ Args:
+ newsencoder (object): the news encoder of NAML.
+
+ Return:
+ object: the user encoder of NAML.
+ """
+ hparams = self.hparams
+ his_input_title_body_verts = keras.Input(
+ shape=(hparams.his_size, hparams.title_size + hparams.body_size + 2),
+ dtype="int32",
+ )
+
+ click_news_presents = layers.TimeDistributed(newsencoder)(
+ his_input_title_body_verts
+ )
+ user_present = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(
+ click_news_presents
+ )
+
+ model = keras.Model(
+ his_input_title_body_verts, user_present, name="user_encoder"
+ )
+ return model
+
+ def _build_newsencoder(self, embedding_layer):
+ """The main function to create news encoder of NAML.
+ news encoder in composed of title encoder, body encoder, vert encoder and subvert encoder
+
+ Args:
+ embedding_layer (object): a word embedding layer.
+
+ Return:
+ object: the news encoder of NAML.
+ """
+ hparams = self.hparams
+ input_title_body_verts = keras.Input(
+ shape=(hparams.title_size + hparams.body_size + 2,), dtype="int32"
+ )
+
+ sequences_input_title = layers.Lambda(lambda x: x[:, : hparams.title_size])(
+ input_title_body_verts
+ )
+ sequences_input_body = layers.Lambda(
+ lambda x: x[:, hparams.title_size : hparams.title_size + hparams.body_size]
+ )(input_title_body_verts)
+ input_vert = layers.Lambda(
+ lambda x: x[
+ :,
+ hparams.title_size
+ + hparams.body_size : hparams.title_size
+ + hparams.body_size
+ + 1,
+ ]
+ )(input_title_body_verts)
+ input_subvert = layers.Lambda(
+ lambda x: x[:, hparams.title_size + hparams.body_size + 1 :]
+ )(input_title_body_verts)
+
+ title_repr = self._build_titleencoder(embedding_layer)(sequences_input_title)
+ body_repr = self._build_bodyencoder(embedding_layer)(sequences_input_body)
+ vert_repr = self._build_vertencoder()(input_vert)
+ subvert_repr = self._build_subvertencoder()(input_subvert)
+
+ concate_repr = layers.Concatenate(axis=-2)(
+ [title_repr, body_repr, vert_repr, subvert_repr]
+ )
+ news_repr = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(
+ concate_repr
+ )
+
+ model = keras.Model(input_title_body_verts, news_repr, name="news_encoder")
+ return model
+
+ def _build_titleencoder(self, embedding_layer):
+ """build title encoder of NAML news encoder.
+
+ Args:
+ embedding_layer (object): a word embedding layer.
+
+ Return:
+ object: the title encoder of NAML.
+ """
+ hparams = self.hparams
+ sequences_input_title = keras.Input(shape=(hparams.title_size,), dtype="int32")
+ embedded_sequences_title = embedding_layer(sequences_input_title)
+
+ y = layers.Dropout(hparams.dropout)(embedded_sequences_title)
+ y = layers.Conv1D(
+ hparams.filter_num,
+ hparams.window_size,
+ activation=hparams.cnn_activation,
+ padding="same",
+ bias_initializer=keras.initializers.Zeros(),
+ kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
+ )(y)
+ y = layers.Dropout(hparams.dropout)(y)
+ pred_title = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(y)
+ pred_title = layers.Reshape((1, hparams.filter_num))(pred_title)
+
+ model = keras.Model(sequences_input_title, pred_title, name="title_encoder")
+ return model
+
+ def _build_bodyencoder(self, embedding_layer):
+ """build body encoder of NAML news encoder.
+
+ Args:
+ embedding_layer (object): a word embedding layer.
+
+ Return:
+ object: the body encoder of NAML.
+ """
+ hparams = self.hparams
+ sequences_input_body = keras.Input(shape=(hparams.body_size,), dtype="int32")
+ embedded_sequences_body = embedding_layer(sequences_input_body)
+
+ y = layers.Dropout(hparams.dropout)(embedded_sequences_body)
+ y = layers.Conv1D(
+ hparams.filter_num,
+ hparams.window_size,
+ activation=hparams.cnn_activation,
+ padding="same",
+ bias_initializer=keras.initializers.Zeros(),
+ kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
+ )(y)
+ y = layers.Dropout(hparams.dropout)(y)
+ pred_body = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(y)
+ pred_body = layers.Reshape((1, hparams.filter_num))(pred_body)
+
+ model = keras.Model(sequences_input_body, pred_body, name="body_encoder")
+ return model
+
+ def _build_vertencoder(self):
+ """build vert encoder of NAML news encoder.
+
+ Return:
+ object: the vert encoder of NAML.
+ """
+ hparams = self.hparams
+ input_vert = keras.Input(shape=(1,), dtype="int32")
+
+ vert_embedding = layers.Embedding(
+ hparams.vert_num, hparams.vert_emb_dim, trainable=True
+ )
+
+ vert_emb = vert_embedding(input_vert)
+ pred_vert = layers.Dense(
+ hparams.filter_num,
+ activation=hparams.dense_activation,
+ bias_initializer=keras.initializers.Zeros(),
+ kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
+ )(vert_emb)
+ pred_vert = layers.Reshape((1, hparams.filter_num))(pred_vert)
+
+ model = keras.Model(input_vert, pred_vert, name="vert_encoder")
+ return model
+
+ def _build_subvertencoder(self):
+ """build subvert encoder of NAML news encoder.
+
+ Return:
+ object: the subvert encoder of NAML.
+ """
+ hparams = self.hparams
+ input_subvert = keras.Input(shape=(1,), dtype="int32")
+
+ subvert_embedding = layers.Embedding(
+ hparams.subvert_num, hparams.subvert_emb_dim, trainable=True
+ )
+
+ subvert_emb = subvert_embedding(input_subvert)
+ pred_subvert = layers.Dense(
+ hparams.filter_num,
+ activation=hparams.dense_activation,
+ bias_initializer=keras.initializers.Zeros(),
+ kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
+ )(subvert_emb)
+ pred_subvert = layers.Reshape((1, hparams.filter_num))(pred_subvert)
+
+ model = keras.Model(input_subvert, pred_subvert, name="subvert_encoder")
+ return model
+
+ def _build_naml(self):
+ """The main function to create NAML's logic. The core of NAML
+ is a user encoder and a news encoder.
+
+ Returns:
+ object: a model used to train.
+ object: a model used to evaluate and predict.
+ """
+ hparams = self.hparams
+
+ his_input_title = keras.Input(
+ shape=(hparams.his_size, hparams.title_size), dtype="int32"
+ )
+ his_input_body = keras.Input(
+ shape=(hparams.his_size, hparams.body_size), dtype="int32"
+ )
+ his_input_vert = keras.Input(shape=(hparams.his_size, 1), dtype="int32")
+ his_input_subvert = keras.Input(shape=(hparams.his_size, 1), dtype="int32")
+
+ pred_input_title = keras.Input(
+ shape=(hparams.npratio + 1, hparams.title_size), dtype="int32"
+ )
+ pred_input_body = keras.Input(
+ shape=(hparams.npratio + 1, hparams.body_size), dtype="int32"
+ )
+ pred_input_vert = keras.Input(shape=(hparams.npratio + 1, 1), dtype="int32")
+ pred_input_subvert = keras.Input(shape=(hparams.npratio + 1, 1), dtype="int32")
+
+ pred_input_title_one = keras.Input(
+ shape=(
+ 1,
+ hparams.title_size,
+ ),
+ dtype="int32",
+ )
+ pred_input_body_one = keras.Input(
+ shape=(
+ 1,
+ hparams.body_size,
+ ),
+ dtype="int32",
+ )
+ pred_input_vert_one = keras.Input(shape=(1, 1), dtype="int32")
+ pred_input_subvert_one = keras.Input(shape=(1, 1), dtype="int32")
+
+ his_title_body_verts = layers.Concatenate(axis=-1)(
+ [his_input_title, his_input_body, his_input_vert, his_input_subvert]
+ )
+
+ pred_title_body_verts = layers.Concatenate(axis=-1)(
+ [pred_input_title, pred_input_body, pred_input_vert, pred_input_subvert]
+ )
+
+ pred_title_body_verts_one = layers.Concatenate(axis=-1)(
+ [
+ pred_input_title_one,
+ pred_input_body_one,
+ pred_input_vert_one,
+ pred_input_subvert_one,
+ ]
+ )
+ pred_title_body_verts_one = layers.Reshape((-1,))(pred_title_body_verts_one)
+
+ embedding_layer = layers.Embedding(
+ self.word2vec_embedding.shape[0],
+ hparams.word_emb_dim,
+ weights=[self.word2vec_embedding],
+ trainable=True,
+ )
+
+ self.newsencoder = self._build_newsencoder(embedding_layer)
+ self.userencoder = self._build_userencoder(self.newsencoder)
+
+ user_present = self.userencoder(his_title_body_verts)
+ news_present = layers.TimeDistributed(self.newsencoder)(pred_title_body_verts)
+ news_present_one = self.newsencoder(pred_title_body_verts_one)
+
+ preds = layers.Dot(axes=-1)([news_present, user_present])
+ preds = layers.Activation(activation="softmax")(preds)
+
+ pred_one = layers.Dot(axes=-1)([news_present_one, user_present])
+ pred_one = layers.Activation(activation="sigmoid")(pred_one)
+
+ model = keras.Model(
+ [
+ his_input_title,
+ his_input_body,
+ his_input_vert,
+ his_input_subvert,
+ pred_input_title,
+ pred_input_body,
+ pred_input_vert,
+ pred_input_subvert,
+ ],
+ preds,
+ )
+
+ scorer = keras.Model(
+ [
+ his_input_title,
+ his_input_body,
+ his_input_vert,
+ his_input_subvert,
+ pred_input_title_one,
+ pred_input_body_one,
+ pred_input_vert_one,
+ pred_input_subvert_one,
+ ],
+ pred_one,
+ )
+
+ return model, scorer
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow.keras as keras
+from tensorflow.keras import layers
+
+
+from recommenders.models.newsrec.models.base_model import BaseModel
+from recommenders.models.newsrec.models.layers import PersonalizedAttentivePooling
+
+__all__ = ["NPAModel"]
+
+
+[docs]class NPAModel(BaseModel):
+ """NPA model(Neural News Recommendation with Attentive Multi-View Learning)
+
+ Chuhan Wu, Fangzhao Wu, Mingxiao An, Jianqiang Huang, Yongfeng Huang and Xing Xie:
+ NPA: Neural News Recommendation with Personalized Attention, KDD 2019, ADS track.
+
+ Attributes:
+ word2vec_embedding (numpy.ndarray): Pretrained word embedding matrix.
+ hparam (object): Global hyper-parameters.
+ """
+
+ def __init__(self, hparams, iterator_creator, seed=None):
+ """Initialization steps for MANL.
+ Compared with the BaseModel, NPA need word embedding.
+ After creating word embedding matrix, BaseModel's __init__ method will be called.
+
+ Args:
+ hparams (object): Global hyper-parameters. Some key setttings such as filter_num are there.
+ iterator_creator_train (object): NPA data loader class for train data.
+ iterator_creator_test (object): NPA data loader class for test and validation data
+ """
+
+ self.word2vec_embedding = self._init_embedding(hparams.wordEmb_file)
+ self.hparam = hparams
+
+ super().__init__(hparams, iterator_creator, seed=seed)
+
+ def _get_input_label_from_iter(self, batch_data):
+ input_feat = [
+ batch_data["user_index_batch"],
+ batch_data["clicked_title_batch"],
+ batch_data["candidate_title_batch"],
+ ]
+ input_label = batch_data["labels"]
+ return input_feat, input_label
+
+ def _build_graph(self):
+ """Build NPA model and scorer.
+
+ Returns:
+ object: a model used to train.
+ object: a model used to evaluate and inference.
+ """
+
+ model, scorer = self._build_npa()
+ return model, scorer
+
+ def _build_userencoder(self, titleencoder, user_embedding_layer):
+ """The main function to create user encoder of NPA.
+
+ Args:
+ titleencoder (object): the news encoder of NPA.
+
+ Return:
+ object: the user encoder of NPA.
+ """
+ hparams = self.hparams
+
+ his_input_title = keras.Input(
+ shape=(hparams.his_size, hparams.title_size), dtype="int32"
+ )
+ user_indexes = keras.Input(shape=(1,), dtype="int32")
+
+ nuser_id = layers.Reshape((1, 1))(user_indexes)
+ repeat_uids = layers.Concatenate(axis=-2)([nuser_id] * hparams.his_size)
+ his_title_uid = layers.Concatenate(axis=-1)([his_input_title, repeat_uids])
+
+ click_title_presents = layers.TimeDistributed(titleencoder)(his_title_uid)
+
+ u_emb = layers.Reshape((hparams.user_emb_dim,))(
+ user_embedding_layer(user_indexes)
+ )
+ user_present = PersonalizedAttentivePooling(
+ hparams.his_size,
+ hparams.filter_num,
+ hparams.attention_hidden_dim,
+ seed=self.seed,
+ )([click_title_presents, layers.Dense(hparams.attention_hidden_dim)(u_emb)])
+
+ model = keras.Model(
+ [his_input_title, user_indexes], user_present, name="user_encoder"
+ )
+ return model
+
+ def _build_newsencoder(self, embedding_layer, user_embedding_layer):
+ """The main function to create news encoder of NPA.
+
+ Args:
+ embedding_layer (object): a word embedding layer.
+
+ Return:
+ object: the news encoder of NPA.
+ """
+ hparams = self.hparams
+ sequence_title_uindex = keras.Input(
+ shape=(hparams.title_size + 1,), dtype="int32"
+ )
+
+ sequences_input_title = layers.Lambda(lambda x: x[:, : hparams.title_size])(
+ sequence_title_uindex
+ )
+ user_index = layers.Lambda(lambda x: x[:, hparams.title_size :])(
+ sequence_title_uindex
+ )
+
+ u_emb = layers.Reshape((hparams.user_emb_dim,))(
+ user_embedding_layer(user_index)
+ )
+ embedded_sequences_title = embedding_layer(sequences_input_title)
+
+ y = layers.Dropout(hparams.dropout)(embedded_sequences_title)
+ y = layers.Conv1D(
+ hparams.filter_num,
+ hparams.window_size,
+ activation=hparams.cnn_activation,
+ padding="same",
+ bias_initializer=keras.initializers.Zeros(),
+ kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
+ )(y)
+ y = layers.Dropout(hparams.dropout)(y)
+
+ pred_title = PersonalizedAttentivePooling(
+ hparams.title_size,
+ hparams.filter_num,
+ hparams.attention_hidden_dim,
+ seed=self.seed,
+ )([y, layers.Dense(hparams.attention_hidden_dim)(u_emb)])
+
+ # pred_title = Reshape((1, feature_size))(pred_title)
+ model = keras.Model(sequence_title_uindex, pred_title, name="news_encoder")
+ return model
+
+ def _build_npa(self):
+ """The main function to create NPA's logic. The core of NPA
+ is a user encoder and a news encoder.
+
+ Returns:
+ object: a model used to train.
+ object: a model used to evaluate and predict.
+ """
+ hparams = self.hparams
+
+ his_input_title = keras.Input(
+ shape=(hparams.his_size, hparams.title_size), dtype="int32"
+ )
+ pred_input_title = keras.Input(
+ shape=(hparams.npratio + 1, hparams.title_size), dtype="int32"
+ )
+ pred_input_title_one = keras.Input(
+ shape=(
+ 1,
+ hparams.title_size,
+ ),
+ dtype="int32",
+ )
+ pred_title_one_reshape = layers.Reshape((hparams.title_size,))(
+ pred_input_title_one
+ )
+ user_indexes = keras.Input(shape=(1,), dtype="int32")
+
+ nuser_index = layers.Reshape((1, 1))(user_indexes)
+ repeat_uindex = layers.Concatenate(axis=-2)(
+ [nuser_index] * (hparams.npratio + 1)
+ )
+ pred_title_uindex = layers.Concatenate(axis=-1)(
+ [pred_input_title, repeat_uindex]
+ )
+ pred_title_uindex_one = layers.Concatenate()(
+ [pred_title_one_reshape, user_indexes]
+ )
+
+ embedding_layer = layers.Embedding(
+ self.word2vec_embedding.shape[0],
+ hparams.word_emb_dim,
+ weights=[self.word2vec_embedding],
+ trainable=True,
+ )
+
+ user_embedding_layer = layers.Embedding(
+ len(self.train_iterator.uid2index),
+ hparams.user_emb_dim,
+ trainable=True,
+ embeddings_initializer="zeros",
+ )
+
+ titleencoder = self._build_newsencoder(embedding_layer, user_embedding_layer)
+ userencoder = self._build_userencoder(titleencoder, user_embedding_layer)
+ newsencoder = titleencoder
+
+ user_present = userencoder([his_input_title, user_indexes])
+
+ news_present = layers.TimeDistributed(newsencoder)(pred_title_uindex)
+ news_present_one = newsencoder(pred_title_uindex_one)
+
+ preds = layers.Dot(axes=-1)([news_present, user_present])
+ preds = layers.Activation(activation="softmax")(preds)
+
+ pred_one = layers.Dot(axes=-1)([news_present_one, user_present])
+ pred_one = layers.Activation(activation="sigmoid")(pred_one)
+
+ model = keras.Model([user_indexes, his_input_title, pred_input_title], preds)
+ scorer = keras.Model(
+ [user_indexes, his_input_title, pred_input_title_one], pred_one
+ )
+
+ return model, scorer
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow.keras as keras
+from tensorflow.keras import layers
+
+
+from recommenders.models.newsrec.models.base_model import BaseModel
+from recommenders.models.newsrec.models.layers import AttLayer2, SelfAttention
+
+__all__ = ["NRMSModel"]
+
+
+[docs]class NRMSModel(BaseModel):
+ """NRMS model(Neural News Recommendation with Multi-Head Self-Attention)
+
+ Chuhan Wu, Fangzhao Wu, Suyu Ge, Tao Qi, Yongfeng Huang,and Xing Xie, "Neural News
+ Recommendation with Multi-Head Self-Attention" in Proceedings of the 2019 Conference
+ on Empirical Methods in Natural Language Processing and the 9th International Joint Conference
+ on Natural Language Processing (EMNLP-IJCNLP)
+
+ Attributes:
+ word2vec_embedding (numpy.ndarray): Pretrained word embedding matrix.
+ hparam (object): Global hyper-parameters.
+ """
+
+ def __init__(
+ self,
+ hparams,
+ iterator_creator,
+ seed=None,
+ ):
+ """Initialization steps for NRMS.
+ Compared with the BaseModel, NRMS need word embedding.
+ After creating word embedding matrix, BaseModel's __init__ method will be called.
+
+ Args:
+ hparams (object): Global hyper-parameters. Some key setttings such as head_num and head_dim are there.
+ iterator_creator_train (object): NRMS data loader class for train data.
+ iterator_creator_test (object): NRMS data loader class for test and validation data
+ """
+ self.word2vec_embedding = self._init_embedding(hparams.wordEmb_file)
+
+ super().__init__(
+ hparams,
+ iterator_creator,
+ seed=seed,
+ )
+
+ def _get_input_label_from_iter(self, batch_data):
+ """get input and labels for trainning from iterator
+
+ Args:
+ batch data: input batch data from iterator
+
+ Returns:
+ list: input feature fed into model (clicked_title_batch & candidate_title_batch)
+ numpy.ndarray: labels
+ """
+ input_feat = [
+ batch_data["clicked_title_batch"],
+ batch_data["candidate_title_batch"],
+ ]
+ input_label = batch_data["labels"]
+ return input_feat, input_label
+
+ def _get_user_feature_from_iter(self, batch_data):
+ """get input of user encoder
+ Args:
+ batch_data: input batch data from user iterator
+
+ Returns:
+ numpy.ndarray: input user feature (clicked title batch)
+ """
+ return batch_data["clicked_title_batch"]
+
+ def _get_news_feature_from_iter(self, batch_data):
+ """get input of news encoder
+ Args:
+ batch_data: input batch data from news iterator
+
+ Returns:
+ numpy.ndarray: input news feature (candidate title batch)
+ """
+ return batch_data["candidate_title_batch"]
+
+ def _build_graph(self):
+ """Build NRMS model and scorer.
+
+ Returns:
+ object: a model used to train.
+ object: a model used to evaluate and inference.
+ """
+ model, scorer = self._build_nrms()
+ return model, scorer
+
+ def _build_userencoder(self, titleencoder):
+ """The main function to create user encoder of NRMS.
+
+ Args:
+ titleencoder (object): the news encoder of NRMS.
+
+ Return:
+ object: the user encoder of NRMS.
+ """
+ hparams = self.hparams
+ his_input_title = keras.Input(
+ shape=(hparams.his_size, hparams.title_size), dtype="int32"
+ )
+
+ click_title_presents = layers.TimeDistributed(titleencoder)(his_input_title)
+ y = SelfAttention(hparams.head_num, hparams.head_dim, seed=self.seed)(
+ [click_title_presents] * 3
+ )
+ user_present = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(y)
+
+ model = keras.Model(his_input_title, user_present, name="user_encoder")
+ return model
+
+ def _build_newsencoder(self, embedding_layer):
+ """The main function to create news encoder of NRMS.
+
+ Args:
+ embedding_layer (object): a word embedding layer.
+
+ Return:
+ object: the news encoder of NRMS.
+ """
+ hparams = self.hparams
+ sequences_input_title = keras.Input(shape=(hparams.title_size,), dtype="int32")
+
+ embedded_sequences_title = embedding_layer(sequences_input_title)
+
+ y = layers.Dropout(hparams.dropout)(embedded_sequences_title)
+ y = SelfAttention(hparams.head_num, hparams.head_dim, seed=self.seed)([y, y, y])
+ y = layers.Dropout(hparams.dropout)(y)
+ pred_title = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(y)
+
+ model = keras.Model(sequences_input_title, pred_title, name="news_encoder")
+ return model
+
+ def _build_nrms(self):
+ """The main function to create NRMS's logic. The core of NRMS
+ is a user encoder and a news encoder.
+
+ Returns:
+ object: a model used to train.
+ object: a model used to evaluate and inference.
+ """
+ hparams = self.hparams
+
+ his_input_title = keras.Input(
+ shape=(hparams.his_size, hparams.title_size), dtype="int32"
+ )
+ pred_input_title = keras.Input(
+ shape=(hparams.npratio + 1, hparams.title_size), dtype="int32"
+ )
+ pred_input_title_one = keras.Input(
+ shape=(
+ 1,
+ hparams.title_size,
+ ),
+ dtype="int32",
+ )
+ pred_title_one_reshape = layers.Reshape((hparams.title_size,))(
+ pred_input_title_one
+ )
+
+ embedding_layer = layers.Embedding(
+ self.word2vec_embedding.shape[0],
+ hparams.word_emb_dim,
+ weights=[self.word2vec_embedding],
+ trainable=True,
+ )
+
+ titleencoder = self._build_newsencoder(embedding_layer)
+ self.userencoder = self._build_userencoder(titleencoder)
+ self.newsencoder = titleencoder
+
+ user_present = self.userencoder(his_input_title)
+ news_present = layers.TimeDistributed(self.newsencoder)(pred_input_title)
+ news_present_one = self.newsencoder(pred_title_one_reshape)
+
+ preds = layers.Dot(axes=-1)([news_present, user_present])
+ preds = layers.Activation(activation="softmax")(preds)
+
+ pred_one = layers.Dot(axes=-1)([news_present_one, user_present])
+ pred_one = layers.Activation(activation="sigmoid")(pred_one)
+
+ model = keras.Model([his_input_title, pred_input_title], preds)
+ scorer = keras.Model([his_input_title, pred_input_title_one], pred_one)
+
+ return model, scorer
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+
+from recommenders.models.deeprec.deeprec_utils import (
+ flat_config,
+ HParams,
+ load_yaml,
+)
+import random
+import re
+
+
+[docs]def check_type(config):
+ """Check that the config parameters are the correct type
+
+ Args:
+ config (dict): Configuration dictionary.
+
+ Raises:
+ TypeError: If the parameters are not the correct type.
+ """
+
+ int_parameters = [
+ "word_size",
+ "his_size",
+ "title_size",
+ "body_size",
+ "npratio",
+ "word_emb_dim",
+ "attention_hidden_dim",
+ "epochs",
+ "batch_size",
+ "show_step",
+ "save_epoch",
+ "head_num",
+ "head_dim",
+ "user_num",
+ "filter_num",
+ "window_size",
+ "gru_unit",
+ "user_emb_dim",
+ "vert_emb_dim",
+ "subvert_emb_dim",
+ ]
+ for param in int_parameters:
+ if param in config and not isinstance(config[param], int):
+ raise TypeError("Parameters {0} must be int".format(param))
+
+ float_parameters = ["learning_rate", "dropout"]
+ for param in float_parameters:
+ if param in config and not isinstance(config[param], float):
+ raise TypeError("Parameters {0} must be float".format(param))
+
+ str_parameters = [
+ "wordEmb_file",
+ "wordDict_file",
+ "userDict_file",
+ "vertDict_file",
+ "subvertDict_file",
+ "method",
+ "loss",
+ "optimizer",
+ "cnn_activation",
+ "dense_activation" "type",
+ ]
+ for param in str_parameters:
+ if param in config and not isinstance(config[param], str):
+ raise TypeError("Parameters {0} must be str".format(param))
+
+ list_parameters = ["layer_sizes", "activation"]
+ for param in list_parameters:
+ if param in config and not isinstance(config[param], list):
+ raise TypeError("Parameters {0} must be list".format(param))
+
+ bool_parameters = ["support_quick_scoring"]
+ for param in bool_parameters:
+ if param in config and not isinstance(config[param], bool):
+ raise TypeError("Parameters {0} must be bool".format(param))
+
+
+[docs]def check_nn_config(f_config):
+ """Check neural networks configuration.
+
+ Args:
+ f_config (dict): Neural network configuration.
+
+ Raises:
+ ValueError: If the parameters are not correct.
+ """
+
+ if f_config["model_type"] in ["nrms", "NRMS"]:
+ required_parameters = [
+ "title_size",
+ "his_size",
+ "wordEmb_file",
+ "wordDict_file",
+ "userDict_file",
+ "npratio",
+ "data_format",
+ "word_emb_dim",
+ # nrms
+ "head_num",
+ "head_dim",
+ # attention
+ "attention_hidden_dim",
+ "loss",
+ "data_format",
+ "dropout",
+ ]
+
+ elif f_config["model_type"] in ["naml", "NAML"]:
+ required_parameters = [
+ "title_size",
+ "body_size",
+ "his_size",
+ "wordEmb_file",
+ "subvertDict_file",
+ "vertDict_file",
+ "wordDict_file",
+ "userDict_file",
+ "npratio",
+ "data_format",
+ "word_emb_dim",
+ "vert_emb_dim",
+ "subvert_emb_dim",
+ # naml
+ "filter_num",
+ "cnn_activation",
+ "window_size",
+ "dense_activation",
+ # attention
+ "attention_hidden_dim",
+ "loss",
+ "data_format",
+ "dropout",
+ ]
+ elif f_config["model_type"] in ["lstur", "LSTUR"]:
+ required_parameters = [
+ "title_size",
+ "his_size",
+ "wordEmb_file",
+ "wordDict_file",
+ "userDict_file",
+ "npratio",
+ "data_format",
+ "word_emb_dim",
+ # lstur
+ "gru_unit",
+ "type",
+ "filter_num",
+ "cnn_activation",
+ "window_size",
+ # attention
+ "attention_hidden_dim",
+ "loss",
+ "data_format",
+ "dropout",
+ ]
+ elif f_config["model_type"] in ["npa", "NPA"]:
+ required_parameters = [
+ "title_size",
+ "his_size",
+ "wordEmb_file",
+ "wordDict_file",
+ "userDict_file",
+ "npratio",
+ "data_format",
+ "word_emb_dim",
+ # npa
+ "user_emb_dim",
+ "filter_num",
+ "cnn_activation",
+ "window_size",
+ # attention
+ "attention_hidden_dim",
+ "loss",
+ "data_format",
+ "dropout",
+ ]
+ else:
+ required_parameters = []
+
+ # check required parameters
+ for param in required_parameters:
+ if param not in f_config:
+ raise ValueError("Parameters {0} must be set".format(param))
+
+ if f_config["model_type"] in ["nrms", "NRMS", "lstur", "LSTUR"]:
+ if f_config["data_format"] != "news":
+ raise ValueError(
+ "For nrms and naml model, data format must be 'news', but your set is {0}".format(
+ f_config["data_format"]
+ )
+ )
+ elif f_config["model_type"] in ["naml", "NAML"]:
+ if f_config["data_format"] != "naml":
+ raise ValueError(
+ "For nrms and naml model, data format must be 'naml', but your set is {0}".format(
+ f_config["data_format"]
+ )
+ )
+
+ check_type(f_config)
+
+
+[docs]def create_hparams(flags):
+ """Create the model hyperparameters.
+
+ Args:
+ flags (dict): Dictionary with the model requirements.
+
+ Returns:
+ HParams: Hyperparameter object.
+ """
+ init_dict = {
+ # data
+ "support_quick_scoring": False,
+ # models
+ "dropout": 0.0,
+ "attention_hidden_dim": 200,
+ # nrms
+ "head_num": 4,
+ "head_dim": 100,
+ # naml
+ "filter_num": 200,
+ "window_size": 3,
+ "vert_emb_dim": 100,
+ "subvert_emb_dim": 100,
+ # lstur
+ "gru_unit": 400,
+ "type": "ini",
+ # npa
+ "user_emb_dim": 50,
+ # train
+ "learning_rate": 0.001,
+ "optimizer": "adam",
+ "epochs": 10,
+ "batch_size": 1,
+ # show info
+ "show_step": 1,
+ }
+ init_dict.update(flags)
+ return HParams(init_dict)
+
+
+[docs]def prepare_hparams(yaml_file=None, **kwargs):
+ """Prepare the model hyperparameters and check that all have the correct value.
+
+ Args:
+ yaml_file (str): YAML file as configuration.
+
+ Returns:
+ HParams: Hyperparameter object.
+ """
+ if yaml_file is not None:
+ config = load_yaml(yaml_file)
+ config = flat_config(config)
+ else:
+ config = {}
+
+ config.update(kwargs)
+
+ check_nn_config(config)
+ return create_hparams(config)
+
+
+[docs]def word_tokenize(sent):
+ """Split sentence into word list using regex.
+ Args:
+ sent (str): Input sentence
+
+ Return:
+ list: word list
+ """
+ pat = re.compile(r"[\w]+|[.,!?;|]")
+ if isinstance(sent, str):
+ return pat.findall(sent.lower())
+ else:
+ return []
+
+
+[docs]def newsample(news, ratio):
+ """Sample ratio samples from news list.
+ If length of news is less than ratio, pad zeros.
+
+ Args:
+ news (list): input news list
+ ratio (int): sample number
+
+ Returns:
+ list: output of sample list.
+ """
+ if ratio > len(news):
+ return news + [0] * (ratio - len(news))
+ else:
+ return random.sample(news, ratio)
+
+
+[docs]def get_mind_data_set(type):
+ """Get MIND dataset address
+
+ Args:
+ type (str): type of mind dataset, must be in ['large', 'small', 'demo']
+
+ Returns:
+ list: data url and train valid dataset name
+ """
+ assert type in ["large", "small", "demo"]
+
+ if type == "large":
+ return (
+ "https://mind201910small.blob.core.windows.net/release/",
+ "MINDlarge_train.zip",
+ "MINDlarge_dev.zip",
+ "MINDlarge_utils.zip",
+ )
+
+ elif type == "small":
+ return (
+ "https://mind201910small.blob.core.windows.net/release/",
+ "MINDsmall_train.zip",
+ "MINDsmall_dev.zip",
+ "MINDsmall_utils.zip",
+ )
+
+ elif type == "demo":
+ return (
+ "https://recodatasets.z20.web.core.windows.net/newsrec/",
+ "MINDdemo_train.zip",
+ "MINDdemo_dev.zip",
+ "MINDdemo_utils.zip",
+ )
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+import tensorflow as tf
+import logging
+import os
+from pathlib import Path
+
+tf.compat.v1.disable_eager_execution()
+log = logging.getLogger(__name__)
+
+
+[docs]class RBM:
+ """Restricted Boltzmann Machine"""
+
+ def __init__(
+ self,
+ possible_ratings,
+ visible_units,
+ hidden_units=500,
+ keep_prob=0.7,
+ init_stdv=0.1,
+ learning_rate=0.004,
+ minibatch_size=100,
+ training_epoch=20,
+ display_epoch=10,
+ sampling_protocol=[50, 70, 80, 90, 100],
+ debug=False,
+ with_metrics=False,
+ seed=42,
+ ):
+ """Implementation of a multinomial Restricted Boltzmann Machine for collaborative filtering
+ in numpy/pandas/tensorflow
+
+ Based on the article by Ruslan Salakhutdinov, Andriy Mnih and Geoffrey Hinton
+ https://www.cs.toronto.edu/~rsalakhu/papers/rbmcf.pdf
+
+ In this implementation we use multinomial units instead of the one-hot-encoded used in
+ the paper. This means that the weights are rank 2 (matrices) instead of rank 3 tensors.
+
+ Basic mechanics:
+
+ 1) A computational graph is created when the RBM class is instantiated.
+ For an item based recommender this consists of:
+ visible units: The number n_visible of visible units equals the number of items
+ hidden units : hyperparameter to fix during training
+
+ 2) Gibbs Sampling:
+
+ 2.1) for each training epoch, the visible units are first clamped on the data
+
+ 2.2) The activation probability of the hidden units, given a linear combination of
+ the visibles, is evaluated P(h=1|phi_v). The latter is then used to sample the
+ value of the hidden units.
+
+ 2.3) The probability P(v=l|phi_h) is evaluated, where l=1,..,r are the ratings (e.g.
+ r=5 for the movielens dataset). In general, this is a multinomial distribution,
+ from which we sample the value of v.
+
+ 2.4) This step is repeated k times, where k increases as optimization converges. It is
+ essential to fix to zero the original unrated items during the all learning process.
+
+ 3) Optimization:
+ The free energy of the visible units given the hidden is evaluated at the beginning (F_0)
+ and after k steps of Bernoulli sampling (F_k). The weights and biases are updated by
+ minimizing the differene F_0 - F_k.
+
+ 4) Inference:
+ Once the joint probability distribution P(v,h) is learned, this is used to generate ratings
+ for unrated items for all users
+ """
+
+ # RBM parameters
+ self.n_hidden = hidden_units # number of hidden units
+ self.keep = keep_prob # keep probability for dropout regularization
+
+ # standard deviation used to initialize the weights matrices
+ self.stdv = init_stdv
+
+ # learning rate used in the update method of the optimizer
+ self.learning_rate = learning_rate
+
+ # size of the minibatch used in the random minibatches training; setting to 1 corresponds to
+ # stochastic gradient descent, and it is considerably slower. Good performance is achieved
+ # for a size of ~100.
+ self.minibatch = minibatch_size
+ self.epochs = training_epoch + 1 # number of epochs used to train the model
+
+ # number of epochs to show the mse error during training
+ self.display_epoch = display_epoch
+
+ # protocol to increase Gibbs sampling's step. Array containing the
+ # percentage of the total training epoch when the step increases by 1
+ self.sampling_protocol = sampling_protocol
+
+ # if true, functions print their control paramters and/or outputs
+ self.debug = debug
+
+ # if true, compute msre and accuracy during training
+ self.with_metrics = with_metrics
+
+ # Seed
+ self.seed = seed
+ np.random.seed(self.seed)
+ tf.compat.v1.set_random_seed(self.seed)
+
+ self.n_visible = visible_units # number of items
+
+ tf.compat.v1.reset_default_graph()
+
+ # ----------------------Initializers-------------------------------------
+
+ # create a sorted list of all the unique ratings (of float type)
+ self.possible_ratings = possible_ratings
+
+ # create a lookup table to map integer indices to float ratings
+ self.ratings_lookup_table = tf.lookup.StaticHashTable(
+ tf.lookup.KeyValueTensorInitializer(
+ tf.constant(list(range(len(self.possible_ratings))), dtype=tf.int32),
+ tf.constant(list(self.possible_ratings), dtype=tf.float32),
+ ),
+ default_value=0,
+ )
+
+ self.generate_graph()
+ self.init_metrics()
+ self.init_gpu()
+ init_graph = tf.compat.v1.global_variables_initializer()
+
+ # Start TF training session on default graph
+ self.sess = tf.compat.v1.Session(config=self.config_gpu)
+ self.sess.run(init_graph)
+
+[docs] def binomial_sampling(self, pr):
+ """Binomial sampling of hidden units activations using a rejection method.
+
+ Basic mechanics:
+
+ 1) Extract a random number from a uniform distribution (g) and compare it with
+ the unit's probability (pr)
+
+ 2) Choose 0 if pr<g, 1 otherwise. It is convenient to implement this condtion using
+ the relu function.
+
+ Args:
+ pr (tf.Tensor, float32): Input conditional probability.
+ g (numpy.ndarray, float32): Uniform probability used for comparison.
+
+ Returns:
+ tf.Tensor: Float32 tensor of sampled units. The value is 1 if pr>g and 0 otherwise.
+ """
+
+ # sample from a Bernoulli distribution with same dimensions as input distribution
+ g = tf.convert_to_tensor(
+ value=np.random.uniform(size=pr.shape[1]), dtype=tf.float32
+ )
+
+ # sample the value of the hidden units
+ h_sampled = tf.nn.relu(tf.sign(pr - g))
+
+ return h_sampled
+
+[docs] def multinomial_sampling(self, pr):
+ """Multinomial Sampling of ratings
+
+ Basic mechanics:
+ For r classes, we sample r binomial distributions using the rejection method. This is possible
+ since each class is statistically independent from the other. Note that this is the same method
+ used in numpy's random.multinomial() function.
+
+ 1) extract a size r array of random numbers from a uniform distribution (g). As pr is normalized,
+ we need to normalize g as well.
+
+ 2) For each user and item, compare pr with the reference distribution. Note that the latter needs
+ to be the same for ALL the user/item pairs in the dataset, as by assumptions they are sampled
+ from a common distribution.
+
+ Args:
+ pr (tf.Tensor, float32): A distributions of shape (m, n, r), where m is the number of examples, n the number
+ of features and r the number of classes. pr needs to be normalized, i.e. sum_k p(k) = 1 for all m, at fixed n.
+ f (tf.Tensor, float32): Normalized, uniform probability used for comparison.
+
+ Returns:
+ tf.Tensor: An (m,n) float32 tensor of sampled rankings from 1 to r.
+ """
+ g = np.random.uniform(size=pr.shape[2]) # sample from a uniform distribution
+ f = tf.convert_to_tensor(
+ value=g / g.sum(), dtype=tf.float32
+ ) # normalize and convert to tensor
+
+ samp = tf.nn.relu(tf.sign(pr - f)) # apply rejection method
+
+ # get integer index of the rating to be sampled
+ v_argmax = tf.cast(tf.argmax(input=samp, axis=2), "int32")
+
+ # lookup the rating using integer index
+ v_samp = tf.cast(self.ratings_lookup_table.lookup(v_argmax), "float32")
+
+ return v_samp
+
+[docs] def multinomial_distribution(self, phi):
+ """Probability that unit v has value l given phi: P(v=l|phi)
+
+ Args:
+ phi (tf.Tensor): linear combination of values of the previous layer
+ r (float): rating scale, corresponding to the number of classes
+
+ Returns:
+ tf.Tensor:
+ - A tensor of shape (r, m, Nv): This needs to be reshaped as (m, Nv, r) in the last step to allow for faster sampling when used in the multinomial function.
+
+ """
+
+ numerator = [
+ tf.exp(tf.multiply(tf.constant(k, dtype="float32"), phi))
+ for k in self.possible_ratings
+ ]
+
+ denominator = tf.reduce_sum(input_tensor=numerator, axis=0)
+
+ prob = tf.compat.v1.div(numerator, denominator)
+
+ return tf.transpose(a=prob, perm=[1, 2, 0])
+
+[docs] def free_energy(self, x):
+ """Free energy of the visible units given the hidden units. Since the sum is over the hidden units'
+ states, the functional form of the visible units Free energy is the same as the one for the binary model.
+
+ Args:
+ x (tf.Tensor): This can be either the sampled value of the visible units (v_k) or the input data
+
+ Returns:
+ tf.Tensor: Free energy of the model.
+ """
+
+ bias = -tf.reduce_sum(input_tensor=tf.matmul(x, tf.transpose(a=self.bv)))
+
+ phi_x = tf.matmul(x, self.w) + self.bh
+ f = -tf.reduce_sum(input_tensor=tf.nn.softplus(phi_x))
+
+ F = bias + f # free energy density per training example
+
+ return F
+
+[docs] def placeholder(self):
+ """Initialize the placeholders for the visible units"""
+ self.vu = tf.compat.v1.placeholder(
+ shape=[None, self.n_visible], dtype="float32"
+ )
+
+[docs] def init_parameters(self):
+ """Initialize the parameters of the model.
+
+ This is a single layer model with two biases. So we have a rectangular matrix w_{ij} and
+ two bias vectors to initialize.
+
+ Args:
+ n_visible (int): number of visible units (input layer)
+ n_hidden (int): number of hidden units (latent variables of the model)
+
+ Returns:
+ tf.Tensor, tf.Tensor, tf.Tensor:
+ - `w` of size (n_visible, n_hidden): correlation matrix initialized by sampling from a normal distribution with zero mean and given variance init_stdv.
+ - `bv` of size (1, n_visible): visible units' bias, initialized to zero.
+ - `bh` of size (1, n_hidden): hidden units' bias, initiliazed to zero.
+ """
+ with tf.compat.v1.variable_scope("Network_parameters"):
+
+ self.w = tf.compat.v1.get_variable(
+ "weight",
+ [self.n_visible, self.n_hidden],
+ initializer=tf.compat.v1.random_normal_initializer(
+ stddev=self.stdv, seed=self.seed
+ ),
+ dtype="float32",
+ )
+
+ self.bv = tf.compat.v1.get_variable(
+ "v_bias",
+ [1, self.n_visible],
+ initializer=tf.compat.v1.zeros_initializer(),
+ dtype="float32",
+ )
+
+ self.bh = tf.compat.v1.get_variable(
+ "h_bias",
+ [1, self.n_hidden],
+ initializer=tf.compat.v1.zeros_initializer(),
+ dtype="float32",
+ )
+
+
+
+[docs] def sample_visible_units(self, h):
+ """Sample the visible units given the hiddens. This can be thought of as a Backward pass in a FFN
+ (negative phase). Each visible unit can take values in [1,rating], while the zero is reserved
+ for missing data; as such the value of the hidden unit is sampled from a multinomial distribution.
+
+ Basic mechanics:
+
+ 1) For every training example we first sample Nv Multinomial distributions. The result is of the
+ form [0,1,0,0,0,...,0] where the index of the 1 element corresponds to the rth rating. The index
+ is extracted using the argmax function and we need to add 1 at the end since array indeces starts
+ from 0.
+
+ 2) Selects only those units that have been sampled. During the training phase it is important to not
+ use the reconstructed inputs, so we beed to enforce a zero value in the reconstructed ratings in
+ the same position as the original input.
+
+ Args:
+ h (tf.Tensor, float32): visible units.
+
+ Returns:
+ tf.Tensor, tf.Tensor:
+ - `pvh`: The activation probability of the visible unit given the hidden.
+ - `v_`: The sampled value of the visible unit from a Multinomial distributions having success probability `pvh`.
+ """
+
+ with tf.compat.v1.name_scope("sample_visible_units"):
+
+ phi_h = tf.matmul(h, tf.transpose(a=self.w)) + self.bv # linear combination
+ pvh = self.multinomial_distribution(
+ phi_h
+ ) # conditional probability of v given h
+
+ # Sampling (modify here )
+ v_tmp = self.multinomial_sampling(
+ pvh
+ ) # sample the value of the visible units
+
+ mask = tf.equal(self.v, 0) # selects the inactive units in the input vector
+
+ v_ = tf.compat.v1.where(
+ mask, x=self.v, y=v_tmp
+ ) # enforce inactive units in the reconstructed vector
+
+ return pvh, v_
+
+[docs] def gibbs_sampling(self):
+ """Gibbs sampling: Determines an estimate of the model configuration via sampling. In the binary
+ RBM we need to impose that unseen movies stay as such, i.e. the sampling phase should not modify
+ the elements where v=0.
+
+ Args:
+ k (scalar, integer): iterator. Number of sampling steps.
+ v (tf.Tensor, float32): visible units.
+
+ Returns:
+ tf.Tensor, tf.Tensor:
+ - `h_k`: The sampled value of the hidden unit at step k, float32.
+ - `v_k`: The sampled value of the visible unit at step k, float32.
+ """
+
+ with tf.compat.v1.name_scope("gibbs_sampling"):
+
+ self.v_k = (
+ self.v
+ ) # initialize the value of the visible units at step k=0 on the data
+
+ if self.debug:
+ print("CD step", self.k)
+
+ for i in range(self.k): # k_sampling
+ _, h_k = self.sample_hidden_units(self.v_k)
+ _, self.v_k = self.sample_visible_units(h_k)
+
+[docs] def losses(self, vv):
+ """Calculate contrastive divergence, which is the difference between
+ the free energy clamped on the data (v) and the model Free energy (v_k).
+
+ Args:
+ vv (tf.Tensor, float32): empirical input
+
+ Returns:
+ obj: contrastive divergence
+ """
+
+ with tf.compat.v1.variable_scope("losses"):
+ obj = self.free_energy(vv) - self.free_energy(self.v_k)
+
+ return obj
+
+[docs] def gibbs_protocol(self, i):
+ """Gibbs protocol.
+
+ Basic mechanics:
+
+ If the current epoch i is in the interval specified in the training protocol,
+ the number of steps in Gibbs sampling (k) is incremented by one and gibbs_sampling is updated
+ accordingly.
+
+ Args:
+ i (int): Current epoch in the loop
+ """
+
+ with tf.compat.v1.name_scope("gibbs_protocol"):
+
+ epoch_percentage = (
+ i / self.epochs
+ ) * 100 # current percentage of the total #epochs
+
+ if epoch_percentage != 0:
+ if (
+ epoch_percentage >= self.sampling_protocol[self.l]
+ and epoch_percentage <= self.sampling_protocol[self.l + 1]
+ ):
+ self.k += 1
+ self.l += 1 # noqa: E741 ambiguous variable name 'l'
+ self.gibbs_sampling()
+
+ if self.debug:
+ log.info("percentage of epochs covered so far %f2" % (epoch_percentage))
+
+[docs] def data_pipeline(self):
+ """Define the data pipeline"""
+
+ # placeholder for the batch_size
+ self.batch_size = tf.compat.v1.placeholder(tf.int64)
+
+ # Create the data pipeline for faster training
+ self.dataset = tf.data.Dataset.from_tensor_slices(self.vu)
+
+ self.dataset = self.dataset.shuffle(
+ buffer_size=50, reshuffle_each_iteration=True, seed=self.seed
+ ) # randomize the batch
+
+ self.dataset = self.dataset.batch(batch_size=self.batch_size).repeat()
+
+ # define iterator
+ self.iter = tf.compat.v1.data.make_initializable_iterator(self.dataset)
+ self.v = self.iter.get_next()
+
+[docs] def init_metrics(self):
+ """Initialize metrics"""
+
+ if self.with_metrics: # if true (default) returns evaluation metrics
+ self.rmse = tf.sqrt(
+ tf.compat.v1.losses.mean_squared_error(
+ self.v, self.v_k, weights=tf.where(self.v > 0, 1, 0)
+ )
+ )
+
+[docs] def generate_graph(self):
+ """Call the different RBM modules to generate the computational graph"""
+
+ log.info("Creating the computational graph")
+
+ self.placeholder() # create the visible units placeholder
+ self.data_pipeline() # data_pipeline
+ self.init_parameters() # initialize Network parameters
+
+ # --------------Initialize protocol for Gibbs sampling------------------
+ log.info("Initialize Gibbs protocol")
+ self.k = 1 # initialize the G_sampling step
+ # initialize epoch_sample index
+ self.l = 0 # noqa: E741 ambiguous variable name 'l'
+ self.gibbs_sampling() # returns the sampled value of the visible units
+
+ # ---Instantiate loss function and optimizer----------------------------
+ obj = self.losses(self.v) # objective function
+
+ rate = (
+ self.learning_rate / self.minibatch
+ ) # learning rate rescaled by the batch size
+
+ self.opt = tf.compat.v1.train.AdamOptimizer(learning_rate=rate).minimize(
+ loss=obj
+ ) # Instantiate the optimizer
+
+[docs] def init_gpu(self):
+ """Config GPU memory"""
+
+ self.config_gpu = tf.compat.v1.ConfigProto(
+ log_device_placement=False, allow_soft_placement=True
+ )
+ self.config_gpu.gpu_options.allow_growth = True # dynamic memory allocation
+
+[docs] def init_training_session(self, xtr):
+ """Initialize the TF session on training data
+
+ Args:
+ xtr (numpy.ndarray, int32): The user/affinity matrix for the train set.
+ """
+
+ self.sess.run(
+ self.iter.initializer,
+ feed_dict={self.vu: xtr, self.batch_size: self.minibatch},
+ )
+
+ self.sess.run(tf.compat.v1.tables_initializer())
+
+[docs] def batch_training(self, num_minibatches):
+ """Perform training over input minibatches. If `self.with_metrics` is False,
+ no online metrics are evaluated.
+
+ Args:
+ num_minibatches (scalar, int32): Number of training minibatches.
+
+ Returns:
+ float: Training error per single epoch. If `self.with_metrics` is False, this is zero.
+ """
+
+ epoch_tr_err = 0 # initialize the training error for each epoch to zero
+
+ # minibatch loop
+ for _ in range(num_minibatches):
+
+ if self.with_metrics:
+ _, batch_err = self.sess.run([self.opt, self.rmse])
+
+ # average msr error per minibatch
+ epoch_tr_err += batch_err / num_minibatches
+
+ else:
+ _ = self.sess.run(self.opt)
+
+ return epoch_tr_err
+
+[docs] def fit(self, xtr):
+ """Fit method
+
+ Training in generative models takes place in two steps:
+
+ 1) Gibbs sampling
+ 2) Gradient evaluation and parameters update
+
+ This estimate is later used in the weight update step by minimizing the distance between the
+ model and the empirical free energy. Note that while the unit's configuration space is sampled,
+ the weights are determined via maximum likelihood (saddle point).
+
+ Main component of the algo; once instantiated, it generates the computational graph and performs
+ model training
+
+ Args:
+ xtr (numpy.ndarray, integers): the user/affinity matrix for the train set
+ xtst (numpy.ndarray, integers): the user/affinity matrix for the test set
+ """
+
+ # keep the position of the items in the train set so that they can be optionally exluded from recommendation
+ self.seen_mask = np.not_equal(xtr, 0)
+
+ n_users = xtr.shape[0]
+ num_minibatches = int(n_users / self.minibatch) # number of minibatches
+
+ self.init_training_session(xtr)
+
+ rmse_train = [] # List to collect the metrics across epochs
+
+ # start loop over training epochs
+ for i in range(self.epochs):
+
+ self.gibbs_protocol(i) # Gibbs sampling update
+ epoch_tr_err = self.batch_training(num_minibatches) # model train
+
+ if self.with_metrics and i % self.display_epoch == 0:
+ log.info("training epoch %i rmse %f" % (i, epoch_tr_err))
+
+ rmse_train.append(epoch_tr_err) # mse training error per training epoch
+
+ self.rmse_train = rmse_train
+
+[docs] def eval_out(self):
+ """Implement multinomial sampling from a trained model"""
+
+ # Sampling
+ _, h = self.sample_hidden_units(self.vu) # sample h
+
+ # sample v
+ phi_h = (
+ tf.transpose(a=tf.matmul(self.w, tf.transpose(a=h))) + self.bv
+ ) # linear combination
+ pvh = self.multinomial_distribution(
+ phi_h
+ ) # conditional probability of v given h
+
+ v = self.multinomial_sampling(pvh) # sample the value of the visible units
+
+ return v, pvh
+
+[docs] def recommend_k_items(self, x, top_k=10, remove_seen=True):
+ """Returns the top-k items ordered by a relevancy score.
+
+ Basic mechanics:
+
+ The method samples new ratings from the learned joint distribution, together with their
+ probabilities. The input x must have the same number of columns as the one used for training
+ the model (i.e. the same number of items) but it can have an arbitrary number of rows (users).
+
+ A recommendation score is evaluated by taking the element-wise product between the ratings and
+ the associated probabilities. For example, we could have the following situation:
+
+ .. code-block:: python
+
+ rating probability score
+ item1 5 0.5 2.5
+ item2 4 0.8 3.2
+
+ then item2 will be recommended.
+
+ Args:
+ x (numpy.ndarray, int32): input user/affinity matrix. Note that this can be a single vector, i.e. the ratings
+ of a single user.
+ top_k (scalar, int32): the number of items to recommend.
+
+ Returns:
+ numpy.ndarray, float:
+ - A sparse matrix containing the top_k elements ordered by their score.
+ - The time taken to recommend k items.
+ """
+
+ # evaluate the ratings and the associated probabilities
+ v_, pvh_ = self.eval_out()
+
+ # evaluate v_ and pvh_ on the input data
+ vp, pvh = self.sess.run([v_, pvh_], feed_dict={self.vu: x})
+ # returns only the probabilities for the predicted ratings in vp
+ pv = np.max(pvh, axis=2)
+
+ # evaluate the score
+ score = np.multiply(vp, pv)
+ # ----------------------Return the results as a P dataframe------------------------------------
+
+ log.info("Extracting top %i elements" % top_k)
+
+ if remove_seen:
+ # if true, it removes items from the train set by setting them to zero
+ vp[self.seen_mask] = 0
+ pv[self.seen_mask] = 0
+ score[self.seen_mask] = 0
+
+ top_items = np.argpartition(-score, range(top_k), axis=1)[
+ :, :top_k
+ ] # get the top k items
+
+ score_c = score.copy() # get a copy of the score matrix
+
+ score_c[
+ np.arange(score_c.shape[0])[:, None], top_items
+ ] = 0 # set to zero the top_k elements
+
+ top_scores = score - score_c # set to zeros all elements other then the top_k
+
+ return top_scores
+
+[docs] def predict(self, x):
+ """Returns the inferred ratings. This method is similar to recommend_k_items() with the
+ exceptions that it returns all the inferred ratings
+
+ Basic mechanics:
+
+ The method samples new ratings from the learned joint distribution, together with
+ their probabilities. The input x must have the same number of columns as the one used
+ for training the model, i.e. the same number of items, but it can have an arbitrary number
+ of rows (users).
+
+ Args:
+ x (numpy.ndarray, int32): Input user/affinity matrix. Note that this can be a single vector, i.e.
+ the ratings of a single user.
+
+ Returns:
+ numpy.ndarray, float:
+ - A matrix with the inferred ratings.
+ - The elapsed time for predediction.
+ """
+
+ v_, _ = self.eval_out() # evaluate the ratings and the associated probabilities
+ vp = self.sess.run(v_, feed_dict={self.vu: x})
+
+ return vp
+
+[docs] def save(self, file_path="./rbm_model.ckpt"):
+ """Save model parameters to `file_path`
+
+ This function saves the current tensorflow session to a specified path.
+
+ Args:
+ file_path (str): output file path for the RBM model checkpoint
+ we will create a new directory if not existing.
+ """
+
+ f_path = Path(file_path)
+ dir_name, file_name = f_path.parent, f_path.name
+
+ # create the directory if it does not exist
+ os.makedirs(dir_name, exist_ok=True)
+
+ # save trained model
+ saver = tf.compat.v1.train.Saver()
+ saver.save(self.sess, os.path.join(dir_name, file_name))
+
+[docs] def load(self, file_path="./rbm_model.ckpt"):
+ """Load model parameters for further use.
+
+ This function loads a saved tensorflow session.
+
+ Args:
+ file_path (str): file path for RBM model checkpoint
+ """
+
+ f_path = Path(file_path)
+ dir_name, file_name = f_path.parent, f_path.name
+
+ # load pre-trained model
+ saver = tf.compat.v1.train.Saver()
+ saver.restore(self.sess, os.path.join(dir_name, file_name))
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+import pandas as pd
+from scipy.sparse import csr_matrix
+
+from recommenders.utils.constants import (
+ DEFAULT_ITEM_COL,
+ DEFAULT_USER_COL,
+ DEFAULT_RATING_COL,
+ DEFAULT_TIMESTAMP_COL,
+)
+
+
+[docs]class RLRMCdataset(object):
+ """RLRMC dataset implementation. Creates sparse data structures for RLRMC algorithm."""
+
+ def __init__(
+ self,
+ train,
+ validation=None,
+ test=None,
+ mean_center=True,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_timestamp=DEFAULT_TIMESTAMP_COL,
+ # seed=42,
+ ):
+ """Initialize parameters.
+
+ Args:
+ train (pandas.DataFrame: training data with at least columns (col_user, col_item, col_rating)
+ validation (pandas.DataFrame): validation data with at least columns (col_user, col_item, col_rating). validation can be None, if so, we only process the training data
+ mean_center (bool): flag to mean center the ratings in train (and validation) data
+ col_user (str): user column name
+ col_item (str): item column name
+ col_rating (str): rating column name
+ col_timestamp (str): timestamp column name
+ """
+ # initialize user and item index
+ self.user_idx = None
+ self.item_idx = None
+
+ # get col name of user, item and rating
+ self.col_user = col_user
+ self.col_item = col_item
+ self.col_rating = col_rating
+ self.col_timestamp = col_timestamp
+ # set random seed
+ # random.seed(seed)
+
+ # data preprocessing for training and validation data
+ self._data_processing(train, validation, test, mean_center)
+
+ def _data_processing(self, train, validation=None, test=None, mean_center=True):
+ """Process the dataset to reindex userID and itemID
+
+ Args:
+ train (pandas.DataFrame): training data with at least columns (col_user, col_item, col_rating)
+ validation (pandas.DataFrame): validation data with at least columns (col_user, col_item, col_rating). validation can be None, if so, we only process the training data
+ mean_center (bool): flag to mean center the ratings in train (and validation) data
+
+ Returns:
+ list: train and validation pandas.DataFrame Dataset, which have been reindexed.
+
+ """
+ # Data processing and reindexing code is adopted from https://github.com/Microsoft/Recommenders/blob/main/recommenders/models/ncf/dataset.py
+ # If validation dataset is None
+ df = train if validation is None else train.append(validation)
+ df = df if test is None else df.append(test)
+
+ # Reindex user and item index
+ if self.user_idx is None:
+ # Map user id
+ user_idx = df[[self.col_user]].drop_duplicates().reindex()
+ user_idx[self.col_user + "_idx"] = np.arange(len(user_idx))
+ self.n_users = len(user_idx)
+ self.user_idx = user_idx
+
+ self.user2id = dict(
+ zip(user_idx[self.col_user], user_idx[self.col_user + "_idx"])
+ )
+ self.id2user = {self.user2id[k]: k for k in self.user2id}
+
+ if self.item_idx is None:
+ # Map item id
+ item_idx = df[[self.col_item]].drop_duplicates()
+ item_idx[self.col_item + "_idx"] = np.arange(len(item_idx))
+ self.n_items = len(item_idx)
+ self.item_idx = item_idx
+
+ self.item2id = dict(
+ zip(item_idx[self.col_item], item_idx[self.col_item + "_idx"])
+ )
+ self.id2item = {self.item2id[k]: k for k in self.item2id}
+
+ df_train = self._reindex(train)
+
+ d = len(user_idx) # number of rows
+ T = len(item_idx) # number of columns
+
+ rows_train = df_train["userID"].values
+ cols_train = df_train["itemID"].values
+ entries_omega = df_train["rating"].values
+ if mean_center:
+ train_mean = np.mean(entries_omega)
+ else:
+ train_mean = 0.0
+ entries_train = entries_omega - train_mean
+ self.model_param = {"num_row": d, "num_col": T, "train_mean": train_mean}
+
+ self.train = csr_matrix(
+ (entries_train.T.ravel(), (rows_train, cols_train)), shape=(d, T)
+ )
+
+ if validation is not None:
+ df_validation = self._reindex(validation)
+ rows_validation = df_validation["userID"].values
+ cols_validation = df_validation["itemID"].values
+ entries_validation = df_validation["rating"].values - train_mean
+ self.validation = csr_matrix(
+ (entries_validation.T.ravel(), (rows_validation, cols_validation)),
+ shape=(d, T),
+ )
+ else:
+ self.validation = None
+
+ def _reindex(self, df):
+ """Process dataset to reindex userID and itemID
+
+ Args:
+ df (pandas.DataFrame): dataframe with at least columns (col_user, col_item, col_rating)
+
+ Returns:
+ list: train and validation pandas.DataFrame Dataset, which have been reindexed.
+
+ """
+
+ # If validation dataset is None
+ if df is None:
+ return None
+
+ # Map user_idx and item_idx
+ df = pd.merge(df, self.user_idx, on=self.col_user, how="left")
+ df = pd.merge(df, self.item_idx, on=self.col_item, how="left")
+
+ # Select relevant columns
+ df_reindex = df[
+ [self.col_user + "_idx", self.col_item + "_idx", self.col_rating]
+ ]
+ df_reindex.columns = [self.col_user, self.col_item, self.col_rating]
+
+ return df_reindex
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+import pandas as pd
+import logging
+from scipy import sparse
+
+from recommenders.utils.python_utils import (
+ cosine_similarity,
+ inclusion_index,
+ jaccard,
+ lexicographers_mutual_information,
+ lift,
+ mutual_information,
+ exponential_decay,
+ get_top_k_scored_items,
+ rescale,
+)
+from recommenders.utils import constants
+
+
+SIM_COOCCUR = "cooccurrence"
+SIM_COSINE = "cosine"
+SIM_INCLUSION_INDEX = "inclusion index"
+SIM_JACCARD = "jaccard"
+SIM_LEXICOGRAPHERS_MUTUAL_INFORMATION = "lexicographers mutual information"
+SIM_LIFT = "lift"
+SIM_MUTUAL_INFORMATION = "mutual information"
+
+logger = logging.getLogger()
+
+
+[docs]class SARSingleNode:
+ """Simple Algorithm for Recommendations (SAR) implementation
+
+ SAR is a fast scalable adaptive algorithm for personalized recommendations based on user transaction history
+ and items description. The core idea behind SAR is to recommend items like those that a user already has
+ demonstrated an affinity to. It does this by 1) estimating the affinity of users for items, 2) estimating
+ similarity across items, and then 3) combining the estimates to generate a set of recommendations for a given user.
+ """
+
+ def __init__(
+ self,
+ col_user=constants.DEFAULT_USER_COL,
+ col_item=constants.DEFAULT_ITEM_COL,
+ col_rating=constants.DEFAULT_RATING_COL,
+ col_timestamp=constants.DEFAULT_TIMESTAMP_COL,
+ col_prediction=constants.DEFAULT_PREDICTION_COL,
+ similarity_type=SIM_JACCARD,
+ time_decay_coefficient=30,
+ time_now=None,
+ timedecay_formula=False,
+ threshold=1,
+ normalize=False,
+ ):
+ """Initialize model parameters
+
+ Args:
+ col_user (str): user column name
+ col_item (str): item column name
+ col_rating (str): rating column name
+ col_timestamp (str): timestamp column name
+ col_prediction (str): prediction column name
+ similarity_type (str): ['cooccurrence', 'cosine', 'inclusion index', 'jaccard',
+ 'lexicographers mutual information', 'lift', 'mutual information'] option for
+ computing item-item similarity
+ time_decay_coefficient (float): number of days till ratings are decayed by 1/2
+ time_now (int | None): current time for time decay calculation
+ timedecay_formula (bool): flag to apply time decay
+ threshold (int): item-item co-occurrences below this threshold will be removed
+ normalize (bool): option for normalizing predictions to scale of original ratings
+ """
+ self.col_rating = col_rating
+ self.col_item = col_item
+ self.col_user = col_user
+ self.col_timestamp = col_timestamp
+ self.col_prediction = col_prediction
+
+ available_similarity_types = [
+ SIM_COOCCUR,
+ SIM_COSINE,
+ SIM_INCLUSION_INDEX,
+ SIM_JACCARD,
+ SIM_LIFT,
+ SIM_MUTUAL_INFORMATION,
+ SIM_LEXICOGRAPHERS_MUTUAL_INFORMATION,
+ ]
+ if similarity_type not in available_similarity_types:
+ raise ValueError(
+ 'Similarity type must be one of ["'
+ + '" | "'.join(available_similarity_types)
+ + '"]'
+ )
+ self.similarity_type = similarity_type
+ self.time_decay_half_life = (
+ time_decay_coefficient * 24 * 60 * 60
+ ) # convert to seconds
+ self.time_decay_flag = timedecay_formula
+ self.time_now = time_now
+ self.threshold = threshold
+ self.user_affinity = None
+ self.item_similarity = None
+ self.item_frequencies = None
+ self.user_frequencies = None
+
+ # threshold - items below this number get set to zero in co-occurrence counts
+ if self.threshold <= 0:
+ raise ValueError("Threshold cannot be < 1")
+
+ # set flag to capture unity-rating user-affinity matrix for scaling scores
+ self.normalize = normalize
+ self.col_unity_rating = "_unity_rating"
+ self.unity_user_affinity = None
+
+ # column for mapping user / item ids to internal indices
+ self.col_item_id = "_indexed_items"
+ self.col_user_id = "_indexed_users"
+
+ # obtain all the users and items from both training and test data
+ self.n_users = None
+ self.n_items = None
+
+ # The min and max of the rating scale, obtained from the training data.
+ self.rating_min = None
+ self.rating_max = None
+
+ # mapping for item to matrix element
+ self.user2index = None
+ self.item2index = None
+
+ # the opposite of the above maps - map array index to actual string ID
+ self.index2item = None
+ self.index2user = None
+
+[docs] def compute_affinity_matrix(self, df, rating_col):
+ """Affinity matrix.
+
+ The user-affinity matrix can be constructed by treating the users and items as
+ indices in a sparse matrix, and the events as the data. Here, we're treating
+ the ratings as the event weights. We convert between different sparse-matrix
+ formats to de-duplicate user-item pairs, otherwise they will get added up.
+
+ Args:
+ df (pandas.DataFrame): Indexed df of users and items
+ rating_col (str): Name of column to use for ratings
+
+ Returns:
+ sparse.csr: Affinity matrix in Compressed Sparse Row (CSR) format.
+ """
+
+ return sparse.coo_matrix(
+ (df[rating_col], (df[self.col_user_id], df[self.col_item_id])),
+ shape=(self.n_users, self.n_items),
+ ).tocsr()
+
+[docs] def compute_time_decay(self, df, decay_column):
+ """Compute time decay on provided column.
+
+ Args:
+ df (pandas.DataFrame): DataFrame of users and items
+ decay_column (str): column to decay
+
+ Returns:
+ pandas.DataFrame: with column decayed
+ """
+
+ # if time_now is None use the latest time
+ if self.time_now is None:
+ self.time_now = df[self.col_timestamp].max()
+
+ # apply time decay to each rating
+ df[decay_column] *= exponential_decay(
+ value=df[self.col_timestamp],
+ max_val=self.time_now,
+ half_life=self.time_decay_half_life,
+ )
+
+ # group time decayed ratings by user-item and take the sum as the user-item affinity
+ return df.groupby([self.col_user, self.col_item]).sum().reset_index()
+
+[docs] def compute_cooccurrence_matrix(self, df):
+ """Co-occurrence matrix.
+
+ The co-occurrence matrix is defined as :math:`C = U^T * U`
+
+ where U is the user_affinity matrix with 1's as values (instead of ratings).
+
+ Args:
+ df (pandas.DataFrame): DataFrame of users and items
+
+ Returns:
+ numpy.ndarray: Co-occurrence matrix
+ """
+ user_item_hits = sparse.coo_matrix(
+ (np.repeat(1, df.shape[0]), (df[self.col_user_id], df[self.col_item_id])),
+ shape=(self.n_users, self.n_items),
+ ).tocsr()
+
+ item_cooccurrence = user_item_hits.transpose().dot(user_item_hits)
+ item_cooccurrence = item_cooccurrence.multiply(
+ item_cooccurrence >= self.threshold
+ )
+
+ return item_cooccurrence.astype(df[self.col_rating].dtype)
+
+[docs] def set_index(self, df):
+ """Generate continuous indices for users and items to reduce memory usage.
+
+ Args:
+ df (pandas.DataFrame): dataframe with user and item ids
+ """
+
+ # generate a map of continuous index values to items
+ self.index2item = dict(enumerate(df[self.col_item].unique()))
+ self.index2user = dict(enumerate(df[self.col_user].unique()))
+
+ # invert the mappings from above
+ self.item2index = {v: k for k, v in self.index2item.items()}
+ self.user2index = {v: k for k, v in self.index2user.items()}
+
+ # set values for the total count of users and items
+ self.n_users = len(self.user2index)
+ self.n_items = len(self.index2item)
+
+[docs] def fit(self, df):
+ """Main fit method for SAR.
+
+ Note:
+ Please make sure that `df` has no duplicates.
+
+ Args:
+ df (pandas.DataFrame): User item rating dataframe (without duplicates).
+ """
+ select_columns = [self.col_user, self.col_item, self.col_rating]
+ if self.time_decay_flag:
+ select_columns += [self.col_timestamp]
+
+ if df[select_columns].duplicated().any():
+ raise ValueError("There should not be duplicates in the dataframe")
+
+ # generate continuous indices if this hasn't been done
+ if self.index2item is None:
+ self.set_index(df)
+
+ logger.info("Collecting user affinity matrix")
+ if not np.issubdtype(df[self.col_rating].dtype, np.number):
+ raise TypeError("Rating column data type must be numeric")
+
+ # copy the DataFrame to avoid modification of the input
+ temp_df = df[select_columns].copy()
+
+ if self.time_decay_flag:
+ logger.info("Calculating time-decayed affinities")
+ temp_df = self.compute_time_decay(df=temp_df, decay_column=self.col_rating)
+
+ logger.info("Creating index columns")
+ # add mapping of user and item ids to indices
+ temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].apply(
+ lambda item: self.item2index.get(item, np.NaN)
+ )
+ temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].apply(
+ lambda user: self.user2index.get(user, np.NaN)
+ )
+
+ if self.normalize:
+ self.rating_min = temp_df[self.col_rating].min()
+ self.rating_max = temp_df[self.col_rating].max()
+ logger.info("Calculating normalization factors")
+ temp_df[self.col_unity_rating] = 1.0
+ if self.time_decay_flag:
+ temp_df = self.compute_time_decay(
+ df=temp_df, decay_column=self.col_unity_rating
+ )
+ self.unity_user_affinity = self.compute_affinity_matrix(
+ df=temp_df, rating_col=self.col_unity_rating
+ )
+
+ # affinity matrix
+ logger.info("Building user affinity sparse matrix")
+ self.user_affinity = self.compute_affinity_matrix(
+ df=temp_df, rating_col=self.col_rating
+ )
+
+ # calculate item co-occurrence
+ logger.info("Calculating item co-occurrence")
+ item_cooccurrence = self.compute_cooccurrence_matrix(df=temp_df)
+
+ # free up some space
+ del temp_df
+
+ # creates an array with the frequency of every unique item
+ self.item_frequencies = item_cooccurrence.diagonal()
+
+ logger.info("Calculating item similarity")
+ if self.similarity_type == SIM_COOCCUR:
+ logger.info("Using co-occurrence based similarity")
+ self.item_similarity = item_cooccurrence
+ elif self.similarity_type == SIM_COSINE:
+ logger.info("Using cosine similarity")
+ self.item_similarity = cosine_similarity(item_cooccurrence)
+ elif self.similarity_type == SIM_INCLUSION_INDEX:
+ logger.info("Using inclusion index")
+ self.item_similarity = inclusion_index(item_cooccurrence)
+ elif self.similarity_type == SIM_JACCARD:
+ logger.info("Using jaccard based similarity")
+ self.item_similarity = jaccard(item_cooccurrence)
+ elif self.similarity_type == SIM_LEXICOGRAPHERS_MUTUAL_INFORMATION:
+ logger.info("Using lexicographers mutual information similarity")
+ self.item_similarity = lexicographers_mutual_information(item_cooccurrence)
+ elif self.similarity_type == SIM_LIFT:
+ logger.info("Using lift based similarity")
+ self.item_similarity = lift(item_cooccurrence)
+ elif self.similarity_type == SIM_MUTUAL_INFORMATION:
+ logger.info("Using mutual information similarity")
+ self.item_similarity = mutual_information(item_cooccurrence)
+ else:
+ raise ValueError("Unknown similarity type: {}".format(self.similarity_type))
+
+ # free up some space
+ del item_cooccurrence
+
+ logger.info("Done training")
+
+[docs] def score(self, test, remove_seen=False):
+ """Score all items for test users.
+
+ Args:
+ test (pandas.DataFrame): user to test
+ remove_seen (bool): flag to remove items seen in training from recommendation
+
+ Returns:
+ numpy.ndarray: Value of interest of all items for the users.
+ """
+
+ # get user / item indices from test set
+ user_ids = list(
+ map(
+ lambda user: self.user2index.get(user, np.NaN),
+ test[self.col_user].unique(),
+ )
+ )
+ if any(np.isnan(user_ids)):
+ raise ValueError("SAR cannot score users that are not in the training set")
+
+ # calculate raw scores with a matrix multiplication
+ logger.info("Calculating recommendation scores")
+ test_scores = self.user_affinity[user_ids, :].dot(self.item_similarity)
+
+ # ensure we're working with a dense ndarray
+ if isinstance(test_scores, sparse.spmatrix):
+ test_scores = test_scores.toarray()
+
+ if self.normalize:
+ counts = self.unity_user_affinity[user_ids, :].dot(self.item_similarity)
+ user_min_scores = (
+ np.tile(counts.min(axis=1)[:, np.newaxis], test_scores.shape[1])
+ * self.rating_min
+ )
+ user_max_scores = (
+ np.tile(counts.max(axis=1)[:, np.newaxis], test_scores.shape[1])
+ * self.rating_max
+ )
+ test_scores = rescale(
+ test_scores,
+ self.rating_min,
+ self.rating_max,
+ user_min_scores,
+ user_max_scores,
+ )
+
+ # remove items in the train set so recommended items are always novel
+ if remove_seen:
+ logger.info("Removing seen items")
+ test_scores += self.user_affinity[user_ids, :] * -np.inf
+
+ return test_scores
+
+[docs] def get_popularity_based_topk(self, top_k=10, sort_top_k=True, items=True):
+ """Get top K most frequently occurring items across all users.
+
+ Args:
+ top_k (int): number of top items to recommend.
+ sort_top_k (bool): flag to sort top k results.
+ items (bool): if false, return most frequent users instead
+
+ Returns:
+ pandas.DataFrame: top k most popular items.
+ """
+ if items:
+ frequencies = self.item_frequencies
+ col = self.col_item
+ idx = self.index2item
+ else:
+ if self.user_frequencies is None:
+ self.user_frequencies = self.user_affinity.getnnz(axis=1).astype(
+ "int64"
+ )
+ frequencies = self.user_frequencies
+ col = self.col_user
+ idx = self.index2user
+
+ test_scores = np.array([frequencies])
+
+ logger.info("Getting top K")
+ top_components, top_scores = get_top_k_scored_items(
+ scores=test_scores, top_k=top_k, sort_top_k=sort_top_k
+ )
+
+ return pd.DataFrame(
+ {
+ col: [idx[item] for item in top_components.flatten()],
+ self.col_prediction: top_scores.flatten(),
+ }
+ )
+
+[docs] def get_item_based_topk(self, items, top_k=10, sort_top_k=True):
+ """Get top K similar items to provided seed items based on similarity metric defined.
+ This method will take a set of items and use them to recommend the most similar items to that set
+ based on the similarity matrix fit during training.
+ This allows recommendations for cold-users (unseen during training), note - the model is not updated.
+
+ The following options are possible based on information provided in the items input:
+ 1. Single user or seed of items: only item column (ratings are assumed to be 1)
+ 2. Single user or seed of items w/ ratings: item column and rating column
+ 3. Separate users or seeds of items: item and user column (user ids are only used to separate item sets)
+ 4. Separate users or seeds of items with ratings: item, user and rating columns provided
+
+ Args:
+ items (pandas.DataFrame): DataFrame with item, user (optional), and rating (optional) columns
+ top_k (int): number of top items to recommend
+ sort_top_k (bool): flag to sort top k results
+
+ Returns:
+ pandas.DataFrame: sorted top k recommendation items
+ """
+
+ # convert item ids to indices
+ item_ids = np.asarray(
+ list(
+ map(
+ lambda item: self.item2index.get(item, np.NaN),
+ items[self.col_item].values,
+ )
+ )
+ )
+
+ # if no ratings were provided assume they are all 1
+ if self.col_rating in items.columns:
+ ratings = items[self.col_rating]
+ else:
+ ratings = pd.Series(np.ones_like(item_ids))
+
+ # create local map of user ids
+ if self.col_user in items.columns:
+ test_users = items[self.col_user]
+ user2index = {x[1]: x[0] for x in enumerate(items[self.col_user].unique())}
+ user_ids = test_users.map(user2index)
+ else:
+ # if no user column exists assume all entries are for a single user
+ test_users = pd.Series(np.zeros_like(item_ids))
+ user_ids = test_users
+ n_users = user_ids.drop_duplicates().shape[0]
+
+ # generate pseudo user affinity using seed items
+ pseudo_affinity = sparse.coo_matrix(
+ (ratings, (user_ids, item_ids)), shape=(n_users, self.n_items)
+ ).tocsr()
+
+ # calculate raw scores with a matrix multiplication
+ test_scores = pseudo_affinity.dot(self.item_similarity)
+
+ # remove items in the seed set so recommended items are novel
+ test_scores[user_ids, item_ids] = -np.inf
+
+ top_items, top_scores = get_top_k_scored_items(
+ scores=test_scores, top_k=top_k, sort_top_k=sort_top_k
+ )
+
+ df = pd.DataFrame(
+ {
+ self.col_user: np.repeat(
+ test_users.drop_duplicates().values, top_items.shape[1]
+ ),
+ self.col_item: [self.index2item[item] for item in top_items.flatten()],
+ self.col_prediction: top_scores.flatten(),
+ }
+ )
+
+ # drop invalid items
+ return df.replace(-np.inf, np.nan).dropna()
+
+[docs] def get_topk_most_similar_users(self, user, top_k, sort_top_k=True):
+ """Based on user affinity towards items, calculate the most similar users to the given user.
+
+ Args:
+ user (int): user to retrieve most similar users for
+ top_k (int): number of top items to recommend
+ sort_top_k (bool): flag to sort top k results
+
+ Returns:
+ pandas.DataFrame: top k most similar users and their scores
+ """
+ user_idx = self.user2index[user]
+ similarities = self.user_affinity[user_idx].dot(self.user_affinity.T).toarray()
+ similarities[0, user_idx] = -np.inf
+
+ top_items, top_scores = get_top_k_scored_items(
+ scores=similarities, top_k=top_k, sort_top_k=sort_top_k
+ )
+
+ df = pd.DataFrame(
+ {
+ self.col_user: [self.index2user[user] for user in top_items.flatten()],
+ self.col_prediction: top_scores.flatten(),
+ }
+ )
+
+ # drop invalid items
+ return df.replace(-np.inf, np.nan).dropna()
+
+[docs] def recommend_k_items(self, test, top_k=10, sort_top_k=True, remove_seen=False):
+ """Recommend top K items for all users which are in the test set
+
+ Args:
+ test (pandas.DataFrame): users to test
+ top_k (int): number of top items to recommend
+ sort_top_k (bool): flag to sort top k results
+ remove_seen (bool): flag to remove items seen in training from recommendation
+
+ Returns:
+ pandas.DataFrame: top k recommendation items for each user
+ """
+
+ test_scores = self.score(test, remove_seen=remove_seen)
+
+ top_items, top_scores = get_top_k_scored_items(
+ scores=test_scores, top_k=top_k, sort_top_k=sort_top_k
+ )
+
+ df = pd.DataFrame(
+ {
+ self.col_user: np.repeat(
+ test[self.col_user].drop_duplicates().values, top_items.shape[1]
+ ),
+ self.col_item: [self.index2item[item] for item in top_items.flatten()],
+ self.col_prediction: top_scores.flatten(),
+ }
+ )
+
+ # drop invalid items
+ return df.replace(-np.inf, np.nan).dropna()
+
+[docs] def predict(self, test):
+ """Output SAR scores for only the users-items pairs which are in the test set
+
+ Args:
+ test (pandas.DataFrame): DataFrame that contains users and items to test
+
+ Returns:
+ pandas.DataFrame: DataFrame contains the prediction results
+ """
+
+ test_scores = self.score(test)
+ user_ids = np.asarray(
+ list(
+ map(
+ lambda user: self.user2index.get(user, np.NaN),
+ test[self.col_user].values,
+ )
+ )
+ )
+
+ # create mapping of new items to zeros
+ item_ids = np.asarray(
+ list(
+ map(
+ lambda item: self.item2index.get(item, np.NaN),
+ test[self.col_item].values,
+ )
+ )
+ )
+ nans = np.isnan(item_ids)
+ if any(nans):
+ logger.warning(
+ "Items found in test not seen during training, new items will have score of 0"
+ )
+ test_scores = np.append(test_scores, np.zeros((self.n_users, 1)), axis=1)
+ item_ids[nans] = self.n_items
+ item_ids = item_ids.astype("int64")
+
+ df = pd.DataFrame(
+ {
+ self.col_user: test[self.col_user].values,
+ self.col_item: test[self.col_item].values,
+ self.col_prediction: test_scores[user_ids, item_ids],
+ }
+ )
+ return df
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import random
+import numpy as np
+from tqdm import tqdm
+import tensorflow as tf
+
+from recommenders.utils.timer import Timer
+
+
+[docs]class MultiHeadAttention(tf.keras.layers.Layer):
+ """
+ - Q (query), K (key) and V (value) are split into multiple heads (num_heads)
+ - each tuple (q, k, v) are fed to scaled_dot_product_attention
+ - all attention outputs are concatenated
+ """
+
+ def __init__(self, attention_dim, num_heads, dropout_rate):
+ """Initialize parameters.
+
+ Args:
+ attention_dim (int): Dimension of the attention embeddings.
+ num_heads (int): Number of heads in the multi-head self-attention module.
+ dropout_rate (float): Dropout probability.
+ """
+ super(MultiHeadAttention, self).__init__()
+ self.num_heads = num_heads
+ self.attention_dim = attention_dim
+ assert attention_dim % self.num_heads == 0
+ self.dropout_rate = dropout_rate
+
+ self.depth = attention_dim // self.num_heads
+
+ self.Q = tf.keras.layers.Dense(self.attention_dim, activation=None)
+ self.K = tf.keras.layers.Dense(self.attention_dim, activation=None)
+ self.V = tf.keras.layers.Dense(self.attention_dim, activation=None)
+ self.dropout = tf.keras.layers.Dropout(self.dropout_rate)
+
+[docs] def call(self, queries, keys):
+ """Model forward pass.
+
+ Args:
+ queries (tf.Tensor): Tensor of queries.
+ keys (tf.Tensor): Tensor of keys
+
+ Returns:
+ tf.Tensor: Output tensor.
+ """
+
+ # Linear projections
+ Q = self.Q(queries) # (N, T_q, C)
+ K = self.K(keys) # (N, T_k, C)
+ V = self.V(keys) # (N, T_k, C)
+
+ # --- MULTI HEAD ---
+ # Split and concat, Q_, K_ and V_ are all (h*N, T_q, C/h)
+ Q_ = tf.concat(tf.split(Q, self.num_heads, axis=2), axis=0)
+ K_ = tf.concat(tf.split(K, self.num_heads, axis=2), axis=0)
+ V_ = tf.concat(tf.split(V, self.num_heads, axis=2), axis=0)
+
+ # --- SCALED DOT PRODUCT ---
+ # Multiplication
+ outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k)
+
+ # Scale
+ outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
+
+ # Key Masking
+ key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k)
+ key_masks = tf.tile(key_masks, [self.num_heads, 1]) # (h*N, T_k)
+ key_masks = tf.tile(
+ tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]
+ ) # (h*N, T_q, T_k)
+
+ paddings = tf.ones_like(outputs) * (-(2**32) + 1)
+ # outputs, (h*N, T_q, T_k)
+ outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs)
+
+ # Future blinding (Causality)
+ diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k)
+ tril = tf.linalg.LinearOperatorLowerTriangular(
+ diag_vals
+ ).to_dense() # (T_q, T_k)
+ masks = tf.tile(
+ tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]
+ ) # (h*N, T_q, T_k)
+
+ paddings = tf.ones_like(masks) * (-(2**32) + 1)
+ # outputs, (h*N, T_q, T_k)
+ outputs = tf.where(tf.equal(masks, 0), paddings, outputs)
+
+ # Activation
+ outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k)
+
+ # Query Masking, query_masks (N, T_q)
+ query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1)))
+ query_masks = tf.tile(query_masks, [self.num_heads, 1]) # (h*N, T_q)
+ query_masks = tf.tile(
+ tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]
+ ) # (h*N, T_q, T_k)
+ outputs *= query_masks # broadcasting. (N, T_q, C)
+
+ # Dropouts
+ outputs = self.dropout(outputs)
+
+ # Weighted sum
+ outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h)
+
+ # --- MULTI HEAD ---
+ # concat heads
+ outputs = tf.concat(
+ tf.split(outputs, self.num_heads, axis=0), axis=2
+ ) # (N, T_q, C)
+
+ # Residual connection
+ outputs += queries
+
+ return outputs
+
+
+[docs]class PointWiseFeedForward(tf.keras.layers.Layer):
+ """
+ Convolution layers with residual connection
+ """
+
+ def __init__(self, conv_dims, dropout_rate):
+ """Initialize parameters.
+
+ Args:
+ conv_dims (list): List of the dimensions of the Feedforward layer.
+ dropout_rate (float): Dropout probability.
+ """
+ super(PointWiseFeedForward, self).__init__()
+ self.conv_dims = conv_dims
+ self.dropout_rate = dropout_rate
+ self.conv_layer1 = tf.keras.layers.Conv1D(
+ filters=self.conv_dims[0], kernel_size=1, activation="relu", use_bias=True
+ )
+ self.conv_layer2 = tf.keras.layers.Conv1D(
+ filters=self.conv_dims[1], kernel_size=1, activation=None, use_bias=True
+ )
+ self.dropout_layer = tf.keras.layers.Dropout(self.dropout_rate)
+
+[docs] def call(self, x):
+ """Model forward pass.
+
+ Args:
+ x (tf.Tensor): Input tensor.
+
+ Returns:
+ tf.Tensor: Output tensor.
+ """
+
+ output = self.conv_layer1(x)
+ output = self.dropout_layer(output)
+
+ output = self.conv_layer2(output)
+ output = self.dropout_layer(output)
+
+ # Residual connection
+ output += x
+
+ return output
+
+
+[docs]class EncoderLayer(tf.keras.layers.Layer):
+ """
+ Transformer based encoder layer
+
+ """
+
+ def __init__(
+ self,
+ seq_max_len,
+ embedding_dim,
+ attention_dim,
+ num_heads,
+ conv_dims,
+ dropout_rate,
+ ):
+ """Initialize parameters.
+
+ Args:
+ seq_max_len (int): Maximum sequence length.
+ embedding_dim (int): Embedding dimension.
+ attention_dim (int): Dimension of the attention embeddings.
+ num_heads (int): Number of heads in the multi-head self-attention module.
+ conv_dims (list): List of the dimensions of the Feedforward layer.
+ dropout_rate (float): Dropout probability.
+ """
+ super(EncoderLayer, self).__init__()
+
+ self.seq_max_len = seq_max_len
+ self.embedding_dim = embedding_dim
+
+ self.mha = MultiHeadAttention(attention_dim, num_heads, dropout_rate)
+ self.ffn = PointWiseFeedForward(conv_dims, dropout_rate)
+
+ self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+ self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+
+ self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
+ self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
+
+ self.layer_normalization = LayerNormalization(
+ self.seq_max_len, self.embedding_dim, 1e-08
+ )
+
+[docs] def call_(self, x, training, mask):
+ """Model forward pass.
+
+ Args:
+ x (tf.Tensor): Input tensor.
+ training (tf.Tensor): Training tensor.
+ mask (tf.Tensor): Mask tensor.
+
+ Returns:
+ tf.Tensor: Output tensor.
+ """
+
+ attn_output = self.mha(queries=self.layer_normalization(x), keys=x)
+ attn_output = self.dropout1(attn_output, training=training)
+ out1 = self.layernorm1(x + attn_output)
+
+ # feed forward network
+ ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model)
+ ffn_output = self.dropout2(ffn_output, training=training)
+ out2 = self.layernorm2(
+ out1 + ffn_output
+ ) # (batch_size, input_seq_len, d_model)
+
+ # masking
+ out2 *= mask
+
+ return out2
+
+[docs] def call(self, x, training, mask):
+ """Model forward pass.
+
+ Args:
+ x (tf.Tensor): Input tensor.
+ training (tf.Tensor): Training tensor.
+ mask (tf.Tensor): Mask tensor.
+
+ Returns:
+ tf.Tensor: Output tensor.
+ """
+
+ x_norm = self.layer_normalization(x)
+ attn_output = self.mha(queries=x_norm, keys=x)
+ attn_output = self.ffn(attn_output)
+ out = attn_output * mask
+
+ return out
+
+
+[docs]class Encoder(tf.keras.layers.Layer):
+ """
+ Invokes Transformer based encoder with user defined number of layers
+
+ """
+
+ def __init__(
+ self,
+ num_layers,
+ seq_max_len,
+ embedding_dim,
+ attention_dim,
+ num_heads,
+ conv_dims,
+ dropout_rate,
+ ):
+ """Initialize parameters.
+
+ Args:
+ num_layers (int): Number of layers.
+ seq_max_len (int): Maximum sequence length.
+ embedding_dim (int): Embedding dimension.
+ attention_dim (int): Dimension of the attention embeddings.
+ num_heads (int): Number of heads in the multi-head self-attention module.
+ conv_dims (list): List of the dimensions of the Feedforward layer.
+ dropout_rate (float): Dropout probability.
+ """
+ super(Encoder, self).__init__()
+
+ self.num_layers = num_layers
+
+ self.enc_layers = [
+ EncoderLayer(
+ seq_max_len,
+ embedding_dim,
+ attention_dim,
+ num_heads,
+ conv_dims,
+ dropout_rate,
+ )
+ for _ in range(num_layers)
+ ]
+
+ self.dropout = tf.keras.layers.Dropout(dropout_rate)
+
+[docs] def call(self, x, training, mask):
+ """Model forward pass.
+
+ Args:
+ x (tf.Tensor): Input tensor.
+ training (tf.Tensor): Training tensor.
+ mask (tf.Tensor): Mask tensor.
+
+ Returns:
+ tf.Tensor: Output tensor.
+ """
+
+ for i in range(self.num_layers):
+ x = self.enc_layers[i](x, training, mask)
+
+ return x # (batch_size, input_seq_len, d_model)
+
+
+[docs]class LayerNormalization(tf.keras.layers.Layer):
+ """
+ Layer normalization using mean and variance
+ gamma and beta are the learnable parameters
+ """
+
+ def __init__(self, seq_max_len, embedding_dim, epsilon):
+ """Initialize parameters.
+
+ Args:
+ seq_max_len (int): Maximum sequence length.
+ embedding_dim (int): Embedding dimension.
+ epsilon (float): Epsilon value.
+ """
+ super(LayerNormalization, self).__init__()
+ self.seq_max_len = seq_max_len
+ self.embedding_dim = embedding_dim
+ self.epsilon = epsilon
+ self.params_shape = (self.seq_max_len, self.embedding_dim)
+ g_init = tf.ones_initializer()
+ self.gamma = tf.Variable(
+ initial_value=g_init(shape=self.params_shape, dtype="float32"),
+ trainable=True,
+ )
+ b_init = tf.zeros_initializer()
+ self.beta = tf.Variable(
+ initial_value=b_init(shape=self.params_shape, dtype="float32"),
+ trainable=True,
+ )
+
+[docs] def call(self, x):
+ """Model forward pass.
+
+ Args:
+ x (tf.Tensor): Input tensor.
+
+ Returns:
+ tf.Tensor: Output tensor.
+ """
+ mean, variance = tf.nn.moments(x, [-1], keepdims=True)
+ normalized = (x - mean) / ((variance + self.epsilon) ** 0.5)
+ output = self.gamma * normalized + self.beta
+ return output
+
+
+[docs]class SASREC(tf.keras.Model):
+ """SAS Rec model
+ Self-Attentive Sequential Recommendation Using Transformer
+
+ :Citation:
+
+ Wang-Cheng Kang, Julian McAuley (2018), Self-Attentive Sequential
+ Recommendation. Proceedings of IEEE International Conference on
+ Data Mining (ICDM'18)
+
+ Original source code from nnkkmto/SASRec-tf2,
+ https://github.com/nnkkmto/SASRec-tf2
+
+ """
+
+ def __init__(self, **kwargs):
+ """Model initialization.
+
+ Args:
+ item_num (int): Number of items in the dataset.
+ seq_max_len (int): Maximum number of items in user history.
+ num_blocks (int): Number of Transformer blocks to be used.
+ embedding_dim (int): Item embedding dimension.
+ attention_dim (int): Transformer attention dimension.
+ conv_dims (list): List of the dimensions of the Feedforward layer.
+ dropout_rate (float): Dropout rate.
+ l2_reg (float): Coefficient of the L2 regularization.
+ num_neg_test (int): Number of negative examples used in testing.
+ """
+ super(SASREC, self).__init__()
+
+ self.item_num = kwargs.get("item_num", None)
+ self.seq_max_len = kwargs.get("seq_max_len", 100)
+ self.num_blocks = kwargs.get("num_blocks", 2)
+ self.embedding_dim = kwargs.get("embedding_dim", 100)
+ self.attention_dim = kwargs.get("attention_dim", 100)
+ self.attention_num_heads = kwargs.get("attention_num_heads", 1)
+ self.conv_dims = kwargs.get("conv_dims", [100, 100])
+ self.dropout_rate = kwargs.get("dropout_rate", 0.5)
+ self.l2_reg = kwargs.get("l2_reg", 0.0)
+ self.num_neg_test = kwargs.get("num_neg_test", 100)
+
+ self.item_embedding_layer = tf.keras.layers.Embedding(
+ self.item_num + 1,
+ self.embedding_dim,
+ name="item_embeddings",
+ mask_zero=True,
+ embeddings_regularizer=tf.keras.regularizers.L2(self.l2_reg),
+ )
+
+ self.positional_embedding_layer = tf.keras.layers.Embedding(
+ self.seq_max_len,
+ self.embedding_dim,
+ name="positional_embeddings",
+ mask_zero=False,
+ embeddings_regularizer=tf.keras.regularizers.L2(self.l2_reg),
+ )
+ self.dropout_layer = tf.keras.layers.Dropout(self.dropout_rate)
+ self.encoder = Encoder(
+ self.num_blocks,
+ self.seq_max_len,
+ self.embedding_dim,
+ self.attention_dim,
+ self.attention_num_heads,
+ self.conv_dims,
+ self.dropout_rate,
+ )
+ self.mask_layer = tf.keras.layers.Masking(mask_value=0)
+ self.layer_normalization = LayerNormalization(
+ self.seq_max_len, self.embedding_dim, 1e-08
+ )
+
+[docs] def embedding(self, input_seq):
+ """Compute the sequence and positional embeddings.
+
+ Args:
+ input_seq (tf.Tensor): Input sequence
+
+ Returns:
+ tf.Tensor, tf.Tensor:
+ - Sequence embeddings.
+ - Positional embeddings.
+ """
+
+ seq_embeddings = self.item_embedding_layer(input_seq)
+ seq_embeddings = seq_embeddings * (self.embedding_dim**0.5)
+
+ # FIXME
+ positional_seq = tf.expand_dims(tf.range(tf.shape(input_seq)[1]), 0)
+ positional_seq = tf.tile(positional_seq, [tf.shape(input_seq)[0], 1])
+ positional_embeddings = self.positional_embedding_layer(positional_seq)
+
+ return seq_embeddings, positional_embeddings
+
+[docs] def call(self, x, training):
+ """Model forward pass.
+
+ Args:
+ x (tf.Tensor): Input tensor.
+ training (tf.Tensor): Training tensor.
+
+ Returns:
+ tf.Tensor, tf.Tensor, tf.Tensor:
+ - Logits of the positive examples.
+ - Logits of the negative examples.
+ - Mask for nonzero targets
+ """
+
+ input_seq = x["input_seq"]
+ pos = x["positive"]
+ neg = x["negative"]
+
+ mask = tf.expand_dims(tf.cast(tf.not_equal(input_seq, 0), tf.float32), -1)
+ seq_embeddings, positional_embeddings = self.embedding(input_seq)
+
+ # add positional embeddings
+ seq_embeddings += positional_embeddings
+
+ # dropout
+ seq_embeddings = self.dropout_layer(seq_embeddings)
+
+ # masking
+ seq_embeddings *= mask
+
+ # --- ATTENTION BLOCKS ---
+ seq_attention = seq_embeddings
+ seq_attention = self.encoder(seq_attention, training, mask)
+ seq_attention = self.layer_normalization(seq_attention) # (b, s, d)
+
+ # --- PREDICTION LAYER ---
+ # user's sequence embedding
+ pos = self.mask_layer(pos)
+ neg = self.mask_layer(neg)
+
+ pos = tf.reshape(pos, [tf.shape(input_seq)[0] * self.seq_max_len])
+ neg = tf.reshape(neg, [tf.shape(input_seq)[0] * self.seq_max_len])
+ pos_emb = self.item_embedding_layer(pos)
+ neg_emb = self.item_embedding_layer(neg)
+ seq_emb = tf.reshape(
+ seq_attention,
+ [tf.shape(input_seq)[0] * self.seq_max_len, self.embedding_dim],
+ ) # (b*s, d)
+
+ pos_logits = tf.reduce_sum(pos_emb * seq_emb, -1)
+ neg_logits = tf.reduce_sum(neg_emb * seq_emb, -1)
+
+ pos_logits = tf.expand_dims(pos_logits, axis=-1) # (bs, 1)
+ # pos_prob = tf.keras.layers.Dense(1, activation='sigmoid')(pos_logits) # (bs, 1)
+
+ neg_logits = tf.expand_dims(neg_logits, axis=-1) # (bs, 1)
+ # neg_prob = tf.keras.layers.Dense(1, activation='sigmoid')(neg_logits) # (bs, 1)
+
+ # output = tf.concat([pos_logits, neg_logits], axis=0)
+
+ # masking for loss calculation
+ istarget = tf.reshape(
+ tf.cast(tf.not_equal(pos, 0), dtype=tf.float32),
+ [tf.shape(input_seq)[0] * self.seq_max_len],
+ )
+
+ return pos_logits, neg_logits, istarget
+
+[docs] def predict(self, inputs):
+ """Returns the logits for the test items.
+
+ Args:
+ inputs (tf.Tensor): Input tensor.
+
+ Returns:
+ tf.Tensor: Output tensor.
+ """
+ training = False
+ input_seq = inputs["input_seq"]
+ candidate = inputs["candidate"]
+
+ mask = tf.expand_dims(tf.cast(tf.not_equal(input_seq, 0), tf.float32), -1)
+ seq_embeddings, positional_embeddings = self.embedding(input_seq)
+ seq_embeddings += positional_embeddings
+ # seq_embeddings = self.dropout_layer(seq_embeddings)
+ seq_embeddings *= mask
+ seq_attention = seq_embeddings
+ seq_attention = self.encoder(seq_attention, training, mask)
+ seq_attention = self.layer_normalization(seq_attention) # (b, s, d)
+ seq_emb = tf.reshape(
+ seq_attention,
+ [tf.shape(input_seq)[0] * self.seq_max_len, self.embedding_dim],
+ ) # (b*s, d)
+ candidate_emb = self.item_embedding_layer(candidate) # (b, s, d)
+ candidate_emb = tf.transpose(candidate_emb, perm=[0, 2, 1]) # (b, d, s)
+
+ test_logits = tf.matmul(seq_emb, candidate_emb)
+ # (200, 100) * (1, 101, 100)'
+
+ test_logits = tf.reshape(
+ test_logits,
+ [tf.shape(input_seq)[0], self.seq_max_len, 1 + self.num_neg_test],
+ ) # (1, 200, 101)
+ test_logits = test_logits[:, -1, :] # (1, 101)
+ return test_logits
+
+[docs] def loss_function(self, pos_logits, neg_logits, istarget):
+ """Losses are calculated separately for the positive and negative
+ items based on the corresponding logits. A mask is included to
+ take care of the zero items (added for padding).
+
+ Args:
+ pos_logits (tf.Tensor): Logits of the positive examples.
+ neg_logits (tf.Tensor): Logits of the negative examples.
+ istarget (tf.Tensor): Mask for nonzero targets.
+
+ Returns:
+ float: Loss.
+ """
+
+ pos_logits = pos_logits[:, 0]
+ neg_logits = neg_logits[:, 0]
+
+ # ignore padding items (0)
+ # istarget = tf.reshape(
+ # tf.cast(tf.not_equal(self.pos, 0), dtype=tf.float32),
+ # [tf.shape(self.input_seq)[0] * self.seq_max_len],
+ # )
+ # for logits
+ loss = tf.reduce_sum(
+ -tf.math.log(tf.math.sigmoid(pos_logits) + 1e-24) * istarget
+ - tf.math.log(1 - tf.math.sigmoid(neg_logits) + 1e-24) * istarget
+ ) / tf.reduce_sum(istarget)
+
+ # for probabilities
+ # loss = tf.reduce_sum(
+ # - tf.math.log(pos_logits + 1e-24) * istarget -
+ # tf.math.log(1 - neg_logits + 1e-24) * istarget
+ # ) / tf.reduce_sum(istarget)
+ reg_loss = tf.compat.v1.losses.get_regularization_loss()
+ # reg_losses = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
+ # loss += sum(reg_losses)
+ loss += reg_loss
+
+ return loss
+
+[docs] def create_combined_dataset(self, u, seq, pos, neg):
+ """
+ function to create model inputs from sampled batch data.
+ This function is used only during training.
+ """
+ inputs = {}
+ seq = tf.keras.preprocessing.sequence.pad_sequences(
+ seq, padding="pre", truncating="pre", maxlen=self.seq_max_len
+ )
+ pos = tf.keras.preprocessing.sequence.pad_sequences(
+ pos, padding="pre", truncating="pre", maxlen=self.seq_max_len
+ )
+ neg = tf.keras.preprocessing.sequence.pad_sequences(
+ neg, padding="pre", truncating="pre", maxlen=self.seq_max_len
+ )
+
+ inputs["users"] = np.expand_dims(np.array(u), axis=-1)
+ inputs["input_seq"] = seq
+ inputs["positive"] = pos
+ inputs["negative"] = neg
+
+ target = np.concatenate(
+ [
+ np.repeat(1, seq.shape[0] * seq.shape[1]),
+ np.repeat(0, seq.shape[0] * seq.shape[1]),
+ ],
+ axis=0,
+ )
+ target = np.expand_dims(target, axis=-1)
+ return inputs, target
+
+[docs] def train(self, dataset, sampler, **kwargs):
+ """
+ High level function for model training as well as
+ evaluation on the validation and test dataset
+ """
+ num_epochs = kwargs.get("num_epochs", 10)
+ batch_size = kwargs.get("batch_size", 128)
+ lr = kwargs.get("learning_rate", 0.001)
+ val_epoch = kwargs.get("val_epoch", 5)
+
+ num_steps = int(len(dataset.user_train) / batch_size)
+
+ optimizer = tf.keras.optimizers.Adam(
+ learning_rate=lr, beta_1=0.9, beta_2=0.999, epsilon=1e-7
+ )
+
+ loss_function = self.loss_function
+
+ train_loss = tf.keras.metrics.Mean(name="train_loss")
+
+ train_step_signature = [
+ {
+ "users": tf.TensorSpec(shape=(None, 1), dtype=tf.int64),
+ "input_seq": tf.TensorSpec(
+ shape=(None, self.seq_max_len), dtype=tf.int64
+ ),
+ "positive": tf.TensorSpec(
+ shape=(None, self.seq_max_len), dtype=tf.int64
+ ),
+ "negative": tf.TensorSpec(
+ shape=(None, self.seq_max_len), dtype=tf.int64
+ ),
+ },
+ tf.TensorSpec(shape=(None, 1), dtype=tf.int64),
+ ]
+
+ @tf.function(input_signature=train_step_signature)
+ def train_step(inp, tar):
+ with tf.GradientTape() as tape:
+ pos_logits, neg_logits, loss_mask = self(inp, training=True)
+ loss = loss_function(pos_logits, neg_logits, loss_mask)
+
+ gradients = tape.gradient(loss, self.trainable_variables)
+ optimizer.apply_gradients(zip(gradients, self.trainable_variables))
+
+ train_loss(loss)
+ return loss
+
+ T = 0.0
+ t0 = Timer()
+ t0.start()
+
+ for epoch in range(1, num_epochs + 1):
+
+ step_loss = []
+ train_loss.reset_states()
+ for step in tqdm(
+ range(num_steps), total=num_steps, ncols=70, leave=False, unit="b"
+ ):
+
+ u, seq, pos, neg = sampler.next_batch()
+
+ inputs, target = self.create_combined_dataset(u, seq, pos, neg)
+
+ loss = train_step(inputs, target)
+ step_loss.append(loss)
+
+ if epoch % val_epoch == 0:
+ t0.stop()
+ t1 = t0.interval
+ T += t1
+ print("Evaluating...")
+ t_test = self.evaluate(dataset)
+ t_valid = self.evaluate_valid(dataset)
+ print(
+ f"\nepoch: {epoch}, time: {T}, valid (NDCG@10: {t_valid[0]}, HR@10: {t_valid[1]})"
+ )
+ print(
+ f"epoch: {epoch}, time: {T}, test (NDCG@10: {t_test[0]}, HR@10: {t_test[1]})"
+ )
+ t0.start()
+
+ t_test = self.evaluate(dataset)
+ print(f"\nepoch: {epoch}, test (NDCG@10: {t_test[0]}, HR@10: {t_test[1]})")
+
+ return t_test
+
+[docs] def evaluate(self, dataset):
+ """
+ Evaluation on the test users (users with at least 3 items)
+ """
+ usernum = dataset.usernum
+ itemnum = dataset.itemnum
+ train = dataset.user_train # removing deepcopy
+ valid = dataset.user_valid
+ test = dataset.user_test
+
+ NDCG = 0.0
+ HT = 0.0
+ valid_user = 0.0
+
+ if usernum > 10000:
+ users = random.sample(range(1, usernum + 1), 10000)
+ else:
+ users = range(1, usernum + 1)
+
+ for u in tqdm(users, ncols=70, leave=False, unit="b"):
+
+ if len(train[u]) < 1 or len(test[u]) < 1:
+ continue
+
+ seq = np.zeros([self.seq_max_len], dtype=np.int32)
+ idx = self.seq_max_len - 1
+ seq[idx] = valid[u][0]
+ idx -= 1
+ for i in reversed(train[u]):
+ seq[idx] = i
+ idx -= 1
+ if idx == -1:
+ break
+ rated = set(train[u])
+ rated.add(0)
+ item_idx = [test[u][0]]
+ for _ in range(self.num_neg_test):
+ t = np.random.randint(1, itemnum + 1)
+ while t in rated:
+ t = np.random.randint(1, itemnum + 1)
+ item_idx.append(t)
+
+ inputs = {}
+ inputs["user"] = np.expand_dims(np.array([u]), axis=-1)
+ inputs["input_seq"] = np.array([seq])
+ inputs["candidate"] = np.array([item_idx])
+
+ # inverse to get descending sort
+ predictions = -1.0 * self.predict(inputs)
+ predictions = np.array(predictions)
+ predictions = predictions[0]
+
+ rank = predictions.argsort().argsort()[0]
+
+ valid_user += 1
+
+ if rank < 10:
+ NDCG += 1 / np.log2(rank + 2)
+ HT += 1
+
+ return NDCG / valid_user, HT / valid_user
+
+[docs] def evaluate_valid(self, dataset):
+ """
+ Evaluation on the validation users
+ """
+ usernum = dataset.usernum
+ itemnum = dataset.itemnum
+ train = dataset.user_train # removing deepcopy
+ valid = dataset.user_valid
+
+ NDCG = 0.0
+ valid_user = 0.0
+ HT = 0.0
+ if usernum > 10000:
+ users = random.sample(range(1, usernum + 1), 10000)
+ else:
+ users = range(1, usernum + 1)
+
+ for u in tqdm(users, ncols=70, leave=False, unit="b"):
+ if len(train[u]) < 1 or len(valid[u]) < 1:
+ continue
+
+ seq = np.zeros([self.seq_max_len], dtype=np.int32)
+ idx = self.seq_max_len - 1
+ for i in reversed(train[u]):
+ seq[idx] = i
+ idx -= 1
+ if idx == -1:
+ break
+
+ rated = set(train[u])
+ rated.add(0)
+ item_idx = [valid[u][0]]
+ for _ in range(self.num_neg_test):
+ t = np.random.randint(1, itemnum + 1)
+ while t in rated:
+ t = np.random.randint(1, itemnum + 1)
+ item_idx.append(t)
+
+ inputs = {}
+ inputs["user"] = np.expand_dims(np.array([u]), axis=-1)
+ inputs["input_seq"] = np.array([seq])
+ inputs["candidate"] = np.array([item_idx])
+
+ # predictions = -model.predict(sess, [u], [seq], item_idx)
+ predictions = -1.0 * self.predict(inputs)
+ predictions = np.array(predictions)
+ predictions = predictions[0]
+
+ rank = predictions.argsort().argsort()[0]
+
+ valid_user += 1
+
+ if rank < 10:
+ NDCG += 1 / np.log2(rank + 2)
+ HT += 1
+
+ return NDCG / valid_user, HT / valid_user
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+# Original codes are from
+# https://github.com/kang205/SASRec/blob/master/sampler.py
+
+import numpy as np
+from multiprocessing import Process, Queue
+
+
+def random_neq(left, right, s):
+ t = np.random.randint(left, right)
+ while t in s:
+ t = np.random.randint(left, right)
+ return t
+
+
+[docs]def sample_function(
+ user_train, usernum, itemnum, batch_size, maxlen, result_queue, seed
+):
+ """Batch sampler that creates a sequence of negative items based on the
+ original sequence of items (positive) that the user has interacted with.
+
+ Args:
+ user_train (dict): dictionary of training exampled for each user
+ usernum (int): number of users
+ itemnum (int): number of items
+ batch_size (int): batch size
+ maxlen (int): maximum input sequence length
+ result_queue (multiprocessing.Queue): queue for storing sample results
+ seed (int): seed for random generator
+ """
+
+ def sample():
+
+ user = np.random.randint(1, usernum + 1)
+ while len(user_train[user]) <= 1:
+ user = np.random.randint(1, usernum + 1)
+
+ seq = np.zeros([maxlen], dtype=np.int32)
+ pos = np.zeros([maxlen], dtype=np.int32)
+ neg = np.zeros([maxlen], dtype=np.int32)
+ nxt = user_train[user][-1]
+ idx = maxlen - 1
+
+ ts = set(user_train[user])
+ for i in reversed(user_train[user][:-1]):
+ seq[idx] = i
+ pos[idx] = nxt
+ if nxt != 0:
+ neg[idx] = random_neq(1, itemnum + 1, ts)
+ nxt = i
+ idx -= 1
+ if idx == -1:
+ break
+
+ return (user, seq, pos, neg)
+
+ np.random.seed(seed)
+ while True:
+ one_batch = []
+ for i in range(batch_size):
+ one_batch.append(sample())
+
+ result_queue.put(zip(*one_batch))
+
+
+[docs]class WarpSampler(object):
+ """Sampler object that creates an iterator for feeding batch data while training.
+
+ Attributes:
+ User: dict, all the users (keys) with items as values
+ usernum: integer, total number of users
+ itemnum: integer, total number of items
+ batch_size (int): batch size
+ maxlen (int): maximum input sequence length
+ n_workers (int): number of workers for parallel execution
+ """
+
+ def __init__(self, User, usernum, itemnum, batch_size=64, maxlen=10, n_workers=1):
+ self.result_queue = Queue(maxsize=n_workers * 10)
+ self.processors = []
+ for i in range(n_workers):
+ self.processors.append(
+ Process(
+ target=sample_function,
+ args=(
+ User,
+ usernum,
+ itemnum,
+ batch_size,
+ maxlen,
+ self.result_queue,
+ np.random.randint(2e9),
+ ),
+ )
+ )
+ self.processors[-1].daemon = True
+ self.processors[-1].start()
+
+ def next_batch(self):
+ return self.result_queue.get()
+
+ def close(self):
+ for p in self.processors:
+ p.terminate()
+ p.join()
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+from recommenders.models.sasrec.model import SASREC, Encoder, LayerNormalization
+
+
+[docs]class SSEPT(SASREC):
+ """
+ SSE-PT Model
+
+ :Citation:
+
+ Wu L., Li S., Hsieh C-J., Sharpnack J., SSE-PT: Sequential Recommendation
+ Via Personalized Transformer, RecSys, 2020.
+ TF 1.x codebase: https://github.com/SSE-PT/SSE-PT
+ TF 2.x codebase (SASREc): https://github.com/nnkkmto/SASRec-tf2
+ """
+
+ def __init__(self, **kwargs):
+ """Model initialization.
+
+ Args:
+ item_num (int): Number of items in the dataset.
+ seq_max_len (int): Maximum number of items in user history.
+ num_blocks (int): Number of Transformer blocks to be used.
+ embedding_dim (int): Item embedding dimension.
+ attention_dim (int): Transformer attention dimension.
+ conv_dims (list): List of the dimensions of the Feedforward layer.
+ dropout_rate (float): Dropout rate.
+ l2_reg (float): Coefficient of the L2 regularization.
+ num_neg_test (int): Number of negative examples used in testing.
+ user_num (int): Number of users in the dataset.
+ user_embedding_dim (int): User embedding dimension.
+ item_embedding_dim (int): Item embedding dimension.
+ """
+ super().__init__(**kwargs)
+
+ self.user_num = kwargs.get("user_num", None) # New
+ self.conv_dims = kwargs.get("conv_dims", [200, 200]) # modified
+ self.user_embedding_dim = kwargs.get(
+ "user_embedding_dim", self.embedding_dim
+ ) # extra
+ self.item_embedding_dim = kwargs.get("item_embedding_dim", self.embedding_dim)
+ self.hidden_units = self.item_embedding_dim + self.user_embedding_dim
+
+ # New, user embedding
+ self.user_embedding_layer = tf.keras.layers.Embedding(
+ input_dim=self.user_num + 1,
+ output_dim=self.user_embedding_dim,
+ name="user_embeddings",
+ mask_zero=True,
+ input_length=1,
+ embeddings_regularizer=tf.keras.regularizers.L2(self.l2_reg),
+ )
+ self.positional_embedding_layer = tf.keras.layers.Embedding(
+ self.seq_max_len,
+ self.user_embedding_dim + self.item_embedding_dim, # difference
+ name="positional_embeddings",
+ mask_zero=False,
+ embeddings_regularizer=tf.keras.regularizers.L2(self.l2_reg),
+ )
+ self.dropout_layer = tf.keras.layers.Dropout(self.dropout_rate)
+ self.encoder = Encoder(
+ self.num_blocks,
+ self.seq_max_len,
+ self.hidden_units,
+ self.hidden_units,
+ self.attention_num_heads,
+ self.conv_dims,
+ self.dropout_rate,
+ )
+ self.mask_layer = tf.keras.layers.Masking(mask_value=0)
+ self.layer_normalization = LayerNormalization(
+ self.seq_max_len, self.hidden_units, 1e-08
+ )
+
+[docs] def call(self, x, training):
+ """Model forward pass.
+
+ Args:
+ x (tf.Tensor): Input tensor.
+ training (tf.Tensor): Training tensor.
+
+ Returns:
+ tf.Tensor, tf.Tensor, tf.Tensor:
+ - Logits of the positive examples.
+ - Logits of the negative examples.
+ - Mask for nonzero targets
+ """
+
+ users = x["users"]
+ input_seq = x["input_seq"]
+ pos = x["positive"]
+ neg = x["negative"]
+
+ mask = tf.expand_dims(tf.cast(tf.not_equal(input_seq, 0), tf.float32), -1)
+ seq_embeddings, positional_embeddings = self.embedding(input_seq)
+
+ # User Encoding
+ # u0_latent = self.user_embedding_layer(users[0])
+ # u0_latent = u0_latent * (self.embedding_dim ** 0.5)
+ u_latent = self.user_embedding_layer(users)
+ u_latent = u_latent * (self.user_embedding_dim**0.5) # (b, 1, h)
+ # return users
+
+ # replicate the user embedding for all the items
+ u_latent = tf.tile(u_latent, [1, tf.shape(input_seq)[1], 1]) # (b, s, h)
+
+ seq_embeddings = tf.reshape(
+ tf.concat([seq_embeddings, u_latent], 2),
+ [tf.shape(input_seq)[0], -1, self.hidden_units],
+ )
+ seq_embeddings += positional_embeddings
+
+ # dropout
+ seq_embeddings = self.dropout_layer(seq_embeddings, training=training)
+
+ # masking
+ seq_embeddings *= mask
+
+ # --- ATTENTION BLOCKS ---
+ seq_attention = seq_embeddings # (b, s, h1 + h2)
+
+ seq_attention = self.encoder(seq_attention, training, mask)
+ seq_attention = self.layer_normalization(seq_attention) # (b, s, h1+h2)
+
+ # --- PREDICTION LAYER ---
+ # user's sequence embedding
+ pos = self.mask_layer(pos)
+ neg = self.mask_layer(neg)
+
+ user_emb = tf.reshape(
+ u_latent,
+ [tf.shape(input_seq)[0] * self.seq_max_len, self.user_embedding_dim],
+ )
+ pos = tf.reshape(pos, [tf.shape(input_seq)[0] * self.seq_max_len])
+ neg = tf.reshape(neg, [tf.shape(input_seq)[0] * self.seq_max_len])
+ pos_emb = self.item_embedding_layer(pos)
+ neg_emb = self.item_embedding_layer(neg)
+
+ # Add user embeddings
+ pos_emb = tf.reshape(tf.concat([pos_emb, user_emb], 1), [-1, self.hidden_units])
+ neg_emb = tf.reshape(tf.concat([neg_emb, user_emb], 1), [-1, self.hidden_units])
+
+ seq_emb = tf.reshape(
+ seq_attention,
+ [tf.shape(input_seq)[0] * self.seq_max_len, self.hidden_units],
+ ) # (b*s, d)
+
+ pos_logits = tf.reduce_sum(pos_emb * seq_emb, -1)
+ neg_logits = tf.reduce_sum(neg_emb * seq_emb, -1)
+
+ pos_logits = tf.expand_dims(pos_logits, axis=-1) # (bs, 1)
+ # pos_prob = tf.keras.layers.Dense(1, activation='sigmoid')(pos_logits) # (bs, 1)
+
+ neg_logits = tf.expand_dims(neg_logits, axis=-1) # (bs, 1)
+ # neg_prob = tf.keras.layers.Dense(1, activation='sigmoid')(neg_logits) # (bs, 1)
+
+ # output = tf.concat([pos_logits, neg_logits], axis=0)
+
+ # masking for loss calculation
+ istarget = tf.reshape(
+ tf.cast(tf.not_equal(pos, 0), dtype=tf.float32),
+ [tf.shape(input_seq)[0] * self.seq_max_len],
+ )
+
+ return pos_logits, neg_logits, istarget
+
+[docs] def predict(self, inputs):
+ """
+ Model prediction for candidate (negative) items
+
+ """
+ training = False
+ user = inputs["user"]
+ input_seq = inputs["input_seq"]
+ candidate = inputs["candidate"]
+
+ mask = tf.expand_dims(tf.cast(tf.not_equal(input_seq, 0), tf.float32), -1)
+ seq_embeddings, positional_embeddings = self.embedding(input_seq) # (1, s, h)
+
+ u0_latent = self.user_embedding_layer(user)
+ u0_latent = u0_latent * (self.user_embedding_dim**0.5) # (1, 1, h)
+ u0_latent = tf.squeeze(u0_latent, axis=0) # (1, h)
+ test_user_emb = tf.tile(u0_latent, [1 + self.num_neg_test, 1]) # (101, h)
+
+ u_latent = self.user_embedding_layer(user)
+ u_latent = u_latent * (self.user_embedding_dim**0.5) # (b, 1, h)
+ u_latent = tf.tile(u_latent, [1, tf.shape(input_seq)[1], 1]) # (b, s, h)
+
+ seq_embeddings = tf.reshape(
+ tf.concat([seq_embeddings, u_latent], 2),
+ [tf.shape(input_seq)[0], -1, self.hidden_units],
+ )
+ seq_embeddings += positional_embeddings # (b, s, h1 + h2)
+
+ seq_embeddings *= mask
+ seq_attention = seq_embeddings
+ seq_attention = self.encoder(seq_attention, training, mask)
+ seq_attention = self.layer_normalization(seq_attention) # (b, s, h1+h2)
+ seq_emb = tf.reshape(
+ seq_attention,
+ [tf.shape(input_seq)[0] * self.seq_max_len, self.hidden_units],
+ ) # (b*s1, h1+h2)
+
+ candidate_emb = self.item_embedding_layer(candidate) # (b, s2, h2)
+ candidate_emb = tf.squeeze(candidate_emb, axis=0) # (s2, h2)
+ candidate_emb = tf.reshape(
+ tf.concat([candidate_emb, test_user_emb], 1), [-1, self.hidden_units]
+ ) # (b*s2, h1+h2)
+
+ candidate_emb = tf.transpose(candidate_emb, perm=[1, 0]) # (h1+h2, b*s2)
+ test_logits = tf.matmul(seq_emb, candidate_emb) # (b*s1, b*s2)
+
+ test_logits = tf.reshape(
+ test_logits,
+ [tf.shape(input_seq)[0], self.seq_max_len, 1 + self.num_neg_test],
+ ) # (1, s, 101)
+ test_logits = test_logits[:, -1, :] # (1, 101)
+ return test_logits
+
+[docs] def loss_function(self, pos_logits, neg_logits, istarget):
+ """Losses are calculated separately for the positive and negative
+ items based on the corresponding logits. A mask is included to
+ take care of the zero items (added for padding).
+
+ Args:
+ pos_logits (tf.Tensor): Logits of the positive examples.
+ neg_logits (tf.Tensor): Logits of the negative examples.
+ istarget (tf.Tensor): Mask for nonzero targets.
+
+ Returns:
+ float: Loss.
+ """
+
+ pos_logits = pos_logits[:, 0]
+ neg_logits = neg_logits[:, 0]
+
+ # ignore padding items (0)
+ # istarget = tf.reshape(
+ # tf.cast(tf.not_equal(self.pos, 0), dtype=tf.float32),
+ # [tf.shape(self.input_seq)[0] * self.seq_max_len],
+ # )
+ # for logits
+ loss = tf.reduce_sum(
+ -tf.math.log(tf.math.sigmoid(pos_logits) + 1e-24) * istarget
+ - tf.math.log(1 - tf.math.sigmoid(neg_logits) + 1e-24) * istarget
+ ) / tf.reduce_sum(istarget)
+
+ # for probabilities
+ # loss = tf.reduce_sum(
+ # - tf.math.log(pos_logits + 1e-24) * istarget -
+ # tf.math.log(1 - neg_logits + 1e-24) * istarget
+ # ) / tf.reduce_sum(istarget)
+ reg_loss = tf.compat.v1.losses.get_regularization_loss()
+ # reg_losses = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.REGULARIZATION_LOSSES)
+ # loss += sum(reg_losses)
+ loss += reg_loss
+
+ return loss
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+from collections import defaultdict
+
+
+[docs]class SASRecDataSet:
+ """
+ A class for creating SASRec specific dataset used during
+ train, validation and testing.
+
+ Attributes:
+ usernum: integer, total number of users
+ itemnum: integer, total number of items
+ User: dict, all the users (keys) with items as values
+ Items: set of all the items
+ user_train: dict, subset of User that are used for training
+ user_valid: dict, subset of User that are used for validation
+ user_test: dict, subset of User that are used for testing
+ col_sep: column separator in the data file
+ filename: data filename
+ """
+
+ def __init__(self, **kwargs):
+ self.usernum = 0
+ self.itemnum = 0
+ self.User = defaultdict(list)
+ self.Items = set()
+ self.user_train = {}
+ self.user_valid = {}
+ self.user_test = {}
+ self.col_sep = kwargs.get("col_sep", " ")
+ self.filename = kwargs.get("filename", None)
+
+ if self.filename:
+ with open(self.filename, "r") as fr:
+ sample = fr.readline()
+ ncols = sample.strip().split(self.col_sep)
+ if ncols == 3:
+ self.with_time = True
+ else:
+ self.with_time = False
+
+ def split(self, **kwargs):
+ self.filename = kwargs.get("filename", self.filename)
+ if not self.filename:
+ raise ValueError("Filename is required")
+
+ if self.with_time:
+ self.data_partition_with_time()
+ else:
+ self.data_partition()
+
+ def data_partition(self):
+ # assume user/item index starting from 1
+ f = open(self.filename, "r")
+ for line in f:
+ u, i = line.rstrip().split(self.col_sep)
+ u = int(u)
+ i = int(i)
+ self.usernum = max(u, self.usernum)
+ self.itemnum = max(i, self.itemnum)
+ self.User[u].append(i)
+
+ for user in self.User:
+ nfeedback = len(self.User[user])
+ if nfeedback < 3:
+ self.user_train[user] = self.User[user]
+ self.user_valid[user] = []
+ self.user_test[user] = []
+ else:
+ self.user_train[user] = self.User[user][:-2]
+ self.user_valid[user] = []
+ self.user_valid[user].append(self.User[user][-2])
+ self.user_test[user] = []
+ self.user_test[user].append(self.User[user][-1])
+
+ def data_partition_with_time(self):
+ # assume user/item index starting from 1
+ f = open(self.filename, "r")
+ for line in f:
+ u, i, t = line.rstrip().split(self.col_sep)
+ u = int(u)
+ i = int(i)
+ t = float(t)
+ self.usernum = max(u, self.usernum)
+ self.itemnum = max(i, self.itemnum)
+ self.User[u].append((i, t))
+ self.Items.add(i)
+
+ for user in self.User.keys():
+ # sort by time
+ items = sorted(self.User[user], key=lambda x: x[1])
+ # keep only the items
+ items = [x[0] for x in items]
+ self.User[user] = items
+ nfeedback = len(self.User[user])
+ if nfeedback < 3:
+ self.user_train[user] = self.User[user]
+ self.user_valid[user] = []
+ self.user_test[user] = []
+ else:
+ self.user_train[user] = self.User[user][:-2]
+ self.user_valid[user] = []
+ self.user_valid[user].append(self.User[user][-2])
+ self.user_test[user] = []
+ self.user_test[user].append(self.User[user][-1])
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import pandas as pd
+import numpy as np
+
+from recommenders.utils.constants import (
+ DEFAULT_USER_COL,
+ DEFAULT_ITEM_COL,
+ DEFAULT_PREDICTION_COL,
+)
+from recommenders.utils.general_utils import invert_dictionary
+
+
+[docs]def surprise_trainset_to_df(
+ trainset, col_user="uid", col_item="iid", col_rating="rating"
+):
+ """Converts a `surprise.Trainset` object to `pandas.DataFrame`
+
+ More info: https://surprise.readthedocs.io/en/stable/trainset.html
+
+ Args:
+ trainset (object): A surprise.Trainset object.
+ col_user (str): User column name.
+ col_item (str): Item column name.
+ col_rating (str): Rating column name.
+
+ Returns:
+ pandas.DataFrame: A dataframe with user column (str), item column (str), and rating column (float).
+ """
+ df = pd.DataFrame(trainset.all_ratings(), columns=[col_user, col_item, col_rating])
+ map_user = (
+ trainset._inner2raw_id_users
+ if trainset._inner2raw_id_users is not None
+ else invert_dictionary(trainset._raw2inner_id_users)
+ )
+ map_item = (
+ trainset._inner2raw_id_items
+ if trainset._inner2raw_id_items is not None
+ else invert_dictionary(trainset._raw2inner_id_items)
+ )
+ df[col_user] = df[col_user].map(map_user)
+ df[col_item] = df[col_item].map(map_item)
+ return df
+
+
+[docs]def predict(
+ algo,
+ data,
+ usercol=DEFAULT_USER_COL,
+ itemcol=DEFAULT_ITEM_COL,
+ predcol=DEFAULT_PREDICTION_COL,
+):
+ """Computes predictions of an algorithm from Surprise on the data. Can be used for computing rating metrics like RMSE.
+
+ Args:
+ algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise
+ data (pandas.DataFrame): the data on which to predict
+ usercol (str): name of the user column
+ itemcol (str): name of the item column
+
+ Returns:
+ pandas.DataFrame: Dataframe with usercol, itemcol, predcol
+ """
+ predictions = [
+ algo.predict(getattr(row, usercol), getattr(row, itemcol))
+ for row in data.itertuples()
+ ]
+ predictions = pd.DataFrame(predictions)
+ predictions = predictions.rename(
+ index=str, columns={"uid": usercol, "iid": itemcol, "est": predcol}
+ )
+ return predictions.drop(["details", "r_ui"], axis="columns")
+
+
+[docs]def compute_ranking_predictions(
+ algo,
+ data,
+ usercol=DEFAULT_USER_COL,
+ itemcol=DEFAULT_ITEM_COL,
+ predcol=DEFAULT_PREDICTION_COL,
+ remove_seen=False,
+):
+ """Computes predictions of an algorithm from Surprise on all users and items in data. It can be used for computing
+ ranking metrics like NDCG.
+
+ Args:
+ algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise
+ data (pandas.DataFrame): the data from which to get the users and items
+ usercol (str): name of the user column
+ itemcol (str): name of the item column
+ remove_seen (bool): flag to remove (user, item) pairs seen in the training data
+
+ Returns:
+ pandas.DataFrame: Dataframe with usercol, itemcol, predcol
+ """
+ preds_lst = []
+ users = data[usercol].unique()
+ items = data[itemcol].unique()
+
+ for user in users:
+ for item in items:
+ preds_lst.append([user, item, algo.predict(user, item).est])
+
+ all_predictions = pd.DataFrame(data=preds_lst, columns=[usercol, itemcol, predcol])
+
+ if remove_seen:
+ tempdf = pd.concat(
+ [
+ data[[usercol, itemcol]],
+ pd.DataFrame(
+ data=np.ones(data.shape[0]), columns=["dummycol"], index=data.index
+ ),
+ ],
+ axis=1,
+ )
+ merged = pd.merge(tempdf, all_predictions, on=[usercol, itemcol], how="outer")
+ return merged[merged["dummycol"].isnull()].drop("dummycol", axis=1)
+ else:
+ return all_predictions
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import linear_kernel
+from transformers import BertTokenizer
+import re
+import unicodedata
+import pandas as pd
+import numpy as np
+
+import nltk
+from nltk.stem.porter import PorterStemmer
+
+
+[docs]class TfidfRecommender:
+ """Term Frequency - Inverse Document Frequency (TF-IDF) Recommender
+
+ This class provides content-based recommendations using TF-IDF vectorization in combination with cosine similarity.
+ """
+
+ def __init__(self, id_col, tokenization_method="scibert"):
+ """Initialize model parameters
+
+ Args:
+ id_col (str): Name of column containing item IDs.
+ tokenization_method (str): ['none','nltk','bert','scibert'] option for tokenization method.
+ """
+ self.id_col = id_col
+ if tokenization_method.lower() not in ["none", "nltk", "bert", "scibert"]:
+ raise ValueError(
+ 'Tokenization method must be one of ["none" | "nltk" | "bert" | "scibert"]'
+ )
+ self.tokenization_method = tokenization_method.lower()
+
+ # Initialize other variables used in this class
+ self.tf = TfidfVectorizer()
+ self.tfidf_matrix = dict()
+ self.tokens = dict()
+ self.stop_words = frozenset()
+ self.recommendations = dict()
+ self.top_k_recommendations = pd.DataFrame()
+
+ def __clean_text(self, text, for_BERT=False, verbose=False):
+ """Clean text by removing HTML tags, symbols, and punctuation.
+
+ Args:
+ text (str): Text to clean.
+ for_BERT (boolean): True or False for if this text is being cleaned for a BERT word tokenization method.
+ verbose (boolean): True or False for whether to print.
+
+ Returns:
+ str: Cleaned version of text.
+ """
+
+ try:
+ # Normalize unicode
+ text_norm = unicodedata.normalize("NFC", text)
+
+ # Remove HTML tags
+ clean = re.sub("<.*?>", "", text_norm)
+
+ # Remove new line and tabs
+ clean = clean.replace("\n", " ")
+ clean = clean.replace("\t", " ")
+ clean = clean.replace("\r", " ")
+ clean = clean.replace("Â\xa0", "") # non-breaking space
+
+ # Remove all punctuation and special characters
+ clean = re.sub(
+ r"([^\s\w]|_)+", "", clean
+ ) # noqa W695 invalid escape sequence '\s'
+
+ # If you want to keep some punctuation, see below commented out example
+ # clean = re.sub(r'([^\s\w\-\_\(\)]|_)+','', clean)
+
+ # Skip further processing if the text will be used in BERT tokenization
+ if for_BERT is False:
+ # Lower case
+ clean = clean.lower()
+ except Exception:
+ if verbose is True:
+ print("Cannot clean non-existent text")
+ clean = ""
+
+ return clean
+
+[docs] def clean_dataframe(self, df, cols_to_clean, new_col_name="cleaned_text"):
+ """Clean the text within the columns of interest and return a dataframe with cleaned and combined text.
+
+ Args:
+ df (pandas.DataFrame): Dataframe containing the text content to clean.
+ cols_to_clean (list of str): List of columns to clean by name (e.g., ['abstract','full_text']).
+ new_col_name (str): Name of the new column that will contain the cleaned text.
+
+ Returns:
+ pandas.DataFrame: Dataframe with cleaned text in the new column.
+ """
+ # Collapse the table such that all descriptive text is just in a single column
+ df = df.replace(np.nan, "", regex=True)
+ df[new_col_name] = df[cols_to_clean].apply(lambda cols: " ".join(cols), axis=1)
+
+ # Check if for BERT tokenization
+ if self.tokenization_method in ["bert", "scibert"]:
+ for_BERT = True
+ else:
+ for_BERT = False
+
+ # Clean the text in the dataframe
+ df[new_col_name] = df[new_col_name].map(
+ lambda x: self.__clean_text(x, for_BERT)
+ )
+
+ return df
+
+[docs] def tokenize_text(
+ self, df_clean, text_col="cleaned_text", ngram_range=(1, 3), min_df=0
+ ):
+ """Tokenize the input text.
+ For more details on the TfidfVectorizer, see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
+
+ Args:
+ df_clean (pandas.DataFrame): Dataframe with cleaned text in the new column.
+ text_col (str): Name of column containing the cleaned text.
+ ngram_range (tuple of int): The lower and upper boundary of the range of n-values for different n-grams to be extracted.
+ min_df (int): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
+
+ Returns:
+ TfidfVectorizer, pandas.Series:
+ - Scikit-learn TfidfVectorizer object defined in `.tokenize_text()`.
+ - Each row contains tokens for respective documents separated by spaces.
+ """
+ vectors = df_clean[text_col]
+
+ # If a HuggingFace BERT word tokenization method
+ if self.tokenization_method in ["bert", "scibert"]:
+ # Set vectorizer
+ tf = TfidfVectorizer(
+ analyzer="word",
+ ngram_range=ngram_range,
+ min_df=min_df,
+ stop_words="english",
+ )
+
+ # Get appropriate transformer name
+ if self.tokenization_method == "bert":
+ bert_method = "bert-base-cased"
+ elif self.tokenization_method == "scibert":
+ bert_method = "allenai/scibert_scivocab_cased"
+
+ # Load pre-trained model tokenizer (vocabulary)
+ tokenizer = BertTokenizer.from_pretrained(bert_method)
+
+ # Loop through each item
+ vectors_tokenized = vectors.copy()
+ for i in range(0, len(vectors)):
+ vectors_tokenized[i] = " ".join(tokenizer.tokenize(vectors[i]))
+
+ elif self.tokenization_method == "nltk":
+ # NLTK Stemming
+ token_dict = {} # noqa: F841
+ stemmer = PorterStemmer()
+
+ def stem_tokens(tokens, stemmer):
+ stemmed = []
+ for item in tokens:
+ stemmed.append(stemmer.stem(item))
+ return stemmed
+
+ def tokenize(text):
+ tokens = nltk.word_tokenize(text)
+ stems = stem_tokens(tokens, stemmer)
+ return stems
+
+ # When defining a custome tokenizer with TfidfVectorizer, the tokenization is applied in the fit function
+ tf = TfidfVectorizer(
+ tokenizer=tokenize,
+ analyzer="word",
+ ngram_range=ngram_range,
+ min_df=min_df,
+ stop_words="english",
+ )
+ vectors_tokenized = vectors
+
+ elif self.tokenization_method == "none":
+ # No tokenization applied
+ tf = TfidfVectorizer(
+ analyzer="word",
+ ngram_range=ngram_range,
+ min_df=min_df,
+ stop_words="english",
+ )
+ vectors_tokenized = vectors
+
+ # Save to class variable
+ self.tf = tf
+
+ return tf, vectors_tokenized
+
+[docs] def fit(self, tf, vectors_tokenized):
+ """Fit TF-IDF vectorizer to the cleaned and tokenized text.
+
+ Args:
+ tf (TfidfVectorizer): sklearn.feature_extraction.text.TfidfVectorizer object defined in .tokenize_text().
+ vectors_tokenized (pandas.Series): Each row contains tokens for respective documents separated by spaces.
+ """
+ self.tfidf_matrix = tf.fit_transform(vectors_tokenized)
+
+[docs] def get_tokens(self):
+ """Return the tokens generated by the TF-IDF vectorizer.
+
+ Returns:
+ dict: Dictionary of tokens generated by the TF-IDF vectorizer.
+ """
+ try:
+ self.tokens = self.tf.vocabulary_
+ except Exception:
+ self.tokens = "Run .tokenize_text() and .fit_tfidf() first"
+ return self.tokens
+
+[docs] def get_stop_words(self):
+ """Return the stop words excluded in the TF-IDF vectorizer.
+
+ Returns:
+ list: Frozenset of stop words used by the TF-IDF vectorizer (can be converted to list).
+ """
+ try:
+ self.stop_words = self.tf.get_stop_words()
+ except Exception:
+ self.stop_words = "Run .tokenize_text() and .fit_tfidf() first"
+ return self.stop_words
+
+ def __create_full_recommendation_dictionary(self, df_clean):
+ """Create the full recommendation dictionary containing all recommendations for all items.
+
+ Args:
+ pandas.DataFrame: Dataframe with cleaned text.
+ """
+
+ # Similarity measure
+ cosine_sim = linear_kernel(self.tfidf_matrix, self.tfidf_matrix)
+
+ # sorted_idx has the indices that would sort the array.
+ sorted_idx = np.argsort(cosine_sim, axis=1)
+
+ data = list(df_clean[self.id_col].values)
+ len_df_clean = len(df_clean)
+
+ results = {}
+ for idx, row in zip(range(0, len_df_clean), data):
+ similar_indices = sorted_idx[idx][: -(len_df_clean + 1) : -1]
+ similar_items = [(cosine_sim[idx][i], data[i]) for i in similar_indices]
+ results[row] = similar_items[1:]
+
+ # Save to class
+ self.recommendations = results
+
+ def __organize_results_as_tabular(self, df_clean, k):
+ """Restructures results dictionary into a table containing only the top k recommendations per item.
+
+ Args:
+ df_clean (pandas.DataFrame): Dataframe with cleaned text.
+ k (int): Number of recommendations to return.
+ """
+ # Initialize new dataframe to hold recommendation output
+ item_id = list()
+ rec_rank = list()
+ rec_score = list()
+ rec_item_id = list()
+
+ # For each item
+ for _item_id in self.recommendations:
+ # Information about the item we are basing recommendations off of
+ rec_based_on = tmp_item_id = _item_id
+
+ # Get all scores and IDs for items recommended for this current item
+ rec_array = self.recommendations.get(rec_based_on)
+ tmp_rec_score = list(map(lambda x: x[0], rec_array))
+ tmp_rec_id = list(map(lambda x: x[1], rec_array))
+
+ # Append multiple values at a time to list
+ item_id.extend([tmp_item_id] * k)
+ rec_rank.extend(list(range(1, k + 1)))
+ rec_score.extend(tmp_rec_score[:k])
+ rec_item_id.extend(tmp_rec_id[:k])
+
+ # Save the output
+ output_dict = {
+ self.id_col: item_id,
+ "rec_rank": rec_rank,
+ "rec_score": rec_score,
+ "rec_" + self.id_col: rec_item_id,
+ }
+
+ # Convert to dataframe
+ self.top_k_recommendations = pd.DataFrame(output_dict)
+
+[docs] def recommend_top_k_items(self, df_clean, k=5):
+ """Recommend k number of items similar to the item of interest.
+
+ Args:
+ df_clean (pandas.DataFrame): Dataframe with cleaned text.
+ k (int): Number of recommendations to return.
+
+ Returns:
+ pandas.DataFrame: Dataframe containing id of top k recommendations for all items.
+ """
+ if k > len(df_clean) - 1:
+ raise ValueError(
+ "Cannot get more recommendations than there are items. Set k lower."
+ )
+ self.__create_full_recommendation_dictionary(df_clean)
+ self.__organize_results_as_tabular(df_clean, k)
+
+ return self.top_k_recommendations
+
+ def __get_single_item_info(self, metadata, rec_id):
+ """Get full information for a single recommended item.
+
+ Args:
+ metadata (pandas.DataFrame): Dataframe containing item info.
+ rec_id (str): Identifier for recommended item.
+
+ Returns:
+ pandas.Series: Single row from dataframe containing recommended item info.
+ """
+
+ # Return row
+ rec_info = metadata.iloc[int(np.where(metadata[self.id_col] == rec_id)[0])]
+
+ return rec_info
+
+ def __make_clickable(self, address):
+ """Make URL clickable.
+
+ Args:
+ address (str): URL address to make clickable.
+ """
+ return '<a href="{0}">{0}</a>'.format(address)
+
+[docs] def get_top_k_recommendations(
+ self, metadata, query_id, cols_to_keep=[], verbose=True
+ ):
+ """Return the top k recommendations with useful metadata for each recommendation.
+
+ Args:
+ metadata (pandas.DataFrame): Dataframe holding metadata for all public domain papers.
+ query_id (str): ID of item of interest.
+ cols_to_keep (list of str): List of columns from the metadata dataframe to include
+ (e.g., ['title','authors','journal','publish_time','url']).
+ By default, all columns are kept.
+ verbose (boolean): Set to True if you want to print the table.
+
+ Returns:
+ pandas.Styler: Stylized dataframe holding recommendations and associated metadata just for the item of interest (can access as normal dataframe by using df.data).
+ """
+
+ # Create subset of dataframe with just recommendations for the item of interest
+ df = self.top_k_recommendations.loc[
+ self.top_k_recommendations[self.id_col] == query_id
+ ].reset_index()
+
+ # Remove id_col of query item
+ df.drop([self.id_col], axis=1, inplace=True)
+
+ # Add metadata for each recommended item (rec_<id_col>)
+ metadata_cols = metadata.columns.values
+ df[metadata_cols] = df.apply(
+ lambda row: self.__get_single_item_info(
+ metadata, row["rec_" + self.id_col]
+ ),
+ axis=1,
+ )
+
+ # Remove id col added from metadata (already present from self.top_k_recommendations)
+ df.drop([self.id_col], axis=1, inplace=True)
+
+ # Rename columns such that rec_ is no longer appended, for simplicity
+ df = df.rename(columns={"rec_rank": "rank", "rec_score": "similarity_score"})
+
+ # Only keep columns of interest
+ if len(cols_to_keep) > 0:
+ # Insert our recommendation scoring/ranking columns
+ cols_to_keep.insert(0, "similarity_score")
+ cols_to_keep.insert(0, "rank")
+ df = df[cols_to_keep]
+
+ # Make URLs clickable if they exist
+ if "url" in list(map(lambda x: x.lower(), metadata_cols)):
+ format_ = {"url": self.__make_clickable}
+ df = df.head().style.format(format_)
+
+ if verbose:
+ df
+
+ return df
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from recommenders.evaluation.python_evaluation import ndcg_at_k
+
+import tensorflow as tf
+from tensorflow.keras.layers import *
+from tensorflow.keras.models import Model
+from tensorflow.keras import backend as K
+from tensorflow.keras.callbacks import ReduceLROnPlateau, Callback
+
+
+[docs]class LossHistory(Callback):
+ """This class is used for saving the validation loss and the training loss per epoch."""
+
+[docs] def on_train_begin(self, logs={}):
+ """Initialise the lists where the loss of training and validation will be saved."""
+ self.losses = []
+ self.val_losses = []
+
+[docs] def on_epoch_end(self, epoch, logs={}):
+ """Save the loss of training and validation set at the end of each epoch."""
+ self.losses.append(logs.get("loss"))
+ self.val_losses.append(logs.get("val_loss"))
+
+
+[docs]class Metrics(Callback):
+ """Callback function used to calculate the NDCG@k metric of validation set at the end of each epoch.
+ Weights of the model with the highest NDCG@k value is saved."""
+
+ def __init__(self, model, val_tr, val_te, mapper, k, save_path=None):
+
+ """Initialize the class parameters.
+
+ Args:
+ model: trained model for validation.
+ val_tr (numpy.ndarray, float): the click matrix for the validation set training part.
+ val_te (numpy.ndarray, float): the click matrix for the validation set testing part.
+ mapper (AffinityMatrix): the mapper for converting click matrix to dataframe.
+ k (int): number of top k items per user (optional).
+ save_path (str): Default path to save weights.
+ """
+ # Model
+ self.model = model
+
+ # Initial value of NDCG
+ self.best_ndcg = 0.0
+
+ # Validation data: training and testing parts
+ self.val_tr = val_tr
+ self.val_te = val_te
+
+ # Mapper for converting from sparse matrix to dataframe
+ self.mapper = mapper
+
+ # Top k items to recommend
+ self.k = k
+
+ # Options to save the weights of the model for future use
+ self.save_path = save_path
+
+[docs] def on_train_begin(self, logs={}):
+ """Initialise the list for validation NDCG@k."""
+ self._data = []
+
+[docs] def recommend_k_items(self, x, k, remove_seen=True):
+ """Returns the top-k items ordered by a relevancy score.
+ Obtained probabilities are used as recommendation score.
+
+ Args:
+ x (numpy.ndarray, int32): input click matrix.
+ k (scalar, int32): the number of items to recommend.
+
+ Returns:
+ numpy.ndarray: A sparse matrix containing the top_k elements ordered by their score.
+
+ """
+ # obtain scores
+ score = self.model.predict(x)
+
+ if remove_seen:
+ # if true, it removes items from the train set by setting them to zero
+ seen_mask = np.not_equal(x, 0)
+ score[seen_mask] = 0
+
+ # get the top k items
+ top_items = np.argpartition(-score, range(k), axis=1)[:, :k]
+
+ # get a copy of the score matrix
+ score_c = score.copy()
+
+ # set to zero the k elements
+ score_c[np.arange(score_c.shape[0])[:, None], top_items] = 0
+
+ # set to zeros all elements other then the k
+ top_scores = score - score_c
+
+ return top_scores
+
+[docs] def on_epoch_end(self, batch, logs={}):
+ """At the end of each epoch calculate NDCG@k of the validation set.
+
+ If the model performance is improved, the model weights are saved.
+ Update the list of validation NDCG@k by adding obtained value
+
+ """
+ # recommend top k items based on training part of validation set
+ top_k = self.recommend_k_items(x=self.val_tr, k=self.k, remove_seen=True)
+
+ # convert recommendations from sparse matrix to dataframe
+ top_k_df = self.mapper.map_back_sparse(top_k, kind="prediction")
+ test_df = self.mapper.map_back_sparse(self.val_te, kind="ratings")
+
+ # calculate NDCG@k
+ NDCG = ndcg_at_k(test_df, top_k_df, col_prediction="prediction", k=self.k)
+
+ # check if there is an improvement in NDCG, if so, update the weights of the saved model
+ if NDCG > self.best_ndcg:
+ self.best_ndcg = NDCG
+
+ # save the weights of the optimal model
+ if self.save_path is not None:
+ self.model.save(self.save_path)
+
+ self._data.append(NDCG)
+
+[docs] def get_data(self):
+ """Returns a list of the NDCG@k of the validation set metrics calculated
+ at the end of each epoch."""
+ return self._data
+
+
+[docs]class AnnealingCallback(Callback):
+ """This class is used for updating the value of β during the annealing process.
+ When β reaches the value of anneal_cap, it stops increasing."""
+
+ def __init__(self, beta, anneal_cap, total_anneal_steps):
+
+ """Constructor
+
+ Args:
+ beta (float): current value of beta.
+ anneal_cap (float): maximum value that beta can reach.
+ total_anneal_steps (int): total number of annealing steps.
+ """
+ # maximum value that beta can take
+ self.anneal_cap = anneal_cap
+
+ # initial value of beta
+ self.beta = beta
+
+ # update_count used for calculating the updated value of beta
+ self.update_count = 0
+
+ # total annealing steps
+ self.total_anneal_steps = total_anneal_steps
+
+[docs] def on_train_begin(self, logs={}):
+ """Initialise a list in which the beta value will be saved at the end of each epoch."""
+ self._beta = []
+
+[docs] def on_batch_end(self, epoch, logs={}):
+ """At the end of each batch the beta should is updated until it reaches the values of anneal cap."""
+ self.update_count = self.update_count + 1
+
+ new_beta = min(
+ 1.0 * self.update_count / self.total_anneal_steps, self.anneal_cap
+ )
+
+ K.set_value(self.beta, new_beta)
+
+[docs] def on_epoch_end(self, epoch, logs={}):
+ """At the end of each epoch save the value of beta in _beta list."""
+ tmp = K.eval(self.beta)
+ self._beta.append(tmp)
+
+
+
+
+[docs]class Mult_VAE:
+ """Multinomial Variational Autoencoders (Multi-VAE) for Collaborative Filtering implementation
+
+ :Citation:
+
+ Liang, Dawen, et al. "Variational autoencoders for collaborative filtering."
+ Proceedings of the 2018 World Wide Web Conference. 2018.
+ https://arxiv.org/pdf/1802.05814.pdf
+ """
+
+ def __init__(
+ self,
+ n_users,
+ original_dim,
+ intermediate_dim=200,
+ latent_dim=70,
+ n_epochs=400,
+ batch_size=100,
+ k=100,
+ verbose=1,
+ drop_encoder=0.5,
+ drop_decoder=0.5,
+ beta=1.0,
+ annealing=False,
+ anneal_cap=1.0,
+ seed=None,
+ save_path=None,
+ ):
+
+ """Constructor
+
+ Args:
+ n_users (int): Number of unique users in the train set.
+ original_dim (int): Number of unique items in the train set.
+ intermediate_dim (int): Dimension of intermediate space.
+ latent_dim (int): Dimension of latent space.
+ n_epochs (int): Number of epochs for training.
+ batch_size (int): Batch size.
+ k (int): number of top k items per user.
+ verbose (int): Whether to show the training output or not.
+ drop_encoder (float): Dropout percentage of the encoder.
+ drop_decoder (float): Dropout percentage of the decoder.
+ beta (float): a constant parameter β in the ELBO function,
+ when you are not using annealing (annealing=False)
+ annealing (bool): option of using annealing method for training the model (True)
+ or not using annealing, keeping a constant beta (False)
+ anneal_cap (float): maximum value that beta can take during annealing process.
+ seed (int): Seed.
+ save_path (str): Default path to save weights.
+ """
+ # Seed
+ self.seed = seed
+ np.random.seed(self.seed)
+
+ # Parameters
+ self.n_users = n_users
+ self.original_dim = original_dim
+ self.intermediate_dim = intermediate_dim
+ self.latent_dim = latent_dim
+ self.n_epochs = n_epochs
+ self.batch_size = batch_size
+ self.k = k
+ self.verbose = verbose
+
+ # Compute samples per epoch
+ self.number_of_batches = self.n_users // self.batch_size
+
+ # Annealing parameters
+ self.anneal_cap = anneal_cap
+ self.annealing = annealing
+
+ if self.annealing:
+ self.beta = K.variable(0.0)
+ else:
+ self.beta = beta
+
+ # Compute total annealing steps
+ self.total_anneal_steps = (
+ self.number_of_batches
+ * (self.n_epochs - int(self.n_epochs * 0.2))
+ // self.anneal_cap
+ )
+
+ # Dropout parameters
+ self.drop_encoder = drop_encoder
+ self.drop_decoder = drop_decoder
+
+ # Path to save optimal model
+ self.save_path = save_path
+
+ # Create StandardVAE model
+ self._create_model()
+
+ def _create_model(self):
+ """Build and compile model."""
+ # Encoding
+ self.x = Input(shape=(self.original_dim,))
+ self.x_ = Lambda(lambda x: K.l2_normalize(x, axis=1))(self.x)
+ self.dropout_encoder = Dropout(self.drop_encoder)(self.x_)
+
+ self.h = Dense(
+ self.intermediate_dim,
+ activation="tanh",
+ kernel_initializer=tf.compat.v1.keras.initializers.glorot_uniform(
+ seed=self.seed
+ ),
+ bias_initializer=tf.compat.v1.keras.initializers.truncated_normal(
+ stddev=0.001, seed=self.seed
+ ),
+ )(self.dropout_encoder)
+ self.z_mean = Dense(self.latent_dim)(self.h)
+ self.z_log_var = Dense(self.latent_dim)(self.h)
+
+ # Sampling
+ self.z = Lambda(self._take_sample, output_shape=(self.latent_dim,))(
+ [self.z_mean, self.z_log_var]
+ )
+
+ # Decoding
+ self.h_decoder = Dense(
+ self.intermediate_dim,
+ activation="tanh",
+ kernel_initializer=tf.compat.v1.keras.initializers.glorot_uniform(
+ seed=self.seed
+ ),
+ bias_initializer=tf.compat.v1.keras.initializers.truncated_normal(
+ stddev=0.001, seed=self.seed
+ ),
+ )
+ self.dropout_decoder = Dropout(self.drop_decoder)
+ self.x_bar = Dense(self.original_dim)
+ self.h_decoded = self.h_decoder(self.z)
+ self.h_decoded_ = self.dropout_decoder(self.h_decoded)
+ self.x_decoded = self.x_bar(self.h_decoded_)
+
+ # Training
+ self.model = Model(self.x, self.x_decoded)
+ self.model.compile(
+ optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001),
+ loss=self._get_vae_loss,
+ )
+
+ def _get_vae_loss(self, x, x_bar):
+ """Calculate negative ELBO (NELBO)."""
+ log_softmax_var = tf.nn.log_softmax(x_bar)
+ self.neg_ll = -tf.reduce_mean(
+ input_tensor=tf.reduce_sum(input_tensor=log_softmax_var * x, axis=-1)
+ )
+ a = tf.keras.backend.print_tensor(self.neg_ll) # noqa: F841
+ # calculate positive Kullback–Leibler divergence divergence term
+ kl_loss = K.mean(
+ 0.5
+ * K.sum(
+ -1 - self.z_log_var + K.square(self.z_mean) + K.exp(self.z_log_var),
+ axis=-1,
+ )
+ )
+
+ # obtain negative ELBO
+ neg_ELBO = self.neg_ll + self.beta * kl_loss
+
+ return neg_ELBO
+
+ def _take_sample(self, args):
+ """Sample epsilon ∼ N (0,I) and compute z via reparametrization trick."""
+
+ """Calculate latent vector using the reparametrization trick.
+ The idea is that sampling from N (_mean, _var) is s the same as sampling from _mean+ epsilon * _var
+ where epsilon ∼ N(0,I)."""
+ # _mean and _log_var calculated in encoder
+ _mean, _log_var = args
+
+ # epsilon
+ epsilon = K.random_normal(
+ shape=(K.shape(_mean)[0], self.latent_dim),
+ mean=0.0,
+ stddev=1.0,
+ seed=self.seed,
+ )
+
+ return _mean + K.exp(_log_var / 2) * epsilon
+
+[docs] def nn_batch_generator(self, x_train):
+ """Used for splitting dataset in batches.
+
+ Args:
+ x_train (numpy.ndarray): The click matrix for the train set, with float values.
+ """
+ # Shuffle the batch
+ np.random.seed(self.seed)
+ shuffle_index = np.arange(np.shape(x_train)[0])
+ np.random.shuffle(shuffle_index)
+ x = x_train[shuffle_index, :]
+ y = x_train[shuffle_index, :]
+
+ # Iterate until making a full epoch
+ counter = 0
+ while 1:
+ index_batch = shuffle_index[
+ self.batch_size * counter : self.batch_size * (counter + 1)
+ ]
+ # Decompress batch
+ x_batch = x[index_batch, :]
+ y_batch = y[index_batch, :]
+ counter += 1
+ yield (np.array(x_batch), np.array(y_batch))
+
+ # Stopping rule
+ if counter >= self.number_of_batches:
+ counter = 0
+
+[docs] def fit(self, x_train, x_valid, x_val_tr, x_val_te, mapper):
+ """Fit model with the train sets and validate on the validation set.
+
+ Args:
+ x_train (numpy.ndarray): the click matrix for the train set.
+ x_valid (numpy.ndarray): the click matrix for the validation set.
+ x_val_tr (numpy.ndarray): the click matrix for the validation set training part.
+ x_val_te (numpy.ndarray): the click matrix for the validation set testing part.
+ mapper (object): the mapper for converting click matrix to dataframe. It can be AffinityMatrix.
+ """
+ # initialise LossHistory used for saving loss of validation and train set per epoch
+ history = LossHistory()
+
+ # initialise Metrics used for calculating NDCG@k per epoch
+ # and saving the model weights with the highest NDCG@k value
+ metrics = Metrics(
+ model=self.model,
+ val_tr=x_val_tr,
+ val_te=x_val_te,
+ mapper=mapper,
+ k=self.k,
+ save_path=self.save_path,
+ )
+
+ self.reduce_lr = ReduceLROnPlateau(
+ monitor="val_loss", factor=0.2, patience=1, min_lr=0.0001
+ )
+
+ if self.annealing:
+ # initialise AnnealingCallback for annealing process
+ anneal = AnnealingCallback(
+ self.beta, self.anneal_cap, self.total_anneal_steps
+ )
+
+ # fit model
+ self.model.fit_generator(
+ generator=self.nn_batch_generator(x_train),
+ steps_per_epoch=self.number_of_batches,
+ epochs=self.n_epochs,
+ verbose=self.verbose,
+ callbacks=[metrics, history, self.reduce_lr, anneal],
+ validation_data=(x_valid, x_valid),
+ )
+
+ self.ls_beta = anneal.get_data()
+
+ else:
+ self.model.fit_generator(
+ generator=self.nn_batch_generator(x_train),
+ steps_per_epoch=self.number_of_batches,
+ epochs=self.n_epochs,
+ verbose=self.verbose,
+ callbacks=[metrics, history, self.reduce_lr],
+ validation_data=(x_valid, x_valid),
+ )
+
+ # save lists
+ self.train_loss = history.losses
+ self.val_loss = history.val_losses
+ self.val_ndcg = metrics.get_data()
+
+[docs] def get_optimal_beta(self):
+ """Returns the value of the optimal beta."""
+ if self.annealing:
+ # find the epoch/index that had the highest NDCG@k value
+ index_max_ndcg = np.argmax(self.val_ndcg)
+
+ # using this index find the value that beta had at this epoch
+ return self.ls_beta[index_max_ndcg]
+ else:
+ return self.beta
+
+[docs] def display_metrics(self):
+ """Plots:
+ 1) Loss per epoch both for validation and train set
+ 2) NDCG@k per epoch of the validation set
+ """
+ # Plot setup
+ plt.figure(figsize=(14, 5))
+ sns.set(style="whitegrid")
+
+ # Plot loss on the left graph
+ plt.subplot(1, 2, 1)
+ plt.plot(self.train_loss, color="b", linestyle="-", label="Train")
+ plt.plot(self.val_loss, color="r", linestyle="-", label="Val")
+ plt.title("\n")
+ plt.xlabel("Epochs", size=14)
+ plt.ylabel("Loss", size=14)
+ plt.legend(loc="upper left")
+
+ # Plot NDCG on the right graph
+ plt.subplot(1, 2, 2)
+ plt.plot(self.val_ndcg, color="r", linestyle="-", label="Val")
+ plt.title("\n")
+ plt.xlabel("Epochs", size=14)
+ plt.ylabel("NDCG@k", size=14)
+ plt.legend(loc="upper left")
+
+ # Add title
+ plt.suptitle("TRAINING AND VALIDATION METRICS HISTORY", size=16)
+ plt.tight_layout(pad=2)
+
+[docs] def recommend_k_items(self, x, k, remove_seen=True):
+ """Returns the top-k items ordered by a relevancy score.
+ Obtained probabilities are used as recommendation score.
+
+ Args:
+ x (numpy.ndarray, int32): input click matrix.
+ k (scalar, int32): the number of items to recommend.
+ Returns:
+ numpy.ndarray, float: A sparse matrix containing the top_k elements ordered by their score.
+ """
+ # return optimal model
+ self.model.load_weights(self.save_path)
+
+ # obtain scores
+ score = self.model.predict(x)
+
+ if remove_seen:
+ # if true, it removes items from the train set by setting them to zero
+ seen_mask = np.not_equal(x, 0)
+ score[seen_mask] = 0
+ # get the top k items
+ top_items = np.argpartition(-score, range(k), axis=1)[:, :k]
+ # get a copy of the score matrix
+ score_c = score.copy()
+ # set to zero the k elements
+ score_c[np.arange(score_c.shape[0])[:, None], top_items] = 0
+ # set to zeros all elements other then the k
+ top_scores = score - score_c
+ return top_scores
+
+[docs] def ndcg_per_epoch(self):
+ """Returns the list of NDCG@k at each epoch."""
+ return self.val_ndcg
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import tensorflow as tf
+from tensorflow.keras.layers import *
+from tensorflow.keras.models import Model
+from tensorflow.keras.losses import binary_crossentropy
+from tensorflow.keras import backend as K
+from tensorflow.keras.callbacks import ReduceLROnPlateau, Callback
+
+from recommenders.evaluation.python_evaluation import ndcg_at_k
+
+
+[docs]class LossHistory(Callback):
+ """This class is used for saving the validation loss and the training loss per epoch."""
+
+[docs] def on_train_begin(self, logs={}):
+ """Initialise the lists where the loss of training and validation will be saved."""
+ self.losses = []
+ self.val_losses = []
+
+[docs] def on_epoch_end(self, epoch, logs={}):
+ """Save the loss of training and validation set at the end of each epoch."""
+ self.losses.append(logs.get("loss"))
+ self.val_losses.append(logs.get("val_loss"))
+
+
+[docs]class Metrics(Callback):
+ """Callback function used to calculate the NDCG@k metric of validation set at the end of each epoch.
+ Weights of the model with the highest NDCG@k value is saved."""
+
+ def __init__(self, model, val_tr, val_te, mapper, k, save_path=None):
+
+ """Initialize the class parameters.
+
+ Args:
+ model: trained model for validation.
+ val_tr (numpy.ndarray, float): the click matrix for the validation set training part.
+ val_te (numpy.ndarray, float): the click matrix for the validation set testing part.
+ mapper (AffinityMatrix): the mapper for converting click matrix to dataframe.
+ k (int): number of top k items per user (optional).
+ save_path (str): Default path to save weights.
+ """
+ # Model
+ self.model = model
+
+ # Initial value of NDCG
+ self.best_ndcg = 0.0
+
+ # Validation data: training and testing parts
+ self.val_tr = val_tr
+ self.val_te = val_te
+
+ # Mapper for converting from sparse matrix to dataframe
+ self.mapper = mapper
+
+ # Top k items to recommend
+ self.k = k
+
+ # Options to save the weights of the model for future use
+ self.save_path = save_path
+
+[docs] def on_train_begin(self, logs={}):
+ """Initialise the list for validation NDCG@k."""
+ self._data = []
+
+[docs] def recommend_k_items(self, x, k, remove_seen=True):
+ """Returns the top-k items ordered by a relevancy score.
+ Obtained probabilities are used as recommendation score.
+
+ Args:
+ x (numpy.ndarray, int32): input click matrix.
+ k (scalar, int32): the number of items to recommend.
+
+ Returns:
+ numpy.ndarray: A sparse matrix containing the top_k elements ordered by their score.
+
+ """
+ # obtain scores
+ score = self.model.predict(x)
+
+ if remove_seen:
+ # if true, it removes items from the train set by setting them to zero
+ seen_mask = np.not_equal(x, 0)
+ score[seen_mask] = 0
+
+ # get the top k items
+ top_items = np.argpartition(-score, range(k), axis=1)[:, :k]
+
+ # get a copy of the score matrix
+ score_c = score.copy()
+
+ # set to zero the k elements
+ score_c[np.arange(score_c.shape[0])[:, None], top_items] = 0
+
+ # set to zeros all elements other then the k
+ top_scores = score - score_c
+
+ return top_scores
+
+[docs] def on_epoch_end(self, batch, logs={}):
+ """At the end of each epoch calculate NDCG@k of the validation set.
+ If the model performance is improved, the model weights are saved.
+ Update the list of validation NDCG@k by adding obtained value.
+ """
+ # recommend top k items based on training part of validation set
+ top_k = self.recommend_k_items(x=self.val_tr, k=self.k, remove_seen=True)
+
+ # convert recommendations from sparse matrix to dataframe
+ top_k_df = self.mapper.map_back_sparse(top_k, kind="prediction")
+ test_df = self.mapper.map_back_sparse(self.val_te, kind="ratings")
+
+ # calculate NDCG@k
+ NDCG = ndcg_at_k(test_df, top_k_df, col_prediction="prediction", k=self.k)
+
+ # check if there is an improvement in NDCG, if so, update the weights of the saved model
+ if NDCG > self.best_ndcg:
+ self.best_ndcg = NDCG
+
+ # save the weights of the optimal model
+ if self.save_path is not None:
+ self.model.save(self.save_path)
+
+ self._data.append(NDCG)
+
+[docs] def get_data(self):
+ """Returns a list of the NDCG@k of the validation set metrics calculated
+ at the end of each epoch."""
+ return self._data
+
+
+[docs]class AnnealingCallback(Callback):
+ """This class is used for updating the value of β during the annealing process.
+ When β reaches the value of anneal_cap, it stops increasing.
+ """
+
+ def __init__(self, beta, anneal_cap, total_anneal_steps):
+
+ """Constructor
+
+ Args:
+ beta (float): current value of beta.
+ anneal_cap (float): maximum value that beta can reach.
+ total_anneal_steps (int): total number of annealing steps.
+ """
+ # maximum value that beta can take
+ self.anneal_cap = anneal_cap
+
+ # initial value of beta
+ self.beta = beta
+
+ # update_count used for calculating the updated value of beta
+ self.update_count = 0
+
+ # total annealing steps
+ self.total_anneal_steps = total_anneal_steps
+
+[docs] def on_train_begin(self, logs={}):
+ """Initialise a list in which the beta value will be saved at the end of each epoch."""
+ self._beta = []
+
+[docs] def on_batch_end(self, epoch, logs={}):
+ """At the end of each batch the beta should is updated until it reaches the values of anneal cap."""
+ self.update_count = self.update_count + 1
+
+ new_beta = min(
+ 1.0 * self.update_count / self.total_anneal_steps, self.anneal_cap
+ )
+
+ K.set_value(self.beta, new_beta)
+
+[docs] def on_epoch_end(self, epoch, logs={}):
+ """At the end of each epoch save the value of beta in _beta list."""
+ tmp = K.eval(self.beta)
+ self._beta.append(tmp)
+
+
+
+
+[docs]class StandardVAE:
+ """Standard Variational Autoencoders (VAE) for Collaborative Filtering implementation."""
+
+ def __init__(
+ self,
+ n_users,
+ original_dim,
+ intermediate_dim=200,
+ latent_dim=70,
+ n_epochs=400,
+ batch_size=100,
+ k=100,
+ verbose=1,
+ drop_encoder=0.5,
+ drop_decoder=0.5,
+ beta=1.0,
+ annealing=False,
+ anneal_cap=1.0,
+ seed=None,
+ save_path=None,
+ ):
+
+ """Initialize class parameters.
+
+ Args:
+ n_users (int): Number of unique users in the train set.
+ original_dim (int): Number of unique items in the train set.
+ intermediate_dim (int): Dimension of intermediate space.
+ latent_dim (int): Dimension of latent space.
+ n_epochs (int): Number of epochs for training.
+ batch_size (int): Batch size.
+ k (int): number of top k items per user.
+ verbose (int): Whether to show the training output or not.
+ drop_encoder (float): Dropout percentage of the encoder.
+ drop_decoder (float): Dropout percentage of the decoder.
+ beta (float): a constant parameter β in the ELBO function,
+ when you are not using annealing (annealing=False)
+ annealing (bool): option of using annealing method for training the model (True)
+ or not using annealing, keeping a constant beta (False)
+ anneal_cap (float): maximum value that beta can take during annealing process.
+ seed (int): Seed.
+ save_path (str): Default path to save weights.
+ """
+ # Seed
+ self.seed = seed
+ np.random.seed(self.seed)
+
+ # Parameters
+ self.n_users = n_users
+ self.original_dim = original_dim
+ self.intermediate_dim = intermediate_dim
+ self.latent_dim = latent_dim
+ self.n_epochs = n_epochs
+ self.batch_size = batch_size
+ self.k = k
+ self.verbose = verbose
+
+ # Compute samples per epoch
+ self.number_of_batches = self.n_users // self.batch_size
+
+ # Annealing parameters
+ self.anneal_cap = anneal_cap
+ self.annealing = annealing
+
+ if self.annealing:
+ self.beta = K.variable(0.0)
+ else:
+ self.beta = beta
+
+ # Compute total annealing steps
+ self.total_anneal_steps = (
+ self.number_of_batches * (self.n_epochs - int(self.n_epochs * 0.2))
+ ) // self.anneal_cap
+
+ # Dropout parameters
+ self.drop_encoder = drop_encoder
+ self.drop_decoder = drop_decoder
+
+ # Path to save optimal model
+ self.save_path = save_path
+
+ # Create StandardVAE model
+ self._create_model()
+
+ def _create_model(self):
+ """Build and compile model."""
+ # Encoding
+ self.x = Input(shape=(self.original_dim,))
+ self.dropout_encoder = Dropout(self.drop_encoder)(self.x)
+ self.h = Dense(self.intermediate_dim, activation="tanh")(self.dropout_encoder)
+ self.z_mean = Dense(self.latent_dim)(self.h)
+ self.z_log_var = Dense(self.latent_dim)(self.h)
+
+ # Sampling
+ self.z = Lambda(self._take_sample, output_shape=(self.latent_dim,))(
+ [self.z_mean, self.z_log_var]
+ )
+
+ # Decoding
+ self.h_decoder = Dense(self.intermediate_dim, activation="tanh")
+ self.dropout_decoder = Dropout(self.drop_decoder)
+ self.x_bar = Dense(self.original_dim, activation="softmax")
+ self.h_decoded = self.h_decoder(self.z)
+ self.h_decoded_ = self.dropout_decoder(self.h_decoded)
+ self.x_decoded = self.x_bar(self.h_decoded_)
+
+ # Training
+ self.model = Model(self.x, self.x_decoded)
+ self.model.compile(
+ optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001),
+ loss=self._get_vae_loss,
+ )
+
+ def _get_vae_loss(self, x, x_bar):
+ """Calculate negative ELBO (NELBO)."""
+ # Reconstruction error: logistic log likelihood
+ reconst_loss = self.original_dim * binary_crossentropy(x, x_bar)
+
+ # Kullback–Leibler divergence
+ kl_loss = 0.5 * K.sum(
+ -1 - self.z_log_var + K.square(self.z_mean) + K.exp(self.z_log_var), axis=-1
+ )
+
+ return reconst_loss + self.beta * kl_loss
+
+ def _take_sample(self, args):
+ """Sample epsilon ∼ N (0,I) and compute z via reparametrization trick."""
+ """Calculate latent vector using the reparametrization trick.
+ The idea is that sampling from N (_mean, _var) is s the same as sampling from _mean+ epsilon * _var
+ where epsilon ∼ N(0,I)."""
+ # sampling from latent dimension for decoder/generative part of network
+ _mean, _log_var = args
+ epsilon = K.random_normal(
+ shape=(K.shape(_mean)[0], self.latent_dim),
+ mean=0.0,
+ stddev=1.0,
+ seed=self.seed,
+ )
+
+ return _mean + K.exp(_log_var / 2) * epsilon
+
+[docs] def nn_batch_generator(self, x_train):
+ """Used for splitting dataset in batches.
+
+ Args:
+ x_train (numpy.ndarray): The click matrix for the train set with float values.
+ """
+ # Shuffle the batch
+ np.random.seed(self.seed)
+ shuffle_index = np.arange(np.shape(x_train)[0])
+ np.random.shuffle(shuffle_index)
+ x = x_train[shuffle_index, :]
+ y = x_train[shuffle_index, :]
+
+ # Iterate until making a full epoch
+ counter = 0
+ while 1:
+ index_batch = shuffle_index[
+ self.batch_size * counter : self.batch_size * (counter + 1)
+ ]
+ # Decompress batch
+ x_batch = x[index_batch, :]
+ y_batch = y[index_batch, :]
+ counter += 1
+ yield (np.array(x_batch), np.array(y_batch))
+
+ # Stopping rule
+ if counter >= self.number_of_batches:
+ counter = 0
+
+[docs] def fit(self, x_train, x_valid, x_val_tr, x_val_te, mapper):
+ """Fit model with the train sets and validate on the validation set.
+
+ Args:
+ x_train (numpy.ndarray): The click matrix for the train set.
+ x_valid (numpy.ndarray): The click matrix for the validation set.
+ x_val_tr (numpy.ndarray): The click matrix for the validation set training part.
+ x_val_te (numpy.ndarray): The click matrix for the validation set testing part.
+ mapper (object): The mapper for converting click matrix to dataframe. It can be AffinityMatrix.
+ """
+ # initialise LossHistory used for saving loss of validation and train set per epoch
+ history = LossHistory()
+
+ # initialise Metrics used for calculating NDCG@k per epoch
+ # and saving the model weights with the highest NDCG@k value
+ metrics = Metrics(
+ model=self.model,
+ val_tr=x_val_tr,
+ val_te=x_val_te,
+ mapper=mapper,
+ k=self.k,
+ save_path=self.save_path,
+ )
+
+ self.reduce_lr = ReduceLROnPlateau(
+ monitor="val_loss", factor=0.2, patience=1, min_lr=0.0001
+ )
+
+ if self.annealing:
+ # initialise AnnealingCallback for annealing process
+ anneal = AnnealingCallback(
+ self.beta, self.anneal_cap, self.total_anneal_steps
+ )
+
+ # fit model
+ self.model.fit_generator(
+ generator=self.nn_batch_generator(x_train),
+ steps_per_epoch=self.number_of_batches,
+ epochs=self.n_epochs,
+ verbose=self.verbose,
+ callbacks=[metrics, history, self.reduce_lr, anneal],
+ validation_data=(x_valid, x_valid),
+ )
+
+ self.ls_beta = anneal.get_data()
+
+ else:
+ self.model.fit_generator(
+ generator=self.nn_batch_generator(x_train),
+ steps_per_epoch=self.number_of_batches,
+ epochs=self.n_epochs,
+ verbose=self.verbose,
+ callbacks=[metrics, history, self.reduce_lr],
+ validation_data=(x_valid, x_valid),
+ )
+
+ # save lists
+ self.train_loss = history.losses
+ self.val_loss = history.val_losses
+ self.val_ndcg = metrics.get_data()
+
+[docs] def get_optimal_beta(self):
+ """Returns the value of the optimal beta."""
+ # find the epoch/index that had the highest NDCG@k value
+ index_max_ndcg = np.argmax(self.val_ndcg)
+
+ # using this index find the value that beta had at this epoch
+ optimal_beta = self.ls_beta[index_max_ndcg]
+
+ return optimal_beta
+
+[docs] def display_metrics(self):
+ """Plots:
+ 1) Loss per epoch both for validation and train sets
+ 2) NDCG@k per epoch of the validation set
+ """
+ # Plot setup
+ plt.figure(figsize=(14, 5))
+ sns.set(style="whitegrid")
+
+ # Plot loss on the left graph
+ plt.subplot(1, 2, 1)
+ plt.plot(self.train_loss, color="b", linestyle="-", label="Train")
+ plt.plot(self.val_loss, color="r", linestyle="-", label="Val")
+ plt.title("\n")
+ plt.xlabel("Epochs", size=14)
+ plt.ylabel("Loss", size=14)
+ plt.legend(loc="upper left")
+
+ # Plot NDCG on the right graph
+ plt.subplot(1, 2, 2)
+ plt.plot(self.val_ndcg, color="r", linestyle="-", label="Val")
+ plt.title("\n")
+ plt.xlabel("Epochs", size=14)
+ plt.ylabel("NDCG@k", size=14)
+ plt.legend(loc="upper left")
+
+ # Add title
+ plt.suptitle("TRAINING AND VALIDATION METRICS HISTORY", size=16)
+ plt.tight_layout(pad=2)
+
+[docs] def recommend_k_items(self, x, k, remove_seen=True):
+ """Returns the top-k items ordered by a relevancy score.
+
+ Obtained probabilities are used as recommendation score.
+
+ Args:
+ x (numpy.ndarray): Input click matrix, with `int32` values.
+ k (scalar): The number of items to recommend.
+
+ Returns:
+ numpy.ndarray: A sparse matrix containing the top_k elements ordered by their score.
+
+ """
+ # return optimal model
+ self.model.load_weights(self.save_path)
+
+ # obtain scores
+ score = self.model.predict(x)
+ if remove_seen:
+ # if true, it removes items from the train set by setting them to zero
+ seen_mask = np.not_equal(x, 0)
+ score[seen_mask] = 0
+ # get the top k items
+ top_items = np.argpartition(-score, range(k), axis=1)[:, :k]
+ # get a copy of the score matrix
+ score_c = score.copy()
+ # set to zero the k elements
+ score_c[np.arange(score_c.shape[0])[:, None], top_items] = 0
+ # set to zeros all elements other then the k
+ top_scores = score - score_c
+ return top_scores
+
+[docs] def ndcg_per_epoch(self):
+ """Returns the list of NDCG@k at each epoch."""
+
+ return self.val_ndcg
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+"""
+This file provides a wrapper to run Vowpal Wabbit from the command line through python.
+It is not recommended to use this approach in production, there are python bindings that can be installed from the
+repository or pip or the command line can be used. This is merely to demonstrate vw usage in the example notebooks.
+"""
+
+import os
+from subprocess import run
+from tempfile import TemporaryDirectory
+import pandas as pd
+
+from recommenders.utils.constants import (
+ DEFAULT_USER_COL,
+ DEFAULT_ITEM_COL,
+ DEFAULT_RATING_COL,
+ DEFAULT_TIMESTAMP_COL,
+ DEFAULT_PREDICTION_COL,
+)
+
+
+[docs]class VW:
+ """Vowpal Wabbit Class"""
+
+ def __init__(
+ self,
+ col_user=DEFAULT_USER_COL,
+ col_item=DEFAULT_ITEM_COL,
+ col_rating=DEFAULT_RATING_COL,
+ col_timestamp=DEFAULT_TIMESTAMP_COL,
+ col_prediction=DEFAULT_PREDICTION_COL,
+ **kwargs,
+ ):
+ """Initialize model parameters
+
+ Args:
+ col_user (str): user column name
+ col_item (str): item column name
+ col_rating (str): rating column name
+ col_timestamp (str): timestamp column name
+ col_prediction (str): prediction column name
+ """
+
+ # create temporary files
+ self.tempdir = TemporaryDirectory()
+ self.train_file = os.path.join(self.tempdir.name, "train.dat")
+ self.test_file = os.path.join(self.tempdir.name, "test.dat")
+ self.model_file = os.path.join(self.tempdir.name, "vw.model")
+ self.prediction_file = os.path.join(self.tempdir.name, "prediction.dat")
+
+ # set DataFrame columns
+ self.col_user = col_user
+ self.col_item = col_item
+ self.col_rating = col_rating
+ self.col_timestamp = col_timestamp
+ self.col_prediction = col_prediction
+
+ self.logistic = "logistic" in kwargs.values()
+ self.train_cmd = self.parse_train_params(params=kwargs)
+ self.test_cmd = self.parse_test_params(params=kwargs)
+
+[docs] @staticmethod
+ def to_vw_cmd(params):
+ """Convert dictionary of parameters to vw command line.
+
+ Args:
+ params (dict): key = parameter, value = value (use True if parameter is just a flag)
+
+ Returns:
+ list[str]: vw command line parameters as list of strings
+ """
+
+ cmd = ["vw"]
+ for k, v in params.items():
+ if v is False:
+ # don't add parameters with a value == False
+ continue
+
+ # add the correct hyphen to the parameter
+ cmd.append(f"-{k}" if len(k) == 1 else f"--{k}")
+ if v is not True:
+ # don't add an argument for parameters with value == True
+ cmd.append("{}".format(v))
+
+ return cmd
+
+[docs] def parse_train_params(self, params):
+ """Parse input hyper-parameters to build vw train commands
+
+ Args:
+ params (dict): key = parameter, value = value (use True if parameter is just a flag)
+
+ Returns:
+ list[str]: vw command line parameters as list of strings
+ """
+
+ # make a copy of the original hyper parameters
+ train_params = params.copy()
+
+ # remove options that are handled internally, not supported, or test only parameters
+ invalid = [
+ "data",
+ "final_regressor",
+ "invert_hash",
+ "readable_model",
+ "t",
+ "testonly",
+ "i",
+ "initial_regressor",
+ "link",
+ ]
+
+ for option in invalid:
+ if option in train_params:
+ del train_params[option]
+
+ train_params.update(
+ {
+ "d": self.train_file,
+ "f": self.model_file,
+ "quiet": params.get("quiet", True),
+ }
+ )
+ return self.to_vw_cmd(params=train_params)
+
+[docs] def parse_test_params(self, params):
+ """Parse input hyper-parameters to build vw test commands
+
+ Args:
+ params (dict): key = parameter, value = value (use True if parameter is just a flag)
+
+ Returns:
+ list[str]: vw command line parameters as list of strings
+ """
+
+ # make a copy of the original hyper parameters
+ test_params = params.copy()
+
+ # remove options that are handled internally, ot supported or train only parameters
+ invalid = [
+ "data",
+ "f",
+ "final_regressor",
+ "initial_regressor",
+ "test_only",
+ "invert_hash",
+ "readable_model",
+ "b",
+ "bit_precision",
+ "holdout_off",
+ "c",
+ "cache",
+ "k",
+ "kill_cache",
+ "l",
+ "learning_rate",
+ "l1",
+ "l2",
+ "initial_t",
+ "power_t",
+ "decay_learning_rate",
+ "q",
+ "quadratic",
+ "cubic",
+ "i",
+ "interactions",
+ "rank",
+ "lrq",
+ "lrqdropout",
+ "oaa",
+ ]
+ for option in invalid:
+ if option in test_params:
+ del test_params[option]
+
+ test_params.update(
+ {
+ "d": self.test_file,
+ "i": self.model_file,
+ "quiet": params.get("quiet", True),
+ "p": self.prediction_file,
+ "t": True,
+ }
+ )
+ return self.to_vw_cmd(params=test_params)
+
+[docs] def to_vw_file(self, df, train=True):
+ """Convert Pandas DataFrame to vw input format file
+
+ Args:
+ df (pandas.DataFrame): input DataFrame
+ train (bool): flag for train mode (or test mode if False)
+ """
+
+ output = self.train_file if train else self.test_file
+ with open(output, "w") as f:
+ # extract columns and create a new dataframe
+ tmp = df[[self.col_rating, self.col_user, self.col_item]].reset_index()
+
+ if train:
+ # we need to reset the rating type to an integer to simplify the vw formatting
+ tmp[self.col_rating] = tmp[self.col_rating].astype("int64")
+
+ # convert rating to binary value
+ if self.logistic:
+ max_value = tmp[self.col_rating].max()
+ tmp[self.col_rating] = tmp[self.col_rating].apply(
+ lambda x: 2 * round(x / max_value) - 1
+ )
+ else:
+ tmp[self.col_rating] = ""
+
+ # convert each row to VW input format (https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Input-format)
+ # [label] [tag]|[user namespace] [user id feature] |[item namespace] [movie id feature]
+ # label is the true rating, tag is a unique id for the example just used to link predictions to truth
+ # user and item namespaces separate features to support interaction features through command line options
+ for _, row in tmp.iterrows():
+ f.write(
+ "{rating} {index}|user {userID} |item {itemID}\n".format(
+ rating=row[self.col_rating],
+ index=row["index"],
+ userID=row[self.col_user],
+ itemID=row[self.col_item],
+ )
+ )
+
+[docs] def fit(self, df):
+ """Train model
+
+ Args:
+ df (pandas.DataFrame): input training data
+ """
+
+ # write dataframe to disk in vw format
+ self.to_vw_file(df=df)
+
+ # train model
+ run(self.train_cmd, check=True)
+
+[docs] def predict(self, df):
+ """Predict results
+
+ Args:
+ df (pandas.DataFrame): input test data
+ """
+
+ # write dataframe to disk in vw format
+ self.to_vw_file(df=df, train=False)
+
+ # generate predictions
+ run(self.test_cmd, check=True)
+
+ # read predictions
+ return df.join(
+ pd.read_csv(
+ self.prediction_file,
+ delim_whitespace=True,
+ names=[self.col_prediction],
+ index_col=1,
+ )
+ )
+
+ def __del__(self):
+ self.tempdir.cleanup()
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+
+from recommenders.utils.constants import DEFAULT_USER_COL, DEFAULT_ITEM_COL
+from recommenders.utils.tf_utils import MODEL_DIR
+
+
+[docs]def build_feature_columns(
+ users,
+ items,
+ user_col=DEFAULT_USER_COL,
+ item_col=DEFAULT_ITEM_COL,
+ item_feat_col=None,
+ crossed_feat_dim=1000,
+ user_dim=8,
+ item_dim=8,
+ item_feat_shape=None,
+ model_type="wide_deep",
+):
+ """Build wide and/or deep feature columns for TensorFlow high-level API Estimator.
+
+ Args:
+ users (iterable): Distinct user ids.
+ items (iterable): Distinct item ids.
+ user_col (str): User column name.
+ item_col (str): Item column name.
+ item_feat_col (str): Item feature column name for 'deep' or 'wide_deep' model.
+ crossed_feat_dim (int): Crossed feature dimension for 'wide' or 'wide_deep' model.
+ user_dim (int): User embedding dimension for 'deep' or 'wide_deep' model.
+ item_dim (int): Item embedding dimension for 'deep' or 'wide_deep' model.
+ item_feat_shape (int or an iterable of integers): Item feature array shape for 'deep' or 'wide_deep' model.
+ model_type (str): Model type, either
+ 'wide' for a linear model,
+ 'deep' for a deep neural networks, or
+ 'wide_deep' for a combination of linear model and neural networks.
+
+ Returns:
+ list, list:
+ - The wide feature columns
+ - The deep feature columns. If only the wide model is selected, the deep column list is empty and viceversa.
+ """
+ if model_type not in ["wide", "deep", "wide_deep"]:
+ raise ValueError("Model type should be either 'wide', 'deep', or 'wide_deep'")
+
+ user_ids = tf.feature_column.categorical_column_with_vocabulary_list(
+ user_col, users
+ )
+ item_ids = tf.feature_column.categorical_column_with_vocabulary_list(
+ item_col, items
+ )
+
+ if model_type == "wide":
+ return _build_wide_columns(user_ids, item_ids, crossed_feat_dim), []
+ elif model_type == "deep":
+ return (
+ [],
+ _build_deep_columns(
+ user_ids, item_ids, user_dim, item_dim, item_feat_col, item_feat_shape
+ ),
+ )
+ elif model_type == "wide_deep":
+ return (
+ _build_wide_columns(user_ids, item_ids, crossed_feat_dim),
+ _build_deep_columns(
+ user_ids, item_ids, user_dim, item_dim, item_feat_col, item_feat_shape
+ ),
+ )
+
+
+def _build_wide_columns(user_ids, item_ids, hash_bucket_size=1000):
+ """Build wide feature (crossed) columns. `user_ids` * `item_ids` are hashed into `hash_bucket_size`
+
+ Args:
+ user_ids (tf.feature_column.categorical_column_with_vocabulary_list): User ids.
+ item_ids (tf.feature_column.categorical_column_with_vocabulary_list): Item ids.
+ hash_bucket_size (int): Hash bucket size.
+
+ Returns:
+ list: Wide feature columns.
+ """
+ # Including the original features in addition to the crossed one is recommended to address hash collision problem.
+ return [
+ user_ids,
+ item_ids,
+ tf.feature_column.crossed_column(
+ [user_ids, item_ids], hash_bucket_size=hash_bucket_size
+ ),
+ ]
+
+
+def _build_deep_columns(
+ user_ids, item_ids, user_dim, item_dim, item_feat_col=None, item_feat_shape=1
+):
+ """Build deep feature columns
+
+ Args:
+ user_ids (tf.feature_column.categorical_column_with_vocabulary_list): User ids.
+ item_ids (tf.feature_column.categorical_column_with_vocabulary_list): Item ids.
+ user_dim (int): User embedding dimension.
+ item_dim (int): Item embedding dimension.
+ item_feat_col (str): Item feature column name.
+ item_feat_shape (int or an iterable of integers): Item feature array shape.
+
+ Returns:
+ list: Deep feature columns.
+ """
+ deep_columns = [
+ # User embedding
+ tf.feature_column.embedding_column(
+ categorical_column=user_ids, dimension=user_dim, max_norm=user_dim**0.5
+ ),
+ # Item embedding
+ tf.feature_column.embedding_column(
+ categorical_column=item_ids, dimension=item_dim, max_norm=item_dim**0.5
+ ),
+ ]
+ # Item feature
+ if item_feat_col is not None:
+ deep_columns.append(
+ tf.feature_column.numeric_column(
+ item_feat_col, shape=item_feat_shape, dtype=tf.float32
+ )
+ )
+ return deep_columns
+
+
+[docs]def build_model(
+ model_dir=MODEL_DIR,
+ wide_columns=(),
+ deep_columns=(),
+ linear_optimizer="Ftrl",
+ dnn_optimizer="Adagrad",
+ dnn_hidden_units=(128, 128),
+ dnn_dropout=0.0,
+ dnn_batch_norm=True,
+ log_every_n_iter=1000,
+ save_checkpoints_steps=10000,
+ seed=None,
+):
+ """Build wide-deep model.
+
+ To generate wide model, pass wide_columns only.
+ To generate deep model, pass deep_columns only.
+ To generate wide_deep model, pass both wide_columns and deep_columns.
+
+ Args:
+ model_dir (str): Model checkpoint directory.
+ wide_columns (list of tf.feature_column): Wide model feature columns.
+ deep_columns (list of tf.feature_column): Deep model feature columns.
+ linear_optimizer (str or tf.train.Optimizer): Wide model optimizer name or object.
+ dnn_optimizer (str or tf.train.Optimizer): Deep model optimizer name or object.
+ dnn_hidden_units (list of int): Deep model hidden units. E.g., [10, 10, 10] is three layers of 10 nodes each.
+ dnn_dropout (float): Deep model's dropout rate.
+ dnn_batch_norm (bool): Deep model's batch normalization flag.
+ log_every_n_iter (int): Log the training loss for every n steps.
+ save_checkpoints_steps (int): Model checkpoint frequency.
+ seed (int): Random seed.
+
+ Returns:
+ tf.estimator.Estimator: Model
+ """
+ gpu_config = tf.compat.v1.ConfigProto()
+ gpu_config.gpu_options.allow_growth = True # dynamic memory allocation
+
+ # TensorFlow training setup
+ config = tf.estimator.RunConfig(
+ tf_random_seed=seed,
+ log_step_count_steps=log_every_n_iter,
+ save_checkpoints_steps=save_checkpoints_steps,
+ session_config=gpu_config,
+ )
+
+ if len(wide_columns) > 0 and len(deep_columns) == 0:
+ model = tf.compat.v1.estimator.LinearRegressor(
+ model_dir=model_dir,
+ config=config,
+ feature_columns=wide_columns,
+ optimizer=linear_optimizer,
+ )
+ elif len(wide_columns) == 0 and len(deep_columns) > 0:
+ model = tf.compat.v1.estimator.DNNRegressor(
+ model_dir=model_dir,
+ config=config,
+ feature_columns=deep_columns,
+ hidden_units=dnn_hidden_units,
+ optimizer=dnn_optimizer,
+ dropout=dnn_dropout,
+ batch_norm=dnn_batch_norm,
+ )
+ elif len(wide_columns) > 0 and len(deep_columns) > 0:
+ model = tf.compat.v1.estimator.DNNLinearCombinedRegressor(
+ model_dir=model_dir,
+ config=config,
+ # wide settings
+ linear_feature_columns=wide_columns,
+ linear_optimizer=linear_optimizer,
+ # deep settings
+ dnn_feature_columns=deep_columns,
+ dnn_hidden_units=dnn_hidden_units,
+ dnn_optimizer=dnn_optimizer,
+ dnn_dropout=dnn_dropout,
+ batch_norm=dnn_batch_norm,
+ )
+ else:
+ raise ValueError(
+ "To generate wide model, set wide_columns.\n"
+ "To generate deep model, set deep_columns.\n"
+ "To generate wide_deep model, set both wide_columns and deep_columns."
+ )
+
+ return model
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+#
+# Utility functions for parameter sweep.
+
+from itertools import product
+
+
+[docs]def generate_param_grid(params):
+ """Generator of parameter grids.
+ Generate parameter lists from a parameter dictionary in the form of:
+
+ .. code-block:: python
+
+ {
+ "param1": [value1, value2],
+ "param2": [value1, value2]
+ }
+
+ to:
+
+ .. code-block:: python
+
+ [
+ {"param1": value1, "param2": value1},
+ {"param1": value2, "param2": value1},
+ {"param1": value1, "param2": value2},
+ {"param1": value2, "param2": value2}
+ ]
+
+ Args:
+ param_dict (dict): dictionary of parameters and values (in a list).
+
+ Return:
+ list: A list of parameter dictionary string that can be fed directly into
+ model builder as keyword arguments.
+ """
+ param_new = {}
+ param_fixed = {}
+
+ for key, value in params.items():
+ if isinstance(value, list):
+ param_new[key] = value
+ else:
+ param_fixed[key] = value
+
+ items = sorted(param_new.items())
+ keys, values = zip(*items)
+
+ params_exp = []
+ for v in product(*values):
+ param_exp = dict(zip(keys, v))
+ param_exp.update(param_fixed)
+ params_exp.append(param_exp)
+
+ return params_exp
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import os
+import psutil
+
+
+[docs]def invert_dictionary(dictionary):
+ """Invert a dictionary
+
+ Note:
+
+ If the dictionary has unique keys and unique values, the inversion would be perfect. However, if there are
+ repeated values, the inversion can take different keys
+
+ Args:
+ dictionary (dict): A dictionary
+
+ Returns:
+ dict: inverted dictionary
+ """
+ return {v: k for k, v in dictionary.items()}
+
+
+[docs]def get_physical_memory():
+ """Get the physical memory in GBs.
+
+ Returns:
+ float: Physical memory in GBs.
+ """
+ return psutil.virtual_memory()[0] / 1073741824
+
+
+[docs]def get_number_processors():
+ """Get the number of processors in a CPU.
+
+ Returns:
+ int: Number of processors.
+ """
+ try:
+ num = os.cpu_count()
+ except Exception:
+ import multiprocessing # force exception in case multiprocessing is not installed
+
+ num = multiprocessing.cpu_count()
+ return num
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import sys
+import os
+import glob
+import logging
+from numba import cuda
+from numba.cuda.cudadrv.error import CudaSupportError
+
+
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_CUDA_PATH_LINUX = "/usr/local/cuda/version.txt"
+
+
+[docs]def get_number_gpus():
+ """Get the number of GPUs in the system.
+ Returns:
+ int: Number of GPUs.
+ """
+ try:
+ import torch
+
+ return torch.cuda.device_count()
+ except (ImportError, ModuleNotFoundError):
+ pass
+ try:
+ import numba
+
+ return len(numba.cuda.gpus)
+ except Exception: # numba.cuda.cudadrv.error.CudaSupportError:
+ return 0
+
+
+[docs]def get_gpu_info():
+ """Get information of GPUs.
+
+ Returns:
+ list: List of gpu information dictionary as with `device_name`, `total_memory` (in Mb) and `free_memory` (in Mb).
+ Returns an empty list if there is no cuda device available.
+ """
+ gpus = []
+ try:
+ for gpu in cuda.gpus:
+ with gpu:
+ meminfo = cuda.current_context().get_memory_info()
+ g = {
+ "device_name": gpu.name.decode("ASCII"),
+ "total_memory": meminfo[1] / 1048576, # Mb
+ "free_memory": meminfo[0] / 1048576, # Mb
+ }
+ gpus.append(g)
+ except CudaSupportError:
+ pass
+
+ return gpus
+
+
+[docs]def clear_memory_all_gpus():
+ """Clear memory of all GPUs."""
+ try:
+ for gpu in cuda.gpus:
+ with gpu:
+ cuda.current_context().deallocations.clear()
+ except CudaSupportError:
+ logger.info("No CUDA available")
+
+
+[docs]def get_cuda_version():
+ """Get CUDA version
+
+ Returns:
+ str: Version of the library.
+ """
+ try:
+ import torch
+
+ return torch.version.cuda
+ except (ImportError, ModuleNotFoundError):
+ path = ""
+ if sys.platform == "win32":
+ candidate = (
+ "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\version.txt"
+ )
+ path_list = glob.glob(candidate)
+ if path_list:
+ path = path_list[0]
+ elif sys.platform == "linux" or sys.platform == "darwin":
+ path = "/usr/local/cuda/version.txt"
+ else:
+ raise ValueError("Not in Windows, Linux or Mac")
+
+ if os.path.isfile(path):
+ with open(path, "r") as f:
+ data = f.read().replace("\n", "")
+ return data
+ else:
+ return None
+
+
+[docs]def get_cudnn_version():
+ """Get the CuDNN version
+
+ Returns:
+ str: Version of the library.
+ """
+
+ def find_cudnn_in_headers(candiates):
+ for c in candidates:
+ file = glob.glob(c)
+ if file:
+ break
+ if file:
+ with open(file[0], "r") as f:
+ version = ""
+ for line in f:
+ if "#define CUDNN_MAJOR" in line:
+ version = line.split()[-1]
+ if "#define CUDNN_MINOR" in line:
+ version += "." + line.split()[-1]
+ if "#define CUDNN_PATCHLEVEL" in line:
+ version += "." + line.split()[-1]
+ if version:
+ return version
+ else:
+ return None
+ else:
+ return None
+
+ try:
+ import torch
+
+ return str(torch.backends.cudnn.version())
+ except (ImportError, ModuleNotFoundError):
+ if sys.platform == "win32":
+ candidates = [r"C:\NVIDIA\cuda\include\cudnn.h"]
+ elif sys.platform == "linux":
+ candidates = [
+ "/usr/include/cudnn_version.h",
+ "/usr/include/x86_64-linux-gnu/cudnn_v[0-99].h",
+ "/usr/local/cuda/include/cudnn.h",
+ "/usr/include/cudnn.h",
+ ]
+ elif sys.platform == "darwin":
+ candidates = ["/usr/local/cuda/include/cudnn.h", "/usr/include/cudnn.h"]
+ else:
+ raise ValueError("Not in Windows, Linux or Mac")
+ return find_cudnn_in_headers(candidates)
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+from math import ceil, floor
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+[docs]def qps_to_replicas(
+ target_qps, processing_time, max_qp_replica=1, target_utilization=0.7
+):
+ """Provide a rough estimate of the number of replicas to support a given
+ load (queries per second)
+
+ Args:
+ target_qps (int): target queries per second that you want to support
+ processing_time (float): the estimated amount of time (in seconds)
+ your service call takes
+ max_qp_replica (int): maximum number of concurrent queries per replica
+ target_utilization (float): proportion of CPU utilization you think is ideal
+
+ Returns:
+ int: Number of estimated replicas required to support a target number of queries per second.
+ """
+ concurrent_queries = target_qps * processing_time / target_utilization
+ replicas = ceil(concurrent_queries / max_qp_replica)
+ logger.info(
+ "Approximately {} replicas are estimated to support {} queries per second.".format(
+ replicas, target_qps
+ )
+ )
+ return replicas
+
+
+[docs]def replicas_to_qps(
+ num_replicas, processing_time, max_qp_replica=1, target_utilization=0.7
+):
+ """Provide a rough estimate of the queries per second supported by a number of replicas
+
+ Args:
+ num_replicas (int): number of replicas
+ processing_time (float): the estimated amount of time (in seconds) your service call takes
+ max_qp_replica (int): maximum number of concurrent queries per replica
+ target_utilization (float): proportion of CPU utilization you think is ideal
+
+ Returns:
+ int: queries per second supported by the number of replicas
+ """
+ qps = floor(num_replicas * max_qp_replica * target_utilization / processing_time)
+ logger.info(
+ "Approximately {} queries per second are supported by {} replicas.".format(
+ qps, num_replicas
+ )
+ )
+ return qps
+
+
+[docs]def nodes_to_replicas(n_cores_per_node, n_nodes=3, cpu_cores_per_replica=0.1):
+ """Provide a rough estimate of the number of replicas supported by a
+ given number of nodes with n_cores_per_node cores each
+
+ Args:
+ n_cores_per_node (int): Total number of cores per node within an AKS
+ cluster that you want to use
+ n_nodes (int): Number of nodes (i.e. VMs) used in the AKS cluster
+ cpu_cores_per_replica (float): Cores assigned to each replica. This
+ can be fractional and corresponds to the
+ cpu_cores argument passed to AksWebservice.deploy_configuration()
+
+ Returns:
+ int: Total number of replicas supported by the configuration
+ """
+ n_cores_avail = (n_cores_per_node - 0.5) * n_nodes - 4.45
+ replicas = floor(n_cores_avail / cpu_cores_per_replica)
+ logger.info(
+ "Approximately {} replicas are supported by {} nodes with {} cores each.".format(
+ replicas, n_nodes, n_cores_per_node
+ )
+ )
+ return replicas
+
+# Original code: https://raw.githubusercontent.com/miguelgfierro/codebase/master/python/system/notebook_memory_management.py
+#
+# Profile memory usage envelope of IPython commands and report interactively.
+# Usage (inside a python notebook):
+# from notebook_memory_management import start_watching_memory, stop_watching_memory
+# To start profile:
+# start_watching_memory()
+# To stop profile:
+# stop_watching_memory()
+#
+# Based on: https://github.com/ianozsvald/ipython_memory_usage
+#
+
+from __future__ import division # 1/2 == 0.5, as in Py3
+from __future__ import absolute_import # avoid hiding global modules with locals
+from __future__ import print_function # force use of print("hello")
+from __future__ import (
+ unicode_literals,
+) # force unadorned strings "" to be Unicode without prepending u""
+import time
+import memory_profiler
+from IPython import get_ipython
+import psutil
+import warnings
+
+
+# keep a global accounting for the last known memory usage
+# which is the reference point for the memory delta calculation
+previous_call_memory_usage = memory_profiler.memory_usage()[0]
+t1 = time.time() # will be set to current time later
+keep_watching = True
+watching_memory = True
+try:
+ input_cells = get_ipython().user_ns["In"]
+except Exception:
+ warnings.warn("Not running on notebook")
+
+
+[docs]def start_watching_memory():
+ """Register memory profiling tools to IPython instance."""
+ global watching_memory
+ watching_memory = True
+ ip = get_ipython()
+ ip.events.register("post_run_cell", watch_memory)
+ ip.events.register("pre_run_cell", pre_run_cell)
+
+
+[docs]def stop_watching_memory():
+ """Unregister memory profiling tools from IPython instance."""
+ global watching_memory
+ watching_memory = False
+ ip = get_ipython()
+ try:
+ ip.events.unregister("post_run_cell", watch_memory)
+ except ValueError:
+ print("ERROR: problem when unregistering")
+ pass
+ try:
+ ip.events.unregister("pre_run_cell", pre_run_cell)
+ except ValueError:
+ print("ERROR: problem when unregistering")
+ pass
+
+
+[docs]def watch_memory():
+ """Bring in the global memory usage value from the previous iteration"""
+ global previous_call_memory_usage, keep_watching, watching_memory, input_cells
+ new_memory_usage = memory_profiler.memory_usage()[0]
+ memory_delta = new_memory_usage - previous_call_memory_usage
+ keep_watching = False
+ total_memory = psutil.virtual_memory()[0] / 1024 / 1024 # in Mb
+ # calculate time delta using global t1 (from the pre-run event) and current time
+ time_delta_secs = time.time() - t1
+ num_commands = len(input_cells) - 1
+ cmd = "In [{}]".format(num_commands)
+ # convert the results into a pretty string
+ output_template = (
+ "{cmd} used {memory_delta:0.4f} Mb RAM in "
+ "{time_delta:0.2f}s, total RAM usage "
+ "{memory_usage:0.2f} Mb, total RAM "
+ "memory {total_memory:0.2f} Mb"
+ )
+ output = output_template.format(
+ time_delta=time_delta_secs,
+ cmd=cmd,
+ memory_delta=memory_delta,
+ memory_usage=new_memory_usage,
+ total_memory=total_memory,
+ )
+ if watching_memory:
+ print(str(output))
+ previous_call_memory_usage = new_memory_usage
+
+
+[docs]def pre_run_cell():
+ """Capture current time before we execute the current command"""
+ global t1
+ t1 = time.time()
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import os
+import re
+import nbformat
+from nbconvert.preprocessors import ExecutePreprocessor
+from IPython.display import display
+
+
+NOTEBOOK_OUTPUT_CONTENT_TYPE = "application/notebook_utils.json+json"
+
+
+[docs]def is_jupyter():
+ """Check if the module is running on Jupyter notebook/console.
+
+ Returns:
+ bool: True if the module is running on Jupyter notebook or Jupyter console,
+ False otherwise.
+ """
+ try:
+ shell_name = get_ipython().__class__.__name__
+ if shell_name == "ZMQInteractiveShell":
+ return True
+ else:
+ return False
+ except NameError:
+ return False
+
+
+[docs]def is_databricks():
+ """Check if the module is running on Databricks.
+
+ Returns:
+ bool: True if the module is running on Databricks notebook,
+ False otherwise.
+ """
+ try:
+ if os.path.realpath(".") == "/databricks/driver":
+ return True
+ else:
+ return False
+ except NameError:
+ return False
+
+
+def _update_parameters(parameter_cell_source, new_parameters):
+ """Replace parameter values in the cell source code."""
+ modified_cell_source = parameter_cell_source
+ for param, new_value in new_parameters.items():
+ if (
+ isinstance(new_value, str)
+ and not (new_value.startswith('"') and new_value.endswith('"'))
+ and not (new_value.startswith("'") and new_value.endswith("'"))
+ ):
+ # Check if the new value is a string and surround it with quotes if necessary
+ new_value = f'"{new_value}"'
+
+ # Define a regular expression pattern to match parameter assignments and ignore comments
+ pattern = re.compile(rf"(\b{param})\s*=\s*([^#\n]+)(?:#.*$)?", re.MULTILINE)
+ modified_cell_source = pattern.sub(rf"\1 = {new_value}", modified_cell_source)
+
+ return modified_cell_source
+
+
+[docs]def execute_notebook(
+ input_notebook, output_notebook, parameters={}, kernel_name="python3", timeout=2200
+):
+ """Execute a notebook while passing parameters to it.
+
+ Note:
+ Ensure your Jupyter Notebook is set up with parameters that can be
+ modified and read. Use Markdown cells to specify parameters that need
+ modification and code cells to set parameters that need to be read.
+
+ Args:
+ input_notebook (str): Path to the input notebook.
+ output_notebook (str): Path to the output notebook
+ parameters (dict): Dictionary of parameters to pass to the notebook.
+ kernel_name (str): Kernel name.
+ timeout (int): Timeout (in seconds) for each cell to execute.
+ """
+
+ # Load the Jupyter Notebook
+ with open(input_notebook, "r") as notebook_file:
+ notebook_content = nbformat.read(notebook_file, as_version=4)
+
+ # Search for and replace parameter values in code cells
+ for cell in notebook_content.cells:
+ if (
+ "tags" in cell.metadata
+ and "parameters" in cell.metadata["tags"]
+ and cell.cell_type == "code"
+ ):
+ # Update the cell's source within notebook_content
+ cell.source = _update_parameters(cell.source, parameters)
+
+ # Create an execution preprocessor
+ execute_preprocessor = ExecutePreprocessor(timeout=timeout, kernel_name=kernel_name)
+
+ # Execute the notebook
+ executed_notebook, _ = execute_preprocessor.preprocess(
+ notebook_content, {"metadata": {"path": "./"}}
+ )
+
+ # Save the executed notebook
+ with open(output_notebook, "w", encoding="utf-8") as executed_notebook_file:
+ nbformat.write(executed_notebook, executed_notebook_file)
+
+
+[docs]def store_metadata(name, value):
+ """Store data in the notebook's output source code.
+ This function is similar to snapbook.glue().
+
+ Args:
+ name (str): Name of the data.
+ value (int,float,str): Value of the data.
+ """
+
+ metadata = {"notebook_utils": {"name": name, "data": True, "display": False}}
+ data_json = {
+ "application/notebook_utils.json+json": {
+ "name": name,
+ "data": value,
+ "encoder": "json",
+ }
+ }
+ display(data_json, metadata=metadata, raw=True)
+
+
+[docs]def read_notebook(path):
+ """Read the metadata stored in the notebook's output source code.
+ This function is similar to snapbook.read_notebook().
+
+ Args:
+ path (str): Path to the notebook.
+
+ Returns:
+ dict: Dictionary of data stored in the notebook.
+ """
+ # Load the Jupyter Notebook
+ with open(path, "r") as notebook_file:
+ notebook_content = nbformat.read(notebook_file, as_version=4)
+
+ # Search for parameters and store them in a dictionary
+ results = {}
+ for cell in notebook_content.cells:
+ if cell.cell_type == "code" and "outputs" in cell:
+ for outputs in cell.outputs:
+ if "metadata" in outputs and "notebook_utils" in outputs.metadata:
+ name = outputs.data[NOTEBOOK_OUTPUT_CONTENT_TYPE]["name"]
+ data = outputs.data[NOTEBOOK_OUTPUT_CONTENT_TYPE]["data"]
+ results[name] = data
+ return results
+
+import matplotlib.pyplot as plt
+
+
+[docs]def line_graph(
+ values,
+ labels,
+ x_guides=None,
+ x_name=None,
+ y_name=None,
+ x_min_max=None,
+ y_min_max=None,
+ legend_loc=None,
+ subplot=None,
+ plot_size=(5, 5),
+):
+ """Plot line graph(s).
+
+ Args:
+ values (list(list(float or tuple)) or list(float or tuple): List of graphs or a graph to plot
+ E.g. a graph = list(y) or list((y,x))
+ labels (list(str) or str): List of labels or a label for graph.
+ If labels is a string, this function assumes the values is a single graph.
+ x_guides (list(int)): List of guidelines (a vertical dotted line)
+ x_name (str): x axis label
+ y_name (str): y axis label
+ x_min_max (list or tuple): Min and max value of the x axis
+ y_min_max (list or tuple): Min and max value of the y axis
+ legend_loc (str): legend location
+ subplot (list or tuple): `matplotlib.pyplot.subplot` format. E.g. to draw 1 x 2 subplot,
+ pass `(1,2,1)` for the first subplot and `(1,2,2)` for the second subplot.
+ plot_size (list or tuple): Plot size (width, height)
+ """
+ if subplot:
+ # Setup figure only once
+ if subplot[2] == 1:
+ if plot_size:
+ plt.figure(
+ figsize=(
+ plot_size[0]
+ * subplot[1], # fig width = plot width * num columns
+ plot_size[1]
+ * subplot[0], # fig height = plot height * num rows
+ )
+ )
+ plt.subplots_adjust(wspace=0.5)
+ plt.subplot(*subplot)
+ else:
+ if plot_size:
+ plt.figure(figsize=plot_size)
+
+ if isinstance(labels, str):
+ if isinstance(values[0], (int, float)):
+ y, x = values, range(len(values))
+ else:
+ y, x = zip(*values)
+ plt.plot(x, y, label=labels, lw=1)
+ else:
+ assert len(values) == len(labels)
+ for i, v in enumerate(values):
+ if isinstance(v[0], (int, float)):
+ y, x = v, range(len(v))
+ else:
+ y, x = zip(*v)
+ plt.plot(x, y, label=labels[i], lw=1)
+
+ if x_guides:
+ for x in x_guides:
+ plt.axvline(x=x, color="gray", lw=1, linestyle="--")
+
+ if x_name:
+ plt.xlabel(x_name)
+ if y_name:
+ plt.ylabel(y_name)
+ if x_min_max:
+ plt.xlim(*x_min_max)
+ if y_min_max:
+ plt.ylim(*y_min_max)
+ if legend_loc:
+ plt.legend(loc=legend_loc)
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import logging
+import numpy as np
+from scipy import sparse
+
+
+logger = logging.getLogger()
+
+
+[docs]def exponential_decay(value, max_val, half_life):
+ """Compute decay factor for a given value based on an exponential decay.
+
+ Values greater than `max_val` will be set to 1.
+
+ Args:
+ value (numeric): Value to calculate decay factor
+ max_val (numeric): Value at which decay factor will be 1
+ half_life (numeric): Value at which decay factor will be 0.5
+
+ Returns:
+ float: Decay factor
+ """
+ return np.minimum(1.0, np.power(0.5, (max_val - value) / half_life))
+
+
+def _get_row_and_column_matrix(array):
+ """Helper method to get the row and column matrix from an array.
+
+ Args:
+ array (numpy.ndarray): the array from which to get the row and column matrix.
+
+ Returns:
+ (numpy.ndarray, numpy.ndarray): (row matrix, column matrix)
+ """
+ row_matrix = np.expand_dims(array, axis=0)
+ column_matrix = np.expand_dims(array, axis=1)
+ return row_matrix, column_matrix
+
+
+[docs]def jaccard(cooccurrence):
+ """Helper method to calculate the Jaccard similarity of a matrix of
+ co-occurrences. When comparing Jaccard with count co-occurrence
+ and lift similarity, count favours predictability, meaning that
+ the most popular items will be recommended most of the time. Lift,
+ by contrast, favours discoverability/serendipity, meaning that an
+ item that is less popular overall but highly favoured by a small
+ subset of users is more likely to be recommended. Jaccard is a
+ compromise between the two.
+
+ Args:
+ cooccurrence (numpy.ndarray): the symmetric matrix of co-occurrences of items.
+
+ Returns:
+ numpy.ndarray: The matrix of Jaccard similarities between any two items.
+
+ """
+
+ diag_rows, diag_cols = _get_row_and_column_matrix(cooccurrence.diagonal())
+
+ with np.errstate(invalid="ignore", divide="ignore"):
+ result = cooccurrence / (diag_rows + diag_cols - cooccurrence)
+
+ return np.array(result)
+
+
+[docs]def lift(cooccurrence):
+ """Helper method to calculate the Lift of a matrix of
+ co-occurrences. In comparison with basic co-occurrence and Jaccard
+ similarity, lift favours discoverability and serendipity, as
+ opposed to co-occurrence that favours the most popular items, and
+ Jaccard that is a compromise between the two.
+
+ Args:
+ cooccurrence (numpy.ndarray): The symmetric matrix of co-occurrences of items.
+
+ Returns:
+ numpy.ndarray: The matrix of Lifts between any two items.
+
+ """
+
+ diag_rows, diag_cols = _get_row_and_column_matrix(cooccurrence.diagonal())
+
+ with np.errstate(invalid="ignore", divide="ignore"):
+ result = cooccurrence / (diag_rows * diag_cols)
+
+ return np.array(result)
+
+
+[docs]def mutual_information(cooccurrence):
+ """Helper method to calculate the Mutual Information of a matrix of
+ co-occurrences.
+
+ Mutual information is a measurement of the amount of information
+ explained by the i-th j-th item column vector.
+
+ Args:
+ cooccurrence (numpy.ndarray): The symmetric matrix of co-occurrences of items.
+
+ Returns:
+ numpy.ndarray: The matrix of mutual information between any two items.
+
+ """
+
+ with np.errstate(invalid="ignore", divide="ignore"):
+ result = np.log2(cooccurrence.shape[0] * lift(cooccurrence))
+
+ return np.array(result)
+
+
+[docs]def lexicographers_mutual_information(cooccurrence):
+ """Helper method to calculate the Lexicographers Mutual Information of
+ a matrix of co-occurrences.
+
+ Due to the bias of mutual information for low frequency items,
+ lexicographers mutual information corrects the formula by
+ multiplying it by the co-occurrence frequency.
+
+ Args:
+ cooccurrence (numpy.ndarray): The symmetric matrix of co-occurrences of items.
+
+ Returns:
+ numpy.ndarray: The matrix of lexicographers mutual information between any two items.
+
+ """
+
+ with np.errstate(invalid="ignore", divide="ignore"):
+ result = cooccurrence * mutual_information(cooccurrence)
+
+ return np.array(result)
+
+
+[docs]def cosine_similarity(cooccurrence):
+ """Helper method to calculate the Cosine similarity of a matrix of
+ co-occurrences.
+
+ Cosine similarity can be interpreted as the angle between the i-th
+ and j-th item.
+
+ Args:
+ cooccurrence (numpy.ndarray): The symmetric matrix of co-occurrences of items.
+
+ Returns:
+ numpy.ndarray: The matrix of cosine similarity between any two items.
+
+ """
+
+ diag_rows, diag_cols = _get_row_and_column_matrix(cooccurrence.diagonal())
+
+ with np.errstate(invalid="ignore", divide="ignore"):
+ result = cooccurrence / np.sqrt(diag_rows * diag_cols)
+
+ return np.array(result)
+
+
+[docs]def inclusion_index(cooccurrence):
+ """Helper method to calculate the Inclusion Index of a matrix of
+ co-occurrences.
+
+ Inclusion index measures the overlap between items.
+
+ Args:
+ cooccurrence (numpy.ndarray): The symmetric matrix of co-occurrences of items.
+
+ Returns:
+ numpy.ndarray: The matrix of inclusion index between any two items.
+
+ """
+
+ diag_rows, diag_cols = _get_row_and_column_matrix(cooccurrence.diagonal())
+
+ with np.errstate(invalid="ignore", divide="ignore"):
+ result = cooccurrence / np.minimum(diag_rows, diag_cols)
+
+ return np.array(result)
+
+
+[docs]def get_top_k_scored_items(scores, top_k, sort_top_k=False):
+ """Extract top K items from a matrix of scores for each user-item pair, optionally sort results per user.
+
+ Args:
+ scores (numpy.ndarray): Score matrix (users x items).
+ top_k (int): Number of top items to recommend.
+ sort_top_k (bool): Flag to sort top k results.
+
+ Returns:
+ numpy.ndarray, numpy.ndarray:
+ - Indices into score matrix for each user's top items.
+ - Scores corresponding to top items.
+
+ """
+
+ # ensure we're working with a dense ndarray
+ if isinstance(scores, sparse.spmatrix):
+ scores = scores.todense()
+
+ if scores.shape[1] < top_k:
+ logger.warning(
+ "Number of items is less than top_k, limiting top_k to number of items"
+ )
+ k = min(top_k, scores.shape[1])
+
+ test_user_idx = np.arange(scores.shape[0])[:, None]
+
+ # get top K items and scores
+ # this determines the un-ordered top-k item indices for each user
+ top_items = np.argpartition(scores, -k, axis=1)[:, -k:]
+ top_scores = scores[test_user_idx, top_items]
+
+ if sort_top_k:
+ sort_ind = np.argsort(-top_scores)
+ top_items = top_items[test_user_idx, sort_ind]
+ top_scores = top_scores[test_user_idx, sort_ind]
+
+ return np.array(top_items), np.array(top_scores)
+
+
+[docs]def binarize(a, threshold):
+ """Binarize the values.
+
+ Args:
+ a (numpy.ndarray): Input array that needs to be binarized.
+ threshold (float): Threshold below which all values are set to 0, else 1.
+
+ Returns:
+ numpy.ndarray: Binarized array.
+ """
+ return np.where(a > threshold, 1.0, 0.0)
+
+
+[docs]def rescale(data, new_min=0, new_max=1, data_min=None, data_max=None):
+ """Rescale/normalize the data to be within the range `[new_min, new_max]`
+ If data_min and data_max are explicitly provided, they will be used
+ as the old min/max values instead of taken from the data.
+
+ Note:
+ This is same as the `scipy.MinMaxScaler` with the exception that we can override
+ the min/max of the old scale.
+
+ Args:
+ data (numpy.ndarray): 1d scores vector or 2d score matrix (users x items).
+ new_min (int|float): The minimum of the newly scaled data.
+ new_max (int|float): The maximum of the newly scaled data.
+ data_min (None|number): The minimum of the passed data [if omitted it will be inferred].
+ data_max (None|number): The maximum of the passed data [if omitted it will be inferred].
+
+ Returns:
+ numpy.ndarray: The newly scaled/normalized data.
+ """
+ data_min = data.min() if data_min is None else data_min
+ data_max = data.max() if data_max is None else data_max
+ return (data - data_min) / (data_max - data_min) * (new_max - new_min) + new_min
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import os
+
+
+try:
+ from pyspark.sql import SparkSession # noqa: F401
+except ImportError:
+ pass # skip this import if we are in pure python environment
+
+MMLSPARK_PACKAGE = "com.microsoft.azure:synapseml_2.12:0.9.5"
+MMLSPARK_REPO = "https://mmlspark.azureedge.net/maven"
+# We support Spark v3, but in case you wish to use v2, set
+# MMLSPARK_PACKAGE = "com.microsoft.ml.spark:mmlspark_2.11:0.18.1"
+# MMLSPARK_REPO = "https://mvnrepository.com/artifact"
+
+
+[docs]def start_or_get_spark(
+ app_name="Sample",
+ url="local[*]",
+ memory="10g",
+ config=None,
+ packages=None,
+ jars=None,
+ repositories=None,
+):
+ """Start Spark if not started
+
+ Args:
+ app_name (str): set name of the application
+ url (str): URL for spark master
+ memory (str): size of memory for spark driver. This will be ignored if spark.driver.memory is set in config.
+ config (dict): dictionary of configuration options
+ packages (list): list of packages to install
+ jars (list): list of jar files to add
+ repositories (list): list of maven repositories
+
+ Returns:
+ object: Spark context.
+ """
+
+ submit_args = ""
+ if packages is not None:
+ submit_args = "--packages {} ".format(",".join(packages))
+ if jars is not None:
+ submit_args += "--jars {} ".format(",".join(jars))
+ if repositories is not None:
+ submit_args += "--repositories {}".format(",".join(repositories))
+ if submit_args:
+ os.environ["PYSPARK_SUBMIT_ARGS"] = "{} pyspark-shell".format(submit_args)
+
+ spark_opts = [
+ 'SparkSession.builder.appName("{}")'.format(app_name),
+ 'master("{}")'.format(url),
+ ]
+
+ if config is not None:
+ for key, raw_value in config.items():
+ value = (
+ '"{}"'.format(raw_value) if isinstance(raw_value, str) else raw_value
+ )
+ spark_opts.append('config("{key}", {value})'.format(key=key, value=value))
+
+ if config is None or "spark.driver.memory" not in config:
+ spark_opts.append('config("spark.driver.memory", "{}")'.format(memory))
+
+ # Set larger stack size
+ spark_opts.append('config("spark.executor.extraJavaOptions", "-Xss4m")')
+ spark_opts.append('config("spark.driver.extraJavaOptions", "-Xss4m")')
+
+ spark_opts.append("getOrCreate()")
+ return eval(".".join(spark_opts))
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+import itertools
+import numpy as np
+import tensorflow as tf
+from tensorflow_estimator.python.estimator.export.export import (
+ build_supervised_input_receiver_fn_from_input_fn,
+)
+
+MODEL_DIR = "model_checkpoints"
+
+
+OPTIMIZERS = dict(
+ adadelta=tf.compat.v1.train.AdadeltaOptimizer,
+ adagrad=tf.compat.v1.train.AdagradOptimizer,
+ adam=tf.compat.v1.train.AdamOptimizer,
+ ftrl=tf.compat.v1.train.FtrlOptimizer,
+ momentum=tf.compat.v1.train.MomentumOptimizer,
+ rmsprop=tf.compat.v1.train.RMSPropOptimizer,
+ sgd=tf.compat.v1.train.GradientDescentOptimizer,
+)
+
+
+[docs]def pandas_input_fn_for_saved_model(df, feat_name_type):
+ """Pandas input function for TensorFlow SavedModel.
+
+ Args:
+ df (pandas.DataFrame): Data containing features.
+ feat_name_type (dict): Feature name and type spec. E.g.
+ `{'userID': int, 'itemID': int, 'rating': float}`
+
+ Returns:
+ func: Input function
+
+ """
+ for feat_type in feat_name_type.values():
+ assert feat_type in (int, float, list)
+
+ def input_fn():
+ examples = [None] * len(df)
+ for i, sample in df.iterrows():
+ ex = tf.train.Example()
+ for feat_name, feat_type in feat_name_type.items():
+ feat = ex.features.feature[feat_name]
+ if feat_type == int:
+ feat.int64_list.value.extend([sample[feat_name]])
+ elif feat_type == float:
+ feat.float_list.value.extend([sample[feat_name]])
+ elif feat_type == list:
+ feat.float_list.value.extend(sample[feat_name])
+ examples[i] = ex.SerializeToString()
+ return {"inputs": tf.constant(examples)}
+
+ return input_fn
+
+
+[docs]def pandas_input_fn(
+ df, y_col=None, batch_size=128, num_epochs=1, shuffle=False, seed=None
+):
+ """Pandas input function for TensorFlow high-level API Estimator.
+ This function returns a `tf.data.Dataset` function.
+
+ Note:
+ `tf.estimator.inputs.pandas_input_fn` cannot handle array/list column properly.
+ For more information, see https://www.tensorflow.org/api_docs/python/tf/estimator/inputs/numpy_input_fn
+
+ Args:
+ df (pandas.DataFrame): Data containing features.
+ y_col (str): Label column name if df has it.
+ batch_size (int): Batch size for the input function.
+ num_epochs (int): Number of epochs to iterate over data. If `None`, it will run forever.
+ shuffle (bool): If True, shuffles the data queue.
+ seed (int): Random seed for shuffle.
+
+ Returns:
+ tf.data.Dataset: Function.
+ """
+
+ X_df = df.copy()
+ if y_col is not None:
+ y = X_df.pop(y_col).values
+ else:
+ y = None
+
+ X = {}
+ for col in X_df.columns:
+ values = X_df[col].values
+ if isinstance(values[0], (list, np.ndarray)):
+ values = np.array(values.tolist(), dtype=np.float32)
+ X[col] = values
+
+ return lambda: _dataset(
+ x=X,
+ y=y,
+ batch_size=batch_size,
+ num_epochs=num_epochs,
+ shuffle=shuffle,
+ seed=seed,
+ )
+
+
+def _dataset(x, y=None, batch_size=128, num_epochs=1, shuffle=False, seed=None):
+ if y is None:
+ dataset = tf.data.Dataset.from_tensor_slices(x)
+ else:
+ dataset = tf.data.Dataset.from_tensor_slices((x, y))
+
+ if shuffle:
+ dataset = dataset.shuffle(
+ 1000, seed=seed, reshuffle_each_iteration=True # buffer size = 1000
+ )
+ elif seed is not None:
+ import warnings
+
+ warnings.warn("Seed was set but `shuffle=False`. Seed will be ignored.")
+
+ return dataset.repeat(num_epochs).batch(batch_size)
+
+
+[docs]def build_optimizer(name, lr=0.001, **kwargs):
+ """Get an optimizer for TensorFlow high-level API Estimator.
+
+ Available options are: `adadelta`, `adagrad`, `adam`, `ftrl`, `momentum`, `rmsprop` or `sgd`.
+
+ Args:
+ name (str): Optimizer name.
+ lr (float): Learning rate
+ kwargs: Optimizer arguments as key-value pairs
+
+ Returns:
+ tf.train.Optimizer: Tensorflow optimizer.
+ """
+ name = name.lower()
+
+ try:
+ optimizer_class = OPTIMIZERS[name]
+ except KeyError:
+ raise KeyError("Optimizer name should be one of: {}".format(list(OPTIMIZERS)))
+
+ # Set parameters
+ params = {}
+ if name == "ftrl":
+ params["l1_regularization_strength"] = kwargs.get(
+ "l1_regularization_strength", 0.0
+ )
+ params["l2_regularization_strength"] = kwargs.get(
+ "l2_regularization_strength", 0.0
+ )
+ elif name == "momentum" or name == "rmsprop":
+ params["momentum"] = kwargs.get("momentum", 0.0)
+
+ return optimizer_class(learning_rate=lr, **params)
+
+
+[docs]def export_model(model, train_input_fn, eval_input_fn, tf_feat_cols, base_dir):
+ """Export TensorFlow estimator (model).
+
+ Args:
+ model (tf.estimator.Estimator): Model to export.
+ train_input_fn (function): Training input function to create data receiver spec.
+ eval_input_fn (function): Evaluation input function to create data receiver spec.
+ tf_feat_cols (list(tf.feature_column)): Feature columns.
+ base_dir (str): Base directory to export the model.
+
+ Returns:
+ str: Exported model path
+ """
+ tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
+ train_rcvr_fn = build_supervised_input_receiver_fn_from_input_fn(train_input_fn)
+ eval_rcvr_fn = build_supervised_input_receiver_fn_from_input_fn(eval_input_fn)
+ serve_rcvr_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
+ tf.feature_column.make_parse_example_spec(tf_feat_cols)
+ )
+ rcvr_fn_map = {
+ tf.estimator.ModeKeys.TRAIN: train_rcvr_fn,
+ tf.estimator.ModeKeys.EVAL: eval_rcvr_fn,
+ tf.estimator.ModeKeys.PREDICT: serve_rcvr_fn,
+ }
+ exported_path = model.experimental_export_all_saved_models(
+ export_dir_base=base_dir, input_receiver_fn_map=rcvr_fn_map
+ )
+
+ return exported_path.decode("utf-8")
+
+
+[docs]def evaluation_log_hook(
+ estimator,
+ logger,
+ true_df,
+ y_col,
+ eval_df,
+ every_n_iter=10000,
+ model_dir=None,
+ batch_size=256,
+ eval_fns=None,
+ **eval_kwargs
+):
+ """Evaluation log hook for TensorFlow high-level API Estimator.
+
+ Note:
+ TensorFlow Estimator model uses the last checkpoint weights for evaluation or prediction.
+ In order to get the most up-to-date evaluation results while training,
+ set model's `save_checkpoints_steps` to be equal or greater than hook's `every_n_iter`.
+
+ Args:
+ estimator (tf.estimator.Estimator): Model to evaluate.
+ logger (Logger): Custom logger to log the results.
+ E.g., define a subclass of Logger for AzureML logging.
+ true_df (pd.DataFrame): Ground-truth data.
+ y_col (str): Label column name in true_df
+ eval_df (pd.DataFrame): Evaluation data without label column.
+ every_n_iter (int): Evaluation frequency (steps).
+ model_dir (str): Model directory to save the summaries to. If None, does not record.
+ batch_size (int): Number of samples fed into the model at a time.
+ Note, the batch size doesn't affect on evaluation results.
+ eval_fns (iterable of functions): List of evaluation functions that have signature of
+ `(true_df, prediction_df, **eval_kwargs)`->`float`. If None, loss is calculated on `true_df`.
+ eval_kwargs: Evaluation function's keyword arguments.
+ Note, prediction column name should be 'prediction'
+
+ Returns:
+ tf.train.SessionRunHook: Session run hook to evaluate the model while training.
+ """
+
+ return _TrainLogHook(
+ estimator,
+ logger,
+ true_df,
+ y_col,
+ eval_df,
+ every_n_iter,
+ model_dir,
+ batch_size,
+ eval_fns,
+ **eval_kwargs
+ )
+
+
+class _TrainLogHook(tf.estimator.SessionRunHook):
+ def __init__(
+ self,
+ estimator,
+ logger,
+ true_df,
+ y_col,
+ eval_df,
+ every_n_iter=10000,
+ model_dir=None,
+ batch_size=256,
+ eval_fns=None,
+ **eval_kwargs
+ ):
+ """Evaluation log hook class"""
+ self.model = estimator
+ self.logger = logger
+ self.true_df = true_df
+ self.y_col = y_col
+ self.eval_df = eval_df
+ self.every_n_iter = every_n_iter
+ self.model_dir = model_dir
+ self.batch_size = batch_size
+ self.eval_fns = eval_fns
+ self.eval_kwargs = eval_kwargs
+
+ self.summary_writer = None
+ self.global_step_tensor = None
+ self.step = 0
+
+ def begin(self):
+ if self.model_dir is not None:
+ self.summary_writer = tf.compat.v1.summary.FileWriterCache.get(
+ self.model_dir
+ )
+ self.global_step_tensor = tf.compat.v1.train.get_or_create_global_step()
+ else:
+ self.step = 0
+
+ def before_run(self, run_context):
+ if self.global_step_tensor is not None:
+ requests = {"global_step": self.global_step_tensor}
+ return tf.estimator.SessionRunArgs(requests)
+ else:
+ return None
+
+ def after_run(self, run_context, run_values):
+ if self.global_step_tensor is not None:
+ self.step = run_values.results["global_step"]
+ else:
+ self.step += 1
+
+ if self.step % self.every_n_iter == 0:
+ _prev_log_level = tf.compat.v1.logging.get_verbosity()
+ tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
+
+ if self.eval_fns is None:
+ result = self.model.evaluate(
+ input_fn=pandas_input_fn(
+ df=self.true_df, y_col=self.y_col, batch_size=self.batch_size
+ )
+ )["average_loss"]
+ self._log("validation_loss", result)
+ else:
+ predictions = list(
+ itertools.islice(
+ self.model.predict(
+ input_fn=pandas_input_fn(
+ df=self.eval_df, batch_size=self.batch_size
+ )
+ ),
+ len(self.eval_df),
+ )
+ )
+ prediction_df = self.eval_df.copy()
+ prediction_df["prediction"] = [p["predictions"][0] for p in predictions]
+ for fn in self.eval_fns:
+ result = fn(self.true_df, prediction_df, **self.eval_kwargs)
+ self._log(fn.__name__, result)
+
+ tf.compat.v1.logging.set_verbosity(_prev_log_level)
+
+ def end(self, session):
+ if self.summary_writer is not None:
+ self.summary_writer.flush()
+
+ def _log(self, tag, value):
+ self.logger.log(tag, value)
+ if self.summary_writer is not None:
+ summary = tf.compat.v1.Summary(
+ value=[tf.compat.v1.Summary.Value(tag=tag, simple_value=value)]
+ )
+ self.summary_writer.add_summary(summary, self.step)
+
+
+[docs]class MetricsLogger:
+ """Metrics logger"""
+
+ def __init__(self):
+ """Initializer"""
+ self._log = {}
+
+[docs] def log(self, metric, value):
+ """Log metrics. Each metric's log will be stored in the corresponding list.
+
+ Args:
+ metric (str): Metric name.
+ value (float): Value.
+ """
+ if metric not in self._log:
+ self._log[metric] = []
+ self._log[metric].append(value)
+
+
+
+# Copyright (c) Recommenders contributors.
+# Licensed under the MIT License.
+
+from timeit import default_timer
+
+
+[docs]class Timer(object):
+ """Timer class.
+
+ `Original code <https://github.com/miguelgfierro/pybase/blob/2298172a13fb4a243754acbc6029a4a2dcf72c20/log_base/timer.py>`_.
+
+ Examples:
+ >>> import time
+ >>> t = Timer()
+ >>> t.start()
+ >>> time.sleep(1)
+ >>> t.stop()
+ >>> t.interval < 1
+ True
+ >>> with Timer() as t:
+ ... time.sleep(1)
+ >>> t.interval < 1
+ True
+ >>> "Time elapsed {}".format(t) #doctest: +ELLIPSIS
+ 'Time elapsed 1...'
+ """
+
+ def __init__(self):
+ self._timer = default_timer
+ self._interval = 0
+ self.running = False
+
+ def __enter__(self):
+ self.start()
+ return self
+
+ def __exit__(self, *args):
+ self.stop()
+
+ def __str__(self):
+ return "{:0.4f}".format(self.interval)
+
+
+
+[docs] def stop(self):
+ """Stop the timer. Calculate the interval in seconds."""
+ self.end = self._timer()
+ try:
+ self._interval = self.end - self.init
+ self.running = False
+ except AttributeError:
+ raise ValueError(
+ "Timer has not been initialized: use start() or the contextual form with Timer() as t:"
+ )
+
+ @property
+ def interval(self):
+ """Get time interval in seconds.
+
+ Returns:
+ float: Seconds.
+ """
+ if self.running:
+ raise ValueError("Timer has not been stopped, please use stop().")
+ else:
+ return self._interval
+
Short
+ */ + .o-tooltip--left { + position: relative; + } + + .o-tooltip--left:after { + opacity: 0; + visibility: hidden; + position: absolute; + content: attr(data-tooltip); + padding: .2em; + font-size: .8em; + left: -.2em; + background: grey; + color: white; + white-space: nowrap; + z-index: 2; + border-radius: 2px; + transform: translateX(-102%) translateY(0); + transition: opacity 0.2s cubic-bezier(0.64, 0.09, 0.08, 1), transform 0.2s cubic-bezier(0.64, 0.09, 0.08, 1); +} + +.o-tooltip--left:hover:after { + display: block; + opacity: 1; + visibility: visible; + transform: translateX(-100%) translateY(0); + transition: opacity 0.2s cubic-bezier(0.64, 0.09, 0.08, 1), transform 0.2s cubic-bezier(0.64, 0.09, 0.08, 1); + transition-delay: .5s; +} + +/* By default the copy button shouldn't show up when printing a page */ +@media print { + button.copybtn { + display: none; + } +} diff --git a/_static/copybutton.js b/_static/copybutton.js new file mode 100644 index 0000000000..2ea7ff3e21 --- /dev/null +++ b/_static/copybutton.js @@ -0,0 +1,248 @@ +// Localization support +const messages = { + 'en': { + 'copy': 'Copy', + 'copy_to_clipboard': 'Copy to clipboard', + 'copy_success': 'Copied!', + 'copy_failure': 'Failed to copy', + }, + 'es' : { + 'copy': 'Copiar', + 'copy_to_clipboard': 'Copiar al portapapeles', + 'copy_success': '¡Copiado!', + 'copy_failure': 'Error al copiar', + }, + 'de' : { + 'copy': 'Kopieren', + 'copy_to_clipboard': 'In die Zwischenablage kopieren', + 'copy_success': 'Kopiert!', + 'copy_failure': 'Fehler beim Kopieren', + }, + 'fr' : { + 'copy': 'Copier', + 'copy_to_clipboard': 'Copier dans le presse-papier', + 'copy_success': 'Copié !', + 'copy_failure': 'Échec de la copie', + }, + 'ru': { + 'copy': 'Скопировать', + 'copy_to_clipboard': 'Скопировать в буфер', + 'copy_success': 'Скопировано!', + 'copy_failure': 'Не удалось скопировать', + }, + 'zh-CN': { + 'copy': '复制', + 'copy_to_clipboard': '复制到剪贴板', + 'copy_success': '复制成功!', + 'copy_failure': '复制失败', + }, + 'it' : { + 'copy': 'Copiare', + 'copy_to_clipboard': 'Copiato negli appunti', + 'copy_success': 'Copiato!', + 'copy_failure': 'Errore durante la copia', + } +} + +let locale = 'en' +if( document.documentElement.lang !== undefined + && messages[document.documentElement.lang] !== undefined ) { + locale = document.documentElement.lang +} + +let doc_url_root = DOCUMENTATION_OPTIONS.URL_ROOT; +if (doc_url_root == '#') { + doc_url_root = ''; +} + +/** + * SVG files for our copy buttons + */ +let iconCheck = `` + +// If the user specified their own SVG use that, otherwise use the default +let iconCopy = ``; +if (!iconCopy) { + iconCopy = `` +} + +/** + * Set up copy/paste for code blocks + */ + +const runWhenDOMLoaded = cb => { + if (document.readyState != 'loading') { + cb() + } else if (document.addEventListener) { + document.addEventListener('DOMContentLoaded', cb) + } else { + document.attachEvent('onreadystatechange', function() { + if (document.readyState == 'complete') cb() + }) + } +} + +const codeCellId = index => `codecell${index}` + +// Clears selected text since ClipboardJS will select the text when copying +const clearSelection = () => { + if (window.getSelection) { + window.getSelection().removeAllRanges() + } else if (document.selection) { + document.selection.empty() + } +} + +// Changes tooltip text for a moment, then changes it back +// We want the timeout of our `success` class to be a bit shorter than the +// tooltip and icon change, so that we can hide the icon before changing back. +var timeoutIcon = 2000; +var timeoutSuccessClass = 1500; + +const temporarilyChangeTooltip = (el, oldText, newText) => { + el.setAttribute('data-tooltip', newText) + el.classList.add('success') + // Remove success a little bit sooner than we change the tooltip + // So that we can use CSS to hide the copybutton first + setTimeout(() => el.classList.remove('success'), timeoutSuccessClass) + setTimeout(() => el.setAttribute('data-tooltip', oldText), timeoutIcon) +} + +// Changes the copy button icon for two seconds, then changes it back +const temporarilyChangeIcon = (el) => { + el.innerHTML = iconCheck; + setTimeout(() => {el.innerHTML = iconCopy}, timeoutIcon) +} + +const addCopyButtonToCodeCells = () => { + // If ClipboardJS hasn't loaded, wait a bit and try again. This + // happens because we load ClipboardJS asynchronously. + if (window.ClipboardJS === undefined) { + setTimeout(addCopyButtonToCodeCells, 250) + return + } + + // Add copybuttons to all of our code cells + const COPYBUTTON_SELECTOR = 'div.highlight pre'; + const codeCells = document.querySelectorAll(COPYBUTTON_SELECTOR) + codeCells.forEach((codeCell, index) => { + const id = codeCellId(index) + codeCell.setAttribute('id', id) + + const clipboardButton = id => + `` + codeCell.insertAdjacentHTML('afterend', clipboardButton(id)) + }) + +function escapeRegExp(string) { + return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string +} + +/** + * Removes excluded text from a Node. + * + * @param {Node} target Node to filter. + * @param {string} exclude CSS selector of nodes to exclude. + * @returns {DOMString} Text from `target` with text removed. + */ +function filterText(target, exclude) { + const clone = target.cloneNode(true); // clone as to not modify the live DOM + if (exclude) { + // remove excluded nodes + clone.querySelectorAll(exclude).forEach(node => node.remove()); + } + return clone.innerText; +} + +// Callback when a copy button is clicked. Will be passed the node that was clicked +// should then grab the text and replace pieces of text that shouldn't be used in output +function formatCopyText(textContent, copybuttonPromptText, isRegexp = false, onlyCopyPromptLines = true, removePrompts = true, copyEmptyLines = true, lineContinuationChar = "", hereDocDelim = "") { + var regexp; + var match; + + // Do we check for line continuation characters and "HERE-documents"? + var useLineCont = !!lineContinuationChar + var useHereDoc = !!hereDocDelim + + // create regexp to capture prompt and remaining line + if (isRegexp) { + regexp = new RegExp('^(' + copybuttonPromptText + ')(.*)') + } else { + regexp = new RegExp('^(' + escapeRegExp(copybuttonPromptText) + ')(.*)') + } + + const outputLines = []; + var promptFound = false; + var gotLineCont = false; + var gotHereDoc = false; + const lineGotPrompt = []; + for (const line of textContent.split('\n')) { + match = line.match(regexp) + if (match || gotLineCont || gotHereDoc) { + promptFound = regexp.test(line) + lineGotPrompt.push(promptFound) + if (removePrompts && promptFound) { + outputLines.push(match[2]) + } else { + outputLines.push(line) + } + gotLineCont = line.endsWith(lineContinuationChar) & useLineCont + if (line.includes(hereDocDelim) & useHereDoc) + gotHereDoc = !gotHereDoc + } else if (!onlyCopyPromptLines) { + outputLines.push(line) + } else if (copyEmptyLines && line.trim() === '') { + outputLines.push(line) + } + } + + // If no lines with the prompt were found then just use original lines + if (lineGotPrompt.some(v => v === true)) { + textContent = outputLines.join('\n'); + } + + // Remove a trailing newline to avoid auto-running when pasting + if (textContent.endsWith("\n")) { + textContent = textContent.slice(0, -1) + } + return textContent +} + + +var copyTargetText = (trigger) => { + var target = document.querySelector(trigger.attributes['data-clipboard-target'].value); + + // get filtered text + let exclude = '.linenos'; + + let text = filterText(target, exclude); + return formatCopyText(text, '', false, true, true, true, '', '') +} + + // Initialize with a callback so we can modify the text before copy + const clipboard = new ClipboardJS('.copybtn', {text: copyTargetText}) + + // Update UI with error/success messages + clipboard.on('success', event => { + clearSelection() + temporarilyChangeTooltip(event.trigger, messages[locale]['copy'], messages[locale]['copy_success']) + temporarilyChangeIcon(event.trigger) + }) + + clipboard.on('error', event => { + temporarilyChangeTooltip(event.trigger, messages[locale]['copy'], messages[locale]['copy_failure']) + }) +} + +runWhenDOMLoaded(addCopyButtonToCodeCells) \ No newline at end of file diff --git a/_static/copybutton_funcs.js b/_static/copybutton_funcs.js new file mode 100644 index 0000000000..dbe1aaad79 --- /dev/null +++ b/_static/copybutton_funcs.js @@ -0,0 +1,73 @@ +function escapeRegExp(string) { + return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string +} + +/** + * Removes excluded text from a Node. + * + * @param {Node} target Node to filter. + * @param {string} exclude CSS selector of nodes to exclude. + * @returns {DOMString} Text from `target` with text removed. + */ +export function filterText(target, exclude) { + const clone = target.cloneNode(true); // clone as to not modify the live DOM + if (exclude) { + // remove excluded nodes + clone.querySelectorAll(exclude).forEach(node => node.remove()); + } + return clone.innerText; +} + +// Callback when a copy button is clicked. Will be passed the node that was clicked +// should then grab the text and replace pieces of text that shouldn't be used in output +export function formatCopyText(textContent, copybuttonPromptText, isRegexp = false, onlyCopyPromptLines = true, removePrompts = true, copyEmptyLines = true, lineContinuationChar = "", hereDocDelim = "") { + var regexp; + var match; + + // Do we check for line continuation characters and "HERE-documents"? + var useLineCont = !!lineContinuationChar + var useHereDoc = !!hereDocDelim + + // create regexp to capture prompt and remaining line + if (isRegexp) { + regexp = new RegExp('^(' + copybuttonPromptText + ')(.*)') + } else { + regexp = new RegExp('^(' + escapeRegExp(copybuttonPromptText) + ')(.*)') + } + + const outputLines = []; + var promptFound = false; + var gotLineCont = false; + var gotHereDoc = false; + const lineGotPrompt = []; + for (const line of textContent.split('\n')) { + match = line.match(regexp) + if (match || gotLineCont || gotHereDoc) { + promptFound = regexp.test(line) + lineGotPrompt.push(promptFound) + if (removePrompts && promptFound) { + outputLines.push(match[2]) + } else { + outputLines.push(line) + } + gotLineCont = line.endsWith(lineContinuationChar) & useLineCont + if (line.includes(hereDocDelim) & useHereDoc) + gotHereDoc = !gotHereDoc + } else if (!onlyCopyPromptLines) { + outputLines.push(line) + } else if (copyEmptyLines && line.trim() === '') { + outputLines.push(line) + } + } + + // If no lines with the prompt were found then just use original lines + if (lineGotPrompt.some(v => v === true)) { + textContent = outputLines.join('\n'); + } + + // Remove a trailing newline to avoid auto-running when pasting + if (textContent.endsWith("\n")) { + textContent = textContent.slice(0, -1) + } + return textContent +} diff --git a/_static/design-style.4045f2051d55cab465a707391d5b2007.min.css b/_static/design-style.4045f2051d55cab465a707391d5b2007.min.css new file mode 100644 index 0000000000..3225661c25 --- /dev/null +++ b/_static/design-style.4045f2051d55cab465a707391d5b2007.min.css @@ -0,0 +1 @@ +.sd-bg-primary{background-color:var(--sd-color-primary) !important}.sd-bg-text-primary{color:var(--sd-color-primary-text) !important}button.sd-bg-primary:focus,button.sd-bg-primary:hover{background-color:var(--sd-color-primary-highlight) !important}a.sd-bg-primary:focus,a.sd-bg-primary:hover{background-color:var(--sd-color-primary-highlight) !important}.sd-bg-secondary{background-color:var(--sd-color-secondary) !important}.sd-bg-text-secondary{color:var(--sd-color-secondary-text) !important}button.sd-bg-secondary:focus,button.sd-bg-secondary:hover{background-color:var(--sd-color-secondary-highlight) !important}a.sd-bg-secondary:focus,a.sd-bg-secondary:hover{background-color:var(--sd-color-secondary-highlight) !important}.sd-bg-success{background-color:var(--sd-color-success) !important}.sd-bg-text-success{color:var(--sd-color-success-text) !important}button.sd-bg-success:focus,button.sd-bg-success:hover{background-color:var(--sd-color-success-highlight) !important}a.sd-bg-success:focus,a.sd-bg-success:hover{background-color:var(--sd-color-success-highlight) !important}.sd-bg-info{background-color:var(--sd-color-info) !important}.sd-bg-text-info{color:var(--sd-color-info-text) !important}button.sd-bg-info:focus,button.sd-bg-info:hover{background-color:var(--sd-color-info-highlight) !important}a.sd-bg-info:focus,a.sd-bg-info:hover{background-color:var(--sd-color-info-highlight) !important}.sd-bg-warning{background-color:var(--sd-color-warning) !important}.sd-bg-text-warning{color:var(--sd-color-warning-text) !important}button.sd-bg-warning:focus,button.sd-bg-warning:hover{background-color:var(--sd-color-warning-highlight) !important}a.sd-bg-warning:focus,a.sd-bg-warning:hover{background-color:var(--sd-color-warning-highlight) !important}.sd-bg-danger{background-color:var(--sd-color-danger) !important}.sd-bg-text-danger{color:var(--sd-color-danger-text) !important}button.sd-bg-danger:focus,button.sd-bg-danger:hover{background-color:var(--sd-color-danger-highlight) !important}a.sd-bg-danger:focus,a.sd-bg-danger:hover{background-color:var(--sd-color-danger-highlight) !important}.sd-bg-light{background-color:var(--sd-color-light) !important}.sd-bg-text-light{color:var(--sd-color-light-text) !important}button.sd-bg-light:focus,button.sd-bg-light:hover{background-color:var(--sd-color-light-highlight) !important}a.sd-bg-light:focus,a.sd-bg-light:hover{background-color:var(--sd-color-light-highlight) !important}.sd-bg-muted{background-color:var(--sd-color-muted) !important}.sd-bg-text-muted{color:var(--sd-color-muted-text) !important}button.sd-bg-muted:focus,button.sd-bg-muted:hover{background-color:var(--sd-color-muted-highlight) !important}a.sd-bg-muted:focus,a.sd-bg-muted:hover{background-color:var(--sd-color-muted-highlight) !important}.sd-bg-dark{background-color:var(--sd-color-dark) !important}.sd-bg-text-dark{color:var(--sd-color-dark-text) !important}button.sd-bg-dark:focus,button.sd-bg-dark:hover{background-color:var(--sd-color-dark-highlight) !important}a.sd-bg-dark:focus,a.sd-bg-dark:hover{background-color:var(--sd-color-dark-highlight) !important}.sd-bg-black{background-color:var(--sd-color-black) !important}.sd-bg-text-black{color:var(--sd-color-black-text) !important}button.sd-bg-black:focus,button.sd-bg-black:hover{background-color:var(--sd-color-black-highlight) !important}a.sd-bg-black:focus,a.sd-bg-black:hover{background-color:var(--sd-color-black-highlight) !important}.sd-bg-white{background-color:var(--sd-color-white) !important}.sd-bg-text-white{color:var(--sd-color-white-text) !important}button.sd-bg-white:focus,button.sd-bg-white:hover{background-color:var(--sd-color-white-highlight) !important}a.sd-bg-white:focus,a.sd-bg-white:hover{background-color:var(--sd-color-white-highlight) !important}.sd-text-primary,.sd-text-primary>p{color:var(--sd-color-primary) !important}a.sd-text-primary:focus,a.sd-text-primary:hover{color:var(--sd-color-primary-highlight) !important}.sd-text-secondary,.sd-text-secondary>p{color:var(--sd-color-secondary) !important}a.sd-text-secondary:focus,a.sd-text-secondary:hover{color:var(--sd-color-secondary-highlight) !important}.sd-text-success,.sd-text-success>p{color:var(--sd-color-success) !important}a.sd-text-success:focus,a.sd-text-success:hover{color:var(--sd-color-success-highlight) !important}.sd-text-info,.sd-text-info>p{color:var(--sd-color-info) !important}a.sd-text-info:focus,a.sd-text-info:hover{color:var(--sd-color-info-highlight) !important}.sd-text-warning,.sd-text-warning>p{color:var(--sd-color-warning) !important}a.sd-text-warning:focus,a.sd-text-warning:hover{color:var(--sd-color-warning-highlight) !important}.sd-text-danger,.sd-text-danger>p{color:var(--sd-color-danger) !important}a.sd-text-danger:focus,a.sd-text-danger:hover{color:var(--sd-color-danger-highlight) !important}.sd-text-light,.sd-text-light>p{color:var(--sd-color-light) !important}a.sd-text-light:focus,a.sd-text-light:hover{color:var(--sd-color-light-highlight) !important}.sd-text-muted,.sd-text-muted>p{color:var(--sd-color-muted) !important}a.sd-text-muted:focus,a.sd-text-muted:hover{color:var(--sd-color-muted-highlight) !important}.sd-text-dark,.sd-text-dark>p{color:var(--sd-color-dark) !important}a.sd-text-dark:focus,a.sd-text-dark:hover{color:var(--sd-color-dark-highlight) !important}.sd-text-black,.sd-text-black>p{color:var(--sd-color-black) !important}a.sd-text-black:focus,a.sd-text-black:hover{color:var(--sd-color-black-highlight) !important}.sd-text-white,.sd-text-white>p{color:var(--sd-color-white) !important}a.sd-text-white:focus,a.sd-text-white:hover{color:var(--sd-color-white-highlight) !important}.sd-outline-primary{border-color:var(--sd-color-primary) !important;border-style:solid !important;border-width:1px !important}a.sd-outline-primary:focus,a.sd-outline-primary:hover{border-color:var(--sd-color-primary-highlight) !important}.sd-outline-secondary{border-color:var(--sd-color-secondary) !important;border-style:solid !important;border-width:1px !important}a.sd-outline-secondary:focus,a.sd-outline-secondary:hover{border-color:var(--sd-color-secondary-highlight) !important}.sd-outline-success{border-color:var(--sd-color-success) !important;border-style:solid !important;border-width:1px !important}a.sd-outline-success:focus,a.sd-outline-success:hover{border-color:var(--sd-color-success-highlight) !important}.sd-outline-info{border-color:var(--sd-color-info) !important;border-style:solid !important;border-width:1px !important}a.sd-outline-info:focus,a.sd-outline-info:hover{border-color:var(--sd-color-info-highlight) !important}.sd-outline-warning{border-color:var(--sd-color-warning) !important;border-style:solid !important;border-width:1px !important}a.sd-outline-warning:focus,a.sd-outline-warning:hover{border-color:var(--sd-color-warning-highlight) !important}.sd-outline-danger{border-color:var(--sd-color-danger) !important;border-style:solid !important;border-width:1px !important}a.sd-outline-danger:focus,a.sd-outline-danger:hover{border-color:var(--sd-color-danger-highlight) !important}.sd-outline-light{border-color:var(--sd-color-light) !important;border-style:solid !important;border-width:1px !important}a.sd-outline-light:focus,a.sd-outline-light:hover{border-color:var(--sd-color-light-highlight) !important}.sd-outline-muted{border-color:var(--sd-color-muted) !important;border-style:solid !important;border-width:1px !important}a.sd-outline-muted:focus,a.sd-outline-muted:hover{border-color:var(--sd-color-muted-highlight) !important}.sd-outline-dark{border-color:var(--sd-color-dark) !important;border-style:solid !important;border-width:1px !important}a.sd-outline-dark:focus,a.sd-outline-dark:hover{border-color:var(--sd-color-dark-highlight) !important}.sd-outline-black{border-color:var(--sd-color-black) !important;border-style:solid !important;border-width:1px !important}a.sd-outline-black:focus,a.sd-outline-black:hover{border-color:var(--sd-color-black-highlight) !important}.sd-outline-white{border-color:var(--sd-color-white) !important;border-style:solid !important;border-width:1px !important}a.sd-outline-white:focus,a.sd-outline-white:hover{border-color:var(--sd-color-white-highlight) !important}.sd-bg-transparent{background-color:transparent !important}.sd-outline-transparent{border-color:transparent !important}.sd-text-transparent{color:transparent !important}.sd-p-0{padding:0 !important}.sd-pt-0,.sd-py-0{padding-top:0 !important}.sd-pr-0,.sd-px-0{padding-right:0 !important}.sd-pb-0,.sd-py-0{padding-bottom:0 !important}.sd-pl-0,.sd-px-0{padding-left:0 !important}.sd-p-1{padding:.25rem !important}.sd-pt-1,.sd-py-1{padding-top:.25rem !important}.sd-pr-1,.sd-px-1{padding-right:.25rem !important}.sd-pb-1,.sd-py-1{padding-bottom:.25rem !important}.sd-pl-1,.sd-px-1{padding-left:.25rem !important}.sd-p-2{padding:.5rem !important}.sd-pt-2,.sd-py-2{padding-top:.5rem !important}.sd-pr-2,.sd-px-2{padding-right:.5rem !important}.sd-pb-2,.sd-py-2{padding-bottom:.5rem !important}.sd-pl-2,.sd-px-2{padding-left:.5rem !important}.sd-p-3{padding:1rem !important}.sd-pt-3,.sd-py-3{padding-top:1rem !important}.sd-pr-3,.sd-px-3{padding-right:1rem !important}.sd-pb-3,.sd-py-3{padding-bottom:1rem !important}.sd-pl-3,.sd-px-3{padding-left:1rem !important}.sd-p-4{padding:1.5rem !important}.sd-pt-4,.sd-py-4{padding-top:1.5rem !important}.sd-pr-4,.sd-px-4{padding-right:1.5rem !important}.sd-pb-4,.sd-py-4{padding-bottom:1.5rem !important}.sd-pl-4,.sd-px-4{padding-left:1.5rem !important}.sd-p-5{padding:3rem !important}.sd-pt-5,.sd-py-5{padding-top:3rem !important}.sd-pr-5,.sd-px-5{padding-right:3rem !important}.sd-pb-5,.sd-py-5{padding-bottom:3rem !important}.sd-pl-5,.sd-px-5{padding-left:3rem !important}.sd-m-auto{margin:auto !important}.sd-mt-auto,.sd-my-auto{margin-top:auto !important}.sd-mr-auto,.sd-mx-auto{margin-right:auto !important}.sd-mb-auto,.sd-my-auto{margin-bottom:auto !important}.sd-ml-auto,.sd-mx-auto{margin-left:auto !important}.sd-m-0{margin:0 !important}.sd-mt-0,.sd-my-0{margin-top:0 !important}.sd-mr-0,.sd-mx-0{margin-right:0 !important}.sd-mb-0,.sd-my-0{margin-bottom:0 !important}.sd-ml-0,.sd-mx-0{margin-left:0 !important}.sd-m-1{margin:.25rem !important}.sd-mt-1,.sd-my-1{margin-top:.25rem !important}.sd-mr-1,.sd-mx-1{margin-right:.25rem !important}.sd-mb-1,.sd-my-1{margin-bottom:.25rem !important}.sd-ml-1,.sd-mx-1{margin-left:.25rem !important}.sd-m-2{margin:.5rem !important}.sd-mt-2,.sd-my-2{margin-top:.5rem !important}.sd-mr-2,.sd-mx-2{margin-right:.5rem !important}.sd-mb-2,.sd-my-2{margin-bottom:.5rem !important}.sd-ml-2,.sd-mx-2{margin-left:.5rem !important}.sd-m-3{margin:1rem !important}.sd-mt-3,.sd-my-3{margin-top:1rem !important}.sd-mr-3,.sd-mx-3{margin-right:1rem !important}.sd-mb-3,.sd-my-3{margin-bottom:1rem !important}.sd-ml-3,.sd-mx-3{margin-left:1rem !important}.sd-m-4{margin:1.5rem !important}.sd-mt-4,.sd-my-4{margin-top:1.5rem !important}.sd-mr-4,.sd-mx-4{margin-right:1.5rem !important}.sd-mb-4,.sd-my-4{margin-bottom:1.5rem !important}.sd-ml-4,.sd-mx-4{margin-left:1.5rem !important}.sd-m-5{margin:3rem !important}.sd-mt-5,.sd-my-5{margin-top:3rem !important}.sd-mr-5,.sd-mx-5{margin-right:3rem !important}.sd-mb-5,.sd-my-5{margin-bottom:3rem !important}.sd-ml-5,.sd-mx-5{margin-left:3rem !important}.sd-w-25{width:25% !important}.sd-w-50{width:50% !important}.sd-w-75{width:75% !important}.sd-w-100{width:100% !important}.sd-w-auto{width:auto !important}.sd-h-25{height:25% !important}.sd-h-50{height:50% !important}.sd-h-75{height:75% !important}.sd-h-100{height:100% !important}.sd-h-auto{height:auto !important}.sd-d-none{display:none !important}.sd-d-inline{display:inline !important}.sd-d-inline-block{display:inline-block !important}.sd-d-block{display:block !important}.sd-d-grid{display:grid !important}.sd-d-flex-row{display:-ms-flexbox !important;display:flex !important;flex-direction:row !important}.sd-d-flex-column{display:-ms-flexbox !important;display:flex !important;flex-direction:column !important}.sd-d-inline-flex{display:-ms-inline-flexbox !important;display:inline-flex !important}@media(min-width: 576px){.sd-d-sm-none{display:none !important}.sd-d-sm-inline{display:inline !important}.sd-d-sm-inline-block{display:inline-block !important}.sd-d-sm-block{display:block !important}.sd-d-sm-grid{display:grid !important}.sd-d-sm-flex{display:-ms-flexbox !important;display:flex !important}.sd-d-sm-inline-flex{display:-ms-inline-flexbox !important;display:inline-flex !important}}@media(min-width: 768px){.sd-d-md-none{display:none !important}.sd-d-md-inline{display:inline !important}.sd-d-md-inline-block{display:inline-block !important}.sd-d-md-block{display:block !important}.sd-d-md-grid{display:grid !important}.sd-d-md-flex{display:-ms-flexbox !important;display:flex !important}.sd-d-md-inline-flex{display:-ms-inline-flexbox !important;display:inline-flex !important}}@media(min-width: 992px){.sd-d-lg-none{display:none !important}.sd-d-lg-inline{display:inline !important}.sd-d-lg-inline-block{display:inline-block !important}.sd-d-lg-block{display:block !important}.sd-d-lg-grid{display:grid !important}.sd-d-lg-flex{display:-ms-flexbox !important;display:flex !important}.sd-d-lg-inline-flex{display:-ms-inline-flexbox !important;display:inline-flex !important}}@media(min-width: 1200px){.sd-d-xl-none{display:none !important}.sd-d-xl-inline{display:inline !important}.sd-d-xl-inline-block{display:inline-block !important}.sd-d-xl-block{display:block !important}.sd-d-xl-grid{display:grid !important}.sd-d-xl-flex{display:-ms-flexbox !important;display:flex !important}.sd-d-xl-inline-flex{display:-ms-inline-flexbox !important;display:inline-flex !important}}.sd-align-major-start{justify-content:flex-start !important}.sd-align-major-end{justify-content:flex-end !important}.sd-align-major-center{justify-content:center !important}.sd-align-major-justify{justify-content:space-between !important}.sd-align-major-spaced{justify-content:space-evenly !important}.sd-align-minor-start{align-items:flex-start !important}.sd-align-minor-end{align-items:flex-end !important}.sd-align-minor-center{align-items:center !important}.sd-align-minor-stretch{align-items:stretch !important}.sd-text-justify{text-align:justify !important}.sd-text-left{text-align:left !important}.sd-text-right{text-align:right !important}.sd-text-center{text-align:center !important}.sd-font-weight-light{font-weight:300 !important}.sd-font-weight-lighter{font-weight:lighter !important}.sd-font-weight-normal{font-weight:400 !important}.sd-font-weight-bold{font-weight:700 !important}.sd-font-weight-bolder{font-weight:bolder !important}.sd-font-italic{font-style:italic !important}.sd-text-decoration-none{text-decoration:none !important}.sd-text-lowercase{text-transform:lowercase !important}.sd-text-uppercase{text-transform:uppercase !important}.sd-text-capitalize{text-transform:capitalize !important}.sd-text-wrap{white-space:normal !important}.sd-text-nowrap{white-space:nowrap !important}.sd-text-truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.sd-fs-1,.sd-fs-1>p{font-size:calc(1.375rem + 1.5vw) !important;line-height:unset !important}.sd-fs-2,.sd-fs-2>p{font-size:calc(1.325rem + 0.9vw) !important;line-height:unset !important}.sd-fs-3,.sd-fs-3>p{font-size:calc(1.3rem + 0.6vw) !important;line-height:unset !important}.sd-fs-4,.sd-fs-4>p{font-size:calc(1.275rem + 0.3vw) !important;line-height:unset !important}.sd-fs-5,.sd-fs-5>p{font-size:1.25rem !important;line-height:unset !important}.sd-fs-6,.sd-fs-6>p{font-size:1rem !important;line-height:unset !important}.sd-border-0{border:0 solid !important}.sd-border-top-0{border-top:0 solid !important}.sd-border-bottom-0{border-bottom:0 solid !important}.sd-border-right-0{border-right:0 solid !important}.sd-border-left-0{border-left:0 solid !important}.sd-border-1{border:1px solid !important}.sd-border-top-1{border-top:1px solid !important}.sd-border-bottom-1{border-bottom:1px solid !important}.sd-border-right-1{border-right:1px solid !important}.sd-border-left-1{border-left:1px solid !important}.sd-border-2{border:2px solid !important}.sd-border-top-2{border-top:2px solid !important}.sd-border-bottom-2{border-bottom:2px solid !important}.sd-border-right-2{border-right:2px solid !important}.sd-border-left-2{border-left:2px solid !important}.sd-border-3{border:3px solid !important}.sd-border-top-3{border-top:3px solid !important}.sd-border-bottom-3{border-bottom:3px solid !important}.sd-border-right-3{border-right:3px solid !important}.sd-border-left-3{border-left:3px solid !important}.sd-border-4{border:4px solid !important}.sd-border-top-4{border-top:4px solid !important}.sd-border-bottom-4{border-bottom:4px solid !important}.sd-border-right-4{border-right:4px solid !important}.sd-border-left-4{border-left:4px solid !important}.sd-border-5{border:5px solid !important}.sd-border-top-5{border-top:5px solid !important}.sd-border-bottom-5{border-bottom:5px solid !important}.sd-border-right-5{border-right:5px solid !important}.sd-border-left-5{border-left:5px solid !important}.sd-rounded-0{border-radius:0 !important}.sd-rounded-1{border-radius:.2rem !important}.sd-rounded-2{border-radius:.3rem !important}.sd-rounded-3{border-radius:.5rem !important}.sd-rounded-pill{border-radius:50rem !important}.sd-rounded-circle{border-radius:50% !important}.shadow-none{box-shadow:none !important}.sd-shadow-sm{box-shadow:0 .125rem .25rem var(--sd-color-shadow) !important}.sd-shadow-md{box-shadow:0 .5rem 1rem var(--sd-color-shadow) !important}.sd-shadow-lg{box-shadow:0 1rem 3rem var(--sd-color-shadow) !important}@keyframes sd-slide-from-left{0%{transform:translateX(-100%)}100%{transform:translateX(0)}}@keyframes sd-slide-from-right{0%{transform:translateX(200%)}100%{transform:translateX(0)}}@keyframes sd-grow100{0%{transform:scale(0);opacity:.5}100%{transform:scale(1);opacity:1}}@keyframes sd-grow50{0%{transform:scale(0.5);opacity:.5}100%{transform:scale(1);opacity:1}}@keyframes sd-grow50-rot20{0%{transform:scale(0.5) rotateZ(-20deg);opacity:.5}75%{transform:scale(1) rotateZ(5deg);opacity:1}95%{transform:scale(1) rotateZ(-1deg);opacity:1}100%{transform:scale(1) rotateZ(0);opacity:1}}.sd-animate-slide-from-left{animation:1s ease-out 0s 1 normal none running sd-slide-from-left}.sd-animate-slide-from-right{animation:1s ease-out 0s 1 normal none running sd-slide-from-right}.sd-animate-grow100{animation:1s ease-out 0s 1 normal none running sd-grow100}.sd-animate-grow50{animation:1s ease-out 0s 1 normal none running sd-grow50}.sd-animate-grow50-rot20{animation:1s ease-out 0s 1 normal none running sd-grow50-rot20}.sd-badge{display:inline-block;padding:.35em .65em;font-size:.75em;font-weight:700;line-height:1;text-align:center;white-space:nowrap;vertical-align:baseline;border-radius:.25rem}.sd-badge:empty{display:none}a.sd-badge{text-decoration:none}.sd-btn .sd-badge{position:relative;top:-1px}.sd-btn{background-color:transparent;border:1px solid transparent;border-radius:.25rem;cursor:pointer;display:inline-block;font-weight:400;font-size:1rem;line-height:1.5;padding:.375rem .75rem;text-align:center;text-decoration:none;transition:color .15s ease-in-out,background-color .15s ease-in-out,border-color .15s ease-in-out,box-shadow .15s ease-in-out;vertical-align:middle;user-select:none;-moz-user-select:none;-ms-user-select:none;-webkit-user-select:none}.sd-btn:hover{text-decoration:none}@media(prefers-reduced-motion: reduce){.sd-btn{transition:none}}.sd-btn-primary,.sd-btn-outline-primary:hover,.sd-btn-outline-primary:focus{color:var(--sd-color-primary-text) !important;background-color:var(--sd-color-primary) !important;border-color:var(--sd-color-primary) !important;border-width:1px !important;border-style:solid !important}.sd-btn-primary:hover,.sd-btn-primary:focus{color:var(--sd-color-primary-text) !important;background-color:var(--sd-color-primary-highlight) !important;border-color:var(--sd-color-primary-highlight) !important;border-width:1px !important;border-style:solid !important}.sd-btn-outline-primary{color:var(--sd-color-primary) !important;border-color:var(--sd-color-primary) !important;border-width:1px !important;border-style:solid !important}.sd-btn-secondary,.sd-btn-outline-secondary:hover,.sd-btn-outline-secondary:focus{color:var(--sd-color-secondary-text) !important;background-color:var(--sd-color-secondary) !important;border-color:var(--sd-color-secondary) !important;border-width:1px !important;border-style:solid !important}.sd-btn-secondary:hover,.sd-btn-secondary:focus{color:var(--sd-color-secondary-text) !important;background-color:var(--sd-color-secondary-highlight) !important;border-color:var(--sd-color-secondary-highlight) !important;border-width:1px !important;border-style:solid !important}.sd-btn-outline-secondary{color:var(--sd-color-secondary) !important;border-color:var(--sd-color-secondary) !important;border-width:1px !important;border-style:solid !important}.sd-btn-success,.sd-btn-outline-success:hover,.sd-btn-outline-success:focus{color:var(--sd-color-success-text) !important;background-color:var(--sd-color-success) !important;border-color:var(--sd-color-success) !important;border-width:1px !important;border-style:solid !important}.sd-btn-success:hover,.sd-btn-success:focus{color:var(--sd-color-success-text) !important;background-color:var(--sd-color-success-highlight) !important;border-color:var(--sd-color-success-highlight) !important;border-width:1px !important;border-style:solid !important}.sd-btn-outline-success{color:var(--sd-color-success) !important;border-color:var(--sd-color-success) !important;border-width:1px !important;border-style:solid !important}.sd-btn-info,.sd-btn-outline-info:hover,.sd-btn-outline-info:focus{color:var(--sd-color-info-text) !important;background-color:var(--sd-color-info) !important;border-color:var(--sd-color-info) !important;border-width:1px !important;border-style:solid !important}.sd-btn-info:hover,.sd-btn-info:focus{color:var(--sd-color-info-text) !important;background-color:var(--sd-color-info-highlight) !important;border-color:var(--sd-color-info-highlight) !important;border-width:1px !important;border-style:solid !important}.sd-btn-outline-info{color:var(--sd-color-info) !important;border-color:var(--sd-color-info) !important;border-width:1px !important;border-style:solid !important}.sd-btn-warning,.sd-btn-outline-warning:hover,.sd-btn-outline-warning:focus{color:var(--sd-color-warning-text) !important;background-color:var(--sd-color-warning) !important;border-color:var(--sd-color-warning) !important;border-width:1px !important;border-style:solid !important}.sd-btn-warning:hover,.sd-btn-warning:focus{color:var(--sd-color-warning-text) !important;background-color:var(--sd-color-warning-highlight) !important;border-color:var(--sd-color-warning-highlight) !important;border-width:1px !important;border-style:solid !important}.sd-btn-outline-warning{color:var(--sd-color-warning) !important;border-color:var(--sd-color-warning) !important;border-width:1px !important;border-style:solid !important}.sd-btn-danger,.sd-btn-outline-danger:hover,.sd-btn-outline-danger:focus{color:var(--sd-color-danger-text) !important;background-color:var(--sd-color-danger) !important;border-color:var(--sd-color-danger) !important;border-width:1px !important;border-style:solid !important}.sd-btn-danger:hover,.sd-btn-danger:focus{color:var(--sd-color-danger-text) !important;background-color:var(--sd-color-danger-highlight) !important;border-color:var(--sd-color-danger-highlight) !important;border-width:1px !important;border-style:solid !important}.sd-btn-outline-danger{color:var(--sd-color-danger) !important;border-color:var(--sd-color-danger) !important;border-width:1px !important;border-style:solid !important}.sd-btn-light,.sd-btn-outline-light:hover,.sd-btn-outline-light:focus{color:var(--sd-color-light-text) !important;background-color:var(--sd-color-light) !important;border-color:var(--sd-color-light) !important;border-width:1px !important;border-style:solid !important}.sd-btn-light:hover,.sd-btn-light:focus{color:var(--sd-color-light-text) !important;background-color:var(--sd-color-light-highlight) !important;border-color:var(--sd-color-light-highlight) !important;border-width:1px !important;border-style:solid !important}.sd-btn-outline-light{color:var(--sd-color-light) !important;border-color:var(--sd-color-light) !important;border-width:1px !important;border-style:solid !important}.sd-btn-muted,.sd-btn-outline-muted:hover,.sd-btn-outline-muted:focus{color:var(--sd-color-muted-text) !important;background-color:var(--sd-color-muted) !important;border-color:var(--sd-color-muted) !important;border-width:1px !important;border-style:solid !important}.sd-btn-muted:hover,.sd-btn-muted:focus{color:var(--sd-color-muted-text) !important;background-color:var(--sd-color-muted-highlight) !important;border-color:var(--sd-color-muted-highlight) !important;border-width:1px !important;border-style:solid !important}.sd-btn-outline-muted{color:var(--sd-color-muted) !important;border-color:var(--sd-color-muted) !important;border-width:1px !important;border-style:solid !important}.sd-btn-dark,.sd-btn-outline-dark:hover,.sd-btn-outline-dark:focus{color:var(--sd-color-dark-text) !important;background-color:var(--sd-color-dark) !important;border-color:var(--sd-color-dark) !important;border-width:1px !important;border-style:solid !important}.sd-btn-dark:hover,.sd-btn-dark:focus{color:var(--sd-color-dark-text) !important;background-color:var(--sd-color-dark-highlight) !important;border-color:var(--sd-color-dark-highlight) !important;border-width:1px !important;border-style:solid !important}.sd-btn-outline-dark{color:var(--sd-color-dark) !important;border-color:var(--sd-color-dark) !important;border-width:1px !important;border-style:solid !important}.sd-btn-black,.sd-btn-outline-black:hover,.sd-btn-outline-black:focus{color:var(--sd-color-black-text) !important;background-color:var(--sd-color-black) !important;border-color:var(--sd-color-black) !important;border-width:1px !important;border-style:solid !important}.sd-btn-black:hover,.sd-btn-black:focus{color:var(--sd-color-black-text) !important;background-color:var(--sd-color-black-highlight) !important;border-color:var(--sd-color-black-highlight) !important;border-width:1px !important;border-style:solid !important}.sd-btn-outline-black{color:var(--sd-color-black) !important;border-color:var(--sd-color-black) !important;border-width:1px !important;border-style:solid !important}.sd-btn-white,.sd-btn-outline-white:hover,.sd-btn-outline-white:focus{color:var(--sd-color-white-text) !important;background-color:var(--sd-color-white) !important;border-color:var(--sd-color-white) !important;border-width:1px !important;border-style:solid !important}.sd-btn-white:hover,.sd-btn-white:focus{color:var(--sd-color-white-text) !important;background-color:var(--sd-color-white-highlight) !important;border-color:var(--sd-color-white-highlight) !important;border-width:1px !important;border-style:solid !important}.sd-btn-outline-white{color:var(--sd-color-white) !important;border-color:var(--sd-color-white) !important;border-width:1px !important;border-style:solid !important}.sd-stretched-link::after{position:absolute;top:0;right:0;bottom:0;left:0;z-index:1;content:""}.sd-hide-link-text{font-size:0}.sd-octicon,.sd-material-icon{display:inline-block;fill:currentColor;vertical-align:middle}.sd-avatar-xs{border-radius:50%;object-fit:cover;object-position:center;width:1rem;height:1rem}.sd-avatar-sm{border-radius:50%;object-fit:cover;object-position:center;width:3rem;height:3rem}.sd-avatar-md{border-radius:50%;object-fit:cover;object-position:center;width:5rem;height:5rem}.sd-avatar-lg{border-radius:50%;object-fit:cover;object-position:center;width:7rem;height:7rem}.sd-avatar-xl{border-radius:50%;object-fit:cover;object-position:center;width:10rem;height:10rem}.sd-avatar-inherit{border-radius:50%;object-fit:cover;object-position:center;width:inherit;height:inherit}.sd-avatar-initial{border-radius:50%;object-fit:cover;object-position:center;width:initial;height:initial}.sd-card{background-clip:border-box;background-color:var(--sd-color-card-background);border:1px solid var(--sd-color-card-border);border-radius:.25rem;color:var(--sd-color-card-text);display:-ms-flexbox;display:flex;-ms-flex-direction:column;flex-direction:column;min-width:0;position:relative;word-wrap:break-word}.sd-card>hr{margin-left:0;margin-right:0}.sd-card-hover:hover{border-color:var(--sd-color-card-border-hover);transform:scale(1.01)}.sd-card-body{-ms-flex:1 1 auto;flex:1 1 auto;padding:1rem 1rem}.sd-card-title{margin-bottom:.5rem}.sd-card-subtitle{margin-top:-0.25rem;margin-bottom:0}.sd-card-text:last-child{margin-bottom:0}.sd-card-link:hover{text-decoration:none}.sd-card-link+.card-link{margin-left:1rem}.sd-card-header{padding:.5rem 1rem;margin-bottom:0;background-color:var(--sd-color-card-header);border-bottom:1px solid var(--sd-color-card-border)}.sd-card-header:first-child{border-radius:calc(0.25rem - 1px) calc(0.25rem - 1px) 0 0}.sd-card-footer{padding:.5rem 1rem;background-color:var(--sd-color-card-footer);border-top:1px solid var(--sd-color-card-border)}.sd-card-footer:last-child{border-radius:0 0 calc(0.25rem - 1px) calc(0.25rem - 1px)}.sd-card-header-tabs{margin-right:-0.5rem;margin-bottom:-0.5rem;margin-left:-0.5rem;border-bottom:0}.sd-card-header-pills{margin-right:-0.5rem;margin-left:-0.5rem}.sd-card-img-overlay{position:absolute;top:0;right:0;bottom:0;left:0;padding:1rem;border-radius:calc(0.25rem - 1px)}.sd-card-img,.sd-card-img-bottom,.sd-card-img-top{width:100%}.sd-card-img,.sd-card-img-top{border-top-left-radius:calc(0.25rem - 1px);border-top-right-radius:calc(0.25rem - 1px)}.sd-card-img,.sd-card-img-bottom{border-bottom-left-radius:calc(0.25rem - 1px);border-bottom-right-radius:calc(0.25rem - 1px)}.sd-cards-carousel{width:100%;display:flex;flex-wrap:nowrap;-ms-flex-direction:row;flex-direction:row;overflow-x:hidden;scroll-snap-type:x mandatory}.sd-cards-carousel.sd-show-scrollbar{overflow-x:auto}.sd-cards-carousel:hover,.sd-cards-carousel:focus{overflow-x:auto}.sd-cards-carousel>.sd-card{flex-shrink:0;scroll-snap-align:start}.sd-cards-carousel>.sd-card:not(:last-child){margin-right:3px}.sd-card-cols-1>.sd-card{width:90%}.sd-card-cols-2>.sd-card{width:45%}.sd-card-cols-3>.sd-card{width:30%}.sd-card-cols-4>.sd-card{width:22.5%}.sd-card-cols-5>.sd-card{width:18%}.sd-card-cols-6>.sd-card{width:15%}.sd-card-cols-7>.sd-card{width:12.8571428571%}.sd-card-cols-8>.sd-card{width:11.25%}.sd-card-cols-9>.sd-card{width:10%}.sd-card-cols-10>.sd-card{width:9%}.sd-card-cols-11>.sd-card{width:8.1818181818%}.sd-card-cols-12>.sd-card{width:7.5%}.sd-container,.sd-container-fluid,.sd-container-lg,.sd-container-md,.sd-container-sm,.sd-container-xl{margin-left:auto;margin-right:auto;padding-left:var(--sd-gutter-x, 0.75rem);padding-right:var(--sd-gutter-x, 0.75rem);width:100%}@media(min-width: 576px){.sd-container-sm,.sd-container{max-width:540px}}@media(min-width: 768px){.sd-container-md,.sd-container-sm,.sd-container{max-width:720px}}@media(min-width: 992px){.sd-container-lg,.sd-container-md,.sd-container-sm,.sd-container{max-width:960px}}@media(min-width: 1200px){.sd-container-xl,.sd-container-lg,.sd-container-md,.sd-container-sm,.sd-container{max-width:1140px}}.sd-row{--sd-gutter-x: 1.5rem;--sd-gutter-y: 0;display:-ms-flexbox;display:flex;-ms-flex-wrap:wrap;flex-wrap:wrap;margin-top:calc(var(--sd-gutter-y) * -1);margin-right:calc(var(--sd-gutter-x) * -0.5);margin-left:calc(var(--sd-gutter-x) * -0.5)}.sd-row>*{box-sizing:border-box;flex-shrink:0;width:100%;max-width:100%;padding-right:calc(var(--sd-gutter-x) * 0.5);padding-left:calc(var(--sd-gutter-x) * 0.5);margin-top:var(--sd-gutter-y)}.sd-col{flex:1 0 0%;-ms-flex:1 0 0%}.sd-row-cols-auto>*{flex:0 0 auto;width:auto}.sd-row-cols-1>*{flex:0 0 auto;-ms-flex:0 0 auto;width:100%}.sd-row-cols-2>*{flex:0 0 auto;-ms-flex:0 0 auto;width:50%}.sd-row-cols-3>*{flex:0 0 auto;-ms-flex:0 0 auto;width:33.3333333333%}.sd-row-cols-4>*{flex:0 0 auto;-ms-flex:0 0 auto;width:25%}.sd-row-cols-5>*{flex:0 0 auto;-ms-flex:0 0 auto;width:20%}.sd-row-cols-6>*{flex:0 0 auto;-ms-flex:0 0 auto;width:16.6666666667%}.sd-row-cols-7>*{flex:0 0 auto;-ms-flex:0 0 auto;width:14.2857142857%}.sd-row-cols-8>*{flex:0 0 auto;-ms-flex:0 0 auto;width:12.5%}.sd-row-cols-9>*{flex:0 0 auto;-ms-flex:0 0 auto;width:11.1111111111%}.sd-row-cols-10>*{flex:0 0 auto;-ms-flex:0 0 auto;width:10%}.sd-row-cols-11>*{flex:0 0 auto;-ms-flex:0 0 auto;width:9.0909090909%}.sd-row-cols-12>*{flex:0 0 auto;-ms-flex:0 0 auto;width:8.3333333333%}@media(min-width: 576px){.sd-col-sm{flex:1 0 0%;-ms-flex:1 0 0%}.sd-row-cols-sm-auto{flex:1 0 auto;-ms-flex:1 0 auto;width:100%}.sd-row-cols-sm-1>*{flex:0 0 auto;-ms-flex:0 0 auto;width:100%}.sd-row-cols-sm-2>*{flex:0 0 auto;-ms-flex:0 0 auto;width:50%}.sd-row-cols-sm-3>*{flex:0 0 auto;-ms-flex:0 0 auto;width:33.3333333333%}.sd-row-cols-sm-4>*{flex:0 0 auto;-ms-flex:0 0 auto;width:25%}.sd-row-cols-sm-5>*{flex:0 0 auto;-ms-flex:0 0 auto;width:20%}.sd-row-cols-sm-6>*{flex:0 0 auto;-ms-flex:0 0 auto;width:16.6666666667%}.sd-row-cols-sm-7>*{flex:0 0 auto;-ms-flex:0 0 auto;width:14.2857142857%}.sd-row-cols-sm-8>*{flex:0 0 auto;-ms-flex:0 0 auto;width:12.5%}.sd-row-cols-sm-9>*{flex:0 0 auto;-ms-flex:0 0 auto;width:11.1111111111%}.sd-row-cols-sm-10>*{flex:0 0 auto;-ms-flex:0 0 auto;width:10%}.sd-row-cols-sm-11>*{flex:0 0 auto;-ms-flex:0 0 auto;width:9.0909090909%}.sd-row-cols-sm-12>*{flex:0 0 auto;-ms-flex:0 0 auto;width:8.3333333333%}}@media(min-width: 768px){.sd-col-md{flex:1 0 0%;-ms-flex:1 0 0%}.sd-row-cols-md-auto{flex:1 0 auto;-ms-flex:1 0 auto;width:100%}.sd-row-cols-md-1>*{flex:0 0 auto;-ms-flex:0 0 auto;width:100%}.sd-row-cols-md-2>*{flex:0 0 auto;-ms-flex:0 0 auto;width:50%}.sd-row-cols-md-3>*{flex:0 0 auto;-ms-flex:0 0 auto;width:33.3333333333%}.sd-row-cols-md-4>*{flex:0 0 auto;-ms-flex:0 0 auto;width:25%}.sd-row-cols-md-5>*{flex:0 0 auto;-ms-flex:0 0 auto;width:20%}.sd-row-cols-md-6>*{flex:0 0 auto;-ms-flex:0 0 auto;width:16.6666666667%}.sd-row-cols-md-7>*{flex:0 0 auto;-ms-flex:0 0 auto;width:14.2857142857%}.sd-row-cols-md-8>*{flex:0 0 auto;-ms-flex:0 0 auto;width:12.5%}.sd-row-cols-md-9>*{flex:0 0 auto;-ms-flex:0 0 auto;width:11.1111111111%}.sd-row-cols-md-10>*{flex:0 0 auto;-ms-flex:0 0 auto;width:10%}.sd-row-cols-md-11>*{flex:0 0 auto;-ms-flex:0 0 auto;width:9.0909090909%}.sd-row-cols-md-12>*{flex:0 0 auto;-ms-flex:0 0 auto;width:8.3333333333%}}@media(min-width: 992px){.sd-col-lg{flex:1 0 0%;-ms-flex:1 0 0%}.sd-row-cols-lg-auto{flex:1 0 auto;-ms-flex:1 0 auto;width:100%}.sd-row-cols-lg-1>*{flex:0 0 auto;-ms-flex:0 0 auto;width:100%}.sd-row-cols-lg-2>*{flex:0 0 auto;-ms-flex:0 0 auto;width:50%}.sd-row-cols-lg-3>*{flex:0 0 auto;-ms-flex:0 0 auto;width:33.3333333333%}.sd-row-cols-lg-4>*{flex:0 0 auto;-ms-flex:0 0 auto;width:25%}.sd-row-cols-lg-5>*{flex:0 0 auto;-ms-flex:0 0 auto;width:20%}.sd-row-cols-lg-6>*{flex:0 0 auto;-ms-flex:0 0 auto;width:16.6666666667%}.sd-row-cols-lg-7>*{flex:0 0 auto;-ms-flex:0 0 auto;width:14.2857142857%}.sd-row-cols-lg-8>*{flex:0 0 auto;-ms-flex:0 0 auto;width:12.5%}.sd-row-cols-lg-9>*{flex:0 0 auto;-ms-flex:0 0 auto;width:11.1111111111%}.sd-row-cols-lg-10>*{flex:0 0 auto;-ms-flex:0 0 auto;width:10%}.sd-row-cols-lg-11>*{flex:0 0 auto;-ms-flex:0 0 auto;width:9.0909090909%}.sd-row-cols-lg-12>*{flex:0 0 auto;-ms-flex:0 0 auto;width:8.3333333333%}}@media(min-width: 1200px){.sd-col-xl{flex:1 0 0%;-ms-flex:1 0 0%}.sd-row-cols-xl-auto{flex:1 0 auto;-ms-flex:1 0 auto;width:100%}.sd-row-cols-xl-1>*{flex:0 0 auto;-ms-flex:0 0 auto;width:100%}.sd-row-cols-xl-2>*{flex:0 0 auto;-ms-flex:0 0 auto;width:50%}.sd-row-cols-xl-3>*{flex:0 0 auto;-ms-flex:0 0 auto;width:33.3333333333%}.sd-row-cols-xl-4>*{flex:0 0 auto;-ms-flex:0 0 auto;width:25%}.sd-row-cols-xl-5>*{flex:0 0 auto;-ms-flex:0 0 auto;width:20%}.sd-row-cols-xl-6>*{flex:0 0 auto;-ms-flex:0 0 auto;width:16.6666666667%}.sd-row-cols-xl-7>*{flex:0 0 auto;-ms-flex:0 0 auto;width:14.2857142857%}.sd-row-cols-xl-8>*{flex:0 0 auto;-ms-flex:0 0 auto;width:12.5%}.sd-row-cols-xl-9>*{flex:0 0 auto;-ms-flex:0 0 auto;width:11.1111111111%}.sd-row-cols-xl-10>*{flex:0 0 auto;-ms-flex:0 0 auto;width:10%}.sd-row-cols-xl-11>*{flex:0 0 auto;-ms-flex:0 0 auto;width:9.0909090909%}.sd-row-cols-xl-12>*{flex:0 0 auto;-ms-flex:0 0 auto;width:8.3333333333%}}.sd-col-auto{flex:0 0 auto;-ms-flex:0 0 auto;width:auto}.sd-col-1{flex:0 0 auto;-ms-flex:0 0 auto;width:8.3333333333%}.sd-col-2{flex:0 0 auto;-ms-flex:0 0 auto;width:16.6666666667%}.sd-col-3{flex:0 0 auto;-ms-flex:0 0 auto;width:25%}.sd-col-4{flex:0 0 auto;-ms-flex:0 0 auto;width:33.3333333333%}.sd-col-5{flex:0 0 auto;-ms-flex:0 0 auto;width:41.6666666667%}.sd-col-6{flex:0 0 auto;-ms-flex:0 0 auto;width:50%}.sd-col-7{flex:0 0 auto;-ms-flex:0 0 auto;width:58.3333333333%}.sd-col-8{flex:0 0 auto;-ms-flex:0 0 auto;width:66.6666666667%}.sd-col-9{flex:0 0 auto;-ms-flex:0 0 auto;width:75%}.sd-col-10{flex:0 0 auto;-ms-flex:0 0 auto;width:83.3333333333%}.sd-col-11{flex:0 0 auto;-ms-flex:0 0 auto;width:91.6666666667%}.sd-col-12{flex:0 0 auto;-ms-flex:0 0 auto;width:100%}.sd-g-0,.sd-gy-0{--sd-gutter-y: 0}.sd-g-0,.sd-gx-0{--sd-gutter-x: 0}.sd-g-1,.sd-gy-1{--sd-gutter-y: 0.25rem}.sd-g-1,.sd-gx-1{--sd-gutter-x: 0.25rem}.sd-g-2,.sd-gy-2{--sd-gutter-y: 0.5rem}.sd-g-2,.sd-gx-2{--sd-gutter-x: 0.5rem}.sd-g-3,.sd-gy-3{--sd-gutter-y: 1rem}.sd-g-3,.sd-gx-3{--sd-gutter-x: 1rem}.sd-g-4,.sd-gy-4{--sd-gutter-y: 1.5rem}.sd-g-4,.sd-gx-4{--sd-gutter-x: 1.5rem}.sd-g-5,.sd-gy-5{--sd-gutter-y: 3rem}.sd-g-5,.sd-gx-5{--sd-gutter-x: 3rem}@media(min-width: 576px){.sd-col-sm-auto{-ms-flex:0 0 auto;flex:0 0 auto;width:auto}.sd-col-sm-1{-ms-flex:0 0 auto;flex:0 0 auto;width:8.3333333333%}.sd-col-sm-2{-ms-flex:0 0 auto;flex:0 0 auto;width:16.6666666667%}.sd-col-sm-3{-ms-flex:0 0 auto;flex:0 0 auto;width:25%}.sd-col-sm-4{-ms-flex:0 0 auto;flex:0 0 auto;width:33.3333333333%}.sd-col-sm-5{-ms-flex:0 0 auto;flex:0 0 auto;width:41.6666666667%}.sd-col-sm-6{-ms-flex:0 0 auto;flex:0 0 auto;width:50%}.sd-col-sm-7{-ms-flex:0 0 auto;flex:0 0 auto;width:58.3333333333%}.sd-col-sm-8{-ms-flex:0 0 auto;flex:0 0 auto;width:66.6666666667%}.sd-col-sm-9{-ms-flex:0 0 auto;flex:0 0 auto;width:75%}.sd-col-sm-10{-ms-flex:0 0 auto;flex:0 0 auto;width:83.3333333333%}.sd-col-sm-11{-ms-flex:0 0 auto;flex:0 0 auto;width:91.6666666667%}.sd-col-sm-12{-ms-flex:0 0 auto;flex:0 0 auto;width:100%}.sd-g-sm-0,.sd-gy-sm-0{--sd-gutter-y: 0}.sd-g-sm-0,.sd-gx-sm-0{--sd-gutter-x: 0}.sd-g-sm-1,.sd-gy-sm-1{--sd-gutter-y: 0.25rem}.sd-g-sm-1,.sd-gx-sm-1{--sd-gutter-x: 0.25rem}.sd-g-sm-2,.sd-gy-sm-2{--sd-gutter-y: 0.5rem}.sd-g-sm-2,.sd-gx-sm-2{--sd-gutter-x: 0.5rem}.sd-g-sm-3,.sd-gy-sm-3{--sd-gutter-y: 1rem}.sd-g-sm-3,.sd-gx-sm-3{--sd-gutter-x: 1rem}.sd-g-sm-4,.sd-gy-sm-4{--sd-gutter-y: 1.5rem}.sd-g-sm-4,.sd-gx-sm-4{--sd-gutter-x: 1.5rem}.sd-g-sm-5,.sd-gy-sm-5{--sd-gutter-y: 3rem}.sd-g-sm-5,.sd-gx-sm-5{--sd-gutter-x: 3rem}}@media(min-width: 768px){.sd-col-md-auto{-ms-flex:0 0 auto;flex:0 0 auto;width:auto}.sd-col-md-1{-ms-flex:0 0 auto;flex:0 0 auto;width:8.3333333333%}.sd-col-md-2{-ms-flex:0 0 auto;flex:0 0 auto;width:16.6666666667%}.sd-col-md-3{-ms-flex:0 0 auto;flex:0 0 auto;width:25%}.sd-col-md-4{-ms-flex:0 0 auto;flex:0 0 auto;width:33.3333333333%}.sd-col-md-5{-ms-flex:0 0 auto;flex:0 0 auto;width:41.6666666667%}.sd-col-md-6{-ms-flex:0 0 auto;flex:0 0 auto;width:50%}.sd-col-md-7{-ms-flex:0 0 auto;flex:0 0 auto;width:58.3333333333%}.sd-col-md-8{-ms-flex:0 0 auto;flex:0 0 auto;width:66.6666666667%}.sd-col-md-9{-ms-flex:0 0 auto;flex:0 0 auto;width:75%}.sd-col-md-10{-ms-flex:0 0 auto;flex:0 0 auto;width:83.3333333333%}.sd-col-md-11{-ms-flex:0 0 auto;flex:0 0 auto;width:91.6666666667%}.sd-col-md-12{-ms-flex:0 0 auto;flex:0 0 auto;width:100%}.sd-g-md-0,.sd-gy-md-0{--sd-gutter-y: 0}.sd-g-md-0,.sd-gx-md-0{--sd-gutter-x: 0}.sd-g-md-1,.sd-gy-md-1{--sd-gutter-y: 0.25rem}.sd-g-md-1,.sd-gx-md-1{--sd-gutter-x: 0.25rem}.sd-g-md-2,.sd-gy-md-2{--sd-gutter-y: 0.5rem}.sd-g-md-2,.sd-gx-md-2{--sd-gutter-x: 0.5rem}.sd-g-md-3,.sd-gy-md-3{--sd-gutter-y: 1rem}.sd-g-md-3,.sd-gx-md-3{--sd-gutter-x: 1rem}.sd-g-md-4,.sd-gy-md-4{--sd-gutter-y: 1.5rem}.sd-g-md-4,.sd-gx-md-4{--sd-gutter-x: 1.5rem}.sd-g-md-5,.sd-gy-md-5{--sd-gutter-y: 3rem}.sd-g-md-5,.sd-gx-md-5{--sd-gutter-x: 3rem}}@media(min-width: 992px){.sd-col-lg-auto{-ms-flex:0 0 auto;flex:0 0 auto;width:auto}.sd-col-lg-1{-ms-flex:0 0 auto;flex:0 0 auto;width:8.3333333333%}.sd-col-lg-2{-ms-flex:0 0 auto;flex:0 0 auto;width:16.6666666667%}.sd-col-lg-3{-ms-flex:0 0 auto;flex:0 0 auto;width:25%}.sd-col-lg-4{-ms-flex:0 0 auto;flex:0 0 auto;width:33.3333333333%}.sd-col-lg-5{-ms-flex:0 0 auto;flex:0 0 auto;width:41.6666666667%}.sd-col-lg-6{-ms-flex:0 0 auto;flex:0 0 auto;width:50%}.sd-col-lg-7{-ms-flex:0 0 auto;flex:0 0 auto;width:58.3333333333%}.sd-col-lg-8{-ms-flex:0 0 auto;flex:0 0 auto;width:66.6666666667%}.sd-col-lg-9{-ms-flex:0 0 auto;flex:0 0 auto;width:75%}.sd-col-lg-10{-ms-flex:0 0 auto;flex:0 0 auto;width:83.3333333333%}.sd-col-lg-11{-ms-flex:0 0 auto;flex:0 0 auto;width:91.6666666667%}.sd-col-lg-12{-ms-flex:0 0 auto;flex:0 0 auto;width:100%}.sd-g-lg-0,.sd-gy-lg-0{--sd-gutter-y: 0}.sd-g-lg-0,.sd-gx-lg-0{--sd-gutter-x: 0}.sd-g-lg-1,.sd-gy-lg-1{--sd-gutter-y: 0.25rem}.sd-g-lg-1,.sd-gx-lg-1{--sd-gutter-x: 0.25rem}.sd-g-lg-2,.sd-gy-lg-2{--sd-gutter-y: 0.5rem}.sd-g-lg-2,.sd-gx-lg-2{--sd-gutter-x: 0.5rem}.sd-g-lg-3,.sd-gy-lg-3{--sd-gutter-y: 1rem}.sd-g-lg-3,.sd-gx-lg-3{--sd-gutter-x: 1rem}.sd-g-lg-4,.sd-gy-lg-4{--sd-gutter-y: 1.5rem}.sd-g-lg-4,.sd-gx-lg-4{--sd-gutter-x: 1.5rem}.sd-g-lg-5,.sd-gy-lg-5{--sd-gutter-y: 3rem}.sd-g-lg-5,.sd-gx-lg-5{--sd-gutter-x: 3rem}}@media(min-width: 1200px){.sd-col-xl-auto{-ms-flex:0 0 auto;flex:0 0 auto;width:auto}.sd-col-xl-1{-ms-flex:0 0 auto;flex:0 0 auto;width:8.3333333333%}.sd-col-xl-2{-ms-flex:0 0 auto;flex:0 0 auto;width:16.6666666667%}.sd-col-xl-3{-ms-flex:0 0 auto;flex:0 0 auto;width:25%}.sd-col-xl-4{-ms-flex:0 0 auto;flex:0 0 auto;width:33.3333333333%}.sd-col-xl-5{-ms-flex:0 0 auto;flex:0 0 auto;width:41.6666666667%}.sd-col-xl-6{-ms-flex:0 0 auto;flex:0 0 auto;width:50%}.sd-col-xl-7{-ms-flex:0 0 auto;flex:0 0 auto;width:58.3333333333%}.sd-col-xl-8{-ms-flex:0 0 auto;flex:0 0 auto;width:66.6666666667%}.sd-col-xl-9{-ms-flex:0 0 auto;flex:0 0 auto;width:75%}.sd-col-xl-10{-ms-flex:0 0 auto;flex:0 0 auto;width:83.3333333333%}.sd-col-xl-11{-ms-flex:0 0 auto;flex:0 0 auto;width:91.6666666667%}.sd-col-xl-12{-ms-flex:0 0 auto;flex:0 0 auto;width:100%}.sd-g-xl-0,.sd-gy-xl-0{--sd-gutter-y: 0}.sd-g-xl-0,.sd-gx-xl-0{--sd-gutter-x: 0}.sd-g-xl-1,.sd-gy-xl-1{--sd-gutter-y: 0.25rem}.sd-g-xl-1,.sd-gx-xl-1{--sd-gutter-x: 0.25rem}.sd-g-xl-2,.sd-gy-xl-2{--sd-gutter-y: 0.5rem}.sd-g-xl-2,.sd-gx-xl-2{--sd-gutter-x: 0.5rem}.sd-g-xl-3,.sd-gy-xl-3{--sd-gutter-y: 1rem}.sd-g-xl-3,.sd-gx-xl-3{--sd-gutter-x: 1rem}.sd-g-xl-4,.sd-gy-xl-4{--sd-gutter-y: 1.5rem}.sd-g-xl-4,.sd-gx-xl-4{--sd-gutter-x: 1.5rem}.sd-g-xl-5,.sd-gy-xl-5{--sd-gutter-y: 3rem}.sd-g-xl-5,.sd-gx-xl-5{--sd-gutter-x: 3rem}}.sd-flex-row-reverse{flex-direction:row-reverse !important}details.sd-dropdown{position:relative}details.sd-dropdown .sd-summary-title{font-weight:700;padding-right:3em !important;-moz-user-select:none;-ms-user-select:none;-webkit-user-select:none;user-select:none}details.sd-dropdown:hover{cursor:pointer}details.sd-dropdown .sd-summary-content{cursor:default}details.sd-dropdown summary{list-style:none;padding:1em}details.sd-dropdown summary .sd-octicon.no-title{vertical-align:middle}details.sd-dropdown[open] summary .sd-octicon.no-title{visibility:hidden}details.sd-dropdown summary::-webkit-details-marker{display:none}details.sd-dropdown summary:focus{outline:none}details.sd-dropdown .sd-summary-icon{margin-right:.5em}details.sd-dropdown .sd-summary-icon svg{opacity:.8}details.sd-dropdown summary:hover .sd-summary-up svg,details.sd-dropdown summary:hover .sd-summary-down svg{opacity:1;transform:scale(1.1)}details.sd-dropdown .sd-summary-up svg,details.sd-dropdown .sd-summary-down svg{display:block;opacity:.6}details.sd-dropdown .sd-summary-up,details.sd-dropdown .sd-summary-down{pointer-events:none;position:absolute;right:1em;top:1em}details.sd-dropdown[open]>.sd-summary-title .sd-summary-down{visibility:hidden}details.sd-dropdown:not([open])>.sd-summary-title .sd-summary-up{visibility:hidden}details.sd-dropdown:not([open]).sd-card{border:none}details.sd-dropdown:not([open])>.sd-card-header{border:1px solid var(--sd-color-card-border);border-radius:.25rem}details.sd-dropdown.sd-fade-in[open] summary~*{-moz-animation:sd-fade-in .5s ease-in-out;-webkit-animation:sd-fade-in .5s ease-in-out;animation:sd-fade-in .5s ease-in-out}details.sd-dropdown.sd-fade-in-slide-down[open] summary~*{-moz-animation:sd-fade-in .5s ease-in-out,sd-slide-down .5s ease-in-out;-webkit-animation:sd-fade-in .5s ease-in-out,sd-slide-down .5s ease-in-out;animation:sd-fade-in .5s ease-in-out,sd-slide-down .5s ease-in-out}.sd-col>.sd-dropdown{width:100%}.sd-summary-content>.sd-tab-set:first-child{margin-top:0}@keyframes sd-fade-in{0%{opacity:0}100%{opacity:1}}@keyframes sd-slide-down{0%{transform:translate(0, -10px)}100%{transform:translate(0, 0)}}.sd-tab-set{border-radius:.125rem;display:flex;flex-wrap:wrap;margin:1em 0;position:relative}.sd-tab-set>input{opacity:0;position:absolute}.sd-tab-set>input:checked+label{border-color:var(--sd-color-tabs-underline-active);color:var(--sd-color-tabs-label-active)}.sd-tab-set>input:checked+label+.sd-tab-content{display:block}.sd-tab-set>input:not(:checked)+label:hover{color:var(--sd-color-tabs-label-hover);border-color:var(--sd-color-tabs-underline-hover)}.sd-tab-set>input:focus+label{outline-style:auto}.sd-tab-set>input:not(.focus-visible)+label{outline:none;-webkit-tap-highlight-color:transparent}.sd-tab-set>label{border-bottom:.125rem solid transparent;margin-bottom:0;color:var(--sd-color-tabs-label-inactive);border-color:var(--sd-color-tabs-underline-inactive);cursor:pointer;font-size:var(--sd-fontsize-tabs-label);font-weight:700;padding:1em 1.25em .5em;transition:color 250ms;width:auto;z-index:1}html .sd-tab-set>label:hover{color:var(--sd-color-tabs-label-active)}.sd-col>.sd-tab-set{width:100%}.sd-tab-content{box-shadow:0 -0.0625rem var(--sd-color-tabs-overline),0 .0625rem var(--sd-color-tabs-underline);display:none;order:99;padding-bottom:.75rem;padding-top:.75rem;width:100%}.sd-tab-content>:first-child{margin-top:0 !important}.sd-tab-content>:last-child{margin-bottom:0 !important}.sd-tab-content>.sd-tab-set{margin:0}.sd-sphinx-override,.sd-sphinx-override *{-moz-box-sizing:border-box;-webkit-box-sizing:border-box;box-sizing:border-box}.sd-sphinx-override p{margin-top:0}:root{--sd-color-primary: #007bff;--sd-color-secondary: #6c757d;--sd-color-success: #28a745;--sd-color-info: #17a2b8;--sd-color-warning: #f0b37e;--sd-color-danger: #dc3545;--sd-color-light: #f8f9fa;--sd-color-muted: #6c757d;--sd-color-dark: #212529;--sd-color-black: black;--sd-color-white: white;--sd-color-primary-highlight: #0069d9;--sd-color-secondary-highlight: #5c636a;--sd-color-success-highlight: #228e3b;--sd-color-info-highlight: #148a9c;--sd-color-warning-highlight: #cc986b;--sd-color-danger-highlight: #bb2d3b;--sd-color-light-highlight: #d3d4d5;--sd-color-muted-highlight: #5c636a;--sd-color-dark-highlight: #1c1f23;--sd-color-black-highlight: black;--sd-color-white-highlight: #d9d9d9;--sd-color-primary-text: #fff;--sd-color-secondary-text: #fff;--sd-color-success-text: #fff;--sd-color-info-text: #fff;--sd-color-warning-text: #212529;--sd-color-danger-text: #fff;--sd-color-light-text: #212529;--sd-color-muted-text: #fff;--sd-color-dark-text: #fff;--sd-color-black-text: #fff;--sd-color-white-text: #212529;--sd-color-shadow: rgba(0, 0, 0, 0.15);--sd-color-card-border: rgba(0, 0, 0, 0.125);--sd-color-card-border-hover: hsla(231, 99%, 66%, 1);--sd-color-card-background: transparent;--sd-color-card-text: inherit;--sd-color-card-header: transparent;--sd-color-card-footer: transparent;--sd-color-tabs-label-active: hsla(231, 99%, 66%, 1);--sd-color-tabs-label-hover: hsla(231, 99%, 66%, 1);--sd-color-tabs-label-inactive: hsl(0, 0%, 66%);--sd-color-tabs-underline-active: hsla(231, 99%, 66%, 1);--sd-color-tabs-underline-hover: rgba(178, 206, 245, 0.62);--sd-color-tabs-underline-inactive: transparent;--sd-color-tabs-overline: rgb(222, 222, 222);--sd-color-tabs-underline: rgb(222, 222, 222);--sd-fontsize-tabs-label: 1rem} diff --git a/_static/design-tabs.js b/_static/design-tabs.js new file mode 100644 index 0000000000..36b38cf0d9 --- /dev/null +++ b/_static/design-tabs.js @@ -0,0 +1,27 @@ +var sd_labels_by_text = {}; + +function ready() { + const li = document.getElementsByClassName("sd-tab-label"); + for (const label of li) { + syncId = label.getAttribute("data-sync-id"); + if (syncId) { + label.onclick = onLabelClick; + if (!sd_labels_by_text[syncId]) { + sd_labels_by_text[syncId] = []; + } + sd_labels_by_text[syncId].push(label); + } + } +} + +function onLabelClick() { + // Activate other inputs with the same sync id. + syncId = this.getAttribute("data-sync-id"); + for (label of sd_labels_by_text[syncId]) { + if (label === this) continue; + label.previousElementSibling.checked = true; + } + window.localStorage.setItem("sphinx-design-last-tab", syncId); +} + +document.addEventListener("DOMContentLoaded", ready, false); diff --git a/_static/doctools.js b/_static/doctools.js new file mode 100644 index 0000000000..c3db08d1c3 --- /dev/null +++ b/_static/doctools.js @@ -0,0 +1,264 @@ +/* + * doctools.js + * ~~~~~~~~~~~ + * + * Base JavaScript utilities for all Sphinx HTML documentation. + * + * :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ +"use strict"; + +const _ready = (callback) => { + if (document.readyState !== "loading") { + callback(); + } else { + document.addEventListener("DOMContentLoaded", callback); + } +}; + +/** + * highlight a given string on a node by wrapping it in + * span elements with the given class name. + */ +const _highlight = (node, addItems, text, className) => { + if (node.nodeType === Node.TEXT_NODE) { + const val = node.nodeValue; + const parent = node.parentNode; + const pos = val.toLowerCase().indexOf(text); + if ( + pos >= 0 && + !parent.classList.contains(className) && + !parent.classList.contains("nohighlight") + ) { + let span; + + const closestNode = parent.closest("body, svg, foreignObject"); + const isInSVG = closestNode && closestNode.matches("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.classList.add(className); + } + + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + parent.insertBefore( + span, + parent.insertBefore( + document.createTextNode(val.substr(pos + text.length)), + node.nextSibling + ) + ); + node.nodeValue = val.substr(0, pos); + + if (isInSVG) { + const rect = document.createElementNS( + "http://www.w3.org/2000/svg", + "rect" + ); + const bbox = parent.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute("class", className); + addItems.push({ parent: parent, target: rect }); + } + } + } else if (node.matches && !node.matches("button, select, textarea")) { + node.childNodes.forEach((el) => _highlight(el, addItems, text, className)); + } +}; +const _highlightText = (thisNode, text, className) => { + let addItems = []; + _highlight(thisNode, addItems, text, className); + addItems.forEach((obj) => + obj.parent.insertAdjacentElement("beforebegin", obj.target) + ); +}; + +/** + * Small JavaScript module for the documentation. + */ +const Documentation = { + init: () => { + Documentation.highlightSearchWords(); + Documentation.initDomainIndexTable(); + Documentation.initOnKeyListeners(); + }, + + /** + * i18n support + */ + TRANSLATIONS: {}, + PLURAL_EXPR: (n) => (n === 1 ? 0 : 1), + LOCALE: "unknown", + + // gettext and ngettext don't access this so that the functions + // can safely bound to a different name (_ = Documentation.gettext) + gettext: (string) => { + const translated = Documentation.TRANSLATIONS[string]; + switch (typeof translated) { + case "undefined": + return string; // no translation + case "string": + return translated; // translation exists + default: + return translated[0]; // (singular, plural) translation tuple exists + } + }, + + ngettext: (singular, plural, n) => { + const translated = Documentation.TRANSLATIONS[singular]; + if (typeof translated !== "undefined") + return translated[Documentation.PLURAL_EXPR(n)]; + return n === 1 ? singular : plural; + }, + + addTranslations: (catalog) => { + Object.assign(Documentation.TRANSLATIONS, catalog.messages); + Documentation.PLURAL_EXPR = new Function( + "n", + `return (${catalog.plural_expr})` + ); + Documentation.LOCALE = catalog.locale; + }, + + /** + * highlight the search words provided in the url in the text + */ + highlightSearchWords: () => { + const highlight = + new URLSearchParams(window.location.search).get("highlight") || ""; + const terms = highlight.toLowerCase().split(/\s+/).filter(x => x); + if (terms.length === 0) return; // nothing to do + + // There should never be more than one element matching "div.body" + const divBody = document.querySelectorAll("div.body"); + const body = divBody.length ? divBody[0] : document.querySelector("body"); + window.setTimeout(() => { + terms.forEach((term) => _highlightText(body, term, "highlighted")); + }, 10); + + const searchBox = document.getElementById("searchbox"); + if (searchBox === null) return; + searchBox.appendChild( + document + .createRange() + .createContextualFragment( + '' + + '' + + Documentation.gettext("Hide Search Matches") + + "
" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + const url = new URL(window.location); + url.searchParams.delete("highlight"); + window.history.replaceState({}, "", url); + }, + + /** + * helper function to focus on search bar + */ + focusSearchBar: () => { + document.querySelectorAll("input[name=q]")[0]?.focus(); + }, + + /** + * Initialise the domain index toggle buttons + */ + initDomainIndexTable: () => { + const toggler = (el) => { + const idNumber = el.id.substr(7); + const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`); + if (el.src.substr(-9) === "minus.png") { + el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`; + toggledRows.forEach((el) => (el.style.display = "none")); + } else { + el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`; + toggledRows.forEach((el) => (el.style.display = "")); + } + }; + + const togglerElements = document.querySelectorAll("img.toggler"); + togglerElements.forEach((el) => + el.addEventListener("click", (event) => toggler(event.currentTarget)) + ); + togglerElements.forEach((el) => (el.style.display = "")); + if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler); + }, + + initOnKeyListeners: () => { + // only install a listener if it is really needed + if ( + !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS && + !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS + ) + return; + + const blacklistedElements = new Set([ + "TEXTAREA", + "INPUT", + "SELECT", + "BUTTON", + ]); + document.addEventListener("keydown", (event) => { + if (blacklistedElements.has(document.activeElement.tagName)) return; // bail for input elements + if (event.altKey || event.ctrlKey || event.metaKey) return; // bail with special keys + + if (!event.shiftKey) { + switch (event.key) { + case "ArrowLeft": + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; + + const prevLink = document.querySelector('link[rel="prev"]'); + if (prevLink && prevLink.href) { + window.location.href = prevLink.href; + event.preventDefault(); + } + break; + case "ArrowRight": + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; + + const nextLink = document.querySelector('link[rel="next"]'); + if (nextLink && nextLink.href) { + window.location.href = nextLink.href; + event.preventDefault(); + } + break; + case "Escape": + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break; + Documentation.hideSearchWords(); + event.preventDefault(); + } + } + + // some keyboard layouts may need Shift to get / + switch (event.key) { + case "/": + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break; + Documentation.focusSearchBar(); + event.preventDefault(); + } + }); + }, +}; + +// quick alias for translations +const _ = Documentation.gettext; + +_ready(Documentation.init); diff --git a/_static/documentation_options.js b/_static/documentation_options.js new file mode 100644 index 0000000000..162a6ba8d8 --- /dev/null +++ b/_static/documentation_options.js @@ -0,0 +1,14 @@ +var DOCUMENTATION_OPTIONS = { + URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), + VERSION: '', + LANGUAGE: 'en', + COLLAPSE_INDEX: false, + BUILDER: 'html', + FILE_SUFFIX: '.html', + LINK_SUFFIX: '.html', + HAS_SOURCE: true, + SOURCELINK_SUFFIX: '', + NAVIGATION_WITH_KEYS: false, + SHOW_SEARCH_SUMMARY: true, + ENABLE_SEARCH_SHORTCUTS: false, +}; \ No newline at end of file diff --git a/_static/file.png b/_static/file.png new file mode 100644 index 0000000000..a858a410e4 Binary files /dev/null and b/_static/file.png differ diff --git a/_static/images/logo_binder.svg b/_static/images/logo_binder.svg new file mode 100644 index 0000000000..45fecf7511 --- /dev/null +++ b/_static/images/logo_binder.svg @@ -0,0 +1,19 @@ + + + diff --git a/_static/images/logo_colab.png b/_static/images/logo_colab.png new file mode 100644 index 0000000000..b7560ec216 Binary files /dev/null and b/_static/images/logo_colab.png differ diff --git a/_static/images/logo_deepnote.svg b/_static/images/logo_deepnote.svg new file mode 100644 index 0000000000..fa77ebfc25 --- /dev/null +++ b/_static/images/logo_deepnote.svg @@ -0,0 +1 @@ + diff --git a/_static/images/logo_jupyterhub.svg b/_static/images/logo_jupyterhub.svg new file mode 100644 index 0000000000..60cfe9f222 --- /dev/null +++ b/_static/images/logo_jupyterhub.svg @@ -0,0 +1 @@ + diff --git a/_static/jquery-3.6.0.js b/_static/jquery-3.6.0.js new file mode 100644 index 0000000000..fc6c299b73 --- /dev/null +++ b/_static/jquery-3.6.0.js @@ -0,0 +1,10881 @@ +/*! + * jQuery JavaScript Library v3.6.0 + * https://jquery.com/ + * + * Includes Sizzle.js + * https://sizzlejs.com/ + * + * Copyright OpenJS Foundation and other contributors + * Released under the MIT license + * https://jquery.org/license + * + * Date: 2021-03-02T17:08Z + */ +( function( global, factory ) { + + "use strict"; + + if ( typeof module === "object" && typeof module.exports === "object" ) { + + // For CommonJS and CommonJS-like environments where a proper `window` + // is present, execute the factory and get jQuery. + // For environments that do not have a `window` with a `document` + // (such as Node.js), expose a factory as module.exports. + // This accentuates the need for the creation of a real `window`. + // e.g. var jQuery = require("jquery")(window); + // See ticket #14549 for more info. + module.exports = global.document ? + factory( global, true ) : + function( w ) { + if ( !w.document ) { + throw new Error( "jQuery requires a window with a document" ); + } + return factory( w ); + }; + } else { + factory( global ); + } + +// Pass this if window is not defined yet +} )( typeof window !== "undefined" ? window : this, function( window, noGlobal ) { + +// Edge <= 12 - 13+, Firefox <=18 - 45+, IE 10 - 11, Safari 5.1 - 9+, iOS 6 - 9.1 +// throw exceptions when non-strict code (e.g., ASP.NET 4.5) accesses strict mode +// arguments.callee.caller (trac-13335). But as of jQuery 3.0 (2016), strict mode should be common +// enough that all such attempts are guarded in a try block. +"use strict"; + +var arr = []; + +var getProto = Object.getPrototypeOf; + +var slice = arr.slice; + +var flat = arr.flat ? function( array ) { + return arr.flat.call( array ); +} : function( array ) { + return arr.concat.apply( [], array ); +}; + + +var push = arr.push; + +var indexOf = arr.indexOf; + +var class2type = {}; + +var toString = class2type.toString; + +var hasOwn = class2type.hasOwnProperty; + +var fnToString = hasOwn.toString; + +var ObjectFunctionString = fnToString.call( Object ); + +var support = {}; + +var isFunction = function isFunction( obj ) { + + // Support: Chrome <=57, Firefox <=52 + // In some browsers, typeof returns "function" for HTML