diff --git a/.github/actions/azureml-test/action.yml b/.github/actions/azureml-test/action.yml
index d74b88160..85ae9f84a 100644
--- a/.github/actions/azureml-test/action.yml
+++ b/.github/actions/azureml-test/action.yml
@@ -76,7 +76,7 @@ runs:
shell: bash
run: pip install --quiet "azureml-core>1,<2" "azure-cli>2,<3"
- name: Log in to Azure
- uses: azure/login@v1
+ uses: azure/login@v2
with:
creds: ${{inputs.AZUREML_TEST_CREDENTIALS}}
- name: Install wheel package
diff --git a/.github/workflows/azureml-cpu-nightly.yml b/.github/workflows/azureml-cpu-nightly.yml
index 72bb700cf..93e414564 100644
--- a/.github/workflows/azureml-cpu-nightly.yml
+++ b/.github/workflows/azureml-cpu-nightly.yml
@@ -67,7 +67,7 @@ jobs:
strategy:
max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration
matrix:
- python-version: ['"python=3.8"', '"python=3.9"']
+ python-version: ['"python=3.8"', '"python=3.9"', '"python=3.10"', '"python=3.11"']
test-group: ${{ fromJSON(needs.get-test-groups.outputs.test_groups) }}
steps:
- name: Check out repository code
diff --git a/.github/workflows/azureml-gpu-nightly.yml b/.github/workflows/azureml-gpu-nightly.yml
index efac48774..3b9f6d6b4 100644
--- a/.github/workflows/azureml-gpu-nightly.yml
+++ b/.github/workflows/azureml-gpu-nightly.yml
@@ -67,7 +67,7 @@ jobs:
strategy:
max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration
matrix:
- python-version: ['"python=3.8"', '"python=3.9"']
+ python-version: ['"python=3.8"', '"python=3.9"', '"python=3.10"', '"python=3.11"']
test-group: ${{ fromJSON(needs.get-test-groups.outputs.test_groups) }}
steps:
- name: Check out repository code
diff --git a/.github/workflows/azureml-spark-nightly.yml b/.github/workflows/azureml-spark-nightly.yml
index b3a76f9ea..8f28be6f2 100644
--- a/.github/workflows/azureml-spark-nightly.yml
+++ b/.github/workflows/azureml-spark-nightly.yml
@@ -66,7 +66,7 @@ jobs:
strategy:
max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration
matrix:
- python-version: ['"python=3.8"', '"python=3.9"']
+ python-version: ['"python=3.8"', '"python=3.9"', '"python=3.10"', '"python=3.11"']
test-group: ${{ fromJSON(needs.get-test-groups.outputs.test_groups) }}
steps:
- name: Check out repository code
diff --git a/.github/workflows/azureml-unit-tests.yml b/.github/workflows/azureml-unit-tests.yml
index 13ed56005..b39268318 100644
--- a/.github/workflows/azureml-unit-tests.yml
+++ b/.github/workflows/azureml-unit-tests.yml
@@ -56,7 +56,7 @@ jobs:
strategy:
max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration
matrix:
- python-version: ['"python=3.8"', '"python=3.9"']
+ python-version: ['"python=3.8"', '"python=3.9"', '"python=3.10"', '"python=3.11"']
test-group: ${{ fromJSON(needs.get-test-groups.outputs.test_groups) }}
steps:
- name: Check out repository code
diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml
index 766b31645..90d03fef6 100644
--- a/.github/workflows/sarplus.yml
+++ b/.github/workflows/sarplus.yml
@@ -39,7 +39,7 @@ jobs:
runs-on: ubuntu-22.04
strategy:
matrix:
- python-version: ["3.8", "3.9"]
+ python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v4
diff --git a/contrib/sarplus/python/setup.py b/contrib/sarplus/python/setup.py
index 4009ec751..f755f5310 100644
--- a/contrib/sarplus/python/setup.py
+++ b/contrib/sarplus/python/setup.py
@@ -42,6 +42,7 @@ def __str__(self):
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Mathematics",
@@ -49,7 +50,7 @@ def __str__(self):
setup_requires=["pytest-runner"],
install_requires=DEPENDENCIES,
tests_require=["pytest"],
- python_requires=">=3.6,<3.11",
+ python_requires=">=3.6,<3.12",
packages=["pysarplus"],
package_data={"": ["VERSION"]},
ext_modules=[
diff --git a/examples/00_quick_start/fastai_movielens.ipynb b/examples/00_quick_start/fastai_movielens.ipynb
index 517673178..944b92623 100644
--- a/examples/00_quick_start/fastai_movielens.ipynb
+++ b/examples/00_quick_start/fastai_movielens.ipynb
@@ -27,17 +27,21 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "System version: 3.6.11 | packaged by conda-forge | (default, Aug 5 2020, 20:09:42) \n",
- "[GCC 7.5.0]\n",
- "Pandas version: 0.25.3\n",
- "Fast AI version: 1.0.46\n",
- "Torch version: 1.4.0\n",
- "Cuda Available: False\n",
+ "System version: 3.9.16 (main, May 15 2023, 23:46:34) \n",
+ "[GCC 11.2.0]\n",
+ "Pandas version: 1.5.3\n",
+ "Fast AI version: 2.7.11\n",
+ "Torch version: 1.13.1+cu117\n",
+ "CUDA Available: True\n",
"CuDNN Enabled: True\n"
]
}
],
"source": [
+ "# Suppress all warnings\n",
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "\n",
"import os\n",
"import sys\n",
"import numpy as np\n",
@@ -46,7 +50,7 @@
"import fastai\n",
"from tempfile import TemporaryDirectory\n",
"\n",
- "from fastai.collab import collab_learner, CollabDataBunch, load_learner\n",
+ "from fastai.collab import collab_learner, CollabDataLoaders, load_learner\n",
"\n",
"from recommenders.utils.constants import (\n",
" DEFAULT_USER_COL as USER, \n",
@@ -67,7 +71,7 @@
"print(\"Pandas version: {}\".format(pd.__version__))\n",
"print(\"Fast AI version: {}\".format(fastai.__version__))\n",
"print(\"Torch version: {}\".format(torch.__version__))\n",
- "print(\"Cuda Available: {}\".format(torch.cuda.is_available()))\n",
+ "print(\"CUDA Available: {}\".format(torch.cuda.is_available()))\n",
"print(\"CuDNN Enabled: {}\".format(torch.backends.cudnn.enabled))"
]
},
@@ -80,7 +84,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 2,
"metadata": {
"tags": [
"parameters"
@@ -101,14 +105,14 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "100%|██████████| 4.81k/4.81k [00:01<00:00, 4.49kKB/s]\n"
+ "100%|██████████| 4.81k/4.81k [00:01<00:00, 3.52kKB/s]\n"
]
},
{
@@ -132,10 +136,10 @@
" \n",
" \n",
" | \n",
- " UserId | \n",
- " MovieId | \n",
- " Rating | \n",
- " Timestamp | \n",
+ " userID | \n",
+ " itemID | \n",
+ " rating | \n",
+ " timestamp | \n",
"
\n",
" \n",
"
\n",
@@ -179,15 +183,15 @@
""
],
"text/plain": [
- " UserId MovieId Rating Timestamp\n",
- "0 196 242 3.0 881250949\n",
- "1 186 302 3.0 891717742\n",
- "2 22 377 1.0 878887116\n",
- "3 244 51 2.0 880606923\n",
- "4 166 346 1.0 886397596"
+ " userID itemID rating timestamp\n",
+ "0 196 242 3.0 881250949\n",
+ "1 186 302 3.0 891717742\n",
+ "2 22 377 1.0 878887116\n",
+ "3 244 51 2.0 880606923\n",
+ "4 166 346 1.0 886397596"
]
},
- "execution_count": 4,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -207,7 +211,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -224,7 +228,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -258,11 +262,11 @@
"outputs": [],
"source": [
"with Timer() as preprocess_time:\n",
- " data = CollabDataBunch.from_df(train_valid_df, \n",
- " user_name=USER, \n",
- " item_name=ITEM, \n",
- " rating_name=RATING, \n",
- " valid_pct=0)\n"
+ " data = CollabDataLoaders.from_df(train_valid_df, \n",
+ " user_name=USER, \n",
+ " item_name=ITEM, \n",
+ " rating_name=RATING, \n",
+ " valid_pct=0)\n"
]
},
{
@@ -276,37 +280,73 @@
"\n",
" \n",
" \n",
- " UserId | \n",
- " MovieId | \n",
- " target | \n",
+ " | \n",
+ " userID | \n",
+ " itemID | \n",
+ " rating | \n",
"
\n",
" \n",
" \n",
" \n",
- " 543 | \n",
- " 1555 | \n",
- " 3.0 | \n",
+ " 0 | \n",
+ " 104 | \n",
+ " 840 | \n",
+ " 1.0 | \n",
"
\n",
" \n",
- " 90 | \n",
- " 945 | \n",
- " 5.0 | \n",
+ " 1 | \n",
+ " 881 | \n",
+ " 112 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 746 | \n",
+ " 506 | \n",
+ " 3.0 | \n",
"
\n",
" \n",
- " 292 | \n",
- " 515 | \n",
+ " 3 | \n",
+ " 104 | \n",
+ " 257 | \n",
" 4.0 | \n",
"
\n",
" \n",
- " 303 | \n",
- " 1092 | \n",
- " 1.0 | \n",
+ " 4 | \n",
+ " 511 | \n",
+ " 1527 | \n",
+ " 4.0 | \n",
"
\n",
" \n",
+ " 5 | \n",
" 497 | \n",
- " 946 | \n",
+ " 763 | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 407 | \n",
+ " 869 | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 291 | \n",
+ " 924 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 109 | \n",
+ " 94 | \n",
" 4.0 | \n",
"
\n",
+ " \n",
+ " 9 | \n",
+ " 82 | \n",
+ " 597 | \n",
+ " 3.0 | \n",
+ "
\n",
" \n",
"
"
],
@@ -369,6 +409,33 @@
"execution_count": 10,
"metadata": {},
"outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
{
"data": {
"text/html": [
@@ -383,34 +450,34 @@
" \n",
" \n",
" \n",
+ " 0 | \n",
+ " 0.961789 | \n",
+ " None | \n",
+ " 00:09 | \n",
+ "
\n",
+ " \n",
" 1 | \n",
- " 0.985993 | \n",
- " | \n",
- " 00:05 | \n",
+ " 0.863359 | \n",
+ " None | \n",
+ " 00:08 | \n",
"
\n",
" \n",
" 2 | \n",
- " 0.885496 | \n",
- " | \n",
- " 00:05 | \n",
+ " 0.750853 | \n",
+ " None | \n",
+ " 00:07 | \n",
"
\n",
" \n",
" 3 | \n",
- " 0.777637 | \n",
- " | \n",
- " 00:05 | \n",
+ " 0.637868 | \n",
+ " None | \n",
+ " 00:08 | \n",
"
\n",
" \n",
" 4 | \n",
- " 0.628971 | \n",
- " | \n",
- " 00:05 | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " 0.532328 | \n",
- " | \n",
- " 00:06 | \n",
+ " 0.526907 | \n",
+ " None | \n",
+ " 00:09 | \n",
"
\n",
" \n",
""
@@ -426,13 +493,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Took 29.5549 seconds for training.\n"
+ "Took 51.5260 seconds for training.\n"
]
}
],
"source": [
"with Timer() as train_time:\n",
- " learn.fit_one_cycle(EPOCHS, max_lr=5e-3)\n",
+ " learn.fit_one_cycle(EPOCHS, lr_max=5e-3)\n",
"\n",
"print(\"Took {} seconds for training.\".format(train_time))"
]
@@ -446,7 +513,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -456,7 +523,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@@ -474,11 +541,11 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
- "learner = load_learner(tmp.name, \"movielens_model.pkl\")"
+ "learner = load_learner(model_path)"
]
},
{
@@ -490,11 +557,11 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
- "total_users, total_items = learner.data.train_ds.x.classes.values()\n",
+ "total_users, total_items = learner.dls.classes.values()\n",
"total_items = total_items[1:]\n",
"total_users = total_users[1:]"
]
@@ -508,7 +575,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@@ -525,7 +592,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
@@ -545,7 +612,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 17,
"metadata": {
"scrolled": false
},
@@ -564,14 +631,14 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Took 1.9734 seconds for 1511060 predictions.\n"
+ "Took 5.1570 seconds for 1511060 predictions.\n"
]
}
],
@@ -595,7 +662,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
@@ -606,7 +673,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
@@ -617,7 +684,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
@@ -628,7 +695,7 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
@@ -639,27 +706,27 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Model:\tCollabLearner\n",
- "Top K:\t10\n",
- "MAP:\t0.026115\n",
- "NDCG:\t0.155065\n",
- "Precision@K:\t0.136691\n",
- "Recall@K:\t0.054940\n"
+ "Model:\t\tLearner\n",
+ "Top K:\t\t10\n",
+ "MAP:\t\t0.024119\n",
+ "NDCG:\t\t0.152808\n",
+ "Precision@K:\t0.139130\n",
+ "Recall@K:\t0.054943\n"
]
}
],
"source": [
- "print(\"Model:\\t\" + learn.__class__.__name__,\n",
- " \"Top K:\\t%d\" % TOP_K,\n",
- " \"MAP:\\t%f\" % eval_map,\n",
- " \"NDCG:\\t%f\" % eval_ndcg,\n",
+ "print(\"Model:\\t\\t\" + learn.__class__.__name__,\n",
+ " \"Top K:\\t\\t%d\" % TOP_K,\n",
+ " \"MAP:\\t\\t%f\" % eval_map,\n",
+ " \"NDCG:\\t\\t%f\" % eval_ndcg,\n",
" \"Precision@K:\\t%f\" % eval_precision,\n",
" \"Recall@K:\\t%f\" % eval_recall, sep='\\n')"
]
@@ -673,7 +740,7 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
@@ -693,18 +760,18 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Model:\tCollabLearner\n",
- "RMSE:\t0.902379\n",
- "MAE:\t0.712163\n",
- "Explained variance:\t0.346523\n",
- "R squared:\t0.345672\n"
+ "Model:\t\t\tLearner\n",
+ "RMSE:\t\t\t0.904589\n",
+ "MAE:\t\t\t0.715827\n",
+ "Explained variance:\t0.356082\n",
+ "R squared:\t\t0.355173\n"
]
}
],
@@ -714,36 +781,35 @@
"eval_mae = mae(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)\n",
"eval_exp_var = exp_var(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)\n",
"\n",
- "print(\"Model:\\t\" + learn.__class__.__name__,\n",
- " \"RMSE:\\t%f\" % eval_rmse,\n",
- " \"MAE:\\t%f\" % eval_mae,\n",
+ "print(\"Model:\\t\\t\\t\" + learn.__class__.__name__,\n",
+ " \"RMSE:\\t\\t\\t%f\" % eval_rmse,\n",
+ " \"MAE:\\t\\t\\t%f\" % eval_mae,\n",
" \"Explained variance:\\t%f\" % eval_exp_var,\n",
- " \"R squared:\\t%f\" % eval_r2, sep='\\n')"
+ " \"R squared:\\t\\t%f\" % eval_r2, sep='\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "That RMSE is actually quite good when compared to these benchmarks: https://www.librec.net/release/v1.3/example.html"
+ "That RMSE is competitive in comparison with other models."
]
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
- "application/scrapbook.scrap.json+json": {
- "data": 0.02611475567509659,
+ "application/notebook_utils.json+json": {
+ "data": 0.024118782738867094,
"encoder": "json",
- "name": "map",
- "version": 1
+ "name": "map"
}
},
"metadata": {
- "scrapbook": {
+ "notebook_utils": {
"data": true,
"display": false,
"name": "map"
@@ -753,15 +819,14 @@
},
{
"data": {
- "application/scrapbook.scrap.json+json": {
- "data": 0.15506533130248687,
+ "application/notebook_utils.json+json": {
+ "data": 0.1528081472533914,
"encoder": "json",
- "name": "ndcg",
- "version": 1
+ "name": "ndcg"
}
},
"metadata": {
- "scrapbook": {
+ "notebook_utils": {
"data": true,
"display": false,
"name": "ndcg"
@@ -771,15 +836,14 @@
},
{
"data": {
- "application/scrapbook.scrap.json+json": {
- "data": 0.13669141039236482,
+ "application/notebook_utils.json+json": {
+ "data": 0.13913043478260873,
"encoder": "json",
- "name": "precision",
- "version": 1
+ "name": "precision"
}
},
"metadata": {
- "scrapbook": {
+ "notebook_utils": {
"data": true,
"display": false,
"name": "precision"
@@ -789,15 +853,14 @@
},
{
"data": {
- "application/scrapbook.scrap.json+json": {
- "data": 0.05493986799753499,
+ "application/notebook_utils.json+json": {
+ "data": 0.05494302697544413,
"encoder": "json",
- "name": "recall",
- "version": 1
+ "name": "recall"
}
},
"metadata": {
- "scrapbook": {
+ "notebook_utils": {
"data": true,
"display": false,
"name": "recall"
@@ -807,15 +870,14 @@
},
{
"data": {
- "application/scrapbook.scrap.json+json": {
- "data": 0.9023793356156464,
+ "application/notebook_utils.json+json": {
+ "data": 0.9045892929999733,
"encoder": "json",
- "name": "rmse",
- "version": 1
+ "name": "rmse"
}
},
"metadata": {
- "scrapbook": {
+ "notebook_utils": {
"data": true,
"display": false,
"name": "rmse"
@@ -825,15 +887,14 @@
},
{
"data": {
- "application/scrapbook.scrap.json+json": {
- "data": 0.7121634655740025,
+ "application/notebook_utils.json+json": {
+ "data": 0.7158267242352735,
"encoder": "json",
- "name": "mae",
- "version": 1
+ "name": "mae"
}
},
"metadata": {
- "scrapbook": {
+ "notebook_utils": {
"data": true,
"display": false,
"name": "mae"
@@ -843,15 +904,14 @@
},
{
"data": {
- "application/scrapbook.scrap.json+json": {
- "data": 0.34652281723228295,
+ "application/notebook_utils.json+json": {
+ "data": 0.3560824305444269,
"encoder": "json",
- "name": "exp_var",
- "version": 1
+ "name": "exp_var"
}
},
"metadata": {
- "scrapbook": {
+ "notebook_utils": {
"data": true,
"display": false,
"name": "exp_var"
@@ -861,15 +921,14 @@
},
{
"data": {
- "application/scrapbook.scrap.json+json": {
- "data": 0.3456716162958503,
+ "application/notebook_utils.json+json": {
+ "data": 0.35517333876960555,
"encoder": "json",
- "name": "rsquared",
- "version": 1
+ "name": "rsquared"
}
},
"metadata": {
- "scrapbook": {
+ "notebook_utils": {
"data": true,
"display": false,
"name": "rsquared"
@@ -879,15 +938,14 @@
},
{
"data": {
- "application/scrapbook.scrap.json+json": {
- "data": 29.554921820759773,
+ "application/notebook_utils.json+json": {
+ "data": 51.52598460000445,
"encoder": "json",
- "name": "train_time",
- "version": 1
+ "name": "train_time"
}
},
"metadata": {
- "scrapbook": {
+ "notebook_utils": {
"data": true,
"display": false,
"name": "train_time"
@@ -897,15 +955,14 @@
},
{
"data": {
- "application/scrapbook.scrap.json+json": {
- "data": 1.973397959023714,
+ "application/notebook_utils.json+json": {
+ "data": 5.156951100005244,
"encoder": "json",
- "name": "test_time",
- "version": 1
+ "name": "test_time"
}
},
"metadata": {
- "scrapbook": {
+ "notebook_utils": {
"data": true,
"display": false,
"name": "test_time"
@@ -930,7 +987,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
@@ -946,9 +1003,9 @@
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
- "display_name": "Python (reco_gpu)",
+ "display_name": "recommenders",
"language": "python",
- "name": "reco_gpu"
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
@@ -960,7 +1017,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.11"
+ "version": "3.9.16"
}
},
"nbformat": 4,
diff --git a/examples/06_benchmarks/benchmark_utils.py b/examples/06_benchmarks/benchmark_utils.py
index e28fa6ab7..c62518838 100644
--- a/examples/06_benchmarks/benchmark_utils.py
+++ b/examples/06_benchmarks/benchmark_utils.py
@@ -15,7 +15,7 @@
except ImportError:
pass # skip this import if we are not in a Spark environment
try:
- from fastai.collab import collab_learner, CollabDataBunch
+ from fastai.collab import collab_learner, CollabDataLoaders
except ImportError:
pass # skip this import if we are not in a GPU environment
@@ -181,7 +181,7 @@ def prepare_training_fastai(train, test):
data = train.copy()
data[DEFAULT_USER_COL] = data[DEFAULT_USER_COL].astype("str")
data[DEFAULT_ITEM_COL] = data[DEFAULT_ITEM_COL].astype("str")
- data = CollabDataBunch.from_df(
+ data = CollabDataLoaders.from_df(
data,
user_name=DEFAULT_USER_COL,
item_name=DEFAULT_ITEM_COL,
@@ -196,7 +196,7 @@ def train_fastai(params, data):
data, n_factors=params["n_factors"], y_range=params["y_range"], wd=params["wd"]
)
with Timer() as t:
- model.fit_one_cycle(cyc_len=params["epochs"], max_lr=params["max_lr"])
+ model.fit_one_cycle(params["epochs"], lr_max=params["lr_max"])
return model, t
@@ -221,9 +221,9 @@ def predict_fastai(model, test):
def recommend_k_fastai(model, test, train, top_k=DEFAULT_K, remove_seen=True):
with Timer() as t:
- total_users, total_items = model.data.train_ds.x.classes.values()
- total_items = total_items[1:]
- total_users = total_users[1:]
+ total_users, total_items = model.dls.classes.values()
+ total_items = np.array(total_items[1:])
+ total_users = np.array(total_users[1:])
test_users = test[DEFAULT_USER_COL].unique()
test_users = np.intersect1d(test_users, total_users)
users_items = cartesian_product(test_users, total_items)
diff --git a/examples/06_benchmarks/movielens.ipynb b/examples/06_benchmarks/movielens.ipynb
index 2f7a857ce..8c8ee6d2f 100644
--- a/examples/06_benchmarks/movielens.ipynb
+++ b/examples/06_benchmarks/movielens.ipynb
@@ -299,7 +299,7 @@
" \"n_factors\": 40, \n",
" \"y_range\": [0,5.5], \n",
" \"wd\": 1e-1,\n",
- " \"max_lr\": 5e-3,\n",
+ " \"lr_max\": 5e-3,\n",
" \"epochs\": 15\n",
"}\n",
"\n",
diff --git a/recommenders/models/fastai/fastai_utils.py b/recommenders/models/fastai/fastai_utils.py
index ab756c7e8..f6b6a8986 100644
--- a/recommenders/models/fastai/fastai_utils.py
+++ b/recommenders/models/fastai/fastai_utils.py
@@ -6,6 +6,7 @@
import pandas as pd
import fastai
import fastprogress
+import torch
from fastprogress.fastprogress import force_console_behavior
from recommenders.utils import constants as cc
@@ -51,24 +52,32 @@ def score(
pandas.DataFrame: Result of recommendation
"""
# replace values not known to the model with NaN
- total_users, total_items = learner.data.train_ds.x.classes.values()
+ total_users, total_items = learner.dls.classes.values()
test_df.loc[~test_df[user_col].isin(total_users), user_col] = np.nan
test_df.loc[~test_df[item_col].isin(total_items), item_col] = np.nan
# map ids to embedding ids
- u = learner.get_idx(test_df[user_col], is_item=False)
- m = learner.get_idx(test_df[item_col], is_item=True)
+ u = learner._get_idx(test_df[user_col], is_item=False)
+ m = learner._get_idx(test_df[item_col], is_item=True)
# score the pytorch model
- pred = learner.model.forward(u, m)
+ x = torch.column_stack((u, m))
+
+ if torch.cuda.is_available():
+ x = x.to("cuda")
+ learner.model = learner.model.to("cuda")
+
+ pred = learner.model.forward(x).detach().cpu().numpy()
scores = pd.DataFrame(
{user_col: test_df[user_col], item_col: test_df[item_col], prediction_col: pred}
)
scores = scores.sort_values([user_col, prediction_col], ascending=[True, False])
+
if top_k is not None:
top_scores = scores.groupby(user_col).head(top_k).reset_index(drop=True)
else:
top_scores = scores
+
return top_scores
@@ -77,7 +86,7 @@ def hide_fastai_progress_bar():
fastprogress.fastprogress.NO_BAR = True
fastprogress.fastprogress.WRITER_FN = str
master_bar, progress_bar = force_console_behavior()
- fastai.basic_train.master_bar, fastai.basic_train.progress_bar = (
+ fastai.callback.progress.master_bar, fastai.callback.progress.progress_bar = (
master_bar,
progress_bar,
)
diff --git a/recommenders/models/rlrmc/RLRMCdataset.py b/recommenders/models/rlrmc/RLRMCdataset.py
index 6b1329d1d..7670105b3 100644
--- a/recommenders/models/rlrmc/RLRMCdataset.py
+++ b/recommenders/models/rlrmc/RLRMCdataset.py
@@ -68,8 +68,8 @@ def _data_processing(self, train, validation=None, test=None, mean_center=True):
"""
# Data processing and reindexing code is adopted from https://github.com/Microsoft/Recommenders/blob/main/recommenders/models/ncf/dataset.py
# If validation dataset is None
- df = train if validation is None else train.append(validation)
- df = df if test is None else df.append(test)
+ df = train if validation is None else pd.concat([train, validation])
+ df = df if test is None else pd.concat([df, test])
# Reindex user and item index
if self.user_idx is None:
diff --git a/recommenders/models/tfidf/tfidf_utils.py b/recommenders/models/tfidf/tfidf_utils.py
index 24575121c..6a6d22389 100644
--- a/recommenders/models/tfidf/tfidf_utils.py
+++ b/recommenders/models/tfidf/tfidf_utils.py
@@ -115,7 +115,7 @@ def clean_dataframe(self, df, cols_to_clean, new_col_name="cleaned_text"):
return df
def tokenize_text(
- self, df_clean, text_col="cleaned_text", ngram_range=(1, 3), min_df=0
+ self, df_clean, text_col="cleaned_text", ngram_range=(1, 3), min_df=0.0
):
"""Tokenize the input text.
For more details on the TfidfVectorizer, see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
@@ -124,7 +124,7 @@ def tokenize_text(
df_clean (pandas.DataFrame): Dataframe with cleaned text in the new column.
text_col (str): Name of column containing the cleaned text.
ngram_range (tuple of int): The lower and upper boundary of the range of n-values for different n-grams to be extracted.
- min_df (int): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
+ min_df (float): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
Returns:
TfidfVectorizer, pandas.Series:
diff --git a/setup.py b/setup.py
index 758765690..c5fc49bb8 100644
--- a/setup.py
+++ b/setup.py
@@ -27,53 +27,45 @@
version += ".post" + str(int(time.time()))
install_requires = [
- "numpy>=1.19", # 1.19 required by tensorflow 2.6
- "pandas>1.0.3,<2",
- "scipy>=1.0.0,<1.11.0", # FIXME: We limit <1.11.0 until #1954 is fixed
- "tqdm>=4.31.1,<5",
- "matplotlib>=2.2.2,<4",
- "scikit-learn>=0.22.1,<1.0.3",
- "numba>=0.38.1,<1",
- "lightfm>=1.15,<2",
- "lightgbm>=2.2.1",
- "memory_profiler>=0.54.0,<1",
- "nltk>=3.4,<4",
- "seaborn>=0.8.1,<1",
- "transformers>=2.5.0,<5",
- "category_encoders>=1.3.0,<2",
- "jinja2>=2,<3.1",
- "requests>=2.31.0,<3",
- "cornac>=1.1.2,<1.15.2;python_version<='3.7'",
- "cornac>=1.15.2,<2;python_version>='3.8'", # After 1.15.2, Cornac requires python 3.8
- "retrying>=1.3.3",
+ "category-encoders>=2.6.0,<3", # requires packaging
+ "cornac>=1.15.2,<2", # requires packaging, tqdm
+ "hyperopt>=0.2.7,<1",
+ "lightfm>=1.17,<2", # requires requests
+ "lightgbm>=4.0.0,<5",
+ "locust>=2.12.2,<3", # requires jinja2
+ "memory-profiler>=0.61.0,<1",
+ "nltk>=3.8.1,<4", # requires tqdm
+ "notebook>=7.0.0,<8", # requires ipykernel, jinja2, jupyter, nbconvert, nbformat, packaging, requests
+ "numba>=0.57.0,<1",
+ "pandas>2.0.0,<3.0.0", # requires numpy
"pandera[strategies]>=0.6.5,<0.18;python_version<='3.8'", # For generating fake datasets
- "pandera[strategies]>=0.6.5;python_version>='3.9'",
- "scikit-surprise>=1.0.6",
- "hyperopt>=0.1.2,<1",
- "ipykernel>=4.6.1,<7",
- "jupyter>=1,<2",
- "locust>=1,<2",
+ "pandera[strategies]>=0.15.0;python_version>='3.9'",
+ "retrying>=1.3.4,<2",
+ "scikit-learn>=1.2.0,<2", # requires scipy, and introduce breaking change affects feature_extraction.text.TfidfVectorizer.min_df
+ "scikit-surprise>=1.1.3",
+ "scipy>=1.10.1,<1.11.0", # FIXME: We limit <1.11.0 until #1954 is fixed
+ "seaborn>=0.13.0,<1", # requires matplotlib, packaging
+ "transformers>=4.27.0,<5", # requires packaging, pyyaml, requests, tqdm
]
# shared dependencies
extras_require = {
"gpu": [
- "nvidia-ml-py3>=7.352.0",
- "tensorflow>=2.8.4,!=2.9.0.*,!=2.9.1,!=2.9.2,!=2.10.0.*,<3",
- "tf-slim>=1.1.0",
- "torch>=1.13.1", # for CUDA 11 support
- "fastai>=1.0.46,<2",
+ "fastai>=2.7.11,<3",
+ "nvidia-ml-py>=11.525.84",
+ "tensorflow>=2.8.4,!=2.9.0.*,!=2.9.1,!=2.9.2,!=2.10.0.*,<2.16", # Fixed TF due to constant security problems and breaking changes #2073
+ "tf-slim>=1.1.0", # No python_requires in its setup.py
+ "torch>=2.0.1,<3",
],
"spark": [
- "pyarrow>=0.12.1,<7.0.0",
- "pyspark>=2.4.5,<3.3.0",
+ "pyarrow>=10.0.1",
+ "pyspark>=3.3.0,<=4",
],
"dev": [
- "black>=18.6b4,<21",
- "pytest>=3.6.4",
- "pytest-cov>=2.12.1",
- "pytest-mock>=3.6.1", # for access to mock fixtures in pytest
- "packaging>=20.9", # for version comparison in test_dependency_security.py
+ "black>=23.3.0",
+ "pytest>=7.2.1",
+ "pytest-cov>=4.1.0",
+ "pytest-mock>=3.10.0", # for access to mock fixtures in pytest
],
}
# For the brave of heart
@@ -117,6 +109,8 @@
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
"Operating System :: POSIX :: Linux",
],
extras_require=extras_require,
diff --git a/tests/README.md b/tests/README.md
index 813b433ba..a6068daec 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -213,11 +213,26 @@ Then, follow the steps below to create the AzureML infrastructure:
2. Create two new clusters: `cpu-cluster` and `gpu-cluster`. Go to compute, then compute cluster, then new.
- Select the CPU VM base. Anything above 64GB of RAM, and 8 cores should be fine.
- Select the GPU VM base. Anything above 56GB of RAM, and 6 cores, and an NVIDIA K80 should be fine.
-3. Add the subscription ID to GitHub action secrets [here](https://github.com/microsoft/recommenders/settings/secrets/actions). Create a new repository secret called `AZUREML_TEST_SUBID` and add the subscription ID as the value.
+3. Add the subscription ID to GitHub action secrets [here](https://github.com/recommenders-team/recommenders/settings/secrets/actions). Create a new repository secret called `AZUREML_TEST_SUBID` and add the subscription ID as the value.
4. Make sure you have installed [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli), and that you are logged in: `az login`.
5. Select your subscription: `az account set -s $AZURE_SUBSCRIPTION_ID`.
-5. Create a Service Principal: `az ad sp create-for-rbac --name "recommenders-cicd" --role contributor --scopes /subscriptions/$AZURE_SUBSCRIPTION_ID --sdk-auth`.
-6. Add the output from the Service Principal (should be a JSON blob) as an action secret `AZUREML_TEST_CREDENTIALS`.
+6. Create a Service Principal: `az ad sp create-for-rbac --name $SERVICE_PRINCIPAL_NAME --role contributor --scopes /subscriptions/$AZURE_SUBSCRIPTION_ID --json-auth`. This will output a JSON blob with the credentials of the Service Principal:
+ ```
+ {
+ "clientId": "XXXXXXXXXXXXXXXXXXXXX",
+ "clientSecret": "XXXXXXXXXXXXXXXXXXXXX",
+ "subscriptionId": "XXXXXXXXXXXXXXXXXXXXX",
+ "tenantId": "XXXXXXXXXXXXXXXXXXXXX",
+ "activeDirectoryEndpointUrl": "https://login.microsoftonline.com",
+ "resourceManagerEndpointUrl": "https://management.azure.com/",
+ "activeDirectoryGraphResourceId": "https://graph.windows.net/",
+ "sqlManagementEndpointUrl": "https://management.core.windows.net:8443/",
+ "galleryEndpointUrl": "https://gallery.azure.com/",
+ "managementEndpointUrl": "https://management.core.windows.net/"
+ }
+ ```
+7. Add the output as github's action secret `AZUREML_TEST_CREDENTIALS` under repository's **Settings > Security > Secrets and variables > Actions**.
+
## How to execute tests in your local environment
diff --git a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py
index 86d0c80ab..adda7e172 100644
--- a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py
+++ b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py
@@ -29,11 +29,12 @@
Example:
Usually, this script is run by a DevOps pipeline. It can also be
run from cmd line.
- >>> python tests/ci/refac.py --clustername 'cluster-d3-v2'
- --subid '12345678-9012-3456-abcd-123456789012'
- --pr '666'
- --reponame 'Recommenders'
- --branch 'staging'
+ >>> python tests/ci/submit_groupwise_azureml_pytest.py \
+ --clustername 'cluster-d3-v2' \
+ --subid '12345678-9012-3456-abcd-123456789012' \
+ --pr '666' \
+ --reponame 'Recommenders' \
+ --branch 'staging'
"""
import argparse
import logging
@@ -41,7 +42,7 @@
from azureml.core.authentication import AzureCliAuthentication
from azureml.core import Workspace
from azureml.core import Experiment
-from azureml.core.runconfig import RunConfiguration
+from azureml.core.runconfig import RunConfiguration, DockerConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.script_run_config import ScriptRunConfig
from azureml.core.compute import ComputeTarget, AmlCompute
@@ -175,7 +176,6 @@ def create_run_config(
run_azuremlcompute = RunConfiguration()
run_azuremlcompute.target = cpu_cluster
- run_azuremlcompute.environment.docker.enabled = True
if not add_gpu_dependencies:
# https://github.com/Azure/AzureML-Containers/blob/master/base/cpu/openmpi4.1.0-ubuntu22.04
run_azuremlcompute.environment.docker.base_image = "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04"
@@ -197,12 +197,14 @@ def create_run_config(
apt-get clean -y && \
rm -rf /var/lib/apt/lists/*
# Conda Environment
+# Pin pip=20.1.1 due to the issue: No module named 'ruamel'
+# See https://learn.microsoft.com/en-us/python/api/overview/azure/ml/install?view=azure-ml-py#troubleshooting
ENV MINICONDA_VERSION py38_23.3.1-0
ENV PATH /opt/miniconda/bin:$PATH
ENV CONDA_PACKAGE 23.5.0
RUN wget -qO /tmp/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-${MINICONDA_VERSION}-Linux-x86_64.sh && \
bash /tmp/miniconda.sh -bf -p /opt/miniconda && \
- conda install conda=${CONDA_PACKAGE} -y && \
+ conda install -y conda=${CONDA_PACKAGE} python=3.8 pip=20.1.1 && \
conda update --all -c conda-forge -y && \
conda clean -ay && \
rm -rf /opt/miniconda/pkgs && \
@@ -290,8 +292,10 @@ def submit_experiment_to_azureml(
source_directory=".",
script=test,
run_config=run_config,
+ docker_runtime_config=DockerConfiguration(use_docker=True),
arguments=arguments,
)
+
run = experiment.submit(script_run_config)
# waits only for configuration to complete
run.wait_for_completion(show_output=True, wait_post_processing=True)
diff --git a/tests/functional/examples/test_notebooks_gpu.py b/tests/functional/examples/test_notebooks_gpu.py
index 2007cc1a7..05b53c68e 100644
--- a/tests/functional/examples/test_notebooks_gpu.py
+++ b/tests/functional/examples/test_notebooks_gpu.py
@@ -247,7 +247,9 @@ def test_wide_deep_functional(
os.path.join("tests", "resources", "deeprec", "slirec"),
10,
400,
- {"auc": 0.7183}, # Don't do logloss check as SLi-Rec uses ranking loss, not a point-wise loss
+ {
+ "auc": 0.7183
+ }, # Don't do logloss check as SLi-Rec uses ranking loss, not a point-wise loss
42,
)
],
@@ -278,7 +280,7 @@ def test_slirec_quickstart_functional(
results = read_notebook(output_notebook)
assert results["auc"] == pytest.approx(expected_values["auc"], rel=TOL, abs=ABS_TOL)
-
+
@pytest.mark.gpu
@pytest.mark.notebooks
@@ -567,7 +569,7 @@ def test_dkn_quickstart_functional(notebooks, output_notebook, kernel_name):
notebook_path,
output_notebook,
kernel_name=kernel_name,
- parameters=dict(EPOCHS=5, BATCH_SIZE=500),
+ parameters=dict(EPOCHS=5, BATCH_SIZE=200),
)
results = read_notebook(output_notebook)
diff --git a/tests/unit/recommenders/evaluation/test_spark_evaluation.py b/tests/unit/recommenders/evaluation/test_spark_evaluation.py
index 278a2e287..55c064e8b 100644
--- a/tests/unit/recommenders/evaluation/test_spark_evaluation.py
+++ b/tests/unit/recommenders/evaluation/test_spark_evaluation.py
@@ -5,7 +5,7 @@
import pytest
import numpy as np
import pandas as pd
-from pandas.util.testing import assert_frame_equal
+from pandas.testing import assert_frame_equal
from recommenders.evaluation.python_evaluation import (
precision_at_k,