diff --git a/.github/actions/azureml-test/action.yml b/.github/actions/azureml-test/action.yml index d74b88160..85ae9f84a 100644 --- a/.github/actions/azureml-test/action.yml +++ b/.github/actions/azureml-test/action.yml @@ -76,7 +76,7 @@ runs: shell: bash run: pip install --quiet "azureml-core>1,<2" "azure-cli>2,<3" - name: Log in to Azure - uses: azure/login@v1 + uses: azure/login@v2 with: creds: ${{inputs.AZUREML_TEST_CREDENTIALS}} - name: Install wheel package diff --git a/.github/workflows/azureml-cpu-nightly.yml b/.github/workflows/azureml-cpu-nightly.yml index 72bb700cf..93e414564 100644 --- a/.github/workflows/azureml-cpu-nightly.yml +++ b/.github/workflows/azureml-cpu-nightly.yml @@ -67,7 +67,7 @@ jobs: strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: - python-version: ['"python=3.8"', '"python=3.9"'] + python-version: ['"python=3.8"', '"python=3.9"', '"python=3.10"', '"python=3.11"'] test-group: ${{ fromJSON(needs.get-test-groups.outputs.test_groups) }} steps: - name: Check out repository code diff --git a/.github/workflows/azureml-gpu-nightly.yml b/.github/workflows/azureml-gpu-nightly.yml index efac48774..3b9f6d6b4 100644 --- a/.github/workflows/azureml-gpu-nightly.yml +++ b/.github/workflows/azureml-gpu-nightly.yml @@ -67,7 +67,7 @@ jobs: strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: - python-version: ['"python=3.8"', '"python=3.9"'] + python-version: ['"python=3.8"', '"python=3.9"', '"python=3.10"', '"python=3.11"'] test-group: ${{ fromJSON(needs.get-test-groups.outputs.test_groups) }} steps: - name: Check out repository code diff --git a/.github/workflows/azureml-spark-nightly.yml b/.github/workflows/azureml-spark-nightly.yml index b3a76f9ea..8f28be6f2 100644 --- a/.github/workflows/azureml-spark-nightly.yml +++ b/.github/workflows/azureml-spark-nightly.yml @@ -66,7 +66,7 @@ jobs: strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: - python-version: ['"python=3.8"', '"python=3.9"'] + python-version: ['"python=3.8"', '"python=3.9"', '"python=3.10"', '"python=3.11"'] test-group: ${{ fromJSON(needs.get-test-groups.outputs.test_groups) }} steps: - name: Check out repository code diff --git a/.github/workflows/azureml-unit-tests.yml b/.github/workflows/azureml-unit-tests.yml index 13ed56005..b39268318 100644 --- a/.github/workflows/azureml-unit-tests.yml +++ b/.github/workflows/azureml-unit-tests.yml @@ -56,7 +56,7 @@ jobs: strategy: max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration matrix: - python-version: ['"python=3.8"', '"python=3.9"'] + python-version: ['"python=3.8"', '"python=3.9"', '"python=3.10"', '"python=3.11"'] test-group: ${{ fromJSON(needs.get-test-groups.outputs.test_groups) }} steps: - name: Check out repository code diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml index 766b31645..90d03fef6 100644 --- a/.github/workflows/sarplus.yml +++ b/.github/workflows/sarplus.yml @@ -39,7 +39,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - python-version: ["3.8", "3.9"] + python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v4 diff --git a/contrib/sarplus/python/setup.py b/contrib/sarplus/python/setup.py index 4009ec751..f755f5310 100644 --- a/contrib/sarplus/python/setup.py +++ b/contrib/sarplus/python/setup.py @@ -42,6 +42,7 @@ def __str__(self): "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Intended Audience :: Developers", "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering :: Mathematics", @@ -49,7 +50,7 @@ def __str__(self): setup_requires=["pytest-runner"], install_requires=DEPENDENCIES, tests_require=["pytest"], - python_requires=">=3.6,<3.11", + python_requires=">=3.6,<3.12", packages=["pysarplus"], package_data={"": ["VERSION"]}, ext_modules=[ diff --git a/examples/00_quick_start/fastai_movielens.ipynb b/examples/00_quick_start/fastai_movielens.ipynb index 517673178..944b92623 100644 --- a/examples/00_quick_start/fastai_movielens.ipynb +++ b/examples/00_quick_start/fastai_movielens.ipynb @@ -27,17 +27,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "System version: 3.6.11 | packaged by conda-forge | (default, Aug 5 2020, 20:09:42) \n", - "[GCC 7.5.0]\n", - "Pandas version: 0.25.3\n", - "Fast AI version: 1.0.46\n", - "Torch version: 1.4.0\n", - "Cuda Available: False\n", + "System version: 3.9.16 (main, May 15 2023, 23:46:34) \n", + "[GCC 11.2.0]\n", + "Pandas version: 1.5.3\n", + "Fast AI version: 2.7.11\n", + "Torch version: 1.13.1+cu117\n", + "CUDA Available: True\n", "CuDNN Enabled: True\n" ] } ], "source": [ + "# Suppress all warnings\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", "import os\n", "import sys\n", "import numpy as np\n", @@ -46,7 +50,7 @@ "import fastai\n", "from tempfile import TemporaryDirectory\n", "\n", - "from fastai.collab import collab_learner, CollabDataBunch, load_learner\n", + "from fastai.collab import collab_learner, CollabDataLoaders, load_learner\n", "\n", "from recommenders.utils.constants import (\n", " DEFAULT_USER_COL as USER, \n", @@ -67,7 +71,7 @@ "print(\"Pandas version: {}\".format(pd.__version__))\n", "print(\"Fast AI version: {}\".format(fastai.__version__))\n", "print(\"Torch version: {}\".format(torch.__version__))\n", - "print(\"Cuda Available: {}\".format(torch.cuda.is_available()))\n", + "print(\"CUDA Available: {}\".format(torch.cuda.is_available()))\n", "print(\"CuDNN Enabled: {}\".format(torch.backends.cudnn.enabled))" ] }, @@ -80,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "tags": [ "parameters" @@ -101,14 +105,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4.81k/4.81k [00:01<00:00, 4.49kKB/s]\n" + "100%|██████████| 4.81k/4.81k [00:01<00:00, 3.52kKB/s]\n" ] }, { @@ -132,10 +136,10 @@ " \n", " \n", " \n", - " UserId\n", - " MovieId\n", - " Rating\n", - " Timestamp\n", + " userID\n", + " itemID\n", + " rating\n", + " timestamp\n", " \n", " \n", " \n", @@ -179,15 +183,15 @@ "" ], "text/plain": [ - " UserId MovieId Rating Timestamp\n", - "0 196 242 3.0 881250949\n", - "1 186 302 3.0 891717742\n", - "2 22 377 1.0 878887116\n", - "3 244 51 2.0 880606923\n", - "4 166 346 1.0 886397596" + " userID itemID rating timestamp\n", + "0 196 242 3.0 881250949\n", + "1 186 302 3.0 891717742\n", + "2 22 377 1.0 878887116\n", + "3 244 51 2.0 880606923\n", + "4 166 346 1.0 886397596" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -207,7 +211,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -224,7 +228,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -258,11 +262,11 @@ "outputs": [], "source": [ "with Timer() as preprocess_time:\n", - " data = CollabDataBunch.from_df(train_valid_df, \n", - " user_name=USER, \n", - " item_name=ITEM, \n", - " rating_name=RATING, \n", - " valid_pct=0)\n" + " data = CollabDataLoaders.from_df(train_valid_df, \n", + " user_name=USER, \n", + " item_name=ITEM, \n", + " rating_name=RATING, \n", + " valid_pct=0)\n" ] }, { @@ -276,37 +280,73 @@ "\n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", "
UserIdMovieIdtargetuserIDitemIDrating
54315553.001048401.0
909455.018811122.0
27465063.0
29251531042574.0
30310921.0451115274.0
54979467633.0
64078693.0
72919244.0
8109944.0
9825973.0
" ], @@ -369,6 +409,33 @@ "execution_count": 10, "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "text/html": [ @@ -383,34 +450,34 @@ " \n", " \n", " \n", + " 0\n", + " 0.961789\n", + " None\n", + " 00:09\n", + " \n", + " \n", " 1\n", - " 0.985993\n", - " \n", - " 00:05\n", + " 0.863359\n", + " None\n", + " 00:08\n", " \n", " \n", " 2\n", - " 0.885496\n", - " \n", - " 00:05\n", + " 0.750853\n", + " None\n", + " 00:07\n", " \n", " \n", " 3\n", - " 0.777637\n", - " \n", - " 00:05\n", + " 0.637868\n", + " None\n", + " 00:08\n", " \n", " \n", " 4\n", - " 0.628971\n", - " \n", - " 00:05\n", - " \n", - " \n", - " 5\n", - " 0.532328\n", - " \n", - " 00:06\n", + " 0.526907\n", + " None\n", + " 00:09\n", " \n", " \n", "" @@ -426,13 +493,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Took 29.5549 seconds for training.\n" + "Took 51.5260 seconds for training.\n" ] } ], "source": [ "with Timer() as train_time:\n", - " learn.fit_one_cycle(EPOCHS, max_lr=5e-3)\n", + " learn.fit_one_cycle(EPOCHS, lr_max=5e-3)\n", "\n", "print(\"Took {} seconds for training.\".format(train_time))" ] @@ -446,7 +513,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -456,7 +523,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -474,11 +541,11 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "learner = load_learner(tmp.name, \"movielens_model.pkl\")" + "learner = load_learner(model_path)" ] }, { @@ -490,11 +557,11 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ - "total_users, total_items = learner.data.train_ds.x.classes.values()\n", + "total_users, total_items = learner.dls.classes.values()\n", "total_items = total_items[1:]\n", "total_users = total_users[1:]" ] @@ -508,7 +575,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -525,7 +592,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -545,7 +612,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": { "scrolled": false }, @@ -564,14 +631,14 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Took 1.9734 seconds for 1511060 predictions.\n" + "Took 5.1570 seconds for 1511060 predictions.\n" ] } ], @@ -595,7 +662,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -606,7 +673,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -617,7 +684,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -628,7 +695,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -639,27 +706,27 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Model:\tCollabLearner\n", - "Top K:\t10\n", - "MAP:\t0.026115\n", - "NDCG:\t0.155065\n", - "Precision@K:\t0.136691\n", - "Recall@K:\t0.054940\n" + "Model:\t\tLearner\n", + "Top K:\t\t10\n", + "MAP:\t\t0.024119\n", + "NDCG:\t\t0.152808\n", + "Precision@K:\t0.139130\n", + "Recall@K:\t0.054943\n" ] } ], "source": [ - "print(\"Model:\\t\" + learn.__class__.__name__,\n", - " \"Top K:\\t%d\" % TOP_K,\n", - " \"MAP:\\t%f\" % eval_map,\n", - " \"NDCG:\\t%f\" % eval_ndcg,\n", + "print(\"Model:\\t\\t\" + learn.__class__.__name__,\n", + " \"Top K:\\t\\t%d\" % TOP_K,\n", + " \"MAP:\\t\\t%f\" % eval_map,\n", + " \"NDCG:\\t\\t%f\" % eval_ndcg,\n", " \"Precision@K:\\t%f\" % eval_precision,\n", " \"Recall@K:\\t%f\" % eval_recall, sep='\\n')" ] @@ -673,7 +740,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -693,18 +760,18 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Model:\tCollabLearner\n", - "RMSE:\t0.902379\n", - "MAE:\t0.712163\n", - "Explained variance:\t0.346523\n", - "R squared:\t0.345672\n" + "Model:\t\t\tLearner\n", + "RMSE:\t\t\t0.904589\n", + "MAE:\t\t\t0.715827\n", + "Explained variance:\t0.356082\n", + "R squared:\t\t0.355173\n" ] } ], @@ -714,36 +781,35 @@ "eval_mae = mae(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)\n", "eval_exp_var = exp_var(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)\n", "\n", - "print(\"Model:\\t\" + learn.__class__.__name__,\n", - " \"RMSE:\\t%f\" % eval_rmse,\n", - " \"MAE:\\t%f\" % eval_mae,\n", + "print(\"Model:\\t\\t\\t\" + learn.__class__.__name__,\n", + " \"RMSE:\\t\\t\\t%f\" % eval_rmse,\n", + " \"MAE:\\t\\t\\t%f\" % eval_mae,\n", " \"Explained variance:\\t%f\" % eval_exp_var,\n", - " \"R squared:\\t%f\" % eval_r2, sep='\\n')" + " \"R squared:\\t\\t%f\" % eval_r2, sep='\\n')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "That RMSE is actually quite good when compared to these benchmarks: https://www.librec.net/release/v1.3/example.html" + "That RMSE is competitive in comparison with other models." ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.02611475567509659, + "application/notebook_utils.json+json": { + "data": 0.024118782738867094, "encoder": "json", - "name": "map", - "version": 1 + "name": "map" } }, "metadata": { - "scrapbook": { + "notebook_utils": { "data": true, "display": false, "name": "map" @@ -753,15 +819,14 @@ }, { "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.15506533130248687, + "application/notebook_utils.json+json": { + "data": 0.1528081472533914, "encoder": "json", - "name": "ndcg", - "version": 1 + "name": "ndcg" } }, "metadata": { - "scrapbook": { + "notebook_utils": { "data": true, "display": false, "name": "ndcg" @@ -771,15 +836,14 @@ }, { "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.13669141039236482, + "application/notebook_utils.json+json": { + "data": 0.13913043478260873, "encoder": "json", - "name": "precision", - "version": 1 + "name": "precision" } }, "metadata": { - "scrapbook": { + "notebook_utils": { "data": true, "display": false, "name": "precision" @@ -789,15 +853,14 @@ }, { "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.05493986799753499, + "application/notebook_utils.json+json": { + "data": 0.05494302697544413, "encoder": "json", - "name": "recall", - "version": 1 + "name": "recall" } }, "metadata": { - "scrapbook": { + "notebook_utils": { "data": true, "display": false, "name": "recall" @@ -807,15 +870,14 @@ }, { "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.9023793356156464, + "application/notebook_utils.json+json": { + "data": 0.9045892929999733, "encoder": "json", - "name": "rmse", - "version": 1 + "name": "rmse" } }, "metadata": { - "scrapbook": { + "notebook_utils": { "data": true, "display": false, "name": "rmse" @@ -825,15 +887,14 @@ }, { "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.7121634655740025, + "application/notebook_utils.json+json": { + "data": 0.7158267242352735, "encoder": "json", - "name": "mae", - "version": 1 + "name": "mae" } }, "metadata": { - "scrapbook": { + "notebook_utils": { "data": true, "display": false, "name": "mae" @@ -843,15 +904,14 @@ }, { "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.34652281723228295, + "application/notebook_utils.json+json": { + "data": 0.3560824305444269, "encoder": "json", - "name": "exp_var", - "version": 1 + "name": "exp_var" } }, "metadata": { - "scrapbook": { + "notebook_utils": { "data": true, "display": false, "name": "exp_var" @@ -861,15 +921,14 @@ }, { "data": { - "application/scrapbook.scrap.json+json": { - "data": 0.3456716162958503, + "application/notebook_utils.json+json": { + "data": 0.35517333876960555, "encoder": "json", - "name": "rsquared", - "version": 1 + "name": "rsquared" } }, "metadata": { - "scrapbook": { + "notebook_utils": { "data": true, "display": false, "name": "rsquared" @@ -879,15 +938,14 @@ }, { "data": { - "application/scrapbook.scrap.json+json": { - "data": 29.554921820759773, + "application/notebook_utils.json+json": { + "data": 51.52598460000445, "encoder": "json", - "name": "train_time", - "version": 1 + "name": "train_time" } }, "metadata": { - "scrapbook": { + "notebook_utils": { "data": true, "display": false, "name": "train_time" @@ -897,15 +955,14 @@ }, { "data": { - "application/scrapbook.scrap.json+json": { - "data": 1.973397959023714, + "application/notebook_utils.json+json": { + "data": 5.156951100005244, "encoder": "json", - "name": "test_time", - "version": 1 + "name": "test_time" } }, "metadata": { - "scrapbook": { + "notebook_utils": { "data": true, "display": false, "name": "test_time" @@ -930,7 +987,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -946,9 +1003,9 @@ "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python (reco_gpu)", + "display_name": "recommenders", "language": "python", - "name": "reco_gpu" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -960,7 +1017,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.11" + "version": "3.9.16" } }, "nbformat": 4, diff --git a/examples/06_benchmarks/benchmark_utils.py b/examples/06_benchmarks/benchmark_utils.py index e28fa6ab7..c62518838 100644 --- a/examples/06_benchmarks/benchmark_utils.py +++ b/examples/06_benchmarks/benchmark_utils.py @@ -15,7 +15,7 @@ except ImportError: pass # skip this import if we are not in a Spark environment try: - from fastai.collab import collab_learner, CollabDataBunch + from fastai.collab import collab_learner, CollabDataLoaders except ImportError: pass # skip this import if we are not in a GPU environment @@ -181,7 +181,7 @@ def prepare_training_fastai(train, test): data = train.copy() data[DEFAULT_USER_COL] = data[DEFAULT_USER_COL].astype("str") data[DEFAULT_ITEM_COL] = data[DEFAULT_ITEM_COL].astype("str") - data = CollabDataBunch.from_df( + data = CollabDataLoaders.from_df( data, user_name=DEFAULT_USER_COL, item_name=DEFAULT_ITEM_COL, @@ -196,7 +196,7 @@ def train_fastai(params, data): data, n_factors=params["n_factors"], y_range=params["y_range"], wd=params["wd"] ) with Timer() as t: - model.fit_one_cycle(cyc_len=params["epochs"], max_lr=params["max_lr"]) + model.fit_one_cycle(params["epochs"], lr_max=params["lr_max"]) return model, t @@ -221,9 +221,9 @@ def predict_fastai(model, test): def recommend_k_fastai(model, test, train, top_k=DEFAULT_K, remove_seen=True): with Timer() as t: - total_users, total_items = model.data.train_ds.x.classes.values() - total_items = total_items[1:] - total_users = total_users[1:] + total_users, total_items = model.dls.classes.values() + total_items = np.array(total_items[1:]) + total_users = np.array(total_users[1:]) test_users = test[DEFAULT_USER_COL].unique() test_users = np.intersect1d(test_users, total_users) users_items = cartesian_product(test_users, total_items) diff --git a/examples/06_benchmarks/movielens.ipynb b/examples/06_benchmarks/movielens.ipynb index 2f7a857ce..8c8ee6d2f 100644 --- a/examples/06_benchmarks/movielens.ipynb +++ b/examples/06_benchmarks/movielens.ipynb @@ -299,7 +299,7 @@ " \"n_factors\": 40, \n", " \"y_range\": [0,5.5], \n", " \"wd\": 1e-1,\n", - " \"max_lr\": 5e-3,\n", + " \"lr_max\": 5e-3,\n", " \"epochs\": 15\n", "}\n", "\n", diff --git a/recommenders/models/fastai/fastai_utils.py b/recommenders/models/fastai/fastai_utils.py index ab756c7e8..f6b6a8986 100644 --- a/recommenders/models/fastai/fastai_utils.py +++ b/recommenders/models/fastai/fastai_utils.py @@ -6,6 +6,7 @@ import pandas as pd import fastai import fastprogress +import torch from fastprogress.fastprogress import force_console_behavior from recommenders.utils import constants as cc @@ -51,24 +52,32 @@ def score( pandas.DataFrame: Result of recommendation """ # replace values not known to the model with NaN - total_users, total_items = learner.data.train_ds.x.classes.values() + total_users, total_items = learner.dls.classes.values() test_df.loc[~test_df[user_col].isin(total_users), user_col] = np.nan test_df.loc[~test_df[item_col].isin(total_items), item_col] = np.nan # map ids to embedding ids - u = learner.get_idx(test_df[user_col], is_item=False) - m = learner.get_idx(test_df[item_col], is_item=True) + u = learner._get_idx(test_df[user_col], is_item=False) + m = learner._get_idx(test_df[item_col], is_item=True) # score the pytorch model - pred = learner.model.forward(u, m) + x = torch.column_stack((u, m)) + + if torch.cuda.is_available(): + x = x.to("cuda") + learner.model = learner.model.to("cuda") + + pred = learner.model.forward(x).detach().cpu().numpy() scores = pd.DataFrame( {user_col: test_df[user_col], item_col: test_df[item_col], prediction_col: pred} ) scores = scores.sort_values([user_col, prediction_col], ascending=[True, False]) + if top_k is not None: top_scores = scores.groupby(user_col).head(top_k).reset_index(drop=True) else: top_scores = scores + return top_scores @@ -77,7 +86,7 @@ def hide_fastai_progress_bar(): fastprogress.fastprogress.NO_BAR = True fastprogress.fastprogress.WRITER_FN = str master_bar, progress_bar = force_console_behavior() - fastai.basic_train.master_bar, fastai.basic_train.progress_bar = ( + fastai.callback.progress.master_bar, fastai.callback.progress.progress_bar = ( master_bar, progress_bar, ) diff --git a/recommenders/models/rlrmc/RLRMCdataset.py b/recommenders/models/rlrmc/RLRMCdataset.py index 6b1329d1d..7670105b3 100644 --- a/recommenders/models/rlrmc/RLRMCdataset.py +++ b/recommenders/models/rlrmc/RLRMCdataset.py @@ -68,8 +68,8 @@ def _data_processing(self, train, validation=None, test=None, mean_center=True): """ # Data processing and reindexing code is adopted from https://github.com/Microsoft/Recommenders/blob/main/recommenders/models/ncf/dataset.py # If validation dataset is None - df = train if validation is None else train.append(validation) - df = df if test is None else df.append(test) + df = train if validation is None else pd.concat([train, validation]) + df = df if test is None else pd.concat([df, test]) # Reindex user and item index if self.user_idx is None: diff --git a/recommenders/models/tfidf/tfidf_utils.py b/recommenders/models/tfidf/tfidf_utils.py index 24575121c..6a6d22389 100644 --- a/recommenders/models/tfidf/tfidf_utils.py +++ b/recommenders/models/tfidf/tfidf_utils.py @@ -115,7 +115,7 @@ def clean_dataframe(self, df, cols_to_clean, new_col_name="cleaned_text"): return df def tokenize_text( - self, df_clean, text_col="cleaned_text", ngram_range=(1, 3), min_df=0 + self, df_clean, text_col="cleaned_text", ngram_range=(1, 3), min_df=0.0 ): """Tokenize the input text. For more details on the TfidfVectorizer, see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html @@ -124,7 +124,7 @@ def tokenize_text( df_clean (pandas.DataFrame): Dataframe with cleaned text in the new column. text_col (str): Name of column containing the cleaned text. ngram_range (tuple of int): The lower and upper boundary of the range of n-values for different n-grams to be extracted. - min_df (int): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. + min_df (float): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. Returns: TfidfVectorizer, pandas.Series: diff --git a/setup.py b/setup.py index 758765690..c5fc49bb8 100644 --- a/setup.py +++ b/setup.py @@ -27,53 +27,45 @@ version += ".post" + str(int(time.time())) install_requires = [ - "numpy>=1.19", # 1.19 required by tensorflow 2.6 - "pandas>1.0.3,<2", - "scipy>=1.0.0,<1.11.0", # FIXME: We limit <1.11.0 until #1954 is fixed - "tqdm>=4.31.1,<5", - "matplotlib>=2.2.2,<4", - "scikit-learn>=0.22.1,<1.0.3", - "numba>=0.38.1,<1", - "lightfm>=1.15,<2", - "lightgbm>=2.2.1", - "memory_profiler>=0.54.0,<1", - "nltk>=3.4,<4", - "seaborn>=0.8.1,<1", - "transformers>=2.5.0,<5", - "category_encoders>=1.3.0,<2", - "jinja2>=2,<3.1", - "requests>=2.31.0,<3", - "cornac>=1.1.2,<1.15.2;python_version<='3.7'", - "cornac>=1.15.2,<2;python_version>='3.8'", # After 1.15.2, Cornac requires python 3.8 - "retrying>=1.3.3", + "category-encoders>=2.6.0,<3", # requires packaging + "cornac>=1.15.2,<2", # requires packaging, tqdm + "hyperopt>=0.2.7,<1", + "lightfm>=1.17,<2", # requires requests + "lightgbm>=4.0.0,<5", + "locust>=2.12.2,<3", # requires jinja2 + "memory-profiler>=0.61.0,<1", + "nltk>=3.8.1,<4", # requires tqdm + "notebook>=7.0.0,<8", # requires ipykernel, jinja2, jupyter, nbconvert, nbformat, packaging, requests + "numba>=0.57.0,<1", + "pandas>2.0.0,<3.0.0", # requires numpy "pandera[strategies]>=0.6.5,<0.18;python_version<='3.8'", # For generating fake datasets - "pandera[strategies]>=0.6.5;python_version>='3.9'", - "scikit-surprise>=1.0.6", - "hyperopt>=0.1.2,<1", - "ipykernel>=4.6.1,<7", - "jupyter>=1,<2", - "locust>=1,<2", + "pandera[strategies]>=0.15.0;python_version>='3.9'", + "retrying>=1.3.4,<2", + "scikit-learn>=1.2.0,<2", # requires scipy, and introduce breaking change affects feature_extraction.text.TfidfVectorizer.min_df + "scikit-surprise>=1.1.3", + "scipy>=1.10.1,<1.11.0", # FIXME: We limit <1.11.0 until #1954 is fixed + "seaborn>=0.13.0,<1", # requires matplotlib, packaging + "transformers>=4.27.0,<5", # requires packaging, pyyaml, requests, tqdm ] # shared dependencies extras_require = { "gpu": [ - "nvidia-ml-py3>=7.352.0", - "tensorflow>=2.8.4,!=2.9.0.*,!=2.9.1,!=2.9.2,!=2.10.0.*,<3", - "tf-slim>=1.1.0", - "torch>=1.13.1", # for CUDA 11 support - "fastai>=1.0.46,<2", + "fastai>=2.7.11,<3", + "nvidia-ml-py>=11.525.84", + "tensorflow>=2.8.4,!=2.9.0.*,!=2.9.1,!=2.9.2,!=2.10.0.*,<2.16", # Fixed TF due to constant security problems and breaking changes #2073 + "tf-slim>=1.1.0", # No python_requires in its setup.py + "torch>=2.0.1,<3", ], "spark": [ - "pyarrow>=0.12.1,<7.0.0", - "pyspark>=2.4.5,<3.3.0", + "pyarrow>=10.0.1", + "pyspark>=3.3.0,<=4", ], "dev": [ - "black>=18.6b4,<21", - "pytest>=3.6.4", - "pytest-cov>=2.12.1", - "pytest-mock>=3.6.1", # for access to mock fixtures in pytest - "packaging>=20.9", # for version comparison in test_dependency_security.py + "black>=23.3.0", + "pytest>=7.2.1", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", # for access to mock fixtures in pytest ], } # For the brave of heart @@ -117,6 +109,8 @@ "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Operating System :: POSIX :: Linux", ], extras_require=extras_require, diff --git a/tests/README.md b/tests/README.md index 813b433ba..a6068daec 100644 --- a/tests/README.md +++ b/tests/README.md @@ -213,11 +213,26 @@ Then, follow the steps below to create the AzureML infrastructure: 2. Create two new clusters: `cpu-cluster` and `gpu-cluster`. Go to compute, then compute cluster, then new. - Select the CPU VM base. Anything above 64GB of RAM, and 8 cores should be fine. - Select the GPU VM base. Anything above 56GB of RAM, and 6 cores, and an NVIDIA K80 should be fine. -3. Add the subscription ID to GitHub action secrets [here](https://github.com/microsoft/recommenders/settings/secrets/actions). Create a new repository secret called `AZUREML_TEST_SUBID` and add the subscription ID as the value. +3. Add the subscription ID to GitHub action secrets [here](https://github.com/recommenders-team/recommenders/settings/secrets/actions). Create a new repository secret called `AZUREML_TEST_SUBID` and add the subscription ID as the value. 4. Make sure you have installed [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli), and that you are logged in: `az login`. 5. Select your subscription: `az account set -s $AZURE_SUBSCRIPTION_ID`. -5. Create a Service Principal: `az ad sp create-for-rbac --name "recommenders-cicd" --role contributor --scopes /subscriptions/$AZURE_SUBSCRIPTION_ID --sdk-auth`. -6. Add the output from the Service Principal (should be a JSON blob) as an action secret `AZUREML_TEST_CREDENTIALS`. +6. Create a Service Principal: `az ad sp create-for-rbac --name $SERVICE_PRINCIPAL_NAME --role contributor --scopes /subscriptions/$AZURE_SUBSCRIPTION_ID --json-auth`. This will output a JSON blob with the credentials of the Service Principal: + ``` + { + "clientId": "XXXXXXXXXXXXXXXXXXXXX", + "clientSecret": "XXXXXXXXXXXXXXXXXXXXX", + "subscriptionId": "XXXXXXXXXXXXXXXXXXXXX", + "tenantId": "XXXXXXXXXXXXXXXXXXXXX", + "activeDirectoryEndpointUrl": "https://login.microsoftonline.com", + "resourceManagerEndpointUrl": "https://management.azure.com/", + "activeDirectoryGraphResourceId": "https://graph.windows.net/", + "sqlManagementEndpointUrl": "https://management.core.windows.net:8443/", + "galleryEndpointUrl": "https://gallery.azure.com/", + "managementEndpointUrl": "https://management.core.windows.net/" + } + ``` +7. Add the output as github's action secret `AZUREML_TEST_CREDENTIALS` under repository's **Settings > Security > Secrets and variables > Actions**. + ## How to execute tests in your local environment diff --git a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py index 86d0c80ab..adda7e172 100644 --- a/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py +++ b/tests/ci/azureml_tests/submit_groupwise_azureml_pytest.py @@ -29,11 +29,12 @@ Example: Usually, this script is run by a DevOps pipeline. It can also be run from cmd line. - >>> python tests/ci/refac.py --clustername 'cluster-d3-v2' - --subid '12345678-9012-3456-abcd-123456789012' - --pr '666' - --reponame 'Recommenders' - --branch 'staging' + >>> python tests/ci/submit_groupwise_azureml_pytest.py \ + --clustername 'cluster-d3-v2' \ + --subid '12345678-9012-3456-abcd-123456789012' \ + --pr '666' \ + --reponame 'Recommenders' \ + --branch 'staging' """ import argparse import logging @@ -41,7 +42,7 @@ from azureml.core.authentication import AzureCliAuthentication from azureml.core import Workspace from azureml.core import Experiment -from azureml.core.runconfig import RunConfiguration +from azureml.core.runconfig import RunConfiguration, DockerConfiguration from azureml.core.conda_dependencies import CondaDependencies from azureml.core.script_run_config import ScriptRunConfig from azureml.core.compute import ComputeTarget, AmlCompute @@ -175,7 +176,6 @@ def create_run_config( run_azuremlcompute = RunConfiguration() run_azuremlcompute.target = cpu_cluster - run_azuremlcompute.environment.docker.enabled = True if not add_gpu_dependencies: # https://github.com/Azure/AzureML-Containers/blob/master/base/cpu/openmpi4.1.0-ubuntu22.04 run_azuremlcompute.environment.docker.base_image = "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04" @@ -197,12 +197,14 @@ def create_run_config( apt-get clean -y && \ rm -rf /var/lib/apt/lists/* # Conda Environment +# Pin pip=20.1.1 due to the issue: No module named 'ruamel' +# See https://learn.microsoft.com/en-us/python/api/overview/azure/ml/install?view=azure-ml-py#troubleshooting ENV MINICONDA_VERSION py38_23.3.1-0 ENV PATH /opt/miniconda/bin:$PATH ENV CONDA_PACKAGE 23.5.0 RUN wget -qO /tmp/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-${MINICONDA_VERSION}-Linux-x86_64.sh && \ bash /tmp/miniconda.sh -bf -p /opt/miniconda && \ - conda install conda=${CONDA_PACKAGE} -y && \ + conda install -y conda=${CONDA_PACKAGE} python=3.8 pip=20.1.1 && \ conda update --all -c conda-forge -y && \ conda clean -ay && \ rm -rf /opt/miniconda/pkgs && \ @@ -290,8 +292,10 @@ def submit_experiment_to_azureml( source_directory=".", script=test, run_config=run_config, + docker_runtime_config=DockerConfiguration(use_docker=True), arguments=arguments, ) + run = experiment.submit(script_run_config) # waits only for configuration to complete run.wait_for_completion(show_output=True, wait_post_processing=True) diff --git a/tests/functional/examples/test_notebooks_gpu.py b/tests/functional/examples/test_notebooks_gpu.py index 2007cc1a7..05b53c68e 100644 --- a/tests/functional/examples/test_notebooks_gpu.py +++ b/tests/functional/examples/test_notebooks_gpu.py @@ -247,7 +247,9 @@ def test_wide_deep_functional( os.path.join("tests", "resources", "deeprec", "slirec"), 10, 400, - {"auc": 0.7183}, # Don't do logloss check as SLi-Rec uses ranking loss, not a point-wise loss + { + "auc": 0.7183 + }, # Don't do logloss check as SLi-Rec uses ranking loss, not a point-wise loss 42, ) ], @@ -278,7 +280,7 @@ def test_slirec_quickstart_functional( results = read_notebook(output_notebook) assert results["auc"] == pytest.approx(expected_values["auc"], rel=TOL, abs=ABS_TOL) - + @pytest.mark.gpu @pytest.mark.notebooks @@ -567,7 +569,7 @@ def test_dkn_quickstart_functional(notebooks, output_notebook, kernel_name): notebook_path, output_notebook, kernel_name=kernel_name, - parameters=dict(EPOCHS=5, BATCH_SIZE=500), + parameters=dict(EPOCHS=5, BATCH_SIZE=200), ) results = read_notebook(output_notebook) diff --git a/tests/unit/recommenders/evaluation/test_spark_evaluation.py b/tests/unit/recommenders/evaluation/test_spark_evaluation.py index 278a2e287..55c064e8b 100644 --- a/tests/unit/recommenders/evaluation/test_spark_evaluation.py +++ b/tests/unit/recommenders/evaluation/test_spark_evaluation.py @@ -5,7 +5,7 @@ import pytest import numpy as np import pandas as pd -from pandas.util.testing import assert_frame_equal +from pandas.testing import assert_frame_equal from recommenders.evaluation.python_evaluation import ( precision_at_k,