From 0b1af830e6e145cff08af0ab4dbe97f51eaf8413 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 22 Oct 2024 18:43:28 +0800 Subject: [PATCH] Add cat test. --- python-package/xgboost/testing/data_iter.py | 72 +++++++++++ python-package/xgboost/testing/updater.py | 128 +++++++------------- tests/python-gpu/test_gpu_data_iterator.py | 29 +++++ tests/python/test_data_iterator.py | 24 ++++ 4 files changed, 169 insertions(+), 84 deletions(-) diff --git a/python-package/xgboost/testing/data_iter.py b/python-package/xgboost/testing/data_iter.py index e107557d3049..a26035d4d64e 100644 --- a/python-package/xgboost/testing/data_iter.py +++ b/python-package/xgboost/testing/data_iter.py @@ -1,10 +1,14 @@ """Tests related to the `DataIter` interface.""" +from typing import Callable, Optional + import numpy as np import xgboost from xgboost import testing as tm +from ..compat import concat + def run_mixed_sparsity(device: str) -> None: """Check QDM with mixed batches.""" @@ -33,3 +37,71 @@ def run_mixed_sparsity(device: str) -> None: Xy_1 = xgboost.QuantileDMatrix(X_arr, y_arr) assert tm.predictor_equal(Xy_0, Xy_1) + + +class CatIter(xgboost.DataIter): # pylint: disable=too-many-instance-attributes + """An iterator for testing categorical features.""" + + def __init__( # pylint: disable=too-many-arguments + self, + n_samples_per_batch: int, + n_features: int, + *, + n_batches: int, + n_cats: int, + sparsity: float, + onehot: bool, + device: str, + cache: Optional[str], + ) -> None: + super().__init__(cache_prefix=cache) + self.n_samples_per_batch = n_samples_per_batch + self.n_features = n_features + self.n_batches = n_batches + self.n_cats = n_cats + self.sparsity = sparsity + self.onehot = onehot + self.device = device + + xs, ys = [], [] + for i in range(n_batches): + cat, y = tm.make_categorical( + self.n_samples_per_batch, + self.n_features, + n_categories=self.n_cats, + onehot=self.onehot, + sparsity=self.sparsity, + random_state=self.n_samples_per_batch * self.n_features * i, + ) + xs.append(cat) + ys.append(y) + + self.xs = xs + self.ys = ys + + self.x = concat(xs) + self.y = concat(ys) + + self._it = 0 + + def xy(self) -> tuple: + """Return the concatenated data.""" + return self.x, self.y + + def next(self, input_data: Callable) -> bool: + if self._it == self.n_batches: + # return False to let XGBoost know this is the end of iteration + return False + X, y = self.xs[self._it], self.ys[self._it] + if self.device == "cuda": + import cudf # pylint: disable=import-error + import cupy # pylint: disable=import-error + + X = cudf.DataFrame(X) + y = cupy.array(y) + input_data(data=X, label=y) + self._it += 1 + return True + + def reset(self) -> None: + self._it = 0 diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py index 20ec65dae5a1..8a85989f0bd6 100644 --- a/python-package/xgboost/testing/updater.py +++ b/python-package/xgboost/testing/updater.py @@ -2,7 +2,7 @@ import json from functools import partial, update_wrapper -from typing import Any, Callable, Dict, List +from typing import Any, Dict, List import numpy as np import pytest @@ -11,8 +11,8 @@ import xgboost.testing as tm from xgboost.data import is_pd_cat_dtype -from ..compat import concat from ..core import DataIter +from .data_iter import CatIter def get_basescore(model: xgb.XGBModel) -> float: @@ -206,18 +206,32 @@ def check_extmem_qdm( # pylint: disable=too-many-arguments n_bins: int, device: str, on_host: bool, + onehot: bool, + is_cat: bool, ) -> None: """Basic test for the `ExtMemQuantileDMatrix`.""" - it = tm.IteratorForTest( - *tm.make_batches( - n_samples_per_batch, n_features, n_batches, use_cupy=device != "cpu" - ), - cache="cache", - on_host=on_host, - ) + if is_cat: + it: DataIter = CatIter( + n_samples_per_batch=n_samples_per_batch, + n_features=n_features, + n_batches=n_batches, + n_cats=5, + sparsity=0.0, + onehot=onehot, + device=device, + cache="cache", + ) + else: + it = tm.IteratorForTest( + *tm.make_batches( + n_samples_per_batch, n_features, n_batches, use_cupy=device != "cpu" + ), + cache="cache", + on_host=on_host, + ) - Xy_it = xgb.ExtMemQuantileDMatrix(it, max_bin=n_bins) + Xy_it = xgb.ExtMemQuantileDMatrix(it, max_bin=n_bins, enable_categorical=is_cat) with pytest.raises(ValueError, match="Only the `hist`"): booster_it = xgb.train( {"device": device, "tree_method": "approx", "max_bin": n_bins}, @@ -228,13 +242,25 @@ def check_extmem_qdm( # pylint: disable=too-many-arguments booster_it = xgb.train( {"device": device, "max_bin": n_bins}, Xy_it, num_boost_round=8 ) - it = tm.IteratorForTest( - *tm.make_batches( - n_samples_per_batch, n_features, n_batches, use_cupy=device != "cpu" - ), - cache=None, - ) - Xy = xgb.QuantileDMatrix(it, max_bin=n_bins) + if is_cat: + it = CatIter( + n_samples_per_batch=n_samples_per_batch, + n_features=n_features, + n_batches=n_batches, + n_cats=5, + sparsity=0.0, + onehot=onehot, + device=device, + cache=None, + ) + else: + it = tm.IteratorForTest( + *tm.make_batches( + n_samples_per_batch, n_features, n_batches, use_cupy=device != "cpu" + ), + cache=None, + ) + Xy = xgb.QuantileDMatrix(it, max_bin=n_bins, enable_categorical=is_cat) booster = xgb.train({"device": device, "max_bin": n_bins}, Xy, num_boost_round=8) cut_it = Xy_it.get_quantile_cut() @@ -349,73 +375,6 @@ def check_get_quantile_cut(tree_method: str, device: str) -> None: USE_PART = 1 -class CatIter(DataIter): # pylint: disable=too-many-instance-attributes - """An iterator for testing categorical features.""" - - def __init__( # pylint: disable=too-many-arguments - self, - n_samples_per_batch: int, - n_features: int, - *, - n_batches: int, - n_cats: int, - sparsity: float, - onehot: bool, - device: str, - ) -> None: - super().__init__(cache_prefix="cache") - self.n_samples_per_batch = n_samples_per_batch - self.n_features = n_features - self.n_batches = n_batches - self.n_cats = n_cats - self.sparsity = sparsity - self.onehot = onehot - self.device = device - - xs, ys = [], [] - for i in range(n_batches): - cat, y = tm.make_categorical( - self.n_samples_per_batch, - self.n_features, - n_categories=self.n_cats, - onehot=self.onehot, - sparsity=self.sparsity, - random_state=self.n_samples_per_batch * self.n_features * i, - ) - xs.append(cat) - ys.append(y) - - self.xs = xs - self.ys = ys - - self.x = concat(xs) - self.y = concat(ys) - - self._it = 0 - - def xy(self) -> tuple: - """Return the concatenated data.""" - return self.x, self.y - - def next(self, input_data: Callable) -> bool: - if self._it == self.n_batches: - # return False to let XGBoost know this is the end of iteration - return False - X, y = self.xs[self._it], self.ys[self._it] - if self.device == "cuda": - import cudf # pylint: disable=import-error - import cupy # pylint: disable=import-error - - X = cudf.DataFrame(X) - y = cupy.array(y) - input_data(data=X, label=y) - self._it += 1 - return True - - def reset(self) -> None: - self._it = 0 - - def _create_dmatrix( # pylint: disable=too-many-arguments n_samples: int, n_features: int, @@ -437,6 +396,7 @@ def _create_dmatrix( # pylint: disable=too-many-arguments n_cats=n_cats, onehot=onehot, device=device, + cache="cache" if extmem else None, ) if extmem: if tree_method == "hist": diff --git a/tests/python-gpu/test_gpu_data_iterator.py b/tests/python-gpu/test_gpu_data_iterator.py index d0f850d0db7a..c2bb90a71131 100644 --- a/tests/python-gpu/test_gpu_data_iterator.py +++ b/tests/python-gpu/test_gpu_data_iterator.py @@ -92,6 +92,35 @@ def test_extmem_qdm( n_bins=n_bins, device="cuda", on_host=on_host, + is_cat=False, + onehot=False, + ) + + +@given( + strategies.integers(1, 2048), + strategies.integers(1, 4), + strategies.integers(2, 16), + strategies.booleans(), + strategies.booleans(), +) +@settings(deadline=None, max_examples=10, print_blob=True) +def test_categorical_extmem_qdm( + n_samples_per_batch: int, + n_batches: int, + n_bins: int, + on_host: bool, + onehot: bool, +) -> None: + check_extmem_qdm( + n_samples_per_batch, + 4, + n_batches=n_batches, + n_bins=n_bins, + device="cuda", + on_host=on_host, + onehot=onehot, + is_cat=True, ) diff --git a/tests/python/test_data_iterator.py b/tests/python/test_data_iterator.py index d6c551dada1a..e8940b4de68b 100644 --- a/tests/python/test_data_iterator.py +++ b/tests/python/test_data_iterator.py @@ -328,6 +328,30 @@ def test_extmem_qdm( n_bins=n_bins, device="cpu", on_host=False, + onehot=False, + is_cat=False, + ) + + +@given( + strategies.integers(1, 4096), + strategies.integers(1, 4), + strategies.integers(2, 16), + strategies.booleans(), +) +@settings(deadline=None, max_examples=10, print_blob=True) +def test_categorical_extmem_qdm( + n_samples_per_batch: int, n_batches: int, n_bins: int, onehot: bool +) -> None: + check_extmem_qdm( + n_samples_per_batch, + 4, + n_batches=n_batches, + n_bins=n_bins, + device="cpu", + on_host=False, + onehot=onehot, + is_cat=True, )