From fd41e03320ce74ba55767774f6f3a39f777b226a Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 23 Apr 2024 10:14:18 +0200 Subject: [PATCH] test(python): Update benchmark tests (#15825) --- .github/workflows/benchmark.yml | 27 --- .github/workflows/codecov.yml | 4 +- .github/workflows/test-python.yml | 4 +- docs/development/contributing/test.md | 8 +- py-polars/Makefile | 2 +- py-polars/pyproject.toml | 3 +- py-polars/tests/benchmark/__init__.py | 3 +- py-polars/tests/benchmark/conftest.py | 33 +-- .../tests/benchmark/data/groupby-datagen.R | 53 ---- py-polars/tests/benchmark/datagen_groupby.py | 193 +++++++++++++++ py-polars/tests/benchmark/test_filter.py | 16 +- py-polars/tests/benchmark/test_group_by.py | 89 +++---- py-polars/tests/benchmark/test_io.py | 23 ++ py-polars/tests/benchmark/test_release.py | 229 ------------------ py-polars/tests/unit/io/test_parquet.py | 20 ++ .../aggregation/__init__.py | 0 .../{ => aggregation}/test_aggregations.py | 34 ++- .../aggregation/test_horizontal.py | 0 .../aggregation/test_vertical.py | 20 ++ .../tests/unit/operations/test_group_by.py | 70 ++++++ py-polars/tests/unit/operations/test_join.py | 38 +++ py-polars/tests/unit/operations/test_sort.py | 12 + .../tests/unit/operations/test_window.py | 21 ++ 23 files changed, 484 insertions(+), 418 deletions(-) delete mode 100644 py-polars/tests/benchmark/data/groupby-datagen.R create mode 100644 py-polars/tests/benchmark/datagen_groupby.py create mode 100644 py-polars/tests/benchmark/test_io.py delete mode 100644 py-polars/tests/benchmark/test_release.py rename py-polars/tests/unit/{functions => operations}/aggregation/__init__.py (100%) rename py-polars/tests/unit/operations/{ => aggregation}/test_aggregations.py (94%) rename py-polars/tests/unit/{functions => operations}/aggregation/test_horizontal.py (100%) rename py-polars/tests/unit/{functions => operations}/aggregation/test_vertical.py (75%) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index a291dd5cbe5e..1f8d27ee4735 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -48,33 +48,6 @@ jobs: working-directory: py-polars run: pip install -r requirements-dev.txt - - name: Load benchmark data from cache - id: cache-data - uses: actions/cache/restore@v4 - with: - path: py-polars/tests/benchmark/data/G1_1e7_1e2_5_0.csv - key: benchmark-data - - - name: Set up R - if: steps.cache-data.outputs.cache-hit != 'true' - uses: r-lib/actions/setup-r@v2 - with: - r-version: '4.3.3' - - - name: Generate data - if: steps.cache-data.outputs.cache-hit != 'true' - working-directory: py-polars/tests/benchmark/data - run: | - Rscript -e 'install.packages("data.table", repos="https://cloud.r-project.org")' - Rscript groupby-datagen.R 1e7 1e2 5 0 - - - name: Save benchmark data in cache - if: github.ref_name == 'main' - uses: actions/cache/save@v4 - with: - path: py-polars/tests/benchmark/data/G1_1e7_1e2_5_0.csv - key: ${{ steps.cache-data.outputs.cache-primary-key }} - - name: Set up Rust run: rustup show diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index d5b937c2b1a6..65cf0cba5e8e 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -113,14 +113,14 @@ jobs: - name: Run Python tests working-directory: py-polars - run: pytest --cov -n auto --dist loadgroup -m "not benchmark and not docs" --cov-report xml:main.xml + run: pytest --cov -n auto --dist loadgroup -m "not release and not benchmark and not docs" --cov-report xml:main.xml continue-on-error: true - name: Run Python tests - async reader working-directory: py-polars env: POLARS_FORCE_ASYNC: 1 - run: pytest --cov -m "not benchmark and not docs" tests/unit/io/ --cov-report xml:async.xml + run: pytest --cov -m "not release and not benchmark and not docs" tests/unit/io/ --cov-report xml:async.xml continue-on-error: true - name: Report Rust coverage diff --git a/.github/workflows/test-python.yml b/.github/workflows/test-python.yml index 0b2a10f38e6b..1646e5bc5032 100644 --- a/.github/workflows/test-python.yml +++ b/.github/workflows/test-python.yml @@ -89,13 +89,13 @@ jobs: # Currently skipped due to performance issues in coverage: # https://github.com/nedbat/coveragepy/issues/1665 COV: ${{ !(matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12') && '--cov' || '--no-cov' }} - run: pytest $COV -n auto --dist loadgroup -m "not benchmark and not docs" + run: pytest $COV -n auto --dist loadgroup -m "not release and not benchmark and not docs" - name: Run tests async reader tests if: github.ref_name != 'main' && matrix.os != 'windows-latest' env: POLARS_FORCE_ASYNC: 1 - run: pytest -m "not benchmark and not docs" tests/unit/io/ + run: pytest -m "not release and not benchmark and not docs" tests/unit/io/ - name: Check import without optional dependencies if: github.ref_name != 'main' && matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest' diff --git a/docs/development/contributing/test.md b/docs/development/contributing/test.md index e2ee89c221b6..9aa245a1fd6c 100644 --- a/docs/development/contributing/test.md +++ b/docs/development/contributing/test.md @@ -102,12 +102,10 @@ Polars uses [CodSpeed](https://codspeed.io/pola-rs/polars) for tracking the perf ### Generating data -For many tests, a relatively large dataset must be generated first. -We use an [R](https://www.r-project.org/) script to generate this data. -The script was taken from the [H2O AI database benchmark](https://github.com/h2oai/db-benchmark), which is the foundation for many of the benchmark tests. +For most tests, a relatively large dataset must be generated first. +This is done as part of the `pytest` setup process. -For the exact steps to generate the data, please refer to the [benchmark workflow](https://github.com/pola-rs/polars/blob/main/.github/workflows/benchmark.yml). -It involves [installing R](https://cran.r-project.org/), installing the [data.table](https://cran.r-project.org/web/packages/data.table/) dependency, and executing a data generation script. +The data generation logic was taken from the [H2O.ai database benchmark](https://github.com/h2oai/db-benchmark), which is the foundation for many of the benchmark tests. ### Running the benchmark tests diff --git a/py-polars/Makefile b/py-polars/Makefile index 93fad3c00931..c8dd4707155d 100644 --- a/py-polars/Makefile +++ b/py-polars/Makefile @@ -100,7 +100,7 @@ test-all: .venv build ## Run all tests .PHONY: coverage coverage: .venv build ## Run tests and report coverage - $(VENV_BIN)/pytest --cov -n auto --dist loadgroup -m "not benchmark" + $(VENV_BIN)/pytest --cov -n auto --dist loadgroup -m "not release and not benchmark" .PHONY: clean clean: ## Clean up caches and build artifacts diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml index 08a325f3f1a9..2031f8ba66ab 100644 --- a/py-polars/pyproject.toml +++ b/py-polars/pyproject.toml @@ -206,12 +206,13 @@ addopts = [ "--strict-markers", "--import-mode=importlib", # Default to running fast tests only. To run ALL tests, run: pytest -m "" - "-m not slow and not write_disk and not benchmark and not hypothesis and not docs", + "-m not slow and not write_disk and not release and not docs and not hypothesis and not benchmark", ] markers = [ "slow: Tests with a longer than average runtime.", "write_disk: Tests that write to disk", "debug: Tests that should be run on a Polars debug build.", + "release: Tests that should be run on a Polars release build.", "docs: Documentation code snippets", ] filterwarnings = [ diff --git a/py-polars/tests/benchmark/__init__.py b/py-polars/tests/benchmark/__init__.py index 757f2b70de88..643565f66680 100644 --- a/py-polars/tests/benchmark/__init__.py +++ b/py-polars/tests/benchmark/__init__.py @@ -1,7 +1,8 @@ """ Benchmark tests. -These tests are skipped by default as a large dataset must be generated first. +These tests are skipped by default as a relatively large dataset must be generated +first. See the documentation on how to run these tests: https://docs.pola.rs/development/contributing/test/#benchmark-tests diff --git a/py-polars/tests/benchmark/conftest.py b/py-polars/tests/benchmark/conftest.py index efdc821f8f83..ca8ffb88b298 100644 --- a/py-polars/tests/benchmark/conftest.py +++ b/py-polars/tests/benchmark/conftest.py @@ -1,34 +1,9 @@ -from pathlib import Path - import pytest import polars as pl +from tests.benchmark.datagen_groupby import generate_group_by_data -@pytest.fixture(scope="module") -def data_path() -> Path: - return Path(__file__).parent / "data" - - -@pytest.fixture(scope="module") -def h2aoi_groupby_data_path(data_path: Path) -> Path: - return data_path / "G1_1e7_1e2_5_0.csv" - - -@pytest.fixture(scope="module") -def h2oai_groupby_data(h2aoi_groupby_data_path: Path) -> pl.DataFrame: - if not h2aoi_groupby_data_path.is_file(): - pytest.skip("Dataset must be generated before running this test.") - - df = pl.read_csv( - h2aoi_groupby_data_path, - dtypes={ - "id4": pl.Int32, - "id5": pl.Int32, - "id6": pl.Int32, - "v1": pl.Int32, - "v2": pl.Int32, - "v3": pl.Float64, - }, - ) - return df +@pytest.fixture(scope="session") +def groupby_data() -> pl.DataFrame: + return generate_group_by_data(10_000, 100, null_ratio=0.05) diff --git a/py-polars/tests/benchmark/data/groupby-datagen.R b/py-polars/tests/benchmark/data/groupby-datagen.R deleted file mode 100644 index b5303678913d..000000000000 --- a/py-polars/tests/benchmark/data/groupby-datagen.R +++ /dev/null @@ -1,53 +0,0 @@ -# Rscript groupby-datagen.R 1e7 1e2 0 0 ## 1e7 rows, 1e2 K, 0% NAs, random order -# Rscript groupby-datagen.R 1e8 1e1 5 1 ## 1e8 rows, 10 K, 5% NAs, sorted order -args = commandArgs(TRUE) - -pretty_sci = function(x) { - tmp<-strsplit(as.character(x), "+", fixed=TRUE)[[1L]] - if(length(tmp)==1L) { - paste0(substr(tmp, 1L, 1L), "e", nchar(tmp)-1L) - } else if(length(tmp)==2L){ - paste0(tmp[1L], as.character(as.integer(tmp[2L]))) - } -} - -library(data.table) -N=as.integer(args[1L]); K=as.integer(args[2L]); nas=as.integer(args[3L]); sort=as.integer(args[4L]) -stopifnot(nas<=100L, nas>=0L, sort%in%c(0L,1L)) -set.seed(108) -cat(sprintf("Producing data of %s rows, %s K groups factors, %s NAs ratio, %s sort flag\n", pretty_sci(N), pretty_sci(K), nas, sort)) -DT = list() -DT[["id1"]] = sample(sprintf("id%03d",1:K), N, TRUE) # large groups (char) -DT[["id2"]] = sample(sprintf("id%03d",1:K), N, TRUE) # small groups (char) -DT[["id3"]] = sample(sprintf("id%010d",1:(N/K)), N, TRUE) # large groups (char) -DT[["id4"]] = sample(K, N, TRUE) # large groups (int) -DT[["id5"]] = sample(K, N, TRUE) # small groups (int) -DT[["id6"]] = sample(N/K, N, TRUE) # small groups (int) -DT[["v1"]] = sample(5, N, TRUE) # int in range [1,5] -DT[["v2"]] = sample(15, N, TRUE) # int in range [1,15] -DT[["v3"]] = round(runif(N,max=100),6) # numeric e.g. 23.574912 -setDT(DT) -if (nas>0L) { - cat("Inputting NAs\n") - for (col in paste0("id",1:6)) { - ucol = unique(DT[[col]]) - nna = as.integer(length(ucol) * (nas/100)) - if (nna) - set(DT, DT[.(sample(ucol, nna)), on=col, which=TRUE], col, NA) - rm(ucol) - } - nna = as.integer(nrow(DT) * (nas/100)) - if (nna) { - for (col in paste0("v",1:3)) - set(DT, sample(nrow(DT), nna), col, NA) - } -} -if (sort==1L) { - cat("Sorting data\n") - setkeyv(DT, paste0("id", 1:6)) -} -file = sprintf("G1_%s_%s_%s_%s.csv", pretty_sci(N), pretty_sci(K), nas, sort) -cat(sprintf("Writing data to %s\n", file)) -fwrite(DT, file) -cat(sprintf("Data written to %s, quitting\n", file)) -if (!interactive()) quit("no", status=0) diff --git a/py-polars/tests/benchmark/datagen_groupby.py b/py-polars/tests/benchmark/datagen_groupby.py new file mode 100644 index 000000000000..782885cf5ec2 --- /dev/null +++ b/py-polars/tests/benchmark/datagen_groupby.py @@ -0,0 +1,193 @@ +""" +Script to generate data for benchmarking group-by operations. + +Data generation logic was adapted from the H2O.ai benchmark. +The original R script is located here: +https://github.com/h2oai/db-benchmark/blob/master/_data/groupby-datagen.R + +Examples +-------- +10 million rows, 100 groups, no nulls, random order: +$ python datagen_groupby.py 1e7 1e2 --null-percentage 0 + +100 million rows, 10 groups, 5% nulls, sorted: +$ python datagen_groupby.py 1e8 1e1 --null-percentage 5 --sorted +""" + +import argparse +import logging + +import numpy as np +from numpy.random import default_rng + +import polars as pl + +logging.basicConfig(level=logging.INFO) + +SEED = 0 +rng = default_rng(seed=SEED) + +__all__ = ["generate_group_by_data"] + + +def generate_group_by_data( + n_rows: int, n_groups: int, null_ratio: float = 0.0, *, sort: bool = False +) -> pl.DataFrame: + """Generate data for benchmarking group-by operations.""" + logging.info("Generating data...") + df = _generate_data(n_rows, n_groups) + + if null_ratio > 0.0: + logging.info("Setting nulls...") + df = _set_nulls(df, null_ratio) + + if sort: + logging.info("Sorting data...") + df = df.sort(c for c in df.columns if c.startswith("id")) + + logging.info("Done generating data.") + return df + + +def _generate_data(n_rows: int, n_groups: int) -> pl.DataFrame: + N = n_rows + K = n_groups + + group_str_small = [f"id{str(i).zfill(3)}" for i in range(1, K + 1)] + group_str_large = [f"id{str(i).zfill(10)}" for i in range(1, int(N / K) + 1)] + group_int_small = range(1, K + 1) + group_int_large = range(1, int(N / K) + 1) + + var_int_small = range(1, 6) + var_int_large = range(1, 16) + var_float = rng.uniform(0, 100, N) + + return pl.DataFrame( + { + "id1": rng.choice(group_str_small, N), + "id2": rng.choice(group_str_small, N), + "id3": rng.choice(group_str_large, N), + "id4": rng.choice(group_int_small, N), + "id5": rng.choice(group_int_small, N), + "id6": rng.choice(group_int_large, N), + "v1": rng.choice(var_int_small, N), + "v2": rng.choice(var_int_large, N), + "v3": np.round(var_float, 6), + }, + schema={ + "id1": pl.String, + "id2": pl.String, + "id3": pl.String, + "id4": pl.Int32, + "id5": pl.Int32, + "id6": pl.Int32, + "v1": pl.Int32, + "v2": pl.Int32, + "v3": pl.Float64, + }, + ) + + +def _set_nulls(df: pl.DataFrame, null_ratio: float) -> pl.DataFrame: + """Set null values according to the given ratio.""" + + def set_nulls_var(s: pl.Series, ratio: float) -> pl.Series: + """Set Series values to null according to the given ratio.""" + len = s.len() + n_null = int(ratio * len) + if n_null == 0: + return s + + indices = rng.choice(len, size=n_null, replace=False) + return s.scatter(indices, None) + + def set_nulls_group(s: pl.Series, ratio: float) -> pl.Series: + """Set Series unique values to null according to the given ratio.""" + uniques = s.unique() + n_null = int(ratio * uniques.len()) + if n_null == 0: + return s + + to_replace = rng.choice(uniques, size=n_null, replace=False) + return ( + s.to_frame() + .select( + pl.when(pl.col(s.name).is_in(to_replace)) + .then(None) + .otherwise(pl.col(s.name)) + .alias(s.name) + ) + .to_series() + ) + + return df.with_columns( + set_nulls_group(s, null_ratio) + if s.name.startswith("id") + else set_nulls_var(s, null_ratio) + for s in df.get_columns() + ) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Generate data for benchmarking group-by operations" + ) + + parser.add_argument("rows", type=float, help="Number of rows") + parser.add_argument("groups", type=float, help="Number of groups") + parser.add_argument( + "-n", + "--null-percentage", + type=int, + default=0, + choices=range(1, 101), + metavar="[0-100]", + help="Percentage of null values", + ) + parser.add_argument( + "-s", + "--sort", + action="store_true", + help="Sort the data by group", + ) + parser.add_argument( + "-o", + "--output", + type=str, + default=None, + help="Output filename", + ) + + args = parser.parse_args() + + # Convert arguments to appropriate types + n_rows = int(args.rows) + n_groups = int(args.groups) + null_ratio = args.null_percentage / 100 + sort = args.sort + + logging.info( + f"Generating data: {n_rows} rows, {n_groups} groups, {null_ratio} null ratio, sorted: {args.sort}" + ) + + df = generate_group_by_data(n_rows, n_groups, null_ratio=null_ratio, sort=sort) + write_data(df, args) + + +def write_data(df: pl.DataFrame, args: argparse.Namespace) -> None: + def format_int(i: int) -> str: + base, exp = f"{i:e}".split("e") + return f"{float(base):g}e{int(exp)}" + + if args.output is not None: + filename = args.output + else: + filename = f"G1_{format_int(args.rows)}_{format_int(args.groups)}_{args.null_percentage}_{int(args.sort)}.csv" + + logging.info(f"Writing data to {filename}") + df.write_csv(filename) + logging.info("Done writing data.") + + +if __name__ == "__main__": + main() diff --git a/py-polars/tests/benchmark/test_filter.py b/py-polars/tests/benchmark/test_filter.py index 0923c791df7e..047404f01bb1 100644 --- a/py-polars/tests/benchmark/test_filter.py +++ b/py-polars/tests/benchmark/test_filter.py @@ -9,9 +9,9 @@ pytestmark = pytest.mark.benchmark() -def test_filter1(h2oai_groupby_data: pl.DataFrame) -> None: - result = ( - h2oai_groupby_data.lazy() +def test_filter1(groupby_data: pl.DataFrame) -> None: + ( + groupby_data.lazy() .filter(pl.col("id1").eq_missing(pl.lit("id046"))) .select( pl.col("id6").cast(pl.Int64).sum(), @@ -19,13 +19,11 @@ def test_filter1(h2oai_groupby_data: pl.DataFrame) -> None: ) .collect() ) - assert result["id6"].item() == 4_762_459_723 - assert result["v3"].item() == pytest.approx(4766795.205196999) -def test_filter2(h2oai_groupby_data: pl.DataFrame) -> None: - result = ( - h2oai_groupby_data.lazy() +def test_filter2(groupby_data: pl.DataFrame) -> None: + ( + groupby_data.lazy() .filter(~(pl.col("id1").eq_missing(pl.lit("id046")))) .select( pl.col("id6").cast(pl.Int64).sum(), @@ -33,5 +31,3 @@ def test_filter2(h2oai_groupby_data: pl.DataFrame) -> None: ) .collect() ) - assert result["id6"].item() == 470_453_297_090 - assert result["v3"].item() == pytest.approx(470202778.84258103) diff --git a/py-polars/tests/benchmark/test_group_by.py b/py-polars/tests/benchmark/test_group_by.py index d29d716deede..7547b13b5e62 100644 --- a/py-polars/tests/benchmark/test_group_by.py +++ b/py-polars/tests/benchmark/test_group_by.py @@ -1,7 +1,7 @@ """ -Benchmark tests for the group by operation. +Benchmark tests for the group-by operation. -These tests are based on the H2O AI database benchmark. +These tests are based on the H2O.ai database benchmark. See: https://h2oai.github.io/db-benchmark/ @@ -16,35 +16,31 @@ pytestmark = pytest.mark.benchmark() -def test_h2oai_groupby_q1(h2oai_groupby_data: pl.DataFrame) -> None: - result = ( - h2oai_groupby_data.lazy() +def test_groupby_h2oai_q1(groupby_data: pl.DataFrame) -> None: + ( + groupby_data.lazy() .group_by("id1") .agg( pl.sum("v1").alias("v1_sum"), ) .collect() ) - assert result.shape == (96, 2) - assert result["v1_sum"].sum() == 28498857 -def test_h2oai_groupby_q2(h2oai_groupby_data: pl.DataFrame) -> None: - result = ( - h2oai_groupby_data.lazy() +def test_groupby_h2oai_q2(groupby_data: pl.DataFrame) -> None: + ( + groupby_data.lazy() .group_by("id1", "id2") .agg( pl.sum("v1").alias("v1_sum"), ) .collect() ) - assert result.shape == (9216, 3) - assert result["v1_sum"].sum() == 28498857 -def test_h2oai_groupby_q3(h2oai_groupby_data: pl.DataFrame) -> None: - result = ( - h2oai_groupby_data.lazy() +def test_groupby_h2oai_q3(groupby_data: pl.DataFrame) -> None: + ( + groupby_data.lazy() .group_by("id3") .agg( pl.sum("v1").alias("v1_sum"), @@ -52,14 +48,11 @@ def test_h2oai_groupby_q3(h2oai_groupby_data: pl.DataFrame) -> None: ) .collect() ) - assert result.shape == (95001, 3) - assert result["v1_sum"].sum() == 28498857 - assert result["v3_mean"].sum() == pytest.approx(4749467.631946) -def test_h2oai_groupby_q4(h2oai_groupby_data: pl.DataFrame) -> None: - result = ( - h2oai_groupby_data.lazy() +def test_groupby_h2oai_q4(groupby_data: pl.DataFrame) -> None: + ( + groupby_data.lazy() .group_by("id4") .agg( pl.mean("v1").alias("v1_mean"), @@ -68,15 +61,11 @@ def test_h2oai_groupby_q4(h2oai_groupby_data: pl.DataFrame) -> None: ) .collect() ) - assert result.shape == (96, 4) - assert result["v1_mean"].sum() == pytest.approx(287.989430) - assert result["v2_mean"].sum() == pytest.approx(767.852921) - assert result["v3_mean"].sum() == pytest.approx(4799.873270) -def test_h2oai_groupby_q5(h2oai_groupby_data: pl.DataFrame) -> None: - result = ( - h2oai_groupby_data.lazy() +def test_groupby_h2oai_q5(groupby_data: pl.DataFrame) -> None: + ( + groupby_data.lazy() .group_by("id6") .agg( pl.sum("v1").alias("v1_sum"), @@ -85,15 +74,11 @@ def test_h2oai_groupby_q5(h2oai_groupby_data: pl.DataFrame) -> None: ) .collect() ) - assert result.shape == (95001, 4) - assert result["v1_sum"].sum() == 28498857 - assert result["v2_sum"].sum() == 75988394 - assert result["v3_sum"].sum() == pytest.approx(474969574.047777) -def test_h2oai_groupby_q6(h2oai_groupby_data: pl.DataFrame) -> None: - result = ( - h2oai_groupby_data.lazy() +def test_groupby_h2oai_q6(groupby_data: pl.DataFrame) -> None: + ( + groupby_data.lazy() .group_by("id4", "id5") .agg( pl.median("v3").alias("v3_median"), @@ -101,49 +86,40 @@ def test_h2oai_groupby_q6(h2oai_groupby_data: pl.DataFrame) -> None: ) .collect() ) - assert result.shape == (9216, 4) - assert result["v3_median"].sum() == pytest.approx(460771.216444) - assert result["v3_std"].sum() == pytest.approx(266006.904622) -def test_h2oai_groupby_q7(h2oai_groupby_data: pl.DataFrame) -> None: - result = ( - h2oai_groupby_data.lazy() +def test_groupby_h2oai_q7(groupby_data: pl.DataFrame) -> None: + ( + groupby_data.lazy() .group_by("id3") .agg((pl.max("v1") - pl.min("v2")).alias("range_v1_v2")) .collect() ) - assert result.shape == (95001, 2) - assert result["range_v1_v2"].sum() == 379850 -def test_h2oai_groupby_q8(h2oai_groupby_data: pl.DataFrame) -> None: - result = ( - h2oai_groupby_data.lazy() +def test_groupby_h2oai_q8(groupby_data: pl.DataFrame) -> None: + ( + groupby_data.lazy() .drop_nulls("v3") .group_by("id6") .agg(pl.col("v3").top_k(2).alias("largest2_v3")) .explode("largest2_v3") .collect() ) - assert result.shape == (190002, 2) - assert result["largest2_v3"].sum() == pytest.approx(18700554.779632) -def test_h2oai_groupby_q9(h2oai_groupby_data: pl.DataFrame) -> None: - result = ( - h2oai_groupby_data.lazy() +def test_groupby_h2oai_q9(groupby_data: pl.DataFrame) -> None: + ( + groupby_data.lazy() .group_by("id2", "id4") .agg((pl.corr("v1", "v2") ** 2).alias("r2")) .collect() ) - assert result.shape == (9216, 3) - assert result["r2"].sum() == pytest.approx(9.940515) -def test_h2oai_groupby_q10(h2oai_groupby_data: pl.DataFrame) -> None: - result = ( - h2oai_groupby_data.lazy() +def test_groupby_h2oai_q10(groupby_data: pl.DataFrame) -> None: + ( + groupby_data.lazy() .group_by("id1", "id2", "id3", "id4", "id5", "id6") .agg( pl.sum("v3").alias("v3_sum"), @@ -151,4 +127,3 @@ def test_h2oai_groupby_q10(h2oai_groupby_data: pl.DataFrame) -> None: ) .collect() ) - assert result.shape == (9999993, 8) diff --git a/py-polars/tests/benchmark/test_io.py b/py-polars/tests/benchmark/test_io.py new file mode 100644 index 000000000000..44e165a61a03 --- /dev/null +++ b/py-polars/tests/benchmark/test_io.py @@ -0,0 +1,23 @@ +"""Benchmark tests for the I/O operations.""" + +from pathlib import Path + +import pytest + +import polars as pl + +pytestmark = pytest.mark.benchmark() + + +def test_write_read_scan_large_csv(groupby_data: pl.DataFrame, tmp_path: Path) -> None: + tmp_path.mkdir(exist_ok=True) + + data_path = tmp_path / "data.csv" + groupby_data.write_csv(data_path) + + predicate = pl.col("v2") < 5 + + shape_eager = pl.read_csv(data_path).filter(predicate).shape + shape_lazy = pl.scan_csv(data_path).filter(predicate).collect().shape + + assert shape_lazy == shape_eager diff --git a/py-polars/tests/benchmark/test_release.py b/py-polars/tests/benchmark/test_release.py deleted file mode 100644 index 03882c4a7940..000000000000 --- a/py-polars/tests/benchmark/test_release.py +++ /dev/null @@ -1,229 +0,0 @@ -""" -Various benchmark tests. - -Tests in this module will be run in the CI using a release build of Polars. - -To run these tests: pytest -m release -""" - -import time -from pathlib import Path -from typing import cast - -import numpy as np -import pytest - -import polars as pl -from polars.testing import assert_frame_equal - -# Mark all tests in this module as benchmark tests -pytestmark = pytest.mark.benchmark() - - -def test_read_scan_large_csv(h2aoi_groupby_data_path: Path) -> None: - if not h2aoi_groupby_data_path.is_file(): - pytest.skip("Dataset must be generated before running this test.") - - predicate = pl.col("v2") < 5 - - shape_eager = pl.read_csv(h2aoi_groupby_data_path).filter(predicate).shape - shape_lazy = ( - (pl.scan_csv(h2aoi_groupby_data_path).filter(predicate)).collect().shape - ) - - assert shape_lazy == shape_eager - - -def test_sort_nan_1942() -> None: - # https://github.com/pola-rs/polars/issues/1942 - t0 = time.time() - pl.repeat(float("nan"), 2 << 12, eager=True).sort() - assert (time.time() - t0) < 1 - - -def test_mean_overflow() -> None: - np.random.seed(1) - expected = 769.5607652 - - df = pl.DataFrame(np.random.randint(500, 1040, 5000000), schema=["value"]) - - result = df.with_columns(pl.mean("value"))[0, 0] - assert np.isclose(result, expected) - - result = df.with_columns(pl.col("value").cast(pl.Int32)).with_columns( - pl.mean("value") - )[0, 0] - assert np.isclose(result, expected) - - result = df.with_columns(pl.col("value").cast(pl.Int32)).get_column("value").mean() - assert np.isclose(result, expected) - - -def test_min_max_2850() -> None: - # https://github.com/pola-rs/polars/issues/2850 - df = pl.DataFrame( - { - "id": [ - 130352432, - 130352277, - 130352611, - 130352833, - 130352305, - 130352258, - 130352764, - 130352475, - 130352368, - 130352346, - ] - } - ) - - minimum = 130352258 - maximum = 130352833.0 - - for _ in range(10): - permuted = df.sample(fraction=1.0, seed=0) - computed = permuted.select( - [pl.col("id").min().alias("min"), pl.col("id").max().alias("max")] - ) - assert cast(int, computed[0, "min"]) == minimum - assert cast(float, computed[0, "max"]) == maximum - - -def test_windows_not_cached() -> None: - ldf = ( - pl.DataFrame( - [ - pl.Series("key", ["a", "a", "b", "b"]), - pl.Series("val", [2, 2, 1, 3]), - ] - ) - .lazy() - .filter( - (pl.col("key").cum_count().over("key") == 1) - | (pl.col("val").shift(1).over("key").is_not_null()) - | (pl.col("val") != pl.col("val").shift(1).over("key")) - ) - ) - # this might fail if they are cached - for _ in range(1000): - ldf.collect() - - -def test_cross_join() -> None: - # triggers > 100 rows implementation - # https://github.com/pola-rs/polars/blob/5f5acb2a523ce01bc710768b396762b8e69a9e07/polars/polars-core/src/frame/cross_join.rs#L34 - df1 = pl.DataFrame({"col1": ["a"], "col2": ["d"]}) - df2 = pl.DataFrame({"frame2": pl.arange(0, 100, eager=True)}) - out = df2.join(df1, how="cross") - df2 = pl.DataFrame({"frame2": pl.arange(0, 101, eager=True)}) - assert_frame_equal(df2.join(df1, how="cross").slice(0, 100), out) - - -def test_cross_join_slice_pushdown() -> None: - # this will likely go out of memory if we did not pushdown the slice - df = ( - pl.Series("x", pl.arange(0, 2**16 - 1, eager=True, dtype=pl.UInt16) % 2**15) - ).to_frame() - - result = df.lazy().join(df.lazy(), how="cross", suffix="_").slice(-5, 10).collect() - expected = pl.DataFrame( - { - "x": [32766, 32766, 32766, 32766, 32766], - "x_": [32762, 32763, 32764, 32765, 32766], - }, - schema={"x": pl.UInt16, "x_": pl.UInt16}, - ) - assert_frame_equal(result, expected) - - result = df.lazy().join(df.lazy(), how="cross", suffix="_").slice(2, 10).collect() - expected = pl.DataFrame( - { - "x": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - "x_": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11], - }, - schema={"x": pl.UInt16, "x_": pl.UInt16}, - ) - - -def test_max_statistic_parquet_writer() -> None: - # this hits the maximal page size - # so the row group will be split into multiple pages - # the page statistics need to be correctly reduced - # for this query to make sense - n = 150_000 - - # int64 is important to hit the page size - df = pl.int_range(0, n, eager=True, dtype=pl.Int64).alias("int").to_frame() - f = "/tmp/tmp.parquet" - df.write_parquet(f, statistics=True, use_pyarrow=False, row_group_size=n) - result = pl.scan_parquet(f).filter(pl.col("int") > n - 3).collect() - expected = pl.DataFrame({"int": [149998, 149999]}) - assert_frame_equal(result, expected) - - -def test_boolean_min_max_agg() -> None: - np.random.seed(0) - idx = np.random.randint(0, 500, 1000) - c = np.random.randint(0, 500, 1000) > 250 - - df = pl.DataFrame({"idx": idx, "c": c}) - aggs = [pl.col("c").min().alias("c_min"), pl.col("c").max().alias("c_max")] - - result = df.group_by("idx").agg(aggs).sum() - - schema = {"idx": pl.Int64, "c_min": pl.UInt32, "c_max": pl.UInt32} - expected = pl.DataFrame( - { - "idx": [107583], - "c_min": [120], - "c_max": [321], - }, - schema=schema, - ) - assert_frame_equal(result, expected) - - nulls = np.random.randint(0, 500, 1000) < 100 - - result = ( - df.with_columns(c=pl.when(pl.lit(nulls)).then(None).otherwise(pl.col("c"))) - .group_by("idx") - .agg(aggs) - .sum() - ) - - expected = pl.DataFrame( - { - "idx": [107583], - "c_min": [133], - "c_max": [276], - }, - schema=schema, - ) - assert_frame_equal(result, expected) - - -def test_categorical_vs_str_group_by() -> None: - # this triggers the perfect hash table - s = pl.Series("a", np.random.randint(0, 50, 100)) - s_with_nulls = pl.select( - pl.when(s < 3).then(None).otherwise(s).alias("a") - ).to_series() - - for s_ in [s, s_with_nulls]: - s_ = s_.cast(str) - cat_out = ( - s_.cast(pl.Categorical) - .to_frame("a") - .group_by("a") - .agg(pl.first().alias("first")) - ) - - str_out = s_.to_frame("a").group_by("a").agg(pl.first().alias("first")) - cat_out.with_columns(pl.col("a").cast(str)) - assert_frame_equal( - cat_out.with_columns( - pl.col("a").cast(str), pl.col("first").cast(pl.List(str)) - ).sort("a"), - str_out.sort("a"), - ) diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 71e328198a80..9ace31e42ccf 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -835,3 +835,23 @@ def test_read_parquet_only_loads_selected_columns_15098( # Only one column's worth of memory should be used; 2 columns would be # 16_000_000 at least, but there's some overhead. assert 8_000_000 < memory_usage_without_pyarrow.get_peak() < 13_000_000 + + +@pytest.mark.release() +@pytest.mark.write_disk() +def test_max_statistic_parquet_writer(tmp_path: Path) -> None: + # this hits the maximal page size + # so the row group will be split into multiple pages + # the page statistics need to be correctly reduced + # for this query to make sense + n = 150_000 + + tmp_path.mkdir(exist_ok=True) + + # int64 is important to hit the page size + df = pl.int_range(0, n, eager=True, dtype=pl.Int64).alias("int").to_frame() + f = tmp_path / "tmp.parquet" + df.write_parquet(f, statistics=True, use_pyarrow=False, row_group_size=n) + result = pl.scan_parquet(f).filter(pl.col("int") > n - 3).collect() + expected = pl.DataFrame({"int": [149998, 149999]}) + assert_frame_equal(result, expected) diff --git a/py-polars/tests/unit/functions/aggregation/__init__.py b/py-polars/tests/unit/operations/aggregation/__init__.py similarity index 100% rename from py-polars/tests/unit/functions/aggregation/__init__.py rename to py-polars/tests/unit/operations/aggregation/__init__.py diff --git a/py-polars/tests/unit/operations/test_aggregations.py b/py-polars/tests/unit/operations/aggregation/test_aggregations.py similarity index 94% rename from py-polars/tests/unit/operations/test_aggregations.py rename to py-polars/tests/unit/operations/aggregation/test_aggregations.py index ca3153cdfc1b..7fcdb30e023b 100644 --- a/py-polars/tests/unit/operations/test_aggregations.py +++ b/py-polars/tests/unit/operations/aggregation/test_aggregations.py @@ -1,7 +1,7 @@ from __future__ import annotations from datetime import date, datetime, timedelta -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, cast import numpy as np import pytest @@ -538,3 +538,35 @@ def test_group_count_over_null_column_15705() -> None: ) out = df.group_by("a", maintain_order=True).agg(pl.col("c").count()) assert out["c"].to_list() == [0, 0, 0] + + +@pytest.mark.release() +def test_min_max_2850() -> None: + # https://github.com/pola-rs/polars/issues/2850 + df = pl.DataFrame( + { + "id": [ + 130352432, + 130352277, + 130352611, + 130352833, + 130352305, + 130352258, + 130352764, + 130352475, + 130352368, + 130352346, + ] + } + ) + + minimum = 130352258 + maximum = 130352833.0 + + for _ in range(10): + permuted = df.sample(fraction=1.0, seed=0) + computed = permuted.select( + [pl.col("id").min().alias("min"), pl.col("id").max().alias("max")] + ) + assert cast(int, computed[0, "min"]) == minimum + assert cast(float, computed[0, "max"]) == maximum diff --git a/py-polars/tests/unit/functions/aggregation/test_horizontal.py b/py-polars/tests/unit/operations/aggregation/test_horizontal.py similarity index 100% rename from py-polars/tests/unit/functions/aggregation/test_horizontal.py rename to py-polars/tests/unit/operations/aggregation/test_horizontal.py diff --git a/py-polars/tests/unit/functions/aggregation/test_vertical.py b/py-polars/tests/unit/operations/aggregation/test_vertical.py similarity index 75% rename from py-polars/tests/unit/functions/aggregation/test_vertical.py rename to py-polars/tests/unit/operations/aggregation/test_vertical.py index 8e232ba33382..26f01dacc3d2 100644 --- a/py-polars/tests/unit/functions/aggregation/test_vertical.py +++ b/py-polars/tests/unit/operations/aggregation/test_vertical.py @@ -1,5 +1,6 @@ from __future__ import annotations +import numpy as np import pytest import polars as pl @@ -54,3 +55,22 @@ def test_alias_for_col_agg(function: str, input: str) -> None: expected = getattr(pl.col(input), function)() # e.g. pl.col(input).min() context = pl.DataFrame({"a": [1, 4], "b": [3, 2]}) assert_expr_equal(result, expected, context) + + +@pytest.mark.release() +def test_mean_overflow() -> None: + np.random.seed(1) + expected = 769.5607652 + + df = pl.DataFrame(np.random.randint(500, 1040, 5000000), schema=["value"]) + + result = df.with_columns(pl.mean("value"))[0, 0] + assert np.isclose(result, expected) + + result = df.with_columns(pl.col("value").cast(pl.Int32)).with_columns( + pl.mean("value") + )[0, 0] + assert np.isclose(result, expected) + + result = df.with_columns(pl.col("value").cast(pl.Int32)).get_column("value").mean() + assert np.isclose(result, expected) diff --git a/py-polars/tests/unit/operations/test_group_by.py b/py-polars/tests/unit/operations/test_group_by.py index 82a24afaaef4..51127cf2c6b2 100644 --- a/py-polars/tests/unit/operations/test_group_by.py +++ b/py-polars/tests/unit/operations/test_group_by.py @@ -4,6 +4,7 @@ from datetime import datetime, timedelta from typing import TYPE_CHECKING, Any +import numpy as np import pytest import polars as pl @@ -990,3 +991,72 @@ def test_aggregated_scalar_elementwise_15602() -> None: def test_group_by_multiple_null_cols_15623() -> None: df = pl.DataFrame(schema={"a": pl.Null, "b": pl.Null}).group_by(pl.all()).len() assert df.is_empty() + + +@pytest.mark.release() +def test_categorical_vs_str_group_by() -> None: + # this triggers the perfect hash table + s = pl.Series("a", np.random.randint(0, 50, 100)) + s_with_nulls = pl.select( + pl.when(s < 3).then(None).otherwise(s).alias("a") + ).to_series() + + for s_ in [s, s_with_nulls]: + s_ = s_.cast(str) + cat_out = ( + s_.cast(pl.Categorical) + .to_frame("a") + .group_by("a") + .agg(pl.first().alias("first")) + ) + + str_out = s_.to_frame("a").group_by("a").agg(pl.first().alias("first")) + cat_out.with_columns(pl.col("a").cast(str)) + assert_frame_equal( + cat_out.with_columns( + pl.col("a").cast(str), pl.col("first").cast(pl.List(str)) + ).sort("a"), + str_out.sort("a"), + ) + + +@pytest.mark.release() +def test_boolean_min_max_agg() -> None: + np.random.seed(0) + idx = np.random.randint(0, 500, 1000) + c = np.random.randint(0, 500, 1000) > 250 + + df = pl.DataFrame({"idx": idx, "c": c}) + aggs = [pl.col("c").min().alias("c_min"), pl.col("c").max().alias("c_max")] + + result = df.group_by("idx").agg(aggs).sum() + + schema = {"idx": pl.Int64, "c_min": pl.UInt32, "c_max": pl.UInt32} + expected = pl.DataFrame( + { + "idx": [107583], + "c_min": [120], + "c_max": [321], + }, + schema=schema, + ) + assert_frame_equal(result, expected) + + nulls = np.random.randint(0, 500, 1000) < 100 + + result = ( + df.with_columns(c=pl.when(pl.lit(nulls)).then(None).otherwise(pl.col("c"))) + .group_by("idx") + .agg(aggs) + .sum() + ) + + expected = pl.DataFrame( + { + "idx": [107583], + "c_min": [133], + "c_max": [276], + }, + schema=schema, + ) + assert_frame_equal(result, expected) diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py index 89c7ef6c917b..020882792b7a 100644 --- a/py-polars/tests/unit/operations/test_join.py +++ b/py-polars/tests/unit/operations/test_join.py @@ -869,3 +869,41 @@ def test_join_4_columns_with_validity() -> None: 115, 4, ) + + +@pytest.mark.release() +def test_cross_join() -> None: + # triggers > 100 rows implementation + # https://github.com/pola-rs/polars/blob/5f5acb2a523ce01bc710768b396762b8e69a9e07/polars/polars-core/src/frame/cross_join.rs#L34 + df1 = pl.DataFrame({"col1": ["a"], "col2": ["d"]}) + df2 = pl.DataFrame({"frame2": pl.arange(0, 100, eager=True)}) + out = df2.join(df1, how="cross") + df2 = pl.DataFrame({"frame2": pl.arange(0, 101, eager=True)}) + assert_frame_equal(df2.join(df1, how="cross").slice(0, 100), out) + + +@pytest.mark.release() +def test_cross_join_slice_pushdown() -> None: + # this will likely go out of memory if we did not pushdown the slice + df = ( + pl.Series("x", pl.arange(0, 2**16 - 1, eager=True, dtype=pl.UInt16) % 2**15) + ).to_frame() + + result = df.lazy().join(df.lazy(), how="cross", suffix="_").slice(-5, 10).collect() + expected = pl.DataFrame( + { + "x": [32766, 32766, 32766, 32766, 32766], + "x_": [32762, 32763, 32764, 32765, 32766], + }, + schema={"x": pl.UInt16, "x_": pl.UInt16}, + ) + assert_frame_equal(result, expected) + + result = df.lazy().join(df.lazy(), how="cross", suffix="_").slice(2, 10).collect() + expected = pl.DataFrame( + { + "x": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "x_": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11], + }, + schema={"x": pl.UInt16, "x_": pl.UInt16}, + ) diff --git a/py-polars/tests/unit/operations/test_sort.py b/py-polars/tests/unit/operations/test_sort.py index a0cbd2b2a820..499127166601 100644 --- a/py-polars/tests/unit/operations/test_sort.py +++ b/py-polars/tests/unit/operations/test_sort.py @@ -1078,3 +1078,15 @@ def test_sort_descending_nulls_last(descending: bool, nulls_last: bool) -> None: df.sort(["x", "y"], descending=descending, nulls_last=nulls_last), pl.DataFrame({"x": ref_x, "y": ref_y}), ) + + +@pytest.mark.release() +def test_sort_nan_1942() -> None: + # https://github.com/pola-rs/polars/issues/1942 + import time + + start = time.time() + pl.repeat(float("nan"), 2**13, eager=True).sort() + end = time.time() + + assert (end - start) < 1.0 diff --git a/py-polars/tests/unit/operations/test_window.py b/py-polars/tests/unit/operations/test_window.py index 90cd421a8b41..dfffd6e5cca9 100644 --- a/py-polars/tests/unit/operations/test_window.py +++ b/py-polars/tests/unit/operations/test_window.py @@ -469,3 +469,24 @@ def test_window_agg_list_null_15437() -> None: output = df.select(pl.concat_list("a").over(1)) expected = pl.DataFrame({"a": [[None]]}) assert_frame_equal(output, expected) + + +@pytest.mark.release() +def test_windows_not_cached() -> None: + ldf = ( + pl.DataFrame( + [ + pl.Series("key", ["a", "a", "b", "b"]), + pl.Series("val", [2, 2, 1, 3]), + ] + ) + .lazy() + .filter( + (pl.col("key").cum_count().over("key") == 1) + | (pl.col("val").shift(1).over("key").is_not_null()) + | (pl.col("val") != pl.col("val").shift(1).over("key")) + ) + ) + # this might fail if they are cached + for _ in range(1000): + ldf.collect()