From fd41e03320ce74ba55767774f6f3a39f777b226a Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Tue, 23 Apr 2024 10:14:18 +0200
Subject: [PATCH] test(python): Update benchmark tests (#15825)

---
 .github/workflows/benchmark.yml               |  27 ---
 .github/workflows/codecov.yml                 |   4 +-
 .github/workflows/test-python.yml             |   4 +-
 docs/development/contributing/test.md         |   8 +-
 py-polars/Makefile                            |   2 +-
 py-polars/pyproject.toml                      |   3 +-
 py-polars/tests/benchmark/__init__.py         |   3 +-
 py-polars/tests/benchmark/conftest.py         |  33 +--
 .../tests/benchmark/data/groupby-datagen.R    |  53 ----
 py-polars/tests/benchmark/datagen_groupby.py  | 193 +++++++++++++++
 py-polars/tests/benchmark/test_filter.py      |  16 +-
 py-polars/tests/benchmark/test_group_by.py    |  89 +++----
 py-polars/tests/benchmark/test_io.py          |  23 ++
 py-polars/tests/benchmark/test_release.py     | 229 ------------------
 py-polars/tests/unit/io/test_parquet.py       |  20 ++
 .../aggregation/__init__.py                   |   0
 .../{ => aggregation}/test_aggregations.py    |  34 ++-
 .../aggregation/test_horizontal.py            |   0
 .../aggregation/test_vertical.py              |  20 ++
 .../tests/unit/operations/test_group_by.py    |  70 ++++++
 py-polars/tests/unit/operations/test_join.py  |  38 +++
 py-polars/tests/unit/operations/test_sort.py  |  12 +
 .../tests/unit/operations/test_window.py      |  21 ++
 23 files changed, 484 insertions(+), 418 deletions(-)
 delete mode 100644 py-polars/tests/benchmark/data/groupby-datagen.R
 create mode 100644 py-polars/tests/benchmark/datagen_groupby.py
 create mode 100644 py-polars/tests/benchmark/test_io.py
 delete mode 100644 py-polars/tests/benchmark/test_release.py
 rename py-polars/tests/unit/{functions => operations}/aggregation/__init__.py (100%)
 rename py-polars/tests/unit/operations/{ => aggregation}/test_aggregations.py (94%)
 rename py-polars/tests/unit/{functions => operations}/aggregation/test_horizontal.py (100%)
 rename py-polars/tests/unit/{functions => operations}/aggregation/test_vertical.py (75%)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index a291dd5cbe5e..1f8d27ee4735 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -48,33 +48,6 @@ jobs:
         working-directory: py-polars
         run: pip install -r requirements-dev.txt
 
-      - name: Load benchmark data from cache
-        id: cache-data
-        uses: actions/cache/restore@v4
-        with:
-          path: py-polars/tests/benchmark/data/G1_1e7_1e2_5_0.csv
-          key: benchmark-data
-
-      - name: Set up R
-        if: steps.cache-data.outputs.cache-hit != 'true'
-        uses: r-lib/actions/setup-r@v2
-        with:
-          r-version: '4.3.3'
-
-      - name: Generate data
-        if: steps.cache-data.outputs.cache-hit != 'true'
-        working-directory: py-polars/tests/benchmark/data
-        run: |
-          Rscript -e 'install.packages("data.table", repos="https://cloud.r-project.org")'
-          Rscript groupby-datagen.R 1e7 1e2 5 0
-
-      - name: Save benchmark data in cache
-        if: github.ref_name == 'main'
-        uses: actions/cache/save@v4
-        with:
-          path: py-polars/tests/benchmark/data/G1_1e7_1e2_5_0.csv
-          key: ${{ steps.cache-data.outputs.cache-primary-key }}
-
       - name: Set up Rust
         run: rustup show
 
diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml
index d5b937c2b1a6..65cf0cba5e8e 100644
--- a/.github/workflows/codecov.yml
+++ b/.github/workflows/codecov.yml
@@ -113,14 +113,14 @@ jobs:
 
       - name: Run Python tests
         working-directory: py-polars
-        run: pytest --cov -n auto --dist loadgroup -m "not benchmark and not docs" --cov-report xml:main.xml
+        run: pytest --cov -n auto --dist loadgroup -m "not release and not benchmark and not docs" --cov-report xml:main.xml
         continue-on-error: true
 
       - name: Run Python tests - async reader
         working-directory: py-polars
         env:
           POLARS_FORCE_ASYNC: 1
-        run: pytest --cov -m "not benchmark and not docs" tests/unit/io/ --cov-report xml:async.xml
+        run: pytest --cov -m "not release and not benchmark and not docs" tests/unit/io/ --cov-report xml:async.xml
         continue-on-error: true
 
       - name: Report Rust coverage
diff --git a/.github/workflows/test-python.yml b/.github/workflows/test-python.yml
index 0b2a10f38e6b..1646e5bc5032 100644
--- a/.github/workflows/test-python.yml
+++ b/.github/workflows/test-python.yml
@@ -89,13 +89,13 @@ jobs:
           # Currently skipped due to performance issues in coverage:
           # https://github.com/nedbat/coveragepy/issues/1665
           COV: ${{ !(matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12') && '--cov' || '--no-cov' }}
-        run: pytest $COV -n auto --dist loadgroup -m "not benchmark and not docs"
+        run: pytest $COV -n auto --dist loadgroup -m "not release and not benchmark and not docs"
 
       - name: Run tests async reader tests
         if: github.ref_name != 'main' && matrix.os != 'windows-latest'
         env:
           POLARS_FORCE_ASYNC: 1
-        run: pytest -m "not benchmark and not docs" tests/unit/io/
+        run: pytest -m "not release and not benchmark and not docs" tests/unit/io/
 
       - name: Check import without optional dependencies
         if: github.ref_name != 'main' && matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest'
diff --git a/docs/development/contributing/test.md b/docs/development/contributing/test.md
index e2ee89c221b6..9aa245a1fd6c 100644
--- a/docs/development/contributing/test.md
+++ b/docs/development/contributing/test.md
@@ -102,12 +102,10 @@ Polars uses [CodSpeed](https://codspeed.io/pola-rs/polars) for tracking the perf
 
 ### Generating data
 
-For many tests, a relatively large dataset must be generated first.
-We use an [R](https://www.r-project.org/) script to generate this data.
-The script was taken from the [H2O AI database benchmark](https://github.com/h2oai/db-benchmark), which is the foundation for many of the benchmark tests.
+For most tests, a relatively large dataset must be generated first.
+This is done as part of the `pytest` setup process.
 
-For the exact steps to generate the data, please refer to the [benchmark workflow](https://github.com/pola-rs/polars/blob/main/.github/workflows/benchmark.yml).
-It involves [installing R](https://cran.r-project.org/), installing the [data.table](https://cran.r-project.org/web/packages/data.table/) dependency, and executing a data generation script.
+The data generation logic was taken from the [H2O.ai database benchmark](https://github.com/h2oai/db-benchmark), which is the foundation for many of the benchmark tests.
 
 ### Running the benchmark tests
 
diff --git a/py-polars/Makefile b/py-polars/Makefile
index 93fad3c00931..c8dd4707155d 100644
--- a/py-polars/Makefile
+++ b/py-polars/Makefile
@@ -100,7 +100,7 @@ test-all: .venv build  ## Run all tests
 
 .PHONY: coverage
 coverage: .venv build  ## Run tests and report coverage
-	$(VENV_BIN)/pytest --cov -n auto --dist loadgroup -m "not benchmark"
+	$(VENV_BIN)/pytest --cov -n auto --dist loadgroup -m "not release and not benchmark"
 
 .PHONY: clean
 clean:  ## Clean up caches and build artifacts
diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml
index 08a325f3f1a9..2031f8ba66ab 100644
--- a/py-polars/pyproject.toml
+++ b/py-polars/pyproject.toml
@@ -206,12 +206,13 @@ addopts = [
   "--strict-markers",
   "--import-mode=importlib",
   # Default to running fast tests only. To run ALL tests, run: pytest -m ""
-  "-m not slow and not write_disk and not benchmark and not hypothesis and not docs",
+  "-m not slow and not write_disk and not release and not docs and not hypothesis and not benchmark",
 ]
 markers = [
   "slow: Tests with a longer than average runtime.",
   "write_disk: Tests that write to disk",
   "debug: Tests that should be run on a Polars debug build.",
+  "release: Tests that should be run on a Polars release build.",
   "docs: Documentation code snippets",
 ]
 filterwarnings = [
diff --git a/py-polars/tests/benchmark/__init__.py b/py-polars/tests/benchmark/__init__.py
index 757f2b70de88..643565f66680 100644
--- a/py-polars/tests/benchmark/__init__.py
+++ b/py-polars/tests/benchmark/__init__.py
@@ -1,7 +1,8 @@
 """
 Benchmark tests.
 
-These tests are skipped by default as a large dataset must be generated first.
+These tests are skipped by default as a relatively large dataset must be generated
+first.
 
 See the documentation on how to run these tests:
 https://docs.pola.rs/development/contributing/test/#benchmark-tests
diff --git a/py-polars/tests/benchmark/conftest.py b/py-polars/tests/benchmark/conftest.py
index efdc821f8f83..ca8ffb88b298 100644
--- a/py-polars/tests/benchmark/conftest.py
+++ b/py-polars/tests/benchmark/conftest.py
@@ -1,34 +1,9 @@
-from pathlib import Path
-
 import pytest
 
 import polars as pl
+from tests.benchmark.datagen_groupby import generate_group_by_data
 
 
-@pytest.fixture(scope="module")
-def data_path() -> Path:
-    return Path(__file__).parent / "data"
-
-
-@pytest.fixture(scope="module")
-def h2aoi_groupby_data_path(data_path: Path) -> Path:
-    return data_path / "G1_1e7_1e2_5_0.csv"
-
-
-@pytest.fixture(scope="module")
-def h2oai_groupby_data(h2aoi_groupby_data_path: Path) -> pl.DataFrame:
-    if not h2aoi_groupby_data_path.is_file():
-        pytest.skip("Dataset must be generated before running this test.")
-
-    df = pl.read_csv(
-        h2aoi_groupby_data_path,
-        dtypes={
-            "id4": pl.Int32,
-            "id5": pl.Int32,
-            "id6": pl.Int32,
-            "v1": pl.Int32,
-            "v2": pl.Int32,
-            "v3": pl.Float64,
-        },
-    )
-    return df
+@pytest.fixture(scope="session")
+def groupby_data() -> pl.DataFrame:
+    return generate_group_by_data(10_000, 100, null_ratio=0.05)
diff --git a/py-polars/tests/benchmark/data/groupby-datagen.R b/py-polars/tests/benchmark/data/groupby-datagen.R
deleted file mode 100644
index b5303678913d..000000000000
--- a/py-polars/tests/benchmark/data/groupby-datagen.R
+++ /dev/null
@@ -1,53 +0,0 @@
-# Rscript groupby-datagen.R 1e7 1e2 0 0 ## 1e7 rows, 1e2 K, 0% NAs, random order
-# Rscript groupby-datagen.R 1e8 1e1 5 1 ## 1e8 rows, 10 K, 5% NAs, sorted order
-args = commandArgs(TRUE)
-
-pretty_sci = function(x) {
-  tmp<-strsplit(as.character(x), "+", fixed=TRUE)[[1L]]
-  if(length(tmp)==1L) {
-    paste0(substr(tmp, 1L, 1L), "e", nchar(tmp)-1L)
-  } else if(length(tmp)==2L){
-    paste0(tmp[1L], as.character(as.integer(tmp[2L])))
-  }
-}
-
-library(data.table)
-N=as.integer(args[1L]); K=as.integer(args[2L]); nas=as.integer(args[3L]); sort=as.integer(args[4L])
-stopifnot(nas<=100L, nas>=0L, sort%in%c(0L,1L))
-set.seed(108)
-cat(sprintf("Producing data of %s rows, %s K groups factors, %s NAs ratio, %s sort flag\n", pretty_sci(N), pretty_sci(K), nas, sort))
-DT = list()
-DT[["id1"]] = sample(sprintf("id%03d",1:K), N, TRUE)      # large groups (char)
-DT[["id2"]] = sample(sprintf("id%03d",1:K), N, TRUE)      # small groups (char)
-DT[["id3"]] = sample(sprintf("id%010d",1:(N/K)), N, TRUE) # large groups (char)
-DT[["id4"]] = sample(K, N, TRUE)                          # large groups (int)
-DT[["id5"]] = sample(K, N, TRUE)                          # small groups (int)
-DT[["id6"]] = sample(N/K, N, TRUE)                        # small groups (int)
-DT[["v1"]] =  sample(5, N, TRUE)                          # int in range [1,5]
-DT[["v2"]] =  sample(15, N, TRUE)                         # int in range [1,15]
-DT[["v3"]] =  round(runif(N,max=100),6)                   # numeric e.g. 23.574912
-setDT(DT)
-if (nas>0L) {
-  cat("Inputting NAs\n")
-  for (col in paste0("id",1:6)) {
-    ucol = unique(DT[[col]])
-    nna = as.integer(length(ucol) * (nas/100))
-    if (nna)
-      set(DT, DT[.(sample(ucol, nna)), on=col, which=TRUE], col, NA)
-    rm(ucol)
-  }
-  nna = as.integer(nrow(DT) * (nas/100))
-  if (nna) {
-    for (col in paste0("v",1:3))
-      set(DT, sample(nrow(DT), nna), col, NA)
-  }
-}
-if (sort==1L) {
-  cat("Sorting data\n")
-  setkeyv(DT, paste0("id", 1:6))
-}
-file = sprintf("G1_%s_%s_%s_%s.csv", pretty_sci(N), pretty_sci(K), nas, sort)
-cat(sprintf("Writing data to %s\n", file))
-fwrite(DT, file)
-cat(sprintf("Data written to %s, quitting\n", file))
-if (!interactive()) quit("no", status=0)
diff --git a/py-polars/tests/benchmark/datagen_groupby.py b/py-polars/tests/benchmark/datagen_groupby.py
new file mode 100644
index 000000000000..782885cf5ec2
--- /dev/null
+++ b/py-polars/tests/benchmark/datagen_groupby.py
@@ -0,0 +1,193 @@
+"""
+Script to generate data for benchmarking group-by operations.
+
+Data generation logic was adapted from the H2O.ai benchmark.
+The original R script is located here:
+https://github.com/h2oai/db-benchmark/blob/master/_data/groupby-datagen.R
+
+Examples
+--------
+10 million rows, 100 groups, no nulls, random order:
+$ python datagen_groupby.py 1e7 1e2 --null-percentage 0
+
+100 million rows, 10 groups, 5% nulls, sorted:
+$ python datagen_groupby.py 1e8 1e1 --null-percentage 5 --sorted
+"""
+
+import argparse
+import logging
+
+import numpy as np
+from numpy.random import default_rng
+
+import polars as pl
+
+logging.basicConfig(level=logging.INFO)
+
+SEED = 0
+rng = default_rng(seed=SEED)
+
+__all__ = ["generate_group_by_data"]
+
+
+def generate_group_by_data(
+    n_rows: int, n_groups: int, null_ratio: float = 0.0, *, sort: bool = False
+) -> pl.DataFrame:
+    """Generate data for benchmarking group-by operations."""
+    logging.info("Generating data...")
+    df = _generate_data(n_rows, n_groups)
+
+    if null_ratio > 0.0:
+        logging.info("Setting nulls...")
+        df = _set_nulls(df, null_ratio)
+
+    if sort:
+        logging.info("Sorting data...")
+        df = df.sort(c for c in df.columns if c.startswith("id"))
+
+    logging.info("Done generating data.")
+    return df
+
+
+def _generate_data(n_rows: int, n_groups: int) -> pl.DataFrame:
+    N = n_rows
+    K = n_groups
+
+    group_str_small = [f"id{str(i).zfill(3)}" for i in range(1, K + 1)]
+    group_str_large = [f"id{str(i).zfill(10)}" for i in range(1, int(N / K) + 1)]
+    group_int_small = range(1, K + 1)
+    group_int_large = range(1, int(N / K) + 1)
+
+    var_int_small = range(1, 6)
+    var_int_large = range(1, 16)
+    var_float = rng.uniform(0, 100, N)
+
+    return pl.DataFrame(
+        {
+            "id1": rng.choice(group_str_small, N),
+            "id2": rng.choice(group_str_small, N),
+            "id3": rng.choice(group_str_large, N),
+            "id4": rng.choice(group_int_small, N),
+            "id5": rng.choice(group_int_small, N),
+            "id6": rng.choice(group_int_large, N),
+            "v1": rng.choice(var_int_small, N),
+            "v2": rng.choice(var_int_large, N),
+            "v3": np.round(var_float, 6),
+        },
+        schema={
+            "id1": pl.String,
+            "id2": pl.String,
+            "id3": pl.String,
+            "id4": pl.Int32,
+            "id5": pl.Int32,
+            "id6": pl.Int32,
+            "v1": pl.Int32,
+            "v2": pl.Int32,
+            "v3": pl.Float64,
+        },
+    )
+
+
+def _set_nulls(df: pl.DataFrame, null_ratio: float) -> pl.DataFrame:
+    """Set null values according to the given ratio."""
+
+    def set_nulls_var(s: pl.Series, ratio: float) -> pl.Series:
+        """Set Series values to null according to the given ratio."""
+        len = s.len()
+        n_null = int(ratio * len)
+        if n_null == 0:
+            return s
+
+        indices = rng.choice(len, size=n_null, replace=False)
+        return s.scatter(indices, None)
+
+    def set_nulls_group(s: pl.Series, ratio: float) -> pl.Series:
+        """Set Series unique values to null according to the given ratio."""
+        uniques = s.unique()
+        n_null = int(ratio * uniques.len())
+        if n_null == 0:
+            return s
+
+        to_replace = rng.choice(uniques, size=n_null, replace=False)
+        return (
+            s.to_frame()
+            .select(
+                pl.when(pl.col(s.name).is_in(to_replace))
+                .then(None)
+                .otherwise(pl.col(s.name))
+                .alias(s.name)
+            )
+            .to_series()
+        )
+
+    return df.with_columns(
+        set_nulls_group(s, null_ratio)
+        if s.name.startswith("id")
+        else set_nulls_var(s, null_ratio)
+        for s in df.get_columns()
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Generate data for benchmarking group-by operations"
+    )
+
+    parser.add_argument("rows", type=float, help="Number of rows")
+    parser.add_argument("groups", type=float, help="Number of groups")
+    parser.add_argument(
+        "-n",
+        "--null-percentage",
+        type=int,
+        default=0,
+        choices=range(1, 101),
+        metavar="[0-100]",
+        help="Percentage of null values",
+    )
+    parser.add_argument(
+        "-s",
+        "--sort",
+        action="store_true",
+        help="Sort the data by group",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        default=None,
+        help="Output filename",
+    )
+
+    args = parser.parse_args()
+
+    # Convert arguments to appropriate types
+    n_rows = int(args.rows)
+    n_groups = int(args.groups)
+    null_ratio = args.null_percentage / 100
+    sort = args.sort
+
+    logging.info(
+        f"Generating data: {n_rows} rows, {n_groups} groups, {null_ratio} null ratio, sorted: {args.sort}"
+    )
+
+    df = generate_group_by_data(n_rows, n_groups, null_ratio=null_ratio, sort=sort)
+    write_data(df, args)
+
+
+def write_data(df: pl.DataFrame, args: argparse.Namespace) -> None:
+    def format_int(i: int) -> str:
+        base, exp = f"{i:e}".split("e")
+        return f"{float(base):g}e{int(exp)}"
+
+    if args.output is not None:
+        filename = args.output
+    else:
+        filename = f"G1_{format_int(args.rows)}_{format_int(args.groups)}_{args.null_percentage}_{int(args.sort)}.csv"
+
+    logging.info(f"Writing data to {filename}")
+    df.write_csv(filename)
+    logging.info("Done writing data.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/py-polars/tests/benchmark/test_filter.py b/py-polars/tests/benchmark/test_filter.py
index 0923c791df7e..047404f01bb1 100644
--- a/py-polars/tests/benchmark/test_filter.py
+++ b/py-polars/tests/benchmark/test_filter.py
@@ -9,9 +9,9 @@
 pytestmark = pytest.mark.benchmark()
 
 
-def test_filter1(h2oai_groupby_data: pl.DataFrame) -> None:
-    result = (
-        h2oai_groupby_data.lazy()
+def test_filter1(groupby_data: pl.DataFrame) -> None:
+    (
+        groupby_data.lazy()
         .filter(pl.col("id1").eq_missing(pl.lit("id046")))
         .select(
             pl.col("id6").cast(pl.Int64).sum(),
@@ -19,13 +19,11 @@ def test_filter1(h2oai_groupby_data: pl.DataFrame) -> None:
         )
         .collect()
     )
-    assert result["id6"].item() == 4_762_459_723
-    assert result["v3"].item() == pytest.approx(4766795.205196999)
 
 
-def test_filter2(h2oai_groupby_data: pl.DataFrame) -> None:
-    result = (
-        h2oai_groupby_data.lazy()
+def test_filter2(groupby_data: pl.DataFrame) -> None:
+    (
+        groupby_data.lazy()
         .filter(~(pl.col("id1").eq_missing(pl.lit("id046"))))
         .select(
             pl.col("id6").cast(pl.Int64).sum(),
@@ -33,5 +31,3 @@ def test_filter2(h2oai_groupby_data: pl.DataFrame) -> None:
         )
         .collect()
     )
-    assert result["id6"].item() == 470_453_297_090
-    assert result["v3"].item() == pytest.approx(470202778.84258103)
diff --git a/py-polars/tests/benchmark/test_group_by.py b/py-polars/tests/benchmark/test_group_by.py
index d29d716deede..7547b13b5e62 100644
--- a/py-polars/tests/benchmark/test_group_by.py
+++ b/py-polars/tests/benchmark/test_group_by.py
@@ -1,7 +1,7 @@
 """
-Benchmark tests for the group by operation.
+Benchmark tests for the group-by operation.
 
-These tests are based on the H2O AI database benchmark.
+These tests are based on the H2O.ai database benchmark.
 
 See:
 https://h2oai.github.io/db-benchmark/
@@ -16,35 +16,31 @@
 pytestmark = pytest.mark.benchmark()
 
 
-def test_h2oai_groupby_q1(h2oai_groupby_data: pl.DataFrame) -> None:
-    result = (
-        h2oai_groupby_data.lazy()
+def test_groupby_h2oai_q1(groupby_data: pl.DataFrame) -> None:
+    (
+        groupby_data.lazy()
         .group_by("id1")
         .agg(
             pl.sum("v1").alias("v1_sum"),
         )
         .collect()
     )
-    assert result.shape == (96, 2)
-    assert result["v1_sum"].sum() == 28498857
 
 
-def test_h2oai_groupby_q2(h2oai_groupby_data: pl.DataFrame) -> None:
-    result = (
-        h2oai_groupby_data.lazy()
+def test_groupby_h2oai_q2(groupby_data: pl.DataFrame) -> None:
+    (
+        groupby_data.lazy()
         .group_by("id1", "id2")
         .agg(
             pl.sum("v1").alias("v1_sum"),
         )
         .collect()
     )
-    assert result.shape == (9216, 3)
-    assert result["v1_sum"].sum() == 28498857
 
 
-def test_h2oai_groupby_q3(h2oai_groupby_data: pl.DataFrame) -> None:
-    result = (
-        h2oai_groupby_data.lazy()
+def test_groupby_h2oai_q3(groupby_data: pl.DataFrame) -> None:
+    (
+        groupby_data.lazy()
         .group_by("id3")
         .agg(
             pl.sum("v1").alias("v1_sum"),
@@ -52,14 +48,11 @@ def test_h2oai_groupby_q3(h2oai_groupby_data: pl.DataFrame) -> None:
         )
         .collect()
     )
-    assert result.shape == (95001, 3)
-    assert result["v1_sum"].sum() == 28498857
-    assert result["v3_mean"].sum() == pytest.approx(4749467.631946)
 
 
-def test_h2oai_groupby_q4(h2oai_groupby_data: pl.DataFrame) -> None:
-    result = (
-        h2oai_groupby_data.lazy()
+def test_groupby_h2oai_q4(groupby_data: pl.DataFrame) -> None:
+    (
+        groupby_data.lazy()
         .group_by("id4")
         .agg(
             pl.mean("v1").alias("v1_mean"),
@@ -68,15 +61,11 @@ def test_h2oai_groupby_q4(h2oai_groupby_data: pl.DataFrame) -> None:
         )
         .collect()
     )
-    assert result.shape == (96, 4)
-    assert result["v1_mean"].sum() == pytest.approx(287.989430)
-    assert result["v2_mean"].sum() == pytest.approx(767.852921)
-    assert result["v3_mean"].sum() == pytest.approx(4799.873270)
 
 
-def test_h2oai_groupby_q5(h2oai_groupby_data: pl.DataFrame) -> None:
-    result = (
-        h2oai_groupby_data.lazy()
+def test_groupby_h2oai_q5(groupby_data: pl.DataFrame) -> None:
+    (
+        groupby_data.lazy()
         .group_by("id6")
         .agg(
             pl.sum("v1").alias("v1_sum"),
@@ -85,15 +74,11 @@ def test_h2oai_groupby_q5(h2oai_groupby_data: pl.DataFrame) -> None:
         )
         .collect()
     )
-    assert result.shape == (95001, 4)
-    assert result["v1_sum"].sum() == 28498857
-    assert result["v2_sum"].sum() == 75988394
-    assert result["v3_sum"].sum() == pytest.approx(474969574.047777)
 
 
-def test_h2oai_groupby_q6(h2oai_groupby_data: pl.DataFrame) -> None:
-    result = (
-        h2oai_groupby_data.lazy()
+def test_groupby_h2oai_q6(groupby_data: pl.DataFrame) -> None:
+    (
+        groupby_data.lazy()
         .group_by("id4", "id5")
         .agg(
             pl.median("v3").alias("v3_median"),
@@ -101,49 +86,40 @@ def test_h2oai_groupby_q6(h2oai_groupby_data: pl.DataFrame) -> None:
         )
         .collect()
     )
-    assert result.shape == (9216, 4)
-    assert result["v3_median"].sum() == pytest.approx(460771.216444)
-    assert result["v3_std"].sum() == pytest.approx(266006.904622)
 
 
-def test_h2oai_groupby_q7(h2oai_groupby_data: pl.DataFrame) -> None:
-    result = (
-        h2oai_groupby_data.lazy()
+def test_groupby_h2oai_q7(groupby_data: pl.DataFrame) -> None:
+    (
+        groupby_data.lazy()
         .group_by("id3")
         .agg((pl.max("v1") - pl.min("v2")).alias("range_v1_v2"))
         .collect()
     )
-    assert result.shape == (95001, 2)
-    assert result["range_v1_v2"].sum() == 379850
 
 
-def test_h2oai_groupby_q8(h2oai_groupby_data: pl.DataFrame) -> None:
-    result = (
-        h2oai_groupby_data.lazy()
+def test_groupby_h2oai_q8(groupby_data: pl.DataFrame) -> None:
+    (
+        groupby_data.lazy()
         .drop_nulls("v3")
         .group_by("id6")
         .agg(pl.col("v3").top_k(2).alias("largest2_v3"))
         .explode("largest2_v3")
         .collect()
     )
-    assert result.shape == (190002, 2)
-    assert result["largest2_v3"].sum() == pytest.approx(18700554.779632)
 
 
-def test_h2oai_groupby_q9(h2oai_groupby_data: pl.DataFrame) -> None:
-    result = (
-        h2oai_groupby_data.lazy()
+def test_groupby_h2oai_q9(groupby_data: pl.DataFrame) -> None:
+    (
+        groupby_data.lazy()
         .group_by("id2", "id4")
         .agg((pl.corr("v1", "v2") ** 2).alias("r2"))
         .collect()
     )
-    assert result.shape == (9216, 3)
-    assert result["r2"].sum() == pytest.approx(9.940515)
 
 
-def test_h2oai_groupby_q10(h2oai_groupby_data: pl.DataFrame) -> None:
-    result = (
-        h2oai_groupby_data.lazy()
+def test_groupby_h2oai_q10(groupby_data: pl.DataFrame) -> None:
+    (
+        groupby_data.lazy()
         .group_by("id1", "id2", "id3", "id4", "id5", "id6")
         .agg(
             pl.sum("v3").alias("v3_sum"),
@@ -151,4 +127,3 @@ def test_h2oai_groupby_q10(h2oai_groupby_data: pl.DataFrame) -> None:
         )
         .collect()
     )
-    assert result.shape == (9999993, 8)
diff --git a/py-polars/tests/benchmark/test_io.py b/py-polars/tests/benchmark/test_io.py
new file mode 100644
index 000000000000..44e165a61a03
--- /dev/null
+++ b/py-polars/tests/benchmark/test_io.py
@@ -0,0 +1,23 @@
+"""Benchmark tests for the I/O operations."""
+
+from pathlib import Path
+
+import pytest
+
+import polars as pl
+
+pytestmark = pytest.mark.benchmark()
+
+
+def test_write_read_scan_large_csv(groupby_data: pl.DataFrame, tmp_path: Path) -> None:
+    tmp_path.mkdir(exist_ok=True)
+
+    data_path = tmp_path / "data.csv"
+    groupby_data.write_csv(data_path)
+
+    predicate = pl.col("v2") < 5
+
+    shape_eager = pl.read_csv(data_path).filter(predicate).shape
+    shape_lazy = pl.scan_csv(data_path).filter(predicate).collect().shape
+
+    assert shape_lazy == shape_eager
diff --git a/py-polars/tests/benchmark/test_release.py b/py-polars/tests/benchmark/test_release.py
deleted file mode 100644
index 03882c4a7940..000000000000
--- a/py-polars/tests/benchmark/test_release.py
+++ /dev/null
@@ -1,229 +0,0 @@
-"""
-Various benchmark tests.
-
-Tests in this module will be run in the CI using a release build of Polars.
-
-To run these tests: pytest -m release
-"""
-
-import time
-from pathlib import Path
-from typing import cast
-
-import numpy as np
-import pytest
-
-import polars as pl
-from polars.testing import assert_frame_equal
-
-# Mark all tests in this module as benchmark tests
-pytestmark = pytest.mark.benchmark()
-
-
-def test_read_scan_large_csv(h2aoi_groupby_data_path: Path) -> None:
-    if not h2aoi_groupby_data_path.is_file():
-        pytest.skip("Dataset must be generated before running this test.")
-
-    predicate = pl.col("v2") < 5
-
-    shape_eager = pl.read_csv(h2aoi_groupby_data_path).filter(predicate).shape
-    shape_lazy = (
-        (pl.scan_csv(h2aoi_groupby_data_path).filter(predicate)).collect().shape
-    )
-
-    assert shape_lazy == shape_eager
-
-
-def test_sort_nan_1942() -> None:
-    # https://github.com/pola-rs/polars/issues/1942
-    t0 = time.time()
-    pl.repeat(float("nan"), 2 << 12, eager=True).sort()
-    assert (time.time() - t0) < 1
-
-
-def test_mean_overflow() -> None:
-    np.random.seed(1)
-    expected = 769.5607652
-
-    df = pl.DataFrame(np.random.randint(500, 1040, 5000000), schema=["value"])
-
-    result = df.with_columns(pl.mean("value"))[0, 0]
-    assert np.isclose(result, expected)
-
-    result = df.with_columns(pl.col("value").cast(pl.Int32)).with_columns(
-        pl.mean("value")
-    )[0, 0]
-    assert np.isclose(result, expected)
-
-    result = df.with_columns(pl.col("value").cast(pl.Int32)).get_column("value").mean()
-    assert np.isclose(result, expected)
-
-
-def test_min_max_2850() -> None:
-    # https://github.com/pola-rs/polars/issues/2850
-    df = pl.DataFrame(
-        {
-            "id": [
-                130352432,
-                130352277,
-                130352611,
-                130352833,
-                130352305,
-                130352258,
-                130352764,
-                130352475,
-                130352368,
-                130352346,
-            ]
-        }
-    )
-
-    minimum = 130352258
-    maximum = 130352833.0
-
-    for _ in range(10):
-        permuted = df.sample(fraction=1.0, seed=0)
-        computed = permuted.select(
-            [pl.col("id").min().alias("min"), pl.col("id").max().alias("max")]
-        )
-        assert cast(int, computed[0, "min"]) == minimum
-        assert cast(float, computed[0, "max"]) == maximum
-
-
-def test_windows_not_cached() -> None:
-    ldf = (
-        pl.DataFrame(
-            [
-                pl.Series("key", ["a", "a", "b", "b"]),
-                pl.Series("val", [2, 2, 1, 3]),
-            ]
-        )
-        .lazy()
-        .filter(
-            (pl.col("key").cum_count().over("key") == 1)
-            | (pl.col("val").shift(1).over("key").is_not_null())
-            | (pl.col("val") != pl.col("val").shift(1).over("key"))
-        )
-    )
-    # this might fail if they are cached
-    for _ in range(1000):
-        ldf.collect()
-
-
-def test_cross_join() -> None:
-    # triggers > 100 rows implementation
-    # https://github.com/pola-rs/polars/blob/5f5acb2a523ce01bc710768b396762b8e69a9e07/polars/polars-core/src/frame/cross_join.rs#L34
-    df1 = pl.DataFrame({"col1": ["a"], "col2": ["d"]})
-    df2 = pl.DataFrame({"frame2": pl.arange(0, 100, eager=True)})
-    out = df2.join(df1, how="cross")
-    df2 = pl.DataFrame({"frame2": pl.arange(0, 101, eager=True)})
-    assert_frame_equal(df2.join(df1, how="cross").slice(0, 100), out)
-
-
-def test_cross_join_slice_pushdown() -> None:
-    # this will likely go out of memory if we did not pushdown the slice
-    df = (
-        pl.Series("x", pl.arange(0, 2**16 - 1, eager=True, dtype=pl.UInt16) % 2**15)
-    ).to_frame()
-
-    result = df.lazy().join(df.lazy(), how="cross", suffix="_").slice(-5, 10).collect()
-    expected = pl.DataFrame(
-        {
-            "x": [32766, 32766, 32766, 32766, 32766],
-            "x_": [32762, 32763, 32764, 32765, 32766],
-        },
-        schema={"x": pl.UInt16, "x_": pl.UInt16},
-    )
-    assert_frame_equal(result, expected)
-
-    result = df.lazy().join(df.lazy(), how="cross", suffix="_").slice(2, 10).collect()
-    expected = pl.DataFrame(
-        {
-            "x": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-            "x_": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
-        },
-        schema={"x": pl.UInt16, "x_": pl.UInt16},
-    )
-
-
-def test_max_statistic_parquet_writer() -> None:
-    # this hits the maximal page size
-    # so the row group will be split into multiple pages
-    # the page statistics need to be correctly reduced
-    # for this query to make sense
-    n = 150_000
-
-    # int64 is important to hit the page size
-    df = pl.int_range(0, n, eager=True, dtype=pl.Int64).alias("int").to_frame()
-    f = "/tmp/tmp.parquet"
-    df.write_parquet(f, statistics=True, use_pyarrow=False, row_group_size=n)
-    result = pl.scan_parquet(f).filter(pl.col("int") > n - 3).collect()
-    expected = pl.DataFrame({"int": [149998, 149999]})
-    assert_frame_equal(result, expected)
-
-
-def test_boolean_min_max_agg() -> None:
-    np.random.seed(0)
-    idx = np.random.randint(0, 500, 1000)
-    c = np.random.randint(0, 500, 1000) > 250
-
-    df = pl.DataFrame({"idx": idx, "c": c})
-    aggs = [pl.col("c").min().alias("c_min"), pl.col("c").max().alias("c_max")]
-
-    result = df.group_by("idx").agg(aggs).sum()
-
-    schema = {"idx": pl.Int64, "c_min": pl.UInt32, "c_max": pl.UInt32}
-    expected = pl.DataFrame(
-        {
-            "idx": [107583],
-            "c_min": [120],
-            "c_max": [321],
-        },
-        schema=schema,
-    )
-    assert_frame_equal(result, expected)
-
-    nulls = np.random.randint(0, 500, 1000) < 100
-
-    result = (
-        df.with_columns(c=pl.when(pl.lit(nulls)).then(None).otherwise(pl.col("c")))
-        .group_by("idx")
-        .agg(aggs)
-        .sum()
-    )
-
-    expected = pl.DataFrame(
-        {
-            "idx": [107583],
-            "c_min": [133],
-            "c_max": [276],
-        },
-        schema=schema,
-    )
-    assert_frame_equal(result, expected)
-
-
-def test_categorical_vs_str_group_by() -> None:
-    # this triggers the perfect hash table
-    s = pl.Series("a", np.random.randint(0, 50, 100))
-    s_with_nulls = pl.select(
-        pl.when(s < 3).then(None).otherwise(s).alias("a")
-    ).to_series()
-
-    for s_ in [s, s_with_nulls]:
-        s_ = s_.cast(str)
-        cat_out = (
-            s_.cast(pl.Categorical)
-            .to_frame("a")
-            .group_by("a")
-            .agg(pl.first().alias("first"))
-        )
-
-        str_out = s_.to_frame("a").group_by("a").agg(pl.first().alias("first"))
-        cat_out.with_columns(pl.col("a").cast(str))
-        assert_frame_equal(
-            cat_out.with_columns(
-                pl.col("a").cast(str), pl.col("first").cast(pl.List(str))
-            ).sort("a"),
-            str_out.sort("a"),
-        )
diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py
index 71e328198a80..9ace31e42ccf 100644
--- a/py-polars/tests/unit/io/test_parquet.py
+++ b/py-polars/tests/unit/io/test_parquet.py
@@ -835,3 +835,23 @@ def test_read_parquet_only_loads_selected_columns_15098(
     # Only one column's worth of memory should be used; 2 columns would be
     # 16_000_000 at least, but there's some overhead.
     assert 8_000_000 < memory_usage_without_pyarrow.get_peak() < 13_000_000
+
+
+@pytest.mark.release()
+@pytest.mark.write_disk()
+def test_max_statistic_parquet_writer(tmp_path: Path) -> None:
+    # this hits the maximal page size
+    # so the row group will be split into multiple pages
+    # the page statistics need to be correctly reduced
+    # for this query to make sense
+    n = 150_000
+
+    tmp_path.mkdir(exist_ok=True)
+
+    # int64 is important to hit the page size
+    df = pl.int_range(0, n, eager=True, dtype=pl.Int64).alias("int").to_frame()
+    f = tmp_path / "tmp.parquet"
+    df.write_parquet(f, statistics=True, use_pyarrow=False, row_group_size=n)
+    result = pl.scan_parquet(f).filter(pl.col("int") > n - 3).collect()
+    expected = pl.DataFrame({"int": [149998, 149999]})
+    assert_frame_equal(result, expected)
diff --git a/py-polars/tests/unit/functions/aggregation/__init__.py b/py-polars/tests/unit/operations/aggregation/__init__.py
similarity index 100%
rename from py-polars/tests/unit/functions/aggregation/__init__.py
rename to py-polars/tests/unit/operations/aggregation/__init__.py
diff --git a/py-polars/tests/unit/operations/test_aggregations.py b/py-polars/tests/unit/operations/aggregation/test_aggregations.py
similarity index 94%
rename from py-polars/tests/unit/operations/test_aggregations.py
rename to py-polars/tests/unit/operations/aggregation/test_aggregations.py
index ca3153cdfc1b..7fcdb30e023b 100644
--- a/py-polars/tests/unit/operations/test_aggregations.py
+++ b/py-polars/tests/unit/operations/aggregation/test_aggregations.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from datetime import date, datetime, timedelta
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 import pytest
@@ -538,3 +538,35 @@ def test_group_count_over_null_column_15705() -> None:
     )
     out = df.group_by("a", maintain_order=True).agg(pl.col("c").count())
     assert out["c"].to_list() == [0, 0, 0]
+
+
+@pytest.mark.release()
+def test_min_max_2850() -> None:
+    # https://github.com/pola-rs/polars/issues/2850
+    df = pl.DataFrame(
+        {
+            "id": [
+                130352432,
+                130352277,
+                130352611,
+                130352833,
+                130352305,
+                130352258,
+                130352764,
+                130352475,
+                130352368,
+                130352346,
+            ]
+        }
+    )
+
+    minimum = 130352258
+    maximum = 130352833.0
+
+    for _ in range(10):
+        permuted = df.sample(fraction=1.0, seed=0)
+        computed = permuted.select(
+            [pl.col("id").min().alias("min"), pl.col("id").max().alias("max")]
+        )
+        assert cast(int, computed[0, "min"]) == minimum
+        assert cast(float, computed[0, "max"]) == maximum
diff --git a/py-polars/tests/unit/functions/aggregation/test_horizontal.py b/py-polars/tests/unit/operations/aggregation/test_horizontal.py
similarity index 100%
rename from py-polars/tests/unit/functions/aggregation/test_horizontal.py
rename to py-polars/tests/unit/operations/aggregation/test_horizontal.py
diff --git a/py-polars/tests/unit/functions/aggregation/test_vertical.py b/py-polars/tests/unit/operations/aggregation/test_vertical.py
similarity index 75%
rename from py-polars/tests/unit/functions/aggregation/test_vertical.py
rename to py-polars/tests/unit/operations/aggregation/test_vertical.py
index 8e232ba33382..26f01dacc3d2 100644
--- a/py-polars/tests/unit/functions/aggregation/test_vertical.py
+++ b/py-polars/tests/unit/operations/aggregation/test_vertical.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import numpy as np
 import pytest
 
 import polars as pl
@@ -54,3 +55,22 @@ def test_alias_for_col_agg(function: str, input: str) -> None:
     expected = getattr(pl.col(input), function)()  # e.g. pl.col(input).min()
     context = pl.DataFrame({"a": [1, 4], "b": [3, 2]})
     assert_expr_equal(result, expected, context)
+
+
+@pytest.mark.release()
+def test_mean_overflow() -> None:
+    np.random.seed(1)
+    expected = 769.5607652
+
+    df = pl.DataFrame(np.random.randint(500, 1040, 5000000), schema=["value"])
+
+    result = df.with_columns(pl.mean("value"))[0, 0]
+    assert np.isclose(result, expected)
+
+    result = df.with_columns(pl.col("value").cast(pl.Int32)).with_columns(
+        pl.mean("value")
+    )[0, 0]
+    assert np.isclose(result, expected)
+
+    result = df.with_columns(pl.col("value").cast(pl.Int32)).get_column("value").mean()
+    assert np.isclose(result, expected)
diff --git a/py-polars/tests/unit/operations/test_group_by.py b/py-polars/tests/unit/operations/test_group_by.py
index 82a24afaaef4..51127cf2c6b2 100644
--- a/py-polars/tests/unit/operations/test_group_by.py
+++ b/py-polars/tests/unit/operations/test_group_by.py
@@ -4,6 +4,7 @@
 from datetime import datetime, timedelta
 from typing import TYPE_CHECKING, Any
 
+import numpy as np
 import pytest
 
 import polars as pl
@@ -990,3 +991,72 @@ def test_aggregated_scalar_elementwise_15602() -> None:
 def test_group_by_multiple_null_cols_15623() -> None:
     df = pl.DataFrame(schema={"a": pl.Null, "b": pl.Null}).group_by(pl.all()).len()
     assert df.is_empty()
+
+
+@pytest.mark.release()
+def test_categorical_vs_str_group_by() -> None:
+    # this triggers the perfect hash table
+    s = pl.Series("a", np.random.randint(0, 50, 100))
+    s_with_nulls = pl.select(
+        pl.when(s < 3).then(None).otherwise(s).alias("a")
+    ).to_series()
+
+    for s_ in [s, s_with_nulls]:
+        s_ = s_.cast(str)
+        cat_out = (
+            s_.cast(pl.Categorical)
+            .to_frame("a")
+            .group_by("a")
+            .agg(pl.first().alias("first"))
+        )
+
+        str_out = s_.to_frame("a").group_by("a").agg(pl.first().alias("first"))
+        cat_out.with_columns(pl.col("a").cast(str))
+        assert_frame_equal(
+            cat_out.with_columns(
+                pl.col("a").cast(str), pl.col("first").cast(pl.List(str))
+            ).sort("a"),
+            str_out.sort("a"),
+        )
+
+
+@pytest.mark.release()
+def test_boolean_min_max_agg() -> None:
+    np.random.seed(0)
+    idx = np.random.randint(0, 500, 1000)
+    c = np.random.randint(0, 500, 1000) > 250
+
+    df = pl.DataFrame({"idx": idx, "c": c})
+    aggs = [pl.col("c").min().alias("c_min"), pl.col("c").max().alias("c_max")]
+
+    result = df.group_by("idx").agg(aggs).sum()
+
+    schema = {"idx": pl.Int64, "c_min": pl.UInt32, "c_max": pl.UInt32}
+    expected = pl.DataFrame(
+        {
+            "idx": [107583],
+            "c_min": [120],
+            "c_max": [321],
+        },
+        schema=schema,
+    )
+    assert_frame_equal(result, expected)
+
+    nulls = np.random.randint(0, 500, 1000) < 100
+
+    result = (
+        df.with_columns(c=pl.when(pl.lit(nulls)).then(None).otherwise(pl.col("c")))
+        .group_by("idx")
+        .agg(aggs)
+        .sum()
+    )
+
+    expected = pl.DataFrame(
+        {
+            "idx": [107583],
+            "c_min": [133],
+            "c_max": [276],
+        },
+        schema=schema,
+    )
+    assert_frame_equal(result, expected)
diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py
index 89c7ef6c917b..020882792b7a 100644
--- a/py-polars/tests/unit/operations/test_join.py
+++ b/py-polars/tests/unit/operations/test_join.py
@@ -869,3 +869,41 @@ def test_join_4_columns_with_validity() -> None:
         115,
         4,
     )
+
+
+@pytest.mark.release()
+def test_cross_join() -> None:
+    # triggers > 100 rows implementation
+    # https://github.com/pola-rs/polars/blob/5f5acb2a523ce01bc710768b396762b8e69a9e07/polars/polars-core/src/frame/cross_join.rs#L34
+    df1 = pl.DataFrame({"col1": ["a"], "col2": ["d"]})
+    df2 = pl.DataFrame({"frame2": pl.arange(0, 100, eager=True)})
+    out = df2.join(df1, how="cross")
+    df2 = pl.DataFrame({"frame2": pl.arange(0, 101, eager=True)})
+    assert_frame_equal(df2.join(df1, how="cross").slice(0, 100), out)
+
+
+@pytest.mark.release()
+def test_cross_join_slice_pushdown() -> None:
+    # this will likely go out of memory if we did not pushdown the slice
+    df = (
+        pl.Series("x", pl.arange(0, 2**16 - 1, eager=True, dtype=pl.UInt16) % 2**15)
+    ).to_frame()
+
+    result = df.lazy().join(df.lazy(), how="cross", suffix="_").slice(-5, 10).collect()
+    expected = pl.DataFrame(
+        {
+            "x": [32766, 32766, 32766, 32766, 32766],
+            "x_": [32762, 32763, 32764, 32765, 32766],
+        },
+        schema={"x": pl.UInt16, "x_": pl.UInt16},
+    )
+    assert_frame_equal(result, expected)
+
+    result = df.lazy().join(df.lazy(), how="cross", suffix="_").slice(2, 10).collect()
+    expected = pl.DataFrame(
+        {
+            "x": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+            "x_": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
+        },
+        schema={"x": pl.UInt16, "x_": pl.UInt16},
+    )
diff --git a/py-polars/tests/unit/operations/test_sort.py b/py-polars/tests/unit/operations/test_sort.py
index a0cbd2b2a820..499127166601 100644
--- a/py-polars/tests/unit/operations/test_sort.py
+++ b/py-polars/tests/unit/operations/test_sort.py
@@ -1078,3 +1078,15 @@ def test_sort_descending_nulls_last(descending: bool, nulls_last: bool) -> None:
         df.sort(["x", "y"], descending=descending, nulls_last=nulls_last),
         pl.DataFrame({"x": ref_x, "y": ref_y}),
     )
+
+
+@pytest.mark.release()
+def test_sort_nan_1942() -> None:
+    # https://github.com/pola-rs/polars/issues/1942
+    import time
+
+    start = time.time()
+    pl.repeat(float("nan"), 2**13, eager=True).sort()
+    end = time.time()
+
+    assert (end - start) < 1.0
diff --git a/py-polars/tests/unit/operations/test_window.py b/py-polars/tests/unit/operations/test_window.py
index 90cd421a8b41..dfffd6e5cca9 100644
--- a/py-polars/tests/unit/operations/test_window.py
+++ b/py-polars/tests/unit/operations/test_window.py
@@ -469,3 +469,24 @@ def test_window_agg_list_null_15437() -> None:
     output = df.select(pl.concat_list("a").over(1))
     expected = pl.DataFrame({"a": [[None]]})
     assert_frame_equal(output, expected)
+
+
+@pytest.mark.release()
+def test_windows_not_cached() -> None:
+    ldf = (
+        pl.DataFrame(
+            [
+                pl.Series("key", ["a", "a", "b", "b"]),
+                pl.Series("val", [2, 2, 1, 3]),
+            ]
+        )
+        .lazy()
+        .filter(
+            (pl.col("key").cum_count().over("key") == 1)
+            | (pl.col("val").shift(1).over("key").is_not_null())
+            | (pl.col("val") != pl.col("val").shift(1).over("key"))
+        )
+    )
+    # this might fail if they are cached
+    for _ in range(1000):
+        ldf.collect()