diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d155b2949..6f236e93b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -110,10 +110,10 @@ jobs: rust-version: ${{ env.rust_msrv }} - name: Test - run: cargo test --no-fail-fast --all-targets --all-features --workspace + run: cargo test --no-fail-fast --lib --examples --tests --all-features --workspace - name: Async-std Test - run: cargo test --no-fail-fast --all-targets --no-default-features --features "async-std" --features "storage-fs" --workspace + run: cargo test --no-fail-fast --lib --examples --tests --no-default-features --features "async-std" --features "storage-fs" --workspace - name: Doc Test run: cargo test --no-fail-fast --doc --all-features --workspace diff --git a/.gitignore b/.gitignore index 05c11eda6..69d8a70ae 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,10 @@ Cargo.lock .vscode **/.DS_Store dist/* + **/venv *.so *.pyc + +crates/iceberg/testdata/performance/raw_data/* +crates/iceberg/testdata/performance/warehouse diff --git a/Makefile b/Makefile index ed10a8acd..ae77164a7 100644 --- a/Makefile +++ b/Makefile @@ -50,7 +50,7 @@ check-toml: install-taplo-cli check: check-fmt check-clippy cargo-sort check-toml cargo-machete doc-test: - cargo test --no-fail-fast --doc --all-features --workspace + cargo test --no-fail-fast --doc --lib --examples '*' --tests '*' --workspace unit-test: doc-test cargo test --no-fail-fast --lib --all-features --workspace diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml index de5b7cdc5..0b9ea59f7 100644 --- a/crates/iceberg/Cargo.toml +++ b/crates/iceberg/Cargo.toml @@ -79,8 +79,19 @@ url = { workspace = true } uuid = { workspace = true } [dev-dependencies] +criterion = { version = "0.3", features = ["async_tokio", "async_futures"] } ctor = { workspace = true } +futures-util = "0.3" +iceberg-catalog-rest = { path = "../catalog/rest" } iceberg_test_utils = { path = "../test_utils", features = ["tests"] } pretty_assertions = { workspace = true } tempfile = { workspace = true } tera = { workspace = true } + +[[bench]] +name = "table_scan_plan_files" +harness = false + +[[bench]] +name = "table_scan_execute_query" +harness = false diff --git a/crates/iceberg/benches/table_scan_execute_query.rs b/crates/iceberg/benches/table_scan_execute_query.rs new file mode 100644 index 000000000..ba26cb7cc --- /dev/null +++ b/crates/iceberg/benches/table_scan_execute_query.rs @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use criterion::*; +use iceberg::expr::Reference; +use iceberg::spec::Datum; +use tokio::runtime::Runtime; + +mod utils; +use utils::{create_file_plan, create_task_stream, exec_plan, setup}; + +pub fn bench_read_all_files_all_rows(c: &mut Criterion) { + let runtime = Runtime::new().unwrap(); + let table = setup(&runtime); + let scan = table.scan().build().unwrap(); + let tasks = create_file_plan(&runtime, scan); + + c.bench_function("scan: read (all files, all rows)", |b| { + b.to_async(&runtime).iter_batched( + || create_task_stream(tasks.clone()), + |plan| exec_plan(table.clone(), plan), + BatchSize::SmallInput, + ) + }); +} + +pub fn bench_read_all_files_some_rows(c: &mut Criterion) { + let runtime = Runtime::new().unwrap(); + let table = setup(&runtime); + let scan = table + .scan() + .with_filter(Reference::new("passenger_count").equal_to(Datum::double(1.0))) + .build() + .unwrap(); + let tasks = create_file_plan(&runtime, scan); + + c.bench_function("scan: read (all files, some rows)", |b| { + b.to_async(&runtime).iter_batched( + || create_task_stream(tasks.clone()), + |plan| exec_plan(table.clone(), plan), + BatchSize::SmallInput, + ) + }); +} + +pub fn bench_read_some_files_all_rows(c: &mut Criterion) { + let runtime = Runtime::new().unwrap(); + let table = setup(&runtime); + let scan = table + .scan() + .with_filter( + Reference::new("tpep_pickup_datetime") + .greater_than_or_equal_to( + Datum::timestamptz_from_str("2024-02-01T00:00:00.000 UTC").unwrap(), + ) + .and(Reference::new("tpep_pickup_datetime").less_than( + Datum::timestamptz_from_str("2024-02-02T00:00:00.000 UTC").unwrap(), + )), + ) + .build() + .unwrap(); + let tasks = create_file_plan(&runtime, scan); + + c.bench_function("scan: read (some files, all rows)", |b| { + b.to_async(&runtime).iter_batched( + || create_task_stream(tasks.clone()), + |plan| exec_plan(table.clone(), plan), + BatchSize::SmallInput, + ) + }); +} + +pub fn bench_read_some_files_some_rows(c: &mut Criterion) { + let runtime = Runtime::new().unwrap(); + let table = setup(&runtime); + let scan = + table + .scan() + .with_filter( + Reference::new("tpep_pickup_datetime") + .greater_than_or_equal_to( + Datum::timestamptz_from_str("2024-02-01T00:00:00.000 UTC").unwrap(), + ) + .and(Reference::new("tpep_pickup_datetime").less_than( + Datum::timestamptz_from_str("2024-02-02T00:00:00.000 UTC").unwrap(), + )) + .and(Reference::new("passenger_count").equal_to(Datum::double(1.0))), + ) + .build() + .unwrap(); + let tasks = create_file_plan(&runtime, scan); + + c.bench_function("scan: read (some files, some rows)", |b| { + b.to_async(&runtime).iter_batched( + || create_task_stream(tasks.clone()), + |plan| exec_plan(table.clone(), plan), + BatchSize::SmallInput, + ) + }); +} + +criterion_group! { + name = benches; + config = Criterion::default().sample_size(10); + targets = bench_read_some_files_some_rows, bench_read_some_files_all_rows, bench_read_all_files_some_rows, bench_read_all_files_all_rows +} + +criterion_main!(benches); diff --git a/crates/iceberg/benches/table_scan_plan_files.rs b/crates/iceberg/benches/table_scan_plan_files.rs new file mode 100644 index 000000000..bde1aad2a --- /dev/null +++ b/crates/iceberg/benches/table_scan_plan_files.rs @@ -0,0 +1,139 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use criterion::*; +use futures_util::StreamExt; +use iceberg::expr::Reference; +use iceberg::spec::Datum; +use iceberg::table::Table; +use tokio::runtime::Runtime; +mod utils; +use utils::setup; + +async fn all_files_all_rows(table: &Table) { + let scan = table.scan().build().unwrap(); + let mut stream = scan.plan_files().await.unwrap(); + + while let Some(item) = stream.next().await { + black_box(item.unwrap()); + } +} + +async fn one_file_all_rows(table: &Table) { + let scan = table + .scan() + .with_filter( + Reference::new("tpep_pickup_datetime") + .greater_than_or_equal_to( + Datum::timestamptz_from_str("2024-02-01T00:00:00.000 UTC").unwrap(), + ) + .and(Reference::new("tpep_pickup_datetime").less_than( + Datum::timestamptz_from_str("2024-02-02T00:00:00.000 UTC").unwrap(), + )), + ) + .build() + .unwrap(); + let mut stream = scan.plan_files().await.unwrap(); + + while let Some(item) = stream.next().await { + black_box(item.unwrap()); + } +} + +async fn all_files_some_rows(table: &Table) { + let scan = table + .scan() + .with_filter(Reference::new("passenger_count").equal_to(Datum::double(1.0))) + .build() + .unwrap(); + let mut stream = scan.plan_files().await.unwrap(); + + while let Some(item) = stream.next().await { + black_box(item.unwrap()); + } +} + +async fn one_file_some_rows(table: &Table) { + let scan = + table + .scan() + .with_filter( + Reference::new("tpep_pickup_datetime") + .greater_than_or_equal_to( + Datum::timestamptz_from_str("2024-02-01T00:00:00.000 UTC").unwrap(), + ) + .and(Reference::new("tpep_pickup_datetime").less_than( + Datum::timestamptz_from_str("2024-02-02T00:00:00.000 UTC").unwrap(), + )) + .and(Reference::new("passenger_count").equal_to(Datum::double(1.0))), + ) + .build() + .unwrap(); + let mut stream = scan.plan_files().await.unwrap(); + + while let Some(item) = stream.next().await { + black_box(item.unwrap()); + } +} + +pub fn bench_all_files_all_rows(c: &mut Criterion) { + let runtime = Runtime::new().unwrap(); + let table = setup(&runtime); + println!("setup complete"); + + c.bench_function("scan: plan (all files, all rows)", |b| { + b.to_async(&runtime).iter(|| all_files_all_rows(&table)) + }); +} + +pub fn bench_one_file_all_rows(c: &mut Criterion) { + let runtime = Runtime::new().unwrap(); + let table = setup(&runtime); + println!("setup complete"); + + c.bench_function("scan: plan (one file, all rows)", |b| { + b.to_async(&runtime).iter(|| one_file_all_rows(&table)) + }); +} + +pub fn bench_all_files_some_rows(c: &mut Criterion) { + let runtime = Runtime::new().unwrap(); + let table = setup(&runtime); + println!("setup complete"); + + c.bench_function("scan: plan (all files, some rows)", |b| { + b.to_async(&runtime).iter(|| all_files_some_rows(&table)) + }); +} + +pub fn bench_one_file_some_rows(c: &mut Criterion) { + let runtime = Runtime::new().unwrap(); + let table = setup(&runtime); + println!("setup complete"); + + c.bench_function("scan: plan (one file, some rows)", |b| { + b.to_async(&runtime).iter(|| one_file_some_rows(&table)) + }); +} + +criterion_group! { + name = benches; + config = Criterion::default().sample_size(10); + targets = bench_all_files_all_rows, bench_all_files_some_rows, bench_one_file_all_rows, bench_one_file_some_rows +} + +criterion_main!(benches); diff --git a/crates/iceberg/benches/utils.rs b/crates/iceberg/benches/utils.rs new file mode 100644 index 000000000..9444fc7bb --- /dev/null +++ b/crates/iceberg/benches/utils.rs @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; +use std::pin::Pin; +use std::sync::Arc; +use std::vec::IntoIter; + +use criterion::black_box; +use futures::stream::StreamExt; +use futures_util::stream::Iter; +use iceberg::io::{S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY}; +use iceberg::scan::{FileScanTask, FileScanTaskStream, TableScan}; +use iceberg::table::Table; +use iceberg::{Catalog, Error}; +use iceberg_catalog_rest::{RestCatalog, RestCatalogConfig}; +use iceberg_test_utils::docker::DockerCompose; +use tokio::runtime::Runtime; + +pub fn get_docker_compose() -> DockerCompose { + DockerCompose::new( + "iceberg-rust-performance", + format!( + "{}/../iceberg/testdata/performance", + env!("CARGO_MANIFEST_DIR") + ), + ) + .with_persistence(true) +} + +pub async fn build_catalog() -> RestCatalog { + let docker_compose = get_docker_compose(); + + let rest_api_container_ip = docker_compose.get_container_ip("rest"); + let warehouse_container_ip = docker_compose.get_container_ip("haproxy"); + // let warehouse_container_ip = docker_compose.get_container_ip("minio"); + + let catalog_uri = format!("http://{}:{}", rest_api_container_ip, 8181); + let warehouse_uri = format!("http://{}:{}", warehouse_container_ip, 9080); + // let warehouse_uri = format!("http://{}:{}", warehouse_container_ip, 9000); + + let user_props = HashMap::from_iter( + vec![ + (S3_ENDPOINT.to_string(), warehouse_uri), + (S3_ACCESS_KEY_ID.to_string(), "admin".to_string()), + (S3_SECRET_ACCESS_KEY.to_string(), "password".to_string()), + (S3_REGION.to_string(), "us-east-1".to_string()), + ] + .into_iter(), + ); + + RestCatalog::new( + RestCatalogConfig::builder() + .uri(catalog_uri) + .props(user_props) + .build(), + ) +} + +async fn setup_async() -> Arc { + let catalog = build_catalog().await; + let namespaces = catalog.list_namespaces(None).await.unwrap(); + let table_idents = catalog.list_tables(&namespaces[0]).await.unwrap(); + let table = catalog.load_table(&table_idents[0]).await.unwrap(); + Arc::new(table) +} + +pub fn setup(runtime: &Runtime) -> Arc
{ + runtime.block_on(setup_async()) +} + +#[allow(dead_code)] // not dead, used in table_scan_execute_query +pub fn create_file_plan(runtime: &Runtime, scan: TableScan) -> Arc> { + let tasks = runtime.block_on(async { + let stream = scan.plan_files().await.unwrap(); + stream.map(|x| x.unwrap()).collect::>().await + }); + Arc::new(tasks) +} + +#[allow(dead_code)] // not dead, used in table_scan_execute_query +pub fn create_task_stream( + tasks: Arc>, +) -> Pin>>>> { + let tasks: Vec<_> = tasks + .iter() + .map(|t| Ok::(t.clone())) + .collect(); + Box::pin(futures_util::stream::iter(tasks)) +} + +#[allow(dead_code)] // not dead, used in table_scan_execute_query +pub async fn exec_plan(table: Arc
, plan: FileScanTaskStream) { + let reader = table.reader_builder().build(); + + let mut record_batch_stream = reader.read(plan).unwrap(); + while let Some(item) = record_batch_stream.next().await { + black_box(item.unwrap()); + } +} diff --git a/crates/iceberg/testdata/performance/docker-compose.yaml b/crates/iceberg/testdata/performance/docker-compose.yaml new file mode 100644 index 000000000..13d7c90ef --- /dev/null +++ b/crates/iceberg/testdata/performance/docker-compose.yaml @@ -0,0 +1,112 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +version: "3" + +services: + spark-iceberg: + image: tabulario/spark-iceberg + build: spark/ + networks: + iceberg_perf: + depends_on: + - rest + - minio + volumes: + - ./warehouse:/home/iceberg/warehouse + - ./notebooks:/home/iceberg/notebooks/notebooks + - ./raw_data:/home/iceberg/raw_data + - ./spark_scripts:/home/iceberg/spark_scripts + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + ports: + - 8888:8888 + - 8080:8080 + - 10000:10000 + - 10001:10001 + + rest: + image: tabulario/iceberg-rest + networks: + iceberg_perf: + ports: + - 8181:8181 + volumes: + - ./warehouse:/warehouse + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + - CATALOG_WAREHOUSE=s3://warehouse/ + - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO + - CATALOG_S3_ENDPOINT=http://minio:9000 + - CATALOG_URI=jdbc:sqlite:file:/warehouse/catalog.sqlite + + minio: + image: minio/minio + environment: + - MINIO_ROOT_USER=admin + - MINIO_ROOT_PASSWORD=password + - MINIO_DOMAIN=minio + networks: + iceberg_perf: + aliases: + - warehouse.minio + volumes: + - ./warehouse:/warehouse + ports: + - 9001:9001 + - 9000:9000 + command: ["server", "/warehouse", "--console-address", ":9001"] + + mc: + depends_on: + - minio + image: minio/mc + container_name: mc + networks: + iceberg_perf: + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + entrypoint: > + /bin/sh -c " + until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done; + /usr/bin/mc mb minio/warehouse; + /usr/bin/mc policy set public minio/warehouse; + tail -f /dev/null + " + + haproxy: + image: haproxy:lts-bookworm + networks: + iceberg_perf: + ports: + - 9080 + volumes: + - type: bind + source: ./haproxy.cfg + target: /usr/local/etc/haproxy/haproxy.cfg + read_only: true + depends_on: + - minio + +networks: + iceberg_perf: diff --git a/crates/iceberg/testdata/performance/haproxy.cfg b/crates/iceberg/testdata/performance/haproxy.cfg new file mode 100644 index 000000000..9da3e1289 --- /dev/null +++ b/crates/iceberg/testdata/performance/haproxy.cfg @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +global + log stdout format raw local0 debug + fd-hard-limit 50000 + +defaults + mode http + timeout client 10s + timeout connect 5s + timeout server 10s + timeout http-request 10s + log global + +frontend slowio + bind :9080 + default_backend minio + tcp-request inspect-delay 50ms + tcp-request content accept if WAIT_END + filter bwlim-out mylimit default-limit 625000 default-period 1s + +backend minio +# server s1 minio.iceberg-performance-tests.orb.local:9000 check maxconn 4 + server s1 minio:9000 check maxconn 4 diff --git a/crates/iceberg/testdata/performance/spark_scripts/setup.sql b/crates/iceberg/testdata/performance/spark_scripts/setup.sql new file mode 100644 index 000000000..858f1b871 --- /dev/null +++ b/crates/iceberg/testdata/performance/spark_scripts/setup.sql @@ -0,0 +1,54 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at + +-- http://www.apache.org/licenses/LICENSE-2.0 + +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +SET TIME ZONE 'UTC'; + +CREATE DATABASE IF NOT EXISTS nyc.taxis; +CREATE TABLE IF NOT EXISTS nyc.taxis ( + VendorID bigint, + tpep_pickup_datetime timestamp, + tpep_dropoff_datetime timestamp, + passenger_count double, + trip_distance double, + RatecodeID double, + store_and_fwd_flag string, + PULocationID bigint, + DOLocationID bigint, + payment_type bigint, + fare_amount double, + extra double, + mta_tax double, + tip_amount double, + tolls_amount double, + improvement_surcharge double, + total_amount double, + congestion_surcharge double, + airport_fee double +) +USING iceberg +PARTITIONED BY (days(tpep_pickup_datetime)); + +CREATE TEMPORARY VIEW parquetTable +USING org.apache.spark.sql.parquet +OPTIONS ( + path "/home/iceberg/raw_data/yellow_tripdata_*.parquet" +); + +-- Repeat the insert a few times to accumulate multiple snapshots and manifest files +INSERT INTO nyc.taxis SELECT * FROM parquetTable; +INSERT INTO nyc.taxis SELECT * FROM parquetTable; +INSERT INTO nyc.taxis SELECT * FROM parquetTable; diff --git a/crates/test_utils/Cargo.toml b/crates/test_utils/Cargo.toml index d4f6e1696..54ad15b26 100644 --- a/crates/test_utils/Cargo.toml +++ b/crates/test_utils/Cargo.toml @@ -31,3 +31,8 @@ log = "0.4.20" [features] tests = [] + +[lib] +name = "iceberg_test_utils" +path = "src/lib.rs" + diff --git a/crates/test_utils/src/docker.rs b/crates/test_utils/src/docker.rs index bde9737b1..c32bd5e7d 100644 --- a/crates/test_utils/src/docker.rs +++ b/crates/test_utils/src/docker.rs @@ -27,6 +27,7 @@ use crate::cmd::{get_cmd_output, get_cmd_output_result, run_command}; pub struct DockerCompose { project_name: String, docker_compose_dir: String, + is_persistent: bool, } impl DockerCompose { @@ -34,9 +35,15 @@ impl DockerCompose { Self { project_name: project_name.to_string(), docker_compose_dir: docker_compose_dir.to_string(), + is_persistent: false, } } + pub fn with_persistence(mut self, is_persistent: bool) -> Self { + self.is_persistent = is_persistent; + self + } + pub fn project_name(&self) -> &str { self.project_name.as_str() } @@ -114,6 +121,10 @@ impl DockerCompose { impl Drop for DockerCompose { fn drop(&mut self) { + if self.is_persistent { + return; + } + let mut cmd = Command::new("docker"); cmd.current_dir(&self.docker_compose_dir); diff --git a/crates/test_utils/src/lib.rs b/crates/test_utils/src/lib.rs index 4f63b8dd7..56a282ffc 100644 --- a/crates/test_utils/src/lib.rs +++ b/crates/test_utils/src/lib.rs @@ -19,15 +19,9 @@ //! //! It's not intended for use outside of `iceberg-rust`. -#[cfg(feature = "tests")] mod cmd; -#[cfg(feature = "tests")] pub mod docker; - -#[cfg(feature = "tests")] pub use common::*; - -#[cfg(feature = "tests")] mod common { use std::sync::Once; diff --git a/justfile b/justfile new file mode 100644 index 000000000..019a15ed8 --- /dev/null +++ b/justfile @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +default: + just --list --unsorted + +# start the performance-testing docker-compose project. +perf-start: + docker-compose -p iceberg-rust-performance --project-directory ./crates/iceberg/testdata/performance up -d + # initialize the perf test if the environment is clean + if [ ! -d ./crates/iceberg/testdata/performance/warehouse/warehouse/nyc/taxis/data ]; then just perf-init; fi + +# stop the performance-testing docker-compose project +perf-stop: + docker-compose -p iceberg-rust-performance --project-directory ./crates/iceberg/testdata/performance down + +# performance testing: initialize the tables and data +perf-init: perf-download-data + docker exec iceberg-rust-performance-spark-iceberg-1 spark-sql --driver-memory 4G --executor-memory 4G -f /home/iceberg/spark_scripts/setup.sql + +# stop and clean up the performance-testing docker-compose project docker images and data +perf-clean: + docker-compose -p iceberg-rust-performance --project-directory ./crates/iceberg/testdata/performance down --remove-orphans -v --rmi all + rm ./crates/iceberg/testdata/performance/raw_data/*.parquet + rm ./crates/iceberg/testdata/performance/warehouse/catalog.sqlite + rm -rf ./crates/iceberg/testdata/performance/warehouse/warehouse + rm -rf ./crates/iceberg/testdata/performance/warehouse/.minio.sys + +# run the performance test suite +perf-run: + # start the perf testing env if it is not running + docker container inspect iceberg-rust-performance-rest-1 >/dev/null || just perf-start + cargo criterion --benches + +# download the "NYC Taxi" data required to populate the performance test suite +perf-download-data: + if [ ! -f ./crates/iceberg/testdata/performance/raw_data/yellow_tripdata_2024-01.parquet ]; then wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet -P crates/iceberg/testdata/performance/raw_data; fi + if [ ! -f ./crates/iceberg/testdata/performance/raw_data/yellow_tripdata_2024-02.parquet ]; then wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet -P crates/iceberg/testdata/performance/raw_data; fi + if [ ! -f ./crates/iceberg/testdata/performance/raw_data/yellow_tripdata_2024-03.parquet ]; then wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-03.parquet -P crates/iceberg/testdata/performance/raw_data; fi + if [ ! -f ./crates/iceberg/testdata/performance/raw_data/yellow_tripdata_2024-04.parquet ]; then wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-04.parquet -P crates/iceberg/testdata/performance/raw_data; fi + if [ ! -f ./crates/iceberg/testdata/performance/raw_data/yellow_tripdata_2024-05.parquet ]; then wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-05.parquet -P crates/iceberg/testdata/performance/raw_data; fi