diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d155b2949..6f236e93b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -110,10 +110,10 @@ jobs:
rust-version: ${{ env.rust_msrv }}
- name: Test
- run: cargo test --no-fail-fast --all-targets --all-features --workspace
+ run: cargo test --no-fail-fast --lib --examples --tests --all-features --workspace
- name: Async-std Test
- run: cargo test --no-fail-fast --all-targets --no-default-features --features "async-std" --features "storage-fs" --workspace
+ run: cargo test --no-fail-fast --lib --examples --tests --no-default-features --features "async-std" --features "storage-fs" --workspace
- name: Doc Test
run: cargo test --no-fail-fast --doc --all-features --workspace
diff --git a/.gitignore b/.gitignore
index 05c11eda6..69d8a70ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,10 @@ Cargo.lock
.vscode
**/.DS_Store
dist/*
+
**/venv
*.so
*.pyc
+
+crates/iceberg/testdata/performance/raw_data/*
+crates/iceberg/testdata/performance/warehouse
diff --git a/Makefile b/Makefile
index ed10a8acd..ae77164a7 100644
--- a/Makefile
+++ b/Makefile
@@ -50,7 +50,7 @@ check-toml: install-taplo-cli
check: check-fmt check-clippy cargo-sort check-toml cargo-machete
doc-test:
- cargo test --no-fail-fast --doc --all-features --workspace
+ cargo test --no-fail-fast --doc --lib --examples '*' --tests '*' --workspace
unit-test: doc-test
cargo test --no-fail-fast --lib --all-features --workspace
diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml
index de5b7cdc5..0b9ea59f7 100644
--- a/crates/iceberg/Cargo.toml
+++ b/crates/iceberg/Cargo.toml
@@ -79,8 +79,19 @@ url = { workspace = true }
uuid = { workspace = true }
[dev-dependencies]
+criterion = { version = "0.3", features = ["async_tokio", "async_futures"] }
ctor = { workspace = true }
+futures-util = "0.3"
+iceberg-catalog-rest = { path = "../catalog/rest" }
iceberg_test_utils = { path = "../test_utils", features = ["tests"] }
pretty_assertions = { workspace = true }
tempfile = { workspace = true }
tera = { workspace = true }
+
+[[bench]]
+name = "table_scan_plan_files"
+harness = false
+
+[[bench]]
+name = "table_scan_execute_query"
+harness = false
diff --git a/crates/iceberg/benches/table_scan_execute_query.rs b/crates/iceberg/benches/table_scan_execute_query.rs
new file mode 100644
index 000000000..ba26cb7cc
--- /dev/null
+++ b/crates/iceberg/benches/table_scan_execute_query.rs
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use criterion::*;
+use iceberg::expr::Reference;
+use iceberg::spec::Datum;
+use tokio::runtime::Runtime;
+
+mod utils;
+use utils::{create_file_plan, create_task_stream, exec_plan, setup};
+
+pub fn bench_read_all_files_all_rows(c: &mut Criterion) {
+ let runtime = Runtime::new().unwrap();
+ let table = setup(&runtime);
+ let scan = table.scan().build().unwrap();
+ let tasks = create_file_plan(&runtime, scan);
+
+ c.bench_function("scan: read (all files, all rows)", |b| {
+ b.to_async(&runtime).iter_batched(
+ || create_task_stream(tasks.clone()),
+ |plan| exec_plan(table.clone(), plan),
+ BatchSize::SmallInput,
+ )
+ });
+}
+
+pub fn bench_read_all_files_some_rows(c: &mut Criterion) {
+ let runtime = Runtime::new().unwrap();
+ let table = setup(&runtime);
+ let scan = table
+ .scan()
+ .with_filter(Reference::new("passenger_count").equal_to(Datum::double(1.0)))
+ .build()
+ .unwrap();
+ let tasks = create_file_plan(&runtime, scan);
+
+ c.bench_function("scan: read (all files, some rows)", |b| {
+ b.to_async(&runtime).iter_batched(
+ || create_task_stream(tasks.clone()),
+ |plan| exec_plan(table.clone(), plan),
+ BatchSize::SmallInput,
+ )
+ });
+}
+
+pub fn bench_read_some_files_all_rows(c: &mut Criterion) {
+ let runtime = Runtime::new().unwrap();
+ let table = setup(&runtime);
+ let scan = table
+ .scan()
+ .with_filter(
+ Reference::new("tpep_pickup_datetime")
+ .greater_than_or_equal_to(
+ Datum::timestamptz_from_str("2024-02-01T00:00:00.000 UTC").unwrap(),
+ )
+ .and(Reference::new("tpep_pickup_datetime").less_than(
+ Datum::timestamptz_from_str("2024-02-02T00:00:00.000 UTC").unwrap(),
+ )),
+ )
+ .build()
+ .unwrap();
+ let tasks = create_file_plan(&runtime, scan);
+
+ c.bench_function("scan: read (some files, all rows)", |b| {
+ b.to_async(&runtime).iter_batched(
+ || create_task_stream(tasks.clone()),
+ |plan| exec_plan(table.clone(), plan),
+ BatchSize::SmallInput,
+ )
+ });
+}
+
+pub fn bench_read_some_files_some_rows(c: &mut Criterion) {
+ let runtime = Runtime::new().unwrap();
+ let table = setup(&runtime);
+ let scan =
+ table
+ .scan()
+ .with_filter(
+ Reference::new("tpep_pickup_datetime")
+ .greater_than_or_equal_to(
+ Datum::timestamptz_from_str("2024-02-01T00:00:00.000 UTC").unwrap(),
+ )
+ .and(Reference::new("tpep_pickup_datetime").less_than(
+ Datum::timestamptz_from_str("2024-02-02T00:00:00.000 UTC").unwrap(),
+ ))
+ .and(Reference::new("passenger_count").equal_to(Datum::double(1.0))),
+ )
+ .build()
+ .unwrap();
+ let tasks = create_file_plan(&runtime, scan);
+
+ c.bench_function("scan: read (some files, some rows)", |b| {
+ b.to_async(&runtime).iter_batched(
+ || create_task_stream(tasks.clone()),
+ |plan| exec_plan(table.clone(), plan),
+ BatchSize::SmallInput,
+ )
+ });
+}
+
+criterion_group! {
+ name = benches;
+ config = Criterion::default().sample_size(10);
+ targets = bench_read_some_files_some_rows, bench_read_some_files_all_rows, bench_read_all_files_some_rows, bench_read_all_files_all_rows
+}
+
+criterion_main!(benches);
diff --git a/crates/iceberg/benches/table_scan_plan_files.rs b/crates/iceberg/benches/table_scan_plan_files.rs
new file mode 100644
index 000000000..bde1aad2a
--- /dev/null
+++ b/crates/iceberg/benches/table_scan_plan_files.rs
@@ -0,0 +1,139 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use criterion::*;
+use futures_util::StreamExt;
+use iceberg::expr::Reference;
+use iceberg::spec::Datum;
+use iceberg::table::Table;
+use tokio::runtime::Runtime;
+mod utils;
+use utils::setup;
+
+async fn all_files_all_rows(table: &Table) {
+ let scan = table.scan().build().unwrap();
+ let mut stream = scan.plan_files().await.unwrap();
+
+ while let Some(item) = stream.next().await {
+ black_box(item.unwrap());
+ }
+}
+
+async fn one_file_all_rows(table: &Table) {
+ let scan = table
+ .scan()
+ .with_filter(
+ Reference::new("tpep_pickup_datetime")
+ .greater_than_or_equal_to(
+ Datum::timestamptz_from_str("2024-02-01T00:00:00.000 UTC").unwrap(),
+ )
+ .and(Reference::new("tpep_pickup_datetime").less_than(
+ Datum::timestamptz_from_str("2024-02-02T00:00:00.000 UTC").unwrap(),
+ )),
+ )
+ .build()
+ .unwrap();
+ let mut stream = scan.plan_files().await.unwrap();
+
+ while let Some(item) = stream.next().await {
+ black_box(item.unwrap());
+ }
+}
+
+async fn all_files_some_rows(table: &Table) {
+ let scan = table
+ .scan()
+ .with_filter(Reference::new("passenger_count").equal_to(Datum::double(1.0)))
+ .build()
+ .unwrap();
+ let mut stream = scan.plan_files().await.unwrap();
+
+ while let Some(item) = stream.next().await {
+ black_box(item.unwrap());
+ }
+}
+
+async fn one_file_some_rows(table: &Table) {
+ let scan =
+ table
+ .scan()
+ .with_filter(
+ Reference::new("tpep_pickup_datetime")
+ .greater_than_or_equal_to(
+ Datum::timestamptz_from_str("2024-02-01T00:00:00.000 UTC").unwrap(),
+ )
+ .and(Reference::new("tpep_pickup_datetime").less_than(
+ Datum::timestamptz_from_str("2024-02-02T00:00:00.000 UTC").unwrap(),
+ ))
+ .and(Reference::new("passenger_count").equal_to(Datum::double(1.0))),
+ )
+ .build()
+ .unwrap();
+ let mut stream = scan.plan_files().await.unwrap();
+
+ while let Some(item) = stream.next().await {
+ black_box(item.unwrap());
+ }
+}
+
+pub fn bench_all_files_all_rows(c: &mut Criterion) {
+ let runtime = Runtime::new().unwrap();
+ let table = setup(&runtime);
+ println!("setup complete");
+
+ c.bench_function("scan: plan (all files, all rows)", |b| {
+ b.to_async(&runtime).iter(|| all_files_all_rows(&table))
+ });
+}
+
+pub fn bench_one_file_all_rows(c: &mut Criterion) {
+ let runtime = Runtime::new().unwrap();
+ let table = setup(&runtime);
+ println!("setup complete");
+
+ c.bench_function("scan: plan (one file, all rows)", |b| {
+ b.to_async(&runtime).iter(|| one_file_all_rows(&table))
+ });
+}
+
+pub fn bench_all_files_some_rows(c: &mut Criterion) {
+ let runtime = Runtime::new().unwrap();
+ let table = setup(&runtime);
+ println!("setup complete");
+
+ c.bench_function("scan: plan (all files, some rows)", |b| {
+ b.to_async(&runtime).iter(|| all_files_some_rows(&table))
+ });
+}
+
+pub fn bench_one_file_some_rows(c: &mut Criterion) {
+ let runtime = Runtime::new().unwrap();
+ let table = setup(&runtime);
+ println!("setup complete");
+
+ c.bench_function("scan: plan (one file, some rows)", |b| {
+ b.to_async(&runtime).iter(|| one_file_some_rows(&table))
+ });
+}
+
+criterion_group! {
+ name = benches;
+ config = Criterion::default().sample_size(10);
+ targets = bench_all_files_all_rows, bench_all_files_some_rows, bench_one_file_all_rows, bench_one_file_some_rows
+}
+
+criterion_main!(benches);
diff --git a/crates/iceberg/benches/utils.rs b/crates/iceberg/benches/utils.rs
new file mode 100644
index 000000000..9444fc7bb
--- /dev/null
+++ b/crates/iceberg/benches/utils.rs
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashMap;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::vec::IntoIter;
+
+use criterion::black_box;
+use futures::stream::StreamExt;
+use futures_util::stream::Iter;
+use iceberg::io::{S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY};
+use iceberg::scan::{FileScanTask, FileScanTaskStream, TableScan};
+use iceberg::table::Table;
+use iceberg::{Catalog, Error};
+use iceberg_catalog_rest::{RestCatalog, RestCatalogConfig};
+use iceberg_test_utils::docker::DockerCompose;
+use tokio::runtime::Runtime;
+
+pub fn get_docker_compose() -> DockerCompose {
+ DockerCompose::new(
+ "iceberg-rust-performance",
+ format!(
+ "{}/../iceberg/testdata/performance",
+ env!("CARGO_MANIFEST_DIR")
+ ),
+ )
+ .with_persistence(true)
+}
+
+pub async fn build_catalog() -> RestCatalog {
+ let docker_compose = get_docker_compose();
+
+ let rest_api_container_ip = docker_compose.get_container_ip("rest");
+ let warehouse_container_ip = docker_compose.get_container_ip("haproxy");
+ // let warehouse_container_ip = docker_compose.get_container_ip("minio");
+
+ let catalog_uri = format!("http://{}:{}", rest_api_container_ip, 8181);
+ let warehouse_uri = format!("http://{}:{}", warehouse_container_ip, 9080);
+ // let warehouse_uri = format!("http://{}:{}", warehouse_container_ip, 9000);
+
+ let user_props = HashMap::from_iter(
+ vec![
+ (S3_ENDPOINT.to_string(), warehouse_uri),
+ (S3_ACCESS_KEY_ID.to_string(), "admin".to_string()),
+ (S3_SECRET_ACCESS_KEY.to_string(), "password".to_string()),
+ (S3_REGION.to_string(), "us-east-1".to_string()),
+ ]
+ .into_iter(),
+ );
+
+ RestCatalog::new(
+ RestCatalogConfig::builder()
+ .uri(catalog_uri)
+ .props(user_props)
+ .build(),
+ )
+}
+
+async fn setup_async() -> Arc
{
+ let catalog = build_catalog().await;
+ let namespaces = catalog.list_namespaces(None).await.unwrap();
+ let table_idents = catalog.list_tables(&namespaces[0]).await.unwrap();
+ let table = catalog.load_table(&table_idents[0]).await.unwrap();
+ Arc::new(table)
+}
+
+pub fn setup(runtime: &Runtime) -> Arc {
+ runtime.block_on(setup_async())
+}
+
+#[allow(dead_code)] // not dead, used in table_scan_execute_query
+pub fn create_file_plan(runtime: &Runtime, scan: TableScan) -> Arc> {
+ let tasks = runtime.block_on(async {
+ let stream = scan.plan_files().await.unwrap();
+ stream.map(|x| x.unwrap()).collect::>().await
+ });
+ Arc::new(tasks)
+}
+
+#[allow(dead_code)] // not dead, used in table_scan_execute_query
+pub fn create_task_stream(
+ tasks: Arc>,
+) -> Pin>>>> {
+ let tasks: Vec<_> = tasks
+ .iter()
+ .map(|t| Ok::(t.clone()))
+ .collect();
+ Box::pin(futures_util::stream::iter(tasks))
+}
+
+#[allow(dead_code)] // not dead, used in table_scan_execute_query
+pub async fn exec_plan(table: Arc, plan: FileScanTaskStream) {
+ let reader = table.reader_builder().build();
+
+ let mut record_batch_stream = reader.read(plan).unwrap();
+ while let Some(item) = record_batch_stream.next().await {
+ black_box(item.unwrap());
+ }
+}
diff --git a/crates/iceberg/testdata/performance/docker-compose.yaml b/crates/iceberg/testdata/performance/docker-compose.yaml
new file mode 100644
index 000000000..13d7c90ef
--- /dev/null
+++ b/crates/iceberg/testdata/performance/docker-compose.yaml
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+version: "3"
+
+services:
+ spark-iceberg:
+ image: tabulario/spark-iceberg
+ build: spark/
+ networks:
+ iceberg_perf:
+ depends_on:
+ - rest
+ - minio
+ volumes:
+ - ./warehouse:/home/iceberg/warehouse
+ - ./notebooks:/home/iceberg/notebooks/notebooks
+ - ./raw_data:/home/iceberg/raw_data
+ - ./spark_scripts:/home/iceberg/spark_scripts
+ environment:
+ - AWS_ACCESS_KEY_ID=admin
+ - AWS_SECRET_ACCESS_KEY=password
+ - AWS_REGION=us-east-1
+ ports:
+ - 8888:8888
+ - 8080:8080
+ - 10000:10000
+ - 10001:10001
+
+ rest:
+ image: tabulario/iceberg-rest
+ networks:
+ iceberg_perf:
+ ports:
+ - 8181:8181
+ volumes:
+ - ./warehouse:/warehouse
+ environment:
+ - AWS_ACCESS_KEY_ID=admin
+ - AWS_SECRET_ACCESS_KEY=password
+ - AWS_REGION=us-east-1
+ - CATALOG_WAREHOUSE=s3://warehouse/
+ - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO
+ - CATALOG_S3_ENDPOINT=http://minio:9000
+ - CATALOG_URI=jdbc:sqlite:file:/warehouse/catalog.sqlite
+
+ minio:
+ image: minio/minio
+ environment:
+ - MINIO_ROOT_USER=admin
+ - MINIO_ROOT_PASSWORD=password
+ - MINIO_DOMAIN=minio
+ networks:
+ iceberg_perf:
+ aliases:
+ - warehouse.minio
+ volumes:
+ - ./warehouse:/warehouse
+ ports:
+ - 9001:9001
+ - 9000:9000
+ command: ["server", "/warehouse", "--console-address", ":9001"]
+
+ mc:
+ depends_on:
+ - minio
+ image: minio/mc
+ container_name: mc
+ networks:
+ iceberg_perf:
+ environment:
+ - AWS_ACCESS_KEY_ID=admin
+ - AWS_SECRET_ACCESS_KEY=password
+ - AWS_REGION=us-east-1
+ entrypoint: >
+ /bin/sh -c "
+ until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done;
+ /usr/bin/mc mb minio/warehouse;
+ /usr/bin/mc policy set public minio/warehouse;
+ tail -f /dev/null
+ "
+
+ haproxy:
+ image: haproxy:lts-bookworm
+ networks:
+ iceberg_perf:
+ ports:
+ - 9080
+ volumes:
+ - type: bind
+ source: ./haproxy.cfg
+ target: /usr/local/etc/haproxy/haproxy.cfg
+ read_only: true
+ depends_on:
+ - minio
+
+networks:
+ iceberg_perf:
diff --git a/crates/iceberg/testdata/performance/haproxy.cfg b/crates/iceberg/testdata/performance/haproxy.cfg
new file mode 100644
index 000000000..9da3e1289
--- /dev/null
+++ b/crates/iceberg/testdata/performance/haproxy.cfg
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+global
+ log stdout format raw local0 debug
+ fd-hard-limit 50000
+
+defaults
+ mode http
+ timeout client 10s
+ timeout connect 5s
+ timeout server 10s
+ timeout http-request 10s
+ log global
+
+frontend slowio
+ bind :9080
+ default_backend minio
+ tcp-request inspect-delay 50ms
+ tcp-request content accept if WAIT_END
+ filter bwlim-out mylimit default-limit 625000 default-period 1s
+
+backend minio
+# server s1 minio.iceberg-performance-tests.orb.local:9000 check maxconn 4
+ server s1 minio:9000 check maxconn 4
diff --git a/crates/iceberg/testdata/performance/spark_scripts/setup.sql b/crates/iceberg/testdata/performance/spark_scripts/setup.sql
new file mode 100644
index 000000000..858f1b871
--- /dev/null
+++ b/crates/iceberg/testdata/performance/spark_scripts/setup.sql
@@ -0,0 +1,54 @@
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements. See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership. The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License. You may obtain a copy of the License at
+
+-- http://www.apache.org/licenses/LICENSE-2.0
+
+-- Unless required by applicable law or agreed to in writing,
+-- software distributed under the License is distributed on an
+-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+-- KIND, either express or implied. See the License for the
+-- specific language governing permissions and limitations
+-- under the License.
+
+SET TIME ZONE 'UTC';
+
+CREATE DATABASE IF NOT EXISTS nyc.taxis;
+CREATE TABLE IF NOT EXISTS nyc.taxis (
+ VendorID bigint,
+ tpep_pickup_datetime timestamp,
+ tpep_dropoff_datetime timestamp,
+ passenger_count double,
+ trip_distance double,
+ RatecodeID double,
+ store_and_fwd_flag string,
+ PULocationID bigint,
+ DOLocationID bigint,
+ payment_type bigint,
+ fare_amount double,
+ extra double,
+ mta_tax double,
+ tip_amount double,
+ tolls_amount double,
+ improvement_surcharge double,
+ total_amount double,
+ congestion_surcharge double,
+ airport_fee double
+)
+USING iceberg
+PARTITIONED BY (days(tpep_pickup_datetime));
+
+CREATE TEMPORARY VIEW parquetTable
+USING org.apache.spark.sql.parquet
+OPTIONS (
+ path "/home/iceberg/raw_data/yellow_tripdata_*.parquet"
+);
+
+-- Repeat the insert a few times to accumulate multiple snapshots and manifest files
+INSERT INTO nyc.taxis SELECT * FROM parquetTable;
+INSERT INTO nyc.taxis SELECT * FROM parquetTable;
+INSERT INTO nyc.taxis SELECT * FROM parquetTable;
diff --git a/crates/test_utils/Cargo.toml b/crates/test_utils/Cargo.toml
index d4f6e1696..54ad15b26 100644
--- a/crates/test_utils/Cargo.toml
+++ b/crates/test_utils/Cargo.toml
@@ -31,3 +31,8 @@ log = "0.4.20"
[features]
tests = []
+
+[lib]
+name = "iceberg_test_utils"
+path = "src/lib.rs"
+
diff --git a/crates/test_utils/src/docker.rs b/crates/test_utils/src/docker.rs
index bde9737b1..c32bd5e7d 100644
--- a/crates/test_utils/src/docker.rs
+++ b/crates/test_utils/src/docker.rs
@@ -27,6 +27,7 @@ use crate::cmd::{get_cmd_output, get_cmd_output_result, run_command};
pub struct DockerCompose {
project_name: String,
docker_compose_dir: String,
+ is_persistent: bool,
}
impl DockerCompose {
@@ -34,9 +35,15 @@ impl DockerCompose {
Self {
project_name: project_name.to_string(),
docker_compose_dir: docker_compose_dir.to_string(),
+ is_persistent: false,
}
}
+ pub fn with_persistence(mut self, is_persistent: bool) -> Self {
+ self.is_persistent = is_persistent;
+ self
+ }
+
pub fn project_name(&self) -> &str {
self.project_name.as_str()
}
@@ -114,6 +121,10 @@ impl DockerCompose {
impl Drop for DockerCompose {
fn drop(&mut self) {
+ if self.is_persistent {
+ return;
+ }
+
let mut cmd = Command::new("docker");
cmd.current_dir(&self.docker_compose_dir);
diff --git a/crates/test_utils/src/lib.rs b/crates/test_utils/src/lib.rs
index 4f63b8dd7..56a282ffc 100644
--- a/crates/test_utils/src/lib.rs
+++ b/crates/test_utils/src/lib.rs
@@ -19,15 +19,9 @@
//!
//! It's not intended for use outside of `iceberg-rust`.
-#[cfg(feature = "tests")]
mod cmd;
-#[cfg(feature = "tests")]
pub mod docker;
-
-#[cfg(feature = "tests")]
pub use common::*;
-
-#[cfg(feature = "tests")]
mod common {
use std::sync::Once;
diff --git a/justfile b/justfile
new file mode 100644
index 000000000..019a15ed8
--- /dev/null
+++ b/justfile
@@ -0,0 +1,55 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+default:
+ just --list --unsorted
+
+# start the performance-testing docker-compose project.
+perf-start:
+ docker-compose -p iceberg-rust-performance --project-directory ./crates/iceberg/testdata/performance up -d
+ # initialize the perf test if the environment is clean
+ if [ ! -d ./crates/iceberg/testdata/performance/warehouse/warehouse/nyc/taxis/data ]; then just perf-init; fi
+
+# stop the performance-testing docker-compose project
+perf-stop:
+ docker-compose -p iceberg-rust-performance --project-directory ./crates/iceberg/testdata/performance down
+
+# performance testing: initialize the tables and data
+perf-init: perf-download-data
+ docker exec iceberg-rust-performance-spark-iceberg-1 spark-sql --driver-memory 4G --executor-memory 4G -f /home/iceberg/spark_scripts/setup.sql
+
+# stop and clean up the performance-testing docker-compose project docker images and data
+perf-clean:
+ docker-compose -p iceberg-rust-performance --project-directory ./crates/iceberg/testdata/performance down --remove-orphans -v --rmi all
+ rm ./crates/iceberg/testdata/performance/raw_data/*.parquet
+ rm ./crates/iceberg/testdata/performance/warehouse/catalog.sqlite
+ rm -rf ./crates/iceberg/testdata/performance/warehouse/warehouse
+ rm -rf ./crates/iceberg/testdata/performance/warehouse/.minio.sys
+
+# run the performance test suite
+perf-run:
+ # start the perf testing env if it is not running
+ docker container inspect iceberg-rust-performance-rest-1 >/dev/null || just perf-start
+ cargo criterion --benches
+
+# download the "NYC Taxi" data required to populate the performance test suite
+perf-download-data:
+ if [ ! -f ./crates/iceberg/testdata/performance/raw_data/yellow_tripdata_2024-01.parquet ]; then wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet -P crates/iceberg/testdata/performance/raw_data; fi
+ if [ ! -f ./crates/iceberg/testdata/performance/raw_data/yellow_tripdata_2024-02.parquet ]; then wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet -P crates/iceberg/testdata/performance/raw_data; fi
+ if [ ! -f ./crates/iceberg/testdata/performance/raw_data/yellow_tripdata_2024-03.parquet ]; then wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-03.parquet -P crates/iceberg/testdata/performance/raw_data; fi
+ if [ ! -f ./crates/iceberg/testdata/performance/raw_data/yellow_tripdata_2024-04.parquet ]; then wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-04.parquet -P crates/iceberg/testdata/performance/raw_data; fi
+ if [ ! -f ./crates/iceberg/testdata/performance/raw_data/yellow_tripdata_2024-05.parquet ]; then wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-05.parquet -P crates/iceberg/testdata/performance/raw_data; fi