diff --git a/.asf.yaml b/.asf.yaml index 364b9b254..90db3557a 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -42,6 +42,7 @@ github: required_approving_review_count: 1 required_linear_history: true + features: wiki: false issues: true @@ -50,6 +51,8 @@ github: - Xuanwo - liurenjie1024 - JanKaul + ghp_branch: gh-pages + ghp_path: / notifications: commits: commits@iceberg.apache.org diff --git a/.github/workflows/website.yml b/.github/workflows/website.yml new file mode 100644 index 000000000..abf979206 --- /dev/null +++ b/.github/workflows/website.yml @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Website + +on: + push: + branches: + - main + pull_request: + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Setup mdBook + uses: peaceiris/actions-mdbook@v1 + with: + mdbook-version: '0.4.36' + + - name: Build + working-directory: website + run: mdbook build + + - name: Copy asf file + run: cp .asf.yaml ./website/book/.asf.yaml + + - name: Deploy to gh-pages + uses: peaceiris/actions-gh-pages@v3.9.3 + if: github.event_name == 'push' && github.ref_name == 'main' + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: website/book + publish_branch: gh-pages diff --git a/.licenserc.yaml b/.licenserc.yaml index cd362bc94..662cea8d5 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -24,5 +24,6 @@ header: - 'LICENSE' - 'NOTICE' - '**/*.json' - + # Generated content by mdbook + - 'website/book' comment: on-failure diff --git a/Cargo.toml b/Cargo.toml index 2adce14f6..f05afc2a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,7 +41,7 @@ log = "^0.4" mockito = "^1" murmur3 = "0.5.2" once_cell = "1" -opendal = "0.43" +opendal = "0.44" ordered-float = "4.0.0" pretty_assertions = "1.4.0" port_scanner = "0.1.5" diff --git a/README.md b/README.md index d7caa34bc..325fba06a 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,77 @@ Native Rust implementation of [Apache Iceberg](https://iceberg.apache.org/). +## Roadmap + +### Catalog + +| Catalog Type | Status | +|--------------|-------------| +| Rest | Done | +| Hive | In Progress | +| Sql | Not Started | +| Glue | Not Started | +| DynamoDB | Not Started | + +### FileIO + +| FileIO Type | Status | +|-------------|-------------| +| S3 | Done | +| Local File | Done | +| GCS | Not Started | +| HDFS | Not Started | + +Our `FileIO` is powered by [Apache OpenDAL](https://github.com/apache/incubator-opendal), so it would be quite easy to +expand to other service. + +### Table API + +#### Reader + +| Feature | Status | +|------------------------------------------------------------|-------------| +| File based task planning | In progress | +| Size based task planning | Not started | +| Filter pushdown(manifest evaluation, partition prunning) | Not started | +| Apply deletions, including equality and position deletions | Not started | +| Read into arrow record batch | Not started | +| Parquet file support | Not started | +| ORC file support | Not started | + +#### Writer + +| Feature | Status | +|--------------------------|-------------| +| Data writer | Not started | +| Equality deletion writer | Not started | +| Position deletion writer | Not started | +| Partitioned writer | Not started | +| Upsert writer | Not started | +| Parquet file support | Not started | +| ORC file support | Not started | + +#### Transaction + +| Feature | Status | +|-----------------------|-------------| +| Schema evolution | Not started | +| Update partition spec | Not started | +| Update properties | Not started | +| Replace sort order | Not started | +| Update location | Not started | +| Append files | Not started | +| Rewrite files | Not started | +| Rewrite manifests | Not started | +| Overwrite files | Not started | +| Row level updates | Not started | +| Replace partitions | Not started | +| Snapshot management | Not started | + +### Integrations + +We will add integrations with other rust based data systems, such as polars, datafusion, etc. + ## Contribute Iceberg is an active open-source project. We are always open to people who want to use it or contribute to it. Here are some ways to go. diff --git a/crates/iceberg/src/expr/mod.rs b/crates/iceberg/src/expr/mod.rs new file mode 100644 index 000000000..aef144441 --- /dev/null +++ b/crates/iceberg/src/expr/mod.rs @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! This module contains expressions. + +mod term; +pub use term::*; +mod predicate; +pub use predicate::*; + +/// Predicate operators used in expressions. +#[allow(missing_docs)] +pub enum PredicateOperator { + IsNull, + NotNull, + IsNan, + NotNan, + LessThan, + LessThanOrEq, + GreaterThan, + GreaterThanOrEq, + Eq, + NotEq, + In, + NotIn, + StartsWith, + NotStartsWith, +} diff --git a/crates/iceberg/src/expr/predicate.rs b/crates/iceberg/src/expr/predicate.rs new file mode 100644 index 000000000..9d6bf8662 --- /dev/null +++ b/crates/iceberg/src/expr/predicate.rs @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! This module contains predicate expressions. +//! Predicate expressions are used to filter data, and evaluates to a boolean value. For example, +//! `a > 10` is a predicate expression, and it evaluates to `true` if `a` is greater than `10`, + +use crate::expr::{BoundReference, PredicateOperator, UnboundReference}; +use crate::spec::Literal; +use std::collections::HashSet; + +/// Logical expression, such as `AND`, `OR`, `NOT`. +pub struct LogicalExpression { + inputs: [Box; N], +} + +/// Unary predicate, for example, `a IS NULL`. +pub struct UnaryExpression { + /// Operator of this predicate, must be single operand operator. + op: PredicateOperator, + /// Term of this predicate, for example, `a` in `a IS NULL`. + term: T, +} + +/// Binary predicate, for example, `a > 10`. +pub struct BinaryExpression { + /// Operator of this predicate, must be binary operator, such as `=`, `>`, `<`, etc. + op: PredicateOperator, + /// Term of this predicate, for example, `a` in `a > 10`. + term: T, + /// Literal of this predicate, for example, `10` in `a > 10`. + literal: Literal, +} + +/// Set predicates, for example, `a in (1, 2, 3)`. +pub struct SetExpression { + /// Operator of this predicate, must be set operator, such as `IN`, `NOT IN`, etc. + op: PredicateOperator, + /// Term of this predicate, for example, `a` in `a in (1, 2, 3)`. + term: T, + /// Literals of this predicate, for example, `(1, 2, 3)` in `a in (1, 2, 3)`. + literals: HashSet, +} + +/// Unbound predicate expression before binding to a schema. +pub enum UnboundPredicate { + /// And predicate, for example, `a > 10 AND b < 20`. + And(LogicalExpression), + /// Or predicate, for example, `a > 10 OR b < 20`. + Or(LogicalExpression), + /// Not predicate, for example, `NOT (a > 10)`. + Not(LogicalExpression), + /// Unary expression, for example, `a IS NULL`. + Unary(UnaryExpression), + /// Binary expression, for example, `a > 10`. + Binary(BinaryExpression), + /// Set predicates, for example, `a in (1, 2, 3)`. + Set(SetExpression), +} + +/// Bound predicate expression after binding to a schema. +pub enum BoundPredicate { + /// An expression always evaluates to true. + AlwaysTrue, + /// An expression always evaluates to false. + AlwaysFalse, + /// An expression combined by `AND`, for example, `a > 10 AND b < 20`. + And(LogicalExpression), + /// An expression combined by `OR`, for example, `a > 10 OR b < 20`. + Or(LogicalExpression), + /// An expression combined by `NOT`, for example, `NOT (a > 10)`. + Not(LogicalExpression), + /// Unary expression, for example, `a IS NULL`. + Unary(UnaryExpression), + /// Binary expression, for example, `a > 10`. + Binary(BinaryExpression), + /// Set predicates, for example, `a in (1, 2, 3)`. + Set(SetExpression), +} diff --git a/crates/iceberg/src/expr/term.rs b/crates/iceberg/src/expr/term.rs new file mode 100644 index 000000000..5a81ecdfc --- /dev/null +++ b/crates/iceberg/src/expr/term.rs @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Term definition. + +use crate::spec::NestedFieldRef; + +/// Unbound term before binding to a schema. +pub type UnboundTerm = UnboundReference; + +/// A named reference in an unbound expression. +/// For example, `a` in `a > 10`. +pub struct UnboundReference { + name: String, +} + +/// A named reference in a bound expression after binding to a schema. +pub struct BoundReference { + field: NestedFieldRef, +} + +/// Bound term after binding to a schema. +pub type BoundTerm = BoundReference; diff --git a/crates/iceberg/src/lib.rs b/crates/iceberg/src/lib.rs index 3de082052..7d652d8b0 100644 --- a/crates/iceberg/src/lib.rs +++ b/crates/iceberg/src/lib.rs @@ -28,6 +28,7 @@ pub use error::ErrorKind; pub use error::Result; mod catalog; + pub use catalog::Catalog; pub use catalog::Namespace; pub use catalog::NamespaceIdent; @@ -45,5 +46,8 @@ pub mod io; pub mod spec; mod scan; + +#[allow(dead_code)] +pub mod expr; pub mod transaction; pub mod transform; diff --git a/crates/iceberg/src/spec/manifest.rs b/crates/iceberg/src/spec/manifest.rs index 654e1b1c7..bdd0d0a56 100644 --- a/crates/iceberg/src/spec/manifest.rs +++ b/crates/iceberg/src/spec/manifest.rs @@ -189,7 +189,7 @@ impl ManifestWriter { let entry = self .field_summary .remove(&field.source_id) - .unwrap_or(FieldSummary::default()); + .unwrap_or_default(); partition_summary.push(entry); } partition_summary diff --git a/rust-toolchain.toml b/rust-toolchain.toml index a5a7402a5..0763a9d50 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -16,5 +16,5 @@ # under the License. [toolchain] -channel = "1.72.1" +channel = "1.75.0" components = ["rustfmt", "clippy"] diff --git a/website/.gitignore b/website/.gitignore new file mode 100644 index 000000000..6155ce079 --- /dev/null +++ b/website/.gitignore @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +book diff --git a/website/README.md b/website/README.md new file mode 100644 index 000000000..9914b8222 --- /dev/null +++ b/website/README.md @@ -0,0 +1,40 @@ + + +# Iceberg Rust Website + +## Setup + +Install mdbook first + +```shell +cargo install mdbook +``` + +## Preview + +```shell +mdbook serve +``` + +## Build + +```shell +mdbook build +``` diff --git a/website/book.toml b/website/book.toml new file mode 100644 index 000000000..780208159 --- /dev/null +++ b/website/book.toml @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[book] +authors = ["Iceberg Community"] +language = "en" +multilingual = false +src = "src" +title = "Iceberg Rust" + +[output.html] +git-repository-url = "https://github.com/apache/iceberg-rust" +git-repository-icon = "fa-github" +edit-url-template = "https://github.com/apache/iceberg-rust/edit/main/website/{path}" +cname = "rust.iceberg.apache.org" +no-section-label = true \ No newline at end of file diff --git a/website/src/CONTRIBUTING.md b/website/src/CONTRIBUTING.md new file mode 120000 index 000000000..f939e75f2 --- /dev/null +++ b/website/src/CONTRIBUTING.md @@ -0,0 +1 @@ +../../CONTRIBUTING.md \ No newline at end of file diff --git a/website/src/SUMMARY.md b/website/src/SUMMARY.md new file mode 100644 index 000000000..c0aa07fbc --- /dev/null +++ b/website/src/SUMMARY.md @@ -0,0 +1,28 @@ + + +- [Introduction](./introduction.md) + +# User Guide + +- [Install](./install.md) + +# Developer Guide + +- [Contributing](./CONTRIBUTING.md) \ No newline at end of file diff --git a/website/src/install.md b/website/src/install.md new file mode 100644 index 000000000..3a96cb5b0 --- /dev/null +++ b/website/src/install.md @@ -0,0 +1,32 @@ + + +# Install + +Add `iceberg` into `Cargo.toml` dependencies: + +```toml +iceberg = "0.2.0" +``` + +iceberg is under active development, you may want to use the git version instead: + +```toml +iceberg = { git = "https://github.com/apache/iceberg-rust", rev = "commit-hash" } +``` diff --git a/website/src/introduction.md b/website/src/introduction.md new file mode 100644 index 000000000..260ec690e --- /dev/null +++ b/website/src/introduction.md @@ -0,0 +1,22 @@ + + +# Iceberg Rust + +Iceberg Rust is a rust implementation for accessing iceberg tables.