diff --git a/rust/parquet/Cargo.toml b/rust/parquet/Cargo.toml index aa7eac224c0cf..7478992327ddc 100644 --- a/rust/parquet/Cargo.toml +++ b/rust/parquet/Cargo.toml @@ -17,9 +17,11 @@ [package] name = "parquet" -version = "0.12.0-SNAPSHOT" +version = "0.5.0-SNAPSHOT" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" +homepage = "https://github.com/apache/arrow" +repository = "https://github.com/apache/arrow" authors = ["Apache Arrow "] keywords = [ "arrow", "parquet", "hadoop" ] readme = "README.md" diff --git a/rust/parquet/README.md b/rust/parquet/README.md new file mode 100644 index 0000000000000..e9238ffba8a13 --- /dev/null +++ b/rust/parquet/README.md @@ -0,0 +1,111 @@ + + +# An Apache Parquet implementation in Rust + +## Usage +Add this to your Cargo.toml: +```toml +[dependencies] +parquet = "0.4" +``` + +and this to your crate root: +```rust +extern crate parquet; +``` + +Example usage of reading data: +```rust +use std::fs::File; +use std::path::Path; +use parquet::file::reader::{FileReader, SerializedFileReader}; + +let file = File::open(&Path::new("/path/to/file")).unwrap(); +let reader = SerializedFileReader::new(file).unwrap(); +let mut iter = reader.get_row_iter(None).unwrap(); +while let Some(record) = iter.next() { + println!("{}", record); +} +``` +See [crate documentation](https://docs.rs/crate/parquet/0.4.2) on available API. + +## Supported Parquet Version +- Parquet-format 2.4.0 + +To update Parquet format to a newer version, check if [parquet-format](https://github.com/sunchao/parquet-format-rs) +version is available. Then simply update version of `parquet-format` crate in Cargo.toml. + +## Features +- [X] All encodings supported +- [X] All compression codecs supported +- [X] Read support + - [X] Primitive column value readers + - [X] Row record reader + - [ ] Arrow record reader +- [X] Statistics support +- [X] Write support + - [X] Primitive column value writers + - [ ] Row record writer + - [ ] Arrow record writer +- [ ] Predicate pushdown +- [ ] Parquet format 2.5 support +- [ ] HDFS support + +## Requirements +- Rust nightly + +See [Working with nightly Rust](https://github.com/rust-lang-nursery/rustup.rs/blob/master/README.md#working-with-nightly-rust) +to install nightly toolchain and set it as default. + +## Build +Run `cargo build` or `cargo build --release` to build in release mode. +Some features take advantage of SSE4.2 instructions, which can be +enabled by adding `RUSTFLAGS="-C target-feature=+sse4.2"` before the +`cargo build` command. + +## Test +Run `cargo test` for unit tests. + +## Binaries +The following binaries are provided (use `cargo install` to install them): +- **parquet-schema** for printing Parquet file schema and metadata. +`Usage: parquet-schema [verbose]`, where `file-path` is the path to a Parquet file, +and optional `verbose` is the boolean flag that allows to print full metadata or schema only +(when not specified only schema will be printed). + +- **parquet-read** for reading records from a Parquet file. +`Usage: parquet-read [num-records]`, where `file-path` is the path to a Parquet file, +and `num-records` is the number of records to read from a file (when not specified all records will +be printed). + +If you see `Library not loaded` error, please make sure `LD_LIBRARY_PATH` is set properly: +``` +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(rustc --print sysroot)/lib +``` + +## Benchmarks +Run `cargo bench` for benchmarks. + +## Docs +To build documentation, run `cargo doc --no-deps`. +To compile and view in the browser, run `cargo doc --no-deps --open`. + +## License +Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0. diff --git a/rust/parquet/src/bin/parquet-read.rs b/rust/parquet/src/bin/parquet-read.rs new file mode 100644 index 0000000000000..c86b26e3e7a4d --- /dev/null +++ b/rust/parquet/src/bin/parquet-read.rs @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Binary file to read data from a Parquet file. +//! +//! # Install +//! +//! `parquet-read` can be installed using `cargo`: +//! ``` +//! cargo install parquet +//! ``` +//! After this `parquet-read` should be globally available: +//! ``` +//! parquet-read XYZ.parquet +//! ``` +//! +//! The binary can also be built from the source code and run as follows: +//! ``` +//! cargo run --bin parquet-read XYZ.parquet +//! ``` +//! +//! # Usage +//! +//! ``` +//! parquet-read [num-records] +//! ``` +//! where `file-path` is the path to a Parquet file and `num-records` is the optional +//! numeric option that allows to specify number of records to read from a file. +//! When not provided, all records are read. +//! +//! Note that `parquet-read` reads full file schema, no projection or filtering is +//! applied. + +extern crate parquet; + +use std::{env, fs::File, path::Path, process}; + +use parquet::file::reader::{FileReader, SerializedFileReader}; + +fn main() { + let args: Vec = env::args().collect(); + if args.len() != 2 && args.len() != 3 { + println!("Usage: parquet-read [num-records]"); + process::exit(1); + } + + let mut num_records: Option = None; + if args.len() == 3 { + match args[2].parse() { + Ok(value) => num_records = Some(value), + Err(e) => panic!("Error when reading value for [num-records], {}", e), + } + } + + let path = Path::new(&args[1]); + let file = File::open(&path).unwrap(); + let parquet_reader = SerializedFileReader::new(file).unwrap(); + + // Use full schema as projected schema + let mut iter = parquet_reader.get_row_iter(None).unwrap(); + + let mut start = 0; + let end = num_records.unwrap_or(0); + let all_records = num_records.is_none(); + + while all_records || start < end { + match iter.next() { + Some(row) => println!("{}", row), + None => break, + } + start += 1; + } +} diff --git a/rust/parquet/src/bin/parquet-schema.rs b/rust/parquet/src/bin/parquet-schema.rs new file mode 100644 index 0000000000000..2eaf7652ae9d6 --- /dev/null +++ b/rust/parquet/src/bin/parquet-schema.rs @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Binary file to print the schema and metadata of a Parquet file. +//! +//! # Install +//! +//! `parquet-schema` can be installed using `cargo`: +//! ``` +//! cargo install parquet +//! ``` +//! After this `parquet-schema` should be globally available: +//! ``` +//! parquet-schema XYZ.parquet +//! ``` +//! +//! The binary can also be built from the source code and run as follows: +//! ``` +//! cargo run --bin parquet-schema XYZ.parquet +//! ``` +//! +//! # Usage +//! +//! ``` +//! parquet-schema [verbose] +//! ``` +//! where `file-path` is the path to a Parquet file and `verbose` is the optional boolean +//! flag that allows to print schema only, when set to `false` (default behaviour when +//! not provided), or print full file metadata, when set to `true`. + +extern crate parquet; + +use std::{env, fs::File, path::Path, process}; + +use parquet::{ + file::reader::{FileReader, SerializedFileReader}, + schema::printer::{print_file_metadata, print_parquet_metadata}, +}; + +fn main() { + let args: Vec = env::args().collect(); + if args.len() != 2 && args.len() != 3 { + println!("Usage: parquet-schema [verbose]"); + process::exit(1); + } + let path = Path::new(&args[1]); + let mut verbose = false; + if args.len() == 3 { + match args[2].parse() { + Ok(b) => verbose = b, + Err(e) => panic!( + "Error when reading value for [verbose] (expected either 'true' or 'false'): {}", + e + ), + } + } + let file = match File::open(&path) { + Err(e) => panic!("Error when opening file {}: {}", path.display(), e), + Ok(f) => f, + }; + match SerializedFileReader::new(file) { + Err(e) => panic!("Error when parsing Parquet file: {}", e), + Ok(parquet_reader) => { + let metadata = parquet_reader.metadata(); + println!("Metadata for file: {}", &args[1]); + println!(""); + if verbose { + print_parquet_metadata(&mut std::io::stdout(), &metadata); + } else { + print_file_metadata(&mut std::io::stdout(), &metadata.file_metadata()); + } + } + } +}