forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ARROW-4160: [Rust] Add README and executable files to parquet
Author: Chao Sun <[email protected]> Closes apache#3314 from sunchao/ARROW-4160 and squashes the following commits: 9d215df <Chao Sun> ARROW-4160: Add README and executable files to parquet
- Loading branch information
Showing
4 changed files
with
289 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,9 +17,11 @@ | |
|
||
[package] | ||
name = "parquet" | ||
version = "0.12.0-SNAPSHOT" | ||
version = "0.5.0-SNAPSHOT" | ||
license = "Apache-2.0" | ||
description = "Apache Parquet implementation in Rust" | ||
homepage = "https://github.com/apache/arrow" | ||
repository = "https://github.com/apache/arrow" | ||
authors = ["Apache Arrow <[email protected]>"] | ||
keywords = [ "arrow", "parquet", "hadoop" ] | ||
readme = "README.md" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
<!--- | ||
Licensed to the Apache Software Foundation (ASF) under one | ||
or more contributor license agreements. See the NOTICE file | ||
distributed with this work for additional information | ||
regarding copyright ownership. The ASF licenses this file | ||
to you under the Apache License, Version 2.0 (the | ||
"License"); you may not use this file except in compliance | ||
with the License. You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, | ||
software distributed under the License is distributed on an | ||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
KIND, either express or implied. See the License for the | ||
specific language governing permissions and limitations | ||
under the License. | ||
--> | ||
|
||
# An Apache Parquet implementation in Rust | ||
|
||
## Usage | ||
Add this to your Cargo.toml: | ||
```toml | ||
[dependencies] | ||
parquet = "0.4" | ||
``` | ||
|
||
and this to your crate root: | ||
```rust | ||
extern crate parquet; | ||
``` | ||
|
||
Example usage of reading data: | ||
```rust | ||
use std::fs::File; | ||
use std::path::Path; | ||
use parquet::file::reader::{FileReader, SerializedFileReader}; | ||
|
||
let file = File::open(&Path::new("/path/to/file")).unwrap(); | ||
let reader = SerializedFileReader::new(file).unwrap(); | ||
let mut iter = reader.get_row_iter(None).unwrap(); | ||
while let Some(record) = iter.next() { | ||
println!("{}", record); | ||
} | ||
``` | ||
See [crate documentation](https://docs.rs/crate/parquet/0.4.2) on available API. | ||
|
||
## Supported Parquet Version | ||
- Parquet-format 2.4.0 | ||
|
||
To update Parquet format to a newer version, check if [parquet-format](https://github.com/sunchao/parquet-format-rs) | ||
version is available. Then simply update version of `parquet-format` crate in Cargo.toml. | ||
|
||
## Features | ||
- [X] All encodings supported | ||
- [X] All compression codecs supported | ||
- [X] Read support | ||
- [X] Primitive column value readers | ||
- [X] Row record reader | ||
- [ ] Arrow record reader | ||
- [X] Statistics support | ||
- [X] Write support | ||
- [X] Primitive column value writers | ||
- [ ] Row record writer | ||
- [ ] Arrow record writer | ||
- [ ] Predicate pushdown | ||
- [ ] Parquet format 2.5 support | ||
- [ ] HDFS support | ||
|
||
## Requirements | ||
- Rust nightly | ||
|
||
See [Working with nightly Rust](https://github.com/rust-lang-nursery/rustup.rs/blob/master/README.md#working-with-nightly-rust) | ||
to install nightly toolchain and set it as default. | ||
|
||
## Build | ||
Run `cargo build` or `cargo build --release` to build in release mode. | ||
Some features take advantage of SSE4.2 instructions, which can be | ||
enabled by adding `RUSTFLAGS="-C target-feature=+sse4.2"` before the | ||
`cargo build` command. | ||
|
||
## Test | ||
Run `cargo test` for unit tests. | ||
|
||
## Binaries | ||
The following binaries are provided (use `cargo install` to install them): | ||
- **parquet-schema** for printing Parquet file schema and metadata. | ||
`Usage: parquet-schema <file-path> [verbose]`, where `file-path` is the path to a Parquet file, | ||
and optional `verbose` is the boolean flag that allows to print full metadata or schema only | ||
(when not specified only schema will be printed). | ||
|
||
- **parquet-read** for reading records from a Parquet file. | ||
`Usage: parquet-read <file-path> [num-records]`, where `file-path` is the path to a Parquet file, | ||
and `num-records` is the number of records to read from a file (when not specified all records will | ||
be printed). | ||
|
||
If you see `Library not loaded` error, please make sure `LD_LIBRARY_PATH` is set properly: | ||
``` | ||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(rustc --print sysroot)/lib | ||
``` | ||
|
||
## Benchmarks | ||
Run `cargo bench` for benchmarks. | ||
|
||
## Docs | ||
To build documentation, run `cargo doc --no-deps`. | ||
To compile and view in the browser, run `cargo doc --no-deps --open`. | ||
|
||
## License | ||
Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
//! Binary file to read data from a Parquet file. | ||
//! | ||
//! # Install | ||
//! | ||
//! `parquet-read` can be installed using `cargo`: | ||
//! ``` | ||
//! cargo install parquet | ||
//! ``` | ||
//! After this `parquet-read` should be globally available: | ||
//! ``` | ||
//! parquet-read XYZ.parquet | ||
//! ``` | ||
//! | ||
//! The binary can also be built from the source code and run as follows: | ||
//! ``` | ||
//! cargo run --bin parquet-read XYZ.parquet | ||
//! ``` | ||
//! | ||
//! # Usage | ||
//! | ||
//! ``` | ||
//! parquet-read <file-path> [num-records] | ||
//! ``` | ||
//! where `file-path` is the path to a Parquet file and `num-records` is the optional | ||
//! numeric option that allows to specify number of records to read from a file. | ||
//! When not provided, all records are read. | ||
//! | ||
//! Note that `parquet-read` reads full file schema, no projection or filtering is | ||
//! applied. | ||
|
||
extern crate parquet; | ||
|
||
use std::{env, fs::File, path::Path, process}; | ||
|
||
use parquet::file::reader::{FileReader, SerializedFileReader}; | ||
|
||
fn main() { | ||
let args: Vec<String> = env::args().collect(); | ||
if args.len() != 2 && args.len() != 3 { | ||
println!("Usage: parquet-read <file-path> [num-records]"); | ||
process::exit(1); | ||
} | ||
|
||
let mut num_records: Option<usize> = None; | ||
if args.len() == 3 { | ||
match args[2].parse() { | ||
Ok(value) => num_records = Some(value), | ||
Err(e) => panic!("Error when reading value for [num-records], {}", e), | ||
} | ||
} | ||
|
||
let path = Path::new(&args[1]); | ||
let file = File::open(&path).unwrap(); | ||
let parquet_reader = SerializedFileReader::new(file).unwrap(); | ||
|
||
// Use full schema as projected schema | ||
let mut iter = parquet_reader.get_row_iter(None).unwrap(); | ||
|
||
let mut start = 0; | ||
let end = num_records.unwrap_or(0); | ||
let all_records = num_records.is_none(); | ||
|
||
while all_records || start < end { | ||
match iter.next() { | ||
Some(row) => println!("{}", row), | ||
None => break, | ||
} | ||
start += 1; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
//! Binary file to print the schema and metadata of a Parquet file. | ||
//! | ||
//! # Install | ||
//! | ||
//! `parquet-schema` can be installed using `cargo`: | ||
//! ``` | ||
//! cargo install parquet | ||
//! ``` | ||
//! After this `parquet-schema` should be globally available: | ||
//! ``` | ||
//! parquet-schema XYZ.parquet | ||
//! ``` | ||
//! | ||
//! The binary can also be built from the source code and run as follows: | ||
//! ``` | ||
//! cargo run --bin parquet-schema XYZ.parquet | ||
//! ``` | ||
//! | ||
//! # Usage | ||
//! | ||
//! ``` | ||
//! parquet-schema <file-path> [verbose] | ||
//! ``` | ||
//! where `file-path` is the path to a Parquet file and `verbose` is the optional boolean | ||
//! flag that allows to print schema only, when set to `false` (default behaviour when | ||
//! not provided), or print full file metadata, when set to `true`. | ||
|
||
extern crate parquet; | ||
|
||
use std::{env, fs::File, path::Path, process}; | ||
|
||
use parquet::{ | ||
file::reader::{FileReader, SerializedFileReader}, | ||
schema::printer::{print_file_metadata, print_parquet_metadata}, | ||
}; | ||
|
||
fn main() { | ||
let args: Vec<String> = env::args().collect(); | ||
if args.len() != 2 && args.len() != 3 { | ||
println!("Usage: parquet-schema <file-path> [verbose]"); | ||
process::exit(1); | ||
} | ||
let path = Path::new(&args[1]); | ||
let mut verbose = false; | ||
if args.len() == 3 { | ||
match args[2].parse() { | ||
Ok(b) => verbose = b, | ||
Err(e) => panic!( | ||
"Error when reading value for [verbose] (expected either 'true' or 'false'): {}", | ||
e | ||
), | ||
} | ||
} | ||
let file = match File::open(&path) { | ||
Err(e) => panic!("Error when opening file {}: {}", path.display(), e), | ||
Ok(f) => f, | ||
}; | ||
match SerializedFileReader::new(file) { | ||
Err(e) => panic!("Error when parsing Parquet file: {}", e), | ||
Ok(parquet_reader) => { | ||
let metadata = parquet_reader.metadata(); | ||
println!("Metadata for file: {}", &args[1]); | ||
println!(""); | ||
if verbose { | ||
print_parquet_metadata(&mut std::io::stdout(), &metadata); | ||
} else { | ||
print_file_metadata(&mut std::io::stdout(), &metadata.file_metadata()); | ||
} | ||
} | ||
} | ||
} |