Skip to content

Commit

Permalink
refine
Browse files Browse the repository at this point in the history
  • Loading branch information
ZENOTME committed Feb 5, 2024
1 parent 71a1cbf commit ce003a7
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 23 deletions.
35 changes: 13 additions & 22 deletions crates/iceberg/src/writer/file_writer/location_generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@

use std::sync::{atomic::AtomicU64, Arc};

use crate::spec::DataFileFormat;

/// `LocationGenerator` used to generate the location of data file.
pub trait LocationGenerator: Clone + Send + 'static {
/// Generate a absolute path for the given file name.
/// Generate an absolute path for the given file name.
/// e.g
/// For file name "part-00000.parquet", the generated location maybe "/table/data/part-00000.parquet"
fn generate_location(&self, file_name: &str) -> String;
Expand All @@ -35,39 +37,28 @@ pub trait FileNameGenerator: Clone + Send + 'static {

/// `DefaultFileNameGenerator` used to generate file name for data file. The file name can be
/// passed to `LocationGenerator` to generate the location of the file.
/// The rule of file name is aligned with the OutputFileFactory in iceberg-java
/// The file name format is "{prefix}-{file_count}[-{suffix}].{file_format}".
#[derive(Clone)]
pub struct DefaultFileNameGenerator {
partition_id: u64,
task_id: u64,
// The purpose of this id is to be able to know from two paths that they were written by the
// same operation.
// That's useful, for example, if a Spark job dies and leaves files in the file system, you can
// identify them all
// with a recursive listing and grep.
operator_id: String,
prefix: String,
suffix: String,
format: String,
file_count: Arc<AtomicU64>,
}

impl DefaultFileNameGenerator {
/// Create a new `FileNameGenerator`.
pub fn new(
partition_id: u64,
task_id: u64,
operator_id: String,
suffix: Option<String>,
) -> Self {
pub fn new(prefix: String, suffix: Option<String>, format: DataFileFormat) -> Self {
let suffix = if let Some(suffix) = suffix {
format!("--{}", suffix)
format!("-{}", suffix)
} else {
"".to_string()
};

Self {
partition_id,
task_id,
operator_id,
prefix,
suffix,
format: format.to_string(),
file_count: Arc::new(AtomicU64::new(0)),
}
}
Expand All @@ -79,8 +70,8 @@ impl FileNameGenerator for DefaultFileNameGenerator {
.file_count
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
format!(
"{:05}-{}-{}-{:05}{}",
self.partition_id, self.task_id, self.operator_id, file_id, self.suffix
"{}-{:05}{}.{}",
self.prefix, file_id, self.suffix, self.format
)
}
}
Expand Down
3 changes: 2 additions & 1 deletion crates/iceberg/src/writer/file_writer/parquet_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,8 @@ mod tests {
let file_io = FileIOBuilder::new_fs_io().build().unwrap();
let loccation_gen =
MockLocationGenerator::new(temp_dir.path().to_str().unwrap().to_string());
let file_name_gen = DefaultFileNameGenerator::new(0, 0, "test".to_string(), None);
let file_name_gen =
DefaultFileNameGenerator::new("test".to_string(), None, DataFileFormat::Parquet);

// prepare data
let schema = {
Expand Down

0 comments on commit ce003a7

Please sign in to comment.