refine

apache · Feb 5, 2024 · ce003a7 · ce003a7
1 parent 71a1cbf
commit ce003a7
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 23 deletions.
diff --git a/crates/iceberg/src/writer/file_writer/location_generator.rs b/crates/iceberg/src/writer/file_writer/location_generator.rs
@@ -19,9 +19,11 @@
 
 use std::sync::{atomic::AtomicU64, Arc};
 
+use crate::spec::DataFileFormat;
+
 /// `LocationGenerator` used to generate the location of data file.
 pub trait LocationGenerator: Clone + Send + 'static {
-    /// Generate a absolute path for the given file name.
+    /// Generate an absolute path for the given file name.
     /// e.g
     /// For file name "part-00000.parquet", the generated location maybe "/table/data/part-00000.parquet"
     fn generate_location(&self, file_name: &str) -> String;
@@ -35,39 +37,28 @@ pub trait FileNameGenerator: Clone + Send + 'static {
 
 /// `DefaultFileNameGenerator` used to generate file name for data file. The file name can be
 /// passed to `LocationGenerator` to generate the location of the file.
-/// The rule of file name is aligned with the OutputFileFactory in iceberg-java
+/// The file name format is "{prefix}-{file_count}[-{suffix}].{file_format}".
 #[derive(Clone)]
 pub struct DefaultFileNameGenerator {
-    partition_id: u64,
-    task_id: u64,
-    // The purpose of this id is to be able to know from two paths that they were written by the
-    // same operation.
-    // That's useful, for example, if a Spark job dies and leaves files in the file system, you can
-    // identify them all
-    // with a recursive listing and grep.
-    operator_id: String,
+    prefix: String,
     suffix: String,
+    format: String,
     file_count: Arc<AtomicU64>,
 }
 
 impl DefaultFileNameGenerator {
     /// Create a new `FileNameGenerator`.
-    pub fn new(
-        partition_id: u64,
-        task_id: u64,
-        operator_id: String,
-        suffix: Option<String>,
-    ) -> Self {
+    pub fn new(prefix: String, suffix: Option<String>, format: DataFileFormat) -> Self {
         let suffix = if let Some(suffix) = suffix {
-            format!("--{}", suffix)
+            format!("-{}", suffix)
         } else {
             "".to_string()
         };
+
         Self {
-            partition_id,
-            task_id,
-            operator_id,
+            prefix,
             suffix,
+            format: format.to_string(),
             file_count: Arc::new(AtomicU64::new(0)),
         }
     }
@@ -79,8 +70,8 @@ impl FileNameGenerator for DefaultFileNameGenerator {
             .file_count
             .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
         format!(
-            "{:05}-{}-{}-{:05}{}",
-            self.partition_id, self.task_id, self.operator_id, file_id, self.suffix
+            "{}-{:05}{}.{}",
+            self.prefix, file_id, self.suffix, self.format
         )
     }
 }

diff --git a/crates/iceberg/src/writer/file_writer/parquet_writer.rs b/crates/iceberg/src/writer/file_writer/parquet_writer.rs
@@ -290,7 +290,8 @@ mod tests {
         let file_io = FileIOBuilder::new_fs_io().build().unwrap();
         let loccation_gen =
             MockLocationGenerator::new(temp_dir.path().to_str().unwrap().to_string());
-        let file_name_gen = DefaultFileNameGenerator::new(0, 0, "test".to_string(), None);
+        let file_name_gen =
+            DefaultFileNameGenerator::new("test".to_string(), None, DataFileFormat::Parquet);
 
         // prepare data
         let schema = {