Merge pull request #31 from DPGrev/dpgrev/improve-generating-test-dat…

…a-documentation Data Generator: bump Iceberg version, improve documentation
duckdb · Dec 11, 2023 · 7aa3d8e · 7aa3d8e
2 parents e16988b + f187a91
commit 7aa3d8e
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -25,7 +25,7 @@ some issue that seems to cause issues with the avro files produced by the spark
 ### Test data generation
 
 To generate test data, the script in 'scripts/test_data_generator' is used to have spark generate some test data. This is 
-based on pyspark 3.4, which you can install through pip. 
+based on pyspark 3.5, which you can install through pip. 
 
 ### Building the extension
 
@@ -49,8 +49,10 @@ To generate the test data, run:
 ```shell
 make data
 ```
-Note that the script requires python3, pyspark and duckdb-python to be installed. Assuming python3 is already installed,
-running `python3 -m pip install duckdb pyspark` should do the trick.
+
+**Note** that the script requires python3, pyspark and duckdb-python to be installed. Make sure that the correct versions for pyspark (3.5.0), java and scala (2.12) are installed.
+
+running `python3 -m pip install duckdb pyspark[sql]==3.5.0` should do the trick.
 
 #### Running unit tests
 

diff --git a/scripts/test_data_generator/generate_iceberg.py b/scripts/test_data_generator/generate_iceberg.py
@@ -35,7 +35,7 @@
 conf.set('spark.sql.catalog.iceberg_catalog.warehouse', DEST_PATH)
 conf.set('spark.sql.parquet.outputTimestampType', 'TIMESTAMP_MICROS')
 conf.set('spark.driver.memory', '10g')
-conf.set('spark.jars', f'{SCRIPT_DIR}/iceberg-spark-runtime-3.4_2.12-1.3.0.jar')
+conf.set('spark.jars', f'{SCRIPT_DIR}/iceberg-spark-runtime-3.5_2.12-1.4.2.jar')
 conf.set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions')
 spark = pyspark.sql.SparkSession.builder.config(conf=conf).getOrCreate()
 sc = spark.sparkContext
@@ -95,4 +95,4 @@
 ### Finally, we copy the latest results to a "final" dir for easy test writing
 ###
 import shutil
-shutil.copytree(f"{DEST_PATH}/expected_results/{last_file}", f"{DEST_PATH}/expected_results/last")
+shutil.copytree(f"{DEST_PATH}/expected_results/{last_file}", f"{DEST_PATH}/expected_results/last")
diff --git a/.../iceberg-spark-runtime-3.4_2.12-1.3.0.jar → .../iceberg-spark-runtime-3.5_2.12-1.4.2.jar b/.../iceberg-spark-runtime-3.4_2.12-1.3.0.jar → .../iceberg-spark-runtime-3.5_2.12-1.4.2.jar