Skip to content

Commit

Permalink
Deriving Spark DataFrame schema on converting from RDD to DataFrame (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
dvadym authored Nov 22, 2023
1 parent 36fc26c commit acf9f67
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 4 deletions.
32 changes: 28 additions & 4 deletions pipeline_dp/dataframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
"""Computing DP aggregations on (Pandas, Spark, Beam) Dataframes."""
import abc
import copy
from collections import namedtuple
from dataclasses import dataclass
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union
Expand Down Expand Up @@ -56,7 +57,7 @@ def dataframe_to_collection(df, columns: Columns):
pass

@abc.abstractmethod
def collection_to_dataframe(col, group_key_column: str):
def collection_to_dataframe(col, metric_output_columns: Sequence[str]):
pass


Expand All @@ -65,9 +66,11 @@ class SparkConverter(DataFrameConvertor):

def __init__(self, spark: pyspark.sql.SparkSession):
self._spark = spark
self._partition_key_schema = None

def dataframe_to_collection(self, df: SparkDataFrame,
columns: Columns) -> pyspark.RDD:
self._save_partition_key_schema(df, columns.partition_key)
columns_to_keep = [columns.privacy_key]
if isinstance(columns.partition_key, str):
num_partition_columns = 1
Expand All @@ -90,8 +93,29 @@ def extractor(row):

return df.rdd.map(extractor)

def collection_to_dataframe(self, col: pyspark.RDD) -> SparkDataFrame:
return self._spark.createDataFrame(col)
def _save_partition_key_schema(self, df: SparkDataFrame,
partition_key: Union[str, Sequence[str]]):
col_name_to_schema = dict((col.name, col) for col in df.schema)
self._partition_key_schema = []
if isinstance(partition_key, str):
self._partition_key_schema.append(col_name_to_schema[partition_key])
else:
for column_name in partition_key:
self._partition_key_schema.append(
col_name_to_schema[column_name])

def collection_to_dataframe(
self, col: pyspark.RDD,
metric_output_columns: Sequence[str]) -> SparkDataFrame:
schema_fields = copy.deepcopy(self._partition_key_schema)
float_type = pyspark.sql.types.DoubleType()
for metric_column in metric_output_columns:
schema_fields.append(
pyspark.sql.types.StructField(metric_column,
float_type,
nullable=False))
schema = pyspark.sql.types.StructType(schema_fields)
return self._spark.createDataFrame(col, schema)


def _create_backend_for_dataframe(
Expand Down Expand Up @@ -217,7 +241,7 @@ def convert_to_partition_metrics_tuple(row):
"Convert to NamedTuple")
# dp_result: PartitionMetricsTuple

return converter.collection_to_dataframe(dp_result)
return converter.collection_to_dataframe(dp_result, output_columns)


@dataclass
Expand Down
26 changes: 26 additions & 0 deletions tests/dataframes_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,32 @@ def test_run_query_multiple_partition_keys_e2e_run(self):
self.assertAlmostEqual(row1["count"], 1, delta=1e-3)
self.assertAlmostEqual(row1["sum_column"], 5, delta=1e-3)

def test_run_query_e2e_run_empty_result(self):
# Arrange
spark = self._get_spark_session()
df = spark.createDataFrame(get_pandas_df())
columns = dataframes.Columns("privacy_key", "group_key", "value")
metrics = {pipeline_dp.Metrics.COUNT: "count_column"}
bounds = dataframes.ContributionBounds(
max_partitions_contributed=2,
max_contributions_per_partition=2,
min_value=-5,
max_value=5)
query = dataframes.Query(df,
columns,
metrics,
bounds,
public_partitions=None)

# Act
budget = dataframes.Budget(1, 1e-10)
result_df = query.run_query(budget)

# Assert
# The small input dataset and private partition selection. It almost
# sure leads to the empty result.
self.assertTrue(result_df.toPandas().empty)


if __name__ == '__main__':
absltest.main()

0 comments on commit acf9f67

Please sign in to comment.