diff --git a/analysis/parameter_tuning.py b/analysis/parameter_tuning.py index e0a58c88..62edb32c 100644 --- a/analysis/parameter_tuning.py +++ b/analysis/parameter_tuning.py @@ -138,7 +138,8 @@ def _find_candidate_parameters( Args: hist: dataset contribution histogram. parameters_to_tune: which parameters to tune. - metric: dp aggregation for which candidates are computed. + metric: dp aggregation for which candidates are computed. If metric is + None, it means no metrics to compute, i.e. only select partitions. strategy: determines the strategy how to select candidates, see comments to enum values for full description of the respective strategies. max_candidates: how many candidates ((l0, linf) pairs) can be in the @@ -243,8 +244,8 @@ def tune(col, """Tunes parameters. It works in the following way: - 1. Based on quantiles of privacy id contributions, candidates for - contribution bounding parameters chosen. + 1. Candidates for contribution bounding parameters chosen based on + options.parameters_search_strategy strategy. 2. Utility analysis run for those parameters. 3. The best parameter set is chosen according to options.minimizing_function. @@ -252,6 +253,9 @@ def tune(col, The result contains output metrics for all utility analysis which were performed. + For tuning parameters for DPEngine.select_partitions set + options.aggregate_params.metrics to an empty list. + Args: col: collection where all elements are of the same type. contribution_histograms: @@ -341,19 +345,23 @@ def _convert_utility_analysis_to_tune_result( def _check_tune_args(options: TuneOptions, is_public_partitions: bool): # Check metrics to tune. metrics = options.aggregate_params.metrics - if len(metrics) > 1: + if not metrics: + # Empty metrics means tuning for select_partitions. + if is_public_partitions: + # Empty metrics means that partition selection tuning is performed. + raise ValueError("empty metrics means tuning of partition selection" + " but public partitions were provided") + elif len(metrics) > 1: raise NotImplementedError( f"Tuning supports only one metrics, but {metrics} given.") - if len(metrics) == 1 and metrics[0] not in [ - pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.PRIVACY_ID_COUNT - ]: - raise NotImplementedError( - f"Tuning is supported only for Count and Privacy id count, but {metrics[0]} given." - ) + else: # len(metrics) == 1 + if metrics[0] not in [ + pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.PRIVACY_ID_COUNT + ]: + raise NotImplementedError( + f"Tuning is supported only for Count and Privacy id count, but {metrics[0]} given." + ) - if is_public_partitions and not metrics: - raise ValueError("empty metrics means private partition selection " - "but public partitions were provided") if options.function_to_minimize != MinimizingFunction.ABSOLUTE_ERROR: raise NotImplementedError( f"Only {MinimizingFunction.ABSOLUTE_ERROR} is implemented.") diff --git a/analysis/tests/parameter_tuning_test.py b/analysis/tests/parameter_tuning_test.py index bb2ab697..abfbccd1 100644 --- a/analysis/tests/parameter_tuning_test.py +++ b/analysis/tests/parameter_tuning_test.py @@ -16,6 +16,7 @@ from absl.testing import absltest from absl.testing import parameterized from unittest import mock +from unittest.mock import patch from typing import List import pipeline_dp @@ -206,7 +207,7 @@ def test_find_candidate_parameters_more_candidates_for_l_inf_when_not_so_many_l_ max_candidates=5, # ceil(1000^(i / 4)), where i in [0, 1, 2, 3, 4] expected_candidates=[1, 6, 32, 178, 1000])) - def test_find_candidate_parameters_constant_relative_ste_strategy( + def test_find_candidate_parameters_constant_relative_step_strategy( self, max_value, max_candidates, expected_candidates): mock_l0_histogram = histograms.Histogram(None, None) mock_l0_histogram.max_value = mock.Mock(return_value=max_value) @@ -227,6 +228,60 @@ def test_find_candidate_parameters_constant_relative_ste_strategy( self.assertEqual(expected_candidates, candidates.max_partitions_contributed) + @parameterized.named_parameters( + dict( + testcase_name='COUNT', + metric=pipeline_dp.Metrics.COUNT, + expected_generate_linf=True, + ), + dict( + testcase_name='PRIVACY_ID_COUNT', + metric=pipeline_dp.Metrics.PRIVACY_ID_COUNT, + expected_generate_linf=False, + ), + dict( + testcase_name='No metric (select partition)', + metric=None, + expected_generate_linf=False, + )) + @patch('analysis.parameter_tuning._find_candidates_constant_relative_step') + def test_find_candidate_parameters_generate_linf( + self, mock_find_candidate_from_histogram, metric, + expected_generate_linf): + mock_l0_histogram = histograms.Histogram( + histograms.HistogramType.L0_CONTRIBUTIONS, None) + mock_linf_histogram = histograms.Histogram( + histograms.HistogramType.LINF_CONTRIBUTIONS, None) + mock_histograms = histograms.DatasetHistograms(mock_l0_histogram, None, + mock_linf_histogram, + None, None) + + mock_find_candidate_from_histogram.return_value = [1, 2] + + parameters_to_tune = parameter_tuning.ParametersToTune( + max_partitions_contributed=True, + max_contributions_per_partition=True) + + candidates = parameter_tuning._find_candidate_parameters( + mock_histograms, + parameters_to_tune, + metric, + ParametersSearchStrategy.CONSTANT_RELATIVE_STEP, + max_candidates=100) + + mock_find_candidate_from_histogram.assert_any_call( + mock_l0_histogram, mock.ANY) + if expected_generate_linf: + self.assertEqual(candidates.max_partitions_contributed, + [1, 1, 2, 2]) + self.assertEqual(candidates.max_contributions_per_partition, + [1, 2, 1, 2]) + mock_find_candidate_from_histogram.assert_any_call( + mock_linf_histogram, mock.ANY) + else: + self.assertEqual(candidates.max_partitions_contributed, [1, 2]) + self.assertIsNone(candidates.max_contributions_per_partition) + def test_tune_count(self): # Arrange. input = [(i % 10, f"pk{i/10}") for i in range(10)] @@ -264,6 +319,42 @@ def test_tune_count(self): self.assertEqual(utility_reports[0].metric_errors[0].metric, pipeline_dp.Metrics.COUNT) + def test_select_partitions(self): + # Arrange. + input = [(i % 10, f"pk{i/10}") for i in range(10)] + data_extractors = pipeline_dp.DataExtractors( + privacy_id_extractor=lambda x: x[0], + partition_extractor=lambda x: x[1], + value_extractor=lambda x: 0) + + contribution_histograms = list( + computing_histograms.compute_dataset_histograms( + input, data_extractors, pipeline_dp.LocalBackend()))[0] + + tune_options = _get_tune_options() + # Setting metrics to empty list makes running only partition selectoin. + tune_options.aggregate_params.metrics = [] + + # Act. + result = parameter_tuning.tune(input, pipeline_dp.LocalBackend(), + contribution_histograms, tune_options, + data_extractors) + + # Assert. + tune_result, per_partition_utility_analysis = result + per_partition_utility_analysis = list(per_partition_utility_analysis) + self.assertLen(per_partition_utility_analysis, 10) + + tune_result = list(tune_result)[0] + + self.assertEqual(tune_options, tune_result.options) + self.assertEqual(contribution_histograms, + tune_result.contribution_histograms) + utility_reports = tune_result.utility_reports + self.assertLen(utility_reports, 1) + self.assertIsInstance(utility_reports[0], metrics.UtilityReport) + self.assertIsNone(utility_reports[0].metric_errors) + def test_tune_privacy_id_count(self): # Arrange. input = [(i % 10, f"pk{i/10}") for i in range(10)] @@ -299,6 +390,9 @@ def test_tune_privacy_id_count(self): self.assertEqual(utility_reports[0].metric_errors[0].metric, pipeline_dp.Metrics.PRIVACY_ID_COUNT) + def test_tune_params_validation(self): + pass + if __name__ == '__main__': absltest.main()