From dada1fed8469adc21a55a9dc30ee6c37aaf3ab3c Mon Sep 17 00:00:00 2001 From: Mikhail Pravilov Date: Thu, 29 Jun 2023 14:21:11 +0200 Subject: [PATCH] Implement pure logarithmic grid for candidates search. (#461) --- analysis/parameter_tuning.py | 38 +++++----- analysis/tests/parameter_tuning_test.py | 99 ++++++++++--------------- 2 files changed, 59 insertions(+), 78 deletions(-) diff --git a/analysis/parameter_tuning.py b/analysis/parameter_tuning.py index 64505e02..a72ed190 100644 --- a/analysis/parameter_tuning.py +++ b/analysis/parameter_tuning.py @@ -41,7 +41,9 @@ class ParametersSearchStrategy(Enum): # Picks up candidates that correspond tp a predefined list of quantiles. QUANTILES = 1 # Candidates are a sequence starting from 1 where relative difference - # between two neighbouring elements is (almost) the same. + # between two neighbouring elements is the same. Mathematically it means + # that candidates are a sequence a_i, where + # a_i = max_value^(i / (max_candidates - 1)), i in [0..(max_candidates - 1)] CONSTANT_RELATIVE_STEP = 2 @@ -204,22 +206,24 @@ def _find_candidates_constant_relative_step(histogram: histograms.Histogram, max_candidates: int) -> List[int]: """Implementation of CONSTANT_RELATIVE_STEP strategy.""" max_value = histogram.max_value - # relative step varies from 1% to 0.1% - # because generate_possible_contribution_bounds generate bounds by changing - # only up to first 3 digits, for example 100000, 101000, 102000... Then - # relative step between neighbouring elements - # varies (101000 - 100000) / 100000 = 0.01 and - # (1000000 - 999000) / 999000 ~= 0.001. - candidates = private_contribution_bounds.generate_possible_contribution_bounds( - max_value) - n_max_without_max_value = max_candidates - 1 - if len(candidates) > n_max_without_max_value: - delta = len(candidates) / n_max_without_max_value - candidates = [ - candidates[int(i * delta)] for i in range(n_max_without_max_value) - ] - if candidates[-1] != max_value: - candidates.append(max_value) + assert max_value >= 1, "max_value has to be >= 1." + max_candidates = min(max_candidates, max_value) + assert max_candidates > 0, "max_candidates have to be positive" + if max_candidates == 1: + return [1] + step = pow(max_value, 1 / (max_candidates - 1)) + candidates = [1] + accumulated = 1 + for i in range(1, max_candidates): + previous_candidate = candidates[-1] + if previous_candidate >= max_value: + break + accumulated *= step + next_candidate = max(previous_candidate + 1, math.ceil(accumulated)) + candidates.append(next_candidate) + # float calculations might be not precise enough but the last candidate has + # to be always max_value + candidates[-1] = max_value return candidates diff --git a/analysis/tests/parameter_tuning_test.py b/analysis/tests/parameter_tuning_test.py index 97f53d80..a4475c1e 100644 --- a/analysis/tests/parameter_tuning_test.py +++ b/analysis/tests/parameter_tuning_test.py @@ -169,10 +169,42 @@ def test_find_candidate_parameters_more_candidates_for_l_inf_when_not_so_many_l_ self.assertEqual([3, 4, 5, 6, 7, 3, 4, 5, 6, 7], candidates.max_contributions_per_partition) - def test_find_candidate_parameters_constant_relative_step_strategy_big_n_max( - self): + @parameterized.named_parameters( + dict(testcase_name='max_value=1, returns [1]', + max_value=1, + max_candidates=1000, + expected_candidates=[1]), + dict(testcase_name='max_candidates=1, returns [1]', + max_value=1000, + max_candidates=1, + expected_candidates=[1]), + dict(testcase_name='max_candidates=2, returns 1 and max_value', + max_value=1003, + max_candidates=2, + expected_candidates=[1, 1003]), + dict(testcase_name='max_candidates is equal to max_value, returns ' + 'all possible candidates', + max_value=10, + max_candidates=10, + expected_candidates=list(range(1, 11))), + dict( + testcase_name='max_candidates is larger than max_value, returns all' + ' possible candidates up to max_value', + max_value=10, + max_candidates=100, + expected_candidates=list(range(1, 11))), + dict( + testcase_name='max_candidates is smaller than max_value, returns ' + 'logarithmic subset of values and last value is ' + 'max_value', + max_value=1000, + max_candidates=5, + # ceil(1000^(i / 4)), where i in [0, 1, 2, 3, 4] + expected_candidates=[1, 6, 32, 178, 1000])) + def test_find_candidate_parameters_constant_relative_ste_strategy( + self, max_value, max_candidates, expected_candidates): mock_l0_histogram = histograms.Histogram(None, None) - setattr(histograms.Histogram, 'max_value', 999999) + setattr(histograms.Histogram, 'max_value', max_value) mock_histograms = histograms.DatasetHistograms(mock_l0_histogram, None, None, None, None) @@ -185,65 +217,10 @@ def test_find_candidate_parameters_constant_relative_step_strategy_big_n_max( parameters_to_tune, pipeline_dp.Metrics.COUNT, ParametersSearchStrategy.CONSTANT_RELATIVE_STEP, - max_candidates=1000) - - expected_superset = set( - list(range(1, 1000, 1)) + list(range(1000, 10000, 10)) + - list(range(10000, 100000, 100)) + - list(range(100000, 1000000, 1000))).union({999999}) - self.assertTrue( - set(candidates.max_partitions_contributed).issubset( - expected_superset)) - self.assertLen(set(candidates.max_partitions_contributed), - len(candidates.max_partitions_contributed)) - self.assertLen(candidates.max_partitions_contributed, 1000) - self.assertEqual(sorted(candidates.max_partitions_contributed), - candidates.max_partitions_contributed) - - def test_find_candidate_parameters_constant_relative_step_strategy_small_n_max( - self): - mock_linf_histogram = histograms.Histogram(None, None) - setattr(histograms.Histogram, 'max_value', 999999) - - mock_histograms = histograms.DatasetHistograms(None, None, - mock_linf_histogram, - None, None) - parameters_to_tune = parameter_tuning.ParametersToTune( - max_partitions_contributed=False, - max_contributions_per_partition=True) - - candidates = parameter_tuning._find_candidate_parameters( - mock_histograms, - parameters_to_tune, - pipeline_dp.Metrics.COUNT, - ParametersSearchStrategy.CONSTANT_RELATIVE_STEP, - max_candidates=10) - - self.assertEqual( - [1, 412, 823, 3340, 7450, 25600, 66700, 178000, 589000, 999999], - candidates.max_contributions_per_partition) + max_candidates=max_candidates) - def test_find_candidate_parameters_constant_relative_step_strategy_number_of_candidates_returned_is_less_than_maximum_number_of_candidates( - self): - mock_linf_histogram = histograms.Histogram(None, None) - setattr(histograms.Histogram, 'max_value', 50) - - mock_histograms = histograms.DatasetHistograms(None, None, - mock_linf_histogram, - None, None) - parameters_to_tune = parameter_tuning.ParametersToTune( - max_partitions_contributed=False, - max_contributions_per_partition=True) - - candidates = parameter_tuning._find_candidate_parameters( - mock_histograms, - parameters_to_tune, - pipeline_dp.Metrics.COUNT, - ParametersSearchStrategy.CONSTANT_RELATIVE_STEP, - max_candidates=100) - - self.assertEqual(list(range(1, 51)), - candidates.max_contributions_per_partition) + self.assertEqual(expected_candidates, + candidates.max_partitions_contributed) def test_tune_count(self): # Arrange.