From dada1fed8469adc21a55a9dc30ee6c37aaf3ab3c Mon Sep 17 00:00:00 2001
From: Mikhail Pravilov <mpravilov@google.com>
Date: Thu, 29 Jun 2023 14:21:11 +0200
Subject: [PATCH] Implement pure logarithmic grid for candidates search. (#461)

---
 analysis/parameter_tuning.py            | 38 +++++-----
 analysis/tests/parameter_tuning_test.py | 99 ++++++++++---------------
 2 files changed, 59 insertions(+), 78 deletions(-)

diff --git a/analysis/parameter_tuning.py b/analysis/parameter_tuning.py
index 64505e02..a72ed190 100644
--- a/analysis/parameter_tuning.py
+++ b/analysis/parameter_tuning.py
@@ -41,7 +41,9 @@ class ParametersSearchStrategy(Enum):
     # Picks up candidates that correspond tp a predefined list of quantiles.
     QUANTILES = 1
     # Candidates are a sequence starting from 1 where relative difference
-    # between two neighbouring elements is (almost) the same.
+    # between two neighbouring elements is the same. Mathematically it means
+    # that candidates are a sequence a_i, where
+    # a_i = max_value^(i / (max_candidates - 1)), i in [0..(max_candidates - 1)]
     CONSTANT_RELATIVE_STEP = 2
 
 
@@ -204,22 +206,24 @@ def _find_candidates_constant_relative_step(histogram: histograms.Histogram,
                                             max_candidates: int) -> List[int]:
     """Implementation of CONSTANT_RELATIVE_STEP strategy."""
     max_value = histogram.max_value
-    # relative step varies from 1% to 0.1%
-    # because generate_possible_contribution_bounds generate bounds by changing
-    # only up to first 3 digits, for example 100000, 101000, 102000... Then
-    # relative step between neighbouring elements
-    # varies (101000 - 100000) / 100000 = 0.01 and
-    # (1000000 - 999000) / 999000 ~= 0.001.
-    candidates = private_contribution_bounds.generate_possible_contribution_bounds(
-        max_value)
-    n_max_without_max_value = max_candidates - 1
-    if len(candidates) > n_max_without_max_value:
-        delta = len(candidates) / n_max_without_max_value
-        candidates = [
-            candidates[int(i * delta)] for i in range(n_max_without_max_value)
-        ]
-    if candidates[-1] != max_value:
-        candidates.append(max_value)
+    assert max_value >= 1, "max_value has to be >= 1."
+    max_candidates = min(max_candidates, max_value)
+    assert max_candidates > 0, "max_candidates have to be positive"
+    if max_candidates == 1:
+        return [1]
+    step = pow(max_value, 1 / (max_candidates - 1))
+    candidates = [1]
+    accumulated = 1
+    for i in range(1, max_candidates):
+        previous_candidate = candidates[-1]
+        if previous_candidate >= max_value:
+            break
+        accumulated *= step
+        next_candidate = max(previous_candidate + 1, math.ceil(accumulated))
+        candidates.append(next_candidate)
+    # float calculations might be not precise enough but the last candidate has
+    # to be always max_value
+    candidates[-1] = max_value
     return candidates
 
 
diff --git a/analysis/tests/parameter_tuning_test.py b/analysis/tests/parameter_tuning_test.py
index 97f53d80..a4475c1e 100644
--- a/analysis/tests/parameter_tuning_test.py
+++ b/analysis/tests/parameter_tuning_test.py
@@ -169,10 +169,42 @@ def test_find_candidate_parameters_more_candidates_for_l_inf_when_not_so_many_l_
         self.assertEqual([3, 4, 5, 6, 7, 3, 4, 5, 6, 7],
                          candidates.max_contributions_per_partition)
 
-    def test_find_candidate_parameters_constant_relative_step_strategy_big_n_max(
-            self):
+    @parameterized.named_parameters(
+        dict(testcase_name='max_value=1, returns [1]',
+             max_value=1,
+             max_candidates=1000,
+             expected_candidates=[1]),
+        dict(testcase_name='max_candidates=1, returns [1]',
+             max_value=1000,
+             max_candidates=1,
+             expected_candidates=[1]),
+        dict(testcase_name='max_candidates=2, returns 1 and max_value',
+             max_value=1003,
+             max_candidates=2,
+             expected_candidates=[1, 1003]),
+        dict(testcase_name='max_candidates is equal to max_value, returns '
+             'all possible candidates',
+             max_value=10,
+             max_candidates=10,
+             expected_candidates=list(range(1, 11))),
+        dict(
+            testcase_name='max_candidates is larger than max_value, returns all'
+            ' possible candidates up to max_value',
+            max_value=10,
+            max_candidates=100,
+            expected_candidates=list(range(1, 11))),
+        dict(
+            testcase_name='max_candidates is smaller than max_value, returns '
+            'logarithmic subset of values and last value is '
+            'max_value',
+            max_value=1000,
+            max_candidates=5,
+            # ceil(1000^(i / 4)), where i in [0, 1, 2, 3, 4]
+            expected_candidates=[1, 6, 32, 178, 1000]))
+    def test_find_candidate_parameters_constant_relative_ste_strategy(
+            self, max_value, max_candidates, expected_candidates):
         mock_l0_histogram = histograms.Histogram(None, None)
-        setattr(histograms.Histogram, 'max_value', 999999)
+        setattr(histograms.Histogram, 'max_value', max_value)
 
         mock_histograms = histograms.DatasetHistograms(mock_l0_histogram, None,
                                                        None, None, None)
@@ -185,65 +217,10 @@ def test_find_candidate_parameters_constant_relative_step_strategy_big_n_max(
             parameters_to_tune,
             pipeline_dp.Metrics.COUNT,
             ParametersSearchStrategy.CONSTANT_RELATIVE_STEP,
-            max_candidates=1000)
-
-        expected_superset = set(
-            list(range(1, 1000, 1)) + list(range(1000, 10000, 10)) +
-            list(range(10000, 100000, 100)) +
-            list(range(100000, 1000000, 1000))).union({999999})
-        self.assertTrue(
-            set(candidates.max_partitions_contributed).issubset(
-                expected_superset))
-        self.assertLen(set(candidates.max_partitions_contributed),
-                       len(candidates.max_partitions_contributed))
-        self.assertLen(candidates.max_partitions_contributed, 1000)
-        self.assertEqual(sorted(candidates.max_partitions_contributed),
-                         candidates.max_partitions_contributed)
-
-    def test_find_candidate_parameters_constant_relative_step_strategy_small_n_max(
-            self):
-        mock_linf_histogram = histograms.Histogram(None, None)
-        setattr(histograms.Histogram, 'max_value', 999999)
-
-        mock_histograms = histograms.DatasetHistograms(None, None,
-                                                       mock_linf_histogram,
-                                                       None, None)
-        parameters_to_tune = parameter_tuning.ParametersToTune(
-            max_partitions_contributed=False,
-            max_contributions_per_partition=True)
-
-        candidates = parameter_tuning._find_candidate_parameters(
-            mock_histograms,
-            parameters_to_tune,
-            pipeline_dp.Metrics.COUNT,
-            ParametersSearchStrategy.CONSTANT_RELATIVE_STEP,
-            max_candidates=10)
-
-        self.assertEqual(
-            [1, 412, 823, 3340, 7450, 25600, 66700, 178000, 589000, 999999],
-            candidates.max_contributions_per_partition)
+            max_candidates=max_candidates)
 
-    def test_find_candidate_parameters_constant_relative_step_strategy_number_of_candidates_returned_is_less_than_maximum_number_of_candidates(
-            self):
-        mock_linf_histogram = histograms.Histogram(None, None)
-        setattr(histograms.Histogram, 'max_value', 50)
-
-        mock_histograms = histograms.DatasetHistograms(None, None,
-                                                       mock_linf_histogram,
-                                                       None, None)
-        parameters_to_tune = parameter_tuning.ParametersToTune(
-            max_partitions_contributed=False,
-            max_contributions_per_partition=True)
-
-        candidates = parameter_tuning._find_candidate_parameters(
-            mock_histograms,
-            parameters_to_tune,
-            pipeline_dp.Metrics.COUNT,
-            ParametersSearchStrategy.CONSTANT_RELATIVE_STEP,
-            max_candidates=100)
-
-        self.assertEqual(list(range(1, 51)),
-                         candidates.max_contributions_per_partition)
+        self.assertEqual(expected_candidates,
+                         candidates.max_partitions_contributed)
 
     def test_tune_count(self):
         # Arrange.