fixes

dvadym · dvadym · commit bde69c0b8bfe · 2025-10-31T16:05:20.000+01:00
diff --git a/analysis/parameter_tuning.py b/analysis/parameter_tuning.py
@@ -162,6 +162,8 @@ def _find_candidate_parameters(
     if tune_count_linf:
         linf_count_bounds = _find_candidates_constant_relative_step(
             hist.linf_contributions_histogram, max_linf_candidates)
+    elif pipeline_dp.Metrics.COUNT in metrics:
+        linf_count_bounds = [aggregate_params.max_contributions_per_partition]
 
     linf_sum_bounds = None
     if tune_sum_linf:
@@ -331,7 +333,8 @@ def tune(col,
                                 pipeline_dp.PreAggregateExtractors],
          public_partitions=None,
          strategy_selector_factory: Optional[
-             dp_strategy_selector.DPStrategySelectorFactory] = None):
+             dp_strategy_selector.DPStrategySelectorFactory] = None,
+         candidates: Optional[analysis.MultiParameterConfiguration] = None):
     """Tunes parameters.
 
     It works in the following way:
@@ -371,13 +374,14 @@ def tune(col,
         strategy_selector_factory = dp_strategy_selector.DPStrategySelectorFactory(
         )
 
-    candidates: analysis.MultiParameterConfiguration = (
-        _find_candidate_parameters(
-            hist=contribution_histograms,
-            parameters_to_tune=options.parameters_to_tune,
-            aggregate_params=options.aggregate_params,
-            max_candidates=options.number_of_parameter_candidates,
-        ))
+    if candidates is None:
+        candidates: analysis.MultiParameterConfiguration = (
+            _find_candidate_parameters(
+                hist=contribution_histograms,
+                parameters_to_tune=options.parameters_to_tune,
+                aggregate_params=options.aggregate_params,
+                max_candidates=options.number_of_parameter_candidates,
+            ))
 
     # Add DP strategy (noise_kind, partition_selection_strategy) to multi
     # parameter configuration.
@@ -429,6 +433,7 @@ def _convert_utility_analysis_to_tune_result(
     # TODO(dvadym): implement relative error.
     # TODO(dvadym): take into consideration partition selection from private
     # partition selection.
+    assert tune_options.function_to_minimize == MinimizingFunction.ABSOLUTE_ERROR
 
     # Sort utility reports by configuration index.
     sorted_utility_reports = sorted(utility_reports,
diff --git a/tests/dp_engine_test.py b/tests/dp_engine_test.py
@@ -288,38 +288,6 @@ def test_calculate_private_contribution_works_on_beam(self):
                         max_partitions_contributed=2)
                 ]))
 
-    @unittest.skipIf(
-        sys.version_info.minor <= 7 and sys.version_info.major == 3,
-        "There are some problems with PySpark setup on older python")
-    def test_calculate_private_contribution_does_not_work_on_spark_due_to_unsupported_operations(
-            self):
-        # Arrange
-        import pyspark
-        engine = pipeline_dp.DPEngine(budget_accountant=None,
-                                      backend=pipeline_dp.SparkRDDBackend(
-                                          pyspark.SparkContext.getOrCreate(
-                                              pyspark.SparkConf())))
-        params = pipeline_dp.CalculatePrivateContributionBoundsParams(
-            aggregation_eps=0.9,
-            aggregation_delta=0.001,
-            calculation_eps=0.1,
-            aggregation_noise_kind=pipeline_dp.NoiseKind.LAPLACE,
-            max_partitions_contributed_upper_bound=2)
-        # user 0 contributes only 1 partitions, others contribute to both
-        data = [("pk0", 0)]
-        for i in range(10000):
-            data += [("pk0", i + 1), ("pk1", i + 1)]
-        data_extractors = pipeline_dp.DataExtractors(
-            partition_extractor=lambda x: x[0],
-            privacy_id_extractor=lambda x: x[1],
-            value_extractor=lambda _: 1,
-        )
-        partitions = ["pk0", "pk1"]
-
-        with self.assertRaises(NotImplementedError):
-            engine.calculate_private_contribution_bounds(
-                data, params, data_extractors, partitions)
-
     def _create_params_default(
             self) -> Tuple[pipeline_dp.AggregateParams, list]:
         """Returns default params and default public partitions."""
@@ -1216,20 +1184,6 @@ def test_run_e2e_partition_selection_local(self):
 
         self.assertLen(list(output), 5)
 
-    @unittest.skip("There are some problems with serialization in this test. "
-                   "Tests in private_spark_test.py work normaly so probably it"
-                   " is because of some missing setup.")
-    def test_run_e2e_partition_selection_spark(self):
-        import pyspark
-        conf = pyspark.SparkConf()
-        sc = pyspark.SparkContext.getOrCreate(conf=conf)
-        input = sc.parallelize(list(range(10)))
-
-        output = self.run_e2e_private_partition_selection_large_budget(
-            input, pipeline_dp.SparkRDDBackend(sc))
-
-        self.assertLen(collect_to_container(), 5)
-
     def test_run_e2e_partition_selection_beam(self):
         with test_pipeline.TestPipeline() as p:
             input = p | "Create input" >> beam.Create(list(range(10)))
diff --git a/tests/pipeline_functions_test.py b/tests/pipeline_functions_test.py
@@ -120,33 +120,6 @@ def test_min_max_per_key(self, col, expected_min_max):
 
             beam_util.assert_that(result, beam_util.equal_to(expected_min_max))
 
-
-@unittest.skipIf(sys.version_info.minor <= 7 and sys.version_info.major == 3,
-                 "There are some problems with PySpark setup on older python.")
-class SparkRDDBackendTest(parameterized.TestCase):
-
-    @classmethod
-    def setUpClass(cls):
-        import pyspark
-        conf = pyspark.SparkConf()
-        cls.sc = pyspark.SparkContext.getOrCreate(conf=conf)
-        cls.backend = pipeline_dp.SparkRDDBackend(cls.sc)
-
-    def test_key_by_extracts_keys_and_keeps_values_untouched(self):
-        col = self.sc.parallelize(["key1_value1", "key1_value2", "key2_value1"])
-
-        def underscore_separated_key_extractor(el):
-            return el.split("_")[0]
-
-        result = composite_funcs.key_by(self.backend,
-                                        col,
-                                        underscore_separated_key_extractor,
-                                        stage_name="Key by").collect()
-
-        self.assertSetEqual(
-            {("key1", "key1_value1"), ("key1", "key1_value2"),
-             ("key2", "key2_value1")}, set(result))
-
     def test_size_accounts_for_duplicates(self):
         col = self.sc.parallelize([3, 2, 1, 1])