Ignore execution_time/score regressions when binaries have same hash (#140)

lukel97 · web-flow · commit b5a41d9d548c · 2026-01-15T23:40:27.000+08:00
We can remove a lot of the noise from execution time results by just ignoring any changes when the binary hashes are the same.
This implements it by adding a column to the TestSuiteSampleFields table, similar to bigger_is_better, and setting it to true for execution_time.
We want to still flag regressions on identical binaries for e.g. compile time.

This was my first time poking around the migrations infrastructure. They get automatically applied whenever the server is launched so there shouldn't be any need to manually run any scripts.
diff --git a/lnt/server/db/migrations/upgrade_17_to_18.py b/lnt/server/db/migrations/upgrade_17_to_18.py
@@ -0,0 +1,29 @@
+"""Adds a ignore_same_hash column to the sample fields table and sets it to
+true for execution_time.
+
+"""
+
+from sqlalchemy import Column, Integer, update
+
+from lnt.server.db.migrations.util import introspect_table
+from lnt.server.db.util import add_column
+
+
+def upgrade(engine):
+    ignore_same_hash = Column("ignore_same_hash", Integer, default=0)
+    add_column(engine, "TestSuiteSampleFields", ignore_same_hash)
+
+    test_suite_sample_fields = introspect_table(engine, "TestSuiteSampleFields")
+    set_init_value = update(test_suite_sample_fields).values(ignore_same_hash=0)
+    set_exec_time = (
+        update(test_suite_sample_fields)
+        .where(
+            (test_suite_sample_fields.c.Name == "execution_time") |
+            (test_suite_sample_fields.c.Name == "score")
+        )
+        .values(ignore_same_hash=1)
+    )
+
+    with engine.begin() as trans:
+        trans.execute(set_init_value)
+        trans.execute(set_exec_time)
diff --git a/lnt/server/db/testsuite.py b/lnt/server/db/testsuite.py
@@ -173,6 +173,7 @@ def from_json(data):
         for index, metric_desc in enumerate(data['metrics']):
             name = metric_desc['name']
             bigger_is_better = metric_desc.get('bigger_is_better', False)
+            ignore_same_hash = metric_desc.get('ignore_same_hash', False)
             metric_type_name = metric_desc.get('type', 'Real')
             display_name = metric_desc.get('display_name')
             unit = metric_desc.get('unit')
@@ -182,8 +183,10 @@ def from_json(data):
                                  metric_type_name)
             metric_type = SampleType(metric_type_name)
             bigger_is_better_int = 1 if bigger_is_better else 0
+            ignore_same_hash_int = 1 if ignore_same_hash else 0
             field = SampleField(name, metric_type, index, status_field=None,
                                 bigger_is_better=bigger_is_better_int,
+                                ignore_same_hash=ignore_same_hash_int,
                                 display_name=display_name, unit=unit,
                                 unit_abbrev=unit_abbrev)
             sample_fields.append(field)
@@ -196,6 +199,7 @@ def __json__(self):
         for sample_field in self.sample_fields:
             metric = {
                 'bigger_is_better': (sample_field.bigger_is_better != 0),
+                'ignore_same_hash': (sample_field.ignore_same_hash != 0),
                 'display_name': sample_field.display_name,
                 'name': sample_field.name,
                 'type': sample_field.type.name,
@@ -340,12 +344,18 @@ class SampleField(FieldMixin, Base):
     # This assumption can be inverted by setting this column to nonzero.
     bigger_is_better = Column("bigger_is_better", Integer)
 
+    # Some fields like execution_time should ignore changes if the binary hash
+    # is the same.
+    ignore_same_hash = Column("ignore_same_hash", Integer, default=0)
+
     def __init__(self, name, type, schema_index, status_field=None, bigger_is_better=0,
+                 ignore_same_hash=0,
                  display_name=None, unit=None, unit_abbrev=None):
         self.name = name
         self.type = type
         self.status_field = status_field
         self.bigger_is_better = bigger_is_better
+        self.ignore_same_hash = ignore_same_hash
         self.display_name = name if display_name is None else display_name
         self.unit = unit
         self.unit_abbrev = unit_abbrev
@@ -367,7 +377,7 @@ def __repr__(self):
 
     def __copy__(self):
         return SampleField(self.name, self.type, self.schema_index, self.status_field,
-                           self.bigger_is_better, self.display_name, self.unit,
+                           self.bigger_is_better, self.ignore_same_hash, self.display_name, self.unit,
                            self.unit_abbrev)
 
     def copy_info(self, other):
diff --git a/lnt/server/reporting/analysis.py b/lnt/server/reporting/analysis.py
@@ -54,7 +54,8 @@ class ComparisonResult:
     def __init__(self, aggregation_fn,
                  cur_failed, prev_failed, samples, prev_samples,
                  cur_hash, prev_hash, cur_profile=None, prev_profile=None,
-                 confidence_lv=0.05, bigger_is_better=False):
+                 confidence_lv=0.05, bigger_is_better=False,
+                 ignore_same_hash=False):
         self.aggregation_fn = aggregation_fn
 
         # Special case: if we're using the minimum to aggregate, swap it for
@@ -103,6 +104,7 @@ def __init__(self, aggregation_fn,
 
         self.confidence_lv = confidence_lv
         self.bigger_is_better = bigger_is_better
+        self.ignore_same_hash = ignore_same_hash
 
     def __repr__(self):
         """Print this ComparisonResult's constructor.
@@ -118,7 +120,8 @@ def __repr__(self):
                           self.samples,
                           self.prev_samples,
                           self.confidence_lv,
-                          bool(self.bigger_is_better))
+                          bool(self.bigger_is_better),
+                          bool(self.ignore_same_hash))
 
     def __json__(self):
         simple_dict = self.__dict__
@@ -155,9 +158,6 @@ def get_test_status(self):
             else:
                 return UNCHANGED_PASS
 
-    # FIXME: take into account hash of binary - if available. If the hash is
-    # the same, the binary is the same and therefore the difference cannot be
-    # significant - for execution time. It can be significant for compile time.
     def get_value_status(self, confidence_interval=2.576,
                          value_precision=MIN_VALUE_PRECISION,
                          ignore_small=True):
@@ -176,6 +176,12 @@ def get_value_status(self, confidence_interval=2.576,
         elif self.prev_failed:
             return UNCHANGED_PASS
 
+        # Ignore changes if the hash of the binary is the same and the field is
+        # sensitive to the hash, e.g. execution time.
+        if self.ignore_same_hash:
+            if self.cur_hash and self.prev_hash and self.cur_hash == self.prev_hash:
+                return UNCHANGED_PASS
+
         # Always ignore percentage changes below MIN_PERCENTAGE_CHANGE %, for now, we just don't
         # have enough time to investigate that level of stuff.
         if ignore_small and abs(self.pct_delta) < MIN_PERCENTAGE_CHANGE:
@@ -355,7 +361,8 @@ def get_comparison_result(self, runs, compare_runs, test_id, field,
                              prev_values, cur_hash, prev_hash,
                              cur_profile, prev_profile,
                              self.confidence_lv,
-                             bigger_is_better=field.bigger_is_better)
+                             bigger_is_better=field.bigger_is_better,
+                             ignore_same_hash=field.ignore_same_hash)
         return r
 
     def get_geomean_comparison_result(self, run, compare_to, field, tests):
@@ -385,7 +392,8 @@ def get_geomean_comparison_result(self, run, compare_to, field, tests):
                                 cur_hash=cur_hash,
                                 prev_hash=prev_hash,
                                 confidence_lv=0,
-                                bigger_is_better=field.bigger_is_better)
+                                bigger_is_better=field.bigger_is_better,
+                                ignore_same_hash=field.ignore_same_hash)
 
     def _load_samples_for_runs(self, session, run_ids, only_tests):
         # Find the set of new runs to load.
diff --git a/schemas/nts.yaml b/schemas/nts.yaml
@@ -16,12 +16,14 @@ metrics:
   display_name: Execution Time
   unit: seconds
   unit_abbrev: s
+  ignore_same_hash: true
 - name: execution_status
   type: Status
 - name: score
   type: Real
   bigger_is_better: true
   display_name: Score
+  ignore_same_hash: true
 - name: mem_bytes
   type: Real
   display_name: Memory Usage
diff --git a/tests/server/reporting/analysis.py b/tests/server/reporting/analysis.py
@@ -319,6 +319,28 @@ def test_handle_zero_sample(self):
                 None, None)
             self.assertEqual(zeroSample.get_value_status(), UNCHANGED_PASS)
 
+    def test_ignore_same_hash(self):
+        """Test ignore_same_hash ignores regressions with the same hash."""
+        same_hash = ComparisonResult(min, False, False, [10.], [5.],
+                                     'abc', 'abc', ignore_same_hash=True)
+        self.assertEqual(same_hash.get_value_status(), UNCHANGED_PASS)
+        self.assertFalse(same_hash.is_result_interesting())
+
+        diff_hash = ComparisonResult(min, False, False, [10.], [5.],
+                                     'abc', '123', ignore_same_hash=True)
+        self.assertEqual(diff_hash.get_value_status(), REGRESSED)
+        self.assertTrue(diff_hash.is_result_interesting())
+
+        no_hash = ComparisonResult(min, False, False, [10.], [5.], None,
+                                   '123', ignore_same_hash=True)
+        self.assertEqual(no_hash.get_value_status(), REGRESSED)
+        self.assertTrue(no_hash.is_result_interesting())
+
+        disabled = ComparisonResult(min, False, False, [10.], [5.],
+                                    'abc', 'abc', ignore_same_hash=False)
+        self.assertEqual(disabled.get_value_status(), REGRESSED)
+        self.assertTrue(disabled.is_result_interesting())
+
 
 class AbsMinTester(unittest.TestCase):
 
diff --git a/tests/server/ui/test_api.py b/tests/server/ui/test_api.py
@@ -290,6 +290,8 @@ def test_schema(self):
                     m['display_name'] = m['name']
                 if 'bigger_is_better' not in m:
                     m['bigger_is_better'] = False
+                if 'ignore_same_hash' not in m:
+                    m['ignore_same_hash'] = False
             yaml_schema['metrics'].sort(key=lambda x: x['name'])
             yaml_schema['run_fields'].sort(key=lambda x: x['name'])
             yaml_schema['machine_fields'].sort(key=lambda x: x['name'])