results-analysis/interop-2022/main.js at c96888dcfa6ee302d78c0897af96a738d8855175 · web-platform-tests/results-analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
/* eslint-disable max-len */

'use strict';

/**
 * Implements test results scoring for Interop 2022 as described in the RFC:
 * https://github.com/web-platform-tests/rfcs/blob/master/rfcs/interop_2022.md#metrics
 *
 * Note that the scaling to 90% happens in the https://wpt.fyi/interop-2022 frontend.
 */

const fetch = require('node-fetch');
const flags = require('flags');
const fs = require('fs');
const Git = require('nodegit');
const lib = require('../lib');
const moment = require('moment');
const path = require('path');

const {advanceDateToSkipBadDataIfNecessary} = require('../bad-ranges');

flags.defineStringList('products', ['chrome', 'firefox', 'safari', 'edge'],
    'Products to include (comma-separated)');
flags.defineString('from', '2022-01-01', 'Starting date (inclusive)');
flags.defineString('to', moment().format('YYYY-MM-DD'),
    'Ending date (exclusive)');
flags.defineBoolean('experimental', false,
    'Calculate metrics for experimental runs.');
flags.parse();

const ROOT_DIR = path.join(__dirname, '..');

const CATEGORIES = [
  'interop-2021-aspect-ratio',
  'interop-2021-flexbox',
  'interop-2021-grid',
  'interop-2021-position-sticky',
  'interop-2021-transforms',
  'interop-2022-cascade',
  'interop-2022-color',
  'interop-2022-contain',
  'interop-2022-dialog',
  'interop-2022-forms',
  'interop-2022-scrolling',
  'interop-2022-subgrid',
  'interop-2022-text',
  'interop-2022-viewport',
  'interop-2022-webcompat',
];

const RUNS_URI = 'https://wpt.fyi/api/runs?aligned=true&max-count=1';

// All non-OK harness statuses. Any non-OK harness status should be investigated
// before being added to this list, so that we don't score tests in the wrong
// way because of a test or infrastructure issue.
const KNOWN_TEST_STATUSES = new Set([
  // TIMEOUT in Safari due to https://webkit.org/b/212201
  '/css/css-grid/grid-definition/grid-limits-001.html',
  // TIMEOUT in Firefox and Safari, all subtests present
  '/css/css-scroll-snap/input/keyboard.html',
  // ERROR in Firefox, TIMEOUT in Safari, all subtests failing in Chrome
  '/css/css-scroll-snap/input/snap-area-overflow-boundary.html',
  // TIMEOUT in Chrome with TIMEOUT subtests
  '/dom/events/Event-dispatch-click.html',
  // ERROR in Safari but linked bug is fixed
  '/html/browsers/browsing-the-web/navigating-across-documents/replace-before-load/form-requestsubmit-during-load.html',
  '/html/browsers/browsing-the-web/navigating-across-documents/replace-before-load/form-requestsubmit-during-pageshow.html',
  // TIMEOUT in Safari, but just a single subtest
  '/html/semantics/forms/form-submission-0/form-double-submit-multiple-targets.html',
  // TIMEOUT in Firefox and Safari, but just a single subtest
  '/html/semantics/forms/form-submission-0/form-double-submit-to-different-origin-frame.html',
  // TIMEOUT in Safari but all passing subtests due to https://bugs.webkit.org/show_bug.cgi?id=235407
  '/html/semantics/forms/form-submission-target/rel-base-target.html',
  '/html/semantics/forms/form-submission-target/rel-button-target.html',
  '/html/semantics/forms/form-submission-target/rel-form-target.html',
  '/html/semantics/forms/form-submission-target/rel-input-target.html',
  // ERROR in Firefox 95 and Safari 15.2, since fixed
  '/html/semantics/interactive-elements/the-dialog-element/dialog-showModal.html',
  // ERROR in Chrome 96, since fixed
  '/html/semantics/interactive-elements/the-dialog-element/modal-dialog-ancestor-is-inert.html',
  // TIMEOUT in Safari, but all subtests present
  '/html/semantics/forms/textfieldselection/select-event.html',
  '/html/semantics/forms/textfieldselection/selection-start-end.html',
  '/html/semantics/forms/textfieldselection/textfieldselection-setRangeText.html',
  '/html/semantics/forms/textfieldselection/textfieldselection-setSelectionRange.html',
  // TIMEOUT in Firefox 98, since fixed
  '/html/semantics/forms/the-input-element/image-click-form-data.html',
  // TIMEOUT in Safari, but all subtests present
  '/html/semantics/forms/the-input-element/range-restore-oninput-onchange-event.html',
  // TIMEOUT in STP 137, since fixed
  '/html/semantics/interactive-elements/the-dialog-element/backdrop-receives-element-events.html',
]);

// Fetches aligned runs from the wpt.fyi server, between the |from| and |to|
// dates. If |experimental| is true fetch experimental runs, else stable runs.
// Returns a map of date to list of runs for that date (one per product)
async function fetchAlignedRunsFromServer(products, from, to, experimental) {
  const label = experimental ? 'experimental' : 'stable';
  let params = `&label=master&label=${label}`;
  for (const product of products) {
    params += `&product=${product}`;
  }
  const runsUri = `${RUNS_URI}${params}`;

  console.log(`Fetching aligned runs from ${from.format('YYYY-MM-DD')} ` +
      `to ${to.format('YYYY-MM-DD')}`);

  let cachedCount = 0;
  const before = moment();
  const alignedRuns = new Map();
  while (from < to) {
    const formattedFrom = from.format('YYYY-MM-DD');
    from.add(1, 'days');
    const formattedTo = from.format('YYYY-MM-DD');

    // We advance the date (if necessary) before doing anything more, so that
    // code later in the loop body can just 'continue' without checking.
    from = advanceDateToSkipBadDataIfNecessary(from, experimental);

    // Attempt to read the runs from the cache.
    // TODO: Consider https://github.com/tidoust/fetch-filecache-for-crawling
    let runs;
    const cacheFile = path.join(ROOT_DIR,
        `cache/${label}-${products.join('-')}-runs-${formattedFrom}.json`);
    try {
      runs = JSON.parse(await fs.promises.readFile(cacheFile));
      if (runs.length) {
        cachedCount++;
      }
    } catch (e) {
      const url = `${runsUri}&from=${formattedFrom}&to=${formattedTo}`;
      const response = await fetch(url);
      // Many days do not have an aligned set of runs, but we always write to
      // the cache to speed up future executions of this code.
      runs = await response.json();
      await fs.promises.writeFile(cacheFile, JSON.stringify(runs));
    }

    if (!runs.length) {
      continue;
    }

    if (runs.length !== products.length) {
      throw new Error(
          `Fetched ${runs.length} runs, expected ${products.length}`);
    }

    alignedRuns.set(formattedFrom, runs);
  }
  const after = moment();
  console.log(`Fetched ${alignedRuns.size} sets of runs in ` +
      `${after - before} ms (${cachedCount} cached)`);

  return alignedRuns;
}

// Score a set of runs (independently) on a set of tests. The runs are presumed
// to be aligned in some way (i.e. they were all run at the same WPT SHA).
//
// Returns an array of scores, which is the top-level score (integer 0-1000) for
// each corresponding input run.
//
// To get the top-level score for a run, each test in that run that is present
// in |allTestsSet| is examined. Each test is scored 0-1000 based on the
// fraction of its subtests that pass, with rounding down so that 1000 means
// all subtests pass. Reftests score either 0 or 1000. These test scores are
// then summed and divided by the size of |allTestsSet|, again rounding down.
//
// This methodology has several consequences:
//
//   1. Individual tests do have a heavier weight than subtests. This could be
//   gamed, by splitting passing tests into multiple files rather than using
//   subtests (or conversely by combining failing tests into subtests in a
//   single file).
//
//   2. If |allTestsSet| is constant across runs *through time*, older runs may
//   not have entries for tests were only added recently and will be penalized
//   for that. This is deliberate - see the comment block later in this
//   function for why.
//
//   3. We could show (on wpt.fyi) scores at both the test and category level as
//   a percentage with one decimal point, and what a user would see would be the
//   same numbers that go into the total score, with no hidden rounding error.
//
//   4. Because we round down twice, the score for a category can end up lower
//   than if we used rational numbers.
function scoreRuns(runs, allTestsSet) {
  const scores = [];
  try {
    for (const run of runs) {
      // Sum of the integer 0-1000 scores for each test.
      let score = 0;

      lib.results.walkTests(run.tree, (path, test, results) => {
        const testname = path + '/' + test;
        if (!allTestsSet.has(testname)) {
          return;
        }

        // TODO: Validate the data by checking that all statuses are recognized.

        let subtestPasses = 0;
        let subtestTotal = 1;
        if ('subtests' in results) {
          if (results['status'] != 'OK' && !KNOWN_TEST_STATUSES.has(testname)) {
            throw new Error(`Unexpected non-OK status for test: ${testname}`);
          }
          subtestTotal = results['subtests'].length;
          for (const subtest of results['subtests']) {
            if (subtest['status'] == 'PASS') {
              subtestPasses += 1;
            }
          }
        } else if (results['status'] == 'PASS') {
          subtestPasses = 1;
        }

        // A single test is scored 0-1000 based on how many of its subtests
        // pass, rounding down so that 1000 always means fully passing.
        score += Math.floor(1000 * subtestPasses / subtestTotal);
      });

      // We always normalize against the number of tests we are looking for,
      // rather than the total number of tests we found. The trade-off is all
      // about new tests being added to the set.
      //
      // If a large chunk of tests are introduced at date X, and they fail in
      // some browser, then runs after date X look worse if you're only
      // counting total tests found - even though the tests would have failed
      // before date X as well.
      //
      // Conversely, if a large chunk of tests are introduced at date X, and
      // they pass in some browser, then runs after date X would get an
      // artificial boost in pass-rate due to this - even if the tests would
      // have passed before date X as well.
      //
      // We consider the former case worse than the latter, so optimize for it
      // by always comparing against the full test list. This does mean that
      // when tests are added to the set, previously generated data is no
      // longer valid and this script should be re-run for all dates.
      scores.push(Math.floor(score / allTestsSet.size));
    }
  } catch (e) {
    e.message += `\n\tRuns: ${runs.map(r => r.id)}`;
    throw e;
  }

  return scores;
}

async function scoreCategory(category, experimental, products, alignedRuns,
    testsSet) {
  // Score the test runs.
  const before = Date.now();
  const dateToScores = new Map();
  for (const [date, runs] of alignedRuns.entries()) {
    const versions = runs.map(run => run.browser_version);
    const scores = scoreRuns(runs, testsSet);
    dateToScores.set(date, {versions, scores});
  }
  const after = Date.now();
  console.log(`Done scoring (took ${after - before} ms)`);

  // Return dateToScores, so that our caller can calculate the summary across
  // multiple categories.
  return dateToScores;
}

async function main() {
  const products = flags.get('products');
  const repo = await Git.Repository.open(
      path.join(ROOT_DIR, 'results-analysis-cache.git'));

  // First, grab aligned runs from the server for the dates that we are
  // interested in.
  const from = moment(flags.get('from'));
  const to = moment(flags.get('to'));
  const experimental = flags.get('experimental');
  const alignedRuns = await fetchAlignedRunsFromServer(
      products, from, to, experimental);

  // Verify that we have data for the fetched runs in the results-analysis-cache
  // repo.
  console.log('Getting local set of run ids from repo');
  let before = Date.now();
  const localRunIds = await lib.results.getLocalRunIds(repo);
  let after = Date.now();
  console.log(`Found ${localRunIds.size} ids (took ${after - before} ms)`);

  let hadErrors = false;
  for (const [date, runs] of alignedRuns.entries()) {
    for (const run of runs) {
      if (!localRunIds.has(run.id)) {
        // If you see this, you probably need to run git-write.js or just update
        // your results-analysis-cache.git repo; see the README.md.
        console.error(`Run ${run.id} missing from local git repo (${date})`);
        hadErrors = true;
      }
    }
  }
  if (hadErrors) {
    throw new Error('Missing data for some runs (see errors logged above). ' +
        'Try running "git fetch --all --tags" in results-analysis-cache/');
  }

  // Load the test result trees into memory; creates a list of recursive tree
  // structures: tree = { trees: [...], tests: [...] }. Each 'tree' represents a
  // directory, each 'test' is the results from a given test file.
  console.log('Iterating over all runs, loading test results');
  before = Date.now();
  for (const runs of alignedRuns.values()) {
    for (const run of runs) {
      // Just in case someone ever adds a 'tree' field to the JSON.
      if (run.tree) {
        throw new Error('Run JSON contains "tree" field; code needs changed.');
      }
      run.tree = await lib.results.getGitTree(repo, run);
    }
  }
  after = Date.now();
  console.log(`Loading ${alignedRuns.size} sets of runs took ` +
      `${after - before} ms`);

  const dateToScoresMaps = new Map();

  // Map from labels to tests (includes)
  const labeledTests = new Map();
  const url = 'https://wpt.fyi/api/metadata?includeTestLevel=true&product=chrome';
  const response = await fetch(url);
  const metadata = await response.json();
  for (const [test, metadataList] of Object.entries(metadata)) {
    for (const {label} of metadataList) {
      if (label) {
        if (!labeledTests.has(label)) {
          labeledTests.set(label, new Set());
        }
        labeledTests.get(label).add(test);
      }
    }
  }

  for (const category of CATEGORIES) {
    console.log(`Scoring runs for ${category}`);
    const testsSet = labeledTests.get(category);
    if (!testsSet || !testsSet.size) {
      throw new Error(`No tests labeled for ${category}`);
    }
    const dateToScores = await scoreCategory(category, experimental, products,
        alignedRuns, testsSet);

    // Store the entire dateToScores for producing the unified CSV later.
    dateToScoresMaps.set(category, dateToScores);
  }

  // TODO: Once the other score CSVs are no longer used, we can push
  // some of this logic into scoreCategory and simplify things.
  let unifiedCsv = 'date';
  for (const product of products) {
    const categoryLabels = CATEGORIES.map(c => `${product}-${c}`);
    unifiedCsv += `,${product}-version,${categoryLabels.join()}`;
  }
  unifiedCsv += '\n';

  // We know that all dateToScoresMaps have the same dates (as they come from
  // the same runs), so we can just iterate the keys from the first.
  for (const date of dateToScoresMaps.get(CATEGORIES[0]).keys()) {
    let csvLine = [date.substr(0, 10)];
    // This is essentially an inversion loop; we have the data mapped by
    // individual categories, but we need it mapped by product.
    for (let browserIdx = 0; browserIdx < products.length; browserIdx++) {
      let version;
      const productScores = [];
      for (const category of CATEGORIES) {
        const {versions, scores} = dateToScoresMaps.get(category).get(date);
        const score = scores[browserIdx];
        productScores.push(score);
        // The versions should all be the same, so we just grab the latest one.
        version = versions[browserIdx];
      }
      csvLine.push(version);
      csvLine = csvLine.concat(productScores);
    }
    unifiedCsv += `${csvLine.join()}\n`;
  }

  const csvFilename = experimental ?
      `interop-2022-experimental.csv` : `interop-2022-stable.csv`;
  await fs.promises.writeFile(csvFilename, unifiedCsv, 'utf-8');
  console.log(`Wrote scores to ${csvFilename}`);
}

main().catch(reason => {
  console.error(reason);
  process.exit(1);
});