perf: Port FrozenOrderedSet to rust by tobni · Pull Request #23200 · pantsbuild/pants

tobni · 2026-03-29T13:23:48Z

Followup to #22501. Same approach — FrozenOrderedSet is now a pyo3 #[pyclass] backed by Py<PyDict> with lazy hash via OnceLock. The end goal is porting more rule code to rust intrinsics.

"""Benchmark: Rust FrozenOrderedSet vs Python FrozenOrderedSet."""

import sys
import timeit
from collections.abc import Hashable, Iterable, Iterator
from typing import AbstractSet, Any, TypeVar

sys.path.insert(0, "src/python")

from pants.engine.internals.native_engine import FrozenOrderedSet as RustFrozenOrderedSet

T = TypeVar("T")


class PyFrozenOrderedSet(AbstractSet[T], Hashable):
    """The old pure-Python FrozenOrderedSet (pre-port)."""

    def __init__(self, iterable=None):
        self._items = dict.fromkeys(iterable) if iterable else {}
        self._hash = None

    def __len__(self):
        return len(self._items)

    def __contains__(self, key):
        return key in self._items

    def __iter__(self) -> Iterator:
        return iter(self._items)

    def __reversed__(self):
        return reversed(tuple(self._items.keys()))

    def __eq__(self, other):
        if not isinstance(other, self.__class__):
            return NotImplemented
        return len(self._items) == len(other._items) and all(
            x == y for x, y in zip(self._items, other._items)
        )

    def __hash__(self):
        if self._hash is None:
            self._hash = 0
            for item in self._items.keys():
                self._hash ^= hash(item)
        return self._hash

    def __repr__(self):
        return f"PyFrozenOrderedSet({list(self)!r})"

    def __bool__(self):
        return bool(self._items)

    def union(self, other):
        return self.__class__(list(self) + [x for x in other if x not in self._items])

    def intersection(self, other):
        s = set(other)
        return self.__class__(x for x in self if x in s)

    def difference(self, other):
        s = set(other)
        return self.__class__(x for x in self if x not in s)

    def issubset(self, other):
        if len(self) > len(other):
            return False
        return all(item in other for item in self)


WARMUP = 1000

def measure(stmt, number, globs):
    timeit.timeit(stmt, number=WARMUP, globals=globs)
    t = timeit.timeit(stmt, number=number, globals=globs)
    return t / number * 1_000_000


BENCHMARKS = [
    ("Construction", "Cls(data)", lambda data, py, rs, **_: [
        {"Cls": PyFrozenOrderedSet, "data": data},
        {"Cls": RustFrozenOrderedSet, "data": data},
    ]),
    ("hash()", "hash(fd)", lambda py, rs, **_: [
        {"fd": py},
        {"fd": rs},
    ]),
    ("__contains__", "k in fd", lambda py, rs, mid, **_: [
        {"fd": py, "k": mid},
        {"fd": rs, "k": mid},
    ]),
    ("__contains__ miss", "k in fd", lambda py, rs, **_: [
        {"fd": py, "k": "MISSING"},
        {"fd": rs, "k": "MISSING"},
    ]),
    ("__eq__", "fd == fd2", lambda py, rs, py2, rs2, **_: [
        {"fd": py, "fd2": py2},
        {"fd": rs, "fd2": rs2},
    ]),
    ("iteration", "list(fd)", lambda py, rs, **_: [
        {"fd": py},
        {"fd": rs},
    ]),
    ("union", "fd.union(other)", lambda py, rs, py_other, rs_other, **_: [
        {"fd": py, "other": py_other},
        {"fd": rs, "other": rs_other},
    ]),
    ("intersection", "fd.intersection(other)", lambda py, rs, py_other, rs_other, **_: [
        {"fd": py, "other": py_other},
        {"fd": rs, "other": rs_other},
    ]),
    ("difference", "fd.difference(other)", lambda py, rs, py_other, rs_other, **_: [
        {"fd": py, "other": py_other},
        {"fd": rs, "other": rs_other},
    ]),
    ("issubset", "small.issubset(fd)", lambda py, rs, py_small, rs_small, **_: [
        {"small": py_small, "fd": py},
        {"small": rs_small, "fd": rs},
    ]),
    ("dict key", "d[fd]", lambda py, rs, **_: [
        {"fd": py, "d": {py: 1}},
        {"fd": rs, "d": {rs: 1}},
    ]),
]

SMALL = list(range(5))
MEDIUM = list(range(20))
LARGE = list(range(200))

DATASETS = [("small (5)", SMALL), ("medium (20)", MEDIUM), ("large (200)", LARGE)]

all_results: dict[str, dict[str, tuple[float, float]]] = {}

for ds_name, data in DATASETS:
    print(f"\n{'=' * 60}")
    print(f"  Dataset: {ds_name}")
    print(f"{'=' * 60}")

    py = PyFrozenOrderedSet(data)
    rs = RustFrozenOrderedSet(data)
    py2 = PyFrozenOrderedSet(data)
    rs2 = RustFrozenOrderedSet(data)
    half = data[:len(data) // 2]
    py_other = PyFrozenOrderedSet(half + list(range(1000, 1000 + len(half))))
    rs_other = RustFrozenOrderedSet(half + list(range(1000, 1000 + len(half))))
    py_small = PyFrozenOrderedSet(data[:3])
    rs_small = RustFrozenOrderedSet(data[:3])
    # Warm up lazy hashes
    for obj in (py, py2, py_other, py_small):
        hash(obj)
    n = 500_000 if len(data) <= 20 else 50_000
    mid = data[len(data) // 2]

    ctx = dict(data=data, py=py, rs=rs, py2=py2, rs2=rs2, mid=mid,
               py_other=py_other, rs_other=rs_other, py_small=py_small, rs_small=rs_small)

    for bench_name, stmt, make_globs in BENCHMARKS:
        py_globs, rs_globs = make_globs(**ctx)
        py_us = measure(stmt, n, py_globs)
        rs_us = measure(stmt, n, rs_globs)
        print(f"  {bench_name:.<20s} Python {py_us:8.3f} µs  Rust {rs_us:8.3f} µs  ({py_us / rs_us:.1f}x)")
        all_results.setdefault(bench_name, {})[ds_name] = (py_us, rs_us)

ds_names = [name for name, _ in DATASETS]
header = f"  {'Operation':<20s}" + "".join(f" | {name:>12s}" for name in ds_names)
sep = f"  {'-'*20}" + "".join(f"-+-{'-'*12}" for _ in ds_names)

print(f"\n{'=' * 60}")
print("  Summary (Python / Rust speedup)")
print(f"{'=' * 60}")
print(header)
print(sep)
for bench_name, _, _ in BENCHMARKS:
    row = f"  {bench_name:<20s}"
    for ds_name in ds_names:
        py_us, rs_us = all_results[bench_name][ds_name]
        ratio = py_us / rs_us
        row += f" | {ratio:11.1f}x"
    print(row)

  Operation            |    small (5) |  medium (20) |  large (200)
  ---------------------+--------------+--------------+-------------
  Construction         |         1.2x |         1.1x |         1.0x
  hash()               |         2.2x |         2.2x |         2.2x
  __contains__         |         1.8x |         1.7x |         1.7x
  __contains__ miss    |         1.8x |         1.8x |         1.8x
  __eq__               |         3.6x |         2.0x |         1.4x
  iteration            |         1.5x |         1.4x |         1.1x
  union                |         4.1x |         3.0x |         2.5x
  intersection         |         2.7x |         1.7x |         1.2x
  difference           |         3.0x |         1.8x |         1.3x
  issubset             |         7.1x |         6.8x |         7.0x
  dict key             |         2.1x |         2.0x |         2.0x

cburroughs · 2026-03-30T15:47:17Z

Really wish I knew rust better for all these cool performance cases. Cross referencing: #14719

tdyas · 2026-05-19T04:22:01Z

What about just porting FrozenOrderedSet usages to the Python standard frozenset?

tdyas · 2026-05-19T04:26:41Z

What about just porting FrozenOrderedSet usages to the Python standard frozenset?

I assume because frozenset is unordered, but is that a property we need for use cases where we are using a set type?

tobni · 2026-05-24T22:46:46Z

I assume because frozenset is unordered, but is that a property we need for use cases where we are using a set type?

I dont know. It is used a lot.

cburroughs

The __new__ --> __init__ thing needs a oneline mention in the release notes I think, on the off chance someone was using the class?

I ran it through some local benchmarks which suggest single digit percentage performance improvements. I think that matches your expectations? As idle curiosity, I wonder how much of the benefit comes from the lazy hash computation.

cburroughs · 2026-05-29T19:54:29Z

+            .map(|o| to_pyset(&o))
+            .collect::<PyResult<Vec<_>>>()?;
+        filter_keys(self, py, |key| {
+            Ok(sets.iter().all(|s| s.contains(key).unwrap_or(false)))


rust clarity: Is the unwrap_or here intentional? I'm not sure why intersection should handle this type of error differently than difference?

Not intentional. This would set.__contains__ throwing an exception. Not sure that ever happens, but better to fix it and let the weird case be visible at least 🤷‍♂️

tobni · 2026-05-30T15:25:01Z

The __new__ --> __init__ thing needs a oneline mention in the release notes I think, on the off chance someone was using the class?

I ran it through some local benchmarks which suggest single digit percentage performance improvements. I think that matches your expectations? As idle curiosity, I wonder how much of the benefit comes from the lazy hash computation.

Aye, around 1%, it scales well with large repos with many deps. The python impl also enjoyed a lazy hash, so the port should be net-nothing if it is never accessed. It almost always is though...

I've updated release notes and fixed the correctness bug you caught

tobni force-pushed the add/port-frozen-ordered-set branch from 42a564e to ca376c9 Compare March 29, 2026 13:25

tobni added category:internal CI, fixes for not-yet-released features, etc. release-notes:not-required [CI] PR doesn't require mention in release notes labels Mar 29, 2026

tobni force-pushed the add/port-frozen-ordered-set branch 7 times, most recently from ee17c67 to 0bf4cba Compare March 29, 2026 18:45

tobni force-pushed the add/port-frozen-ordered-set branch from 0bf4cba to c52ba37 Compare April 1, 2026 08:20

cburroughs mentioned this pull request Apr 8, 2026

--changed-since prohibitivley slow in large python heavy repositories #23236

Open

cburroughs reviewed May 29, 2026

View reviewed changes

tobni force-pushed the add/port-frozen-ordered-set branch from c52ba37 to 5b7d8f5 Compare May 30, 2026 15:20

tobni added 3 commits May 30, 2026 17:22

perf: Port FrozenOrderedSet to rust

f0e0cc5

docs: Add plugin api note

64bfa96

fix: Surface set.__contains__ exceptions

5bdcfaf

tobni force-pushed the add/port-frozen-ordered-set branch from 5b7d8f5 to 5bdcfaf Compare May 30, 2026 15:23

chore: Address lint from rebase-upgrade

f0586d9

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

perf: Port FrozenOrderedSet to rust#23200

perf: Port FrozenOrderedSet to rust#23200
tobni wants to merge 4 commits into
pantsbuild:mainfrom
tobni:add/port-frozen-ordered-set

tobni commented Mar 29, 2026 •

edited

Loading

Uh oh!

cburroughs commented Mar 30, 2026

Uh oh!

tdyas commented May 19, 2026

Uh oh!

tdyas commented May 19, 2026

Uh oh!

tobni commented May 24, 2026 •

edited

Loading

Uh oh!

cburroughs left a comment

Uh oh!

cburroughs May 29, 2026

Uh oh!

tobni May 30, 2026

Uh oh!

tobni commented May 30, 2026 •

edited

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

Uh oh!

Conversation

tobni commented Mar 29, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

cburroughs commented Mar 30, 2026

Uh oh!

tdyas commented May 19, 2026

Uh oh!

tdyas commented May 19, 2026

Uh oh!

tobni commented May 24, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

cburroughs left a comment

Choose a reason for hiding this comment

Uh oh!

cburroughs May 29, 2026

Choose a reason for hiding this comment

Uh oh!

tobni May 30, 2026

Choose a reason for hiding this comment

Uh oh!

tobni commented May 30, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

tobni commented Mar 29, 2026 •

edited

Loading

tobni commented May 24, 2026 •

edited

Loading

tobni commented May 30, 2026 •

edited

Loading