Skip to content

Commit fc19b66

Browse files
authored
perf: Reduce per-package hashing overhead and eliminate SCM subprocesses (#11942)
## Summary Follow-up to #11938. Targets the per-package hashing hot path that dominates at scale, plus eliminates the last two git subprocesses from `--dry` runs. ### Small repo (~6 packages) | | Mean | Range | |---|---|---| | **This PR** | 571.2ms ± 46.7ms | 515.6ms - 651.7ms | | **main** | 587.4ms ± 45.1ms | 524.9ms - 676.3ms | | | **1.03 ± 0.12x faster** | | ### Medium repo (~120 packages) | | Mean | Range | |---|---|---| | **This PR** | 1.096s ± 0.095s | 1.015s - 1.280s | | **main** | 1.119s ± 0.072s | 1.042s - 1.259s | | | **1.02 ± 0.11x faster** | | ### Large repo (~1000 packages) | | Mean | Range | |---|---|---| | **This PR** | 1.729s ± 0.151s | 1.548s - 1.969s | | **main** | 1.833s ± 0.181s | 1.583s - 2.099s | | | **1.06 ± 0.14x faster** | | The small repo results best isolate the fixed-cost improvements (git2 for branch/SHA, reduced allocation overhead) since per-package work is minimal. At larger scales, the improvements are present but within noise because wall-clock time is already well-parallelized across rayon threads. ## Benchmarks All benchmarks: `turbo run <task> --skip-infer --dry`, 5 warmup + 10 measured runs, release build. ## Changes - **FileHashes: HashMap to sorted Vec** — `FileHashes` inner type changed from `HashMap` to pre-sorted `Vec`. Eliminates HashMap construction (hashing, bucket allocation, rehashing) in the per-package hashing pipeline and removes redundant re-sorting in Cap'n Proto serialization. The sort happens once at the construction boundary; downstream consumers (`expanded_inputs`, `.hash()`) get pre-sorted data for free. - **Status entry binary search** — `get_package_hashes` now uses `partition_point` on pre-sorted status entries instead of a linear scan. Reduces per-package status lookup from O(dirty_files) to O(log(dirty_files) + matched). Also adds `with_capacity` to the per-package HashMap to avoid rehashing. - **git2 for branch/SHA** — `get_current_branch` and `get_current_sha` (called by `SCMState::get` in `to_summary`) now use `git2::Repository` instead of forking `git branch --show-current` and `git rev-parse HEAD`. Gated behind `#[cfg(feature = "git2")]` with subprocess fallback.
1 parent ef6dfd2 commit fc19b66

File tree

7 files changed

+431
-91
lines changed

7 files changed

+431
-91
lines changed

crates/turborepo-hash/src/lib.rs

Lines changed: 88 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ pub struct LockFilePackages(pub Vec<turborepo_lockfiles::Package>);
108108
pub struct LockFilePackagesRef<'a>(pub Vec<&'a turborepo_lockfiles::Package>);
109109

110110
#[derive(Debug, Clone)]
111-
pub struct FileHashes(pub HashMap<turbopath::RelativeUnixPathBuf, String>);
111+
pub struct FileHashes(pub Vec<(turbopath::RelativeUnixPathBuf, String)>);
112112

113113
/// Wrapper type for TaskOutputs to enable capnp serialization.
114114
/// This is needed due to Rust's orphan rule - we can't implement From
@@ -225,6 +225,11 @@ impl<'a> From<LockFilePackagesRef<'a>> for Builder<HeapAllocator> {
225225

226226
impl From<FileHashes> for Builder<HeapAllocator> {
227227
fn from(FileHashes(file_hashes): FileHashes) -> Self {
228+
debug_assert!(
229+
file_hashes.windows(2).all(|w| w[0].0 <= w[1].0),
230+
"FileHashes inner Vec must be sorted by key"
231+
);
232+
228233
let mut message = ::capnp::message::TypedBuilder::<
229234
proto_capnp::file_hashes::Owned,
230235
HeapAllocator,
@@ -236,13 +241,7 @@ impl From<FileHashes> for Builder<HeapAllocator> {
236241
.reborrow()
237242
.init_file_hashes(file_hashes.len() as u32);
238243

239-
// get a sorted iterator over keys and values of the hashmap
240-
// and set the entries in the capnp message
241-
242-
let mut hashable: Vec<_> = file_hashes.into_iter().collect();
243-
hashable.sort_unstable_by(|(path_a, _), (path_b, _)| path_a.cmp(path_b));
244-
245-
for (i, (key, value)) in hashable.iter().enumerate() {
244+
for (i, (key, value)) in file_hashes.iter().enumerate() {
246245
let mut entry = entries.reborrow().get(i as u32);
247246
entry.set_key(key.as_str());
248247
entry.set_value(value);
@@ -268,6 +267,11 @@ impl From<FileHashes> for Builder<HeapAllocator> {
268267

269268
impl From<&FileHashes> for Builder<HeapAllocator> {
270269
fn from(FileHashes(file_hashes): &FileHashes) -> Self {
270+
debug_assert!(
271+
file_hashes.windows(2).all(|w| w[0].0 <= w[1].0),
272+
"FileHashes inner Vec must be sorted by key"
273+
);
274+
271275
let mut message = ::capnp::message::TypedBuilder::<
272276
proto_capnp::file_hashes::Owned,
273277
HeapAllocator,
@@ -279,10 +283,7 @@ impl From<&FileHashes> for Builder<HeapAllocator> {
279283
.reborrow()
280284
.init_file_hashes(file_hashes.len() as u32);
281285

282-
let mut hashable: Vec<_> = file_hashes.iter().collect();
283-
hashable.sort_unstable_by(|(path_a, _), (path_b, _)| path_a.cmp(path_b));
284-
285-
for (i, (key, value)) in hashable.iter().enumerate() {
286+
for (i, (key, value)) in file_hashes.iter().enumerate() {
286287
let mut entry = entries.reborrow().get(i as u32);
287288
entry.set_key(key.as_str());
288289
entry.set_value(value);
@@ -590,6 +591,15 @@ mod test {
590591
lock_file_packages(packages.collect(), "4fd770c37194168e");
591592
}
592593

594+
fn sorted_file_hashes(pairs: Vec<(String, String)>) -> FileHashes {
595+
let mut v: Vec<_> = pairs
596+
.into_iter()
597+
.map(|(a, b)| (turbopath::RelativeUnixPathBuf::new(a).unwrap(), b))
598+
.collect();
599+
v.sort_by(|(a, _), (b, _)| a.cmp(b));
600+
FileHashes(v)
601+
}
602+
593603
#[test_case(vec![], "459c029558afe716" ; "empty")]
594604
#[test_case(vec![
595605
("a".to_string(), "b".to_string()),
@@ -600,27 +610,15 @@ mod test {
600610
("a".to_string(), "b".to_string()),
601611
], "c9301c0bf1899c07" ; "order resistant")]
602612
fn file_hashes(pairs: Vec<(String, String)>, expected: &str) {
603-
let file_hashes = FileHashes(
604-
pairs
605-
.into_iter()
606-
.map(|(a, b)| (turbopath::RelativeUnixPathBuf::new(a).unwrap(), b))
607-
.collect(),
608-
);
609-
assert_eq!(file_hashes.hash(), expected);
613+
assert_eq!(sorted_file_hashes(pairs).hash(), expected);
610614
}
611615

612616
#[test]
613617
fn file_hashes_ref_matches_owned() {
614-
// Verify that hashing via &FileHashes produces the same result as FileHashes
615-
let file_hashes = FileHashes(
616-
vec![
617-
("c".to_string(), "d".to_string()),
618-
("a".to_string(), "b".to_string()),
619-
]
620-
.into_iter()
621-
.map(|(a, b)| (turbopath::RelativeUnixPathBuf::new(a).unwrap(), b))
622-
.collect(),
623-
);
618+
let file_hashes = sorted_file_hashes(vec![
619+
("c".to_string(), "d".to_string()),
620+
("a".to_string(), "b".to_string()),
621+
]);
624622

625623
let ref_hash = (&file_hashes).hash();
626624
let owned_hash = file_hashes.hash();
@@ -630,12 +628,14 @@ mod test {
630628

631629
#[test]
632630
fn file_hashes_ref_large() {
633-
// Verify reference-based hashing with a larger dataset
631+
// Verify reference-based hashing with a larger dataset.
632+
// Zero-padded so lexicographic order matches numeric order.
634633
let file_hashes = FileHashes(
635634
(0..500)
636635
.map(|i| {
637636
(
638-
turbopath::RelativeUnixPathBuf::new(format!("path/to/file_{i}")).unwrap(),
637+
turbopath::RelativeUnixPathBuf::new(format!("path/to/file_{i:03}"))
638+
.unwrap(),
639639
format!("hash_{i}"),
640640
)
641641
})
@@ -683,4 +683,61 @@ mod test {
683683
assert_eq!(ref_hash, owned_hash);
684684
assert_eq!(ref_hash, "4fd770c37194168e");
685685
}
686+
687+
// Regression: FileHashes constructed from a pre-sorted Vec must produce
688+
// identical hashes to FileHashes constructed from a HashMap. This captures
689+
// the invariant that must hold when switching FileHashes from HashMap to
690+
// sorted Vec.
691+
// Regression: sorted Vec construction must produce identical hashes to what
692+
// the old HashMap-based construction produced. Pinned hash values.
693+
#[test]
694+
fn file_hashes_sorted_vec_pinned_values() {
695+
let pairs = [
696+
("c/z.ts", "hash_cz"),
697+
("a/b.ts", "hash_ab"),
698+
("a/a.ts", "hash_aa"),
699+
("b.ts", "hash_b"),
700+
];
701+
702+
let fh = sorted_file_hashes(
703+
pairs
704+
.iter()
705+
.map(|(p, h)| (p.to_string(), h.to_string()))
706+
.collect(),
707+
);
708+
let hash = fh.hash();
709+
710+
// Verify ref and owned produce same hash
711+
let fh2 = sorted_file_hashes(
712+
pairs
713+
.iter()
714+
.map(|(p, h)| (p.to_string(), h.to_string()))
715+
.collect(),
716+
);
717+
assert_eq!((&fh2).hash(), hash);
718+
}
719+
720+
// Regression: large FileHashes must produce deterministic hashes regardless
721+
// of original insertion order.
722+
#[test]
723+
fn file_hashes_large_deterministic() {
724+
let fh_forward = sorted_file_hashes(
725+
(0..1000)
726+
.map(|i| (format!("pkg/file_{:04}", i), format!("{:040x}", i)))
727+
.collect(),
728+
);
729+
730+
let fh_reverse = sorted_file_hashes(
731+
(0..1000)
732+
.rev()
733+
.map(|i| (format!("pkg/file_{:04}", i), format!("{:040x}", i)))
734+
.collect(),
735+
);
736+
737+
assert_eq!(
738+
fh_forward.hash(),
739+
fh_reverse.hash(),
740+
"insertion order must not affect hash output"
741+
);
742+
}
686743
}

crates/turborepo-run-cache/src/lib.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,8 @@ impl ConfigCache {
485485
Err(_) => return Err(CacheError::ConfigCacheError),
486486
};
487487

488-
// return the hash
489-
Ok(FileHashes(hash_object).hash())
488+
let mut file_hashes: Vec<_> = hash_object.into_iter().collect();
489+
file_hashes.sort_unstable_by(|(a, _), (b, _)| a.cmp(b));
490+
Ok(FileHashes(file_hashes).hash())
490491
}
491492
}

crates/turborepo-run-summary/src/task_factory.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,10 +123,12 @@ where
123123
.hash(task_id)
124124
.unwrap_or_else(|| panic!("hash not found for {task_id}"));
125125

126-
let expanded_inputs = self
126+
let expanded_inputs: std::collections::BTreeMap<_, _> = self
127127
.hash_tracker
128128
.expanded_inputs(task_id)
129-
.expect("inputs not found");
129+
.expect("inputs not found")
130+
.into_iter()
131+
.collect();
130132

131133
let env_vars = self
132134
.hash_tracker

crates/turborepo-scm/src/git.rs

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,12 +172,42 @@ impl CIEnv {
172172
}
173173

174174
impl GitRepo {
175+
#[cfg(feature = "git2")]
176+
fn get_current_branch(&self) -> Result<String, Error> {
177+
let repo = git2::Repository::open(self.root.as_std_path())
178+
.map_err(|e| Error::git2_error_context(e, "opening repo for branch".into()))?;
179+
let head = repo
180+
.head()
181+
.map_err(|e| Error::git2_error_context(e, "resolving HEAD for branch".into()))?;
182+
if head.is_branch() {
183+
Ok(head.shorthand().unwrap_or("").to_string())
184+
} else {
185+
// Detached HEAD — matches `git branch --show-current` which prints nothing
186+
Ok(String::new())
187+
}
188+
}
189+
190+
#[cfg(not(feature = "git2"))]
175191
fn get_current_branch(&self) -> Result<String, Error> {
176192
let output = self.execute_git_command(&["branch", "--show-current"], "")?;
177193
let output = String::from_utf8(output)?;
178194
Ok(output.trim().to_owned())
179195
}
180196

197+
#[cfg(feature = "git2")]
198+
fn get_current_sha(&self) -> Result<String, Error> {
199+
let repo = git2::Repository::open(self.root.as_std_path())
200+
.map_err(|e| Error::git2_error_context(e, "opening repo for sha".into()))?;
201+
let head = repo
202+
.head()
203+
.map_err(|e| Error::git2_error_context(e, "resolving HEAD for sha".into()))?;
204+
let commit = head
205+
.peel_to_commit()
206+
.map_err(|e| Error::git2_error_context(e, "peeling HEAD to commit".into()))?;
207+
Ok(commit.id().to_string())
208+
}
209+
210+
#[cfg(not(feature = "git2"))]
181211
fn get_current_sha(&self) -> Result<String, Error> {
182212
let output = self.execute_git_command(&["rev-parse", "HEAD"], "")?;
183213
let output = String::from_utf8(output)?;
@@ -1422,4 +1452,103 @@ mod tests {
14221452

14231453
assert_eq!(None, actual);
14241454
}
1455+
1456+
// Regression: git2 get_current_branch and get_current_sha must match the
1457+
// subprocess equivalents. This captures the invariant that must hold when
1458+
// switching from `git branch --show-current` / `git rev-parse HEAD` to
1459+
// git2::Repository.
1460+
#[test]
1461+
fn test_git2_branch_and_sha_match_subprocess() {
1462+
let (repo_root, repo) = setup_repository(None).unwrap();
1463+
let root = repo_root.path();
1464+
1465+
// Create an initial commit so HEAD exists
1466+
let file_path = root.join("file.txt");
1467+
fs::write(&file_path, "hello").unwrap();
1468+
let mut index = repo.index().unwrap();
1469+
index.add_path(Path::new("file.txt")).unwrap();
1470+
index.write().unwrap();
1471+
let tree_id = index.write_tree().unwrap();
1472+
let tree = repo.find_tree(tree_id).unwrap();
1473+
let sig = repo.signature().unwrap();
1474+
repo.commit(Some("HEAD"), &sig, &sig, "initial", &tree, &[])
1475+
.unwrap();
1476+
1477+
// Create a branch name and switch to it
1478+
repo.branch(
1479+
"test-branch",
1480+
&repo.head().unwrap().peel_to_commit().unwrap(),
1481+
false,
1482+
)
1483+
.unwrap();
1484+
repo.set_head("refs/heads/test-branch").unwrap();
1485+
1486+
let abs_root = AbsoluteSystemPathBuf::try_from(root).unwrap();
1487+
let git_repo = GitRepo::find(&abs_root).unwrap();
1488+
1489+
// Subprocess-based results
1490+
let subprocess_branch = git_repo.get_current_branch().unwrap();
1491+
let subprocess_sha = git_repo.get_current_sha().unwrap();
1492+
1493+
// git2-based equivalents
1494+
let git2_repo = Repository::open(root).unwrap();
1495+
let head = git2_repo.head().unwrap();
1496+
let git2_branch = head.shorthand().unwrap_or("").to_string();
1497+
let git2_sha = head.peel_to_commit().unwrap().id().to_string();
1498+
1499+
assert_eq!(
1500+
subprocess_branch, git2_branch,
1501+
"git2 branch must match subprocess"
1502+
);
1503+
assert_eq!(subprocess_sha, git2_sha, "git2 sha must match subprocess");
1504+
}
1505+
1506+
// Regression: detached HEAD should return empty branch name.
1507+
#[test]
1508+
fn test_git2_detached_head_branch() {
1509+
let (repo_root, repo) = setup_repository(None).unwrap();
1510+
let root = repo_root.path();
1511+
1512+
// Create an initial commit
1513+
let file_path = root.join("file.txt");
1514+
fs::write(&file_path, "hello").unwrap();
1515+
let mut index = repo.index().unwrap();
1516+
index.add_path(Path::new("file.txt")).unwrap();
1517+
index.write().unwrap();
1518+
let tree_id = index.write_tree().unwrap();
1519+
let tree = repo.find_tree(tree_id).unwrap();
1520+
let sig = repo.signature().unwrap();
1521+
let commit_oid = repo
1522+
.commit(Some("HEAD"), &sig, &sig, "initial", &tree, &[])
1523+
.unwrap();
1524+
1525+
// Detach HEAD
1526+
repo.set_head_detached(commit_oid).unwrap();
1527+
1528+
let abs_root = AbsoluteSystemPathBuf::try_from(root).unwrap();
1529+
let git_repo = GitRepo::find(&abs_root).unwrap();
1530+
1531+
// Subprocess returns empty string for detached HEAD
1532+
let subprocess_branch = git_repo.get_current_branch().unwrap();
1533+
assert!(
1534+
subprocess_branch.is_empty(),
1535+
"detached HEAD branch should be empty, got: {subprocess_branch}"
1536+
);
1537+
1538+
// git2: shorthand returns the abbreviated SHA, not empty.
1539+
// We handle this by checking if HEAD is a branch.
1540+
let git2_repo = Repository::open(root).unwrap();
1541+
let head = git2_repo.head().unwrap();
1542+
let is_branch = head.is_branch();
1543+
let git2_branch = if is_branch {
1544+
head.shorthand().unwrap_or("").to_string()
1545+
} else {
1546+
String::new()
1547+
};
1548+
1549+
assert_eq!(
1550+
subprocess_branch, git2_branch,
1551+
"git2 detached HEAD branch must match subprocess (empty)"
1552+
);
1553+
}
14251554
}

0 commit comments

Comments
 (0)