Skip to content

Commit d16f081

Browse files
authored
perf: Reduce allocations in SCM hashing, glob preprocessing, and cache lookups (#11916)
## Summary Reduce heap allocations and syscalls across the hot paths of `turbo run`. These are mechanical changes — no behavioral differences, no new APIs. ### Benchmarks (`--dry` runs, `--skip-infer`, 10 runs each, 5 warmup) | Repo | Packages | Tasks | Before | After | Delta | |------|----------|-------|--------|-------|-------| | Large | ~1000 | 1690 | 2.179s ± 0.073s | 2.130s ± 0.049s | **1.02x faster** | | Medium | ~120 | ~200 | 1.235s ± 0.098s | 1.216s ± 0.079s | **1.02x faster** | | Small | ~5 | ~5 | 834.1ms ± 28.4ms | 816.8ms ± 22.8ms | **1.02x faster** | ### Changes **SCM parsing** (`ls_tree.rs`, `status.rs`): `entry.hash.to_vec()` + `String::from_utf8()` allocated twice per entry (intermediate `Vec<u8>` then `String`). Now uses `str::from_utf8()` + `.to_owned()` for a single allocation. BufReader buffer increased from 8KB to 64KB to reduce `read()` syscalls on large git output. **File hashing** (`hash_object.rs`): `hashes.reserve(to_hash.len())` pre-allocates the HashMap to avoid rehashing during insertion. **Package deps** (`package_deps.rs`): Input globs were cloned to `String` just to iterate them. Now works with `&str` references and reuses a single `String` buffer for the `"{package_path}/{glob}"` join instead of allocating per-iteration via `.join("/")`. Capacity hints added to `inclusions`, `exclusions`, `to_hash`, and `hashes` Vecs/HashMaps. When include globs overlap with the git index (the `$TURBO_DEFAULT$` + explicit inputs case), files already known from the index are skipped instead of being re-hashed. **Task hashing** (`lib.rs`): Dependency hash deduplication now collects `&str` references under the mutex lock instead of cloning each `String` into the `HashSet`. Owned strings are only allocated after dedup, halving allocations. Capacity hint added. **Glob preprocessing** (`globwalk/lib.rs`): Capacity hints on include/exclude path Vecs. Exclude path processing avoids an unnecessary `.to_string()` by using `Cow::into_owned()` which is free when the Cow is already borrowed. **Cache lookups** (`fs.rs`): `FSCache::exists()` was doing 3 `format!()` + `join_component()` allocations per call (called once per task). Now reuses a single `String` buffer, truncating and re-appending suffixes for `.tar`, `.tar.zst`, and `-meta.json`.
1 parent fb5ef0e commit d16f081

File tree

7 files changed

+152
-102
lines changed

7 files changed

+152
-102
lines changed

crates/turborepo-cache/src/fs.rs

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -125,19 +125,29 @@ impl FSCache {
125125

126126
#[tracing::instrument(skip_all)]
127127
pub(crate) fn exists(&self, hash: &str) -> Result<Option<CacheHitMetadata>, CacheError> {
128-
let uncompressed_cache_path = self.cache_directory.join_component(&format!("{hash}.tar"));
129-
let compressed_cache_path = self
130-
.cache_directory
131-
.join_component(&format!("{hash}.tar.zst"));
128+
let cache_dir = self.cache_directory.as_str();
129+
let mut buf = String::with_capacity(cache_dir.len() + 1 + hash.len() + "-meta.json".len());
130+
buf.push_str(cache_dir);
131+
buf.push(std::path::MAIN_SEPARATOR);
132+
buf.push_str(hash);
133+
let prefix_len = buf.len();
134+
135+
buf.push_str(".tar");
136+
let uncompressed_exists = std::path::Path::new(&buf).exists();
132137

133-
if !uncompressed_cache_path.exists() && !compressed_cache_path.exists() {
138+
buf.push_str(".zst");
139+
let compressed_exists = std::path::Path::new(&buf).exists();
140+
141+
if !uncompressed_exists && !compressed_exists {
134142
return Ok(None);
135143
}
136144

145+
buf.truncate(prefix_len);
146+
buf.push_str("-meta.json");
147+
137148
let duration = CacheMetadata::read(
138-
&self
139-
.cache_directory
140-
.join_component(&format!("{hash}-meta.json")),
149+
&AbsoluteSystemPathBuf::try_from(buf.as_str())
150+
.map_err(|_| CacheError::ConfigCacheInvalidBase)?,
141151
)
142152
.map(|meta| meta.duration)
143153
.unwrap_or(0);

crates/turborepo-globwalk/src/lib.rs

Lines changed: 21 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -80,25 +80,22 @@ fn preprocess_paths_and_globs<S: AsRef<str>>(
8080
.ok_or(WalkError::InvalidPath)?;
8181
let base_path_slash = escape_glob_literals(&raw_slash);
8282

83-
let (include_paths, lowest_segment) = include
84-
.iter()
85-
.map(|s| fix_glob_pattern(s.as_ref()).into_owned())
86-
.map(|mut s| {
83+
let (include_paths, lowest_segment) = {
84+
let mut paths = Vec::with_capacity(include.len());
85+
let mut lowest = usize::MAX;
86+
for s in include {
87+
let mut fixed = fix_glob_pattern(s.as_ref()).into_owned();
8788
// We need to check inclusion globs before the join
8889
// as to_slash doesn't preserve Windows drive names.
89-
add_doublestar_to_dir(base_path, &mut s);
90-
s
91-
})
92-
.map(|s| join_unix_like_paths(&base_path_slash, &s))
93-
.filter_map(|s| collapse_path(&s).map(|(s, v)| (s.to_string(), v)))
94-
.fold(
95-
(vec![], usize::MAX),
96-
|(mut vec, lowest_segment), (path, lowest_segment_next)| {
97-
let lowest_segment = std::cmp::min(lowest_segment, lowest_segment_next);
98-
vec.push(path); // we stringify here due to lifetime issues
99-
(vec, lowest_segment)
100-
},
101-
);
90+
add_doublestar_to_dir(base_path, &mut fixed);
91+
let joined = join_unix_like_paths(&base_path_slash, &fixed);
92+
if let Some((collapsed, segment)) = collapse_path(&joined) {
93+
lowest = std::cmp::min(lowest, segment);
94+
paths.push(collapsed.into_owned());
95+
}
96+
}
97+
(paths, lowest)
98+
};
10299

103100
let base_path = base_path
104101
.components()
@@ -108,16 +105,13 @@ fn preprocess_paths_and_globs<S: AsRef<str>>(
108105
)
109106
.collect::<PathBuf>();
110107

111-
let mut exclude_paths = vec![];
112-
for split in exclude
113-
.iter()
114-
.map(|s| fix_glob_pattern(s.as_ref()))
115-
.map(|s| join_unix_like_paths(&base_path_slash, s.as_ref()))
116-
.filter_map(|g| collapse_path(&g).map(|(s, _)| s.to_string()))
117-
{
118-
// if the glob ends with a slash, then we need to add a double star,
119-
// unless it already ends with a double star
120-
add_trailing_double_star(&mut exclude_paths, &split);
108+
let mut exclude_paths = Vec::with_capacity(exclude.len() * 2);
109+
for s in exclude {
110+
let fixed = fix_glob_pattern(s.as_ref());
111+
let joined = join_unix_like_paths(&base_path_slash, fixed.as_ref());
112+
if let Some((collapsed, _)) = collapse_path(&joined) {
113+
add_trailing_double_star(&mut exclude_paths, &collapsed);
114+
}
121115
}
122116

123117
Ok((base_path, include_paths, exclude_paths))

crates/turborepo-scm/src/hash_object.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ pub(crate) fn hash_objects(
4545
) -> Result<(), Error> {
4646
let pkg_prefix = git_root.anchor(pkg_path).ok().map(|a| a.to_unix());
4747

48+
hashes.reserve(to_hash.len());
4849
let results: Vec<Result<Option<(RelativeUnixPathBuf, String)>, Error>> = to_hash
4950
.into_par_iter()
5051
.map(|filename| {

crates/turborepo-scm/src/ls_tree.rs

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,26 +71,32 @@ impl GitRepo {
7171
}
7272

7373
fn read_ls_tree<R: Read>(reader: R, hashes: &mut GitHashes) -> Result<(), Error> {
74-
let mut reader = BufReader::new(reader);
74+
let mut reader = BufReader::with_capacity(64 * 1024, reader);
7575
let mut buffer = Vec::new();
7676
while reader.read_until(b'\0', &mut buffer)? != 0 {
7777
let entry = parse_ls_tree(&buffer)?;
78-
let hash = String::from_utf8(entry.hash.to_vec())?;
79-
let path = RelativeUnixPathBuf::new(String::from_utf8(entry.filename.to_vec())?)?;
80-
hashes.insert(path, hash);
78+
let hash = std::str::from_utf8(entry.hash)
79+
.map_err(|e| Error::git_error(format!("invalid utf8 in ls-tree hash: {e}")))?;
80+
let filename = std::str::from_utf8(entry.filename)
81+
.map_err(|e| Error::git_error(format!("invalid utf8 in ls-tree filename: {e}")))?;
82+
let path = RelativeUnixPathBuf::new(filename)?;
83+
hashes.insert(path, hash.to_owned());
8184
buffer.clear();
8285
}
8386
Ok(())
8487
}
8588

8689
fn read_ls_tree_sorted<R: Read>(reader: R, hashes: &mut SortedGitHashes) -> Result<(), Error> {
87-
let mut reader = BufReader::new(reader);
90+
let mut reader = BufReader::with_capacity(64 * 1024, reader);
8891
let mut buffer = Vec::new();
8992
while reader.read_until(b'\0', &mut buffer)? != 0 {
9093
let entry = parse_ls_tree(&buffer)?;
91-
let hash = String::from_utf8(entry.hash.to_vec())?;
92-
let path = RelativeUnixPathBuf::new(String::from_utf8(entry.filename.to_vec())?)?;
93-
hashes.insert(path, hash);
94+
let hash = std::str::from_utf8(entry.hash)
95+
.map_err(|e| Error::git_error(format!("invalid utf8 in ls-tree hash: {e}")))?;
96+
let filename = std::str::from_utf8(entry.filename)
97+
.map_err(|e| Error::git_error(format!("invalid utf8 in ls-tree filename: {e}")))?;
98+
let path = RelativeUnixPathBuf::new(filename)?;
99+
hashes.insert(path, hash.to_owned());
94100
buffer.clear();
95101
}
96102
Ok(())

crates/turborepo-scm/src/package_deps.rs

Lines changed: 83 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -200,45 +200,30 @@ impl GitRepo {
200200
let package_unix_path_buf = package_path.to_unix();
201201
let package_unix_path = package_unix_path_buf.as_str();
202202

203-
let mut inputs = inputs
204-
.iter()
205-
.map(|s| s.as_ref().to_string())
206-
.collect::<Vec<String>>();
207-
208-
if include_configs {
209-
// Add in package.json and turbo.json to input patterns. Both file paths are
210-
// relative to pkgPath
211-
//
212-
// - package.json is an input because if the `scripts` in the package.json
213-
// change (i.e. the tasks that turbo executes), we want a cache miss, since
214-
// any existing cache could be invalid.
215-
// - turbo.json because it's the definition of the tasks themselves. The root
216-
// turbo.json is similarly included in the global hash. This file may not
217-
// exist in the workspace, but that is ok, because it will get ignored
218-
// downstream.
219-
inputs.push("package.json".to_string());
220-
inputs.push("turbo.json".to_string());
221-
inputs.push("turbo.jsonc".to_string());
222-
}
203+
static CONFIG_FILES: &[&str] = &["package.json", "turbo.json", "turbo.jsonc"];
204+
let extra_inputs = if include_configs { CONFIG_FILES } else { &[] };
205+
let total_inputs = inputs.len() + extra_inputs.len();
206+
207+
let mut inclusions = Vec::with_capacity(total_inputs);
208+
let mut exclusions = Vec::with_capacity(total_inputs);
209+
let mut glob_buf = String::with_capacity(package_unix_path.len() + 1 + 64);
223210

224-
// The input patterns are relative to the package.
225-
// However, we need to change the globbing to be relative to the repo root.
226-
// Prepend the package path to each of the input patterns.
227-
//
228-
// FIXME: we don't yet error on absolute unix paths being passed in as inputs,
229-
// and instead tack them on as if they were relative paths. This should be an
230-
// error further upstream, but since we haven't pulled the switch yet,
231-
// we need to mimic the Go behavior here and trim leading `/`
232-
// characters.
233-
let mut inclusions = vec![];
234-
let mut exclusions = vec![];
235-
for raw_glob in inputs {
211+
let all_inputs = inputs
212+
.iter()
213+
.map(|s| s.as_ref())
214+
.chain(extra_inputs.iter().copied());
215+
for raw_glob in all_inputs {
216+
glob_buf.clear();
236217
if let Some(exclusion) = raw_glob.strip_prefix('!') {
237-
let glob_str = [package_unix_path, exclusion.trim_start_matches('/')].join("/");
238-
exclusions.push(ValidatedGlob::from_str(&glob_str)?);
218+
glob_buf.push_str(package_unix_path);
219+
glob_buf.push('/');
220+
glob_buf.push_str(exclusion.trim_start_matches('/'));
221+
exclusions.push(ValidatedGlob::from_str(&glob_buf)?);
239222
} else {
240-
let glob_str = [package_unix_path, raw_glob.trim_start_matches('/')].join("/");
241-
inclusions.push(ValidatedGlob::from_str(&glob_str)?);
223+
glob_buf.push_str(package_unix_path);
224+
glob_buf.push('/');
225+
glob_buf.push_str(raw_glob.trim_start_matches('/'));
226+
inclusions.push(ValidatedGlob::from_str(&glob_buf)?);
242227
}
243228
}
244229
let files = globwalk::globwalk(
@@ -247,14 +232,11 @@ impl GitRepo {
247232
&exclusions,
248233
globwalk::WalkType::Files,
249234
)?;
250-
let to_hash = files
251-
.iter()
252-
.map(|entry| {
253-
let path = self.root.anchor(entry)?.to_unix();
254-
Ok(path)
255-
})
256-
.collect::<Result<Vec<_>, Error>>()?;
257-
let mut hashes = GitHashes::new();
235+
let mut to_hash = Vec::with_capacity(files.len());
236+
for entry in &files {
237+
to_hash.push(self.root.anchor(entry)?.to_unix());
238+
}
239+
let mut hashes = GitHashes::with_capacity(files.len());
258240
hash_objects(&self.root, &full_pkg_path, to_hash, &mut hashes)?;
259241
Ok(hashes)
260242
}
@@ -283,16 +265,66 @@ impl GitRepo {
283265
}
284266

285267
// Include globs can find files not in the git index (e.g. gitignored files
286-
// that a user explicitly wants to track). We still need globwalk for these
287-
// but can skip re-hashing files already known from the index.
268+
// that a user explicitly wants to track). Walk the filesystem for these
269+
// files but skip re-hashing any already known from the index.
288270
if !includes.is_empty() {
289-
let include_hashes = self.get_package_file_hashes_from_inputs(
271+
let full_pkg_path = turbo_root.resolve(package_path);
272+
let package_unix_path_buf = package_path.to_unix();
273+
let package_unix_path = package_unix_path_buf.as_str();
274+
275+
static CONFIG_FILES: &[&str] = &["package.json", "turbo.json", "turbo.jsonc"];
276+
let mut inclusions = Vec::with_capacity(includes.len() + CONFIG_FILES.len());
277+
let mut exclusions = Vec::new();
278+
let mut glob_buf = String::with_capacity(package_unix_path.len() + 1 + 64);
279+
280+
let all = includes.iter().copied().chain(CONFIG_FILES.iter().copied());
281+
for raw_glob in all {
282+
glob_buf.clear();
283+
if let Some(exclusion) = raw_glob.strip_prefix('!') {
284+
glob_buf.push_str(package_unix_path);
285+
glob_buf.push('/');
286+
glob_buf.push_str(exclusion.trim_start_matches('/'));
287+
exclusions.push(ValidatedGlob::from_str(&glob_buf)?);
288+
} else {
289+
glob_buf.push_str(package_unix_path);
290+
glob_buf.push('/');
291+
glob_buf.push_str(raw_glob.trim_start_matches('/'));
292+
inclusions.push(ValidatedGlob::from_str(&glob_buf)?);
293+
}
294+
}
295+
296+
let files = globwalk::globwalk(
290297
turbo_root,
291-
package_path,
292-
&includes,
293-
true,
298+
&inclusions,
299+
&exclusions,
300+
globwalk::WalkType::Files,
294301
)?;
295-
hashes.extend(include_hashes);
302+
303+
// Only hash files not already present from the git index
304+
let mut to_hash = Vec::new();
305+
for entry in &files {
306+
let git_relative = self.root.anchor(entry)?.to_unix();
307+
let pkg_relative = turbopath::RelativeUnixPath::strip_prefix(
308+
&git_relative,
309+
&package_unix_path_buf,
310+
)
311+
.ok()
312+
.map(|s| s.to_owned());
313+
314+
let already_known = pkg_relative
315+
.as_ref()
316+
.is_some_and(|rel| hashes.contains_key(rel));
317+
318+
if !already_known {
319+
to_hash.push(git_relative);
320+
}
321+
}
322+
323+
if !to_hash.is_empty() {
324+
let mut new_hashes = GitHashes::with_capacity(to_hash.len());
325+
hash_objects(&self.root, &full_pkg_path, to_hash, &mut new_hashes)?;
326+
hashes.extend(new_hashes);
327+
}
296328
}
297329

298330
// Apply excludes via in-memory matching — no filesystem walk needed since

crates/turborepo-scm/src/status.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,11 +83,13 @@ fn read_status<R: Read>(
8383
hashes: &mut GitHashes,
8484
) -> Result<Vec<RelativeUnixPathBuf>, Error> {
8585
let mut to_hash = Vec::new();
86-
let mut reader = BufReader::new(reader);
86+
let mut reader = BufReader::with_capacity(64 * 1024, reader);
8787
let mut buffer = Vec::new();
8888
while reader.read_until(b'\0', &mut buffer)? != 0 {
8989
let entry = parse_status(&buffer)?;
90-
let path = RelativeUnixPathBuf::new(String::from_utf8(entry.filename.to_owned())?)?;
90+
let filename = std::str::from_utf8(entry.filename)
91+
.map_err(|e| Error::git_error(format!("invalid utf8 in git status: {e}")))?;
92+
let path = RelativeUnixPathBuf::new(filename)?;
9193
if entry.is_delete {
9294
let path = path.strip_prefix(pkg_prefix).map_err(|_| {
9395
Error::git_error(format!(
@@ -106,11 +108,13 @@ fn read_status<R: Read>(
106108

107109
fn read_status_raw<R: Read>(reader: R) -> Result<Vec<RepoStatusEntry>, Error> {
108110
let mut entries = Vec::new();
109-
let mut reader = BufReader::new(reader);
111+
let mut reader = BufReader::with_capacity(64 * 1024, reader);
110112
let mut buffer = Vec::new();
111113
while reader.read_until(b'\0', &mut buffer)? != 0 {
112114
let entry = parse_status(&buffer)?;
113-
let path = RelativeUnixPathBuf::new(String::from_utf8(entry.filename.to_owned())?)?;
115+
let filename = std::str::from_utf8(entry.filename)
116+
.map_err(|e| Error::git_error(format!("invalid utf8 in git status: {e}")))?;
117+
let path = RelativeUnixPathBuf::new(filename)?;
114118
entries.push(RepoStatusEntry {
115119
path,
116120
is_delete: entry.is_delete,

crates/turborepo-task-hash/src/lib.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ impl<'a, R: RunOptsHashInfo> TaskHasher<'a, R> {
419419
.lock()
420420
.expect("hash tracker mutex poisoned");
421421

422-
let mut dependency_hash_set = HashSet::new();
422+
let mut dependency_hash_set = HashSet::with_capacity(dependency_set.len());
423423
for dependency_task in dependency_set {
424424
let TaskNode::Task(dependency_task_id) = dependency_task else {
425425
continue;
@@ -429,11 +429,14 @@ impl<'a, R: RunOptsHashInfo> TaskHasher<'a, R> {
429429
.package_task_hashes
430430
.get(dependency_task_id)
431431
.ok_or_else(|| Error::MissingDependencyTaskHash(dependency_task.to_string()))?;
432-
dependency_hash_set.insert(dependency_hash.clone());
432+
dependency_hash_set.insert(dependency_hash.as_str());
433433
}
434-
drop(state);
435434

436-
let mut dependency_hash_list = dependency_hash_set.into_iter().collect::<Vec<_>>();
435+
let mut dependency_hash_list: Vec<String> = dependency_hash_set
436+
.into_iter()
437+
.map(|s| s.to_owned())
438+
.collect();
439+
drop(state);
437440
dependency_hash_list.sort_unstable();
438441

439442
Ok(dependency_hash_list)

0 commit comments

Comments
 (0)