Skip to content

Commit 47f6751

Browse files
committed
Replace method_references HashMap with SortedVecMap
Method references (8.96M entries) are bulk-loaded during indexing, never accessed during resolution, and only queried/removed during incremental updates and FFI queries. This access pattern is ideal for a sorted Vec: - HashMap: ~16M slots × 41 bytes = ~656 MB (load factor + control bytes) - SortedVec: 8.96M × 40 bytes = ~342 MB (no overhead) Introduces SortedVecMap<K, V> backed by a sorted Vec<(K, V)> with O(log n) binary search lookups. Uses UnsafeCell for lazy sorting so get() can take &self. Entries are bulk-pushed unsorted during indexing, then sorted lazily on first lookup. ## Benchmark (cumulative with NonZeroU64 change) | Metric | Before | After | Delta | |--------|--------|-------|-------| | Memory (peak footprint) | 4480 MB | 3542 MB | -20.9% | | Declarations | 895,844 | 895,844 | identical | | Definitions | 1,063,171 | 1,063,171 | identical |
1 parent f936bb1 commit 47f6751

File tree

4 files changed

+173
-8
lines changed

4 files changed

+173
-8
lines changed

rust/rubydex/src/main.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,9 @@ fn main() {
8686
eprintln!("{error}");
8787
}
8888

89+
// Sort method references after bulk loading for efficient lookup
90+
graph.sort_method_references();
91+
8992
if let Some(StopAfter::Indexing) = args.stop_after {
9093
return exit(args.stats);
9194
}

rust/rubydex/src/model.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@ pub mod identity_maps;
99
pub mod ids;
1010
pub mod name;
1111
pub mod references;
12+
pub mod sorted_vec_map;
1213
pub mod string_ref;
1314
pub mod visibility;

rust/rubydex/src/model/graph.rs

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use crate::model::identity_maps::{IdentityHashMap, IdentityHashSet};
1111
use crate::model::ids::{DeclarationId, DefinitionId, NameId, ReferenceId, StringId, UriId};
1212
use crate::model::name::{Name, NameRef, ParentScope, ResolvedName};
1313
use crate::model::references::{ConstantReference, MethodRef};
14+
use crate::model::sorted_vec_map::SortedVecMap;
1415
use crate::model::string_ref::StringRef;
1516
use crate::stats;
1617

@@ -48,8 +49,9 @@ pub struct Graph {
4849
names: IdentityHashMap<NameId, NameRef>,
4950
// Map of constant references
5051
constant_references: IdentityHashMap<ReferenceId, ConstantReference>,
51-
// Map of method references that still need to be resolved
52-
method_references: IdentityHashMap<ReferenceId, MethodRef>,
52+
// Method references stored as a sorted vec for compact memory usage.
53+
// Not used during resolution — only queried and removed during incremental updates.
54+
method_references: SortedVecMap<ReferenceId, MethodRef>,
5355

5456
/// The position encoding used for LSP line/column locations. Not related to the actual encoding of the file
5557
position_encoding: Encoding,
@@ -69,7 +71,7 @@ impl Graph {
6971
strings: IdentityHashMap::default(),
7072
names: IdentityHashMap::default(),
7173
constant_references: IdentityHashMap::default(),
72-
method_references: IdentityHashMap::default(),
74+
method_references: SortedVecMap::new(),
7375
position_encoding: Encoding::default(),
7476
name_dependents: IdentityHashMap::default(),
7577
}
@@ -358,12 +360,18 @@ impl Graph {
358360
&self.constant_references
359361
}
360362

361-
// Returns an immutable reference to the method references map
363+
// Returns an immutable reference to the method references
362364
#[must_use]
363-
pub fn method_references(&self) -> &IdentityHashMap<ReferenceId, MethodRef> {
365+
pub fn method_references(&self) -> &SortedVecMap<ReferenceId, MethodRef> {
364366
&self.method_references
365367
}
366368

369+
/// Sort method references for efficient lookup.
370+
/// Must be called after bulk loading (indexing) is complete.
371+
pub fn sort_method_references(&mut self) {
372+
self.method_references.sort();
373+
}
374+
367375
#[must_use]
368376
pub fn all_diagnostics(&self) -> Vec<&Diagnostic> {
369377
let document_diagnostics = self.documents.values().flat_map(Document::all_diagnostics);
@@ -776,9 +784,7 @@ impl Graph {
776784
}
777785

778786
for (method_ref_id, method_ref) in method_references {
779-
if self.method_references.insert(method_ref_id, method_ref).is_some() {
780-
debug_assert!(false, "Method ReferenceId collision in global graph");
781-
}
787+
self.method_references.push(method_ref_id, method_ref);
782788
}
783789

784790
for (name_id, deps) in name_dependents {
@@ -945,6 +951,7 @@ impl Graph {
945951
&self.position_encoding
946952
}
947953

954+
948955
#[allow(clippy::cast_precision_loss)]
949956
pub fn print_query_statistics(&self) {
950957
use std::collections::{HashMap, HashSet};
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
use std::cell::UnsafeCell;
2+
3+
/// A sorted Vec-based map for compact storage of bulk-loaded key-value pairs.
4+
///
5+
/// Uses a flat `Vec<(K, V)>` sorted by key for O(log n) lookups.
6+
/// Much more memory-efficient than HashMap for large collections that are
7+
/// bulk-loaded then mostly read, with occasional removals.
8+
///
9+
/// # When to use
10+
///
11+
/// Use this instead of HashMap when **all** of the following are true:
12+
/// - The collection is large (millions of entries) — small maps don't benefit
13+
/// - Entries are bulk-loaded (e.g., during indexing), not inserted one-at-a-time in hot loops
14+
/// - Lookups are infrequent or off the critical path (O(log n) binary search, not O(1))
15+
///
16+
/// **Do not use** for collections accessed in tight loops during resolution or
17+
/// other hot paths. Binary search on large arrays causes cache misses that
18+
/// make it slower than HashMap despite the memory savings. For example,
19+
/// `constant_references` (5.64M entries, queried millions of times during
20+
/// resolution) showed a 24% time regression when switched to SortedVecMap.
21+
///
22+
/// # Implementation
23+
///
24+
/// Uses interior mutability for lazy sorting: `get()` takes `&self` and sorts
25+
/// on first access if needed. This is safe because sorting is idempotent and
26+
/// the map is only accessed from a single thread at a time.
27+
pub struct SortedVecMap<K, V> {
28+
// UnsafeCell allows mutation through &self for lazy sorting.
29+
// Safety: rubydex's Graph is single-threaded for mutations.
30+
inner: UnsafeCell<SortedVecMapInner<K, V>>,
31+
}
32+
33+
struct SortedVecMapInner<K, V> {
34+
entries: Vec<(K, V)>,
35+
sorted: bool,
36+
}
37+
38+
// SAFETY: SortedVecMap is only used within Graph which is single-threaded for mutations.
39+
// The UnsafeCell is only mutated during lazy sorting which is idempotent.
40+
// Sync is needed because Graph is stored in Arc<Mutex<>> in the MCP server.
41+
unsafe impl<K: Send, V: Send> Send for SortedVecMap<K, V> {}
42+
unsafe impl<K: Send + Sync, V: Send + Sync> Sync for SortedVecMap<K, V> {}
43+
44+
impl<K: std::fmt::Debug, V: std::fmt::Debug> std::fmt::Debug for SortedVecMap<K, V> {
45+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
46+
let inner = unsafe { &*self.inner.get() };
47+
f.debug_struct("SortedVecMap")
48+
.field("entries", &inner.entries)
49+
.field("sorted", &inner.sorted)
50+
.finish()
51+
}
52+
}
53+
54+
impl<K, V> Default for SortedVecMap<K, V> {
55+
fn default() -> Self {
56+
Self {
57+
inner: UnsafeCell::new(SortedVecMapInner {
58+
entries: Vec::new(),
59+
sorted: true,
60+
}),
61+
}
62+
}
63+
}
64+
65+
impl<K: Ord + Copy, V> SortedVecMap<K, V> {
66+
pub fn new() -> Self {
67+
Self::default()
68+
}
69+
70+
/// Ensure the entries are sorted.
71+
fn ensure_sorted(&self) {
72+
let inner = unsafe { &mut *self.inner.get() };
73+
if !inner.sorted {
74+
inner.entries.sort_unstable_by_key(|(k, _)| *k);
75+
inner.sorted = true;
76+
}
77+
}
78+
79+
fn inner(&self) -> &SortedVecMapInner<K, V> {
80+
unsafe { &*self.inner.get() }
81+
}
82+
83+
/// Push a new entry without maintaining sort order.
84+
pub fn push(&mut self, key: K, value: V) {
85+
let inner = self.inner.get_mut();
86+
inner.entries.push((key, value));
87+
inner.sorted = false;
88+
}
89+
90+
/// Insert a key-value pair, returning the previous value if the key already existed.
91+
pub fn insert(&mut self, key: K, value: V) -> Option<V> {
92+
self.ensure_sorted();
93+
let inner = self.inner.get_mut();
94+
match inner.entries.binary_search_by_key(&key, |(k, _)| *k) {
95+
Ok(index) => {
96+
let old = std::mem::replace(&mut inner.entries[index].1, value);
97+
Some(old)
98+
}
99+
Err(index) => {
100+
inner.entries.insert(index, (key, value));
101+
None
102+
}
103+
}
104+
}
105+
106+
/// Look up a value by key. Lazily sorts if needed.
107+
pub fn get(&self, key: &K) -> Option<&V> {
108+
self.ensure_sorted();
109+
let inner = self.inner();
110+
inner.entries
111+
.binary_search_by_key(key, |(k, _)| *k)
112+
.ok()
113+
.map(|index| &inner.entries[index].1)
114+
}
115+
116+
/// Remove an entry by key, returning the value if found.
117+
pub fn remove(&mut self, key: &K) -> Option<V> {
118+
self.ensure_sorted();
119+
let inner = self.inner.get_mut();
120+
match inner.entries.binary_search_by_key(key, |(k, _)| *k) {
121+
Ok(index) => Some(inner.entries.remove(index).1),
122+
Err(_) => None,
123+
}
124+
}
125+
126+
pub fn len(&self) -> usize {
127+
self.inner().entries.len()
128+
}
129+
130+
pub fn is_empty(&self) -> bool {
131+
self.inner().entries.is_empty()
132+
}
133+
134+
pub fn keys(&self) -> impl Iterator<Item = &K> {
135+
self.inner().entries.iter().map(|(k, _)| k)
136+
}
137+
138+
pub fn values(&self) -> impl Iterator<Item = &V> {
139+
self.inner().entries.iter().map(|(_, v)| v)
140+
}
141+
142+
pub fn iter(&self) -> impl Iterator<Item = (&K, &V)> {
143+
self.inner().entries.iter().map(|(k, v)| (k, v))
144+
}
145+
146+
pub fn shrink_to_fit(&mut self) {
147+
self.inner.get_mut().entries.shrink_to_fit();
148+
}
149+
150+
/// Sort the map if not already sorted.
151+
pub fn sort(&mut self) {
152+
self.ensure_sorted();
153+
}
154+
}

0 commit comments

Comments
 (0)