Skip to content

Commit 2fbeb42

Browse files
committed
join: Benchmark join with actual Unicode data requiring locale collation
1 parent b9372e5 commit 2fbeb42

File tree

1 file changed

+42
-1
lines changed

1 file changed

+42
-1
lines changed

src/uu/join/benches/join_bench.rs

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ fn join_custom_separator(bencher: Bencher) {
110110
});
111111
}
112112

113-
/// Benchmark join with French locale (fr_FR.UTF-8)
113+
/// Benchmark join with French locale (fr_FR.UTF-8) - ASCII data (fast path)
114114
#[divan::bench]
115115
fn join_french_locale(bencher: Bencher) {
116116
let num_lines = 10000;
@@ -126,6 +126,47 @@ fn join_french_locale(bencher: Bencher) {
126126
});
127127
}
128128

129+
/// Create files with Unicode data that requires locale collation
130+
fn create_unicode_join_files(temp_dir: &TempDir, num_lines: usize) -> (String, String) {
131+
let file1_path = temp_dir.path().join("file1.txt");
132+
let file2_path = temp_dir.path().join("file2.txt");
133+
134+
let mut file1 = File::create(&file1_path).unwrap();
135+
let mut file2 = File::create(&file2_path).unwrap();
136+
137+
// Create data with accented characters that require locale collation
138+
let accented_chars = [
139+
"àbc", "àbd", "abc", "abd", "èfg", "efg", "çar", "car", "öst", "ost",
140+
];
141+
142+
for i in 0..num_lines {
143+
let key = &accented_chars[i % accented_chars.len()];
144+
writeln!(file1, "{key}:{i:06} field1_{i}").unwrap();
145+
writeln!(file2, "{key}:{i:06} data1_{i}").unwrap();
146+
}
147+
148+
(
149+
file1_path.to_str().unwrap().to_string(),
150+
file2_path.to_str().unwrap().to_string(),
151+
)
152+
}
153+
154+
/// Benchmark join with actual Unicode data requiring locale collation
155+
#[divan::bench]
156+
fn join_unicode_locale(bencher: Bencher) {
157+
let num_lines = 1000; // Smaller due to complexity
158+
let temp_dir = TempDir::new().unwrap();
159+
let (file1, file2) = create_unicode_join_files(&temp_dir, num_lines);
160+
161+
bencher
162+
.with_inputs(|| unsafe {
163+
std::env::set_var("LC_ALL", "fr_FR.UTF-8");
164+
})
165+
.bench_values(|_| {
166+
black_box(run_util_function(uumain, &[&file1, &file2]));
167+
});
168+
}
169+
129170
fn main() {
130171
divan::main();
131172
}

0 commit comments

Comments
 (0)