More perf adjustments.

rjpower · rjpower · commit b84390d01d7f · 2026-03-27T12:04:22.000-07:00
diff --git a/lib/marin/src/marin/processing/classification/deduplication/fuzzy.py b/lib/marin/src/marin/processing/classification/deduplication/fuzzy.py
@@ -95,7 +95,11 @@ def compute_minhash_lsh_batches(batch: pa.RecordBatch) -> Iterator[dict]:
             doc_id_val = doc_id.as_py()
             for b in doc_buckets.as_py():
                 counters.increment("minhash/buckets")
-                yield {"bucket": str(b), "id": doc_id_val}
+                # Reinterpret u64 as signed int64 so Arrow infers int64 instead of
+                # failing on values >= 2^63. The bucket is only a grouping key so
+                # the sign bit doesn't matter.
+                bucket = b if b < (1 << 63) else b - (1 << 64)
+                yield {"bucket": bucket, "id": doc_id_val}
 
     ctx = ZephyrContext(
         name="fuzzy-dedup",
diff --git a/lib/zephyr/src/zephyr/plan.py b/lib/zephyr/src/zephyr/plan.py
@@ -291,11 +291,11 @@ def _merge_key(row: dict) -> Any:
 
     is_gen = inspect.isgeneratorfunction(reducer_fn)
     for start, end, key_value in _find_group_boundaries(key_col):
-        group_items = item_col[start:end].to_pylist()
+        group_items = (item_col[i].as_py() for i in range(start, end))
         if is_gen:
-            yield from reducer_fn(key_value, iter(group_items))
+            yield from reducer_fn(key_value, group_items)
         else:
-            yield reducer_fn(key_value, iter(group_items))
+            yield reducer_fn(key_value, group_items)
 
 
 def _reduce_gen(
diff --git a/lib/zephyr/src/zephyr/shuffle.py b/lib/zephyr/src/zephyr/shuffle.py
@@ -656,7 +656,7 @@ def _ensure_writer(chunk_schema: pa.Schema) -> pa.Schema:
             seg_file = _segment_path(parquet_path, seg_idx)
             seg_paths.append(seg_file)
             ensure_parent_dir(seg_file)
-            writer = pq.ParquetWriter(seg_file, schema)
+            writer = pq.ParquetWriter(seg_file, schema, compression="zstd", compression_level=1)
         elif chunk_schema != schema:
             _flush_pending()
             writer.close()
@@ -668,7 +668,7 @@ def _ensure_writer(chunk_schema: pa.Schema) -> pa.Schema:
             seg_file = _segment_path(parquet_path, seg_idx)
             seg_paths.append(seg_file)
             ensure_parent_dir(seg_file)
-            writer = pq.ParquetWriter(seg_file, schema)
+            writer = pq.ParquetWriter(seg_file, schema, compression="zstd", compression_level=1)
             logger.info(
                 "[shard %d] Schema evolved after %d chunks; starting segment %d",
                 source_shard,
diff --git a/rust/dupekit/src/minhash_ops.rs b/rust/dupekit/src/minhash_ops.rs
@@ -4,17 +4,23 @@ use pyo3::prelude::*;
 use rand::{Rng, SeedableRng};
 use rand_pcg::Pcg64;
 use regex::Regex;
-use std::sync::Arc;
+use std::sync::{Arc, OnceLock};
 use xxhash_rust::xxh3;
 
+static WHITESPACE_RE: OnceLock<Regex> = OnceLock::new();
+
+fn whitespace_regex() -> &'static Regex {
+    WHITESPACE_RE.get_or_init(|| Regex::new(r"\s+").unwrap())
+}
+
 /// Clean text using the SlimPajama text cleaning process.
 /// 1. Lowercase
 /// 2. Remove punctuation
 /// 3. Replace multiple whitespace with single space
 /// 4. Trim
 pub fn clean_text(arr: &StringArray) -> PyResult<Arc<StringArray>> {
     let mut builder = StringBuilder::with_capacity(arr.len(), arr.len() * 50);
-    let whitespace_re = Regex::new(r"\s+").map_err(|e| PyValueError::new_err(e.to_string()))?;
+    let whitespace_re = whitespace_regex();
     let punctuation: &[char] = &[
         '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<',
         '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~',
@@ -73,9 +79,15 @@ pub fn compute_minhash(
             let hash = xxh3::xxh3_64(text.as_bytes()) as u128;
             update_signature(&mut signature, hash, &coeffs);
         } else {
+            // Reusable buffer for encoding char windows to bytes, avoiding
+            // a String allocation per ngram.
+            let mut ngram_buf = Vec::with_capacity(ngram_size * 4);
             for window in chars.windows(ngram_size) {
-                let s: String = window.iter().collect();
-                let hash = xxh3::xxh3_64(s.as_bytes()) as u128;
+                ngram_buf.clear();
+                for &ch in window {
+                    ngram_buf.extend_from_slice(ch.encode_utf8(&mut [0; 4]).as_bytes());
+                }
+                let hash = xxh3::xxh3_64(&ngram_buf) as u128;
                 update_signature(&mut signature, hash, &coeffs);
             }
         }