1515/**
1616 * Benchmark CharArrayHash vs Lucene CharArrayMap on a large word list.
1717 *
18- * CLI: --file <path> UTF-8 file, one token per line (optional) --target <N>
19- * desired unique tokens (default 600000) --mode <hash|map|both> which
20- * structure(s) to test (default both) --lowercase lowercase tokens before
21- * insert (optional)
22- *
23- * Notes: - Uses Δ(used heap) after GC as a coarse footprint estimator. - For
24- * reproducible results, fix heap: -Xms4g -Xmx4g -XX:+UseG1GC - Requires your
25- * CharArrayHash class on classpath.
18+ * Notes:
19+ * - This version avoids "same char[] instance" lookup bias by using separate
20+ * precomputed probe arrays for hits.
21+ * - Build simulates a copy-required token stream: CharArrayHash copies from a reused buffer
22+ * into its slab, CharArrayMap must allocate a stable char[] per key.
23+ * - Uses Δ(used heap) after GC as a coarse footprint estimator.
2624 */
2725public class CharArrayHashBenchmark
2826{
27+ private static final Random RNG = new Random (0xC0FFEE );
2928
3029 public static void main (String [] args ) throws Exception
3130 {
3231 final String lexicon = "word.csv" ;
3332 InputStream is = French .class .getResourceAsStream (lexicon );
3433 Reader reader = new InputStreamReader (is , StandardCharsets .UTF_8 );
35- List <char []> tokens = loadTokens (reader );
34+
35+ List <String > tokens = loadTokensAsStrings (reader );
3636 System .out .println ("Loaded tokens: " + fmt (tokens .size ()));
3737
38- List <char []> hits = new ArrayList <>(tokens );
39- List <char []> misses = makeMisses (tokens );
38+ // Precompute lookup probes (distinct arrays) so timing excludes allocation.
39+ List <char []> hitProbes = makeHitProbes (tokens );
40+ List <char []> missProbes = makeMissesFromHits (hitProbes );
4041
4142 System .out .println ("\n === Lucene CharArrayMap<Integer> ===" );
42- runMap (tokens , hits , misses );
43+ runMap (tokens , hitProbes , missProbes );
44+
4345 System .out .println ("\n === CharArrayHash ===" );
44- runHash (tokens , hits , misses );
46+ runHash (tokens , hitProbes , missProbes );
4547 }
4648
4749 // ---------------------- CharArrayHash ----------------------
4850
49- private static void runHash (List <char [] > tokens , List <char []> hits , List <char []> misses )
51+ private static void runHash (List <String > tokens , List <char []> hits , List <char []> misses )
5052 {
5153 forceGC ();
5254 long mem0 = usedMem ();
5355 long t0 = System .nanoTime ();
5456
5557 CharArrayHash h = new CharArrayHash (tokens .size ());
58+
59+ // Reused buffer: realistic for token streams (copy-required).
60+ char [] buf = new char [32 ];
5661 int lastOrd = -1 ;
57- for (char [] w : tokens ) {
58- int r = h .add (w , 0 , w .length );
62+
63+ for (String s : tokens ) {
64+ int len = s .length ();
65+ if (buf .length < len ) buf = new char [len ];
66+ s .getChars (0 , len , buf , 0 );
67+ int r = h .add (buf , 0 , len );
5968 lastOrd = (r >= 0 ) ? r : (-r - 1 );
6069 }
6170 h .trimToSize ();
@@ -82,21 +91,26 @@ private static void runHash(List<char[]> tokens, List<char[]> hits, List<char[]>
8291 });
8392
8493 print ("CharArrayHash" , tokens .size (), buildNs , bestHitNs , bestMissNs , bytes );
85- if (lastOrd == 42 ) System .out .print ("" ); // keep reachable
94+ if (lastOrd == 42 ) System .out .print ("" );
8695 }
8796
8897 // ---------------------- CharArrayMap<Integer> ----------------------
8998
90- private static void runMap (List <char [] > tokens , List <char []> hits , List <char []> misses )
99+ private static void runMap (List <String > tokens , List <char []> hits , List <char []> misses )
91100 {
92101 forceGC ();
93102 long mem0 = usedMem ();
94103 long t0 = System .nanoTime ();
95104
96105 CharArrayMap <Integer > map = new CharArrayMap <>(tokens .size (), false );
106+
107+ // Copy-required: store stable char[] keys, so allocate per token.
97108 int id = 0 ;
98- for (char [] w : tokens ) {
99- map .put (w , id ++); // uses char[] API
109+ for (String s : tokens ) {
110+ int len = s .length ();
111+ char [] k = new char [len ];
112+ s .getChars (0 , len , k , 0 );
113+ map .put (k , id ++);
100114 }
101115
102116 long buildNs = System .nanoTime () - t0 ;
@@ -126,27 +140,33 @@ private static void runMap(List<char[]> tokens, List<char[]> hits, List<char[]>
126140 if (id == 7 ) System .out .print ("" );
127141 }
128142
129- // ---------------------- helpers ----------------------
143+ // ---------------------- data prep ----------------------
130144
131- private static final Random RNG = new Random (0xC0FFEE );
132-
133- private static List <char []> loadTokens (Reader reader ) throws IOException
145+ private static List <String > loadTokensAsStrings (Reader reader ) throws IOException
134146 {
135- List <char [] > out = new ArrayList <>(600000 );
147+ List <String > out = new ArrayList <>(600000 );
136148 CSVReader csvReader = new CSVReader (reader , ',' , 1 , '"' , 16384 );
137- // header line
138149 if (!csvReader .readRow ()) throw new IOException ("Empty file?" );
139150 while (csvReader .readRow ()) {
140- final int wordLength = csvReader .getCell (0 ).length ();
141- final char [] word = new char [wordLength ];
142- csvReader .getCell (0 ).getChars (0 , wordLength , word , 0 );
143- out .add (word );
151+ out .add (csvReader .getCellAsString (0 ));
144152 }
145153 Collections .shuffle (out , RNG );
146154 return out ;
147155 }
148156
149- private static List <char []> makeMisses (List <char []> hits )
157+ private static List <char []> makeHitProbes (List <String > tokens )
158+ {
159+ List <char []> hits = new ArrayList <>(tokens .size ());
160+ for (String s : tokens ) {
161+ int len = s .length ();
162+ char [] w = new char [len ];
163+ s .getChars (0 , len , w , 0 );
164+ hits .add (w );
165+ }
166+ return hits ;
167+ }
168+
169+ private static List <char []> makeMissesFromHits (List <char []> hits )
150170 {
151171 List <char []> misses = new ArrayList <>(hits .size ());
152172 for (char [] w : hits ) {
@@ -158,6 +178,8 @@ private static List<char[]> makeMisses(List<char[]> hits)
158178 return misses ;
159179 }
160180
181+ // ---------------------- helpers ----------------------
182+
161183 private static void forceGC ()
162184 {
163185 try {
@@ -204,11 +226,9 @@ private static void print(String name, int n, long buildNs, long hitNs, long mis
204226 System .out .println ("Name: " + name );
205227 System .out .println ("Entries: " + fmt (n ));
206228 System .out .println ("Build time: " + String .format (Locale .ROOT , "%.2f ms" , buildMs ));
207- System .out .println (
208- "Hit lookup: " + String .format (Locale .ROOT , "%.2f ms" , hitMs ) + " (" + fmtQps (hitQps ) + " qps)" );
209- System .out .println (
210- "Miss lookup: " + String .format (Locale .ROOT , "%.2f ms" , missMs ) + " (" + fmtQps (missQps ) + " qps)" );
211- System .out .println ("Heap Δ (bytes): " + fmt (bytes ));
229+ System .out .println ("Hit lookup: " + String .format (Locale .ROOT , "%.2f ms" , hitMs ) + " (" + fmtQps (hitQps ) + " qps)" );
230+ System .out .println ("Miss lookup: " + String .format (Locale .ROOT , "%.2f ms" , missMs ) + " (" + fmtQps (missQps ) + " qps)" );
231+ System .out .println ("Heap \u0394 (bytes): " + fmt (bytes ));
212232 System .out .println ("Bytes/entry: " + (Double .isNaN (bpe ) ? "n/a" : String .format (Locale .ROOT , "%.2f" , bpe )));
213233 }
214234
@@ -223,5 +243,4 @@ private static String fmtQps(double qps)
223243 if (qps >= 1_000 ) return String .format (Locale .ROOT , "%.1fk" , qps );
224244 return String .format (Locale .ROOT , "%.0f" , qps );
225245 }
226-
227246}
0 commit comments