Skip to content

Commit 5d77176

Browse files
committed
Use a branchless binary search for extension<->MIME-Types
This uses the excellent branchless binary search by Orlon Peters[1], based on the Malte Skarupke's version[2] of Leonard E Shar's version of a binary search. It's a fascinating implementation that ends up becoming just a very tight loop using CMOV instructions, and finishing with another CMOV instruction. It's so clean that I removed the fast path using STRING_SWITCH as that's not necessary anymore. A nice side effect of this change is that the string "application/octet-stream" doesn't appear in the binary anymore (except in the debug version due to assertions), as it's now part of the compressed blob that mimegen generates. A good segue to this commit would be porting the other usage of bsearch() to this implementation. [1] https://orlp.net/blog/bitwise-binary-search/ [2] https://probablydance.com/2023/04/27/beautiful-branchless-binary-search/
1 parent f9eedda commit 5d77176

File tree

2 files changed

+48
-58
lines changed

2 files changed

+48
-58
lines changed

src/bin/tools/mimegen.c

+31-28
Original file line numberDiff line numberDiff line change
@@ -161,29 +161,6 @@ static char *compress_output(const struct output *output, size_t *outlen)
161161
return compressed;
162162
}
163163

164-
static bool is_builtin_ext(const char *ext)
165-
{
166-
/* STRING_SWITCH_L() is not used here to not bring in lwan.h */
167-
/* FIXME: maybe use an X-macro to keep in sync with lwan-tables.c? */
168-
if (strcaseequal_neutral(ext, "css"))
169-
return true;
170-
if (strcaseequal_neutral(ext, "gif"))
171-
return true;
172-
if (strcaseequal_neutral(ext, "htm"))
173-
return true;
174-
if (strcaseequal_neutral(ext, "html"))
175-
return true;
176-
if (strcaseequal_neutral(ext, "jpg"))
177-
return true;
178-
if (strcaseequal_neutral(ext, "js"))
179-
return true;
180-
if (strcaseequal_neutral(ext, "png"))
181-
return true;
182-
if (strcaseequal_neutral(ext, "txt"))
183-
return true;
184-
return false;
185-
}
186-
187164
int main(int argc, char *argv[])
188165
{
189166
/* 32k is sufficient for the provided mime.types, but we can reallocate
@@ -258,11 +235,6 @@ int main(int argc, char *argv[])
258235
ext[8] = '\0';
259236
}
260237

261-
/* Lwan has a fast-path for some common extensions, so don't bundle them
262-
* in this table if not really needed. */
263-
if (is_builtin_ext(ext))
264-
continue;
265-
266238
k = strdup(ext);
267239
v = strdup(mime_type);
268240

@@ -286,6 +258,22 @@ int main(int argc, char *argv[])
286258
}
287259
}
288260

261+
{
262+
char *k = strdup("bin");
263+
char *v = strdup("application/octet-stream");
264+
if (!k || !v) {
265+
fprintf(stderr, "Could not allocate memory\n");
266+
fclose(fp);
267+
return 1;
268+
}
269+
int r = hash_add_unique(ext_mime, k, v);
270+
if (r != 0 && r != -EEXIST) {
271+
fprintf(stderr, "Could not add fallback mime entry\n");
272+
fclose(fp);
273+
return 1;
274+
}
275+
}
276+
289277
/* Get sorted list of extensions. */
290278
exts = calloc(hash_get_count(ext_mime), sizeof(char *));
291279
if (!exts) {
@@ -305,6 +293,7 @@ int main(int argc, char *argv[])
305293
fclose(fp);
306294
return 1;
307295
}
296+
ssize_t bin_index = -1;
308297
for (i = 0; i < hash_get_count(ext_mime); i++) {
309298
uint64_t ext_lower = 0;
310299

@@ -322,6 +311,9 @@ int main(int argc, char *argv[])
322311
fclose(fp);
323312
return 1;
324313
}
314+
315+
if (bin_index < 0 && streq(exts[i], "bin"))
316+
bin_index = (ssize_t)i;
325317
}
326318
for (i = 0; i < hash_get_count(ext_mime); i++) {
327319
if (output_append(&output, hash_find(ext_mime, exts[i])) < 0) {
@@ -331,6 +323,12 @@ int main(int argc, char *argv[])
331323
}
332324
}
333325

326+
if (bin_index < 0) {
327+
fprintf(stderr, "Could not find fallback item after sorting!\n");
328+
fclose(fp);
329+
return 1;
330+
}
331+
334332
/* Compress blob. */
335333
compressed = compress_output(&output, &compressed_size);
336334
if (!compressed) {
@@ -349,10 +347,15 @@ int main(int argc, char *argv[])
349347
#else
350348
printf("/* Compressed with zlib (deflate) */\n");
351349
#endif
350+
351+
unsigned int entries_floor = 1u << (31 - __builtin_clz(hash_get_count(ext_mime)));
352+
352353
printf("#pragma once\n");
353354
printf("#define MIME_UNCOMPRESSED_LEN %zu\n", output.used);
354355
printf("#define MIME_COMPRESSED_LEN %lu\n", compressed_size);
355356
printf("#define MIME_ENTRIES %d\n", hash_get_count(ext_mime));
357+
printf("#define MIME_ENTRIES_FLOOR %d\n", entries_floor);
358+
printf("#define MIME_ENTRY_FALLBACK %ld\n", bin_index);
356359
printf("static const unsigned char mime_entries_compressed[] = {\n");
357360
for (i = 1; compressed_size; compressed_size--, i++)
358361
printf("0x%02x,%c", compressed[i - 1] & 0xff, " \n"[i % 13 == 0]);

src/lib/lwan-tables.c

+17-30
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636

3737
static unsigned char uncompressed_mime_entries[MIME_UNCOMPRESSED_LEN];
3838
static char *mime_types[MIME_ENTRIES];
39+
static uint64_t *mime_extensions;
3940
static bool mime_entries_initialized = false;
4041

4142
void lwan_tables_shutdown(void)
@@ -86,6 +87,7 @@ void lwan_tables_init(void)
8687
mime_types[i] = (char *)ptr;
8788
ptr += strlen((const char *)ptr) + 1;
8889
}
90+
mime_extensions = (uint64_t *)uncompressed_mime_entries;
8991

9092
mime_entries_initialized = true;
9193

@@ -120,34 +122,25 @@ LWAN_SELF_TEST(status_codes)
120122
#undef ASSERT_STATUS
121123
}
122124

123-
static int compare_mime_entry(const void *a, const void *b)
125+
static ALWAYS_INLINE const char *bsearch_mime_type(uint64_t ext)
124126
{
125-
const uint64_t exta = string_as_uint64((const char *)a);
126-
const uint64_t extb = string_as_uint64((const char *)b);
127-
128-
return (exta > extb) - (exta < extb);
127+
/* Based on https://orlp.net/blog/bitwise-binary-search/ */
128+
int64_t b = ext > mime_extensions[MIME_ENTRIES / 2]
129+
? MIME_ENTRIES - MIME_ENTRIES_FLOOR
130+
: -1;
131+
for (uint64_t bit = MIME_ENTRIES_FLOOR >> 1; bit != 0; bit >>= 1) {
132+
if (ext > mime_extensions[b + (int64_t)bit])
133+
b += (int64_t)bit;
134+
}
135+
return mime_types[mime_extensions[b + 1] == ext ? b + 1
136+
: MIME_ENTRY_FALLBACK];
129137
}
130138

131-
const char *
132-
lwan_determine_mime_type_for_file_name(const char *file_name)
139+
const char *lwan_determine_mime_type_for_file_name(const char *file_name)
133140
{
134141
char *last_dot = strrchr(file_name, '.');
135-
if (UNLIKELY(!last_dot))
136-
goto fallback;
137-
138-
STRING_SWITCH_L(last_dot) {
139-
case STR4_INT_L('.','c','s','s'): return "text/css";
140-
case STR4_INT_L('.','g','i','f'): return "image/gif";
141-
case STR4_INT_L('.','h','t','m'): return "text/html";
142-
case STR4_INT_L('.','j','p','g'): return "image/jpeg";
143-
case STR4_INT_L('.','j','s',' '): return "text/javascript";
144-
case STR4_INT_L('.','p','n','g'): return "image/png";
145-
case STR4_INT_L('.','t','x','t'): return "text/plain";
146-
}
147-
148-
if (LIKELY(*last_dot)) {
142+
if (LIKELY(last_dot && *last_dot)) {
149143
uint64_t key = 0;
150-
const unsigned char *extension;
151144

152145
#pragma GCC diagnostic push
153146
#pragma GCC diagnostic ignored "-Wstringop-truncation"
@@ -157,17 +150,11 @@ lwan_determine_mime_type_for_file_name(const char *file_name)
157150
* 8 bytes per extension. */
158151
strncpy((char *)&key, last_dot + 1, 8);
159152
#pragma GCC diagnostic pop
160-
key &= ~0x2020202020202020ull;
161-
key = htobe64(key);
162153

163-
extension = bsearch(&key, uncompressed_mime_entries, MIME_ENTRIES, 8,
164-
compare_mime_entry);
165-
if (LIKELY(extension))
166-
return mime_types[(extension - uncompressed_mime_entries) / 8];
154+
return bsearch_mime_type(htobe64(key & ~0x2020202020202020ull));
167155
}
168156

169-
fallback:
170-
return "application/octet-stream";
157+
return mime_types[MIME_ENTRY_FALLBACK];
171158
}
172159

173160
#include "lookup-http-status.h" /* genrated by statuslookupgen */

0 commit comments

Comments
 (0)