Skip to content

Commit eb4859d

Browse files
authored
Merge pull request #378 from Shopify/key-cache-2
Unpacker: `key_cache` option.
2 parents 83a2600 + f8bc6da commit eb4859d

File tree

6 files changed

+188
-8
lines changed

6 files changed

+188
-8
lines changed

doclib/msgpack/unpacker.rb

+2
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ class Unpacker
1919
# Supported options:
2020
#
2121
# * *:symbolize_keys* deserialize keys of Hash objects as Symbol instead of String
22+
# * *:freeze* freeze the deserialized objects. Can allow string deduplication and some allocation elision.
23+
# * *:key_cache* Enable caching of map keys, this can improve performance significantly if the same map keys are frequently encountered, but also degrade performance if that's not the case.
2224
# * *:allow_unknown_ext* allow to deserialize ext type object with unknown type id as ExtensionValue instance. Otherwise (by default), unpacker throws UnknownExtTypeError.
2325
#
2426
# See also Buffer#initialize for other options.

ext/msgpack/buffer.h

+127
Original file line numberDiff line numberDiff line change
@@ -473,4 +473,131 @@ static inline VALUE msgpack_buffer_read_top_as_symbol(msgpack_buffer_t* b, size_
473473
return rb_str_intern(msgpack_buffer_read_top_as_string(b, length, true, utf8));
474474
}
475475

476+
// Hash keys are likely to be repeated, and are frozen.
477+
// As such we can re-use them if we keep a cache of the ones we've seen so far,
478+
// and save much more expensive lookups into the global fstring table.
479+
// This cache implementation is deliberately simple, as we're optimizing for compactness,
480+
// to be able to fit easily embeded inside msgpack_unpacker_t.
481+
// As such, binary search into a sorted array gives a good tradeoff between compactness and
482+
// performance.
483+
#define MSGPACK_KEY_CACHE_CAPACITY 63
484+
485+
typedef struct msgpack_key_cache_t msgpack_key_cache_t;
486+
struct msgpack_key_cache_t {
487+
int length;
488+
VALUE entries[MSGPACK_KEY_CACHE_CAPACITY];
489+
};
490+
491+
static inline VALUE build_interned_string(const char *str, const long length)
492+
{
493+
# ifdef HAVE_RB_ENC_INTERNED_STR
494+
return rb_enc_interned_str(str, length, rb_utf8_encoding());
495+
# else
496+
VALUE rstring = rb_utf8_str_new(str, length);
497+
return rb_funcall(rb_str_freeze(rstring), s_uminus, 0);
498+
# endif
499+
}
500+
501+
static inline VALUE build_symbol(const char *str, const long length)
502+
{
503+
return rb_str_intern(build_interned_string(str, length));
504+
}
505+
506+
static void rvalue_cache_insert_at(msgpack_key_cache_t *cache, int index, VALUE rstring)
507+
{
508+
MEMMOVE(&cache->entries[index + 1], &cache->entries[index], VALUE, cache->length - index);
509+
cache->length++;
510+
cache->entries[index] = rstring;
511+
}
512+
513+
static inline int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
514+
{
515+
long rstring_length = RSTRING_LEN(rstring);
516+
if (length == rstring_length) {
517+
return memcmp(str, RSTRING_PTR(rstring), length);
518+
} else {
519+
return (int)(length - rstring_length);
520+
}
521+
}
522+
523+
static VALUE rstring_cache_fetch(msgpack_key_cache_t *cache, const char *str, const long length)
524+
{
525+
int low = 0;
526+
int high = cache->length - 1;
527+
int mid = 0;
528+
int last_cmp = 0;
529+
530+
while (low <= high) {
531+
mid = (high + low) >> 1;
532+
VALUE entry = cache->entries[mid];
533+
last_cmp = rstring_cache_cmp(str, length, entry);
534+
535+
if (last_cmp == 0) {
536+
return entry;
537+
} else if (last_cmp > 0) {
538+
low = mid + 1;
539+
} else {
540+
high = mid - 1;
541+
}
542+
}
543+
544+
VALUE rstring = build_interned_string(str, length);
545+
546+
if (cache->length < MSGPACK_KEY_CACHE_CAPACITY) {
547+
if (last_cmp > 0) {
548+
mid += 1;
549+
}
550+
551+
rvalue_cache_insert_at(cache, mid, rstring);
552+
}
553+
return rstring;
554+
}
555+
556+
static VALUE rsymbol_cache_fetch(msgpack_key_cache_t *cache, const char *str, const long length)
557+
{
558+
int low = 0;
559+
int high = cache->length - 1;
560+
int mid = 0;
561+
int last_cmp = 0;
562+
563+
while (low <= high) {
564+
mid = (high + low) >> 1;
565+
VALUE entry = cache->entries[mid];
566+
last_cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
567+
568+
if (last_cmp == 0) {
569+
return entry;
570+
} else if (last_cmp > 0) {
571+
low = mid + 1;
572+
} else {
573+
high = mid - 1;
574+
}
575+
}
576+
577+
VALUE rsymbol = build_symbol(str, length);
578+
579+
if (cache->length < MSGPACK_KEY_CACHE_CAPACITY) {
580+
if (last_cmp > 0) {
581+
mid += 1;
582+
}
583+
584+
rvalue_cache_insert_at(cache, mid, rsymbol);
585+
}
586+
return rsymbol;
587+
}
588+
589+
static inline VALUE msgpack_buffer_read_top_as_interned_symbol(msgpack_buffer_t* b, msgpack_key_cache_t *cache, size_t length)
590+
{
591+
VALUE result = rsymbol_cache_fetch(cache, b->read_buffer, length);
592+
_msgpack_buffer_consumed(b, length);
593+
return result;
594+
}
595+
596+
static inline VALUE msgpack_buffer_read_top_as_interned_string(msgpack_buffer_t* b, msgpack_key_cache_t *cache, size_t length)
597+
{
598+
VALUE result = rstring_cache_fetch(cache, b->read_buffer, length);
599+
_msgpack_buffer_consumed(b, length);
600+
return result;
601+
}
602+
476603
#endif

ext/msgpack/extconf.rb

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
have_func("rb_enc_interned_str", "ruby.h") # Ruby 3.0+
44
have_func("rb_hash_new_capa", "ruby.h") # Ruby 3.2+
55
have_func("rb_proc_call_with_block", "ruby.h") # CRuby (TruffleRuby doesn't have it)
6+
have_func("rb_gc_mark_locations", "ruby.h") # Missing on TruffleRuby
67

78
append_cflags([
89
"-fvisibility=hidden",

ext/msgpack/unpacker.c

+41-4
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,19 @@
2626
#define rb_proc_call_with_block(recv, argc, argv, block) rb_funcallv(recv, rb_intern("call"), argc, argv)
2727
#endif
2828

29+
#ifndef HAVE_RB_GC_MARK_LOCATIONS
30+
// For TruffleRuby
31+
void rb_gc_mark_locations(const VALUE *start, const VALUE *end)
32+
{
33+
VALUE *value = start;
34+
35+
while (value < end) {
36+
rb_gc_mark(*value);
37+
value++;
38+
}
39+
}
40+
#endif
41+
2942
struct protected_proc_call_args {
3043
VALUE proc;
3144
int argc;
@@ -130,11 +143,18 @@ void msgpack_unpacker_mark_stack(msgpack_unpacker_stack_t* stack)
130143
}
131144
}
132145

146+
void msgpack_unpacker_mark_key_cache(msgpack_key_cache_t *cache)
147+
{
148+
const VALUE *entries = &cache->entries[0];
149+
rb_gc_mark_locations(entries, entries + cache->length);
150+
}
151+
133152
void msgpack_unpacker_mark(msgpack_unpacker_t* uk)
134153
{
135154
rb_gc_mark(uk->last_object);
136155
rb_gc_mark(uk->reading_raw);
137156
msgpack_unpacker_mark_stack(&uk->stack);
157+
msgpack_unpacker_mark_key_cache(&uk->key_cache);
138158
/* See MessagePack_Buffer_wrap */
139159
/* msgpack_buffer_mark(UNPACKER_BUFFER_(uk)); */
140160
rb_gc_mark(uk->buffer_ref);
@@ -374,15 +394,32 @@ static inline int read_raw_body_begin(msgpack_unpacker_t* uk, int raw_type)
374394
size_t length = uk->reading_raw_remaining;
375395
if(length <= msgpack_buffer_top_readable_size(UNPACKER_BUFFER_(uk))) {
376396
int ret;
377-
if ((uk->optimized_symbol_ext_type && uk->symbol_ext_type == raw_type) || (uk->symbolize_keys && is_reading_map_key(uk))) {
397+
if ((uk->optimized_symbol_ext_type && uk->symbol_ext_type == raw_type)) {
378398
VALUE symbol = msgpack_buffer_read_top_as_symbol(UNPACKER_BUFFER_(uk), length, raw_type != RAW_TYPE_BINARY);
379399
ret = object_complete_symbol(uk, symbol);
400+
} else if (is_reading_map_key(uk) && raw_type == RAW_TYPE_STRING) {
401+
/* don't use zerocopy for hash keys but get a frozen string directly
402+
* because rb_hash_aset freezes keys and it causes copying */
403+
VALUE key;
404+
if (uk->symbolize_keys) {
405+
if (uk->use_key_cache) {
406+
key = msgpack_buffer_read_top_as_interned_symbol(UNPACKER_BUFFER_(uk), &uk->key_cache, length);
407+
} else {
408+
key = msgpack_buffer_read_top_as_symbol(UNPACKER_BUFFER_(uk), length, true);
409+
}
410+
ret = object_complete_symbol(uk, key);
411+
} else {
412+
if (uk->use_key_cache) {
413+
key = msgpack_buffer_read_top_as_interned_string(UNPACKER_BUFFER_(uk), &uk->key_cache, length);
414+
} else {
415+
key = msgpack_buffer_read_top_as_string(UNPACKER_BUFFER_(uk), length, true, true);
416+
}
417+
418+
ret = object_complete(uk, key);
419+
}
380420
} else {
381421
bool will_freeze = uk->freeze;
382422
if(raw_type == RAW_TYPE_STRING || raw_type == RAW_TYPE_BINARY) {
383-
/* don't use zerocopy for hash keys but get a frozen string directly
384-
* because rb_hash_aset freezes keys and it causes copying */
385-
will_freeze = will_freeze || is_reading_map_key(uk);
386423
VALUE string = msgpack_buffer_read_top_as_string(UNPACKER_BUFFER_(uk), length, will_freeze, raw_type == RAW_TYPE_STRING);
387424
ret = object_complete(uk, string);
388425
} else {

ext/msgpack/unpacker.h

+12-4
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ struct msgpack_unpacker_stack_t {
5050
struct msgpack_unpacker_t {
5151
msgpack_buffer_t buffer;
5252
msgpack_unpacker_stack_t stack;
53+
msgpack_key_cache_t key_cache;
5354

5455
VALUE self;
5556
VALUE last_object;
@@ -66,10 +67,12 @@ struct msgpack_unpacker_t {
6667

6768
/* options */
6869
int symbol_ext_type;
69-
bool symbolize_keys;
70-
bool freeze;
71-
bool allow_unknown_ext;
72-
bool optimized_symbol_ext_type;
70+
71+
bool use_key_cache: 1;
72+
bool symbolize_keys: 1;
73+
bool freeze: 1;
74+
bool allow_unknown_ext: 1;
75+
bool optimized_symbol_ext_type: 1;
7376
};
7477

7578
#define UNPACKER_BUFFER_(uk) (&(uk)->buffer)
@@ -101,6 +104,11 @@ static inline void msgpack_unpacker_set_symbolized_keys(msgpack_unpacker_t* uk,
101104
uk->symbolize_keys = enable;
102105
}
103106

107+
static inline void msgpack_unpacker_set_key_cache(msgpack_unpacker_t* uk, bool enable)
108+
{
109+
uk->use_key_cache = enable;
110+
}
111+
104112
static inline void msgpack_unpacker_set_freeze(msgpack_unpacker_t* uk, bool enable)
105113
{
106114
uk->freeze = enable;

ext/msgpack/unpacker_class.c

+5
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ static VALUE eUnknownExtTypeError;
3434
static VALUE mTypeError; // obsoleted. only for backward compatibility. See #86.
3535

3636
static VALUE sym_symbolize_keys;
37+
static VALUE sym_key_cache;
3738
static VALUE sym_freeze;
3839
static VALUE sym_allow_unknown_ext;
3940

@@ -128,6 +129,9 @@ VALUE MessagePack_Unpacker_initialize(int argc, VALUE* argv, VALUE self)
128129
if(options != Qnil) {
129130
VALUE v;
130131

132+
v = rb_hash_aref(options, sym_key_cache);
133+
msgpack_unpacker_set_key_cache(uk, RTEST(v));
134+
131135
v = rb_hash_aref(options, sym_symbolize_keys);
132136
msgpack_unpacker_set_symbolized_keys(uk, RTEST(v));
133137

@@ -413,6 +417,7 @@ void MessagePack_Unpacker_module_init(VALUE mMessagePack)
413417
eUnknownExtTypeError = rb_define_class_under(mMessagePack, "UnknownExtTypeError", eUnpackError);
414418

415419
sym_symbolize_keys = ID2SYM(rb_intern("symbolize_keys"));
420+
sym_key_cache = ID2SYM(rb_intern("key_cache"));
416421
sym_freeze = ID2SYM(rb_intern("freeze"));
417422
sym_allow_unknown_ext = ID2SYM(rb_intern("allow_unknown_ext"));
418423

0 commit comments

Comments
 (0)