From 8abee2eeb6896608174ae793f9e6870e1f6092de Mon Sep 17 00:00:00 2001 From: Alan Wu Date: Fri, 23 Nov 2018 14:22:42 -0500 Subject: [PATCH] Hash code memoization for short fstrings For short enough fstrings, we have extra space inside the RString struct we could use to remember the hash code. Applications that work with JSON or YAML benefit from this optimization as they often use string literals as hash keys. The popular ORM ActiveRecord also uses fstring internally and thus would benefit from this optimization. This also provides yet another incentive for users to turn on frozen string literals. * string.c: memoize hash code for short, embeded fstrings after the embedded buffer. * hash_aref_fstr.rb: benchmark demonstrating the best case scenario. About 20% faster with this commit. * hash_aref_long_str.rb: benchamrk demonstrating the worst case scenario for this optimization. It shows that this doesn't have visible performance impact for strings that don't benefit from the optimization. * freeze_unique_strings.yml: benchmark that creates many different unique fstrings. We memoize the hash for each string on top of what we already do before this commit, but there is no visible performance impact. --- benchmark/freeze_unique_strings.yml | 7 +++++++ benchmark/hash_aref_fstr.rb | 4 ++++ benchmark/hash_aref_long_str.rb | 4 ++++ string.c | 23 ++++++++++++++++++++++- 4 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 benchmark/freeze_unique_strings.yml create mode 100644 benchmark/hash_aref_fstr.rb create mode 100644 benchmark/hash_aref_long_str.rb diff --git a/benchmark/freeze_unique_strings.yml b/benchmark/freeze_unique_strings.yml new file mode 100644 index 0000000000..af6f4260b1 --- /dev/null +++ b/benchmark/freeze_unique_strings.yml @@ -0,0 +1,7 @@ +prelude: | + str = +"1000000" +benchmark: + freeze_unique_strings: | + str.succ! + -str +loop_count: 1000000 # freeze this many unique strings diff --git a/benchmark/hash_aref_fstr.rb b/benchmark/hash_aref_fstr.rb new file mode 100644 index 0000000000..3cb283f0c3 --- /dev/null +++ b/benchmark/hash_aref_fstr.rb @@ -0,0 +1,4 @@ +h = {} +strs = ('a'..'z').to_a.map!(&:-@) +strs.each { |s| h[s] = s } +200_000.times { strs.each { |s| h[s] } } diff --git a/benchmark/hash_aref_long_str.rb b/benchmark/hash_aref_long_str.rb new file mode 100644 index 0000000000..d96cf7fdb5 --- /dev/null +++ b/benchmark/hash_aref_long_str.rb @@ -0,0 +1,4 @@ +h = {} +strs = ['a' * 100] * 10 +strs.each { |s| h[s] = s } +200_000.times { strs.each { |s| h[s] } } diff --git a/string.c b/string.c index e55c59136a..e1aa88ce04 100644 --- a/string.c +++ b/string.c @@ -100,6 +100,20 @@ VALUE rb_cSymbol; RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\ } while (0) +#define MEMO_HASH_LEN_MAX (int)(sizeof(((struct RString*)0)->as) - sizeof(st_index_t) - 1) +struct memoized_hash_embeded_str { + char ary[MEMO_HASH_LEN_MAX + 1]; + st_index_t memoized_hash; +}; +STATIC_ASSERT(memoized_hash_type_punning, sizeof(struct memoized_hash_embeded_str) == sizeof(((struct RString*)0)->as)); +#define SET_HASH_MEMO(str, hash) do { \ + struct RString *rstr = (struct RString*) (str);\ + ((struct memoized_hash_embeded_str*) &rstr->as)->memoized_hash = hash;\ +} while (0) +#define GET_HASH_MEMO(str) (((struct memoized_hash_embeded_str*) &(((struct RString*) (str))->as))->memoized_hash) +#define CAN_MEMO_HASH(str) \ + (RB_FL_TEST_RAW((str), RSTRING_NOEMBED | RUBY_FL_FREEZE) == RUBY_FL_FREEZE && RSTRING_EMBED_LEN(str) <= MEMO_HASH_LEN_MAX) + #define STR_SET_LEN(str, n) do { \ if (STR_EMBED_P(str)) {\ STR_SET_EMBED_LEN((str), (n));\ @@ -298,6 +312,9 @@ fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t arg, int existi str = str_new_frozen(rb_cString, str); } } + if (CAN_MEMO_HASH(str)) { + SET_HASH_MEMO(str, rb_str_hash(str)); + } RBASIC(str)->flags |= RSTRING_FSTR; *key = *value = *fstr = str; @@ -3133,7 +3150,11 @@ rb_str_prepend_multi(int argc, VALUE *argv, VALUE str) st_index_t rb_str_hash(VALUE str) { - int e = ENCODING_GET(str); + int e; + if (FL_TEST(str, RSTRING_FSTR) && CAN_MEMO_HASH(str)) { + return GET_HASH_MEMO(str); + } + e = ENCODING_GET(str); if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) { e = 0; } -- 2.19.1