Feature #12463
ruby lacks plus-plus
Description
From c0f0d0c5f5c58d56fec95ca4303a0f5db5b54d56 Mon Sep 17 00:00:00 2001
From: "Urabe, Shyouhei" shyouhei@ruby-lang.org
Date: Mon, 6 Jun 2016 17:07:11 +0900
Subject: [PATCH 1/1] ruby lacks plus-plus operation.
This commit adds opt_plusplus VM instruction. Before this, a ruby
snippet foo += 1
was converted to up to 4 VM instructions. By
unifying these instructions into one, we can avoid unnecessary stack
manipulations and can issue dedicated machine instruction, if any.
This speeds up things. A set of microbenchmark results shows that it
gains up to 180% boost on a very tight loop.
raw data:
[["loop_whileloop", [[0.624962, 0.579907, 0.562363, 0.579649, 0.589442, 0.595911, 0.583657], [0.317089, 0.337159, 0.311376, 0.311721, 0.317585, 0.310999, 0.313107]]], ["loop_whileloop2", [[0.129038, 0.137306, 0.13944, 0.121268, 0.120862, 0.121065, 0.135851], [0.072271, 0.074998, 0.080031, 0.078602, 0.07527, 0.07627, 0.072029]]]]
Elapsed time: 7.775304 (sec)
benchmark results:
minimum results in each 7 measurements.
Execution time (sec)
name | trunk | ours |
---|---|---|
loop_whileloop | 0.562 | 0.311 |
loop_whileloop2 | 0.121 | 0.072 |
Speedup ratio: compare with the result of `trunk' (greater is better)
name | ours |
---|---|
loop_whileloop | 1.808 |
loop_whileloop2 | 1.678 |
Also for a nontrivial benchmark, optcarrot speeds up from 31.14 fps to
31.98 fps (102.6%). This is not that huge gain (disappointed), but it
does speeds up more or less.
raw data:
[["optcarrot", [[30.403074843558894, 30.310258944580042, 32.71944350007326], [31.959367571755536, 31.721515210358586, 32.26915908959967]]]]
* configure.in (__builtin_add_overflow): check existence of this
compiler intrinsic which is available in recent gcc and clang,
with expectedly-optimal implementation of overflow detection.
* insns.def (opt_plusplus): new instruction. This unified
instruction is expected to appear when some local variable is
`+=1`ed. That was converted to 4 separate instructions before
(load variable, load immediate one, call plus, store variable).
* insns.def (opt_plus): refactor move implementation to share with
opt_plusplus.
* vm_insnhelper.c (opt_plus_func): moved to here.
* compile.c (iseq_peephole_optimize): generate opt_plusplus when
possible.
Signed-off-by: Urabe, Shyouhei <shyouhei@ruby-lang.org>
---
compile.c | 46 ++++++++++++++++++++++++
configure.in | 1 +
insns.def | 109 +++++++++++++++++++++++++++++++++++---------------------
vm_insnhelper.c | 62 ++++++++++++++++++++++++++++++++
4 files changed, 177 insertions(+), 41 deletions(-)
diff --git a/compile.c b/compile.c
index cc496cb..6d99d0b 100644
--- a/compile.c
+++ b/compile.c
@@ -2254,6 +2254,52 @@ iseq_peephole_optimize(rb_iseq_t *iseq, LINK_ELEMENT *list, const int do_tailcal
}
}
+ if (IS_INSN_ID(iobj, getlocal)) {
+ /*
+ * getlocal_OP__WC__0
+ * putobject_OP_INT2FIX_O_1_C_
+ * opt_plus
+ * setlocal_OP__WC__0
+ * =>
+ * opt_plusplus
+ */
+ INSN *nobj = (INSN *)iobj->link.next;
+ VALUE idx = OPERAND_AT(iobj, 0);
+ VALUE lev = OPERAND_AT(iobj, 1);
+ if (IS_INSN((LINK_ELEMENT *)nobj) &&
+ IS_INSN_ID(nobj, putobject)) {
+ if(OPERAND_AT(nobj, 0) == INT2FIX(1)) {
+ nobj = (INSN * )nobj->link.next;
+ if (IS_INSN((LINK_ELEMENT *)nobj) &&
+ IS_INSN_ID(nobj, send)) {
+ struct rb_call_info *ci = (void *)OPERAND_AT(nobj, 0);
+ struct rb_call_cache *cc = (void *)OPERAND_AT(nobj, 1);
+ if ((ci->flag & VM_CALL_ARGS_SIMPLE) &&
+ (ci->orig_argc == 1) &&
+ (ci->mid == idPLUS)) {
+ nobj = (INSN *)nobj->link.next;
+ if (IS_INSN((LINK_ELEMENT *)nobj) &&
+ IS_INSN_ID(nobj, setlocal)) {
+ if ((OPERAND_AT(nobj, 0) == idx) &&
+ (OPERAND_AT(nobj, 1) == lev)) {
+ INSN *opt_plusplus = new_insn_body(
+ iseq, iobj->line_no, BIN(opt_plusplus),
+ 4, idx, lev, (VALUE)ci, (VALUE)cc);
+ nobj = (INSN * )nobj->link.next;
+ REMOVE_ELEM(nobj->link.prev->prev->prev->prev);
+ REMOVE_ELEM(nobj->link.prev->prev->prev);
+ REMOVE_ELEM(nobj->link.prev->prev);
+ REMOVE_ELEM(nobj->link.prev);
+ INSERT_ELEM_PREV(
+ &nobj->link, (LINK_ELEMENT *)opt_plusplus);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
if (do_tailcallopt &&
(IS_INSN_ID(iobj, send) ||
IS_INSN_ID(iobj, opt_aref_with) ||
diff --git a/configure.in b/configure.in
index 063d839..e0b7a91 100644
--- a/configure.in
+++ b/configure.in
@@ -2480,6 +2480,7 @@ if test x$rb_cv_builtin___builtin_choose_expr = xyes; then
])
fi
RUBY_CHECK_BUILTIN_FUNC(__builtin_types_compatible_p, [__builtin_types_compatible_p(int, int)])
+RUBY_CHECK_BUILTIN_FUNC(__builtin_add_overflow, [__builtin_add_overflow(0, 0, (int*)0)])
if test "$ac_cv_func_qsort_r" != no; then
AC_CACHE_CHECK(whether qsort_r is GNU version, rb_cv_gnu_qsort_r,
diff --git a/insns.def b/insns.def
index cdc8287..1438e29 100644
--- a/insns.def
+++ b/insns.def
@@ -1377,47 +1377,7 @@ opt_plus
(VALUE recv, VALUE obj)
(VALUE val)
{
- if (FIXNUM_2_P(recv, obj) &&
- BASIC_OP_UNREDEFINED_P(BOP_PLUS,INTEGER_REDEFINED_OP_FLAG)) {
- /* fixnum + fixnum */
-#ifndef LONG_LONG_VALUE
- val = (recv + (obj & (~1)));
- if ((~(recv ^ obj) & (recv ^ val)) &
- ((VALUE)0x01 << ((sizeof(VALUE) * CHAR_BIT) - 1))) {
- val = rb_big_plus(rb_int2big(FIX2LONG(recv)),
- rb_int2big(FIX2LONG(obj)));
- }
-#else
- long a, b, c;
- a = FIX2LONG(recv);
- b = FIX2LONG(obj);
- c = a + b;
- val = LONG2NUM(c);
-#endif
- }
- else if (FLONUM_2_P(recv, obj) &&
- BASIC_OP_UNREDEFINED_P(BOP_PLUS, FLOAT_REDEFINED_OP_FLAG)) {
- val = DBL2NUM(RFLOAT_VALUE(recv) + RFLOAT_VALUE(obj));
- }
- else if (!SPECIAL_CONST_P(recv) && !SPECIAL_CONST_P(obj)) {
- if (RBASIC_CLASS(recv) == rb_cFloat && RBASIC_CLASS(obj) == rb_cFloat &&
- BASIC_OP_UNREDEFINED_P(BOP_PLUS, FLOAT_REDEFINED_OP_FLAG)) {
- val = DBL2NUM(RFLOAT_VALUE(recv) + RFLOAT_VALUE(obj));
- }
- else if (RBASIC_CLASS(recv) == rb_cString && RBASIC_CLASS(obj) == rb_cString &&
- BASIC_OP_UNREDEFINED_P(BOP_PLUS, STRING_REDEFINED_OP_FLAG)) {
- val = rb_str_plus(recv, obj);
- }
- else if (RBASIC_CLASS(recv) == rb_cArray &&
- BASIC_OP_UNREDEFINED_P(BOP_PLUS, ARRAY_REDEFINED_OP_FLAG)) {
- val = rb_ary_plus(recv, obj);
- }
- else {
- goto INSN_LABEL(normal_dispatch);
- }
- }
- else {
- INSN_LABEL(normal_dispatch):
+ if ((val = opt_plus_func(recv, obj)) == Qundef) {
PUSH(recv);
PUSH(obj);
CALL_SIMPLE_METHOD(recv);
@@ -2163,6 +2123,73 @@ opt_call_c_function
}
/**
+ @c optimize
+ @e unified instruction that does getlocal + putobject + opt_plus + setlocal.
+ */
+DEFINE_INSN
+opt_plusplus
+(lindex_t idx, rb_num_t level, CALL_INFO ci, CALL_CACHE cc)
+()
+()
+{
+ int const lv = level;
+ VALUE *ep = GET_EP();
+ VALUE *ptr;
+ VALUE tmp;
+ int i;
+
+ for (i = 0; i < lv; i++) {
+ ep = GET_PREV_EP(ep);
+ }
+ ptr = ep - idx;
+
+ if (FIXNUM_P(*ptr) &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS,INTEGER_REDEFINED_OP_FLAG)) {
+#if defined LONG_LONG_VALUE
+ long a, b;
+ a = FIX2LONG(*ptr);
+ b = a + 1;
+ *ptr = LONG2NUM(b);
+#elif defined HAVE_BUILTIN___BUILTIN_ADD_OVERFLOW
+ const signed long one = INT2FIX(1) & ~RUBY_FIXNUM_FLAG;
+ signed long ret;
+ if (UNLIKELY(__builtin_saddl_overflow((signed long)*ptr, one, &ret))) {
+ *ptr = rb_big_plus(rb_int2big(FIX2LONG(*ptr)),
+ rb_int2big(FIX2LONG(1)));
+ }
+ else {
+ *ptr = (VALUE)ret;
+ }
+#else
+ const VALUE a, b, c;
+ a = *ptr;
+ b = INT2FIX(1) & ~RUBY_FIXNUM_FLAG;
+ c = a + b;
+ if ((~(a ^ b) & (a ^ c)) &
+ ((VALUE)0x01 << ((sizeof(VALUE) * CHAR_BIT) - 1))) {
+ *ptr = rb_big_plus(rb_int2big(FIX2LONG(a)),
+ rb_int2big(FIX2LONG(b)));
+ }
+ else {
+ *ptr = c;
+ }
+#endif
+ }
+ else if ((tmp = opt_plus_func(*ptr, INT2FIX(1))) != Qundef) {
+ *ptr = tmp;
+ }
+ else {
+ VALUE val;
+ VALUE recv = *ptr;
+
+ PUSH(recv);
+ PUSH(INT2FIX(1));
+ CALL_SIMPLE_METHOD(recv);
+ *ptr = val;
+ }
+}
+
+/**
@c joke
@e BLT
@j BLT
diff --git a/vm_insnhelper.c b/vm_insnhelper.c
index 691c37a..4ff21c5 100644
--- a/vm_insnhelper.c
+++ b/vm_insnhelper.c
@@ -1209,6 +1209,68 @@ rb_equal_opt(VALUE obj1, VALUE obj2)
return opt_eq_func(obj1, obj2, &ci, &cc);
}
+static inline VALUE
+fixnum_plus_fixnum(VALUE i, VALUE j)
+{
+#if defined LONG_LONG_VALUE
+ long a, b, c;
+ a = FIX2LONG(i);
+ b = FIX2LONG(j);
+ c = a + b;
+ return LONG2NUM(c);
+#elif defined HAVE_BUILTIN___BUILTIN_ADD_OVERFLOW
+ signed long ret;
+ if (__builtin_saddl_overflow(i, j & ~RUBY_FIXNUM_FLAG, &ret)) {
+ return rb_big_plus(rb_int2big(FIX2LONG(i)),
+ rb_int2big(FIX2LONG(j)));
+ }
+ else {
+ return (VALUE)ret;
+ }
+#else
+ VALUE val = (i + (j & ~RUBY_FIXNUM_FLAG));
+ if ((~(i ^ j) & (i ^ val)) &
+ ((VALUE)0x01 << ((sizeof(VALUE) * CHAR_BIT) - 1))) {
+ val = rb_big_plus(rb_int2big(FIX2LONG(i)),
+ rb_int2big(FIX2LONG(j)));
+ }
+ return val;
+#endif
+}
+
+static inline VALUE
+opt_plus_func(VALUE recv, VALUE obj)
+{
+ if (FIXNUM_2_P(recv, obj) &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS,INTEGER_REDEFINED_OP_FLAG)) {
+ return fixnum_plus_fixnum(recv, obj);
+ }
+ else if (FLONUM_2_P(recv, obj) &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS, FLOAT_REDEFINED_OP_FLAG)) {
+ return DBL2NUM(RFLOAT_VALUE(recv) + RFLOAT_VALUE(obj));
+ }
+ else if (!SPECIAL_CONST_P(recv) && !SPECIAL_CONST_P(obj)) {
+ if (RBASIC_CLASS(recv) == rb_cFloat && RBASIC_CLASS(obj) == rb_cFloat &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS, FLOAT_REDEFINED_OP_FLAG)) {
+ return DBL2NUM(RFLOAT_VALUE(recv) + RFLOAT_VALUE(obj));
+ }
+ else if (RBASIC_CLASS(recv) == rb_cString && RBASIC_CLASS(obj) == rb_cString &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS, STRING_REDEFINED_OP_FLAG)) {
+ return rb_str_plus(recv, obj);
+ }
+ else if (RBASIC_CLASS(recv) == rb_cArray &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS, ARRAY_REDEFINED_OP_FLAG)) {
+ return rb_ary_plus(recv, obj);
+ }
+ else {
+ return Qundef;
+ }
+ }
+ else {
+ return Qundef;
+ }
+}
+
static VALUE vm_call0(rb_thread_t*, VALUE, ID, int, const VALUE*, const rb_callable_method_entry_t *);
static VALUE
--
2.8.3
Related issues
Updated by nobu (Nobuyoshi Nakada) over 4 years ago
- Description updated (diff)
- Tracker changed from Bug to Feature
Updated by duerst (Martin Dürst) over 4 years ago
This looks like an interesting improvement. But in some way, a
a += 1
in a Ruby program may be a code smell (specifically, it smells of C and similar languages).
Updated by normalperson (Eric Wong) over 4 years ago
duerst@it.aoyama.ac.jp wrote:
This looks like an interesting improvement. But in some way, a
a += 1in a Ruby program may be a code smell (specifically, it smells of C and similar languages).
Agreed, I'm not sure how often I see this to be a useful optimization.
Also for a nontrivial benchmark, optcarrot speeds up from 31.14 fps to
31.98 fps (102.6%). This is not that huge gain (disappointed), but it
does speeds up more or less.
Yes, that is small. What is optcarrot? I can't find it in trunk.
Also, what is performance impact for other benchmarks?
Do you notice a performance change (either micro or other
benchmarks) removing "inline"?
Mainly, I am uncomfortable about making vm_exec loop bigger and
blowing away icache.
Updated by shyouhei (Shyouhei Urabe) over 4 years ago
On 06/06/2016 10:44 PM, Eric Wong wrote:
Also for a nontrivial benchmark, optcarrot speeds up from 31.14 fps to
31.98 fps (102.6%). This is not that huge gain (disappointed), but it
does speeds up more or less.Yes, that is small. What is optcarrot? I can't find it in trunk.
Sorry about it. Optcarrot is a recently-developed pure-ruby NES simulator.
Can be fond here: git@github.com:mame/optcarrot.git
Also, what is performance impact for other benchmarks?
Attached. consult below for full benchmark output. I have not (and have
to) taken a closer look at each results but at a glance no benchmark
gained 200% speed up, nor 50% speed down. To put it better it has no
significant drawbacks.
Do you notice a performance change (either micro or other
benchmarks) removing "inline"?
No, not tested yet. I'll take a look.
Mainly, I am uncomfortable about making vm_exec loop bigger and
blowing away icache.
I understand your concern. I have to point out on the other hand that
this optimization shortens 4 VM instructions into 1, which must
positively impact dcache locality a bit (instruction sequence is very
frequently accessed). There is a tradeoff between them.
----------------------------------------------------------- raw data: [["app_answer", [[0.052123], [0.045075]]], ["app_aobench", [[51.100811], [61.442784]]], ["app_erb", [[0], [0]]], ["app_factorial", [[1.017833], [0.981952]]], ["app_fib", [[0.483506], [0.493121]]], ["app_lc_fizzbuzz", [[86.574658], [83.14404]]], ["app_mandelbrot", [[1.407061], [1.315739]]], ["app_pentomino", [[16.77479], [16.531438]]], ["app_raise", [[0.269551], [0.278264]]], ["app_strconcat", [[0.961209], [0.924087]]], ["app_tak", [[0.607776], [0.603985]]], ["app_tarai", [[0.504795], [0.504447]]], ["app_uri", [[0], [0]]], ["array_shift", [[0], [0]]], ["hash_aref_dsym", [[0.365131], [0.355265]]], ["hash_aref_dsym_long", [[12.73269], [13.330662]]], ["hash_aref_fix", [[0.39919], [0.352738]]], ["hash_aref_flo", [[0.075359], [0.090797]]], ["hash_aref_miss", [[0.517686], [0.529451]]], ["hash_aref_str", [[0.466861], [0.421541]]], ["hash_aref_sym", [[0.359693], [0.339002]]], ["hash_aref_sym_long", [[0.521086], [0.537503]]], ["hash_flatten", [[0.350155], [0.356782]]], ["hash_ident_flo", [[0.047239], [0.058664]]], ["hash_ident_num", [[0.312423], [0.331413]]], ["hash_ident_obj", [[0.343969], [0.30839]]], ["hash_ident_str", [[0.3327], [0.337857]]], ["hash_ident_sym", [[0.366507], [0.345687]]], ["hash_keys", [[0.356622], [0.354485]]], ["hash_shift", [[0.028791], [0.044413]]], ["hash_shift_u16", [[0.147648], [0.137591]]], ["hash_shift_u24", [[0.135581], [0.140135]]], ["hash_shift_u32", [[0.133544], [0.149953]]], ["hash_to_proc", [[0.015097], [0.015503]]], ["hash_values", [[0.326313], [0.338462]]], ["io_file_create", [[2.631183], [2.29139]]], ["io_file_read", [[0], [0]]], ["io_file_write", [[0], [0]]], ["io_nonblock_noex", [[0], [0]]], ["io_nonblock_noex2", [[0], [0]]], ["io_select", [[2.871379], [2.845252]]], ["io_select2", [[3.019507], [2.971023]]], ["io_select3", [[0.018271], [0.018546]]], ["loop_for", [[1.360109], [1.433324]]], ["loop_generator", [[0.644188], [0.651241]]], ["loop_times", [[1.260078], [1.259863]]], ["loop_whileloop", [[0.635374], [0.355271]]], ["loop_whileloop2", [[0.141063], [0.076761]]], ["marshal_dump_flo", [[0.523541], [0.498045]]], ["marshal_dump_load_geniv", [[0.89369], [0.862783]]], ["marshal_dump_load_time", [[1.238441], [1.180119]]], ["require", [[16.165019], [2.615595]]], ["require_thread", [[0.560172], [0.548627]]], ["securerandom", [[0], [0]]], ["so_ackermann", [[0.634471], [0.66092]]], ["so_array", [[1.073883], [1.068649]]], ["so_binary_trees", [[7.228263], [7.137076]]], ["so_concatenate", [[5.055297], [4.822921]]], ["so_count_words", [[0.216728], [0.173223]]], ["so_exception", [[0.348299], [0.312874]]], ["so_fannkuch", [[1.802251], [1.80874]]], ["so_fasta", [[1.57365], [1.562748]]], ["so_k_nucleotide", [[1.186867], [1.174061]]], ["so_lists", [[0.522591], [0.493595]]], ["so_mandelbrot", [[2.446802], [2.483779]]], ["so_matrix", [[0.553252], [0.537407]]], ["so_meteor_contest", [[3.098925], [3.09681]]], ["so_nbody", [[1.351588], [1.303507]]], ["so_nested_loop", [[1.025351], [1.060477]]], ["so_nsieve", [[1.84633], [1.748345]]], ["so_nsieve_bits", [[2.130051], [2.132071]]], ["so_object", [[0.637309], [0.646502]]], ["so_partial_sums", [[1.856853], [1.748393]]], ["so_pidigits", [[1.23942], [1.301111]]], ["so_random", [[0.39408], [0.35076]]], ["so_reverse_complement", [[1.571702], [1.644563]]], ["so_sieve", [[0.547596], [0.496421]]], ["so_spectralnorm", [[1.870886], [1.850356]]], ["vm1_attr_ivar", [[1.218386], [1.012984]]], ["vm1_attr_ivar_set", [[1.337383], [1.103195]]], ["vm1_block", [[1.996893], [1.86192]]], ["vm1_const", [[0.871864], [0.738432]]], ["vm1_ensure", [[0.633504], [0.405527]]], ["vm1_float_simple", [[4.575614], [4.468849]]], ["vm1_gc_short_lived", [[5.659463], [5.823397]]], ["vm1_gc_short_with_complex_long", [[6.430288], [6.456603]]], ["vm1_gc_short_with_long", [[7.00017], [6.849872]]], ["vm1_gc_short_with_symbol", [[5.526827], [5.366768]]], ["vm1_gc_wb_ary", [[1.148627], [0.95045]]], ["vm1_gc_wb_ary_promoted", [[1.177966], [0.945101]]], ["vm1_gc_wb_obj", [[1.024596], [0.767215]]], ["vm1_gc_wb_obj_promoted", [[1.150204], [0.942514]]], ["vm1_ivar", [[0.891796], [0.615384]]], ["vm1_ivar_set", [[0.872773], [0.657297]]], ["vm1_length", [[1.14291], [0.865646]]], ["vm1_lvar_init", [[1.906298], [1.72483]]], ["vm1_lvar_set", [[2.984877], [2.637209]]], ["vm1_neq", [[1.163216], [0.960243]]], ["vm1_not", [[0.949804], [0.650484]]], ["vm1_rescue", [[0.778971], [0.464327]]], ["vm1_simplereturn", [[1.238101], [0.982779]]], ["vm1_swap", [[0.928376], [0.674591]]], ["vm1_yield", [[1.348768], [1.099841]]], ["vm2_array", [[1.398596], [1.350792]]], ["vm2_bigarray", [[9.668123], [9.524274]]], ["vm2_bighash", [[7.155718], [7.151162]]], ["vm2_case", [[0.223288], [0.209747]]], ["vm2_case_lit", [[0.820809], [0.797681]]], ["vm2_defined_method", [[2.721788], [2.654055]]], ["vm2_dstr", [[1.09328], [1.042475]]], ["vm2_eval", [[31.762698], [32.069769]]], ["vm2_method", [[1.154035], [1.077522]]], ["vm2_method_missing", [[2.697381], [2.566442]]], ["vm2_method_with_block", [[1.432468], [1.219831]]], ["vm2_mutex", [[0.797908], [0.761003]]], ["vm2_newlambda", [[1.559347], [1.46937]]], ["vm2_poly_method", [[2.61575], [2.589814]]], ["vm2_poly_method_ov", [[0.295096], [0.23947]]], ["vm2_proc", [[0.570004], [0.53438]]], ["vm2_raise1", [[5.852356], [5.784271]]], ["vm2_raise2", [[8.396749], [8.350987]]], ["vm2_regexp", [[1.203123], [1.170023]]], ["vm2_send", [[0.469301], [0.420171]]], ["vm2_string_literal", [[0.344021], [0.263268]]], ["vm2_struct_big_aref_hi", [[0.284613], [0.240975]]], ["vm2_struct_big_aref_lo", [[0.289707], [0.238292]]], ["vm2_struct_big_aset", [[0.321701], [0.280001]]], ["vm2_struct_big_href_hi", [[0.371876], [0.354926]]], ["vm2_struct_big_href_lo", [[0.410041], [0.359201]]], ["vm2_struct_big_hset", [[0.414453], [0.344909]]], ["vm2_struct_small_aref", [[0.228413], [0.185585]]], ["vm2_struct_small_aset", [[0.304598], [0.273653]]], ["vm2_struct_small_href", [[0.353266], [0.314363]]], ["vm2_struct_small_hset", [[0.348498], [0.328763]]], ["vm2_super", [[0.547899], [0.490276]]], ["vm2_unif1", [[0.256021], [0.222816]]], ["vm2_zsuper", [[0.540286], [0.523564]]], ["vm3_backtrace", [[0.211558], [0.211667]]], ["vm3_clearmethodcache", [[0.518466], [0.546614]]], ["vm3_gc", [[1.47503], [1.481742]]], ["vm3_gc_old_full", [[3.070784], [3.0723]]], ["vm3_gc_old_immediate", [[2.735763], [2.888246]]], ["vm3_gc_old_lazy", [[4.444772], [3.881177]]], ["vm_symbol_block_pass", [[0.954599], [0.967853]]], ["vm_thread_alive_check1", [[0.217786], [0.20758]]], ["vm_thread_close", [[3.568386], [3.668023]]], ["vm_thread_create_join", [[2.599036], [2.579275]]], ["vm_thread_mutex1", [[0.620894], [0.573859]]], ["vm_thread_mutex2", [[0.954779], [0.915577]]], ["vm_thread_mutex3", [[165.541376], [159.570138]]], ["vm_thread_pass", [[0.691981], [0.763412]]], ["vm_thread_pass_flood", [[0.08634], [0.087652]]], ["vm_thread_pipe", [[0.372563], [0.383598]]], ["vm_thread_queue", [[0.127318], [0.11117]]]] Elapsed time: 1162.336444 (sec) ----------------------------------------------------------- benchmark results: Execution time (sec) name trunk ours app_answer 0.052 0.045 app_aobench 51.101 61.443 app_erb 0.000 0.000 app_factorial 1.018 0.982 app_fib 0.484 0.493 app_lc_fizzbuzz 86.575 83.144 app_mandelbrot 1.407 1.316 app_pentomino 16.775 16.531 app_raise 0.270 0.278 app_strconcat 0.961 0.924 app_tak 0.608 0.604 app_tarai 0.505 0.504 app_uri 0.000 0.000 array_shift 0.000 0.000 hash_aref_dsym 0.365 0.355 hash_aref_dsym_long 12.733 13.331 hash_aref_fix 0.399 0.353 hash_aref_flo 0.075 0.091 hash_aref_miss 0.518 0.529 hash_aref_str 0.467 0.422 hash_aref_sym 0.360 0.339 hash_aref_sym_long 0.521 0.538 hash_flatten 0.350 0.357 hash_ident_flo 0.047 0.059 hash_ident_num 0.312 0.331 hash_ident_obj 0.344 0.308 hash_ident_str 0.333 0.338 hash_ident_sym 0.367 0.346 hash_keys 0.357 0.354 hash_shift 0.029 0.044 hash_shift_u16 0.148 0.138 hash_shift_u24 0.136 0.140 hash_shift_u32 0.134 0.150 hash_to_proc 0.015 0.016 hash_values 0.326 0.338 io_file_create 2.631 2.291 io_file_read 0.000 0.000 io_file_write 0.000 0.000 io_nonblock_noex 0.000 0.000 io_nonblock_noex2 0.000 0.000 io_select 2.871 2.845 io_select2 3.020 2.971 io_select3 0.018 0.019 loop_for 1.360 1.433 loop_generator 0.644 0.651 loop_times 1.260 1.260 loop_whileloop 0.635 0.355 loop_whileloop2 0.141 0.077 marshal_dump_flo 0.524 0.498 marshal_dump_load_geniv 0.894 0.863 marshal_dump_load_time 1.238 1.180 require 16.165 2.616 require_thread 0.560 0.549 securerandom 0.000 0.000 so_ackermann 0.634 0.661 so_array 1.074 1.069 so_binary_trees 7.228 7.137 so_concatenate 5.055 4.823 so_count_words 0.217 0.173 so_exception 0.348 0.313 so_fannkuch 1.802 1.809 so_fasta 1.574 1.563 so_k_nucleotide 1.187 1.174 so_lists 0.523 0.494 so_mandelbrot 2.447 2.484 so_matrix 0.553 0.537 so_meteor_contest 3.099 3.097 so_nbody 1.352 1.304 so_nested_loop 1.025 1.060 so_nsieve 1.846 1.748 so_nsieve_bits 2.130 2.132 so_object 0.637 0.647 so_partial_sums 1.857 1.748 so_pidigits 1.239 1.301 so_random 0.394 0.351 so_reverse_complement 1.572 1.645 so_sieve 0.548 0.496 so_spectralnorm 1.871 1.850 vm1_attr_ivar* 0.583 0.658 vm1_attr_ivar_set* 0.702 0.748 vm1_block* 1.362 1.507 vm1_const* 0.236 0.383 vm1_ensure* 0.000 0.050 vm1_float_simple* 3.940 4.114 vm1_gc_short_lived* 5.024 5.468 vm1_gc_short_with_complex_long* 5.795 6.101 vm1_gc_short_with_long* 6.365 6.495 vm1_gc_short_with_symbol* 4.891 5.011 vm1_gc_wb_ary* 0.513 0.595 vm1_gc_wb_ary_promoted* 0.543 0.590 vm1_gc_wb_obj* 0.389 0.412 vm1_gc_wb_obj_promoted* 0.515 0.587 vm1_ivar* 0.256 0.260 vm1_ivar_set* 0.237 0.302 vm1_length* 0.508 0.510 vm1_lvar_init* 1.271 1.370 vm1_lvar_set* 2.350 2.282 vm1_neq* 0.528 0.605 vm1_not* 0.314 0.295 vm1_rescue* 0.144 0.109 vm1_simplereturn* 0.603 0.628 vm1_swap* 0.293 0.319 vm1_yield* 0.713 0.745 vm2_array* 1.258 1.274 vm2_bigarray* 9.527 9.448 vm2_bighash* 7.015 7.074 vm2_case* 0.082 0.133 vm2_case_lit* 0.680 0.721 vm2_defined_method* 2.581 2.577 vm2_dstr* 0.952 0.966 vm2_eval* 31.622 31.993 vm2_method* 1.013 1.001 vm2_method_missing* 2.556 2.490 vm2_method_with_block* 1.291 1.143 vm2_mutex* 0.657 0.684 vm2_newlambda* 1.418 1.393 vm2_poly_method* 2.475 2.513 vm2_poly_method_ov* 0.154 0.163 vm2_proc* 0.429 0.458 vm2_raise1* 5.711 5.708 vm2_raise2* 8.256 8.274 vm2_regexp* 1.062 1.093 vm2_send* 0.328 0.343 vm2_string_literal* 0.203 0.187 vm2_struct_big_aref_hi* 0.144 0.164 vm2_struct_big_aref_lo* 0.149 0.162 vm2_struct_big_aset* 0.181 0.203 vm2_struct_big_href_hi* 0.231 0.278 vm2_struct_big_href_lo* 0.269 0.282 vm2_struct_big_hset* 0.273 0.268 vm2_struct_small_aref* 0.087 0.109 vm2_struct_small_aset* 0.164 0.197 vm2_struct_small_href* 0.212 0.238 vm2_struct_small_hset* 0.207 0.252 vm2_super* 0.407 0.414 vm2_unif1* 0.115 0.146 vm2_zsuper* 0.399 0.447 vm3_backtrace 0.212 0.212 vm3_clearmethodcache 0.518 0.547 vm3_gc 1.475 1.482 vm3_gc_old_full 3.071 3.072 vm3_gc_old_immediate 2.736 2.888 vm3_gc_old_lazy 4.445 3.881 vm_symbol_block_pass 0.955 0.968 vm_thread_alive_check1 0.218 0.208 vm_thread_close 3.568 3.668 vm_thread_create_join 2.599 2.579 vm_thread_mutex1 0.621 0.574 vm_thread_mutex2 0.955 0.916 vm_thread_mutex3 165.541 159.570 vm_thread_pass 0.692 0.763 vm_thread_pass_flood 0.086 0.088 vm_thread_pipe 0.373 0.384 vm_thread_queue 0.127 0.111 Speedup ratio: compare with the result of `trunk' (greater is better) name ours app_answer 1.156 app_aobench 0.832 app_erbError app_factorial 1.037 app_fib 0.981 app_lc_fizzbuzz 1.041 app_mandelbrot 1.069 app_pentomino 1.015 app_raise 0.969 app_strconcat 1.040 app_tak 1.006 app_tarai 1.001 app_uriError array_shiftError hash_aref_dsym 1.028 hash_aref_dsym_long 0.955 hash_aref_fix 1.132 hash_aref_flo 0.830 hash_aref_miss 0.978 hash_aref_str 1.108 hash_aref_sym 1.061 hash_aref_sym_long 0.969 hash_flatten 0.981 hash_ident_flo 0.805 hash_ident_num 0.943 hash_ident_obj 1.115 hash_ident_str 0.985 hash_ident_sym 1.060 hash_keys 1.006 hash_shift 0.648 hash_shift_u16 1.073 hash_shift_u24 0.968 hash_shift_u32 0.891 hash_to_proc 0.974 hash_values 0.964 io_file_create 1.148 io_file_readError io_file_writeError io_nonblock_noexError io_nonblock_noex2Error io_select 1.009 io_select2 1.016 io_select3 0.985 loop_for 0.949 loop_generator 0.989 loop_times 1.000 loop_whileloop 1.788 loop_whileloop2 1.838 marshal_dump_flo 1.051 marshal_dump_load_geniv 1.036 marshal_dump_load_time 1.049 require 6.180 require_thread 1.021 securerandomError so_ackermann 0.960 so_array 1.005 so_binary_trees 1.013 so_concatenate 1.048 so_count_words 1.251 so_exception 1.113 so_fannkuch 0.996 so_fasta 1.007 so_k_nucleotide 1.011 so_lists 1.059 so_mandelbrot 0.985 so_matrix 1.029 so_meteor_contest 1.001 so_nbody 1.037 so_nested_loop 0.967 so_nsieve 1.056 so_nsieve_bits 0.999 so_object 0.986 so_partial_sums 1.062 so_pidigits 0.953 so_random 1.124 so_reverse_complement 0.956 so_sieve 1.103 so_spectralnorm 1.011 vm1_attr_ivar* 0.886 vm1_attr_ivar_set* 0.939 vm1_block* 0.904 vm1_const* 0.617 vm1_ensure* 0.000 vm1_float_simple* 0.958 vm1_gc_short_lived* 0.919 vm1_gc_short_with_complex_long* 0.950 vm1_gc_short_with_long* 0.980 vm1_gc_short_with_symbol* 0.976 vm1_gc_wb_ary* 0.862 vm1_gc_wb_ary_promoted* 0.920 vm1_gc_wb_obj* 0.945 vm1_gc_wb_obj_promoted* 0.877 vm1_ivar* 0.986 vm1_ivar_set* 0.786 vm1_length* 0.994 vm1_lvar_init* 0.928 vm1_lvar_set* 1.030 vm1_neq* 0.873 vm1_not* 1.065 vm1_rescue* 1.317 vm1_simplereturn* 0.961 vm1_swap* 0.918 vm1_yield* 0.958 vm2_array* 0.987 vm2_bigarray* 1.008 vm2_bighash* 0.992 vm2_case* 0.618 vm2_case_lit* 0.943 vm2_defined_method* 1.001 vm2_dstr* 0.986 vm2_eval* 0.988 vm2_method* 1.012 vm2_method_missing* 1.027 vm2_method_with_block* 1.130 vm2_mutex* 0.960 vm2_newlambda* 1.018 vm2_poly_method* 0.985 vm2_poly_method_ov* 0.947 vm2_proc* 0.937 vm2_raise1* 1.001 vm2_raise2* 0.998 vm2_regexp* 0.971 vm2_send* 0.956 vm2_string_literal* 1.088 vm2_struct_big_aref_hi* 0.874 vm2_struct_big_aref_lo* 0.920 vm2_struct_big_aset* 0.889 vm2_struct_big_href_hi* 0.830 vm2_struct_big_href_lo* 0.952 vm2_struct_big_hset* 1.020 vm2_struct_small_aref* 0.803 vm2_struct_small_aset* 0.831 vm2_struct_small_href* 0.893 vm2_struct_small_hset* 0.823 vm2_super* 0.984 vm2_unif1* 0.787 vm2_zsuper* 0.894 vm3_backtrace 0.999 vm3_clearmethodcache 0.949 vm3_gc 0.995 vm3_gc_old_full 1.000 vm3_gc_old_immediate 0.947 vm3_gc_old_lazy 1.145 vm_symbol_block_pass 0.986 vm_thread_alive_check1 1.049 vm_thread_close 0.973 vm_thread_create_join 1.008 vm_thread_mutex1 1.082 vm_thread_mutex2 1.043 vm_thread_mutex3 1.037 vm_thread_pass 0.906 vm_thread_pass_flood 0.985 vm_thread_pipe 0.971 vm_thread_queue 1.145
Updated by ko1 (Koichi Sasada) over 4 years ago
I'm on weak negative with the following two points.
(1) there are only small improvement because only a few program uses "i+=1".
(2) I don't want that people prefer to use "while loop" because of performance.
Updated by naruse (Yui NARUSE) over 4 years ago
Koichi Sasada wrote:
I'm on weak negative with the following two points.
(1) there are only small improvement because only a few program uses "i+=1".
(2) I don't want that people prefer to use "while loop" because of performance.
People should use Integer#times for this while&++ case.
LTO can speed up int_dotimes() in 20%, but while loop is still 2 times faster.
Without LTO: Performance counter stats for './miniruby --disable-gem -e100_000_000.times{}': 5153.515293 task-clock (msec) # 0.997 CPUs utilized 123 context-switches # 0.024 K/sec 1 cpu-migrations # 0.000 K/sec 873 page-faults # 0.169 K/sec 14894914453 cycles # 2.890 GHz 5256063515 stalled-cycles-frontend # 35.29% frontend cycles idle <not supported> stalled-cycles-backend 37123557344 instructions # 2.49 insns per cycle # 0.14 stalled cycles per insn 5304912019 branches # 1029.377 M/sec 202652 branch-misses # 0.00% of all branches 5.170935973 seconds time elapsed With LTO: Performance counter stats for './miniruby --disable-gem -e100_000_000.times{}': 4456.424594 task-clock (msec) # 0.997 CPUs utilized 269 context-switches # 0.060 K/sec 2 cpu-migrations # 0.000 K/sec 823 page-faults # 0.185 K/sec 13179265879 cycles # 2.957 GHz 3793118960 stalled-cycles-frontend # 28.78% frontend cycles idle <not supported> stalled-cycles-backend 36422695065 instructions # 2.76 insns per cycle # 0.10 stalled cycles per insn 5004776248 branches # 1123.047 M/sec 147323 branch-misses # 0.00% of all branches 4.471030258 seconds time elapsed while loop: Performance counter stats for './miniruby --disable-gem -ei=0;while i<100_000_000;i+=1;end': 2071.648179 task-clock (msec) # 0.998 CPUs utilized 35 context-switches # 0.017 K/sec 2 cpu-migrations # 0.001 K/sec 878 page-faults # 0.424 K/sec 6041423503 cycles # 2.916 GHz 2493938546 stalled-cycles-frontend # 41.28% frontend cycles idle <not supported> stalled-cycles-backend 14520079387 instructions # 2.40 insns per cycle # 0.17 stalled cycles per insn 1504280501 branches # 726.127 M/sec 116788 branch-misses # 0.01% of all branches 2.076396582 seconds time elapsed
Updated by rosenfeld (Rodrigo Rosenfeld Rosas) over 4 years ago
If ++ would be implemented as an atomic operation, then I'd be +1 for this as it's much easier to write the common pattern @counter++ than to @mutex.synchronize{@counter += 1} or being forced to implement a thread-safe counter since Ruby doesn't provide one in stdlib as far as I know.
Updated by shyouhei (Shyouhei Urabe) over 4 years ago
I don't think ++ is going to be atomic because in Ruby an Integer has infinite width in theory. To achieve this property a ++ must be implemented with proper reallocation of underlying memory region(s), which is very difficult (if not impossible) to do atomically.
Updated by rosenfeld (Rodrigo Rosenfeld Rosas) over 4 years ago
Or if performance is not a concern it could be simply implemented by using a mutex to perform the operation, right?
Updated by shyouhei (Shyouhei Urabe) over 4 years ago
I didn't say we don't need speed; I said it's difficult by design.
Updated by shyouhei (Shyouhei Urabe) over 4 years ago
Anyways, I tested following modification (against the proposed opt_plusplus) that tries to be atomic as far as no reallocation is involved. It works. But surprisingly, it slowed down. The problem was the inserted __atomic_thread_fence() intrinsic which eats a lot of CPU time.
So ++ being atomic is not only difficult-by-design, but also suffers considerable amount of overheads even if we could reroute the design issue.
From 88f11a18bfac404ade8db3673816c01fd2d64672 Mon Sep 17 00:00:00 2001
From: "Urabe, Shyouhei" <shyouhei@ruby-lang.org>
Date: Thu, 21 Jul 2016 12:07:41 +0900
Subject: [PATCH 1/1] increase atomicity of ++ operation if possible
When the receiver is an Integer first try to increment atomically.
If it overflows, give up atomicity and fall back to normal method
calls.
-----------------------------------------------------------
raw data:
[["loop_whileloop",
[[0.592233, 0.610019, 0.609028, 0.600098, 0.605186, 0.631371, 0.596622],
[0.874219, 0.8871, 0.853446, 0.896625, 0.881152, 0.87091, 0.889423]]],
["loop_whileloop2",
[[0.125931, 0.131164, 0.130166, 0.123759, 0.123483, 0.1308, 0.133648],
[0.191824, 0.188386, 0.174099, 0.182529, 0.182059, 0.183854, 0.193107]]]]
Elapsed time: 12.598961 (sec)
-----------------------------------------------------------
benchmark results:
minimum results in each 7 measurements.
Execution time (sec)
name trunk ours
loop_whileloop 0.592 0.853
loop_whileloop2 0.123 0.174
Speedup ratio: compare with the result of `trunk' (greater is better)
name ours
loop_whileloop 0.694
loop_whileloop2 0.709
Signed-off-by: Urabe, Shyouhei <shyouhei@ruby-lang.org>
---
insns.def | 35 ++++++++++++++++++-----------------
1 file changed, 18 insertions(+), 17 deletions(-)
diff --git a/insns.def b/insns.def
index 1438e29..6f71135 100644
--- a/insns.def
+++ b/insns.def
@@ -2145,33 +2145,34 @@ opt_plusplus
if (FIXNUM_P(*ptr) &&
BASIC_OP_UNREDEFINED_P(BOP_PLUS,INTEGER_REDEFINED_OP_FLAG)) {
+ const signed long one = INT2FIX(1) & ~RUBY_FIXNUM_FLAG;
+
#if defined LONG_LONG_VALUE
- long a, b;
- a = FIX2LONG(*ptr);
- b = a + 1;
- *ptr = LONG2NUM(b);
+ InterlockedExchangeAdd64(ptr, one);
+ if (*ptr > FIXNUM_MAX) {
+ long a, b;
+ a = FIX2LONG(*ptr);
+ b = a + 1;
+ *ptr = LONG2NUM(b);
+ }
#elif defined HAVE_BUILTIN___BUILTIN_ADD_OVERFLOW
- const signed long one = INT2FIX(1) & ~RUBY_FIXNUM_FLAG;
- signed long ret;
- if (UNLIKELY(__builtin_saddl_overflow((signed long)*ptr, one, &ret))) {
+ signed long *p = (signed long *)ptr;
+ __atomic_thread_fence(__ATOMIC_SEQ_CST);
+ if (UNLIKELY(__builtin_saddl_overflow(*p, one, p))) {
*ptr = rb_big_plus(rb_int2big(FIX2LONG(*ptr)),
rb_int2big(FIX2LONG(1)));
}
- else {
- *ptr = (VALUE)ret;
- }
#else
const VALUE a, b, c;
- a = *ptr;
- b = INT2FIX(1) & ~RUBY_FIXNUM_FLAG;
- c = a + b;
+ do {
+ a = *ptr;
+ b = one;
+ c = a + b;
+ } while (ATOMIC_VALUE_CAS(ptr, a, c) != a);
if ((~(a ^ b) & (a ^ c)) &
((VALUE)0x01 << ((sizeof(VALUE) * CHAR_BIT) - 1))) {
*ptr = rb_big_plus(rb_int2big(FIX2LONG(a)),
rb_int2big(FIX2LONG(b)));
}
- else {
- *ptr = c;
- }
#endif
}
--
2.9.2
Updated by shyouhei (Shyouhei Urabe) over 4 years ago
- Related to Feature #12607: Ruby needs an atomic integer added
Updated by rosenfeld (Rodrigo Rosenfeld Rosas) over 4 years ago
Thanks for the investigation. I understand there are trade-offs involved in most decisions. I'm just not sure how Ruby treats them. Would a ++ operator be more useful if it would be faster than +=1 or if it was some syntax sugar over an atomic increment? Anyway, I'm not against your original proposal, which targets performance. I just think I wouldn't ever use ++ for the purpose of improving the performance of my own code as I don't think it would be noticeable for any real use cases. I'd probably use it though but simply because it would be a syntax sugar over += 1 :)
Updated by ko1 (Koichi Sasada) over 3 years ago
- Status changed from Assigned to Rejected
I reject this ticket with the reason #5.