Feature #12463
closedruby lacks plus-plus
Description
From c0f0d0c5f5c58d56fec95ca4303a0f5db5b54d56 Mon Sep 17 00:00:00 2001
From: "Urabe, Shyouhei" shyouhei@ruby-lang.org
Date: Mon, 6 Jun 2016 17:07:11 +0900
Subject: [PATCH 1/1] ruby lacks plus-plus operation.
This commit adds opt_plusplus VM instruction. Before this, a ruby
snippet foo += 1 was converted to up to 4 VM instructions. By
unifying these instructions into one, we can avoid unnecessary stack
manipulations and can issue dedicated machine instruction, if any.
This speeds up things. A set of microbenchmark results shows that it
gains up to 180% boost on a very tight loop.
raw data:
[["loop_whileloop",
[[0.624962, 0.579907, 0.562363, 0.579649, 0.589442, 0.595911, 0.583657],
[0.317089, 0.337159, 0.311376, 0.311721, 0.317585, 0.310999, 0.313107]]],
["loop_whileloop2",
[[0.129038, 0.137306, 0.13944, 0.121268, 0.120862, 0.121065, 0.135851],
[0.072271, 0.074998, 0.080031, 0.078602, 0.07527, 0.07627, 0.072029]]]]
Elapsed time: 7.775304 (sec)
benchmark results:
minimum results in each 7 measurements.
Execution time (sec)
name | trunk| ours
---------------+------+--------
loop_whileloop | 0.562| 0.311
loop_whileloop2| 0.121| 0.072
Speedup ratio: compare with the result of `trunk' (greater is better)
name | ours
---------------+------
loop_whileloop | 1.808
loop_whileloop2| 1.678
Also for a nontrivial benchmark, optcarrot speeds up from 31.14 fps to
31.98 fps (102.6%). This is not that huge gain (disappointed), but it
does speeds up more or less.
raw data:
[["optcarrot",
[[30.403074843558894, 30.310258944580042, 32.71944350007326],
[31.959367571755536, 31.721515210358586, 32.26915908959967]]]]
* configure.in (__builtin_add_overflow): check existence of this
compiler intrinsic which is available in recent gcc and clang,
with expectedly-optimal implementation of overflow detection.
* insns.def (opt_plusplus): new instruction. This unified
instruction is expected to appear when some local variable is
`+=1`ed. That was converted to 4 separate instructions before
(load variable, load immediate one, call plus, store variable).
* insns.def (opt_plus): refactor move implementation to share with
opt_plusplus.
* vm_insnhelper.c (opt_plus_func): moved to here.
* compile.c (iseq_peephole_optimize): generate opt_plusplus when
possible.
Signed-off-by: Urabe, Shyouhei <shyouhei@ruby-lang.org>
---
compile.c | 46 ++++++++++++++++++++++++
configure.in | 1 +
insns.def | 109 +++++++++++++++++++++++++++++++++++---------------------
vm_insnhelper.c | 62 ++++++++++++++++++++++++++++++++
4 files changed, 177 insertions(+), 41 deletions(-)
diff --git a/compile.c b/compile.c
index cc496cb..6d99d0b 100644
--- a/compile.c
+++ b/compile.c
@@ -2254,6 +2254,52 @@ iseq_peephole_optimize(rb_iseq_t *iseq, LINK_ELEMENT *list, const int do_tailcal
}
}
+ if (IS_INSN_ID(iobj, getlocal)) {
+ /*
+ * getlocal_OP__WC__0
+ * putobject_OP_INT2FIX_O_1_C_
+ * opt_plus
+ * setlocal_OP__WC__0
+ * =>
+ * opt_plusplus
+ */
+ INSN *nobj = (INSN *)iobj->link.next;
+ VALUE idx = OPERAND_AT(iobj, 0);
+ VALUE lev = OPERAND_AT(iobj, 1);
+ if (IS_INSN((LINK_ELEMENT *)nobj) &&
+ IS_INSN_ID(nobj, putobject)) {
+ if(OPERAND_AT(nobj, 0) == INT2FIX(1)) {
+ nobj = (INSN * )nobj->link.next;
+ if (IS_INSN((LINK_ELEMENT *)nobj) &&
+ IS_INSN_ID(nobj, send)) {
+ struct rb_call_info *ci = (void *)OPERAND_AT(nobj, 0);
+ struct rb_call_cache *cc = (void *)OPERAND_AT(nobj, 1);
+ if ((ci->flag & VM_CALL_ARGS_SIMPLE) &&
+ (ci->orig_argc == 1) &&
+ (ci->mid == idPLUS)) {
+ nobj = (INSN *)nobj->link.next;
+ if (IS_INSN((LINK_ELEMENT *)nobj) &&
+ IS_INSN_ID(nobj, setlocal)) {
+ if ((OPERAND_AT(nobj, 0) == idx) &&
+ (OPERAND_AT(nobj, 1) == lev)) {
+ INSN *opt_plusplus = new_insn_body(
+ iseq, iobj->line_no, BIN(opt_plusplus),
+ 4, idx, lev, (VALUE)ci, (VALUE)cc);
+ nobj = (INSN * )nobj->link.next;
+ REMOVE_ELEM(nobj->link.prev->prev->prev->prev);
+ REMOVE_ELEM(nobj->link.prev->prev->prev);
+ REMOVE_ELEM(nobj->link.prev->prev);
+ REMOVE_ELEM(nobj->link.prev);
+ INSERT_ELEM_PREV(
+ &nobj->link, (LINK_ELEMENT *)opt_plusplus);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
if (do_tailcallopt &&
(IS_INSN_ID(iobj, send) ||
IS_INSN_ID(iobj, opt_aref_with) ||
diff --git a/configure.in b/configure.in
index 063d839..e0b7a91 100644
--- a/configure.in
+++ b/configure.in
@@ -2480,6 +2480,7 @@ if test x$rb_cv_builtin___builtin_choose_expr = xyes; then
])
fi
RUBY_CHECK_BUILTIN_FUNC(__builtin_types_compatible_p, [__builtin_types_compatible_p(int, int)])
+RUBY_CHECK_BUILTIN_FUNC(__builtin_add_overflow, [__builtin_add_overflow(0, 0, (int*)0)])
if test "$ac_cv_func_qsort_r" != no; then
AC_CACHE_CHECK(whether qsort_r is GNU version, rb_cv_gnu_qsort_r,
diff --git a/insns.def b/insns.def
index cdc8287..1438e29 100644
--- a/insns.def
+++ b/insns.def
@@ -1377,47 +1377,7 @@ opt_plus
(VALUE recv, VALUE obj)
(VALUE val)
{
- if (FIXNUM_2_P(recv, obj) &&
- BASIC_OP_UNREDEFINED_P(BOP_PLUS,INTEGER_REDEFINED_OP_FLAG)) {
- /* fixnum + fixnum */
-#ifndef LONG_LONG_VALUE
- val = (recv + (obj & (~1)));
- if ((~(recv ^ obj) & (recv ^ val)) &
- ((VALUE)0x01 << ((sizeof(VALUE) * CHAR_BIT) - 1))) {
- val = rb_big_plus(rb_int2big(FIX2LONG(recv)),
- rb_int2big(FIX2LONG(obj)));
- }
-#else
- long a, b, c;
- a = FIX2LONG(recv);
- b = FIX2LONG(obj);
- c = a + b;
- val = LONG2NUM(c);
-#endif
- }
- else if (FLONUM_2_P(recv, obj) &&
- BASIC_OP_UNREDEFINED_P(BOP_PLUS, FLOAT_REDEFINED_OP_FLAG)) {
- val = DBL2NUM(RFLOAT_VALUE(recv) + RFLOAT_VALUE(obj));
- }
- else if (!SPECIAL_CONST_P(recv) && !SPECIAL_CONST_P(obj)) {
- if (RBASIC_CLASS(recv) == rb_cFloat && RBASIC_CLASS(obj) == rb_cFloat &&
- BASIC_OP_UNREDEFINED_P(BOP_PLUS, FLOAT_REDEFINED_OP_FLAG)) {
- val = DBL2NUM(RFLOAT_VALUE(recv) + RFLOAT_VALUE(obj));
- }
- else if (RBASIC_CLASS(recv) == rb_cString && RBASIC_CLASS(obj) == rb_cString &&
- BASIC_OP_UNREDEFINED_P(BOP_PLUS, STRING_REDEFINED_OP_FLAG)) {
- val = rb_str_plus(recv, obj);
- }
- else if (RBASIC_CLASS(recv) == rb_cArray &&
- BASIC_OP_UNREDEFINED_P(BOP_PLUS, ARRAY_REDEFINED_OP_FLAG)) {
- val = rb_ary_plus(recv, obj);
- }
- else {
- goto INSN_LABEL(normal_dispatch);
- }
- }
- else {
- INSN_LABEL(normal_dispatch):
+ if ((val = opt_plus_func(recv, obj)) == Qundef) {
PUSH(recv);
PUSH(obj);
CALL_SIMPLE_METHOD(recv);
@@ -2163,6 +2123,73 @@ opt_call_c_function
}
/**
+ @c optimize
+ @e unified instruction that does getlocal + putobject + opt_plus + setlocal.
+ */
+DEFINE_INSN
+opt_plusplus
+(lindex_t idx, rb_num_t level, CALL_INFO ci, CALL_CACHE cc)
+()
+()
+{
+ int const lv = level;
+ VALUE *ep = GET_EP();
+ VALUE *ptr;
+ VALUE tmp;
+ int i;
+
+ for (i = 0; i < lv; i++) {
+ ep = GET_PREV_EP(ep);
+ }
+ ptr = ep - idx;
+
+ if (FIXNUM_P(*ptr) &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS,INTEGER_REDEFINED_OP_FLAG)) {
+#if defined LONG_LONG_VALUE
+ long a, b;
+ a = FIX2LONG(*ptr);
+ b = a + 1;
+ *ptr = LONG2NUM(b);
+#elif defined HAVE_BUILTIN___BUILTIN_ADD_OVERFLOW
+ const signed long one = INT2FIX(1) & ~RUBY_FIXNUM_FLAG;
+ signed long ret;
+ if (UNLIKELY(__builtin_saddl_overflow((signed long)*ptr, one, &ret))) {
+ *ptr = rb_big_plus(rb_int2big(FIX2LONG(*ptr)),
+ rb_int2big(FIX2LONG(1)));
+ }
+ else {
+ *ptr = (VALUE)ret;
+ }
+#else
+ const VALUE a, b, c;
+ a = *ptr;
+ b = INT2FIX(1) & ~RUBY_FIXNUM_FLAG;
+ c = a + b;
+ if ((~(a ^ b) & (a ^ c)) &
+ ((VALUE)0x01 << ((sizeof(VALUE) * CHAR_BIT) - 1))) {
+ *ptr = rb_big_plus(rb_int2big(FIX2LONG(a)),
+ rb_int2big(FIX2LONG(b)));
+ }
+ else {
+ *ptr = c;
+ }
+#endif
+ }
+ else if ((tmp = opt_plus_func(*ptr, INT2FIX(1))) != Qundef) {
+ *ptr = tmp;
+ }
+ else {
+ VALUE val;
+ VALUE recv = *ptr;
+
+ PUSH(recv);
+ PUSH(INT2FIX(1));
+ CALL_SIMPLE_METHOD(recv);
+ *ptr = val;
+ }
+}
+
+/**
@c joke
@e BLT
@j BLT
diff --git a/vm_insnhelper.c b/vm_insnhelper.c
index 691c37a..4ff21c5 100644
--- a/vm_insnhelper.c
+++ b/vm_insnhelper.c
@@ -1209,6 +1209,68 @@ rb_equal_opt(VALUE obj1, VALUE obj2)
return opt_eq_func(obj1, obj2, &ci, &cc);
}
+static inline VALUE
+fixnum_plus_fixnum(VALUE i, VALUE j)
+{
+#if defined LONG_LONG_VALUE
+ long a, b, c;
+ a = FIX2LONG(i);
+ b = FIX2LONG(j);
+ c = a + b;
+ return LONG2NUM(c);
+#elif defined HAVE_BUILTIN___BUILTIN_ADD_OVERFLOW
+ signed long ret;
+ if (__builtin_saddl_overflow(i, j & ~RUBY_FIXNUM_FLAG, &ret)) {
+ return rb_big_plus(rb_int2big(FIX2LONG(i)),
+ rb_int2big(FIX2LONG(j)));
+ }
+ else {
+ return (VALUE)ret;
+ }
+#else
+ VALUE val = (i + (j & ~RUBY_FIXNUM_FLAG));
+ if ((~(i ^ j) & (i ^ val)) &
+ ((VALUE)0x01 << ((sizeof(VALUE) * CHAR_BIT) - 1))) {
+ val = rb_big_plus(rb_int2big(FIX2LONG(i)),
+ rb_int2big(FIX2LONG(j)));
+ }
+ return val;
+#endif
+}
+
+static inline VALUE
+opt_plus_func(VALUE recv, VALUE obj)
+{
+ if (FIXNUM_2_P(recv, obj) &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS,INTEGER_REDEFINED_OP_FLAG)) {
+ return fixnum_plus_fixnum(recv, obj);
+ }
+ else if (FLONUM_2_P(recv, obj) &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS, FLOAT_REDEFINED_OP_FLAG)) {
+ return DBL2NUM(RFLOAT_VALUE(recv) + RFLOAT_VALUE(obj));
+ }
+ else if (!SPECIAL_CONST_P(recv) && !SPECIAL_CONST_P(obj)) {
+ if (RBASIC_CLASS(recv) == rb_cFloat && RBASIC_CLASS(obj) == rb_cFloat &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS, FLOAT_REDEFINED_OP_FLAG)) {
+ return DBL2NUM(RFLOAT_VALUE(recv) + RFLOAT_VALUE(obj));
+ }
+ else if (RBASIC_CLASS(recv) == rb_cString && RBASIC_CLASS(obj) == rb_cString &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS, STRING_REDEFINED_OP_FLAG)) {
+ return rb_str_plus(recv, obj);
+ }
+ else if (RBASIC_CLASS(recv) == rb_cArray &&
+ BASIC_OP_UNREDEFINED_P(BOP_PLUS, ARRAY_REDEFINED_OP_FLAG)) {
+ return rb_ary_plus(recv, obj);
+ }
+ else {
+ return Qundef;
+ }
+ }
+ else {
+ return Qundef;
+ }
+}
+
static VALUE vm_call0(rb_thread_t*, VALUE, ID, int, const VALUE*, const rb_callable_method_entry_t *);
static VALUE
--
2.8.3
Updated by nobu (Nobuyoshi Nakada) almost 10 years ago
- Tracker changed from Bug to Feature
- Description updated (diff)
Updated by duerst (Martin Dürst) almost 10 years ago
This looks like an interesting improvement. But in some way, a
a += 1
in a Ruby program may be a code smell (specifically, it smells of C and similar languages).
Updated by normalperson (Eric Wong) almost 10 years ago
duerst@it.aoyama.ac.jp wrote:
This looks like an interesting improvement. But in some way, a
a += 1in a Ruby program may be a code smell (specifically, it smells of C and similar languages).
Agreed, I'm not sure how often I see this to be a useful optimization.
Also for a nontrivial benchmark, optcarrot speeds up from 31.14 fps to
31.98 fps (102.6%). This is not that huge gain (disappointed), but it
does speeds up more or less.
Yes, that is small. What is optcarrot? I can't find it in trunk.
Also, what is performance impact for other benchmarks?
Do you notice a performance change (either micro or other
benchmarks) removing "inline"?
Mainly, I am uncomfortable about making vm_exec loop bigger and
blowing away icache.
Updated by shyouhei (Shyouhei Urabe) almost 10 years ago
On 06/06/2016 10:44 PM, Eric Wong wrote:
Also for a nontrivial benchmark, optcarrot speeds up from 31.14 fps to
31.98 fps (102.6%). This is not that huge gain (disappointed), but it
does speeds up more or less.Yes, that is small. What is optcarrot? I can't find it in trunk.
Sorry about it. Optcarrot is a recently-developed pure-ruby NES simulator.
Can be fond here: git@github.com:mame/optcarrot.git
Also, what is performance impact for other benchmarks?
Attached. consult below for full benchmark output. I have not (and have
to) taken a closer look at each results but at a glance no benchmark
gained 200% speed up, nor 50% speed down. To put it better it has no
significant drawbacks.
Do you notice a performance change (either micro or other
benchmarks) removing "inline"?
No, not tested yet. I'll take a look.
Mainly, I am uncomfortable about making vm_exec loop bigger and
blowing away icache.
I understand your concern. I have to point out on the other hand that
this optimization shortens 4 VM instructions into 1, which must
positively impact dcache locality a bit (instruction sequence is very
frequently accessed). There is a tradeoff between them.
-----------------------------------------------------------
raw data:
[["app_answer", [[0.052123], [0.045075]]],
["app_aobench", [[51.100811], [61.442784]]],
["app_erb", [[0], [0]]],
["app_factorial", [[1.017833], [0.981952]]],
["app_fib", [[0.483506], [0.493121]]],
["app_lc_fizzbuzz", [[86.574658], [83.14404]]],
["app_mandelbrot", [[1.407061], [1.315739]]],
["app_pentomino", [[16.77479], [16.531438]]],
["app_raise", [[0.269551], [0.278264]]],
["app_strconcat", [[0.961209], [0.924087]]],
["app_tak", [[0.607776], [0.603985]]],
["app_tarai", [[0.504795], [0.504447]]],
["app_uri", [[0], [0]]],
["array_shift", [[0], [0]]],
["hash_aref_dsym", [[0.365131], [0.355265]]],
["hash_aref_dsym_long", [[12.73269], [13.330662]]],
["hash_aref_fix", [[0.39919], [0.352738]]],
["hash_aref_flo", [[0.075359], [0.090797]]],
["hash_aref_miss", [[0.517686], [0.529451]]],
["hash_aref_str", [[0.466861], [0.421541]]],
["hash_aref_sym", [[0.359693], [0.339002]]],
["hash_aref_sym_long", [[0.521086], [0.537503]]],
["hash_flatten", [[0.350155], [0.356782]]],
["hash_ident_flo", [[0.047239], [0.058664]]],
["hash_ident_num", [[0.312423], [0.331413]]],
["hash_ident_obj", [[0.343969], [0.30839]]],
["hash_ident_str", [[0.3327], [0.337857]]],
["hash_ident_sym", [[0.366507], [0.345687]]],
["hash_keys", [[0.356622], [0.354485]]],
["hash_shift", [[0.028791], [0.044413]]],
["hash_shift_u16", [[0.147648], [0.137591]]],
["hash_shift_u24", [[0.135581], [0.140135]]],
["hash_shift_u32", [[0.133544], [0.149953]]],
["hash_to_proc", [[0.015097], [0.015503]]],
["hash_values", [[0.326313], [0.338462]]],
["io_file_create", [[2.631183], [2.29139]]],
["io_file_read", [[0], [0]]],
["io_file_write", [[0], [0]]],
["io_nonblock_noex", [[0], [0]]],
["io_nonblock_noex2", [[0], [0]]],
["io_select", [[2.871379], [2.845252]]],
["io_select2", [[3.019507], [2.971023]]],
["io_select3", [[0.018271], [0.018546]]],
["loop_for", [[1.360109], [1.433324]]],
["loop_generator", [[0.644188], [0.651241]]],
["loop_times", [[1.260078], [1.259863]]],
["loop_whileloop", [[0.635374], [0.355271]]],
["loop_whileloop2", [[0.141063], [0.076761]]],
["marshal_dump_flo", [[0.523541], [0.498045]]],
["marshal_dump_load_geniv", [[0.89369], [0.862783]]],
["marshal_dump_load_time", [[1.238441], [1.180119]]],
["require", [[16.165019], [2.615595]]],
["require_thread", [[0.560172], [0.548627]]],
["securerandom", [[0], [0]]],
["so_ackermann", [[0.634471], [0.66092]]],
["so_array", [[1.073883], [1.068649]]],
["so_binary_trees", [[7.228263], [7.137076]]],
["so_concatenate", [[5.055297], [4.822921]]],
["so_count_words", [[0.216728], [0.173223]]],
["so_exception", [[0.348299], [0.312874]]],
["so_fannkuch", [[1.802251], [1.80874]]],
["so_fasta", [[1.57365], [1.562748]]],
["so_k_nucleotide", [[1.186867], [1.174061]]],
["so_lists", [[0.522591], [0.493595]]],
["so_mandelbrot", [[2.446802], [2.483779]]],
["so_matrix", [[0.553252], [0.537407]]],
["so_meteor_contest", [[3.098925], [3.09681]]],
["so_nbody", [[1.351588], [1.303507]]],
["so_nested_loop", [[1.025351], [1.060477]]],
["so_nsieve", [[1.84633], [1.748345]]],
["so_nsieve_bits", [[2.130051], [2.132071]]],
["so_object", [[0.637309], [0.646502]]],
["so_partial_sums", [[1.856853], [1.748393]]],
["so_pidigits", [[1.23942], [1.301111]]],
["so_random", [[0.39408], [0.35076]]],
["so_reverse_complement", [[1.571702], [1.644563]]],
["so_sieve", [[0.547596], [0.496421]]],
["so_spectralnorm", [[1.870886], [1.850356]]],
["vm1_attr_ivar", [[1.218386], [1.012984]]],
["vm1_attr_ivar_set", [[1.337383], [1.103195]]],
["vm1_block", [[1.996893], [1.86192]]],
["vm1_const", [[0.871864], [0.738432]]],
["vm1_ensure", [[0.633504], [0.405527]]],
["vm1_float_simple", [[4.575614], [4.468849]]],
["vm1_gc_short_lived", [[5.659463], [5.823397]]],
["vm1_gc_short_with_complex_long", [[6.430288], [6.456603]]],
["vm1_gc_short_with_long", [[7.00017], [6.849872]]],
["vm1_gc_short_with_symbol", [[5.526827], [5.366768]]],
["vm1_gc_wb_ary", [[1.148627], [0.95045]]],
["vm1_gc_wb_ary_promoted", [[1.177966], [0.945101]]],
["vm1_gc_wb_obj", [[1.024596], [0.767215]]],
["vm1_gc_wb_obj_promoted", [[1.150204], [0.942514]]],
["vm1_ivar", [[0.891796], [0.615384]]],
["vm1_ivar_set", [[0.872773], [0.657297]]],
["vm1_length", [[1.14291], [0.865646]]],
["vm1_lvar_init", [[1.906298], [1.72483]]],
["vm1_lvar_set", [[2.984877], [2.637209]]],
["vm1_neq", [[1.163216], [0.960243]]],
["vm1_not", [[0.949804], [0.650484]]],
["vm1_rescue", [[0.778971], [0.464327]]],
["vm1_simplereturn", [[1.238101], [0.982779]]],
["vm1_swap", [[0.928376], [0.674591]]],
["vm1_yield", [[1.348768], [1.099841]]],
["vm2_array", [[1.398596], [1.350792]]],
["vm2_bigarray", [[9.668123], [9.524274]]],
["vm2_bighash", [[7.155718], [7.151162]]],
["vm2_case", [[0.223288], [0.209747]]],
["vm2_case_lit", [[0.820809], [0.797681]]],
["vm2_defined_method", [[2.721788], [2.654055]]],
["vm2_dstr", [[1.09328], [1.042475]]],
["vm2_eval", [[31.762698], [32.069769]]],
["vm2_method", [[1.154035], [1.077522]]],
["vm2_method_missing", [[2.697381], [2.566442]]],
["vm2_method_with_block", [[1.432468], [1.219831]]],
["vm2_mutex", [[0.797908], [0.761003]]],
["vm2_newlambda", [[1.559347], [1.46937]]],
["vm2_poly_method", [[2.61575], [2.589814]]],
["vm2_poly_method_ov", [[0.295096], [0.23947]]],
["vm2_proc", [[0.570004], [0.53438]]],
["vm2_raise1", [[5.852356], [5.784271]]],
["vm2_raise2", [[8.396749], [8.350987]]],
["vm2_regexp", [[1.203123], [1.170023]]],
["vm2_send", [[0.469301], [0.420171]]],
["vm2_string_literal", [[0.344021], [0.263268]]],
["vm2_struct_big_aref_hi", [[0.284613], [0.240975]]],
["vm2_struct_big_aref_lo", [[0.289707], [0.238292]]],
["vm2_struct_big_aset", [[0.321701], [0.280001]]],
["vm2_struct_big_href_hi", [[0.371876], [0.354926]]],
["vm2_struct_big_href_lo", [[0.410041], [0.359201]]],
["vm2_struct_big_hset", [[0.414453], [0.344909]]],
["vm2_struct_small_aref", [[0.228413], [0.185585]]],
["vm2_struct_small_aset", [[0.304598], [0.273653]]],
["vm2_struct_small_href", [[0.353266], [0.314363]]],
["vm2_struct_small_hset", [[0.348498], [0.328763]]],
["vm2_super", [[0.547899], [0.490276]]],
["vm2_unif1", [[0.256021], [0.222816]]],
["vm2_zsuper", [[0.540286], [0.523564]]],
["vm3_backtrace", [[0.211558], [0.211667]]],
["vm3_clearmethodcache", [[0.518466], [0.546614]]],
["vm3_gc", [[1.47503], [1.481742]]],
["vm3_gc_old_full", [[3.070784], [3.0723]]],
["vm3_gc_old_immediate", [[2.735763], [2.888246]]],
["vm3_gc_old_lazy", [[4.444772], [3.881177]]],
["vm_symbol_block_pass", [[0.954599], [0.967853]]],
["vm_thread_alive_check1", [[0.217786], [0.20758]]],
["vm_thread_close", [[3.568386], [3.668023]]],
["vm_thread_create_join", [[2.599036], [2.579275]]],
["vm_thread_mutex1", [[0.620894], [0.573859]]],
["vm_thread_mutex2", [[0.954779], [0.915577]]],
["vm_thread_mutex3", [[165.541376], [159.570138]]],
["vm_thread_pass", [[0.691981], [0.763412]]],
["vm_thread_pass_flood", [[0.08634], [0.087652]]],
["vm_thread_pipe", [[0.372563], [0.383598]]],
["vm_thread_queue", [[0.127318], [0.11117]]]]
Elapsed time: 1162.336444 (sec)
-----------------------------------------------------------
benchmark results:
Execution time (sec)
name trunk ours
app_answer 0.052 0.045
app_aobench 51.101 61.443
app_erb 0.000 0.000
app_factorial 1.018 0.982
app_fib 0.484 0.493
app_lc_fizzbuzz 86.575 83.144
app_mandelbrot 1.407 1.316
app_pentomino 16.775 16.531
app_raise 0.270 0.278
app_strconcat 0.961 0.924
app_tak 0.608 0.604
app_tarai 0.505 0.504
app_uri 0.000 0.000
array_shift 0.000 0.000
hash_aref_dsym 0.365 0.355
hash_aref_dsym_long 12.733 13.331
hash_aref_fix 0.399 0.353
hash_aref_flo 0.075 0.091
hash_aref_miss 0.518 0.529
hash_aref_str 0.467 0.422
hash_aref_sym 0.360 0.339
hash_aref_sym_long 0.521 0.538
hash_flatten 0.350 0.357
hash_ident_flo 0.047 0.059
hash_ident_num 0.312 0.331
hash_ident_obj 0.344 0.308
hash_ident_str 0.333 0.338
hash_ident_sym 0.367 0.346
hash_keys 0.357 0.354
hash_shift 0.029 0.044
hash_shift_u16 0.148 0.138
hash_shift_u24 0.136 0.140
hash_shift_u32 0.134 0.150
hash_to_proc 0.015 0.016
hash_values 0.326 0.338
io_file_create 2.631 2.291
io_file_read 0.000 0.000
io_file_write 0.000 0.000
io_nonblock_noex 0.000 0.000
io_nonblock_noex2 0.000 0.000
io_select 2.871 2.845
io_select2 3.020 2.971
io_select3 0.018 0.019
loop_for 1.360 1.433
loop_generator 0.644 0.651
loop_times 1.260 1.260
loop_whileloop 0.635 0.355
loop_whileloop2 0.141 0.077
marshal_dump_flo 0.524 0.498
marshal_dump_load_geniv 0.894 0.863
marshal_dump_load_time 1.238 1.180
require 16.165 2.616
require_thread 0.560 0.549
securerandom 0.000 0.000
so_ackermann 0.634 0.661
so_array 1.074 1.069
so_binary_trees 7.228 7.137
so_concatenate 5.055 4.823
so_count_words 0.217 0.173
so_exception 0.348 0.313
so_fannkuch 1.802 1.809
so_fasta 1.574 1.563
so_k_nucleotide 1.187 1.174
so_lists 0.523 0.494
so_mandelbrot 2.447 2.484
so_matrix 0.553 0.537
so_meteor_contest 3.099 3.097
so_nbody 1.352 1.304
so_nested_loop 1.025 1.060
so_nsieve 1.846 1.748
so_nsieve_bits 2.130 2.132
so_object 0.637 0.647
so_partial_sums 1.857 1.748
so_pidigits 1.239 1.301
so_random 0.394 0.351
so_reverse_complement 1.572 1.645
so_sieve 0.548 0.496
so_spectralnorm 1.871 1.850
vm1_attr_ivar* 0.583 0.658
vm1_attr_ivar_set* 0.702 0.748
vm1_block* 1.362 1.507
vm1_const* 0.236 0.383
vm1_ensure* 0.000 0.050
vm1_float_simple* 3.940 4.114
vm1_gc_short_lived* 5.024 5.468
vm1_gc_short_with_complex_long* 5.795 6.101
vm1_gc_short_with_long* 6.365 6.495
vm1_gc_short_with_symbol* 4.891 5.011
vm1_gc_wb_ary* 0.513 0.595
vm1_gc_wb_ary_promoted* 0.543 0.590
vm1_gc_wb_obj* 0.389 0.412
vm1_gc_wb_obj_promoted* 0.515 0.587
vm1_ivar* 0.256 0.260
vm1_ivar_set* 0.237 0.302
vm1_length* 0.508 0.510
vm1_lvar_init* 1.271 1.370
vm1_lvar_set* 2.350 2.282
vm1_neq* 0.528 0.605
vm1_not* 0.314 0.295
vm1_rescue* 0.144 0.109
vm1_simplereturn* 0.603 0.628
vm1_swap* 0.293 0.319
vm1_yield* 0.713 0.745
vm2_array* 1.258 1.274
vm2_bigarray* 9.527 9.448
vm2_bighash* 7.015 7.074
vm2_case* 0.082 0.133
vm2_case_lit* 0.680 0.721
vm2_defined_method* 2.581 2.577
vm2_dstr* 0.952 0.966
vm2_eval* 31.622 31.993
vm2_method* 1.013 1.001
vm2_method_missing* 2.556 2.490
vm2_method_with_block* 1.291 1.143
vm2_mutex* 0.657 0.684
vm2_newlambda* 1.418 1.393
vm2_poly_method* 2.475 2.513
vm2_poly_method_ov* 0.154 0.163
vm2_proc* 0.429 0.458
vm2_raise1* 5.711 5.708
vm2_raise2* 8.256 8.274
vm2_regexp* 1.062 1.093
vm2_send* 0.328 0.343
vm2_string_literal* 0.203 0.187
vm2_struct_big_aref_hi* 0.144 0.164
vm2_struct_big_aref_lo* 0.149 0.162
vm2_struct_big_aset* 0.181 0.203
vm2_struct_big_href_hi* 0.231 0.278
vm2_struct_big_href_lo* 0.269 0.282
vm2_struct_big_hset* 0.273 0.268
vm2_struct_small_aref* 0.087 0.109
vm2_struct_small_aset* 0.164 0.197
vm2_struct_small_href* 0.212 0.238
vm2_struct_small_hset* 0.207 0.252
vm2_super* 0.407 0.414
vm2_unif1* 0.115 0.146
vm2_zsuper* 0.399 0.447
vm3_backtrace 0.212 0.212
vm3_clearmethodcache 0.518 0.547
vm3_gc 1.475 1.482
vm3_gc_old_full 3.071 3.072
vm3_gc_old_immediate 2.736 2.888
vm3_gc_old_lazy 4.445 3.881
vm_symbol_block_pass 0.955 0.968
vm_thread_alive_check1 0.218 0.208
vm_thread_close 3.568 3.668
vm_thread_create_join 2.599 2.579
vm_thread_mutex1 0.621 0.574
vm_thread_mutex2 0.955 0.916
vm_thread_mutex3 165.541 159.570
vm_thread_pass 0.692 0.763
vm_thread_pass_flood 0.086 0.088
vm_thread_pipe 0.373 0.384
vm_thread_queue 0.127 0.111
Speedup ratio: compare with the result of `trunk' (greater is better)
name ours
app_answer 1.156
app_aobench 0.832
app_erbError
app_factorial 1.037
app_fib 0.981
app_lc_fizzbuzz 1.041
app_mandelbrot 1.069
app_pentomino 1.015
app_raise 0.969
app_strconcat 1.040
app_tak 1.006
app_tarai 1.001
app_uriError
array_shiftError
hash_aref_dsym 1.028
hash_aref_dsym_long 0.955
hash_aref_fix 1.132
hash_aref_flo 0.830
hash_aref_miss 0.978
hash_aref_str 1.108
hash_aref_sym 1.061
hash_aref_sym_long 0.969
hash_flatten 0.981
hash_ident_flo 0.805
hash_ident_num 0.943
hash_ident_obj 1.115
hash_ident_str 0.985
hash_ident_sym 1.060
hash_keys 1.006
hash_shift 0.648
hash_shift_u16 1.073
hash_shift_u24 0.968
hash_shift_u32 0.891
hash_to_proc 0.974
hash_values 0.964
io_file_create 1.148
io_file_readError
io_file_writeError
io_nonblock_noexError
io_nonblock_noex2Error
io_select 1.009
io_select2 1.016
io_select3 0.985
loop_for 0.949
loop_generator 0.989
loop_times 1.000
loop_whileloop 1.788
loop_whileloop2 1.838
marshal_dump_flo 1.051
marshal_dump_load_geniv 1.036
marshal_dump_load_time 1.049
require 6.180
require_thread 1.021
securerandomError
so_ackermann 0.960
so_array 1.005
so_binary_trees 1.013
so_concatenate 1.048
so_count_words 1.251
so_exception 1.113
so_fannkuch 0.996
so_fasta 1.007
so_k_nucleotide 1.011
so_lists 1.059
so_mandelbrot 0.985
so_matrix 1.029
so_meteor_contest 1.001
so_nbody 1.037
so_nested_loop 0.967
so_nsieve 1.056
so_nsieve_bits 0.999
so_object 0.986
so_partial_sums 1.062
so_pidigits 0.953
so_random 1.124
so_reverse_complement 0.956
so_sieve 1.103
so_spectralnorm 1.011
vm1_attr_ivar* 0.886
vm1_attr_ivar_set* 0.939
vm1_block* 0.904
vm1_const* 0.617
vm1_ensure* 0.000
vm1_float_simple* 0.958
vm1_gc_short_lived* 0.919
vm1_gc_short_with_complex_long* 0.950
vm1_gc_short_with_long* 0.980
vm1_gc_short_with_symbol* 0.976
vm1_gc_wb_ary* 0.862
vm1_gc_wb_ary_promoted* 0.920
vm1_gc_wb_obj* 0.945
vm1_gc_wb_obj_promoted* 0.877
vm1_ivar* 0.986
vm1_ivar_set* 0.786
vm1_length* 0.994
vm1_lvar_init* 0.928
vm1_lvar_set* 1.030
vm1_neq* 0.873
vm1_not* 1.065
vm1_rescue* 1.317
vm1_simplereturn* 0.961
vm1_swap* 0.918
vm1_yield* 0.958
vm2_array* 0.987
vm2_bigarray* 1.008
vm2_bighash* 0.992
vm2_case* 0.618
vm2_case_lit* 0.943
vm2_defined_method* 1.001
vm2_dstr* 0.986
vm2_eval* 0.988
vm2_method* 1.012
vm2_method_missing* 1.027
vm2_method_with_block* 1.130
vm2_mutex* 0.960
vm2_newlambda* 1.018
vm2_poly_method* 0.985
vm2_poly_method_ov* 0.947
vm2_proc* 0.937
vm2_raise1* 1.001
vm2_raise2* 0.998
vm2_regexp* 0.971
vm2_send* 0.956
vm2_string_literal* 1.088
vm2_struct_big_aref_hi* 0.874
vm2_struct_big_aref_lo* 0.920
vm2_struct_big_aset* 0.889
vm2_struct_big_href_hi* 0.830
vm2_struct_big_href_lo* 0.952
vm2_struct_big_hset* 1.020
vm2_struct_small_aref* 0.803
vm2_struct_small_aset* 0.831
vm2_struct_small_href* 0.893
vm2_struct_small_hset* 0.823
vm2_super* 0.984
vm2_unif1* 0.787
vm2_zsuper* 0.894
vm3_backtrace 0.999
vm3_clearmethodcache 0.949
vm3_gc 0.995
vm3_gc_old_full 1.000
vm3_gc_old_immediate 0.947
vm3_gc_old_lazy 1.145
vm_symbol_block_pass 0.986
vm_thread_alive_check1 1.049
vm_thread_close 0.973
vm_thread_create_join 1.008
vm_thread_mutex1 1.082
vm_thread_mutex2 1.043
vm_thread_mutex3 1.037
vm_thread_pass 0.906
vm_thread_pass_flood 0.985
vm_thread_pipe 0.971
vm_thread_queue 1.145
Updated by ko1 (Koichi Sasada) almost 10 years ago
I'm on weak negative with the following two points.
(1) there are only small improvement because only a few program uses "i+=1".
(2) I don't want that people prefer to use "while loop" because of performance.
Updated by naruse (Yui NARUSE) almost 10 years ago
Koichi Sasada wrote:
I'm on weak negative with the following two points.
(1) there are only small improvement because only a few program uses "i+=1".
(2) I don't want that people prefer to use "while loop" because of performance.
People should use Integer#times for this while&++ case.
LTO can speed up int_dotimes() in 20%, but while loop is still 2 times faster.
Without LTO:
Performance counter stats for './miniruby --disable-gem -e100_000_000.times{}':
5153.515293 task-clock (msec) # 0.997 CPUs utilized
123 context-switches # 0.024 K/sec
1 cpu-migrations # 0.000 K/sec
873 page-faults # 0.169 K/sec
14894914453 cycles # 2.890 GHz
5256063515 stalled-cycles-frontend # 35.29% frontend cycles idle
<not supported> stalled-cycles-backend
37123557344 instructions # 2.49 insns per cycle
# 0.14 stalled cycles per insn
5304912019 branches # 1029.377 M/sec
202652 branch-misses # 0.00% of all branches
5.170935973 seconds time elapsed
With LTO:
Performance counter stats for './miniruby --disable-gem -e100_000_000.times{}':
4456.424594 task-clock (msec) # 0.997 CPUs utilized
269 context-switches # 0.060 K/sec
2 cpu-migrations # 0.000 K/sec
823 page-faults # 0.185 K/sec
13179265879 cycles # 2.957 GHz
3793118960 stalled-cycles-frontend # 28.78% frontend cycles idle
<not supported> stalled-cycles-backend
36422695065 instructions # 2.76 insns per cycle
# 0.10 stalled cycles per insn
5004776248 branches # 1123.047 M/sec
147323 branch-misses # 0.00% of all branches
4.471030258 seconds time elapsed
while loop:
Performance counter stats for './miniruby --disable-gem -ei=0;while i<100_000_000;i+=1;end':
2071.648179 task-clock (msec) # 0.998 CPUs utilized
35 context-switches # 0.017 K/sec
2 cpu-migrations # 0.001 K/sec
878 page-faults # 0.424 K/sec
6041423503 cycles # 2.916 GHz
2493938546 stalled-cycles-frontend # 41.28% frontend cycles idle
<not supported> stalled-cycles-backend
14520079387 instructions # 2.40 insns per cycle
# 0.17 stalled cycles per insn
1504280501 branches # 726.127 M/sec
116788 branch-misses # 0.01% of all branches
2.076396582 seconds time elapsed
Updated by rosenfeld (Rodrigo Rosenfeld Rosas) almost 10 years ago
If ++ would be implemented as an atomic operation, then I'd be +1 for this as it's much easier to write the common pattern @counter++ than to @mutex.synchronize{@counter += 1} or being forced to implement a thread-safe counter since Ruby doesn't provide one in stdlib as far as I know.
Updated by shyouhei (Shyouhei Urabe) almost 10 years ago
I don't think ++ is going to be atomic because in Ruby an Integer has infinite width in theory. To achieve this property a ++ must be implemented with proper reallocation of underlying memory region(s), which is very difficult (if not impossible) to do atomically.
Updated by rosenfeld (Rodrigo Rosenfeld Rosas) almost 10 years ago
Or if performance is not a concern it could be simply implemented by using a mutex to perform the operation, right?
Updated by shyouhei (Shyouhei Urabe) almost 10 years ago
I didn't say we don't need speed; I said it's difficult by design.
Updated by shyouhei (Shyouhei Urabe) almost 10 years ago
Anyways, I tested following modification (against the proposed opt_plusplus) that tries to be atomic as far as no reallocation is involved. It works. But surprisingly, it slowed down. The problem was the inserted __atomic_thread_fence() intrinsic which eats a lot of CPU time.
So ++ being atomic is not only difficult-by-design, but also suffers considerable amount of overheads even if we could reroute the design issue.
From 88f11a18bfac404ade8db3673816c01fd2d64672 Mon Sep 17 00:00:00 2001
From: "Urabe, Shyouhei" <shyouhei@ruby-lang.org>
Date: Thu, 21 Jul 2016 12:07:41 +0900
Subject: [PATCH 1/1] increase atomicity of ++ operation if possible
When the receiver is an Integer first try to increment atomically.
If it overflows, give up atomicity and fall back to normal method
calls.
-----------------------------------------------------------
raw data:
[["loop_whileloop",
[[0.592233, 0.610019, 0.609028, 0.600098, 0.605186, 0.631371, 0.596622],
[0.874219, 0.8871, 0.853446, 0.896625, 0.881152, 0.87091, 0.889423]]],
["loop_whileloop2",
[[0.125931, 0.131164, 0.130166, 0.123759, 0.123483, 0.1308, 0.133648],
[0.191824, 0.188386, 0.174099, 0.182529, 0.182059, 0.183854, 0.193107]]]]
Elapsed time: 12.598961 (sec)
-----------------------------------------------------------
benchmark results:
minimum results in each 7 measurements.
Execution time (sec)
name trunk ours
loop_whileloop 0.592 0.853
loop_whileloop2 0.123 0.174
Speedup ratio: compare with the result of `trunk' (greater is better)
name ours
loop_whileloop 0.694
loop_whileloop2 0.709
Signed-off-by: Urabe, Shyouhei <shyouhei@ruby-lang.org>
---
insns.def | 35 ++++++++++++++++++-----------------
1 file changed, 18 insertions(+), 17 deletions(-)
diff --git a/insns.def b/insns.def
index 1438e29..6f71135 100644
--- a/insns.def
+++ b/insns.def
@@ -2145,33 +2145,34 @@ opt_plusplus
if (FIXNUM_P(*ptr) &&
BASIC_OP_UNREDEFINED_P(BOP_PLUS,INTEGER_REDEFINED_OP_FLAG)) {
+ const signed long one = INT2FIX(1) & ~RUBY_FIXNUM_FLAG;
+
#if defined LONG_LONG_VALUE
- long a, b;
- a = FIX2LONG(*ptr);
- b = a + 1;
- *ptr = LONG2NUM(b);
+ InterlockedExchangeAdd64(ptr, one);
+ if (*ptr > FIXNUM_MAX) {
+ long a, b;
+ a = FIX2LONG(*ptr);
+ b = a + 1;
+ *ptr = LONG2NUM(b);
+ }
#elif defined HAVE_BUILTIN___BUILTIN_ADD_OVERFLOW
- const signed long one = INT2FIX(1) & ~RUBY_FIXNUM_FLAG;
- signed long ret;
- if (UNLIKELY(__builtin_saddl_overflow((signed long)*ptr, one, &ret))) {
+ signed long *p = (signed long *)ptr;
+ __atomic_thread_fence(__ATOMIC_SEQ_CST);
+ if (UNLIKELY(__builtin_saddl_overflow(*p, one, p))) {
*ptr = rb_big_plus(rb_int2big(FIX2LONG(*ptr)),
rb_int2big(FIX2LONG(1)));
}
- else {
- *ptr = (VALUE)ret;
- }
#else
const VALUE a, b, c;
- a = *ptr;
- b = INT2FIX(1) & ~RUBY_FIXNUM_FLAG;
- c = a + b;
+ do {
+ a = *ptr;
+ b = one;
+ c = a + b;
+ } while (ATOMIC_VALUE_CAS(ptr, a, c) != a);
if ((~(a ^ b) & (a ^ c)) &
((VALUE)0x01 << ((sizeof(VALUE) * CHAR_BIT) - 1))) {
*ptr = rb_big_plus(rb_int2big(FIX2LONG(a)),
rb_int2big(FIX2LONG(b)));
}
- else {
- *ptr = c;
- }
#endif
}
--
2.9.2
Updated by shyouhei (Shyouhei Urabe) almost 10 years ago
- Related to Feature #12607: Ruby needs an atomic integer added
Updated by rosenfeld (Rodrigo Rosenfeld Rosas) over 9 years ago
Thanks for the investigation. I understand there are trade-offs involved in most decisions. I'm just not sure how Ruby treats them. Would a ++ operator be more useful if it would be faster than +=1 or if it was some syntax sugar over an atomic increment? Anyway, I'm not against your original proposal, which targets performance. I just think I wouldn't ever use ++ for the purpose of improving the performance of my own code as I don't think it would be noticeable for any real use cases. I'd probably use it though but simply because it would be a syntax sugar over += 1 :)
Updated by ko1 (Koichi Sasada) almost 9 years ago
- Status changed from Assigned to Rejected
I reject this ticket with the reason #5.