Project

General

Profile

Actions

Feature #12225

closed

Remove inline assemblers and always enables USE_MACHINE_REGS

Added by naruse (Yui NARUSE) almost 9 years ago. Updated over 8 years ago.

Status:
Rejected
Assignee:
-
Target version:
-
[ruby-core:74615]

Description

Current vm_exec.c stores pc an explicitly declared register to get PC.
Since recent CPUs and compilers are very smart, we expect they optimizes their use of registers.

With following patch the benchmark becomes following:

diff --git a/vm_exec.c b/vm_exec.c
index 5e4ff94..6f7c1ad 100644
--- a/vm_exec.c
+++ b/vm_exec.c
@@ -15,23 +15,6 @@
 static void vm_analysis_insn(int insn);
 #endif
 
-#if VMDEBUG > 0
-#define DECL_SC_REG(type, r, reg) register type reg_##r
-
-#elif defined(__GNUC__) && defined(__x86_64__)
-#define DECL_SC_REG(type, r, reg) register type reg_##r __asm__("r" reg)
-
-#elif defined(__GNUC__) && defined(__i386__)
-#define DECL_SC_REG(type, r, reg) register type reg_##r __asm__("e" reg)
-
-#elif defined(__GNUC__) && defined(__powerpc64__)
-#define DECL_SC_REG(type, r, reg) register type reg_##r __asm__("r" reg)
-
-#else
-#define DECL_SC_REG(type, r, reg) register type reg_##r
-#endif
-/* #define DECL_SC_REG(r, reg) VALUE reg_##r */
-
 #if VM_DEBUG_STACKOVERFLOW
 NORETURN(static void vm_stack_overflow_for_insn(void));
 static void
@@ -49,41 +32,12 @@ vm_exec_core(rb_thread_t *th, VALUE initial)
 {
 
 #if OPT_STACK_CACHING
-#if 0
-#elif __GNUC__ && __x86_64__ && !defined(__native_client__)
-    DECL_SC_REG(VALUE, a, "12");
-    DECL_SC_REG(VALUE, b, "13");
-#else
     register VALUE reg_a;
     register VALUE reg_b;
 #endif
-#endif
 
-#if defined(__GNUC__) && defined(__i386__)
-    DECL_SC_REG(const VALUE *, pc, "di");
-    DECL_SC_REG(rb_control_frame_t *, cfp, "si");
-#define USE_MACHINE_REGS 1
-
-#elif defined(__GNUC__) && defined(__x86_64__)
-    DECL_SC_REG(const VALUE *, pc, "14");
-# if defined(__native_client__)
-    DECL_SC_REG(rb_control_frame_t *, cfp, "13");
-# else
-    DECL_SC_REG(rb_control_frame_t *, cfp, "15");
-# endif
-#define USE_MACHINE_REGS 1
-
-#elif defined(__GNUC__) && defined(__powerpc64__)
-    DECL_SC_REG(const VALUE *, pc, "14");
-    DECL_SC_REG(rb_control_frame_t *, cfp, "15");
-#define USE_MACHINE_REGS 1
-
-#else
     register rb_control_frame_t *reg_cfp;
     const VALUE *reg_pc;
-#endif
-
-#if USE_MACHINE_REGS
 
 #undef  RESTORE_REGS
 #define RESTORE_REGS() \
@@ -98,7 +52,6 @@ vm_exec_core(rb_thread_t *th, VALUE initial)
 #define GET_PC() (reg_pc)
 #undef  SET_PC
 #define SET_PC(x) (reg_cfp->pc = REG_PC = (x))
-#endif
 
 #if OPT_TOKEN_THREADED_CODE || OPT_DIRECT_THREADED_CODE
 #include "vmtc.inc"
Speedup ratio: compare with the result of `ruby 2.4.0dev (2016-03-27 trunk 54303) [x86_64-linux]' (greater is better)
name    built-ruby
loop_whileloop       1.016
vm1_attr_ivar*       0.991
vm1_attr_ivar_set*           0.976
vm1_block*           1.013
vm1_const*           0.924
vm1_ensure*          0.978
vm1_float_simple*            1.006
vm1_gc_short_lived*          1.011
vm1_gc_short_with_complex_long*      1.036
vm1_gc_short_with_long*      1.064
vm1_gc_short_with_symbol*            0.997
vm1_gc_wb_ary*       1.005
vm1_gc_wb_ary_promoted*      1.000
vm1_gc_wb_obj*       0.977
vm1_gc_wb_obj_promoted*      1.029
vm1_ivar*            1.054
vm1_ivar_set*        0.961
vm1_length*          1.019
vm1_lvar_init*       0.962
vm1_lvar_set*        0.991
vm1_neq*             0.976
vm1_not*             0.903
vm1_rescue*          0.983
vm1_simplereturn*            1.005
vm1_swap*            1.000
vm1_yield*           0.979
additional example micro benchmark

BEFORE gcc 4.8:
 Performance counter stats for './miniruby -e@v=42; n=100_000_000;while n>0; x=x|x; x=x|x;n-=1;end':

       7218.124555 task-clock (msec)         #    0.998 CPUs utilized
               123 context-switches          #    0.017 K/sec
                 2 cpu-migrations            #    0.000 K/sec
               906 page-faults               #    0.126 K/sec
       21374094581 cycles                    #    2.961 GHz
        4469895839 stalled-cycles-frontend   #   20.91% frontend cycles idle
   <not supported> stalled-cycles-backend
       55226298374 instructions              #    2.58  insns per cycle
                                             #    0.08  stalled cycles per insn
        7805291103 branches                  # 1081.346 M/sec
         200172514 branch-misses             #    2.56% of all branches

       7.230608341 seconds time elapsed

BEFORE gcc version 5.3.0 20151204 (Ubuntu 5.3.0-3ubuntu1~14.04):
 Performance counter stats for './miniruby -e@v=42; n=100_000_000;while n>0; x=x|x; x=x|x;n-=1;end':

       8054.736236 task-clock (msec)         #    0.998 CPUs utilized
               128 context-switches          #    0.016 K/sec
                 2 cpu-migrations            #    0.000 K/sec
               895 page-faults               #    0.111 K/sec
       23776261112 cycles                    #    2.952 GHz
        7078686240 stalled-cycles-frontend   #   29.77% frontend cycles idle
   <not supported> stalled-cycles-backend
       53126508523 instructions              #    2.23  insns per cycle
                                             #    0.13  stalled cycles per insn
        7505454893 branches                  #  931.806 M/sec
         201181233 branch-misses             #    2.68% of all branches

       8.074872624 seconds time elapsed

AFTER gcc version 4.8.5 (Ubuntu 4.8.5-2ubuntu1~14.04.1):

 Performance counter stats for './miniruby -e@v=42; n=100_000_000;while n>0; x=x|x; x=x|x;n-=1;end':

       7267.867318 task-clock (msec)         #    0.997 CPUs utilized
               169 context-switches          #    0.023 K/sec
                 1 cpu-migrations            #    0.000 K/sec
               899 page-faults               #    0.124 K/sec
       21563673390 cycles                    #    2.967 GHz
        4952119471 stalled-cycles-frontend   #   22.97% frontend cycles idle
   <not supported> stalled-cycles-backend
       53226715304 instructions              #    2.47  insns per cycle
                                             #    0.09  stalled cycles per insn
        7805365852 branches                  # 1073.955 M/sec
         200218594 branch-misses             #    2.57% of all branches

       7.286793973 seconds time elapsed

AFTER gcc version 5.3.0 20151204 (Ubuntu 5.3.0-3ubuntu1~14.04):

 Performance counter stats for './miniruby -e@v=42; n=100_000_000;while n>0; x=x|x; x=x|x;n-=1;end':

       7146.899779 task-clock (msec)         #    0.998 CPUs utilized
               166 context-switches          #    0.023 K/sec
                 2 cpu-migrations            #    0.000 K/sec
               899 page-faults               #    0.126 K/sec
       21188099959 cycles                    #    2.965 GHz
        4839187155 stalled-cycles-frontend   #   22.84% frontend cycles idle
   <not supported> stalled-cycles-backend
       52525802838 instructions              #    2.48  insns per cycle
                                             #    0.09  stalled cycles per insn
        7505329721 branches                  # 1050.152 M/sec
         200175714 branch-misses             #    2.67% of all branches

       7.157645157 seconds time elapsed
Actions

Also available in: Atom PDF

Like0
Like0Like0Like0