Actions
Feature #12225
closedRemove inline assemblers and always enables USE_MACHINE_REGS
Status:
Rejected
Assignee:
-
Target version:
-
Description
Current vm_exec.c stores pc an explicitly declared register to get PC.
Since recent CPUs and compilers are very smart, we expect they optimizes their use of registers.
With following patch the benchmark becomes following:
diff --git a/vm_exec.c b/vm_exec.c
index 5e4ff94..6f7c1ad 100644
--- a/vm_exec.c
+++ b/vm_exec.c
@@ -15,23 +15,6 @@
static void vm_analysis_insn(int insn);
#endif
-#if VMDEBUG > 0
-#define DECL_SC_REG(type, r, reg) register type reg_##r
-
-#elif defined(__GNUC__) && defined(__x86_64__)
-#define DECL_SC_REG(type, r, reg) register type reg_##r __asm__("r" reg)
-
-#elif defined(__GNUC__) && defined(__i386__)
-#define DECL_SC_REG(type, r, reg) register type reg_##r __asm__("e" reg)
-
-#elif defined(__GNUC__) && defined(__powerpc64__)
-#define DECL_SC_REG(type, r, reg) register type reg_##r __asm__("r" reg)
-
-#else
-#define DECL_SC_REG(type, r, reg) register type reg_##r
-#endif
-/* #define DECL_SC_REG(r, reg) VALUE reg_##r */
-
#if VM_DEBUG_STACKOVERFLOW
NORETURN(static void vm_stack_overflow_for_insn(void));
static void
@@ -49,41 +32,12 @@ vm_exec_core(rb_thread_t *th, VALUE initial)
{
#if OPT_STACK_CACHING
-#if 0
-#elif __GNUC__ && __x86_64__ && !defined(__native_client__)
- DECL_SC_REG(VALUE, a, "12");
- DECL_SC_REG(VALUE, b, "13");
-#else
register VALUE reg_a;
register VALUE reg_b;
#endif
-#endif
-#if defined(__GNUC__) && defined(__i386__)
- DECL_SC_REG(const VALUE *, pc, "di");
- DECL_SC_REG(rb_control_frame_t *, cfp, "si");
-#define USE_MACHINE_REGS 1
-
-#elif defined(__GNUC__) && defined(__x86_64__)
- DECL_SC_REG(const VALUE *, pc, "14");
-# if defined(__native_client__)
- DECL_SC_REG(rb_control_frame_t *, cfp, "13");
-# else
- DECL_SC_REG(rb_control_frame_t *, cfp, "15");
-# endif
-#define USE_MACHINE_REGS 1
-
-#elif defined(__GNUC__) && defined(__powerpc64__)
- DECL_SC_REG(const VALUE *, pc, "14");
- DECL_SC_REG(rb_control_frame_t *, cfp, "15");
-#define USE_MACHINE_REGS 1
-
-#else
register rb_control_frame_t *reg_cfp;
const VALUE *reg_pc;
-#endif
-
-#if USE_MACHINE_REGS
#undef RESTORE_REGS
#define RESTORE_REGS() \
@@ -98,7 +52,6 @@ vm_exec_core(rb_thread_t *th, VALUE initial)
#define GET_PC() (reg_pc)
#undef SET_PC
#define SET_PC(x) (reg_cfp->pc = REG_PC = (x))
-#endif
#if OPT_TOKEN_THREADED_CODE || OPT_DIRECT_THREADED_CODE
#include "vmtc.inc"
Speedup ratio: compare with the result of `ruby 2.4.0dev (2016-03-27 trunk 54303) [x86_64-linux]' (greater is better)
name built-ruby
loop_whileloop 1.016
vm1_attr_ivar* 0.991
vm1_attr_ivar_set* 0.976
vm1_block* 1.013
vm1_const* 0.924
vm1_ensure* 0.978
vm1_float_simple* 1.006
vm1_gc_short_lived* 1.011
vm1_gc_short_with_complex_long* 1.036
vm1_gc_short_with_long* 1.064
vm1_gc_short_with_symbol* 0.997
vm1_gc_wb_ary* 1.005
vm1_gc_wb_ary_promoted* 1.000
vm1_gc_wb_obj* 0.977
vm1_gc_wb_obj_promoted* 1.029
vm1_ivar* 1.054
vm1_ivar_set* 0.961
vm1_length* 1.019
vm1_lvar_init* 0.962
vm1_lvar_set* 0.991
vm1_neq* 0.976
vm1_not* 0.903
vm1_rescue* 0.983
vm1_simplereturn* 1.005
vm1_swap* 1.000
vm1_yield* 0.979
additional example micro benchmark
BEFORE gcc 4.8:
Performance counter stats for './miniruby -e@v=42; n=100_000_000;while n>0; x=x|x; x=x|x;n-=1;end':
7218.124555 task-clock (msec) # 0.998 CPUs utilized
123 context-switches # 0.017 K/sec
2 cpu-migrations # 0.000 K/sec
906 page-faults # 0.126 K/sec
21374094581 cycles # 2.961 GHz
4469895839 stalled-cycles-frontend # 20.91% frontend cycles idle
<not supported> stalled-cycles-backend
55226298374 instructions # 2.58 insns per cycle
# 0.08 stalled cycles per insn
7805291103 branches # 1081.346 M/sec
200172514 branch-misses # 2.56% of all branches
7.230608341 seconds time elapsed
BEFORE gcc version 5.3.0 20151204 (Ubuntu 5.3.0-3ubuntu1~14.04):
Performance counter stats for './miniruby -e@v=42; n=100_000_000;while n>0; x=x|x; x=x|x;n-=1;end':
8054.736236 task-clock (msec) # 0.998 CPUs utilized
128 context-switches # 0.016 K/sec
2 cpu-migrations # 0.000 K/sec
895 page-faults # 0.111 K/sec
23776261112 cycles # 2.952 GHz
7078686240 stalled-cycles-frontend # 29.77% frontend cycles idle
<not supported> stalled-cycles-backend
53126508523 instructions # 2.23 insns per cycle
# 0.13 stalled cycles per insn
7505454893 branches # 931.806 M/sec
201181233 branch-misses # 2.68% of all branches
8.074872624 seconds time elapsed
AFTER gcc version 4.8.5 (Ubuntu 4.8.5-2ubuntu1~14.04.1):
Performance counter stats for './miniruby -e@v=42; n=100_000_000;while n>0; x=x|x; x=x|x;n-=1;end':
7267.867318 task-clock (msec) # 0.997 CPUs utilized
169 context-switches # 0.023 K/sec
1 cpu-migrations # 0.000 K/sec
899 page-faults # 0.124 K/sec
21563673390 cycles # 2.967 GHz
4952119471 stalled-cycles-frontend # 22.97% frontend cycles idle
<not supported> stalled-cycles-backend
53226715304 instructions # 2.47 insns per cycle
# 0.09 stalled cycles per insn
7805365852 branches # 1073.955 M/sec
200218594 branch-misses # 2.57% of all branches
7.286793973 seconds time elapsed
AFTER gcc version 5.3.0 20151204 (Ubuntu 5.3.0-3ubuntu1~14.04):
Performance counter stats for './miniruby -e@v=42; n=100_000_000;while n>0; x=x|x; x=x|x;n-=1;end':
7146.899779 task-clock (msec) # 0.998 CPUs utilized
166 context-switches # 0.023 K/sec
2 cpu-migrations # 0.000 K/sec
899 page-faults # 0.126 K/sec
21188099959 cycles # 2.965 GHz
4839187155 stalled-cycles-frontend # 22.84% frontend cycles idle
<not supported> stalled-cycles-backend
52525802838 instructions # 2.48 insns per cycle
# 0.09 stalled cycles per insn
7505329721 branches # 1050.152 M/sec
200175714 branch-misses # 2.67% of all branches
7.157645157 seconds time elapsed
Actions
Like0
Like0Like0Like0