Project

General

Profile

Actions

Bug #20629

closed

Ruby infinite loop when raising+rescuing inside a RUBY_INTERNAL_EVENT_NEWOBJ

Added by ivoanjo (Ivo Anjo) 6 days ago. Updated 6 days ago.

Status:
Closed
Assignee:
-
Target version:
-
ruby -v:
ruby 3.3.4 (2024-07-09 revision be1089c8ec) [x86_64-linux]
[ruby-core:118561]

Description

Background

Hey! While working on the Datadog Ruby Profiler I ran into a weird situation where the Ruby VM locks up as it gets stuck in an infinite loop in vm_sync.c:

	        while (recorded_lock_rec < current_lock_rec) {
	            RB_VM_LOCK_LEAVE_LEV(&current_lock_rec);
	        }

...because leave is this:

	static inline void
	rb_vm_lock_leave(unsigned int *lev, const char *file, int line)
	{
	    if (rb_multi_ractor_p()) {
	        rb_vm_lock_leave_body(lev APPEND_LOCATION_PARAMS);
	    }
	}

...and rb_multi_ractor_p() is always false as I'm not using Ractors.

How to reproduce

(I include a zip with all the files needed to reproduce this, no need to copy-paste from here!)

Inside a native extension, I'm creating a RUBY_INTERNAL_EVENT_NEWOBJ. Inside it, I'm raising and catching an exception, to simulate a (handled) bug in our code:

#include <ruby.h>
#include <ruby/thread.h>
#include <ruby/thread_native.h>
#include <ruby/debug.h>

static VALUE raise_exception(VALUE unused) {
  fprintf(stderr, "Raising!\n");
  rb_raise(rb_eRuntimeError, "This is a test!");
}

static VALUE ignore_failure(VALUE self_instance, VALUE exception) {
  fprintf(stderr, "Caught!\n");
  return Qnil;
}

static void on_newobj_event(VALUE tracepoint_data, void *unused) {
  fprintf(stderr, "Inside on_newobj_event\n");
  // VALUE klass = rb_class_of(rb_tracearg_object(rb_tracearg_from_tracepoint(tracepoint_data)));
  // rb_p(klass);
  rb_rescue2(
    raise_exception,
    Qnil,
    ignore_failure,
    Qnil,
    rb_eException, // rb_eException is the base class of all Ruby exceptions
    0 // Required by API to be the last argument
  );
  fprintf(stderr, "Finished on_newobj_event\n");
}

static VALUE install_tracepoint(VALUE unused) {
  VALUE tp = rb_tracepoint_new(Qnil, RUBY_INTERNAL_EVENT_NEWOBJ, on_newobj_event, NULL /* unused */);
  rb_tracepoint_enable(tp);
  fprintf(stderr, "Tracepoint installed!\n");
  return Qnil;
}

void Init_exception_bug_repro(void) {
  rb_define_singleton_method(rb_define_module("Repro"), "install_tracepoint", install_tracepoint, 0);
}

And my test Ruby script is:

require_relative './exception_bug_repro'

Repro.install_tracepoint

while true
  begin
    raise('hello')
  rescue
    print '.'
  end
end

Here's how running it looks:

$ bundle exec rake clean compile
$ bundle exec ruby testcase.rb
Tracepoint installed!
Inside on_newobj_event
Raising!
Caught!
Finished on_newobj_event
Inside on_newobj_event
Raising!
Caught!
Finished on_newobj_event
Inside on_newobj_event
Raising!
Caught!
Finished on_newobj_event
Inside on_newobj_event
Raising!
Caught!
Finished on_newobj_event
Inside on_newobj_event
Raising!
# <--- Infinite loop

That is, we should always see the raise -> caught, but sometimes the raise doesn't go very well, and from then Ruby eventually locks up.

Here's the full backtrace from gdb:

#0  rb_multi_ractor_p () at ruby-3_3/vm_sync.h:39
#1  0x00005fdde2534665 in rb_vm_lock_leave (lev=0x7ffc2f1b5f70, file=0x5fdde2783bb5 "vm_sync.c", line=231) at ruby-3_3/vm_sync.h:92
#2  0x00005fdde2534b28 in rb_ec_vm_lock_rec_release (ec=0x5fdde3e9b570, recorded_lock_rec=0, current_lock_rec=1) at vm_sync.c:231
#3  0x00005fdde23088f2 in rb_ec_vm_lock_rec_check (ec=0x5fdde3e9b570, recorded_lock_rec=0) at ruby-3_3/eval_intern.h:144
#4  0x00005fdde2308937 in rb_ec_tag_state (ec=0x5fdde3e9b570) at ruby-3_3/eval_intern.h:155
#5  0x00005fdde230c71e in setup_exception (ec=0x5fdde3e9b570, tag=6, mesg=126196681963080, cause=4) at eval.c:564
#6  0x00005fdde230cdc5 in rb_longjmp (ec=0x5fdde3e9b570, tag=6, mesg=126196681963080, cause=4) at eval.c:660
#7  0x00005fdde230d398 in rb_raise_jump (mesg=126196681963080, cause=36) at eval.c:853
#8  0x00005fdde230d0cc in rb_f_raise (argc=1, argv=0x72c690294048) at eval.c:752
#9  0x00005fdde230d0f4 in f_raise (c=1, v=0x72c690294048, _=126197147684320) at eval.c:789
#10 0x00005fdde2502d24 in ractor_safe_call_cfunc_m1 (recv=126197147684320, argc=1, argv=0x72c690294048, func=0x5fdde230d0cc <f_raise>) at ruby-3_3/vm_insnhelper.c:3311
#11 0x00005fdde250395b in vm_call_cfunc_with_frame_ (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0, argc=1, argv=0x72c690294048, stack_bottom=0x72c690294040)
    at ruby-3_3/vm_insnhelper.c:3502
#12 0x00005fdde2503b30 in vm_call_cfunc_with_frame (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:3530
#13 0x00005fdde2503c48 in vm_call_cfunc_other (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:3556
#14 0x00005fdde2504070 in vm_call_cfunc (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:3638
#15 0x00005fdde2506b9f in vm_call_method_each_type (ec=0x5fdde3e9b570, cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:4429
#16 0x00005fdde25076e0 in vm_call_method (ec=0x5fdde3e9b570, cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:4566
#17 0x00005fdde250784d in vm_call_general (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, calling=0x7ffc2f1b66a0) at ruby-3_3/vm_insnhelper.c:4599
#18 0x00005fdde2509ed2 in vm_sendish (ec=0x5fdde3e9b570, reg_cfp=0x72c690393fa0, cd=0x5fdde3f4f7e0, block_handler=0, method_explorer=mexp_search_method)
    at ruby-3_3/vm_insnhelper.c:5593
#19 0x00005fdde25114b9 in vm_exec_core (ec=0x5fdde3e9b570) at ruby-3_3/insns.def:834
#20 0x00005fdde252829b in rb_vm_exec (ec=0x5fdde3e9b570) at vm.c:2486
#21 0x00005fdde25290fa in rb_iseq_eval_main (iseq=0x72c674667ad0) at vm.c:2752
#22 0x00005fdde230be59 in rb_ec_exec_node (ec=0x5fdde3e9b570, n=0x72c674667ad0) at eval.c:287
#23 0x00005fdde230bfd6 in ruby_run_node (n=0x72c674667ad0) at eval.c:328
#24 0x00005fdde2307245 in rb_main (argc=2, argv=0x7ffc2f1b7618) at ./main.c:39
#25 0x00005fdde23072bd in main (argc=2, argv=0x7ffc2f1b7618) at ./main.c:58

In this minimal reproducer, usually it's when allocating a Thread::Backtrace object that triggers the issue, but on the original codebase where I was experimenting I saw this issue even with other objects (I think??).

To be honest, maybe the problem is on the extension side -- this code assumes it's ok to raise/rescue in the RUBY_INTERNAL_EVENT_NEWOBJ tracepoint as it usually works fine, but maybe it's not?


Files

exception-bug-repo.zip (3.79 KB) exception-bug-repo.zip ivoanjo (Ivo Anjo), 07/11/2024 11:46 AM
Actions

Also available in: Atom PDF

Like0
Like1Like0