Project

General

Profile

Bug #17540

A segfault due to Clang/LLVM optimization on 32-bit ARM Linux

Added by xtkoba (Tee KOBAYASHI) about 2 months ago. Updated 5 days ago.

Status:
Closed
Priority:
Normal
Assignee:
-
Target version:
-
ruby -v:
ruby 3.0.0p0 (2020-12-25 revision 95aff21468) [armv7a-linux-eabi]
[ruby-core:102083]

Description

When built with optflags=-O3 (which is the default), ruby -e "pp Thread.main" causes a segfault, which seems to be worked around by the following change:

--- a/include/ruby/internal/fl_type.h
+++ b/include/ruby/internal/fl_type.h
@@ -231,7 +231,7 @@
 RBIMPL_ATTR_PURE_UNLESS_DEBUG()
 RBIMPL_ATTR_ARTIFICIAL()
 static inline VALUE
-RB_FL_TEST_RAW(VALUE obj, VALUE flags)
+RB_FL_TEST_RAW(volatile VALUE obj, VALUE flags)
 {
     RBIMPL_ASSERT_OR_ASSUME(RB_FL_ABLE(obj));
     return RBASIC(obj)->flags & flags;

There might be a bug in the optimizer of Clang/LLVM (version 11.0.1).

Updated by shyouhei (Shyouhei Urabe) about 2 months ago

Mmm… That sounds like an LLVM issue. At least in the latest C++, volatile is deprecated. I don’t think that fix works for extension libraries written in that language.

cf: http://wg21.link/P1152

Updated by xtkoba (Tee KOBAYASHI) 7 days ago

Here is an alternative workaround which inserts a memory barrier into the function rb_str_vcatf from sprintf.c:

--- a/sprintf.c
+++ b/sprintf.c
@@ -1227,6 +1227,7 @@
     f._bf._base = (unsigned char *)str;
     f._p = (unsigned char *)RSTRING_END(str);
     klass = RBASIC(str)->klass;
+    __asm__ __volatile__ ("" : : : "memory");
     RBASIC_CLEAR_CLASS(str);
     f.vwrite = ruby__sfvwrite;
     f.vextra = ruby__sfvextra;

Without the memory barrier, the flow of rb_str_vcatf looks as follows:

Breakpoint 1, rb_str_vcatf (str=str@entry=1024669056, fmt=0x3fc64e95 " %s>", ap=...) at ../sprintf.c:1222
1222        StringValue(str);
1223        rb_str_modify(str);
1226        f._w = rb_str_capacity(str);
1225        f._bf._size = 0;
1224        f._flags = __SWR | __SSTR;
1226        f._w = rb_str_capacity(str);
1228        f._p = (unsigned char *)RSTRING_END(str);
1227        f._bf._base = (unsigned char *)str;
1228        f._p = (unsigned char *)RSTRING_END(str);
1226        f._w = rb_str_capacity(str);
1228        f._p = (unsigned char *)RSTRING_END(str);
1232        f.vwrite = ruby__sfvwrite;
1233        f.vextra = ruby__sfvextra;
1228        f._p = (unsigned char *)RSTRING_END(str);
1231        RBASIC_CLEAR_CLASS(str);
1232        f.vwrite = ruby__sfvwrite;
1233        f.vextra = ruby__sfvextra;
1228        f._p = (unsigned char *)RSTRING_END(str);
1229        klass = RBASIC(str)->klass;
1234        buffer.value = 0;
1233        f.vextra = ruby__sfvextra;
1235        BSD_vfprintf(&f, fmt, ap);
1232        f.vwrite = ruby__sfvwrite;
1235        BSD_vfprintf(&f, fmt, ap);
1236        RBASIC_SET_CLASS_RAW(str, klass);
1237        rb_str_resize(str, (char *)f._p - RSTRING_PTR(str));
1236        RBASIC_SET_CLASS_RAW(str, klass);
1237        rb_str_resize(str, (char *)f._p - RSTRING_PTR(str));
1240        return str;
(gdb) p *(struct RBasic *)str
$1 = {flags = 8197, klass = 0}

With the memory barrier, the flow becomes as follows:

Breakpoint 1, rb_str_vcatf (str=str@entry=1024669056, fmt=0x3fc64e95 " %s>", ap=...) at ../sprintf.c:1222
1222        StringValue(str);
1223        rb_str_modify(str);
1226        f._w = rb_str_capacity(str);
1225        f._bf._size = 0;
1224        f._flags = __SWR | __SSTR;
1226        f._w = rb_str_capacity(str);
1228        f._p = (unsigned char *)RSTRING_END(str);
1227        f._bf._base = (unsigned char *)str;
1228        f._p = (unsigned char *)RSTRING_END(str);
1226        f._w = rb_str_capacity(str);
1228        f._p = (unsigned char *)RSTRING_END(str);
1229        klass = RBASIC(str)->klass;
1230        __asm__ __volatile__ ("" : : : "memory");
1231        RBASIC_CLEAR_CLASS(str);
1232        f.vwrite = ruby__sfvwrite;
1233        f.vextra = ruby__sfvextra;
1232        f.vwrite = ruby__sfvwrite;
1231        RBASIC_CLEAR_CLASS(str);
1233        f.vextra = ruby__sfvextra;
1234        buffer.value = 0;
1233        f.vextra = ruby__sfvextra;
1232        f.vwrite = ruby__sfvwrite;
1235        BSD_vfprintf(&f, fmt, ap);
1236        RBASIC_SET_CLASS_RAW(str, klass);
1237        rb_str_resize(str, (char *)f._p - RSTRING_PTR(str));
1236        RBASIC_SET_CLASS_RAW(str, klass);
1237        rb_str_resize(str, (char *)f._p - RSTRING_PTR(str));
1240        return str;
(gdb) p *(struct RBasic *)str
$1 = {flags = 8197, klass = 1062619968}

And yes, this looks pretty much a bug of Clang/LLVM to me for now. I'm going to create a minimal reproducing example to send to Clang/LLVM maintainers. One more push...

Updated by xtkoba (Tee KOBAYASHI) 6 days ago

MWE:

#include <stdarg.h>
#include <assert.h>

#ifndef WORKAROUND
#define WORKAROUND 0
#endif

typedef unsigned long VALUE;

static void RBASIC_SET_CLASS_RAW(VALUE obj, VALUE klass)
{
    struct { VALUE flags; VALUE klass; } *ptr = (void *)obj;
    ptr->klass = klass;
}

struct RBasic {
    VALUE flags;
    /*const*/ VALUE klass;
};

#define RBASIC(obj) ((struct RBasic *)(obj))

struct RString {
    struct RBasic basic;
    union {
        struct { long len; char *ptr; } heap;
        char ary[8];
    } as;
};

/* dummy function */
#pragma clang optimize off
static int dummy_vfprintf(char *fmt, va_list ap) { return 0; }
#pragma clang optimize on

#define NOP __asm__ __volatile__ ("nop")

VALUE rb_str_catf(VALUE str, char *format, ...)
{
    va_list ap;
    __builtin_va_start(ap, format);
    struct RString buf;
    if (RBASIC(str)->flags & 0x2000) { NOP; }
    else {
        buf.as.heap.ptr = ((struct RString *)str)->as.ary;
    }
    if (__builtin_expect(!!(! buf.as.heap.ptr), 0)) { NOP; }
    VALUE klass = RBASIC(str)->klass;
#if WORKAROUND
    __asm__ __volatile__ ("" : : : "memory");
#endif
    RBASIC_SET_CLASS_RAW(str, 0);
    dummy_vfprintf(format, ap);
    RBASIC_SET_CLASS_RAW(str, klass);
    __builtin_va_end(ap);
    return str;
}

int main()
{
    struct RBasic obj = { .flags = 0, .klass = 1 };
    rb_str_catf((VALUE)&obj, "");
    assert(obj.klass == 1);
    return 0;
}

The assertion fails when compiled with -DWORKAROUND=0, and holds with -DWORKAROUND=1.

Updated by alanwu (Alan Wu) 5 days ago

This seems like a strict aliasing issue. I'm curious if compiling with -fno-strict-aliasing fixes it.

Updated by xtkoba (Tee KOBAYASHI) 5 days ago

alanwu (Alan Wu) Ah yes, it seems to fix the issue, at least for the MWE code. Thanks for the suggestion!

Updated by alanwu (Alan Wu) 5 days ago

If this fixes the Ruby crash too, maybe we should put -fno-strict-aliasing as a default compilation option like the Linux kernel. It's very easy to make strict aliasing mistakes and they are too hard to spot...

Updated by shyouhei (Shyouhei Urabe) 5 days ago

I suspect d0e0c884bb4277e529adbd8d82aae0a651f7edf2 could be the cause of this issue.

Updated by xtkoba (Tee KOBAYASHI) 5 days ago

Compilation with -fno-strict-aliasing seems to resolve the main issue of the ruby crash. It would be nice if this compiler option is added by default, unless it causes a serious performance drawback.

shyouhei (Shyouhei Urabe) That change might have induced the issue, but I don't think it is the fundamental cause. The trouble would be that the type struct RBasic has aliases. In this point of view, yet another workaround would be as follows (although I don't know whether the attribute may_alias works with C++):

--- a/include/ruby/internal/core/rbasic.h
+++ b/include/ruby/internal/core/rbasic.h
@@ -66,7 +66,7 @@ RBasic {
     {
     }
 #endif
-};
+} __attribute__ ((__may_alias__));

 RBIMPL_SYMBOL_EXPORT_BEGIN()
 VALUE rb_obj_hide(VALUE obj);

FYI, some searching told me that CPython had a similar issue until version 2.7 with its object representation in C (cf. https://bugs.python.org/issue30104).

Updated by shyouhei (Shyouhei Urabe) 5 days ago

This is my take:

From 7fb39b1138dfaa3a1502673ac82d6b75401e8f39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8D=9C=E9=83=A8=E6=98=8C=E5=B9=B3?=
 <shyouhei@ruby-lang.org>
Date: Tue, 2 Mar 2021 15:22:22 +0900
Subject: [PATCH] fix strict aliasing

---
 internal/object.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/internal/object.h b/internal/object.h
index aa820128c7..6a5dfcda63 100644
--- a/internal/object.h
+++ b/internal/object.h
@@ -47,8 +47,8 @@ MJIT_SYMBOL_EXPORT_END
 static inline void
 RBASIC_SET_CLASS_RAW(VALUE obj, VALUE klass)
 {
-    struct { VALUE flags; VALUE klass; } *ptr = (void *)obj;
-    ptr->klass = klass;
+    VALUE *ptr =(/* const cast */VALUE *) & RBASIC(obj)->klass;
+    memcpy(ptr, &klass, sizeof(klass));
 }

 static inline void
-- 
2.17.1

I would like to reflain from fixing this issue by adding compiler-specific __asm__ or __attreibute__, because that does not help everyone... The world is not built on top of LLVM. I don't want to kill support for other compiler infrastructures.

It sounds like a nice idea to have -fno-strict-aliasing, though. That is just a matter of compiler flag. Must not hurt the codebase.

Updated by xtkoba (Tee KOBAYASHI) 5 days ago

shyouhei (Shyouhei Urabe) Thanks, it works. I have no objection to avoiding compiler-specific keywords, as long as things go well without them.

#12

Updated by shyouhei (Shyouhei Urabe) 5 days ago

  • Status changed from Open to Closed

Applied in changeset git|33dc0a070a515811e71fccbdc8cf0cd5a5dd784c.


RBASIC_SET_CLASS_RAW: follow strict aliasing rule

Instead of rather euphemistic struct cast, just reomve the const
qualifier and assign directly. According to ISO/IEC 9899:2018 section
6.5 paragraph 7, VALUE and const VALUE are allowed to alias (but two
distinct structs are not, even when their structures are the same).
[Bug #17540]

Also available in: Atom PDF