Bug #17540
closedA segfault due to Clang/LLVM optimization on 32-bit ARM Linux
Description
When built with optflags=-O3
(which is the default), ruby -e "pp Thread.main"
causes a segfault, which seems to be worked around by the following change:
--- a/include/ruby/internal/fl_type.h
+++ b/include/ruby/internal/fl_type.h
@@ -231,7 +231,7 @@
RBIMPL_ATTR_PURE_UNLESS_DEBUG()
RBIMPL_ATTR_ARTIFICIAL()
static inline VALUE
-RB_FL_TEST_RAW(VALUE obj, VALUE flags)
+RB_FL_TEST_RAW(volatile VALUE obj, VALUE flags)
{
RBIMPL_ASSERT_OR_ASSUME(RB_FL_ABLE(obj));
return RBASIC(obj)->flags & flags;
There might be a bug in the optimizer of Clang/LLVM (version 11.0.1).
Updated by shyouhei (Shyouhei Urabe) about 4 years ago
Mmm… That sounds like an LLVM issue. At least in the latest C++, volatile
is deprecated. I don’t think that fix works for extension libraries written in that language.
Updated by xtkoba (Tee KOBAYASHI) almost 4 years ago
Here is an alternative workaround which inserts a memory barrier into the function rb_str_vcatf
from sprintf.c
:
--- a/sprintf.c
+++ b/sprintf.c
@@ -1227,6 +1227,7 @@
f._bf._base = (unsigned char *)str;
f._p = (unsigned char *)RSTRING_END(str);
klass = RBASIC(str)->klass;
+ __asm__ __volatile__ ("" : : : "memory");
RBASIC_CLEAR_CLASS(str);
f.vwrite = ruby__sfvwrite;
f.vextra = ruby__sfvextra;
Without the memory barrier, the flow of rb_str_vcatf
looks as follows:
Breakpoint 1, rb_str_vcatf (str=str@entry=1024669056, fmt=0x3fc64e95 " %s>", ap=...) at ../sprintf.c:1222
1222 StringValue(str);
1223 rb_str_modify(str);
1226 f._w = rb_str_capacity(str);
1225 f._bf._size = 0;
1224 f._flags = __SWR | __SSTR;
1226 f._w = rb_str_capacity(str);
1228 f._p = (unsigned char *)RSTRING_END(str);
1227 f._bf._base = (unsigned char *)str;
1228 f._p = (unsigned char *)RSTRING_END(str);
1226 f._w = rb_str_capacity(str);
1228 f._p = (unsigned char *)RSTRING_END(str);
1232 f.vwrite = ruby__sfvwrite;
1233 f.vextra = ruby__sfvextra;
1228 f._p = (unsigned char *)RSTRING_END(str);
1231 RBASIC_CLEAR_CLASS(str);
1232 f.vwrite = ruby__sfvwrite;
1233 f.vextra = ruby__sfvextra;
1228 f._p = (unsigned char *)RSTRING_END(str);
1229 klass = RBASIC(str)->klass;
1234 buffer.value = 0;
1233 f.vextra = ruby__sfvextra;
1235 BSD_vfprintf(&f, fmt, ap);
1232 f.vwrite = ruby__sfvwrite;
1235 BSD_vfprintf(&f, fmt, ap);
1236 RBASIC_SET_CLASS_RAW(str, klass);
1237 rb_str_resize(str, (char *)f._p - RSTRING_PTR(str));
1236 RBASIC_SET_CLASS_RAW(str, klass);
1237 rb_str_resize(str, (char *)f._p - RSTRING_PTR(str));
1240 return str;
(gdb) p *(struct RBasic *)str
$1 = {flags = 8197, klass = 0}
With the memory barrier, the flow becomes as follows:
Breakpoint 1, rb_str_vcatf (str=str@entry=1024669056, fmt=0x3fc64e95 " %s>", ap=...) at ../sprintf.c:1222
1222 StringValue(str);
1223 rb_str_modify(str);
1226 f._w = rb_str_capacity(str);
1225 f._bf._size = 0;
1224 f._flags = __SWR | __SSTR;
1226 f._w = rb_str_capacity(str);
1228 f._p = (unsigned char *)RSTRING_END(str);
1227 f._bf._base = (unsigned char *)str;
1228 f._p = (unsigned char *)RSTRING_END(str);
1226 f._w = rb_str_capacity(str);
1228 f._p = (unsigned char *)RSTRING_END(str);
1229 klass = RBASIC(str)->klass;
1230 __asm__ __volatile__ ("" : : : "memory");
1231 RBASIC_CLEAR_CLASS(str);
1232 f.vwrite = ruby__sfvwrite;
1233 f.vextra = ruby__sfvextra;
1232 f.vwrite = ruby__sfvwrite;
1231 RBASIC_CLEAR_CLASS(str);
1233 f.vextra = ruby__sfvextra;
1234 buffer.value = 0;
1233 f.vextra = ruby__sfvextra;
1232 f.vwrite = ruby__sfvwrite;
1235 BSD_vfprintf(&f, fmt, ap);
1236 RBASIC_SET_CLASS_RAW(str, klass);
1237 rb_str_resize(str, (char *)f._p - RSTRING_PTR(str));
1236 RBASIC_SET_CLASS_RAW(str, klass);
1237 rb_str_resize(str, (char *)f._p - RSTRING_PTR(str));
1240 return str;
(gdb) p *(struct RBasic *)str
$1 = {flags = 8197, klass = 1062619968}
And yes, this looks pretty much a bug of Clang/LLVM to me for now. I'm going to create a minimal reproducing example to send to Clang/LLVM maintainers. One more push...
Updated by xtkoba (Tee KOBAYASHI) almost 4 years ago
MWE:
#include <stdarg.h>
#include <assert.h>
#ifndef WORKAROUND
#define WORKAROUND 0
#endif
typedef unsigned long VALUE;
static void RBASIC_SET_CLASS_RAW(VALUE obj, VALUE klass)
{
struct { VALUE flags; VALUE klass; } *ptr = (void *)obj;
ptr->klass = klass;
}
struct RBasic {
VALUE flags;
/*const*/ VALUE klass;
};
#define RBASIC(obj) ((struct RBasic *)(obj))
struct RString {
struct RBasic basic;
union {
struct { long len; char *ptr; } heap;
char ary[8];
} as;
};
/* dummy function */
#pragma clang optimize off
static int dummy_vfprintf(char *fmt, va_list ap) { return 0; }
#pragma clang optimize on
#define NOP __asm__ __volatile__ ("nop")
VALUE rb_str_catf(VALUE str, char *format, ...)
{
va_list ap;
__builtin_va_start(ap, format);
struct RString buf;
if (RBASIC(str)->flags & 0x2000) { NOP; }
else {
buf.as.heap.ptr = ((struct RString *)str)->as.ary;
}
if (__builtin_expect(!!(! buf.as.heap.ptr), 0)) { NOP; }
VALUE klass = RBASIC(str)->klass;
#if WORKAROUND
__asm__ __volatile__ ("" : : : "memory");
#endif
RBASIC_SET_CLASS_RAW(str, 0);
dummy_vfprintf(format, ap);
RBASIC_SET_CLASS_RAW(str, klass);
__builtin_va_end(ap);
return str;
}
int main()
{
struct RBasic obj = { .flags = 0, .klass = 1 };
rb_str_catf((VALUE)&obj, "");
assert(obj.klass == 1);
return 0;
}
The assertion fails when compiled with -DWORKAROUND=0
, and holds with -DWORKAROUND=1
.
Updated by xtkoba (Tee KOBAYASHI) almost 4 years ago
A ticket in LLVM Bugzilla: https://bugs.llvm.org/show_bug.cgi?id=49384
Updated by alanwu (Alan Wu) almost 4 years ago
This seems like a strict aliasing issue. I'm curious if compiling with -fno-strict-aliasing
fixes it.
Updated by xtkoba (Tee KOBAYASHI) almost 4 years ago
alanwu (Alan Wu) Ah yes, it seems to fix the issue, at least for the MWE code. Thanks for the suggestion!
Updated by alanwu (Alan Wu) almost 4 years ago
If this fixes the Ruby crash too, maybe we should put -fno-strict-aliasing
as a default compilation option like the Linux kernel. It's very easy to make strict aliasing mistakes and they are too hard to spot...
Updated by shyouhei (Shyouhei Urabe) almost 4 years ago
I suspect d0e0c884bb4277e529adbd8d82aae0a651f7edf2 could be the cause of this issue.
Updated by xtkoba (Tee KOBAYASHI) almost 4 years ago
Compilation with -fno-strict-aliasing
seems to resolve the main issue of the ruby
crash. It would be nice if this compiler option is added by default, unless it causes a serious performance drawback.
shyouhei (Shyouhei Urabe) That change might have induced the issue, but I don't think it is the fundamental cause. The trouble would be that the type struct RBasic
has aliases. In this point of view, yet another workaround would be as follows (although I don't know whether the attribute may_alias
works with C++):
--- a/include/ruby/internal/core/rbasic.h
+++ b/include/ruby/internal/core/rbasic.h
@@ -66,7 +66,7 @@ RBasic {
{
}
#endif
-};
+} __attribute__ ((__may_alias__));
RBIMPL_SYMBOL_EXPORT_BEGIN()
VALUE rb_obj_hide(VALUE obj);
FYI, some searching told me that CPython had a similar issue until version 2.7 with its object representation in C (cf. https://bugs.python.org/issue30104).
Updated by shyouhei (Shyouhei Urabe) almost 4 years ago
This is my take:
From 7fb39b1138dfaa3a1502673ac82d6b75401e8f39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8D=9C=E9=83=A8=E6=98=8C=E5=B9=B3?=
<shyouhei@ruby-lang.org>
Date: Tue, 2 Mar 2021 15:22:22 +0900
Subject: [PATCH] fix strict aliasing
---
internal/object.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/internal/object.h b/internal/object.h
index aa820128c7..6a5dfcda63 100644
--- a/internal/object.h
+++ b/internal/object.h
@@ -47,8 +47,8 @@ MJIT_SYMBOL_EXPORT_END
static inline void
RBASIC_SET_CLASS_RAW(VALUE obj, VALUE klass)
{
- struct { VALUE flags; VALUE klass; } *ptr = (void *)obj;
- ptr->klass = klass;
+ VALUE *ptr =(/* const cast */VALUE *) & RBASIC(obj)->klass;
+ memcpy(ptr, &klass, sizeof(klass));
}
static inline void
--
2.17.1
I would like to reflain from fixing this issue by adding compiler-specific __asm__
or __attreibute__
, because that does not help everyone... The world is not built on top of LLVM. I don't want to kill support for other compiler infrastructures.
It sounds like a nice idea to have -fno-strict-aliasing
, though. That is just a matter of compiler flag. Must not hurt the codebase.
Updated by xtkoba (Tee KOBAYASHI) almost 4 years ago
shyouhei (Shyouhei Urabe) Thanks, it works. I have no objection to avoiding compiler-specific keywords, as long as things go well without them.
Updated by shyouhei (Shyouhei Urabe) almost 4 years ago
- Status changed from Open to Closed
Applied in changeset git|33dc0a070a515811e71fccbdc8cf0cd5a5dd784c.
RBASIC_SET_CLASS_RAW: follow strict aliasing rule
Instead of rather euphemistic struct cast, just reomve the const
qualifier and assign directly. According to ISO/IEC 9899:2018 section
6.5 paragraph 7, VALUE
and const VALUE
are allowed to alias (but two
distinct structs are not, even when their structures are the same).
[Bug #17540]