Project

General

Profile

Feature #22094 » 0001-Speed-up-Array-join-with-a-byte-copy-fast-path.patch

yaroslavmarkin (Yaroslav Markin), 06/07/2026 04:16 PM

View differences:

array.c
#include "internal/object.h"
#include "internal/proc.h"
#include "internal/rational.h"
#include "internal/string.h"
#include "internal/vm.h"
#include "probes.h"
#include "ruby/encoding.h"
......
VALUE
rb_ary_join(VALUE ary, VALUE sep)
{
long len = 1, i;
long len = 1, i, n = RARRAY_LEN(ary);
VALUE val, tmp, result;
if (RARRAY_LEN(ary) == 0) return rb_usascii_str_new(0, 0);
if (n == 0) return rb_usascii_str_new(0, 0);
int sep_cr = ENC_CODERANGE_7BIT, sep_enc = 0;
if (!NIL_P(sep)) {
StringValue(sep);
len += RSTRING_LEN(sep) * (RARRAY_LEN(ary) - 1);
len += RSTRING_LEN(sep) * (n - 1);
sep_cr = rb_enc_str_coderange(sep);
sep_enc = ENCODING_GET(sep);
}
long len_memo = RARRAY_LEN(ary);
for (i=0; i < len_memo; i++) {
/* Measure the result length and note whether every part is 7-bit ASCII. A
non-String element switches to the general per-element append path. */
int ascii_only = NIL_P(sep) || sep_cr == ENC_CODERANGE_7BIT;
long len_memo = n;
for (i = 0; i < len_memo; i++) {
val = RARRAY_AREF(ary, i);
if (RB_UNLIKELY(!RB_TYPE_P(val, T_STRING))) {
tmp = rb_check_string_type(val);
if (NIL_P(tmp) || tmp != val) {
int first;
long n = RARRAY_LEN(ary);
if (i > n) i = n;
result = rb_str_buf_new(len + (n-i)*10);
long m = RARRAY_LEN(ary);
if (i > m) i = m;
result = rb_str_buf_new(len + (m - i) * 10);
rb_enc_associate(result, rb_usascii_encoding());
i = ary_join_0(ary, sep, i, result);
first = i == 0;
first = (i == 0);
ary_join_1(ary, ary, sep, i, result, &first);
return result;
}
len += RSTRING_LEN(tmp);
len_memo = RARRAY_LEN(ary);
ascii_only = 0;
}
else {
len += RSTRING_LEN(val);
if (ascii_only && ENC_CODERANGE(val) != ENC_CODERANGE_7BIT) ascii_only = 0;
}
}
n = RARRAY_LEN(ary);
result = rb_str_new(0, len);
rb_str_set_len(result, 0);
/* Fast path: build the result by copying raw bytes (no per-append bookkeeping)
when no encoding negotiation can occur. That holds when every part is 7-bit
ASCII, or when every element shares one ASCII-compatible single-byte-terminated
encoding and the separator is compatible. The result takes element 0's
encoding; result_cr is its code range, left UNKNOWN to compute lazily. */
int enc = ENCODING_GET(RARRAY_AREF(ary, 0));
int result_cr = ENC_CODERANGE_7BIT;
int copyable = ascii_only;
if (!copyable && rb_str_encindex_fastpath(enc) &&
(n <= 1 || NIL_P(sep) || sep_enc == enc || sep_cr == ENC_CODERANGE_7BIT)) {
/* Every element must share enc; merge code ranges (VALID if any part is
clean multibyte, UNKNOWN if any is UNKNOWN/BROKEN -> compute lazily). */
int valid = 0, uncertain = 0;
copyable = 1;
for (i = 0; i < n; i++) {
val = RARRAY_AREF(ary, i);
if (ENCODING_GET(val) != enc) { copyable = 0; break; }
int c = ENC_CODERANGE(val);
if (c == ENC_CODERANGE_VALID) valid = 1;
else if (c != ENC_CODERANGE_7BIT) uncertain = 1;
}
if (copyable) {
if (n > 1 && !NIL_P(sep)) {
if (sep_cr == ENC_CODERANGE_VALID) valid = 1;
else if (sep_cr != ENC_CODERANGE_7BIT) uncertain = 1;
}
result_cr = uncertain ? ENC_CODERANGE_UNKNOWN :
(valid ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT);
}
}
ary_join_0(ary, sep, RARRAY_LEN(ary), result);
if (copyable) {
result = rb_str_buf_new(len);
rb_enc_associate_index(result, enc);
const char *sep_ptr = NIL_P(sep) ? NULL : RSTRING_PTR(sep);
long sep_len = NIL_P(sep) ? 0 : RSTRING_LEN(sep);
char *const buf = RSTRING_PTR(result);
char *p = buf;
for (i = 0; i < n; i++) {
val = RARRAY_AREF(ary, i);
long vlen = RSTRING_LEN(val);
if (i > 0 && sep_len) {
memcpy(p, sep_ptr, sep_len);
p += sep_len;
}
if (vlen) {
memcpy(p, RSTRING_PTR(val), vlen);
p += vlen;
}
}
ENC_CODERANGE_CLEAR(result); /* so rb_str_set_len does not rescan the bytes */
rb_str_set_len(result, p - buf);
if (result_cr != ENC_CODERANGE_UNKNOWN) ENC_CODERANGE_SET(result, result_cr);
return result;
}
/* General path: per-element append with full encoding handling. */
result = rb_str_new(0, len);
rb_str_set_len(result, 0);
ary_join_0(ary, sep, n, result);
return result;
}
benchmark/array_join.yml
prelude: |
# All elements are 7-bit ASCII Strings (the dominant real-world join case).
# Distinct objects so large-N cases exercise realistic heap/cache behavior.
a100_1 = Array.new(100) { "a" * 1 }
a100_8 = Array.new(100) { "a" * 8 }
a100_40 = Array.new(100) { "a" * 40 }
a1k_1 = Array.new(1000) { "a" * 1 }
a1k_8 = Array.new(1000) { "a" * 8 }
a1k_40 = Array.new(1000) { "a" * 40 }
a100k_1 = Array.new(100000) { "a" * 1 }
a100k_8 = Array.new(100000) { "a" * 8 }
a100k_40 = Array.new(100000) { "a" * 40 }
benchmark:
# separator " " (space)
join_sp_n100_e1: a100_1.join(" ")
join_sp_n100_e8: a100_8.join(" ")
join_sp_n100_e40: a100_40.join(" ")
join_sp_n1k_e1: a1k_1.join(" ")
join_sp_n1k_e8: a1k_8.join(" ")
join_sp_n1k_e40: a1k_40.join(" ")
join_sp_n100k_e1: a100k_1.join(" ")
join_sp_n100k_e8: a100k_8.join(" ")
join_sp_n100k_e40: a100k_40.join(" ")
# separator "\n" (newline)
join_nl_n100_e1: a100_1.join("\n")
join_nl_n100_e8: a100_8.join("\n")
join_nl_n100_e40: a100_40.join("\n")
join_nl_n1k_e1: a1k_1.join("\n")
join_nl_n1k_e8: a1k_8.join("\n")
join_nl_n1k_e40: a1k_40.join("\n")
join_nl_n100k_e1: a100k_1.join("\n")
join_nl_n100k_e8: a100k_8.join("\n")
join_nl_n100k_e40: a100k_40.join("\n")
benchmark/array_join_regression.yml
prelude: |
# --- must NOT take any fast path: prove added precompute-loop checks don't regress ---
# all multibyte UTF-8 (VALID coderange). 7-bit gate bails at elem 0; same-encoding gate TAKES it.
mb_utf8 = Array.new(1000) { "café" }
# ASCII except a trailing multibyte elem: WORST CASE for 7-bit gate (scans all, then falls back).
tail_mb = Array.new(1000) { "a" * 8 }; tail_mb[-1] = "café"
# non-String elements -> ary_join_1 (to_s). Gate bails at type check before coderange.
nonstr_int = Array.new(1000) { |i| i }
# ASCII strings except a trailing Integer: precompute scans strings, then mixed fallback.
mixed_tail = (Array.new(999) { "a" * 8 } << 42)
# UTF-16LE: not ASCII-compatible. Both gates bail immediately.
utf16 = Array.new(1000) { "ab".encode("UTF-16LE") }
# nested arrays -> recursive fallback.
nested = Array.new(500) { [1, 2] }
# 7-bit ASCII elements but a multibyte separator (3-byte em dash). Both gates bail at sep check.
ascii_elems = Array.new(1000) { "a" * 8 }
sep_mb = "—"
# --- fast-path ELIGIBLE variants: confirm benefit generalizes beyond single chars / find crossover ---
# frozen ASCII elements (read-only path).
frozen_elems = Array.new(1000) { ("a" * 8).freeze }
# large elements: where memcpy should dominate and the win decays toward 1x.
big_e256 = Array.new(1000) { "a" * 256 }
big_e1k = Array.new(1000) { "a" * 1024 }
big_e4k = Array.new(1000) { "a" * 4096 }
benchmark:
# regression (fallback / gate worst-case) — all use a 1-byte ASCII separator unless noted
reg_multibyte_utf8: mb_utf8.join(" ")
reg_tail_multibyte: tail_mb.join(" ")
reg_nonstring_int: nonstr_int.join(" ")
reg_mixed_tail_int: mixed_tail.join(" ")
reg_utf16: utf16.join(" ".encode("UTF-16LE"))
reg_nested: nested.join(" ")
reg_multibyte_sep: ascii_elems.join(sep_mb)
# fast-path eligible variants
fp_frozen: frozen_elems.join(" ")
fp_nosep: ascii_elems.join
fp_multichar_sep: ascii_elems.join(", ")
fp_big_e256: big_e256.join(" ")
fp_big_e1k: big_e1k.join(" ")
fp_big_e4k: big_e4k.join(" ")
test/ruby/test_array.rb
assert_equal("b", x.ary.join(""))
end
def test_join_encoding
# Multibyte UTF-8 elements: result is UTF-8, valid, not ASCII-only.
r = @cls["caf\u00e9", "na\u00efve"].join(" ")
assert_equal("caf\u00e9 na\u00efve", r)
assert_equal(Encoding::UTF_8, r.encoding)
assert_equal(true, r.valid_encoding?)
assert_equal(false, r.ascii_only?)
# All 7-bit content stays ASCII-only.
r = @cls["abc", "def"].join(",")
assert_equal("abc,def", r)
assert_equal(true, r.ascii_only?)
# Multibyte separator (same encoding as the elements).
r = @cls["a", "b", "c"].join("\u2014")
assert_equal("a\u2014b\u2014c", r)
assert_equal(Encoding::UTF_8, r.encoding)
assert_equal(false, r.ascii_only?)
assert_equal(true, r.valid_encoding?)
# Mixed ASCII-compatible encodings, all 7-bit: result takes element 0's encoding.
r = @cls["abc".dup.force_encoding("US-ASCII"), "def".dup.force_encoding("UTF-8")].join(" ")
assert_equal("abc def", r)
assert_equal(Encoding::US_ASCII, r.encoding)
assert_equal(true, r.ascii_only?)
# 7-bit content in a non-ASCII encoding (Shift_JIS).
r = @cls["abc".encode("Shift_JIS"), "def".encode("Shift_JIS")].join(" ")
assert_equal(Encoding::Shift_JIS, r.encoding)
assert_equal("abc def".b, r.b)
# Same-encoding multibyte (Shift_JIS): byte-for-byte concatenation.
s = "\u65e5\u672c".encode("Shift_JIS")
sep = "/".encode("Shift_JIS")
r = @cls[s, s].join(sep)
assert_equal(Encoding::Shift_JIS, r.encoding)
assert_equal((s + sep + s).b, r.b)
# Broken bytes: result keeps them and reports invalid.
bad = "\xff\xfe".dup.force_encoding("UTF-8")
r = @cls[bad, bad].join(" ")
assert_equal((bad + " " + bad).b, r.b)
assert_equal(false, r.valid_encoding?)
# Incompatible element/separator encodings still raise.
assert_raise(Encoding::CompatibilityError) do
@cls["a".dup.force_encoding("UTF-8"), "b".encode("UTF-16LE")].join(" ")
end
# Bulk copy: large array, verified against a non-join oracle.
big = @cls.new(500) { "ab" }
assert_equal(("ab," * 500).chomp(","), big.join(","))
end
def test_to_a2
klass = Class.new(Array)
a = klass.new.to_a
    (1-1/1)