0001-Speed-up-Array-join-with-a-byte-copy-fast-path.patch - Ruby - Ruby Issue Tracking System

     #include "internal/object.h"
     #include "internal/proc.h"
     #include "internal/rational.h"
     #include "internal/string.h"
     #include "internal/vm.h"
     #include "probes.h"
     #include "ruby/encoding.h"
-...
     VALUE
     rb_ary_join(VALUE ary, VALUE sep)
+    {
         long len = 1, i;
         long len = 1, i, n = RARRAY_LEN(ary);
         VALUE val, tmp, result;
         if (RARRAY_LEN(ary) == 0) return rb_usascii_str_new(0, 0);
         if (n == 0) return rb_usascii_str_new(0, 0);
         int sep_cr = ENC_CODERANGE_7BIT, sep_enc = 0;
         if (!NIL_P(sep)) {
             StringValue(sep);
             len += RSTRING_LEN(sep) * (RARRAY_LEN(ary) - 1);
             len += RSTRING_LEN(sep) * (n - 1);
             sep_cr = rb_enc_str_coderange(sep);
             sep_enc = ENCODING_GET(sep);
+        }
         long len_memo = RARRAY_LEN(ary);
         for (i=0; i < len_memo; i++) {
         /* Measure the result length and note whether every part is 7-bit ASCII. A
            non-String element switches to the general per-element append path. */
         int ascii_only = NIL_P(sep) || sep_cr == ENC_CODERANGE_7BIT;
         long len_memo = n;
         for (i = 0; i < len_memo; i++) {
             val = RARRAY_AREF(ary, i);
             if (RB_UNLIKELY(!RB_TYPE_P(val, T_STRING))) {
                 tmp = rb_check_string_type(val);
                 if (NIL_P(tmp) || tmp != val) {
                     int first;
                     long n = RARRAY_LEN(ary);
                     if (i > n) i = n;
                     result = rb_str_buf_new(len + (n-i)*10);
                     long m = RARRAY_LEN(ary);
                     if (i > m) i = m;
                     result = rb_str_buf_new(len + (m - i) * 10);
                     rb_enc_associate(result, rb_usascii_encoding());
                     i = ary_join_0(ary, sep, i, result);
                     first = i == 0;
                     first = (i == 0);
                     ary_join_1(ary, ary, sep, i, result, &first);
                     return result;
+                }
                 len += RSTRING_LEN(tmp);
                 len_memo = RARRAY_LEN(ary);
                 ascii_only = 0;
+            }
             else {
                 len += RSTRING_LEN(val);
                 if (ascii_only && ENC_CODERANGE(val) != ENC_CODERANGE_7BIT) ascii_only = 0;
+            }
+        }
         n = RARRAY_LEN(ary);
         result = rb_str_new(0, len);
         rb_str_set_len(result, 0);
         /* Fast path: build the result by copying raw bytes (no per-append bookkeeping)
            when no encoding negotiation can occur. That holds when every part is 7-bit
            ASCII, or when every element shares one ASCII-compatible single-byte-terminated
            encoding and the separator is compatible. The result takes element 0's
            encoding; result_cr is its code range, left UNKNOWN to compute lazily. */
         int enc = ENCODING_GET(RARRAY_AREF(ary, 0));
         int result_cr = ENC_CODERANGE_7BIT;
         int copyable = ascii_only;
         if (!copyable && rb_str_encindex_fastpath(enc) &&
             (n <= 1 || NIL_P(sep) || sep_enc == enc || sep_cr == ENC_CODERANGE_7BIT)) {
             /* Every element must share enc; merge code ranges (VALID if any part is
                clean multibyte, UNKNOWN if any is UNKNOWN/BROKEN -> compute lazily). */
             int valid = 0, uncertain = 0;
             copyable = 1;
             for (i = 0; i < n; i++) {
                 val = RARRAY_AREF(ary, i);
                 if (ENCODING_GET(val) != enc) { copyable = 0; break; }
                 int c = ENC_CODERANGE(val);
                 if (c == ENC_CODERANGE_VALID) valid = 1;
                 else if (c != ENC_CODERANGE_7BIT) uncertain = 1;
+            }
             if (copyable) {
                 if (n > 1 && !NIL_P(sep)) {
                     if (sep_cr == ENC_CODERANGE_VALID) valid = 1;
                     else if (sep_cr != ENC_CODERANGE_7BIT) uncertain = 1;
+                }
                 result_cr = uncertain ? ENC_CODERANGE_UNKNOWN :
                             (valid ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT);
+            }
+        }
         ary_join_0(ary, sep, RARRAY_LEN(ary), result);
         if (copyable) {
             result = rb_str_buf_new(len);
             rb_enc_associate_index(result, enc);
             const char *sep_ptr = NIL_P(sep) ? NULL : RSTRING_PTR(sep);
             long sep_len = NIL_P(sep) ? 0 : RSTRING_LEN(sep);
             char *const buf = RSTRING_PTR(result);
             char *p = buf;
             for (i = 0; i < n; i++) {
                 val = RARRAY_AREF(ary, i);
                 long vlen = RSTRING_LEN(val);
                 if (i > 0 && sep_len) {
                     memcpy(p, sep_ptr, sep_len);
                     p += sep_len;
+                }
                 if (vlen) {
                     memcpy(p, RSTRING_PTR(val), vlen);
                     p += vlen;
+                }
+            }
             ENC_CODERANGE_CLEAR(result);  /* so rb_str_set_len does not rescan the bytes */
             rb_str_set_len(result, p - buf);
             if (result_cr != ENC_CODERANGE_UNKNOWN) ENC_CODERANGE_SET(result, result_cr);
             return result;
+        }
         /* General path: per-element append with full encoding handling. */
         result = rb_str_new(0, len);
         rb_str_set_len(result, 0);
         ary_join_0(ary, sep, n, result);
         return result;
+    }

     prelude: |
       # All elements are 7-bit ASCII Strings (the dominant real-world join case).
       # Distinct objects so large-N cases exercise realistic heap/cache behavior.
       a100_1    = Array.new(100)    { "a" * 1  }
       a100_8    = Array.new(100)    { "a" * 8  }
       a100_40   = Array.new(100)    { "a" * 40 }
       a1k_1     = Array.new(1000)   { "a" * 1  }
       a1k_8     = Array.new(1000)   { "a" * 8  }
       a1k_40    = Array.new(1000)   { "a" * 40 }
       a100k_1   = Array.new(100000) { "a" * 1  }
       a100k_8   = Array.new(100000) { "a" * 8  }
       a100k_40  = Array.new(100000) { "a" * 40 }
     benchmark:
       # separator " " (space)
       join_sp_n100_e1:    a100_1.join(" ")
       join_sp_n100_e8:    a100_8.join(" ")
       join_sp_n100_e40:   a100_40.join(" ")
       join_sp_n1k_e1:     a1k_1.join(" ")
       join_sp_n1k_e8:     a1k_8.join(" ")
       join_sp_n1k_e40:    a1k_40.join(" ")
       join_sp_n100k_e1:   a100k_1.join(" ")
       join_sp_n100k_e8:   a100k_8.join(" ")
       join_sp_n100k_e40:  a100k_40.join(" ")
       # separator "\n" (newline)
       join_nl_n100_e1:    a100_1.join("\n")
       join_nl_n100_e8:    a100_8.join("\n")
       join_nl_n100_e40:   a100_40.join("\n")
       join_nl_n1k_e1:     a1k_1.join("\n")
       join_nl_n1k_e8:     a1k_8.join("\n")
       join_nl_n1k_e40:    a1k_40.join("\n")
       join_nl_n100k_e1:   a100k_1.join("\n")
       join_nl_n100k_e8:   a100k_8.join("\n")
       join_nl_n100k_e40:  a100k_40.join("\n")

     prelude: |
       # --- must NOT take any fast path: prove added precompute-loop checks don't regress ---
       # all multibyte UTF-8 (VALID coderange). 7-bit gate bails at elem 0; same-encoding gate TAKES it.
       mb_utf8       = Array.new(1000) { "café" }
       # ASCII except a trailing multibyte elem: WORST CASE for 7-bit gate (scans all, then falls back).
       tail_mb       = Array.new(1000) { "a" * 8 }; tail_mb[-1] = "café"
       # non-String elements -> ary_join_1 (to_s). Gate bails at type check before coderange.
       nonstr_int    = Array.new(1000) { |i| i }
       # ASCII strings except a trailing Integer: precompute scans strings, then mixed fallback.
       mixed_tail    = (Array.new(999) { "a" * 8 } << 42)
       # UTF-16LE: not ASCII-compatible. Both gates bail immediately.
       utf16         = Array.new(1000) { "ab".encode("UTF-16LE") }
       # nested arrays -> recursive fallback.
       nested        = Array.new(500)  { [1, 2] }
       # 7-bit ASCII elements but a multibyte separator (3-byte em dash). Both gates bail at sep check.
       ascii_elems   = Array.new(1000) { "a" * 8 }
       sep_mb        = "—"
       # --- fast-path ELIGIBLE variants: confirm benefit generalizes beyond single chars / find crossover ---
       # frozen ASCII elements (read-only path).
       frozen_elems  = Array.new(1000) { ("a" * 8).freeze }
       # large elements: where memcpy should dominate and the win decays toward 1x.
       big_e256      = Array.new(1000) { "a" * 256  }
       big_e1k       = Array.new(1000) { "a" * 1024 }
       big_e4k       = Array.new(1000) { "a" * 4096 }
     benchmark:
       # regression (fallback / gate worst-case) — all use a 1-byte ASCII separator unless noted
       reg_multibyte_utf8:   mb_utf8.join(" ")
       reg_tail_multibyte:   tail_mb.join(" ")
       reg_nonstring_int:    nonstr_int.join(" ")
       reg_mixed_tail_int:   mixed_tail.join(" ")
       reg_utf16:            utf16.join(" ".encode("UTF-16LE"))
       reg_nested:           nested.join(" ")
       reg_multibyte_sep:    ascii_elems.join(sep_mb)
       # fast-path eligible variants
       fp_frozen:            frozen_elems.join(" ")
       fp_nosep:             ascii_elems.join
       fp_multichar_sep:     ascii_elems.join(", ")
       fp_big_e256:          big_e256.join(" ")
       fp_big_e1k:           big_e1k.join(" ")
       fp_big_e4k:           big_e4k.join(" ")

         assert_equal("b", x.ary.join(""))
       end
       def test_join_encoding
         # Multibyte UTF-8 elements: result is UTF-8, valid, not ASCII-only.
         r = @cls["caf\u00e9", "na\u00efve"].join(" ")
         assert_equal("caf\u00e9 na\u00efve", r)
         assert_equal(Encoding::UTF_8, r.encoding)
         assert_equal(true, r.valid_encoding?)
         assert_equal(false, r.ascii_only?)
         # All 7-bit content stays ASCII-only.
         r = @cls["abc", "def"].join(",")
         assert_equal("abc,def", r)
         assert_equal(true, r.ascii_only?)
         # Multibyte separator (same encoding as the elements).
         r = @cls["a", "b", "c"].join("\u2014")
         assert_equal("a\u2014b\u2014c", r)
         assert_equal(Encoding::UTF_8, r.encoding)
         assert_equal(false, r.ascii_only?)
         assert_equal(true, r.valid_encoding?)
         # Mixed ASCII-compatible encodings, all 7-bit: result takes element 0's encoding.
         r = @cls["abc".dup.force_encoding("US-ASCII"), "def".dup.force_encoding("UTF-8")].join(" ")
         assert_equal("abc def", r)
         assert_equal(Encoding::US_ASCII, r.encoding)
         assert_equal(true, r.ascii_only?)
         # 7-bit content in a non-ASCII encoding (Shift_JIS).
         r = @cls["abc".encode("Shift_JIS"), "def".encode("Shift_JIS")].join(" ")
         assert_equal(Encoding::Shift_JIS, r.encoding)
         assert_equal("abc def".b, r.b)
         # Same-encoding multibyte (Shift_JIS): byte-for-byte concatenation.
         s = "\u65e5\u672c".encode("Shift_JIS")
         sep = "/".encode("Shift_JIS")
         r = @cls[s, s].join(sep)
         assert_equal(Encoding::Shift_JIS, r.encoding)
         assert_equal((s + sep + s).b, r.b)
         # Broken bytes: result keeps them and reports invalid.
         bad = "\xff\xfe".dup.force_encoding("UTF-8")
         r = @cls[bad, bad].join(" ")
         assert_equal((bad + " " + bad).b, r.b)
         assert_equal(false, r.valid_encoding?)
         # Incompatible element/separator encodings still raise.
         assert_raise(Encoding::CompatibilityError) do
           @cls["a".dup.force_encoding("UTF-8"), "b".encode("UTF-16LE")].join(" ")
         end
         # Bulk copy: large array, verified against a non-join oracle.
         big = @cls.new(500) { "ab" }
         assert_equal(("ab," * 500).chomp(","), big.join(","))
       end
       def test_to_a2
         klass = Class.new(Array)
         a = klass.new.to_a

Project

General

Profile

Ruby

Feature #22094 » 0001-Speed-up-Array-join-with-a-byte-copy-fast-path.patch