Project

General

Profile

Feature #12275 » v1.patch

tad (Tadashi Saito), 11/27/2017 07:56 PM

View differences:

NEWS
112 112
  * String#each_grapheme_cluster and String#grapheme_clusters is added to
113 113
    enumerate grapheme clusters [Feature #13780]
114 114
  * String#start_with? supports regexp [Feature #13712]
115
  * String#undump is added to unescape String#dump'ed string [Feature #12275]
115 116

  
116 117
* Regexp/String: Update Unicode version from 9.0.0 to 10.0.0 [Feature #13685]
117 118

  
string.c
19 19
#include "ruby_assert.h"
20 20
#include "id.h"
21 21
#include "debug_counter.h"
22
#include "ruby/util.h"
22 23

  
23 24
#define BEG(no) (regs->beg[(no)])
24 25
#define END(no) (regs->end[(no)])
......
3418 3419
    return rb_str_eql(folded_str1, folded_str2);
3419 3420
}
3420 3421

  
3422
static long
3423
strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3424
	    const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3425
{
3426
    const char *search_start = str_ptr;
3427
    long pos, search_len = str_len - offset;
3428

  
3429
    for (;;) {
3430
	const char *t;
3431
	pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3432
	if (pos < 0) return pos;
3433
	t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3434
	if (t == search_start + pos) break;
3435
	search_len -= t - search_start;
3436
	if (search_len <= 0) return -1;
3437
	offset += t - search_start;
3438
	search_start = t;
3439
    }
3440
    return pos + offset;
3441
}
3442

  
3421 3443
#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3422 3444

  
3423 3445
static long
3424 3446
rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3425 3447
{
3426
    const char *str_ptr, *str_ptr_end, *sub_ptr, *search_start;
3427
    long pos, str_len, sub_len, search_len;
3448
    const char *str_ptr, *str_ptr_end, *sub_ptr;
3449
    long str_len, sub_len;
3428 3450
    int single_byte = single_byte_optimizable(str);
3429 3451
    rb_encoding *enc;
3430 3452

  
......
3454 3476
    if (sub_len == 0) return offset;
3455 3477

  
3456 3478
    /* need proceed one character at a time */
3457

  
3458
    search_start = str_ptr;
3459
    search_len = RSTRING_LEN(str) - offset;
3460
    for (;;) {
3461
	const char *t;
3462
	pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3463
	if (pos < 0) return pos;
3464
	t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3465
	if (t == search_start + pos) break;
3466
	search_len -= t - search_start;
3467
	if (search_len <= 0) return -1;
3468
	offset += t - search_start;
3469
	search_start = t;
3470
    }
3471
    return pos + offset;
3479
    return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3472 3480
}
3473 3481

  
3474 3482

  
......
6069 6077
    return result;
6070 6078
}
6071 6079

  
6080
/* Is s wrapped with '"'? */
6081
static int
6082
is_wrapped(const char *s, const char *s_end, long len, rb_encoding *enc)
6083
{
6084
    unsigned int cbeg, cend;
6085
    const char *prev;
6086

  
6087
    if (len < 2) return FALSE;
6088

  
6089
    cbeg = rb_enc_mbc_to_codepoint(s, s_end, enc);
6090
    if (cbeg != '"') return FALSE;
6091

  
6092
    prev = rb_enc_prev_char(s, s_end, s_end, enc);
6093
    cend = rb_enc_mbc_to_codepoint(prev, s_end, enc);
6094
    return cend == '"';
6095
}
6096

  
6097
static const char *
6098
unescape_ascii(unsigned int c)
6099
{
6100
    switch (c) {
6101
      case 'n':
6102
	return "\n";
6103
      case 'r':
6104
	return "\r";
6105
      case 't':
6106
	return "\t";
6107
      case 'f':
6108
	return "\f";
6109
      case 'v':
6110
	return "\v";
6111
      case 'b':
6112
	return "\b";
6113
      case 'a':
6114
	return "\a";
6115
      case 'e':
6116
	return "\e";
6117
      default:
6118
	UNREACHABLE;
6119
    }
6120
}
6121

  
6122
static int
6123
undump_after_backslash(VALUE undumped, const char *s, const char *s_end, rb_encoding *enc)
6124
{
6125
    unsigned int c, c2;
6126
    int n, n2, codelen;
6127
    size_t hexlen;
6128
    char buf[6];
6129

  
6130
    c = rb_enc_codepoint_len(s, s_end, &n, enc);
6131
    switch (c) {
6132
      case '\\':
6133
      case '"':
6134
	rb_str_cat(undumped, s, n); /* cat itself */
6135
	n++;
6136
	break;
6137
      case 'n':
6138
      case 'r':
6139
      case 't':
6140
      case 'f':
6141
      case 'v':
6142
      case 'b':
6143
      case 'a':
6144
      case 'e':
6145
	rb_str_cat(undumped, unescape_ascii(c), n);
6146
	n++;
6147
	break;
6148
      case 'u':
6149
	if (s+1 >= s_end) {
6150
	    rb_raise(rb_eArgError, "invalid Unicode escape");
6151
	}
6152
	c2 = rb_enc_codepoint_len(s+1, s_end, NULL, enc);
6153
	if (c2 == '{') { /* handle \u{...} form */
6154
	    const char *hexstr = s + 2;
6155
	    unsigned int hex;
6156
	    static const char* const close_brace = "}";
6157
	    long pos;
6158

  
6159
	    if (hexstr >= s_end) {
6160
		rb_raise(rb_eArgError, "unterminated Unicode escape");
6161
	    }
6162
	    /* find close brace */
6163
	    pos = strseq_core(hexstr, s_end, s_end - hexstr, close_brace, 1, 0, enc);
6164
	    if (pos < 0) {
6165
		rb_raise(rb_eArgError, "unterminated Unicode escape");
6166
	    }
6167
	    hex = ruby_scan_hex(hexstr, pos, &hexlen);
6168
	    if (hexlen == 0 || hexlen > 6) {
6169
		rb_raise(rb_eArgError, "invalid Unicode escape");
6170
	    }
6171
	    if (hex > 0x10ffffU) {
6172
		rb_raise(rb_eArgError, "invalid Unicode codepoint (too large)");
6173
	    }
6174
	    if ((hex & 0xfffff800U) == 0xd800U) {
6175
		rb_raise(rb_eArgError, "invalid Unicode codepoint");
6176
	    }
6177
	    codelen = rb_enc_codelen(hex, enc);
6178
	    rb_enc_mbcput(hex, buf, enc);
6179
	    rb_str_cat(undumped, buf, codelen);
6180
	    n += rb_strlen_lit("u{}") + hexlen;
6181
	}
6182
	else { /* handle \uXXXX form */
6183
	    unsigned int hex = ruby_scan_hex(s+1, 4, &hexlen);
6184
	    if (hexlen != 4) {
6185
		rb_raise(rb_eArgError, "invalid Unicode escape");
6186
	    }
6187
	    codelen = rb_enc_codelen(hex, enc);
6188
	    rb_enc_mbcput(hex, buf, enc);
6189
	    rb_str_cat(undumped, buf, codelen);
6190
	    n += rb_strlen_lit("uXXXX");
6191
	}
6192
	break;
6193
      case 'x':
6194
	if (s+1 >= s_end) {
6195
	    rb_raise(rb_eArgError, "invalid hex escape");
6196
	}
6197
	c2 = ruby_scan_hex(s+1, 2, &hexlen);
6198
	if (hexlen != 2) {
6199
	    rb_raise(rb_eArgError, "invalid hex escape");
6200
	}
6201
	*buf = (char)c2;
6202
	rb_str_cat(undumped, buf, 1L);
6203
	n += rb_strlen_lit("xXX");
6204
	break;
6205
      case '#':
6206
	if (s+1 >= s_end) {
6207
	    rb_str_cat(undumped, s, 1L); /* just '#' */
6208
	    n++;
6209
	    break;
6210
	}
6211
	n2 = rb_enc_mbclen(s+1, s_end, enc);
6212
	if (n2 == 1 && IS_EVSTR(s+1, s_end)) {
6213
	    rb_str_cat(undumped, s, n);
6214
	    n += n2;
6215
	}
6216
	break;
6217
      default:
6218
	rb_str_cat(undumped, "\\", 1L); /* keep backslash */
6219
    }
6220

  
6221
    return n;
6222
}
6223

  
6224
/*
6225
 *  call-seq:
6226
 *     str.undump   -> new_str
6227
 *
6228
 *  Produces unescaped version of +str+.
6229
 *  See also String#dump because String#undump does inverse of String#dump.
6230
 *
6231
 *    "\"hello \\n ''\"".undump #=> "hello \n ''"
6232
 */
6233

  
6234
static VALUE
6235
str_undump(VALUE str)
6236
{
6237
    const char *s = RSTRING_PTR(str);
6238
    const char *s_end = RSTRING_END(str);
6239
    long len = RSTRING_LEN(str);
6240
    rb_encoding *enc = rb_enc_get(str);
6241
    int n;
6242
    unsigned int c;
6243
    VALUE undumped = rb_enc_str_new(s, 0L, enc);
6244

  
6245
    rb_must_asciicompat(str);
6246

  
6247
    if (is_wrapped(s, s_end, len, enc)) {
6248
	/* strip '"' at the begin and the end */
6249
	s++;
6250
	s_end--;
6251
    }
6252

  
6253
    for (; s < s_end; s += n) {
6254
	c = rb_enc_codepoint_len(s, s_end, &n, enc);
6255
	if (c == '\\') {
6256
	    if (s+1 >= s_end) {
6257
		rb_raise(rb_eArgError, "invalid escape");
6258
	    }
6259
	    n = undump_after_backslash(undumped, s+1, s_end, enc);
6260
	}
6261
	else {
6262
	    rb_str_cat(undumped, s, n);
6263
	}
6264
    }
6265

  
6266
    OBJ_INFECT(undumped, str);
6267
    return undumped;
6268
}
6072 6269

  
6073 6270
static void
6074 6271
rb_str_check_dummy_enc(rb_encoding *enc)
......
10582 10779
    rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
10583 10780
    rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
10584 10781
    rb_define_method(rb_cString, "dump", rb_str_dump, 0);
10782
    rb_define_method(rb_cString, "undump", str_undump, 0);
10585 10783

  
10586 10784
    sym_ascii      = ID2SYM(rb_intern("ascii"));
10587 10785
    sym_turkic     = ID2SYM(rb_intern("turkic"));
test/ruby/test_string.rb
753 753
    assert_equal(S('"\\u{10ABCD}"'), b.dump)
754 754
  end
755 755

  
756
  def test_undump
757
    a = S("Test") << 1 << 2 << 3 << 9 << 13 << 10
758
    assert_equal(a, S('"Test\\x01\\x02\\x03\\t\\r\\n"').undump)
759
    assert_equal(S("\u{7F}"), S('"\\x7F"').undump)
760
    assert_equal(S("\u{AB}"), S('"\\u00AB"').undump)
761
    assert_equal(S("\u{ABC}"), S('"\\u0ABC"').undump)
762
    assert_equal(S("\uABCD"), S('"\\uABCD"').undump)
763
    assert_equal(S("\u{ABCDE}"), S('"\\u{ABCDE}"').undump)
764
    assert_equal(S("\u{10ABCD}"), S('"\\u{10ABCD}"').undump)
765

  
766
    assert_equal(S("äöü"), S('"\u00E4\u00F6\u00FC"').undump)
767
    assert_equal(S("äöü"), S('"\xC3\xA4\xC3\xB6\xC3\xBC"').undump)
768
 end
769

  
756 770
  def test_dup
757 771
    for taint in [ false, true ]
758 772
      for frozen in [ false, true ]
759
-