Project

General

Profile

Feature #12275 » v2.patch

tad (Tadashi Saito), 12/09/2017 05:46 PM

View differences:

NEWS
119 119
  * String#each_grapheme_cluster and String#grapheme_clusters is added to
120 120
    enumerate grapheme clusters [Feature #13780]
121 121
  * String#start_with? supports regexp [Feature #13712]
122
  * String#undump is added to unescape String#dump'ed string [Feature #12275]
122 123

  
123 124
* Regexp/String: Update Unicode version from 9.0.0 to 10.0.0 [Feature #13685]
124 125

  
string.c
19 19
#include "ruby_assert.h"
20 20
#include "id.h"
21 21
#include "debug_counter.h"
22
#include "ruby/util.h"
22 23

  
23 24
#define BEG(no) (regs->beg[(no)])
24 25
#define END(no) (regs->end[(no)])
......
3422 3423
    return rb_str_eql(folded_str1, folded_str2);
3423 3424
}
3424 3425

  
3426
static long
3427
strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3428
	    const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3429
{
3430
    const char *search_start = str_ptr;
3431
    long pos, search_len = str_len - offset;
3432

  
3433
    for (;;) {
3434
	const char *t;
3435
	pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3436
	if (pos < 0) return pos;
3437
	t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3438
	if (t == search_start + pos) break;
3439
	search_len -= t - search_start;
3440
	if (search_len <= 0) return -1;
3441
	offset += t - search_start;
3442
	search_start = t;
3443
    }
3444
    return pos + offset;
3445
}
3446

  
3425 3447
#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3426 3448

  
3427 3449
static long
3428 3450
rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3429 3451
{
3430
    const char *str_ptr, *str_ptr_end, *sub_ptr, *search_start;
3431
    long pos, str_len, sub_len, search_len;
3452
    const char *str_ptr, *str_ptr_end, *sub_ptr;
3453
    long str_len, sub_len;
3432 3454
    int single_byte = single_byte_optimizable(str);
3433 3455
    rb_encoding *enc;
3434 3456

  
......
3458 3480
    if (sub_len == 0) return offset;
3459 3481

  
3460 3482
    /* need proceed one character at a time */
3461

  
3462
    search_start = str_ptr;
3463
    search_len = RSTRING_LEN(str) - offset;
3464
    for (;;) {
3465
	const char *t;
3466
	pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3467
	if (pos < 0) return pos;
3468
	t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3469
	if (t == search_start + pos) break;
3470
	search_len -= t - search_start;
3471
	if (search_len <= 0) return -1;
3472
	offset += t - search_start;
3473
	search_start = t;
3474
    }
3475
    return pos + offset;
3483
    return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3476 3484
}
3477 3485

  
3478 3486

  
......
6073 6081
    return result;
6074 6082
}
6075 6083

  
6084
enum undump_source_format {
6085
    UNDUMP_SOURCE_SIMPLE, /* "..." */
6086
    UNDUMP_SOURCE_FORCE_ENCODING, /* "...".force_encoding("...") */
6087
    UNDUMP_SOURCE_INVALID
6088
};
6089

  
6090
static enum undump_source_format
6091
check_undump_source_format(const char *s, const char *s_end, long len, rb_encoding *enc,
6092
			   VALUE *forced_enc_str, long *forced_enc_str_len)
6093
{
6094
    unsigned int cbeg, cend;
6095
    const char *prev;
6096
    static const long force_encoding_minimum_len = rb_strlen_lit("\"\".force_encoding(\"\")");
6097
    static const char force_encoding_middle_part[] = "\".force_encoding(\"";
6098
    static const long force_encoding_middle_part_len = rb_strlen_lit("\".force_encoding(\"");
6099
    static const char force_encoding_end_part[] = "\")";
6100
    static const long force_encoding_end_part_len = rb_strlen_lit("\")");
6101
    long pos_before_middle_part, pos_before_end_part, pos_after_middle_part;
6102

  
6103
    if (len < 2) return UNDUMP_SOURCE_INVALID;
6104

  
6105
    cbeg = rb_enc_mbc_to_codepoint(s, s_end, enc);
6106
    if (cbeg != '"') return UNDUMP_SOURCE_INVALID;
6107

  
6108
    prev = rb_enc_prev_char(s, s_end, s_end, enc);
6109
    cend = rb_enc_mbc_to_codepoint(prev, s_end, enc);
6110
    if (cend == '"') return UNDUMP_SOURCE_SIMPLE;
6111

  
6112
    if (cend != ')' || len < force_encoding_minimum_len) {
6113
	return UNDUMP_SOURCE_INVALID;
6114
    }
6115

  
6116
    /* find '".force_encoding("' */
6117
    pos_before_middle_part = strseq_core(s, s_end, len,
6118
					 force_encoding_middle_part, force_encoding_middle_part_len,
6119
					 0, enc);
6120
    if (pos_before_middle_part <= 0) {
6121
	return UNDUMP_SOURCE_INVALID;
6122
    }
6123

  
6124
    pos_after_middle_part = pos_before_middle_part + force_encoding_middle_part_len;
6125
    /* find '")' */
6126
    pos_before_end_part = strseq_core(s + pos_after_middle_part, s_end, len - pos_after_middle_part,
6127
				      force_encoding_end_part, force_encoding_end_part_len,
6128
				      0, enc);
6129
    if (pos_before_end_part < 0 || pos_after_middle_part + pos_before_end_part + 2 != len) {
6130
	return UNDUMP_SOURCE_INVALID;
6131
    }
6132

  
6133
    *forced_enc_str_len = pos_before_end_part;
6134
    *forced_enc_str = rb_str_new(s + pos_after_middle_part, *forced_enc_str_len);
6135
    return UNDUMP_SOURCE_FORCE_ENCODING;
6136
}
6137

  
6138
static const char *
6139
unescape_ascii(unsigned int c)
6140
{
6141
    switch (c) {
6142
      case 'n':
6143
	return "\n";
6144
      case 'r':
6145
	return "\r";
6146
      case 't':
6147
	return "\t";
6148
      case 'f':
6149
	return "\f";
6150
      case 'v':
6151
	return "\v";
6152
      case 'b':
6153
	return "\b";
6154
      case 'a':
6155
	return "\a";
6156
      case 'e':
6157
	return "\e";
6158
      default:
6159
	UNREACHABLE;
6160
    }
6161
}
6162

  
6163
static int
6164
undump_after_backslash(VALUE undumped, const char *s, const char *s_end, rb_encoding **penc)
6165
{
6166
    unsigned int c, c2;
6167
    int n, n2, codelen;
6168
    size_t hexlen;
6169
    char buf[6];
6170
    static const rb_encoding *enc_utf8 = NULL;
6171

  
6172
    c = rb_enc_codepoint_len(s, s_end, &n, *penc);
6173
    switch (c) {
6174
      case '\\':
6175
      case '"':
6176
	rb_str_cat(undumped, s, n); /* cat itself */
6177
	n++;
6178
	break;
6179
      case 'n':
6180
      case 'r':
6181
      case 't':
6182
      case 'f':
6183
      case 'v':
6184
      case 'b':
6185
      case 'a':
6186
      case 'e':
6187
	rb_str_cat(undumped, unescape_ascii(c), n);
6188
	n++;
6189
	break;
6190
      case 'u':
6191
	if (s+1 >= s_end) {
6192
	    rb_raise(rb_eArgError, "invalid Unicode escape");
6193
	}
6194
	if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
6195
	if (*penc != enc_utf8) {
6196
	    *penc = enc_utf8;
6197
	    rb_enc_associate(undumped, enc_utf8);
6198
	    ENC_CODERANGE_CLEAR(undumped);
6199
	}
6200
	c2 = rb_enc_codepoint_len(s+1, s_end, NULL, *penc);
6201
	if (c2 == '{') { /* handle \u{...} form */
6202
	    const char *hexstr = s + 2;
6203
	    unsigned int hex;
6204
	    static const char* const close_brace = "}";
6205
	    long pos;
6206

  
6207
	    if (hexstr >= s_end) {
6208
		rb_raise(rb_eArgError, "unterminated Unicode escape");
6209
	    }
6210
	    /* find close brace */
6211
	    pos = strseq_core(hexstr, s_end, s_end - hexstr, close_brace, 1, 0, *penc);
6212
	    if (pos < 0) {
6213
		rb_raise(rb_eArgError, "unterminated Unicode escape");
6214
	    }
6215
	    hex = ruby_scan_hex(hexstr, pos, &hexlen);
6216
	    if (hexlen == 0 || hexlen > 6) {
6217
		rb_raise(rb_eArgError, "invalid Unicode escape");
6218
	    }
6219
	    if (hex > 0x10ffffU) {
6220
		rb_raise(rb_eArgError, "invalid Unicode codepoint (too large)");
6221
	    }
6222
	    if ((hex & 0xfffff800U) == 0xd800U) {
6223
		rb_raise(rb_eArgError, "invalid Unicode codepoint");
6224
	    }
6225
	    codelen = rb_enc_codelen(hex, *penc);
6226
	    rb_enc_mbcput(hex, buf, *penc);
6227
	    rb_str_cat(undumped, buf, codelen);
6228
	    n += rb_strlen_lit("u{}") + hexlen;
6229
	}
6230
	else { /* handle \uXXXX form */
6231
	    unsigned int hex = ruby_scan_hex(s+1, 4, &hexlen);
6232
	    if (hexlen != 4) {
6233
		rb_raise(rb_eArgError, "invalid Unicode escape");
6234
	    }
6235
	    codelen = rb_enc_codelen(hex, *penc);
6236
	    rb_enc_mbcput(hex, buf, *penc);
6237
	    rb_str_cat(undumped, buf, codelen);
6238
	    n += rb_strlen_lit("uXXXX");
6239
	}
6240
	break;
6241
      case 'x':
6242
	if (s+1 >= s_end) {
6243
	    rb_raise(rb_eArgError, "invalid hex escape");
6244
	}
6245
	c2 = ruby_scan_hex(s+1, 2, &hexlen);
6246
	if (hexlen != 2) {
6247
	    rb_raise(rb_eArgError, "invalid hex escape");
6248
	}
6249
	*buf = (char)c2;
6250
	rb_str_cat(undumped, buf, 1L);
6251
	n += rb_strlen_lit("xXX");
6252
	break;
6253
      case '#':
6254
	if (s+1 >= s_end) {
6255
	    rb_str_cat(undumped, s, 1L); /* just '#' */
6256
	    n++;
6257
	    break;
6258
	}
6259
	n2 = rb_enc_mbclen(s+1, s_end, *penc);
6260
	if (n2 == 1 && IS_EVSTR(s+1, s_end)) {
6261
	    rb_str_cat(undumped, s, n);
6262
	    n += n2;
6263
	}
6264
	break;
6265
      default:
6266
	rb_str_cat(undumped, "\\", 1L); /* keep backslash */
6267
    }
6268

  
6269
    return n;
6270
}
6271

  
6272
/*
6273
 *  call-seq:
6274
 *     str.undump   -> new_str
6275
 *
6276
 *  Produces unescaped version of +str+.
6277
 *  See also String#dump because String#undump does inverse of String#dump.
6278
 *
6279
 *    "\"hello \\n ''\"".undump #=> "hello \n ''"
6280
 */
6281

  
6282
static VALUE
6283
str_undump(VALUE str)
6284
{
6285
    const char *s = RSTRING_PTR(str);
6286
    const char *s_end = RSTRING_END(str);
6287
    long len = RSTRING_LEN(str);
6288
    rb_encoding *enc = rb_enc_get(str), *forced_enc;
6289
    int n;
6290
    unsigned int c;
6291
    enum undump_source_format source_format;
6292
    VALUE undumped = rb_enc_str_new(s, 0L, enc);
6293
    VALUE forced_enc_str;
6294
    long forced_enc_str_len;
6295

  
6296
    rb_must_asciicompat(str);
6297

  
6298
    source_format = check_undump_source_format(s, s_end, len, enc,
6299
					       &forced_enc_str, &forced_enc_str_len);
6300
    if (source_format == UNDUMP_SOURCE_INVALID) {
6301
	rb_raise(rb_eArgError, "not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6302
    }
6303
    if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) {
6304
	forced_enc = rb_to_encoding(forced_enc_str);
6305
    }
6306

  
6307
    /* strip '"' at the start */
6308
    s++;
6309
    if (source_format == UNDUMP_SOURCE_SIMPLE) {
6310
	/* strip '"' at the end */
6311
	s_end--;
6312
    } else { /* source_format == UNDUMP_SOURCE_FORCE_ENCODING */
6313
	/* strip '".force_encoding("...")' */
6314
	s_end -= rb_strlen_lit("\".force_encoding(\"\")") + forced_enc_str_len;
6315
    }
6316

  
6317
    for (; s < s_end; s += n) {
6318
	c = rb_enc_codepoint_len(s, s_end, &n, enc);
6319
	if (c == '\\') {
6320
	    if (s+1 >= s_end) {
6321
		rb_raise(rb_eArgError, "invalid escape");
6322
	    }
6323
	    n = undump_after_backslash(undumped, s+1, s_end, &enc);
6324
	}
6325
	else {
6326
	    rb_str_cat(undumped, s, n);
6327
	}
6328
    }
6329

  
6330
    if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) {
6331
	rb_enc_associate(undumped, forced_enc);
6332
	ENC_CODERANGE_CLEAR(undumped);
6333
    }
6334
    OBJ_INFECT(undumped, str);
6335
    return undumped;
6336
}
6076 6337

  
6077 6338
static void
6078 6339
rb_str_check_dummy_enc(rb_encoding *enc)
......
10586 10847
    rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
10587 10848
    rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
10588 10849
    rb_define_method(rb_cString, "dump", rb_str_dump, 0);
10850
    rb_define_method(rb_cString, "undump", str_undump, 0);
10589 10851

  
10590 10852
    sym_ascii      = ID2SYM(rb_intern("ascii"));
10591 10853
    sym_turkic     = ID2SYM(rb_intern("turkic"));
test/ruby/test_string.rb
753 753
    assert_equal(S('"\\u{10ABCD}"'), b.dump)
754 754
  end
755 755

  
756
  def test_undump
757
    a = S("Test") << 1 << 2 << 3 << 9 << 13 << 10
758
    assert_equal(a, S('"Test\\x01\\x02\\x03\\t\\r\\n"').undump)
759
    assert_equal(S("\u{7F}"), S('"\\x7F"').undump)
760
    assert_equal(S("\u{AB}"), S('"\\u00AB"').undump)
761
    assert_equal(S("\u{ABC}"), S('"\\u0ABC"').undump)
762
    assert_equal(S("\uABCD"), S('"\\uABCD"').undump)
763
    assert_equal(S("\u{ABCDE}"), S('"\\u{ABCDE}"').undump)
764
    assert_equal(S("\u{10ABCD}"), S('"\\u{10ABCD}"').undump)
765

  
766
    assert_equal(S("äöü"), S('"\u00E4\u00F6\u00FC"').undump)
767
    assert_equal(S("äöü"), S('"\xC3\xA4\xC3\xB6\xC3\xBC"').undump)
768

  
769
    assert_equal(Encoding::UTF_8, S('"\\u3042"').encode(Encoding::EUC_JP).undump.encoding)
770

  
771
    assert_equal("abc".encode(Encoding::UTF_16LE),
772
                 '"a\x00b\x00c\x00".force_encoding("UTF-16LE")'.undump)
773

  
774
    assert_raise(ArgumentError) { S('\u3042').undump }
775
    assert_raise(ArgumentError) { S('"".force_encoding()').undump }
776
    assert_raise(ArgumentError) { S('"".force_encoding("UNKNOWN")').undump }
777
 end
778

  
756 779
  def test_dup
757 780
    for taint in [ false, true ]
758 781
      for frozen in [ false, true ]
759
-