Project

General

Profile

Feature #12275 » v3.patch

tad (Tadashi Saito), 12/13/2017 08:25 PM

View differences:

NEWS
173 173
    * String#delete_suffix, String#delete_suffix! [Feature #13665]
174 174
    * String#each_grapheme_cluster and String#grapheme_clusters to
175 175
      enumerate grapheme clusters [Feature #13780]
176
    * String#undump to unescape String#dump'ed string [Feature #12275]
176 177

  
177 178
* Struct
178 179

  
string.c
19 19
#include "ruby_assert.h"
20 20
#include "id.h"
21 21
#include "debug_counter.h"
22
#include "ruby/util.h"
22 23

  
23 24
#define BEG(no) (regs->beg[(no)])
24 25
#define END(no) (regs->end[(no)])
......
3422 3423
    return rb_str_eql(folded_str1, folded_str2);
3423 3424
}
3424 3425

  
3426
static long
3427
strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3428
	    const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3429
{
3430
    const char *search_start = str_ptr;
3431
    long pos, search_len = str_len - offset;
3432

  
3433
    for (;;) {
3434
	const char *t;
3435
	pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3436
	if (pos < 0) return pos;
3437
	t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3438
	if (t == search_start + pos) break;
3439
	search_len -= t - search_start;
3440
	if (search_len <= 0) return -1;
3441
	offset += t - search_start;
3442
	search_start = t;
3443
    }
3444
    return pos + offset;
3445
}
3446

  
3425 3447
#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3426 3448

  
3427 3449
static long
3428 3450
rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3429 3451
{
3430
    const char *str_ptr, *str_ptr_end, *sub_ptr, *search_start;
3431
    long pos, str_len, sub_len, search_len;
3452
    const char *str_ptr, *str_ptr_end, *sub_ptr;
3453
    long str_len, sub_len;
3432 3454
    int single_byte = single_byte_optimizable(str);
3433 3455
    rb_encoding *enc;
3434 3456

  
......
3458 3480
    if (sub_len == 0) return offset;
3459 3481

  
3460 3482
    /* need proceed one character at a time */
3461

  
3462
    search_start = str_ptr;
3463
    search_len = RSTRING_LEN(str) - offset;
3464
    for (;;) {
3465
	const char *t;
3466
	pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3467
	if (pos < 0) return pos;
3468
	t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3469
	if (t == search_start + pos) break;
3470
	search_len -= t - search_start;
3471
	if (search_len <= 0) return -1;
3472
	offset += t - search_start;
3473
	search_start = t;
3474
    }
3475
    return pos + offset;
3483
    return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3476 3484
}
3477 3485

  
3478 3486

  
......
6073 6081
    return result;
6074 6082
}
6075 6083

  
6084
enum undump_source_format {
6085
    UNDUMP_SOURCE_SIMPLE, /* "..." */
6086
    UNDUMP_SOURCE_FORCE_ENCODING, /* "...".force_encoding("...") */
6087
    UNDUMP_SOURCE_INVALID
6088
};
6089

  
6090
static enum undump_source_format
6091
check_undump_source_format(const char *s, const char *s_end, long len, rb_encoding *enc,
6092
			   VALUE *forced_enc_str, long *forced_enc_str_len)
6093
{
6094
    unsigned int cbeg, cend;
6095
    const char *prev;
6096
    static const long force_encoding_minimum_len = rb_strlen_lit("\"\".force_encoding(\"\")");
6097
    static const char force_encoding_middle_part[] = "\".force_encoding(\"";
6098
    static const long force_encoding_middle_part_len = rb_strlen_lit("\".force_encoding(\"");
6099
    static const char force_encoding_end_part[] = "\")";
6100
    static const long force_encoding_end_part_len = rb_strlen_lit("\")");
6101
    long pos_before_middle_part, pos_before_end_part, pos_after_middle_part;
6102

  
6103
    if (len < 2) return UNDUMP_SOURCE_INVALID;
6104

  
6105
    cbeg = rb_enc_mbc_to_codepoint(s, s_end, enc);
6106
    if (cbeg != '"') return UNDUMP_SOURCE_INVALID;
6107

  
6108
    prev = rb_enc_prev_char(s, s_end, s_end, enc);
6109
    cend = rb_enc_mbc_to_codepoint(prev, s_end, enc);
6110
    if (cend == '"') return UNDUMP_SOURCE_SIMPLE;
6111

  
6112
    if (cend != ')' || len < force_encoding_minimum_len) {
6113
	return UNDUMP_SOURCE_INVALID;
6114
    }
6115

  
6116
    /* find '".force_encoding("' */
6117
    pos_before_middle_part = strseq_core(s, s_end, len,
6118
					 force_encoding_middle_part, force_encoding_middle_part_len,
6119
					 0, enc);
6120
    if (pos_before_middle_part <= 0) {
6121
	return UNDUMP_SOURCE_INVALID;
6122
    }
6123

  
6124
    pos_after_middle_part = pos_before_middle_part + force_encoding_middle_part_len;
6125
    /* find '")' */
6126
    pos_before_end_part = strseq_core(s + pos_after_middle_part, s_end, len - pos_after_middle_part,
6127
				      force_encoding_end_part, force_encoding_end_part_len,
6128
				      0, enc);
6129
    if (pos_before_end_part < 0 || pos_after_middle_part + pos_before_end_part + 2 != len) {
6130
	return UNDUMP_SOURCE_INVALID;
6131
    }
6132

  
6133
    *forced_enc_str_len = pos_before_end_part;
6134
    *forced_enc_str = rb_str_new(s + pos_after_middle_part, *forced_enc_str_len);
6135
    return UNDUMP_SOURCE_FORCE_ENCODING;
6136
}
6137

  
6138
static int
6139
unescape_ascii(unsigned int c)
6140
{
6141
    switch (c) {
6142
      case 'n':
6143
	return '\n';
6144
      case 'r':
6145
	return '\r';
6146
      case 't':
6147
	return '\t';
6148
      case 'f':
6149
	return '\f';
6150
      case 'v':
6151
	return '\13';
6152
      case 'b':
6153
	return '\010';
6154
      case 'a':
6155
	return '\007';
6156
      case 'e':
6157
	return 033;
6158
      default:
6159
	UNREACHABLE;
6160
    }
6161
}
6162

  
6163
static int
6164
undump_after_backslash(VALUE undumped, const char *s, const char *s_end, rb_encoding **penc)
6165
{
6166
    unsigned int c, c2;
6167
    int n, codelen;
6168
    size_t hexlen;
6169
    char buf[6];
6170
    static rb_encoding *enc_utf8 = NULL;
6171

  
6172
    c = rb_enc_codepoint_len(s, s_end, &n, *penc);
6173
    switch (c) {
6174
      case '\\':
6175
      case '"':
6176
      case '#':
6177
	rb_str_cat(undumped, s, n); /* cat itself */
6178
	n++;
6179
	break;
6180
      case 'n':
6181
      case 'r':
6182
      case 't':
6183
      case 'f':
6184
      case 'v':
6185
      case 'b':
6186
      case 'a':
6187
      case 'e':
6188
	*buf = (char)unescape_ascii(c);
6189
	rb_str_cat(undumped, buf, n);
6190
	n++;
6191
	break;
6192
      case 'u':
6193
	if (s+1 >= s_end) {
6194
	    rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6195
	}
6196
	if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
6197
	if (*penc != enc_utf8) {
6198
	    *penc = enc_utf8;
6199
	    rb_enc_associate(undumped, enc_utf8);
6200
	    ENC_CODERANGE_CLEAR(undumped);
6201
	}
6202
	c2 = rb_enc_codepoint_len(s+1, s_end, NULL, *penc);
6203
	if (c2 == '{') { /* handle \u{...} form */
6204
	    const char *hexstr = s + 2;
6205
	    int hex;
6206
	    static const char* const close_brace = "}";
6207
	    long pos;
6208

  
6209
	    if (hexstr >= s_end) {
6210
		rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
6211
	    }
6212
	    /* find close brace */
6213
	    pos = strseq_core(hexstr, s_end, s_end - hexstr, close_brace, 1, 0, *penc);
6214
	    if (pos < 0) {
6215
		rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
6216
	    }
6217
	    hex = scan_hex(hexstr, pos, &hexlen);
6218
	    if (hexlen == 0 || hexlen > 6) {
6219
		rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6220
	    }
6221
	    if (hex > 0x10ffff) {
6222
		rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
6223
	    }
6224
	    if ((hex & 0xfffff800) == 0xd800) {
6225
		rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
6226
	    }
6227
	    codelen = rb_enc_codelen(hex, *penc);
6228
	    rb_enc_mbcput(hex, buf, *penc);
6229
	    rb_str_cat(undumped, buf, codelen);
6230
	    n += rb_strlen_lit("u{}") + hexlen;
6231
	}
6232
	else { /* handle \uXXXX form */
6233
	    int hex = scan_hex(s+1, 4, &hexlen);
6234
	    if (hexlen != 4) {
6235
		rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6236
	    }
6237
	    codelen = rb_enc_codelen(hex, *penc);
6238
	    rb_enc_mbcput(hex, buf, *penc);
6239
	    rb_str_cat(undumped, buf, codelen);
6240
	    n += rb_strlen_lit("uXXXX");
6241
	}
6242
	break;
6243
      case 'x':
6244
	if (s+1 >= s_end) {
6245
	    rb_raise(rb_eRuntimeError, "invalid hex escape");
6246
	}
6247
	c2 = scan_hex(s+1, 2, &hexlen);
6248
	if (hexlen != 2) {
6249
	    rb_raise(rb_eRuntimeError, "invalid hex escape");
6250
	}
6251
	*buf = (char)c2;
6252
	rb_str_cat(undumped, buf, 1L);
6253
	n += rb_strlen_lit("xXX");
6254
	break;
6255
      default:
6256
	rb_str_cat(undumped, "\\", 1L); /* keep backslash */
6257
    }
6258

  
6259
    return n;
6260
}
6261

  
6262
static VALUE rb_str_is_ascii_only_p(VALUE str);
6263

  
6264
/*
6265
 *  call-seq:
6266
 *     str.undump   -> new_str
6267
 *
6268
 *  Produces unescaped version of +str+.
6269
 *  See also String#dump because String#undump does inverse of String#dump.
6270
 *
6271
 *    "\"hello \\n ''\"".undump #=> "hello \n ''"
6272
 */
6273

  
6274
static VALUE
6275
str_undump(VALUE str)
6276
{
6277
    const char *s = RSTRING_PTR(str);
6278
    const char *s_end = RSTRING_END(str);
6279
    long len = RSTRING_LEN(str);
6280
    rb_encoding *enc = rb_enc_get(str), *forced_enc;
6281
    int n;
6282
    unsigned int c;
6283
    enum undump_source_format source_format;
6284
    VALUE undumped = rb_enc_str_new(s, 0L, enc);
6285
    VALUE forced_enc_str;
6286
    long forced_enc_str_len;
6287
    int w;
6288

  
6289
    rb_must_asciicompat(str);
6290
    if (rb_str_is_ascii_only_p(str) == Qfalse) {
6291
	rb_raise(rb_eRuntimeError, "non-ASCII character detected");
6292
    }
6293
    if (!str_null_check(str, &w)) {
6294
	rb_raise(rb_eRuntimeError, "string contains null byte");
6295
    }
6296

  
6297
    source_format = check_undump_source_format(s, s_end, len, enc,
6298
					       &forced_enc_str, &forced_enc_str_len);
6299
    if (source_format == UNDUMP_SOURCE_INVALID) {
6300
	rb_raise(rb_eRuntimeError, "not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
6301
    }
6302
    if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) {
6303
	forced_enc = rb_find_encoding(forced_enc_str);
6304
	if (forced_enc == NULL) {
6305
	    rb_raise(rb_eRuntimeError, "unknown encoding name - %"PRIsVALUE, forced_enc_str);
6306
	}
6307
    }
6308

  
6309
    /* strip '"' at the start */
6310
    s++;
6311
    if (source_format == UNDUMP_SOURCE_SIMPLE) {
6312
	/* strip '"' at the end */
6313
	s_end--;
6314
    } else { /* source_format == UNDUMP_SOURCE_FORCE_ENCODING */
6315
	/* strip '".force_encoding("...")' */
6316
	s_end -= rb_strlen_lit("\".force_encoding(\"\")") + forced_enc_str_len;
6317
    }
6318

  
6319
    for (; s < s_end; s += n) {
6320
	c = rb_enc_codepoint_len(s, s_end, &n, enc);
6321
	if (c == '\\') {
6322
	    if (s+1 >= s_end) {
6323
		rb_raise(rb_eRuntimeError, "invalid escape");
6324
	    }
6325
	    n = undump_after_backslash(undumped, s+1, s_end, &enc);
6326
	}
6327
	else if (c == '"') {
6328
	    rb_raise(rb_eRuntimeError, "non-escaped double quote detected");
6329
	}
6330
	else {
6331
	    rb_str_cat(undumped, s, n);
6332
	}
6333
    }
6334

  
6335
    if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) {
6336
	rb_enc_associate(undumped, forced_enc);
6337
	ENC_CODERANGE_CLEAR(undumped);
6338
    }
6339
    OBJ_INFECT(undumped, str);
6340
    return undumped;
6341
}
6076 6342

  
6077 6343
static void
6078 6344
rb_str_check_dummy_enc(rb_encoding *enc)
......
10586 10852
    rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
10587 10853
    rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
10588 10854
    rb_define_method(rb_cString, "dump", rb_str_dump, 0);
10855
    rb_define_method(rb_cString, "undump", str_undump, 0);
10589 10856

  
10590 10857
    sym_ascii      = ID2SYM(rb_intern("ascii"));
10591 10858
    sym_turkic     = ID2SYM(rb_intern("turkic"));
test/ruby/test_string.rb
753 753
    assert_equal(S('"\\u{10ABCD}"'), b.dump)
754 754
  end
755 755

  
756
  def test_undump
757
    a = S("Test") << 1 << 2 << 3 << 9 << 13 << 10
758
    assert_equal(a, S('"Test\\x01\\x02\\x03\\t\\r\\n"').undump)
759
    assert_equal(S("\u{7F}"), S('"\\x7F"').undump)
760
    assert_equal(S("\u{AB}"), S('"\\u00AB"').undump)
761
    assert_equal(S("\u{ABC}"), S('"\\u0ABC"').undump)
762
    assert_equal(S("\uABCD"), S('"\\uABCD"').undump)
763
    assert_equal(S("\u{ABCDE}"), S('"\\u{ABCDE}"').undump)
764
    assert_equal(S("\u{10ABCD}"), S('"\\u{10ABCD}"').undump)
765

  
766
    assert_equal(S("äöü"), S('"\u00E4\u00F6\u00FC"').undump)
767
    assert_equal(S("äöü"), S('"\xC3\xA4\xC3\xB6\xC3\xBC"').undump)
768

  
769
    assert_equal(Encoding::UTF_8, S('"\\u3042"').encode(Encoding::EUC_JP).undump.encoding)
770

  
771
    assert_equal("abc".encode(Encoding::UTF_16LE),
772
                 '"a\x00b\x00c\x00".force_encoding("UTF-16LE")'.undump)
773

  
774
    assert_equal('\#', '"\\\\#"'.undump)
775
    assert_equal('\#{', '"\\\\\#{"'.undump)
776

  
777
    assert_raise(RuntimeError) { S('\u3042').undump }
778
    assert_raise(RuntimeError) { S('"".force_encoding()').undump }
779
    assert_raise(RuntimeError) { S('"".force_encoding("UNKNOWN")').undump }
780
    assert_raise(RuntimeError) { S(%("\u00E4")).undump }
781
    assert_raise(RuntimeError) { S('""""').undump }
782

  
783
    assert_raise(RuntimeError) { S('"\u"').undump }
784
    assert_raise(RuntimeError) { S('"\u{"').undump }
785
    assert_raise(RuntimeError) { S('"\u{3042"').undump }
786
    assert_raise(RuntimeError) { S('"\x"').undump }
787
    assert_raise(RuntimeError) { S('"\\"').undump }
788
    assert_raise(RuntimeError) { S(%("\0")).undump }
789
  end
790

  
756 791
  def test_dup
757 792
    for taint in [ false, true ]
758 793
      for frozen in [ false, true ]
759
-