Feature #12275 » v1.patch
| NEWS | ||
|---|---|---|
| 
       * String#each_grapheme_cluster and String#grapheme_clusters is added to 
   | 
||
| 
         enumerate grapheme clusters [Feature #13780] 
   | 
||
| 
       * String#start_with? supports regexp [Feature #13712] 
   | 
||
| 
       * String#undump is added to unescape String#dump'ed string [Feature #12275] 
   | 
||
| 
     * Regexp/String: Update Unicode version from 9.0.0 to 10.0.0 [Feature #13685] 
   | 
||
| string.c | ||
|---|---|---|
| 
     #include "ruby_assert.h" 
   | 
||
| 
     #include "id.h" 
   | 
||
| 
     #include "debug_counter.h" 
   | 
||
| 
     #include "ruby/util.h" 
   | 
||
| 
     #define BEG(no) (regs->beg[(no)]) 
   | 
||
| 
     #define END(no) (regs->end[(no)]) 
   | 
||
| ... | ... | |
| 
         return rb_str_eql(folded_str1, folded_str2); 
   | 
||
| 
     } 
   | 
||
| 
     static long 
   | 
||
| 
     strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len, 
   | 
||
| 
     	    const char *sub_ptr, long sub_len, long offset, rb_encoding *enc) 
   | 
||
| 
     { 
   | 
||
| 
         const char *search_start = str_ptr; 
   | 
||
| 
         long pos, search_len = str_len - offset; 
   | 
||
| 
         for (;;) { 
   | 
||
| 
     	const char *t; 
   | 
||
| 
     	pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc); 
   | 
||
| 
     	if (pos < 0) return pos; 
   | 
||
| 
     	t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc); 
   | 
||
| 
     	if (t == search_start + pos) break; 
   | 
||
| 
     	search_len -= t - search_start; 
   | 
||
| 
     	if (search_len <= 0) return -1; 
   | 
||
| 
     	offset += t - search_start; 
   | 
||
| 
     	search_start = t; 
   | 
||
| 
         } 
   | 
||
| 
         return pos + offset; 
   | 
||
| 
     } 
   | 
||
| 
     #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0) 
   | 
||
| 
     static long 
   | 
||
| 
     rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte) 
   | 
||
| 
     { 
   | 
||
| 
         const char *str_ptr, *str_ptr_end, *sub_ptr, *search_start; 
   | 
||
| 
         long pos, str_len, sub_len, search_len; 
   | 
||
| 
         const char *str_ptr, *str_ptr_end, *sub_ptr; 
   | 
||
| 
         long str_len, sub_len; 
   | 
||
| 
         int single_byte = single_byte_optimizable(str); 
   | 
||
| 
         rb_encoding *enc; 
   | 
||
| ... | ... | |
| 
         if (sub_len == 0) return offset; 
   | 
||
| 
         /* need proceed one character at a time */ 
   | 
||
| 
         search_start = str_ptr; 
   | 
||
| 
         search_len = RSTRING_LEN(str) - offset; 
   | 
||
| 
         for (;;) { 
   | 
||
| 
     	const char *t; 
   | 
||
| 
     	pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc); 
   | 
||
| 
     	if (pos < 0) return pos; 
   | 
||
| 
     	t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc); 
   | 
||
| 
     	if (t == search_start + pos) break; 
   | 
||
| 
     	search_len -= t - search_start; 
   | 
||
| 
     	if (search_len <= 0) return -1; 
   | 
||
| 
     	offset += t - search_start; 
   | 
||
| 
     	search_start = t; 
   | 
||
| 
         } 
   | 
||
| 
         return pos + offset; 
   | 
||
| 
         return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc); 
   | 
||
| 
     } 
   | 
||
| ... | ... | |
| 
         return result; 
   | 
||
| 
     } 
   | 
||
| 
     /* Is s wrapped with '"'? */ 
   | 
||
| 
     static int 
   | 
||
| 
     is_wrapped(const char *s, const char *s_end, long len, rb_encoding *enc) 
   | 
||
| 
     { 
   | 
||
| 
         unsigned int cbeg, cend; 
   | 
||
| 
         const char *prev; 
   | 
||
| 
         if (len < 2) return FALSE; 
   | 
||
| 
         cbeg = rb_enc_mbc_to_codepoint(s, s_end, enc); 
   | 
||
| 
         if (cbeg != '"') return FALSE; 
   | 
||
| 
         prev = rb_enc_prev_char(s, s_end, s_end, enc); 
   | 
||
| 
         cend = rb_enc_mbc_to_codepoint(prev, s_end, enc); 
   | 
||
| 
         return cend == '"'; 
   | 
||
| 
     } 
   | 
||
| 
     static const char * 
   | 
||
| 
     unescape_ascii(unsigned int c) 
   | 
||
| 
     { 
   | 
||
| 
         switch (c) { 
   | 
||
| 
           case 'n': 
   | 
||
| 
     	return "\n"; 
   | 
||
| 
           case 'r': 
   | 
||
| 
     	return "\r"; 
   | 
||
| 
           case 't': 
   | 
||
| 
     	return "\t"; 
   | 
||
| 
           case 'f': 
   | 
||
| 
     	return "\f"; 
   | 
||
| 
           case 'v': 
   | 
||
| 
     	return "\v"; 
   | 
||
| 
           case 'b': 
   | 
||
| 
     	return "\b"; 
   | 
||
| 
           case 'a': 
   | 
||
| 
     	return "\a"; 
   | 
||
| 
           case 'e': 
   | 
||
| 
     	return "\e"; 
   | 
||
| 
           default: 
   | 
||
| 
     	UNREACHABLE; 
   | 
||
| 
         } 
   | 
||
| 
     } 
   | 
||
| 
     static int 
   | 
||
| 
     undump_after_backslash(VALUE undumped, const char *s, const char *s_end, rb_encoding *enc) 
   | 
||
| 
     { 
   | 
||
| 
         unsigned int c, c2; 
   | 
||
| 
         int n, n2, codelen; 
   | 
||
| 
         size_t hexlen; 
   | 
||
| 
         char buf[6]; 
   | 
||
| 
         c = rb_enc_codepoint_len(s, s_end, &n, enc); 
   | 
||
| 
         switch (c) { 
   | 
||
| 
           case '\\': 
   | 
||
| 
           case '"': 
   | 
||
| 
     	rb_str_cat(undumped, s, n); /* cat itself */ 
   | 
||
| 
     	n++; 
   | 
||
| 
     	break; 
   | 
||
| 
           case 'n': 
   | 
||
| 
           case 'r': 
   | 
||
| 
           case 't': 
   | 
||
| 
           case 'f': 
   | 
||
| 
           case 'v': 
   | 
||
| 
           case 'b': 
   | 
||
| 
           case 'a': 
   | 
||
| 
           case 'e': 
   | 
||
| 
     	rb_str_cat(undumped, unescape_ascii(c), n); 
   | 
||
| 
     	n++; 
   | 
||
| 
     	break; 
   | 
||
| 
           case 'u': 
   | 
||
| 
     	if (s+1 >= s_end) { 
   | 
||
| 
     	    rb_raise(rb_eArgError, "invalid Unicode escape"); 
   | 
||
| 
     	} 
   | 
||
| 
     	c2 = rb_enc_codepoint_len(s+1, s_end, NULL, enc); 
   | 
||
| 
     	if (c2 == '{') { /* handle \u{...} form */ 
   | 
||
| 
     	    const char *hexstr = s + 2; 
   | 
||
| 
     	    unsigned int hex; 
   | 
||
| 
     	    static const char* const close_brace = "}"; 
   | 
||
| 
     	    long pos; 
   | 
||
| 
     	    if (hexstr >= s_end) { 
   | 
||
| 
     		rb_raise(rb_eArgError, "unterminated Unicode escape"); 
   | 
||
| 
     	    } 
   | 
||
| 
     	    /* find close brace */ 
   | 
||
| 
     	    pos = strseq_core(hexstr, s_end, s_end - hexstr, close_brace, 1, 0, enc); 
   | 
||
| 
     	    if (pos < 0) { 
   | 
||
| 
     		rb_raise(rb_eArgError, "unterminated Unicode escape"); 
   | 
||
| 
     	    } 
   | 
||
| 
     	    hex = ruby_scan_hex(hexstr, pos, &hexlen); 
   | 
||
| 
     	    if (hexlen == 0 || hexlen > 6) { 
   | 
||
| 
     		rb_raise(rb_eArgError, "invalid Unicode escape"); 
   | 
||
| 
     	    } 
   | 
||
| 
     	    if (hex > 0x10ffffU) { 
   | 
||
| 
     		rb_raise(rb_eArgError, "invalid Unicode codepoint (too large)"); 
   | 
||
| 
     	    } 
   | 
||
| 
     	    if ((hex & 0xfffff800U) == 0xd800U) { 
   | 
||
| 
     		rb_raise(rb_eArgError, "invalid Unicode codepoint"); 
   | 
||
| 
     	    } 
   | 
||
| 
     	    codelen = rb_enc_codelen(hex, enc); 
   | 
||
| 
     	    rb_enc_mbcput(hex, buf, enc); 
   | 
||
| 
     	    rb_str_cat(undumped, buf, codelen); 
   | 
||
| 
     	    n += rb_strlen_lit("u{}") + hexlen; 
   | 
||
| 
     	} 
   | 
||
| 
     	else { /* handle \uXXXX form */ 
   | 
||
| 
     	    unsigned int hex = ruby_scan_hex(s+1, 4, &hexlen); 
   | 
||
| 
     	    if (hexlen != 4) { 
   | 
||
| 
     		rb_raise(rb_eArgError, "invalid Unicode escape"); 
   | 
||
| 
     	    } 
   | 
||
| 
     	    codelen = rb_enc_codelen(hex, enc); 
   | 
||
| 
     	    rb_enc_mbcput(hex, buf, enc); 
   | 
||
| 
     	    rb_str_cat(undumped, buf, codelen); 
   | 
||
| 
     	    n += rb_strlen_lit("uXXXX"); 
   | 
||
| 
     	} 
   | 
||
| 
     	break; 
   | 
||
| 
           case 'x': 
   | 
||
| 
     	if (s+1 >= s_end) { 
   | 
||
| 
     	    rb_raise(rb_eArgError, "invalid hex escape"); 
   | 
||
| 
     	} 
   | 
||
| 
     	c2 = ruby_scan_hex(s+1, 2, &hexlen); 
   | 
||
| 
     	if (hexlen != 2) { 
   | 
||
| 
     	    rb_raise(rb_eArgError, "invalid hex escape"); 
   | 
||
| 
     	} 
   | 
||
| 
     	*buf = (char)c2; 
   | 
||
| 
     	rb_str_cat(undumped, buf, 1L); 
   | 
||
| 
     	n += rb_strlen_lit("xXX"); 
   | 
||
| 
     	break; 
   | 
||
| 
           case '#': 
   | 
||
| 
     	if (s+1 >= s_end) { 
   | 
||
| 
     	    rb_str_cat(undumped, s, 1L); /* just '#' */ 
   | 
||
| 
     	    n++; 
   | 
||
| 
     	    break; 
   | 
||
| 
     	} 
   | 
||
| 
     	n2 = rb_enc_mbclen(s+1, s_end, enc); 
   | 
||
| 
     	if (n2 == 1 && IS_EVSTR(s+1, s_end)) { 
   | 
||
| 
     	    rb_str_cat(undumped, s, n); 
   | 
||
| 
     	    n += n2; 
   | 
||
| 
     	} 
   | 
||
| 
     	break; 
   | 
||
| 
           default: 
   | 
||
| 
     	rb_str_cat(undumped, "\\", 1L); /* keep backslash */ 
   | 
||
| 
         } 
   | 
||
| 
         return n; 
   | 
||
| 
     } 
   | 
||
| 
     /* 
   | 
||
| 
      *  call-seq: 
   | 
||
| 
      *     str.undump   -> new_str 
   | 
||
| 
      * 
   | 
||
| 
      *  Produces unescaped version of +str+. 
   | 
||
| 
      *  See also String#dump because String#undump does inverse of String#dump. 
   | 
||
| 
      * 
   | 
||
| 
      *    "\"hello \\n ''\"".undump #=> "hello \n ''" 
   | 
||
| 
      */ 
   | 
||
| 
     static VALUE 
   | 
||
| 
     str_undump(VALUE str) 
   | 
||
| 
     { 
   | 
||
| 
         const char *s = RSTRING_PTR(str); 
   | 
||
| 
         const char *s_end = RSTRING_END(str); 
   | 
||
| 
         long len = RSTRING_LEN(str); 
   | 
||
| 
         rb_encoding *enc = rb_enc_get(str); 
   | 
||
| 
         int n; 
   | 
||
| 
         unsigned int c; 
   | 
||
| 
         VALUE undumped = rb_enc_str_new(s, 0L, enc); 
   | 
||
| 
         rb_must_asciicompat(str); 
   | 
||
| 
         if (is_wrapped(s, s_end, len, enc)) { 
   | 
||
| 
     	/* strip '"' at the begin and the end */ 
   | 
||
| 
     	s++; 
   | 
||
| 
     	s_end--; 
   | 
||
| 
         } 
   | 
||
| 
         for (; s < s_end; s += n) { 
   | 
||
| 
     	c = rb_enc_codepoint_len(s, s_end, &n, enc); 
   | 
||
| 
     	if (c == '\\') { 
   | 
||
| 
     	    if (s+1 >= s_end) { 
   | 
||
| 
     		rb_raise(rb_eArgError, "invalid escape"); 
   | 
||
| 
     	    } 
   | 
||
| 
     	    n = undump_after_backslash(undumped, s+1, s_end, enc); 
   | 
||
| 
     	} 
   | 
||
| 
     	else { 
   | 
||
| 
     	    rb_str_cat(undumped, s, n); 
   | 
||
| 
     	} 
   | 
||
| 
         } 
   | 
||
| 
         OBJ_INFECT(undumped, str); 
   | 
||
| 
         return undumped; 
   | 
||
| 
     } 
   | 
||
| 
     static void 
   | 
||
| 
     rb_str_check_dummy_enc(rb_encoding *enc) 
   | 
||
| ... | ... | |
| 
         rb_define_method(rb_cString, "to_str", rb_str_to_s, 0); 
   | 
||
| 
         rb_define_method(rb_cString, "inspect", rb_str_inspect, 0); 
   | 
||
| 
         rb_define_method(rb_cString, "dump", rb_str_dump, 0); 
   | 
||
| 
         rb_define_method(rb_cString, "undump", str_undump, 0); 
   | 
||
| 
         sym_ascii      = ID2SYM(rb_intern("ascii")); 
   | 
||
| 
         sym_turkic     = ID2SYM(rb_intern("turkic")); 
   | 
||
| test/ruby/test_string.rb | ||
|---|---|---|
| 
         assert_equal(S('"\\u{10ABCD}"'), b.dump) 
   | 
||
| 
       end 
   | 
||
| 
       def test_undump 
   | 
||
| 
         a = S("Test") << 1 << 2 << 3 << 9 << 13 << 10 
   | 
||
| 
         assert_equal(a, S('"Test\\x01\\x02\\x03\\t\\r\\n"').undump) 
   | 
||
| 
         assert_equal(S("\u{7F}"), S('"\\x7F"').undump) 
   | 
||
| 
         assert_equal(S("\u{AB}"), S('"\\u00AB"').undump) 
   | 
||
| 
         assert_equal(S("\u{ABC}"), S('"\\u0ABC"').undump) 
   | 
||
| 
         assert_equal(S("\uABCD"), S('"\\uABCD"').undump) 
   | 
||
| 
         assert_equal(S("\u{ABCDE}"), S('"\\u{ABCDE}"').undump) 
   | 
||
| 
         assert_equal(S("\u{10ABCD}"), S('"\\u{10ABCD}"').undump) 
   | 
||
| 
         assert_equal(S("äöü"), S('"\u00E4\u00F6\u00FC"').undump) 
   | 
||
| 
         assert_equal(S("äöü"), S('"\xC3\xA4\xC3\xB6\xC3\xBC"').undump) 
   | 
||
| 
      end 
   | 
||
| 
       def test_dup 
   | 
||
| 
         for taint in [ false, true ] 
   | 
||
| 
           for frozen in [ false, true ] 
   | 
||