Feature #12275 » v2.patch
NEWS | ||
---|---|---|
* String#each_grapheme_cluster and String#grapheme_clusters is added to
|
||
enumerate grapheme clusters [Feature #13780]
|
||
* String#start_with? supports regexp [Feature #13712]
|
||
* String#undump is added to unescape String#dump'ed string [Feature #12275]
|
||
* Regexp/String: Update Unicode version from 9.0.0 to 10.0.0 [Feature #13685]
|
||
string.c | ||
---|---|---|
#include "ruby_assert.h"
|
||
#include "id.h"
|
||
#include "debug_counter.h"
|
||
#include "ruby/util.h"
|
||
#define BEG(no) (regs->beg[(no)])
|
||
#define END(no) (regs->end[(no)])
|
||
... | ... | |
return rb_str_eql(folded_str1, folded_str2);
|
||
}
|
||
static long
|
||
strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
|
||
const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
|
||
{
|
||
const char *search_start = str_ptr;
|
||
long pos, search_len = str_len - offset;
|
||
for (;;) {
|
||
const char *t;
|
||
pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
|
||
if (pos < 0) return pos;
|
||
t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
|
||
if (t == search_start + pos) break;
|
||
search_len -= t - search_start;
|
||
if (search_len <= 0) return -1;
|
||
offset += t - search_start;
|
||
search_start = t;
|
||
}
|
||
return pos + offset;
|
||
}
|
||
#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
|
||
static long
|
||
rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
|
||
{
|
||
const char *str_ptr, *str_ptr_end, *sub_ptr, *search_start;
|
||
long pos, str_len, sub_len, search_len;
|
||
const char *str_ptr, *str_ptr_end, *sub_ptr;
|
||
long str_len, sub_len;
|
||
int single_byte = single_byte_optimizable(str);
|
||
rb_encoding *enc;
|
||
... | ... | |
if (sub_len == 0) return offset;
|
||
/* need proceed one character at a time */
|
||
search_start = str_ptr;
|
||
search_len = RSTRING_LEN(str) - offset;
|
||
for (;;) {
|
||
const char *t;
|
||
pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
|
||
if (pos < 0) return pos;
|
||
t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
|
||
if (t == search_start + pos) break;
|
||
search_len -= t - search_start;
|
||
if (search_len <= 0) return -1;
|
||
offset += t - search_start;
|
||
search_start = t;
|
||
}
|
||
return pos + offset;
|
||
return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
|
||
}
|
||
... | ... | |
return result;
|
||
}
|
||
enum undump_source_format {
|
||
UNDUMP_SOURCE_SIMPLE, /* "..." */
|
||
UNDUMP_SOURCE_FORCE_ENCODING, /* "...".force_encoding("...") */
|
||
UNDUMP_SOURCE_INVALID
|
||
};
|
||
static enum undump_source_format
|
||
check_undump_source_format(const char *s, const char *s_end, long len, rb_encoding *enc,
|
||
VALUE *forced_enc_str, long *forced_enc_str_len)
|
||
{
|
||
unsigned int cbeg, cend;
|
||
const char *prev;
|
||
static const long force_encoding_minimum_len = rb_strlen_lit("\"\".force_encoding(\"\")");
|
||
static const char force_encoding_middle_part[] = "\".force_encoding(\"";
|
||
static const long force_encoding_middle_part_len = rb_strlen_lit("\".force_encoding(\"");
|
||
static const char force_encoding_end_part[] = "\")";
|
||
static const long force_encoding_end_part_len = rb_strlen_lit("\")");
|
||
long pos_before_middle_part, pos_before_end_part, pos_after_middle_part;
|
||
if (len < 2) return UNDUMP_SOURCE_INVALID;
|
||
cbeg = rb_enc_mbc_to_codepoint(s, s_end, enc);
|
||
if (cbeg != '"') return UNDUMP_SOURCE_INVALID;
|
||
prev = rb_enc_prev_char(s, s_end, s_end, enc);
|
||
cend = rb_enc_mbc_to_codepoint(prev, s_end, enc);
|
||
if (cend == '"') return UNDUMP_SOURCE_SIMPLE;
|
||
if (cend != ')' || len < force_encoding_minimum_len) {
|
||
return UNDUMP_SOURCE_INVALID;
|
||
}
|
||
/* find '".force_encoding("' */
|
||
pos_before_middle_part = strseq_core(s, s_end, len,
|
||
force_encoding_middle_part, force_encoding_middle_part_len,
|
||
0, enc);
|
||
if (pos_before_middle_part <= 0) {
|
||
return UNDUMP_SOURCE_INVALID;
|
||
}
|
||
pos_after_middle_part = pos_before_middle_part + force_encoding_middle_part_len;
|
||
/* find '")' */
|
||
pos_before_end_part = strseq_core(s + pos_after_middle_part, s_end, len - pos_after_middle_part,
|
||
force_encoding_end_part, force_encoding_end_part_len,
|
||
0, enc);
|
||
if (pos_before_end_part < 0 || pos_after_middle_part + pos_before_end_part + 2 != len) {
|
||
return UNDUMP_SOURCE_INVALID;
|
||
}
|
||
*forced_enc_str_len = pos_before_end_part;
|
||
*forced_enc_str = rb_str_new(s + pos_after_middle_part, *forced_enc_str_len);
|
||
return UNDUMP_SOURCE_FORCE_ENCODING;
|
||
}
|
||
static const char *
|
||
unescape_ascii(unsigned int c)
|
||
{
|
||
switch (c) {
|
||
case 'n':
|
||
return "\n";
|
||
case 'r':
|
||
return "\r";
|
||
case 't':
|
||
return "\t";
|
||
case 'f':
|
||
return "\f";
|
||
case 'v':
|
||
return "\v";
|
||
case 'b':
|
||
return "\b";
|
||
case 'a':
|
||
return "\a";
|
||
case 'e':
|
||
return "\e";
|
||
default:
|
||
UNREACHABLE;
|
||
}
|
||
}
|
||
static int
|
||
undump_after_backslash(VALUE undumped, const char *s, const char *s_end, rb_encoding **penc)
|
||
{
|
||
unsigned int c, c2;
|
||
int n, n2, codelen;
|
||
size_t hexlen;
|
||
char buf[6];
|
||
static const rb_encoding *enc_utf8 = NULL;
|
||
c = rb_enc_codepoint_len(s, s_end, &n, *penc);
|
||
switch (c) {
|
||
case '\\':
|
||
case '"':
|
||
rb_str_cat(undumped, s, n); /* cat itself */
|
||
n++;
|
||
break;
|
||
case 'n':
|
||
case 'r':
|
||
case 't':
|
||
case 'f':
|
||
case 'v':
|
||
case 'b':
|
||
case 'a':
|
||
case 'e':
|
||
rb_str_cat(undumped, unescape_ascii(c), n);
|
||
n++;
|
||
break;
|
||
case 'u':
|
||
if (s+1 >= s_end) {
|
||
rb_raise(rb_eArgError, "invalid Unicode escape");
|
||
}
|
||
if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
|
||
if (*penc != enc_utf8) {
|
||
*penc = enc_utf8;
|
||
rb_enc_associate(undumped, enc_utf8);
|
||
ENC_CODERANGE_CLEAR(undumped);
|
||
}
|
||
c2 = rb_enc_codepoint_len(s+1, s_end, NULL, *penc);
|
||
if (c2 == '{') { /* handle \u{...} form */
|
||
const char *hexstr = s + 2;
|
||
unsigned int hex;
|
||
static const char* const close_brace = "}";
|
||
long pos;
|
||
if (hexstr >= s_end) {
|
||
rb_raise(rb_eArgError, "unterminated Unicode escape");
|
||
}
|
||
/* find close brace */
|
||
pos = strseq_core(hexstr, s_end, s_end - hexstr, close_brace, 1, 0, *penc);
|
||
if (pos < 0) {
|
||
rb_raise(rb_eArgError, "unterminated Unicode escape");
|
||
}
|
||
hex = ruby_scan_hex(hexstr, pos, &hexlen);
|
||
if (hexlen == 0 || hexlen > 6) {
|
||
rb_raise(rb_eArgError, "invalid Unicode escape");
|
||
}
|
||
if (hex > 0x10ffffU) {
|
||
rb_raise(rb_eArgError, "invalid Unicode codepoint (too large)");
|
||
}
|
||
if ((hex & 0xfffff800U) == 0xd800U) {
|
||
rb_raise(rb_eArgError, "invalid Unicode codepoint");
|
||
}
|
||
codelen = rb_enc_codelen(hex, *penc);
|
||
rb_enc_mbcput(hex, buf, *penc);
|
||
rb_str_cat(undumped, buf, codelen);
|
||
n += rb_strlen_lit("u{}") + hexlen;
|
||
}
|
||
else { /* handle \uXXXX form */
|
||
unsigned int hex = ruby_scan_hex(s+1, 4, &hexlen);
|
||
if (hexlen != 4) {
|
||
rb_raise(rb_eArgError, "invalid Unicode escape");
|
||
}
|
||
codelen = rb_enc_codelen(hex, *penc);
|
||
rb_enc_mbcput(hex, buf, *penc);
|
||
rb_str_cat(undumped, buf, codelen);
|
||
n += rb_strlen_lit("uXXXX");
|
||
}
|
||
break;
|
||
case 'x':
|
||
if (s+1 >= s_end) {
|
||
rb_raise(rb_eArgError, "invalid hex escape");
|
||
}
|
||
c2 = ruby_scan_hex(s+1, 2, &hexlen);
|
||
if (hexlen != 2) {
|
||
rb_raise(rb_eArgError, "invalid hex escape");
|
||
}
|
||
*buf = (char)c2;
|
||
rb_str_cat(undumped, buf, 1L);
|
||
n += rb_strlen_lit("xXX");
|
||
break;
|
||
case '#':
|
||
if (s+1 >= s_end) {
|
||
rb_str_cat(undumped, s, 1L); /* just '#' */
|
||
n++;
|
||
break;
|
||
}
|
||
n2 = rb_enc_mbclen(s+1, s_end, *penc);
|
||
if (n2 == 1 && IS_EVSTR(s+1, s_end)) {
|
||
rb_str_cat(undumped, s, n);
|
||
n += n2;
|
||
}
|
||
break;
|
||
default:
|
||
rb_str_cat(undumped, "\\", 1L); /* keep backslash */
|
||
}
|
||
return n;
|
||
}
|
||
/*
|
||
* call-seq:
|
||
* str.undump -> new_str
|
||
*
|
||
* Produces unescaped version of +str+.
|
||
* See also String#dump because String#undump does inverse of String#dump.
|
||
*
|
||
* "\"hello \\n ''\"".undump #=> "hello \n ''"
|
||
*/
|
||
static VALUE
|
||
str_undump(VALUE str)
|
||
{
|
||
const char *s = RSTRING_PTR(str);
|
||
const char *s_end = RSTRING_END(str);
|
||
long len = RSTRING_LEN(str);
|
||
rb_encoding *enc = rb_enc_get(str), *forced_enc;
|
||
int n;
|
||
unsigned int c;
|
||
enum undump_source_format source_format;
|
||
VALUE undumped = rb_enc_str_new(s, 0L, enc);
|
||
VALUE forced_enc_str;
|
||
long forced_enc_str_len;
|
||
rb_must_asciicompat(str);
|
||
source_format = check_undump_source_format(s, s_end, len, enc,
|
||
&forced_enc_str, &forced_enc_str_len);
|
||
if (source_format == UNDUMP_SOURCE_INVALID) {
|
||
rb_raise(rb_eArgError, "not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
|
||
}
|
||
if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) {
|
||
forced_enc = rb_to_encoding(forced_enc_str);
|
||
}
|
||
/* strip '"' at the start */
|
||
s++;
|
||
if (source_format == UNDUMP_SOURCE_SIMPLE) {
|
||
/* strip '"' at the end */
|
||
s_end--;
|
||
} else { /* source_format == UNDUMP_SOURCE_FORCE_ENCODING */
|
||
/* strip '".force_encoding("...")' */
|
||
s_end -= rb_strlen_lit("\".force_encoding(\"\")") + forced_enc_str_len;
|
||
}
|
||
for (; s < s_end; s += n) {
|
||
c = rb_enc_codepoint_len(s, s_end, &n, enc);
|
||
if (c == '\\') {
|
||
if (s+1 >= s_end) {
|
||
rb_raise(rb_eArgError, "invalid escape");
|
||
}
|
||
n = undump_after_backslash(undumped, s+1, s_end, &enc);
|
||
}
|
||
else {
|
||
rb_str_cat(undumped, s, n);
|
||
}
|
||
}
|
||
if (source_format == UNDUMP_SOURCE_FORCE_ENCODING) {
|
||
rb_enc_associate(undumped, forced_enc);
|
||
ENC_CODERANGE_CLEAR(undumped);
|
||
}
|
||
OBJ_INFECT(undumped, str);
|
||
return undumped;
|
||
}
|
||
static void
|
||
rb_str_check_dummy_enc(rb_encoding *enc)
|
||
... | ... | |
rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
|
||
rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
|
||
rb_define_method(rb_cString, "dump", rb_str_dump, 0);
|
||
rb_define_method(rb_cString, "undump", str_undump, 0);
|
||
sym_ascii = ID2SYM(rb_intern("ascii"));
|
||
sym_turkic = ID2SYM(rb_intern("turkic"));
|
test/ruby/test_string.rb | ||
---|---|---|
assert_equal(S('"\\u{10ABCD}"'), b.dump)
|
||
end
|
||
def test_undump
|
||
a = S("Test") << 1 << 2 << 3 << 9 << 13 << 10
|
||
assert_equal(a, S('"Test\\x01\\x02\\x03\\t\\r\\n"').undump)
|
||
assert_equal(S("\u{7F}"), S('"\\x7F"').undump)
|
||
assert_equal(S("\u{AB}"), S('"\\u00AB"').undump)
|
||
assert_equal(S("\u{ABC}"), S('"\\u0ABC"').undump)
|
||
assert_equal(S("\uABCD"), S('"\\uABCD"').undump)
|
||
assert_equal(S("\u{ABCDE}"), S('"\\u{ABCDE}"').undump)
|
||
assert_equal(S("\u{10ABCD}"), S('"\\u{10ABCD}"').undump)
|
||
assert_equal(S("äöü"), S('"\u00E4\u00F6\u00FC"').undump)
|
||
assert_equal(S("äöü"), S('"\xC3\xA4\xC3\xB6\xC3\xBC"').undump)
|
||
assert_equal(Encoding::UTF_8, S('"\\u3042"').encode(Encoding::EUC_JP).undump.encoding)
|
||
assert_equal("abc".encode(Encoding::UTF_16LE),
|
||
'"a\x00b\x00c\x00".force_encoding("UTF-16LE")'.undump)
|
||
assert_raise(ArgumentError) { S('\u3042').undump }
|
||
assert_raise(ArgumentError) { S('"".force_encoding()').undump }
|
||
assert_raise(ArgumentError) { S('"".force_encoding("UNKNOWN")').undump }
|
||
end
|
||
def test_dup
|
||
for taint in [ false, true ]
|
||
for frozen in [ false, true ]
|