Index: string.c =================================================================== --- string.c (revision 53085) +++ string.c (working copy) @@ -561,6 +561,18 @@ rb_enc_cr_str_exact_copy(VALUE dest, VALUE src) ENC_CODERANGE_SET(dest, ENC_CODERANGE(src)); } +static int +make_coderange(VALUE str, rb_encoding *enc) +{ + if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc)) { + return ENC_CODERANGE_BROKEN; + } + else { + return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), + get_actual_encoding(rb_enc_to_index(enc), str)); + } +} + int rb_enc_str_coderange(VALUE str) { @@ -569,13 +581,7 @@ rb_enc_str_coderange(VALUE str) if (cr == ENC_CODERANGE_UNKNOWN) { int encidx = ENCODING_GET(str); rb_encoding *enc = rb_enc_from_index(encidx); - if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc)) { - cr = ENC_CODERANGE_BROKEN; - } - else { - cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), - get_actual_encoding(encidx, str)); - } + cr = make_coderange(str, enc); ENC_CODERANGE_SET(str, cr); } return cr; @@ -8541,7 +8547,7 @@ rb_str_b(VALUE str) /* * call-seq: - * str.valid_encoding? -> true or false + * str.valid_encoding?(encoding = nil) -> true or false * * Returns true for a string which encoded correctly. * @@ -8548,13 +8554,27 @@ rb_str_b(VALUE str) * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false * "\x80".force_encoding("UTF-8").valid_encoding? #=> false + * + * If `encoding` is specified, validate with the encoding. + * + * "\xc2\xa1".valid_encoding?("UTF-8") #=> true + * "\xc2".valid_encoding?("UTF-8") #=> false + * "\x80".valid_encoding?("UTF-8") #=> false */ static VALUE -rb_str_valid_encoding_p(VALUE str) +rb_str_valid_encoding_p(int argc, VALUE *argv, VALUE str) { - int cr = rb_enc_str_coderange(str); + VALUE enc = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil; + int cr; + if (NIL_P(enc)) { + cr = rb_enc_str_coderange(str); + } + else { + cr = make_coderange(str, rb_to_encoding(enc)); + } + return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue; } @@ -9502,7 +9522,7 @@ Init_String(void) rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */ rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1); rb_define_method(rb_cString, "b", rb_str_b, 0); - rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0); + rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, -1); rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0); rb_fs = Qnil; Index: test/ruby/test_m17n.rb =================================================================== --- test/ruby/test_m17n.rb (revision 53085) +++ test/ruby/test_m17n.rb (working copy) @@ -1476,6 +1476,40 @@ class TestM17N < Test::Unit::TestCase assert_equal(false, s.valid_encoding?, bug6190) end + def test_valid_encoding_with_arg + s = "\xa1" + assert_equal(false, s.valid_encoding?("euc-jp")) + assert_equal(true, (s+s).valid_encoding?("euc-jp"), "[ruby-dev:33826]") + assert_equal(true, (s*2).valid_encoding?("euc-jp"), "[ruby-dev:33826]") + assert_equal(true, ("%s%s" % [s, s]).valid_encoding?("euc-jp")) + assert_equal(true, (s.dup << s).valid_encoding?("euc-jp")) + assert_equal(true, "".center(2, s).valid_encoding?("euc-jp")) + + s = "\xa1\xa1\x8f" + assert_equal(false, s.valid_encoding?("euc-jp")) + assert_equal(true, s.reverse.valid_encoding?("euc-jp")) + + bug4018 = '[ruby-core:33027]' + s = "\xa1\xa1" + assert_equal(true, s.valid_encoding?("euc-jp")) + s << "\x8f" + assert_equal(false, s.valid_encoding?("euc-jp"), bug4018) + s = "aa" + assert_equal(true, s.valid_encoding?("utf-16be")) + s << "\xff" + assert_equal(false, s.valid_encoding?("utf-16be"), bug4018) + + bug6190 = '[ruby-core:43557]' + s = "\xe9" + s = s.encode("utf-8", "utf-8") + s.force_encoding("ascii-8bit") + assert_equal(false, s.valid_encoding?("utf-8"), bug6190) + s = "\xe9" + s.encode!("utf-8", "utf-8") + s.force_encoding("ascii-8bit") + assert_equal(false, s.valid_encoding?("utf-8"), bug6190) + end + def test_getbyte assert_equal(0x82, u("\xE3\x81\x82\xE3\x81\x84").getbyte(2)) assert_equal(0x82, u("\xE3\x81\x82\xE3\x81\x84").getbyte(-4))