Project

General

Profile

Actions

Feature #6752

closed

Replacing ill-formed subsequencce

Added by naruse (Yui NARUSE) over 12 years ago. Updated over 11 years ago.

Status:
Closed
Target version:
[ruby-dev:45975]

Description

=begin
== 概要
Stringになんらかの理由で不正なバイト列が含まれている時に、それを置換文字で置き換えたい。

== ユースケース
実際に確認されているユースケースは以下の通りです。

== 必要な引数: 置換文字
省略可能、String。
デフォルトは、Unicode系ならU+FFFD、それ以外では「?」。
デフォルトが空文字でない理由は、削除してしまうことで、従来は存在しなかったトークンを作れてしまい、
上位のレイヤーの脆弱性に繋がるからです。
http://unicode.org/reports/tr36/#UTF-8_Exploit

== API
--- str.encode(str.encoding, invalid: replace, [replace: "〓"])

  • CSI的じゃなくて気持ち悪い
  • iconv でできるのは glibc iconv か GNU libiconv に //IGNORE つけた時で他はできない
  • 実装上のメリットは後述の通り、直感に反してあまりない(と思う)

== 別メソッド

  • 新しいメソッドである
  • fix/repair invalid/illegal bytes/sequence あたりの名前か

== 実装
=== 鬼車ベース
int ret = rb_enc_precise_mbclen(p, e, enc); して、
MBCLEN_INVALID_P(ret) が真な時、何バイト目が不正なのかわからないのが微妙。
ONIGENC_CONSTRUCT_MBCLEN_INVALID() がバイト数を取らないのが原因なので、
鬼車のエンコーディングモジュール全てに影響してしまうため、修正困難。
不正なバイトはほとんど存在しないと仮定して、効率を犠牲にすれば回避は可能。

=== transcodeベース
UCS正規化なglibc iconv, GNU libiconv, Perl Encodeなどと違って、
CSIなtranscodeでは、自分自身に変換する場合、
エンコーディングごとに「何もしない」変換モジュールを用意しないといけない。

とりあえず鬼車ベースのコンセプト実装とテストを添付しておきます。

diff --git a/string.c b/string.c
index d038835..4808f15 100644
--- a/string.c
+++ b/string.c
@@ -7426,6 +7426,199 @@ rb_str_ellipsize(VALUE str, long len)
return ret;
}

+/*

    • call-seq:
    • str.fix_invalid -> new_str
    • If the string is well-formed, it returns self.
    • If the string has invalid byte sequence, repair it with given replacement
    • character.
  • */
    +VALUE
    +rb_str_fix_invalid(VALUE str)
    +{
  • int cr = ENC_CODERANGE(str);
  • rb_encoding *enc;
  • if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID)
  • return rb_str_dup(str);
  • enc = STR_ENC_GET(str);
  • if (rb_enc_asciicompat(enc)) {
  • const char *p = RSTRING_PTR(str);
  • const char *e = RSTRING_END(str);
  • const char *p1 = p;
  • /* 10 should be enough for the usual use case,
    • fixing a wrongly chopped character at the end of the string
  • */
  • long room = 10;
  • VALUE buf = rb_str_buf_new(RSTRING_LEN(str) + room);
  • const char *rep;
  • if (enc == rb_utf8_encoding())
  •  rep = "\xEF\xBF\xBD";
    
  • else
  •  rep = "?";
    
  • cr = ENC_CODERANGE_7BIT;
  • p = search_nonascii(p, e);
  • if (!p) {
  •  p = e;
    
  • }
  • while (p < e) {
  •  int ret = rb_enc_precise_mbclen(p, e, enc);
    
  •  if (MBCLEN_CHARFOUND_P(ret)) {
    
  •  if ((unsigned char)*p > 127) cr = ENC_CODERANGE_VALID;
    
  •  p += MBCLEN_CHARFOUND_LEN(ret);
    
  •  }
    
  •  else if (MBCLEN_INVALID_P(ret)) {
    
  •  const char *q;
    
  •  long clen = rb_enc_mbmaxlen(enc);
    
  •  if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
    
  •  q = RSTRING_END(buf);
    
  •  if (e - p < clen) clen = e - p;
    
  •  if (clen < 3) {
    
  •      clen = 1;
    
  •  }
    
  •  else {
    
  •      long len = RSTRING_LEN(buf);
    
  •      clen--;
    
  •      rb_str_buf_cat(buf, p, clen);
    
  •      for (; clen > 1; clen--) {
    
  •  	ret = rb_enc_precise_mbclen(q, q + clen, enc);
    
  •  	if (MBCLEN_NEEDMORE_P(ret)) {
    
  •  	    break;
    
  •  	}
    
  •  	else if (MBCLEN_INVALID_P(ret)) {
    
  •  	    continue;
    
  •  	}
    
  •  	else {
    
  •  	    rb_bug("shouldn't reach here '%s'", q);
    
  •  	}
    
  •      }
    
  •      rb_str_set_len(buf, len);
    
  •  }
    
  •  p += clen;
    
  •  p1 = p;
    
  •  rb_str_buf_cat2(buf, rep);
    
  •  p = search_nonascii(p, e);
    
  •  if (!p) {
    
  •      p = e;
    
  •      break;
    
  •  }
    
  •  }
    
  •  else if (MBCLEN_NEEDMORE_P(ret)) {
    
  •  break;
    
  •  }
    
  •  else {
    
  •  rb_bug("shouldn't reach here");
    
  •  }
    
  • }
  • if (p1 < p) {
  •  rb_str_buf_cat(buf, p1, p - p1);
    
  • }
  • if (p < e) {
  •  rb_str_buf_cat2(buf, rep);
    
  •  cr = ENC_CODERANGE_VALID;
    
  • }
  • ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
  • return buf;
  • }
  • else if (rb_enc_dummy_p(enc)) {
  • return rb_str_dup(str);
  • }
  • else {
  • /* ASCII incompatible */
  • const char *p = RSTRING_PTR(str);
  • const char *e = RSTRING_END(str);
  • const char *p1 = p;
  • /* 10 should be enough for the usual use case,
    • fixing a wrongly chopped character at the end of the string
  • */
  • long room = 10;
  • VALUE buf = rb_str_buf_new(RSTRING_LEN(str) + room);
  • const char *rep;
  • long mbminlen = rb_enc_mbminlen(enc);
  • static rb_encoding *utf16be;
  • static rb_encoding *utf16le;
  • static rb_encoding *utf32be;
  • static rb_encoding *utf32le;
  • if (!utf16be) {
  •  utf16be = rb_enc_find("UTF-16BE");
    
  •  utf16le = rb_enc_find("UTF-16LE");
    
  •  utf32be = rb_enc_find("UTF-32BE");
    
  •  utf32le = rb_enc_find("UTF-32LE");
    
  • }
  • if (enc == utf16be) {
  •  rep = "\xFF\xFD";
    
  • }
  • else if (enc == utf16le) {
  •  rep = "\xFD\xFF";
    
  • }
  • else if (enc == utf32be) {
  •  rep = "\x00\x00\xFF\xFD";
    
  • }
  • else if (enc == utf32le) {
  •  rep = "\xFD\xFF\x00\x00";
    
  • }
  • else {
  •  rep = "?";
    
  • }
  • while (p < e) {
  •  int ret = rb_enc_precise_mbclen(p, e, enc);
    
  •  if (MBCLEN_CHARFOUND_P(ret)) {
    
  •  p += MBCLEN_CHARFOUND_LEN(ret);
    
  •  }
    
  •  else if (MBCLEN_INVALID_P(ret)) {
    
  •  const char *q;
    
  •  long clen = rb_enc_mbmaxlen(enc);
    
  •  if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
    
  •  q = RSTRING_END(buf);
    
  •  if (e - p < clen) clen = e - p;
    
  •  if (clen < mbminlen * 3) {
    
  •      clen = mbminlen;
    
  •  }
    
  •  else {
    
  •      long len = RSTRING_LEN(buf);
    
  •      clen -= mbminlen;
    
  •      rb_str_buf_cat(buf, p, clen);
    
  •      for (; clen > mbminlen; clen-=mbminlen) {
    
  •  	ret = rb_enc_precise_mbclen(q, q + clen, enc);
    
  •  	if (MBCLEN_NEEDMORE_P(ret)) {
    
  •  	    break;
    
  •  	}
    
  •  	else if (MBCLEN_INVALID_P(ret)) {
    
  •  	    continue;
    
  •  	}
    
  •  	else {
    
  •  	    rb_bug("shouldn't reach here '%s'", q);
    
  •  	}
    
  •      }
    
  •      rb_str_set_len(buf, len);
    
  •  }
    
  •  p += clen;
    
  •  p1 = p;
    
  •  rb_str_buf_cat2(buf, rep);
    
  •  }
    
  •  else if (MBCLEN_NEEDMORE_P(ret)) {
    
  •  break;
    
  •  }
    
  •  else {
    
  •  rb_bug("shouldn't reach here");
    
  •  }
    
  • }
  • if (p1 < p) {
  •  rb_str_buf_cat(buf, p1, p - p1);
    
  • }
  • if (p < e) {
  •  rb_str_buf_cat2(buf, rep);
    
  • }
  • ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), ENC_CODERANGE_VALID);
  • return buf;
  • }
    +}

/**********************************************************************

  • Document-class: Symbol

@@ -7882,6 +8075,7 @@ Init_String(void)
rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);

  • rb_define_method(rb_cString, "fix_invalid", rb_str_fix_invalid, 0);

    rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
    rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
    diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb
    index 47f349c..2b0cfeb 100644
    --- a/test/ruby/test_string.rb
    +++ b/test/ruby/test_string.rb
    @@ -2031,6 +2031,29 @@ class TestString < Test::Unit::TestCase

    assert_equal(u("\x82")+("\u3042"*9), ("\u3042"*10).byteslice(2, 28))
    end

  • def test_fix_invalid

  • assert_equal("\uFFFD\uFFFD\uFFFD", "\x80\x80\x80".fix_invalid)

  • assert_equal("\uFFFDA", "\xF4\x80\x80A".fix_invalid)

  • exapmles in Unicode 6.1.0 D93b

  • assert_equal("\x41\uFFFD\uFFFD\x41\uFFFD\x41",

  •             "\x41\xC0\xAF\x41\xF4\x80\x80\x41".fix_invalid)
    
  • assert_equal("\x41\uFFFD\uFFFD\uFFFD\x41",

  •             "\x41\xE0\x9F\x80\x41".fix_invalid)
    
  • assert_equal("\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064",

  •             "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64".fix_invalid)
    
  • assert_equal("abcdefghijklmnopqrstuvwxyz\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064",

  •             "abcdefghijklmnopqrstuvwxyz\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64".fix_invalid)
    
  • assert_equal("\uFFFD\u3042".encode("UTF-16BE"),

  •             "\xD8\x00\x30\x42".force_encoding(Encoding::UTF_16BE).
    
  •             fix_invalid)
    
  • assert_equal("\uFFFD\u3042".encode("UTF-16LE"),

  •             "\x00\xD8\x42\x30".force_encoding(Encoding::UTF_16LE).
    
  •             fix_invalid)
    
  • end
    end

class TestString2 < TestString
=end


Related issues 2 (0 open2 closed)

Related to Ruby master - Feature #6321: Find and repair bad bytes in encodings, without transcodingClosednaruse (Yui NARUSE)04/19/2012Actions
Related to Ruby master - Bug #7967: String#encode invalid: :replace doesn't replace invalid charsRejected02/26/2013Actions
Actions

Also available in: Atom PDF

Like0
Like0Like0Like0Like0Like0Like0Like0Like0Like0