From a749456a1b337b183cdbeeb96ac5db7d3d215b50 Mon Sep 17 00:00:00 2001 From: Nobuyoshi Nakada Date: Sat, 8 Jun 2019 21:38:06 +0900 Subject: [PATCH] Enable BOM detection with non-UTF encodings * io.c (parse_mode_enc): enable encoding detection by BOM with non-UTF encodings. it is a quite common usage to designate the encoding in a file by the BOM. --- io.c | 10 +--------- test/ruby/test_io_m17n.rb | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/io.c b/io.c index 893cabc248..63d5323ac0 100644 --- a/io.c +++ b/io.c @@ -5454,9 +5454,7 @@ rb_io_fmode_modestr(int fmode) } static const char bom_prefix[] = "bom|"; -static const char utf_prefix[] = "utf-"; enum {bom_prefix_len = (int)sizeof(bom_prefix) - 1}; -enum {utf_prefix_len = (int)sizeof(utf_prefix) - 1}; static int io_encname_bom_p(const char *name, long len) @@ -5693,13 +5691,7 @@ parse_mode_enc(const char *estr, rb_encoding *estr_enc, if ((fmode & FMODE_SETENC_BY_BOM) || io_encname_bom_p(estr, len)) { estr += bom_prefix_len; len -= bom_prefix_len; - if (!STRNCASECMP(estr, utf_prefix, utf_prefix_len)) { - fmode |= FMODE_SETENC_BY_BOM; - } - else { - rb_enc_warn(estr_enc, "BOM with non-UTF encoding %s is nonsense", estr); - fmode &= ~FMODE_SETENC_BY_BOM; - } + fmode |= FMODE_SETENC_BY_BOM; } if (len == 0 || len > ENCODING_MAXNAMELEN) { idx = -1; diff --git a/test/ruby/test_io_m17n.rb b/test/ruby/test_io_m17n.rb index 8dfa5d5500..e10e521895 100644 --- a/test/ruby/test_io_m17n.rb +++ b/test/ruby/test_io_m17n.rb @@ -2084,30 +2084,30 @@ with_tmpdir { text = "\uFEFFa" stripped = "a" - %w/UTF-8 UTF-16BE UTF-16LE UTF-32BE UTF-32LE/.each do |name| + %w/UTF-8 UTF-16BE UTF-16LE UTF-32BE UTF-32LE/.product(%w[UTF-8 CP932]) do |name, enc| path = '%s-bom.txt' % name content = text.encode(name) generate_file(path, content) - result = File.read(path, mode: 'rb:BOM|UTF-8') - assert_equal(content[1].force_encoding("ascii-8bit"), - result.force_encoding("ascii-8bit")) - result = File.read(path, mode: 'rb:BOM|UTF-8:UTF-8') - assert_equal(Encoding::UTF_8, result.encoding) - assert_equal(stripped, result) + result = File.read(path, mode: "rb:BOM|#{enc}") + assert_equal(Encoding.find(name), result.encoding, name) + assert_equal(stripped.encode(name), result, name) + result = File.read(path, mode: "rb:BOM|#{enc}:UTF-8") + assert_equal(Encoding::UTF_8, result.encoding, name) + assert_equal(stripped, result, name) end bug3407 = '[ruby-core:30641]' path = 'UTF-8-bom.txt' result = File.read(path, encoding: 'BOM|UTF-8') - assert_equal("a", result.force_encoding("ascii-8bit"), bug3407) + assert_equal("a", result, bug3407) bug8323 = '[ruby-core:54563] [Bug #8323]' expected = "a\xff".force_encoding("utf-8") open(path, 'ab') {|f| f.write("\xff")} - result = File.read(path, encoding: 'BOM|UTF-8') + result = File.read(path, encoding: 'BOM|CP932') assert_not_predicate(result, :valid_encoding?, bug8323) assert_equal(expected, result, bug8323) - result = File.read(path, encoding: 'BOM|UTF-8:UTF-8') + result = File.read(path, encoding: 'BOM|CP932:UTF-8') assert_not_predicate(result, :valid_encoding?, bug8323) assert_equal(expected, result, bug8323) @@ -2138,25 +2138,25 @@ def test_bom_non_utf enc = nil - assert_warn(/BOM/) { + assert_warn('') { open(__FILE__, "r:bom|us-ascii") {|f| enc = f.external_encoding} } assert_equal(Encoding::US_ASCII, enc) enc = nil - assert_warn(/BOM/) { + assert_warn('') { open(__FILE__, "r", encoding: "bom|us-ascii") {|f| enc = f.external_encoding} } assert_equal(Encoding::US_ASCII, enc) enc = nil - assert_warn(/BOM/) { + assert_warn('') { open(IO::NULL, "w:bom|us-ascii") {|f| enc = f.external_encoding} } assert_equal(Encoding::US_ASCII, enc) enc = nil - assert_warn(/BOM/) { + assert_warn('') { open(IO::NULL, "w", encoding: "bom|us-ascii") {|f| enc = f.external_encoding} } assert_equal(Encoding::US_ASCII, enc) -- 2.21.0