Project

General

Profile

Backport #8323 ยป 0001-io.c-conversion-from-bom-encoding.patch

nobu (Nobuyoshi Nakada), 04/25/2013 01:42 PM

View differences:

io.c
4860 4860
 * Qnil => no encoding specified (internal only)
4861 4861
 */
4862 4862
static void
4863
rb_io_ext_int_to_encs(rb_encoding *ext, rb_encoding *intern, rb_encoding **enc, rb_encoding **enc2)
4863
rb_io_ext_int_to_encs(rb_encoding *ext, rb_encoding *intern, rb_encoding **enc, rb_encoding **enc2, int fmode)
4864 4864
{
4865 4865
    int default_ext = 0;
4866 4866

  
......
4871 4871
    if (intern == NULL && ext != rb_ascii8bit_encoding())
4872 4872
	/* If external is ASCII-8BIT, no default transcoding */
4873 4873
	intern = rb_default_internal_encoding();
4874
    if (intern == NULL || intern == (rb_encoding *)Qnil || intern == ext) {
4874
    if (intern == NULL || intern == (rb_encoding *)Qnil ||
4875
	(!(fmode & FMODE_SETENC_BY_BOM) && (intern == ext))) {
4875 4876
	/* No internal encoding => use external + no transcoding */
4876 4877
	*enc = (default_ext && intern != ext) ? NULL : ext;
4877 4878
	*enc2 = NULL;
......
4894 4895
    const char *p;
4895 4896
    char encname[ENCODING_MAXNAMELEN+1];
4896 4897
    int idx, idx2;
4898
    int fmode = fmode_p ? *fmode_p : 0;
4897 4899
    rb_encoding *ext_enc, *int_enc;
4898 4900

  
4899 4901
    /* parse estr as "enc" or "enc2:enc" or "enc:-" */
......
4905 4907
	    idx = -1;
4906 4908
	else {
4907 4909
	    if (io_encname_bom_p(estr, len)) {
4908
		if (fmode_p) *fmode_p |= FMODE_SETENC_BY_BOM;
4910
		fmode |= FMODE_SETENC_BY_BOM;
4909 4911
		estr += 4;
4910 4912
                len -= 4;
4911 4913
            }
......
4918 4920
    else {
4919 4921
	long len = strlen(estr);
4920 4922
	if (io_encname_bom_p(estr, len)) {
4921
	    if (fmode_p) *fmode_p |= FMODE_SETENC_BY_BOM;
4923
	    fmode |= FMODE_SETENC_BY_BOM;
4922 4924
	    estr += 4;
4923 4925
            len -= 4;
4924 4926
	    memcpy(encname, estr, len);
......
4927 4929
	}
4928 4930
	idx = rb_enc_find_index(estr);
4929 4931
    }
4932
    if (fmode_p) *fmode_p = fmode;
4930 4933

  
4931 4934
    if (idx >= 0)
4932 4935
	ext_enc = rb_enc_from_index(idx);
......
4946 4949
	    idx2 = rb_enc_find_index(p);
4947 4950
	    if (idx2 < 0)
4948 4951
		unsupported_encoding(p);
4949
	    else if (idx2 == idx) {
4952
	    else if (!(fmode & FMODE_SETENC_BY_BOM) && (idx2 == idx)) {
4950 4953
		int_enc = (rb_encoding *)Qnil;
4951 4954
	    }
4952 4955
	    else
......
4954 4957
	}
4955 4958
    }
4956 4959

  
4957
    rb_io_ext_int_to_encs(ext_enc, int_enc, enc_p, enc2_p);
4960
    rb_io_ext_int_to_encs(ext_enc, int_enc, enc_p, enc2_p, fmode);
4958 4961
}
4959 4962

  
4960 4963
int
......
5015 5018
	    parse_mode_enc(StringValueCStr(tmp), enc_p, enc2_p, fmode_p);
5016 5019
	}
5017 5020
	else {
5018
	    rb_io_ext_int_to_encs(rb_to_encoding(encoding), NULL, enc_p, enc2_p);
5021
	    rb_io_ext_int_to_encs(rb_to_encoding(encoding), NULL, enc_p, enc2_p, 0);
5019 5022
	}
5020 5023
    }
5021 5024
    else if (extenc != Qundef || intenc != Qundef) {
5022 5025
        extracted = 1;
5023
	rb_io_ext_int_to_encs(extencoding, intencoding, enc_p, enc2_p);
5026
	rb_io_ext_int_to_encs(extencoding, intencoding, enc_p, enc2_p, 0);
5024 5027
    }
5025 5028
    return extracted;
5026 5029
}
......
5095 5098
    vmode = *vmode_p;
5096 5099

  
5097 5100
    /* Set to defaults */
5098
    rb_io_ext_int_to_encs(NULL, NULL, &enc, &enc2);
5101
    rb_io_ext_int_to_encs(NULL, NULL, &enc, &enc2, 0);
5099 5102

  
5100 5103
  vmode_handle:
5101 5104
    if (NIL_P(vmode)) {
......
5123 5126
	    rb_encoding *e;
5124 5127

  
5125 5128
	    e = (fmode & FMODE_BINMODE) ? rb_ascii8bit_encoding() : NULL;
5126
	    rb_io_ext_int_to_encs(e, NULL, &enc, &enc2);
5129
	    rb_io_ext_int_to_encs(e, NULL, &enc, &enc2, fmode);
5127 5130
	}
5128 5131
    }
5129 5132

  
......
5147 5150
            oflags |= O_BINARY;
5148 5151
#endif
5149 5152
	    if (!has_enc)
5150
		rb_io_ext_int_to_encs(rb_ascii8bit_encoding(), NULL, &enc, &enc2);
5153
		rb_io_ext_int_to_encs(rb_ascii8bit_encoding(), NULL, &enc, &enc2, fmode);
5151 5154
	}
5152 5155
#if DEFAULT_TEXTMODE
5153 5156
	else if (NIL_P(vmode)) {
......
5370 5373
io_set_encoding_by_bom(VALUE io)
5371 5374
{
5372 5375
    int idx = io_strip_bom(io);
5376
    rb_io_t *fptr;
5373 5377

  
5378
    GetOpenFile(io, fptr);
5374 5379
    if (idx) {
5375
	rb_io_t *fptr;
5376
	GetOpenFile(io, fptr);
5377 5380
	io_encoding_set(fptr, rb_enc_from_encoding(rb_enc_from_index(idx)),
5378 5381
		rb_io_internal_encoding(io), Qnil);
5379 5382
    }
5383
    else {
5384
	fptr->encs.enc2 = NULL;
5385
    }
5380 5386
}
5381 5387

  
5382 5388
static VALUE
......
5386 5392
    convconfig_t cc;
5387 5393
    if (!convconfig) {
5388 5394
	/* Set to default encodings */
5389
	rb_io_ext_int_to_encs(NULL, NULL, &cc.enc, &cc.enc2);
5395
	rb_io_ext_int_to_encs(NULL, NULL, &cc.enc, &cc.enc2, fmode);
5390 5396
        cc.ecflags = 0;
5391 5397
        cc.ecopts = Qnil;
5392 5398
        convconfig = &cc;
......
5420 5426
	/* Set to default encodings */
5421 5427

  
5422 5428
	e = (fmode & FMODE_BINMODE) ? rb_ascii8bit_encoding() : NULL;
5423
	rb_io_ext_int_to_encs(e, NULL, &convconfig.enc, &convconfig.enc2);
5429
	rb_io_ext_int_to_encs(e, NULL, &convconfig.enc, &convconfig.enc2, fmode);
5424 5430
        convconfig.ecflags = 0;
5425 5431
        convconfig.ecopts = Qnil;
5426 5432
    }
......
9078 9084
    else {
9079 9085
	if (NIL_P(v1)) {
9080 9086
	    /* Set to default encodings */
9081
	    rb_io_ext_int_to_encs(NULL, NULL, &enc, &enc2);
9087
	    rb_io_ext_int_to_encs(NULL, NULL, &enc, &enc2, 0);
9082 9088
	    SET_UNIVERSAL_NEWLINE_DECORATOR_IF_ENC2(enc2, ecflags);
9083 9089
            ecopts = Qnil;
9084 9090
	}
......
9090 9096
                ecflags = rb_econv_prepare_options(opt, &ecopts, ecflags);
9091 9097
	    }
9092 9098
	    else {
9093
		rb_io_ext_int_to_encs(find_encoding(v1), NULL, &enc, &enc2);
9099
		rb_io_ext_int_to_encs(find_encoding(v1), NULL, &enc, &enc2, 0);
9094 9100
		SET_UNIVERSAL_NEWLINE_DECORATOR_IF_ENC2(enc2, ecflags);
9095 9101
                ecopts = Qnil;
9096 9102
	    }
test/ruby/test_io_m17n.rb
1997 1997
  def test_strip_bom
1998 1998
    with_tmpdir {
1999 1999
      text = "\uFEFFa"
2000
      stripped = "a"
2000 2001
      %w/UTF-8 UTF-16BE UTF-16LE UTF-32BE UTF-32LE/.each do |name|
2001 2002
        path = '%s-bom.txt' % name
2002 2003
        content = text.encode(name)
......
2004 2005
        result = File.read(path, mode: 'rb:BOM|UTF-8')
2005 2006
        assert_equal(content[1].force_encoding("ascii-8bit"),
2006 2007
                     result.force_encoding("ascii-8bit"))
2008
        result = File.read(path, mode: 'rb:BOM|UTF-8:UTF-8')
2009
        assert_equal(Encoding::UTF_8, result.encoding)
2010
        assert_equal(stripped, result)
2007 2011
      end
2008 2012

  
2009 2013
      bug3407 = '[ruby-core:30641]'
2010
      result = File.read('UTF-8-bom.txt', encoding: 'BOM|UTF-8')
2014
      path = 'UTF-8-bom.txt'
2015
      result = File.read(path, encoding: 'BOM|UTF-8')
2011 2016
      assert_equal("a", result.force_encoding("ascii-8bit"), bug3407)
2017

  
2018
      bug8323 = '[ruby-core:54563] [Bug #8323]'
2019
      expected = "a\xff".force_encoding("utf-8")
2020
      open(path, 'ab') {|f| f.write("\xff")}
2021
      result = File.read(path, encoding: 'BOM|UTF-8')
2022
      assert_not_predicate(result, :valid_encoding?, bug8323)
2023
      assert_equal(expected, result, bug8323)
2024
      result = File.read(path, encoding: 'BOM|UTF-8:UTF-8')
2025
      assert_not_predicate(result, :valid_encoding?, bug8323)
2026
      assert_equal(expected, result, bug8323)
2027

  
2028
      path = 'ascii.txt'
2029
      generate_file(path, stripped)
2030
      result = File.read(path, encoding: 'BOM|UTF-8')
2031
      assert_equal(stripped, result, bug8323)
2032
      result = File.read(path, encoding: 'BOM|UTF-8:UTF-8')
2033
      assert_equal(stripped, result, bug8323)
2012 2034
    }
2013 2035
  end
2014 2036

  
2015
-