Project

General

Profile

Bug #14363 ยป each_grapheme_cluster_size_real.patch

hugopeixoto (Hugo Peixoto), 03/21/2018 04:17 PM

View differences:

string.c
8355 8355
    return rb_str_enumerate_codepoints(str, ary);
8356 8356
}
8357 8357

  
8358
static VALUE
8359
rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
8360
{
8361
    long grapheme_cluster_count = 0;
8362
    regex_t *reg_grapheme_cluster = NULL;
8363
    static regex_t *reg_grapheme_cluster_utf8 = NULL;
8364
    int encidx = ENCODING_GET(str);
8365
    rb_encoding *enc = rb_enc_from_index(encidx);
8366
    int unicode_p = rb_enc_unicode_p(enc);
8367
    const char *ptr, *end;
8368

  
8369
    if (!unicode_p || single_byte_optimizable(str)) {
8370
	return rb_str_length(str);
8371
    }
8372

  
8373
    /* synchronize */
8374
    if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
8375
	reg_grapheme_cluster = reg_grapheme_cluster_utf8;
8376
    }
8377
    if (!reg_grapheme_cluster) {
8378
	const OnigUChar source[] = "\\X";
8379
	int r = onig_new(&reg_grapheme_cluster, source, source + sizeof(source) - 1,
8380
			 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, NULL);
8381
	if (r) {
8382
	    rb_bug("cannot compile grapheme cluster regexp");
8383
	}
8384
	if (encidx == rb_utf8_encindex()) {
8385
	    reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
8386
	}
8387
    }
8388

  
8389
    ptr = RSTRING_PTR(str);
8390
    end = RSTRING_END(str);
8391

  
8392
    while (ptr < end) {
8393
	OnigPosition len = onig_match(reg_grapheme_cluster,
8394
				      (const OnigUChar *)ptr, (const OnigUChar *)end,
8395
				      (const OnigUChar *)ptr, NULL, 0);
8396
	if (len == 0) break;
8397
	if (len < 0) {
8398
	    break;
8399
	}
8400
	grapheme_cluster_count++;
8401
	ptr += len;
8402
    }
8403
    RB_GC_GUARD(str);
8404

  
8405
    return LONG2NUM(grapheme_cluster_count);
8406
}
8407

  
8358 8408
static VALUE
8359 8409
rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
8360 8410
{
......
8426 8476
static VALUE
8427 8477
rb_str_each_grapheme_cluster(VALUE str)
8428 8478
{
8429
    RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
8479
    RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
8430 8480
    return rb_str_enumerate_grapheme_clusters(str, 0);
8431 8481
}
8432 8482

  
test/ruby/test_string.rb
982 982
      "\u{1f469 200d 2764 fe0f 200d 1f469}",
983 983
    ].each do |g|
984 984
      assert_equal [g], g.each_grapheme_cluster.to_a
985
      assert_equal 1, g.each_grapheme_cluster.size
986
    end
987

  
988
    [
989
      ["\u{a 308}", ["\u000A", "\u0308"]],
990
      ["\u{d 308}", ["\u000D", "\u0308"]],
991
      ["abc", ["a", "b", "c"]],
992
    ].each do |str, grapheme_clusters|
993
      assert_equal grapheme_clusters, str.each_grapheme_cluster.to_a
994
      assert_equal grapheme_clusters.size, str.each_grapheme_cluster.size
985 995
    end
986 996

  
987
    assert_equal ["\u000A", "\u0308"], "\u{a 308}".each_grapheme_cluster.to_a
988
    assert_equal ["\u000D", "\u0308"], "\u{d 308}".each_grapheme_cluster.to_a
989
    assert_equal ["a", "b", "c"], "abc".b.each_grapheme_cluster.to_a
990 997
    s = ("x"+"\u{10ABCD}"*250000)
991 998
    assert_empty(s.each_grapheme_cluster {s.clear})
992 999
  end