From fc16f001b795acdb33857cc84b6f6266645c97f1 Mon Sep 17 00:00:00 2001 From: K.Takata Date: Mon, 9 Jan 2012 04:17:06 +0900 Subject: [PATCH] support for Ruby 1.9.3 compatible \b, \B and POSIX brackets. \b, \B and POSIX brackets use Unicode rules in Ruby 1.9.3. ONIG_OPTION_WORD_BOUND_ALL_RANGE and ONIG_OPTION_POSIX_BRACKET_ALL_RANGE are added to support these features. With ONIG_SYNTAX_RUBY, character set modifiers work as follows: /d: Default \d, \s and \w match in the ASCII range only. \b, \B and POSIX brackets use Unicode rules. /a: ASCII \d, \s, \w and POSIX brackets match in the ASCII range only. \b and \B use ASCII rules. /u: Unicode \d, \s, \w and POSIX brackets match in the all Unicode range. \b and \B use Unicode rules. Cherry-picked from commit 'b8f4216'. Conflicts: regparse.c --- oniguruma.h | 7 ++++--- regint.h | 2 ++ regparse.c | 47 ++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 46 insertions(+), 10 deletions(-) diff --git a/oniguruma.h b/oniguruma.h index ffc0c7c..233334f 100644 --- a/oniguruma.h +++ b/oniguruma.h @@ -367,8 +367,10 @@ typedef unsigned int OnigOptionType; #define ONIG_OPTION_POSIX_REGION (ONIG_OPTION_NOTEOL << 1) /* options (ctype range) */ #define ONIG_OPTION_ASCII_RANGE (ONIG_OPTION_POSIX_REGION << 1) +#define ONIG_OPTION_POSIX_BRACKET_ALL_RANGE (ONIG_OPTION_ASCII_RANGE << 1) +#define ONIG_OPTION_WORD_BOUND_ALL_RANGE (ONIG_OPTION_POSIX_BRACKET_ALL_RANGE << 1) /* options (newline) */ -#define ONIG_OPTION_NEWLINE_CRLF (ONIG_OPTION_ASCII_RANGE << 1) +#define ONIG_OPTION_NEWLINE_CRLF (ONIG_OPTION_WORD_BOUND_ALL_RANGE << 1) #define ONIG_OPTION_MAXBIT ONIG_OPTION_NEWLINE_CRLF /* limit */ #define ONIG_OPTION_ON(options,regopt) ((options) |= (regopt)) @@ -452,7 +454,7 @@ ONIG_EXTERN const OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE (1U<<0) /* \Q...\E */ #define ONIG_SYN_OP2_QMARK_GROUP_EFFECT (1U<<1) /* (?...) */ #define ONIG_SYN_OP2_OPTION_PERL (1U<<2) /* (?imsxadlu), (?-imsx), (?^imsxalu) */ -#define ONIG_SYN_OP2_OPTION_RUBY (1U<<3) /* (?imx), (?-imx) */ +#define ONIG_SYN_OP2_OPTION_RUBY (1U<<3) /* (?imxadu), (?-imx) */ #define ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT (1U<<4) /* ?+,*+,++ */ #define ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL (1U<<5) /* {n,m}+ */ #define ONIG_SYN_OP2_CCLASS_SET_OP (1U<<6) /* [...&&..[..]..] */ @@ -497,7 +499,6 @@ ONIG_EXTERN const OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME_CALL (1U<<10) /* (?)(?)(?&x) */ /* syntax (behavior) in char class [...] */ -#define ONIG_SYN_POSIX_BRACKET_ALWAYS_ALL_RANGE (1U<<19) /* (?a) doesn't affect POSIX brackets */ #define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1U<<20) /* [^...] */ #define ONIG_SYN_BACKSLASH_ESCAPE_IN_CC (1U<<21) /* [..\w..] etc.. */ #define ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC (1U<<22) diff --git a/regint.h b/regint.h index 25e499f..67650df 100644 --- a/regint.h +++ b/regint.h @@ -370,6 +370,8 @@ typedef unsigned int BitStatusType; #define IS_NOTEOL(option) ((option) & ONIG_OPTION_NOTEOL) #define IS_POSIX_REGION(option) ((option) & ONIG_OPTION_POSIX_REGION) #define IS_ASCII_RANGE(option) ((option) & ONIG_OPTION_ASCII_RANGE) +#define IS_POSIX_BRACKET_ALL_RANGE(option) ((option) & ONIG_OPTION_POSIX_BRACKET_ALL_RANGE) +#define IS_WORD_BOUND_ALL_RANGE(option) ((option) & ONIG_OPTION_WORD_BOUND_ALL_RANGE) #define IS_NEWLINE_CRLF(option) ((option) & ONIG_OPTION_NEWLINE_CRLF) /* OP_SET_OPTION is required for these options. diff --git a/regparse.c b/regparse.c index 3cb6bd4..9d0f047 100644 --- a/regparse.c +++ b/regparse.c @@ -63,9 +63,9 @@ const OnigSyntaxType OnigSyntaxRuby = { ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | ONIG_SYN_WARN_CC_DUP | - ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT | - ONIG_SYN_POSIX_BRACKET_ALWAYS_ALL_RANGE ) - , ONIG_OPTION_ASCII_RANGE + ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) + , ( ONIG_OPTION_ASCII_RANGE | ONIG_OPTION_POSIX_BRACKET_ALL_RANGE | + ONIG_OPTION_WORD_BOUND_ALL_RANGE ) , { (OnigCodePoint )'\\' /* esc */ @@ -3429,14 +3429,16 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; tok->type = TK_ANCHOR; tok->u.anchor.subtype = ANCHOR_WORD_BOUND; - tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option); + tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option) + && ! IS_WORD_BOUND_ALL_RANGE(env->option); break; case 'B': if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; tok->type = TK_ANCHOR; tok->u.anchor.subtype = ANCHOR_NOT_WORD_BOUND; - tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option); + tok->u.anchor.ascii_range = IS_ASCII_RANGE(env->option) + && ! IS_WORD_BOUND_ALL_RANGE(env->option); break; #ifdef USE_WORD_BEGIN_END @@ -4263,7 +4265,7 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) return ONIGERR_INVALID_POSIX_BRACKET_TYPE; r = add_ctype_to_cc(cc, pb->ctype, not, - IS_SYNTAX_BV(env->syntax, ONIG_SYN_POSIX_BRACKET_ALWAYS_ALL_RANGE), + IS_POSIX_BRACKET_ALL_RANGE(env->option), env); if (r != 0) return r; @@ -5032,6 +5034,18 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, ONOFF(option, ONIG_OPTION_EXTEND, 1); PFETCH(c); } +#if 0 + else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) { + /* d-imx */ + ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); + ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0); + ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0); + ONOFF(option, ONIG_OPTION_IGNORECASE, 1); + ONOFF(option, ONIG_OPTION_MULTILINE, 1); + ONOFF(option, ONIG_OPTION_EXTEND, 1); + PFETCH(c); + } +#endif else { return ONIGERR_UNDEFINED_GROUP_OPTION; } @@ -5082,6 +5096,8 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) && (neg == 0)) { ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); + ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1); + ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1); } else return ONIGERR_UNDEFINED_GROUP_OPTION; @@ -5092,12 +5108,29 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) && (neg == 0)) { ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); + ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1); + ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1); + } + else + return ONIGERR_UNDEFINED_GROUP_OPTION; + break; + + case 'd': + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && + (neg == 0)) { + ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); + } + else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY) && + (neg == 0)) { + ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); + ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0); + ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0); } else return ONIGERR_UNDEFINED_GROUP_OPTION; break; - case 'd': case 'l': + case 'l': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && (neg == 0)) { ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); } -- 1.7.5.1