Index: lib/uri/common.rb =================================================================== --- lib/uri/common.rb (revision 24198) +++ lib/uri/common.rb (working copy) @@ -2,7 +2,7 @@ # # Author:: Akira Yamada # Revision:: $Id$ -# License:: +# License:: # You can redistribute it and/or modify it under the same term as Ruby. # @@ -20,12 +20,15 @@ # alpha = lowalpha | upalpha ALPHA = "a-zA-Z" + # digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | 7" | + # "8" | "9" + DIGIT = "0-9" # alphanum = alpha | digit - ALNUM = "#{ALPHA}\\d" + ALNUM = "#{ALPHA}#{DIGIT}" # hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | # "a" | "b" | "c" | "d" | "e" | "f" - HEX = "a-fA-F\\d" + HEX = "#{DIGIT}a-fA-F" # escaped = "%" hex hex ESCAPED = "%[#{HEX}]{2}" # mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | @@ -34,7 +37,7 @@ UNRESERVED = "-_.!~*'()#{ALNUM}" # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | # "$" | "," - # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | + # reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | # "$" | "," | "[" | "]" (RFC 2732) RESERVED = ";/?:@&=+$,\\[\\]" @@ -100,114 +103,114 @@ def split(uri) case uri when '' - # null uri + # null uri when @regexp[:ABS_URI] - scheme, opaque, userinfo, host, port, - registry, path, query, fragment = $~[1..-1] + scheme, opaque, userinfo, host, port, + registry, path, query, fragment = $~[1..-1] + + # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] + + # absoluteURI = scheme ":" ( hier_part | opaque_part ) + # hier_part = ( net_path | abs_path ) [ "?" query ] + # opaque_part = uric_no_slash *uric - # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] + # abs_path = "/" path_segments + # net_path = "//" authority [ abs_path ] - # absoluteURI = scheme ":" ( hier_part | opaque_part ) - # hier_part = ( net_path | abs_path ) [ "?" query ] - # opaque_part = uric_no_slash *uric - - # abs_path = "/" path_segments - # net_path = "//" authority [ abs_path ] - - # authority = server | reg_name - # server = [ [ userinfo "@" ] hostport ] - - if !scheme - raise InvalidURIError, - "bad URI(absolute but no scheme): #{uri}" - end - if !opaque && (!path && (!host && !registry)) - raise InvalidURIError, - "bad URI(absolute but no path): #{uri}" - end + # authority = server | reg_name + # server = [ [ userinfo "@" ] hostport ] + + if !scheme + raise InvalidURIError, + "bad URI(absolute but no scheme): #{uri}" + end + if !opaque && (!path && (!host && !registry)) + raise InvalidURIError, + "bad URI(absolute but no path): #{uri}" + end when @regexp[:REL_URI] - scheme = nil - opaque = nil + scheme = nil + opaque = nil + + userinfo, host, port, registry, + rel_segment, abs_path, query, fragment = $~[1..-1] + if rel_segment && abs_path + path = rel_segment + abs_path + elsif rel_segment + path = rel_segment + elsif abs_path + path = abs_path + end + + # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] + + # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] - userinfo, host, port, registry, - rel_segment, abs_path, query, fragment = $~[1..-1] - if rel_segment && abs_path - path = rel_segment + abs_path - elsif rel_segment - path = rel_segment - elsif abs_path - path = abs_path - end - - # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] - - # relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] - - # net_path = "//" authority [ abs_path ] - # abs_path = "/" path_segments - # rel_path = rel_segment [ abs_path ] + # net_path = "//" authority [ abs_path ] + # abs_path = "/" path_segments + # rel_path = rel_segment [ abs_path ] - # authority = server | reg_name - # server = [ [ userinfo "@" ] hostport ] + # authority = server | reg_name + # server = [ [ userinfo "@" ] hostport ] else - raise InvalidURIError, "bad URI(is not URI?): #{uri}" + raise InvalidURIError, "bad URI(is not URI?): #{uri}" end path = '' if !path && !opaque # (see RFC2396 Section 5.2) ret = [ - scheme, - userinfo, host, port, # X - registry, # X - path, # Y - opaque, # Y - query, - fragment + scheme, + userinfo, host, port, # X + registry, # X + path, # Y + opaque, # Y + query, + fragment ] return ret end def parse(uri) - scheme, userinfo, host, port, - registry, path, opaque, query, fragment = self.split(uri) + scheme, userinfo, host, port, + registry, path, opaque, query, fragment = self.split(uri) if scheme && URI.scheme_list.include?(scheme.upcase) - URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port, - registry, path, opaque, query, + URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port, + registry, path, opaque, query, fragment, self) else - Generic.new(scheme, userinfo, host, port, - registry, path, opaque, query, - fragment, self) + Generic.new(scheme, userinfo, host, port, + registry, path, opaque, query, + fragment, self) end end def join(*str) u = self.parse(str[0]) str[1 .. -1].each do |x| - u = u.merge(x) + u = u.merge(x) end u end def extract(str, schemes = nil, &block) if block_given? - str.scan(make_regexp(schemes)) { yield $& } - nil + str.scan(make_regexp(schemes)) { yield $& } + nil else - result = [] - str.scan(make_regexp(schemes)) { result.push $& } - result + result = [] + str.scan(make_regexp(schemes)) { result.push $& } + result end end def make_regexp(schemes = nil) unless schemes - @regexp[:ABS_URI_REF] + @regexp[:ABS_URI_REF] else - /(?=#{Regexp.union(*schemes)}:)#{@pattern[:X_ABS_URI]}/x + /(?=#{Regexp.union(*schemes)}:)#{@pattern[:X_ABS_URI]}/x end end @@ -262,7 +265,7 @@ # hostname = *( domainlabel "." ) toplabel [ "." ] unless hostname - ret[:HOSTNAME] = hostname = "(?:#{domlabel}\\.)*#{toplabel}\\.?" + ret[:HOSTNAME] = hostname = "(?:#{domlabel}\\.)*#{toplabel}\\.?" end # RFC 2373, APPENDIX B: @@ -276,7 +279,9 @@ # allowed too. Here is a replacement. # # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT - ret[:IPV4ADDR] = ipv4addr = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}" + v4digit = "#{PATTERN::DIGIT}{1,3}" + ipv4addr = "(?:#{v4digit}\\.){3}#{v4digit}" + ret[:IPV4ADDR] = ipv4addr # hex4 = 1*4HEXDIG hex4 = "[#{PATTERN::HEX}]{1,4}" # lastpart = hex4 | IPv4address @@ -298,9 +303,9 @@ # host = hostname | IPv4address | IPv6reference (RFC 2732) ret[:HOST] = host = "(?:#{hostname}|#{ipv4addr}|#{ipv6ref})" # port = *digit - port = '\d*' + ret[:PORT] = port = "[#{PATTERN::DIGIT}]*" # hostport = host [ ":" port ] - ret[:HOSTPORT] = hostport = "#{host}(?::#{port})?" + hostport = "#{host}(?::#{port})?" # userinfo = *( unreserved | escaped | # ";" | ":" | "&" | "=" | "+" | "$" | "," ) @@ -329,7 +334,7 @@ ret[:REL_SEGMENT] = rel_segment = "(?:[#{unreserved};@&=+$,]|#{escaped})+" # scheme = alpha *( alpha | digit | "+" | "-" | "." ) - ret[:SCHEME] = scheme = "[#{PATTERN::ALPHA}][-+.#{PATTERN::ALPHA}\\d]*" + ret[:SCHEME] = scheme = "[#{PATTERN::ALPHA}][-+.#{PATTERN::ALPHA}#{PATTERN::DIGIT}]*" # abs_path = "/" path_segments ret[:ABS_PATH] = abs_path = "/#{path_segments}" @@ -352,23 +357,24 @@ ret[:URI_REF] = uri_ref = "(?:#{abs_uri}|#{rel_uri})?(?:##{fragment})?" ret[:X_ABS_URI] = " - (#{scheme}): (?# 1: scheme) + (#{scheme}): (?# 1: scheme) (?: - (#{opaque_part}) (?# 2: opaque) + (#{opaque_part}) (?# 2: opaque) | (?:(?: //(?: - (?:(?:(#{userinfo})@)? (?# 3: userinfo) - (?:(#{host})(?::(\\d*))?))? (?# 4: host, 5: port) + (?:(?:(#{userinfo})@)? (?# 3: userinfo) + (?:(#{host}) (?# 4: host) + (?::([#{PATTERN::DIGIT}]*))?))? (?# 5: port) | - (#{reg_name}) (?# 6: registry) + (#{reg_name}) (?# 6: registry) ) | - (?!//)) (?# XXX: '//' is the mark for hostport) - (#{abs_path})? (?# 7: path) - )(?:\\?(#{query}))? (?# 8: query) + (?!//)) (?# XXX: '//' is the mark for hostport) + (#{abs_path})? (?# 7: path) + )(?:\\?(#{query}))? (?# 8: query) ) - (?:\\#(#{fragment}))? (?# 9: fragment) + (?:\\#(#{fragment}))? (?# 9: fragment) " ret[:X_REL_URI] = " @@ -376,18 +382,19 @@ (?: // (?: - (?:(#{userinfo})@)? (?# 1: userinfo) - (#{host})?(?::(\\d*))? (?# 2: host, 3: port) + (?:(#{userinfo})@)? (?# 1: userinfo) + (#{host})? (?# 2: host) + (?::([#{PATTERN::DIGIT}]*))? (?# 3: port) | - (#{reg_name}) (?# 4: registry) + (#{reg_name}) (?# 4: registry) ) ) | - (#{rel_segment}) (?# 5: rel_segment) + (#{rel_segment}) (?# 5: rel_segment) )? - (#{abs_path})? (?# 6: abs_path) - (?:\\?(#{query}))? (?# 7: query) - (?:\\#(#{fragment}))? (?# 8: fragment) + (#{abs_path})? (?# 6: abs_path) + (?:\\?(#{query}))? (?# 7: query) + (?:\\#(#{fragment}))? (?# 8: fragment) " ret @@ -457,7 +464,7 @@ end end else - raise ArgumentError, + raise ArgumentError, "expected Array of or Hash of components of #{klass.to_s} (#{klass.component[1..-1].join(', ')})" end tmp[:scheme] = klass.to_s.sub(/\A.*::/, '').downcase @@ -538,7 +545,7 @@ def self.scheme_list @@schemes end - + # # Base class for all URI exceptions. # @@ -579,7 +586,7 @@ # * Opaque # * Query # * Fragment - # + # # == Usage # # require 'uri' @@ -604,7 +611,7 @@ # == Description # # Creates one of the URI's subclasses instance from the string. - # + # # == Raises # # URI::InvalidURIError @@ -617,11 +624,11 @@ # uri = URI.parse("http://www.ruby-lang.org/") # p uri # # => # - # p uri.scheme - # # => "http" - # p uri.host - # # => "www.ruby-lang.org" - # + # p uri.scheme + # # => "http" + # p uri.host + # # => "www.ruby-lang.org" + # def self.parse(uri) DEFAULT_PARSER.parse(uri) end @@ -658,7 +665,7 @@ # # == Args # - # +str+:: + # +str+:: # String to extract URIs from. # +schemes+:: # Limit URI matching to a specific schemes. @@ -686,25 +693,25 @@ # # == Args # - # +match_schemes+:: + # +match_schemes+:: # Array of schemes. If given, resulting regexp matches to URIs # whose scheme is one of the match_schemes. - # + # # == Description # Returns a Regexp object which matches to URI-like strings. # The Regexp object returned by this method includes arbitrary # number of capture group (parentheses). Never rely on it's number. - # + # # == Usage # # require 'uri' # # # extract first URI from html_string # html_string.slice(URI.regexp) - # + # # # remove ftp URIs # html_string.sub(URI.regexp(['ftp']) - # + # # # You should not rely on the number of parentheses # html_string.scan(URI.regexp) do |*matches| # p $&