Project

General

Profile

Feature #4017 ยป ruby_19_csv_parser_split_methods.patch

Patch 1/2 - ender672 (Timothy Elliott), 11/03/2010 09:43 AM

View differences:

lib/csv.rb (working copy)
154 154
#   CSV(csv = "")   { |csv_str| csv_str << %w{my data here} }  # to a String
155 155
#   CSV($stderr)    { |csv_err| csv_err << %w{my data here} }  # to $stderr
156 156
#   CSV($stdin)     { |csv_in|  csv_in.each { |row| p row } }  # from $stdin
157
# 
157
#
158 158
# == Advanced Usage
159
# 
159
#
160 160
# === Wrap an IO Object
161
# 
161
#
162 162
#   csv = CSV.new(io, options)
163 163
#   # ... read (with gets() or each()) from and write (with <<) to csv here ...
164 164
#
......
836 836
    #
837 837
    # This method assumes you want the Table.headers(), unless you explicitly
838 838
    # pass <tt>:write_headers => false</tt>.
839
    # 
839
    #
840 840
    def to_csv(options = Hash.new)
841 841
      wh = options.fetch(:write_headers, true)
842 842
      @table.inject(wh ? [headers.to_csv(options)] : [ ]) do |rows, row|
......
965 965
  # <b><tt>:row_sep</tt></b>::            <tt>:auto</tt>
966 966
  # <b><tt>:quote_char</tt></b>::         <tt>'"'</tt>
967 967
  # <b><tt>:field_size_limit</tt></b>::   +nil+
968
  # <b><tt>:io_read_limit</tt></b>::      <tt>2048</tt>
968 969
  # <b><tt>:converters</tt></b>::         +nil+
969 970
  # <b><tt>:unconverted_fields</tt></b>:: +nil+
970 971
  # <b><tt>:headers</tt></b>::            +false+
......
977 978
                      row_sep:            :auto,
978 979
                      quote_char:         '"',
979 980
                      field_size_limit:   nil,
981
                      io_read_limit:      2048,
980 982
                      converters:         nil,
981 983
                      unconverted_fields: nil,
982 984
                      headers:            false,
......
1586 1588

  
1587 1589
    # track our own lineno since IO gets confused about line-ends is CSV fields
1588 1590
    @lineno = 0
1591
    @data_buf = nil
1589 1592
  end
1590 1593

  
1591 1594
  #
......
1681 1684
  def rewind
1682 1685
    @headers = nil
1683 1686
    @lineno  = 0
1687
    @data_buf = nil
1684 1688

  
1685 1689
    @io.rewind
1686 1690
  end
......
1798 1802
  # The data source must be open for reading.
1799 1803
  #
1800 1804
  def shift
1801
    #########################################################################
1802
    ### This method is purposefully kept a bit long as simple conditional ###
1803
    ### checks are faster than numerous (expensive) method calls.         ###
1804
    #########################################################################
1805

  
1806 1805
    # handle headers not based on document content
1807 1806
    if header_row? and @return_headers and
1808 1807
       [Array, String].include? @use_headers.class
......
1813 1812
      end
1814 1813
    end
1815 1814

  
1816
    #
1817
    # it can take multiple calls to <tt>@io.gets()</tt> to get a full line,
1818
    # because of \r and/or \n characters embedded in quoted fields
1819
    #
1820
    in_extended_col = false
1821
    csv             = Array.new
1815
    @lineno += 1
1816
    csv = parse_csv_row
1817
    return unless csv
1822 1818

  
1823
    loop do
1824
      # add another read to the line
1825
      unless parse = @io.gets(@row_sep)
1826
        return nil
1819
    if csv == [nil]
1820
      if @skip_blanks
1821
        return shift
1822
      elsif @unconverted_fields
1823
        return add_unconverted_fields(Array.new, Array.new)
1824
      elsif @use_headers
1825
        return self.class::Row.new(Array.new, Array.new)
1826
      else
1827
        return Array.new
1827 1828
      end
1829
    end
1828 1830

  
1829
      parse.sub!(@parsers[:line_end], "")
1831
    # save fields unconverted fields, if needed...
1832
    unconverted = csv.dup if @unconverted_fields
1830 1833

  
1831
      if csv.empty?
1832
        #
1833
        # I believe a blank line should be an <tt>Array.new</tt>, not Ruby 1.8
1834
        # CSV's <tt>[nil]</tt>
1835
        #
1836
        if parse.empty?
1837
          @lineno += 1
1838
          if @skip_blanks
1839
            next
1840
          elsif @unconverted_fields
1841
            return add_unconverted_fields(Array.new, Array.new)
1842
          elsif @use_headers
1843
            return self.class::Row.new(Array.new, Array.new)
1844
          else
1845
            return Array.new
1846
          end
1847
        end
1848
      end
1834
    # convert fields, if needed...
1835
    csv = convert_fields(csv) unless @use_headers or @converters.empty?
1836
    # parse out header rows and handle CSV::Row conversions...
1837
    csv = parse_headers(csv)  if     @use_headers
1849 1838

  
1850
      parts =  parse.split(@col_sep, -1)
1851
      if parts.empty?
1852
        if in_extended_col
1853
          csv[-1] << @col_sep   # will be replaced with a @row_sep after the parts.each loop
1854
        else
1855
          csv << nil
1856
        end
1857
      end
1839
    # inject unconverted fields and accessor, if requested...
1840
    if @unconverted_fields and not csv.respond_to? :unconverted_fields
1841
      add_unconverted_fields(csv, unconverted)
1842
    end
1858 1843

  
1859
      # This loop is the hot path of csv parsing. Some things may be non-dry
1860
      # for a reason. Make sure to benchmark when refactoring.
1861
      parts.each do |part|
1862
        if in_extended_col
1863
          # If we are continuing a previous column
1864
          if part[-1] == @quote_char && part.count(@quote_char) % 2 != 0
1865
            # extended column ends
1866
            csv.last << part[0..-2]
1867
            raise MalformedCSVError if csv.last =~ @parsers[:stray_quote]
1868
            csv.last.gsub!(@quote_char * 2, @quote_char)
1869
            in_extended_col = false
1870
          else
1871
            csv.last << part
1872
            csv.last << @col_sep
1873
          end
1874
        elsif part[0] == @quote_char
1875
          # If we are staring a new quoted column
1876
          if part[-1] != @quote_char || part.count(@quote_char) % 2 != 0
1877
            # start an extended column
1878
            csv             << part[1..-1]
1879
            csv.last        << @col_sep
1880
            in_extended_col =  true
1881
          else
1882
            # regular quoted column
1883
            csv << part[1..-2]
1884
            raise MalformedCSVError if csv.last =~ @parsers[:stray_quote]
1885
            csv.last.gsub!(@quote_char * 2, @quote_char)
1886
          end
1887
        elsif part =~ @parsers[:quote_or_nl]
1888
          # Unquoted field with bad characters.
1889
          if part =~ @parsers[:nl_or_lf]
1890
            raise MalformedCSVError, "Unquoted fields do not allow " +
1891
                                     "\\r or \\n (line #{lineno + 1})."
1892
          else
1893
            raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}."
1894
          end
1895
        else
1896
          # Regular ole unquoted field.
1897
          csv << (part.empty? ? nil : part)
1898
        end
1899
      end
1900

  
1901
      # Replace tacked on @col_sep with @row_sep if we are still in an extended
1902
      # column.
1903
      csv[-1][-1] = @row_sep if in_extended_col
1904

  
1905
      if in_extended_col
1906
        # if we're at eof?(), a quoted field wasn't closed...
1907
        if @io.eof?
1908
          raise MalformedCSVError,
1909
                "Unclosed quoted field on line #{lineno + 1}."
1910
        elsif @field_size_limit and csv.last.size >= @field_size_limit
1911
          raise MalformedCSVError, "Field size exceeded on line #{lineno + 1}."
1912
        end
1913
        # otherwise, we need to loop and pull some more data to complete the row
1914
      else
1915
        @lineno += 1
1916

  
1917
        # save fields unconverted fields, if needed...
1918
        unconverted = csv.dup if @unconverted_fields
1919

  
1920
        # convert fields, if needed...
1921
        csv = convert_fields(csv) unless @use_headers or @converters.empty?
1922
        # parse out header rows and handle CSV::Row conversions...
1923
        csv = parse_headers(csv)  if     @use_headers
1924

  
1925
        # inject unconverted fields and accessor, if requested...
1926
        if @unconverted_fields and not csv.respond_to? :unconverted_fields
1927
          add_unconverted_fields(csv, unconverted)
1928
        end
1929

  
1930
        # return the results
1931
        break csv
1932
      end
1933
    end
1844
    # return the results
1845
    csv
1934 1846
  end
1935 1847
  alias_method :gets,     :shift
1936 1848
  alias_method :readline, :shift
......
1976 1888

  
1977 1889
  private
1978 1890

  
1891
  def parse_csv_row
1892
    buf = io_get_unquoted
1893
    return unless buf
1894
    line = []
1895

  
1896
    loop do
1897
      case buf
1898
      when @quote_char
1899
        line << io_get_quoted
1900
        buf = io_get_unquoted
1901

  
1902
        case buf
1903
        when nil, @row_sep
1904
          return line
1905
        when @col_sep
1906
          return line << nil
1907
        end
1908

  
1909
        break unless buf.slice!(0, @col_sep.size) == @col_sep
1910
      when @row_sep
1911
        return line << nil
1912
      else
1913
        newline = buf.chomp! @row_sep
1914

  
1915
        if buf.count(@nl_lf) > 0
1916
          raise MalformedCSVError, "Unquoted fields do not allow " +
1917
                                   "\\r or \\n (line #{@lineno})."
1918
        end
1919

  
1920
        buf.split(@col_sep, -1).each{ |c| line << (c.empty? ? nil : c) }
1921

  
1922
        if newline
1923
          return line
1924
        elsif line.last == @quote_char
1925
          buf = line.pop
1926
        elsif data_buf_eof?
1927
          return line
1928
        else
1929
          break
1930
        end
1931
      end
1932
    end
1933

  
1934
    raise MalformedCSVError, "Illegal quoting on line #{@lineno}."
1935
  end
1936

  
1937
  # Read @io and return everything until the next unescaped quote character.
1938
  # Escaped quote characters are automatically unescaped. Raises an error if we
1939
  # hit @io.eof? or @field_size_limit without encountering an ending quote.
1979 1940
  #
1941
  # Only successfully returns if the read ended with a @quote_char. This
1942
  # prevents us from returning only part of a multibyte character.
1943
  def io_get_quoted
1944
    buf = @io.gets @quote_char, @field_size_limit
1945

  
1946
    while buf.chomp!(@quote_char) do
1947
      @data_buf = @io.gets @quote_char, @io_read_limit
1948
      return buf unless @data_buf == @quote_char
1949

  
1950
      break if @io.eof?
1951
      buf << @quote_char + @io.gets(@quote_char, @field_size_limit)
1952
    end
1953

  
1954
    raise MalformedCSVError, "Illegal quoting on line #{@lineno}."
1955
  end
1956

  
1957
  # Read @io and return everything until we hit a newline or a quote character.
1958
  # Raise an error if we exceed @field_size_limit.
1959
  #
1960
  # Only successfully returns if the read contains @row_sep, ends with a
1961
  # @quote_char, or reaches @io.eof? This prevents us from returning only part
1962
  # of a multibyte character.
1963
  #
1964
  # If we have multibyte encoding then it is possible that a @quote_char will be
1965
  # truncated on @io.gets. We only check for this if we exhaust the data buffer.
1966
  def io_get_unquoted
1967
    unless @data_buf
1968
      return unless @data_buf = @io.gets(@quote_char, @io_read_limit)
1969
    end
1970

  
1971
    loop do
1972
      if newline = @data_buf.index(@row_sep)
1973
        break if newline == @data_buf.size - @row_sep.size
1974
        return @data_buf.slice!(0, newline + @row_sep.size)
1975
      end
1976

  
1977
      break if @io.eof? || @data_buf.end_with?(@quote_char)
1978

  
1979
      if @field_size_limit && @data_buf.size > @field_size_limit
1980
        raise MalformedCSVError, "Field size exceeded on line #{@lineno}."
1981
      end
1982

  
1983
      @data_buf += @io.gets(@quote_char, @io_read_limit) unless align_data_buf
1984
    end
1985

  
1986
    return_buf = @data_buf
1987
    @data_buf = nil
1988
    return_buf
1989
  end
1990

  
1991
  # Fetch up to 10 bytes into @data_buf until the string matches its encoding.
1992
  # Returns a value only if we modified the buffer.
1993
  def align_data_buf
1994
    return if @data_buf.valid_encoding? || @io.eof?
1995

  
1996
    10.times do
1997
      break true if @data_buf.valid_encoding? || @io.eof?
1998
      @data_buf += @io.read(1).force_encoding(raw_encoding)
1999
    end
2000
  end
2001

  
2002
  def data_buf_eof?
2003
    @io.eof? && !@data_buf
2004
  end
2005

  
2006
  #
1980 2007
  # Stores the indicated separators for later use.
1981 2008
  #
1982 2009
  # If auto-discovery was requested for <tt>@row_sep</tt>, this method will read
......
2075 2102
    # store the parser behaviors
2076 2103
    @skip_blanks      = options.delete(:skip_blanks)
2077 2104
    @field_size_limit = options.delete(:field_size_limit)
2078

  
2079
    # prebuild Regexps for faster parsing
2080
    esc_row_sep = escape_re(@row_sep)
2081
    esc_quote   = escape_re(@quote_char)
2082
    @parsers = {
2083
      # for detecting parse errors
2084
      quote_or_nl:    encode_re("[", esc_quote, "\r\n]"),
2085
      nl_or_lf:       encode_re("[\r\n]"),
2086
      stray_quote:    encode_re( "[^", esc_quote, "]", esc_quote,
2087
                                 "[^", esc_quote, "]" ),
2088
      # safer than chomp!()
2089
      line_end:       encode_re(esc_row_sep, "\\z"),
2090
      # illegal unquoted characters
2091
      return_newline: encode_str("\r\n")
2092
    }
2105
    @io_read_limit    = options.delete(:io_read_limit)
2106
    @nl_lf            = encode_str("\r\n")
2093 2107
  end
2094 2108

  
2095 2109
  #