Index: lib/csv.rb
===================================================================
--- lib/csv.rb (revision 29650)
+++ lib/csv.rb (working copy)
@@ -154,11 +154,11 @@
# CSV(csv = "") { |csv_str| csv_str << %w{my data here} } # to a String
# CSV($stderr) { |csv_err| csv_err << %w{my data here} } # to $stderr
# CSV($stdin) { |csv_in| csv_in.each { |row| p row } } # from $stdin
-#
+#
# == Advanced Usage
-#
+#
# === Wrap an IO Object
-#
+#
# csv = CSV.new(io, options)
# # ... read (with gets() or each()) from and write (with <<) to csv here ...
#
@@ -836,7 +836,7 @@
#
# This method assumes you want the Table.headers(), unless you explicitly
# pass :write_headers => false.
- #
+ #
def to_csv(options = Hash.new)
wh = options.fetch(:write_headers, true)
@table.inject(wh ? [headers.to_csv(options)] : [ ]) do |rows, row|
@@ -965,6 +965,7 @@
# :row_sep:: :auto
# :quote_char:: '"'
# :field_size_limit:: +nil+
+ # :io_read_limit:: 2048
# :converters:: +nil+
# :unconverted_fields:: +nil+
# :headers:: +false+
@@ -977,6 +978,7 @@
row_sep: :auto,
quote_char: '"',
field_size_limit: nil,
+ io_read_limit: 2048,
converters: nil,
unconverted_fields: nil,
headers: false,
@@ -1586,6 +1588,7 @@
# track our own lineno since IO gets confused about line-ends is CSV fields
@lineno = 0
+ @data_buf = nil
end
#
@@ -1681,6 +1684,7 @@
def rewind
@headers = nil
@lineno = 0
+ @data_buf = nil
@io.rewind
end
@@ -1798,11 +1802,6 @@
# The data source must be open for reading.
#
def shift
- #########################################################################
- ### This method is purposefully kept a bit long as simple conditional ###
- ### checks are faster than numerous (expensive) method calls. ###
- #########################################################################
-
# handle headers not based on document content
if header_row? and @return_headers and
[Array, String].include? @use_headers.class
@@ -1813,124 +1812,37 @@
end
end
- #
- # it can take multiple calls to @io.gets() to get a full line,
- # because of \r and/or \n characters embedded in quoted fields
- #
- in_extended_col = false
- csv = Array.new
+ @lineno += 1
+ csv = parse_csv_row
+ return unless csv
- loop do
- # add another read to the line
- unless parse = @io.gets(@row_sep)
- return nil
+ if csv == [nil]
+ if @skip_blanks
+ return shift
+ elsif @unconverted_fields
+ return add_unconverted_fields(Array.new, Array.new)
+ elsif @use_headers
+ return self.class::Row.new(Array.new, Array.new)
+ else
+ return Array.new
end
+ end
- parse.sub!(@parsers[:line_end], "")
+ # save fields unconverted fields, if needed...
+ unconverted = csv.dup if @unconverted_fields
- if csv.empty?
- #
- # I believe a blank line should be an Array.new, not Ruby 1.8
- # CSV's [nil]
- #
- if parse.empty?
- @lineno += 1
- if @skip_blanks
- next
- elsif @unconverted_fields
- return add_unconverted_fields(Array.new, Array.new)
- elsif @use_headers
- return self.class::Row.new(Array.new, Array.new)
- else
- return Array.new
- end
- end
- end
+ # convert fields, if needed...
+ csv = convert_fields(csv) unless @use_headers or @converters.empty?
+ # parse out header rows and handle CSV::Row conversions...
+ csv = parse_headers(csv) if @use_headers
- parts = parse.split(@col_sep, -1)
- if parts.empty?
- if in_extended_col
- csv[-1] << @col_sep # will be replaced with a @row_sep after the parts.each loop
- else
- csv << nil
- end
- end
+ # inject unconverted fields and accessor, if requested...
+ if @unconverted_fields and not csv.respond_to? :unconverted_fields
+ add_unconverted_fields(csv, unconverted)
+ end
- # This loop is the hot path of csv parsing. Some things may be non-dry
- # for a reason. Make sure to benchmark when refactoring.
- parts.each do |part|
- if in_extended_col
- # If we are continuing a previous column
- if part[-1] == @quote_char && part.count(@quote_char) % 2 != 0
- # extended column ends
- csv.last << part[0..-2]
- raise MalformedCSVError if csv.last =~ @parsers[:stray_quote]
- csv.last.gsub!(@quote_char * 2, @quote_char)
- in_extended_col = false
- else
- csv.last << part
- csv.last << @col_sep
- end
- elsif part[0] == @quote_char
- # If we are staring a new quoted column
- if part[-1] != @quote_char || part.count(@quote_char) % 2 != 0
- # start an extended column
- csv << part[1..-1]
- csv.last << @col_sep
- in_extended_col = true
- else
- # regular quoted column
- csv << part[1..-2]
- raise MalformedCSVError if csv.last =~ @parsers[:stray_quote]
- csv.last.gsub!(@quote_char * 2, @quote_char)
- end
- elsif part =~ @parsers[:quote_or_nl]
- # Unquoted field with bad characters.
- if part =~ @parsers[:nl_or_lf]
- raise MalformedCSVError, "Unquoted fields do not allow " +
- "\\r or \\n (line #{lineno + 1})."
- else
- raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}."
- end
- else
- # Regular ole unquoted field.
- csv << (part.empty? ? nil : part)
- end
- end
-
- # Replace tacked on @col_sep with @row_sep if we are still in an extended
- # column.
- csv[-1][-1] = @row_sep if in_extended_col
-
- if in_extended_col
- # if we're at eof?(), a quoted field wasn't closed...
- if @io.eof?
- raise MalformedCSVError,
- "Unclosed quoted field on line #{lineno + 1}."
- elsif @field_size_limit and csv.last.size >= @field_size_limit
- raise MalformedCSVError, "Field size exceeded on line #{lineno + 1}."
- end
- # otherwise, we need to loop and pull some more data to complete the row
- else
- @lineno += 1
-
- # save fields unconverted fields, if needed...
- unconverted = csv.dup if @unconverted_fields
-
- # convert fields, if needed...
- csv = convert_fields(csv) unless @use_headers or @converters.empty?
- # parse out header rows and handle CSV::Row conversions...
- csv = parse_headers(csv) if @use_headers
-
- # inject unconverted fields and accessor, if requested...
- if @unconverted_fields and not csv.respond_to? :unconverted_fields
- add_unconverted_fields(csv, unconverted)
- end
-
- # return the results
- break csv
- end
- end
+ # return the results
+ csv
end
alias_method :gets, :shift
alias_method :readline, :shift
@@ -1976,7 +1888,122 @@
private
+ def parse_csv_row
+ buf = io_get_unquoted
+ return unless buf
+ line = []
+
+ loop do
+ case buf
+ when @quote_char
+ line << io_get_quoted
+ buf = io_get_unquoted
+
+ case buf
+ when nil, @row_sep
+ return line
+ when @col_sep
+ return line << nil
+ end
+
+ break unless buf.slice!(0, @col_sep.size) == @col_sep
+ when @row_sep
+ return line << nil
+ else
+ newline = buf.chomp! @row_sep
+
+ if buf.count(@nl_lf) > 0
+ raise MalformedCSVError, "Unquoted fields do not allow " +
+ "\\r or \\n (line #{@lineno})."
+ end
+
+ buf.split(@col_sep, -1).each{ |c| line << (c.empty? ? nil : c) }
+
+ if newline
+ return line
+ elsif line.last == @quote_char
+ buf = line.pop
+ elsif data_buf_eof?
+ return line
+ else
+ break
+ end
+ end
+ end
+
+ raise MalformedCSVError, "Illegal quoting on line #{@lineno}."
+ end
+
+ # Read @io and return everything until the next unescaped quote character.
+ # Escaped quote characters are automatically unescaped. Raises an error if we
+ # hit @io.eof? or @field_size_limit without encountering an ending quote.
#
+ # Only successfully returns if the read ended with a @quote_char. This
+ # prevents us from returning only part of a multibyte character.
+ def io_get_quoted
+ buf = @io.gets @quote_char, @field_size_limit
+
+ while buf.chomp!(@quote_char) do
+ @data_buf = @io.gets @quote_char, @io_read_limit
+ return buf unless @data_buf == @quote_char
+
+ break if @io.eof?
+ buf << @quote_char + @io.gets(@quote_char, @field_size_limit)
+ end
+
+ raise MalformedCSVError, "Illegal quoting on line #{@lineno}."
+ end
+
+ # Read @io and return everything until we hit a newline or a quote character.
+ # Raise an error if we exceed @field_size_limit.
+ #
+ # Only successfully returns if the read contains @row_sep, ends with a
+ # @quote_char, or reaches @io.eof? This prevents us from returning only part
+ # of a multibyte character.
+ #
+ # If we have multibyte encoding then it is possible that a @quote_char will be
+ # truncated on @io.gets. We only check for this if we exhaust the data buffer.
+ def io_get_unquoted
+ unless @data_buf
+ return unless @data_buf = @io.gets(@quote_char, @io_read_limit)
+ end
+
+ loop do
+ if newline = @data_buf.index(@row_sep)
+ break if newline == @data_buf.size - @row_sep.size
+ return @data_buf.slice!(0, newline + @row_sep.size)
+ end
+
+ break if @io.eof? || @data_buf.end_with?(@quote_char)
+
+ if @field_size_limit && @data_buf.size > @field_size_limit
+ raise MalformedCSVError, "Field size exceeded on line #{@lineno}."
+ end
+
+ @data_buf += @io.gets(@quote_char, @io_read_limit) unless align_data_buf
+ end
+
+ return_buf = @data_buf
+ @data_buf = nil
+ return_buf
+ end
+
+ # Fetch up to 10 bytes into @data_buf until the string matches its encoding.
+ # Returns a value only if we modified the buffer.
+ def align_data_buf
+ return if @data_buf.valid_encoding? || @io.eof?
+
+ 10.times do
+ break true if @data_buf.valid_encoding? || @io.eof?
+ @data_buf += @io.read(1).force_encoding(raw_encoding)
+ end
+ end
+
+ def data_buf_eof?
+ @io.eof? && !@data_buf
+ end
+
+ #
# Stores the indicated separators for later use.
#
# If auto-discovery was requested for @row_sep, this method will read
@@ -2075,21 +2102,8 @@
# store the parser behaviors
@skip_blanks = options.delete(:skip_blanks)
@field_size_limit = options.delete(:field_size_limit)
-
- # prebuild Regexps for faster parsing
- esc_row_sep = escape_re(@row_sep)
- esc_quote = escape_re(@quote_char)
- @parsers = {
- # for detecting parse errors
- quote_or_nl: encode_re("[", esc_quote, "\r\n]"),
- nl_or_lf: encode_re("[\r\n]"),
- stray_quote: encode_re( "[^", esc_quote, "]", esc_quote,
- "[^", esc_quote, "]" ),
- # safer than chomp!()
- line_end: encode_re(esc_row_sep, "\\z"),
- # illegal unquoted characters
- return_newline: encode_str("\r\n")
- }
+ @io_read_limit = options.delete(:io_read_limit)
+ @nl_lf = encode_str("\r\n")
end
#